-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
2313bbc
commit 927a090
Showing
5 changed files
with
48 additions
and
89 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,32 +1,54 @@ | ||
from palooza_wizard.utils.files import download_or_load_soup | ||
from palooza_wizard.utils.process import process_soup | ||
from palooza_wizard.agent import get_agent_functions | ||
from palooza_wizard.chatgpt import num_tokens_from_string | ||
from palooza_wizard.graph import PaloozaGraph | ||
import palooza_wizard.algorithms as wizard_algos | ||
import palooza_wizard.utils as wizard_utils | ||
import palooza_wizard.agent as agent | ||
import palooza_wizard.constants as ct | ||
from bs4 import BeautifulSoup | ||
# Download or load soup | ||
#url = "https://www.amazon.com/Molblly-Shredded-Standard-Adjustable-Hypoallergenic/dp/B08X4TQJTL/?_encoding=UTF8&pd_rd_w=ABAfK&content-id=amzn1.sym.952cfb50-b01e-485f-be6e-00434541418b%3Aamzn1.symc.e5c80209-769f-4ade-a325-2eaec14b8e0e&pf_rd_p=952cfb50-b01e-485f-be6e-00434541418b&pf_rd_r=DNTYSCCM3CE36ZBNPB9Q&pd_rd_wg=vsfzz&pd_rd_r=a7259352-0e85-4361-934b-108279598eb3&ref_=pd_gw_ci_mcx_mr_hp_atf_m&th=1" | ||
#soup = download_or_load_soup(url, file_path="outputs/html/test.html") | ||
|
||
url = "https://www.amazon.com/" | ||
soup = download_or_load_soup(url, file_path="outputs/html/amazon.html") | ||
#minified = minify_html.minify(str(soup), minify_js=True, remove_processing_instructions=True) | ||
#with open(f"minified.html", "w") as f: | ||
# f.write(minified) | ||
#soup = BeautifulSoup(minified, "html.parser") | ||
if __name__ == "__main__": | ||
|
||
# Clean all output folders | ||
wizard_utils.clean_all_output_folders() | ||
|
||
soup = process_soup(soup) | ||
# quitar atributos inutiles | ||
# prune a las clases | ||
website_name = "rubmaps" | ||
|
||
#url = "https://www.rubmaps.ch/erotic-massage-hot-stones-spa-cleveland-oh-184586" | ||
#url = "https://www.amazon.com/Legend-Zelda-Breath-Wild-Nintendo-Switch/dp/B097B2YWFX/ref=sr_1_2?crid=AMBZI2YGNFC3&keywords=zelda%2Bvideo%2Bgame&qid=1698834691&sprefix=zelda%2Bvideogame%2Caps%2C241&sr=8-2&th=1" | ||
#url = "https://www.rubmaps.ch/erotic-massage-windsor-spa-chicago-il-190400#rubmaps" | ||
#soup = wizard_utils.download_or_load_soup(url, f"{website_name}.html", use_proxies=False) | ||
#soup = wizard_utils.process_soup(soup) | ||
|
||
|
||
#soup = soup.find(attrs={"id": "location-container"}) | ||
#soup = BeautifulSoup(f"<body>{str(soup)}</body>") | ||
#soup = wizard_utils.process_soup(soup) | ||
with open("soup3.html", "r") as f: | ||
soup = BeautifulSoup(f.read()) | ||
soup = wizard_utils.process_soup(soup) | ||
|
||
tasks = [ ## output del proceso anterior | ||
#{"element": {"tag": "div", "attribute": "class", "value": "vipEscortsArea"}, "function_name": "get_escorts_info"}, | ||
{"element": {"tag": "div", "attribute": "class", "value": "a-section a-spacing-large"}, "function_name": "get_amazon"} | ||
] | ||
file_name = "outputs/agent/amazon.py" | ||
|
||
string = str(soup.find("div", attrs = {"class": "a-section"})) | ||
palooza_graph = PaloozaGraph() | ||
palooza_graph.get_graph(soup, labels_to_integers = False) | ||
min_depth = 0 | ||
max_candidates = 20 | ||
|
||
# Get candidates | ||
candidates = wizard_algos.degree_importance( | ||
palooza_graph.G, | ||
palooza_graph.root, | ||
min_depth, | ||
max_candidates | ||
) | ||
|
||
# Get soup nodes. | ||
wizard_utils.get_soup_nodes( | ||
website_name, | ||
palooza_graph.G, | ||
soup, | ||
candidates, | ||
verbose = True | ||
) | ||
|
||
# Get agent functions. | ||
agent.get_agent_functions() | ||
|
||
print("Num tokens in string: ", num_tokens_from_string(string, "cl100k_base")) | ||
#get_agent_functions(soup, tasks, file_name) |