diff --git a/.DS_Store b/.DS_Store index 3e7379c..970cffe 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/.gitignore b/.gitignore index 467c327..5f76876 100644 --- a/.gitignore +++ b/.gitignore @@ -158,6 +158,5 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ -./outputs/* -playground -outputs \ No newline at end of file +playground/ +outputs/ \ No newline at end of file diff --git a/main.py b/main.py deleted file mode 100644 index 8bc85d0..0000000 --- a/main.py +++ /dev/null @@ -1,54 +0,0 @@ -from palooza_wizard.graph import PaloozaGraph -import palooza_wizard.algorithms as wizard_algos -import palooza_wizard.utils as wizard_utils -import palooza_wizard.agent as agent -import palooza_wizard.constants as ct -from bs4 import BeautifulSoup - -if __name__ == "__main__": - - # Clean all output folders - wizard_utils.clean_all_output_folders() - - website_name = "rubmaps" - - #url = "https://www.rubmaps.ch/erotic-massage-hot-stones-spa-cleveland-oh-184586" - #url = "https://www.amazon.com/Legend-Zelda-Breath-Wild-Nintendo-Switch/dp/B097B2YWFX/ref=sr_1_2?crid=AMBZI2YGNFC3&keywords=zelda%2Bvideo%2Bgame&qid=1698834691&sprefix=zelda%2Bvideogame%2Caps%2C241&sr=8-2&th=1" - #url = "https://www.rubmaps.ch/erotic-massage-windsor-spa-chicago-il-190400#rubmaps" - #soup = wizard_utils.download_or_load_soup(url, f"{website_name}.html", use_proxies=False) - #soup = wizard_utils.process_soup(soup) - - - #soup = soup.find(attrs={"id": "location-container"}) - #soup = BeautifulSoup(f"{str(soup)}") - #soup = wizard_utils.process_soup(soup) - with open("soup3.html", "r") as f: - soup = BeautifulSoup(f.read()) - soup = wizard_utils.process_soup(soup) - - - palooza_graph = PaloozaGraph() - palooza_graph.get_graph(soup, labels_to_integers = False) - min_depth = 0 - max_candidates = 20 - - # Get candidates - candidates = wizard_algos.degree_importance( - palooza_graph.G, - palooza_graph.root, - min_depth, - max_candidates - ) - - # Get soup nodes. - wizard_utils.get_soup_nodes( - website_name, - palooza_graph.G, - soup, - candidates, - verbose = True - ) - - # Get agent functions. - agent.get_agent_functions() - diff --git a/main2.py b/main2.py deleted file mode 100644 index 7bc3a60..0000000 --- a/main2.py +++ /dev/null @@ -1,8 +0,0 @@ -from palooza_wizard.graph import PaloozaGraph -import palooza_wizard.algorithms as wizard_algos -import palooza_wizard.utils as wizard_utils -import palooza_wizard.agent as agent -import palooza_wizard.constants as ct - -indexes = [] -agent.get_agent_functions(indexes=indexes) \ No newline at end of file diff --git a/playground/main.py b/playground/main.py index 3c0ebc0..8bc85d0 100644 --- a/playground/main.py +++ b/playground/main.py @@ -1,32 +1,54 @@ -from palooza_wizard.utils.files import download_or_load_soup -from palooza_wizard.utils.process import process_soup -from palooza_wizard.agent import get_agent_functions -from palooza_wizard.chatgpt import num_tokens_from_string +from palooza_wizard.graph import PaloozaGraph +import palooza_wizard.algorithms as wizard_algos +import palooza_wizard.utils as wizard_utils +import palooza_wizard.agent as agent +import palooza_wizard.constants as ct from bs4 import BeautifulSoup -# Download or load soup -#url = "https://www.amazon.com/Molblly-Shredded-Standard-Adjustable-Hypoallergenic/dp/B08X4TQJTL/?_encoding=UTF8&pd_rd_w=ABAfK&content-id=amzn1.sym.952cfb50-b01e-485f-be6e-00434541418b%3Aamzn1.symc.e5c80209-769f-4ade-a325-2eaec14b8e0e&pf_rd_p=952cfb50-b01e-485f-be6e-00434541418b&pf_rd_r=DNTYSCCM3CE36ZBNPB9Q&pd_rd_wg=vsfzz&pd_rd_r=a7259352-0e85-4361-934b-108279598eb3&ref_=pd_gw_ci_mcx_mr_hp_atf_m&th=1" -#soup = download_or_load_soup(url, file_path="outputs/html/test.html") -url = "https://www.amazon.com/" -soup = download_or_load_soup(url, file_path="outputs/html/amazon.html") -#minified = minify_html.minify(str(soup), minify_js=True, remove_processing_instructions=True) -#with open(f"minified.html", "w") as f: -# f.write(minified) -#soup = BeautifulSoup(minified, "html.parser") +if __name__ == "__main__": + # Clean all output folders + wizard_utils.clean_all_output_folders() -soup = process_soup(soup) -# quitar atributos inutiles -# prune a las clases + website_name = "rubmaps" + + #url = "https://www.rubmaps.ch/erotic-massage-hot-stones-spa-cleveland-oh-184586" + #url = "https://www.amazon.com/Legend-Zelda-Breath-Wild-Nintendo-Switch/dp/B097B2YWFX/ref=sr_1_2?crid=AMBZI2YGNFC3&keywords=zelda%2Bvideo%2Bgame&qid=1698834691&sprefix=zelda%2Bvideogame%2Caps%2C241&sr=8-2&th=1" + #url = "https://www.rubmaps.ch/erotic-massage-windsor-spa-chicago-il-190400#rubmaps" + #soup = wizard_utils.download_or_load_soup(url, f"{website_name}.html", use_proxies=False) + #soup = wizard_utils.process_soup(soup) + + #soup = soup.find(attrs={"id": "location-container"}) + #soup = BeautifulSoup(f"{str(soup)}") + #soup = wizard_utils.process_soup(soup) + with open("soup3.html", "r") as f: + soup = BeautifulSoup(f.read()) + soup = wizard_utils.process_soup(soup) -tasks = [ ## output del proceso anterior - #{"element": {"tag": "div", "attribute": "class", "value": "vipEscortsArea"}, "function_name": "get_escorts_info"}, - {"element": {"tag": "div", "attribute": "class", "value": "a-section a-spacing-large"}, "function_name": "get_amazon"} -] -file_name = "outputs/agent/amazon.py" -string = str(soup.find("div", attrs = {"class": "a-section"})) + palooza_graph = PaloozaGraph() + palooza_graph.get_graph(soup, labels_to_integers = False) + min_depth = 0 + max_candidates = 20 + + # Get candidates + candidates = wizard_algos.degree_importance( + palooza_graph.G, + palooza_graph.root, + min_depth, + max_candidates + ) + + # Get soup nodes. + wizard_utils.get_soup_nodes( + website_name, + palooza_graph.G, + soup, + candidates, + verbose = True + ) + + # Get agent functions. + agent.get_agent_functions() -print("Num tokens in string: ", num_tokens_from_string(string, "cl100k_base")) -#get_agent_functions(soup, tasks, file_name)