delete

DatapaloozaCO · Nov 28, 2023 · 927a090 · 927a090
1 parent 2313bbc
commit 927a090
Show file tree

Hide file tree

Showing 5 changed files with 48 additions and 89 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -158,6 +158,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
-./outputs/*
-playground
-outputs
+playground/
+outputs/
diff --git a/main.py b/main.py
diff --git a/main2.py b/main2.py
diff --git a/playground/main.py b/playground/main.py
@@ -1,32 +1,54 @@
-from palooza_wizard.utils.files import download_or_load_soup
-from palooza_wizard.utils.process import process_soup
-from palooza_wizard.agent import get_agent_functions
-from palooza_wizard.chatgpt import num_tokens_from_string 
+from palooza_wizard.graph import PaloozaGraph
+import palooza_wizard.algorithms as wizard_algos
+import palooza_wizard.utils as wizard_utils
+import palooza_wizard.agent as agent
+import palooza_wizard.constants as ct
 from bs4 import BeautifulSoup
-# Download or load soup
-#url = "https://www.amazon.com/Molblly-Shredded-Standard-Adjustable-Hypoallergenic/dp/B08X4TQJTL/?_encoding=UTF8&pd_rd_w=ABAfK&content-id=amzn1.sym.952cfb50-b01e-485f-be6e-00434541418b%3Aamzn1.symc.e5c80209-769f-4ade-a325-2eaec14b8e0e&pf_rd_p=952cfb50-b01e-485f-be6e-00434541418b&pf_rd_r=DNTYSCCM3CE36ZBNPB9Q&pd_rd_wg=vsfzz&pd_rd_r=a7259352-0e85-4361-934b-108279598eb3&ref_=pd_gw_ci_mcx_mr_hp_atf_m&th=1"
-#soup = download_or_load_soup(url, file_path="outputs/html/test.html")
 
-url = "https://www.amazon.com/"
-soup = download_or_load_soup(url, file_path="outputs/html/amazon.html")
-#minified = minify_html.minify(str(soup), minify_js=True, remove_processing_instructions=True)
-#with open(f"minified.html", "w") as f:
-#    f.write(minified)
-#soup = BeautifulSoup(minified, "html.parser")
+if __name__ == "__main__":
 
+    # Clean all output folders
+    wizard_utils.clean_all_output_folders()
 
-soup = process_soup(soup) 
-# quitar atributos inutiles
-# prune a las clases
+    website_name = "rubmaps"
+
+    #url = "https://www.rubmaps.ch/erotic-massage-hot-stones-spa-cleveland-oh-184586"
+    #url = "https://www.amazon.com/Legend-Zelda-Breath-Wild-Nintendo-Switch/dp/B097B2YWFX/ref=sr_1_2?crid=AMBZI2YGNFC3&keywords=zelda%2Bvideo%2Bgame&qid=1698834691&sprefix=zelda%2Bvideogame%2Caps%2C241&sr=8-2&th=1"
+    #url = "https://www.rubmaps.ch/erotic-massage-windsor-spa-chicago-il-190400#rubmaps"
+    #soup = wizard_utils.download_or_load_soup(url, f"{website_name}.html", use_proxies=False)
+    #soup = wizard_utils.process_soup(soup)
 
+
+    #soup = soup.find(attrs={"id": "location-container"})
+    #soup = BeautifulSoup(f"<body>{str(soup)}</body>")
+    #soup = wizard_utils.process_soup(soup)
+    with open("soup3.html", "r") as f:
+        soup = BeautifulSoup(f.read())
+    soup = wizard_utils.process_soup(soup)
 
-tasks = [ ## output del proceso anterior
-    #{"element": {"tag": "div", "attribute": "class", "value": "vipEscortsArea"}, "function_name": "get_escorts_info"},
-    {"element": {"tag": "div", "attribute": "class", "value": "a-section a-spacing-large"}, "function_name": "get_amazon"}
-]
-file_name = "outputs/agent/amazon.py"
 
-string = str(soup.find("div", attrs = {"class": "a-section"}))
+    palooza_graph = PaloozaGraph()
+    palooza_graph.get_graph(soup, labels_to_integers = False)
+    min_depth = 0
+    max_candidates = 20
+
+    # Get candidates
+    candidates = wizard_algos.degree_importance(
+        palooza_graph.G, 
+        palooza_graph.root,
+        min_depth, 
+        max_candidates
+    )
+
+    # Get soup nodes.
+    wizard_utils.get_soup_nodes(
+        website_name, 
+        palooza_graph.G, 
+        soup, 
+        candidates, 
+        verbose = True
+    )
+
+    # Get agent functions.
+    agent.get_agent_functions()
 
-print("Num tokens in string: ", num_tokens_from_string(string, "cl100k_base"))
-#get_agent_functions(soup, tasks, file_name)