Skip to content

Commit

Permalink
delete
Browse files Browse the repository at this point in the history
  • Loading branch information
Juanchobanano committed Nov 28, 2023
1 parent 2313bbc commit 927a090
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 89 deletions.
Binary file modified .DS_Store
Binary file not shown.
5 changes: 2 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,5 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
./outputs/*
playground
outputs
playground/
outputs/
54 changes: 0 additions & 54 deletions main.py

This file was deleted.

8 changes: 0 additions & 8 deletions main2.py

This file was deleted.

70 changes: 46 additions & 24 deletions playground/main.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,54 @@
from palooza_wizard.utils.files import download_or_load_soup
from palooza_wizard.utils.process import process_soup
from palooza_wizard.agent import get_agent_functions
from palooza_wizard.chatgpt import num_tokens_from_string
from palooza_wizard.graph import PaloozaGraph
import palooza_wizard.algorithms as wizard_algos
import palooza_wizard.utils as wizard_utils
import palooza_wizard.agent as agent
import palooza_wizard.constants as ct
from bs4 import BeautifulSoup
# Download or load soup
#url = "https://www.amazon.com/Molblly-Shredded-Standard-Adjustable-Hypoallergenic/dp/B08X4TQJTL/?_encoding=UTF8&pd_rd_w=ABAfK&content-id=amzn1.sym.952cfb50-b01e-485f-be6e-00434541418b%3Aamzn1.symc.e5c80209-769f-4ade-a325-2eaec14b8e0e&pf_rd_p=952cfb50-b01e-485f-be6e-00434541418b&pf_rd_r=DNTYSCCM3CE36ZBNPB9Q&pd_rd_wg=vsfzz&pd_rd_r=a7259352-0e85-4361-934b-108279598eb3&ref_=pd_gw_ci_mcx_mr_hp_atf_m&th=1"
#soup = download_or_load_soup(url, file_path="outputs/html/test.html")

url = "https://www.amazon.com/"
soup = download_or_load_soup(url, file_path="outputs/html/amazon.html")
#minified = minify_html.minify(str(soup), minify_js=True, remove_processing_instructions=True)
#with open(f"minified.html", "w") as f:
# f.write(minified)
#soup = BeautifulSoup(minified, "html.parser")
if __name__ == "__main__":

# Clean all output folders
wizard_utils.clean_all_output_folders()

soup = process_soup(soup)
# quitar atributos inutiles
# prune a las clases
website_name = "rubmaps"

#url = "https://www.rubmaps.ch/erotic-massage-hot-stones-spa-cleveland-oh-184586"
#url = "https://www.amazon.com/Legend-Zelda-Breath-Wild-Nintendo-Switch/dp/B097B2YWFX/ref=sr_1_2?crid=AMBZI2YGNFC3&keywords=zelda%2Bvideo%2Bgame&qid=1698834691&sprefix=zelda%2Bvideogame%2Caps%2C241&sr=8-2&th=1"
#url = "https://www.rubmaps.ch/erotic-massage-windsor-spa-chicago-il-190400#rubmaps"
#soup = wizard_utils.download_or_load_soup(url, f"{website_name}.html", use_proxies=False)
#soup = wizard_utils.process_soup(soup)


#soup = soup.find(attrs={"id": "location-container"})
#soup = BeautifulSoup(f"<body>{str(soup)}</body>")
#soup = wizard_utils.process_soup(soup)
with open("soup3.html", "r") as f:
soup = BeautifulSoup(f.read())
soup = wizard_utils.process_soup(soup)

tasks = [ ## output del proceso anterior
#{"element": {"tag": "div", "attribute": "class", "value": "vipEscortsArea"}, "function_name": "get_escorts_info"},
{"element": {"tag": "div", "attribute": "class", "value": "a-section a-spacing-large"}, "function_name": "get_amazon"}
]
file_name = "outputs/agent/amazon.py"

string = str(soup.find("div", attrs = {"class": "a-section"}))
palooza_graph = PaloozaGraph()
palooza_graph.get_graph(soup, labels_to_integers = False)
min_depth = 0
max_candidates = 20

# Get candidates
candidates = wizard_algos.degree_importance(
palooza_graph.G,
palooza_graph.root,
min_depth,
max_candidates
)

# Get soup nodes.
wizard_utils.get_soup_nodes(
website_name,
palooza_graph.G,
soup,
candidates,
verbose = True
)

# Get agent functions.
agent.get_agent_functions()

print("Num tokens in string: ", num_tokens_from_string(string, "cl100k_base"))
#get_agent_functions(soup, tasks, file_name)

0 comments on commit 927a090

Please sign in to comment.