From 5b0d9590e1b3a6e2da3d7d22f9652b7e693e466b Mon Sep 17 00:00:00 2001 From: Juanchobanano Date: Tue, 28 Nov 2023 16:05:17 +0200 Subject: [PATCH] Add super awersome feature --- .gitignore | 2 +- .pre-commit-config.yaml | 11 +++ LICENSE.md | 2 +- palooza_wizard/__init__.py | 2 +- palooza_wizard/agent.py | 38 ++++++---- palooza_wizard/algorithms/__init__.py | 2 +- .../algorithms/degree_importance.py | 72 +++++++++---------- palooza_wizard/chatgpt/__init__.py | 2 +- palooza_wizard/chatgpt/chatgpt.py | 31 +++++--- palooza_wizard/chatgpt/constants.py | 12 ++-- palooza_wizard/chatgpt/pricing.py | 10 ++- palooza_wizard/chatgpt/tokens.py | 4 +- palooza_wizard/constants.py | 8 +-- palooza_wizard/graph.py | 32 ++++----- palooza_wizard/utils/__init__.py | 2 +- palooza_wizard/utils/files.py | 41 +++++++---- palooza_wizard/utils/process.py | 3 +- palooza_wizard/utils/soup.py | 40 ++++++----- palooza_wizard_cli.py | 50 ++++++++----- setup.py | 16 ++--- ...342\231\202\357\270\217_Palooza_Wizard.py" | 62 ++++++++-------- .../pages/02_\360\237\244\224_About.py" | 10 +-- .../pages/03_\360\237\232\200_Datapalooza.py" | 49 +++++++++---- streamlit/utils/__init__.py | 2 +- streamlit/utils/error_box.py | 3 +- 25 files changed, 300 insertions(+), 206 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.gitignore b/.gitignore index 5f76876..c21b5fd 100644 --- a/.gitignore +++ b/.gitignore @@ -159,4 +159,4 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ playground/ -outputs/ \ No newline at end of file +outputs/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..48ee030 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,11 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.3.0 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace +- repo: https://github.com/psf/black + rev: 22.10.0 + hooks: + - id: black diff --git a/LICENSE.md b/LICENSE.md index 1a6c82a..b77357c 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -17,4 +17,4 @@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/palooza_wizard/__init__.py b/palooza_wizard/__init__.py index 00fd90b..18a8364 100644 --- a/palooza_wizard/__init__.py +++ b/palooza_wizard/__init__.py @@ -2,4 +2,4 @@ import palooza_wizard.constants as constants for folder in constants.FOLDERS: - files.create_folder_if_not_exists(folder) \ No newline at end of file + files.create_folder_if_not_exists(folder) diff --git a/palooza_wizard/agent.py b/palooza_wizard/agent.py index d2b3e09..280e028 100644 --- a/palooza_wizard/agent.py +++ b/palooza_wizard/agent.py @@ -2,10 +2,11 @@ import palooza_wizard.chatgpt as pwc import palooza_wizard.constants as ct from typing import List -import os +import os + def get_agent_code(file_name: str): - python_code = load_code_string(file_name = file_name) + python_code = load_code_string(file_name=file_name) system_message = pwc.get_system_message_for_agent() messages = [ {"role": "system", "content": system_message}, @@ -16,25 +17,33 @@ def get_agent_code(file_name: str): with open(f"./{file_name}", "w") as f: f.write(agent_code) + def get_element_metadata(task: dict) -> tuple: data = task["element"] tag, attribute, value = data["tag"], data["attribute"], data["value"] return tag, attribute, value + def get_agent_function(file_path: str) -> str: - with open(f'{ct.IMPORTANCE_OUTPUT_FOLDER}/{file_path}', "r", encoding='windows-1252') as f: + with open( + f"{ct.IMPORTANCE_OUTPUT_FOLDER}/{file_path}", "r", encoding="windows-1252" + ) as f: user_message = f.read() function_name = file_path - messages = pwc.get_messages_for_function(user_message = user_message, function_name = function_name) - completion = pwc.get_completion_from_messages(messages = messages) + messages = pwc.get_messages_for_function( + user_message=user_message, function_name=function_name + ) + completion = pwc.get_completion_from_messages(messages=messages) return completion + def save_completion(completion: str, file_name: str) -> None: file_name = file_name.replace(".html", ".py") with open(f"{ct.AGENT_OUTPUT_FOLDER}/{file_name}", "a") as f: f.write(completion) f.write("\n\n") + def get_agent_functions() -> None: """Get agent functions @@ -45,23 +54,24 @@ def get_agent_functions() -> None: Return: - None """ - #try: + # try: # #os.remove(f"{ct.FUNCTIONS_OUTPUT_FOLDER}/{file_name}") # os.remove(file_name) - #except: + # except: # pass - + file_paths = os.listdir(ct.IMPORTANCE_OUTPUT_FOLDER) - #file_paths = [file_paths[index] for index in indexes] + # file_paths = [file_paths[index] for index in indexes] for file_path in file_paths: completion = get_agent_function(file_path) - completion = pwc.format_python_completion(completion = completion) - save_completion(completion = completion, file_name = file_path) + completion = pwc.format_python_completion(completion=completion) + save_completion(completion=completion, file_name=file_path) + def load_code_string(file_name: str): - #{ct.FUNCTIONS_OUTPUT_FOLDER} + # {ct.FUNCTIONS_OUTPUT_FOLDER} with open(f"./{file_name}", "r") as f: - python_code = f.read() - print(python_code) + python_code = f.read() + print(python_code) return python_code diff --git a/palooza_wizard/algorithms/__init__.py b/palooza_wizard/algorithms/__init__.py index 80d55f6..46c75fc 100644 --- a/palooza_wizard/algorithms/__init__.py +++ b/palooza_wizard/algorithms/__init__.py @@ -1 +1 @@ -from .degree_importance import degree_importance \ No newline at end of file +from .degree_importance import degree_importance diff --git a/palooza_wizard/algorithms/degree_importance.py b/palooza_wizard/algorithms/degree_importance.py index 37fc8ee..92f4f11 100644 --- a/palooza_wizard/algorithms/degree_importance.py +++ b/palooza_wizard/algorithms/degree_importance.py @@ -2,71 +2,67 @@ from typing import List import palooza_wizard.constants as ct from bs4 import BeautifulSoup -import os -import joblib -import sys - -def filter_candidates_by_containment( - graph: nx.DiGraph, - candidates: List[int] - ): - """Para todo g1, g2 e I, g1 no contiene a g2 ni g2 a g1 - """ +import os +import joblib +import sys + + +def filter_candidates_by_containment(graph: nx.DiGraph, candidates: List[int]): + """Para todo g1, g2 e I, g1 no contiene a g2 ni g2 a g1""" inadmissable_nodes = [] for candidate in candidates: if candidate in inadmissable_nodes: - continue + continue descendants = list(nx.descendants(graph, candidate)) inadmissable_nodes = inadmissable_nodes + descendants yield candidate + def filter_candidates_by_depth( - graph: nx.DiGraph, - root, - candidates: List[int], - min_depth: int - ): - nodes_depth = nx.shortest_path_length(graph, root) - for candidate in candidates: + graph: nx.DiGraph, root, candidates: List[int], min_depth: int +): + nodes_depth = nx.shortest_path_length(graph, root) + for candidate in candidates: if nodes_depth[candidate] > min_depth: - yield candidate + yield candidate + def filter_candidates( - graph: nx.DiGraph, - root: str, - candidates: List[int], - min_depth: int = 3 - ): - candidates = [x for x in filter_candidates_by_depth(graph, root, candidates, min_depth)] + graph: nx.DiGraph, root: str, candidates: List[int], min_depth: int = 3 +): + candidates = [ + x for x in filter_candidates_by_depth(graph, root, candidates, min_depth) + ] candidates = [x for x in filter_candidates_by_containment(graph, candidates)] - return candidates + return candidates + def degree_importance( - graph: nx.DiGraph, - root: str, - min_depth: int, - max_candidates: int, - verbose: bool = False - ) -> List[int]: - - # Get degree of nodes. + graph: nx.DiGraph, + root: str, + min_depth: int, + max_candidates: int, + verbose: bool = False, +) -> List[int]: + + # Get degree of nodes. nodes_degree = list(graph.degree(graph.nodes())) # Sort nodes by degree. - nodes_degree.sort(key = lambda z: z[1], reverse=True) + nodes_degree.sort(key=lambda z: z[1], reverse=True) nodes_degree = nodes_degree[:max_candidates] - #for key, value in nodes_degree: + # for key, value in nodes_degree: # print(f"{key}: {value}") # Get candidates of nodes. candidates = [x[0] for x in nodes_degree] # Filter candidates. - candidates = filter_candidates(graph, root, candidates, min_depth = min_depth) + candidates = filter_candidates(graph, root, candidates, min_depth=min_depth) - if verbose: + if verbose: print("Candidates") print(candidates) diff --git a/palooza_wizard/chatgpt/__init__.py b/palooza_wizard/chatgpt/__init__.py index d29aa16..044686a 100644 --- a/palooza_wizard/chatgpt/__init__.py +++ b/palooza_wizard/chatgpt/__init__.py @@ -1,3 +1,3 @@ from .pricing import * from .tokens import * -from .chatgpt import * \ No newline at end of file +from .chatgpt import * diff --git a/palooza_wizard/chatgpt/chatgpt.py b/palooza_wizard/chatgpt/chatgpt.py index 52bf5f3..5e0d7fa 100644 --- a/palooza_wizard/chatgpt/chatgpt.py +++ b/palooza_wizard/chatgpt/chatgpt.py @@ -1,5 +1,5 @@ import openai -from typing import List +from typing import List from dotenv import dotenv_values import palooza_wizard.constants as ct import palooza_wizard.chatgpt as chatgpt @@ -8,34 +8,40 @@ openai.api_key = ct.OPEN_AI_API_KEY -def get_completion_from_messages(messages, model="gpt-4", temperature=0, max_tokens=500): # gpt-4 + +def get_completion_from_messages( + messages, model="gpt-4", temperature=0, max_tokens=500 +): # gpt-4 response = openai.ChatCompletion.create( model=model, messages=messages, - temperature=temperature, - max_tokens=max_tokens, + temperature=temperature, + max_tokens=max_tokens, ) return response.choices[0].message["content"] + def get_system_message_for_function(function_name: str) -> str: system_message = f""" - Create a Python function with a proper name starting with '{function_name}' that extracts all relevant data from - the provided HTML code by the user and returns the data as a dictionary. The keys of the dictionaries should be as - few as possible. Utilize Beautiful Soup (beautifulsoup4) to implement this Python function. The output should + Create a Python function with a proper name starting with '{function_name}' that extracts all relevant data from + the provided HTML code by the user and returns the data as a dictionary. The keys of the dictionaries should be as + few as possible. Utilize Beautiful Soup (beautifulsoup4) to implement this Python function. The output should consist solely of Python code without any English words. """ return system_message + def get_system_message_for_agent() -> str: system_message = f""" - Create a python class called 'agent' which implements as methods the defined functions passed by the user in the prompt. + Create a python class called 'agent' which implements as methods the defined functions passed by the user in the prompt. In addition, include a new method called 'extract_data' that accepts as argument a beautifulsoup object and a url and which uses all the methods of the class and returns a dictionary with the result of each method. Constructor method must not recieve any parameter, instead, each method within the class must accept as parameter a BeautifulSoup object """ return system_message + def get_messages_for_function(user_message: str, function_name: str) -> List[str]: - system_message = get_system_message_for_function(function_name) + system_message = get_system_message_for_function(function_name) num_tokens = chatgpt.num_tokens_for_model(system_message + user_message) print("Number of tokens to be sent: ", num_tokens) messages = [ @@ -44,8 +50,11 @@ def get_messages_for_function(user_message: str, function_name: str) -> List[str ] return messages + def format_python_completion(completion: str) -> str: if completion.find("```python") != -1: - a, b = completion.find("```python"), completion.find("```", len("```python") + 1, len(completion)) - return completion[a + len("```python"): b] + a, b = completion.find("```python"), completion.find( + "```", len("```python") + 1, len(completion) + ) + return completion[a + len("```python") : b] return completion diff --git a/palooza_wizard/chatgpt/constants.py b/palooza_wizard/chatgpt/constants.py index 22381c2..4a317df 100644 --- a/palooza_wizard/chatgpt/constants.py +++ b/palooza_wizard/chatgpt/constants.py @@ -3,18 +3,18 @@ "training": 0.0004, "input_usage": 0.0016, "output_usage": 0.0016, - "num_tokens": 1000 - }, + "num_tokens": 1000, + }, "davinci-002": { "training": 0.0060, "input_usage": 0.0120, "output_usage": 0.0120, - "num_tokens": 1000 + "num_tokens": 1000, }, "GPT-3.5-Turbo": { "training": 0.0080, "input_usage": 0.0120, "output_usage": 0.0160, - "num_tokens": 1000 - } -} \ No newline at end of file + "num_tokens": 1000, + }, +} diff --git a/palooza_wizard/chatgpt/pricing.py b/palooza_wizard/chatgpt/pricing.py index 22df797..6a8b874 100644 --- a/palooza_wizard/chatgpt/pricing.py +++ b/palooza_wizard/chatgpt/pricing.py @@ -1,27 +1,31 @@ import palooza_wizard.chatgpt.constants as ct -from typing import List +from typing import List + def get_available_models() -> List[str]: return list(ct.PRICING.keys()) + def validate_model(model: str) -> bool: return model in ct.PRICING.keys() + def estimated_training_cost(num_tokens: int, model: str = "GPT-3.5-Turbo") -> float: assert validate_model(model), "Invalid model" num_tokens_per_cost = ct.PRICING[model]["num_tokens"] training_cost = ct.PRICING[model]["training"] return (num_tokens / num_tokens_per_cost) * training_cost + def estimated_input_usage_cost(num_tokens: int, model: str = "GPT-3.5-Turbo") -> float: assert validate_model(model), "Invalid model" num_tokens_per_cost = ct.PRICING[model]["num_tokens"] input_cost = ct.PRICING[model]["input_usage"] return (num_tokens / num_tokens_per_cost) * input_cost -def estimated_output_usage_cost(num_tokens: int, model: str = "GPT-3.5-Turbo") -> float: + +def estimated_output_usage_cost(num_tokens: int, model: str = "GPT-3.5-Turbo") -> float: assert validate_model(model), "Invalid model" num_tokens_per_cost = ct.PRICING[model]["num_tokens"] output_cost = ct.PRICING[model]["output_usage"] return (num_tokens / num_tokens_per_cost) * output_cost - diff --git a/palooza_wizard/chatgpt/tokens.py b/palooza_wizard/chatgpt/tokens.py index 72db2bc..f958296 100644 --- a/palooza_wizard/chatgpt/tokens.py +++ b/palooza_wizard/chatgpt/tokens.py @@ -1,5 +1,6 @@ import tiktoken + def num_tokens_with_encoding(string: str, encoding_name: str = "cl100k_base") -> int: """This function computes the number of tokens in a string #https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb @@ -8,10 +9,11 @@ def num_tokens_with_encoding(string: str, encoding_name: str = "cl100k_base") -> num_tokens = len(encoding.encode(string)) return num_tokens + def num_tokens_for_model(string: str, model_name: str = "gpt-4") -> int: """This function computers the number of token in a string for a specific model name #https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb """ encoding = tiktoken.encoding_for_model(model_name) num_tokens = len(encoding.encode(string)) - return num_tokens \ No newline at end of file + return num_tokens diff --git a/palooza_wizard/constants.py b/palooza_wizard/constants.py index 32c33db..16b11f4 100644 --- a/palooza_wizard/constants.py +++ b/palooza_wizard/constants.py @@ -11,7 +11,7 @@ IMPORTANCE_OUTPUT_FOLDER = "./outputs/importance/" JSONS_OUTPUT_FOLDER = "./outputs/jsons" SOUPS_OUTPUT_FOLDER = "./outputs/soups" -DATA_OUTPUT_FOLDER = "./outputs/data" +DATA_OUTPUT_FOLDER = "./outputs/data" FOLDERS = [ HTML_OUTPUT_FOLDER, @@ -19,7 +19,7 @@ AGENT_OUTPUT_FOLDER, IMPORTANCE_OUTPUT_FOLDER, JSONS_OUTPUT_FOLDER, - SOUPS_OUTPUT_FOLDER + SOUPS_OUTPUT_FOLDER, ] # Proxies data @@ -27,9 +27,9 @@ PROXY_USERNAME = config["USERNAME"] PROXY_PASSWORD = config["PASSWORD"] -FAKE_USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; rv:84.0) Gecko/20100101 Firefox/84.0' +FAKE_USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; rv:84.0) Gecko/20100101 Firefox/84.0" REQUEST_TIMEOUT = 10 # Selector label SELECTOR_LABEL = "selector" -ROOT_LABEL = "root" \ No newline at end of file +ROOT_LABEL = "root" diff --git a/palooza_wizard/graph.py b/palooza_wizard/graph.py index 5e21703..565fb9d 100644 --- a/palooza_wizard/graph.py +++ b/palooza_wizard/graph.py @@ -6,6 +6,7 @@ from typing import List import palooza_wizard.constants as ct + class PaloozaGraph: def __init__(self) -> None: # Initialize global variables @@ -36,35 +37,34 @@ def get_color(self, soup: BeautifulSoup) -> None: self.sizes.append(1) # Get node name - def get_node_name( - self, - soup: BeautifulSoup, - parent_name: str, - index: int - ) -> None: + def get_node_name(self, soup: BeautifulSoup, parent_name: str, index: int) -> None: self.get_color(soup) node_name = f"{parent_name}__{index}__{str(soup.name)}__*" return node_name # Get node properties - def get_node_properties(self, soup: BeautifulSoup, parent_name: str, node_name: str) -> dict: + def get_node_properties( + self, soup: BeautifulSoup, parent_name: str, node_name: str + ) -> dict: properties = { - "tag": soup.name, # h1, h2, p, ..., + "tag": soup.name, # h1, h2, p, ..., "parent_name": parent_name, "node_name": node_name, "number": self.counter, - "class": soup.get("class"), # ['a', 'b'] - "id": soup.get("id"), # + "class": soup.get("class"), # ['a', 'b'] + "id": soup.get("id"), # ct.SELECTOR_LABEL: { "tag": soup.name, "class": soup.get("class"), - "id": soup.get("id") - } + "id": soup.get("id"), + }, } return properties # Add nodes to the graph - def add_nodes(self, soup: BeautifulSoup, parent_name: str = "", index: int = 1, depth: int = 0): + def add_nodes( + self, soup: BeautifulSoup, parent_name: str = "", index: int = 1, depth: int = 0 + ): if soup is None: return @@ -87,12 +87,12 @@ def add_nodes(self, soup: BeautifulSoup, parent_name: str = "", index: int = 1, children = soup.findChildren(recursive=False) # While there is just one children, keep going down - #while len(children) == 1: + # while len(children) == 1: # children = children[0] # children = children.findChildren(recursive=False) # Add children only if there is more than 1 children - #if len(children) > 1: + # if len(children) > 1: for i in range(len(children)): self.add_nodes(children[i], node_name, i + 1, depth + 1) @@ -100,4 +100,4 @@ def get_graph(self, soup: BeautifulSoup, labels_to_integers: bool = True): self.add_nodes(soup, ct.ROOT_LABEL) if labels_to_integers: self.G = nx.convert_node_labels_to_integers(self.G) - self.root = 0 \ No newline at end of file + self.root = 0 diff --git a/palooza_wizard/utils/__init__.py b/palooza_wizard/utils/__init__.py index 1d393f6..ee08e01 100644 --- a/palooza_wizard/utils/__init__.py +++ b/palooza_wizard/utils/__init__.py @@ -1,3 +1,3 @@ from .files import * from .process import * -from .soup import * \ No newline at end of file +from .soup import * diff --git a/palooza_wizard/utils/files.py b/palooza_wizard/utils/files.py index cac4dee..8f85425 100644 --- a/palooza_wizard/utils/files.py +++ b/palooza_wizard/utils/files.py @@ -6,6 +6,7 @@ import shutil from typing import List + def get_files_in_folder(folder_path: str, full_path: bool = True) -> List[str]: if not file_exists(folder_path): raise Exception("Folder not found") @@ -15,76 +16,89 @@ def get_files_in_folder(folder_path: str, full_path: bool = True) -> List[str]: files = [os.path.join(folder_path, file) for file in files] return files + def get_request_with_proxies(url: str, use_proxies: bool = True): if use_proxies: auth = HTTPProxyAuth(ct.PROXY_USERNAME, ct.PROXY_PASSWORD) data = requests.get( - url = url, - proxies = ct.PROXIES, - auth = auth, - headers = {"User-Agent": ct.FAKE_USER_AGENT}, - timeout = ct.REQUEST_TIMEOUT + url=url, + proxies=ct.PROXIES, + auth=auth, + headers={"User-Agent": ct.FAKE_USER_AGENT}, + timeout=ct.REQUEST_TIMEOUT, ) else: data = requests.get( - url = url, - headers = {"User-Agent": ct.FAKE_USER_AGENT}, - timeout = ct.REQUEST_TIMEOUT + url=url, + headers={"User-Agent": ct.FAKE_USER_AGENT}, + timeout=ct.REQUEST_TIMEOUT, ) return data + def create_folder_if_not_exists(folder_path: str) -> bool: """This function check if a folder exists in a given path, otherwise, it creates the folder""" if not os.path.exists(folder_path): os.makedirs(folder_path) return True + def file_exists(file_path: str) -> bool: """This function check if a file exists in a given path""" return os.path.exists(file_path) + def get_html_from_file(file_path: str) -> str: """Load HTML file""" with open(file_path, "r") as f: html = f.read() return html -def get_soup_from_file(file_path: str) -> BeautifulSoup: + +def get_soup_from_file(file_path: str) -> BeautifulSoup: """Load HTML file and return a BeautifulSoup object""" html = get_html_from_file(file_path) soup = BeautifulSoup(html, "html.parser") return soup + def get_html_from_url(url: str, use_proxies: bool = True) -> str: """Download HTML file and return a string object""" data = get_request_with_proxies(url, use_proxies) return data.content + def get_soup_from_url(url: str, use_proxies: bool = True) -> BeautifulSoup: """Download HTML file and return a BeautifulSoup object using get_html_from_url function""" - soup = get_html_from_url(url, use_proxies = use_proxies) + soup = get_html_from_url(url, use_proxies=use_proxies) soup = BeautifulSoup(soup, "html.parser") return soup + def save_html_to_file(html: str, file_path: str) -> None: """Save HTML string to a file""" with open(file_path, "w") as f: f.write(html) + def save_soup_to_file(soup: BeautifulSoup, file_path: str) -> None: """Save BeautifulSoup object to a file""" html = str(soup) save_html_to_file(html, file_path) -def download_or_load_soup(url: str, file_path: str, use_proxies: bool = True) -> BeautifulSoup: + +def download_or_load_soup( + url: str, file_path: str, use_proxies: bool = True +) -> BeautifulSoup: """Download HTML file or load HTML file""" if file_exists(file_path): soup = get_soup_from_file(file_path) else: - soup = get_soup_from_url(url, use_proxies = use_proxies) + soup = get_soup_from_url(url, use_proxies=use_proxies) save_soup_to_file(soup, file_path) return soup + def delete_all_files_in_folder(folder_path: str): for filename in os.listdir(folder_path): file_path = os.path.join(folder_path, filename) @@ -94,7 +108,8 @@ def delete_all_files_in_folder(folder_path: str): elif os.path.isdir(file_path): shutil.rmtree(file_path) except Exception as e: - print('Failed to delete %s. Reason: %s' % (file_path, e)) + print("Failed to delete %s. Reason: %s" % (file_path, e)) + def clean_all_output_folders(): for folder in ct.FOLDERS: diff --git a/palooza_wizard/utils/process.py b/palooza_wizard/utils/process.py index ef59983..3337a83 100644 --- a/palooza_wizard/utils/process.py +++ b/palooza_wizard/utils/process.py @@ -1,8 +1,9 @@ from bs4 import BeautifulSoup + def process_soup(soup: BeautifulSoup) -> BeautifulSoup: """Select HTML body and remove all script tags""" soup = soup.find("body") - for s in soup.select('script'): + for s in soup.select("script"): s.extract() return soup diff --git a/palooza_wizard/utils/soup.py b/palooza_wizard/utils/soup.py index a98bfd8..eaee769 100644 --- a/palooza_wizard/utils/soup.py +++ b/palooza_wizard/utils/soup.py @@ -1,11 +1,12 @@ from bs4 import BeautifulSoup -from typing import List +from typing import List import palooza_wizard.constants as ct import networkx as nx + def process_parent_name(element_path: str) -> dict: """This function maps a string element_path to a dictionary - Input: nodo1__div__*__nodo2__div__*__nodo3__div__* + Input: nodo1__div__*__nodo2__div__*__nodo3__div__* Output: {0: ('div', 'nodo1'), 1: ('div', 'nodo2'), 2: ('div', 'nodo3')} """ path_info = {} @@ -14,8 +15,8 @@ def process_parent_name(element_path: str) -> dict: for edge in element_path: edge = edge.split("__") - edge = [x for x in edge if x != ''] - if len(edge) == 0: + edge = [x for x in edge if x != ""] + if len(edge) == 0: continue tag = edge[1] value = edge[0] @@ -23,8 +24,9 @@ def process_parent_name(element_path: str) -> dict: depth += 1 return path_info + def get_element_with_path(soup: BeautifulSoup, path: dict): - """Based on a soup and a path to a important node, recursively + """Based on a soup and a path to a important node, recursively explore the soup to get the element. {0: ('body', '0'), 1: ('div', '1'), 2: ('div', '4')} #print(path) @@ -32,24 +34,25 @@ def get_element_with_path(soup: BeautifulSoup, path: dict): """ counter = 0 for _, value in path.items(): - counter += 1 + counter += 1 # Skip body tag. if counter == 1: continue soup = soup.findChildren(recursive=False)[int(value[1]) - 1] return soup - + + def get_soup_nodes( - website_name: str, - graph: nx.DiGraph, - soup: BeautifulSoup, - candidates: List[int], - verbose: bool = False - ): + website_name: str, + graph: nx.DiGraph, + soup: BeautifulSoup, + candidates: List[int], + verbose: bool = False, +): """Based on the computed candidates, get the soup of each one of them. Note that this method is agnostic to any node importance algorithm. """ - # Get selectors. + # Get selectors. parents_name = nx.get_node_attributes(graph, "node_name") for key, value in parents_name.items(): parents_name[key] = process_parent_name(value) @@ -61,7 +64,8 @@ def get_soup_nodes( soup_candidate = get_element_with_path(soup, candidate_path) with open( f"{ct.IMPORTANCE_OUTPUT_FOLDER}/{website_name}_{counter + 1}.html", - "w", - encoding="utf-8") as f: - f.write(str(soup_candidate)) - counter += 1 \ No newline at end of file + "w", + encoding="utf-8", + ) as f: + f.write(str(soup_candidate)) + counter += 1 diff --git a/palooza_wizard_cli.py b/palooza_wizard_cli.py index ce44579..034cfe3 100644 --- a/palooza_wizard_cli.py +++ b/palooza_wizard_cli.py @@ -10,28 +10,46 @@ def main(): ap = argparse.ArgumentParser() - ap.add_argument('-q', help='query', required=True) - ap.add_argument('-e', help='search engine(s) - ' + ', '.join(search_engines_dict), default='google') - ap.add_argument('-o', help='output file [html, csv, json]', default='print') - ap.add_argument('-n', help='filename for output file', default=config.OUTPUT_DIR+'output') - ap.add_argument('-p', help='number of pages', default=config.SEARCH_ENGINE_RESULTS_PAGES, type=int) - ap.add_argument('-f', help='filter results [url, title, text, host]', default=None) - ap.add_argument('-i', help='ignore duplicats, useful when multiple search engines are used', action='store_true') - ap.add_argument('-proxy', help='use proxy (protocol://ip:port)', default=config.PROXY) - + ap.add_argument("-q", help="query", required=True) + ap.add_argument( + "-e", + help="search engine(s) - " + ", ".join(search_engines_dict), + default="google", + ) + ap.add_argument("-o", help="output file [html, csv, json]", default="print") + ap.add_argument( + "-n", help="filename for output file", default=config.OUTPUT_DIR + "output" + ) + ap.add_argument( + "-p", + help="number of pages", + default=config.SEARCH_ENGINE_RESULTS_PAGES, + type=int, + ) + ap.add_argument("-f", help="filter results [url, title, text, host]", default=None) + ap.add_argument( + "-i", + help="ignore duplicats, useful when multiple search engines are used", + action="store_true", + ) + ap.add_argument( + "-proxy", help="use proxy (protocol://ip:port)", default=config.PROXY + ) + args = ap.parse_args() proxy = args.proxy timeout = config.TIMEOUT + (10 * bool(proxy)) engines = [ - e.strip() for e in args.e.lower().split(',') - if e.strip() in search_engines_dict or e.strip() == 'all' + e.strip() + for e in args.e.lower().split(",") + if e.strip() in search_engines_dict or e.strip() == "all" ] if not engines: - print('Please choose a search engine: ' + ', '.join(search_engines_dict)) + print("Please choose a search engine: " + ", ".join(search_engines_dict)) else: - if 'all' in engines: + if "all" in engines: engine = AllSearchEngines(proxy, timeout) elif len(engines) > 1: engine = MultipleSearchEngines(engines, proxy, timeout) @@ -41,10 +59,10 @@ def main(): engine.ignore_duplicate_urls = args.i if args.f: engine.set_search_operator(args.f) - + engine.search(args.q, args.p) engine.output(args.o, args.n) -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index 12df261..fed7c65 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup, find_packages -import codecs -import os +import codecs +import os here = os.path.abspath(os.path.dirname(__file__)) @@ -8,17 +8,17 @@ long_description = "\\n" + fh.read() requirements = [] -with open('requirements.txt') as f: +with open("requirements.txt") as f: requirements = f.readlines() setup( name="palooza_wizard", - version=1, #'{{VERSION_PLACEHOLDER}}', + version=1, #'{{VERSION_PLACEHOLDER}}', description="Datapalooza Scraper Framework", - url = "https://github.com/DatapaloozaCO/datapalooza_scraper_framework", + url="https://github.com/DatapaloozaCO/datapalooza_scraper_framework", long_description_content_type="text/markdown", long_description=long_description, packages=find_packages(), - #install_requires=requirements, - keywords=['scraper', 'crawler'] -) \ No newline at end of file + # install_requires=requirements, + keywords=["scraper", "crawler"], +) diff --git "a/streamlit/01_\360\237\247\231\342\200\215\342\231\202\357\270\217_Palooza_Wizard.py" "b/streamlit/01_\360\237\247\231\342\200\215\342\231\202\357\270\217_Palooza_Wizard.py" index eae27c1..3f6664a 100644 --- "a/streamlit/01_\360\237\247\231\342\200\215\342\231\202\357\270\217_Palooza_Wizard.py" +++ "b/streamlit/01_\360\237\247\231\342\200\215\342\231\202\357\270\217_Palooza_Wizard.py" @@ -1,49 +1,51 @@ import streamlit as st -import trubrics -import validators +import trubrics +import validators import streamlit.components.v1 as components import palooza_wizard as wizard -st.set_page_config(page_title='Palooza Wizard πŸ§™β€β™‚οΈ', page_icon='πŸ§™β€β™‚οΈ', layout='wide') -st.title('Palooza Wizard πŸ§™β€β™‚οΈ') +st.set_page_config(page_title="Palooza Wizard πŸ§™β€β™‚οΈ", page_icon="πŸ§™β€β™‚οΈ", layout="wide") +st.title("Palooza Wizard πŸ§™β€β™‚οΈ") -st.markdown("Palooza Wizard ✨ is a powerful tool that allows users to **create web scrapers with minimal effort**. With **just a URL as input**, this framework generates a Python script 🐍 that enables users to scrape data from a website of their choice 🌐.") +st.markdown( + "Palooza Wizard ✨ is a powerful tool that allows users to **create web scrapers with minimal effort**. With **just a URL as input**, this framework generates a Python script 🐍 that enables users to scrape data from a website of their choice 🌐." +) st.divider() interacted = 0 -url = st.text_input( - "Input an URL", - placeholder = "https://www.google.com/", - value = "" -) -valid_url = (1 if validators.url(url) else 0) +url = st.text_input("Input an URL", placeholder="https://www.google.com/", value="") +valid_url = 1 if validators.url(url) else 0 if not valid_url and url != "": - st.error('The URL is not valid', icon="🚨") -continue_button = st.button("Continue", disabled = not valid_url) + st.error("The URL is not valid", icon="🚨") +continue_button = st.button("Continue", disabled=not valid_url) st.subheader("Wizard Results") col1, col2 = st.columns(2) -with col1: +with col1: st.markdown("Node extracted") - components.html(""" + components.html( + """ - """) -with col2: + """ + ) +with col2: st.markdown("Data preview") - st.json({ - 'foo': 'bar', - 'baz': 'boz', - 'stuff': [ - 'stuff 1', - 'stuff 2', - 'stuff 3', - 'stuff 5', - ], - }) - agree = st.checkbox('I want this section') + st.json( + { + "foo": "bar", + "baz": "boz", + "stuff": [ + "stuff 1", + "stuff 2", + "stuff 3", + "stuff 5", + ], + } + ) + agree = st.checkbox("I want this section") if agree: - st.write('You have selected this section!') + st.write("You have selected this section!") -st.button("Descargar mis datos πŸ“ en formato excel") \ No newline at end of file +st.button("Descargar mis datos πŸ“ en formato excel") diff --git "a/streamlit/pages/02_\360\237\244\224_About.py" "b/streamlit/pages/02_\360\237\244\224_About.py" index 3f3e8a0..5c832a8 100644 --- "a/streamlit/pages/02_\360\237\244\224_About.py" +++ "b/streamlit/pages/02_\360\237\244\224_About.py" @@ -1,9 +1,9 @@ import streamlit as st -import trubrics -import validators +import trubrics +import validators -st.set_page_config(page_title='Palooza Wizard πŸ§™β€β™‚οΈ', page_icon='πŸ§™β€β™‚οΈ', layout='wide') -st.title('About Palooza Wizard πŸ€”') +st.set_page_config(page_title="Palooza Wizard πŸ§™β€β™‚οΈ", page_icon="πŸ§™β€β™‚οΈ", layout="wide") +st.title("About Palooza Wizard πŸ€”") st.header("πŸ€“ Goal") st.markdown("lorem10") @@ -18,4 +18,4 @@ st.markdown("lorem10") st.markdown("### ➑️ Next Page: [πŸš€ ](/)", unsafe_allow_html=False) -st.markdown("### ➑️ Visit our Website: [πŸš€ Datapalooza](https://datapalooza.co)") \ No newline at end of file +st.markdown("### ➑️ Visit our Website: [πŸš€ Datapalooza](https://datapalooza.co)") diff --git "a/streamlit/pages/03_\360\237\232\200_Datapalooza.py" "b/streamlit/pages/03_\360\237\232\200_Datapalooza.py" index 1cc305d..98542a5 100644 --- "a/streamlit/pages/03_\360\237\232\200_Datapalooza.py" +++ "b/streamlit/pages/03_\360\237\232\200_Datapalooza.py" @@ -1,24 +1,32 @@ import streamlit as st -st.set_page_config(page_title="πŸš€ Datapalooza ", page_icon = "https://chainbreaker.riskii.co/assets/img/logo/chain-white.png") +st.set_page_config( + page_title="πŸš€ Datapalooza ", + page_icon="https://chainbreaker.riskii.co/assets/img/logo/chain-white.png", +) st.header("πŸš€ Datapalooza: The driven force behind Chain Breaker") -st.markdown("Discover [πŸš€ Datapalooza](https://datapalooza.co), the force behind Chain Breaker's development. πŸš€ As a data-as-a-service leader, we extract, format, and deliver web data for valuable insights. πŸ’‘ Empowering businesses with data expertise, we unlock opportunities and drive success. Partner with Datapalooza to harness data's potential for transformative insights. 🀝πŸ’ͺ") +st.markdown( + "Discover [πŸš€ Datapalooza](https://datapalooza.co), the force behind Chain Breaker's development. πŸš€ As a data-as-a-service leader, we extract, format, and deliver web data for valuable insights. πŸ’‘ Empowering businesses with data expertise, we unlock opportunities and drive success. Partner with Datapalooza to harness data's potential for transformative insights. 🀝πŸ’ͺ" +) st.header("πŸ’‘ How business use web data to succeed") -st.markdown(""" +st.markdown( + """ - Price and Product Monitoring πŸ’°πŸ” - Know your customer (KYC) πŸ•΅οΈβ€β™€οΈπŸ‘€ - Job Monitoring πŸ§‘β€πŸ’»πŸ” - Real Estate Opportunities πŸ’πŸ” - Financial Data for Investment Decisions πŸ’ΌπŸ“ˆ - Identification of Cybersecurity Threads πŸ›‘οΈπŸ” -""") +""" +) st.header("πŸ₯Έ Our Services") -st.markdown(""" +st.markdown( + """ - Web Data Extraction πŸ“Š - Data Cleansing 🧹 - Data Integration and Consolidation 🧩 @@ -28,16 +36,25 @@ - Data Analytics and Insights πŸ”¬ - Data Visualization and Reporting πŸ“ˆ - Data Security and Compliance πŸ” -""") +""" +) st.header("🀝 Meet Datapalooza Team") col1, col2 = st.columns(2) -with col1: - st.markdown("### [:unicorn_face: Cristhian Pardo](https://www.linkedin.com/in/cristhian-pardo/)") - st.write("Computer scientist and mathematician, passionate about AI, number theory, and using data to tackle society's challenges with new technologies.") -with col2: - st.markdown("### [:frog: Juan Esteban Cepeda](https://www.linkedin.com/in/juan-e-cepeda-gestion/)") - st.write("Computer scientist and business admin with 5 years experience in ML, software engineering, and data analysis.") +with col1: + st.markdown( + "### [:unicorn_face: Cristhian Pardo](https://www.linkedin.com/in/cristhian-pardo/)" + ) + st.write( + "Computer scientist and mathematician, passionate about AI, number theory, and using data to tackle society's challenges with new technologies." + ) +with col2: + st.markdown( + "### [:frog: Juan Esteban Cepeda](https://www.linkedin.com/in/juan-e-cepeda-gestion/)" + ) + st.write( + "Computer scientist and business admin with 5 years experience in ML, software engineering, and data analysis." + ) st.header("πŸ”— Links") @@ -50,7 +67,11 @@ with col2: st.markdown("### [:incoming_envelope: Email](mailto:info@datapalooza.co)") # st.image('images/kaggle.png', width=125) - st.write("Do you have questions or a special inquery? Write us to **info@datapalooza.co**") + st.write( + "Do you have questions or a special inquery? Write us to **info@datapalooza.co**" + ) -st.markdown("### ➑️ Visit Chain Breaker πŸ”— Website: [here](https://chainbreaker.datapalooza.co/)") +st.markdown( + "### ➑️ Visit Chain Breaker πŸ”— Website: [here](https://chainbreaker.datapalooza.co/)" +) st.markdown("### ➑️ Visit our Website: [πŸš€ Datapalooza](https://datapalooza.co)") diff --git a/streamlit/utils/__init__.py b/streamlit/utils/__init__.py index 1be2dea..3bf5017 100644 --- a/streamlit/utils/__init__.py +++ b/streamlit/utils/__init__.py @@ -1 +1 @@ -from error_box import * \ No newline at end of file +from error_box import * diff --git a/streamlit/utils/error_box.py b/streamlit/utils/error_box.py index 0b8a78e..83f3d6c 100644 --- a/streamlit/utils/error_box.py +++ b/streamlit/utils/error_box.py @@ -1,5 +1,6 @@ import streamlit as st + def display_error_box(valid_url: bool = False): if not valid_url: - st.error('The URL is not valid', icon="🚨") \ No newline at end of file + st.error("The URL is not valid", icon="🚨")