update to latest sync

uva-bi-sdad · Mar 11, 2024 · d7d2af7 · d7d2af7
1 parent 4b4973b
commit d7d2af7
Show file tree

Hide file tree

Showing 11 changed files with 590 additions and 154 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,136 +1,5 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-pip-wheel-metadata/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-.python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-*-env
-*.DS_Store 
-*.csc
-
-# Steve: ignore some of my play-around files
-output.*
-test/
+*
+!.gitignore
+!search_engine_robots.csv
+!search_engines.csv
+!*.py
diff --git a/find_rss.py b/find_rss.py
@@ -0,0 +1,67 @@
+import pandas as pd
+import requests
+import os
+from urllib.parse import urlsplit, urlunsplit
+import urllib.robotparser
+import traceback
+from tqdm import tqdm
+from datetime import datetime
+
+'''
+Look for the robots.txt on each site, and see if the robots.txt contains links to the sitemap. Use the robots and the sitemap to search for potential rss feeds
+'''
+
+
+def parse_robot(robot_url):
+    rp = urllib.robotparser.RobotFileParser()
+    rp.set_url(robot_url)
+    rp.read()
+    return rp.site_maps()
+
+
+if __name__ == '__main__':
+    df = pd.read_csv('search_engines.csv')
+    # df = df[df['accessible'] == 1]
+    robot_access = []
+    time_accessed = []
+    sitemaps = []
+    for url in tqdm(list(df['href'])):
+        robot_appended = False
+        try:
+            # https://stackoverflow.com/questions/35616434/how-can-i-get-the-base-of-a-url-in-python
+            split_url = urlsplit(url)
+            print('%s' % (split_url.netloc))
+            # print(os.path.dirname(url))
+            url = split_url.scheme + r'://' + split_url.netloc
+            r = requests.get(url)
+            if r.status_code == 200:
+                robots = url + '/robots.txt'
+                robot_r = requests.get(robots)
+                print('\t\t[%s]: %s' % (robots, robot_r))
+                if robot_r.status_code == 200:
+                    sitemap = parse_robot(robots)
+                    robot_access.append(robots)
+                    sitemaps.append(sitemap)
+                else:
+                    robot_access.append('')
+                    sitemaps.append('')
+                robot_appended = True
+                # sitemap = url + '/sitemap.xml'
+                # print('\t\t[%s]: %s' % (sitemap, requests.get(sitemap)))
+
+        except:
+            print(traceback.format_exc())
+            print('\tFAIL')
+        finally:
+            # keeping the row count the same across the files even during failure
+            if not robot_appended:
+                robot_access.append('')
+                sitemaps.append('')
+            time_accessed.append(datetime.now())
+            print('-' * 80)
+    df['found_robots'] = robot_access
+    df['found_sitemaps'] = sitemaps
+    df['time_accessed'] = time_accessed
+    df['ignore'] = 0  # for manual checking in the future...
+    df.to_csv('search_engine_robots.csv', index=False)
+    print(df)
diff --git a/get_search_engines.py b/get_search_engines.py
@@ -0,0 +1,33 @@
+import pandas as pd
+from bs4 import BeautifulSoup
+import requests
+
+
+if __name__ == '__main__':
+    url = 'https://www.stanventures.com/blog/top-search-engines-list/'
+
+    r = requests.get(url)
+    soup = BeautifulSoup(r.text, 'html.parser')
+
+    print(soup)
+
+    # li = soup.find_all('span', {'class': 'ez-toc-section'})
+
+    print('-' * 80)
+
+    li = soup.find_all('span', {'class': 'ez-toc-section'})
+    search_engines = []
+
+    for e in li:
+        print(e['id'])
+        src = soup.find('span', {'id': e['id']}).find_parent('h3')
+        children = src.findChildren("a", recursive=False)
+        print('Printing children: ')
+        for child in children:
+            print(child)
+            print(child['href'])
+            search_engines.append(child['href'])
+
+    df = pd.DataFrame()
+    df['href'] = search_engines
+    df.to_csv('search_engines.csv', index=False)
diff --git a/merge-csv.py b/merge-csv.py
@@ -0,0 +1,26 @@
+import os
+import pandas as pd
+from datetime import datetime
+
+dir = 'output'
+errors = 0
+success = 0
+l = []
+for file in os.listdir(dir):
+    filepath = os.path.join(dir, file)
+    try:
+        df = pd.read_csv(filepath)
+        l.append(df)
+    except pd.errors.EmptyDataError:
+        print('Empty data')
+        errors += 1
+    except UnicodeDecodeError:
+        print('Unicode error')
+        errors += 1
+
+print('Total erros: %s' % errors)
+print('Successes: %s' % len(l))
+
+df = pd.concat(l)
+# print(df['link'])
+df.to_csv('%s_urls.csv' % datetime.now().strftime('%Y-%m-%d'), index=False)