Skip to content

Commit

Permalink
update to latest sync
Browse files Browse the repository at this point in the history
  • Loading branch information
dads2busy committed Mar 11, 2024
1 parent 4b4973b commit d7d2af7
Show file tree
Hide file tree
Showing 11 changed files with 590 additions and 154 deletions.
141 changes: 5 additions & 136 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,136 +1,5 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/
*-env
*.DS_Store
*.csc

# Steve: ignore some of my play-around files
output.*
test/
*
!.gitignore
!search_engine_robots.csv
!search_engines.csv
!*.py
67 changes: 67 additions & 0 deletions find_rss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import pandas as pd
import requests
import os
from urllib.parse import urlsplit, urlunsplit
import urllib.robotparser
import traceback
from tqdm import tqdm
from datetime import datetime

'''
Look for the robots.txt on each site, and see if the robots.txt contains links to the sitemap. Use the robots and the sitemap to search for potential rss feeds
'''


def parse_robot(robot_url):
rp = urllib.robotparser.RobotFileParser()
rp.set_url(robot_url)
rp.read()
return rp.site_maps()


if __name__ == '__main__':
df = pd.read_csv('search_engines.csv')
# df = df[df['accessible'] == 1]
robot_access = []
time_accessed = []
sitemaps = []
for url in tqdm(list(df['href'])):
robot_appended = False
try:
# https://stackoverflow.com/questions/35616434/how-can-i-get-the-base-of-a-url-in-python
split_url = urlsplit(url)
print('%s' % (split_url.netloc))
# print(os.path.dirname(url))
url = split_url.scheme + r'://' + split_url.netloc
r = requests.get(url)
if r.status_code == 200:
robots = url + '/robots.txt'
robot_r = requests.get(robots)
print('\t\t[%s]: %s' % (robots, robot_r))
if robot_r.status_code == 200:
sitemap = parse_robot(robots)
robot_access.append(robots)
sitemaps.append(sitemap)
else:
robot_access.append('')
sitemaps.append('')
robot_appended = True
# sitemap = url + '/sitemap.xml'
# print('\t\t[%s]: %s' % (sitemap, requests.get(sitemap)))

except:
print(traceback.format_exc())
print('\tFAIL')
finally:
# keeping the row count the same across the files even during failure
if not robot_appended:
robot_access.append('')
sitemaps.append('')
time_accessed.append(datetime.now())
print('-' * 80)
df['found_robots'] = robot_access
df['found_sitemaps'] = sitemaps
df['time_accessed'] = time_accessed
df['ignore'] = 0 # for manual checking in the future...
df.to_csv('search_engine_robots.csv', index=False)
print(df)
33 changes: 33 additions & 0 deletions get_search_engines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import pandas as pd
from bs4 import BeautifulSoup
import requests


if __name__ == '__main__':
url = 'https://www.stanventures.com/blog/top-search-engines-list/'

r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')

print(soup)

# li = soup.find_all('span', {'class': 'ez-toc-section'})

print('-' * 80)

li = soup.find_all('span', {'class': 'ez-toc-section'})
search_engines = []

for e in li:
print(e['id'])
src = soup.find('span', {'id': e['id']}).find_parent('h3')
children = src.findChildren("a", recursive=False)
print('Printing children: ')
for child in children:
print(child)
print(child['href'])
search_engines.append(child['href'])

df = pd.DataFrame()
df['href'] = search_engines
df.to_csv('search_engines.csv', index=False)
26 changes: 26 additions & 0 deletions merge-csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import os
import pandas as pd
from datetime import datetime

dir = 'output'
errors = 0
success = 0
l = []
for file in os.listdir(dir):
filepath = os.path.join(dir, file)
try:
df = pd.read_csv(filepath)
l.append(df)
except pd.errors.EmptyDataError:
print('Empty data')
errors += 1
except UnicodeDecodeError:
print('Unicode error')
errors += 1

print('Total erros: %s' % errors)
print('Successes: %s' % len(l))

df = pd.concat(l)
# print(df['link'])
df.to_csv('%s_urls.csv' % datetime.now().strftime('%Y-%m-%d'), index=False)
Loading

0 comments on commit d7d2af7

Please sign in to comment.