diff --git a/.gitignore b/.gitignore index cfa5a82..64d42e1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,136 +1,5 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ -*-env -*.DS_Store -*.csc - -# Steve: ignore some of my play-around files -output.* -test/ \ No newline at end of file +* +!.gitignore +!search_engine_robots.csv +!search_engines.csv +!*.py diff --git a/find_rss.py b/find_rss.py new file mode 100644 index 0000000..d6a6700 --- /dev/null +++ b/find_rss.py @@ -0,0 +1,67 @@ +import pandas as pd +import requests +import os +from urllib.parse import urlsplit, urlunsplit +import urllib.robotparser +import traceback +from tqdm import tqdm +from datetime import datetime + +''' +Look for the robots.txt on each site, and see if the robots.txt contains links to the sitemap. Use the robots and the sitemap to search for potential rss feeds +''' + + +def parse_robot(robot_url): + rp = urllib.robotparser.RobotFileParser() + rp.set_url(robot_url) + rp.read() + return rp.site_maps() + + +if __name__ == '__main__': + df = pd.read_csv('search_engines.csv') + # df = df[df['accessible'] == 1] + robot_access = [] + time_accessed = [] + sitemaps = [] + for url in tqdm(list(df['href'])): + robot_appended = False + try: + # https://stackoverflow.com/questions/35616434/how-can-i-get-the-base-of-a-url-in-python + split_url = urlsplit(url) + print('%s' % (split_url.netloc)) + # print(os.path.dirname(url)) + url = split_url.scheme + r'://' + split_url.netloc + r = requests.get(url) + if r.status_code == 200: + robots = url + '/robots.txt' + robot_r = requests.get(robots) + print('\t\t[%s]: %s' % (robots, robot_r)) + if robot_r.status_code == 200: + sitemap = parse_robot(robots) + robot_access.append(robots) + sitemaps.append(sitemap) + else: + robot_access.append('') + sitemaps.append('') + robot_appended = True + # sitemap = url + '/sitemap.xml' + # print('\t\t[%s]: %s' % (sitemap, requests.get(sitemap))) + + except: + print(traceback.format_exc()) + print('\tFAIL') + finally: + # keeping the row count the same across the files even during failure + if not robot_appended: + robot_access.append('') + sitemaps.append('') + time_accessed.append(datetime.now()) + print('-' * 80) + df['found_robots'] = robot_access + df['found_sitemaps'] = sitemaps + df['time_accessed'] = time_accessed + df['ignore'] = 0 # for manual checking in the future... + df.to_csv('search_engine_robots.csv', index=False) + print(df) diff --git a/get_search_engines.py b/get_search_engines.py new file mode 100644 index 0000000..f9bdda8 --- /dev/null +++ b/get_search_engines.py @@ -0,0 +1,33 @@ +import pandas as pd +from bs4 import BeautifulSoup +import requests + + +if __name__ == '__main__': + url = 'https://www.stanventures.com/blog/top-search-engines-list/' + + r = requests.get(url) + soup = BeautifulSoup(r.text, 'html.parser') + + print(soup) + + # li = soup.find_all('span', {'class': 'ez-toc-section'}) + + print('-' * 80) + + li = soup.find_all('span', {'class': 'ez-toc-section'}) + search_engines = [] + + for e in li: + print(e['id']) + src = soup.find('span', {'id': e['id']}).find_parent('h3') + children = src.findChildren("a", recursive=False) + print('Printing children: ') + for child in children: + print(child) + print(child['href']) + search_engines.append(child['href']) + + df = pd.DataFrame() + df['href'] = search_engines + df.to_csv('search_engines.csv', index=False) diff --git a/merge-csv.py b/merge-csv.py new file mode 100644 index 0000000..26ef4af --- /dev/null +++ b/merge-csv.py @@ -0,0 +1,26 @@ +import os +import pandas as pd +from datetime import datetime + +dir = 'output' +errors = 0 +success = 0 +l = [] +for file in os.listdir(dir): + filepath = os.path.join(dir, file) + try: + df = pd.read_csv(filepath) + l.append(df) + except pd.errors.EmptyDataError: + print('Empty data') + errors += 1 + except UnicodeDecodeError: + print('Unicode error') + errors += 1 + +print('Total erros: %s' % errors) +print('Successes: %s' % len(l)) + +df = pd.concat(l) +# print(df['link']) +df.to_csv('%s_urls.csv' % datetime.now().strftime('%Y-%m-%d'), index=False) diff --git a/news-get.py b/news-get.py new file mode 100644 index 0000000..09d7a79 --- /dev/null +++ b/news-get.py @@ -0,0 +1,259 @@ +import argparse +import random +import time +import logging +import glob +import os +import validators +import queue +from threading import Thread +from threading import Lock +from tqdm import tqdm +from urllib.parse import urlparse + +import dask.dataframe as dd +import pandas as pd + +import newspaper + +from sumy.parsers.html import HtmlParser +from sumy.nlp.tokenizers import Tokenizer +from sumy.summarizers.lsa import LsaSummarizer +from sumy.nlp.stemmers import Stemmer +from sumy.utils import get_stop_words + +from selenium.webdriver import Chrome +from selenium.webdriver import ChromeOptions +# from selenium.webdriver.common.keys import Keys +# from selenium.webdriver.common.by import By +from webdriver_manager.chrome import ChromeDriverManager + + +class NewsGetter: + + def __init__(self, output_dir, severity=1, **kwargs): + self.language = kwargs.get('language', 'english') + self.severity = severity + self.outdir = output_dir + self.tokenizer = Tokenizer(self.language) + self.connection_error = [] # list recording the urls that cannot be scraped + self.recent_sites = [] # this list tracks recently accessed sites to avoid over-ping one site + if self.severity >= 2: + ops = ChromeOptions() + # ops.add_argument('headless') + ops.add_argument('--window-size=1920,1080') # to ensure no content is hidden if in smaller windows + self.driver = Chrome(ChromeDriverManager().install(), options=ops) + + def text_from_url(self, url, sleep=(0, 0.1)): + time.sleep(random.uniform(sleep[0], sleep[1])) + output_string = '' + try: + article = newspaper.Article(url, language='en') + article.download() + output_string = newspaper.fulltext(article.html) + except: + try: + sumy_parser = HtmlParser.from_url(url, self.tokenizer) + for paragraph in sumy_parser.document.paragraphs: + for sentence in paragraph.senteces: + output_string += sentence._text + output_string += '\n' + except: + if self.severity < 2: # if not trying selenium, raise a debug log + self.connection_error.append('Cannot connect to ' + url) + return + if len(output_string) < 400: + # 400 char is arbitrary threshold, identify "successful connection but rejected by robot test" + # if connection failed in the previous two attempts, output_string = '' + if self.severity < 2: + self.connection_error.append('Cannot scrap text from ' + url) + return # if blocked by robot test, do not return anything + else: + try: + self.driver.get(url) + except: # when selenium also fails to connect + self.connection_error.append('Cannot connect to ' + url) + return + html_source = self.driver.page_source + output_string = newspaper.fulltext(html_source) + return output_string + + def sum_from_url(self, url, sleep=(0, 0.1), **kwargs): + time.sleep(random.uniform(sleep[0], sleep[1])) + try: + sumy_parser = HtmlParser.from_url(url, self.tokenizer) + except: + try: + article = newspaper.Article(url, language='en') + article.download() + sumy_parser = HtmlParser.from_string(article.html, article.source_url, self.tokenizer) + except: + if self.severity < 2: + self.connection_error.append('Cannot connect to ' + url) + return + else: + try: + self.driver.get(url) + except: + self.connection_error.append('Cannot connect to ' + url) + return + sumy_parser = HtmlParser.from_string(self.driver.page_source, url, self.tokenizer) + summarizer = LsaSummarizer(Stemmer(self.language)) + summarizer.stop_words = get_stop_words(self.language) + try: + summary = summarizer(sumy_parser.document, kwargs.get('sentence_count', 10)) + summerized = '' + for sentence in summary: + summerized += (sentence._text + " ") + return summerized + except: + # when the text scrapped is empty, print an error log + self.connection_error.append('Cannot scrap text from ' + url) + return + + def texts_from_csv(self, csv_dir, summarize=False, thread_count=16): + + class Worker(Thread): + + def __init__(self, url_queue: queue.Queue, getter: NewsGetter, lock: Lock, taskbar): + Thread.__init__(self) + self.queue = url_queue + self.result = pd.Series(dtype=str) + self.news_getter = getter + self.task_bar = taskbar + self.lock = lock + + def run(self): + while True: + (index, url) = self.queue.get() + if url == '': + # print(threading.active_count()) + break + site = urlparse(url).netloc + site = '.'.join(site.split('.')[-2:]) # ignore the sub-domain + + # this chunk is to avoid requesting the same site too frequently + if site in self.news_getter.recent_sites: + time.sleep(3) + # print('Sleep triggered for site ' + site) + self.lock.acquire() # the lock for the outerclass's list of visited sites + self.news_getter.recent_sites.append(site) + # the "cushion" for visited url is avg. 2 urls per thread + if len(self.news_getter.recent_sites) >= 2 * thread_count: + del self.news_getter.recent_sites[0] + self.lock.release() + + if summarize: + text = self.news_getter.sum_from_url(url) + else: + text = self.news_getter.text_from_url(url) + self.result = pd.concat([self.result, + pd.Series([text], index=[index])]) + self.queue.task_done() + self.task_bar.update(1) + + urls = queue.Queue() + + csv_files = glob.glob(csv_dir + "/*.csv") + if len(csv_files) == 0: + raise Exception("No .csv files found in directory: " + csv_dir) + for file in csv_files: + self.connection_error = [] + self.recent_sites = [] + df = pd.read_csv(file) + outdir = self.outdir + "\\" + file.split("\\")[-1] + url_column = df.loc[:, 'link'] # this column name is universal across output files of feedparser + print("Processing {0} links from {1}".format(len(url_column), file)) + queue_size = 0 + for i in url_column.index: + urls.put((i, url_column[i])) + queue_size += 1 + workers = [] + print("{} threads initiated".format(thread_count)) + lock = Lock() + with tqdm(total=queue_size) as pbar: + for i in range(thread_count): + urls.put(('', '')) # this is the stopper indicating no more urls left + worker = Worker(urls, self, lock, pbar) + worker.start() + workers.append(worker) + + for worker in workers: + worker.join() # join worker to wait until all finish + + for failed_message in self.connection_error: + print(failed_message) + + texts = pd.Series(dtype=str) + for worker in workers: + pd.concat([texts, worker.result]) + df['text'] = texts + df.to_csv(outdir, mode='w+') + print('file saved to ' + outdir) + + +def main(args): + getter = NewsGetter(severity=args.severity, output_dir=args.output_dir, laguage='english') + if args.dir_to_csv is not None: + getter.texts_from_csv(args.dir_to_csv, args.summarize, args.threads_count) + if args.urls is not None: + urls_output = pd.DataFrame(columns=['url', 'text']) + for url in args.urls: + if args.summarize: + retrieved = [url, getter.sum_from_url(url)] + urls_output = pd.concat([urls_output, pd.DataFrame([retrieved], columns=urls_output.columns)]) + else: + retrieved = [url, getter.text_from_url(url)] + urls_output = pd.concat([urls_output, pd.DataFrame([retrieved], columns=urls_output.columns)]) + urls_output.to_csv(args.output_dir + "\\" + 'url_output.csv') + print('file saved to ' + args.output_dir + '\\' + 'url_output.csv') + return + + +def check_args(args): + if args.urls is None: + if args.dir_to_csv is None: + raise ValueError("must input at least one of valid url or .csv containing urls") + else: + for url in args.urls: + if not validators.url(url): + raise ValueError(url + 'is not a valid url') + + if args.dir_to_csv is not None: + if not os.path.isdir(args.dir_to_csv): + raise ValueError("dir_to_csv {} is not a valid directory".format(args.dir_to_csv)) + + if args.severity is not None: + if args.severity > 4 or args.severity < 1: + raise ValueError("input severity is not valid") + else: + args.severity = 1 + + if args.summarize is None: + args.summarize = False + + if not args.output_dir is None and not os.path.isdir(args.output_dir): + raise ValueError('Output directory (%s) is not a valid directory' % ( + os.path.abspath(args.outdir))) + + if args.threads_count is None: + args.threads_count = 8 + + return args + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Acquiring news articles from urls") + parser.add_argument("-u", "--urls", nargs="+", help="urls to desired news text", required=False) + parser.add_argument("-dir", "--dir_to_csv", help="directory to the csv files generated by rss_get.py", + required=False) + parser.add_argument('-sum', "--summarize", help='whether to summarize text', action=argparse.BooleanOptionalAction) + parser.add_argument("-s", "--severity", + help="level 1: get html directly; level 2: use selenium to bypass robot test; level 3: set " + "minimum wait time to bypass javascript; level 4 (under construction): use credentials", + nargs="?", const=1, type=int, required=False) + parser.add_argument('-o', '--output_dir', type=str, required=True) + parser.add_argument('-t', '--threads_count', nargs='?', const=8, type=int) + args = parser.parse_args() + args = check_args(args) + main(args) diff --git a/requirements.txt b/requirements.txt index 1a4c4cb..543566f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,73 @@ +beautifulsoup4==4.11.1 +blis==0.7.7 +breadability==0.1.20 +catalogue==2.0.7 certifi==2022.5.18.1 +chardet==4.0.0 charset-normalizer==2.0.12 +click==8.1.3 +cloudpickle==2.1.0 +cssselect==1.1.0 +cymem==2.0.6 +dask==2022.5.2 +docopt==0.6.2 +et-xmlfile==1.1.0 +feedfinder2==0.0.4 feedparser==6.0.10 +filelock==3.7.1 +fsspec==2022.5.0 idna==3.3 +jieba3k==0.35.1 +Jinja2==3.1.2 +joblib==1.1.0 +langcodes==3.3.0 +locket==1.0.0 +lxml==4.9.0 +MarkupSafe==2.1.1 +murmurhash==1.0.7 +newspaper3k==0.2.8 +nltk==3.7 numpy==1.22.4 +openpyxl==3.0.10 +packaging==21.3 pandas==1.4.2 +partd==1.2.0 +pathy==0.6.1 +Pillow==9.1.1 +preshed==3.0.6 prettytable==3.3.0 +pycountry==22.3.5 +pydantic==1.8.2 +pyparsing==3.0.9 python-dateutil==2.8.2 pytz==2022.1 +PyYAML==6.0 +regex==2022.6.2 requests==2.27.1 +requests-file==1.5.1 +scikit-learn==1.1.1 +scipy==1.8.1 sgmllib3k==1.0.0 six==1.16.0 +sklearn==0.0 +smart-open==5.2.1 +soupsieve==2.3.2.post1 +spacy==3.3.0 +spacy-legacy==3.0.9 +spacy-loggers==1.0.2 +srsly==2.4.3 +sumy==0.10.0 +tabulate==0.8.9 +thinc==8.0.17 +threadpoolctl==3.1.0 +tinysegmenter==0.3 +tldextract==3.3.0 +toolz==0.11.2 tqdm==4.64.0 +typer==0.4.1 +typing_extensions==4.2.0 urllib3==1.26.9 +wasabi==0.9.1 wcwidth==0.2.5 +XlsxWriter==3.0.3 xmltodict==0.13.0 diff --git a/rss-get-test.sh b/rss-get-test.sh index e9d3736..85ece09 100644 --- a/rss-get-test.sh +++ b/rss-get-test.sh @@ -1,5 +1,5 @@ -python rss-get.py -k oranges apples -o test #export to a directory that does not exist -python rss-get.py -k oranges -sd 2500-03-02 #set the start date to after right now +python rss-get.py -k oranges apples -o test +python rss-get.py -k oranges -sd 2500-03-02 python rss-get.py -k apples -sd 2500-02-31 -python rss-get.py -k apples -ed 2022-03-01 #set the end date but no start date +python rss-get.py -k apples -ed 2022-03-01 python rss-get.py -k bananas -sd 2022-05-01 -ed 2022-05-02 diff --git a/rss-get.py b/rss-get.py index d7f8558..c52b707 100644 --- a/rss-get.py +++ b/rss-get.py @@ -23,10 +23,11 @@ def __init__(self, source, format): self.time_url = format['time_url'] else: self.time_url = None - self.columns = format['columns'] + # self.columns = format['columns'] def search_keyword(self, keyword, date_start=None, date_end=None): - if date_start is not None and date_end is not None: + # if the initial formatting is None + if date_start is None: query = self.base_url.format(keyword) else: if self.time_url is not None: @@ -37,17 +38,17 @@ def search_keyword(self, keyword, date_start=None, date_end=None): logging.debug('query: %s' % query) if query is not None: - return requests.get(query, headers={'Accept': 'application/json'}) + return requests.get(query, headers={'Accept': 'application/json'}), query else: - return None + return None, query def response_to_df(self, response): logging.debug('response: %s' % response.text) d = feedparser.parse(response.text) # convert response to a dictionary # logging.debug(pprint(d)) df = pd.json_normalize(d['entries']) - logging.debug(df.columns) - df = df.loc[:, self.columns] + # logging.debug(df.columns) + # df = df.loc[:, self.columns] return df @@ -71,9 +72,11 @@ def main(args): args.start_date, settings.ALL_FORMATS[source]['dt_format']) if args.end_date is not None and 'dt_format' in settings.ALL_FORMATS[source]: end_date_str = datetime.strftime( - args.start_date, settings.ALL_FORMATS[source]['dt_format']) + args.end_date, settings.ALL_FORMATS[source]['dt_format']) - r = rss.search_keyword(keyword, start_date_str, end_date_str) + r, query = rss.search_keyword( + keyword, start_date_str, end_date_str) + assert query is not None logging.debug(r) # logging.debug(r.text) if r is not None: @@ -88,6 +91,7 @@ def main(args): filename = '%s_%s_%s.csv' % (datetime.now().strftime( '%Y-%m-%d'), source, keyword) filepath = os.path.join(args.outdir, filename) + df['query'] = query df.to_csv(filepath, index=False) saved.append(filepath) logging.info('-' * 80) @@ -118,6 +122,12 @@ def clean_args(args): def check_args(args): + + if not (args.keywords or args.input_file): + raise ValueError('Either keywords or input file needs to be specified') + if args.keywords and args.input_file: + raise ValueError('Cannot input both keywords and input files') + # if the end date is specified if not args.end_date is None: # and if the start date is not specified @@ -136,9 +146,11 @@ def check_args(args): if __name__ == '__main__': - parser = argparse.ArgumentParser(description='DSPG pull from Google RSS') + parser = argparse.ArgumentParser(description='DSPG pull from RSS sources') parser.add_argument('-k', '--keywords', nargs='+', - help='Keywords to parse', required=True) + help='Keywords to parse') + parser.add_argument('-i', '--input_file', type=str, + help='New line delimited keywords file') parser.add_argument('-s', '--sources', nargs='+', help='Sources (%s) to parse from. If None, parses all possible sources' % (list(settings.ALL_FORMATS.keys())), required=False) parser.add_argument('-o', '--outdir', type=str) @@ -160,4 +172,7 @@ def check_args(args): logging.basicConfig( format='%(levelname)s:%(message)s', level=logging.INFO) logging.debug(args) + if args.input_file: + args.keywords = ''.join( + open(args.input_file, 'r').readlines()).split('\n') main(args) diff --git a/search_engine_robots.csv b/search_engine_robots.csv new file mode 100644 index 0000000..55bf174 --- /dev/null +++ b/search_engine_robots.csv @@ -0,0 +1,55 @@ +href,accessible,found_robots,found_sitemaps,time_accessed,ignore +https://www.google.com/,1,https://www.google.com/robots.txt,['https://www.google.com/sitemap.xml'],2022-06-07 10:29:39.691276,0 +https://www.bing.com/,1,https://www.bing.com/robots.txt,"['http://cn.bing.com/dict/sitemap-index.xml', 'https://www.bing.com/api/maps/mapcontrol/isdk/sitemap.xml', 'https://www.bing.com/travelguide/sitemaps/sitemap.xml', 'https://www.bing.com/maps/sitemap.xml', 'https://www.bing.com/sitemap/shop/sitemap.xml']",2022-06-07 10:29:40.011759,0 +https://in.search.yahoo.com/,1,https://in.search.yahoo.com/robots.txt,,2022-06-07 10:29:40.488014,0 +https://www.baidu.com/,1,https://www.baidu.com/robots.txt,,2022-06-07 10:29:42.005834,0 +https://duckduckgo.com/,1,https://duckduckgo.com/robots.txt,['https://duckduckgo.com/sitemap.xml'],2022-06-07 10:29:42.162114,0 +https://yandex.com/,1,https://yandex.com/robots.txt,"['https://yandex.com/support/sitemap.xml', 'https://yandex.com/blog/sitemap.xml', 'https://yandex.com/turbo/public/sitemap.xml', 'https://yandex.com/games/sitemaps/sitemap.index.xml', 'https://yandex.com/support/sitemap.xml', 'https://yandex.com/blog/sitemap.xml', 'https://yandex.com/turbo/public/sitemap.xml', 'https://yandex.com/games/sitemaps/sitemap.index.xml', 'https://yandex.com/support/sitemap.xml', 'https://yandex.com/blog/sitemap.xml', 'https://yandex.com/turbo/public/sitemap.xml', 'https://yandex.com/games/sitemaps/sitemap.index.xml']",2022-06-07 10:29:44.021279,0 +https://www.ask.com/,1,https://www.ask.com/robots.txt,['https://www.ask.com/sitemap.xml'],2022-06-07 10:29:44.242983,0 +https://www.naver.com/,1,https://www.naver.com/robots.txt,,2022-06-07 10:29:45.208337,0 +https://www.aol.com/,1,https://www.aol.com/robots.txt,"['https://www.aol.com/sitemap_index.xml', 'https://www.aol.com/o2-seo-sitemap/o2-index-video-sitemap.xml.gz', 'https://www.aol.com/sitemaps/aolnews-sitemap_index_US_en-US.xml.gz', 'https://www.aol.com/sitemaps/aolnews-sitemap_googlenewsindex_US_en-US.xml.gz', 'https://www.aol.com/sitemaps/aolentertainment-sitemap_index_US_en-US.xml.gz', 'https://www.aol.com/sitemaps/aolentertainment-sitemap_googlenewsindex_US_en-US.xml.gz', 'https://www.aol.com/sitemaps/aolfinance-sitemap_index_US_en-US.xml.gz', 'https://www.aol.com/sitemaps/aolfinance-sitemap_googlenewsindex_US_en-US.xml.gz', 'https://www.aol.com/sitemaps/aollifestyle-sitemap_index_US_en-US.xml.gz', 'https://www.aol.com/sitemaps/aollifestyle-sitemap_googlenewsindex_US_en-US.xml.gz', 'https://www.aol.com/sitemaps/aolweather-sitemap_index_US_en-US.xml.gz', 'https://www.aol.com/sitemaps/aolweather-sitemap_googlenewsindex_US_en-US.xml.gz']",2022-06-07 10:29:45.849402,0 +https://www.seznam.cz/,1,,,2022-06-07 10:29:47.704335,0 +https://www.yippy.com/,1,,,2022-06-07 10:29:47.798866,0 +https://ccsearch.creativecommons.org/,1,https://ccsearch.creativecommons.org/robots.txt,,2022-06-07 10:29:49.044924,0 +https://gibiru.com/,1,https://gibiru.com/robots.txt,,2022-06-07 10:29:49.196015,0 +https://amazon.com,1,,,2022-06-07 10:29:49.421303,0 +https://www.searchencrypt.com/,1,,,2022-06-07 10:29:49.724037,0 +https://www.startpage.com/,1,https://www.startpage.com/robots.txt,,2022-06-07 10:29:51.054612,0 +https://swisscows.com/,1,https://swisscows.com/robots.txt,['https://swisscows.com/sitemap.xml'],2022-06-07 10:29:52.624385,0 +https://www.ecosia.org/?c=en,1,https://www.ecosia.org/robots.txt,,2022-06-07 10:29:53.031030,0 +https://www.gigablast.com/,1,https://www.gigablast.com/robots.txt,,2022-06-07 10:29:54.169442,0 +https://www.lycos.com/,1,https://www.lycos.com/robots.txt,['http://www.lycos.com/sitemap.xml'],2022-06-07 10:29:54.661840,0 +https://www.mojeek.com/,1,https://www.mojeek.com/robots.txt,,2022-06-07 10:29:55.519019,0 +https://searx.me/,0,,,2022-06-07 10:29:55.628944,0 +https://www.webcrawler.com/,1,https://www.webcrawler.com/robots.txt,,2022-06-07 10:29:56.437709,0 +https://www.wolframalpha.com/,1,https://www.wolframalpha.com/robots.txt,,2022-06-07 10:29:57.754816,0 +https://neeva.co/,1,https://neeva.co/robots.txt,['https://neeva.com/sitemap.xml'],2022-06-07 10:29:58.404305,0 +https://metager.org/,1,https://metager.org/robots.txt,,2022-06-07 10:29:59.512509,0 +https://www.qwant.com/?l=en,1,https://www.qwant.com/robots.txt,,2022-06-07 10:30:00.924578,0 +https://qmamu.com/,1,https://qmamu.com/robots.txt,,2022-06-07 10:30:03.067257,0 +https://you.com/,1,,,2022-06-07 10:30:03.186806,0 +https://www.oscobo.com/,1,https://www.oscobo.com/robots.txt,,2022-06-07 10:30:04.354308,0 +https://infinitysearch.co/,1,https://infinitysearch.co/robots.txt,,2022-06-07 10:30:04.829192,0 +https://yep.com,1,https://yep.com/robots.txt,,2022-06-07 10:30:06.994141,0 +https://infinitysearch.co/,1,https://infinitysearch.co/robots.txt,,2022-06-07 10:30:07.402487,0 +https://www.facebook.com/,1,https://www.facebook.com/robots.txt,,2022-06-07 10:30:07.782680,0 +https://www.linkedin.com/,1,https://www.linkedin.com/robots.txt,,2022-06-07 10:30:08.452821,0 +https://twitter.com/explore,1,https://twitter.com/robots.txt,['https://twitter.com/sitemap.xml'],2022-06-07 10:30:08.833829,0 +https://search.brave.com/,1,https://search.brave.com/robots.txt,,2022-06-07 10:30:09.443799,0 +https://www.flickr.com/,1,https://www.flickr.com/robots.txt,"['https://www.flickr.com/sitemap/index/users/sitemap-index-users-00000000.xml.gz', 'https://www.flickr.com/sitemap/index/tags/sitemap-index-tags-00000000.xml.gz', 'https://www.flickr.com/sitemap/index/sets/sitemap-index-sets-00000000.xml.gz', 'https://www.flickr.com/sitemap/index/photos/sitemap-index-photos-00000000.xml.gz', 'https://www.flickr.com/sitemap/index/groups/sitemap-index-groups-00000000.xml.gz']",2022-06-07 10:30:09.833298,0 +https://in.pinterest.com/,1,https://in.pinterest.com/robots.txt,"['https://in.pinterest.com/v3_sitemaps/readable_pin_url_sitemap_in.pinterest.com.xml', 'https://in.pinterest.com/v3_sitemaps/pin_image_sitemap_in.pinterest.com.xml', 'https://in.pinterest.com/v3_sitemaps/promoted_idea_pin_sitemap_in.pinterest.com.xml']",2022-06-07 10:30:10.581134,0 +https://www.bing.com/images/trending?FORM=ILPTRD,1,https://www.bing.com/robots.txt,"['http://cn.bing.com/dict/sitemap-index.xml', 'https://www.bing.com/api/maps/mapcontrol/isdk/sitemap.xml', 'https://www.bing.com/travelguide/sitemaps/sitemap.xml', 'https://www.bing.com/maps/sitemap.xml', 'https://www.bing.com/sitemap/shop/sitemap.xml']",2022-06-07 10:30:10.873709,0 +https://www.google.com/imghp?hl=en,1,https://www.google.com/robots.txt,['https://www.google.com/sitemap.xml'],2022-06-07 10:30:11.103016,0 +https://tineye.com/,1,https://tineye.com/robots.txt,,2022-06-07 10:30:11.426082,0 +https://commons.wikimedia.org/wiki/Main_Page,1,https://commons.wikimedia.org/robots.txt,,2022-06-07 10:30:11.605162,0 +https://youtube.com,1,https://youtube.com/robots.txt,"['https://www.youtube.com/sitemaps/sitemap.xml', 'https://www.youtube.com/product/sitemap.xml']",2022-06-07 10:30:12.233562,0 +https://www.dailymotion.com/in,1,https://www.dailymotion.com/robots.txt,"['https://www.dailymotion.com/map-videos-latest.xml', 'https://www.dailymotion.com/map-videos-default-strategic.xml', 'https://www.dailymotion.com/map-videos-default-partner.xml', 'https://www.dailymotion.com/map-videos-default-user.xml', 'https://www.dailymotion.com/map-topics-strategic.xml', 'https://www.dailymotion.com/map-channels-strategic.xml', 'https://www.dailymotion.com/map-pages-default.xml', 'https://www.dailymotion.com/map-videos-default-heuristic.xml', 'https://www.dailymotion.com/map-videos-latest-heuristic.xml']",2022-06-07 10:30:12.734535,0 +https://vimeo.com/,1,https://vimeo.com/robots.txt,"['https://vimeo.com/sitemap/master.xml.gz', 'https://vimeo.com/sitemap/latest.xml.gz', 'https://vimeo.com/sitemap/stock.xml.gz', 'https://vimeo.com/sitemap/static-sitemap.xml', 'https://vimeo.com/blog/sitemap_index.xml']",2022-06-07 10:30:13.129346,0 +https://lbry.com/,1,https://lbry.com/robots.txt,,2022-06-07 10:30:13.438039,0 +https://joinpeertube.org/,1,,,2022-06-07 10:30:14.449532,0 +https://d.tube/,0,,,2022-06-07 10:30:14.535807,0 +https://bittube.tv/,1,https://bittube.tv/robots.txt,,2022-06-07 10:30:17.299715,0 +https://www.bitchute.com/,1,https://www.bitchute.com/robots.txt,,2022-06-07 10:30:18.088497,0 +https://www.twitch.tv/,1,https://www.twitch.tv/robots.txt,['https://www.twitch.tv/sitemapv2_index.xml.gz'],2022-06-07 10:30:18.263732,0 +https://www.metacafe.com/,1,https://www.metacafe.com/robots.txt,,2022-06-07 10:30:20.316803,0 +https://archive.org/,1,https://archive.org/robots.txt,['https://archive.org/sitemap/sitemap.xml'],2022-06-07 10:30:21.284901,0 diff --git a/search_engines.csv b/search_engines.csv new file mode 100644 index 0000000..888cd2f --- /dev/null +++ b/search_engines.csv @@ -0,0 +1,55 @@ +href,accessible +https://www.google.com/,1 +https://www.bing.com/,1 +https://in.search.yahoo.com/,1 +https://www.baidu.com/,1 +https://duckduckgo.com/,1 +https://yandex.com/,1 +https://www.ask.com/,1 +https://www.naver.com/,1 +https://www.aol.com/,1 +https://www.seznam.cz/,1 +https://www.yippy.com/,1 +https://ccsearch.creativecommons.org/,1 +https://gibiru.com/,1 +https://amazon.com,1 +https://www.searchencrypt.com/,1 +https://www.startpage.com/,1 +https://swisscows.com/,1 +https://www.ecosia.org/?c=en,1 +https://www.gigablast.com/,1 +https://www.lycos.com/,1 +https://www.mojeek.com/,1 +https://searx.me/,0 +https://www.webcrawler.com/,1 +https://www.wolframalpha.com/,1 +https://neeva.co/,1 +https://metager.org/,1 +https://www.qwant.com/?l=en,1 +https://qmamu.com/,1 +https://you.com/,1 +https://www.oscobo.com/,1 +https://infinitysearch.co/,1 +https://yep.com,1 +https://infinitysearch.co/,1 +https://www.facebook.com/,1 +https://www.linkedin.com/,1 +https://twitter.com/explore,1 +https://search.brave.com/,1 +https://www.flickr.com/,1 +https://in.pinterest.com/,1 +https://www.bing.com/images/trending?FORM=ILPTRD,1 +https://www.google.com/imghp?hl=en,1 +https://tineye.com/,1 +https://commons.wikimedia.org/wiki/Main_Page,1 +https://youtube.com,1 +https://www.dailymotion.com/in,1 +https://vimeo.com/,1 +https://lbry.com/,1 +https://joinpeertube.org/,1 +https://d.tube/,0 +https://bittube.tv/,1 +https://www.bitchute.com/,1 +https://www.twitch.tv/,1 +https://www.metacafe.com/,1 +https://archive.org/,1 \ No newline at end of file diff --git a/settings.py b/settings.py index 01e6207..15759b2 100644 --- a/settings.py +++ b/settings.py @@ -3,11 +3,11 @@ 'base_url': 'https://news.google.com/rss/search?q={0}', 'time_url': 'https://news.google.com/rss/search?q={0}+after:{1}+before:{2}', 'dt_format': '%Y-%m-%d', - 'columns': ['title', 'links', 'link', 'id', 'guidislink', 'published', - 'published_parsed', 'summary', 'title_detail.type', - 'title_detail.language', 'title_detail.base', 'title_detail.value', - 'summary_detail.type', 'summary_detail.language', 'summary_detail.base', - 'summary_detail.value', 'source.href', 'source.title'] + # 'columns': ['title', 'links', 'link', 'id', 'guidislink', 'published', + # 'published_parsed', 'summary', 'title_detail.type', + # 'title_detail.language', 'title_detail.base', 'title_detail.value', + # 'summary_detail.type', 'summary_detail.language', 'summary_detail.base', + # 'summary_detail.value', 'source.href', 'source.title'] }, 'bing': { 'base_url': 'https://www.bing.com/news/search?q={0}&format=rss',