From 942b9cb178ddd11733bcd0a59480fb5d0c8a2424 Mon Sep 17 00:00:00 2001 From: pgaref Date: Thu, 25 May 2017 03:58:42 +0100 Subject: [PATCH] Started working on #23 - First version will use flask and sqlite3 --- http_request_randomizer/web/README.md | 8 ++ http_request_randomizer/web/common.py | 0 .../web/schedulers/__init__.py | 0 .../web/schedulers/health.py | 78 +++++++++++++++++++ .../web/schedulers/parsing.py | 78 +++++++++++++++++++ http_request_randomizer/web/schema.sql | 26 +++++++ requirements.txt | 5 +- setup.py | 1 + 8 files changed, 194 insertions(+), 2 deletions(-) create mode 100644 http_request_randomizer/web/README.md create mode 100644 http_request_randomizer/web/common.py create mode 100644 http_request_randomizer/web/schedulers/__init__.py create mode 100644 http_request_randomizer/web/schedulers/health.py create mode 100644 http_request_randomizer/web/schedulers/parsing.py create mode 100644 http_request_randomizer/web/schema.sql diff --git a/http_request_randomizer/web/README.md b/http_request_randomizer/web/README.md new file mode 100644 index 0000000..5f00f8c --- /dev/null +++ b/http_request_randomizer/web/README.md @@ -0,0 +1,8 @@ +# HTTP Request Randomizer WebApp + + +## Setup instructions + export FLASK_APP=minitwit.minitwit + export FLASK_DEBUG=1 + flask initdb + flask run diff --git a/http_request_randomizer/web/common.py b/http_request_randomizer/web/common.py new file mode 100644 index 0000000..e69de29 diff --git a/http_request_randomizer/web/schedulers/__init__.py b/http_request_randomizer/web/schedulers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/http_request_randomizer/web/schedulers/health.py b/http_request_randomizer/web/schedulers/health.py new file mode 100644 index 0000000..ce6f3da --- /dev/null +++ b/http_request_randomizer/web/schedulers/health.py @@ -0,0 +1,78 @@ +""" +Demonstrates how to use the background scheduler to schedule a job that executes on 3 second +intervals. +""" +from http_request_randomizer.requests.parsers.FreeProxyParser import FreeProxyParser +from http_request_randomizer.requests.parsers.ProxyForEuParser import ProxyForEuParser +from http_request_randomizer.requests.parsers.RebroWeeblyParser import RebroWeeblyParser +from http_request_randomizer.requests.parsers.SamairProxyParser import SamairProxyParser +from apscheduler.schedulers.background import BackgroundScheduler +from requests.exceptions import ReadTimeout +from sqlite3 import dbapi2 as sqlite3 +from urlparse import urlparse +import logging +import time +import os + +from http_request_randomizer.web.common import insert_proxy_db + +logger = logging.getLogger(__name__) +logging.basicConfig() + +__author__ = 'pgaref' + + +class ParsingScheduler: + + def __init__(self, DATABASE, timeout=5): + parsers = list([]) + parsers.append(FreeProxyParser('http://free-proxy-list.net', timeout=timeout)) + parsers.append(ProxyForEuParser('http://proxyfor.eu/geo.php', 1.0, timeout=timeout)) + parsers.append(RebroWeeblyParser('http://rebro.weebly.com', timeout=timeout)) + parsers.append(SamairProxyParser('http://samair.ru/proxy/time-01.htm', timeout=timeout)) + self.parsers = parsers + self.scheduler = BackgroundScheduler() + self.DATABASE = DATABASE + + def tick(self): + print('Tick! The time is: %s' % time.time()) + for parser in self.parsers: + curr_proxy_list = [] + try: + curr_proxy_list = parser.parse_proxyList() + except ReadTimeout: + print("Proxy Parser: '{}' TimedOut!".format(parser.url)) + finally: + # Separate db connection per parser + sqlite_db = sqlite3.connect(self.DATABASE) + sqlite_db.row_factory = sqlite3.Row + for current_proxy in curr_proxy_list: + parsed_proxy = urlparse(current_proxy) + insert_proxy_db(sqlite_db, proxy_ip=parsed_proxy.hostname, proxy_port=parsed_proxy.port, + provider=parser.url) + print("Inserted: {} proxies from: {}".format(len(curr_proxy_list), parser.url)) + + def add_background_task(self, interval=60): + self.scheduler.add_job(self.tick, 'interval', seconds=interval) + + def start_background_task(self): + self.scheduler.start() + + def shutdown_background_task(self): + self.scheduler.shutdown() + + +if __name__ == '__main__': + ps = ParsingScheduler(DATABASE='/tmp/proxylist.db') + ps.add_background_task(10) + ps.start_background_task() + + print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C')) + + try: + # This is here to simulate application activity (which keeps the main thread alive). + while True: + time.sleep(2) + except (KeyboardInterrupt, SystemExit): + # Not strictly necessary if daemonic mode is enabled but should be done if possible + ps.shutdown_background_task() diff --git a/http_request_randomizer/web/schedulers/parsing.py b/http_request_randomizer/web/schedulers/parsing.py new file mode 100644 index 0000000..ce6f3da --- /dev/null +++ b/http_request_randomizer/web/schedulers/parsing.py @@ -0,0 +1,78 @@ +""" +Demonstrates how to use the background scheduler to schedule a job that executes on 3 second +intervals. +""" +from http_request_randomizer.requests.parsers.FreeProxyParser import FreeProxyParser +from http_request_randomizer.requests.parsers.ProxyForEuParser import ProxyForEuParser +from http_request_randomizer.requests.parsers.RebroWeeblyParser import RebroWeeblyParser +from http_request_randomizer.requests.parsers.SamairProxyParser import SamairProxyParser +from apscheduler.schedulers.background import BackgroundScheduler +from requests.exceptions import ReadTimeout +from sqlite3 import dbapi2 as sqlite3 +from urlparse import urlparse +import logging +import time +import os + +from http_request_randomizer.web.common import insert_proxy_db + +logger = logging.getLogger(__name__) +logging.basicConfig() + +__author__ = 'pgaref' + + +class ParsingScheduler: + + def __init__(self, DATABASE, timeout=5): + parsers = list([]) + parsers.append(FreeProxyParser('http://free-proxy-list.net', timeout=timeout)) + parsers.append(ProxyForEuParser('http://proxyfor.eu/geo.php', 1.0, timeout=timeout)) + parsers.append(RebroWeeblyParser('http://rebro.weebly.com', timeout=timeout)) + parsers.append(SamairProxyParser('http://samair.ru/proxy/time-01.htm', timeout=timeout)) + self.parsers = parsers + self.scheduler = BackgroundScheduler() + self.DATABASE = DATABASE + + def tick(self): + print('Tick! The time is: %s' % time.time()) + for parser in self.parsers: + curr_proxy_list = [] + try: + curr_proxy_list = parser.parse_proxyList() + except ReadTimeout: + print("Proxy Parser: '{}' TimedOut!".format(parser.url)) + finally: + # Separate db connection per parser + sqlite_db = sqlite3.connect(self.DATABASE) + sqlite_db.row_factory = sqlite3.Row + for current_proxy in curr_proxy_list: + parsed_proxy = urlparse(current_proxy) + insert_proxy_db(sqlite_db, proxy_ip=parsed_proxy.hostname, proxy_port=parsed_proxy.port, + provider=parser.url) + print("Inserted: {} proxies from: {}".format(len(curr_proxy_list), parser.url)) + + def add_background_task(self, interval=60): + self.scheduler.add_job(self.tick, 'interval', seconds=interval) + + def start_background_task(self): + self.scheduler.start() + + def shutdown_background_task(self): + self.scheduler.shutdown() + + +if __name__ == '__main__': + ps = ParsingScheduler(DATABASE='/tmp/proxylist.db') + ps.add_background_task(10) + ps.start_background_task() + + print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C')) + + try: + # This is here to simulate application activity (which keeps the main thread alive). + while True: + time.sleep(2) + except (KeyboardInterrupt, SystemExit): + # Not strictly necessary if daemonic mode is enabled but should be done if possible + ps.shutdown_background_task() diff --git a/http_request_randomizer/web/schema.sql b/http_request_randomizer/web/schema.sql new file mode 100644 index 0000000..9284d31 --- /dev/null +++ b/http_request_randomizer/web/schema.sql @@ -0,0 +1,26 @@ +drop table if exists user; +create table user ( + user_id integer primary key autoincrement, + username text not null, + email text not null, + pw_hash text not null +); + +drop table if exists message; +create table message ( + message_id integer primary key autoincrement, + author_id integer not null, + text text not null, + pub_date integer +); + +drop table if exists proxy; +create table proxy( + ip text not null, + port integer not null, + provider text not null, + add_date integer not null, + check_date integer, + anonymity_level integer, + PRIMARY KEY (ip, port) +); diff --git a/requirements.txt b/requirements.txt index 920f979..a27e302 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,9 +2,10 @@ beautifulsoup4 == 4.5.3 coverage == 4.3.4 httmock == 1.2.6 psutil == 5.1.3 -pytest == 3.0.7 +pytest == 3.1.0 pytest-cov == 2.4.0 python-dateutil == 2.6.0 requests == 2.13.0 schedule == 0.4.2 -flask == 0.12.1 \ No newline at end of file +flask == 0.12.2 +APScheduler == 3.3.1 \ No newline at end of file diff --git a/setup.py b/setup.py index 804580f..08739c3 100644 --- a/setup.py +++ b/setup.py @@ -67,6 +67,7 @@ def run_tests(self): 'python-dateutil>=2.6.0', 'requests>=2.13.0', 'schedule>=0.4.2', + 'flask>=0.12.2' ], setup_requires=['pytest-runner'], zip_safe=False,