Skip to content

Commit

Permalink
Started working on #23 - First version will use flask and sqlite3
Browse files Browse the repository at this point in the history
  • Loading branch information
pgaref committed May 25, 2017
1 parent 55e6f05 commit 942b9cb
Show file tree
Hide file tree
Showing 8 changed files with 194 additions and 2 deletions.
8 changes: 8 additions & 0 deletions http_request_randomizer/web/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# HTTP Request Randomizer WebApp


## Setup instructions
export FLASK_APP=minitwit.minitwit
export FLASK_DEBUG=1
flask initdb
flask run
Empty file.
Empty file.
78 changes: 78 additions & 0 deletions http_request_randomizer/web/schedulers/health.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""
Demonstrates how to use the background scheduler to schedule a job that executes on 3 second
intervals.
"""
from http_request_randomizer.requests.parsers.FreeProxyParser import FreeProxyParser
from http_request_randomizer.requests.parsers.ProxyForEuParser import ProxyForEuParser
from http_request_randomizer.requests.parsers.RebroWeeblyParser import RebroWeeblyParser
from http_request_randomizer.requests.parsers.SamairProxyParser import SamairProxyParser
from apscheduler.schedulers.background import BackgroundScheduler
from requests.exceptions import ReadTimeout
from sqlite3 import dbapi2 as sqlite3
from urlparse import urlparse
import logging
import time
import os

from http_request_randomizer.web.common import insert_proxy_db

logger = logging.getLogger(__name__)
logging.basicConfig()

__author__ = 'pgaref'


class ParsingScheduler:

def __init__(self, DATABASE, timeout=5):
parsers = list([])
parsers.append(FreeProxyParser('http://free-proxy-list.net', timeout=timeout))
parsers.append(ProxyForEuParser('http://proxyfor.eu/geo.php', 1.0, timeout=timeout))
parsers.append(RebroWeeblyParser('http://rebro.weebly.com', timeout=timeout))
parsers.append(SamairProxyParser('http://samair.ru/proxy/time-01.htm', timeout=timeout))
self.parsers = parsers
self.scheduler = BackgroundScheduler()
self.DATABASE = DATABASE

def tick(self):
print('Tick! The time is: %s' % time.time())
for parser in self.parsers:
curr_proxy_list = []
try:
curr_proxy_list = parser.parse_proxyList()
except ReadTimeout:
print("Proxy Parser: '{}' TimedOut!".format(parser.url))
finally:
# Separate db connection per parser
sqlite_db = sqlite3.connect(self.DATABASE)
sqlite_db.row_factory = sqlite3.Row
for current_proxy in curr_proxy_list:
parsed_proxy = urlparse(current_proxy)
insert_proxy_db(sqlite_db, proxy_ip=parsed_proxy.hostname, proxy_port=parsed_proxy.port,
provider=parser.url)
print("Inserted: {} proxies from: {}".format(len(curr_proxy_list), parser.url))

def add_background_task(self, interval=60):
self.scheduler.add_job(self.tick, 'interval', seconds=interval)

def start_background_task(self):
self.scheduler.start()

def shutdown_background_task(self):
self.scheduler.shutdown()


if __name__ == '__main__':
ps = ParsingScheduler(DATABASE='/tmp/proxylist.db')
ps.add_background_task(10)
ps.start_background_task()

print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))

try:
# This is here to simulate application activity (which keeps the main thread alive).
while True:
time.sleep(2)
except (KeyboardInterrupt, SystemExit):
# Not strictly necessary if daemonic mode is enabled but should be done if possible
ps.shutdown_background_task()
78 changes: 78 additions & 0 deletions http_request_randomizer/web/schedulers/parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""
Demonstrates how to use the background scheduler to schedule a job that executes on 3 second
intervals.
"""
from http_request_randomizer.requests.parsers.FreeProxyParser import FreeProxyParser
from http_request_randomizer.requests.parsers.ProxyForEuParser import ProxyForEuParser
from http_request_randomizer.requests.parsers.RebroWeeblyParser import RebroWeeblyParser
from http_request_randomizer.requests.parsers.SamairProxyParser import SamairProxyParser
from apscheduler.schedulers.background import BackgroundScheduler
from requests.exceptions import ReadTimeout
from sqlite3 import dbapi2 as sqlite3
from urlparse import urlparse
import logging
import time
import os

from http_request_randomizer.web.common import insert_proxy_db

logger = logging.getLogger(__name__)
logging.basicConfig()

__author__ = 'pgaref'


class ParsingScheduler:

def __init__(self, DATABASE, timeout=5):
parsers = list([])
parsers.append(FreeProxyParser('http://free-proxy-list.net', timeout=timeout))
parsers.append(ProxyForEuParser('http://proxyfor.eu/geo.php', 1.0, timeout=timeout))
parsers.append(RebroWeeblyParser('http://rebro.weebly.com', timeout=timeout))
parsers.append(SamairProxyParser('http://samair.ru/proxy/time-01.htm', timeout=timeout))
self.parsers = parsers
self.scheduler = BackgroundScheduler()
self.DATABASE = DATABASE

def tick(self):
print('Tick! The time is: %s' % time.time())
for parser in self.parsers:
curr_proxy_list = []
try:
curr_proxy_list = parser.parse_proxyList()
except ReadTimeout:
print("Proxy Parser: '{}' TimedOut!".format(parser.url))
finally:
# Separate db connection per parser
sqlite_db = sqlite3.connect(self.DATABASE)
sqlite_db.row_factory = sqlite3.Row
for current_proxy in curr_proxy_list:
parsed_proxy = urlparse(current_proxy)
insert_proxy_db(sqlite_db, proxy_ip=parsed_proxy.hostname, proxy_port=parsed_proxy.port,
provider=parser.url)
print("Inserted: {} proxies from: {}".format(len(curr_proxy_list), parser.url))

def add_background_task(self, interval=60):
self.scheduler.add_job(self.tick, 'interval', seconds=interval)

def start_background_task(self):
self.scheduler.start()

def shutdown_background_task(self):
self.scheduler.shutdown()


if __name__ == '__main__':
ps = ParsingScheduler(DATABASE='/tmp/proxylist.db')
ps.add_background_task(10)
ps.start_background_task()

print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C'))

try:
# This is here to simulate application activity (which keeps the main thread alive).
while True:
time.sleep(2)
except (KeyboardInterrupt, SystemExit):
# Not strictly necessary if daemonic mode is enabled but should be done if possible
ps.shutdown_background_task()
26 changes: 26 additions & 0 deletions http_request_randomizer/web/schema.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
drop table if exists user;
create table user (
user_id integer primary key autoincrement,
username text not null,
email text not null,
pw_hash text not null
);

drop table if exists message;
create table message (
message_id integer primary key autoincrement,
author_id integer not null,
text text not null,
pub_date integer
);

drop table if exists proxy;
create table proxy(
ip text not null,
port integer not null,
provider text not null,
add_date integer not null,
check_date integer,
anonymity_level integer,
PRIMARY KEY (ip, port)
);
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@ beautifulsoup4 == 4.5.3
coverage == 4.3.4
httmock == 1.2.6
psutil == 5.1.3
pytest == 3.0.7
pytest == 3.1.0
pytest-cov == 2.4.0
python-dateutil == 2.6.0
requests == 2.13.0
schedule == 0.4.2
flask == 0.12.1
flask == 0.12.2
APScheduler == 3.3.1
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def run_tests(self):
'python-dateutil>=2.6.0',
'requests>=2.13.0',
'schedule>=0.4.2',
'flask>=0.12.2'
],
setup_requires=['pytest-runner'],
zip_safe=False,
Expand Down

0 comments on commit 942b9cb

Please sign in to comment.