From 909e98ecbf945172d192cf6704aaa4e4b632c2bc Mon Sep 17 00:00:00 2001 From: rajat Date: Thu, 3 Sep 2015 18:56:29 +0400 Subject: [PATCH] Persisting headers, method and cookies in SQLAlchemy backend. --- .../contrib/backends/sqlalchemy/__init__.py | 18 ++++++++++++++++-- frontera/utils/tester.py | 10 +++++++++- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/frontera/contrib/backends/sqlalchemy/__init__.py b/frontera/contrib/backends/sqlalchemy/__init__.py index 2bb825f95..ddbdb2e4d 100644 --- a/frontera/contrib/backends/sqlalchemy/__init__.py +++ b/frontera/contrib/backends/sqlalchemy/__init__.py @@ -10,6 +10,7 @@ from frontera import Backend from frontera.utils.misc import load_object +from frontera.core.models import Request, Response # Default settings DEFAULT_ENGINE = 'sqlite:///:memory:' @@ -62,6 +63,9 @@ class State: state = Column(String(12)) error = Column(String(20)) meta = Column(PickleType()) + headers = Column(PickleType()) + cookies = Column(PickleType()) + method = Column(String(6)) @classmethod def query(cls, session): @@ -135,7 +139,8 @@ def get_next_requests(self, max_next_requests, **kwargs): next_pages = [] for db_page in query: db_page.state = Page.State.QUEUED - request = self.manager.request_model(url=db_page.url, meta=db_page.meta) + request = self.manager.request_model(url=db_page.url, meta=db_page.meta, headers=db_page.headers, + cookies=db_page.cookies, method=db_page.method) next_pages.append(request) self.session.commit() return next_pages @@ -161,9 +166,18 @@ def _create_page(self, obj): db_page.fingerprint = obj.meta['fingerprint'] db_page.state = Page.State.NOT_CRAWLED db_page.url = obj.url - db_page.depth = 0 db_page.created_at = datetime.datetime.utcnow() db_page.meta = obj.meta + db_page.depth = 0 + + if isinstance(obj, Request): + db_page.headers = obj.headers + db_page.method = obj.method + db_page.cookies = obj.cookies + elif isinstance(obj, Response): + db_page.headers = obj.request.headers + db_page.method = obj.request.method + db_page.cookies = obj.request.cookies return db_page def _get_or_create_db_page(self, obj): diff --git a/frontera/utils/tester.py b/frontera/utils/tester.py index 3f5519c4e..c913772a7 100644 --- a/frontera/utils/tester.py +++ b/frontera/utils/tester.py @@ -39,7 +39,12 @@ def _add_all(self): self.frontier.add_seeds([self._make_request(link.url)]) def _make_request(self, url): - r = self.frontier.request_model(url=url) + r = self.frontier.request_model(url=url, + headers={ + 'X-Important-Header': 'Frontera' + }, + method='POST', + cookies={'currency': 'USD'}) r.meta['this_param'] = 'should be passed over' return r @@ -67,6 +72,9 @@ def _run_iteration(self): self.frontier.request_error(request=page_to_crawl, error=crawled_page.status) assert page_to_crawl.meta['this_param'] == 'should be passed over' + assert page_to_crawl.headers['X-Important-Header'] == 'Frontera' + assert page_to_crawl.method == 'POST' + assert page_to_crawl.cookies['currency'] == 'USD' return requests