Skip to content

Commit

Permalink
fix bug clobbering own changes to parent_page
Browse files Browse the repository at this point in the history
and some other tweaks (python 3.5+, pytest logging config, ...)
  • Loading branch information
nlevitt committed Oct 17, 2019
1 parent ba85917 commit e23fa68
Show file tree
Hide file tree
Showing 7 changed files with 29 additions and 12 deletions.
3 changes: 2 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
dist: xenial
language: python
python:
- 3.4
- 3.5
- 3.6
- 3.7
Expand All @@ -24,6 +23,8 @@ script:
- DISPLAY=:1 py.test --tb=native -v tests
after_failure:
- chromium-browser --version
- sudo kill -QUIT $(sudo svstat /etc/service/warcprox | egrep -o 'pid [0-9]+' | awk '{print $2}')
- sudo kill -QUIT $(sudo svstat /etc/service/brozzler-worker | egrep -o 'pid [0-9]+' | awk '{print $2}')
- sudo cat /var/log/warcprox.log
- sudo cat /var/log/brozzler-worker.log
- sudo cat /var/log/pywb.log
Expand Down
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ Brozzler is designed to work in conjuction with warcprox for web archiving.
Requirements
------------

- Python 3.4 or later
- Python 3.5 or later
- RethinkDB deployment
- Chromium or Google Chrome >= version 64

Expand Down
26 changes: 18 additions & 8 deletions brozzler/frontier.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ def _merge_page(self, existing_page, fresh_page):
'''
existing_page.priority += fresh_page.priority
existing_page.hashtags = list(set(
existing_page.hashtags + fresh_page.hashtags))
(existing_page.hashtags or []) + (fresh_page.hashtags or [])))
existing_page.hops_off = min(
existing_page.hops_off, fresh_page.hops_off)

Expand Down Expand Up @@ -375,14 +375,18 @@ def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
decisions['accepted'].add(fresh_page.url)
if fresh_page.id in pages:
page = pages[fresh_page.id]
page.hashtags = list(set((page.hashtags or [])
+ fresh_page.hashtags))
page.priority += fresh_page.priority
self._merge_page(page, fresh_page)
counts['updated'] += 1
else:
pages[fresh_page.id] = fresh_page
counts['added'] += 1

# make sure we're not stepping on our own toes in case we have a link
# back to parent_page, which I think happens because of hashtags
if parent_page.id in pages:
self._merge_page(parent_page, pages[parent_page.id])
del pages[parent_page.id]

# insert/replace in batches of 50 to try to avoid this error:
# "rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:"
# there can be many pages and each one can be very large (many videos,
Expand All @@ -392,8 +396,11 @@ def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
try:
self.logger.debug(
'inserting/replacing batch of %s pages', len(batch))
result = self.rr.table('pages').insert(
batch, conflict='replace').run()
reql = self.rr.table('pages').insert(batch, conflict='replace')
self.logger.trace(
'running query self.rr.table("pages").insert(%r, '
'conflict="replace")', batch)
result = reql.run()
except Exception as e:
self.logger.error(
'problem inserting/replacing batch of %s pages',
Expand Down Expand Up @@ -450,12 +457,15 @@ def site_pages(self, site_id, brozzled=None):
Returns:
iterator of brozzler.Page
'''
results = self.rr.table("pages").between(
query = self.rr.table("pages").between(
[site_id, 1 if brozzled is True else 0,
r.minval, r.minval],
[site_id, 0 if brozzled is False else r.maxval,
r.maxval, r.maxval],
index="priority_by_site").run()
index="priority_by_site")
self.logger.trace("running query: %r", query)
results = query.run()
for result in results:
self.logger.trace("yielding result: %r", result)
yield brozzler.Page(self.rr, result)

6 changes: 6 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# https://docs.pytest.org/en/latest/logging.html
# https://github.com/pytest-dev/pytest/issues/5296
[pytest]
log_format = %(asctime)s.%(msecs)03d %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s
log_date_format = %Y-%m-%d %H:%M:%S

1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,6 @@ def find_package_data(package):
'Development Status :: 5 - Production/Stable',
'Environment :: Console',
'License :: OSI Approved :: Apache Software License',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
Expand Down
1 change: 1 addition & 0 deletions tests/test_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import subprocess
import http.server
import logging
import sys
import warcprox

# https://stackoverflow.com/questions/166506/finding-local-ip-addresses-using-pythons-stdlib
Expand Down
2 changes: 1 addition & 1 deletion tests/test_frontier.py
Original file line number Diff line number Diff line change
Expand Up @@ -733,7 +733,7 @@ def test_hashtag_seed():
assert pages[0].hashtags == ['#hash',]

def test_hashtag_links():
rr = doublethink.Rethinker('localhost', db='ignoreme')
rr = doublethink.Rethinker('localhost', db='test_hashtag_links')
frontier = brozzler.RethinkDbFrontier(rr)

site = brozzler.Site(rr, {'seed': 'http://example.org/'})
Expand Down

0 comments on commit e23fa68

Please sign in to comment.