Skip to content

Commit

Permalink
Merge pull request #2838 from internetarchive/feature/prevent-infobas…
Browse files Browse the repository at this point in the history
…e-null-cover-errors

Use cover.jpg if title.jpg does not exist, and never send empty covers to infobase
  • Loading branch information
mekarpeles authored Jan 13, 2020
2 parents 7413363 + 06df165 commit 836044c
Show file tree
Hide file tree
Showing 12 changed files with 92 additions and 189 deletions.
1 change: 1 addition & 0 deletions openlibrary/catalog/add_book/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,7 @@ def load_data(rec, account=None):
cover_id = None
if cover_url:
cover_id = add_cover(cover_url, ekey, account=account)
if cover_id:
edition['covers'] = [cover_id]

edits = [] # Things (Edition, Work, Authors) to be saved
Expand Down
160 changes: 49 additions & 111 deletions openlibrary/core/ia.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,50 @@
"""Library for interacting wih archive.org.
"""Library for interacting with archive.org.
"""

import logging
import os
import urllib2
import datetime
from xml.dom import minidom
import simplejson
import requests
import web
import logging

from infogami import config
from infogami.utils import stats
import cache

from openlibrary.core import cache
from openlibrary.utils.dateutil import date_n_days_ago

import six

logger = logging.getLogger('openlibrary.ia')

IA_BASE_URL = config.get('ia_base_url')
VALID_READY_REPUB_STATES = ['4', '19', '20', '22']

def get_item_json(itemid):

def _get_metadata(itemid):
"""Returns metadata by querying the archive.org metadata API.
"""
itemid = web.safestr(itemid.strip())
url = 'http://archive.org/metadata/%s' % itemid
url = '%s/metadata/%s' % (IA_BASE_URL, itemid)
try:
stats.begin('archive.org', url=url)
metadata_json = urllib2.urlopen(url).read()
metadata = requests.get(url)
stats.end()
return simplejson.loads(metadata_json)
return metadata.json()
except IOError:
stats.end()
return {}

# cache the results in memcache for a minute
_get_metadata = web.memoize(_get_metadata, expires=60)


def get_metadata(itemid):
item_json = _get_metadata(itemid)
return extract_item_metadata(item_json)

get_metadata = cache.memcache_memoize(get_metadata, key_prefix='ia.get_metadata', timeout=5*60)


def extract_item_metadata(item_json):
metadata = process_metadata_dict(item_json.get('metadata', {}))
if metadata:
Expand All @@ -42,11 +57,6 @@ def extract_item_metadata(item_json):
metadata['_filenames'] = [f['name'] for f in files]
return metadata

def get_metadata(itemid):
item_json = get_item_json(itemid)
return extract_item_metadata(item_json)

get_metadata = cache.memcache_memoize(get_metadata, key_prefix='ia.get_metadata', timeout=5*60)

def process_metadata_dict(metadata):
"""Process metadata dict to make sure multi-valued fields like
Expand All @@ -57,95 +67,23 @@ def process_metadata_dict(metadata):
non-list cases. This function makes sure the known multi-valued fields are
always lists.
"""
mutlivalued = set(['collection', 'external-identifier', 'isbn', 'subject', 'oclc-id'])
multivalued = set(['collection', 'external-identifier', 'isbn', 'subject', 'oclc-id'])
def process_item(k, v):
if k in mutlivalued and not isinstance(v, list):
if k in multivalued and not isinstance(v, list):
v = [v]
elif k not in mutlivalued and isinstance(v, list):
elif k not in multivalued and isinstance(v, list):
v = v[0]
return (k, v)
return dict(process_item(k, v) for k, v in metadata.items() if v)

def _old_get_meta_xml(itemid):
"""Returns the contents of meta_xml as JSON.
"""
itemid = web.safestr(itemid.strip())
url = 'http://www.archive.org/download/%s/%s_meta.xml' % (itemid, itemid)
try:
stats.begin('archive.org', url=url)
metaxml = urllib2.urlopen(url).read()
stats.end()
except IOError:
logger.error("Failed to download _meta.xml for %s", itemid, exc_info=True)
stats.end()
return web.storage()

# archive.org returns html on internal errors.
# Checking for valid xml before trying to parse it.
if not metaxml.strip().startswith("<?xml"):
return web.storage()

try:
defaults = {"collection": [], "external-identifier": []}
return web.storage(xml2dict(metaxml, **defaults))
except Exception as e:
logger.error("Failed to parse metaxml for %s", itemid, exc_info=True)
return web.storage()

def get_meta_xml(itemid):
# use metadata API instead of parsing meta xml manually
return get_metadata(itemid)

def xml2dict(xml, **defaults):
"""Converts xml to python dictionary assuming that the xml is not nested.
To get some tag as a list/set, pass a keyword argument with list/set as value.
>>> xml2dict('<doc><x>1</x><x>2</x></doc>')
{'x': 2}
>>> xml2dict('<doc><x>1</x><x>2</x></doc>', x=[])
{'x': [1, 2]}
"""
d = defaults
dom = minidom.parseString(xml)

for node in dom.documentElement.childNodes:
if node.nodeType == node.TEXT_NODE or len(node.childNodes) == 0:
continue
else:
key = node.tagName
value = node.childNodes[0].data

if key in d and isinstance(d[key], list):
d[key].append(value)
elif key in d and isinstance(d[key], set):
d[key].add(value)
else:
d[key] = value

return d

def _get_metadata(itemid):
"""Returns metadata by querying the archive.org metadata API.
"""
url = "http://www.archive.org/metadata/%s" % itemid
try:
stats.begin("archive.org", url=url)
text = urllib2.urlopen(url).read()
stats.end()
return simplejson.loads(text)
except (IOError, ValueError):
return None

# cache the results in memcache for a minute
_get_metadata = web.memoize(_get_metadata, expires=60)

def locate_item(itemid):
"""Returns (hostname, path) for the item.
"""
d = _get_metadata(itemid)
return d.get('server'), d.get('dir')


def edition_from_item_metadata(itemid, metadata):
"""Converts the item metadata into a form suitable to be used as edition
in Open Library.
Expand All @@ -158,24 +96,37 @@ def edition_from_item_metadata(itemid, metadata):
e.add_metadata(metadata)
return e


def get_cover_url(item_id):
"""Gets the URL of the archive.org item's title (or cover) page.
"""
base_url = '{0}/download/{1}/page/'.format(IA_BASE_URL, item_id)
title_response = requests.head(base_url + 'title.jpg', allow_redirects=True)
if title_response.status_code == 404:
return base_url + 'cover.jpg'
return base_url + 'title.jpg'


def get_item_manifest(item_id, item_server, item_path):
url = 'https://%s/BookReader/BookReaderJSON.php' % item_server
url += "?itemPath=%s&itemId=%s&server=%s" % (item_path, item_id, item_server)
url += '?itemPath=%s&itemId=%s&server=%s' % (item_path, item_id, item_server)
try:
stats.begin("archive.org", url=url)
manifest_json = urllib2.urlopen(url).read()
stats.begin('archive.org', url=url)
manifest = requests.get(url)
stats.end()
return simplejson.loads(manifest_json)
return manifest.json()
except IOError:
stats.end()
return {}


def get_item_status(itemid, metadata, **server):
item_server = server.pop('item_server', None)
item_path = server.pop('item_path', None)
return ItemEdition.get_item_status(itemid, metadata, item_server=item_server,
item_path=item_path)


class ItemEdition(dict):
"""Class to convert item metadata into edition dict.
"""
Expand Down Expand Up @@ -249,7 +200,6 @@ def get_item_status(cls, itemid, metadata, item_server=None, item_path=None):
# items with metadata no_ol_import=true will be not imported
if metadata.get("no_ol_import", '').lower() == 'true':
return "no-ol-import"

return "ok"

@classmethod
Expand All @@ -264,15 +214,12 @@ def is_valid_item(cls, itemid, metadata):

def add_metadata(self, metadata):
self.metadata = metadata

self.add('title')
self.add('description', 'description')
self.add_list('publisher', 'publishers')
self.add_list("creator", "author_names")
self.add_list('creator', 'author_names')
self.add('date', 'publish_date')

self.add_isbns()
self.add_subjects()

def add(self, key, key2=None):
metadata = self.metadata
Expand All @@ -298,7 +245,7 @@ def add_list(self, key, key2):
metadata = self.metadata

key2 = key2 or key
# sometimes the empty values are represneted as {} in metadata API. Avoid them.
# sometimes the empty values are represented as {} in metadata API. Avoid them.
if key in metadata and metadata[key] != {}:
value = metadata[key]
if not isinstance(value, list):
Expand All @@ -321,15 +268,6 @@ def add_isbns(self):
if isbn_13:
self["isbn_13"] = isbn_13

def add_subjects(self):
collections = self.metadata.get("collection", [])
mapping = {
"inlibrary": "In library",
"lendinglibrary": "Lending library"
}
subjects = [subject for c, subject in mapping.items() if c in collections]
if subjects:
self['subjects'] = subjects

_ia_db = None
def get_ia_db(configfile=None):
Expand Down
18 changes: 9 additions & 9 deletions openlibrary/mocks/mock_ia.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,21 @@ def mock_ia(request, monkeypatch):
from openlibrary.core import ia
def test_ia(mock_ia):
assert ia.get_meta_xml("foo") == {}
assert ia.get_metadata("foo") == {}
mock_ia.set_meta_xml("foo", {"collection": ["a", "b"]})
assert ia.get_meta_xml("foo") == {"collection": ["a", "b"]}
mock_ia.set_metadata("foo", {"collection": ["a", "b"]})
assert ia.get_metadata("foo") == {"collection": ["a", "b"]}
"""
metaxml = {}
metadata = {}

class IA:
def set_meta_xml(self, itemid, meta):
metaxml[itemid] = meta
def set_metadata(self, itemid, meta):
metadata[itemid] = meta

def get_meta_xml(self, itemid):
return metaxml.get(itemid, {})
def get_metadata(self, itemid):
return metadata.get(itemid, {})

mock_ia = IA()
monkeypatch.setattr(ia, "get_meta_xml", ia.get_meta_xml)
monkeypatch.setattr(ia, 'get_metadata', ia.get_metadata)

return mock_ia
2 changes: 1 addition & 1 deletion openlibrary/plugins/books/dynlinks.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ def process_result_for_viewapi(result):


def get_ia_availability(itemid):
collections = ia.get_meta_xml(itemid).get("collection", [])
collections = ia.get_metadata(itemid).get('collection', [])

if 'lendinglibrary' in collections or 'inlibrary' in collections:
return 'borrow'
Expand Down
2 changes: 1 addition & 1 deletion openlibrary/plugins/books/readlinks.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ def process(self, req):
self.wkey_to_iaids = dict((wkey, get_work_iaids(wkey)[:iaid_limit])
for wkey in self.works)
iaids = sum(self.wkey_to_iaids.values(), [])
self.iaid_to_meta = dict((iaid, ia.get_meta_xml(iaid)) for iaid in iaids)
self.iaid_to_meta = dict((iaid, ia.get_metadata(iaid)) for iaid in iaids)

def lookup_iaids(iaids):
step = 10
Expand Down
2 changes: 1 addition & 1 deletion openlibrary/plugins/books/tests/test_dynlinks.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def monkeypatch_ol(monkeypatch):
mock.default = []
monkeypatch.setattr(dynlinks, "ol_get_many", mock)

monkeypatch.setattr(ia, "get_meta_xml", lambda itemid: web.storage())
monkeypatch.setattr(ia, "get_metadata", lambda itemid: web.storage())

def test_query_keys(monkeypatch):
monkeypatch_ol(monkeypatch)
Expand Down
Loading

0 comments on commit 836044c

Please sign in to comment.