Merge pull request #2838 from internetarchive/feature/prevent-infobas…

…e-null-cover-errors Use cover.jpg if title.jpg does not exist, and never send empty covers to infobase
internetarchive · Jan 13, 2020 · 836044c · 836044c
2 parents 7413363 + 06df165
commit 836044c
Show file tree

Hide file tree

Showing 12 changed files with 92 additions and 189 deletions.
diff --git a/openlibrary/catalog/add_book/__init__.py b/openlibrary/catalog/add_book/__init__.py
@@ -542,6 +542,7 @@ def load_data(rec, account=None):
     cover_id = None
     if cover_url:
         cover_id = add_cover(cover_url, ekey, account=account)
+    if cover_id:
         edition['covers'] = [cover_id]
 
     edits = []  # Things (Edition, Work, Authors) to be saved

diff --git a/openlibrary/core/ia.py b/openlibrary/core/ia.py
@@ -1,35 +1,50 @@
-"""Library for interacting wih archive.org.
+"""Library for interacting with archive.org.
 """
+
+import logging
 import os
-import urllib2
-import datetime
-from xml.dom import minidom
-import simplejson
+import requests
 import web
-import logging
+
 from infogami import config
 from infogami.utils import stats
-import cache
+
+from openlibrary.core import cache
 from openlibrary.utils.dateutil import date_n_days_ago
 
 import six
 
 logger = logging.getLogger('openlibrary.ia')
 
+IA_BASE_URL = config.get('ia_base_url')
 VALID_READY_REPUB_STATES = ['4', '19', '20', '22']
 
-def get_item_json(itemid):
+
+def _get_metadata(itemid):
+    """Returns metadata by querying the archive.org metadata API.
+    """
     itemid = web.safestr(itemid.strip())
-    url = 'http://archive.org/metadata/%s' % itemid
+    url = '%s/metadata/%s' % (IA_BASE_URL, itemid)
     try:
         stats.begin('archive.org', url=url)
-        metadata_json = urllib2.urlopen(url).read()
+        metadata = requests.get(url)
         stats.end()
-        return simplejson.loads(metadata_json)
+        return metadata.json()
     except IOError:
         stats.end()
         return {}
 
+# cache the results in memcache for a minute
+_get_metadata = web.memoize(_get_metadata, expires=60)
+
+
+def get_metadata(itemid):
+    item_json = _get_metadata(itemid)
+    return extract_item_metadata(item_json)
+
+get_metadata = cache.memcache_memoize(get_metadata, key_prefix='ia.get_metadata', timeout=5*60)
+
+
 def extract_item_metadata(item_json):
     metadata = process_metadata_dict(item_json.get('metadata', {}))
     if metadata:
@@ -42,11 +57,6 @@ def extract_item_metadata(item_json):
         metadata['_filenames'] = [f['name'] for f in files]
     return metadata
 
-def get_metadata(itemid):
-    item_json = get_item_json(itemid)
-    return extract_item_metadata(item_json)
-
-get_metadata = cache.memcache_memoize(get_metadata, key_prefix='ia.get_metadata', timeout=5*60)
 
 def process_metadata_dict(metadata):
     """Process metadata dict to make sure multi-valued fields like
@@ -57,95 +67,23 @@ def process_metadata_dict(metadata):
     non-list cases. This function makes sure the known multi-valued fields are
     always lists.
     """
-    mutlivalued = set(['collection', 'external-identifier', 'isbn', 'subject', 'oclc-id'])
+    multivalued = set(['collection', 'external-identifier', 'isbn', 'subject', 'oclc-id'])
     def process_item(k, v):
-        if k in mutlivalued and not isinstance(v, list):
+        if k in multivalued and not isinstance(v, list):
             v = [v]
-        elif k not in mutlivalued and isinstance(v, list):
+        elif k not in multivalued and isinstance(v, list):
             v = v[0]
         return (k, v)
     return dict(process_item(k, v) for k, v in metadata.items() if v)
 
-def _old_get_meta_xml(itemid):
-    """Returns the contents of meta_xml as JSON.
-    """
-    itemid = web.safestr(itemid.strip())
-    url = 'http://www.archive.org/download/%s/%s_meta.xml' % (itemid, itemid)
-    try:
-        stats.begin('archive.org', url=url)
-        metaxml = urllib2.urlopen(url).read()
-        stats.end()
-    except IOError:
-        logger.error("Failed to download _meta.xml for %s", itemid, exc_info=True)
-        stats.end()
-        return web.storage()
-
-    # archive.org returns html on internal errors.
-    # Checking for valid xml before trying to parse it.
-    if not metaxml.strip().startswith("<?xml"):
-        return web.storage()
-
-    try:
-        defaults = {"collection": [], "external-identifier": []}
-        return web.storage(xml2dict(metaxml, **defaults))
-    except Exception as e:
-        logger.error("Failed to parse metaxml for %s", itemid, exc_info=True)
-        return web.storage()
-
-def get_meta_xml(itemid):
-    # use metadata API instead of parsing meta xml manually
-    return get_metadata(itemid)
-
-def xml2dict(xml, **defaults):
-    """Converts xml to python dictionary assuming that the xml is not nested.
-
-    To get some tag as a list/set, pass a keyword argument with list/set as value.
-
-        >>> xml2dict('<doc><x>1</x><x>2</x></doc>')
-        {'x': 2}
-        >>> xml2dict('<doc><x>1</x><x>2</x></doc>', x=[])
-        {'x': [1, 2]}
-    """
-    d = defaults
-    dom = minidom.parseString(xml)
-
-    for node in dom.documentElement.childNodes:
-        if node.nodeType == node.TEXT_NODE or len(node.childNodes) == 0:
-            continue
-        else:
-            key = node.tagName
-            value = node.childNodes[0].data
-
-            if key in d and isinstance(d[key], list):
-                d[key].append(value)
-            elif key in d and isinstance(d[key], set):
-                d[key].add(value)
-            else:
-                d[key] = value
-
-    return d
-
-def _get_metadata(itemid):
-    """Returns metadata by querying the archive.org metadata API.
-    """
-    url = "http://www.archive.org/metadata/%s" % itemid
-    try:
-        stats.begin("archive.org", url=url)
-        text = urllib2.urlopen(url).read()
-        stats.end()
-        return simplejson.loads(text)
-    except (IOError, ValueError):
-        return None
-
-# cache the results in memcache for a minute
-_get_metadata = web.memoize(_get_metadata, expires=60)
 
 def locate_item(itemid):
     """Returns (hostname, path) for the item.
     """
     d = _get_metadata(itemid)
     return d.get('server'), d.get('dir')
 
+
 def edition_from_item_metadata(itemid, metadata):
     """Converts the item metadata into a form suitable to be used as edition
     in Open Library.
@@ -158,24 +96,37 @@ def edition_from_item_metadata(itemid, metadata):
         e.add_metadata(metadata)
         return e
 
+
+def get_cover_url(item_id):
+    """Gets the URL of the archive.org item's title (or cover) page.
+    """
+    base_url = '{0}/download/{1}/page/'.format(IA_BASE_URL, item_id)
+    title_response = requests.head(base_url + 'title.jpg', allow_redirects=True)
+    if title_response.status_code == 404:
+        return base_url + 'cover.jpg'
+    return base_url + 'title.jpg'
+
+
 def get_item_manifest(item_id, item_server, item_path):
     url = 'https://%s/BookReader/BookReaderJSON.php' % item_server
-    url += "?itemPath=%s&itemId=%s&server=%s" % (item_path, item_id, item_server)
+    url += '?itemPath=%s&itemId=%s&server=%s' % (item_path, item_id, item_server)
     try:
-        stats.begin("archive.org", url=url)
-        manifest_json = urllib2.urlopen(url).read()
+        stats.begin('archive.org', url=url)
+        manifest = requests.get(url)
         stats.end()
-        return simplejson.loads(manifest_json)
+        return manifest.json()
     except IOError:
         stats.end()
         return {}
 
+
 def get_item_status(itemid, metadata, **server):
     item_server = server.pop('item_server', None)
     item_path = server.pop('item_path', None)
     return ItemEdition.get_item_status(itemid, metadata, item_server=item_server,
                                        item_path=item_path)
 
+
 class ItemEdition(dict):
     """Class to convert item metadata into edition dict.
     """
@@ -249,7 +200,6 @@ def get_item_status(cls, itemid, metadata, item_server=None, item_path=None):
         # items with metadata no_ol_import=true will be not imported
         if metadata.get("no_ol_import", '').lower() == 'true':
             return "no-ol-import"
-
         return "ok"
 
     @classmethod
@@ -264,15 +214,12 @@ def is_valid_item(cls, itemid, metadata):
 
     def add_metadata(self, metadata):
         self.metadata = metadata
-
         self.add('title')
         self.add('description', 'description')
         self.add_list('publisher', 'publishers')
-        self.add_list("creator", "author_names")
+        self.add_list('creator', 'author_names')
         self.add('date', 'publish_date')
-
         self.add_isbns()
-        self.add_subjects()
 
     def add(self, key, key2=None):
         metadata = self.metadata
@@ -298,7 +245,7 @@ def add_list(self, key, key2):
         metadata = self.metadata
 
         key2 = key2 or key
-        # sometimes the empty values are represneted as {} in metadata API. Avoid them.
+        # sometimes the empty values are represented as {} in metadata API. Avoid them.
         if key in metadata and metadata[key] != {}:
             value = metadata[key]
             if not isinstance(value, list):
@@ -321,15 +268,6 @@ def add_isbns(self):
         if isbn_13:
             self["isbn_13"] = isbn_13
 
-    def add_subjects(self):
-        collections = self.metadata.get("collection", [])
-        mapping = {
-            "inlibrary": "In library",
-            "lendinglibrary": "Lending library"
-        }
-        subjects = [subject for c, subject in mapping.items() if c in collections]
-        if subjects:
-            self['subjects'] = subjects
 
 _ia_db = None
 def get_ia_db(configfile=None):

diff --git a/openlibrary/mocks/mock_ia.py b/openlibrary/mocks/mock_ia.py
@@ -10,21 +10,21 @@ def mock_ia(request, monkeypatch):
         from openlibrary.core import ia
 
         def test_ia(mock_ia):
-            assert ia.get_meta_xml("foo") == {}
+            assert ia.get_metadata("foo") == {}
 
-            mock_ia.set_meta_xml("foo", {"collection": ["a", "b"]})
-            assert ia.get_meta_xml("foo") == {"collection": ["a", "b"]}
+            mock_ia.set_metadata("foo", {"collection": ["a", "b"]})
+            assert ia.get_metadata("foo") == {"collection": ["a", "b"]}
     """
-    metaxml = {}
+    metadata = {}
 
     class IA:
-        def set_meta_xml(self, itemid, meta):
-            metaxml[itemid] = meta
+        def set_metadata(self, itemid, meta):
+            metadata[itemid] = meta
 
-        def get_meta_xml(self, itemid):
-            return metaxml.get(itemid, {})
+        def get_metadata(self, itemid):
+            return metadata.get(itemid, {})
 
     mock_ia = IA()
-    monkeypatch.setattr(ia, "get_meta_xml", ia.get_meta_xml)
+    monkeypatch.setattr(ia, 'get_metadata', ia.get_metadata)
 
     return mock_ia
diff --git a/openlibrary/plugins/books/dynlinks.py b/openlibrary/plugins/books/dynlinks.py
@@ -364,7 +364,7 @@ def process_result_for_viewapi(result):
 
 
 def get_ia_availability(itemid):
-    collections = ia.get_meta_xml(itemid).get("collection", [])
+    collections = ia.get_metadata(itemid).get('collection', [])
 
     if 'lendinglibrary' in collections or 'inlibrary' in collections:
         return 'borrow'

diff --git a/openlibrary/plugins/books/readlinks.py b/openlibrary/plugins/books/readlinks.py
@@ -331,7 +331,7 @@ def process(self, req):
         self.wkey_to_iaids = dict((wkey, get_work_iaids(wkey)[:iaid_limit])
                                   for wkey in self.works)
         iaids = sum(self.wkey_to_iaids.values(), [])
-        self.iaid_to_meta = dict((iaid, ia.get_meta_xml(iaid)) for iaid in iaids)
+        self.iaid_to_meta = dict((iaid, ia.get_metadata(iaid)) for iaid in iaids)
 
         def lookup_iaids(iaids):
             step = 10

diff --git a/openlibrary/plugins/books/tests/test_dynlinks.py b/openlibrary/plugins/books/tests/test_dynlinks.py
@@ -236,7 +236,7 @@ def monkeypatch_ol(monkeypatch):
     mock.default = []
     monkeypatch.setattr(dynlinks, "ol_get_many", mock)
 
-    monkeypatch.setattr(ia, "get_meta_xml", lambda itemid: web.storage())
+    monkeypatch.setattr(ia, "get_metadata", lambda itemid: web.storage())
 
 def test_query_keys(monkeypatch):
     monkeypatch_ol(monkeypatch)