From 9d50c87d5dcc193a2a39a1044863bb185337dc35 Mon Sep 17 00:00:00 2001 From: Charles Horn Date: Fri, 10 Jan 2020 11:38:08 +1300 Subject: [PATCH 01/16] don't send covers: [None] to infobase --- openlibrary/catalog/add_book/__init__.py | 1 + openlibrary/plugins/importapi/code.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/openlibrary/catalog/add_book/__init__.py b/openlibrary/catalog/add_book/__init__.py index 8e3c48e07da..f7bc0b7e2f1 100644 --- a/openlibrary/catalog/add_book/__init__.py +++ b/openlibrary/catalog/add_book/__init__.py @@ -542,6 +542,7 @@ def load_data(rec, account=None): cover_id = None if cover_url: cover_id = add_cover(cover_url, ekey, account=account) + if cover_id: edition['covers'] = [cover_id] edits = [] # Things (Edition, Work, Authors) to be saved diff --git a/openlibrary/plugins/importapi/code.py b/openlibrary/plugins/importapi/code.py index 4d2ba2b9800..9106bafccd8 100644 --- a/openlibrary/plugins/importapi/code.py +++ b/openlibrary/plugins/importapi/code.py @@ -313,8 +313,8 @@ def populate_edition_data(self, edition, identifier): :return: Edition record """ edition['ocaid'] = identifier - edition['source_records'] = "ia:" + identifier - edition['cover'] = "{0}/download/{1}/{1}/page/title.jpg".format(IA_BASE_URL, identifier) + edition['source_records'] = 'ia:' + identifier + edition['cover'] = '{0}/download/{1}/page/title.jpg'.format(IA_BASE_URL, identifier) return edition def get_marc_record(self, identifier): From 1d4e612e2e8433cc7f9964b30744b97059023e55 Mon Sep 17 00:00:00 2001 From: Charles Horn Date: Fri, 10 Jan 2020 14:43:03 +1300 Subject: [PATCH 02/16] remove fake subjects from ia code --- openlibrary/core/ia.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/openlibrary/core/ia.py b/openlibrary/core/ia.py index 40a12680bac..fe3b84d8bb9 100644 --- a/openlibrary/core/ia.py +++ b/openlibrary/core/ia.py @@ -1,4 +1,4 @@ -"""Library for interacting wih archive.org. +"""Library for interacting with archive.org. """ import os import urllib2 @@ -18,6 +18,7 @@ VALID_READY_REPUB_STATES = ['4', '19', '20', '22'] + def get_item_json(itemid): itemid = web.safestr(itemid.strip()) url = 'http://archive.org/metadata/%s' % itemid @@ -30,6 +31,7 @@ def get_item_json(itemid): stats.end() return {} + def extract_item_metadata(item_json): metadata = process_metadata_dict(item_json.get('metadata', {})) if metadata: @@ -42,12 +44,14 @@ def extract_item_metadata(item_json): metadata['_filenames'] = [f['name'] for f in files] return metadata + def get_metadata(itemid): item_json = get_item_json(itemid) return extract_item_metadata(item_json) get_metadata = cache.memcache_memoize(get_metadata, key_prefix='ia.get_metadata', timeout=5*60) + def process_metadata_dict(metadata): """Process metadata dict to make sure multi-valued fields like collection and external-identifier are always lists. @@ -66,6 +70,7 @@ def process_item(k, v): return (k, v) return dict(process_item(k, v) for k, v in metadata.items() if v) + def _old_get_meta_xml(itemid): """Returns the contents of meta_xml as JSON. """ @@ -92,10 +97,12 @@ def _old_get_meta_xml(itemid): logger.error("Failed to parse metaxml for %s", itemid, exc_info=True) return web.storage() + def get_meta_xml(itemid): # use metadata API instead of parsing meta xml manually return get_metadata(itemid) + def xml2dict(xml, **defaults): """Converts xml to python dictionary assuming that the xml is not nested. @@ -122,9 +129,9 @@ def xml2dict(xml, **defaults): d[key].add(value) else: d[key] = value - return d + def _get_metadata(itemid): """Returns metadata by querying the archive.org metadata API. """ @@ -140,12 +147,14 @@ def _get_metadata(itemid): # cache the results in memcache for a minute _get_metadata = web.memoize(_get_metadata, expires=60) + def locate_item(itemid): """Returns (hostname, path) for the item. """ d = _get_metadata(itemid) return d.get('server'), d.get('dir') + def edition_from_item_metadata(itemid, metadata): """Converts the item metadata into a form suitable to be used as edition in Open Library. @@ -158,6 +167,7 @@ def edition_from_item_metadata(itemid, metadata): e.add_metadata(metadata) return e + def get_item_manifest(item_id, item_server, item_path): url = 'https://%s/BookReader/BookReaderJSON.php' % item_server url += "?itemPath=%s&itemId=%s&server=%s" % (item_path, item_id, item_server) @@ -170,12 +180,14 @@ def get_item_manifest(item_id, item_server, item_path): stats.end() return {} + def get_item_status(itemid, metadata, **server): item_server = server.pop('item_server', None) item_path = server.pop('item_path', None) return ItemEdition.get_item_status(itemid, metadata, item_server=item_server, item_path=item_path) + class ItemEdition(dict): """Class to convert item metadata into edition dict. """ @@ -264,15 +276,12 @@ def is_valid_item(cls, itemid, metadata): def add_metadata(self, metadata): self.metadata = metadata - self.add('title') self.add('description', 'description') self.add_list('publisher', 'publishers') - self.add_list("creator", "author_names") + self.add_list('creator', 'author_names') self.add('date', 'publish_date') - self.add_isbns() - self.add_subjects() def add(self, key, key2=None): metadata = self.metadata @@ -298,7 +307,7 @@ def add_list(self, key, key2): metadata = self.metadata key2 = key2 or key - # sometimes the empty values are represneted as {} in metadata API. Avoid them. + # sometimes the empty values are represented as {} in metadata API. Avoid them. if key in metadata and metadata[key] != {}: value = metadata[key] if not isinstance(value, list): @@ -321,15 +330,6 @@ def add_isbns(self): if isbn_13: self["isbn_13"] = isbn_13 - def add_subjects(self): - collections = self.metadata.get("collection", []) - mapping = { - "inlibrary": "In library", - "lendinglibrary": "Lending library" - } - subjects = [subject for c, subject in mapping.items() if c in collections] - if subjects: - self['subjects'] = subjects _ia_db = None def get_ia_db(configfile=None): From eb5e215a0e9b06c0189006cbe61e1655fac6d4b5 Mon Sep 17 00:00:00 2001 From: Charles Horn Date: Fri, 10 Jan 2020 14:43:47 +1300 Subject: [PATCH 03/16] remove unused old method --- openlibrary/core/ia.py | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/openlibrary/core/ia.py b/openlibrary/core/ia.py index fe3b84d8bb9..30855561ce9 100644 --- a/openlibrary/core/ia.py +++ b/openlibrary/core/ia.py @@ -71,33 +71,6 @@ def process_item(k, v): return dict(process_item(k, v) for k, v in metadata.items() if v) -def _old_get_meta_xml(itemid): - """Returns the contents of meta_xml as JSON. - """ - itemid = web.safestr(itemid.strip()) - url = 'http://www.archive.org/download/%s/%s_meta.xml' % (itemid, itemid) - try: - stats.begin('archive.org', url=url) - metaxml = urllib2.urlopen(url).read() - stats.end() - except IOError: - logger.error("Failed to download _meta.xml for %s", itemid, exc_info=True) - stats.end() - return web.storage() - - # archive.org returns html on internal errors. - # Checking for valid xml before trying to parse it. - if not metaxml.strip().startswith(" Date: Fri, 10 Jan 2020 14:52:02 +1300 Subject: [PATCH 04/16] remove custom, and unused, XML parser --- openlibrary/core/ia.py | 30 ------------------------------ openlibrary/tests/core/test_ia.py | 10 ---------- 2 files changed, 40 deletions(-) diff --git a/openlibrary/core/ia.py b/openlibrary/core/ia.py index 30855561ce9..8240b86770c 100644 --- a/openlibrary/core/ia.py +++ b/openlibrary/core/ia.py @@ -3,7 +3,6 @@ import os import urllib2 import datetime -from xml.dom import minidom import simplejson import web import logging @@ -76,35 +75,6 @@ def get_meta_xml(itemid): return get_metadata(itemid) -def xml2dict(xml, **defaults): - """Converts xml to python dictionary assuming that the xml is not nested. - - To get some tag as a list/set, pass a keyword argument with list/set as value. - - >>> xml2dict('12') - {'x': 2} - >>> xml2dict('12', x=[]) - {'x': [1, 2]} - """ - d = defaults - dom = minidom.parseString(xml) - - for node in dom.documentElement.childNodes: - if node.nodeType == node.TEXT_NODE or len(node.childNodes) == 0: - continue - else: - key = node.tagName - value = node.childNodes[0].data - - if key in d and isinstance(d[key], list): - d[key].append(value) - elif key in d and isinstance(d[key], set): - d[key].add(value) - else: - d[key] = value - return d - - def _get_metadata(itemid): """Returns metadata by querying the archive.org metadata API. """ diff --git a/openlibrary/tests/core/test_ia.py b/openlibrary/tests/core/test_ia.py index 6a57a22ec26..41b0d9286d1 100644 --- a/openlibrary/tests/core/test_ia.py +++ b/openlibrary/tests/core/test_ia.py @@ -1,13 +1,5 @@ -from __future__ import print_function from openlibrary.core import ia -def test_xml2dict(): - assert ia.xml2dict("12") == {"x": "1", "y": "2"} - assert ia.xml2dict("12", x=[]) == {"x": ["1"], "y": "2"} - - assert ia.xml2dict("12") == {"x": "2"} - assert ia.xml2dict("12", x=[]) == {"x": ["1", "2"]} - def test_get_metaxml(monkeypatch, mock_memcache): import StringIO import urllib2 @@ -27,8 +19,6 @@ def urlopen(url): } } """ - - print(ia.get_meta_xml("foo00bar")) assert ia.get_meta_xml("foo00bar") == { "title": "Foo", "identifier": "foo00bar", From 0e2983c6bdd45282f7b7b22d0832d0d6ed4e4a4c Mon Sep 17 00:00:00 2001 From: Charles Horn Date: Fri, 10 Jan 2020 15:05:03 +1300 Subject: [PATCH 05/16] use ia URL from config, and move near duplicate methods close together --- openlibrary/core/ia.py | 59 +++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/openlibrary/core/ia.py b/openlibrary/core/ia.py index 8240b86770c..eb189a66c18 100644 --- a/openlibrary/core/ia.py +++ b/openlibrary/core/ia.py @@ -15,12 +15,13 @@ logger = logging.getLogger('openlibrary.ia') +IA_BASE_URL = config.get('ia_base_url') VALID_READY_REPUB_STATES = ['4', '19', '20', '22'] def get_item_json(itemid): itemid = web.safestr(itemid.strip()) - url = 'http://archive.org/metadata/%s' % itemid + url = '%s/metadata/%s' % (IA_BASE_URL, itemid) try: stats.begin('archive.org', url=url) metadata_json = urllib2.urlopen(url).read() @@ -31,6 +32,34 @@ def get_item_json(itemid): return {} +def get_metadata(itemid): + item_json = get_item_json(itemid) + return extract_item_metadata(item_json) + +get_metadata = cache.memcache_memoize(get_metadata, key_prefix='ia.get_metadata', timeout=5*60) + + +def _get_metadata(itemid): + """Returns metadata by querying the archive.org metadata API. + """ + url = '%s/metadata/%s' % (IA_BASE_URL, itemid) + try: + stats.begin("archive.org", url=url) + text = urllib2.urlopen(url).read() + stats.end() + return simplejson.loads(text) + except (IOError, ValueError): + return None + +# cache the results in memcache for a minute +_get_metadata = web.memoize(_get_metadata, expires=60) + + +def get_meta_xml(itemid): + # use metadata API instead of parsing meta xml manually + return get_metadata(itemid) + + def extract_item_metadata(item_json): metadata = process_metadata_dict(item_json.get('metadata', {})) if metadata: @@ -44,13 +73,6 @@ def extract_item_metadata(item_json): return metadata -def get_metadata(itemid): - item_json = get_item_json(itemid) - return extract_item_metadata(item_json) - -get_metadata = cache.memcache_memoize(get_metadata, key_prefix='ia.get_metadata', timeout=5*60) - - def process_metadata_dict(metadata): """Process metadata dict to make sure multi-valued fields like collection and external-identifier are always lists. @@ -70,27 +92,6 @@ def process_item(k, v): return dict(process_item(k, v) for k, v in metadata.items() if v) -def get_meta_xml(itemid): - # use metadata API instead of parsing meta xml manually - return get_metadata(itemid) - - -def _get_metadata(itemid): - """Returns metadata by querying the archive.org metadata API. - """ - url = "http://www.archive.org/metadata/%s" % itemid - try: - stats.begin("archive.org", url=url) - text = urllib2.urlopen(url).read() - stats.end() - return simplejson.loads(text) - except (IOError, ValueError): - return None - -# cache the results in memcache for a minute -_get_metadata = web.memoize(_get_metadata, expires=60) - - def locate_item(itemid): """Returns (hostname, path) for the item. """ From 0f2dedfa3ef06ebd24c87003acd7cfa0ff40b81f Mon Sep 17 00:00:00 2001 From: Charles Horn Date: Fri, 10 Jan 2020 15:32:06 +1300 Subject: [PATCH 06/16] deprecation warning to one method for getting metadata --- openlibrary/core/ia.py | 12 ++++++++---- openlibrary/tests/core/test_ia.py | 24 +++++++++--------------- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/openlibrary/core/ia.py b/openlibrary/core/ia.py index eb189a66c18..6de3a7b576c 100644 --- a/openlibrary/core/ia.py +++ b/openlibrary/core/ia.py @@ -1,14 +1,17 @@ """Library for interacting with archive.org. """ + +import cache +import logging import os -import urllib2 -import datetime import simplejson +import urllib2 +import warnings import web -import logging + from infogami import config from infogami.utils import stats -import cache + from openlibrary.utils.dateutil import date_n_days_ago import six @@ -57,6 +60,7 @@ def _get_metadata(itemid): def get_meta_xml(itemid): # use metadata API instead of parsing meta xml manually + warnings.warn('Deprecated, use ia.get_metadata(itemid) instead.', DeprecationWarning) return get_metadata(itemid) diff --git a/openlibrary/tests/core/test_ia.py b/openlibrary/tests/core/test_ia.py index 41b0d9286d1..d58cf2e55f4 100644 --- a/openlibrary/tests/core/test_ia.py +++ b/openlibrary/tests/core/test_ia.py @@ -1,16 +1,9 @@ -from openlibrary.core import ia - -def test_get_metaxml(monkeypatch, mock_memcache): - import StringIO - import urllib2 +from StringIO import StringIO +import urllib2 - metadata_json = None - def urlopen(url): - return StringIO.StringIO(metadata_json) - - monkeypatch.setattr(urllib2, "urlopen", urlopen) +from openlibrary.core import ia - # test with correct xml +def test_get_metadata(monkeypatch, mock_memcache): metadata_json = """{ "metadata": { "title": "Foo", @@ -19,7 +12,8 @@ def urlopen(url): } } """ - assert ia.get_meta_xml("foo00bar") == { + monkeypatch.setattr(urllib2, "urlopen", lambda url: StringIO(metadata_json)) + assert ia.get_metadata('foo00bar') == { "title": "Foo", "identifier": "foo00bar", "collection": ["printdisabled", "inlibrary"], @@ -27,6 +21,6 @@ def urlopen(url): "_filenames": [] } - # test with metadata errors - metadata_json = "{}" - assert ia.get_meta_xml("foo02bar") == {} +def test_get_metadata_empty(monkeypatch, mock_memcache): + monkeypatch.setattr(urllib2, "urlopen", lambda url: StringIO("{}")) + assert ia.get_metadata('foo02bar') == {} From 3f5dbd459ed2165a7252b06f234d776d1c6743cd Mon Sep 17 00:00:00 2001 From: Charles Horn Date: Fri, 10 Jan 2020 15:54:55 +1300 Subject: [PATCH 07/16] replace deprecated metadata call --- openlibrary/plugins/books/dynlinks.py | 2 +- openlibrary/plugins/books/tests/test_dynlinks.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/openlibrary/plugins/books/dynlinks.py b/openlibrary/plugins/books/dynlinks.py index 6d31c9e22be..cc4e1d6a244 100644 --- a/openlibrary/plugins/books/dynlinks.py +++ b/openlibrary/plugins/books/dynlinks.py @@ -364,7 +364,7 @@ def process_result_for_viewapi(result): def get_ia_availability(itemid): - collections = ia.get_meta_xml(itemid).get("collection", []) + collections = ia.get_metadata(itemid).get('collection', []) if 'lendinglibrary' in collections or 'inlibrary' in collections: return 'borrow' diff --git a/openlibrary/plugins/books/tests/test_dynlinks.py b/openlibrary/plugins/books/tests/test_dynlinks.py index fe7f2795962..42e476e6d47 100644 --- a/openlibrary/plugins/books/tests/test_dynlinks.py +++ b/openlibrary/plugins/books/tests/test_dynlinks.py @@ -236,7 +236,7 @@ def monkeypatch_ol(monkeypatch): mock.default = [] monkeypatch.setattr(dynlinks, "ol_get_many", mock) - monkeypatch.setattr(ia, "get_meta_xml", lambda itemid: web.storage()) + monkeypatch.setattr(ia, "get_metadata", lambda itemid: web.storage()) def test_query_keys(monkeypatch): monkeypatch_ol(monkeypatch) From fdea6497803fb42446e89d9e2b8e8214e97c678e Mon Sep 17 00:00:00 2001 From: Charles Horn Date: Fri, 10 Jan 2020 16:11:07 +1300 Subject: [PATCH 08/16] replace deprecated metadata method --- openlibrary/mocks/mock_ia.py | 18 +++++++++--------- openlibrary/plugins/books/readlinks.py | 2 +- openlibrary/plugins/openlibrary/home.py | 2 +- openlibrary/plugins/upstream/models.py | 6 +++--- openlibrary/solr/process_stats.py | 2 +- openlibrary/tests/core/test_ia.py | 4 ++-- scripts/2011/09/generate_deworks.py | 2 +- 7 files changed, 18 insertions(+), 18 deletions(-) diff --git a/openlibrary/mocks/mock_ia.py b/openlibrary/mocks/mock_ia.py index c596663030b..032f6322f88 100644 --- a/openlibrary/mocks/mock_ia.py +++ b/openlibrary/mocks/mock_ia.py @@ -10,21 +10,21 @@ def mock_ia(request, monkeypatch): from openlibrary.core import ia def test_ia(mock_ia): - assert ia.get_meta_xml("foo") == {} + assert ia.get_metadata("foo") == {} - mock_ia.set_meta_xml("foo", {"collection": ["a", "b"]}) - assert ia.get_meta_xml("foo") == {"collection": ["a", "b"]} + mock_ia.set_metadata("foo", {"collection": ["a", "b"]}) + assert ia.get_metadata("foo") == {"collection": ["a", "b"]} """ - metaxml = {} + metadata = {} class IA: - def set_meta_xml(self, itemid, meta): - metaxml[itemid] = meta + def set_metadata(self, itemid, meta): + metadata[itemid] = meta - def get_meta_xml(self, itemid): - return metaxml.get(itemid, {}) + def get_metadata(self, itemid): + return metadata.get(itemid, {}) mock_ia = IA() - monkeypatch.setattr(ia, "get_meta_xml", ia.get_meta_xml) + monkeypatch.setattr(ia, 'get_metadata', ia.get_metadata) return mock_ia diff --git a/openlibrary/plugins/books/readlinks.py b/openlibrary/plugins/books/readlinks.py index 011bcffe5ca..e9ef54da173 100644 --- a/openlibrary/plugins/books/readlinks.py +++ b/openlibrary/plugins/books/readlinks.py @@ -331,7 +331,7 @@ def process(self, req): self.wkey_to_iaids = dict((wkey, get_work_iaids(wkey)[:iaid_limit]) for wkey in self.works) iaids = sum(self.wkey_to_iaids.values(), []) - self.iaid_to_meta = dict((iaid, ia.get_meta_xml(iaid)) for iaid in iaids) + self.iaid_to_meta = dict((iaid, ia.get_metadata(iaid)) for iaid in iaids) def lookup_iaids(iaids): step = 10 diff --git a/openlibrary/plugins/openlibrary/home.py b/openlibrary/plugins/openlibrary/home.py index ff7ce08eae9..d21c079c99c 100644 --- a/openlibrary/plugins/openlibrary/home.py +++ b/openlibrary/plugins/openlibrary/home.py @@ -248,7 +248,7 @@ def get_authors(doc): d.cover_url = 'https://archive.org/services/img/%s' % d.ocaid if d.ocaid: - collections = ia.get_meta_xml(d.ocaid).get("collection", []) + collections = ia.get_metadata(d.ocaid).get('collection', []) if 'lendinglibrary' in collections or 'inlibrary' in collections: d.borrow_url = book.url("/borrow") diff --git a/openlibrary/plugins/upstream/models.py b/openlibrary/plugins/upstream/models.py index 3bfe36c5a3a..275639c732b 100644 --- a/openlibrary/plugins/upstream/models.py +++ b/openlibrary/plugins/upstream/models.py @@ -148,9 +148,9 @@ def get_ia_meta_fields(self): if not self.get('ocaid', None): meta = {} else: - meta = ia.get_meta_xml(self.ocaid) - meta.setdefault("external-identifier", []) - meta.setdefault("collection", []) + meta = ia.get_metadata(self.ocaid) + meta.setdefault('external-identifier', []) + meta.setdefault('collection', []) self._ia_meta_fields = meta return self._ia_meta_fields diff --git a/openlibrary/solr/process_stats.py b/openlibrary/solr/process_stats.py index 64ff3b4c3d4..99994d83e21 100644 --- a/openlibrary/solr/process_stats.py +++ b/openlibrary/solr/process_stats.py @@ -76,7 +76,7 @@ def _get_metadata(ia_id): if meta: meta['collection'] = meta['collection'].split(";") else: - meta = ia.get_meta_xml(ia_id) + meta = ia.get_metadata(ia_id) return meta def preload(entries): diff --git a/openlibrary/tests/core/test_ia.py b/openlibrary/tests/core/test_ia.py index d58cf2e55f4..72a6d36f535 100644 --- a/openlibrary/tests/core/test_ia.py +++ b/openlibrary/tests/core/test_ia.py @@ -12,7 +12,7 @@ def test_get_metadata(monkeypatch, mock_memcache): } } """ - monkeypatch.setattr(urllib2, "urlopen", lambda url: StringIO(metadata_json)) + monkeypatch.setattr(urllib2, 'urlopen', lambda url: StringIO(metadata_json)) assert ia.get_metadata('foo00bar') == { "title": "Foo", "identifier": "foo00bar", @@ -22,5 +22,5 @@ def test_get_metadata(monkeypatch, mock_memcache): } def test_get_metadata_empty(monkeypatch, mock_memcache): - monkeypatch.setattr(urllib2, "urlopen", lambda url: StringIO("{}")) + monkeypatch.setattr(urllib2, 'urlopen', lambda url: StringIO('{}')) assert ia.get_metadata('foo02bar') == {} diff --git a/scripts/2011/09/generate_deworks.py b/scripts/2011/09/generate_deworks.py index be6d74e817e..5ee642529e2 100644 --- a/scripts/2011/09/generate_deworks.py +++ b/scripts/2011/09/generate_deworks.py @@ -315,7 +315,7 @@ def make_ia_db(editions_dump_file): print >> f, ocaid """ if ocaid: - metaxml = ia.get_meta_xml(ocaid) + metaxml = ia.get_metadata(ocaid) db[ocaid] = simplejson.dumps(metaxml) """ f.close() From c93d0bb16a5cf746d01114183c047ee0ab0db867 Mon Sep 17 00:00:00 2001 From: Charles Horn Date: Fri, 10 Jan 2020 16:16:52 +1300 Subject: [PATCH 09/16] remove deprecated def now nothing uses it --- openlibrary/core/ia.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/openlibrary/core/ia.py b/openlibrary/core/ia.py index 6de3a7b576c..b843c761625 100644 --- a/openlibrary/core/ia.py +++ b/openlibrary/core/ia.py @@ -6,7 +6,6 @@ import os import simplejson import urllib2 -import warnings import web from infogami import config @@ -58,12 +57,6 @@ def _get_metadata(itemid): _get_metadata = web.memoize(_get_metadata, expires=60) -def get_meta_xml(itemid): - # use metadata API instead of parsing meta xml manually - warnings.warn('Deprecated, use ia.get_metadata(itemid) instead.', DeprecationWarning) - return get_metadata(itemid) - - def extract_item_metadata(item_json): metadata = process_metadata_dict(item_json.get('metadata', {})) if metadata: From f6f6f49a99c2061be62cce22078e1495b2de6eca Mon Sep 17 00:00:00 2001 From: Charles Horn Date: Fri, 10 Jan 2020 16:18:11 +1300 Subject: [PATCH 10/16] fix typo in variable name --- openlibrary/core/ia.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/openlibrary/core/ia.py b/openlibrary/core/ia.py index b843c761625..84698d0233e 100644 --- a/openlibrary/core/ia.py +++ b/openlibrary/core/ia.py @@ -79,11 +79,11 @@ def process_metadata_dict(metadata): non-list cases. This function makes sure the known multi-valued fields are always lists. """ - mutlivalued = set(['collection', 'external-identifier', 'isbn', 'subject', 'oclc-id']) + multivalued = set(['collection', 'external-identifier', 'isbn', 'subject', 'oclc-id']) def process_item(k, v): - if k in mutlivalued and not isinstance(v, list): + if k in multivalued and not isinstance(v, list): v = [v] - elif k not in mutlivalued and isinstance(v, list): + elif k not in multivalued and isinstance(v, list): v = v[0] return (k, v) return dict(process_item(k, v) for k, v in metadata.items() if v) From 7ffc79f634497bfdc9b1dec786582cb9615d68c1 Mon Sep 17 00:00:00 2001 From: Charles Horn Date: Fri, 10 Jan 2020 16:48:08 +1300 Subject: [PATCH 11/16] DRY up get_metadata methods --- openlibrary/core/ia.py | 25 +++++++------------------ openlibrary/plugins/importapi/code.py | 19 ++++++------------- 2 files changed, 13 insertions(+), 31 deletions(-) diff --git a/openlibrary/core/ia.py b/openlibrary/core/ia.py index 84698d0233e..307875e69fc 100644 --- a/openlibrary/core/ia.py +++ b/openlibrary/core/ia.py @@ -21,7 +21,9 @@ VALID_READY_REPUB_STATES = ['4', '19', '20', '22'] -def get_item_json(itemid): +def _get_metadata(itemid): + """Returns metadata by querying the archive.org metadata API. + """ itemid = web.safestr(itemid.strip()) url = '%s/metadata/%s' % (IA_BASE_URL, itemid) try: @@ -33,30 +35,17 @@ def get_item_json(itemid): stats.end() return {} +# cache the results in memcache for a minute +_get_metadata = web.memoize(_get_metadata, expires=60) + def get_metadata(itemid): - item_json = get_item_json(itemid) + item_json = _get_metadata(itemid) return extract_item_metadata(item_json) get_metadata = cache.memcache_memoize(get_metadata, key_prefix='ia.get_metadata', timeout=5*60) -def _get_metadata(itemid): - """Returns metadata by querying the archive.org metadata API. - """ - url = '%s/metadata/%s' % (IA_BASE_URL, itemid) - try: - stats.begin("archive.org", url=url) - text = urllib2.urlopen(url).read() - stats.end() - return simplejson.loads(text) - except (IOError, ValueError): - return None - -# cache the results in memcache for a minute -_get_metadata = web.memoize(_get_metadata, expires=60) - - def extract_item_metadata(item_json): metadata = process_metadata_dict(item_json.get('metadata', {})) if metadata: diff --git a/openlibrary/plugins/importapi/code.py b/openlibrary/plugins/importapi/code.py index 9106bafccd8..1cac17f37f4 100644 --- a/openlibrary/plugins/importapi/code.py +++ b/openlibrary/plugins/importapi/code.py @@ -206,31 +206,24 @@ def get_subfield(field, id_subfield): return json.dumps(result) # Case 1 - Is this a valid Archive.org item? - try: - item_json = ia.get_item_json(identifier) - item_server = item_json['server'] - item_path = item_json['dir'] - except KeyError: - return self.error("invalid-ia-identifier", "%s not found" % identifier) - metadata = ia.extract_item_metadata(item_json) + metadata = ia.get_metadata(identifier) if not metadata: - return self.error("invalid-ia-identifier") + return self.error('invalid-ia-identifier', '%s not found' % identifier) # Case 2 - Does the item have an openlibrary field specified? # The scan operators search OL before loading the book and add the # OL key if a match is found. We can trust them and attach the item # to that edition. - if metadata.get("mediatype") == "texts" and metadata.get("openlibrary"): + if metadata.get('mediatype') == 'texts' and metadata.get('openlibrary'): edition_data = self.get_ia_record(metadata) - edition_data["openlibrary"] = metadata["openlibrary"] + edition_data['openlibrary'] = metadata['openlibrary'] edition_data = self.populate_edition_data(edition_data, identifier) return self.load_book(edition_data) # Case 3 - Can the item be loaded into Open Library? - status = ia.get_item_status(identifier, metadata, - item_server=item_server, item_path=item_path) + status = ia.get_item_status(identifier, metadata) if status != 'ok': - return self.error(status, "Prohibited Item") + return self.error(status, 'Prohibited Item %s' % identifier) # Case 4 - Does this item have a marc record? marc_record = self.get_marc_record(identifier) From 25e40f0f7bd5878cea1ea2a9dcc6029ece846745 Mon Sep 17 00:00:00 2001 From: Charles Horn Date: Fri, 10 Jan 2020 19:57:16 +1300 Subject: [PATCH 12/16] move ia cover logic to ia module --- openlibrary/core/ia.py | 6 ++++++ openlibrary/plugins/importapi/code.py | 25 +++++++------------------ 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/openlibrary/core/ia.py b/openlibrary/core/ia.py index 307875e69fc..80586084af0 100644 --- a/openlibrary/core/ia.py +++ b/openlibrary/core/ia.py @@ -98,6 +98,12 @@ def edition_from_item_metadata(itemid, metadata): return e +def get_cover_url(item_id): + """Gets the URL of the archive.org item's title (or cover) page. + """ + return '{0}/download/{1}/page/title.jpg'.format(IA_BASE_URL, item_id) + + def get_item_manifest(item_id, item_server, item_path): url = 'https://%s/BookReader/BookReaderJSON.php' % item_server url += "?itemPath=%s&itemId=%s&server=%s" % (item_path, item_id, item_server) diff --git a/openlibrary/plugins/importapi/code.py b/openlibrary/plugins/importapi/code.py index 1cac17f37f4..99a9debfb7f 100644 --- a/openlibrary/plugins/importapi/code.py +++ b/openlibrary/plugins/importapi/code.py @@ -2,15 +2,14 @@ """ from infogami.plugins.api.code import add_hook -from infogami import config + from openlibrary.plugins.openlibrary.code import can_write from openlibrary.catalog.marc.marc_binary import MarcBinary, MarcException from openlibrary.catalog.marc.marc_xml import MarcXml from openlibrary.catalog.marc.parse import read_edition from openlibrary.catalog import add_book from openlibrary.catalog.get_ia import get_marc_record_from_ia, get_from_archive_bulk -from openlibrary import accounts -from openlibrary import records +from openlibrary import accounts, records from openlibrary.core import ia import web @@ -26,7 +25,6 @@ from lxml import etree import logging -IA_BASE_URL = config.get('ia_base_url') MARC_LENGTH_POS = 5 logger = logging.getLogger('openlibrary.importapi') @@ -226,18 +224,16 @@ def get_subfield(field, id_subfield): return self.error(status, 'Prohibited Item %s' % identifier) # Case 4 - Does this item have a marc record? - marc_record = self.get_marc_record(identifier) + marc_record = get_marc_record_from_ia(identifier) if marc_record: self.reject_non_book_marc(marc_record) try: edition_data = read_edition(marc_record) except MarcException as e: - logger.error("failed to read from MARC record %s: %s", identifier, str(e)) - return self.error("invalid-marc-record") - + logger.error('failed to read from MARC record %s: %s', identifier, str(e)) + return self.error('invalid-marc-record') elif require_marc: - return self.error("no-marc-record") - + return self.error('no-marc-record') else: try: edition_data = self.get_ia_record(metadata) @@ -246,7 +242,6 @@ def get_subfield(field, id_subfield): # Add IA specific fields: ocaid, source_records, and cover edition_data = self.populate_edition_data(edition_data, identifier) - return self.load_book(edition_data) def get_ia_record(self, metadata): @@ -307,15 +302,9 @@ def populate_edition_data(self, edition, identifier): """ edition['ocaid'] = identifier edition['source_records'] = 'ia:' + identifier - edition['cover'] = '{0}/download/{1}/page/title.jpg'.format(IA_BASE_URL, identifier) + edition['cover'] = ia.get_cover_url(identifier) return edition - def get_marc_record(self, identifier): - try: - return get_marc_record_from_ia(identifier) - except IOError: - return None - def find_edition(self, identifier): """ Checks if the given identifier has already been imported into OL. From 52d9765f91423c6a12c238ff68907c0d0b7b0ca9 Mon Sep 17 00:00:00 2001 From: Charles Horn Date: Fri, 10 Jan 2020 20:21:48 +1300 Subject: [PATCH 13/16] use requests instead of urllib2 for ia module --- openlibrary/core/ia.py | 16 +++++++--------- openlibrary/tests/core/test_ia.py | 11 ++++------- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/openlibrary/core/ia.py b/openlibrary/core/ia.py index 80586084af0..55c8532b49d 100644 --- a/openlibrary/core/ia.py +++ b/openlibrary/core/ia.py @@ -4,8 +4,7 @@ import cache import logging import os -import simplejson -import urllib2 +import requests import web from infogami import config @@ -28,9 +27,9 @@ def _get_metadata(itemid): url = '%s/metadata/%s' % (IA_BASE_URL, itemid) try: stats.begin('archive.org', url=url) - metadata_json = urllib2.urlopen(url).read() + metadata = requests.get(url) stats.end() - return simplejson.loads(metadata_json) + return metadata.json() except IOError: stats.end() return {} @@ -106,12 +105,12 @@ def get_cover_url(item_id): def get_item_manifest(item_id, item_server, item_path): url = 'https://%s/BookReader/BookReaderJSON.php' % item_server - url += "?itemPath=%s&itemId=%s&server=%s" % (item_path, item_id, item_server) + url += '?itemPath=%s&itemId=%s&server=%s' % (item_path, item_id, item_server) try: - stats.begin("archive.org", url=url) - manifest_json = urllib2.urlopen(url).read() + stats.begin('archive.org', url=url) + manifest = requests.get(url) stats.end() - return simplejson.loads(manifest_json) + return manifest.json() except IOError: stats.end() return {} @@ -197,7 +196,6 @@ def get_item_status(cls, itemid, metadata, item_server=None, item_path=None): # items with metadata no_ol_import=true will be not imported if metadata.get("no_ol_import", '').lower() == 'true': return "no-ol-import" - return "ok" @classmethod diff --git a/openlibrary/tests/core/test_ia.py b/openlibrary/tests/core/test_ia.py index 72a6d36f535..13efe3aebc2 100644 --- a/openlibrary/tests/core/test_ia.py +++ b/openlibrary/tests/core/test_ia.py @@ -1,18 +1,15 @@ -from StringIO import StringIO -import urllib2 - from openlibrary.core import ia def test_get_metadata(monkeypatch, mock_memcache): - metadata_json = """{ + metadata = { "metadata": { "title": "Foo", "identifier": "foo00bar", "collection": ["printdisabled", "inlibrary"] } } - """ - monkeypatch.setattr(urllib2, 'urlopen', lambda url: StringIO(metadata_json)) + + monkeypatch.setattr(ia, '_get_metadata', lambda _id: metadata) assert ia.get_metadata('foo00bar') == { "title": "Foo", "identifier": "foo00bar", @@ -22,5 +19,5 @@ def test_get_metadata(monkeypatch, mock_memcache): } def test_get_metadata_empty(monkeypatch, mock_memcache): - monkeypatch.setattr(urllib2, 'urlopen', lambda url: StringIO('{}')) + monkeypatch.setattr(ia, '_get_metadata', lambda _id: {}) assert ia.get_metadata('foo02bar') == {} From 43ee8ac402e7f92d102bfd78a0995cbd83399c81 Mon Sep 17 00:00:00 2001 From: Charles Horn Date: Fri, 10 Jan 2020 20:43:52 +1300 Subject: [PATCH 14/16] fallback to archive.org cover.jpg if title.jpg does not exist --- openlibrary/core/ia.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/openlibrary/core/ia.py b/openlibrary/core/ia.py index 55c8532b49d..40dc2eb2ba7 100644 --- a/openlibrary/core/ia.py +++ b/openlibrary/core/ia.py @@ -100,7 +100,11 @@ def edition_from_item_metadata(itemid, metadata): def get_cover_url(item_id): """Gets the URL of the archive.org item's title (or cover) page. """ - return '{0}/download/{1}/page/title.jpg'.format(IA_BASE_URL, item_id) + base_url = '{0}/download/{1}/page/'.format(IA_BASE_URL, item_id) + title_response = requests.head(base_url + 'title.jpg', allow_redirects=True) + if title_response.status_code == 404: + return base_url + 'cover.jpg' + return base_url + 'title.jpg' def get_item_manifest(item_id, item_server, item_path): From b58237abc0136e0a7c99a9a7ea042b88e0028f40 Mon Sep 17 00:00:00 2001 From: Charles Horn Date: Fri, 10 Jan 2020 20:55:10 +1300 Subject: [PATCH 15/16] add TODO for better response codes --- openlibrary/plugins/importapi/code.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/openlibrary/plugins/importapi/code.py b/openlibrary/plugins/importapi/code.py index 99a9debfb7f..01e19a85a1f 100644 --- a/openlibrary/plugins/importapi/code.py +++ b/openlibrary/plugins/importapi/code.py @@ -118,12 +118,13 @@ def POST(self): try: reply = add_book.load(edition) + # TODO: If any records have been created, return a 201, otherwise 200 + return json.dumps(reply) except add_book.RequiredField as e: return self.error('missing-required-field', str(e)) - return json.dumps(reply) def reject_non_book_marc(self, marc_record, **kwargs): - details = "Item rejected" + details = 'Item rejected' # Is the item a serial instead of a book? marc_leaders = marc_record.leader() if marc_leaders[7] == 's': From 06df165003912899a264548f68bb2d6573f14d6c Mon Sep 17 00:00:00 2001 From: Charles Horn Date: Sat, 11 Jan 2020 01:39:46 +1300 Subject: [PATCH 16/16] improve import: --- openlibrary/core/ia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openlibrary/core/ia.py b/openlibrary/core/ia.py index 40dc2eb2ba7..a972c115f36 100644 --- a/openlibrary/core/ia.py +++ b/openlibrary/core/ia.py @@ -1,7 +1,6 @@ """Library for interacting with archive.org. """ -import cache import logging import os import requests @@ -10,6 +9,7 @@ from infogami import config from infogami.utils import stats +from openlibrary.core import cache from openlibrary.utils.dateutil import date_n_days_ago import six