diff --git a/openlibrary/catalog/add_book/__init__.py b/openlibrary/catalog/add_book/__init__.py index 8e3c48e07da..f7bc0b7e2f1 100644 --- a/openlibrary/catalog/add_book/__init__.py +++ b/openlibrary/catalog/add_book/__init__.py @@ -542,6 +542,7 @@ def load_data(rec, account=None): cover_id = None if cover_url: cover_id = add_cover(cover_url, ekey, account=account) + if cover_id: edition['covers'] = [cover_id] edits = [] # Things (Edition, Work, Authors) to be saved diff --git a/openlibrary/core/ia.py b/openlibrary/core/ia.py index 40a12680bac..a972c115f36 100644 --- a/openlibrary/core/ia.py +++ b/openlibrary/core/ia.py @@ -1,35 +1,50 @@ -"""Library for interacting wih archive.org. +"""Library for interacting with archive.org. """ + +import logging import os -import urllib2 -import datetime -from xml.dom import minidom -import simplejson +import requests import web -import logging + from infogami import config from infogami.utils import stats -import cache + +from openlibrary.core import cache from openlibrary.utils.dateutil import date_n_days_ago import six logger = logging.getLogger('openlibrary.ia') +IA_BASE_URL = config.get('ia_base_url') VALID_READY_REPUB_STATES = ['4', '19', '20', '22'] -def get_item_json(itemid): + +def _get_metadata(itemid): + """Returns metadata by querying the archive.org metadata API. + """ itemid = web.safestr(itemid.strip()) - url = 'http://archive.org/metadata/%s' % itemid + url = '%s/metadata/%s' % (IA_BASE_URL, itemid) try: stats.begin('archive.org', url=url) - metadata_json = urllib2.urlopen(url).read() + metadata = requests.get(url) stats.end() - return simplejson.loads(metadata_json) + return metadata.json() except IOError: stats.end() return {} +# cache the results in memcache for a minute +_get_metadata = web.memoize(_get_metadata, expires=60) + + +def get_metadata(itemid): + item_json = _get_metadata(itemid) + return extract_item_metadata(item_json) + +get_metadata = cache.memcache_memoize(get_metadata, key_prefix='ia.get_metadata', timeout=5*60) + + def extract_item_metadata(item_json): metadata = process_metadata_dict(item_json.get('metadata', {})) if metadata: @@ -42,11 +57,6 @@ def extract_item_metadata(item_json): metadata['_filenames'] = [f['name'] for f in files] return metadata -def get_metadata(itemid): - item_json = get_item_json(itemid) - return extract_item_metadata(item_json) - -get_metadata = cache.memcache_memoize(get_metadata, key_prefix='ia.get_metadata', timeout=5*60) def process_metadata_dict(metadata): """Process metadata dict to make sure multi-valued fields like @@ -57,88 +67,15 @@ def process_metadata_dict(metadata): non-list cases. This function makes sure the known multi-valued fields are always lists. """ - mutlivalued = set(['collection', 'external-identifier', 'isbn', 'subject', 'oclc-id']) + multivalued = set(['collection', 'external-identifier', 'isbn', 'subject', 'oclc-id']) def process_item(k, v): - if k in mutlivalued and not isinstance(v, list): + if k in multivalued and not isinstance(v, list): v = [v] - elif k not in mutlivalued and isinstance(v, list): + elif k not in multivalued and isinstance(v, list): v = v[0] return (k, v) return dict(process_item(k, v) for k, v in metadata.items() if v) -def _old_get_meta_xml(itemid): - """Returns the contents of meta_xml as JSON. - """ - itemid = web.safestr(itemid.strip()) - url = 'http://www.archive.org/download/%s/%s_meta.xml' % (itemid, itemid) - try: - stats.begin('archive.org', url=url) - metaxml = urllib2.urlopen(url).read() - stats.end() - except IOError: - logger.error("Failed to download _meta.xml for %s", itemid, exc_info=True) - stats.end() - return web.storage() - - # archive.org returns html on internal errors. - # Checking for valid xml before trying to parse it. - if not metaxml.strip().startswith(">> xml2dict('12') - {'x': 2} - >>> xml2dict('12', x=[]) - {'x': [1, 2]} - """ - d = defaults - dom = minidom.parseString(xml) - - for node in dom.documentElement.childNodes: - if node.nodeType == node.TEXT_NODE or len(node.childNodes) == 0: - continue - else: - key = node.tagName - value = node.childNodes[0].data - - if key in d and isinstance(d[key], list): - d[key].append(value) - elif key in d and isinstance(d[key], set): - d[key].add(value) - else: - d[key] = value - - return d - -def _get_metadata(itemid): - """Returns metadata by querying the archive.org metadata API. - """ - url = "http://www.archive.org/metadata/%s" % itemid - try: - stats.begin("archive.org", url=url) - text = urllib2.urlopen(url).read() - stats.end() - return simplejson.loads(text) - except (IOError, ValueError): - return None - -# cache the results in memcache for a minute -_get_metadata = web.memoize(_get_metadata, expires=60) def locate_item(itemid): """Returns (hostname, path) for the item. @@ -146,6 +83,7 @@ def locate_item(itemid): d = _get_metadata(itemid) return d.get('server'), d.get('dir') + def edition_from_item_metadata(itemid, metadata): """Converts the item metadata into a form suitable to be used as edition in Open Library. @@ -158,24 +96,37 @@ def edition_from_item_metadata(itemid, metadata): e.add_metadata(metadata) return e + +def get_cover_url(item_id): + """Gets the URL of the archive.org item's title (or cover) page. + """ + base_url = '{0}/download/{1}/page/'.format(IA_BASE_URL, item_id) + title_response = requests.head(base_url + 'title.jpg', allow_redirects=True) + if title_response.status_code == 404: + return base_url + 'cover.jpg' + return base_url + 'title.jpg' + + def get_item_manifest(item_id, item_server, item_path): url = 'https://%s/BookReader/BookReaderJSON.php' % item_server - url += "?itemPath=%s&itemId=%s&server=%s" % (item_path, item_id, item_server) + url += '?itemPath=%s&itemId=%s&server=%s' % (item_path, item_id, item_server) try: - stats.begin("archive.org", url=url) - manifest_json = urllib2.urlopen(url).read() + stats.begin('archive.org', url=url) + manifest = requests.get(url) stats.end() - return simplejson.loads(manifest_json) + return manifest.json() except IOError: stats.end() return {} + def get_item_status(itemid, metadata, **server): item_server = server.pop('item_server', None) item_path = server.pop('item_path', None) return ItemEdition.get_item_status(itemid, metadata, item_server=item_server, item_path=item_path) + class ItemEdition(dict): """Class to convert item metadata into edition dict. """ @@ -249,7 +200,6 @@ def get_item_status(cls, itemid, metadata, item_server=None, item_path=None): # items with metadata no_ol_import=true will be not imported if metadata.get("no_ol_import", '').lower() == 'true': return "no-ol-import" - return "ok" @classmethod @@ -264,15 +214,12 @@ def is_valid_item(cls, itemid, metadata): def add_metadata(self, metadata): self.metadata = metadata - self.add('title') self.add('description', 'description') self.add_list('publisher', 'publishers') - self.add_list("creator", "author_names") + self.add_list('creator', 'author_names') self.add('date', 'publish_date') - self.add_isbns() - self.add_subjects() def add(self, key, key2=None): metadata = self.metadata @@ -298,7 +245,7 @@ def add_list(self, key, key2): metadata = self.metadata key2 = key2 or key - # sometimes the empty values are represneted as {} in metadata API. Avoid them. + # sometimes the empty values are represented as {} in metadata API. Avoid them. if key in metadata and metadata[key] != {}: value = metadata[key] if not isinstance(value, list): @@ -321,15 +268,6 @@ def add_isbns(self): if isbn_13: self["isbn_13"] = isbn_13 - def add_subjects(self): - collections = self.metadata.get("collection", []) - mapping = { - "inlibrary": "In library", - "lendinglibrary": "Lending library" - } - subjects = [subject for c, subject in mapping.items() if c in collections] - if subjects: - self['subjects'] = subjects _ia_db = None def get_ia_db(configfile=None): diff --git a/openlibrary/mocks/mock_ia.py b/openlibrary/mocks/mock_ia.py index c596663030b..032f6322f88 100644 --- a/openlibrary/mocks/mock_ia.py +++ b/openlibrary/mocks/mock_ia.py @@ -10,21 +10,21 @@ def mock_ia(request, monkeypatch): from openlibrary.core import ia def test_ia(mock_ia): - assert ia.get_meta_xml("foo") == {} + assert ia.get_metadata("foo") == {} - mock_ia.set_meta_xml("foo", {"collection": ["a", "b"]}) - assert ia.get_meta_xml("foo") == {"collection": ["a", "b"]} + mock_ia.set_metadata("foo", {"collection": ["a", "b"]}) + assert ia.get_metadata("foo") == {"collection": ["a", "b"]} """ - metaxml = {} + metadata = {} class IA: - def set_meta_xml(self, itemid, meta): - metaxml[itemid] = meta + def set_metadata(self, itemid, meta): + metadata[itemid] = meta - def get_meta_xml(self, itemid): - return metaxml.get(itemid, {}) + def get_metadata(self, itemid): + return metadata.get(itemid, {}) mock_ia = IA() - monkeypatch.setattr(ia, "get_meta_xml", ia.get_meta_xml) + monkeypatch.setattr(ia, 'get_metadata', ia.get_metadata) return mock_ia diff --git a/openlibrary/plugins/books/dynlinks.py b/openlibrary/plugins/books/dynlinks.py index 6d31c9e22be..cc4e1d6a244 100644 --- a/openlibrary/plugins/books/dynlinks.py +++ b/openlibrary/plugins/books/dynlinks.py @@ -364,7 +364,7 @@ def process_result_for_viewapi(result): def get_ia_availability(itemid): - collections = ia.get_meta_xml(itemid).get("collection", []) + collections = ia.get_metadata(itemid).get('collection', []) if 'lendinglibrary' in collections or 'inlibrary' in collections: return 'borrow' diff --git a/openlibrary/plugins/books/readlinks.py b/openlibrary/plugins/books/readlinks.py index 011bcffe5ca..e9ef54da173 100644 --- a/openlibrary/plugins/books/readlinks.py +++ b/openlibrary/plugins/books/readlinks.py @@ -331,7 +331,7 @@ def process(self, req): self.wkey_to_iaids = dict((wkey, get_work_iaids(wkey)[:iaid_limit]) for wkey in self.works) iaids = sum(self.wkey_to_iaids.values(), []) - self.iaid_to_meta = dict((iaid, ia.get_meta_xml(iaid)) for iaid in iaids) + self.iaid_to_meta = dict((iaid, ia.get_metadata(iaid)) for iaid in iaids) def lookup_iaids(iaids): step = 10 diff --git a/openlibrary/plugins/books/tests/test_dynlinks.py b/openlibrary/plugins/books/tests/test_dynlinks.py index fe7f2795962..42e476e6d47 100644 --- a/openlibrary/plugins/books/tests/test_dynlinks.py +++ b/openlibrary/plugins/books/tests/test_dynlinks.py @@ -236,7 +236,7 @@ def monkeypatch_ol(monkeypatch): mock.default = [] monkeypatch.setattr(dynlinks, "ol_get_many", mock) - monkeypatch.setattr(ia, "get_meta_xml", lambda itemid: web.storage()) + monkeypatch.setattr(ia, "get_metadata", lambda itemid: web.storage()) def test_query_keys(monkeypatch): monkeypatch_ol(monkeypatch) diff --git a/openlibrary/plugins/importapi/code.py b/openlibrary/plugins/importapi/code.py index 4d2ba2b9800..01e19a85a1f 100644 --- a/openlibrary/plugins/importapi/code.py +++ b/openlibrary/plugins/importapi/code.py @@ -2,15 +2,14 @@ """ from infogami.plugins.api.code import add_hook -from infogami import config + from openlibrary.plugins.openlibrary.code import can_write from openlibrary.catalog.marc.marc_binary import MarcBinary, MarcException from openlibrary.catalog.marc.marc_xml import MarcXml from openlibrary.catalog.marc.parse import read_edition from openlibrary.catalog import add_book from openlibrary.catalog.get_ia import get_marc_record_from_ia, get_from_archive_bulk -from openlibrary import accounts -from openlibrary import records +from openlibrary import accounts, records from openlibrary.core import ia import web @@ -26,7 +25,6 @@ from lxml import etree import logging -IA_BASE_URL = config.get('ia_base_url') MARC_LENGTH_POS = 5 logger = logging.getLogger('openlibrary.importapi') @@ -120,12 +118,13 @@ def POST(self): try: reply = add_book.load(edition) + # TODO: If any records have been created, return a 201, otherwise 200 + return json.dumps(reply) except add_book.RequiredField as e: return self.error('missing-required-field', str(e)) - return json.dumps(reply) def reject_non_book_marc(self, marc_record, **kwargs): - details = "Item rejected" + details = 'Item rejected' # Is the item a serial instead of a book? marc_leaders = marc_record.leader() if marc_leaders[7] == 's': @@ -206,45 +205,36 @@ def get_subfield(field, id_subfield): return json.dumps(result) # Case 1 - Is this a valid Archive.org item? - try: - item_json = ia.get_item_json(identifier) - item_server = item_json['server'] - item_path = item_json['dir'] - except KeyError: - return self.error("invalid-ia-identifier", "%s not found" % identifier) - metadata = ia.extract_item_metadata(item_json) + metadata = ia.get_metadata(identifier) if not metadata: - return self.error("invalid-ia-identifier") + return self.error('invalid-ia-identifier', '%s not found' % identifier) # Case 2 - Does the item have an openlibrary field specified? # The scan operators search OL before loading the book and add the # OL key if a match is found. We can trust them and attach the item # to that edition. - if metadata.get("mediatype") == "texts" and metadata.get("openlibrary"): + if metadata.get('mediatype') == 'texts' and metadata.get('openlibrary'): edition_data = self.get_ia_record(metadata) - edition_data["openlibrary"] = metadata["openlibrary"] + edition_data['openlibrary'] = metadata['openlibrary'] edition_data = self.populate_edition_data(edition_data, identifier) return self.load_book(edition_data) # Case 3 - Can the item be loaded into Open Library? - status = ia.get_item_status(identifier, metadata, - item_server=item_server, item_path=item_path) + status = ia.get_item_status(identifier, metadata) if status != 'ok': - return self.error(status, "Prohibited Item") + return self.error(status, 'Prohibited Item %s' % identifier) # Case 4 - Does this item have a marc record? - marc_record = self.get_marc_record(identifier) + marc_record = get_marc_record_from_ia(identifier) if marc_record: self.reject_non_book_marc(marc_record) try: edition_data = read_edition(marc_record) except MarcException as e: - logger.error("failed to read from MARC record %s: %s", identifier, str(e)) - return self.error("invalid-marc-record") - + logger.error('failed to read from MARC record %s: %s', identifier, str(e)) + return self.error('invalid-marc-record') elif require_marc: - return self.error("no-marc-record") - + return self.error('no-marc-record') else: try: edition_data = self.get_ia_record(metadata) @@ -253,7 +243,6 @@ def get_subfield(field, id_subfield): # Add IA specific fields: ocaid, source_records, and cover edition_data = self.populate_edition_data(edition_data, identifier) - return self.load_book(edition_data) def get_ia_record(self, metadata): @@ -313,16 +302,10 @@ def populate_edition_data(self, edition, identifier): :return: Edition record """ edition['ocaid'] = identifier - edition['source_records'] = "ia:" + identifier - edition['cover'] = "{0}/download/{1}/{1}/page/title.jpg".format(IA_BASE_URL, identifier) + edition['source_records'] = 'ia:' + identifier + edition['cover'] = ia.get_cover_url(identifier) return edition - def get_marc_record(self, identifier): - try: - return get_marc_record_from_ia(identifier) - except IOError: - return None - def find_edition(self, identifier): """ Checks if the given identifier has already been imported into OL. diff --git a/openlibrary/plugins/openlibrary/home.py b/openlibrary/plugins/openlibrary/home.py index ff7ce08eae9..d21c079c99c 100644 --- a/openlibrary/plugins/openlibrary/home.py +++ b/openlibrary/plugins/openlibrary/home.py @@ -248,7 +248,7 @@ def get_authors(doc): d.cover_url = 'https://archive.org/services/img/%s' % d.ocaid if d.ocaid: - collections = ia.get_meta_xml(d.ocaid).get("collection", []) + collections = ia.get_metadata(d.ocaid).get('collection', []) if 'lendinglibrary' in collections or 'inlibrary' in collections: d.borrow_url = book.url("/borrow") diff --git a/openlibrary/plugins/upstream/models.py b/openlibrary/plugins/upstream/models.py index 3bfe36c5a3a..275639c732b 100644 --- a/openlibrary/plugins/upstream/models.py +++ b/openlibrary/plugins/upstream/models.py @@ -148,9 +148,9 @@ def get_ia_meta_fields(self): if not self.get('ocaid', None): meta = {} else: - meta = ia.get_meta_xml(self.ocaid) - meta.setdefault("external-identifier", []) - meta.setdefault("collection", []) + meta = ia.get_metadata(self.ocaid) + meta.setdefault('external-identifier', []) + meta.setdefault('collection', []) self._ia_meta_fields = meta return self._ia_meta_fields diff --git a/openlibrary/solr/process_stats.py b/openlibrary/solr/process_stats.py index 64ff3b4c3d4..99994d83e21 100644 --- a/openlibrary/solr/process_stats.py +++ b/openlibrary/solr/process_stats.py @@ -76,7 +76,7 @@ def _get_metadata(ia_id): if meta: meta['collection'] = meta['collection'].split(";") else: - meta = ia.get_meta_xml(ia_id) + meta = ia.get_metadata(ia_id) return meta def preload(entries): diff --git a/openlibrary/tests/core/test_ia.py b/openlibrary/tests/core/test_ia.py index 6a57a22ec26..13efe3aebc2 100644 --- a/openlibrary/tests/core/test_ia.py +++ b/openlibrary/tests/core/test_ia.py @@ -1,35 +1,16 @@ -from __future__ import print_function from openlibrary.core import ia -def test_xml2dict(): - assert ia.xml2dict("12") == {"x": "1", "y": "2"} - assert ia.xml2dict("12", x=[]) == {"x": ["1"], "y": "2"} - - assert ia.xml2dict("12") == {"x": "2"} - assert ia.xml2dict("12", x=[]) == {"x": ["1", "2"]} - -def test_get_metaxml(monkeypatch, mock_memcache): - import StringIO - import urllib2 - - metadata_json = None - def urlopen(url): - return StringIO.StringIO(metadata_json) - - monkeypatch.setattr(urllib2, "urlopen", urlopen) - - # test with correct xml - metadata_json = """{ +def test_get_metadata(monkeypatch, mock_memcache): + metadata = { "metadata": { "title": "Foo", "identifier": "foo00bar", "collection": ["printdisabled", "inlibrary"] } } - """ - print(ia.get_meta_xml("foo00bar")) - assert ia.get_meta_xml("foo00bar") == { + monkeypatch.setattr(ia, '_get_metadata', lambda _id: metadata) + assert ia.get_metadata('foo00bar') == { "title": "Foo", "identifier": "foo00bar", "collection": ["printdisabled", "inlibrary"], @@ -37,6 +18,6 @@ def urlopen(url): "_filenames": [] } - # test with metadata errors - metadata_json = "{}" - assert ia.get_meta_xml("foo02bar") == {} +def test_get_metadata_empty(monkeypatch, mock_memcache): + monkeypatch.setattr(ia, '_get_metadata', lambda _id: {}) + assert ia.get_metadata('foo02bar') == {} diff --git a/scripts/2011/09/generate_deworks.py b/scripts/2011/09/generate_deworks.py index be6d74e817e..5ee642529e2 100644 --- a/scripts/2011/09/generate_deworks.py +++ b/scripts/2011/09/generate_deworks.py @@ -315,7 +315,7 @@ def make_ia_db(editions_dump_file): print >> f, ocaid """ if ocaid: - metaxml = ia.get_meta_xml(ocaid) + metaxml = ia.get_metadata(ocaid) db[ocaid] = simplejson.dumps(metaxml) """ f.close()