diff --git a/openlibrary/catalog/add_book/__init__.py b/openlibrary/catalog/add_book/__init__.py
index 8e3c48e07da..f7bc0b7e2f1 100644
--- a/openlibrary/catalog/add_book/__init__.py
+++ b/openlibrary/catalog/add_book/__init__.py
@@ -542,6 +542,7 @@ def load_data(rec, account=None):
cover_id = None
if cover_url:
cover_id = add_cover(cover_url, ekey, account=account)
+ if cover_id:
edition['covers'] = [cover_id]
edits = [] # Things (Edition, Work, Authors) to be saved
diff --git a/openlibrary/core/ia.py b/openlibrary/core/ia.py
index 40a12680bac..a972c115f36 100644
--- a/openlibrary/core/ia.py
+++ b/openlibrary/core/ia.py
@@ -1,35 +1,50 @@
-"""Library for interacting wih archive.org.
+"""Library for interacting with archive.org.
"""
+
+import logging
import os
-import urllib2
-import datetime
-from xml.dom import minidom
-import simplejson
+import requests
import web
-import logging
+
from infogami import config
from infogami.utils import stats
-import cache
+
+from openlibrary.core import cache
from openlibrary.utils.dateutil import date_n_days_ago
import six
logger = logging.getLogger('openlibrary.ia')
+IA_BASE_URL = config.get('ia_base_url')
VALID_READY_REPUB_STATES = ['4', '19', '20', '22']
-def get_item_json(itemid):
+
+def _get_metadata(itemid):
+ """Returns metadata by querying the archive.org metadata API.
+ """
itemid = web.safestr(itemid.strip())
- url = 'http://archive.org/metadata/%s' % itemid
+ url = '%s/metadata/%s' % (IA_BASE_URL, itemid)
try:
stats.begin('archive.org', url=url)
- metadata_json = urllib2.urlopen(url).read()
+ metadata = requests.get(url)
stats.end()
- return simplejson.loads(metadata_json)
+ return metadata.json()
except IOError:
stats.end()
return {}
+# cache the results in memcache for a minute
+_get_metadata = web.memoize(_get_metadata, expires=60)
+
+
+def get_metadata(itemid):
+ item_json = _get_metadata(itemid)
+ return extract_item_metadata(item_json)
+
+get_metadata = cache.memcache_memoize(get_metadata, key_prefix='ia.get_metadata', timeout=5*60)
+
+
def extract_item_metadata(item_json):
metadata = process_metadata_dict(item_json.get('metadata', {}))
if metadata:
@@ -42,11 +57,6 @@ def extract_item_metadata(item_json):
metadata['_filenames'] = [f['name'] for f in files]
return metadata
-def get_metadata(itemid):
- item_json = get_item_json(itemid)
- return extract_item_metadata(item_json)
-
-get_metadata = cache.memcache_memoize(get_metadata, key_prefix='ia.get_metadata', timeout=5*60)
def process_metadata_dict(metadata):
"""Process metadata dict to make sure multi-valued fields like
@@ -57,88 +67,15 @@ def process_metadata_dict(metadata):
non-list cases. This function makes sure the known multi-valued fields are
always lists.
"""
- mutlivalued = set(['collection', 'external-identifier', 'isbn', 'subject', 'oclc-id'])
+ multivalued = set(['collection', 'external-identifier', 'isbn', 'subject', 'oclc-id'])
def process_item(k, v):
- if k in mutlivalued and not isinstance(v, list):
+ if k in multivalued and not isinstance(v, list):
v = [v]
- elif k not in mutlivalued and isinstance(v, list):
+ elif k not in multivalued and isinstance(v, list):
v = v[0]
return (k, v)
return dict(process_item(k, v) for k, v in metadata.items() if v)
-def _old_get_meta_xml(itemid):
- """Returns the contents of meta_xml as JSON.
- """
- itemid = web.safestr(itemid.strip())
- url = 'http://www.archive.org/download/%s/%s_meta.xml' % (itemid, itemid)
- try:
- stats.begin('archive.org', url=url)
- metaxml = urllib2.urlopen(url).read()
- stats.end()
- except IOError:
- logger.error("Failed to download _meta.xml for %s", itemid, exc_info=True)
- stats.end()
- return web.storage()
-
- # archive.org returns html on internal errors.
- # Checking for valid xml before trying to parse it.
- if not metaxml.strip().startswith(">> xml2dict('12')
- {'x': 2}
- >>> xml2dict('12', x=[])
- {'x': [1, 2]}
- """
- d = defaults
- dom = minidom.parseString(xml)
-
- for node in dom.documentElement.childNodes:
- if node.nodeType == node.TEXT_NODE or len(node.childNodes) == 0:
- continue
- else:
- key = node.tagName
- value = node.childNodes[0].data
-
- if key in d and isinstance(d[key], list):
- d[key].append(value)
- elif key in d and isinstance(d[key], set):
- d[key].add(value)
- else:
- d[key] = value
-
- return d
-
-def _get_metadata(itemid):
- """Returns metadata by querying the archive.org metadata API.
- """
- url = "http://www.archive.org/metadata/%s" % itemid
- try:
- stats.begin("archive.org", url=url)
- text = urllib2.urlopen(url).read()
- stats.end()
- return simplejson.loads(text)
- except (IOError, ValueError):
- return None
-
-# cache the results in memcache for a minute
-_get_metadata = web.memoize(_get_metadata, expires=60)
def locate_item(itemid):
"""Returns (hostname, path) for the item.
@@ -146,6 +83,7 @@ def locate_item(itemid):
d = _get_metadata(itemid)
return d.get('server'), d.get('dir')
+
def edition_from_item_metadata(itemid, metadata):
"""Converts the item metadata into a form suitable to be used as edition
in Open Library.
@@ -158,24 +96,37 @@ def edition_from_item_metadata(itemid, metadata):
e.add_metadata(metadata)
return e
+
+def get_cover_url(item_id):
+ """Gets the URL of the archive.org item's title (or cover) page.
+ """
+ base_url = '{0}/download/{1}/page/'.format(IA_BASE_URL, item_id)
+ title_response = requests.head(base_url + 'title.jpg', allow_redirects=True)
+ if title_response.status_code == 404:
+ return base_url + 'cover.jpg'
+ return base_url + 'title.jpg'
+
+
def get_item_manifest(item_id, item_server, item_path):
url = 'https://%s/BookReader/BookReaderJSON.php' % item_server
- url += "?itemPath=%s&itemId=%s&server=%s" % (item_path, item_id, item_server)
+ url += '?itemPath=%s&itemId=%s&server=%s' % (item_path, item_id, item_server)
try:
- stats.begin("archive.org", url=url)
- manifest_json = urllib2.urlopen(url).read()
+ stats.begin('archive.org', url=url)
+ manifest = requests.get(url)
stats.end()
- return simplejson.loads(manifest_json)
+ return manifest.json()
except IOError:
stats.end()
return {}
+
def get_item_status(itemid, metadata, **server):
item_server = server.pop('item_server', None)
item_path = server.pop('item_path', None)
return ItemEdition.get_item_status(itemid, metadata, item_server=item_server,
item_path=item_path)
+
class ItemEdition(dict):
"""Class to convert item metadata into edition dict.
"""
@@ -249,7 +200,6 @@ def get_item_status(cls, itemid, metadata, item_server=None, item_path=None):
# items with metadata no_ol_import=true will be not imported
if metadata.get("no_ol_import", '').lower() == 'true':
return "no-ol-import"
-
return "ok"
@classmethod
@@ -264,15 +214,12 @@ def is_valid_item(cls, itemid, metadata):
def add_metadata(self, metadata):
self.metadata = metadata
-
self.add('title')
self.add('description', 'description')
self.add_list('publisher', 'publishers')
- self.add_list("creator", "author_names")
+ self.add_list('creator', 'author_names')
self.add('date', 'publish_date')
-
self.add_isbns()
- self.add_subjects()
def add(self, key, key2=None):
metadata = self.metadata
@@ -298,7 +245,7 @@ def add_list(self, key, key2):
metadata = self.metadata
key2 = key2 or key
- # sometimes the empty values are represneted as {} in metadata API. Avoid them.
+ # sometimes the empty values are represented as {} in metadata API. Avoid them.
if key in metadata and metadata[key] != {}:
value = metadata[key]
if not isinstance(value, list):
@@ -321,15 +268,6 @@ def add_isbns(self):
if isbn_13:
self["isbn_13"] = isbn_13
- def add_subjects(self):
- collections = self.metadata.get("collection", [])
- mapping = {
- "inlibrary": "In library",
- "lendinglibrary": "Lending library"
- }
- subjects = [subject for c, subject in mapping.items() if c in collections]
- if subjects:
- self['subjects'] = subjects
_ia_db = None
def get_ia_db(configfile=None):
diff --git a/openlibrary/mocks/mock_ia.py b/openlibrary/mocks/mock_ia.py
index c596663030b..032f6322f88 100644
--- a/openlibrary/mocks/mock_ia.py
+++ b/openlibrary/mocks/mock_ia.py
@@ -10,21 +10,21 @@ def mock_ia(request, monkeypatch):
from openlibrary.core import ia
def test_ia(mock_ia):
- assert ia.get_meta_xml("foo") == {}
+ assert ia.get_metadata("foo") == {}
- mock_ia.set_meta_xml("foo", {"collection": ["a", "b"]})
- assert ia.get_meta_xml("foo") == {"collection": ["a", "b"]}
+ mock_ia.set_metadata("foo", {"collection": ["a", "b"]})
+ assert ia.get_metadata("foo") == {"collection": ["a", "b"]}
"""
- metaxml = {}
+ metadata = {}
class IA:
- def set_meta_xml(self, itemid, meta):
- metaxml[itemid] = meta
+ def set_metadata(self, itemid, meta):
+ metadata[itemid] = meta
- def get_meta_xml(self, itemid):
- return metaxml.get(itemid, {})
+ def get_metadata(self, itemid):
+ return metadata.get(itemid, {})
mock_ia = IA()
- monkeypatch.setattr(ia, "get_meta_xml", ia.get_meta_xml)
+ monkeypatch.setattr(ia, 'get_metadata', ia.get_metadata)
return mock_ia
diff --git a/openlibrary/plugins/books/dynlinks.py b/openlibrary/plugins/books/dynlinks.py
index 6d31c9e22be..cc4e1d6a244 100644
--- a/openlibrary/plugins/books/dynlinks.py
+++ b/openlibrary/plugins/books/dynlinks.py
@@ -364,7 +364,7 @@ def process_result_for_viewapi(result):
def get_ia_availability(itemid):
- collections = ia.get_meta_xml(itemid).get("collection", [])
+ collections = ia.get_metadata(itemid).get('collection', [])
if 'lendinglibrary' in collections or 'inlibrary' in collections:
return 'borrow'
diff --git a/openlibrary/plugins/books/readlinks.py b/openlibrary/plugins/books/readlinks.py
index 011bcffe5ca..e9ef54da173 100644
--- a/openlibrary/plugins/books/readlinks.py
+++ b/openlibrary/plugins/books/readlinks.py
@@ -331,7 +331,7 @@ def process(self, req):
self.wkey_to_iaids = dict((wkey, get_work_iaids(wkey)[:iaid_limit])
for wkey in self.works)
iaids = sum(self.wkey_to_iaids.values(), [])
- self.iaid_to_meta = dict((iaid, ia.get_meta_xml(iaid)) for iaid in iaids)
+ self.iaid_to_meta = dict((iaid, ia.get_metadata(iaid)) for iaid in iaids)
def lookup_iaids(iaids):
step = 10
diff --git a/openlibrary/plugins/books/tests/test_dynlinks.py b/openlibrary/plugins/books/tests/test_dynlinks.py
index fe7f2795962..42e476e6d47 100644
--- a/openlibrary/plugins/books/tests/test_dynlinks.py
+++ b/openlibrary/plugins/books/tests/test_dynlinks.py
@@ -236,7 +236,7 @@ def monkeypatch_ol(monkeypatch):
mock.default = []
monkeypatch.setattr(dynlinks, "ol_get_many", mock)
- monkeypatch.setattr(ia, "get_meta_xml", lambda itemid: web.storage())
+ monkeypatch.setattr(ia, "get_metadata", lambda itemid: web.storage())
def test_query_keys(monkeypatch):
monkeypatch_ol(monkeypatch)
diff --git a/openlibrary/plugins/importapi/code.py b/openlibrary/plugins/importapi/code.py
index 4d2ba2b9800..01e19a85a1f 100644
--- a/openlibrary/plugins/importapi/code.py
+++ b/openlibrary/plugins/importapi/code.py
@@ -2,15 +2,14 @@
"""
from infogami.plugins.api.code import add_hook
-from infogami import config
+
from openlibrary.plugins.openlibrary.code import can_write
from openlibrary.catalog.marc.marc_binary import MarcBinary, MarcException
from openlibrary.catalog.marc.marc_xml import MarcXml
from openlibrary.catalog.marc.parse import read_edition
from openlibrary.catalog import add_book
from openlibrary.catalog.get_ia import get_marc_record_from_ia, get_from_archive_bulk
-from openlibrary import accounts
-from openlibrary import records
+from openlibrary import accounts, records
from openlibrary.core import ia
import web
@@ -26,7 +25,6 @@
from lxml import etree
import logging
-IA_BASE_URL = config.get('ia_base_url')
MARC_LENGTH_POS = 5
logger = logging.getLogger('openlibrary.importapi')
@@ -120,12 +118,13 @@ def POST(self):
try:
reply = add_book.load(edition)
+ # TODO: If any records have been created, return a 201, otherwise 200
+ return json.dumps(reply)
except add_book.RequiredField as e:
return self.error('missing-required-field', str(e))
- return json.dumps(reply)
def reject_non_book_marc(self, marc_record, **kwargs):
- details = "Item rejected"
+ details = 'Item rejected'
# Is the item a serial instead of a book?
marc_leaders = marc_record.leader()
if marc_leaders[7] == 's':
@@ -206,45 +205,36 @@ def get_subfield(field, id_subfield):
return json.dumps(result)
# Case 1 - Is this a valid Archive.org item?
- try:
- item_json = ia.get_item_json(identifier)
- item_server = item_json['server']
- item_path = item_json['dir']
- except KeyError:
- return self.error("invalid-ia-identifier", "%s not found" % identifier)
- metadata = ia.extract_item_metadata(item_json)
+ metadata = ia.get_metadata(identifier)
if not metadata:
- return self.error("invalid-ia-identifier")
+ return self.error('invalid-ia-identifier', '%s not found' % identifier)
# Case 2 - Does the item have an openlibrary field specified?
# The scan operators search OL before loading the book and add the
# OL key if a match is found. We can trust them and attach the item
# to that edition.
- if metadata.get("mediatype") == "texts" and metadata.get("openlibrary"):
+ if metadata.get('mediatype') == 'texts' and metadata.get('openlibrary'):
edition_data = self.get_ia_record(metadata)
- edition_data["openlibrary"] = metadata["openlibrary"]
+ edition_data['openlibrary'] = metadata['openlibrary']
edition_data = self.populate_edition_data(edition_data, identifier)
return self.load_book(edition_data)
# Case 3 - Can the item be loaded into Open Library?
- status = ia.get_item_status(identifier, metadata,
- item_server=item_server, item_path=item_path)
+ status = ia.get_item_status(identifier, metadata)
if status != 'ok':
- return self.error(status, "Prohibited Item")
+ return self.error(status, 'Prohibited Item %s' % identifier)
# Case 4 - Does this item have a marc record?
- marc_record = self.get_marc_record(identifier)
+ marc_record = get_marc_record_from_ia(identifier)
if marc_record:
self.reject_non_book_marc(marc_record)
try:
edition_data = read_edition(marc_record)
except MarcException as e:
- logger.error("failed to read from MARC record %s: %s", identifier, str(e))
- return self.error("invalid-marc-record")
-
+ logger.error('failed to read from MARC record %s: %s', identifier, str(e))
+ return self.error('invalid-marc-record')
elif require_marc:
- return self.error("no-marc-record")
-
+ return self.error('no-marc-record')
else:
try:
edition_data = self.get_ia_record(metadata)
@@ -253,7 +243,6 @@ def get_subfield(field, id_subfield):
# Add IA specific fields: ocaid, source_records, and cover
edition_data = self.populate_edition_data(edition_data, identifier)
-
return self.load_book(edition_data)
def get_ia_record(self, metadata):
@@ -313,16 +302,10 @@ def populate_edition_data(self, edition, identifier):
:return: Edition record
"""
edition['ocaid'] = identifier
- edition['source_records'] = "ia:" + identifier
- edition['cover'] = "{0}/download/{1}/{1}/page/title.jpg".format(IA_BASE_URL, identifier)
+ edition['source_records'] = 'ia:' + identifier
+ edition['cover'] = ia.get_cover_url(identifier)
return edition
- def get_marc_record(self, identifier):
- try:
- return get_marc_record_from_ia(identifier)
- except IOError:
- return None
-
def find_edition(self, identifier):
"""
Checks if the given identifier has already been imported into OL.
diff --git a/openlibrary/plugins/openlibrary/home.py b/openlibrary/plugins/openlibrary/home.py
index ff7ce08eae9..d21c079c99c 100644
--- a/openlibrary/plugins/openlibrary/home.py
+++ b/openlibrary/plugins/openlibrary/home.py
@@ -248,7 +248,7 @@ def get_authors(doc):
d.cover_url = 'https://archive.org/services/img/%s' % d.ocaid
if d.ocaid:
- collections = ia.get_meta_xml(d.ocaid).get("collection", [])
+ collections = ia.get_metadata(d.ocaid).get('collection', [])
if 'lendinglibrary' in collections or 'inlibrary' in collections:
d.borrow_url = book.url("/borrow")
diff --git a/openlibrary/plugins/upstream/models.py b/openlibrary/plugins/upstream/models.py
index 3bfe36c5a3a..275639c732b 100644
--- a/openlibrary/plugins/upstream/models.py
+++ b/openlibrary/plugins/upstream/models.py
@@ -148,9 +148,9 @@ def get_ia_meta_fields(self):
if not self.get('ocaid', None):
meta = {}
else:
- meta = ia.get_meta_xml(self.ocaid)
- meta.setdefault("external-identifier", [])
- meta.setdefault("collection", [])
+ meta = ia.get_metadata(self.ocaid)
+ meta.setdefault('external-identifier', [])
+ meta.setdefault('collection', [])
self._ia_meta_fields = meta
return self._ia_meta_fields
diff --git a/openlibrary/solr/process_stats.py b/openlibrary/solr/process_stats.py
index 64ff3b4c3d4..99994d83e21 100644
--- a/openlibrary/solr/process_stats.py
+++ b/openlibrary/solr/process_stats.py
@@ -76,7 +76,7 @@ def _get_metadata(ia_id):
if meta:
meta['collection'] = meta['collection'].split(";")
else:
- meta = ia.get_meta_xml(ia_id)
+ meta = ia.get_metadata(ia_id)
return meta
def preload(entries):
diff --git a/openlibrary/tests/core/test_ia.py b/openlibrary/tests/core/test_ia.py
index 6a57a22ec26..13efe3aebc2 100644
--- a/openlibrary/tests/core/test_ia.py
+++ b/openlibrary/tests/core/test_ia.py
@@ -1,35 +1,16 @@
-from __future__ import print_function
from openlibrary.core import ia
-def test_xml2dict():
- assert ia.xml2dict("12") == {"x": "1", "y": "2"}
- assert ia.xml2dict("12", x=[]) == {"x": ["1"], "y": "2"}
-
- assert ia.xml2dict("12") == {"x": "2"}
- assert ia.xml2dict("12", x=[]) == {"x": ["1", "2"]}
-
-def test_get_metaxml(monkeypatch, mock_memcache):
- import StringIO
- import urllib2
-
- metadata_json = None
- def urlopen(url):
- return StringIO.StringIO(metadata_json)
-
- monkeypatch.setattr(urllib2, "urlopen", urlopen)
-
- # test with correct xml
- metadata_json = """{
+def test_get_metadata(monkeypatch, mock_memcache):
+ metadata = {
"metadata": {
"title": "Foo",
"identifier": "foo00bar",
"collection": ["printdisabled", "inlibrary"]
}
}
- """
- print(ia.get_meta_xml("foo00bar"))
- assert ia.get_meta_xml("foo00bar") == {
+ monkeypatch.setattr(ia, '_get_metadata', lambda _id: metadata)
+ assert ia.get_metadata('foo00bar') == {
"title": "Foo",
"identifier": "foo00bar",
"collection": ["printdisabled", "inlibrary"],
@@ -37,6 +18,6 @@ def urlopen(url):
"_filenames": []
}
- # test with metadata errors
- metadata_json = "{}"
- assert ia.get_meta_xml("foo02bar") == {}
+def test_get_metadata_empty(monkeypatch, mock_memcache):
+ monkeypatch.setattr(ia, '_get_metadata', lambda _id: {})
+ assert ia.get_metadata('foo02bar') == {}
diff --git a/scripts/2011/09/generate_deworks.py b/scripts/2011/09/generate_deworks.py
index be6d74e817e..5ee642529e2 100644
--- a/scripts/2011/09/generate_deworks.py
+++ b/scripts/2011/09/generate_deworks.py
@@ -315,7 +315,7 @@ def make_ia_db(editions_dump_file):
print >> f, ocaid
"""
if ocaid:
- metaxml = ia.get_meta_xml(ocaid)
+ metaxml = ia.get_metadata(ocaid)
db[ocaid] = simplejson.dumps(metaxml)
"""
f.close()