Skip to content

Commit

Permalink
Merge branch 'refactor'
Browse files Browse the repository at this point in the history
  • Loading branch information
cwulfman committed Sep 28, 2022
2 parents 847fba2 + 14881f1 commit 2c55d94
Show file tree
Hide file tree
Showing 15 changed files with 584 additions and 409 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,4 @@ dmypy.json
.pyre/

/schemas
*.*~
1 change: 1 addition & 0 deletions adam/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Sphinx = "^4.4.0"
tox = "^3.24.5"

[tool.poetry.scripts]
create_graph = "adam.create_graph:run"
analyze_manifests = "adam.cli:run"
analyze_manifest = "adam.analyze_manifest:run"
download_pages = "adam.download_pages:run"
Expand Down
22 changes: 9 additions & 13 deletions adam/src/adam/analyze_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,19 @@
from os import defpath
import sys
from pathlib import Path
from adam.manifest import Manifest
from rdflib import container
import spacy
from adam.container import Container


_logger = logging.getLogger(__name__)

_default_output_dir = Path("/tmp/adam")
_default_output_dir.mkdir(parents=True, exist_ok=True)


def analyze_manifest(
manifest_url, nlp, out_dir=_default_output_dir, image_dir=_default_output_dir
):
container = Container(manifest_url, nlp, image_dir)
container.dump(out_dir)


# CLI
def parse_args(args):
parser = argparse.ArgumentParser()
parser.add_argument("manifest_url", help="url of manifest")
parser.add_argument("nlp", help="name of spacy model to use")
parser.add_argument("basedir", help="directory to save into")
parser.add_argument(
"-v",
"--verbose",
Expand Down Expand Up @@ -58,10 +49,15 @@ def setup_logging(loglevel):


def main(args):
_logger.info("Script starts here")
args = parse_args(args)
setup_logging(args.loglevel)
manifest = Manifest(args.manifest_url)
nlp = spacy.load(args.nlp)
analyze_manifest(args.manifest_url, nlp, _default_output_dir, _default_output_dir)
container = Container(manifest, nlp)
base_dir = Path(args.basedir)
base_dir.mkdir(parents=True, exist_ok=True)
container.dump(base_dir)
_logger.info("Script ends here")


Expand Down
122 changes: 31 additions & 91 deletions adam/src/adam/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,128 +12,68 @@
import spacy
from adam.page import Page
from adam.graphable import Graphable
from adam.manifest import Manifest


class Container(Graphable):
"""
The Container class.
The Container Class
"""

def __init__(self, manifest_uri, nlp=None, cache_dir_root=None):
def __init__(self, manifest_object, nlp=None):
super().__init__()
self._manifest_uri = manifest_uri
self._manifest = None
self._pages = []
self._manifest = manifest_object
self._nlp = nlp
self._cache_dir_root = cache_dir_root
self._pages = None

@property
def cache_dir(self):
if self._cache_dir_root:
return Path(self._cache_dir_root) / Path(self.container_label)
else:
return None

@property
def _id(self):
def id(self):
"""returns the uuid portion of the manifest @id"""
return self.manifest['@id'].split('/')[-2]
return self._manifest.id

@property
def metadata(self):
"""returns a dict of metadata from the manifest"""
metadata = {}
for item in self.manifest['metadata']:
metadata[item['label']] = item['value']
return metadata
return self._manifest.metadata

@property
def manifest(self):
"""returns a manifest, loading it if necessary"""
if not self._manifest:
self.load_manifest()
return self._manifest
def pages(self):
if self._pages is None:
self.generate_pages()
return self._pages

@property
def nlp(self):
"""Returns a spaCy pipeline, creating it if it does not exist"""
if not self._nlp:
self._nlp = spacy.load("en_core_web_lg")
return self._nlp

@property
def pages(self):
"""Returns the collection of Page objects,
creating them if necessary"""
if not self._pages:
self.generate_pages()
return self._pages
@nlp.setter
def nlp(self, value):
if self._nlp != value:
self._nlp = value
for page in self.pages:
page.nlp = value

@property
def container_label(self):
if 'Container' in self.metadata.keys():
string_label = self.metadata['Container'][0]
def label(self):
if "Container" in self._manifest.metadata.keys():
string_label = self._manifest.metadata["Container"][0]
else:
string_label = self._id
return re.sub(r"[,. ]", "_", string_label)
string_label = self.id

def load_manifest(self):
logging.info("downloading manifest |%s|" % self._manifest_uri)
try:
with urllib.request.urlopen(self._manifest_uri) as response:
self._manifest = json.loads(response.read())
except urllib.error.HTTPError as error:
uri = self._manifest_uri
msg = f"couldn't download from {uri}"
logging.exception(msg, error)
return re.sub(r"[,. ]", "_", string_label)

def generate_pages(self):
"""Iterates over page images and creates Page objects for each"""
if "sequences" in self.manifest.keys():
for canvas in self.manifest['sequences'][0]['canvases']:
page = Page(canvas, self.nlp, self.metadata, self.cache_dir)
self._pages.append(page)
else:
logging.debug("no sequences found")

def download_pages(self):
"""
Downloads all page images
"""
for page in self.pages:
page.download_image()



def build_graph(self):
"""
Constructs a graph from all the pages.
"""
graph = self.graph
for page in self.pages:
page.build_graph()
graph += page.graph

def export(self, target_dir_name, fmt="txt"):
target_dir = Path(target_dir_name) / Path(self.container_label)
target_dir.mkdir(parents=True, exist_ok=True)
for page in self.pages:
file_name = str(page.id).rsplit('/', maxsplit=1)[-1] + '.' + fmt
page.export(target_dir / file_name, fmt)
self._pages = [Page(canvas) for canvas in self._manifest.canvases]

def dump(self, target_dir_name):
"""
Serializes the container in all formats:
plain text, hocr, alto, and rdf
"""

for fmt in ['txt', 'csv', 'jsonl']:
logging.info("exporting format %s" % fmt)
self.export(target_dir_name, fmt)

target_dir = Path(target_dir_name) / Path(self.container_label)
target_dir.mkdir(parents=True, exist_ok=True)
self.build_graph()
logging.info("serializing RDF")
rdf_file_name = self.container_label + '.' + 'ttl'
self.serialize(target_dir / rdf_file_name)
base_dir = Path(target_dir_name) / Path(self.id)
base_dir.mkdir(parents=True, exist_ok=True)
for page in self.pages:
page.export_as_txt(base_dir)
page.export_as_csv(base_dir)
page.export_as_jsonl(base_dir)
page.export_as_rdf(base_dir)
59 changes: 18 additions & 41 deletions adam/src/adam/create_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,33 +6,19 @@
import json
import logging
import sys
from urllib.request import urlopen
from pathlib import Path
from adam.manifest import Manifest
from adam.container import Container
from rdflib import container

_logger = logging.getLogger(__name__)

def manifest_id(manifest_url):
"""
Returns the id from a manifest URI.
Given:
https://figgy.princeton.edu/concern/scanned_resources/2a701cb1-33d4-4112-bf5d-65123e8aa8e7/manifest
Return: 2a701cb1-33d4-4112-bf5d-65123e8aa8e7
"""
return manifest_url.split('/')[-2]

def output_graph(manifest_url, outfile):
with urlopen(manifest_url) as url:
manifest = json.loads(url.read())

container = Container(manifest)
container.build_graph()
container.serialize(outfile)

def parse_args(args):
"""Parse command line parameters"""
parser = argparse.ArgumentParser(description="Produce a graph from a manifest")
parser.add_argument(dest="url", help="URL of the manifest")
parser.add_argument("url", help="URL of the manifest")
parser.add_argument("outdir", help="output directory")
parser.add_argument(
"-v",
"--verbose",
Expand All @@ -49,41 +35,32 @@ def parse_args(args):
action="store_const",
const=logging.DEBUG,
)
parser.add_argument('-o', '--outfile',
dest="outfile",
default=sys.stdout.buffer,
help="output file (stdout by default)")

return parser.parse_args(args)


def setup_logging(log_level):
log_format = "[%(asctime)s] %(levelname)s:%(name)s:%(message)s"
logging.basicConfig(
level=log_level, stream=sys.stdout, format=log_format, datefmt="%Y-%m-%d %H:%M:%S"
)
level=log_level,
stream=sys.stdout,
format=log_format,
datefmt="%Y-%m-%d %H:%M:%S",
)


def main(args):
args = parse_args(args)
setup_logging(args.loglevel)
output_graph(args.url, args.outfile)
container = Container(Manifest(args.url))
outdir = Path(args.outdir)
outdir.mkdir(parents=True, exist_ok=True)
for page in container.pages:
page.export_as_rdf(outdir)

def run():
"""Calls :func:`main` passing the CLI arguments extracted from :obj:`sys.argv`

This function can be used as entry point to create console scripts with setuptools.
"""
def run():
main(sys.argv[1:])


if __name__ == "__main__":
# ^ This is a guard statement that will prevent the following code from
# being executed in the case someone imports this file instead of
# executing it as a script.
# https://docs.python.org/3/library/__main__.html

# After installing your project with pip, users can also run your Python
# modules as scripts via the ``-m`` flag, as defined in PEP 338::
#
# python -m foo.skeleton 42
#
run()
41 changes: 26 additions & 15 deletions adam/src/adam/graphable.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,41 +2,50 @@
The Graphable Class
"""

import pathlib
from sys import stdout
from typing import Union, IO, TextIO
from rdflib import Graph, Namespace
from shortuuid import uuid


class Graphable:
""" The Graphable class holds info about ontologies"""
"""The Graphable class holds info about ontologies"""

def __init__(self):
"""Initializes a Graphable.
"""Initializes a Graphable.
Sets up namespaces and establishes an id.
"""
self._graph = Graph()
self._graph = None
self._namespaces = {
"ecrm": Namespace("http://erlangen-crm.org/200717/"),
"sc": Namespace("http://iiif.io/api/presentation/2#"),
"page": Namespace("https://figgy.princeton.edu/concerns/pages/"),
"actor": Namespace("https://figgy.princeton.edu/concerns/actors/"),
"appellation": Namespace("https://figgy.princeton.edu/concerns/appellations/"),
"appellation": Namespace(
"https://figgy.princeton.edu/concerns/appellations/"
),
"entity": Namespace("https://figgy.princeton.edu/concerns/entities/"),
"inscription": Namespace("https://figgy.princeton.edu/concerns/inscriptions/"),
"etype": Namespace("https://figgy.princeton.edu/concerns/adam/")
"inscription": Namespace(
"https://figgy.princeton.edu/concerns/inscriptions/"
),
"etype": Namespace("https://figgy.princeton.edu/concerns/adam/"),
}

manager = self._graph.namespace_manager

for prefix, namespace in self._namespaces.items():
manager.bind(prefix, namespace)

@property
def id(self):
return self._id
# @property
# def graph_id(self):
# return self._id

@property
def graph(self):
if self._graph is None:
self._graph = Graph()
manager = self._graph.namespace_manager

for prefix, namespace in self._namespaces.items():
manager.bind(prefix, namespace)

self.build_graph()
return self._graph

def namespace(self, key):
Expand All @@ -49,5 +58,7 @@ def build_graph(self):
"""Does nothing in the base class; intended to be implemented by each subclass"""
pass

def serialize(self, path=stdout, fmt='ttl'):
def serialize(
self, path: Union[str, pathlib.PurePath, IO[bytes]], fmt: str = "ttl"
):
self.graph.serialize(destination=path, format=fmt)
Loading

0 comments on commit 2c55d94

Please sign in to comment.