Skip to content

Commit

Permalink
Merge pull request #358 from uoregon-libraries/feature/refactor-marc-…
Browse files Browse the repository at this point in the history
…processing

Feature/refactor marc processing
  • Loading branch information
jechols authored Nov 14, 2024
2 parents 055b3d3 + 3be1596 commit 1c205bd
Show file tree
Hide file tree
Showing 7 changed files with 253 additions and 70 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ require (
github.com/jessevdk/go-flags v1.4.0
github.com/pressly/goose/v3 v3.13.4
github.com/tidwall/sjson v1.2.5
github.com/uoregon-libraries/gopkg v0.29.0
github.com/uoregon-libraries/gopkg v0.30.2
golang.org/x/crypto v0.10.0
golang.org/x/text v0.11.0
)
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ github.com/uoregon-libraries/gopkg v0.28.0 h1:9JDQ27RSRKSRfuO9wjcNDl3xWTw7p5iMNt
github.com/uoregon-libraries/gopkg v0.28.0/go.mod h1:AQz5Eawxd/FlcIIF1Nan7PVHlxLFSSaF9X+KQhDIvmg=
github.com/uoregon-libraries/gopkg v0.29.0 h1:p4eXvyU+XCQGpz34of+7W8PS8c5Wj21z46HxN+2BQGo=
github.com/uoregon-libraries/gopkg v0.29.0/go.mod h1:AQz5Eawxd/FlcIIF1Nan7PVHlxLFSSaF9X+KQhDIvmg=
github.com/uoregon-libraries/gopkg v0.30.2 h1:PaBywsY0/jZKxX4Qzf6BMNqgs8txlPLLb6MHEnpEnOw=
github.com/uoregon-libraries/gopkg v0.30.2/go.mod h1:AQz5Eawxd/FlcIIF1Nan7PVHlxLFSSaF9X+KQhDIvmg=
golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM=
golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I=
golang.org/x/mod v0.12.0 h1:rmsUpXtvNzj340zd98LZ4KntptpfRHwpFOHG188oHXc=
Expand Down
78 changes: 9 additions & 69 deletions src/cmd/server/internal/titlehandler/marc.go
Original file line number Diff line number Diff line change
@@ -1,41 +1,16 @@
package titlehandler

import (
"encoding/xml"
"fmt"
"io"
"net/http"
"os"
"regexp"
"strings"

"github.com/uoregon-libraries/newspaper-curation-app/src/internal/logger"
"github.com/uoregon-libraries/newspaper-curation-app/src/internal/marc"
)

var marcStripLocRE = regexp.MustCompile(`[ /:,]+$`)

type subfield struct {
Code string `xml:"code,attr"`
Data string `xml:",innerxml"`
}

type datafield struct {
Subfields []subfield `xml:"subfield"`
Ind1 string `xml:"ind1,attr"`
Ind2 string `xml:"ind2,attr"`
Tag string `xml:"tag,attr"`
}

type controlfield struct {
Tag string `xml:"tag,attr"`
Data string `xml:",innerxml"`
}

type marc struct {
Datafields []datafield `xml:"datafield"`
Controlfields []controlfield `xml:"controlfield"`
}

// pullMARCForTitle pulls the MARC record from the library of congress and sets the
// title's data if successful
func pullMARCForTitle(t *Title) {
Expand Down Expand Up @@ -74,60 +49,25 @@ func lookupMARC(t *Title, marcLoc string) error {
} else {
reader, err = getMarcLocal(marcLoc)
}

// An error from the Get call is not a deal-breaker, though we do want to
// report it
if err != nil {
return err
return fmt.Errorf("preparing MARC XML reader: %w", err)
}
defer reader.Close()

var data []byte
data, err = io.ReadAll(reader)
// An error reading the response is also not a deal-breaker, but a bit weirder
if err != nil {
return fmt.Errorf("reading response body: %w", err)
}

var m marc
err = xml.Unmarshal(data, &m)
if err != nil {
return fmt.Errorf("unmarshaling response body: %w", err)
}

for _, df := range m.Datafields {
if df.Tag == "245" {
for _, sf := range df.Subfields {
if sf.Code == "a" {
t.MARCTitle = sf.Data
}
}
}

if df.Tag == "260" || df.Tag == "264" {
for _, sf := range df.Subfields {
if sf.Code == "a" {
t.MARCLocation = marcStripLocRE.ReplaceAllString(sf.Data, "")
}
}
}
}
for _, cf := range m.Controlfields {
if cf.Tag == "008" {
runes := []rune(cf.Data)
t.LangCode3 = string(runes[35:38])
}
}
var m *marc.MARC
m, err = marc.ParseXML(reader)
t.MARCTitle = m.Title()
t.MARCLocation = m.Location()
t.LangCode3 = m.Language()
if t.MARCTitle == "" || t.MARCLocation == "" {
return fmt.Errorf("invalid xml response: title and location must not be blank")
return fmt.Errorf("parsing MARC XML: title and location must not be blank")
}

t.ValidLCCN = true

// Hopefully this saves, but if not we're not losing irreplacable data, so we just log the error and move on
err = t.Save()
if err != nil {
return fmt.Errorf("unable to save title (id %d) after MARC data pull: %w", t.ID, err)
return fmt.Errorf("saving title (id %d) after MARC XML read: %w", t.ID, err)
}

return nil
Expand Down
160 changes: 160 additions & 0 deletions src/internal/marc/marc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
// Package marc has *extremely* rudimentary MARC XML processing for getting at
// a title's name, LCCN, and language code
package marc

import (
"encoding/xml"
"fmt"
"io"
"regexp"
"strings"

"github.com/uoregon-libraries/gopkg/xmlnode"
)

var marcStripLocRE = regexp.MustCompile(`[ /:,]+$`)

type subfield struct {
Code string `xml:"code,attr"`
Data string `xml:",innerxml"`
}

type datafield struct {
Subfields []subfield `xml:"subfield"`
Ind1 string `xml:"ind1,attr"`
Ind2 string `xml:"ind2,attr"`
Tag string `xml:"tag,attr"`
}

type controlfield struct {
Tag string `xml:"tag,attr"`
Data string `xml:",innerxml"`
}

type marcXML struct {
Datafields []datafield `xml:"datafield"`
Controlfields []controlfield `xml:"controlfield"`
}

// MARC holds the raw data parsed from an XML source
type MARC struct {
raw *marcXML
fields map[string]string
}

func newMARC(raw *marcXML) *MARC {
var m = &MARC{raw: raw, fields: make(map[string]string)}

for _, cf := range raw.Controlfields {
m.fields[cf.Tag] = cf.Data
}
for _, df := range raw.Datafields {
for _, sf := range df.Subfields {
m.fields[df.Tag+"$"+sf.Code] = sf.Data
}
}

return m
}

// Get returns the value of the field with the given tag. Control fields, such
// as "008", have no code, and can be requested directly. Data fields have
// subfields, and must include a tag to indicate which subfield, e.g., tag
// "245" and code "a".
func (m *MARC) Get(tag, code string) string {
if code == "" {
return m.fields[tag]
}
return m.fields[tag+"$"+code]
}

// LCCN returns field 010 $a, stripped of all spaces
func (m *MARC) LCCN() string {
return strings.Replace(m.Get("010", "a"), " ", "", -1)
}

// Title returns field 245 $a from MARC
func (m *MARC) Title() string {
var a = strings.TrimSpace(m.Get("245", "a"))
var b = strings.TrimSpace(m.Get("245", "b"))

if b != "" {
return a + " " + b
}
return a
}

// Location returns the value in field 260 $a or 264 $a, with special
// characters removed. Field 264 is given precedence.
func (m *MARC) Location() string {
var location = m.Get("264", "a")
if location == "" {
location = m.Get("260", "a")
}

return marcStripLocRE.ReplaceAllString(location, "")
}

// Language returns the three-character language code from field 008
func (m *MARC) Language() string {
var lang = []rune(m.Get("008", ""))
if len(lang) < 38 {
return ""
}

return string(lang[35:38])
}

// parse is our low-level XML parser that gets the raw data structure set up,
// but doesn't do any data processing / translating
func parse(r io.Reader) (*marcXML, error) {
var data, err = io.ReadAll(r)
if err != nil {
return nil, fmt.Errorf("reading MARC xml: %w", err)
}

var mx = new(marcXML)
var root = new(xmlnode.Node)
err = xml.Unmarshal(data, root)
if err != nil {
return nil, fmt.Errorf("unmarshaling xml into generic structure: %w", err)
}
switch root.XMLName.Local {
case "collection":
if len(root.Nodes) == 0 {
return nil, fmt.Errorf("parsing generic xml: root node has no children")
}
if len(root.Nodes) > 1 {
return nil, fmt.Errorf("parsing generic xml: root node has too many children")
}
var data2, err = xml.Marshal(root.Nodes[0])
if err != nil {
return nil, fmt.Errorf("parsing generic xml: internal error re-exporting <record>: %w", err)
}
err = xml.Unmarshal(data2, mx)
if err != nil {
return nil, fmt.Errorf("unmarshaling <record>: %w", err)
}

case "record":
err = xml.Unmarshal(data, mx)
if err != nil {
return nil, fmt.Errorf("unmarshaling <record>: %w", err)
}

default:
return nil, fmt.Errorf(`unmarshaling xml: root node should be "collection" or "record" (got %q)`, root.XMLName.Local)
}

return mx, nil
}

// ParseXML returns a new MARC instance from the XML in the given [io.Reader]
func ParseXML(r io.Reader) (*MARC, error) {
var mx, err = parse(r)
if err != nil {
return nil, err
}

return newMARC(mx), nil
}
75 changes: 75 additions & 0 deletions src/internal/marc/marc_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
package marc

import (
"os"
"path/filepath"
"testing"
)

func getwd(t *testing.T) string {
var wd, err = os.Getwd()
if err != nil {
t.Fatalf("Unable to get working directory: %s", err)
}

return wd
}

func getFile(t *testing.T, name string) *os.File {
var wd = getwd(t)
var f, err = os.Open(filepath.Join(wd, "testdata", name))
if err != nil {
t.Fatalf("Unable to read test file %q: %s", name, err)
return nil
}

return f
}

func compare(t *testing.T, field, expected, got string) {
if expected != got {
t.Errorf("%s should have been %s, got %s", field, expected, got)
}
}

func TestParseXML(t *testing.T) {
var tests = map[string]struct {
file string
lccn string
title string
location string
language string
}{
"collection-wrapped MARC file": {
file: "2002260445-UnitedAmerican.mrk",
lccn: "2002260445",
title: "The united American : a magazine of good citizenchip.",
location: "Portland, Or.",
language: "eng",
},

"ONI-provided MARC record": {
file: "oni-2024240297-NorthDouglasHerald.xml",
lccn: "2024240297",
title: "North Douglas herald.",
location: "Drain Or",
language: "eng",
},
}

for name, tc := range tests {
t.Run(name, func(t *testing.T) {
var f = getFile(t, tc.file)
var m, err = ParseXML(f)
if err != nil {
t.Fatalf("Unable to parse MARC from %q: %s", tc.file, err)
return
}

compare(t, "LCCN", tc.lccn, m.LCCN())
compare(t, "Title", tc.title, m.Title())
compare(t, "Location", tc.location, m.Location())
compare(t, "Language", tc.language, m.Language())
})
}
}
5 changes: 5 additions & 0 deletions src/internal/marc/testdata/2002260445-UnitedAmerican.mrk
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<?xml version="1.0" encoding="UTF-8" ?><marc:collection xmlns:marc="http://www.loc.gov/MARC21/slim" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd">
<marc:record><marc:leader>01031cas a2200301 a 4500</marc:leader>
<marc:controlfield tag="001">ocm50154090 </marc:controlfield>
<marc:controlfield tag="003">OCoLC</marc:controlfield><marc:controlfield tag="005">20191215042006.0</marc:controlfield><marc:controlfield tag="008">020711d19231927orumr p 0 0eng c</marc:controlfield><marc:datafield tag="010" ind1=" " ind2=" "><marc:subfield code="a"> 2002260445</marc:subfield></marc:datafield><marc:datafield tag="040" ind1=" " ind2=" "><marc:subfield code="a">ORU</marc:subfield><marc:subfield code="b">eng</marc:subfield><marc:subfield code="c">ORU</marc:subfield><marc:subfield code="d">OCLCQ</marc:subfield><marc:subfield code="d">OCLCF</marc:subfield><marc:subfield code="d">OCLCO</marc:subfield></marc:datafield><marc:datafield tag="029" ind1="1" ind2=" "><marc:subfield code="a">AU@</marc:subfield><marc:subfield code="b">000023760614</marc:subfield></marc:datafield><marc:datafield tag="035" ind1=" " ind2=" "><marc:subfield code="a">(OCoLC)50154090</marc:subfield></marc:datafield><marc:datafield tag="042" ind1=" " ind2=" "><marc:subfield code="a">pcc</marc:subfield></marc:datafield><marc:datafield tag="049" ind1=" " ind2=" "><marc:subfield code="a">ORUM</marc:subfield></marc:datafield><marc:datafield tag="130" ind1="0" ind2=" "><marc:subfield code="a">United American (Portland, Or.)</marc:subfield></marc:datafield><marc:datafield tag="245" ind1="1" ind2="4"><marc:subfield code="a">The united American :</marc:subfield><marc:subfield code="b">a magazine of good citizenchip.</marc:subfield></marc:datafield><marc:datafield tag="260" ind1=" " ind2=" "><marc:subfield code="a">Portland, Or. :</marc:subfield><marc:subfield code="b">Northman Pub. Co.,</marc:subfield><marc:subfield code="c">[1923-1927]</marc:subfield></marc:datafield><marc:datafield tag="300" ind1=" " ind2=" "><marc:subfield code="a">5 v. :</marc:subfield><marc:subfield code="b">ill. ;</marc:subfield><marc:subfield code="c">31 cm.</marc:subfield></marc:datafield><marc:datafield tag="310" ind1=" " ind2=" "><marc:subfield code="a">Monthly</marc:subfield></marc:datafield><marc:datafield tag="362" ind1="0" ind2=" "><marc:subfield code="a">Vol. 1, no. 11 (Aug. 1923)-v. 5, no. 5 (Feb. 1927).</marc:subfield></marc:datafield><marc:datafield tag="500" ind1=" " ind2=" "><marc:subfield code="a">Title from cover.</marc:subfield></marc:datafield><marc:datafield tag="515" ind1=" " ind2=" "><marc:subfield code="a">Vol. 1-v. 5 also called continuous v. 19-continuous v. 22.</marc:subfield></marc:datafield><marc:datafield tag="650" ind1=" " ind2="0"><marc:subfield code="a">Americanization</marc:subfield><marc:subfield code="v">Periodicals.</marc:subfield></marc:datafield><marc:datafield tag="650" ind1=" " ind2="7"><marc:subfield code="a">Americanization.</marc:subfield><marc:subfield code="2">fast</marc:subfield><marc:subfield code="0">(OCoLC)fst00807485</marc:subfield></marc:datafield><marc:datafield tag="655" ind1=" " ind2="7"><marc:subfield code="a">Periodicals.</marc:subfield><marc:subfield code="2">fast</marc:subfield><marc:subfield code="0">(OCoLC)fst01411641</marc:subfield></marc:datafield><marc:datafield tag="780" ind1="0" ind2="0"><marc:subfield code="t">Western American (Portland, Or.)</marc:subfield><marc:subfield code="w">(DLC) 2002260444</marc:subfield><marc:subfield code="w">(OCoLC)50154099</marc:subfield></marc:datafield><marc:datafield tag="994" ind1=" " ind2=" "><marc:subfield code="a">C0</marc:subfield><marc:subfield code="b">ORU</marc:subfield></marc:datafield></marc:record>
</marc:collection>
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<record><leader>01574cas a2200457 i 4500</leader><controlfield tag="001">on1457219712</controlfield><controlfield tag="003">OCoLC</controlfield><controlfield tag="005">20241003040953.0</controlfield><controlfield tag="006">m o d </controlfield><controlfield tag="007">cr |||||||||||</controlfield><controlfield tag="008">240925c20239999orumr noo 0 0eng c</controlfield><datafield ind1=" " ind2=" " tag="010"><subfield code="a"> 2024240297</subfield></datafield><datafield ind1=" " ind2=" " tag="040"><subfield code="a">ORU</subfield><subfield code="b">eng</subfield><subfield code="e">rda</subfield><subfield code="e">pn</subfield><subfield code="c">ORU</subfield><subfield code="d">ORU</subfield></datafield><datafield ind1=" " ind2=" " tag="035"><subfield code="a">(OCoLC)1457219712</subfield></datafield><datafield ind1=" " ind2=" " tag="042"><subfield code="a">pcc</subfield></datafield><datafield ind1=" " ind2=" " tag="043"><subfield code="a">n-us-or</subfield></datafield><datafield ind1=" " ind2=" " tag="049"><subfield code="a">ORUM</subfield></datafield><datafield ind1="0" ind2=" " tag="130"><subfield code="a">North Douglas herald (Drain, Or. : 2023)</subfield></datafield><datafield ind1="1" ind2="0" tag="245"><subfield code="a">North Douglas herald.</subfield></datafield><datafield ind1="1" ind2=" " tag="246"><subfield code="a">Herald</subfield></datafield><datafield ind1=" " ind2=" " tag="250"><subfield code="a">North County edition.</subfield></datafield><datafield ind1=" " ind2="1" tag="264"><subfield code="a">Drain Or :</subfield><subfield code="b">North Douglas Herald,</subfield><subfield code="c">2023-</subfield></datafield><datafield ind1=" " ind2=" " tag="300"><subfield code="a">1 online resource</subfield></datafield><datafield ind1=" " ind2=" " tag="310"><subfield code="a">Monthly</subfield></datafield><datafield ind1=" " ind2=" " tag="336"><subfield code="a">text</subfield><subfield code="b">txt</subfield><subfield code="2">rdacontent</subfield></datafield><datafield ind1=" " ind2=" " tag="336"><subfield code="a">still image</subfield><subfield code="b">sti</subfield><subfield code="2">rdacontent</subfield></datafield><datafield ind1=" " ind2=" " tag="337"><subfield code="a">computer</subfield><subfield code="b">c</subfield><subfield code="2">rdamedia</subfield></datafield><datafield ind1=" " ind2=" " tag="338"><subfield code="a">online resource</subfield><subfield code="b">cr</subfield><subfield code="2">rdacarrier</subfield></datafield><datafield ind1="1" ind2=" " tag="362"><subfield code="a">Began with Volume 1 issue 1 (September 1, 2023).</subfield></datafield><datafield ind1=" " ind2=" " tag="500"><subfield code="a">Issues for October 2023 - lack edition statement.</subfield></datafield><datafield ind1="0" ind2=" " tag="588"><subfield code="a">Volume 1 issue 1 (September 1, 2023); title from PDF masthead (publisher's Web site, viewed Sept. 25, 2024).</subfield></datafield><datafield ind1="1" ind2=" " tag="588"><subfield code="a">Vol #2 issue #9 (September 2024) (viewed Sept. 25, 2024).</subfield></datafield><datafield ind1=" " ind2="0" tag="651"><subfield code="a">Drain (Or.)</subfield><subfield code="v">Newspapers.</subfield></datafield><datafield ind1=" " ind2="0" tag="651"><subfield code="a">Douglas County (Or.)</subfield><subfield code="v">Newspapers.</subfield></datafield><datafield ind1=" " ind2="0" tag="651"><subfield code="a">Lane County (Or.)</subfield><subfield code="v">Newspapers.</subfield></datafield><datafield ind1=" " ind2="7" tag="655"><subfield code="a">Serial publications.</subfield><subfield code="2">lcgft</subfield></datafield><datafield ind1=" " ind2="7" tag="655"><subfield code="a">Newspapers.</subfield><subfield code="2">lcgft</subfield></datafield><datafield ind1=" " ind2=" " tag="752"><subfield code="a">United States</subfield><subfield code="b">Oregon</subfield><subfield code="c">Douglas</subfield><subfield code="d">Drain.</subfield></datafield><datafield ind1="0" ind2="8" tag="776"><subfield code="i">Pint version:</subfield><subfield code="t">North Douglas Herald</subfield></datafield><datafield ind1="4" ind2="0" tag="856"><subfield code="u">https://www.ndherald.com/issues.html</subfield><subfield code="z">Publisher's Web site</subfield></datafield><datafield ind1=" " ind2=" " tag="994"><subfield code="a">C0</subfield><subfield code="b">ORU</subfield></datafield></record>

0 comments on commit 1c205bd

Please sign in to comment.