-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #358 from uoregon-libraries/feature/refactor-marc-…
…processing Feature/refactor marc processing
- Loading branch information
Showing
7 changed files
with
253 additions
and
70 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
// Package marc has *extremely* rudimentary MARC XML processing for getting at | ||
// a title's name, LCCN, and language code | ||
package marc | ||
|
||
import ( | ||
"encoding/xml" | ||
"fmt" | ||
"io" | ||
"regexp" | ||
"strings" | ||
|
||
"github.com/uoregon-libraries/gopkg/xmlnode" | ||
) | ||
|
||
var marcStripLocRE = regexp.MustCompile(`[ /:,]+$`) | ||
|
||
type subfield struct { | ||
Code string `xml:"code,attr"` | ||
Data string `xml:",innerxml"` | ||
} | ||
|
||
type datafield struct { | ||
Subfields []subfield `xml:"subfield"` | ||
Ind1 string `xml:"ind1,attr"` | ||
Ind2 string `xml:"ind2,attr"` | ||
Tag string `xml:"tag,attr"` | ||
} | ||
|
||
type controlfield struct { | ||
Tag string `xml:"tag,attr"` | ||
Data string `xml:",innerxml"` | ||
} | ||
|
||
type marcXML struct { | ||
Datafields []datafield `xml:"datafield"` | ||
Controlfields []controlfield `xml:"controlfield"` | ||
} | ||
|
||
// MARC holds the raw data parsed from an XML source | ||
type MARC struct { | ||
raw *marcXML | ||
fields map[string]string | ||
} | ||
|
||
func newMARC(raw *marcXML) *MARC { | ||
var m = &MARC{raw: raw, fields: make(map[string]string)} | ||
|
||
for _, cf := range raw.Controlfields { | ||
m.fields[cf.Tag] = cf.Data | ||
} | ||
for _, df := range raw.Datafields { | ||
for _, sf := range df.Subfields { | ||
m.fields[df.Tag+"$"+sf.Code] = sf.Data | ||
} | ||
} | ||
|
||
return m | ||
} | ||
|
||
// Get returns the value of the field with the given tag. Control fields, such | ||
// as "008", have no code, and can be requested directly. Data fields have | ||
// subfields, and must include a tag to indicate which subfield, e.g., tag | ||
// "245" and code "a". | ||
func (m *MARC) Get(tag, code string) string { | ||
if code == "" { | ||
return m.fields[tag] | ||
} | ||
return m.fields[tag+"$"+code] | ||
} | ||
|
||
// LCCN returns field 010 $a, stripped of all spaces | ||
func (m *MARC) LCCN() string { | ||
return strings.Replace(m.Get("010", "a"), " ", "", -1) | ||
} | ||
|
||
// Title returns field 245 $a from MARC | ||
func (m *MARC) Title() string { | ||
var a = strings.TrimSpace(m.Get("245", "a")) | ||
var b = strings.TrimSpace(m.Get("245", "b")) | ||
|
||
if b != "" { | ||
return a + " " + b | ||
} | ||
return a | ||
} | ||
|
||
// Location returns the value in field 260 $a or 264 $a, with special | ||
// characters removed. Field 264 is given precedence. | ||
func (m *MARC) Location() string { | ||
var location = m.Get("264", "a") | ||
if location == "" { | ||
location = m.Get("260", "a") | ||
} | ||
|
||
return marcStripLocRE.ReplaceAllString(location, "") | ||
} | ||
|
||
// Language returns the three-character language code from field 008 | ||
func (m *MARC) Language() string { | ||
var lang = []rune(m.Get("008", "")) | ||
if len(lang) < 38 { | ||
return "" | ||
} | ||
|
||
return string(lang[35:38]) | ||
} | ||
|
||
// parse is our low-level XML parser that gets the raw data structure set up, | ||
// but doesn't do any data processing / translating | ||
func parse(r io.Reader) (*marcXML, error) { | ||
var data, err = io.ReadAll(r) | ||
if err != nil { | ||
return nil, fmt.Errorf("reading MARC xml: %w", err) | ||
} | ||
|
||
var mx = new(marcXML) | ||
var root = new(xmlnode.Node) | ||
err = xml.Unmarshal(data, root) | ||
if err != nil { | ||
return nil, fmt.Errorf("unmarshaling xml into generic structure: %w", err) | ||
} | ||
switch root.XMLName.Local { | ||
case "collection": | ||
if len(root.Nodes) == 0 { | ||
return nil, fmt.Errorf("parsing generic xml: root node has no children") | ||
} | ||
if len(root.Nodes) > 1 { | ||
return nil, fmt.Errorf("parsing generic xml: root node has too many children") | ||
} | ||
var data2, err = xml.Marshal(root.Nodes[0]) | ||
if err != nil { | ||
return nil, fmt.Errorf("parsing generic xml: internal error re-exporting <record>: %w", err) | ||
} | ||
err = xml.Unmarshal(data2, mx) | ||
if err != nil { | ||
return nil, fmt.Errorf("unmarshaling <record>: %w", err) | ||
} | ||
|
||
case "record": | ||
err = xml.Unmarshal(data, mx) | ||
if err != nil { | ||
return nil, fmt.Errorf("unmarshaling <record>: %w", err) | ||
} | ||
|
||
default: | ||
return nil, fmt.Errorf(`unmarshaling xml: root node should be "collection" or "record" (got %q)`, root.XMLName.Local) | ||
} | ||
|
||
return mx, nil | ||
} | ||
|
||
// ParseXML returns a new MARC instance from the XML in the given [io.Reader] | ||
func ParseXML(r io.Reader) (*MARC, error) { | ||
var mx, err = parse(r) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
return newMARC(mx), nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
package marc | ||
|
||
import ( | ||
"os" | ||
"path/filepath" | ||
"testing" | ||
) | ||
|
||
func getwd(t *testing.T) string { | ||
var wd, err = os.Getwd() | ||
if err != nil { | ||
t.Fatalf("Unable to get working directory: %s", err) | ||
} | ||
|
||
return wd | ||
} | ||
|
||
func getFile(t *testing.T, name string) *os.File { | ||
var wd = getwd(t) | ||
var f, err = os.Open(filepath.Join(wd, "testdata", name)) | ||
if err != nil { | ||
t.Fatalf("Unable to read test file %q: %s", name, err) | ||
return nil | ||
} | ||
|
||
return f | ||
} | ||
|
||
func compare(t *testing.T, field, expected, got string) { | ||
if expected != got { | ||
t.Errorf("%s should have been %s, got %s", field, expected, got) | ||
} | ||
} | ||
|
||
func TestParseXML(t *testing.T) { | ||
var tests = map[string]struct { | ||
file string | ||
lccn string | ||
title string | ||
location string | ||
language string | ||
}{ | ||
"collection-wrapped MARC file": { | ||
file: "2002260445-UnitedAmerican.mrk", | ||
lccn: "2002260445", | ||
title: "The united American : a magazine of good citizenchip.", | ||
location: "Portland, Or.", | ||
language: "eng", | ||
}, | ||
|
||
"ONI-provided MARC record": { | ||
file: "oni-2024240297-NorthDouglasHerald.xml", | ||
lccn: "2024240297", | ||
title: "North Douglas herald.", | ||
location: "Drain Or", | ||
language: "eng", | ||
}, | ||
} | ||
|
||
for name, tc := range tests { | ||
t.Run(name, func(t *testing.T) { | ||
var f = getFile(t, tc.file) | ||
var m, err = ParseXML(f) | ||
if err != nil { | ||
t.Fatalf("Unable to parse MARC from %q: %s", tc.file, err) | ||
return | ||
} | ||
|
||
compare(t, "LCCN", tc.lccn, m.LCCN()) | ||
compare(t, "Title", tc.title, m.Title()) | ||
compare(t, "Location", tc.location, m.Location()) | ||
compare(t, "Language", tc.language, m.Language()) | ||
}) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
<?xml version="1.0" encoding="UTF-8" ?><marc:collection xmlns:marc="http://www.loc.gov/MARC21/slim" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd"> | ||
<marc:record><marc:leader>01031cas a2200301 a 4500</marc:leader> | ||
<marc:controlfield tag="001">ocm50154090 </marc:controlfield> | ||
<marc:controlfield tag="003">OCoLC</marc:controlfield><marc:controlfield tag="005">20191215042006.0</marc:controlfield><marc:controlfield tag="008">020711d19231927orumr p 0 0eng c</marc:controlfield><marc:datafield tag="010" ind1=" " ind2=" "><marc:subfield code="a"> 2002260445</marc:subfield></marc:datafield><marc:datafield tag="040" ind1=" " ind2=" "><marc:subfield code="a">ORU</marc:subfield><marc:subfield code="b">eng</marc:subfield><marc:subfield code="c">ORU</marc:subfield><marc:subfield code="d">OCLCQ</marc:subfield><marc:subfield code="d">OCLCF</marc:subfield><marc:subfield code="d">OCLCO</marc:subfield></marc:datafield><marc:datafield tag="029" ind1="1" ind2=" "><marc:subfield code="a">AU@</marc:subfield><marc:subfield code="b">000023760614</marc:subfield></marc:datafield><marc:datafield tag="035" ind1=" " ind2=" "><marc:subfield code="a">(OCoLC)50154090</marc:subfield></marc:datafield><marc:datafield tag="042" ind1=" " ind2=" "><marc:subfield code="a">pcc</marc:subfield></marc:datafield><marc:datafield tag="049" ind1=" " ind2=" "><marc:subfield code="a">ORUM</marc:subfield></marc:datafield><marc:datafield tag="130" ind1="0" ind2=" "><marc:subfield code="a">United American (Portland, Or.)</marc:subfield></marc:datafield><marc:datafield tag="245" ind1="1" ind2="4"><marc:subfield code="a">The united American :</marc:subfield><marc:subfield code="b">a magazine of good citizenchip.</marc:subfield></marc:datafield><marc:datafield tag="260" ind1=" " ind2=" "><marc:subfield code="a">Portland, Or. :</marc:subfield><marc:subfield code="b">Northman Pub. Co.,</marc:subfield><marc:subfield code="c">[1923-1927]</marc:subfield></marc:datafield><marc:datafield tag="300" ind1=" " ind2=" "><marc:subfield code="a">5 v. :</marc:subfield><marc:subfield code="b">ill. ;</marc:subfield><marc:subfield code="c">31 cm.</marc:subfield></marc:datafield><marc:datafield tag="310" ind1=" " ind2=" "><marc:subfield code="a">Monthly</marc:subfield></marc:datafield><marc:datafield tag="362" ind1="0" ind2=" "><marc:subfield code="a">Vol. 1, no. 11 (Aug. 1923)-v. 5, no. 5 (Feb. 1927).</marc:subfield></marc:datafield><marc:datafield tag="500" ind1=" " ind2=" "><marc:subfield code="a">Title from cover.</marc:subfield></marc:datafield><marc:datafield tag="515" ind1=" " ind2=" "><marc:subfield code="a">Vol. 1-v. 5 also called continuous v. 19-continuous v. 22.</marc:subfield></marc:datafield><marc:datafield tag="650" ind1=" " ind2="0"><marc:subfield code="a">Americanization</marc:subfield><marc:subfield code="v">Periodicals.</marc:subfield></marc:datafield><marc:datafield tag="650" ind1=" " ind2="7"><marc:subfield code="a">Americanization.</marc:subfield><marc:subfield code="2">fast</marc:subfield><marc:subfield code="0">(OCoLC)fst00807485</marc:subfield></marc:datafield><marc:datafield tag="655" ind1=" " ind2="7"><marc:subfield code="a">Periodicals.</marc:subfield><marc:subfield code="2">fast</marc:subfield><marc:subfield code="0">(OCoLC)fst01411641</marc:subfield></marc:datafield><marc:datafield tag="780" ind1="0" ind2="0"><marc:subfield code="t">Western American (Portland, Or.)</marc:subfield><marc:subfield code="w">(DLC) 2002260444</marc:subfield><marc:subfield code="w">(OCoLC)50154099</marc:subfield></marc:datafield><marc:datafield tag="994" ind1=" " ind2=" "><marc:subfield code="a">C0</marc:subfield><marc:subfield code="b">ORU</marc:subfield></marc:datafield></marc:record> | ||
</marc:collection> |
1 change: 1 addition & 0 deletions
1
src/internal/marc/testdata/oni-2024240297-NorthDouglasHerald.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
<record><leader>01574cas a2200457 i 4500</leader><controlfield tag="001">on1457219712</controlfield><controlfield tag="003">OCoLC</controlfield><controlfield tag="005">20241003040953.0</controlfield><controlfield tag="006">m o d </controlfield><controlfield tag="007">cr |||||||||||</controlfield><controlfield tag="008">240925c20239999orumr noo 0 0eng c</controlfield><datafield ind1=" " ind2=" " tag="010"><subfield code="a"> 2024240297</subfield></datafield><datafield ind1=" " ind2=" " tag="040"><subfield code="a">ORU</subfield><subfield code="b">eng</subfield><subfield code="e">rda</subfield><subfield code="e">pn</subfield><subfield code="c">ORU</subfield><subfield code="d">ORU</subfield></datafield><datafield ind1=" " ind2=" " tag="035"><subfield code="a">(OCoLC)1457219712</subfield></datafield><datafield ind1=" " ind2=" " tag="042"><subfield code="a">pcc</subfield></datafield><datafield ind1=" " ind2=" " tag="043"><subfield code="a">n-us-or</subfield></datafield><datafield ind1=" " ind2=" " tag="049"><subfield code="a">ORUM</subfield></datafield><datafield ind1="0" ind2=" " tag="130"><subfield code="a">North Douglas herald (Drain, Or. : 2023)</subfield></datafield><datafield ind1="1" ind2="0" tag="245"><subfield code="a">North Douglas herald.</subfield></datafield><datafield ind1="1" ind2=" " tag="246"><subfield code="a">Herald</subfield></datafield><datafield ind1=" " ind2=" " tag="250"><subfield code="a">North County edition.</subfield></datafield><datafield ind1=" " ind2="1" tag="264"><subfield code="a">Drain Or :</subfield><subfield code="b">North Douglas Herald,</subfield><subfield code="c">2023-</subfield></datafield><datafield ind1=" " ind2=" " tag="300"><subfield code="a">1 online resource</subfield></datafield><datafield ind1=" " ind2=" " tag="310"><subfield code="a">Monthly</subfield></datafield><datafield ind1=" " ind2=" " tag="336"><subfield code="a">text</subfield><subfield code="b">txt</subfield><subfield code="2">rdacontent</subfield></datafield><datafield ind1=" " ind2=" " tag="336"><subfield code="a">still image</subfield><subfield code="b">sti</subfield><subfield code="2">rdacontent</subfield></datafield><datafield ind1=" " ind2=" " tag="337"><subfield code="a">computer</subfield><subfield code="b">c</subfield><subfield code="2">rdamedia</subfield></datafield><datafield ind1=" " ind2=" " tag="338"><subfield code="a">online resource</subfield><subfield code="b">cr</subfield><subfield code="2">rdacarrier</subfield></datafield><datafield ind1="1" ind2=" " tag="362"><subfield code="a">Began with Volume 1 issue 1 (September 1, 2023).</subfield></datafield><datafield ind1=" " ind2=" " tag="500"><subfield code="a">Issues for October 2023 - lack edition statement.</subfield></datafield><datafield ind1="0" ind2=" " tag="588"><subfield code="a">Volume 1 issue 1 (September 1, 2023); title from PDF masthead (publisher's Web site, viewed Sept. 25, 2024).</subfield></datafield><datafield ind1="1" ind2=" " tag="588"><subfield code="a">Vol #2 issue #9 (September 2024) (viewed Sept. 25, 2024).</subfield></datafield><datafield ind1=" " ind2="0" tag="651"><subfield code="a">Drain (Or.)</subfield><subfield code="v">Newspapers.</subfield></datafield><datafield ind1=" " ind2="0" tag="651"><subfield code="a">Douglas County (Or.)</subfield><subfield code="v">Newspapers.</subfield></datafield><datafield ind1=" " ind2="0" tag="651"><subfield code="a">Lane County (Or.)</subfield><subfield code="v">Newspapers.</subfield></datafield><datafield ind1=" " ind2="7" tag="655"><subfield code="a">Serial publications.</subfield><subfield code="2">lcgft</subfield></datafield><datafield ind1=" " ind2="7" tag="655"><subfield code="a">Newspapers.</subfield><subfield code="2">lcgft</subfield></datafield><datafield ind1=" " ind2=" " tag="752"><subfield code="a">United States</subfield><subfield code="b">Oregon</subfield><subfield code="c">Douglas</subfield><subfield code="d">Drain.</subfield></datafield><datafield ind1="0" ind2="8" tag="776"><subfield code="i">Pint version:</subfield><subfield code="t">North Douglas Herald</subfield></datafield><datafield ind1="4" ind2="0" tag="856"><subfield code="u">https://www.ndherald.com/issues.html</subfield><subfield code="z">Publisher's Web site</subfield></datafield><datafield ind1=" " ind2=" " tag="994"><subfield code="a">C0</subfield><subfield code="b">ORU</subfield></datafield></record> |