Skip to content

Commit

Permalink
Merge pull request #187 from rice-crc/develop
Browse files Browse the repository at this point in the history
Merge develop into main
  • Loading branch information
derekjkeller authored Nov 21, 2023
2 parents 48ca222 + ff8ec33 commit 9b46a44
Show file tree
Hide file tree
Showing 41 changed files with 65,677 additions and 319 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@ networks/tmp/
*.sql
dbconf.json


django/document/management/commands/BL_Sources/

django/document/management/commands/*.txt
django/document/management/commands/*.csv
django/document/management/commands/*.json

tmp/

Expand Down
2 changes: 1 addition & 1 deletion api/common/static/Enslaved_options.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion api/common/static/Enslaved_options.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion api/common/static/Enslaver_options.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion api/common/static/Enslaver_options.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion api/common/static/Source_options.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion api/common/static/Source_options.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion api/common/static/Voyage_options.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion api/common/static/Voyage_options.py

Large diffs are not rendered by default.

20 changes: 10 additions & 10 deletions api/document/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,16 @@
from past.models import *
from voyage.models import *

# class PageAdmin(admin.ModelAdmin):
# readonly_fields=['page_url','image_filename','iiif_manifest_url','iiif_baseimage_url']
# search_fields=['page_url','image_filename']
# list_display=['page_url','image_filename']
# model=Page
class PageAdmin(admin.ModelAdmin):
readonly_fields=['page_url','image_filename','iiif_manifest_url','iiif_baseimage_url']
search_fields=['page_url','image_filename']
list_display=['page_url','image_filename']
model=Page

class ShortRefAdmin(admin.ModelAdmin):
model=ShortRef
search_fields=('name',)
list_display=('name',)
search_fields=('name','transkribus_docId')
list_display=('name','transkribus_docId')

class SourceEnslavedConnectionInline(admin.StackedInline):
model=SourceEnslavedConnection
Expand Down Expand Up @@ -59,12 +59,12 @@ class SourceAdmin(admin.ModelAdmin):
SourceVoyageConnectionInline,
SourceEnslavementRelationConnectionInline
]
search_fields=['title','zotero_item_id']
search_fields=['title','zotero_item_id','short_ref__name']
autocomplete_fields=['short_ref','date']
readonly_fields=['item_url','zotero_item_id','zotero_group_id']
list_display=('title','short_ref','zotero_item_id')
list_display=('title','short_ref','zotero_item_id','human_reviewed')

admin.site.register(Source, SourceAdmin)
admin.site.register(ShortRef,ShortRefAdmin)
admin.site.register(DocSparseDate,DocSparseDateAdmin)
# admin.site.register(Page,PageAdmin)
admin.site.register(Page,PageAdmin)
324 changes: 324 additions & 0 deletions api/document/management/commands/BL_IMPORT.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,324 @@
import csv
import re
from pyzotero import zotero
from django.core.management.base import BaseCommand, CommandError
from document.models import *
from past.models import *
from voyage.models import *
from voyages3.settings import *
import requests
import json
import os
import time

class Command(BaseCommand):
help = 'imports BL volumes from dd41 student segmentations -- purpose-built'
def handle(self, *args, **options):

basepath="document/management/commands/BL_Sources"
csvpaths=[i for i in os.listdir(basepath) if i.endswith('.csv')]

library_id='5288953'
grouplibrary_name='sv_british_library'
library_type=zotero_credentials['library_type']
api_key=zotero_credentials['api_key']
zot = zotero.Zotero(library_id, library_type, api_key)

ms_template = zot.item_template('manuscript')
letter_template = zot.item_template('letter')
for csvpath in csvpaths:
fpath=os.path.join(basepath,csvpath)

MS_ID=int(re.search("(?<=MS\s)[0-9]+",fpath).group(0).strip())
print(MS_ID)
with open(fpath,'r',encoding='ISO-8859-1') as csvfile:
reader=csv.DictReader(csvfile)
printedalready=False

thisdoc={}
endofdoc=False

doccount=0

for row in reader:
# print(row)
page_label=row['Page Label']
date=row['Date']
if re.match('[0-9]+,\s*[0-9]+,\s*[0-9]+',date):
dd,mm,yyyy=[d.strip() if d!='' else None for d in re.search('[0-9]+,\s*[0-9]+,\s*[0-9]+',date).group(0).split(',')]
mm=int(mm)
dd=int(dd)
# print("----->",dd,mm,yyyy,len(yyyy))
if len(yyyy)==2:
yyyy=int("17"+yyyy)
elif len(yyyy)==1:
yyyy=int("170"+yyyy)
else:
yyyy=int(yyyy)

date=[yyyy,mm,dd]
else:
date=None
doctype=row['Type']

# print(date)

if 'Note' in row:
note=row['Note']
elif 'Notes' in row:
note=row['Notes']
else:
note=None

if note in [None,'']:
note=None

uv_url=row['UV URL']

if 'VID' in row:
voyage_ids=re.findall("[0-9]+",row["VID"])
if voyage_ids!=[]:
voyage_ids=[int(vid) for vid in voyage_ids]
else:
voyage_ids=None

else:
voyage_ids=None

if doctype=='':
if thisdoc!={}:
callNumber="Additional Manuscripts %d" %MS_ID
shortTitle="SSC Add MS %d" %MS_ID



if thisdoc['doctype'].strip() in [
'Copy of Correspondence',
'Correspondence',
'Court Correspondence',
'Court of Directors Correspondence',
'Letter ',
'Letters of Correspondence'
]:
template_type="letter"
docdict=dict(letter_template)
else:
template_type="ms"
docdict=dict(ms_template)

if thisdoc['doctype'].strip().lower() == "meeting minutes":
thisdoc['doctype']="Minutes of the Meeting of the Court of Directors of the South Sea Company"


if 'date' in thisdoc:
thisdocdate=thisdoc['date']
else:
thisdocdate=None

if thisdocdate is not None:
datestr='/'.join([str(d) for d in thisdocdate])
else:
datestr="No Date"
docdict["date"]=datestr

# print(thisdoc['pagelabels'],len(thisdoc['pagelabels']))

pagelabels=thisdoc['pagelabels']
numpages=len(pagelabels)

# print("pp",pp)

pagelabels_str='; '.join(pagelabels)
#per dd41 -- no dates in the titles. fair enough.
# if datestr=="No Date":
# title="%s (%s)" %(thisdoc['doctype'],datestr)
# else:
# title="%s %s" %(datestr,thisdoc['doctype'])

title= thisdoc['doctype']

docdict["archiveLocation"]= "%s, %s" %(callNumber,pagelabels_str)
docdict['url']=thisdoc['uv_url']
docdict['shortTitle']=shortTitle

docdict['archive']='British Library'
if template_type=='ms':
docdict['numPages']=numpages
docdict['place']="London"
else:
docdict['extra']="Place: London"
docdict['title']=title
docdict['language']='en-UK'

shortref,shortref_isnew=ShortRef.objects.get_or_create(
name=shortTitle
)

docdict['title']+=': %s' %pagelabels_str

thisdocpages=[]

for pagelabel in pagelabels:
try:
pagenumber=str(int(re.search('[0-9]+',pagelabel).group(0)))
verso_or_recto=re.search('[r|v]',pagelabel).group(0)
except:
print("bad page label?",pagelabels)
exit()

pagenumberlength=len(pagenumber)

bufferedzeroes=''.join(['0' for i in range(3-pagenumberlength)])

bufferedpagenumber=bufferedzeroes+pagenumber
if len(bufferedpagenumber)>3:
print("bad page number",docdict)
exit()

image_filename="add_ms_%d_fse%s%s.jpg" %(MS_ID,bufferedpagenumber,verso_or_recto)

try:
page=Page.objects.get(
image_filename=image_filename
)
except:
page=Page.objects.create(
is_british_library=True,
image_filename=image_filename
)
thisdocpages.append(page)

while True:
try:
resp = zot.create_items([docdict])
break
except:
time.sleep(10)



try:
zotero_url=resp['successful']['0']['links']['self']['href']
print(zotero_url)
except:
print("error with zotero call")
print(resp)
exit()

group_id=re.search("(?<=groups/)[0-9]+",zotero_url).group(0)
item_id=re.search("(?<=items/)[A-Z|0-9]+",zotero_url).group(0)

if thisdocdate is not None:
yyyy,mm,dd=thisdocdate
docsparsedate=DocSparseDate.objects.create(
month=mm,
day=dd,
year=yyyy
)
else:
docsparsedate=None

if 'notes' in thisdoc:
notes=thisdoc['notes']
else:
notes=None

doccount+=1

source=Source.objects.create(
short_ref=shortref,
zotero_group_id=group_id,
zotero_item_id=item_id,
zotero_grouplibrary_name=grouplibrary_name,
title=title,
date=docsparsedate,
notes=notes,
order_in_shortref=doccount,
is_british_library=True,

)
docpageorder=1
for page in thisdocpages:
SourcePageConnection.objects.create(
source=source,
page=page,
order=docpageorder
)
docpageorder+=1

if 'voyage_ids' in thisdoc:
voyage_ids=thisdoc['voyage_ids']
print(voyage_ids)
for vid in voyage_ids:
try:
voyage=Voyage.objects.get(voyage_id=vid)
except:
print("voyage %d does not exist" %vid,thisdoc)
voyage=None

if voyage!=None:
scv=SourceVoyageConnection.objects.create(
source=source,
voyage=voyage
)
scv.save()


thisdoc={}
else:
if date is not None:
thisdoc['date']=date
if 'pagelabels' not in thisdoc:
thisdoc['pagelabels']=[page_label]
else:
thisdoc['pagelabels'].append(page_label)
thisdoc['doctype']=doctype

if note is not None:
if "notes" in thisdoc:
thisdoc["notes"]+= "\n%s" %note
else:
thisdoc["notes"]=note

if voyage_ids is not None:
if 'voyage_ids' in thisdoc:
thisdoc['voyage_ids']+=voyage_ids
else:
thisdoc['voyage_ids']=voyage_ids

if uv_url is not None:
thisdoc["uv_url"]=uv_url

print("%d sub-documents" %doccount)


# made a column named "skip" and flag goofy entries to ignore them
# cleared out rows that seem to be sitting between separate docs
# merged some data into the title colum, such as in 25563, where there's details on some of the letters
# tried to split some blocks like on 25561, where we see numerous notes within a block like "same date as previous entry, but different entry"
#
# If there is a column like "VID" then we want to tag that voyage when we create the source!! -- see MS 25575
#
#
# **** 25560 HAD THE HEADER NOTE: "Henry Kelsall, Under-Secretary to the Treasury:
# Letters to, from the South Sea Company: 1721-1730."
#
# **** 25555 HAD THE HEADER NOTE: "The following appear to be letters sent & received by South Sea Co."
#
# **** 25449 AND 25495 HAD A BUNCH OF EXTRA ITEMS AT THE BOTTOM OF THE SHEET THAT APPEAR TO BE BACKMATTER -- BUT WEREN'T GIVEN PAGE NUMERS. THESE WERE CUT FROM THE IMPORT.
#
# **** 25550 HAS THE HEADER: All following pertain to Committee of Correspondence
#
# **** 25545 HAS THE HEADER, A LITTLE WAYS IN?: "All following meetings pertain to dividends annuities and balances of South Sea Company"
#
# **** 25565 LOOKS WRONG TO ME
#
# **** Without access to the metadata, I had to leave the 2-digit years assigned by the data entry folks. This will have to be updated later.
#
# get filenames from
# * "add" + csv filename + fse??? + 3-digit version of numeric portion of pagenumber + recto or verso + '.jpg'
# * like "add_ms_25570_fse004r.jpg"
# * ... and how do i make the iiif image urls? what's that pattern?
# #
#

Loading

0 comments on commit 9b46a44

Please sign in to comment.