Skip to content

Commit

Permalink
Merge pull request #90 from nationalarchives/scrape-caselaw
Browse files Browse the repository at this point in the history
Get all current documents in a 'csv' file
  • Loading branch information
dragon-dxw authored Oct 4, 2022
2 parents d54c003 + 2526c07 commit e0e3812
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 0 deletions.
6 changes: 6 additions & 0 deletions utilities/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
This will request pages of headline details of judgments, 250 at a time.
Sometimes there are Page Cache errors, but rerunning seems to fix them.

Output is semi-colon separated. If a field occurs multiple times, it will appear with ` *** ` between them:

`https://caselaw.nationalarchives.gov.uk/id/ukut/lc/2022/72;2022-03-14 *** 2022-03-14;Craig Duncan Collins v Richard Howell *** Craig Duncan Collins v Richard Howell;2022-04-25T16:07:56;[2022] UKUT 72 (LC);UKUT-LC`
27 changes: 27 additions & 0 deletions utilities/my_xquery.xqy
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
xquery version "1.0-ml";

import module namespace dls = "http://marklogic.com/xdmp/dls"
at "/MarkLogic/dls.xqy";
import module namespace search = "http://marklogic.com/appservices/search" at "/MarkLogic/appservices/search/search.xqy";


declare function local:output($x ) {
fn:string-join($x, " *** ")
};

declare variable $mindoc as xs:integer external;
declare variable $maxdoc as xs:integer external;

for $doc in fn:collection("http://marklogic.com/collections/dls/latest-version")[$mindoc to $maxdoc]
let $meta := $doc//*:akomaNtoso/*:judgment/*:meta
let $id := $meta/*:identification
return (
fn:string-join(
(local:output($id/*:FRBRWork/*:FRBRthis/@value),
local:output($id/*:FRBRWork/*:FRBRdate/@date),
local:output($id//*:FRBRWork/*:FRBRname/@value),
local:output($id//*:FRBRManifestation/*:FRBRdate[@name="transform"]/@date),
local:output($meta/*:proprietary/*:cite/text()),
local:output($meta/*:proprietary/*:court/text())
), ";")
)
26 changes: 26 additions & 0 deletions utilities/run-xquery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import environ
environ.Env.read_env()
import caselawclient.Client
from requests_toolbelt.multipart import decoder
import json
import sys
client = caselawclient.Client.api_client
page=0
page_size = 250
part = True

while part:
mindoc = 1 + (page_size * page)
maxdoc = page_size * (page + 1)

print (f"Page {page}, ({mindoc}-{maxdoc})", file=sys.stderr)
vars = json.dumps({'mindoc': mindoc, 'maxdoc': maxdoc})
response = client.eval(xquery_path="my_xquery.xqy", vars=vars, accept_header="application/xml")
if not response.content:
break
multipart_data = decoder.MultipartDecoder.from_response(response)
print (len(multipart_data.parts), file=sys.stderr)
for part in multipart_data.parts:
print (part.text)
print (part.text, file=sys.stderr)
page = page + 1

0 comments on commit e0e3812

Please sign in to comment.