Skip to content

Commit

Permalink
fix(or, orctapp): scrape new endpoint
Browse files Browse the repository at this point in the history
- Backscraper now works for any range
- Updated example files
- Collecting judges
  • Loading branch information
grossir committed Oct 10, 2024
1 parent 8e16ded commit ab19c60
Show file tree
Hide file tree
Showing 8 changed files with 5,804 additions and 19,753 deletions.
228 changes: 97 additions & 131 deletions juriscraper/opinions/united_states/state/or.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,150 +12,116 @@

class Site(OpinionSiteLinear):
court_code = "p17027coll3"
detail_url = "https://ojd.contentdm.oclc.org/digital/bl/dmwebservices/index.php?q=dmQuery/{}/identi^{}^all^and/title!subjec!descri!dmrecord/title/1024/1/0/0/0/0/json"
download_url = "https://ojd.contentdm.oclc.org/digital/api/collection/{}/id/{}/download"
days_interval = 720
# Earliest opinion as of development in Oct 2024
first_opinion_date = datetime(2023, 4, 1)
base_url = "https://cdm17027.contentdm.oclc.org/digital/api/search/collection/{}/searchterm/{}-{}/field/dated/mode/exact/conn/and/maxRecords/200"
# technically they have an 1870 case but just one
first_opinion_date = datetime(1997, 8, 12)
days_interval = 15

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.url = (
"https://www.courts.oregon.gov/publications/sc/Pages/default.aspx"
)
today = datetime.today()
self.url = self.format_url(today - timedelta(15), today)
self.make_backscrape_iterable(kwargs)

# By default, scrape at most 10 days into the past
# It's important to limit regular scrapes, since
# this scraper makes secondary requests and the site
# loads all opinions back to a year; which would
# create a lot of hits to the server each time
# the hourly scraper is triggered
# The limits will be modified in a backscrape
self.start_date = (datetime.today() - timedelta(10)).date()
self.end_date = (datetime.today() + timedelta(1)).date()

def _process_html(self):
for date_header in self.html.xpath(
"//h4[a[contains(@href, '/dated/')]]"
):
date_string = date_header.text_content().strip()
if not date_string:
logger.info("Skipping section with no date string")
continue

date = datetime.strptime(date_string, "%m/%d/%Y").date()
if date > self.end_date:
# Opinions come in descending date order
for row in self.html["items"]:
docket, name, citation, date = (
x["value"] for x in row["metadataFields"]
)
if not name:
# Happens on rows like:
# "Miscellaneous Supreme Court dispositions, June 10 and 13, 2024"
logger.info("Skipping row '%s'", docket)
continue
if date < self.start_date and not self.test_mode_enabled():
logger.info(
"Date %s is out of range [%s to %s]",
date,
self.start_date,
self.end_date,
)
break

self.process_a_date(date_header)

def process_a_date(self, date_header) -> None:
"""Process a section defined by a date header
:param date_header: the lxml element containing the date
:return None
"""
date_string = date_header.text_content().strip()

# orctapp has h5 tags which describe the status of the
# opinions in the next ul
for sibling in date_header.xpath("following-sibling::*"):
if sibling.tag not in ["ul", "h5"]:
# Time to jump to another date
break

if "orctapp" in self.court_id:
if sibling.tag == "h5":
status = sibling.text_content().strip()
if status == "Precedential Opinions":
status = "Published"
elif status == "Nonprecedential Memorandum Opinions":
status = "Unpublished"
else:
status = "Unknown"
else:
status = "Published"

for item in sibling.xpath("li"):
# Ensure two links are present (skip Petitions
# for Review rows)
text = item.text_content().strip()
anchors = item.xpath(".//a")
if not (len(anchors) > 1):
logger.info("Skipping row without 2 links. Row: %s", text)
continue

detail_url = anchors[0].xpath("./@href")[0]
download_url, disposition = self.get_details(detail_url)
if not download_url:
# Usually happens for
# "Miscellaneous Supreme Court Dispositions"
logger.info("No records for detail JSON")
continue

name = text.split(")", 1)[-1]
# Clean up names like:
# "Knopp v. Griffin-Valade (Certified appeal accepted)"
if "(" in name:
name, disposition = name.split("(", 1)
disposition = disposition.strip(")")

self.cases.append(
{
"date": date_string,
"name": name,
"docket": anchors[1].text_content().strip(),
"url": download_url,
"citation": item.xpath("b/text()")[0].strip(),
"status": status,
"disposition": disposition,
}
)

def get_details(self, detail_url: str) -> tuple[str, str]:
"""Makes a request to get a case details, including the URL
:param detail_url: case detail's page url
:return: a tuple: (the pdf download url, the disposition)
judge, disposition, status, lower_court_number = self.get_details(
row
)
per_curiam = False
if judge and judge == "PC" or "per curiam" in judge.lower():
per_curiam = True
judge = ""

self.cases.append(
{
"name": name,
"date": date,
"docket": docket.split(",")[0],
"url": f"https://ojd.contentdm.oclc.org/digital/api/collection/{row['collectionAlias']}/id/{row['itemId']}/download",
"citation": citation,
"judge": judge,
"per_curiam": per_curiam,
"status": status,
"disposition": disposition,
"lower_court_number": lower_court_number,
}
)

def get_details(self, row: dict) -> tuple[str, str, str, str]:
"""Makes a secondary request to get details for a single
opinion
:param row: the JSON records, to get the item id for the request
or the JSON object in tests
:return: a tuple containing, if it has a valid value
- judge
- disposition
- status
- lower court number (only for `or`)
"""
if self.test_mode_enabled():
return "placeholder url", "placeholder disposition"

identifier = detail_url.split("=")[-1]
detail_url = self.detail_url.format(self.court_code, identifier)

logger.info("Getting detail JSON from %s", detail_url)
json = self.request["session"].get(detail_url).json()
logger.debug(json)
if not json.get("records"):
return "", ""

disposition = json["records"][0].get("descri") or ""
download_url = self.download_url.format(
self.court_code, json["records"][0]["pointer"]
)
return download_url, disposition
if not row.get("detailJson"):
return (
"placeholder judge",
"placeholder disposition",
"Unknown",
"placeholder lower court number",
)
# Some test cases have their detail data manually copy pasted
json = row["detailJson"]
else:
item_id = row["itemId"]
url = f"https://cdm17027.contentdm.oclc.org/digital/api/collections/{self.court_code}/items/{item_id}/false"
json = self.request["session"].get(url).json()
logger.debug("Getting detail JSON from %s", url)

if len(json["fields"]) == 1:
fields = json["parent"]["fields"]
else:
fields = json["fields"]

judge, disposition, status, lower_court_number = "", "", "Unknown", ""
for field in fields:
if field["key"] == "judge":
judge = field["value"]
elif field["key"] == "type":
if field["value"] == "Nonprecedential opinion":
status = "Unpublished"
else:
status = "Published"
elif field["key"] == "descri":
disposition = field["value"]
elif field["key"] == "relhapt":
# For orctapp this field may be populated with consolidated docket
# numbers
if self.court_id.endswith("or") and not field[
"value"
].startswith("S"):
lower_court_number = field["value"]

return judge, disposition, status, lower_court_number

def _download_backwards(self, dates: tuple) -> None:
"""The site loads by default the last couple years of data.
So it's not necessary to query the page in a special way to
target data in these years, only to set the proper date limits
To back scrape older opinions, we would need to target another
site
"""
self.start_date, self.end_date = dates
logger.info("Backscraping for range %s %s", *dates)
self.url = self.format_url(*dates)
self.html = self._download()
self._process_html()

def format_url(self, start_date: datetime, end_date: datetime) -> str:
"""
Creates a date range URL by formatting input dates
"""
start = datetime.strftime(start_date, "%Y%m%d")
end = datetime.strftime(end_date, "%Y%m%d")
return self.base_url.format(self.court_code, start, end)
3 changes: 0 additions & 3 deletions juriscraper/opinions/united_states/state/orctapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,3 @@ class Site(oregon_module.Site):

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = (
"https://www.courts.oregon.gov/publications/coa/Pages/default.aspx"
)
Loading

0 comments on commit ab19c60

Please sign in to comment.