fix(or, orctapp): scrape new endpoint

- Backscraper now works for any range - Updated example files - Collecting judges
freelawproject · Oct 10, 2024 · ab19c60 · ab19c60
1 parent 8e16ded
commit ab19c60
Show file tree

Hide file tree

Showing 8 changed files with 5,804 additions and 19,753 deletions.
diff --git a/juriscraper/opinions/united_states/state/or.py b/juriscraper/opinions/united_states/state/or.py
@@ -12,150 +12,116 @@
 
 class Site(OpinionSiteLinear):
     court_code = "p17027coll3"
-    detail_url = "https://ojd.contentdm.oclc.org/digital/bl/dmwebservices/index.php?q=dmQuery/{}/identi^{}^all^and/title!subjec!descri!dmrecord/title/1024/1/0/0/0/0/json"
-    download_url = "https://ojd.contentdm.oclc.org/digital/api/collection/{}/id/{}/download"
-    days_interval = 720
-    # Earliest opinion as of development in Oct 2024
-    first_opinion_date = datetime(2023, 4, 1)
+    base_url = "https://cdm17027.contentdm.oclc.org/digital/api/search/collection/{}/searchterm/{}-{}/field/dated/mode/exact/conn/and/maxRecords/200"
+    # technically they have an 1870 case but just one
+    first_opinion_date = datetime(1997, 8, 12)
+    days_interval = 15
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.court_id = self.__module__
-        self.url = (
-            "https://www.courts.oregon.gov/publications/sc/Pages/default.aspx"
-        )
+        today = datetime.today()
+        self.url = self.format_url(today - timedelta(15), today)
         self.make_backscrape_iterable(kwargs)
 
-        # By default, scrape at most 10 days into the past
-        # It's important to limit regular scrapes, since
-        # this scraper makes secondary requests and the site
-        # loads all opinions back to a year; which would
-        # create a lot of hits to the server each time
-        # the hourly scraper is triggered
-        # The limits will be modified in a backscrape
-        self.start_date = (datetime.today() - timedelta(10)).date()
-        self.end_date = (datetime.today() + timedelta(1)).date()
-
     def _process_html(self):
-        for date_header in self.html.xpath(
-            "//h4[a[contains(@href, '/dated/')]]"
-        ):
-            date_string = date_header.text_content().strip()
-            if not date_string:
-                logger.info("Skipping section with no date string")
-                continue
-
-            date = datetime.strptime(date_string, "%m/%d/%Y").date()
-            if date > self.end_date:
-                # Opinions come in descending date order
+        for row in self.html["items"]:
+            docket, name, citation, date = (
+                x["value"] for x in row["metadataFields"]
+            )
+            if not name:
+                # Happens on rows like:
+                # "Miscellaneous Supreme Court dispositions, June 10 and 13, 2024"
+                logger.info("Skipping row '%s'", docket)
                 continue
-            if date < self.start_date and not self.test_mode_enabled():
-                logger.info(
-                    "Date %s is out of range [%s to %s]",
-                    date,
-                    self.start_date,
-                    self.end_date,
-                )
-                break
 
-            self.process_a_date(date_header)
-
-    def process_a_date(self, date_header) -> None:
-        """Process a section defined by a date header
-
-        :param date_header: the lxml element containing the date
-        :return None
-        """
-        date_string = date_header.text_content().strip()
-
-        # orctapp has h5 tags which describe the status of the
-        # opinions in the next ul
-        for sibling in date_header.xpath("following-sibling::*"):
-            if sibling.tag not in ["ul", "h5"]:
-                # Time to jump to another date
-                break
-
-            if "orctapp" in self.court_id:
-                if sibling.tag == "h5":
-                    status = sibling.text_content().strip()
-                    if status == "Precedential Opinions":
-                        status = "Published"
-                    elif status == "Nonprecedential Memorandum Opinions":
-                        status = "Unpublished"
-                    else:
-                        status = "Unknown"
-            else:
-                status = "Published"
-
-            for item in sibling.xpath("li"):
-                # Ensure two links are present (skip Petitions
-                # for Review rows)
-                text = item.text_content().strip()
-                anchors = item.xpath(".//a")
-                if not (len(anchors) > 1):
-                    logger.info("Skipping row without 2 links. Row: %s", text)
-                    continue
-
-                detail_url = anchors[0].xpath("./@href")[0]
-                download_url, disposition = self.get_details(detail_url)
-                if not download_url:
-                    # Usually happens for
-                    # "Miscellaneous Supreme Court Dispositions"
-                    logger.info("No records for detail JSON")
-                    continue
-
-                name = text.split(")", 1)[-1]
-                # Clean up names like:
-                # "Knopp v. Griffin-Valade (Certified appeal accepted)"
-                if "(" in name:
-                    name, disposition = name.split("(", 1)
-                    disposition = disposition.strip(")")
-
-                self.cases.append(
-                    {
-                        "date": date_string,
-                        "name": name,
-                        "docket": anchors[1].text_content().strip(),
-                        "url": download_url,
-                        "citation": item.xpath("b/text()")[0].strip(),
-                        "status": status,
-                        "disposition": disposition,
-                    }
-                )
-
-    def get_details(self, detail_url: str) -> tuple[str, str]:
-        """Makes a request to get a case details, including the URL
-
-        :param detail_url: case detail's page url
-        :return: a tuple: (the pdf download url, the disposition)
+            judge, disposition, status, lower_court_number = self.get_details(
+                row
+            )
+            per_curiam = False
+            if judge and judge == "PC" or "per curiam" in judge.lower():
+                per_curiam = True
+                judge = ""
+
+            self.cases.append(
+                {
+                    "name": name,
+                    "date": date,
+                    "docket": docket.split(",")[0],
+                    "url": f"https://ojd.contentdm.oclc.org/digital/api/collection/{row['collectionAlias']}/id/{row['itemId']}/download",
+                    "citation": citation,
+                    "judge": judge,
+                    "per_curiam": per_curiam,
+                    "status": status,
+                    "disposition": disposition,
+                    "lower_court_number": lower_court_number,
+                }
+            )
+
+    def get_details(self, row: dict) -> tuple[str, str, str, str]:
+        """Makes a secondary request to get details for a single
+        opinion
+
+        :param row: the JSON records, to get the item id for the request
+            or the JSON object in tests
+        :return: a tuple containing, if it has a valid value
+            - judge
+            - disposition
+            - status
+            - lower court number (only for `or`)
         """
         if self.test_mode_enabled():
-            return "placeholder url", "placeholder disposition"
-
-        identifier = detail_url.split("=")[-1]
-        detail_url = self.detail_url.format(self.court_code, identifier)
-
-        logger.info("Getting detail JSON from %s", detail_url)
-        json = self.request["session"].get(detail_url).json()
-        logger.debug(json)
-        if not json.get("records"):
-            return "", ""
-
-        disposition = json["records"][0].get("descri") or ""
-        download_url = self.download_url.format(
-            self.court_code, json["records"][0]["pointer"]
-        )
-        return download_url, disposition
+            if not row.get("detailJson"):
+                return (
+                    "placeholder judge",
+                    "placeholder disposition",
+                    "Unknown",
+                    "placeholder lower court number",
+                )
+            # Some test cases have their detail data manually copy pasted
+            json = row["detailJson"]
+        else:
+            item_id = row["itemId"]
+            url = f"https://cdm17027.contentdm.oclc.org/digital/api/collections/{self.court_code}/items/{item_id}/false"
+            json = self.request["session"].get(url).json()
+            logger.debug("Getting detail JSON from %s", url)
+
+        if len(json["fields"]) == 1:
+            fields = json["parent"]["fields"]
+        else:
+            fields = json["fields"]
+
+        judge, disposition, status, lower_court_number = "", "", "Unknown", ""
+        for field in fields:
+            if field["key"] == "judge":
+                judge = field["value"]
+            elif field["key"] == "type":
+                if field["value"] == "Nonprecedential opinion":
+                    status = "Unpublished"
+                else:
+                    status = "Published"
+            elif field["key"] == "descri":
+                disposition = field["value"]
+            elif field["key"] == "relhapt":
+                # For orctapp this field may be populated with consolidated docket
+                # numbers
+                if self.court_id.endswith("or") and not field[
+                    "value"
+                ].startswith("S"):
+                    lower_court_number = field["value"]
+
+        return judge, disposition, status, lower_court_number
 
     def _download_backwards(self, dates: tuple) -> None:
-        """The site loads by default the last couple years of data.
-        So it's not necessary to query the page in a special way to
-        target data in these years, only to set the proper date limits
-
-        To back scrape older opinions, we would need to target another
-        site
-        """
-        self.start_date, self.end_date = dates
         logger.info("Backscraping for range %s %s", *dates)
+        self.url = self.format_url(*dates)
         self.html = self._download()
         self._process_html()
+
+    def format_url(self, start_date: datetime, end_date: datetime) -> str:
+        """
+        Creates a date range URL by formatting input dates
+        """
+        start = datetime.strftime(start_date, "%Y%m%d")
+        end = datetime.strftime(end_date, "%Y%m%d")
+        return self.base_url.format(self.court_code, start, end)
diff --git a/juriscraper/opinions/united_states/state/orctapp.py b/juriscraper/opinions/united_states/state/orctapp.py
@@ -18,6 +18,3 @@ class Site(oregon_module.Site):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.url = (
-            "https://www.courts.oregon.gov/publications/coa/Pages/default.aspx"
-        )