From 75ac9e17d3f706d5667c2f8ce4b11f802281b0cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Moritz=20M=C3=A4hr?= <maehr@users.noreply.github.com>
Date: Thu, 21 Nov 2024 16:47:57 +0100
Subject: [PATCH 1/4] fix: update thumbnail handling to use placeholder for
 non-public items

---
 .github/workflows/process_data.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/process_data.py b/.github/workflows/process_data.py
index 9c7f711e..4886a5ae 100644
--- a/.github/workflows/process_data.py
+++ b/.github/workflows/process_data.py
@@ -143,9 +143,10 @@ def infer_display_template(format_value):
 
 def extract_item_data(item):
     """Extracts relevant data from an item and downloads its thumbnail if available."""
-    local_image_path = (
-        download_thumbnail(item.get("thumbnail_display_urls", {}).get("large", ""))
-        or "assets/img/no-image.svg"
+    local_image_path = download_thumbnail(
+        item.get("thumbnail_display_urls", {}).get("large", "")
+        if item.get("o:is_public", False)
+        else "assets/img/placeholder.svg"
     )
 
     return {
@@ -180,12 +181,13 @@ def extract_media_data(media, item_dc_identifier):
     format_value = extract_property(media.get("dcterms:format", []), 9)
     display_template = infer_display_template(format_value)
 
-    # Download the thumbnail image if available and valid
+    # Download the thumbnail image if available and valid    # Download the thumbnail image if available and valid
     if "platzhalter" in media.get("o:source", ""):
         local_image_path = "assets/img/placeholder.svg"
     else:
-        local_image_path = download_thumbnail(
-            media.get("thumbnail_display_urls", {}).get("large", "")
+        local_image_path = (
+            download_thumbnail(media.get("thumbnail_display_urls", {}).get("large", ""))
+            or "assets/img/no-image.svg"
         )
 
     # Extract media data

From 0eafb0dd9a9f8ac10ed8dff7542c17a12baad515 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Moritz=20M=C3=A4hr?= <maehr@users.noreply.github.com>
Date: Fri, 22 Nov 2024 14:06:29 +0100
Subject: [PATCH 2/4] feat: normalize string fields in records to Unicode NFC
 form

---
 .github/workflows/process_data.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/process_data.py b/.github/workflows/process_data.py
index 4886a5ae..008ebe11 100644
--- a/.github/workflows/process_data.py
+++ b/.github/workflows/process_data.py
@@ -1,6 +1,7 @@
 import json
 import logging
 import os
+import unicodedata
 from urllib.parse import urljoin, urlparse
 
 import pandas as pd
@@ -225,23 +226,36 @@ def extract_media_data(media, item_dc_identifier):
     }
 
 
+def normalize_record(record):
+    """Normalizes all string fields in a record to Unicode NFC form."""
+    return {
+        key: unicodedata.normalize("NFC", value) if isinstance(value, str) else value
+        for key, value in record.items()
+    }
+
+
 # --- Main Processing Function ---
 def main():
     # Fetch item data
     items_data = get_items_from_collection(ITEM_SET_ID)
 
     # Process each item and associated media
-    item_records, media_records = [], []
+    items_processed = []
     for item in items_data:
         item_record = extract_item_data(item)
-        item_records.append(item_record)
+        items_processed.append(item_record)
         media_data = get_media(item.get("o:id", ""))
         if media_data:
             for media in media_data:
-                media_records.append(extract_media_data(media, item_record["objectid"]))
+                items_processed.append(
+                    extract_media_data(media, item_record["objectid"])
+                )
+
+    # Normalize all string fields in the records to avoid decomposed Unicode form Umlaute ¨ + o -> ö
+    items_normalized = [normalize_record(record) for record in items_processed]
 
     # Save data to CSV and JSON formats
-    save_to_files(item_records + media_records, CSV_PATH, JSON_PATH)
+    save_to_files(items_normalized, CSV_PATH, JSON_PATH)
 
 
 def save_to_files(records, csv_path, json_path):

From 4a647a8024b358ac9d7755899ce97ed9e57da875 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Moritz=20M=C3=A4hr?= <maehr@users.noreply.github.com>
Date: Fri, 22 Nov 2024 14:11:56 +0100
Subject: [PATCH 3/4] fix: correct comment formatting in media data extraction
 function

---
 .github/workflows/process_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/process_data.py b/.github/workflows/process_data.py
index 008ebe11..243b04c6 100644
--- a/.github/workflows/process_data.py
+++ b/.github/workflows/process_data.py
@@ -182,7 +182,7 @@ def extract_media_data(media, item_dc_identifier):
     format_value = extract_property(media.get("dcterms:format", []), 9)
     display_template = infer_display_template(format_value)
 
-    # Download the thumbnail image if available and valid    # Download the thumbnail image if available and valid
+    # Download the thumbnail image if available and valid
     if "platzhalter" in media.get("o:source", ""):
         local_image_path = "assets/img/placeholder.svg"
     else:

From d9822d605b9477053420d429033deabd016c8f56 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Moritz=20M=C3=A4hr?= <maehr@users.noreply.github.com>
Date: Fri, 22 Nov 2024 14:14:20 +0100
Subject: [PATCH 4/4] fix: improve readability of thumbnail download logic in
 item data extraction

---
 .github/workflows/process_data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/process_data.py b/.github/workflows/process_data.py
index 243b04c6..de648088 100644
--- a/.github/workflows/process_data.py
+++ b/.github/workflows/process_data.py
@@ -144,8 +144,8 @@ def infer_display_template(format_value):
 
 def extract_item_data(item):
     """Extracts relevant data from an item and downloads its thumbnail if available."""
-    local_image_path = download_thumbnail(
-        item.get("thumbnail_display_urls", {}).get("large", "")
+    local_image_path = (
+        download_thumbnail(item.get("thumbnail_display_urls", {}).get("large", ""))
         if item.get("o:is_public", False)
         else "assets/img/placeholder.svg"
     )