From 75ac9e17d3f706d5667c2f8ce4b11f802281b0cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20M=C3=A4hr?= Date: Thu, 21 Nov 2024 16:47:57 +0100 Subject: [PATCH 1/4] fix: update thumbnail handling to use placeholder for non-public items --- .github/workflows/process_data.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/process_data.py b/.github/workflows/process_data.py index 9c7f711e..4886a5ae 100644 --- a/.github/workflows/process_data.py +++ b/.github/workflows/process_data.py @@ -143,9 +143,10 @@ def infer_display_template(format_value): def extract_item_data(item): """Extracts relevant data from an item and downloads its thumbnail if available.""" - local_image_path = ( - download_thumbnail(item.get("thumbnail_display_urls", {}).get("large", "")) - or "assets/img/no-image.svg" + local_image_path = download_thumbnail( + item.get("thumbnail_display_urls", {}).get("large", "") + if item.get("o:is_public", False) + else "assets/img/placeholder.svg" ) return { @@ -180,12 +181,13 @@ def extract_media_data(media, item_dc_identifier): format_value = extract_property(media.get("dcterms:format", []), 9) display_template = infer_display_template(format_value) - # Download the thumbnail image if available and valid + # Download the thumbnail image if available and valid # Download the thumbnail image if available and valid if "platzhalter" in media.get("o:source", ""): local_image_path = "assets/img/placeholder.svg" else: - local_image_path = download_thumbnail( - media.get("thumbnail_display_urls", {}).get("large", "") + local_image_path = ( + download_thumbnail(media.get("thumbnail_display_urls", {}).get("large", "")) + or "assets/img/no-image.svg" ) # Extract media data From 0eafb0dd9a9f8ac10ed8dff7542c17a12baad515 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20M=C3=A4hr?= Date: Fri, 22 Nov 2024 14:06:29 +0100 Subject: [PATCH 2/4] feat: normalize string fields in records to Unicode NFC form --- .github/workflows/process_data.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/.github/workflows/process_data.py b/.github/workflows/process_data.py index 4886a5ae..008ebe11 100644 --- a/.github/workflows/process_data.py +++ b/.github/workflows/process_data.py @@ -1,6 +1,7 @@ import json import logging import os +import unicodedata from urllib.parse import urljoin, urlparse import pandas as pd @@ -225,23 +226,36 @@ def extract_media_data(media, item_dc_identifier): } +def normalize_record(record): + """Normalizes all string fields in a record to Unicode NFC form.""" + return { + key: unicodedata.normalize("NFC", value) if isinstance(value, str) else value + for key, value in record.items() + } + + # --- Main Processing Function --- def main(): # Fetch item data items_data = get_items_from_collection(ITEM_SET_ID) # Process each item and associated media - item_records, media_records = [], [] + items_processed = [] for item in items_data: item_record = extract_item_data(item) - item_records.append(item_record) + items_processed.append(item_record) media_data = get_media(item.get("o:id", "")) if media_data: for media in media_data: - media_records.append(extract_media_data(media, item_record["objectid"])) + items_processed.append( + extract_media_data(media, item_record["objectid"]) + ) + + # Normalize all string fields in the records to avoid decomposed Unicode form Umlaute ¨ + o -> ö + items_normalized = [normalize_record(record) for record in items_processed] # Save data to CSV and JSON formats - save_to_files(item_records + media_records, CSV_PATH, JSON_PATH) + save_to_files(items_normalized, CSV_PATH, JSON_PATH) def save_to_files(records, csv_path, json_path): From 4a647a8024b358ac9d7755899ce97ed9e57da875 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20M=C3=A4hr?= Date: Fri, 22 Nov 2024 14:11:56 +0100 Subject: [PATCH 3/4] fix: correct comment formatting in media data extraction function --- .github/workflows/process_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/process_data.py b/.github/workflows/process_data.py index 008ebe11..243b04c6 100644 --- a/.github/workflows/process_data.py +++ b/.github/workflows/process_data.py @@ -182,7 +182,7 @@ def extract_media_data(media, item_dc_identifier): format_value = extract_property(media.get("dcterms:format", []), 9) display_template = infer_display_template(format_value) - # Download the thumbnail image if available and valid # Download the thumbnail image if available and valid + # Download the thumbnail image if available and valid if "platzhalter" in media.get("o:source", ""): local_image_path = "assets/img/placeholder.svg" else: From d9822d605b9477053420d429033deabd016c8f56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20M=C3=A4hr?= Date: Fri, 22 Nov 2024 14:14:20 +0100 Subject: [PATCH 4/4] fix: improve readability of thumbnail download logic in item data extraction --- .github/workflows/process_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/process_data.py b/.github/workflows/process_data.py index 243b04c6..de648088 100644 --- a/.github/workflows/process_data.py +++ b/.github/workflows/process_data.py @@ -144,8 +144,8 @@ def infer_display_template(format_value): def extract_item_data(item): """Extracts relevant data from an item and downloads its thumbnail if available.""" - local_image_path = download_thumbnail( - item.get("thumbnail_display_urls", {}).get("large", "") + local_image_path = ( + download_thumbnail(item.get("thumbnail_display_urls", {}).get("large", "")) if item.get("o:is_public", False) else "assets/img/placeholder.svg" )