OpenDataScotland · paul-bradbeer · Nov 12, 2023 · Nov 12, 2023 · JackGilmore · Dec 3, 2023
diff --git a/.gitignore b/.gitignore
@@ -365,3 +365,6 @@ FodyWeavers.xsd
 # Pytest outputs
 tests/mock_data/mockcsv.csv
 tests/mock_data/output
+.vscode/settings.json
+.vscode/settings.json
+.vscode/settings.json
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,3 +1,9 @@
 {
-    "python.formatting.provider": "black"
+    "python.formatting.provider": "black",
+    "python.testing.pytestArgs": [
+        "tests"
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true,
+    "python.defaultInterpreterPath": "F:\\Program Files\\python3.9\\python"
 }
diff --git a/csvProcessor.py b/csvProcessor.py
@@ -0,0 +1,9 @@
+try:
+    from processor import Processor
+except:
+    from .processor import Processor
+
+
+class csvProcessor(Processor):
+    def __init__(self, type):
+        super().__init__(type)
diff --git a/jsonProcessor.py b/jsonProcessor.py
@@ -0,0 +1,9 @@
+try:
+    from processor import Processor
+except:
+    from .processor import Processor
+
+
+class jsonProcessor(Processor):
+    def __init__(self, type):
+        super().__init__(type)
diff --git a/jsonRow.py b/jsonRow.py
@@ -0,0 +1,46 @@
+import json, dataclasses
+from dataclasses import dataclass
+
+@dataclass
+class jsonRow:
+    Title: str = ""
+    Owner: str = ""
+    PageURL: str = ""
+    AssetURL: str = ""
+    FileName: str = ""
+    DateCreated: str = ""
+    DateUpdated: str = ""
+    FileSize: str = ""
+    FileSizeUnit: str = ""
+    FileType: str = ""
+    NumRecords: str = ""
+    OriginalTags: str = ""
+    ManualTags: str = ""
+    License: str = ""
+    Description: str = ""
+
+    def __init__(self):
+        Title = ""
+        Owner = ""
+        PageURL = ""
+        AssetURL = ""
+        FileName = ""
+        DateCreated = ""
+        DateUpdated = ""
+        FileSize = ""
+        FileSizeUnit = ""
+        FileType = ""
+        NumRecords = ""
+        OriginalTags = ""
+        ManualTags = ""
+        License = ""
+        Description = ""
+
+    class EnhancedJSONEncoder(json.JSONEncoder):
+        def default(self, o):
+            if dataclasses.is_dataclass(o):
+                return dataclasses.asdict(o)
+            return super().default(o)
+
+    def toJSON(self):
+        return json.dumps(self, cls=self.EnhancedJSONEncoder)
diff --git a/merge_data.py b/merge_data.py
@@ -49,8 +49,8 @@ def main():
         }
     )
     source_scotgov["Source"] = "sparql"
-    #print("DateUpdated " + source_scotgov["DateUpdated"])
-    #print("DateCreated " + source_scotgov["DateCreated"])
+    # print("DateUpdated " + source_scotgov["DateUpdated"])
+    # print("DateCreated " + source_scotgov["DateCreated"])
     try:
         source_scotgov["DateUpdated"] = pd.to_datetime(
             source_scotgov["DateUpdated"], utc=True
@@ -110,7 +110,7 @@ def main():
                 source_usmart = pd.concat(
                     [
                         source_usmart,
-                        pd.read_csv(
+                        pd.read_json(
                             folder + r"/" + filename,
                             parse_dates=["DateCreated", "DateUpdated"],
                         ),
@@ -165,19 +165,43 @@ def main():
     # From Scottish Parliament
     print("\tMerging Scottish Parliament...")
     path = "data/bespoke_ScottishParliament/Scottish Parliament.json"
-    scottish_parliament_scraped = pd.read_json(path, convert_dates=["dateCreated", "dateUpdated"])
+    scottish_parliament_scraped = pd.read_json(
+        path, convert_dates=["dateCreated", "dateUpdated"]
+    )
 
-    for index, row in scottish_parliament_scraped.iterrows():      
+    for index, row in scottish_parliament_scraped.iterrows():
         resources = pd.json_normalize(row["resources"])
         for resource_index, resource_row in resources.iterrows():
             # TEMP FIX: Need to do this mapping until we modify the merged_output.json schema to support nesting resources inside each dataset entry
             source_scraped = pd.concat(
-                [source_scraped, pd.DataFrame.from_records([{"Title": row["title"], "Owner": row["owner"], "PageURL": row["pageURL"], "AssetURL": resource_row["assetUrl"], "DateCreated": row["dateCreated"], "DateUpdated": row["dateUpdated"], "FileSize": resource_row["fileSize"], "FileType": resource_row["fileType"], "NumRecords": resource_row["numRecords"], "OriginalTags": row["tags"], "ManualTags" : row["tags"], "License": row["licence"], "Description": row["description"], "FileName": resource_row["fileName"]}])]
+                [
+                    source_scraped,
+                    pd.DataFrame.from_records(
+                        [
+                            {
+                                "Title": row["title"],
+                                "Owner": row["owner"],
+                                "PageURL": row["pageURL"],
+                                "AssetURL": resource_row["assetUrl"],
+                                "DateCreated": row["dateCreated"],
+                                "DateUpdated": row["dateUpdated"],
+                                "FileSize": resource_row["fileSize"],
+                                "FileType": resource_row["fileType"],
+                                "NumRecords": resource_row["numRecords"],
+                                "OriginalTags": row["tags"],
+                                "ManualTags": row["tags"],
+                                "License": row["licence"],
+                                "Description": row["description"],
+                                "FileName": resource_row["fileName"],
+                            }
+                        ]
+                    ),
+                ]
             )
 
     source_scraped["Source"] = "Web Scraped"
     # endregion
-  
+
     ### Combine all data into single table
     print("Concatenating all")
     data = pd.concat(
@@ -191,7 +215,7 @@ def main():
         ]
     )
     data = data.reset_index(drop=True)
-    
+
     print(f"Output untidy {dt.now()}")
     ### Saves copy of data without cleaning - for analysis purposes
     data.to_json("data/merged_output_untidy.json", orient="records", date_format="iso")
@@ -395,7 +419,7 @@ def tidy_licence(licence_name):
             "Other (Public Domain)": "Public Domain",
             "Public Domain": "Public Domain",
             "Public Sector End User Licence (Scotland)": "Public Sector End User Licence (Scotland)",
-            "Scottish Parliament Copyright Policy": "Scottish Parliament Copyright Policy"
+            "Scottish Parliament Copyright Policy": "Scottish Parliament Copyright Policy",
         }
 
         for key in known_licences.keys():

diff --git a/processor.py b/processor.py
@@ -4,6 +4,14 @@
 import csv
 import json
 import os
+import dataclasses
+
+
+class EnhancedJSONEncoder(json.JSONEncoder):
+    def default(self, o):
+        if dataclasses.is_dataclass(o):
+            return dataclasses.asdict(o)
+        return super().default(o)
 
 
 class Processor:
@@ -107,16 +115,18 @@ def write_csv(self, fname, prepped):
                     r[-1] = r[-1].replace("\n", " ")
                 w.writerow(r)
 
-    def write_json(self, fname, prepped):        
+    def write_json(self, fname, prepped):
         with open(fname, "w", encoding="utf8") as json_file:
-            json.dump(prepped, json_file, indent=4)
+            json.dump(prepped, json_file, indent=4, cls=EnhancedJSONEncoder)
 
     def get_datasets(self, owner, url, fname):
         print("Override this method")
 
-    def process(self, file_type = "csv"):
+    def process(self, file_type="csv"):
         self.get_urls()
 
         for name, url in self.urls.items():
             print(name)
-            self.get_datasets(name, url, os.path.join("data", self.type, f"{name}.{file_type}"))
+            self.get_datasets(
+                name, url, os.path.join("data", self.type, f"{name}.{file_type}")
+            )
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -60,6 +60,13 @@ def is_valid_tags(str_to_check):
     return is_valid_string(str_to_check)
 
 
+def is_valid_tags_json(tags_to_check):
+    for thistag in tags_to_check:
+        if not is_valid_string(thistag):
+            return False
+    return True
+
+
 def is_valid_licence(str_to_check):
     licences = [
         "http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/",
@@ -110,3 +117,35 @@ def csv_checker(csv_file):
                 assert csv_testers[col_idx]["test_func"](cell)
                 # result = False
     return result
+
+
+def json_checker(json_file):
+    result = True
+
+    json_testers = {
+        "Title": is_valid_string,
+        "Owner": is_valid_string,
+        "PageURL": is_valid_url,
+        "AssetURL": is_valid_url,
+        "FileName": is_valid_filename,
+        "DateCreated": is_valid_date,
+        "DateUpdated": is_valid_date,
+        "FileSize": is_valid_number,
+        "FileSizeUnit": is_valid_file_size_unit,
+        "FileType": is_valid_file_type,
+        "NumRecords": is_valid_number,
+        "OriginalTags": is_valid_tags_json,
+        "ManualTags": is_valid_tags_json,
+        "License": is_valid_licence,
+        "Description": is_valid_string,
+    }
+
+    header_row = 0
+    for row in json_file:
+        for attribute_key in row.keys():
+            this_test_function = json_testers[attribute_key]
+            this_test_value = row[attribute_key]
+            test_result = this_test_function(this_test_value)
+            debug_output = f"{this_test_function.__name__}( {this_test_value} )"
+            assert test_result, debug_output
+    return result
diff --git a/tests/jsonRow_test.py b/tests/jsonRow_test.py
@@ -0,0 +1,50 @@
+import pytest
+from ..jsonRow import jsonRow
+
+
+def test_json_conversion():
+    # Test 1 - every attribute populated
+
+    example_row1 = jsonRow()
+
+    example_row1.Title = "Example title"
+    example_row1.Owner = "Example owner"
+    example_row1.PageURL = "Example original dataset link"
+    example_row1.AssetURL = "Example resource link"
+    example_row1.FileName = "Example filename"
+    example_row1.DateCreated = "05/Nov/2023"
+    example_row1.DateUpdated = "05/Dec/2023"
+    example_row1.FileSize = "Example size"
+    example_row1.FileSizeUnit = "Example size unit"
+    example_row1.FileType = "Example file type"
+    example_row1.NumRecords = "Example num records"
+    example_row1.OriginalTags = "Example tags"
+    example_row1.ManualTags = "Example manual tags"
+    example_row1.License = "Example licence"
+    example_row1.Description = "Example description"
+
+    outputJson1 = example_row1.toJSON()
+
+    expectedJson1 = '{"Title": "Example title", "Owner": "Example owner", "PageURL": "Example original dataset link", "AssetURL": "Example resource link", "FileName": "Example filename", "DateCreated": "05/Nov/2023", "DateUpdated": "05/Dec/2023", "FileSize": "Example size", "FileSizeUnit": "Example size unit", "FileType": "Example file type", "NumRecords": "Example num records", "OriginalTags": "Example tags", "ManualTags": "Example manual tags", "License": "Example licence", "Description": "Example description"}'
+
+    assert outputJson1 == expectedJson1
+
+    # Test 2 - some attributes left as defaults (blank)
+
+    example_row2 = jsonRow()
+
+    example_row2.Title = "Example title"
+    example_row2.Owner = "Example owner"
+    example_row2.PageURL = "Example original dataset link"
+    example_row2.AssetURL = "Example resource link"
+    example_row2.FileType = "Example file type"
+    example_row2.OriginalTags = "Example tags"
+    example_row2.ManualTags = "Example manual tags"
+    example_row2.License = "Example licence"
+    example_row2.Description = "Example description"
+
+    outputJson2 = example_row2.toJSON()
+
+    expectedJson2 = '{"Title": "Example title", "Owner": "Example owner", "PageURL": "Example original dataset link", "AssetURL": "Example resource link", "FileName": "", "DateCreated": "", "DateUpdated": "", "FileSize": "", "FileSizeUnit": "", "FileType": "Example file type", "NumRecords": "", "OriginalTags": "Example tags", "ManualTags": "Example manual tags", "License": "Example licence", "Description": "Example description"}'
+
+    assert outputJson2 == expectedJson2