Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue223 #260

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -365,3 +365,6 @@ FodyWeavers.xsd
# Pytest outputs
tests/mock_data/mockcsv.csv
tests/mock_data/output
.vscode/settings.json
.vscode/settings.json
.vscode/settings.json
8 changes: 7 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
{
"python.formatting.provider": "black"
"python.formatting.provider": "black",
"python.testing.pytestArgs": [
"tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"python.defaultInterpreterPath": "F:\\Program Files\\python3.9\\python"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would remove this as it's dependent on the machine config of whoever's running the scripts

}
9 changes: 9 additions & 0 deletions csvProcessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
try:
from processor import Processor
except:
from .processor import Processor


class csvProcessor(Processor):
def __init__(self, type):
super().__init__(type)
9 changes: 9 additions & 0 deletions jsonProcessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
try:
from processor import Processor
except:
from .processor import Processor


class jsonProcessor(Processor):
def __init__(self, type):
super().__init__(type)
46 changes: 46 additions & 0 deletions jsonRow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import json, dataclasses
from dataclasses import dataclass

@dataclass
class jsonRow:
Title: str = ""
Owner: str = ""
PageURL: str = ""
AssetURL: str = ""
FileName: str = ""
DateCreated: str = ""
DateUpdated: str = ""
FileSize: str = ""
FileSizeUnit: str = ""
FileType: str = ""
NumRecords: str = ""
OriginalTags: str = ""
ManualTags: str = ""
License: str = ""
Description: str = ""

def __init__(self):
Title = ""
Owner = ""
PageURL = ""
AssetURL = ""
FileName = ""
DateCreated = ""
DateUpdated = ""
FileSize = ""
FileSizeUnit = ""
FileType = ""
NumRecords = ""
OriginalTags = ""
ManualTags = ""
License = ""
Description = ""

class EnhancedJSONEncoder(json.JSONEncoder):
def default(self, o):
if dataclasses.is_dataclass(o):
return dataclasses.asdict(o)
return super().default(o)

def toJSON(self):
return json.dumps(self, cls=self.EnhancedJSONEncoder)
42 changes: 33 additions & 9 deletions merge_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ def main():
}
)
source_scotgov["Source"] = "sparql"
#print("DateUpdated " + source_scotgov["DateUpdated"])
#print("DateCreated " + source_scotgov["DateCreated"])
# print("DateUpdated " + source_scotgov["DateUpdated"])
# print("DateCreated " + source_scotgov["DateCreated"])
try:
source_scotgov["DateUpdated"] = pd.to_datetime(
source_scotgov["DateUpdated"], utc=True
Expand Down Expand Up @@ -110,7 +110,7 @@ def main():
source_usmart = pd.concat(
[
source_usmart,
pd.read_csv(
pd.read_json(
folder + r"/" + filename,
parse_dates=["DateCreated", "DateUpdated"],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line caused an error for me. I think it needs to be convert_dates instead

),
Expand Down Expand Up @@ -165,19 +165,43 @@ def main():
# From Scottish Parliament
print("\tMerging Scottish Parliament...")
path = "data/bespoke_ScottishParliament/Scottish Parliament.json"
scottish_parliament_scraped = pd.read_json(path, convert_dates=["dateCreated", "dateUpdated"])
scottish_parliament_scraped = pd.read_json(
path, convert_dates=["dateCreated", "dateUpdated"]
)

for index, row in scottish_parliament_scraped.iterrows():
for index, row in scottish_parliament_scraped.iterrows():
resources = pd.json_normalize(row["resources"])
for resource_index, resource_row in resources.iterrows():
# TEMP FIX: Need to do this mapping until we modify the merged_output.json schema to support nesting resources inside each dataset entry
source_scraped = pd.concat(
[source_scraped, pd.DataFrame.from_records([{"Title": row["title"], "Owner": row["owner"], "PageURL": row["pageURL"], "AssetURL": resource_row["assetUrl"], "DateCreated": row["dateCreated"], "DateUpdated": row["dateUpdated"], "FileSize": resource_row["fileSize"], "FileType": resource_row["fileType"], "NumRecords": resource_row["numRecords"], "OriginalTags": row["tags"], "ManualTags" : row["tags"], "License": row["licence"], "Description": row["description"], "FileName": resource_row["fileName"]}])]
[
source_scraped,
pd.DataFrame.from_records(
[
{
"Title": row["title"],
"Owner": row["owner"],
"PageURL": row["pageURL"],
"AssetURL": resource_row["assetUrl"],
"DateCreated": row["dateCreated"],
"DateUpdated": row["dateUpdated"],
"FileSize": resource_row["fileSize"],
"FileType": resource_row["fileType"],
"NumRecords": resource_row["numRecords"],
"OriginalTags": row["tags"],
"ManualTags": row["tags"],
"License": row["licence"],
"Description": row["description"],
"FileName": resource_row["fileName"],
}
]
),
]
)

source_scraped["Source"] = "Web Scraped"
# endregion

### Combine all data into single table
print("Concatenating all")
data = pd.concat(
Expand All @@ -191,7 +215,7 @@ def main():
]
)
data = data.reset_index(drop=True)

print(f"Output untidy {dt.now()}")
### Saves copy of data without cleaning - for analysis purposes
data.to_json("data/merged_output_untidy.json", orient="records", date_format="iso")
Expand Down Expand Up @@ -395,7 +419,7 @@ def tidy_licence(licence_name):
"Other (Public Domain)": "Public Domain",
"Public Domain": "Public Domain",
"Public Sector End User Licence (Scotland)": "Public Sector End User Licence (Scotland)",
"Scottish Parliament Copyright Policy": "Scottish Parliament Copyright Policy"
"Scottish Parliament Copyright Policy": "Scottish Parliament Copyright Policy",
}

for key in known_licences.keys():
Expand Down
18 changes: 14 additions & 4 deletions processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@
import csv
import json
import os
import dataclasses


class EnhancedJSONEncoder(json.JSONEncoder):
def default(self, o):
if dataclasses.is_dataclass(o):
return dataclasses.asdict(o)
return super().default(o)


class Processor:
Expand Down Expand Up @@ -107,16 +115,18 @@ def write_csv(self, fname, prepped):
r[-1] = r[-1].replace("\n", " ")
w.writerow(r)

def write_json(self, fname, prepped):
def write_json(self, fname, prepped):
with open(fname, "w", encoding="utf8") as json_file:
json.dump(prepped, json_file, indent=4)
json.dump(prepped, json_file, indent=4, cls=EnhancedJSONEncoder)

def get_datasets(self, owner, url, fname):
print("Override this method")

def process(self, file_type = "csv"):
def process(self, file_type="csv"):
self.get_urls()

for name, url in self.urls.items():
print(name)
self.get_datasets(name, url, os.path.join("data", self.type, f"{name}.{file_type}"))
self.get_datasets(
name, url, os.path.join("data", self.type, f"{name}.{file_type}")
)
39 changes: 39 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,13 @@ def is_valid_tags(str_to_check):
return is_valid_string(str_to_check)


def is_valid_tags_json(tags_to_check):
for thistag in tags_to_check:
if not is_valid_string(thistag):
return False
return True


def is_valid_licence(str_to_check):
licences = [
"http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/",
Expand Down Expand Up @@ -110,3 +117,35 @@ def csv_checker(csv_file):
assert csv_testers[col_idx]["test_func"](cell)
# result = False
return result


def json_checker(json_file):
result = True

json_testers = {
"Title": is_valid_string,
"Owner": is_valid_string,
"PageURL": is_valid_url,
"AssetURL": is_valid_url,
"FileName": is_valid_filename,
"DateCreated": is_valid_date,
"DateUpdated": is_valid_date,
"FileSize": is_valid_number,
"FileSizeUnit": is_valid_file_size_unit,
"FileType": is_valid_file_type,
"NumRecords": is_valid_number,
"OriginalTags": is_valid_tags_json,
"ManualTags": is_valid_tags_json,
"License": is_valid_licence,
"Description": is_valid_string,
}

header_row = 0
for row in json_file:
for attribute_key in row.keys():
this_test_function = json_testers[attribute_key]
this_test_value = row[attribute_key]
test_result = this_test_function(this_test_value)
debug_output = f"{this_test_function.__name__}( {this_test_value} )"
assert test_result, debug_output
return result
50 changes: 50 additions & 0 deletions tests/jsonRow_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import pytest
from ..jsonRow import jsonRow


def test_json_conversion():
# Test 1 - every attribute populated

example_row1 = jsonRow()

example_row1.Title = "Example title"
example_row1.Owner = "Example owner"
example_row1.PageURL = "Example original dataset link"
example_row1.AssetURL = "Example resource link"
example_row1.FileName = "Example filename"
example_row1.DateCreated = "05/Nov/2023"
example_row1.DateUpdated = "05/Dec/2023"
example_row1.FileSize = "Example size"
example_row1.FileSizeUnit = "Example size unit"
example_row1.FileType = "Example file type"
example_row1.NumRecords = "Example num records"
example_row1.OriginalTags = "Example tags"
example_row1.ManualTags = "Example manual tags"
example_row1.License = "Example licence"
example_row1.Description = "Example description"

outputJson1 = example_row1.toJSON()

expectedJson1 = '{"Title": "Example title", "Owner": "Example owner", "PageURL": "Example original dataset link", "AssetURL": "Example resource link", "FileName": "Example filename", "DateCreated": "05/Nov/2023", "DateUpdated": "05/Dec/2023", "FileSize": "Example size", "FileSizeUnit": "Example size unit", "FileType": "Example file type", "NumRecords": "Example num records", "OriginalTags": "Example tags", "ManualTags": "Example manual tags", "License": "Example licence", "Description": "Example description"}'

assert outputJson1 == expectedJson1

# Test 2 - some attributes left as defaults (blank)

example_row2 = jsonRow()

example_row2.Title = "Example title"
example_row2.Owner = "Example owner"
example_row2.PageURL = "Example original dataset link"
example_row2.AssetURL = "Example resource link"
example_row2.FileType = "Example file type"
example_row2.OriginalTags = "Example tags"
example_row2.ManualTags = "Example manual tags"
example_row2.License = "Example licence"
example_row2.Description = "Example description"

outputJson2 = example_row2.toJSON()

expectedJson2 = '{"Title": "Example title", "Owner": "Example owner", "PageURL": "Example original dataset link", "AssetURL": "Example resource link", "FileName": "", "DateCreated": "", "DateUpdated": "", "FileSize": "", "FileSizeUnit": "", "FileType": "Example file type", "NumRecords": "", "OriginalTags": "Example tags", "ManualTags": "Example manual tags", "License": "Example licence", "Description": "Example description"}'

assert outputJson2 == expectedJson2
Loading