diff --git a/home/templatetags/format_timesince.py b/home/templatetags/format_timesince.py new file mode 100644 index 00000000..bb5574cc --- /dev/null +++ b/home/templatetags/format_timesince.py @@ -0,0 +1,9 @@ +from home.templatetags.markdown import register + + +@register.filter +def format_timesince(timesince: str) -> str: + """ + Timesince returns a string like "3 days, 4 hours". This filter will return a string like "3 days". + """ + return timesince.split(",")[0] diff --git a/lib/datahub-client/data_platform_catalogue/client/datahub_client.py b/lib/datahub-client/data_platform_catalogue/client/datahub_client.py index 178f8169..ea46d733 100644 --- a/lib/datahub-client/data_platform_catalogue/client/datahub_client.py +++ b/lib/datahub-client/data_platform_catalogue/client/datahub_client.py @@ -45,12 +45,14 @@ parse_data_owner, parse_domain, parse_glossary_terms, + parse_last_modified, parse_names, parse_properties, parse_relations, parse_stewards, parse_subtypes, parse_tags, + parse_updated, ) from data_platform_catalogue.client.search import SearchClient from data_platform_catalogue.entities import ( @@ -274,6 +276,8 @@ def get_table_details(self, urn) -> Table: tags = parse_tags(response) glossary_terms = parse_glossary_terms(response) created, modified = parse_created_and_modified(properties) + modified = parse_last_modified(response) + updated = parse_updated(response) name, display_name, qualified_name = parse_names(response, properties) lineage_relations = parse_relations( @@ -308,6 +312,7 @@ def get_table_details(self, urn) -> Table: tags=tags, glossary_terms=glossary_terms, last_modified=modified, + last_updated=updated, created=created, column_details=columns, custom_properties=custom_properties, @@ -369,6 +374,7 @@ def get_database_details(self, urn: str) -> Database: tags = parse_tags(response) glossary_terms = parse_glossary_terms(response) created, modified = parse_created_and_modified(properties) + modified = parse_last_modified(response) name, display_name, qualified_name = parse_names(response, properties) child_relations = parse_relations( @@ -412,6 +418,7 @@ def get_dashboard_details(self, urn: str) -> Dashboard: tags = parse_tags(response) glossary_terms = parse_glossary_terms(response) created, modified = parse_created_and_modified(properties) + modified = parse_last_modified(response) name, display_name, qualified_name = parse_names(response, properties) children = parse_relations( RelationshipType.CHILD, [response["relationships"]] diff --git a/lib/datahub-client/data_platform_catalogue/client/graphql/getDatasetDetails.graphql b/lib/datahub-client/data_platform_catalogue/client/graphql/getDatasetDetails.graphql index 050af08b..8b7beee7 100644 --- a/lib/datahub-client/data_platform_catalogue/client/graphql/getDatasetDetails.graphql +++ b/lib/datahub-client/data_platform_catalogue/client/graphql/getDatasetDetails.graphql @@ -155,6 +155,15 @@ query getDatasetDetails($urn: String!) { } } } + runs: runs(start: 0, count: 1, direction: OUTGOING) { + runs { + ... on DataProcessInstance { + created { + time + } + } + } + } } } diff --git a/lib/datahub-client/data_platform_catalogue/client/graphql_helpers.py b/lib/datahub-client/data_platform_catalogue/client/graphql_helpers.py index e2d87dfa..fe2a77d7 100644 --- a/lib/datahub-client/data_platform_catalogue/client/graphql_helpers.py +++ b/lib/datahub-client/data_platform_catalogue/client/graphql_helpers.py @@ -1,9 +1,11 @@ from collections import defaultdict -from datetime import datetime, timezone +from datetime import datetime from importlib.resources import files +import logging from typing import Any, Tuple from data_platform_catalogue.entities import ( + Audience, AccessInformation, Column, ColumnRef, @@ -20,6 +22,8 @@ UsageRestrictions, ) +logger = logging.getLogger(__name__) + PROPERTIES_EMPTY_STRING_FIELDS = ("description", "externalUrl") # Note: Data owner is missing as an ownershipType entity in Datahub, but it still seems to be @@ -119,7 +123,7 @@ def parse_last_modified(entity: dict[str, Any]) -> datetime | None: timestamp = entity.get("lastIngested") if timestamp is None: return None - return datetime.fromtimestamp(timestamp / 1000, timezone.utc) + return timestamp def parse_created_and_modified( @@ -132,14 +136,21 @@ def parse_created_and_modified( if modified == 0: modified = None - if created is not None: - created = datetime.fromtimestamp(created / 1000, timezone.utc) - if modified is not None: - modified = datetime.fromtimestamp(modified / 1000, timezone.utc) - return created, modified +def parse_updated( + response: dict[str, Any] +) -> datetime | None: + list_of_runs: list = response.get("runs", {}).get("runs", []) + if not list_of_runs: + updated = None + if list_of_runs: + updated = list_of_runs[0].get("created", {}).get("time", {}) + + return updated + + def parse_tags(entity: dict[str, Any]) -> list[TagRef]: """ Parse tag information into a list of TagRef objects for displaying @@ -166,6 +177,28 @@ def parse_tags(entity: dict[str, Any]) -> list[TagRef]: return tags +def get_refresh_period_from_cadet_tags( + tags: list[TagRef], + refresh_schedules: list[str] = ["daily", "weekly", "monthly"] +) -> str: + # Check if any of the tags are refresh period tags eg "daily_opg" + relevant_refresh_schedules = [ + schedule + for tag_ref in tags + for schedule in refresh_schedules + if schedule in tag_ref.display_name + ] + if len(relevant_refresh_schedules) > 1: + logger.warn(f"More than one refresh period tag found: {tags=}") + + if relevant_refresh_schedules: + refresh_schedule = relevant_refresh_schedules[0].capitalize() + return refresh_schedule + + if not relevant_refresh_schedules: + return "" + + def parse_glossary_terms(entity: dict[str, Any]) -> list[GlossaryTermRef]: """ Parse glossary_term information into a list of TagRef for displaying @@ -215,21 +248,18 @@ def parse_properties( access_information = AccessInformation.model_validate(custom_properties_dict) usage_restrictions = UsageRestrictions.model_validate(custom_properties_dict) data_summary = DataSummary.model_validate(custom_properties_dict) + tags = parse_tags(entity) + data_summary.refresh_period = get_refresh_period_from_cadet_tags(tags) + audience = custom_properties_dict.get("audience", "Internal") further_information = FurtherInformation.model_validate(custom_properties_dict) - last_updated_timestamp = properties.get("lastRefreshed") - if last_updated_timestamp: - last_updated_date_str = datetime.fromtimestamp(last_updated_timestamp).strftime( - "%d %B %Y" - ) - data_summary.last_updated = last_updated_date_str - custom_properties = CustomEntityProperties( access_information=access_information, usage_restrictions=usage_restrictions, data_summary=data_summary, further_information=further_information, + audience=audience ) return properties, custom_properties diff --git a/lib/datahub-client/data_platform_catalogue/entities.py b/lib/datahub-client/data_platform_catalogue/entities.py index 002a6ca8..69cee134 100644 --- a/lib/datahub-client/data_platform_catalogue/entities.py +++ b/lib/datahub-client/data_platform_catalogue/entities.py @@ -14,6 +14,11 @@ class RelationshipType(Enum): CHILD = "CHILD" +class Audience(Enum): + INTERNAL = "Internal" + PUBLISHED = "Published" + + class EntityRef(BaseModel): """ A reference to another entity in the metadata graph. @@ -323,19 +328,12 @@ class DataSummary(BaseModel): default="", examples=["123", 123], ) - refresh_period: str = Field( description="Indicates the frequency that the data are refreshed/updated", default="", examples=["Annually", "Quarterly", "Monthly", "Weekly", "Daily"], ) - last_updated: str = Field( - description="Indicates the date when the data were last refreshed/updated", - default="", - examples=["05 May 2024", "25 December 2023"], - ) - class CustomEntityProperties(BaseModel): """Custom entity properties not part of DataHub's entity model""" @@ -355,6 +353,13 @@ class CustomEntityProperties(BaseModel): description="Routes to further information about the data", default_factory=FurtherInformation, ) + audience: Audience = Field( + description="If the data is published or not", + default="Internal", + ) + + class Config: + use_enum_values = True class Entity(BaseModel): @@ -520,6 +525,11 @@ class Table(Entity): ] ], ) + last_updated: Optional[datetime] = Field( + description="Indicates the time when the data were last refreshed (eg pipeline run with dbt).", + default=None, + examples=[datetime(2011, 10, 2, 3, 0, 0)], + ) class Chart(Entity): diff --git a/lib/datahub-client/tests/client/datahub/test_datahub_client.py b/lib/datahub-client/tests/client/datahub/test_datahub_client.py index 8858c461..b1a0d051 100644 --- a/lib/datahub-client/tests/client/datahub/test_datahub_client.py +++ b/lib/datahub-client/tests/client/datahub/test_datahub_client.py @@ -13,6 +13,7 @@ ReferencedEntityMissing, ) from data_platform_catalogue.entities import ( + Audience, AccessInformation, Chart, Column, @@ -80,8 +81,8 @@ def database(self): ) ] }, - last_modified=datetime(2020, 5, 17), - created=datetime(2020, 5, 17), + last_modified=1710426920000, + created=1710426920000, tags=[TagRef(urn="test", display_name="test")], platform=EntityRef(urn="urn:li:dataPlatform:athena", display_name="athena"), custom_properties=CustomEntityProperties( @@ -137,7 +138,7 @@ def table(self): ], ), tags=[TagRef(display_name="some-tag", urn="urn:li:tag:Entity")], - last_modified=datetime(2024, 3, 5, 6, 16, 47, 814000, tzinfo=timezone.utc), + last_modified=1710426920000, created=None, column_details=[ Column( @@ -205,7 +206,7 @@ def table2(self): ], ), tags=[TagRef(display_name="some-tag", urn="urn:li:tag:Entity")], - last_modified=datetime(2024, 3, 5, 6, 16, 47, 814000, tzinfo=timezone.utc), + last_modified=1710426920000, created=None, column_details=[ Column( @@ -331,6 +332,7 @@ def test_get_dataset( }, "lastIngested": 1709619407814, "domain": None, + "provider": "LAA", "schemaMetadata": { "fields": [ { @@ -374,6 +376,7 @@ def test_get_dataset( fully_qualified_name="Foo.Dataset", description="Dataset", relationships={ + RelationshipType.DATA_LINEAGE: [], RelationshipType.PARENT: [ EntitySummary( entity_ref=EntityRef( @@ -389,7 +392,6 @@ def test_get_dataset( entity_type="Database", ) ], - RelationshipType.DATA_LINEAGE: [], }, domain=DomainRef(display_name="", urn=""), governance=Governance( @@ -397,7 +399,8 @@ def test_get_dataset( data_stewards=[], ), tags=[TagRef(display_name="some-tag", urn="urn:li:tag:Entity")], - last_modified=datetime(2024, 3, 5, 6, 16, 47, 814000, tzinfo=timezone.utc), + last_modified=1709619407814, + provider="LAA", created=None, platform=EntityRef(urn="datahub", display_name="datahub"), column_details=[ @@ -479,6 +482,7 @@ def test_get_dataset_minimal_properties( ), data_summary=DataSummary(), further_information=FurtherInformation(), + audience=Audience.INTERNAL, ), column_details=[], ) @@ -538,6 +542,7 @@ def test_get_chart_details(self, datahub_client, base_mock_graph): ), data_summary=DataSummary(), further_information=FurtherInformation(), + audience=Audience.INTERNAL, ), external_url="https://data.justice.gov.uk/prisons/public-protection/absconds", ) diff --git a/lib/datahub-client/tests/client/datahub/test_graphql_helpers.py b/lib/datahub-client/tests/client/datahub/test_graphql_helpers.py index fbbc4043..61b998d3 100644 --- a/lib/datahub-client/tests/client/datahub/test_graphql_helpers.py +++ b/lib/datahub-client/tests/client/datahub/test_graphql_helpers.py @@ -1,11 +1,13 @@ from datetime import datetime, timezone +import pytest import pytest from data_platform_catalogue.client.graphql_helpers import ( DATA_CUSTODIAN, _make_user_email_from_urn, _parse_owners_by_type, + get_refresh_period_from_cadet_tags, parse_columns, parse_created_and_modified, parse_data_owner, @@ -14,8 +16,10 @@ parse_relations, parse_subtypes, parse_tags, + parse_updated, ) from data_platform_catalogue.entities import ( + Audience, AccessInformation, Column, ColumnRef, @@ -237,8 +241,8 @@ def test_parse_relations_blank(): ( 1710426920000, {"time": 1710426921000, "actor": "Shakira"}, - datetime(2024, 3, 14, 14, 35, 20, tzinfo=timezone.utc), - datetime(2024, 3, 14, 14, 35, 21, tzinfo=timezone.utc), + 1710426920000, + 1710426921000, ), ( 0, @@ -282,6 +286,7 @@ def test_parse_properties(): {"key": "s3_location", "value": "s3://databucket/"}, {"key": "row_count", "value": 100}, {"key": "Not_IN", "value": "dddd"}, + {"key": "audience", "value": "Internal"}, ], "name": "test", "description": "test description", @@ -310,6 +315,7 @@ def test_parse_properties(): further_information=FurtherInformation( dc_slack_channel_name="test-channel", dc_slack_channel_url="test-url" ), + audience=Audience.INTERNAL, ) @@ -325,6 +331,7 @@ def test_parse_properties_with_none_values(): {"key": "s3_location", "value": "s3://databucket/"}, {"key": "row_count", "value": 100}, {"key": "Not_IN", "value": "dddd"}, + {"key": "audience", "value": "Internal"}, ], "name": "test", "description": None, @@ -353,6 +360,7 @@ def test_parse_properties_with_none_values(): ), data_summary=DataSummary(row_count=100), further_information=FurtherInformation(), + audience=Audience.INTERNAL, ) @@ -635,3 +643,31 @@ def test_parse_owners(): display_name="", ), ] + + +def test_parse_updated(): + expected_timestamp = 12345678 + example_with_updated = { + "runs": { + "runs": [ + {"created": {"time": expected_timestamp}} + ] + } + } + example_no_updated = {} + + assert parse_updated(example_with_updated) == expected_timestamp + assert parse_updated(example_no_updated) is None + + +@pytest.mark.parametrize( + "tags, expected_refresh_period", + [ + ([TagRef(display_name="daily_opg", urn="urn:li:tag:daily_opg")], "Daily"), + ([TagRef(display_name="monthly", urn="urn:li:tag:monthly")], "Monthly"), + ([TagRef(display_name="dc_cadet", urn="urn:li:tag:dc_cadet")], ""), + ], +) +def test_get_refresh_period_from_cadet_tags(tags, expected_refresh_period): + refresh_period = get_refresh_period_from_cadet_tags(tags) + assert refresh_period == expected_refresh_period diff --git a/lib/datahub-client/tests/client/datahub/test_search.py b/lib/datahub-client/tests/client/datahub/test_search.py index df78ef7f..93b31dbe 100644 --- a/lib/datahub-client/tests/client/datahub/test_search.py +++ b/lib/datahub-client/tests/client/datahub/test_search.py @@ -5,6 +5,7 @@ from data_platform_catalogue.client.search import SearchClient from data_platform_catalogue.entities import ( + Audience, AccessInformation, DataSummary, EntityRef, @@ -145,7 +146,6 @@ def test_one_search_result(mock_graph, searcher): "s3_location": "", "dc_access_requirements": "", "refresh_period": "", - "last_updated": "", "row_count": "", }, tags=[], @@ -236,7 +236,6 @@ def test_dataset_result(mock_graph, searcher): "s3_location": "", "dc_access_requirements": "", "refresh_period": "", - "last_updated": "", "row_count": "", }, tags=[], @@ -385,7 +384,6 @@ def test_2_dataset_results_with_one_malformed_result(mock_graph, searcher): "s3_location": "", "dc_access_requirements": "", "refresh_period": "", - "last_updated": "", "row_count": "", }, tags=[], @@ -480,13 +478,10 @@ def test_full_page(mock_graph, searcher): "s3_location": "", "dc_access_requirements": "", "refresh_period": "", - "last_updated": "", "row_count": "", }, tags=[], - last_modified=datetime( - 2024, 1, 23, 6, 15, 2, 353000, tzinfo=timezone.utc - ), + last_modified=1705990502353, created=None, ), SearchResult( @@ -514,7 +509,6 @@ def test_full_page(mock_graph, searcher): "s3_location": "", "dc_access_requirements": "", "refresh_period": "", - "last_updated": "", "row_count": "", }, tags=[], @@ -546,7 +540,6 @@ def test_full_page(mock_graph, searcher): "s3_location": "", "dc_access_requirements": "", "refresh_period": "", - "last_updated": "", "row_count": "", }, tags=[], @@ -628,7 +621,6 @@ def test_query_match(mock_graph, searcher): "s3_location": "", "dc_access_requirements": "", "refresh_period": "", - "last_updated": "", "row_count": "", }, tags=[], @@ -709,7 +701,6 @@ def test_result_with_owner(mock_graph, searcher): "s3_location": "", "dc_access_requirements": "", "refresh_period": "", - "last_updated": "", "row_count": "", }, tags=[], @@ -1082,7 +1073,6 @@ def test_search_for_charts(mock_graph, searcher): "s3_location": "", "dc_access_requirements": "", "refresh_period": "", - "last_updated": "", "row_count": "", }, tags=[], @@ -1189,6 +1179,7 @@ def test_search_for_container(mock_graph, searcher): "name": "test_db", }, metadata={ + "audience": "Internal", "owner": "Shannon Lovett", "owner_email": "shannon@longtail.com", "domain_name": "testdom", diff --git a/lib/datahub-client/tests/snapshots/test_upsert_table.json b/lib/datahub-client/tests/snapshots/test_upsert_table.json index 59801297..4212f6c1 100644 --- a/lib/datahub-client/tests/snapshots/test_upsert_table.json +++ b/lib/datahub-client/tests/snapshots/test_upsert_table.json @@ -15,12 +15,12 @@ "dc_access_requirements": "", "row_count": "5", "refresh_period": "", - "last_updated": "", "dc_slack_channel_name": "test-channel", "dc_slack_channel_url": "test-url", "dc_teams_channel_name": "", "dc_teams_channel_url": "", - "dc_team_email": "" + "dc_team_email": "", + "audience": "Internal" }, "name": "Dataset", "qualifiedName": "database.Dataset", @@ -123,4 +123,4 @@ } } } -] +] \ No newline at end of file diff --git a/lib/datahub-client/tests/snapshots/test_upsert_table_and_database.json b/lib/datahub-client/tests/snapshots/test_upsert_table_and_database.json index 27c0ef4c..7c88430b 100644 --- a/lib/datahub-client/tests/snapshots/test_upsert_table_and_database.json +++ b/lib/datahub-client/tests/snapshots/test_upsert_table_and_database.json @@ -28,12 +28,12 @@ "dc_access_requirements": "", "row_count": "", "refresh_period": "", - "last_updated": "", "dc_slack_channel_name": "test-channel", "dc_slack_channel_url": "test-url", "dc_teams_channel_name": "", "dc_teams_channel_url": "", - "dc_team_email": "" + "dc_team_email": "", + "audience": "Internal" }, "name": "my_database", "description": "little test db" @@ -115,12 +115,12 @@ "dc_access_requirements": "", "row_count": "5", "refresh_period": "", - "last_updated": "", "dc_slack_channel_name": "test-channel", "dc_slack_channel_url": "test-url", "dc_teams_channel_name": "", "dc_teams_channel_url": "", - "dc_team_email": "" + "dc_team_email": "", + "audience": "Internal" }, "name": "Dataset", "qualifiedName": "database.Dataset", @@ -223,4 +223,4 @@ } } } -] +] \ No newline at end of file diff --git a/templates/details_base.html b/templates/details_base.html index bea31b7e..9a019690 100644 --- a/templates/details_base.html +++ b/templates/details_base.html @@ -5,6 +5,7 @@ {% load waffle_tags %} {% load i18n %} {% load page_title %} +{% load format_timesince %} {% block title %} {% with details_title=h1_value|add:" - "|add:entity_type %} @@ -75,6 +76,18 @@

{{entity.custom_properties.data_summary.refresh_period}} {% endif %} + {% if entity.custom_properties.audience %} +
  • + Audience: + {{entity.custom_properties.audience}} +
  • + {% endif %} +
  • {% translate "Domain:" %} {{entity.domain.display_name | default:_('Not provided') }} @@ -92,6 +105,9 @@

    {% include "partial/esda_info.html" with is_esda=is_esda %} {% endblock metadata_list %} +
    + Updated {{entity.last_modified|timesince|format_timesince}} ago +
    diff --git a/tests/conftest.py b/tests/conftest.py index bcde26af..d3f2e48d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -374,7 +374,7 @@ def generate_table_metadata( description="some description", ) ], - last_modified=datetime(2024, 3, 5, 6, 16, 47, 814000, tzinfo=timezone.utc), + last_modified=1710426920000, created=None, column_details=[ Column( @@ -435,7 +435,7 @@ def generate_chart_metadata( description="some description", ) ], - last_modified=datetime(2024, 3, 5, 6, 16, 47, 814000, tzinfo=timezone.utc), + last_modified=1710426920000, created=None, platform=EntityRef(urn="urn:li:dataPlatform:athena", display_name="athena"), custom_properties=custom_properties or CustomEntityProperties(), @@ -495,7 +495,7 @@ def generate_database_metadata( description="some description", ) ], - last_modified=datetime(2024, 3, 5, 6, 16, 47, 814000, tzinfo=timezone.utc), + last_modified=1710426920000, created=None, platform=EntityRef(urn="urn:li:dataPlatform:athena", display_name="athena"), custom_properties=custom_properties or CustomEntityProperties(), @@ -553,7 +553,7 @@ def generate_dashboard_metadata( description="some description", ) ], - last_modified=datetime(2024, 3, 5, 6, 16, 47, 814000, tzinfo=timezone.utc), + last_modified=1710426920000, created=None, platform=EntityRef(urn="urn:li:dataPlatform:athena", display_name="athena"), custom_properties=custom_properties or CustomEntityProperties(), diff --git a/tests/home/templatetags/test_format_timesince.py b/tests/home/templatetags/test_format_timesince.py new file mode 100644 index 00000000..33f1c025 --- /dev/null +++ b/tests/home/templatetags/test_format_timesince.py @@ -0,0 +1,12 @@ +import pytest + +from home.templatetags.format_timesince import format_timesince + + +@pytest.mark.parametrize("timesince, expected_result", [ + ("30 seconds", "30 seconds"), + ("1 hour, 45 minutes", "1 hour"), + ("1 day, 6 hours, 45 minutes", "1 day"), +]) +def test_format_timesince(timesince, expected_result): + assert format_timesince(timesince) == expected_result