From 2e193f0eddec3f5e422e357da3fc76915c3b15be Mon Sep 17 00:00:00 2001 From: HemanthM005 <mhemanthmanikala@gmail.com> Date: Fri, 16 Feb 2024 19:16:27 +0530 Subject: [PATCH 1/3] Fix: fiscal years and dates temporal coverage issue --- .dockerignore | 1 + app/utils/columns_mapping.py | 4 +- app/utils/temporal_coverage.py | 74 +++++++++++++++++++++++++++------- 3 files changed, 63 insertions(+), 16 deletions(-) diff --git a/.dockerignore b/.dockerignore index bf3a88f..904280c 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,5 +1,6 @@ # Volumes volumes/ +.venv/ # taken from : https://github.com/themattrix/python-pypi-template/blob/master/.dockerignore # Git diff --git a/app/utils/columns_mapping.py b/app/utils/columns_mapping.py index b19d917..2626953 100644 --- a/app/utils/columns_mapping.py +++ b/app/utils/columns_mapping.py @@ -7,7 +7,7 @@ NoteSettings, UnitSettings, ) - +from fastapi.logger import logger datetime_settings = DateTimeSettings() geography_settings = GeographySettings() unit_settings = UnitSettings() @@ -62,7 +62,7 @@ async def find_datetime_columns(columns: set): columns, month_pattern ) date_columns, columns = extract_pattern_from_columns(columns, date_pattern) - + logger.info(f"date_columns: {date_columns}") # filter out `as_on_date` from date columns date_columns = { col for col in date_columns if not as_on_date_pattern.match(col) diff --git a/app/utils/temporal_coverage.py b/app/utils/temporal_coverage.py index 760656c..b22dead 100644 --- a/app/utils/temporal_coverage.py +++ b/app/utils/temporal_coverage.py @@ -1,10 +1,12 @@ import re from itertools import chain from typing import List +import pandas as pd from app.core.config import DateTimeSettings datetime_settings = DateTimeSettings() +from fastapi.logger import logger def convert_to_calender_year(other_year): @@ -76,16 +78,53 @@ def is_sequence(year_mapping): def temporal_coverage_representation(is_sequence, year_mapping): year_values_from_mapping = sorted(year_mapping.keys()) + logger.warning(f"Year Values from Mapping: {year_values_from_mapping}") if len(year_values_from_mapping) == 1: return f"{year_values_from_mapping[0]}" if not is_sequence: + logger.warning(f"Year Mapping: {', '.join(str(year) for year in year_values_from_mapping)}") return ", ".join(str(year) for year in year_values_from_mapping) return f"{year_values_from_mapping[0]} to {year_values_from_mapping[-1]}" +def is_fiscal_check(unique_years): + for year in unique_years: + if "-" not in year: + return False + return True + + +def get_time_periods(years, is_fiscal=False): + if not is_fiscal: + years = sorted(map(int, years)) + years = list(map(str, years)) + time_periods = [] + start_year = years[0] + end_year = years[0] + + for year in years[1:]: + if int(year.split('-')[0]) == int(end_year.split('-')[0]) + 1: + end_year = year + else: + if start_year == end_year: + time_periods.append(start_year) + else: + time_periods.append(f"{start_year} to {end_year}") + start_year = year + end_year = year + + # Add the last time period + if start_year == end_year: + time_periods.append(start_year) + else: + time_periods.append(f"{start_year} to {end_year}") + + return ", ".join(time_periods) + + async def get_temporal_coverage(dataset, mapped_columns: dict): year_columns = ( list(mapped_columns["calender_year"]) @@ -93,24 +132,31 @@ async def get_temporal_coverage(dataset, mapped_columns: dict): + list(mapped_columns["other_year"]) ) year_columns = [year_column for year_column in year_columns if year_column] - - # do operation on the first year column - if len(year_columns) == 0: + date_columns = list(mapped_columns["date"]) + + if len(date_columns) != 0: + date_column = date_columns[0] + # Extract unique years + unique_year_values = pd.to_datetime(dataset[date_column], format='%d-%m-%Y').dt.year.unique() + unique_year_values = [str(year) for year in unique_year_values] + elif len(year_columns) != 0: + year_column = year_columns[0] + unique_year_values = [ + f"{year_val}" for year_val in dataset[year_column].unique() if year_val + ] + else: return {"temporal_coverage": ""} - year_column = year_columns[0] - unique_year_values = [ - f"{year_val}" for year_val in dataset[year_column].unique() if year_val - ] - if not verify_proper_format_of_year_values(unique_year_values): return {"temporal_coverage": ""} + is_fiscal = is_fiscal_check(unique_year_values) + temporal_coverage = get_time_periods(unique_year_values, is_fiscal) + # year_mapping = get_list_mappings(unique_year_values) - year_mapping = get_list_mappings(unique_year_values) + # year_in_sequence = is_sequence(year_mapping) - year_in_sequence = is_sequence(year_mapping) - - temporal_coverage = temporal_coverage_representation( - year_in_sequence, year_mapping - ) + # temporal_coverage = temporal_coverage_representation( + # year_in_sequence, year_mapping + # ) + logger.warning(f"Temporal Coverage: {temporal_coverage}") return {"temporal_coverage": temporal_coverage} From 738fcbdd932d92333375594157fd27abc00b4915 Mon Sep 17 00:00:00 2001 From: HemanthM005 <mhemanthmanikala@gmail.com> Date: Fri, 16 Feb 2024 19:28:32 +0530 Subject: [PATCH 2/3] Fix: fiscal years and dates temporal coverage issue --- .pre-commit-config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f9d834e..04f428c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,14 +1,14 @@ repos: - repo: https://github.com/psf/black - rev: 22.3.0 + rev: 24.2.0 hooks: - id: black language_version: python3 - repo: https://github.com/pycqa/flake8 - rev: 3.9.0 + rev: 7.0.0 hooks: - id: flake8 - repo: https://github.com/timothycrosley/isort - rev: 5.12.0 + rev: 5.13.2 hooks: - id: isort \ No newline at end of file From ce48cc4f81be43c75066ea96d083fcb52b22e505 Mon Sep 17 00:00:00 2001 From: HemanthM005 <mhemanthmanikala@gmail.com> Date: Fri, 16 Feb 2024 19:36:10 +0530 Subject: [PATCH 3/3] Fix: fiscal years and dates temporal coverage issue --- app/utils/columns_mapping.py | 4 +++- app/utils/spatial_coverage.py | 8 +++++--- app/utils/temporal_coverage.py | 19 +++++++++++++------ 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/app/utils/columns_mapping.py b/app/utils/columns_mapping.py index 2626953..d0c339b 100644 --- a/app/utils/columns_mapping.py +++ b/app/utils/columns_mapping.py @@ -1,13 +1,15 @@ import re from itertools import chain +from fastapi.logger import logger + from app.core.config import ( DateTimeSettings, GeographySettings, NoteSettings, UnitSettings, ) -from fastapi.logger import logger + datetime_settings = DateTimeSettings() geography_settings = GeographySettings() unit_settings = UnitSettings() diff --git a/app/utils/spatial_coverage.py b/app/utils/spatial_coverage.py index 813d518..dd663b8 100644 --- a/app/utils/spatial_coverage.py +++ b/app/utils/spatial_coverage.py @@ -74,9 +74,11 @@ async def get_spatial_coverage(dataset): ] part, whole = ( None, - None - if "country" in ordered_geographic_entity - else geography_settings.DEFAULT_SPATIAL_COVERAGE, + ( + None + if "country" in ordered_geographic_entity + else geography_settings.DEFAULT_SPATIAL_COVERAGE + ), ) for entity in ordered_geographic_entity: diff --git a/app/utils/temporal_coverage.py b/app/utils/temporal_coverage.py index b22dead..7f2ad49 100644 --- a/app/utils/temporal_coverage.py +++ b/app/utils/temporal_coverage.py @@ -1,12 +1,13 @@ import re from itertools import chain from typing import List + import pandas as pd +from fastapi.logger import logger from app.core.config import DateTimeSettings datetime_settings = DateTimeSettings() -from fastapi.logger import logger def convert_to_calender_year(other_year): @@ -84,7 +85,9 @@ def temporal_coverage_representation(is_sequence, year_mapping): return f"{year_values_from_mapping[0]}" if not is_sequence: - logger.warning(f"Year Mapping: {', '.join(str(year) for year in year_values_from_mapping)}") + logger.warning( + f"Year Mapping: {', '.join(str(year) for year in year_values_from_mapping)}" + ) return ", ".join(str(year) for year in year_values_from_mapping) return f"{year_values_from_mapping[0]} to {year_values_from_mapping[-1]}" @@ -106,7 +109,7 @@ def get_time_periods(years, is_fiscal=False): end_year = years[0] for year in years[1:]: - if int(year.split('-')[0]) == int(end_year.split('-')[0]) + 1: + if int(year.split("-")[0]) == int(end_year.split("-")[0]) + 1: end_year = year else: if start_year == end_year: @@ -115,7 +118,7 @@ def get_time_periods(years, is_fiscal=False): time_periods.append(f"{start_year} to {end_year}") start_year = year end_year = year - + # Add the last time period if start_year == end_year: time_periods.append(start_year) @@ -137,12 +140,16 @@ async def get_temporal_coverage(dataset, mapped_columns: dict): if len(date_columns) != 0: date_column = date_columns[0] # Extract unique years - unique_year_values = pd.to_datetime(dataset[date_column], format='%d-%m-%Y').dt.year.unique() + unique_year_values = pd.to_datetime( + dataset[date_column], format="%d-%m-%Y" + ).dt.year.unique() unique_year_values = [str(year) for year in unique_year_values] elif len(year_columns) != 0: year_column = year_columns[0] unique_year_values = [ - f"{year_val}" for year_val in dataset[year_column].unique() if year_val + f"{year_val}" + for year_val in dataset[year_column].unique() + if year_val ] else: return {"temporal_coverage": ""}