Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: fiscal years and dates temporal coverage issue #22

Merged
merged 3 commits into from
Feb 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Volumes
volumes/
.venv/

# taken from : https://github.com/themattrix/python-pypi-template/blob/master/.dockerignore
# Git
Expand Down
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
repos:
- repo: https://github.com/psf/black
rev: 22.3.0
rev: 24.2.0
hooks:
- id: black
language_version: python3
- repo: https://github.com/pycqa/flake8
rev: 3.9.0
rev: 7.0.0
hooks:
- id: flake8
- repo: https://github.com/timothycrosley/isort
rev: 5.12.0
rev: 5.13.2
hooks:
- id: isort
4 changes: 3 additions & 1 deletion app/utils/columns_mapping.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import re
from itertools import chain

from fastapi.logger import logger

from app.core.config import (
DateTimeSettings,
GeographySettings,
Expand Down Expand Up @@ -62,7 +64,7 @@ async def find_datetime_columns(columns: set):
columns, month_pattern
)
date_columns, columns = extract_pattern_from_columns(columns, date_pattern)

logger.info(f"date_columns: {date_columns}")
# filter out `as_on_date` from date columns
date_columns = {
col for col in date_columns if not as_on_date_pattern.match(col)
Expand Down
8 changes: 5 additions & 3 deletions app/utils/spatial_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,11 @@ async def get_spatial_coverage(dataset):
]
part, whole = (
None,
None
if "country" in ordered_geographic_entity
else geography_settings.DEFAULT_SPATIAL_COVERAGE,
(
None
if "country" in ordered_geographic_entity
else geography_settings.DEFAULT_SPATIAL_COVERAGE
),
)

for entity in ordered_geographic_entity:
Expand Down
81 changes: 67 additions & 14 deletions app/utils/temporal_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
from itertools import chain
from typing import List

import pandas as pd
from fastapi.logger import logger

from app.core.config import DateTimeSettings

datetime_settings = DateTimeSettings()
Expand Down Expand Up @@ -76,41 +79,91 @@ def is_sequence(year_mapping):

def temporal_coverage_representation(is_sequence, year_mapping):
year_values_from_mapping = sorted(year_mapping.keys())
logger.warning(f"Year Values from Mapping: {year_values_from_mapping}")

if len(year_values_from_mapping) == 1:
return f"{year_values_from_mapping[0]}"

if not is_sequence:
logger.warning(
f"Year Mapping: {', '.join(str(year) for year in year_values_from_mapping)}"
)
return ", ".join(str(year) for year in year_values_from_mapping)

return f"{year_values_from_mapping[0]} to {year_values_from_mapping[-1]}"


def is_fiscal_check(unique_years):
for year in unique_years:
if "-" not in year:
return False
return True


def get_time_periods(years, is_fiscal=False):
if not is_fiscal:
years = sorted(map(int, years))
years = list(map(str, years))
time_periods = []
start_year = years[0]
end_year = years[0]

for year in years[1:]:
if int(year.split("-")[0]) == int(end_year.split("-")[0]) + 1:
end_year = year
else:
if start_year == end_year:
time_periods.append(start_year)
else:
time_periods.append(f"{start_year} to {end_year}")
start_year = year
end_year = year

# Add the last time period
if start_year == end_year:
time_periods.append(start_year)
else:
time_periods.append(f"{start_year} to {end_year}")

return ", ".join(time_periods)


async def get_temporal_coverage(dataset, mapped_columns: dict):
year_columns = (
list(mapped_columns["calender_year"])
+ list(mapped_columns["non_calendar_year"])
+ list(mapped_columns["other_year"])
)
year_columns = [year_column for year_column in year_columns if year_column]

# do operation on the first year column
if len(year_columns) == 0:
date_columns = list(mapped_columns["date"])

if len(date_columns) != 0:
date_column = date_columns[0]
# Extract unique years
unique_year_values = pd.to_datetime(
dataset[date_column], format="%d-%m-%Y"
).dt.year.unique()
unique_year_values = [str(year) for year in unique_year_values]
elif len(year_columns) != 0:
year_column = year_columns[0]
unique_year_values = [
f"{year_val}"
for year_val in dataset[year_column].unique()
if year_val
]
else:
return {"temporal_coverage": ""}

year_column = year_columns[0]
unique_year_values = [
f"{year_val}" for year_val in dataset[year_column].unique() if year_val
]

if not verify_proper_format_of_year_values(unique_year_values):
return {"temporal_coverage": ""}
is_fiscal = is_fiscal_check(unique_year_values)
temporal_coverage = get_time_periods(unique_year_values, is_fiscal)
# year_mapping = get_list_mappings(unique_year_values)

year_mapping = get_list_mappings(unique_year_values)
# year_in_sequence = is_sequence(year_mapping)

year_in_sequence = is_sequence(year_mapping)

temporal_coverage = temporal_coverage_representation(
year_in_sequence, year_mapping
)
# temporal_coverage = temporal_coverage_representation(
# year_in_sequence, year_mapping
# )
logger.warning(f"Temporal Coverage: {temporal_coverage}")
return {"temporal_coverage": temporal_coverage}
Loading