diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index aebb248..f9d834e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,10 +5,10 @@ repos: - id: black language_version: python3 - repo: https://github.com/pycqa/flake8 - rev: 4.0.1 + rev: 3.9.0 hooks: - id: flake8 - repo: https://github.com/timothycrosley/isort - rev: 5.9.3 + rev: 5.12.0 hooks: - id: isort \ No newline at end of file diff --git a/app/api/api_v1/routers/meta_data.py b/app/api/api_v1/routers/meta_data.py index f4bcabf..46c885d 100644 --- a/app/api/api_v1/routers/meta_data.py +++ b/app/api/api_v1/routers/meta_data.py @@ -110,11 +110,17 @@ async def list_bucket_objects( else: objects = await get_list_of_s3_objects(s3_resource, s3_bucket, prefix) objects_json = [ - {"key": obj.key, "last_modified": obj.last_modified} + { + "key": obj.key, + "last_modified": obj.last_modified, + "size": obj.size / 1e3, + } for obj in objects if obj.key.endswith(file_format) ] return { "total": len(objects_json), + "file_size": "KB", + "bucket": s3_bucket, "objects": objects_json, } diff --git a/app/core/config.py b/app/core/config.py index 280e850..be43837 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -28,9 +28,10 @@ class Config: class DateTimeSettings(BaseSettings): - CALENDAR_YEAR_KEYWORD = "year" + CALENDAR_YEAR_KEYWORD = "^year$" FISCAL_YEAR_KEYWORD = "fiscal_year" ACADEMIC_YEAR_KEYWORD = "academic_year" + OTHER_YEAR_KEYWORD = ".*_year" QUARTER_KEYWORD = "quarter" MONTH_KEYWORD = "month" DATE_KEYWORD = "date" @@ -46,7 +47,7 @@ class DateTimeSettings(BaseSettings): 2: ["week"], 3: ["month"], 4: ["quarter"], - 5: ["calender_year", "non_calendar_year"], + 5: ["calender_year", "non_calendar_year", "other_year"], } GRANULARITY_REPRESENTATION = { "date": "Daily", @@ -55,6 +56,7 @@ class DateTimeSettings(BaseSettings): "quarter": "Quarterly", "calender_year": "Yearly", "non_calendar_year": "Yearly", + "other_year": "Yearly", } diff --git a/app/models/meta_data.py b/app/models/meta_data.py index c36b048..9f21a33 100644 --- a/app/models/meta_data.py +++ b/app/models/meta_data.py @@ -11,11 +11,3 @@ class MetaData(BaseModel): spatial_coverage: Optional[str] formats_available: Optional[str] is_public: Optional[bool] - - -""" -[ -"https://storage.factly.org/mande/edu-ministry/data/processed/statistics/1_AISHE_report/1_universities_count_by_state/output.csv", -"https://storage.factly.org/mande/edu-ministry/data/processed/statistics/1_AISHE_report/19_enrolment_foreign/output.csv" -] -""" diff --git a/app/utils/columns_mapping.py b/app/utils/columns_mapping.py index 64fdc20..b19d917 100644 --- a/app/utils/columns_mapping.py +++ b/app/utils/columns_mapping.py @@ -32,6 +32,9 @@ async def find_datetime_columns(columns: set): cal_year_pattern = re.compile( r".*({})".format(datetime_settings.CALENDAR_YEAR_KEYWORD) ) + other_year_pattern = re.compile( + r".*({})".format(datetime_settings.OTHER_YEAR_KEYWORD) + ) quarter_pattern = re.compile( r".*({})".format(datetime_settings.QUARTER_KEYWORD) ) @@ -49,6 +52,9 @@ async def find_datetime_columns(columns: set): year_columns, columns = extract_pattern_from_columns( columns, cal_year_pattern ) + other_year_columns, columns = extract_pattern_from_columns( + columns, other_year_pattern + ) quarter_columns, columns = extract_pattern_from_columns( columns, quarter_pattern ) @@ -61,10 +67,10 @@ async def find_datetime_columns(columns: set): date_columns = { col for col in date_columns if not as_on_date_pattern.match(col) } - return { "non_calendar_year": fiscal_year_columns, "calender_year": year_columns, + "other_year": other_year_columns, "quarter": quarter_columns, "month": month_columns, "date": date_columns, @@ -131,6 +137,7 @@ async def find_mapped_columns(columns): **unit_columns, **note_columns, } + not_mapped_columns = list( set(columns).difference( list(chain.from_iterable(mapped_columns.values())) diff --git a/app/utils/temporal_coverage.py b/app/utils/temporal_coverage.py index 9471f41..760656c 100644 --- a/app/utils/temporal_coverage.py +++ b/app/utils/temporal_coverage.py @@ -87,8 +87,10 @@ def temporal_coverage_representation(is_sequence, year_mapping): async def get_temporal_coverage(dataset, mapped_columns: dict): - year_columns = list(mapped_columns["calender_year"]) + list( - mapped_columns["non_calendar_year"] + year_columns = ( + list(mapped_columns["calender_year"]) + + list(mapped_columns["non_calendar_year"]) + + list(mapped_columns["other_year"]) ) year_columns = [year_column for year_column in year_columns if year_column]