diff --git a/.gitignore b/.gitignore index 9f29902..b9a78fa 100644 --- a/.gitignore +++ b/.gitignore @@ -109,3 +109,4 @@ ENV/ notebooks/data/ docs/notebooks data/all_daily_data.tar.gz +.history/ diff --git a/.history/task_geo/data_sources/noaa/__main___20200428005757.py b/.history/task_geo/data_sources/noaa/__main___20200428005757.py deleted file mode 100644 index 9b84dda..0000000 --- a/.history/task_geo/data_sources/noaa/__main___20200428005757.py +++ /dev/null @@ -1,34 +0,0 @@ -import argparse - -from ftp import download_noaa_files, process_noaa_data - - -def get_argparser(): - parser = argparse.ArgumentParser() - parser.add_argument( - '-d', '--download', - action='store_true', help="Wheter download or not the files.") - - parser.add_argument( - '-o', '--output', required=True, - help='Destination file to store the processed dataset.') - - parser.add_argument( - '-c', '--countries', required=True, - nargs='?', help='FIPS Country codes to select data for.') - - return parser - - -def main(): - parser = get_argparser() - args = parser.parse_args() - if args.download: - download_noaa_files() - else: - dataset = process_noaa_data(args.countries) - dataset.to_csv(args.output, index=False, header=True) - - -if __name__ == '__main__': - main() diff --git a/.history/task_geo/data_sources/noaa/__main___20200428031132.py b/.history/task_geo/data_sources/noaa/__main___20200428031132.py deleted file mode 100644 index 119fb10..0000000 --- a/.history/task_geo/data_sources/noaa/__main___20200428031132.py +++ /dev/null @@ -1,34 +0,0 @@ -import argparse - -from ftp_connector import download_noaa_files, process_noaa_data - - -def get_argparser(): - parser = argparse.ArgumentParser() - parser.add_argument( - '-d', '--download', - action='store_true', help="Wheter download or not the files.") - - parser.add_argument( - '-o', '--output', required=True, - help='Destination file to store the processed dataset.') - - parser.add_argument( - '-c', '--countries', required=True, - nargs='?', help='FIPS Country codes to select data for.') - - return parser - - -def main(): - parser = get_argparser() - args = parser.parse_args() - if args.download: - download_noaa_files() - else: - dataset = process_noaa_data(args.countries) - dataset.to_csv(args.output, index=False, header=True) - - -if __name__ == '__main__': - main() diff --git a/.history/task_geo/data_sources/noaa/noaa_api_connector_20200428010121.py b/.history/task_geo/data_sources/noaa/noaa_api_connector_20200428010121.py deleted file mode 100644 index 61502b0..0000000 --- a/.history/task_geo/data_sources/noaa/noaa_api_connector_20200428010121.py +++ /dev/null @@ -1,209 +0,0 @@ -"""Connector to the NOAA API. - - -Contributors: - -The journal article describing GHCN-Daily is: -Menne, M.J., I. Durre, R.S. Vose, B.E. Gleason, and T.G. Houston, 2012: An overview -of the Global Historical Climatology Network-Daily Database. Journal of Atmospheric -and Oceanic Technology, 29, 897-910, doi:10.1175/JTECH-D-11-00103.1. - -To acknowledge the specific version of the dataset used, please cite: -Menne, M.J., I. Durre, B. Korzeniewski, S. McNeal, K. Thomas, X. Yin, S. Anthony, R. Ray, -R.S. Vose, B.E.Gleason, and T.G. Houston, 2012: Global Historical Climatology Network - -Daily (GHCN-Daily), Version 3.26 -NOAA National Climatic Data Center. http://doi.org/10.7289/V5D21VHZ [2020/03/30]. -""" - -import logging -import os -from datetime import datetime - -import pandas as pd -import requests - -from task_geo.data_sources.noaa.ftp_connector import download_noaa_files -from task_geo.data_sources.noaa.references import ( - COUNTRY_AND_TERRITORY_CODES, DATA_DIRECTORY, TERRITORY_ACTIVE_STATIONS_MAP, load_dataset) - -logging.basicConfig(level=logging.DEBUG) -logging.getLogger("urllib3").setLevel(logging.WARNING) - - -DEFAULT_METRICS = ['TMAX', 'TMIN', 'TAVG', 'PCRP', 'SNOW', 'SNWD', 'PSUN', 'TSUN'] - - -def get_stations_by_country(country): - """Get all stations for a given country code. - - Arguments: - country(str) - - Returns: - list[str] - """ - - territory_codes = COUNTRY_AND_TERRITORY_CODES.get(country) - if territory_codes is None: - raise ValueError('Wrong country code %s', country) - - stations = list() - for code in territory_codes: - code_stations = TERRITORY_ACTIVE_STATIONS_MAP.get(code) - if code_stations is not None: - stations.extend(code_stations) - - return stations - - -def get_request_urls(country, start_date, end_date=None, metrics=None): - """Encodes the parameters the URL to make a GET request - - Arguments: - country(str): FIPS Country code - start_date(datetime) - end_date(datetime): Defaults to today - metrics(list[str]): Optional.List of metrics to retrieve,valid values are: - TMIN: Minimum temperature. - TMAX: Maximum temperature. - TAVG: Average of temperature. - SNOW: Snowfall (mm). - SNWD: Snow depth (mm). - PRCP: Precipitation. - PSUN: Daily percent of possible sunshine (percent) - TSUN: Daily total sunshine (minutes) - - Returns: - str - """ - - base_url = 'https://www.ncei.noaa.gov/access/services/data/v1?dataset=daily-summaries' - max_stations_req = 50 - - if metrics is None: - metrics = DEFAULT_METRICS - - request_common_args = ( - f'&format=json' - f'&units=metric' - f'&dataTypes={",".join(metrics)}' - ) - - if end_date is None: - end_date = datetime.now() - - start = start_date.date().isoformat() - end = end_date.date().isoformat() - - stations_list = get_stations_by_country(country) - inventory_data = load_dataset('inventory') - inventory_data = inventory_data[inventory_data.start_date >= start_date.year] - inventory_data = inventory_data[inventory_data.end_date <= end_date.year] - stations_list = [x for x in stations_list if x in inventory_data.ID] - if len(stations_list) < max_stations_req: - stations = ','.join(stations_list) - return [ - f'{base_url}&stations={stations}&startDate={start}&endDate={end}{request_common_args}'] - - else: - chunked_station_list = [ - stations_list[i:i + max_stations_req] - for i in range(0, len(stations_list), max_stations_req) - ] - - return [ - ( - f'{base_url}&stations={",".join(chunk)}&startDate={start}' - f'&endDate={end}{request_common_args}' - ) - for chunk in chunked_station_list - ] - - -def get_parse_response(urls): - """Calls the urls in urls, return responses and errors - - Arguments: - urls(list[str]): Urls as generated by `get_request_urls`. - - Returns: - tuple[list[dict], list[Exception]]: - The first element of the tuple is a list of dictionary with all the responses. - The second element is a list with all the exceptions raised during the calls. - """ - - results = list() - errors = list() - - total = len(urls) - 1 - for i, url in enumerate(urls): - logging.debug('Making request %s / %s', i + 1, total + 1) - response = requests.get(url) - try: - response.raise_for_status() - except requests.exceptions.HTTPError: - errors.append({ - 'url': url, - 'error': response.json(), - }) - continue - - results.extend(response.json()) - - return results, errors - - -def noaa_api_connector(countries, start_date, end_date=None, metrics=None): - """Get data from NOAA API. - - Arguments: - countries(list[str]): List of FIPS country codes to retrieve. - start_date(datetime) - end_date(datetime) - metrics(list[str]): Optional.List of metrics to retrieve,valid values are: - TMIN: Minimum temperature. - TMAX: Maximum temperature. - TAVG: Average of temperature. - SNOW: Snowfall (mm). - SNWD: Snow depth (mm). - PRCP: Precipitation. - PSUN: Daily percent of possible sunshine (percent) - TSUN: Daily total sunshine (minutes) - - Returns: - tuple[list[dict], list[Exception]] - """ - if not os.path.isfile(f'{DATA_DIRECTORY}/stations_metadata.txt'): - download_noaa_files(large_files=False) - - result = list() - for country in countries: - logging.info('Requesting data for %s', country) - urls = get_request_urls(country, start_date, end_date, metrics) - country_results, errors = get_parse_response(urls) - - if errors: - logging.info('The following errors where found during the operation:') - for error in errors: - logging.info(error) - - result.extend(country_results) - - data = pd.DataFrame(result) - stations = load_dataset('stations') - data = data.merge(stations, how='left', left_on='STATION', right_on='ID') - - del data['ID'] - del data['STATE'] - - columns = [ - 'DATE', 'STATION', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'NAME', - 'GSN FLAG', 'HCN/CRN FLAG', 'WMO ID' - ] - - if metrics is None: - metrics = DEFAULT_METRICS - - columns.extend([metric for metric in metrics if metric in data.columns]) - - return data[columns] diff --git a/.history/task_geo/data_sources/noaa/noaa_api_connector_20200428030209.py b/.history/task_geo/data_sources/noaa/noaa_api_connector_20200428030209.py deleted file mode 100644 index 20e35eb..0000000 --- a/.history/task_geo/data_sources/noaa/noaa_api_connector_20200428030209.py +++ /dev/null @@ -1,209 +0,0 @@ -"""Connector to the NOAA API. - - -Contributors: - -The journal article describing GHCN-Daily is: -Menne, M.J., I. Durre, R.S. Vose, B.E. Gleason, and T.G. Houston, 2012: An overview -of the Global Historical Climatology Network-Daily Database. Journal of Atmospheric -and Oceanic Technology, 29, 897-910, doi:10.1175/JTECH-D-11-00103.1. - -To acknowledge the specific version of the dataset used, please cite: -Menne, M.J., I. Durre, B. Korzeniewski, S. McNeal, K. Thomas, X. Yin, S. Anthony, R. Ray, -R.S. Vose, B.E.Gleason, and T.G. Houston, 2012: Global Historical Climatology Network - -Daily (GHCN-Daily), Version 3.26 -NOAA National Climatic Data Center. http://doi.org/10.7289/V5D21VHZ [2020/03/30]. -""" - -import logging -import os -from datetime import datetime - -import pandas as pd -import requests - -from task_geo.data_sources.noaa.ftp_connector import download_noaa_files -from task_geo.data_sources.noaa.references import ( - COUNTRY_AND_TERRITORY_CODES, DATA_DIRECTORY, TERRITORY_ACTIVE_STATIONS_MAP, load_dataset) - -logging.basicConfig(level=logging.DEBUG) -logging.getLogger("urllib3").setLevel(logging.WARNING) - - -DEFAULT_METRICS = ['TMAX', 'TMIN', 'TAVG', 'PCRP', 'SNOW', 'SNWD', 'PSUN', 'TSUN'] - - -def get_stations_by_country(country): - """Get all stations for a given country code. - - Arguments: - country(str) - - Returns: - list[str] - """ - - territory_codes = COUNTRY_AND_TERRITORY_CODES.get(country) - if territory_codes is None: - raise ValueError('Wrong country code %s', country) - - stations = list() - for code in territory_codes: - code_stations = TERRITORY_ACTIVE_STATIONS_MAP.get(code) - if code_stations is not None: - stations.extend(code_stations) - - return stations - - -def get_request_urls(country, start_date, end_date=None, metrics=None): - """Encodes the parameters the URL to make a GET request - - Arguments: - country(str): FIPS Country code - start_date(datetime) - end_date(datetime): Defaults to today - metrics(list[str]): Optional.List of metrics to retrieve,valid values are: - TMIN: Minimum temperature. - TMAX: Maximum temperature. - TAVG: Average of temperature. - SNOW: Snowfall (mm). - SNWD: Snow depth (mm). - PRCP: Precipitation. - PSUN: Daily percent of possible sunshine (percent) - TSUN: Daily total sunshine (minutes) - - Returns: - str - """ - - base_url = 'https://www.ncei.noaa.gov/access/services/data/v1?dataset=daily-summaries' - max_stations_req = 50 - - if metrics is None: - metrics = DEFAULT_METRICS - - request_common_args = ( - f'&format=json' - f'&units=metric' - f'&dataTypes={",".join(metrics)}' - ) - - if end_date is None: - end_date = datetime.now() - - start = start_date.date().isoformat() - end = end_date.date().isoformat() - - stations_list = get_stations_by_country(country) - inventory_data = load_dataset('inventory') - inventory_data = inventory_data[inventory_data.end_date >= start_date.year] - inventory_data = inventory_data[inventory_data.start_date <= end_date.year] - stations_list = [x for x in stations_list if x in inventory_data.ID] - if len(stations_list) < max_stations_req: - stations = ','.join(stations_list) - return [ - f'{base_url}&stations={stations}&startDate={start}&endDate={end}{request_common_args}'] - - else: - chunked_station_list = [ - stations_list[i:i + max_stations_req] - for i in range(0, len(stations_list), max_stations_req) - ] - - return [ - ( - f'{base_url}&stations={",".join(chunk)}&startDate={start}' - f'&endDate={end}{request_common_args}' - ) - for chunk in chunked_station_list - ] - - -def get_parse_response(urls): - """Calls the urls in urls, return responses and errors - - Arguments: - urls(list[str]): Urls as generated by `get_request_urls`. - - Returns: - tuple[list[dict], list[Exception]]: - The first element of the tuple is a list of dictionary with all the responses. - The second element is a list with all the exceptions raised during the calls. - """ - - results = list() - errors = list() - - total = len(urls) - 1 - for i, url in enumerate(urls): - logging.debug('Making request %s / %s', i + 1, total + 1) - response = requests.get(url) - try: - response.raise_for_status() - except requests.exceptions.HTTPError: - errors.append({ - 'url': url, - 'error': response.json(), - }) - continue - - results.extend(response.json()) - - return results, errors - - -def noaa_api_connector(countries, start_date, end_date=None, metrics=None): - """Get data from NOAA API. - - Arguments: - countries(list[str]): List of FIPS country codes to retrieve. - start_date(datetime) - end_date(datetime) - metrics(list[str]): Optional.List of metrics to retrieve,valid values are: - TMIN: Minimum temperature. - TMAX: Maximum temperature. - TAVG: Average of temperature. - SNOW: Snowfall (mm). - SNWD: Snow depth (mm). - PRCP: Precipitation. - PSUN: Daily percent of possible sunshine (percent) - TSUN: Daily total sunshine (minutes) - - Returns: - tuple[list[dict], list[Exception]] - """ - if not os.path.isfile(f'{DATA_DIRECTORY}/stations_metadata.txt'): - download_noaa_files(large_files=False) - - result = list() - for country in countries: - logging.info('Requesting data for %s', country) - urls = get_request_urls(country, start_date, end_date, metrics) - country_results, errors = get_parse_response(urls) - - if errors: - logging.info('The following errors where found during the operation:') - for error in errors: - logging.info(error) - - result.extend(country_results) - - data = pd.DataFrame(result) - stations = load_dataset('stations') - data = data.merge(stations, how='left', left_on='STATION', right_on='ID') - - del data['ID'] - del data['STATE'] - - columns = [ - 'DATE', 'STATION', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'NAME', - 'GSN FLAG', 'HCN/CRN FLAG', 'WMO ID' - ] - - if metrics is None: - metrics = DEFAULT_METRICS - - columns.extend([metric for metric in metrics if metric in data.columns]) - - return data[columns] diff --git a/.history/task_geo/data_sources/noaa/noaa_api_connector_20200428030541.py b/.history/task_geo/data_sources/noaa/noaa_api_connector_20200428030541.py deleted file mode 100644 index 2a0f944..0000000 --- a/.history/task_geo/data_sources/noaa/noaa_api_connector_20200428030541.py +++ /dev/null @@ -1,208 +0,0 @@ -"""Connector to the NOAA API. - - -Contributors: - -The journal article describing GHCN-Daily is: -Menne, M.J., I. Durre, R.S. Vose, B.E. Gleason, and T.G. Houston, 2012: An overview -of the Global Historical Climatology Network-Daily Database. Journal of Atmospheric -and Oceanic Technology, 29, 897-910, doi:10.1175/JTECH-D-11-00103.1. - -To acknowledge the specific version of the dataset used, please cite: -Menne, M.J., I. Durre, B. Korzeniewski, S. McNeal, K. Thomas, X. Yin, S. Anthony, R. Ray, -R.S. Vose, B.E.Gleason, and T.G. Houston, 2012: Global Historical Climatology Network - -Daily (GHCN-Daily), Version 3.26 -NOAA National Climatic Data Center. http://doi.org/10.7289/V5D21VHZ [2020/03/30]. -""" - -import logging -import os -from datetime import datetime - -import pandas as pd -import requests - -from task_geo.data_sources.noaa.ftp_connector import download_noaa_files -from task_geo.data_sources.noaa.references import ( - COUNTRY_AND_TERRITORY_CODES, DATA_DIRECTORY, TERRITORY_ACTIVE_STATIONS_MAP, load_dataset) - -logging.basicConfig(level=logging.DEBUG) -logging.getLogger("urllib3").setLevel(logging.WARNING) - - -DEFAULT_METRICS = ['TMAX', 'TMIN', 'TAVG', 'PCRP', 'SNOW', 'SNWD', 'PSUN', 'TSUN'] - - -def get_stations_by_country(country): - """Get all stations for a given country code. - - Arguments: - country(str) - - Returns: - list[str] - """ - - territory_codes = COUNTRY_AND_TERRITORY_CODES.get(country) - if territory_codes is None: - raise ValueError('Wrong country code %s', country) - - stations = list() - for code in territory_codes: - code_stations = TERRITORY_ACTIVE_STATIONS_MAP.get(code) - if code_stations is not None: - stations.extend(code_stations) - - return stations - - -def get_request_urls(country, start_date, end_date=None, metrics=None): - """Encodes the parameters the URL to make a GET request - - Arguments: - country(str): FIPS Country code - start_date(datetime) - end_date(datetime): Defaults to today - metrics(list[str]): Optional.List of metrics to retrieve,valid values are: - TMIN: Minimum temperature. - TMAX: Maximum temperature. - TAVG: Average of temperature. - SNOW: Snowfall (mm). - SNWD: Snow depth (mm). - PRCP: Precipitation. - PSUN: Daily percent of possible sunshine (percent) - TSUN: Daily total sunshine (minutes) - - Returns: - str - """ - - base_url = 'https://www.ncei.noaa.gov/access/services/data/v1?dataset=daily-summaries' - max_stations_req = 50 - - if metrics is None: - metrics = DEFAULT_METRICS - - request_common_args = ( - f'&format=json' - f'&units=metric' - f'&dataTypes={",".join(metrics)}' - ) - - if end_date is None: - end_date = datetime.now() - - start = start_date.date().isoformat() - end = end_date.date().isoformat() - - stations_list = get_stations_by_country(country) - inventory_data = load_dataset('inventory') - inventory_data = inventory_data[inventory_data.end_date >= start_date.year] - stations_list = [x for x in stations_list if x in inventory_data.ID] - if len(stations_list) < max_stations_req: - stations = ','.join(stations_list) - return [ - f'{base_url}&stations={stations}&startDate={start}&endDate={end}{request_common_args}'] - - else: - chunked_station_list = [ - stations_list[i:i + max_stations_req] - for i in range(0, len(stations_list), max_stations_req) - ] - - return [ - ( - f'{base_url}&stations={",".join(chunk)}&startDate={start}' - f'&endDate={end}{request_common_args}' - ) - for chunk in chunked_station_list - ] - - -def get_parse_response(urls): - """Calls the urls in urls, return responses and errors - - Arguments: - urls(list[str]): Urls as generated by `get_request_urls`. - - Returns: - tuple[list[dict], list[Exception]]: - The first element of the tuple is a list of dictionary with all the responses. - The second element is a list with all the exceptions raised during the calls. - """ - - results = list() - errors = list() - - total = len(urls) - 1 - for i, url in enumerate(urls): - logging.debug('Making request %s / %s', i + 1, total + 1) - response = requests.get(url) - try: - response.raise_for_status() - except requests.exceptions.HTTPError: - errors.append({ - 'url': url, - 'error': response.json(), - }) - continue - - results.extend(response.json()) - - return results, errors - - -def noaa_api_connector(countries, start_date, end_date=None, metrics=None): - """Get data from NOAA API. - - Arguments: - countries(list[str]): List of FIPS country codes to retrieve. - start_date(datetime) - end_date(datetime) - metrics(list[str]): Optional.List of metrics to retrieve,valid values are: - TMIN: Minimum temperature. - TMAX: Maximum temperature. - TAVG: Average of temperature. - SNOW: Snowfall (mm). - SNWD: Snow depth (mm). - PRCP: Precipitation. - PSUN: Daily percent of possible sunshine (percent) - TSUN: Daily total sunshine (minutes) - - Returns: - tuple[list[dict], list[Exception]] - """ - if not os.path.isfile(f'{DATA_DIRECTORY}/stations_metadata.txt'): - download_noaa_files(large_files=False) - - result = list() - for country in countries: - logging.info('Requesting data for %s', country) - urls = get_request_urls(country, start_date, end_date, metrics) - country_results, errors = get_parse_response(urls) - - if errors: - logging.info('The following errors where found during the operation:') - for error in errors: - logging.info(error) - - result.extend(country_results) - - data = pd.DataFrame(result) - stations = load_dataset('stations') - data = data.merge(stations, how='left', left_on='STATION', right_on='ID') - - del data['ID'] - del data['STATE'] - - columns = [ - 'DATE', 'STATION', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'NAME', - 'GSN FLAG', 'HCN/CRN FLAG', 'WMO ID' - ] - - if metrics is None: - metrics = DEFAULT_METRICS - - columns.extend([metric for metric in metrics if metric in data.columns]) - - return data[columns] diff --git a/.history/task_geo/data_sources/noaa/noaa_api_connector_20200428030902.py b/.history/task_geo/data_sources/noaa/noaa_api_connector_20200428030902.py deleted file mode 100644 index bfcda28..0000000 --- a/.history/task_geo/data_sources/noaa/noaa_api_connector_20200428030902.py +++ /dev/null @@ -1,208 +0,0 @@ -"""Connector to the NOAA API. - - -Contributors: - -The journal article describing GHCN-Daily is: -Menne, M.J., I. Durre, R.S. Vose, B.E. Gleason, and T.G. Houston, 2012: An overview -of the Global Historical Climatology Network-Daily Database. Journal of Atmospheric -and Oceanic Technology, 29, 897-910, doi:10.1175/JTECH-D-11-00103.1. - -To acknowledge the specific version of the dataset used, please cite: -Menne, M.J., I. Durre, B. Korzeniewski, S. McNeal, K. Thomas, X. Yin, S. Anthony, R. Ray, -R.S. Vose, B.E.Gleason, and T.G. Houston, 2012: Global Historical Climatology Network - -Daily (GHCN-Daily), Version 3.26 -NOAA National Climatic Data Center. http://doi.org/10.7289/V5D21VHZ [2020/03/30]. -""" - -import logging -import os -from datetime import datetime - -import pandas as pd -import requests - -from task_geo.data_sources.noaa.ftp_connector import download_noaa_files -from task_geo.data_sources.noaa.references import ( - COUNTRY_AND_TERRITORY_CODES, DATA_DIRECTORY, TERRITORY_ACTIVE_STATIONS_MAP, load_dataset) - -logging.basicConfig(level=logging.DEBUG) -logging.getLogger("urllib3").setLevel(logging.WARNING) - - -DEFAULT_METRICS = ['TMAX', 'TMIN', 'TAVG', 'PCRP', 'SNOW', 'SNWD', 'PSUN', 'TSUN'] - - -def get_stations_by_country(country): - """Get all stations for a given country code. - - Arguments: - country(str) - - Returns: - list[str] - """ - - territory_codes = COUNTRY_AND_TERRITORY_CODES.get(country) - if territory_codes is None: - raise ValueError('Wrong country code %s', country) - - stations = list() - for code in territory_codes: - code_stations = TERRITORY_ACTIVE_STATIONS_MAP.get(code) - if code_stations is not None: - stations.extend(code_stations) - - return stations - - -def get_request_urls(country, start_date, end_date=None, metrics=None): - """Encodes the parameters the URL to make a GET request - - Arguments: - country(str): FIPS Country code - start_date(datetime) - end_date(datetime): Defaults to today - metrics(list[str]): Optional.List of metrics to retrieve,valid values are: - TMIN: Minimum temperature. - TMAX: Maximum temperature. - TAVG: Average of temperature. - SNOW: Snowfall (mm). - SNWD: Snow depth (mm). - PRCP: Precipitation. - PSUN: Daily percent of possible sunshine (percent) - TSUN: Daily total sunshine (minutes) - - Returns: - str - """ - - base_url = 'https://www.ncei.noaa.gov/access/services/data/v1?dataset=daily-summaries' - max_stations_req = 50 - - if metrics is None: - metrics = DEFAULT_METRICS - - request_common_args = ( - f'&format=json' - f'&units=metric' - f'&dataTypes={",".join(metrics)}' - ) - - if end_date is None: - end_date = datetime.now() - - start = start_date.date().isoformat() - end = end_date.date().isoformat() - - stations_list = get_stations_by_country(country) - inventory_data = load_dataset('inventory') - inventory_data = inventory_data[inventory_data.end_date >= start_date.year] - stations_list = inventory_data[inventory_data.ID.isin(stations_list)].ID.nunique() - if len(stations_list) < max_stations_req: - stations = ','.join(stations_list) - return [ - f'{base_url}&stations={stations}&startDate={start}&endDate={end}{request_common_args}'] - - else: - chunked_station_list = [ - stations_list[i:i + max_stations_req] - for i in range(0, len(stations_list), max_stations_req) - ] - - return [ - ( - f'{base_url}&stations={",".join(chunk)}&startDate={start}' - f'&endDate={end}{request_common_args}' - ) - for chunk in chunked_station_list - ] - - -def get_parse_response(urls): - """Calls the urls in urls, return responses and errors - - Arguments: - urls(list[str]): Urls as generated by `get_request_urls`. - - Returns: - tuple[list[dict], list[Exception]]: - The first element of the tuple is a list of dictionary with all the responses. - The second element is a list with all the exceptions raised during the calls. - """ - - results = list() - errors = list() - - total = len(urls) - 1 - for i, url in enumerate(urls): - logging.debug('Making request %s / %s', i + 1, total + 1) - response = requests.get(url) - try: - response.raise_for_status() - except requests.exceptions.HTTPError: - errors.append({ - 'url': url, - 'error': response.json(), - }) - continue - - results.extend(response.json()) - - return results, errors - - -def noaa_api_connector(countries, start_date, end_date=None, metrics=None): - """Get data from NOAA API. - - Arguments: - countries(list[str]): List of FIPS country codes to retrieve. - start_date(datetime) - end_date(datetime) - metrics(list[str]): Optional.List of metrics to retrieve,valid values are: - TMIN: Minimum temperature. - TMAX: Maximum temperature. - TAVG: Average of temperature. - SNOW: Snowfall (mm). - SNWD: Snow depth (mm). - PRCP: Precipitation. - PSUN: Daily percent of possible sunshine (percent) - TSUN: Daily total sunshine (minutes) - - Returns: - tuple[list[dict], list[Exception]] - """ - if not os.path.isfile(f'{DATA_DIRECTORY}/stations_metadata.txt'): - download_noaa_files(large_files=False) - - result = list() - for country in countries: - logging.info('Requesting data for %s', country) - urls = get_request_urls(country, start_date, end_date, metrics) - country_results, errors = get_parse_response(urls) - - if errors: - logging.info('The following errors where found during the operation:') - for error in errors: - logging.info(error) - - result.extend(country_results) - - data = pd.DataFrame(result) - stations = load_dataset('stations') - data = data.merge(stations, how='left', left_on='STATION', right_on='ID') - - del data['ID'] - del data['STATE'] - - columns = [ - 'DATE', 'STATION', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'NAME', - 'GSN FLAG', 'HCN/CRN FLAG', 'WMO ID' - ] - - if metrics is None: - metrics = DEFAULT_METRICS - - columns.extend([metric for metric in metrics if metric in data.columns]) - - return data[columns] diff --git a/.history/task_geo/data_sources/noaa/noaa_api_connector_20200428031053.py b/.history/task_geo/data_sources/noaa/noaa_api_connector_20200428031053.py deleted file mode 100644 index 46a0960..0000000 --- a/.history/task_geo/data_sources/noaa/noaa_api_connector_20200428031053.py +++ /dev/null @@ -1,208 +0,0 @@ -"""Connector to the NOAA API. - - -Contributors: - -The journal article describing GHCN-Daily is: -Menne, M.J., I. Durre, R.S. Vose, B.E. Gleason, and T.G. Houston, 2012: An overview -of the Global Historical Climatology Network-Daily Database. Journal of Atmospheric -and Oceanic Technology, 29, 897-910, doi:10.1175/JTECH-D-11-00103.1. - -To acknowledge the specific version of the dataset used, please cite: -Menne, M.J., I. Durre, B. Korzeniewski, S. McNeal, K. Thomas, X. Yin, S. Anthony, R. Ray, -R.S. Vose, B.E.Gleason, and T.G. Houston, 2012: Global Historical Climatology Network - -Daily (GHCN-Daily), Version 3.26 -NOAA National Climatic Data Center. http://doi.org/10.7289/V5D21VHZ [2020/03/30]. -""" - -import logging -import os -from datetime import datetime - -import pandas as pd -import requests - -from task_geo.data_sources.noaa.ftp_connector import download_noaa_files -from task_geo.data_sources.noaa.references import ( - COUNTRY_AND_TERRITORY_CODES, DATA_DIRECTORY, TERRITORY_ACTIVE_STATIONS_MAP, load_dataset) - -logging.basicConfig(level=logging.DEBUG) -logging.getLogger("urllib3").setLevel(logging.WARNING) - - -DEFAULT_METRICS = ['TMAX', 'TMIN', 'TAVG', 'PCRP', 'SNOW', 'SNWD', 'PSUN', 'TSUN'] - - -def get_stations_by_country(country): - """Get all stations for a given country code. - - Arguments: - country(str) - - Returns: - list[str] - """ - - territory_codes = COUNTRY_AND_TERRITORY_CODES.get(country) - if territory_codes is None: - raise ValueError('Wrong country code %s', country) - - stations = list() - for code in territory_codes: - code_stations = TERRITORY_ACTIVE_STATIONS_MAP.get(code) - if code_stations is not None: - stations.extend(code_stations) - - return stations - - -def get_request_urls(country, start_date, end_date=None, metrics=None): - """Encodes the parameters the URL to make a GET request - - Arguments: - country(str): FIPS Country code - start_date(datetime) - end_date(datetime): Defaults to today - metrics(list[str]): Optional.List of metrics to retrieve,valid values are: - TMIN: Minimum temperature. - TMAX: Maximum temperature. - TAVG: Average of temperature. - SNOW: Snowfall (mm). - SNWD: Snow depth (mm). - PRCP: Precipitation. - PSUN: Daily percent of possible sunshine (percent) - TSUN: Daily total sunshine (minutes) - - Returns: - str - """ - - base_url = 'https://www.ncei.noaa.gov/access/services/data/v1?dataset=daily-summaries' - max_stations_req = 50 - - if metrics is None: - metrics = DEFAULT_METRICS - - request_common_args = ( - f'&format=json' - f'&units=metric' - f'&dataTypes={",".join(metrics)}' - ) - - if end_date is None: - end_date = datetime.now() - - start = start_date.date().isoformat() - end = end_date.date().isoformat() - - stations_list = get_stations_by_country(country) - inventory_data = load_dataset('inventory') - inventory_data = inventory_data[inventory_data.end_date >= start_date.year] - stations_list = inventory_data[inventory_data.ID.isin(stations_list)].ID.unique() - if len(stations_list) < max_stations_req: - stations = ','.join(stations_list) - return [ - f'{base_url}&stations={stations}&startDate={start}&endDate={end}{request_common_args}'] - - else: - chunked_station_list = [ - stations_list[i:i + max_stations_req] - for i in range(0, len(stations_list), max_stations_req) - ] - - return [ - ( - f'{base_url}&stations={",".join(chunk)}&startDate={start}' - f'&endDate={end}{request_common_args}' - ) - for chunk in chunked_station_list - ] - - -def get_parse_response(urls): - """Calls the urls in urls, return responses and errors - - Arguments: - urls(list[str]): Urls as generated by `get_request_urls`. - - Returns: - tuple[list[dict], list[Exception]]: - The first element of the tuple is a list of dictionary with all the responses. - The second element is a list with all the exceptions raised during the calls. - """ - - results = list() - errors = list() - - total = len(urls) - 1 - for i, url in enumerate(urls): - logging.debug('Making request %s / %s', i + 1, total + 1) - response = requests.get(url) - try: - response.raise_for_status() - except requests.exceptions.HTTPError: - errors.append({ - 'url': url, - 'error': response.json(), - }) - continue - - results.extend(response.json()) - - return results, errors - - -def noaa_api_connector(countries, start_date, end_date=None, metrics=None): - """Get data from NOAA API. - - Arguments: - countries(list[str]): List of FIPS country codes to retrieve. - start_date(datetime) - end_date(datetime) - metrics(list[str]): Optional.List of metrics to retrieve,valid values are: - TMIN: Minimum temperature. - TMAX: Maximum temperature. - TAVG: Average of temperature. - SNOW: Snowfall (mm). - SNWD: Snow depth (mm). - PRCP: Precipitation. - PSUN: Daily percent of possible sunshine (percent) - TSUN: Daily total sunshine (minutes) - - Returns: - tuple[list[dict], list[Exception]] - """ - if not os.path.isfile(f'{DATA_DIRECTORY}/stations_metadata.txt'): - download_noaa_files(large_files=False) - - result = list() - for country in countries: - logging.info('Requesting data for %s', country) - urls = get_request_urls(country, start_date, end_date, metrics) - country_results, errors = get_parse_response(urls) - - if errors: - logging.info('The following errors where found during the operation:') - for error in errors: - logging.info(error) - - result.extend(country_results) - - data = pd.DataFrame(result) - stations = load_dataset('stations') - data = data.merge(stations, how='left', left_on='STATION', right_on='ID') - - del data['ID'] - del data['STATE'] - - columns = [ - 'DATE', 'STATION', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'NAME', - 'GSN FLAG', 'HCN/CRN FLAG', 'WMO ID' - ] - - if metrics is None: - metrics = DEFAULT_METRICS - - columns.extend([metric for metric in metrics if metric in data.columns]) - - return data[columns] diff --git a/.history/task_geo/data_sources/noaa/noaa_api_connector_20200428031122.py b/.history/task_geo/data_sources/noaa/noaa_api_connector_20200428031122.py deleted file mode 100644 index 46a0960..0000000 --- a/.history/task_geo/data_sources/noaa/noaa_api_connector_20200428031122.py +++ /dev/null @@ -1,208 +0,0 @@ -"""Connector to the NOAA API. - - -Contributors: - -The journal article describing GHCN-Daily is: -Menne, M.J., I. Durre, R.S. Vose, B.E. Gleason, and T.G. Houston, 2012: An overview -of the Global Historical Climatology Network-Daily Database. Journal of Atmospheric -and Oceanic Technology, 29, 897-910, doi:10.1175/JTECH-D-11-00103.1. - -To acknowledge the specific version of the dataset used, please cite: -Menne, M.J., I. Durre, B. Korzeniewski, S. McNeal, K. Thomas, X. Yin, S. Anthony, R. Ray, -R.S. Vose, B.E.Gleason, and T.G. Houston, 2012: Global Historical Climatology Network - -Daily (GHCN-Daily), Version 3.26 -NOAA National Climatic Data Center. http://doi.org/10.7289/V5D21VHZ [2020/03/30]. -""" - -import logging -import os -from datetime import datetime - -import pandas as pd -import requests - -from task_geo.data_sources.noaa.ftp_connector import download_noaa_files -from task_geo.data_sources.noaa.references import ( - COUNTRY_AND_TERRITORY_CODES, DATA_DIRECTORY, TERRITORY_ACTIVE_STATIONS_MAP, load_dataset) - -logging.basicConfig(level=logging.DEBUG) -logging.getLogger("urllib3").setLevel(logging.WARNING) - - -DEFAULT_METRICS = ['TMAX', 'TMIN', 'TAVG', 'PCRP', 'SNOW', 'SNWD', 'PSUN', 'TSUN'] - - -def get_stations_by_country(country): - """Get all stations for a given country code. - - Arguments: - country(str) - - Returns: - list[str] - """ - - territory_codes = COUNTRY_AND_TERRITORY_CODES.get(country) - if territory_codes is None: - raise ValueError('Wrong country code %s', country) - - stations = list() - for code in territory_codes: - code_stations = TERRITORY_ACTIVE_STATIONS_MAP.get(code) - if code_stations is not None: - stations.extend(code_stations) - - return stations - - -def get_request_urls(country, start_date, end_date=None, metrics=None): - """Encodes the parameters the URL to make a GET request - - Arguments: - country(str): FIPS Country code - start_date(datetime) - end_date(datetime): Defaults to today - metrics(list[str]): Optional.List of metrics to retrieve,valid values are: - TMIN: Minimum temperature. - TMAX: Maximum temperature. - TAVG: Average of temperature. - SNOW: Snowfall (mm). - SNWD: Snow depth (mm). - PRCP: Precipitation. - PSUN: Daily percent of possible sunshine (percent) - TSUN: Daily total sunshine (minutes) - - Returns: - str - """ - - base_url = 'https://www.ncei.noaa.gov/access/services/data/v1?dataset=daily-summaries' - max_stations_req = 50 - - if metrics is None: - metrics = DEFAULT_METRICS - - request_common_args = ( - f'&format=json' - f'&units=metric' - f'&dataTypes={",".join(metrics)}' - ) - - if end_date is None: - end_date = datetime.now() - - start = start_date.date().isoformat() - end = end_date.date().isoformat() - - stations_list = get_stations_by_country(country) - inventory_data = load_dataset('inventory') - inventory_data = inventory_data[inventory_data.end_date >= start_date.year] - stations_list = inventory_data[inventory_data.ID.isin(stations_list)].ID.unique() - if len(stations_list) < max_stations_req: - stations = ','.join(stations_list) - return [ - f'{base_url}&stations={stations}&startDate={start}&endDate={end}{request_common_args}'] - - else: - chunked_station_list = [ - stations_list[i:i + max_stations_req] - for i in range(0, len(stations_list), max_stations_req) - ] - - return [ - ( - f'{base_url}&stations={",".join(chunk)}&startDate={start}' - f'&endDate={end}{request_common_args}' - ) - for chunk in chunked_station_list - ] - - -def get_parse_response(urls): - """Calls the urls in urls, return responses and errors - - Arguments: - urls(list[str]): Urls as generated by `get_request_urls`. - - Returns: - tuple[list[dict], list[Exception]]: - The first element of the tuple is a list of dictionary with all the responses. - The second element is a list with all the exceptions raised during the calls. - """ - - results = list() - errors = list() - - total = len(urls) - 1 - for i, url in enumerate(urls): - logging.debug('Making request %s / %s', i + 1, total + 1) - response = requests.get(url) - try: - response.raise_for_status() - except requests.exceptions.HTTPError: - errors.append({ - 'url': url, - 'error': response.json(), - }) - continue - - results.extend(response.json()) - - return results, errors - - -def noaa_api_connector(countries, start_date, end_date=None, metrics=None): - """Get data from NOAA API. - - Arguments: - countries(list[str]): List of FIPS country codes to retrieve. - start_date(datetime) - end_date(datetime) - metrics(list[str]): Optional.List of metrics to retrieve,valid values are: - TMIN: Minimum temperature. - TMAX: Maximum temperature. - TAVG: Average of temperature. - SNOW: Snowfall (mm). - SNWD: Snow depth (mm). - PRCP: Precipitation. - PSUN: Daily percent of possible sunshine (percent) - TSUN: Daily total sunshine (minutes) - - Returns: - tuple[list[dict], list[Exception]] - """ - if not os.path.isfile(f'{DATA_DIRECTORY}/stations_metadata.txt'): - download_noaa_files(large_files=False) - - result = list() - for country in countries: - logging.info('Requesting data for %s', country) - urls = get_request_urls(country, start_date, end_date, metrics) - country_results, errors = get_parse_response(urls) - - if errors: - logging.info('The following errors where found during the operation:') - for error in errors: - logging.info(error) - - result.extend(country_results) - - data = pd.DataFrame(result) - stations = load_dataset('stations') - data = data.merge(stations, how='left', left_on='STATION', right_on='ID') - - del data['ID'] - del data['STATE'] - - columns = [ - 'DATE', 'STATION', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'NAME', - 'GSN FLAG', 'HCN/CRN FLAG', 'WMO ID' - ] - - if metrics is None: - metrics = DEFAULT_METRICS - - columns.extend([metric for metric in metrics if metric in data.columns]) - - return data[columns] diff --git a/task_geo/data_sources/noaa/ftp_formatter.py b/task_geo/data_sources/noaa/ftp_formatter.py new file mode 100644 index 0000000..e69de29