diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..e267d90 --- /dev/null +++ b/.env.example @@ -0,0 +1,2 @@ +MONGODB_URI= +MONGODB_DB_NAME= \ No newline at end of file diff --git a/lambda_function.py b/lambda_function.py deleted file mode 100644 index 01bc6eb..0000000 --- a/lambda_function.py +++ /dev/null @@ -1,53 +0,0 @@ -import requests -import zipfile -import io -import csv -import json -from pymongo import MongoClient - - -def lambda_handler(event, context): - # -------- Step 1: Download the .zip File -------- - zip_file_url = "https://datamall.lta.gov.sg/content/dam/datamall/datasets/Facts_Figures/Vehicle Registration/Monthly New Registration of Cars by Make.zip" - response = requests.get(zip_file_url) - - # -------- Step 2: Extract the .csv File in Memory -------- - with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref: - for file_info in zip_ref.infolist(): - if file_info.filename.endswith(".csv"): - with zip_ref.open( - file_info - ) as csv_file: # Assumes 'data.csv' is in the zip root - csv_data = [] - csv_reader = csv.DictReader(io.TextIOWrapper(csv_file)) - for row in csv_reader: - csv_data.append(row) - - # -------- Step 3: Convert CSV to JSON -------- - json_data = json.dumps(csv_data) - - # -------- Step 4: Connect to MongoDB -------- - client = MongoClient( - "mongodb://localhost:27017/" - ) # Replace this with connection details for MongoDB Atlas or DocumentDB - db = client["local-lta-datasets"] - collection = db["cars"] - - # -------- Step 5: Comparison -------- - existing_data = collection.find({}, {"month": 1}) - existing_data_set = {item["month"] for item in existing_data} - - new_items = [item for item in json_data if item["month"] not in existing_data_set] - - # -------- Step 6: Insert New Items -------- - if new_items: - collection.insert_many(new_items) - print(f"Inserted {len(new_items)} records") - else: - print("No new records") - - return {"statusCode": 200, "body": "Data successfully written to MongoDB!"} - - -if __name__ == "__main__": - lambda_handler(None, None) diff --git a/main.py b/main.py deleted file mode 100644 index e157ec7..0000000 --- a/main.py +++ /dev/null @@ -1,5 +0,0 @@ -import update_cars -import asyncio - - -asyncio.run(update_cars.main()) diff --git a/update_cars.py b/update_cars.py index a74bb38..ef7d37f 100644 --- a/update_cars.py +++ b/update_cars.py @@ -1,60 +1,23 @@ -import boto3 -from dotenv import load_dotenv - -from updater import update - -load_dotenv() - - -COLLECTION_NAME: str = "cars" -ZIP_FILE_NAME: str = "Monthly New Registration of Cars by Make.zip" -ZIP_URL: str = ( - f"https://datamall.lta.gov.sg/content/dam/datamall/datasets/Facts_Figures/Vehicle Registration/{ZIP_FILE_NAME}" -) - -print(f"ZIP_URL {ZIP_URL}") +import asyncio +import updater +from typing import List async def main(): - result = await update() - - if __name__ == "__main__": - # Create a DynamoDB table - my_table = create_dynamodb_table("cars") - - # Put the item into the table - put_item_into_table(my_table, result) - - -def create_dynamodb_table(table_name): - # Create a DynamoDB resource - dynamodb = boto3.resource("dynamodb") - - # Create the table - table = dynamodb.create_table( - TableName=table_name, - KeySchema=[ - {"AttributeName": "id", "KeyType": "HASH"}, # Partition key - {"AttributeName": "timestamp", "KeyType": "RANGE"}, # Sort key - ], - AttributeDefinitions=[ - {"AttributeName": "id", "AttributeType": "S"}, # String data type - { - "AttributeName": "timestamp", - "AttributeType": "N", # Number data type - }, - ], - ProvisionedThroughput={"ReadCapacityUnits": 5, "WriteCapacityUnits": 5}, + collection_name: str = "cars" + zip_file_name: str = "Monthly New Registration of Cars by Make.zip" + zip_url: str = ( + f"https://datamall.lta.gov.sg/content/dam/datamall/datasets/Facts_Figures/Vehicle Registration/{zip_file_name}" ) + key_fields: List[str] = ["month"] - # Wait until the table exists - table.meta.client.get_waiter("table_exists").wait(TableName=table_name) - - print(f"DynamoDB Table {table_name} created") - - return table + await updater.main( + collection_name=collection_name, + zip_url=zip_url, + zip_file_name=zip_file_name, + key_fields=key_fields, + ) -def put_item_into_table(table, item_data): - # Put an item into the DynamoDB table - table.put_item(Item=item_data) +if __name__ == "__main__": + asyncio.run(main()) diff --git a/update_coe.py b/update_coe.py new file mode 100644 index 0000000..38cc367 --- /dev/null +++ b/update_coe.py @@ -0,0 +1,23 @@ +import asyncio +import updater +from typing import List + + +async def main(): + collection_name: str = "coe" + zip_file_name: str = "COE Bidding Results.zip" + zip_url: str = ( + f"https://datamall.lta.gov.sg/content/dam/datamall/datasets/Facts_Figures/Vehicle Registration/{zip_file_name}" + ) + key_fields: List[str] = ["month", "bidding_no"] + + await updater.main( + collection_name=collection_name, + zip_url=zip_url, + zip_file_name=zip_file_name, + key_fields=key_fields, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/updater.py b/updater.py index f0f27d3..0e555a7 100644 --- a/updater.py +++ b/updater.py @@ -1,20 +1,12 @@ -import asyncio import csv import datetime import os from zipfile import ZipFile from pymongo import MongoClient - from download_file import download_file EXTRACT_PATH = "tmp" -csv_file_path = "tmp/Monthly New Registration of Cars by Make/M03-Car_Regn_by_make.csv" - -json_file_path = ( - "tmp/Monthly New Registration of Cars by Make/M03-Car_Regn_by_make.csv.json" -) - csv_data = [] @@ -25,7 +17,7 @@ async def update(collection_name, zip_file_name, zip_url, key_fields): try: zip_file_path = os.path.join(EXTRACT_PATH, zip_file_name) - await download_file(zip_url, zip_file_path) # You'll need to define this + await download_file(zip_url, zip_file_path) extracted_file_name = extract_zip_file(zip_file_path, EXTRACT_PATH) destination_path = os.path.join(EXTRACT_PATH, extracted_file_name) @@ -71,19 +63,14 @@ def extract_zip_file(zip_file_path, extract_to_path): for entry in zip_ref.infolist(): if not entry.is_dir(): return entry.filename - return "" -async def main(): - zip_file_name = "Monthly New Registration of Cars by Make.zip" - zip_url = f"https://datamall.lta.gov.sg/content/dam/datamall/datasets/Facts_Figures/Vehicle Registration/{zip_file_name}" # Replace with the actual URL - collection_name = "cars" - +async def main(collection_name, zip_file_name, zip_url, key_fields): message = await update( collection_name=collection_name, zip_file_name=zip_file_name, zip_url=zip_url, - key_fields=["month"], + key_fields=key_fields, ) response = { @@ -96,7 +83,3 @@ async def main(): print(response) return response - - -if __name__ == "__main__": - asyncio.run(main())