feat: data preparation scripts

Sunbird-AIAssistant · Feb 21, 2024 · 8fcc6c8 · 8fcc6c8
1 parent edf5f24
commit 8fcc6c8
Show file tree

Hide file tree

Showing 8 changed files with 800 additions and 0 deletions.
diff --git a/data-preparation/.gitignore b/data-preparation/.gitignore
@@ -0,0 +1 @@
+venv/**
diff --git a/data-preparation/IVRS-INPUT.csv b/data-preparation/IVRS-INPUT.csv
diff --git a/data-preparation/cleanup.py b/data-preparation/cleanup.py
@@ -0,0 +1,20 @@
+import boto3
+import yaml
+from rich.progress import track
+
+if __name__ == '__main__':
+    with open("config.yaml", "r") as stream:
+        CONFIG = yaml.safe_load(stream)
+        s3 = boto3.resource(
+            's3',
+            region_name=CONFIG["region_name"],
+            aws_secret_access_key=CONFIG["aws_secret_access_key"],
+            aws_access_key_id=CONFIG["aws_access_key_id"],
+            endpoint_url=CONFIG["endpoint_url"]
+        )
+
+        bucket = s3.Bucket(CONFIG['bucket'])
+        objects = bucket.objects.all()
+        for obj in track(objects, description="Deleting files"):
+            obj.delete()
+        print("Completed deleting all the files.")
diff --git a/data-preparation/config.yaml b/data-preparation/config.yaml
@@ -0,0 +1,5 @@
+region_name: ""
+aws_secret_access_key: ""
+aws_access_key_id: ""
+endpoint_url: ""
+bucket: ""
diff --git a/data-preparation/generate-config.py b/data-preparation/generate-config.py
@@ -0,0 +1,81 @@
+import sys
+import csv
+import re
+import json
+# import gdown
+import boto3
+import yaml
+
+
+from pathlib import Path
+
+CONFIG = {}
+
+
+def get_upload_path(file_name, category, language):
+    if category != "" and language != "": 
+        return f'audio/{category}/{language}/{file_name}'
+    else:
+        return f'audio/{file_name}'
+
+def upload_to_oci_storage(s3, file_name, category, language):
+    try:
+        file_path = Path(file_name)
+        upload_path = get_upload_path(file_path.name, category, language)
+        s3.meta.client.upload_file(f"{file_name}", CONFIG['bucket'], upload_path, ExtraArgs={'ContentType': 'application/json'})
+        print(upload_path, "uploaded successfully")
+    except Exception as e:
+        print(e)
+
+def create_config(s3):
+    config = {}
+    categories = set()
+    languages = set()
+
+    invalid_option_link = ["https://objectstorage.ap-hyderabad-1.oraclecloud.com/n/ax2cel5zyviy/b/sbdjp-ivrs/o/audio/invalid_option_english.wav"]
+
+    for obj in s3.Bucket(CONFIG['bucket']).objects.all():
+        path = Path(obj.key)
+
+        if len(path.parts) < 4: continue
+        if path.parts[0] != "audio": continue
+
+        category = path.parts[2].strip().lower()
+        language = path.parts[1].strip().lower()
+
+        categories.add(category)
+        languages.add(language)
+        audio_key = f"{category}:{language}"
+        if audio_key not in config: config[audio_key] = []
+
+        url = s3.meta.client.generate_presigned_url(ClientMethod = 'get_object', Params = { 'Bucket': CONFIG['bucket'], 'Key': obj.key })
+        url = url.split("?")[0]
+
+        config[audio_key].append(url)
+
+    for lang in languages:
+        for cat in categories:
+            audio_key = f"{cat}:{lang}:empty"
+            config[audio_key] = invalid_option_link
+
+        audio_key = f"invalid_option:{lang}"
+        config[audio_key] = invalid_option_link
+
+    with open('converted-drive-files/ivrs_config.json', 'w') as f:
+        f.write(json.dumps(config))
+
+if __name__ == '__main__':
+    with open("config.yaml", "r") as stream:
+        CONFIG = yaml.safe_load(stream)
+        s3 = boto3.resource(
+            's3',
+            region_name=CONFIG["region_name"],
+            aws_secret_access_key=CONFIG["aws_secret_access_key"],
+            aws_access_key_id=CONFIG["aws_access_key_id"],
+            endpoint_url=CONFIG["endpoint_url"]
+        )
+
+        create_config(s3)
+        upload_to_oci_storage(s3, "converted-drive-files/ivrs_config.json", "", "")
+
+
diff --git a/data-preparation/invalid_option_english.wav b/data-preparation/invalid_option_english.wav
diff --git a/data-preparation/process-ivrs-audios.py b/data-preparation/process-ivrs-audios.py
@@ -0,0 +1,144 @@
+import os
+import csv
+import re
+from pydrive.auth import GoogleAuth
+from pydrive.drive import GoogleDrive
+from pydub import AudioSegment
+import boto3
+import yaml
+from rich.progress import track
+from collections import defaultdict
+import shutil
+
+def download_file(drive, file_id, destination_path, language, category):
+    # Get the file
+    file = drive.CreateFile({'id': file_id})
+    file_name = f"{file['title']}"
+    # Download the file
+    file_destination_path = f"{destination_path}/{language}/{category}"
+    os.makedirs(file_destination_path, exist_ok=True)
+    final_file_path = f"{file_destination_path}/{file['title']}"
+    file.GetContentFile(final_file_path)
+    # print('Downloaded', file_name)
+    return final_file_path
+
+def get_drive():
+    # Authenticate and create the PyDrive client
+    gauth = GoogleAuth()
+    gauth.LocalWebserverAuth()  # Creates local webserver and auto handles authentication.
+    drive = GoogleDrive(gauth)
+    return drive
+
+def check_file_existence(file_path):
+    if not os.path.exists(file_path):
+        raise FileNotFoundError("The file '{}' does not exist.".format(file_path))
+    else:
+        print("The file given is ".format(file_path))
+
+
+def download_audio_files(source_file_path):
+    downloaded_files = {}
+    # Open the CSV file
+    with open(source_file_path) as csvfile:
+        files_list = [d for d in csv.DictReader(csvfile)]
+        drive = get_drive()
+        # for file in files_list:
+        for file in track(files_list, description="Downloading files"):
+            category = file["Category"].strip().lower()
+            language = file["Language"].strip().lower()
+            drive_link = file["Drive Link"]
+            file_id = re.search(r'/file/d/([^/]+)/', drive_link).group(1)
+            local_file_path = download_file(drive, file_id, destination_path, language, category)
+            file["Downloaded Path"] = local_file_path
+            if local_file_path not in downloaded_files:
+                downloaded_files.setdefault(local_file_path, 1)
+            else:
+                print(f"File already downloaded: {local_file_path}")
+
+        with open("out.csv", "w") as f:
+            wr = csv.DictWriter(f, delimiter=",",fieldnames=list(files_list[0].keys()))
+            wr.writeheader()
+            wr.writerows(files_list)
+
+def get_s3_object_and_bucket():
+    with open("config.yaml", "r") as stream:
+        CONFIG = yaml.safe_load(stream)
+        s3 = boto3.resource(
+            's3',
+            region_name=CONFIG["region_name"],
+            aws_secret_access_key=CONFIG["aws_secret_access_key"],
+            aws_access_key_id=CONFIG["aws_access_key_id"],
+            endpoint_url=CONFIG["endpoint_url"]
+        )
+        return s3, CONFIG['bucket']
+
+def is_audio_file(file_path):
+    _, extension = os.path.splitext(file_path)
+    return extension.lower() == '.mp3' or extension.lower() == '.wav'
+def list_files(directory):
+    # List to store all files
+    all_files = []
+
+    # Walk through all directories and subdirectories recursively
+    for root, directories, files in os.walk(directory):
+        for filename in files:
+            # Append the full path to the list
+            full_file_path = os.path.join(root, filename)
+            if is_audio_file(full_file_path) == True:
+                all_files.append(full_file_path)
+    return all_files
+
+def change_extension(file_path):
+    # Split the file path into directory, base filename, and extension
+    directory, filename = os.path.split(file_path)
+    base_name, old_extension = os.path.splitext(filename)
+    # Construct the new file path with .wav extension
+    return os.path.join(directory, base_name + ".wav")
+
+def convert_to_16bit_mono_8k_pcm_wav(mp3_file, wav_file):
+    # Load the MP3 file
+    audio = AudioSegment.from_mp3(mp3_file)
+    # Set channels to mono
+    audio = audio.set_channels(1)
+    # Set sample width to 2 bytes (16 bit)
+    audio = audio.set_sample_width(2)
+    # Set sample rate to 8000 Hz
+    audio = audio.set_frame_rate(8000)
+    # Export as WAV
+    audio.export(wav_file, format="wav", bitrate='8k', parameters=["-ac", "1", "-ar", "8000", "-sample_fmt", "s16"])
+
+def convert_and_upload(s3, base_folder, bucket_name):
+    file_list = list_files(base_folder)
+    # for mp3_file_path in file_list:
+    for mp3_file_path in track(file_list, description="Convert & Uploading"):
+        # print(f"Processing {mp3_file_path}")
+        converted_wav_file_path = change_extension('converted-drive-files/audio/'+'/'.join(mp3_file_path.split('/')[1:]))
+        # print(f"Converting to {converted_wav_file_path}")
+        try:
+            os.makedirs(os.path.dirname(converted_wav_file_path), exist_ok=True)
+            # convert_to_8khz_mono_with_bitrate(mp3_file_path, converted_mp3_file_path)
+            convert_to_16bit_mono_8k_pcm_wav(mp3_file_path, converted_wav_file_path)
+            # print(f"Converted to {converted_wav_file_path}")
+            s3_file_path='/'.join(converted_wav_file_path.split('/')[1:])
+            s3.meta.client.upload_file(converted_wav_file_path, bucket_name, s3_file_path, ExtraArgs={'ContentType': 'audio/wav'})
+            # print(f"Uploaded to {s3_file_path}")
+        except Exception as e:
+            print("Failed to process and upload ", mp3_file_path)
+            pass
+
+def copy_invalid_option_audio(s3, bucket): 
+    source_file = 'invalid_option_english.wav'
+    # Specify the path to the folder where you want to move the file
+    s3.meta.client.upload_file(source_file, bucket, f"audio/{source_file}", ExtraArgs={'ContentType': 'audio/wav'})
+
+if __name__ == "__main__":
+    destination_path="original-drive-files"
+    source_file_path = input("Please enter input file path: ")
+    check_file_existence(source_file_path)
+    download_audio_files(source_file_path)
+    print("Completed downloading all the files.")
+    s3, bucket = get_s3_object_and_bucket()
+    convert_and_upload(s3,destination_path,bucket)
+    print("Completed uploading all the files.")
+    copy_invalid_option_audio(s3, bucket)
+    print("Completed copying invalid option audio.")
diff --git a/data-preparation/requirements.txt b/data-preparation/requirements.txt
@@ -0,0 +1,37 @@
+boto3==1.34.38
+botocore==1.34.38
+cachetools==5.3.2
+certifi==2024.2.2
+charset-normalizer==3.3.2
+google-api-core==2.16.2
+google-api-python-client==2.116.0
+google-auth==2.27.0
+google-auth-httplib2==0.2.0
+googleapis-common-protos==1.62.0
+httplib2==0.22.0
+idna==3.6
+jmespath==1.0.1
+markdown-it-py==3.0.0
+mdurl==0.1.2
+numpy==1.26.4
+oauth2client==4.1.3
+pandas==2.2.0
+protobuf==4.25.2
+pyasn1==0.5.1
+pyasn1-modules==0.3.0
+PyDrive==1.3.1
+pydub==0.25.1
+Pygments==2.17.2
+pyparsing==3.1.1
+python-dateutil==2.8.2
+pytz==2024.1
+PyYAML==6.0.1
+requests==2.31.0
+rich==13.7.0
+rsa==4.9
+s3transfer==0.10.0
+six==1.16.0
+tqdm==4.66.2
+tzdata==2023.4
+uritemplate==4.1.1
+urllib3==2.0.7