Skip to content

Commit

Permalink
feat: data preparation scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
maheshkumargangula committed Feb 21, 2024
1 parent edf5f24 commit 8fcc6c8
Show file tree
Hide file tree
Showing 8 changed files with 800 additions and 0 deletions.
1 change: 1 addition & 0 deletions data-preparation/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
venv/**
512 changes: 512 additions & 0 deletions data-preparation/IVRS-INPUT.csv

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions data-preparation/cleanup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import boto3
import yaml
from rich.progress import track

if __name__ == '__main__':
with open("config.yaml", "r") as stream:
CONFIG = yaml.safe_load(stream)
s3 = boto3.resource(
's3',
region_name=CONFIG["region_name"],
aws_secret_access_key=CONFIG["aws_secret_access_key"],
aws_access_key_id=CONFIG["aws_access_key_id"],
endpoint_url=CONFIG["endpoint_url"]
)

bucket = s3.Bucket(CONFIG['bucket'])
objects = bucket.objects.all()
for obj in track(objects, description="Deleting files"):
obj.delete()
print("Completed deleting all the files.")
5 changes: 5 additions & 0 deletions data-preparation/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
region_name: ""
aws_secret_access_key: ""
aws_access_key_id: ""
endpoint_url: ""
bucket: ""
81 changes: 81 additions & 0 deletions data-preparation/generate-config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import sys
import csv
import re
import json
# import gdown
import boto3
import yaml


from pathlib import Path

CONFIG = {}


def get_upload_path(file_name, category, language):
if category != "" and language != "":
return f'audio/{category}/{language}/{file_name}'
else:
return f'audio/{file_name}'

def upload_to_oci_storage(s3, file_name, category, language):
try:
file_path = Path(file_name)
upload_path = get_upload_path(file_path.name, category, language)
s3.meta.client.upload_file(f"{file_name}", CONFIG['bucket'], upload_path, ExtraArgs={'ContentType': 'application/json'})
print(upload_path, "uploaded successfully")
except Exception as e:
print(e)

def create_config(s3):
config = {}
categories = set()
languages = set()

invalid_option_link = ["https://objectstorage.ap-hyderabad-1.oraclecloud.com/n/ax2cel5zyviy/b/sbdjp-ivrs/o/audio/invalid_option_english.wav"]

for obj in s3.Bucket(CONFIG['bucket']).objects.all():
path = Path(obj.key)

if len(path.parts) < 4: continue
if path.parts[0] != "audio": continue

category = path.parts[2].strip().lower()
language = path.parts[1].strip().lower()

categories.add(category)
languages.add(language)
audio_key = f"{category}:{language}"
if audio_key not in config: config[audio_key] = []

url = s3.meta.client.generate_presigned_url(ClientMethod = 'get_object', Params = { 'Bucket': CONFIG['bucket'], 'Key': obj.key })
url = url.split("?")[0]

config[audio_key].append(url)

for lang in languages:
for cat in categories:
audio_key = f"{cat}:{lang}:empty"
config[audio_key] = invalid_option_link

audio_key = f"invalid_option:{lang}"
config[audio_key] = invalid_option_link

with open('converted-drive-files/ivrs_config.json', 'w') as f:
f.write(json.dumps(config))

if __name__ == '__main__':
with open("config.yaml", "r") as stream:
CONFIG = yaml.safe_load(stream)
s3 = boto3.resource(
's3',
region_name=CONFIG["region_name"],
aws_secret_access_key=CONFIG["aws_secret_access_key"],
aws_access_key_id=CONFIG["aws_access_key_id"],
endpoint_url=CONFIG["endpoint_url"]
)

create_config(s3)
upload_to_oci_storage(s3, "converted-drive-files/ivrs_config.json", "", "")


Binary file added data-preparation/invalid_option_english.wav
Binary file not shown.
144 changes: 144 additions & 0 deletions data-preparation/process-ivrs-audios.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import os
import csv
import re
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from pydub import AudioSegment
import boto3
import yaml
from rich.progress import track
from collections import defaultdict
import shutil

def download_file(drive, file_id, destination_path, language, category):
# Get the file
file = drive.CreateFile({'id': file_id})
file_name = f"{file['title']}"
# Download the file
file_destination_path = f"{destination_path}/{language}/{category}"
os.makedirs(file_destination_path, exist_ok=True)
final_file_path = f"{file_destination_path}/{file['title']}"
file.GetContentFile(final_file_path)
# print('Downloaded', file_name)
return final_file_path

def get_drive():
# Authenticate and create the PyDrive client
gauth = GoogleAuth()
gauth.LocalWebserverAuth() # Creates local webserver and auto handles authentication.
drive = GoogleDrive(gauth)
return drive

def check_file_existence(file_path):
if not os.path.exists(file_path):
raise FileNotFoundError("The file '{}' does not exist.".format(file_path))
else:
print("The file given is ".format(file_path))


def download_audio_files(source_file_path):
downloaded_files = {}
# Open the CSV file
with open(source_file_path) as csvfile:
files_list = [d for d in csv.DictReader(csvfile)]
drive = get_drive()
# for file in files_list:
for file in track(files_list, description="Downloading files"):
category = file["Category"].strip().lower()
language = file["Language"].strip().lower()
drive_link = file["Drive Link"]
file_id = re.search(r'/file/d/([^/]+)/', drive_link).group(1)
local_file_path = download_file(drive, file_id, destination_path, language, category)
file["Downloaded Path"] = local_file_path
if local_file_path not in downloaded_files:
downloaded_files.setdefault(local_file_path, 1)
else:
print(f"File already downloaded: {local_file_path}")

with open("out.csv", "w") as f:
wr = csv.DictWriter(f, delimiter=",",fieldnames=list(files_list[0].keys()))
wr.writeheader()
wr.writerows(files_list)

def get_s3_object_and_bucket():
with open("config.yaml", "r") as stream:
CONFIG = yaml.safe_load(stream)
s3 = boto3.resource(
's3',
region_name=CONFIG["region_name"],
aws_secret_access_key=CONFIG["aws_secret_access_key"],
aws_access_key_id=CONFIG["aws_access_key_id"],
endpoint_url=CONFIG["endpoint_url"]
)
return s3, CONFIG['bucket']

def is_audio_file(file_path):
_, extension = os.path.splitext(file_path)
return extension.lower() == '.mp3' or extension.lower() == '.wav'
def list_files(directory):
# List to store all files
all_files = []

# Walk through all directories and subdirectories recursively
for root, directories, files in os.walk(directory):
for filename in files:
# Append the full path to the list
full_file_path = os.path.join(root, filename)
if is_audio_file(full_file_path) == True:
all_files.append(full_file_path)
return all_files

def change_extension(file_path):
# Split the file path into directory, base filename, and extension
directory, filename = os.path.split(file_path)
base_name, old_extension = os.path.splitext(filename)
# Construct the new file path with .wav extension
return os.path.join(directory, base_name + ".wav")

def convert_to_16bit_mono_8k_pcm_wav(mp3_file, wav_file):
# Load the MP3 file
audio = AudioSegment.from_mp3(mp3_file)
# Set channels to mono
audio = audio.set_channels(1)
# Set sample width to 2 bytes (16 bit)
audio = audio.set_sample_width(2)
# Set sample rate to 8000 Hz
audio = audio.set_frame_rate(8000)
# Export as WAV
audio.export(wav_file, format="wav", bitrate='8k', parameters=["-ac", "1", "-ar", "8000", "-sample_fmt", "s16"])

def convert_and_upload(s3, base_folder, bucket_name):
file_list = list_files(base_folder)
# for mp3_file_path in file_list:
for mp3_file_path in track(file_list, description="Convert & Uploading"):
# print(f"Processing {mp3_file_path}")
converted_wav_file_path = change_extension('converted-drive-files/audio/'+'/'.join(mp3_file_path.split('/')[1:]))
# print(f"Converting to {converted_wav_file_path}")
try:
os.makedirs(os.path.dirname(converted_wav_file_path), exist_ok=True)
# convert_to_8khz_mono_with_bitrate(mp3_file_path, converted_mp3_file_path)
convert_to_16bit_mono_8k_pcm_wav(mp3_file_path, converted_wav_file_path)
# print(f"Converted to {converted_wav_file_path}")
s3_file_path='/'.join(converted_wav_file_path.split('/')[1:])
s3.meta.client.upload_file(converted_wav_file_path, bucket_name, s3_file_path, ExtraArgs={'ContentType': 'audio/wav'})
# print(f"Uploaded to {s3_file_path}")
except Exception as e:
print("Failed to process and upload ", mp3_file_path)
pass

def copy_invalid_option_audio(s3, bucket):
source_file = 'invalid_option_english.wav'
# Specify the path to the folder where you want to move the file
s3.meta.client.upload_file(source_file, bucket, f"audio/{source_file}", ExtraArgs={'ContentType': 'audio/wav'})

if __name__ == "__main__":
destination_path="original-drive-files"
source_file_path = input("Please enter input file path: ")
check_file_existence(source_file_path)
download_audio_files(source_file_path)
print("Completed downloading all the files.")
s3, bucket = get_s3_object_and_bucket()
convert_and_upload(s3,destination_path,bucket)
print("Completed uploading all the files.")
copy_invalid_option_audio(s3, bucket)
print("Completed copying invalid option audio.")
37 changes: 37 additions & 0 deletions data-preparation/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
boto3==1.34.38
botocore==1.34.38
cachetools==5.3.2
certifi==2024.2.2
charset-normalizer==3.3.2
google-api-core==2.16.2
google-api-python-client==2.116.0
google-auth==2.27.0
google-auth-httplib2==0.2.0
googleapis-common-protos==1.62.0
httplib2==0.22.0
idna==3.6
jmespath==1.0.1
markdown-it-py==3.0.0
mdurl==0.1.2
numpy==1.26.4
oauth2client==4.1.3
pandas==2.2.0
protobuf==4.25.2
pyasn1==0.5.1
pyasn1-modules==0.3.0
PyDrive==1.3.1
pydub==0.25.1
Pygments==2.17.2
pyparsing==3.1.1
python-dateutil==2.8.2
pytz==2024.1
PyYAML==6.0.1
requests==2.31.0
rich==13.7.0
rsa==4.9
s3transfer==0.10.0
six==1.16.0
tqdm==4.66.2
tzdata==2023.4
uritemplate==4.1.1
urllib3==2.0.7

0 comments on commit 8fcc6c8

Please sign in to comment.