-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
edf5f24
commit 8fcc6c8
Showing
8 changed files
with
800 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
venv/** |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import boto3 | ||
import yaml | ||
from rich.progress import track | ||
|
||
if __name__ == '__main__': | ||
with open("config.yaml", "r") as stream: | ||
CONFIG = yaml.safe_load(stream) | ||
s3 = boto3.resource( | ||
's3', | ||
region_name=CONFIG["region_name"], | ||
aws_secret_access_key=CONFIG["aws_secret_access_key"], | ||
aws_access_key_id=CONFIG["aws_access_key_id"], | ||
endpoint_url=CONFIG["endpoint_url"] | ||
) | ||
|
||
bucket = s3.Bucket(CONFIG['bucket']) | ||
objects = bucket.objects.all() | ||
for obj in track(objects, description="Deleting files"): | ||
obj.delete() | ||
print("Completed deleting all the files.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
region_name: "" | ||
aws_secret_access_key: "" | ||
aws_access_key_id: "" | ||
endpoint_url: "" | ||
bucket: "" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
import sys | ||
import csv | ||
import re | ||
import json | ||
# import gdown | ||
import boto3 | ||
import yaml | ||
|
||
|
||
from pathlib import Path | ||
|
||
CONFIG = {} | ||
|
||
|
||
def get_upload_path(file_name, category, language): | ||
if category != "" and language != "": | ||
return f'audio/{category}/{language}/{file_name}' | ||
else: | ||
return f'audio/{file_name}' | ||
|
||
def upload_to_oci_storage(s3, file_name, category, language): | ||
try: | ||
file_path = Path(file_name) | ||
upload_path = get_upload_path(file_path.name, category, language) | ||
s3.meta.client.upload_file(f"{file_name}", CONFIG['bucket'], upload_path, ExtraArgs={'ContentType': 'application/json'}) | ||
print(upload_path, "uploaded successfully") | ||
except Exception as e: | ||
print(e) | ||
|
||
def create_config(s3): | ||
config = {} | ||
categories = set() | ||
languages = set() | ||
|
||
invalid_option_link = ["https://objectstorage.ap-hyderabad-1.oraclecloud.com/n/ax2cel5zyviy/b/sbdjp-ivrs/o/audio/invalid_option_english.wav"] | ||
|
||
for obj in s3.Bucket(CONFIG['bucket']).objects.all(): | ||
path = Path(obj.key) | ||
|
||
if len(path.parts) < 4: continue | ||
if path.parts[0] != "audio": continue | ||
|
||
category = path.parts[2].strip().lower() | ||
language = path.parts[1].strip().lower() | ||
|
||
categories.add(category) | ||
languages.add(language) | ||
audio_key = f"{category}:{language}" | ||
if audio_key not in config: config[audio_key] = [] | ||
|
||
url = s3.meta.client.generate_presigned_url(ClientMethod = 'get_object', Params = { 'Bucket': CONFIG['bucket'], 'Key': obj.key }) | ||
url = url.split("?")[0] | ||
|
||
config[audio_key].append(url) | ||
|
||
for lang in languages: | ||
for cat in categories: | ||
audio_key = f"{cat}:{lang}:empty" | ||
config[audio_key] = invalid_option_link | ||
|
||
audio_key = f"invalid_option:{lang}" | ||
config[audio_key] = invalid_option_link | ||
|
||
with open('converted-drive-files/ivrs_config.json', 'w') as f: | ||
f.write(json.dumps(config)) | ||
|
||
if __name__ == '__main__': | ||
with open("config.yaml", "r") as stream: | ||
CONFIG = yaml.safe_load(stream) | ||
s3 = boto3.resource( | ||
's3', | ||
region_name=CONFIG["region_name"], | ||
aws_secret_access_key=CONFIG["aws_secret_access_key"], | ||
aws_access_key_id=CONFIG["aws_access_key_id"], | ||
endpoint_url=CONFIG["endpoint_url"] | ||
) | ||
|
||
create_config(s3) | ||
upload_to_oci_storage(s3, "converted-drive-files/ivrs_config.json", "", "") | ||
|
||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
import os | ||
import csv | ||
import re | ||
from pydrive.auth import GoogleAuth | ||
from pydrive.drive import GoogleDrive | ||
from pydub import AudioSegment | ||
import boto3 | ||
import yaml | ||
from rich.progress import track | ||
from collections import defaultdict | ||
import shutil | ||
|
||
def download_file(drive, file_id, destination_path, language, category): | ||
# Get the file | ||
file = drive.CreateFile({'id': file_id}) | ||
file_name = f"{file['title']}" | ||
# Download the file | ||
file_destination_path = f"{destination_path}/{language}/{category}" | ||
os.makedirs(file_destination_path, exist_ok=True) | ||
final_file_path = f"{file_destination_path}/{file['title']}" | ||
file.GetContentFile(final_file_path) | ||
# print('Downloaded', file_name) | ||
return final_file_path | ||
|
||
def get_drive(): | ||
# Authenticate and create the PyDrive client | ||
gauth = GoogleAuth() | ||
gauth.LocalWebserverAuth() # Creates local webserver and auto handles authentication. | ||
drive = GoogleDrive(gauth) | ||
return drive | ||
|
||
def check_file_existence(file_path): | ||
if not os.path.exists(file_path): | ||
raise FileNotFoundError("The file '{}' does not exist.".format(file_path)) | ||
else: | ||
print("The file given is ".format(file_path)) | ||
|
||
|
||
def download_audio_files(source_file_path): | ||
downloaded_files = {} | ||
# Open the CSV file | ||
with open(source_file_path) as csvfile: | ||
files_list = [d for d in csv.DictReader(csvfile)] | ||
drive = get_drive() | ||
# for file in files_list: | ||
for file in track(files_list, description="Downloading files"): | ||
category = file["Category"].strip().lower() | ||
language = file["Language"].strip().lower() | ||
drive_link = file["Drive Link"] | ||
file_id = re.search(r'/file/d/([^/]+)/', drive_link).group(1) | ||
local_file_path = download_file(drive, file_id, destination_path, language, category) | ||
file["Downloaded Path"] = local_file_path | ||
if local_file_path not in downloaded_files: | ||
downloaded_files.setdefault(local_file_path, 1) | ||
else: | ||
print(f"File already downloaded: {local_file_path}") | ||
|
||
with open("out.csv", "w") as f: | ||
wr = csv.DictWriter(f, delimiter=",",fieldnames=list(files_list[0].keys())) | ||
wr.writeheader() | ||
wr.writerows(files_list) | ||
|
||
def get_s3_object_and_bucket(): | ||
with open("config.yaml", "r") as stream: | ||
CONFIG = yaml.safe_load(stream) | ||
s3 = boto3.resource( | ||
's3', | ||
region_name=CONFIG["region_name"], | ||
aws_secret_access_key=CONFIG["aws_secret_access_key"], | ||
aws_access_key_id=CONFIG["aws_access_key_id"], | ||
endpoint_url=CONFIG["endpoint_url"] | ||
) | ||
return s3, CONFIG['bucket'] | ||
|
||
def is_audio_file(file_path): | ||
_, extension = os.path.splitext(file_path) | ||
return extension.lower() == '.mp3' or extension.lower() == '.wav' | ||
def list_files(directory): | ||
# List to store all files | ||
all_files = [] | ||
|
||
# Walk through all directories and subdirectories recursively | ||
for root, directories, files in os.walk(directory): | ||
for filename in files: | ||
# Append the full path to the list | ||
full_file_path = os.path.join(root, filename) | ||
if is_audio_file(full_file_path) == True: | ||
all_files.append(full_file_path) | ||
return all_files | ||
|
||
def change_extension(file_path): | ||
# Split the file path into directory, base filename, and extension | ||
directory, filename = os.path.split(file_path) | ||
base_name, old_extension = os.path.splitext(filename) | ||
# Construct the new file path with .wav extension | ||
return os.path.join(directory, base_name + ".wav") | ||
|
||
def convert_to_16bit_mono_8k_pcm_wav(mp3_file, wav_file): | ||
# Load the MP3 file | ||
audio = AudioSegment.from_mp3(mp3_file) | ||
# Set channels to mono | ||
audio = audio.set_channels(1) | ||
# Set sample width to 2 bytes (16 bit) | ||
audio = audio.set_sample_width(2) | ||
# Set sample rate to 8000 Hz | ||
audio = audio.set_frame_rate(8000) | ||
# Export as WAV | ||
audio.export(wav_file, format="wav", bitrate='8k', parameters=["-ac", "1", "-ar", "8000", "-sample_fmt", "s16"]) | ||
|
||
def convert_and_upload(s3, base_folder, bucket_name): | ||
file_list = list_files(base_folder) | ||
# for mp3_file_path in file_list: | ||
for mp3_file_path in track(file_list, description="Convert & Uploading"): | ||
# print(f"Processing {mp3_file_path}") | ||
converted_wav_file_path = change_extension('converted-drive-files/audio/'+'/'.join(mp3_file_path.split('/')[1:])) | ||
# print(f"Converting to {converted_wav_file_path}") | ||
try: | ||
os.makedirs(os.path.dirname(converted_wav_file_path), exist_ok=True) | ||
# convert_to_8khz_mono_with_bitrate(mp3_file_path, converted_mp3_file_path) | ||
convert_to_16bit_mono_8k_pcm_wav(mp3_file_path, converted_wav_file_path) | ||
# print(f"Converted to {converted_wav_file_path}") | ||
s3_file_path='/'.join(converted_wav_file_path.split('/')[1:]) | ||
s3.meta.client.upload_file(converted_wav_file_path, bucket_name, s3_file_path, ExtraArgs={'ContentType': 'audio/wav'}) | ||
# print(f"Uploaded to {s3_file_path}") | ||
except Exception as e: | ||
print("Failed to process and upload ", mp3_file_path) | ||
pass | ||
|
||
def copy_invalid_option_audio(s3, bucket): | ||
source_file = 'invalid_option_english.wav' | ||
# Specify the path to the folder where you want to move the file | ||
s3.meta.client.upload_file(source_file, bucket, f"audio/{source_file}", ExtraArgs={'ContentType': 'audio/wav'}) | ||
|
||
if __name__ == "__main__": | ||
destination_path="original-drive-files" | ||
source_file_path = input("Please enter input file path: ") | ||
check_file_existence(source_file_path) | ||
download_audio_files(source_file_path) | ||
print("Completed downloading all the files.") | ||
s3, bucket = get_s3_object_and_bucket() | ||
convert_and_upload(s3,destination_path,bucket) | ||
print("Completed uploading all the files.") | ||
copy_invalid_option_audio(s3, bucket) | ||
print("Completed copying invalid option audio.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
boto3==1.34.38 | ||
botocore==1.34.38 | ||
cachetools==5.3.2 | ||
certifi==2024.2.2 | ||
charset-normalizer==3.3.2 | ||
google-api-core==2.16.2 | ||
google-api-python-client==2.116.0 | ||
google-auth==2.27.0 | ||
google-auth-httplib2==0.2.0 | ||
googleapis-common-protos==1.62.0 | ||
httplib2==0.22.0 | ||
idna==3.6 | ||
jmespath==1.0.1 | ||
markdown-it-py==3.0.0 | ||
mdurl==0.1.2 | ||
numpy==1.26.4 | ||
oauth2client==4.1.3 | ||
pandas==2.2.0 | ||
protobuf==4.25.2 | ||
pyasn1==0.5.1 | ||
pyasn1-modules==0.3.0 | ||
PyDrive==1.3.1 | ||
pydub==0.25.1 | ||
Pygments==2.17.2 | ||
pyparsing==3.1.1 | ||
python-dateutil==2.8.2 | ||
pytz==2024.1 | ||
PyYAML==6.0.1 | ||
requests==2.31.0 | ||
rich==13.7.0 | ||
rsa==4.9 | ||
s3transfer==0.10.0 | ||
six==1.16.0 | ||
tqdm==4.66.2 | ||
tzdata==2023.4 | ||
uritemplate==4.1.1 | ||
urllib3==2.0.7 |