-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathAudioset.py
140 lines (114 loc) · 5.67 KB
/
Audioset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import os
import numpy as np
import pandas as pd
import librosa
import joblib
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
# Function to extract features from audio files
def extract_features(file_name):
try:
audio, sample_rate = librosa.load(file_name, sr=44100)
n_fft = min(1024, len(audio)) # Ensure n_fft does not exceed the length of the audio
# Extract features
mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40, fmax=8000, n_fft=n_fft)
chroma = librosa.feature.chroma_stft(y=audio, sr=sample_rate, n_fft=n_fft)
mel = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_fft=n_fft)
contrast = librosa.feature.spectral_contrast(y=audio, sr=sample_rate, n_fft=n_fft)
tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(audio), sr=sample_rate)
# Add additional features
zcr = librosa.feature.zero_crossing_rate(y=audio)
rmse = librosa.feature.rms(y=audio)
# Combine features into a single array
features = np.hstack((np.mean(mfccs.T, axis=0),
np.mean(chroma.T, axis=0),
np.mean(mel.T, axis=0),
np.mean(contrast.T, axis=0),
np.mean(tonnetz.T, axis=0),
np.mean(zcr.T, axis=0),
np.mean(rmse.T, axis=0)))
return features
except Exception as e:
print(f"Error processing {file_name}: {e}")
return None
# Load AudioSet metadata
metadata_audioset = pd.read_csv('D:/Sound_Recognition/Audioset/balanced.csv', header=None) # No header
# Display the first few rows of the DataFrame to understand its structure
print(metadata_audioset.head())
# Setting column indices based on your structure
audio_id_column = 0 # YTID
start_time_column = 1 # start_seconds
end_time_column = 2 # end_seconds
label_column = 3 # positive_labels
features_audioset = []
labels_audioset = []
# Processing the metadata for AudioSet
for index, row in metadata_audioset.iterrows():
youtube_id = row[audio_id_column] # Get the YouTube ID
label = row[label_column].strip().split() # Get the labels, removing extra spaces and splitting by spaces
audio_file_path = f'https://www.youtube.com/watch?v={youtube_id}' # Construct file path from YouTube ID
# Download audio file from YouTube (make sure you have a method to download it)
# Placeholder: Assume you have a function download_audio(youtube_id, audio_file_path)
# Uncomment the next line if you have a working download function
# download_audio(youtube_id, audio_file_path)
# Extract features from the downloaded audio file
feature = extract_features(audio_file_path)
if feature is not None:
features_audioset.append(feature)
labels_audioset.append(label)
else:
print(f"Failed to extract features from audio with ID: {youtube_id}")
# Convert features and labels to NumPy arrays
features_audioset = np.array(features_audioset)
labels_audioset = np.array(labels_audioset)
# Flatten the list of labels to encode them correctly
flat_labels = [item for sublist in labels_audioset for item in sublist]
# Encode labels
le = LabelEncoder()
encoded_labels = to_categorical(le.fit_transform(flat_labels))
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_audioset, encoded_labels, test_size=0.2, random_state=42)
# Reshape for CNN input
X_train_cnn = X_train.reshape(X_train.shape[0], X_train.shape[1], 1, 1) # Reshape for CNN
X_test_cnn = X_test.reshape(X_test.shape[0], X_test.shape[1], 1, 1)
# Build the CNN model
model_cnn = Sequential()
model_cnn.add(Conv2D(32, (3, 1), activation='relu', input_shape=(X_train_cnn.shape[1], 1, 1)))
model_cnn.add(MaxPooling2D(pool_size=(2, 1)))
model_cnn.add(Conv2D(64, (3, 1), activation='relu'))
model_cnn.add(MaxPooling2D(pool_size=(2, 1)))
model_cnn.add(Flatten())
model_cnn.add(Dense(128, activation='relu'))
model_cnn.add(Dropout(0.5))
model_cnn.add(Dense(len(le.classes_), activation='softmax'))
# Compile the model
model_cnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Train the model
history_cnn = model_cnn.fit(X_train_cnn, y_train, batch_size=32, epochs=120, validation_data=(X_test_cnn, y_test), verbose=1)
# Evaluate the model
score = model_cnn.evaluate(X_test_cnn, y_test, verbose=0)
print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')
# Save the trained model and LabelEncoder
model_cnn.save('D:/Sound_Recognition/Audioset/Sound_Classifier_CNN.h5')
joblib.dump(le, 'D:/Sound_Recognition/Audioset/Sound_label.pkl')
# Save label mappings to a CSV file
label_mapping = pd.DataFrame({
'encoded_label': range(len(le.classes_)),
'original_label': le.classes_
})
label_mapping.to_csv('D:/Sound_Recognition/Audioset/label_mapping.csv', index=False)
print("Model, LabelEncoder, and Label mappings have been saved successfully.")
# Optional: Plot training history
plt.plot(history_cnn.history['accuracy'], label='train accuracy')
plt.plot(history_cnn.history['val_accuracy'], label='val accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('CNN Training History')
plt.show()