forked from codewithroger/Sound-Recognition-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsound.py
76 lines (63 loc) · 2.84 KB
/
sound.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import numpy as np
import sounddevice as sd
import librosa
from tensorflow.keras.models import load_model
import joblib
# Load the saved model and LabelEncoder
model = load_model('D:/MegaProject/saved_models/urban_sound_classifier.h5')
le = joblib.load('D:/MegaProject/saved_models/label_encoder.pkl')
# Function to extract features from audio data
def extract_features_from_audio(audio, sample_rate):
try:
# Convert multi-channel audio to mono by averaging channels
if len(audio.shape) > 1:
audio = np.mean(audio, axis=1)
mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40, fmax=8000)
chroma = librosa.feature.chroma_stft(y=audio, sr=sample_rate)
mel = librosa.feature.melspectrogram(y=audio, sr=sample_rate)
contrast = librosa.feature.spectral_contrast(y=audio, sr=sample_rate)
tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(audio), sr=sample_rate)
# Combine extracted features into a single array
features = np.hstack((
np.mean(mfccs.T, axis=0),
np.mean(chroma.T, axis=0),
np.mean(mel.T, axis=0),
np.mean(contrast.T, axis=0),
np.mean(tonnetz.T, axis=0)
))
return features
except Exception as e:
print(f"Error processing audio: {e}")
return None
# Function to predict label of a sound
def predict_sound_label(audio, sample_rate):
feature = extract_features_from_audio(audio, sample_rate)
if feature is None:
print(f"Unable to extract features from the audio")
return None
feature = feature.reshape(1, -1)
predicted_vector = model.predict(feature)
predicted_label = le.inverse_transform(np.argmax(predicted_vector, axis=1))
return predicted_label[0]
# Function to record audio from microphone (using sounddevice)
def get_audio(duration=3, sample_rate=44100, device_index=5):
global audio_data
audio_data = []
def callback(indata, frames, time, status):
if status:
print(status, file=sys.stderr)
global audio_data
audio_data.append(indata.copy())
# Adjust the channels parameter to 5 (0 through 4, inclusive)
with sd.InputStream(samplerate=sample_rate, channels=5, device=device_index, callback=callback):
print("Recording...")
sd.sleep(duration * 1000)
print("Recording complete")
# Concatenate and convert to a numpy array
audio = np.concatenate(audio_data)
return audio, sample_rate
# Record audio from the microphone
audio, sample_rate = get_audio(duration=3, device_index=5)
# Predict the label for the recorded audio
predicted_label = predict_sound_label(audio, sample_rate)
print(f"The predicted label for the recorded sound is: {predicted_label}")