diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..5ecb04b --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 konas122 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..502d5e4 --- /dev/null +++ b/README.md @@ -0,0 +1,45 @@ +# 声纹识别 + +## python第三方库 + +``` +python=3.8 +tensorboardX=2.6 +tensorboard=2.11.2 +scipy=1.4.1 +numpy=1.23.5 +librosa=0.9.2 +torch=1.8.1 +torchaudio=0.8.1 +torchvision=0.9.1 +``` + +​ +## 训练 +​ 运行 `train.py` 进行训练。 + +​ 该网络是在 `resnet18` 或 `vgg19` 的基础上再添加 LSTM 和线性层,从而实现声纹识别。 +​ 该项目同时也保留了单用 CNN 的方法( `net_cnn.py` )来实现声纹识别,其实效果也不差。 + + +​ +## 训练数据 +​ 这是我所用的数据集:https://pan.baidu.com/s/1_KrjPB27AHPrBa_1AeMQSQ?pwd=0mag 提取码:0mag + +​ 当然,也可以用自己的数据集。只需在 `train.py` 的相同目录下创建 `data` 文件夹,并在 `data` 下创建子文件夹 `train`,然后将自己的训练数据放到 `train` 中。目前,这代码仅支持 `.wav` 格式的训练音频。 + +​ + +### Acknowledge + +We study many useful projects in our codeing process, which includes: + +[clovaai/voxceleb_trainer](https://github.com/clovaai/voxceleb_trainer). + +[lawlict/ECAPA-TDNN](https://github.com/lawlict/ECAPA-TDNN/blob/master/ecapa_tdnn.py). + +[TaoRuijie/ECAPA-TDNN](https://github.com/TaoRuijie/ECAPA-TDNN) + +Thanks for these authors to open source their code! + +未完待续... diff --git a/audio.py b/audio.py new file mode 100644 index 0000000..30b0233 --- /dev/null +++ b/audio.py @@ -0,0 +1,155 @@ +import torch +import random +import librosa +import numpy as np +import librosa.display +from scipy.signal import medfilt +import matplotlib.pyplot as plt +# import torchaudio.transforms as T + + +path = '.\\voices' +name = 'a001.wav' +audio_filename = ".\\data\\test\\G2231\\T0055G2231S0076.wav" + + +def noise_augmentation(samples, min_db=40, max_db=80): + samples = samples.copy() # frombuffer()导致数据不可更改因此使用拷贝 + data_type = samples[0].dtype + db = np.random.randint(low=min_db, high=max_db) + db *= 1e-6 + noise = db * np.random.normal(0, 1, len(samples)) # 高斯分布 + # print(db) + samples = samples + noise + samples = samples.astype(data_type) + return samples + + +def add_noise(x, snr, method='vectorized', axis=0): + # Signal power + if method == 'vectorized': + N = x.size + Ps = np.sum(x ** 2 / N) + elif method == 'max_en': + N = x.shape[axis] + Ps = np.max(np.sum(x ** 2 / N, axis=axis)) + elif method == 'axial': + N = x.shape[axis] + Ps = np.sum(x ** 2 / N, axis=axis) + else: + raise ValueError('method \"' + str(method) + '\" not recognized.') + + Psdb = 10 * np.log10(Ps) # Signal power, in dB + Pn = Psdb - snr # Noise level necessary + n = np.sqrt(10 ** (Pn / 10)) * np.random.normal(0, 1, x.shape) # Noise vector (or matrix) + return x + n + + +def load_spectrogram(filename): + wav, fs = librosa.load(filename, sr=16000) + mag = librosa.feature.melspectrogram(y=wav, sr=16000, n_fft=512, n_mels=80, + win_length=400, hop_length=160) + mag = librosa.power_to_db(mag, ref=1.0, amin=1e-10, top_db=None) + librosa.display.specshow(mag, sr=16000, x_axis='time', y_axis='mel') # 画mel谱图 + plt.show() + + return mag + + +def audio_to_wav(filename, sr=16000, noise=False): + wav, fs = librosa.load(filename, sr=sr) + + # wav1 = load_spectrogram(wav) + # t = T.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, + # f_min=20, f_max=7600, window_fn=torch.hamming_window, n_mels=80) + # wav2 = torch.from_numpy(wav) + # wav2 = t(wav2) + + extended_wav = np.append(wav, wav) + if len(extended_wav) < 41000: + extended_wav = np.append(extended_wav, wav) + if noise: + extended_wav = add_noise(extended_wav, fs) + return extended_wav, fs + + +def loadWAV(filename, noise=False): + y, sr = audio_to_wav(filename=filename, noise=noise) + assert len(y) >= 41000, f'Error: file {filename}\n' + num = random.randint(0, len(y) - 41000) + y = y[num:num + 41000] + y = torch.from_numpy(y).float() + return y + + +def load_pure_wav(filename, frame_threshold=10, noise=False): + y, sr = audio_to_wav(filename=filename, noise=noise) + mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=24, win_length=1024, hop_length=512, n_fft=1024) + Mfcc1 = medfilt(mfcc[0, :], 9) # 对mfcc进行中值滤波 + pic = Mfcc1 + start = 0 + end = 0 + points = [] + min_data = min(pic) * 0.9 + for i in range((pic.shape[0])): + if pic[i] < min_data and start == 0: + start = i + if pic[i] < min_data and start != 0: + end = i + elif pic[i] > min_data and start != 0: + hh = [start, end] + points.append(hh) + start = 0 + if pic[-1] < min_data and start != 0: # 解决 文件的最后为静音 + hh = [start, end] + points.append(hh) + distances = [] + for i in range(len(points)): + two_ends = points[i] + distance = two_ends[1] - two_ends[0] + if distance > frame_threshold: + distances.append(points[i]) + + # out, _ = soundfile.read(filename) + # out = out.astype(np.float32) + if len(distances) == 0: # 无静音段 + return y + else: + silence_data = [] + for i in range(len(distances)): + if i == 0: + start, end = distances[i] + if start == 1: + internal_clean = y[0:0] + else: + start = (start - 1) * 512 # 求取开始帧的开头 + # end = (end - 1) * 512 + 1024 + internal_clean = y[0:start - 1] + else: + _, end = distances[i - 1] + start, _ = distances[i] + start = (start - 1) * 512 + end = (end - 1) * 512 + 1024 # 求取结束帧的结尾 + internal_clean = y[end + 1:start] + # hhh = np.array(internal_clean) + silence_data.extend(internal_clean) + ll = len(distances) # 结尾音频处理 + _, end = distances[ll - 1] + end = (end - 1) * 512 + 1024 + end_part_clean = y[end:len(y)] + silence_data.extend(end_part_clean) + y = silence_data + y = torch.from_numpy(np.array(y)).float() + return y + + +if __name__ == '__main__': + a = load_pure_wav(audio_filename, noise=True) + print(a.shape, a.dtype) + _ = load_spectrogram(audio_filename) + # a = np.array([[[-11, -10, -9, -8], + # [-7, -6, -5, -4], + # [-3, -2, -1, 0]], + # [[1, 2, 3, 4], + # [5, 6, 7, 8], + # [9, 10, 11, 12]]]) diff --git a/eval.py b/eval.py new file mode 100644 index 0000000..bd222a4 --- /dev/null +++ b/eval.py @@ -0,0 +1,72 @@ +import torch +import time +from torch import nn +# from d2l import torch as d2l + + +class Timer: + def __init__(self): + self.times = [] + self.tik = None + self.start() + + def start(self): + self.tik = time.time() + + def stop(self): + self.times.append(time.time() - self.tik) + return self.times[-1] + + def avg(self): + return sum(self.times) / len(self.times) + + def sum(self): + return sum(self.times) + + +class Accumulator: + def __init__(self, n): + self.data = [0.0] * n + + def add(self, *args): + self.data = [a + float(b) for a, b in zip(self.data, args)] + + def reset(self): + self.data = [0.0] * len(self.data) + + def __getitem__(self, idx): + return self.data[idx] + + +def try_gpu(i=0): + if torch.cuda.device_count() >= i + 1: + return torch.device(f'cuda:{i}') + return torch.device('cpu') + + +def accuracy(y_hat, y): + if len(y_hat.shape) > 1 and y_hat.shape[1] > 1: + y_hat = y_hat.argmax(axis=1) + astype = lambda x, *args, **kwargs: x.type(*args, **kwargs) + cmp = astype(y_hat, y.dtype) == y + reduce_sum = lambda x, *args, **kwargs: x.sum(*args, **kwargs) + return float(reduce_sum(astype(cmp, y.dtype))) + + +def evaluate_accuracy_gpu(net, data_iter, device=None): + if isinstance(net, nn.Module): + net.eval() + if not device: + device = next(iter(net.parameters())).device + metric = Accumulator(2) + + with torch.no_grad(): + for X, y in data_iter: + if isinstance(X, list): + X = [x.to(device) for x in X] + else: + X = X.to(device) + y = y.to(device) + size = lambda x, *args, **kwargs: x.numel(*args, **kwargs) + metric.add(accuracy(net(X), y), size(y)) + return metric[0] / metric[1] diff --git a/fine_tuning.py b/fine_tuning.py new file mode 100644 index 0000000..d10cf73 --- /dev/null +++ b/fine_tuning.py @@ -0,0 +1,65 @@ +import torch +import loader +import train as t +import eval as d2l +# import torch_directml +from loss import AAMSoftmax +# from d2l import torch as d2l +from tensorboardX import SummaryWriter +from torch.utils.data import DataLoader +from models.tdnn_pretrain import Pretrain_TDNN + + +def load_model(path, output_num, device, not_grad=False): + load_net = torch.load(path, map_location=device) + model = Pretrain_TDNN(output_num, 1024, output_embedding=False, not_grad=not_grad) + model.speaker_encoder = load_net.speaker_encoder + del load_net + return model + + +if __name__ == "__main__": + people_num, data_per_people = 420, 10 + noise, mel, reverse = False, True, False + margin, scale, easy_margin = 0.2, 20, False + num_epochs, learn_rate, weight_decay = 40, 0.1, 1e-3 + learn_rate_period, learn_rate_decay = 10, 0.95 + mode, model_name = "train", "resnet18" + hidden_size, num_layers = 64, 2 + + # Device = torch_directml.device() + # prefetch_factor, batch_size, num_works, persistent = 2, 32, 8, False + + Device = d2l.try_gpu() + if Device.type == 'cpu': + prefetch_factor, batch_size, num_works, persistent = 2, 8, 8, False + elif torch.cuda.is_available(): + prefetch_factor, batch_size, num_works, persistent = 8, 256, 32, True + else: + prefetch_factor, batch_size, num_works, persistent = 2, 32, 8, False + + t.init_logs() + train_dict, test_dict, people_num = loader.load_files(mode=mode, folder_num=people_num, + file_num=data_per_people, k=1) + train_dataset = loader.MyDataset(data_dict=train_dict, people_num=people_num, train=True, + mel=mel, noise=noise) + test_dataset = loader.MyDataset(data_dict=test_dict, people_num=people_num, train=False, + mel=mel, noise=noise) + print(len(train_dataset), len(test_dataset)) + train_ = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, + drop_last=True, num_workers=num_works, pin_memory=True, + persistent_workers=persistent, prefetch_factor=prefetch_factor) + test_ = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True, + drop_last=True, num_workers=num_works, pin_memory=True, + persistent_workers=persistent, prefetch_factor=prefetch_factor) + + # pth_path = 'test.pth' + # model2 = load_model(pth_path, people_num, Device, not_grad=True) + + model2 = Pretrain_TDNN(people_num, 1024, output_embedding=False, not_grad=False) + model2.load_parameters('param.model', Device) + + loss = AAMSoftmax(192, people_num, margin, scale, easy_margin) + writer = SummaryWriter('./logs') + t.train(train_, test_, model2, loss, Device, writer, num_epochs, learn_rate, weight_decay) + model2.save_parameters('param2.model') diff --git a/img/PR.jpg b/img/PR.jpg new file mode 100644 index 0000000..22ad336 Binary files /dev/null and b/img/PR.jpg differ diff --git a/img/ROC.jpg b/img/ROC.jpg new file mode 100644 index 0000000..4c8278b Binary files /dev/null and b/img/ROC.jpg differ diff --git a/img/confusion_matrix.jpg b/img/confusion_matrix.jpg new file mode 100644 index 0000000..d6d3a08 Binary files /dev/null and b/img/confusion_matrix.jpg differ diff --git a/loader.py b/loader.py new file mode 100644 index 0000000..5e44675 --- /dev/null +++ b/loader.py @@ -0,0 +1,160 @@ +import os +import torch +import audio +import numpy as np +import multiprocessing +from joblib import Parallel, delayed +from torch.utils.data import Dataset, DataLoader + +transcript_filename = ".\\data\\transcript.txt" +test_path = ".\\data\\test" +train_path = ".\\data\\train" +dev_path = ".\\data\\dev" + + +class MyDataset(Dataset): + def __init__(self, data_dict=None, people_num=None, train=True, mel=True, noise=False): + super(MyDataset, self).__init__() + self.noise = noise + self.mel = mel + self.train = train + self.data_dict = data_dict + self.spect = [] + self.labels = [] + if data_dict is None or people_num is None: + raise Exception(f'Error: data_dtc {data_dict} is empty\n') + else: + self.people_num = people_num + self._preprocess() + + def _preprocess(self): + out = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(self._audio)(key) for key in self.data_dict) + self.spect = [value for value, _ in out] + self.labels = [value for _, value in out] + self.labels = torch.from_numpy(np.array(self.labels)).long() + + def _audio(self, key): + spec = audio.loadWAV(filename=key) + return spec, self.data_dict[key] + + def __getitem__(self, item): + label = self.labels[item] + spec = self.spect[item] + return spec, label + + def __len__(self): + return len(self.labels) + + +def load_files(mode="train", folder_num=-1, file_num=-1, k=1.5): + path = ".\\data" + train, test = {}, {} + if mode == "train": + path = path + '\\train' + elif mode == "test": + path = path + '\\test' + elif mode == "dev": + path = path + "\\dev" + else: + raise Exception(f'Error: mode {mode} 不存在') + dirs = os.listdir(path) + + if 0 < folder_num < len(dirs): + if mode == "train": + num = np.arange(folder_num) + else: + num = np.random.choice(len(dirs), folder_num, replace=False) + else: + num = np.arange(len(dirs)) + folder_num = len(dirs) + if k <= 0 or k >= 9: + k = 1.5 + + count = 0 + folder_path = [] + for i in num: + file_path = dirs[i] + folder_path.append(file_path) + file_path = os.path.join(path, file_path) + tmp_files = os.listdir(file_path) + sub_files = [tmp_files[file] for file in range(len(tmp_files)) + if tmp_files[file][-4:] == ".wav"] + + if file_num > len(sub_files): + file_num = len(sub_files) + elif file_num < 10: + file_num = 10 + np.random.shuffle(sub_files) + train_num = int(file_num // (k + 1) * k + 1) + # test_num = file_num - train_num + + for j in range(train_num): + wav_file = os.path.join(file_path, sub_files[j]) + train[wav_file] = count + for j in range(train_num, file_num): + wav_file = os.path.join(file_path, sub_files[j]) + test[wav_file] = count + count += 1 + return train, test, folder_num + + +class Vocabulary: + def __init__(self, word_to_id, id_to_word, vocab, token, sentence_max): + self.word_to_id = word_to_id + self.id_to_word = id_to_word + self.vocab = vocab + self.token = token + self.sentence_max = sentence_max + + +def transcript_process(filename, token="word"): + id_to_word = {} + word_to_id = {} + vocab = {} + sentence_max = 0 + f = open(filename, "r", encoding="utf-8") + for line in f.readlines(): + text = line[16:] + index = line[:16] + vec = np.array([], dtype='int16') + text = text.replace("\n", "") + if token == "word": + words = text.split(' ') + elif token == "char": + words = text.replace(" ", "") + else: + raise Exception(f'Error: token {token} 不存在') + # print(words) + for word in words: + if sentence_max < len(words): + sentence_max = len(words) + if word not in word_to_id: + new_id = len(word_to_id) + word_to_id[word] = new_id + id_to_word[new_id] = word + vec = np.append(vec, word_to_id[word]) + vocab[index] = vec + f.close() + vocabulary = Vocabulary(word_to_id, id_to_word, vocab, token, sentence_max) + return vocabulary + + +if __name__ == '__main__': + # vocabulary = transcript_process(transcript_filename, token="word") + # print(vocabulary.vocab) + # print(vocabulary.sentence_max) + Reverse = False + train_dict, test_dict, number = load_files("train", 40, 20, 1.5) + # for i in train_dict.values(): + # print(i) + train_dataset = MyDataset(train_dict, number, True, True, False) + test_dataset = MyDataset(test_dict, number, False, True, False) + print(len(train_dataset), len(test_dataset)) + train_iter = DataLoader(dataset=train_dataset, batch_size=6, shuffle=True, drop_last=True, num_workers=4) + print(len(train_iter)) + a = None + for b, (x, y) in enumerate(train_iter): + if b == 0: + a = x + print(x.shape, y) + print(a[0].shape) diff --git a/logs/acc/test_acc/events.out.tfevents.1680200803.Konas b/logs/acc/test_acc/events.out.tfevents.1680200803.Konas new file mode 100644 index 0000000..f06cfb8 Binary files /dev/null and b/logs/acc/test_acc/events.out.tfevents.1680200803.Konas differ diff --git a/logs/acc/test_acc/events.out.tfevents.1681549622.Konas b/logs/acc/test_acc/events.out.tfevents.1681549622.Konas new file mode 100644 index 0000000..1f2fd05 Binary files /dev/null and b/logs/acc/test_acc/events.out.tfevents.1681549622.Konas differ diff --git a/logs/acc/train_acc/events.out.tfevents.1678964030.Konas b/logs/acc/train_acc/events.out.tfevents.1678964030.Konas new file mode 100644 index 0000000..caa0452 Binary files /dev/null and b/logs/acc/train_acc/events.out.tfevents.1678964030.Konas differ diff --git a/logs/acc/train_acc/events.out.tfevents.1679050809.Konas b/logs/acc/train_acc/events.out.tfevents.1679050809.Konas new file mode 100644 index 0000000..b08adf6 Binary files /dev/null and b/logs/acc/train_acc/events.out.tfevents.1679050809.Konas differ diff --git a/logs/acc/train_acc/events.out.tfevents.1679832770.Konas b/logs/acc/train_acc/events.out.tfevents.1679832770.Konas new file mode 100644 index 0000000..4612c74 Binary files /dev/null and b/logs/acc/train_acc/events.out.tfevents.1679832770.Konas differ diff --git a/logs/acc/train_acc/events.out.tfevents.1679986062.Konas b/logs/acc/train_acc/events.out.tfevents.1679986062.Konas new file mode 100644 index 0000000..7533348 Binary files /dev/null and b/logs/acc/train_acc/events.out.tfevents.1679986062.Konas differ diff --git a/logs/acc/train_acc/events.out.tfevents.1680200803.Konas b/logs/acc/train_acc/events.out.tfevents.1680200803.Konas new file mode 100644 index 0000000..a8a239d Binary files /dev/null and b/logs/acc/train_acc/events.out.tfevents.1680200803.Konas differ diff --git a/logs/acc/train_acc/events.out.tfevents.1681549622.Konas b/logs/acc/train_acc/events.out.tfevents.1681549622.Konas new file mode 100644 index 0000000..da98fc9 Binary files /dev/null and b/logs/acc/train_acc/events.out.tfevents.1681549622.Konas differ diff --git a/logs/events.out.tfevents.1680192878.Konas b/logs/events.out.tfevents.1680192878.Konas new file mode 100644 index 0000000..792bc4c Binary files /dev/null and b/logs/events.out.tfevents.1680192878.Konas differ diff --git a/logs/events.out.tfevents.1680251006.Konas b/logs/events.out.tfevents.1680251006.Konas new file mode 100644 index 0000000..6524766 Binary files /dev/null and b/logs/events.out.tfevents.1680251006.Konas differ diff --git a/logs/events.out.tfevents.1681549581.Konas b/logs/events.out.tfevents.1681549581.Konas new file mode 100644 index 0000000..2354615 Binary files /dev/null and b/logs/events.out.tfevents.1681549581.Konas differ diff --git a/loss.py b/loss.py new file mode 100644 index 0000000..078216c --- /dev/null +++ b/loss.py @@ -0,0 +1,90 @@ +import math +import torch +import eval as d2l +import torch.nn as nn +# import torch.nn.functional as F + + +def prec_accuracy(output, target, topk=(1,)): + mask = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(mask, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(1 / batch_size)) + return res + + +def evaluate_accuracy_gpu(net, data_iter, device=None): + if isinstance(net, nn.Module): + net.eval() + if not device: + device = d2l.try_gpu() + metric = d2l.Accumulator(2) + + with torch.no_grad(): + for _, (X, y) in enumerate(data_iter): + if isinstance(X, list): + X = [x.to(device) for x in X] + else: + X = X.to(device) + y = y.to(device) + size = lambda x, *args, **kwargs: x.numel(*args, **kwargs) + phi = net(X) + + one_hot = torch.zeros(phi.size(), device='cuda' if torch.cuda.is_available() else 'cpu') + one_hot.scatter_(1, y.view(-1, 1), 1) + output = (one_hot * phi) + ((1.0 - one_hot) * phi) + prec = prec_accuracy(output.detach(), y.detach(), topk=(1,))[0] + metric.add(prec * size(y), size(y)) + + return metric[0] / metric[1] + + +class AAMSoftmax(nn.Module): + def __init__(self, nOut, nClasses, margin=0.2, scale=20, easy_margin=False): # or margin=0.2, scale=30 + super(AAMSoftmax, self).__init__() + self.test_normalize = True + self.m = margin + self.s = scale + self.in_feats = nOut + self.output_num = nClasses + # self.weight = torch.nn.Parameter(torch.FloatTensor(nClasses, nOut), requires_grad=True) + # nn.init.xavier_normal_(self.weight, gain=1) + self.ce = nn.CrossEntropyLoss() + self.easy_margin = easy_margin + self.cos_m = math.cos(self.m) + self.sin_m = math.sin(self.m) + # make the function cos(theta+m) monotonic decreasing while theta in [0°,180°] + self.th = math.cos(math.pi - self.m) + self.mm = math.sin(math.pi - self.m) * self.m + print('Initialised AAMSoftmax margin:%.3f scale:%.3f' % (self.m, self.s)) + + def forward(self, cosine, label): + assert cosine.size()[0] == label.size()[0] + assert cosine.size()[1] == self.output_num + # cos(theta) + # cosine = F.linear(F.normalize(cosine), F.normalize(self.weight)) + # cos(theta + m) + sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1)) + phi = cosine * self.cos_m - sine * self.sin_m + + if self.easy_margin: + phi = torch.where(cosine > 0, phi, cosine) + else: + phi = torch.where((cosine - self.th) > 0, phi, cosine - self.mm) + + # one_hot = torch.zeros_like(cosine) + one_hot = torch.zeros(cosine.size(), device='cuda' if torch.cuda.is_available() else 'cpu') + one_hot.scatter_(1, label.view(-1, 1), 1) + output = (one_hot * phi) + ((1.0 - one_hot) * cosine) + output = output * self.s + + loss = self.ce(output, label) + prec = prec_accuracy(output.detach(), label.detach(), topk=(1,))[0] + return loss, prec diff --git a/models/tdnn.py b/models/tdnn.py new file mode 100644 index 0000000..ab94c69 --- /dev/null +++ b/models/tdnn.py @@ -0,0 +1,177 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchaudio.transforms as T +from models.tdnn_module import PreEmphasis, FbankAug + + +class Res2Conv1dReluBn(nn.Module): + def __init__(self, channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True, scale=4): + super().__init__() + assert channels % scale == 0, "{} % {} != 0".format(channels, scale) + self.scale = scale + self.width = channels // scale + self.nums = scale if scale == 1 else scale - 1 + + self.convs = [] + self.bns = [] + for i in range(self.nums): + self.convs.append(nn.Conv1d(self.width, self.width, kernel_size, stride, padding, dilation, bias=bias)) + self.bns.append(nn.BatchNorm1d(self.width)) + self.convs = nn.ModuleList(self.convs) + self.bns = nn.ModuleList(self.bns) + + def forward(self, x): + out = [] + spx = torch.split(x, self.width, 1) + sp = None + for i in range(self.nums): + if i == 0: + sp = spx[i] + else: + sp = sp + spx[i] + # Order: conv -> relu -> bn + sp = self.convs[i](sp) + sp = self.bns[i](F.relu(sp)) + out.append(sp) + if self.scale != 1: + out.append(spx[self.nums]) + out = torch.cat(out, dim=1) + return out + + +class Conv1dReluBn(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True): + super().__init__() + self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias) + self.bn = nn.BatchNorm1d(out_channels) + + def forward(self, x): + return self.bn(F.relu(self.conv(x))) + + +class SE_Connect(nn.Module): + def __init__(self, channels, s=2): + super().__init__() + assert channels % s == 0, "{} % {} != 0".format(channels, s) + self.linear1 = nn.Linear(channels, channels // s) + self.linear2 = nn.Linear(channels // s, channels) + + def forward(self, x): + out = x.mean(dim=2) + out = F.relu(self.linear1(out)) + out = torch.sigmoid(self.linear2(out)) + out = x * out.unsqueeze(2) + return out + + +def SE_Res2Block(channels, kernel_size, stride, padding, dilation, scale): + return nn.Sequential( + Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0), + Res2Conv1dReluBn(channels, kernel_size, stride, padding, dilation, scale=scale), + Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0), + SE_Connect(channels) + ) + + +class AttentiveStatsPool(nn.Module): + def __init__(self, in_dim, bottleneck_dim, context): + super().__init__() + self.context = context + if self.context: + in_dims = in_dim * 3 + else: + in_dims = in_dim + self.linear = nn.Sequential( + nn.Conv1d(in_dims, bottleneck_dim, kernel_size=1), + nn.Tanh(), + nn.BatchNorm1d(bottleneck_dim), + nn.Conv1d(bottleneck_dim, in_dim, kernel_size=1), + nn.Softmax(dim=2), + ) + + def forward(self, x): + t = x.size()[-1] + if self.context: + global_x = torch.cat( + ( + x, + torch.mean(x, dim=2, keepdim=True).repeat(1, 1, t), + torch.sqrt(torch.var(x, dim=2, keepdim=True).clamp(min=1e-4, max=1e4)).repeat(1, 1, t), + ), + dim=1, + ) + else: + global_x = x + alpha = self.linear(global_x) + mean = torch.sum(alpha * x, dim=2) + residuals = torch.sum(alpha * x ** 2, dim=2) - mean ** 2 + std = torch.sqrt(residuals.clamp(min=1e-9)) + return torch.cat([mean, std], dim=1) + + +class ECAPA_TDNN(nn.Module): + def __init__(self, in_channels=80, channels=512, embd_dim=192, output_num=10, + context=True, aug=True, embedding=True): + super().__init__() + self.context = context + self.aug = aug + self.embedding = embedding + self.layer1 = Conv1dReluBn(in_channels, channels, kernel_size=5, padding=2) + self.layer2 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=2, dilation=2, scale=8) + self.layer3 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=3, dilation=3, scale=8) + self.layer4 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=4, dilation=4, scale=8) + + self.fbank = torch.nn.Sequential( + PreEmphasis(), + T.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, + f_min=20, f_max=7600, window_fn=torch.hamming_window, n_mels=80) + ) + self.specaug = FbankAug() # Spec augmentation + + cat_channels = channels * 3 + self.conv = nn.Conv1d(cat_channels, 1536, kernel_size=1) + self.pooling = AttentiveStatsPool(1536, 128, self.context) + self.bn1 = nn.BatchNorm1d(3072) + self.linear = nn.Linear(3072, embd_dim) + self.bn2 = nn.BatchNorm1d(embd_dim) + + self.weight = torch.nn.Parameter(torch.FloatTensor(output_num, embd_dim), requires_grad=True) + nn.init.xavier_normal_(self.weight, gain=1) + + def forward(self, x): + with torch.no_grad(): + x = self.fbank(x) + 1e-6 + x = x.log() + x = x - torch.mean(x, dim=-1, keepdim=True) + if self.aug: + x = self.specaug(x) + out1 = self.layer1(x) + out2 = self.layer2(out1) + out3 = self.layer3(out1 + out2) + out4 = self.layer4(out1 + out2 + out3) + + # out1 = self.layer1(x) + # out2 = self.layer2(out1) + out1 + # out3 = self.layer3(out1 + out2) + out1 + out2 + # out4 = self.layer4(out1 + out2 + out3) + out1 + out2 + out3 + + out = torch.cat([out2, out3, out4], dim=1) + out = F.relu(self.conv(out)) + if out.shape[0] == 1: + out = self.linear(self.pooling(out)) + else: + out = self.bn1(self.pooling(out)) + out = self.bn2(self.linear(out)) + + if not self.embedding: + return F.linear(F.normalize(out), F.normalize(self.weight)) + return out + + +if __name__ == '__main__': + X = torch.zeros(2, 90000) + model = ECAPA_TDNN(in_channels=80, channels=512, embd_dim=192, output_num=10, context=True, embedding=False) + output = model(X) + # print(model) + print(output.shape) # [2, 192] or [2, output_num] diff --git a/models/tdnn_l.py b/models/tdnn_l.py new file mode 100644 index 0000000..54fbf54 --- /dev/null +++ b/models/tdnn_l.py @@ -0,0 +1,57 @@ +import torch +import eval as d2l +import torch.nn as nn +# import torch.nn.functional as F +from models.tdnn_module import ECAPA_TDNN + + +class ECAPAModel(nn.Module): + def __init__(self, n_class, C=1024, output_embedding=True, not_grad=False): + super(ECAPAModel, self).__init__() + self.in_features = 192 + self.output_num = n_class + self.output_embedding = output_embedding + self.speaker_encoder = ECAPA_TDNN(C=C) + self.fc = nn.Linear(192, self.output_num) + if not not_grad: + for param in self.speaker_encoder.parameters(): + param.requires_grad = True + else: + for param in self.speaker_encoder.parameters(): + param.requires_grad = False + + def forward(self, x, aug=True): + out = self.speaker_encoder(x, aug=aug) + if not self.output_embedding: + return self.fc(out) + else: + return out + + def save_parameters(self, path): + torch.save(self.state_dict(), path) + + def load_parameters(self, path, device): + self_state = self.state_dict() + loaded_state = torch.load(path, map_location=device) + for name, param in loaded_state.items(): + origname = name + if name not in self_state: + name = name.replace("module.", "") + if name not in self_state: + print("%s is not in the model." % origname) + continue + if self_state[name].size() != loaded_state[origname].size(): + print("Wrong parameter length: %s, model: %s, loaded: %s" % ( + origname, self_state[name].size(), loaded_state[origname].size())) + continue + self_state[name].copy_(param) + + +if __name__ == '__main__': + net = ECAPAModel(100, 1024, False) + net.load_parameters("../pretrain.model", d2l.try_gpu()) + X = torch.zeros(2, 90000) + output = net(X) + print(output.shape) + # parameters = torch.load("../pretrain.model", map_location=d2l.try_gpu()) + # print(parameters) diff --git a/models/tdnn_module.py b/models/tdnn_module.py new file mode 100644 index 0000000..a60fded --- /dev/null +++ b/models/tdnn_module.py @@ -0,0 +1,195 @@ +import math +import torch +import torchaudio +import torch.nn as nn +import torch.nn.functional as F + + +class SEModule(nn.Module): + def __init__(self, channels, bottleneck=128): + super(SEModule, self).__init__() + self.se = nn.Sequential( + nn.AdaptiveAvgPool1d(1), + nn.Conv1d(channels, bottleneck, kernel_size=1, padding=0), + nn.ReLU(), + # nn.BatchNorm1d(bottleneck), # I remove this layer + nn.Conv1d(bottleneck, channels, kernel_size=1, padding=0), + nn.Sigmoid(), + ) + + def forward(self, input): + x = self.se(input) + return input * x + + +class Bottle2neck(nn.Module): + def __init__(self, inplanes, planes, kernel_size=None, dilation=None, scale=8): + super(Bottle2neck, self).__init__() + width = int(math.floor(planes / scale)) + self.conv1 = nn.Conv1d(inplanes, width * scale, kernel_size=1) + self.bn1 = nn.BatchNorm1d(width * scale) + self.nums = scale - 1 + convs = [] + bns = [] + num_pad = math.floor(kernel_size / 2) * dilation + for i in range(self.nums): + convs.append(nn.Conv1d(width, width, kernel_size=kernel_size, dilation=dilation, padding=num_pad)) + bns.append(nn.BatchNorm1d(width)) + self.convs = nn.ModuleList(convs) + self.bns = nn.ModuleList(bns) + self.conv3 = nn.Conv1d(width * scale, planes, kernel_size=1) + self.bn3 = nn.BatchNorm1d(planes) + self.relu = nn.ReLU() + self.width = width + self.se = SEModule(planes) + + def forward(self, x): + sp = None + residual = x + out = self.conv1(x) + out = self.relu(out) + out = self.bn1(out) + + spx = torch.split(out, self.width, 1) + for i in range(self.nums): + if i == 0: + sp = spx[i] + else: + sp = sp + spx[i] + sp = self.convs[i](sp) + sp = self.relu(sp) + sp = self.bns[i](sp) + if i == 0: + out = sp + else: + out = torch.cat((out, sp), 1) + out = torch.cat((out, spx[self.nums]), 1) + + out = self.conv3(out) + out = self.relu(out) + out = self.bn3(out) + + out = self.se(out) + out += residual + return out + + +class PreEmphasis(nn.Module): + def __init__(self, coef: float = 0.97): + super().__init__() + self.coef = coef + self.register_buffer( + 'flipped_filter', torch.FloatTensor([-self.coef, 1.]).unsqueeze(0).unsqueeze(0) + ) + + def forward(self, input: torch.tensor) -> torch.tensor: + input = input.unsqueeze(1) + input = F.pad(input, (1, 0), 'reflect') + return F.conv1d(input, self.flipped_filter).squeeze(1) + + +class FbankAug(nn.Module): + def __init__(self, freq_mask_width=(0, 8), time_mask_width=(0, 10)): + self.time_mask_width = time_mask_width + self.freq_mask_width = freq_mask_width + super().__init__() + + def mask_along_axis(self, x, dim): + original_size = x.shape + batch, fea, time = x.shape + if dim == 1: + D = fea + width_range = self.freq_mask_width + else: + D = time + width_range = self.time_mask_width + + mask_len = torch.randint(width_range[0], width_range[1], (batch, 1), device=x.device).unsqueeze(2) + mask_pos = torch.randint(0, max(1, D - mask_len.max()), (batch, 1), device=x.device).unsqueeze(2) + arange = torch.arange(D, device=x.device).view(1, 1, -1) + mask = (mask_pos <= arange) * (arange < (mask_pos + mask_len)) + mask = mask.any(dim=1) + + if dim == 1: + mask = mask.unsqueeze(2) + else: + mask = mask.unsqueeze(1) + + x = x.masked_fill_(mask, 0.0) + return x.view(*original_size) + + def forward(self, x): + x = self.mask_along_axis(x, dim=2) + x = self.mask_along_axis(x, dim=1) + return x + + +class ECAPA_TDNN(nn.Module): + def __init__(self, C): + super(ECAPA_TDNN, self).__init__() + + self.torchfbank = torch.nn.Sequential( + PreEmphasis(), + torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, + f_min=20, f_max=7600, window_fn=torch.hamming_window, n_mels=80), + ) + + self.specaug = FbankAug() # Spec augmentation + + self.conv1 = nn.Conv1d(80, C, kernel_size=5, stride=1, padding=2) + self.relu = nn.ReLU() + self.bn1 = nn.BatchNorm1d(C) + self.layer1 = Bottle2neck(C, C, kernel_size=3, dilation=2, scale=8) + self.layer2 = Bottle2neck(C, C, kernel_size=3, dilation=3, scale=8) + self.layer3 = Bottle2neck(C, C, kernel_size=3, dilation=4, scale=8) + # I fixed the shape of the output from MFA layer, that is close to the setting from ECAPA paper. + self.layer4 = nn.Conv1d(3 * C, 1536, kernel_size=1) + self.attention = nn.Sequential( + nn.Conv1d(4608, 256, kernel_size=1), + nn.ReLU(), + nn.BatchNorm1d(256), + nn.Tanh(), # I add this layer + nn.Conv1d(256, 1536, kernel_size=1), + nn.Softmax(dim=2), + ) + self.bn5 = nn.BatchNorm1d(3072) + self.fc6 = nn.Linear(3072, 192) + self.bn6 = nn.BatchNorm1d(192) + + def forward(self, x, aug=True): + with torch.no_grad(): + x = self.torchfbank(x) + 1e-6 + x = x.log() + x = x - torch.mean(x, dim=-1, keepdim=True) + if aug: + x = self.specaug(x) + + x = self.conv1(x) + x = self.relu(x) + x = self.bn1(x) + + x1 = self.layer1(x) + x2 = self.layer2(x + x1) + x3 = self.layer3(x + x1 + x2) + + x = self.layer4(torch.cat((x1, x2, x3), dim=1)) + x = self.relu(x) + + t = x.size()[-1] + + global_x = torch.cat((x, torch.mean(x, dim=2, keepdim=True).repeat(1, 1, t), + torch.sqrt(torch.var(x, dim=2, keepdim=True).clamp(min=1e-4)).repeat(1, 1, t)), dim=1) + + w = self.attention(global_x) + + mu = torch.sum(x * w, dim=2) + sg = torch.sqrt((torch.sum((x ** 2) * w, dim=2) - mu ** 2).clamp(min=1e-4)) + + x = torch.cat((mu, sg), 1) + if x.shape[0] > 1: + x = self.bn5(x) + x = self.fc6(x) + if x.shape[0] > 1: + x = self.bn6(x) + + return x diff --git a/models/tdnn_pretrain.py b/models/tdnn_pretrain.py new file mode 100644 index 0000000..1865966 --- /dev/null +++ b/models/tdnn_pretrain.py @@ -0,0 +1,55 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from models.tdnn_module import ECAPA_TDNN as TDNN + + +class Pretrain_TDNN(nn.Module): + def __init__(self, n_class, C=1024, output_embedding=True, not_grad=False, aug=True): + super(Pretrain_TDNN, self).__init__() + self.aug = aug + self.in_features = 192 + self.output_num = n_class + self.output_embedding = output_embedding + self.speaker_encoder = TDNN(C=C) + self.weight = torch.nn.Parameter(torch.FloatTensor(n_class, 192), requires_grad=True) + nn.init.xavier_normal_(self.weight, gain=1) + if not not_grad: + for param in self.speaker_encoder.parameters(): + param.requires_grad = True + else: + for param in self.speaker_encoder.parameters(): + param.requires_grad = False + + def forward(self, x): + out = self.speaker_encoder(x, aug=self.aug) + if not self.output_embedding: + return F.linear(F.normalize(out), F.normalize(self.weight)) + else: + return out + + def save_parameters(self, path): + torch.save(self.state_dict(), path) + + def load_parameters(self, path, device): + self_state = self.state_dict() + loaded_state = torch.load(path, map_location=device) + for name, param in loaded_state.items(): + origname = name + if name not in self_state: + name = name.replace("module.", "") + if name not in self_state: + print("%s is not in the model." % origname) + continue + if self_state[name].size() != loaded_state[origname].size(): + print("Wrong parameter length: %s, model: %s, loaded: %s" % ( + origname, self_state[name].size(), loaded_state[origname].size())) + continue + self_state[name].copy_(param) + + +if __name__ == '__main__': + net = Pretrain_TDNN(100, 1024, True) + X = torch.zeros(2, 41500) + output = net(X) + print(output.shape) diff --git a/param.model b/param.model new file mode 100644 index 0000000..769a13a Binary files /dev/null and b/param.model differ diff --git a/test.py b/test.py new file mode 100644 index 0000000..679a36a --- /dev/null +++ b/test.py @@ -0,0 +1,17 @@ +import torch + +import eval as d2l +from tools import eval_net +# from d2l import torch as d2l +from models.tdnn_pretrain import Pretrain_TDNN + +if __name__ == "__main__": + model_path = './param.model' + Device = d2l.try_gpu() + + model2 = Pretrain_TDNN(420, 1024, False, not_grad=False) + model2.load_parameters(model_path, Device) + # model2 = torch.load('net.pth') + + EER, minDCF = eval_net(model2, Device, 10, 10) + print(f'EER:{EER:.4f} minDCF:{minDCF:.4f}') diff --git a/tools.py b/tools.py new file mode 100644 index 0000000..848e83c --- /dev/null +++ b/tools.py @@ -0,0 +1,158 @@ +import numpy as np +import torch +import audio +import numpy +import loader +import random +from sklearn import metrics +import torch.nn.functional as F +import matplotlib.pyplot as plt + + +def tuneThresholdfromScore(scores, labels, target_fa): + # 运用scikit-learn库来计算roc曲线 + fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=1) + # 算出auc + auc = metrics.auc(fpr, tpr) + + plt.plot(fpr, tpr, 'k--', label='ROC (area = {0:.2f})'.format(auc), lw=2) + plt.xlim([-0.05, 1.05]) + plt.ylim([-0.05, 1.05]) + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title('ROC Curve') + plt.savefig('./img/ROC.jpg', dpi=400) + + prec, recall, _ = metrics.precision_recall_curve(labels, scores, pos_label=1) + metrics.PrecisionRecallDisplay(precision=prec, recall=recall).plot() + plt.savefig('./img/PR.jpg', dpi=400) + + fnr = 1 - tpr + tunedThreshold = [] + + for tfa in target_fa: + idx = numpy.nanargmin(numpy.absolute((tfa - fpr))) # numpy.where(fpr<=tfa)[0][-1] + tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]]) + # 根据上面算出的fnr和fpr相减得出一个数组,算出数组中最小的索引(排除NaN) + idxE = numpy.nanargmin(numpy.absolute((fnr - fpr))) + eer = max(fpr[idxE], fnr[idxE]) + + return tunedThreshold[1][0], eer, auc, fpr, fnr + + +def ComputeErrorRates(scores, labels, threshold=0.96695, p=0.01): + assert len(scores) == len(labels), f'Error: {scores} {labels}\n' + predict = [] + threshold = threshold if 0.9693 <= threshold < 0.99 else 0.9693 + for i in range(len(scores)): + if scores[i] > threshold: + predict.append(1) + else: + predict.append(0) + matrix = metrics.confusion_matrix(labels, predict) + [TN, FP], [FN, TP] = matrix + matrix = np.array([[TP, FN], [FP, TN]]) + + metrics.ConfusionMatrixDisplay(confusion_matrix=matrix, + display_labels=['Positive', 'Negative']).plot() + plt.savefig('./img/confusion_matrix.jpg', dpi=400) + + FAR = FP / (FP + TN) + FRR = FN / (TP + FN) + minDCF = FAR * (1 - p) + FRR * p + return matrix, minDCF + + +def ComputeMinDcf(fnrs, fprs, thresholds, p_target, c_miss, c_fa): + min_c_det = float("inf") + min_c_det_threshold = thresholds[0] + for i in range(0, len(fnrs)): + c_det = c_miss * fnrs[i] * p_target + c_fa * fprs[i] * (1 - p_target) + if c_det < min_c_det: + min_c_det = c_det + min_c_det_threshold = thresholds[i] + c_def = min(c_miss * p_target, c_fa * (1 - p_target)) + min_dcf = min_c_det / c_def + return min_dcf, min_c_det_threshold + + +def get_embedding(net, name, device): + net.aug = False + net.output_embedding = True + net.to(device) + wav = audio.loadWAV(filename=name) + wav = wav.unsqueeze(0).to(device) + with torch.no_grad(): + embedding = net(wav) + embedding = F.normalize(embedding, p=2, dim=1) + return embedding + + +def dic_process(dic): + result = {} + value = list(dic.values())[0] + embedding_list = [] + for item in dic.items(): + if item[1] != value: + value = item[1] + embedding_list = [] + embedding_list.append(item[0]) + result[value] = embedding_list + return result + + +def eval_net(net, device, folder_num=-1, file_num=-1): + labels = [] + embed_dict = {} + score_list = [] + enroll, test, folder_num = loader.load_files("test", folder_num, file_num, 9) + enroll = dic_process(enroll) + test = dic_process(test) + + for key in enroll: + count = 0 + embed = None + for name in enroll[key]: + if count >= len(enroll[key]): + break + count += 1 + embedding = get_embedding(net, name, device) + + if count == 1: + embed = embedding + else: + embed = torch.cat([embed, embedding]) + embed = torch.mean(embed, dim=0).unsqueeze(0) + embed_dict[key] = embed + + for item in enroll: + dict_key_ls = list(enroll.keys()) + random.shuffle(dict_key_ls) + for label in dict_key_ls: + if label == item: + y_true = 1 + else: + y_true = 0 + num = random.randint(0, len(test[label]) - 1) + embed1 = get_embedding(net, test[label][num], device) + embedding = embed_dict[item] + + score = torch.matmul(embed1, embedding.mT).cpu().numpy().reshape(-1) + score_list.append(score) + labels.append(y_true) + + threshold, EER, AUC, _, _ = tuneThresholdfromScore(score_list, labels, [1, 0.1]) + _, minDCF = ComputeErrorRates(score_list, labels, threshold) + return EER, minDCF + + +if __name__ == '__main__': + train_dict, test_dict, number = loader.load_files("train", 40, 20, 1.5) + dic_process(train_dict) + # print(train_dict) + + # embed = torch.FloatTensor([[0.1, 0.2, 0.3, 0.4], + # [0.5, 0.6, 0.7, 0.8]]) + # sum = torch.matmul(embed, embed.T) + # sum = torch.sum(sum, dim=[0, 1], keepdim=False) + # print(sum) diff --git a/train.py b/train.py new file mode 100644 index 0000000..8d1842a --- /dev/null +++ b/train.py @@ -0,0 +1,106 @@ +import os +import torch +import loader +import eval as d2l +# from d2l import torch as d2l +from tensorboardX import SummaryWriter +from torch.utils.data import DataLoader +from models.tdnn import ECAPA_TDNN +from loss import AAMSoftmax, evaluate_accuracy_gpu + + +def init_logs(path=".\\logs"): + for root, dirs, files in os.walk(path, topdown=False): + for name in files: + os.remove(os.path.join(root, name)) + for name in dirs: + os.rmdir(os.path.join(root, name)) + + +def train(train_iter, test_iter, net, loss_func, device, write, num_epoch=10, lr=0.1, wd=2e-4): + net.to(device) + trainer = torch.optim.Adam(params=(param for param in net.parameters() + if param.requires_grad), lr=lr, weight_decay=wd) + scheduler = torch.optim.lr_scheduler.CyclicLR(trainer, base_lr=1e-3, max_lr=0.1, step_size_up=6250, + mode="triangular2", cycle_momentum=False) + timer = d2l.Timer() + sum, img = 0, None + for epoch in range(num_epoch): + print(f'\nepoch {epoch + 1}:') + train_acc = train_l = 0 + metric = d2l.Accumulator(3) + net.train() + for i, (x, y) in enumerate(train_iter): + # if i == 0 and epoch == num_epoch - 1: + # img = x.to(device) + timer.start() + x, y = x.to(device), y.to(device) + trainer.zero_grad() + y_hat = net(x) + l, prec = loss_func(y_hat, y) + l.backward() + trainer.step() + with torch.no_grad(): + metric.add(l * x.shape[0], prec * x.shape[0], x.shape[0]) + timer.stop() + train_l = metric[0] / metric[2] + train_acc = metric[1] / metric[2] + scheduler.step() + sum += metric[2] + # test_acc = 0 + test_acc = evaluate_accuracy_gpu(net, test_iter) + print(f'\tloss {train_l:.3f}, train acc {train_acc:.3f}, ' + f'test acc {test_acc:.3f}') + write.add_scalar('loss', train_l, epoch) + write.add_scalars('acc', {'test_acc': test_acc, 'train_acc': train_acc}, epoch) + print(f'\n{sum / timer.sum():.1f} examples/sec ' + f'on {str(device)}') + # write.add_graph(net, img) + + +if __name__ == "__main__": + people_num, data_per_people = 420, 150 + noise, mel = False, True + margin, scale, easy_margin = 0.2, 20, False + not_grad, bidirectional, reverse = False, True, False + num_epochs, learn_rate, weight_decay = 150, 0.125, 1e-3 + mode, model_name = "train", "dense169" + hidden_size, num_layers = 64, 2 + model_path = './pretrain.model' + + # Device = torch_directml.device() + # print(Device) + # prefetch_factor, batch_size, num_works, persistent = 2, 32, 8, False + + Device = d2l.try_gpu() + if Device.type == 'cpu': + prefetch_factor, batch_size, num_works, persistent = 2, 8, 8, False + elif torch.cuda.is_available(): + prefetch_factor, batch_size, num_works, persistent = 8, 256, 32, True + else: + prefetch_factor, batch_size, num_works, persistent = 2, 32, 8, False + + init_logs() + train_dict, test_dict, people_num = loader.load_files(mode=mode, folder_num=people_num, + file_num=data_per_people, k=7.5) + train_dataset = loader.MyDataset(data_dict=train_dict, people_num=people_num, train=True, mel=mel, + noise=noise) + test_dataset = loader.MyDataset(data_dict=test_dict, people_num=people_num, train=False, mel=mel, + noise=False) + print(len(train_dataset), len(test_dataset)) + train_ = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, + drop_last=True, num_workers=num_works, pin_memory=True, + persistent_workers=persistent, prefetch_factor=prefetch_factor) + test_ = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True, + drop_last=True, num_workers=num_works, pin_memory=True, + persistent_workers=persistent, prefetch_factor=prefetch_factor) + writer = SummaryWriter('./logs') + + # model1 = cnn.get_net(people_num, model_name, not_grad) + # model2 = F.CNN_LSTM(model_name, people_num, hidden_size, num_layers, bidirectional, not_grad) + model2 = ECAPA_TDNN(in_channels=80, channels=512, embd_dim=192, + output_num=people_num, context=True, embedding=False) + + loss = AAMSoftmax(192, people_num, margin, scale, easy_margin) + train(train_, test_, model2, loss, Device, writer, num_epochs, learn_rate, weight_decay) + torch.save(model2.state_dict(), "net.model")