From 8654bbb6c0aea9c7b04c0b5bf7386f986af6f64e Mon Sep 17 00:00:00 2001 From: sooftware Date: Sun, 3 Jan 2021 19:24:44 +0900 Subject: [PATCH] bin/dataclass.py => kospeech/dataclass.py & Update Naming --- bin/eval.py | 4 +-- bin/main.py | 2 +- kospeech/checkpoint/checkpoint.py | 1 - kospeech/data/__init__.py | 2 +- kospeech/data/audio/augment.py | 12 ++++----- kospeech/data/audio/core.py | 3 ++- kospeech/data/audio/feature.py | 45 +++++++++++++++++++++++-------- kospeech/data/audio/parser.py | 21 ++++++++------- kospeech/data/label_loader.py | 4 ++- {bin => kospeech}/dataclass.py | 0 10 files changed, 60 insertions(+), 34 deletions(-) rename {bin => kospeech}/dataclass.py (100%) diff --git a/bin/eval.py b/bin/eval.py index ba3eac52..9a25b9cb 100644 --- a/bin/eval.py +++ b/bin/eval.py @@ -8,11 +8,11 @@ import sys import hydra import warnings - sys.path.append('..') + from hydra.core.config_store import ConfigStore from omegaconf import OmegaConf, DictConfig -from bin.dataclass import EvalConfig +from kospeech.dataclass import EvalConfig from kospeech.vocabs.ksponspeech import KsponSpeechVocabulary from kospeech.vocabs.librispeech import LibriSpeechVocabulary from kospeech.data.label_loader import load_dataset diff --git a/bin/main.py b/bin/main.py index 8c856b5e..11795a1e 100644 --- a/bin/main.py +++ b/bin/main.py @@ -29,7 +29,7 @@ KsponSpeechVocabulary, LibriSpeechVocabulary, ) -from bin.dataclass import ( +from kospeech.dataclass import ( AudioConfig, TrainConfig, DeepSpeech2Config, diff --git a/kospeech/checkpoint/checkpoint.py b/kospeech/checkpoint/checkpoint.py index 4ffffb91..c545416c 100644 --- a/kospeech/checkpoint/checkpoint.py +++ b/kospeech/checkpoint/checkpoint.py @@ -6,7 +6,6 @@ import os import time -import shutil import torch import torch.nn as nn from kospeech.utils import logger diff --git a/kospeech/data/__init__.py b/kospeech/data/__init__.py index 285dd446..62d0825b 100644 --- a/kospeech/data/__init__.py +++ b/kospeech/data/__init__.py @@ -4,5 +4,5 @@ SpectrogramDataset, AudioDataLoader, MultiDataLoader, - split_dataset + split_dataset, ) diff --git a/kospeech/data/audio/augment.py b/kospeech/data/audio/augment.py index 24838c71..9c5f18eb 100644 --- a/kospeech/data/audio/augment.py +++ b/kospeech/data/audio/augment.py @@ -33,25 +33,25 @@ def __init__(self, freq_mask_para: int = 18, time_mask_num: int = 10, freq_mask_ self.time_mask_num = time_mask_num self.freq_mask_num = freq_mask_num - def __call__(self, feature_vector: Tensor) -> Tensor: + def __call__(self, feature: Tensor) -> Tensor: """ Provides SpecAugmentation for audio """ - time_axis_length = feature_vector.size(0) - freq_axis_length = feature_vector.size(1) + time_axis_length = feature.size(0) + freq_axis_length = feature.size(1) time_mask_para = time_axis_length / 20 # Refer to "Specaugment on large scale dataset" paper # time mask for _ in range(self.time_mask_num): t = int(np.random.uniform(low=0.0, high=time_mask_para)) t0 = random.randint(0, time_axis_length - t) - feature_vector[t0: t0 + t, :] = 0 + feature[t0: t0 + t, :] = 0 # freq mask for _ in range(self.freq_mask_num): f = int(np.random.uniform(low=0.0, high=self.freq_mask_para)) f0 = random.randint(0, freq_axis_length - f) - feature_vector[:, f0: f0 + f] = 0 + feature[:, f0: f0 + f] = 0 - return feature_vector + return feature class NoiseInjector(object): diff --git a/kospeech/data/audio/core.py b/kospeech/data/audio/core.py index 9249883b..69d0c1ec 100644 --- a/kospeech/data/audio/core.py +++ b/kospeech/data/audio/core.py @@ -7,12 +7,13 @@ import warnings import numpy as np import librosa + from astropy.modeling import ParameterError from numpy.lib.stride_tricks import as_strided from kospeech.utils import logger -def load_audio(audio_path: str, del_silence: bool = False, extension: str = 'pcm'): +def load_audio(audio_path: str, del_silence: bool = False, extension: str = 'pcm') -> np.ndarray: """ Load audio file (PCM) to sound. if del_silence is True, Eliminate all sounds below 30dB. If exception occurs in numpy.memmap(), return None. diff --git a/kospeech/data/audio/feature.py b/kospeech/data/audio/feature.py index d13ca448..dc9e6987 100644 --- a/kospeech/data/audio/feature.py +++ b/kospeech/data/audio/feature.py @@ -7,10 +7,8 @@ import torch import platform import numpy as np -from torch import ( - Tensor, - FloatTensor -) + +from torch import Tensor, FloatTensor class Spectrogram(object): @@ -21,10 +19,15 @@ class Spectrogram(object): sample_rate (int): Sample rate of audio signal. (Default: 16000) frame_length (int): frame length for spectrogram (ms) (Default : 20) frame_shift (int): Length of hop between STFT windows. (ms) (Default: 10) + feature_extract_by (str): which library to use for feature extraction (default: torch) """ - def __init__(self, sample_rate: int = 16000, - frame_length: int = 20, frame_shift: int = 10, - feature_extract_by: str = 'torch') -> None: + def __init__( + self, + sample_rate: int = 16000, + frame_length: int = 20, + frame_shift: int = 10, + feature_extract_by: str = 'torch' + ) -> None: self.sample_rate = sample_rate self.feature_extract_by = feature_extract_by.lower() @@ -71,9 +74,16 @@ class MelSpectrogram(object): n_mels (int): Number of mfc coefficients to retain. (Default: 80) frame_length (int): frame length for spectrogram (ms) (Default : 20) frame_shift (int): Length of hop between STFT windows. (ms) (Default: 10) - feature_extract_by (str): which library to use for feature extraction(default: librosa) + feature_extract_by (str): which library to use for feature extraction (default: librosa) """ - def __init__(self, sample_rate=16000, n_mels=80, frame_length=20, frame_shift=10, feature_extract_by='librosa'): + def __init__( + self, + sample_rate: int = 16000, + n_mels: int = 80, + frame_length: int = 20, + frame_shift: int = 10, + feature_extract_by: str = 'librosa' + ) -> None: self.sample_rate = sample_rate self.n_mels = n_mels self.n_fft = int(round(sample_rate * 0.001 * frame_length)) @@ -129,7 +139,14 @@ class MFCC(object): frame_shift (int): Length of hop between STFT windows. (ms) (Default: 10) feature_extract_by (str): which library to use for feature extraction(default: librosa) """ - def __init__(self, sample_rate=16000, n_mfcc=40, frame_length=20, frame_shift=10, feature_extract_by='librosa'): + def __init__( + self, + sample_rate: int = 16000, + n_mfcc: int = 40, + frame_length: int = 20, + frame_shift: int = 10, + feature_extract_by: str = 'librosa' + ) -> None: self.sample_rate = sample_rate self.n_mfcc = n_mfcc self.n_fft = int(round(sample_rate * 0.001 * frame_length)) @@ -177,7 +194,13 @@ class FilterBank(object): frame_length (int): frame length for spectrogram (ms) (Default : 20) frame_shift (int): Length of hop between STFT windows. (ms) (Default: 10) """ - def __init__(self, sample_rate=16000, n_mels=80, frame_length=20, frame_shift=10): + def __init__( + self, + sample_rate: int = 16000, + n_mels: int = 80, + frame_length: int = 20, + frame_shift: int = 10 + ) -> None: import torchaudio self.transforms = torchaudio.compliance.kaldi.fbank self.sample_rate = sample_rate diff --git a/kospeech/data/audio/parser.py b/kospeech/data/audio/parser.py index 46d6536b..38bd3803 100644 --- a/kospeech/data/audio/parser.py +++ b/kospeech/data/audio/parser.py @@ -5,6 +5,8 @@ # LICENSE file in the root directory of this source tree. import numpy as np + +from kospeech.utils import logger from kospeech.data.audio.core import load_audio from torch import Tensor, FloatTensor from kospeech.data.audio.augment import SpecAugment @@ -12,9 +14,8 @@ MelSpectrogram, MFCC, Spectrogram, - FilterBank + FilterBank, ) -from kospeech.utils import logger class AudioParser(object): @@ -122,23 +123,23 @@ def parse_audio(self, audio_path: str, augment_method: int) -> Tensor: logger.info("Audio is None : {0}".format(audio_path)) return None - feature_vector = self.transforms(signal) + feature = self.transforms(signal) if self.normalize: - feature_vector -= feature_vector.mean() - feature_vector /= np.std(feature_vector) + feature -= feature.mean() + feature /= np.std(feature) # Refer to "Sequence to Sequence Learning with Neural Network" paper if self.input_reverse: - feature_vector = feature_vector[:, ::-1] - feature_vector = FloatTensor(np.ascontiguousarray(np.swapaxes(feature_vector, 0, 1))) + feature = feature[:, ::-1] + feature = FloatTensor(np.ascontiguousarray(np.swapaxes(feature, 0, 1))) else: - feature_vector = FloatTensor(feature_vector).transpose(0, 1) + feature = FloatTensor(feature).transpose(0, 1) if augment_method == SpectrogramParser.SPEC_AUGMENT: - feature_vector = self.spec_augment(feature_vector) + feature = self.spec_augment(feature) - return feature_vector + return feature def parse_transcript(self, *args, **kwargs): raise NotImplementedError diff --git a/kospeech/data/label_loader.py b/kospeech/data/label_loader.py index ddb150f6..616f254f 100644 --- a/kospeech/data/label_loader.py +++ b/kospeech/data/label_loader.py @@ -4,8 +4,10 @@ # This source code is licensed under the Apache 2.0 License license found in the # LICENSE file in the root directory of this source tree. +from typing import Tuple -def load_dataset(transcripts_path): + +def load_dataset(transcripts_path: str) -> Tuple[list, list]: """ Provides dictionary of filename and labels diff --git a/bin/dataclass.py b/kospeech/dataclass.py similarity index 100% rename from bin/dataclass.py rename to kospeech/dataclass.py