Merge pull request #74 from sooftware/hydra

bin/dataclass.py => kospeech/dataclass.py & Update Naming
sooftware · Jan 3, 2021 · 51cd706 · 51cd706
2 parents 41375aa + 8654bbb
commit 51cd706
Show file tree

Hide file tree

Showing 10 changed files with 60 additions and 34 deletions.
diff --git a/bin/eval.py b/bin/eval.py
@@ -8,11 +8,11 @@
 import sys
 import hydra
 import warnings
-
 sys.path.append('..')
+
 from hydra.core.config_store import ConfigStore
 from omegaconf import OmegaConf, DictConfig
-from bin.dataclass import EvalConfig
+from kospeech.dataclass import EvalConfig
 from kospeech.vocabs.ksponspeech import KsponSpeechVocabulary
 from kospeech.vocabs.librispeech import LibriSpeechVocabulary
 from kospeech.data.label_loader import load_dataset

diff --git a/bin/main.py b/bin/main.py
@@ -29,7 +29,7 @@
     KsponSpeechVocabulary,
     LibriSpeechVocabulary,
 )
-from bin.dataclass import (
+from kospeech.dataclass import (
     AudioConfig,
     TrainConfig,
     DeepSpeech2Config,

diff --git a/kospeech/checkpoint/checkpoint.py b/kospeech/checkpoint/checkpoint.py
@@ -6,7 +6,6 @@
 
 import os
 import time
-import shutil
 import torch
 import torch.nn as nn
 from kospeech.utils import logger

diff --git a/kospeech/data/__init__.py b/kospeech/data/__init__.py
@@ -4,5 +4,5 @@
     SpectrogramDataset,
     AudioDataLoader,
     MultiDataLoader,
-    split_dataset
+    split_dataset,
 )
diff --git a/kospeech/data/audio/augment.py b/kospeech/data/audio/augment.py
@@ -33,25 +33,25 @@ def __init__(self, freq_mask_para: int = 18, time_mask_num: int = 10, freq_mask_
         self.time_mask_num = time_mask_num
         self.freq_mask_num = freq_mask_num
 
-    def __call__(self, feature_vector: Tensor) -> Tensor:
+    def __call__(self, feature: Tensor) -> Tensor:
         """ Provides SpecAugmentation for audio """
-        time_axis_length = feature_vector.size(0)
-        freq_axis_length = feature_vector.size(1)
+        time_axis_length = feature.size(0)
+        freq_axis_length = feature.size(1)
         time_mask_para = time_axis_length / 20      # Refer to "Specaugment on large scale dataset" paper
 
         # time mask
         for _ in range(self.time_mask_num):
             t = int(np.random.uniform(low=0.0, high=time_mask_para))
             t0 = random.randint(0, time_axis_length - t)
-            feature_vector[t0: t0 + t, :] = 0
+            feature[t0: t0 + t, :] = 0
 
         # freq mask
         for _ in range(self.freq_mask_num):
             f = int(np.random.uniform(low=0.0, high=self.freq_mask_para))
             f0 = random.randint(0, freq_axis_length - f)
-            feature_vector[:, f0: f0 + f] = 0
+            feature[:, f0: f0 + f] = 0
 
-        return feature_vector
+        return feature
 
 
 class NoiseInjector(object):

diff --git a/kospeech/data/audio/core.py b/kospeech/data/audio/core.py
@@ -7,12 +7,13 @@
 import warnings
 import numpy as np
 import librosa
+
 from astropy.modeling import ParameterError
 from numpy.lib.stride_tricks import as_strided
 from kospeech.utils import logger
 
 
-def load_audio(audio_path: str, del_silence: bool = False, extension: str = 'pcm'):
+def load_audio(audio_path: str, del_silence: bool = False, extension: str = 'pcm') -> np.ndarray:
     """
     Load audio file (PCM) to sound. if del_silence is True, Eliminate all sounds below 30dB.
     If exception occurs in numpy.memmap(), return None.

diff --git a/kospeech/data/audio/feature.py b/kospeech/data/audio/feature.py
@@ -7,10 +7,8 @@
 import torch
 import platform
 import numpy as np
-from torch import (
-    Tensor,
-    FloatTensor
-)
+
+from torch import Tensor, FloatTensor
 
 
 class Spectrogram(object):
@@ -21,10 +19,15 @@ class Spectrogram(object):
         sample_rate (int): Sample rate of audio signal. (Default: 16000)
         frame_length (int): frame length for spectrogram (ms) (Default : 20)
         frame_shift (int): Length of hop between STFT windows. (ms) (Default: 10)
+        feature_extract_by (str): which library to use for feature extraction (default: torch)
     """
-    def __init__(self, sample_rate: int = 16000,
-                 frame_length: int = 20, frame_shift: int = 10,
-                 feature_extract_by: str = 'torch') -> None:
+    def __init__(
+            self,
+            sample_rate: int = 16000,
+            frame_length: int = 20,
+            frame_shift: int = 10,
+            feature_extract_by: str = 'torch'
+    ) -> None:
         self.sample_rate = sample_rate
         self.feature_extract_by = feature_extract_by.lower()
 
@@ -71,9 +74,16 @@ class MelSpectrogram(object):
         n_mels (int):  Number of mfc coefficients to retain. (Default: 80)
         frame_length (int): frame length for spectrogram (ms) (Default : 20)
         frame_shift (int): Length of hop between STFT windows. (ms) (Default: 10)
-        feature_extract_by (str): which library to use for feature extraction(default: librosa)
+        feature_extract_by (str): which library to use for feature extraction (default: librosa)
     """
-    def __init__(self, sample_rate=16000, n_mels=80, frame_length=20, frame_shift=10, feature_extract_by='librosa'):
+    def __init__(
+            self,
+            sample_rate: int = 16000,
+            n_mels: int = 80,
+            frame_length: int = 20,
+            frame_shift: int = 10,
+            feature_extract_by: str = 'librosa'
+    ) -> None:
         self.sample_rate = sample_rate
         self.n_mels = n_mels
         self.n_fft = int(round(sample_rate * 0.001 * frame_length))
@@ -129,7 +139,14 @@ class MFCC(object):
         frame_shift (int): Length of hop between STFT windows. (ms) (Default: 10)
         feature_extract_by (str): which library to use for feature extraction(default: librosa)
     """
-    def __init__(self, sample_rate=16000, n_mfcc=40, frame_length=20, frame_shift=10, feature_extract_by='librosa'):
+    def __init__(
+            self,
+            sample_rate: int = 16000,
+            n_mfcc: int = 40,
+            frame_length: int = 20,
+            frame_shift: int = 10,
+            feature_extract_by: str = 'librosa'
+    ) -> None:
         self.sample_rate = sample_rate
         self.n_mfcc = n_mfcc
         self.n_fft = int(round(sample_rate * 0.001 * frame_length))
@@ -177,7 +194,13 @@ class FilterBank(object):
         frame_length (int): frame length for spectrogram (ms) (Default : 20)
         frame_shift (int): Length of hop between STFT windows. (ms) (Default: 10)
     """
-    def __init__(self, sample_rate=16000, n_mels=80, frame_length=20, frame_shift=10):
+    def __init__(
+            self,
+            sample_rate: int = 16000,
+            n_mels: int = 80,
+            frame_length: int = 20,
+            frame_shift: int = 10
+    ) -> None:
         import torchaudio
         self.transforms = torchaudio.compliance.kaldi.fbank
         self.sample_rate = sample_rate

diff --git a/kospeech/data/audio/parser.py b/kospeech/data/audio/parser.py
@@ -5,16 +5,17 @@
 # LICENSE file in the root directory of this source tree.
 
 import numpy as np
+
+from kospeech.utils import logger
 from kospeech.data.audio.core import load_audio
 from torch import Tensor, FloatTensor
 from kospeech.data.audio.augment import SpecAugment
 from kospeech.data.audio.feature import (
     MelSpectrogram,
     MFCC,
     Spectrogram,
-    FilterBank
+    FilterBank,
 )
-from kospeech.utils import logger
 
 
 class AudioParser(object):
@@ -122,23 +123,23 @@ def parse_audio(self, audio_path: str, augment_method: int) -> Tensor:
             logger.info("Audio is None : {0}".format(audio_path))
             return None
 
-        feature_vector = self.transforms(signal)
+        feature = self.transforms(signal)
 
         if self.normalize:
-            feature_vector -= feature_vector.mean()
-            feature_vector /= np.std(feature_vector)
+            feature -= feature.mean()
+            feature /= np.std(feature)
 
         # Refer to "Sequence to Sequence Learning with Neural Network" paper
         if self.input_reverse:
-            feature_vector = feature_vector[:, ::-1]
-            feature_vector = FloatTensor(np.ascontiguousarray(np.swapaxes(feature_vector, 0, 1)))
+            feature = feature[:, ::-1]
+            feature = FloatTensor(np.ascontiguousarray(np.swapaxes(feature, 0, 1)))
         else:
-            feature_vector = FloatTensor(feature_vector).transpose(0, 1)
+            feature = FloatTensor(feature).transpose(0, 1)
 
         if augment_method == SpectrogramParser.SPEC_AUGMENT:
-            feature_vector = self.spec_augment(feature_vector)
+            feature = self.spec_augment(feature)
 
-        return feature_vector
+        return feature
 
     def parse_transcript(self, *args, **kwargs):
         raise NotImplementedError
diff --git a/kospeech/data/label_loader.py b/kospeech/data/label_loader.py
@@ -4,8 +4,10 @@
 # This source code is licensed under the Apache 2.0 License license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Tuple
 
-def load_dataset(transcripts_path):
+
+def load_dataset(transcripts_path: str) -> Tuple[list, list]:
     """
     Provides dictionary of filename and labels
 

diff --git a/bin/dataclass.py → kospeech/dataclass.py b/bin/dataclass.py → kospeech/dataclass.py