Skip to content

Commit

Permalink
Merge pull request #74 from sooftware/hydra
Browse files Browse the repository at this point in the history
bin/dataclass.py => kospeech/dataclass.py & Update Naming
  • Loading branch information
sooftware authored Jan 3, 2021
2 parents 41375aa + 8654bbb commit 51cd706
Show file tree
Hide file tree
Showing 10 changed files with 60 additions and 34 deletions.
4 changes: 2 additions & 2 deletions bin/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
import sys
import hydra
import warnings

sys.path.append('..')

from hydra.core.config_store import ConfigStore
from omegaconf import OmegaConf, DictConfig
from bin.dataclass import EvalConfig
from kospeech.dataclass import EvalConfig
from kospeech.vocabs.ksponspeech import KsponSpeechVocabulary
from kospeech.vocabs.librispeech import LibriSpeechVocabulary
from kospeech.data.label_loader import load_dataset
Expand Down
2 changes: 1 addition & 1 deletion bin/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
KsponSpeechVocabulary,
LibriSpeechVocabulary,
)
from bin.dataclass import (
from kospeech.dataclass import (
AudioConfig,
TrainConfig,
DeepSpeech2Config,
Expand Down
1 change: 0 additions & 1 deletion kospeech/checkpoint/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

import os
import time
import shutil
import torch
import torch.nn as nn
from kospeech.utils import logger
Expand Down
2 changes: 1 addition & 1 deletion kospeech/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@
SpectrogramDataset,
AudioDataLoader,
MultiDataLoader,
split_dataset
split_dataset,
)
12 changes: 6 additions & 6 deletions kospeech/data/audio/augment.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,25 +33,25 @@ def __init__(self, freq_mask_para: int = 18, time_mask_num: int = 10, freq_mask_
self.time_mask_num = time_mask_num
self.freq_mask_num = freq_mask_num

def __call__(self, feature_vector: Tensor) -> Tensor:
def __call__(self, feature: Tensor) -> Tensor:
""" Provides SpecAugmentation for audio """
time_axis_length = feature_vector.size(0)
freq_axis_length = feature_vector.size(1)
time_axis_length = feature.size(0)
freq_axis_length = feature.size(1)
time_mask_para = time_axis_length / 20 # Refer to "Specaugment on large scale dataset" paper

# time mask
for _ in range(self.time_mask_num):
t = int(np.random.uniform(low=0.0, high=time_mask_para))
t0 = random.randint(0, time_axis_length - t)
feature_vector[t0: t0 + t, :] = 0
feature[t0: t0 + t, :] = 0

# freq mask
for _ in range(self.freq_mask_num):
f = int(np.random.uniform(low=0.0, high=self.freq_mask_para))
f0 = random.randint(0, freq_axis_length - f)
feature_vector[:, f0: f0 + f] = 0
feature[:, f0: f0 + f] = 0

return feature_vector
return feature


class NoiseInjector(object):
Expand Down
3 changes: 2 additions & 1 deletion kospeech/data/audio/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
import warnings
import numpy as np
import librosa

from astropy.modeling import ParameterError
from numpy.lib.stride_tricks import as_strided
from kospeech.utils import logger


def load_audio(audio_path: str, del_silence: bool = False, extension: str = 'pcm'):
def load_audio(audio_path: str, del_silence: bool = False, extension: str = 'pcm') -> np.ndarray:
"""
Load audio file (PCM) to sound. if del_silence is True, Eliminate all sounds below 30dB.
If exception occurs in numpy.memmap(), return None.
Expand Down
45 changes: 34 additions & 11 deletions kospeech/data/audio/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,8 @@
import torch
import platform
import numpy as np
from torch import (
Tensor,
FloatTensor
)

from torch import Tensor, FloatTensor


class Spectrogram(object):
Expand All @@ -21,10 +19,15 @@ class Spectrogram(object):
sample_rate (int): Sample rate of audio signal. (Default: 16000)
frame_length (int): frame length for spectrogram (ms) (Default : 20)
frame_shift (int): Length of hop between STFT windows. (ms) (Default: 10)
feature_extract_by (str): which library to use for feature extraction (default: torch)
"""
def __init__(self, sample_rate: int = 16000,
frame_length: int = 20, frame_shift: int = 10,
feature_extract_by: str = 'torch') -> None:
def __init__(
self,
sample_rate: int = 16000,
frame_length: int = 20,
frame_shift: int = 10,
feature_extract_by: str = 'torch'
) -> None:
self.sample_rate = sample_rate
self.feature_extract_by = feature_extract_by.lower()

Expand Down Expand Up @@ -71,9 +74,16 @@ class MelSpectrogram(object):
n_mels (int): Number of mfc coefficients to retain. (Default: 80)
frame_length (int): frame length for spectrogram (ms) (Default : 20)
frame_shift (int): Length of hop between STFT windows. (ms) (Default: 10)
feature_extract_by (str): which library to use for feature extraction(default: librosa)
feature_extract_by (str): which library to use for feature extraction (default: librosa)
"""
def __init__(self, sample_rate=16000, n_mels=80, frame_length=20, frame_shift=10, feature_extract_by='librosa'):
def __init__(
self,
sample_rate: int = 16000,
n_mels: int = 80,
frame_length: int = 20,
frame_shift: int = 10,
feature_extract_by: str = 'librosa'
) -> None:
self.sample_rate = sample_rate
self.n_mels = n_mels
self.n_fft = int(round(sample_rate * 0.001 * frame_length))
Expand Down Expand Up @@ -129,7 +139,14 @@ class MFCC(object):
frame_shift (int): Length of hop between STFT windows. (ms) (Default: 10)
feature_extract_by (str): which library to use for feature extraction(default: librosa)
"""
def __init__(self, sample_rate=16000, n_mfcc=40, frame_length=20, frame_shift=10, feature_extract_by='librosa'):
def __init__(
self,
sample_rate: int = 16000,
n_mfcc: int = 40,
frame_length: int = 20,
frame_shift: int = 10,
feature_extract_by: str = 'librosa'
) -> None:
self.sample_rate = sample_rate
self.n_mfcc = n_mfcc
self.n_fft = int(round(sample_rate * 0.001 * frame_length))
Expand Down Expand Up @@ -177,7 +194,13 @@ class FilterBank(object):
frame_length (int): frame length for spectrogram (ms) (Default : 20)
frame_shift (int): Length of hop between STFT windows. (ms) (Default: 10)
"""
def __init__(self, sample_rate=16000, n_mels=80, frame_length=20, frame_shift=10):
def __init__(
self,
sample_rate: int = 16000,
n_mels: int = 80,
frame_length: int = 20,
frame_shift: int = 10
) -> None:
import torchaudio
self.transforms = torchaudio.compliance.kaldi.fbank
self.sample_rate = sample_rate
Expand Down
21 changes: 11 additions & 10 deletions kospeech/data/audio/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,17 @@
# LICENSE file in the root directory of this source tree.

import numpy as np

from kospeech.utils import logger
from kospeech.data.audio.core import load_audio
from torch import Tensor, FloatTensor
from kospeech.data.audio.augment import SpecAugment
from kospeech.data.audio.feature import (
MelSpectrogram,
MFCC,
Spectrogram,
FilterBank
FilterBank,
)
from kospeech.utils import logger


class AudioParser(object):
Expand Down Expand Up @@ -122,23 +123,23 @@ def parse_audio(self, audio_path: str, augment_method: int) -> Tensor:
logger.info("Audio is None : {0}".format(audio_path))
return None

feature_vector = self.transforms(signal)
feature = self.transforms(signal)

if self.normalize:
feature_vector -= feature_vector.mean()
feature_vector /= np.std(feature_vector)
feature -= feature.mean()
feature /= np.std(feature)

# Refer to "Sequence to Sequence Learning with Neural Network" paper
if self.input_reverse:
feature_vector = feature_vector[:, ::-1]
feature_vector = FloatTensor(np.ascontiguousarray(np.swapaxes(feature_vector, 0, 1)))
feature = feature[:, ::-1]
feature = FloatTensor(np.ascontiguousarray(np.swapaxes(feature, 0, 1)))
else:
feature_vector = FloatTensor(feature_vector).transpose(0, 1)
feature = FloatTensor(feature).transpose(0, 1)

if augment_method == SpectrogramParser.SPEC_AUGMENT:
feature_vector = self.spec_augment(feature_vector)
feature = self.spec_augment(feature)

return feature_vector
return feature

def parse_transcript(self, *args, **kwargs):
raise NotImplementedError
4 changes: 3 additions & 1 deletion kospeech/data/label_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
# This source code is licensed under the Apache 2.0 License license found in the
# LICENSE file in the root directory of this source tree.

from typing import Tuple

def load_dataset(transcripts_path):

def load_dataset(transcripts_path: str) -> Tuple[list, list]:
"""
Provides dictionary of filename and labels
Expand Down
File renamed without changes.

0 comments on commit 51cd706

Please sign in to comment.