Skip to content

Commit

Permalink
Merge pull request #43 from NavodPeiris/dev
Browse files Browse the repository at this point in the history
fixed some errors
  • Loading branch information
NavodPeiris authored Aug 16, 2024
2 parents bbe32be + f24ff24 commit 4085aa7
Show file tree
Hide file tree
Showing 8 changed files with 19 additions and 17 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ transcript will also indicate the timeframe in seconds where each speaker speaks
```
from speechlib import Transcriptor
file = "obama1.wav" # your audio file
file = "obama_zach.wav" # your audio file
voices_folder = "voices" # voices folder containing voice samples for recognition
language = "en" # language code
log_folder = "logs" # log folder for storing transcripts
Expand Down
Binary file added examples/chinese_wav.wav
Binary file not shown.
4 changes: 2 additions & 2 deletions examples/transcribe.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from speechlib import Transcriptor

file = "obama1.wav" # your audio file
file = "obama_zach.wav" # your audio file
voices_folder = "voices" # voices folder containing voice samples for recognition
language = "en" # language code
log_folder = "logs" # log folder for storing transcripts
modelSize = "tiny" # size of model to be used [tiny, small, medium, large-v1, large-v2, large-v3]
quantization = False # setting this 'True' may speed up the process but lower the accuracy
ACCESS_TOKEN = "your huggingface access token" # get permission to access pyannote/[email protected] on huggingface
ACCESS_TOKEN = "your huggingface token" # get permission to access pyannote/[email protected] on huggingface

# quantization only works on faster-whisper
transcriptor = Transcriptor(file, log_folder, language, modelSize, ACCESS_TOKEN, voices_folder, quantization)
Expand Down
2 changes: 1 addition & 1 deletion library.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ transcript will also indicate the timeframe in seconds where each speaker speaks
```
from speechlib import Transcriptor
file = "obama1.wav" # your audio file
file = "obama_zach.wav" # your audio file
voices_folder = "voices" # voices folder containing voice samples for recognition
language = "en" # language code
log_folder = "logs" # log folder for storing transcripts
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="speechlib",
version="1.1.3",
version="1.1.4",
description="speechlib is a library that can do speaker diarization, transcription and speaker recognition on an audio file to create transcripts with actual speaker names. This library also contain audio preprocessor functions.",
packages=find_packages(),
long_description=long_description,
Expand Down
2 changes: 1 addition & 1 deletion setup_instruction.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ for publishing:
pip install twine

for install locally for testing:
pip install dist/speechlib-1.1.3-py3-none-any.whl
pip install dist/speechlib-1.1.4-py3-none-any.whl

finally run:
twine upload dist/*
Expand Down
21 changes: 12 additions & 9 deletions speechlib/speechlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,26 @@
class Transcriptor:

def __init__(self, file, log_folder, language, modelSize, ACCESS_TOKEN, voices_folder=None, quantization=False):
'''transcribe a wav file
'''
transcribe a wav file
arguments:
arguments:
file: name of wav file with extension ex: file.wav
file: name of wav file with extension ex: file.wav
log_folder: name of folder where transcript will be stored
log_folder: name of folder where transcript will be stored
language: language of wav file
language: language of wav file
modelSize: tiny, small, medium, large, large-v1, large-v2, large-v3 (bigger model is more accurate but slow!!)
modelSize: tiny, small, medium, large, large-v1, large-v2, large-v3 (bigger model is more accurate but slow!!)
ACCESS_TOKEN: huggingface access token
voices_folder: folder containing subfolders named after each speaker with speaker voice samples in them. This will be used for speaker recognition
voices_folder: folder containing subfolders named after each speaker with speaker voice samples in them. This will be used for speaker recognition
quantization: whether to use int8 quantization or not (default=False)
quantization: whether to use int8 quantization or not (default=False)
see documentation: https://github.com/Navodplayer1/speechlib
see documentation: https://github.com/Navodplayer1/speechlib
supported languages:
Expand Down
3 changes: 1 addition & 2 deletions speechlib/wav_segmenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@ def wav_file_segmentation(file_name, segments, language, modelSize, model_type,
# return -> [[start time, end time, transcript], [start time, end time, transcript], ..]
texts.append([segment[0], segment[1], trans])
except Exception as err:
# to avoid transcription exceptions that occur when transcribing silent segments we have to pass
pass
print("ERROR while transcribing: ", err)
# Delete the WAV file after processing
os.remove(file)

Expand Down

0 comments on commit 4085aa7

Please sign in to comment.