-
Notifications
You must be signed in to change notification settings - Fork 1
/
multi-speaker.js
113 lines (100 loc) · 6.02 KB
/
multi-speaker.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
'use strict'
import fs from 'fs'
import path from 'path'
import async from 'async'
import {exec} from 'child_process'
import jsb from 'json-beautify'
import { normalizePrompt, normalizeText } from './common_functions.js'
const audioInputFolder = 'originals', promptsInputFolder = 'originals'
const minClipLength = 1, maxClipLength = 17 // incase trimmed by the sox command
function loadLabelFile(file) {
return fs.readFileSync(path.join(audioInputFolder, file + '.txt'), 'utf-8').split('\n')
.filter(line => line.trim())
.map(line => line.trim().split('\t').map(p => Number(p)))
.map(([start, end, index]) => ({start, end, length: end - start, file, index}))
}
function loadPrompts(speaker) {
return fs.readFileSync(path.join(promptsInputFolder, `${speaker}-prompts.txt`), 'utf-8').split('\n\n').map(group => {
const lines = group.split('\n')
const [number, type, source] = lines[0].split('\t')
return {text: lines.slice(1).join('\n'), index: Number(number), type, source}
})
}
const incCounter = (arr, prop) => arr[prop] = (arr[prop] || 0) + 1
const totalDuration = (entries) => entries.reduce((acc, e) => acc + e.length, 0)
function loadSpeaker(speaker, indexOffset) {
const prompts = loadPrompts(speaker), entries = []
// read the directory and parse the file names
fs.readdirSync(audioInputFolder).filter(f => f.endsWith(`.flac`) && f.startsWith(speaker)).map(f => {
const parts = f.match(/^(.+?)-(\d+)-(\d+)\.flac$/)
if (!parts || parts[1] != speaker) console.error(`malformed file name ${f}`)
const [startI, endI] = parts.slice(2, 4).map(p => Number(p))
const file = f.slice(0, -5), labels = loadLabelFile(file)
if (startI != labels[0].index || (endI - startI + 1) != labels.length) console.error(`malformed labels for file ${f}`)
console.log(`process ${f} with ${labels.length} labels`)
return {startI, endI, speaker, file, labels, prompts: prompts.slice(startI - 1, endI)}
}).sort((a, b) => a.startI - b.startI)
.forEach(g => {
g.labels.forEach(({start, end, length, index}, i) => {
const prompt = g.prompts[i], type = prompt.type
if (index != prompt.index) console.error(`prompt index ${prompt.index} does not match the label index ${index}`)
const {sinhala, roman} = normalizePrompt(prompt.text, type), audioInd = index + indexOffset, lengthRatio = length / roman.replace(/h/g, '').length
entries.push({ audioInd, roman, sinhala, speaker: g.speaker, start, end, length, type,
lengthRatio, file: g.file, wavFile: `pali_${String(audioInd).padStart(4, '0')}` })
})
})
console.log(`total ${entries.length} labels loaded for speaker ${speaker}`)
return entries
}
const entries = []
entries.push(...loadSpeaker('wdevananda', 0))
entries.push(...loadSpeaker('oshadir', entries.length))
entries.push(...loadSpeaker('obhasa', entries.length))
entries.push(...loadSpeaker('lankananda', entries.length))
const outliersToRemove = 25
const outlierRemoved = entries.sort((a, b) => a.lengthRatio - b.lengthRatio).slice(outliersToRemove, -outliersToRemove)
const usedEntries = outlierRemoved.filter(({length}) => length <= maxClipLength && length >= minClipLength).sort((a, b) => a.audioInd - b.audioInd)
// extract content from audio files
// trim all silences more than 0.75 seconds, normalize and set rate (original flac is 44100)
// silence -l 1 0.1 0.2% -1 0.75 0.2% reverse silence 1 0.1 0.2% reverse
const extractAudio = true
if (extractAudio) {
const outputFolder = 'wavs', outputOptions = 'rate 22050 norm -1' //rate 22050 before norm
fs.rmSync(outputFolder, {recursive: true})
fs.mkdirSync(outputFolder)
const extractSegment = (e, callback) => { // Define the function that will extract a single segment
const inputFile = path.join(audioInputFolder, e.file + '.flac')
const command = `sox "${inputFile}" "${path.join(outputFolder, e.wavFile + '.wav')}" trim ${e.start} ${e.length.toFixed(2)} ${outputOptions}`;
exec(command, callback);
}
const startTime = Date.now()
async.mapLimit(usedEntries, 7, (e, mapCallback) => {
extractSegment(e, (error, stdout, stderr) => mapCallback(error || null, stderr || stdout))
}, (error, results) => {
if (error) console.error(error)
console.log(`Extracted audio from flac files in ${((Date.now() - startTime) / 1000).toFixed(2)} seconds`);
})
}
// compute character and type counts
const charCountsRoman = {}, charCountsSinhala = {}, speakerCounts = {}
usedEntries.forEach(e => {
for (let char of e.roman) incCounter(charCountsRoman, char)
for (let char of e.sinhala) incCounter(charCountsSinhala, char)
incCounter(speakerCounts, e.speaker)
})
// fs.writeFileSync('char-counts.tsv', Object.entries(charCountsRoman)
// .sort((a, b) => b[1] - a[1])
// .map(([char, count]) => char + '\t' + count)
// .join('\n'), 'utf-8')
fs.writeFileSync('metadata.csv', usedEntries.map(e => [e.wavFile, e.roman, e.sinhala, e.speaker, e.type].join('|')).join('\n'), 'utf8')
// logging stats
const log = (stat, count, duration) => console.log(`${stat} labels => count: ${count}, length: ${(duration / 3600).toFixed(1)} hours, average length: ${(duration / count).toFixed(2)}`)
log('Total', entries.length, totalDuration(entries))
log('Outliers', outlierRemoved.length, totalDuration(outlierRemoved))
log('Used', usedEntries.length, totalDuration(usedEntries))
console.log(`characters="${Object.keys(charCountsRoman).sort().join('')}"`)
console.log(`characters="${Object.keys(charCountsSinhala).sort().join('')}"`)
console.log(`speakers=${JSON.stringify(speakerCounts)}`)
console.log(`run the create-dataset.js next to extract from tipitaka.lk`)
// '(),-.:;?xංඅආඉඊඋඌඑඔකඛගඝඞචඡජඣඤටඨඩඪණතථදධනපඵබභමයරලවසහළ්ාිීුූෙො"
// '(),-.:;?xංඅආඉඊඋඌඑඔකඛගඝඞචඡජඣඤටඨඩඪණතථදධනපඵබභමයරලවසහළ්ාිීුූෙො"