Skip to content

Commit

Permalink
create new pali dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
pathnirvana committed Jul 19, 2023
1 parent 6a938e1 commit e7d7281
Show file tree
Hide file tree
Showing 8 changed files with 6,444 additions and 142 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ orig
*.bz2
node_modules
.DS_Store
originals/*.flac
originals/*.flac
/wavs
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ Currently there is a lack of publically availble tts datasets for sinhala langua
- List of Roman Characters: ` !'(),-.:;?abcdefghijklmnoprstuvyæñāēīōśşūǣḍḥḷṁṅṇṉṛṝṭ`
- Silences have been removed from both the beginning and the end of the recordings
- Silences in the middle of the recording clipped to 0.75 seconds
```Total labels => count: 6449, length: 13.9 hours, average length: 7.78
Outliers labels => count: 6399, length: 13.8 hours, average length: 7.78
Used labels => count: 6248, length: 13.7 hours, average length: 7.89
characters=" !'(),-.:;=?abcdefghijklmnoprstuvyæñāēīōśşūǣḍḥḷṁṅṇṉṛṝṭ"
characters=" !'(),-.:;=?[]ංඃඅආඇඈඉඊඋඌඍඑඒඓඔඕඖකඛගඝඞඟචඡජඣඤඥටඨඩඪණඬතථදධනඳපඵබභමඹයරලවශෂසහළෆ්ාැෑිීුූෘෙේෛොෝෞෲ‍‘’“”"
```

## metadata.csv
- contains a subset of the recordings which are less than 16 seconds long in the `ljspeech` format
Expand Down
54 changes: 54 additions & 0 deletions char-counts.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
a 98634
68590
i 29055
n 26598
y 22669
v 21396
t 18054
k 17929
m 17504
ā 17049
u 16721
h 16632
d 15364
s 15154
r 15066
e 14016
p 10506
æ 9347
ē 8884
. 7311
g 7247
l 7155
6307
b 5695
o 4230
4205
ī 4069
, 2796
ō 2605
ū 2524
2379
ś 1793
ş 1736
j 1735
c 1697
1489
1359
' 1249
1074
630
ñ 539
( 523
) 514
ǣ 444
: 427
- 330
299
? 224
107
f 49
! 33
; 25
= 13
7
191 changes: 86 additions & 105 deletions create-dataset.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,138 +5,119 @@ import path from 'path'
import async from 'async'
import {exec} from 'child_process'
import jsb from 'json-beautify'
import { normalizePrompt, normalizeText } from './common_functions'

const labelInputFolder = '/Users/janaka/node/tipitaka.lk/public/audio', // also in '/Volumes/1TB/audio/final uploaded'
textInputFolder = '/Users/janaka/node/tipitaka.lk/public/static/text',
audioInputFolder = '/Volumes/1TB/audio/silence-added'
const minClipLength = 3, maxClipLength = 15
// TODO: ap- should not have any gatha, force set to para/default - but dhs was
const forceTypeNoGatha = /^(ap-dhs)/g
import {sinhalaToRomanConvert} from '@pnfo/singlish-search/roman_convert.js'

export function normalizeText(sinhala) {
sinhala = sinhala.replace(/\n/g, '...') // newlines cause issues in displaying text
let text = sinhala.replace(/\*\*|__|\{\S+?\}/g, '') // remove bold, underline and footnotes
text = text.replace(/ ?-පෙ-/g, '...') // -pe- is not pronounced
text = text.replace(/^-පෙ-/g, '') // beginning with -pe- removed
text = text.replace(/[-–—]+/g, '-')
text = text.replace(/[\[\{\(]\s?/g, '(') // only the normal bracket is supported
text = text.replace(/\s?[\]\}\)]/g, ')')
text = text.replace(/\(\s?\)/g, ' ') // remove empty brackets
text = text.replace(/["“”‘’]/g, "'") // all quotes to single straight quotes
text = text.replace(/\s+/g, ' ').trim() // collapse whitespace
text = text.replace(/\u200d/g, '') // remove yansa, rakar, bandi
const roman = sinhalaToRomanConvert(text).trim()
return {sinhala, roman}
}

const fileMap = JSON.parse(fs.readFileSync(path.join(labelInputFolder, 'file-map.json'), 'utf-8'))
const audioInputFolder = 'originals', promptsInputFolder = 'prompts'
const minClipLength = 2, maxClipLength = 15

function loadLabelFile(file) {
return fs.readFileSync(path.join(labelInputFolder, file + '.txt'), 'utf-8').split('\n')
return fs.readFileSync(path.join(audioInputFolder, file + '.txt'), 'utf-8').split('\n')
.filter(line => line.trim())
.map(line => line.trim().split('\t').map(p => Number(p)))
.map(([start, end, num]) => ({start, end, length: end - start, file, num}))
}

function splitWords(sinhala) { // anything outside sinhala range deleted
return sinhala.replace(/[^\u0D80-\u0DFF ]/g, '').split(' ').filter(w => w.length)
}
function getMedianWordFrequency(words) {
const sorted = words.map(w => wordCounts[w]).sort((a, b) => a - b)
const midI = Math.floor(sorted.length / 2)
return sorted.length % 2 ? sorted[midI] : (sorted[midI - 1] + sorted[midI]) / 2
.map(([start, end, index]) => ({start, end, length: end - start, file, index}))
}
const incCounter = (arr, prop) => arr[prop] = arr[prop] ? arr[prop] + 1 : 1



const wordCounts = {}, dedupList = {}
let totalLabels = 0, totalLength = 0
const usableEntries = []
Object.entries(fileMap).forEach(([textFile, labelFiles]) => {
const entries = [], labels = []
labelFiles.forEach(lf => labels.push(...loadLabelFile(lf)))
JSON.parse(fs.readFileSync(path.join(textInputFolder, textFile + '.json'), 'utf-8')).pages.forEach(page => entries.push(...page.pali.entries))

// join labels with entries, leaving only entries with labels
let labeled = entries.filter(entry => !entry.noAudio)
labeled.forEach((e, i) => {
e.label = labels[i]
e.textFile = textFile
})
labeled = labeled.filter(e => e.label) // some text files could be only partially recorded

if (forceTypeNoGatha.test(textFile)) labeled.forEach(e => e.type = (e.type == 'gatha') ? 'paragraph' : e.type)

// get only the labels of length within desired range
let usable = labeled.filter((e, i) => e.label && e.label.length > minClipLength && e.label.length <= maxClipLength)
usable.forEach(e => {
const text = normalizeText(e.text, e.type)
e = Object.assign(e, normalizePrompt(text))
e.words = splitWords(e.sinhala)
function loadPrompts(speaker) {
return fs.readFileSync(path.join(promptsInputFolder, `prompts-${speaker}.txt`), 'utf-8').split('\n\n').map(group => {
const lines = group.split('\n')
const [number, source] = lines[0].split('\t')
return {text: lines.slice(1).join('\n'), index: Number(number), source}
})

usable = usable.filter(e => { // remove any text that occured before
const key = e.words.join(' ')
if (dedupList[key]) return false
for (let word of e.words) incCounter(wordCounts, word) // give higher prob to rare words
return dedupList[key] = true
}
const incCounter = (arr, prop) => arr[prop] = (arr[prop] || 0) + 1
const totalDuration = (entries) => entries.reduce((acc, e) => acc + e.length, 0)

function loadSpeaker(speaker, indexOffset) {
const prompts = loadPrompts(speaker), entries = []
// read the directory and parse the file names
fs.readdirSync(audioInputFolder).filter(f => f.endsWith(`${speaker}.flac`)).map(f => {
const parts = f.match(/^(\d+)-(\d+)-(.+)\.flac$/)
if (!parts || parts[3] != speaker) console.error(`malformed file name ${f}`)
const [startI, endI] = parts.slice(1, 3).map(p => Number(p))
const file = f.slice(0, -5), labels = loadLabelFile(file)
if (startI != labels[0].index || (endI - startI + 1) != labels.length) console.error(`malformed labels for file ${f}`)
console.log(`process ${f} with ${labels.length} labels`)
return {startI, endI, speaker, file, labels, prompts: prompts.slice(startI - 1, endI)}
}).sort((a, b) => a.startI - b.startI)
.forEach(g => {
g.labels.forEach(({start, end, length, index}, i) => {
const prompt = g.prompts[i]
if (index != prompt.index) console.error(`prompt index ${prompt.index} does not match the label index ${index}`)
const {sinhala, roman} = normalizeText(prompt.text), audioInd = index + indexOffset, lengthRatio = length / roman.replace(/h/g, '').length
entries.push({ audioInd, roman, sinhala, speaker: g.speaker, start, end, length,
lengthRatio, file: g.file, wavFile: `sinh_${String(audioInd).padStart(4, '0')}` })
})
})
console.log(`total ${entries.length} labels loaded for speaker ${speaker}`)
return entries
}

totalLength += labeled.reduce((acc, e) => acc + e.label.length, 0)
totalLabels += labeled.length

console.log(`file: ${textFile}, all labels: ${labeled.length}, usable labels: ${usable.length}`)
usableEntries.push(...usable)
})
const entries = []
entries.push(...loadSpeaker('mettananda', 0))
entries.push(...loadSpeaker('oshadi', entries.length))

// remove outliers and sort usable entries based on a score
usableEntries.forEach(e => {
e.score = getMedianWordFrequency(e.words)
e.lengthRatio = e.label.length / e.roman.replace(/h/g, '').length
})
const outliersToRemove = 100
const outlierRemoved = usableEntries.sort((a, b) => a.lengthRatio - b.lengthRatio).slice(outliersToRemove, -outliersToRemove)
outlierRemoved.sort((a, b) => a.score - b.score) // ascending order of the score
const outlierLength = outlierRemoved.reduce((acc, e) => acc + e.label.length, 0)

const requiredLength = 10 * 3600 // collect until this many hours are reached
let collectedLength = 0
const usedEntries = outlierRemoved.filter((e, i) => {
collectedLength += e.label.length
e.wavFile = 'pali_' + (i + 1).toString().padStart(4, '0')
return collectedLength <= requiredLength
})
usedEntries.sort((a, b) => a.lengthRatio - b.lengthRatio)
const outliersToRemove = 25
const outlierRemoved = entries.sort((a, b) => a.lengthRatio - b.lengthRatio).slice(outliersToRemove, -outliersToRemove)
const usedEntries = outlierRemoved.filter(({length}) => length <= maxClipLength && length >= minClipLength).sort((a, b) => a.audioInd - b.audioInd)

// extract content from audio files
// trim all silences more than 0.75 seconds, normalize and set rate (original flac is 44100)
const outputFolder = 'wavs', outputOptions = 'silence -l 1 0.1 1% -1 0.75 1% reverse silence 1 0.1 1% reverse rate 22050 norm -1' //rate 22050 before norm
fs.rmSync(outputFolder, {recursive: true})
fs.mkdirSync(outputFolder)

const extractSegment = (e, callback) => { // Define the function that will extract a single segment
const inputFile = path.join(audioInputFolder, e.label.file + '.flac')
const command = `sox "${inputFile}" "${path.join(outputFolder, e.wavFile + '.wav')}" trim ${e.label.start} ${e.label.length.toFixed(2)} ${outputOptions}`;
exec(command, callback);
// silence -l 1 0.1 1% -1 0.75 1% reverse silence 1 0.1 1% reverse
const extractAudio = true
if (extractAudio) {
const outputFolder = 'wavs', outputOptions = 'rate 22050 norm -1' //rate 22050 before norm
fs.rmSync(outputFolder, {recursive: true})
fs.mkdirSync(outputFolder)

const extractSegment = (e, callback) => { // Define the function that will extract a single segment
const inputFile = path.join(audioInputFolder, e.file + '.flac')
const command = `sox "${inputFile}" "${path.join(outputFolder, e.wavFile + '.wav')}" trim ${e.start} ${e.length.toFixed(2)} ${outputOptions}`;
exec(command, callback);
}
const startTime = Date.now()
async.mapLimit(usedEntries, 7, (e, mapCallback) => {
extractSegment(e, (error, stdout, stderr) => mapCallback(error || null, stderr || stdout))
}, (error, results) => {
if (error) console.error(error)
console.log(`Extracted audio from flac files in ${((Date.now() - startTime) / 1000).toFixed(2)} seconds`);
})
}
const startTime = new Date()
// async.mapLimit(usedEntries, 7, (e, mapCallback) => {
// extractSegment(e, (error, stdout, stderr) => mapCallback(error || null, stderr || stdout))
// }, (error, results) => {
// if (error) console.error(error)
// console.log(`Extracted audio from flac files in ${((Date.now() - startTime) / 1000).toFixed(2)} seconds`);
// })


// compute character and type counts
const charCountsRoman = {}, charCountsSinhala = {}, typeCounts = {}
const charCountsRoman = {}, charCountsSinhala = {}
usedEntries.forEach(e => {
for (let char of e.roman) incCounter(charCountsRoman, char)
for (let char of e.sinhala) incCounter(charCountsSinhala, char)
incCounter(typeCounts, e.type)
e.speaker = (e.type == 'gatha') ? 'gatha' : 'default'
})
const usedLength = usedEntries.reduce((acc, e) => acc + e.label.length, 0)

fs.writeFileSync('char-counts.tsv', Object.entries(charCountsRoman)
.sort((a, b) => b[1] - a[1])
.map(([char, count]) => char + '\t' + count)
.join('\n'), 'utf-8')
fs.writeFileSync('metadata.csv', usedEntries.map(e => [e.wavFile, e.sinhala, e.roman, e.speaker].join('|')).join('\n'), 'utf8')
//fs.writeFileSync('word-counts.tsv', Object.entries(wordCounts).map(([word, count]) => word + '\t' + count).join('\n'), 'utf-8')
fs.writeFileSync('text-entries.json', jsb(usedEntries, null, '\t', 100), 'utf-8')

fs.writeFileSync('metadata.csv', usedEntries.map(e => [e.wavFile, e.roman, e.sinhala, e.speaker].join('|')).join('\n'), 'utf8')

// logging stats
const log = (stat, count, duration) => console.log(`${stat} labels => count: ${count}, length: ${(duration / 3600).toFixed(1)} hours, average length: ${(duration / count).toFixed(2)}`)
log('Total', totalLabels, totalLength)
log('Usable', outlierRemoved.length, outlierLength)
log('Used', usedEntries.length, usedLength)
console.log(typeCounts)
log('Total', entries.length, totalDuration(entries))
log('Outliers', outlierRemoved.length, totalDuration(outlierRemoved))
log('Used', usedEntries.length, totalDuration(usedEntries))
console.log(`characters="${Object.keys(charCountsRoman).sort().join('')}"`)
console.log(`characters="${Object.keys(charCountsSinhala).sort().join('')}"`)
console.log(`create dataset using "tar -cjf pali_dataset.tar.bz2 wavs metadata.csv"`)
console.log(`create dataset using "tar -cjf sinh_dataset.tar.bz2 wavs metadata.csv"`)
Loading

0 comments on commit e7d7281

Please sign in to comment.