create new pali dataset

pnfo · Jul 19, 2023 · e7d7281 · e7d7281
1 parent 6a938e1
commit e7d7281
Show file tree

Hide file tree

Showing 8 changed files with 6,444 additions and 142 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,5 @@ orig
 *.bz2
 node_modules
 .DS_Store
-originals/*.flac
+originals/*.flac
+/wavs
diff --git a/README.md b/README.md
@@ -19,6 +19,12 @@ Currently there is a lack of publically availble tts datasets for sinhala langua
 - List of Roman Characters: ` !'(),-.:;?abcdefghijklmnoprstuvyæñāēīōśşūǣḍḥḷṁṅṇṉṛṝṭ`
 - Silences have been removed from both the beginning and the end of the recordings
 - Silences in the middle of the recording clipped to 0.75 seconds
+```Total labels => count: 6449, length: 13.9 hours, average length: 7.78
+Outliers labels => count: 6399, length: 13.8 hours, average length: 7.78
+Used labels => count: 6248, length: 13.7 hours, average length: 7.89
+characters=" !'(),-.:;=?abcdefghijklmnoprstuvyæñāēīōśşūǣḍḥḷṁṅṇṉṛṝṭ"
+characters=" !'(),-.:;=?[]ංඃඅආඇඈඉඊඋඌඍඑඒඓඔඕඖකඛගඝඞඟචඡජඣඤඥටඨඩඪණඬතථදධනඳපඵබභමඹයරලවශෂසහළෆ්ාැෑිීුූෘෙේෛොෝෞෲ‍‘’“”"
+```
 
 ## metadata.csv
 - contains a subset of the recordings which are less than 16 seconds long in the `ljspeech` format

diff --git a/char-counts.tsv b/char-counts.tsv
@@ -0,0 +1,54 @@
+a	98634
+ 	68590
+i	29055
+n	26598
+y	22669
+v	21396
+t	18054
+k	17929
+m	17504
+ā	17049
+u	16721
+h	16632
+d	15364
+s	15154
+r	15066
+e	14016
+p	10506
+æ	9347
+ē	8884
+.	7311
+g	7247
+l	7155
+ṭ	6307
+b	5695
+o	4230
+ṇ	4205
+ī	4069
+,	2796
+ō	2605
+ū	2524
+ḷ	2379
+ś	1793
+ş	1736
+j	1735
+c	1697
+ṉ	1489
+ḍ	1359
+'	1249
+ṁ	1074
+ṛ	630
+ñ	539
+(	523
+)	514
+ǣ	444
+:	427
+-	330
+ṅ	299
+?	224
+ḥ	107
+f	49
+!	33
+;	25
+=	13
+ṝ	7
diff --git a/create-dataset.js b/create-dataset.js
@@ -5,138 +5,119 @@ import path from 'path'
 import async from 'async'
 import {exec} from 'child_process'
 import jsb from 'json-beautify'
-import { normalizePrompt, normalizeText } from './common_functions'
-
-const labelInputFolder = '/Users/janaka/node/tipitaka.lk/public/audio', // also in '/Volumes/1TB/audio/final uploaded'
-    textInputFolder = '/Users/janaka/node/tipitaka.lk/public/static/text',
-    audioInputFolder = '/Volumes/1TB/audio/silence-added'
-const minClipLength = 3, maxClipLength = 15
-// TODO: ap- should not have any gatha, force set to para/default - but dhs was 
-const forceTypeNoGatha = /^(ap-dhs)/g
+import {sinhalaToRomanConvert} from '@pnfo/singlish-search/roman_convert.js'
+
+export function normalizeText(sinhala) {
+    sinhala = sinhala.replace(/\n/g, '...') // newlines cause issues in displaying text
+    let text = sinhala.replace(/\*\*|__|\{\S+?\}/g, '') // remove bold, underline and footnotes
+    text = text.replace(/ ?-පෙ-/g, '...') // -pe- is not pronounced
+    text = text.replace(/^-පෙ-/g, '') // beginning with -pe- removed
+    text = text.replace(/[-–—]+/g, '-')
+    text = text.replace(/[\[\{\(]\s?/g, '(') // only the normal bracket is supported
+    text = text.replace(/\s?[\]\}\)]/g, ')')
+    text = text.replace(/\(\s?\)/g, ' ') // remove empty brackets
+    text = text.replace(/["“”‘’]/g, "'") // all quotes to single straight quotes
+    text = text.replace(/\s+/g, ' ').trim() // collapse whitespace
+    text = text.replace(/\u200d/g, '') // remove yansa, rakar, bandi
+    const roman = sinhalaToRomanConvert(text).trim()
+    return {sinhala, roman}
+}
 
-const fileMap = JSON.parse(fs.readFileSync(path.join(labelInputFolder, 'file-map.json'), 'utf-8'))
+const audioInputFolder = 'originals', promptsInputFolder = 'prompts'
+const minClipLength = 2, maxClipLength = 15
 
 function loadLabelFile(file) {
-    return fs.readFileSync(path.join(labelInputFolder, file + '.txt'), 'utf-8').split('\n')
+    return fs.readFileSync(path.join(audioInputFolder, file + '.txt'), 'utf-8').split('\n')
         .filter(line => line.trim())
         .map(line => line.trim().split('\t').map(p => Number(p)))
-        .map(([start, end, num]) => ({start, end, length: end - start, file, num}))
-}
-
-function splitWords(sinhala) { // anything outside sinhala range deleted
-    return sinhala.replace(/[^\u0D80-\u0DFF ]/g, '').split(' ').filter(w => w.length)
-}
-function getMedianWordFrequency(words) {
-    const sorted = words.map(w => wordCounts[w]).sort((a, b) => a - b)
-    const midI = Math.floor(sorted.length / 2)
-    return sorted.length % 2 ? sorted[midI] : (sorted[midI - 1] + sorted[midI]) / 2
+        .map(([start, end, index]) => ({start, end, length: end - start, file, index}))
 }
-const incCounter = (arr, prop) => arr[prop] = arr[prop] ? arr[prop] + 1 : 1
-
 
-
-const wordCounts = {}, dedupList = {}
-let totalLabels = 0, totalLength = 0
-const usableEntries = []
-Object.entries(fileMap).forEach(([textFile, labelFiles]) => {
-    const entries = [], labels = []
-    labelFiles.forEach(lf => labels.push(...loadLabelFile(lf)))
-    JSON.parse(fs.readFileSync(path.join(textInputFolder, textFile + '.json'), 'utf-8')).pages.forEach(page => entries.push(...page.pali.entries))
-
-    // join labels with entries, leaving only entries with labels
-    let labeled = entries.filter(entry => !entry.noAudio)
-    labeled.forEach((e, i) => {
-            e.label = labels[i]
-            e.textFile = textFile
-    })
-    labeled = labeled.filter(e => e.label) // some text files could be only partially recorded
-
-    if (forceTypeNoGatha.test(textFile)) labeled.forEach(e => e.type = (e.type == 'gatha') ? 'paragraph' : e.type)
-
-    // get only the labels of length within desired range
-    let usable = labeled.filter((e, i) => e.label && e.label.length > minClipLength && e.label.length <= maxClipLength)
-    usable.forEach(e => {
-        const text = normalizeText(e.text, e.type)
-        e = Object.assign(e, normalizePrompt(text))
-        e.words = splitWords(e.sinhala)
+function loadPrompts(speaker) {
+    return fs.readFileSync(path.join(promptsInputFolder, `prompts-${speaker}.txt`), 'utf-8').split('\n\n').map(group => {
+        const lines = group.split('\n')
+        const [number, source] = lines[0].split('\t')
+        return {text: lines.slice(1).join('\n'), index: Number(number), source}
     })
-
-    usable = usable.filter(e => { // remove any text that occured before
-        const key = e.words.join(' ')
-        if (dedupList[key]) return false
-        for (let word of e.words) incCounter(wordCounts, word) // give higher prob to rare words
-        return dedupList[key] = true
+}
+const incCounter = (arr, prop) => arr[prop] = (arr[prop] || 0) + 1
+const totalDuration = (entries) => entries.reduce((acc, e) => acc + e.length, 0)
+
+function loadSpeaker(speaker, indexOffset) {
+    const prompts = loadPrompts(speaker), entries = []
+    // read the directory and parse the file names
+    fs.readdirSync(audioInputFolder).filter(f => f.endsWith(`${speaker}.flac`)).map(f => {
+        const parts = f.match(/^(\d+)-(\d+)-(.+)\.flac$/)
+        if (!parts || parts[3] != speaker) console.error(`malformed file name ${f}`)
+        const [startI, endI] = parts.slice(1, 3).map(p => Number(p))
+        const file = f.slice(0, -5), labels = loadLabelFile(file)
+        if (startI != labels[0].index || (endI - startI + 1) != labels.length) console.error(`malformed labels for file ${f}`)
+        console.log(`process ${f} with ${labels.length} labels`)
+        return {startI, endI, speaker, file, labels, prompts: prompts.slice(startI - 1, endI)}
+    }).sort((a, b) => a.startI - b.startI)
+    .forEach(g => {
+        g.labels.forEach(({start, end, length, index}, i) => {
+            const prompt = g.prompts[i]
+            if (index != prompt.index) console.error(`prompt index ${prompt.index} does not match the label index ${index}`)
+            const {sinhala, roman} = normalizeText(prompt.text), audioInd = index + indexOffset, lengthRatio = length / roman.replace(/h/g, '').length
+            entries.push({ audioInd, roman, sinhala, speaker: g.speaker, start, end, length, 
+                lengthRatio, file: g.file, wavFile: `sinh_${String(audioInd).padStart(4, '0')}` })
+        })
     })
+    console.log(`total ${entries.length} labels loaded for speaker ${speaker}`)
+    return entries
+}
 
-    totalLength += labeled.reduce((acc, e) => acc + e.label.length, 0)
-    totalLabels += labeled.length
-
-    console.log(`file: ${textFile}, all labels: ${labeled.length}, usable labels: ${usable.length}`)
-    usableEntries.push(...usable)
-})
+const entries = []
+entries.push(...loadSpeaker('mettananda', 0))
+entries.push(...loadSpeaker('oshadi', entries.length))
 
-// remove outliers and sort usable entries based on a score
-usableEntries.forEach(e => {
-    e.score = getMedianWordFrequency(e.words)
-    e.lengthRatio = e.label.length / e.roman.replace(/h/g, '').length
-})
-const outliersToRemove = 100
-const outlierRemoved = usableEntries.sort((a, b) => a.lengthRatio - b.lengthRatio).slice(outliersToRemove, -outliersToRemove)
-outlierRemoved.sort((a, b) => a.score - b.score) // ascending order of the score
-const outlierLength = outlierRemoved.reduce((acc, e) => acc + e.label.length, 0)
-
-const requiredLength = 10 * 3600  // collect until this many hours are reached
-let collectedLength = 0
-const usedEntries = outlierRemoved.filter((e, i) => {
-    collectedLength += e.label.length
-    e.wavFile = 'pali_' + (i + 1).toString().padStart(4, '0')
-    return collectedLength <= requiredLength
-})
-usedEntries.sort((a, b) => a.lengthRatio - b.lengthRatio)
+const outliersToRemove = 25
+const outlierRemoved = entries.sort((a, b) => a.lengthRatio - b.lengthRatio).slice(outliersToRemove, -outliersToRemove)
+const usedEntries = outlierRemoved.filter(({length}) => length <= maxClipLength && length >= minClipLength).sort((a, b) => a.audioInd - b.audioInd)
 
 // extract content from audio files
 // trim all silences more than 0.75 seconds, normalize and set rate (original flac is 44100)
-const outputFolder = 'wavs', outputOptions = 'silence -l 1 0.1 1% -1 0.75 1% reverse silence 1 0.1 1% reverse rate 22050 norm -1' //rate 22050 before norm
-fs.rmSync(outputFolder, {recursive: true})
-fs.mkdirSync(outputFolder)
-
-const extractSegment = (e, callback) => { // Define the function that will extract a single segment
-    const inputFile = path.join(audioInputFolder, e.label.file + '.flac')
-    const command = `sox "${inputFile}" "${path.join(outputFolder, e.wavFile + '.wav')}" trim ${e.label.start} ${e.label.length.toFixed(2)} ${outputOptions}`;
-    exec(command, callback);
+// silence -l 1 0.1 1% -1 0.75 1% reverse silence 1 0.1 1% reverse
+const extractAudio = true
+if (extractAudio) {
+    const outputFolder = 'wavs', outputOptions = 'rate 22050 norm -1' //rate 22050 before norm
+    fs.rmSync(outputFolder, {recursive: true})
+    fs.mkdirSync(outputFolder)
+
+    const extractSegment = (e, callback) => { // Define the function that will extract a single segment
+        const inputFile = path.join(audioInputFolder, e.file + '.flac')
+        const command = `sox "${inputFile}" "${path.join(outputFolder, e.wavFile + '.wav')}" trim ${e.start} ${e.length.toFixed(2)} ${outputOptions}`;
+        exec(command, callback);
+    }
+    const startTime = Date.now()
+    async.mapLimit(usedEntries, 7, (e, mapCallback) => {
+            extractSegment(e, (error, stdout, stderr) => mapCallback(error || null, stderr || stdout))
+        }, (error, results) => {
+            if (error) console.error(error)
+            console.log(`Extracted audio from flac files in ${((Date.now() - startTime) / 1000).toFixed(2)} seconds`);
+        })
 }
-const startTime = new Date()
-// async.mapLimit(usedEntries, 7, (e, mapCallback) => {
-//         extractSegment(e, (error, stdout, stderr) => mapCallback(error || null, stderr || stdout))
-//     }, (error, results) => {
-//         if (error) console.error(error)
-//         console.log(`Extracted audio from flac files in ${((Date.now() - startTime) / 1000).toFixed(2)} seconds`);
-//     })
-
 
 // compute character and type counts
-const charCountsRoman = {}, charCountsSinhala = {}, typeCounts = {}
+const charCountsRoman = {}, charCountsSinhala = {}
 usedEntries.forEach(e => {
     for (let char of e.roman) incCounter(charCountsRoman, char)
     for (let char of e.sinhala) incCounter(charCountsSinhala, char)
-    incCounter(typeCounts, e.type)
-    e.speaker = (e.type == 'gatha') ? 'gatha' : 'default'
 })
-const usedLength = usedEntries.reduce((acc, e) => acc + e.label.length, 0)
 
 fs.writeFileSync('char-counts.tsv', Object.entries(charCountsRoman)
     .sort((a, b) => b[1] - a[1])
     .map(([char, count]) => char + '\t' + count)
     .join('\n'), 'utf-8')
-fs.writeFileSync('metadata.csv', usedEntries.map(e => [e.wavFile, e.sinhala, e.roman, e.speaker].join('|')).join('\n'), 'utf8')
-//fs.writeFileSync('word-counts.tsv', Object.entries(wordCounts).map(([word, count]) => word + '\t' + count).join('\n'), 'utf-8')
-fs.writeFileSync('text-entries.json', jsb(usedEntries, null, '\t', 100), 'utf-8')
 
+fs.writeFileSync('metadata.csv', usedEntries.map(e => [e.wavFile, e.roman, e.sinhala, e.speaker].join('|')).join('\n'), 'utf8')
+
+// logging stats
 const log = (stat, count, duration) => console.log(`${stat} labels => count: ${count}, length: ${(duration / 3600).toFixed(1)} hours, average length: ${(duration / count).toFixed(2)}`)
-log('Total', totalLabels, totalLength)
-log('Usable', outlierRemoved.length, outlierLength)
-log('Used', usedEntries.length, usedLength)
-console.log(typeCounts)
+log('Total', entries.length, totalDuration(entries))
+log('Outliers', outlierRemoved.length, totalDuration(outlierRemoved))
+log('Used', usedEntries.length, totalDuration(usedEntries))
 console.log(`characters="${Object.keys(charCountsRoman).sort().join('')}"`)
 console.log(`characters="${Object.keys(charCountsSinhala).sort().join('')}"`)
-console.log(`create dataset using "tar -cjf pali_dataset.tar.bz2 wavs metadata.csv"`)
+console.log(`create dataset using "tar -cjf sinh_dataset.tar.bz2 wavs metadata.csv"`)