Skip to content

Commit

Permalink
feat(transliterate): add strict mode
Browse files Browse the repository at this point in the history
  • Loading branch information
noomorph committed May 12, 2024
1 parent 3bc0d73 commit 343119a
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 10 deletions.
2 changes: 2 additions & 0 deletions src/__utils__/fixtures.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,5 @@ export const pronouns_reflexive: FixtureGetter = () =>
jest.requireActual('../__fixtures__/pronouns-reflexive.json');
export const pronouns_relative: FixtureGetter = () =>
jest.requireActual('../__fixtures__/pronouns-relative.json');
export const other: FixtureGetter = () =>
jest.requireActual('../__fixtures__/other.json');
31 changes: 31 additions & 0 deletions src/transliterate/__tests__/integrity.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import * as fixtures from '../../__utils__/fixtures';
import transliterate from '..';

describe('transliteration integrity', () => {
test.each([
...fixtures.adjectives(),
...fixtures.nouns_feminine(),
...fixtures.nouns_masculine_animate(),
...fixtures.nouns_masculine(),
...fixtures.nouns_misc(),
...fixtures.nouns_neuter(),
...fixtures.verbs_imperfect(),
...fixtures.verbs_perfect(),
...fixtures.verbs_misc(),
...fixtures.pronouns_demonstrative(),
...fixtures.pronouns_indefinite(),
...fixtures.pronouns_interrogative(),
...fixtures.pronouns_personal(),
...fixtures.pronouns_possessive(),
...fixtures.pronouns_reciprocal(),
...fixtures.pronouns_reflexive(),
...fixtures.pronouns_relative(),
...fixtures.other(),
])('%s', (_id, _morphology, lemma, additional) => {
const value = lemma + ' ' + additional;
const script = 'isv-Cyrl-x-etymolog';
const withPreprocessing = transliterate(value, script, false);
const withoutPreprocessing = transliterate(value, script, true);
expect(withPreprocessing).toBe(withoutPreprocessing);
});
});
21 changes: 21 additions & 0 deletions src/transliterate/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,127 +9,148 @@ import { FlavorisationBCP47Code } from '../constants';
export default function transliterate(
text: string,
lang: FlavorisationBCP47Code,
preprocessed = false,
): string {
switch (lang) {
case 'isv-Latn':
return _transliterate(
text,
TransliterationType.Latin,
FlavorizationType.Standard,
preprocessed,
);
case 'isv-Cyrl':
return _transliterate(
text,
TransliterationType.StandardCyrillic,
FlavorizationType.Standard,
preprocessed,
);
case 'isv-Glag':
return _transliterate(
text,
TransliterationType.Glagolitic,
FlavorizationType.Standard,
preprocessed,
);
case 'isv-x-fonipa':
return _transliterate(
text,
TransliterationType.IPA,
FlavorizationType.Etymological,
preprocessed,
);
case 'isv-Latn-x-etymolog':
return _transliterate(
text,
TransliterationType.Latin,
FlavorizationType.Etymological,
preprocessed,
);
case 'isv-Cyrl-x-etymolog':
return _transliterate(
text,
TransliterationType.StandardCyrillic,
FlavorizationType.Etymological,
preprocessed,
);
case 'isv-Glag-x-etymolog':
return _transliterate(
text,
TransliterationType.Glagolitic,
FlavorizationType.Etymological,
preprocessed,
);
case 'isv-Cyrl-x-iotated':
return _transliterate(
text,
TransliterationType.TraditionalIotatedCyrillic,
FlavorizationType.Standard,
preprocessed,
);
case 'isv-Cyrl-x-iotated-ext':
return _transliterate(
text,
TransliterationType.TraditionalIotatedCyrillic,
FlavorizationType.CyrillicExtended,
preprocessed,
);
case 'isv-Cyrl-x-northern':
return _transliterate(
text,
TransliterationType.StandardCyrillic,
FlavorizationType.Northern,
preprocessed,
);
case 'isv-Cyrl-x-sloviant':
return _transliterate(
text,
TransliterationType.StandardCyrillic,
FlavorizationType.Slovianto,
preprocessed,
);
case 'isv-Cyrl-x-southern':
return _transliterate(
text,
TransliterationType.StandardCyrillic,
FlavorizationType.Southern,
preprocessed,
);
case 'isv-Latn-PL':
return _transliterate(
text,
TransliterationType.Polish,
FlavorizationType.Etymological,
preprocessed,
);
case 'isv-Latn-x-ascii':
return _transliterate(
text,
TransliterationType.ASCII,
FlavorizationType.Standard,
preprocessed,
);
case 'isv-Latn-x-northern':
return _transliterate(
text,
TransliterationType.Latin,
FlavorizationType.Northern,
preprocessed,
);
case 'isv-Latn-x-sloviant':
return _transliterate(
text,
TransliterationType.Latin,
FlavorizationType.Slovianto,
preprocessed,
);
case 'isv-Latn-x-southern':
return _transliterate(
text,
TransliterationType.Latin,
FlavorizationType.Southern,
preprocessed,
);
case 'isv-Glag-x-northern':
return _transliterate(
text,
TransliterationType.Glagolitic,
FlavorizationType.Northern,
preprocessed,
);
case 'isv-Glag-x-southern':
return _transliterate(
text,
TransliterationType.Glagolitic,
FlavorizationType.Southern,
preprocessed,
);
case 'isv-Glag-x-sloviant':
return _transliterate(
text,
TransliterationType.Glagolitic,
FlavorizationType.Slovianto,
preprocessed,
);
case 'isv':
return text;
Expand Down
50 changes: 40 additions & 10 deletions src/transliterate/transliterate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,31 +23,38 @@ export enum FlavorizationType {
* @param iSource
* @param type
* @param flav
* @param preprocessed whether the input is already preprocessed, and should not be normalized
* @see {@link http://steen.free.fr/scripts/transliteration.js}
*/
export function transliterate(
iSource: string,
type: number,
flav: string | number = 2,
preprocessed = false,
): string {
return iSource
.normalize('NFC')
.replace(/[\p{Letter}\p{Mark}]+/gu, (w) =>
transliterateWord(w, type, flav),
return iSource.normalize('NFC').replace(/[\p{Letter}\p{Mark}]+/gu, (word) => {
//symbol % marks the borders of the %word%
const OrigW = `%${word}%`;
const preprocess = preprocessed ? nmsifyStrict : nmsifyLoose;
return transliterateWord(
preprocess(OrigW.toLowerCase()),
OrigW,
type,
flav,
);
});
}

const VOWEL = /[aeiouyąęųåėȯèòěê]/;

function transliterateWord(
iW: string,
__iW: string,
__OrigW: string,
type: string | number,
flav: string | number,
) {
//symbol % marks the borders of the %word%
iW = '%' + iW + '%';
let OrigW = iW;
iW = nmsify(iW.toLowerCase());
let iW = __iW;
let OrigW = __OrigW;
// 'ŕ' remains between two consonants, in other cases is replaced by 'ř'
iW = iW.replace(/ŕ/g, 'ř');
const aPos = iW.indexOf('ř');
Expand Down Expand Up @@ -603,7 +610,7 @@ function jgedoe(iW: string) {
return result;
}

function nmsify(iW: string) {
function nmsifyLoose(iW: string) {
return (
iW
.replace(/[яꙗ]/g, '#a')
Expand Down Expand Up @@ -683,8 +690,12 @@ function nmsify(iW: string) {
.replace(/zst/g, 'z#st')
.replace(/%izs/g, '%iz#s')
.replace(/%bezs/g, '%bez#s')
.replace(/%obezs/g, '%obez#s')
.replace(/%razs/g, '%raz#s')
.replace(/%råzs/g, '%råz#s')
.replace(/%szadu%/g, '%s#zadu%')
.replace(/%vozs/g, '%voz#s')
.replace(/%vȯzs/g, '%vȯz#s')
.replace(/konjug/g, 'kon#jug')
.replace(/konjun/g, 'kon#jun')
.replace(/injek/g, 'in#jek')
Expand Down Expand Up @@ -726,3 +737,22 @@ function nmsify(iW: string) {
.replace(/jj/g, 'j')
);
}

function nmsifyStrict(iW: string) {
return iW
.replaceAll('konjug', 'kon#jug')
.replaceAll('konjun', 'kon#jun')
.replaceAll('injek', 'in#jek')
.replaceAll('%wifi%', '%vifi%')
.replaceAll('á', 'a')
.replaceAll('d́', 'ď')
.replaceAll('é', 'e')
.replaceAll('ì', 'i')
.replaceAll('í', 'i')
.replaceAll('ĵ', 'j')
.replaceAll('ĺ', 'ľ')
.replaceAll('ó', 'o')
.replaceAll('œ', 'o')
.replaceAll('t́', 'ť')
.replaceAll('ý', 'y');
}

0 comments on commit 343119a

Please sign in to comment.