Skip to content

Commit

Permalink
issues #10 and #15: more detailed incode documentation, one more test…
Browse files Browse the repository at this point in the history
… example
  • Loading branch information
leoalenc committed Apr 13, 2018
1 parent c59ab36 commit f6b9311
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 15 deletions.
1 change: 1 addition & 0 deletions fst/build-suff.foma
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Script for generating diminutives in Portuguese with Foma
# Usage: foma -f build-suff.foma

source suff02.xfst
ss suff02-foma.fst
4 changes: 3 additions & 1 deletion fst/build-suff.xfst
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Script for generating diminutives in Portuguese with XFST
# Script for generating and testing the analysis of diminutives in Portuguese with XFST
# Usage: xfst -f build-suff.xfst -q

source suff02.xfst
down < test-upper.txt > test-upper.out
# inho.txt contains all diminutives in inho and zinho from DELAF-PB
up < inho.txt > inho.out
ss suff02-xfst.fst
9 changes: 0 additions & 9 deletions fst/suff02.lexc
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,6 @@ LEXICON Root
Nouns ;

LEXICON Nouns
!< @bin"lem_m.sample.fst" "@U.GEND.M@" > Suff ;
!< @bin"lem_f.sample.fst" "@U.GEND.F@" > Suff ;

!< @txt"m.txt" "@U.GEND.M@" > Suff ;
!< @txt"f.txt" "@U.GEND.F@" > Suff ;

< @stxt"all_other_m_sg" "@U.GEND.M@" "@U.NUM.SG@" > Cat ;
< @stxt"all_other_m_pl" "@U.GEND.M@" "@U.NUM.PL@" > Cat ;
Expand All @@ -26,15 +21,11 @@ LEXICON Nouns
< @stxt"aug_f_sg" "@U.GEND.F@" "@U.NUM.SG@" "@U.AUG.Y@" > Cat ;
< @stxt"aug_f_pl" "@U.GEND.F@" "@U.NUM.PL@" "@U.AUG.Y@" > Cat ;

!LEXICON Word
!< "@U.CAT.W@" > Suff ;

LEXICON Cat
< %+N:0 "@D.AUG@" > Suff ;
< %+N:0 %+AUG:0 "@R.AUG@" > Suff ;

LEXICON Suff
! <%+DIM 0:%^ "@D.CAT.W@" > Gend ;
<%+DIM 0:%^ "@D.ENDING.S@" "@U.NUM.SG@" "@C.NUM@" > Gend ;
<%+DIM:{^zinh} 0:%^ "@D.ENDING.S@" > Gend ;
<%+DIM:{^inh} 0:%^ "@R.ENDING.S@" > Gend ;
Expand Down
86 changes: 81 additions & 5 deletions fst/suff02.xfst
Original file line number Diff line number Diff line change
Expand Up @@ -4,40 +4,90 @@
# This script runs in both Foma and XFST. It first
# compiles the lexc grammar defined in suff02.lexc.
# This grammar defines a lexicon transducer
# (Beesley & Karttunen 2003), mapping lexical analyses
# to intermediate forms. These intermediate forms are then
# mapped to the final strings by composing the lexicon transduceer
# (Beesley & Karttunen 2003), mapping lexical strings
# (abstract representations consisting of lemma and features)
# to intermediate forms. These intermediate forms are then mapped
# to the final strings (inflected words) by composing the lexicon transduceer
# with a cascade of alternation rules. For more information on
# finite-state morphology, see Beesley & Karttunen (2003).
# Further references in the README file.

# Author: Leonel F. de Alencar, Federal University of Ceará
# Date: April 11, 2018

set flag-is-epsilon ON
clear
# define AccVow [ á | ã | é | ê | ó | ô | i | u ] ;

# anterior vowels
define AntVow [ e | i ] ;

# posterior vowels
define PostVow [ a | o | u ] ;

# vowels
define Vow [ AntVow | PostVow ] ;

# morpheme separator
define MorphSep %^ ;

# thematic vowels
define ThemVow [ a | e | o ] ;

# convert letter c to phone [s] to prevent rule ChangeC
# from applying in cases like face^inha (diminitive of face)
define PhonC [c -> %[ s %] || _ AntVow MorphSep ] ;

# convett back phone [s] to letter c
define OrthC %[ s %] -> c ;

# delete ç before morpheme separator and anterior vowel
define DeleteCedilla [ ç -> c || _ MorphSep AntVow ];

# convert m to n before all letters except p or b
define NasalBilabAssim m -> n || _ MorphSep \[p|b];

# delete thematic vowel before morpheme separator and vowel
# casa^inha => cas^inha (diminutive of house)
define ThemVowDel ThemVow -> 0 || _ MorphSep Vow ;

# Note that the mapping examples used throughout these comments are simplified,
# they should help grasping the functioning of the different alternation rules.
# They don't correspond to the actual intermediate strings generated
# by the lexicon transducer.

# faca^inha => fac^inha => faqu^inha (diminutive of knife)
define ChangeC c -> {qu} || _ MorphSep AntVow ;

# manga^inha => mang^inha => mangu^inha (diminutive of mango)
define ChangeG g -> {gu} || _ MorphSep AntVow ;

# optionally delete e in cases like luzes^zinhas (diminutive of light in plural)
define OptDelEStemZ e (->) 0 || [z | s] _ s MorphSep z ;

# optionally delete e in cases like flores^zinhas (diminutive of flower in plural)
define OptDelEStemR e (->) 0 || r _ s MorphSep z ;

# composing the two previous rules
define OptDelE OptDelEStemZ .o. OptDelEStemR ;

# luzes^zinhas => luze^zinhas
define SDeletion s -> 0 || _ MorphSep z ;

# luz^zinhas => lu^zinhas
define ZDeletion z-> 0 || _ MorphSep z ;

# ideia^inha => idei^inha => ide^inha (diminutive of idea)
define IDeletion i-> 0 || _ MorphSep i ;

# delete morpheme separator
define DeleteMorphSep MorphSep -> 0 ;

# remove accents (diacritics marking stress) from derived words
define Unaccent [[á -> a] .o. [é -> e] .o. [ê -> e] .o. [ó -> o]
.o. [ô -> o] .o. [í -> i] .o. [ú -> u]
.o. [â -> a ]];

# compose the cascade of alternation rules in one single transducer
define AltRules NasalBilabAssim .o.
PhonC
.o.
Expand All @@ -62,9 +112,35 @@ define AltRules NasalBilabAssim .o.
DeleteCedilla
.o.
DeleteMorphSep ;

# convert abstract morpheme represeantation +DIM to inh suffix
# before thematic vowel, e.g.:
# casa^+DIM^a => cas^inh^a (diminutive of house)
# exepting cases where the thematic vowel is preceded by ã or õ, e.g.:
# pão => pão^+DIM^o => *pão^inh^o (diminutive of bread)
define RealizeDim [ %+DIM -> {^inh} || \[ã|õ] ThemVow _ ] ;
read lexc < suff02.lexc

# compiling lexc grammar
read lexc < suff02.lexc

# assigning resulting transducer to variable Lex
define Lex

# remove paths with +DIM in the transducer's lower language
define Filter ~ [$ %+DIM ] ;
# This prevents generation (and analysis) of ungrammatical forms like
# *cafeinho (cafezinho, diminutive of coffee), *pãinho (pãozinho),
# *marinho (marzinho, diminutive of sea), *papelinho (papelzinho,
# diminutive of paper), etc. This is subject to some dialectal variation,
# e.g. casalinho (diminutive of couple or town), which occurs in European Portuguese
# (Villalva & Silvestre 2014:120), and mulherinha (diminutive of woman),
# which is used by some speakers of Brazilian Portuguese. These forms
# can be generated by changing the left context of the RealizeDim rule.

# compose everything into the final lexical transducer
# mapping lexical strings like pão+N+DIM+M+PL
# to surface strings (i.e. inflected words) like
# pãezinhos (plural diminutive of bread)

regex Lex .o. RealizeDim .o. Filter .o. AltRules ;

2 changes: 2 additions & 0 deletions fst/test-upper.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ lápis+N+DIM+M+SG
lápis+N+DIM+M+PL
rapaz+N+DIM+M+SG
rapaz+N+DIM+M+PL
canal+N+DIM+M+SG
canal+N+DIM+M+PL
homem+N+AUG+DIM+M+SG
homem+N+AUG+DIM+M+PL
agulha+N+AUG+DIM+M+SG
Expand Down

0 comments on commit f6b9311

Please sign in to comment.