| #!/bin/sh |
| # affix compressor utility for Hunspell |
| # 2008 (c) László Németh, version 0.3 |
| # usage: affixcompress sorted_word_list_file [max_affix_rules] |
| case $# in |
| 0) echo \ |
| "affixcompress - compress a huge sorted word list to Hunspell format |
| Usage: |
| |
| LC_ALL=C sort word_list >sorted_word_list |
| affixcompress sorted_word_list [max_affix_rules] |
| |
| Default value of max_affix_rules = 5000 |
| |
| Note: output may need manually added affix parameters (SET character_encoding, |
| TRY suggestion_characters etc., see man(4) hunspell)" |
| exit 0;; |
| esac |
| |
| MAXAFFIX=${2:-5000} |
| |
| # profiling |
| #AWK="pgawk --profile" |
| AWK="gawk" |
| |
| rm -f $1.aff $1.dic |
| cat $1 | $AWK ' |
| { |
| # calculate frequent suffixes |
| A[$1] = 1 |
| len = length($1) |
| if (len > 2) { |
| # print $1, substr($1, 1, len - 1), substr($1, len, 1) >"/dev/stderr" |
| B[substr($1, 1, len - 1)] = substr($1, len, 1); |
| } |
| for(i = 2; i < len; i++) { |
| r = substr($1, 1, i) |
| if (i == 2) { |
| if (prev != r) { |
| delete A |
| delete B |
| print "Deleted roots: ", prev > "/dev/stderr" |
| A[$1] = 1 |
| } |
| prev = r |
| } |
| if (A[r]) { |
| # print $1 ": " r " és "substr($1, i + 1, len - i + 1) >"/dev/stderr" |
| sfx[substr($1, i + 1, len - i + 1)]++ |
| } else if (B[r] && B[r] != substr($1, i + 1, 1)) { |
| r2 = substr($1, i + 1, len - i + 1) |
| sfy[r2,B[r]]++ |
| } |
| } |
| } |
| END { |
| for (i in sfx) print i, 0, sfx[i] |
| for (i in sfy) print i, sfy[i] |
| } |
| ' | tr '\034' ' ' >affixcompress0.tmp |
| sort -rnk 3 affixcompress0.tmp | $AWK '$3 >= 1{print $0}' | |
| head -$MAXAFFIX >affixcompress1.tmp |
| cat affixcompress1.tmp | |
| $AWK ' |
| function potential_roots() { |
| # potential roots with most frequent suffixes |
| for(word in W) if (W[word]==1) { |
| print word >"word" |
| len = length(word); |
| for(i = 2; i < len; i++) { |
| root = substr(word, 1, i) |
| suff = substr(word, i + 1, len - i + 1) |
| if ((W[root]!="") && (sfxfr[suff] > 100)) C[root]++ |
| if (sfz[suff]) { |
| l = split(sfz[suff], a) |
| for (k=1; k <= l; k++) if ((W[root a[k]]!="") && (sfyfr[root a[k]] > 100)) { |
| C[root a[k]]++ |
| } |
| } |
| } |
| } |
| |
| # calculate roots |
| for(word in W) if (W[word]==1) { |
| print word >"word2" |
| len = length(word); |
| z = 0 |
| # choose most frequent root (maybe the original word) |
| max = C[word] |
| maxword = word |
| maxsuff = 0 |
| for(i = 2; i < len; i++) { |
| root = substr(word, 1, i) |
| suff = substr(word, i + 1, len - i + 1) |
| if ((sfx[suff] != "") && (C[root] > max)) { |
| max = C[root] |
| maxword = root |
| maxsuff = sfx[suff] |
| } |
| if (sfz[suff] != "") { |
| l = split(sfz[suff], a) |
| for (k=1; k <= l; k++) if (C[root a[k]] > max) { |
| max = C[root a[k]] |
| maxword = root a[k] |
| maxsuff = sfy[suff,a[k]] |
| } |
| } |
| } |
| if (max > 0) { |
| if (maxsuff > 0) print maxword, maxsuff; else print maxword |
| A[maxword]++ |
| z=1 |
| } else { |
| for(i = 2; i < len; i++) { |
| root = substr(word, 1, i) |
| suff = substr(word, i + 1, len - i + 1) |
| if ((A[root] > 0) && sfx[suff]!="") { |
| print root, sfx[suff] |
| z = 1 |
| break |
| } |
| if (sfz[suff]) { |
| l = split(sfz[suff], a) |
| for (k=1; k <= l; k++) if (A[root a[k]]!="") { |
| print root a[k], sfy[suff,a[k]] |
| z = 1 |
| break |
| } |
| } |
| } |
| } |
| if (z == 0) { |
| print word |
| A[word]++ |
| } |
| } |
| delete A |
| delete C |
| } |
| FILENAME == "-" { |
| if ($2 == 0) { |
| sfx[$1] = NR |
| sfxfr[$1] = $3 |
| } else { |
| sfy[$1,$2] = NR |
| sfyfr[$1,$2] = $3 |
| sfz[$1] = sfz[$1] " " $2 |
| } |
| maxsuf = NR |
| next |
| } |
| { |
| cap = substr($1, 1, 3) |
| if (cap != prev) { |
| potential_roots() |
| delete W |
| print "Deleted class:", prev > "/dev/stderr" |
| } |
| prev = cap |
| W[$1] = 1 |
| } |
| END { |
| potential_roots() |
| # write out frequent suffixes |
| out=FILENAME ".aff" |
| print "FLAG num" >out |
| for (i in sfx) if (sfx[i] > 0) { |
| print "SFX", sfx[i], "Y 1" >out |
| print "SFX", sfx[i], "0", i, "." >out |
| } |
| for (i in sfy) if (sfy[i] > 0) { |
| print "SFX", sfy[i], "Y 1" >out |
| split(i, c, "\034"); |
| print "SFX", sfy[i], c[2], c[1], c[2] >out |
| } |
| } |
| ' - $1 >affixcompress2.tmp |
| sort -nk 2 affixcompress2.tmp >affixcompress3.tmp |
| cat affixcompress3.tmp | $AWK -v out="$1.dic" ' |
| { |
| if (A[$1]=="") A[$1]=$2; |
| else if ($2!="") A[$1] = A[$1] "," $2 |
| } |
| END { |
| for (i in A) n++ |
| print n >out |
| for (i in A) { |
| if (A[i]=="") print i |
| else print i "/" A[i] |
| } |
| } |
| ' | sort >>$1.dic |