blob: 9fc2989ddd844d2fcc2d1964f9dbbfeda66b9562 [file] [log] [blame]
#!/bin/sh
# affix compressor utility for Hunspell
# 2008 (c) László Németh, version 0.3
# usage: affixcompress sorted_word_list_file [max_affix_rules]
case $# in
0) echo \
"affixcompress - compress a huge sorted word list to Hunspell format
Usage:
LC_ALL=C sort word_list >sorted_word_list
affixcompress sorted_word_list [max_affix_rules]
Default value of max_affix_rules = 5000
Note: output may need manually added affix parameters (SET character_encoding,
TRY suggestion_characters etc., see man(4) hunspell)"
exit 0;;
esac
MAXAFFIX=${2:-5000}
# profiling
#AWK="pgawk --profile"
AWK="gawk"
rm -f $1.aff $1.dic
cat $1 | $AWK '
{
# calculate frequent suffixes
A[$1] = 1
len = length($1)
if (len > 2) {
# print $1, substr($1, 1, len - 1), substr($1, len, 1) >"/dev/stderr"
B[substr($1, 1, len - 1)] = substr($1, len, 1);
}
for(i = 2; i < len; i++) {
r = substr($1, 1, i)
if (i == 2) {
if (prev != r) {
delete A
delete B
print "Deleted roots: ", prev > "/dev/stderr"
A[$1] = 1
}
prev = r
}
if (A[r]) {
# print $1 ": " r " és "substr($1, i + 1, len - i + 1) >"/dev/stderr"
sfx[substr($1, i + 1, len - i + 1)]++
} else if (B[r] && B[r] != substr($1, i + 1, 1)) {
r2 = substr($1, i + 1, len - i + 1)
sfy[r2,B[r]]++
}
}
}
END {
for (i in sfx) print i, 0, sfx[i]
for (i in sfy) print i, sfy[i]
}
' | tr '\034' ' ' >affixcompress0.tmp
sort -rnk 3 affixcompress0.tmp | $AWK '$3 >= 1{print $0}' |
head -$MAXAFFIX >affixcompress1.tmp
cat affixcompress1.tmp |
$AWK '
function potential_roots() {
# potential roots with most frequent suffixes
for(word in W) if (W[word]==1) {
print word >"word"
len = length(word);
for(i = 2; i < len; i++) {
root = substr(word, 1, i)
suff = substr(word, i + 1, len - i + 1)
if ((W[root]!="") && (sfxfr[suff] > 100)) C[root]++
if (sfz[suff]) {
l = split(sfz[suff], a)
for (k=1; k <= l; k++) if ((W[root a[k]]!="") && (sfyfr[root a[k]] > 100)) {
C[root a[k]]++
}
}
}
}
# calculate roots
for(word in W) if (W[word]==1) {
print word >"word2"
len = length(word);
z = 0
# choose most frequent root (maybe the original word)
max = C[word]
maxword = word
maxsuff = 0
for(i = 2; i < len; i++) {
root = substr(word, 1, i)
suff = substr(word, i + 1, len - i + 1)
if ((sfx[suff] != "") && (C[root] > max)) {
max = C[root]
maxword = root
maxsuff = sfx[suff]
}
if (sfz[suff] != "") {
l = split(sfz[suff], a)
for (k=1; k <= l; k++) if (C[root a[k]] > max) {
max = C[root a[k]]
maxword = root a[k]
maxsuff = sfy[suff,a[k]]
}
}
}
if (max > 0) {
if (maxsuff > 0) print maxword, maxsuff; else print maxword
A[maxword]++
z=1
} else {
for(i = 2; i < len; i++) {
root = substr(word, 1, i)
suff = substr(word, i + 1, len - i + 1)
if ((A[root] > 0) && sfx[suff]!="") {
print root, sfx[suff]
z = 1
break
}
if (sfz[suff]) {
l = split(sfz[suff], a)
for (k=1; k <= l; k++) if (A[root a[k]]!="") {
print root a[k], sfy[suff,a[k]]
z = 1
break
}
}
}
}
if (z == 0) {
print word
A[word]++
}
}
delete A
delete C
}
FILENAME == "-" {
if ($2 == 0) {
sfx[$1] = NR
sfxfr[$1] = $3
} else {
sfy[$1,$2] = NR
sfyfr[$1,$2] = $3
sfz[$1] = sfz[$1] " " $2
}
maxsuf = NR
next
}
{
cap = substr($1, 1, 3)
if (cap != prev) {
potential_roots()
delete W
print "Deleted class:", prev > "/dev/stderr"
}
prev = cap
W[$1] = 1
}
END {
potential_roots()
# write out frequent suffixes
out=FILENAME ".aff"
print "FLAG num" >out
for (i in sfx) if (sfx[i] > 0) {
print "SFX", sfx[i], "Y 1" >out
print "SFX", sfx[i], "0", i, "." >out
}
for (i in sfy) if (sfy[i] > 0) {
print "SFX", sfy[i], "Y 1" >out
split(i, c, "\034");
print "SFX", sfy[i], c[2], c[1], c[2] >out
}
}
' - $1 >affixcompress2.tmp
sort -nk 2 affixcompress2.tmp >affixcompress3.tmp
cat affixcompress3.tmp | $AWK -v out="$1.dic" '
{
if (A[$1]=="") A[$1]=$2;
else if ($2!="") A[$1] = A[$1] "," $2
}
END {
for (i in A) n++
print n >out
for (i in A) {
if (A[i]=="") print i
else print i "/" A[i]
}
}
' | sort >>$1.dic