| #!/bin/sh |
| # |
| # (C) 2008 Caolán McNamara <caolanm@redhat.com> |
| # |
| # This program is free software; you can redistribute it and/or modify |
| # it under the terms of the GNU General Public License as published by |
| # the Free Software Foundation; either version 2 of the License, or |
| # (at your option) any later version. |
| # |
| # This program is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| # GNU General Public License for more details. |
| # |
| # You should have received a copy of the GNU General Public License |
| # along with this program; if not, write to the Free Software |
| # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
| |
| # This creates a LANG_TERRITORY .aff & .dic from a wordlist. |
| # It is only a simple wordlist spellchecking dictionary output, no |
| # knowledge of language rules can be extrapolated to shrink the |
| # wordlist or provide .aff rules for extending wordstems |
| |
| if [ $# -lt 2 ]; then |
| echo "Usage: wordlist2hunspell wordlist_file locale" |
| echo "e.g. wordlist2hunspell breton.words br_FR to create br_FR.dic and br_FR.aff in cwd" |
| exit 1 |
| fi |
| |
| export LANG=$2.utf8 |
| echo "# A basic .aff for a raw wordlist, created through wordlist2hunspell" > $2.aff |
| echo SET UTF-8 >> $2.aff |
| #see https://bugzilla.redhat.com/show_bug.cgi?id=462184 for the "C" hacks |
| echo TRY `sed 's/./&\n/g' $1 | sed '/^$/d' | LC_ALL=C sort -n | LC_ALL=C uniq -c | LC_ALL=C sort -rn | tr -s ' ' | cut -d ' ' -f 3 | tr -d '\n'` >> $2.aff |
| cat $1 | sed '/^$/d' | wc -l > $2.dic |
| LC_ALL=C sort $1 | sed '/^$/d' >> $2.dic |
| |
| echo Basic $2.dic and $2.aff created |