blob: 6bbd09c8f5031fe7008c92b130de0489f1bf2608 [file] [log] [blame]
/* Un-munch a root word list with affix tags
* to recreate the original word list
*/
#include <ctype.h>
#include <string.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#ifdef __linux__
#include <error.h>
#include <errno.h>
#include <sys/mman.h>
#endif
#include "unmunch.h"
int main(int argc, char** argv)
{
int i;
int al, wl;
FILE * wrdlst;
FILE * afflst;
char *wf, *af;
char * ap;
char ts[MAX_LN_LEN];
/* first parse the command line options */
/* arg1 - munched wordlist, arg2 - affix file */
if (argv[1]) {
wf = mystrdup(argv[1]);
} else {
fprintf(stderr,"correct syntax is:\n");
fprintf(stderr,"unmunch dic_file affix_file\n");
exit(1);
}
if (argv[2]) {
af = mystrdup(argv[2]);
} else {
fprintf(stderr,"correct syntax is:\n");
fprintf(stderr,"unmunch dic_file affix_file\n");
exit(1);
}
/* open the affix file */
afflst = fopen(af,"r");
if (!afflst) {
fprintf(stderr,"Error - could not open affix description file\n");
exit(1);
}
/* step one is to parse the affix file building up the internal
affix data structures */
numpfx = 0;
numsfx = 0;
fullstrip = 0;
if (parse_aff_file(afflst)) {
fprintf(stderr,"Error - in affix file loading\n");
exit(1);
}
fclose(afflst);
fprintf(stderr,"parsed in %d prefixes and %d suffixes\n",numpfx,numsfx);
/* affix file is now parsed so create hash table of wordlist on the fly */
/* open the wordlist */
wrdlst = fopen(wf,"r");
if (!wrdlst) {
fprintf(stderr,"Error - could not open word list file\n");
exit(1);
}
/* skip over the hash table size */
if (! fgets(ts, MAX_LN_LEN-1,wrdlst)) {
fclose(wrdlst);
return 2;
}
mychomp(ts);
while (fgets(ts,MAX_LN_LEN-1,wrdlst)) {
mychomp(ts);
/* split each line into word and affix char strings */
ap = strchr(ts,'/');
if (ap) {
*ap = '\0';
ap++;
al = strlen(ap);
} else {
al = 0;
ap = NULL;
}
wl = strlen(ts);
numwords = 0;
wlist[numwords].word = mystrdup(ts);
wlist[numwords].pallow = 0;
numwords++;
if (al)
expand_rootword(ts,wl,ap,al);
for (i=0; i < numwords; i++) {
fprintf(stdout,"%s\n",wlist[i].word);
free(wlist[i].word);
wlist[i].word = NULL;
wlist[i].pallow = 0;
}
}
fclose(wrdlst);
return 0;
}
int parse_aff_file(FILE * afflst)
{
int i, j;
int numents=0;
char achar='\0';
short ff=0;
char ft;
struct affent * ptr= NULL;
struct affent * nptr= NULL;
char * line = malloc(MAX_LN_LEN);
while (fgets(line,MAX_LN_LEN,afflst)) {
mychomp(line);
ft = ' ';
fprintf(stderr,"parsing line: %s\n",line);
if (strncmp(line,"FULLSTRIP",9) == 0) fullstrip = 1;
if (strncmp(line,"PFX",3) == 0) ft = 'P';
if (strncmp(line,"SFX",3) == 0) ft = 'S';
if (ft != ' ') {
char * tp = line;
char * piece;
ff = 0;
i = 0;
while ((piece=mystrsep(&tp,' '))) {
if (*piece != '\0') {
switch(i) {
case 0: break;
case 1: { achar = *piece; break; }
case 2: { if (*piece == 'Y') ff = XPRODUCT; break; }
case 3: { numents = atoi(piece);
ptr = malloc(numents * sizeof(struct affent));
ptr->achar = achar;
ptr->xpflg = ff;
fprintf(stderr,"parsing %c entries %d\n",achar,numents);
break;
}
default: break;
}
i++;
}
free(piece);
}
/* now parse all of the sub entries*/
nptr = ptr;
for (j=0; j < numents; j++) {
if (!fgets(line,MAX_LN_LEN,afflst)) return 1;
mychomp(line);
tp = line;
i = 0;
while ((piece=mystrsep(&tp,' '))) {
if (*piece != '\0') {
switch(i) {
case 0: { if (nptr != ptr) {
nptr->achar = ptr->achar;
nptr->xpflg = ptr->xpflg;
}
break;
}
case 1: break;
case 2: { nptr->strip = mystrdup(piece);
nptr->stripl = strlen(nptr->strip);
if (strcmp(nptr->strip,"0") == 0) {
free(nptr->strip);
nptr->strip=mystrdup("");
nptr->stripl = 0;
}
break;
}
case 3: { nptr->appnd = mystrdup(piece);
nptr->appndl = strlen(nptr->appnd);
if (strcmp(nptr->appnd,"0") == 0) {
free(nptr->appnd);
nptr->appnd=mystrdup("");
nptr->appndl = 0;
}
break;
}
case 4: { encodeit(nptr,piece);}
fprintf(stderr, " affix: %s %d, strip: %s %d\n",nptr->appnd,
nptr->appndl,nptr->strip,nptr->stripl);
default: break;
}
i++;
}
free(piece);
}
nptr++;
}
if (ft == 'P') {
ptable[numpfx].aep = ptr;
ptable[numpfx].num = numents;
fprintf(stderr,"ptable %d num is %d flag %c\n",numpfx,ptable[numpfx].num,ptr->achar);
numpfx++;
} else {
stable[numsfx].aep = ptr;
stable[numsfx].num = numents;
fprintf(stderr,"stable %d num is %d flag %c\n",numsfx,stable[numsfx].num,ptr->achar);
numsfx++;
}
ptr = NULL;
nptr = NULL;
numents = 0;
achar='\0';
}
}
free(line);
return 0;
}
void encodeit(struct affent * ptr, char * cs)
{
int nc;
int neg;
int grp;
unsigned char c;
int n;
int ec;
int nm;
int i, j, k;
unsigned char mbr[MAX_WD_LEN];
/* now clear the conditions array */
for (i=0;i<SET_SIZE;i++) ptr->conds[i] = (unsigned char) 0;
/* now parse the string to create the conds array */
nc = strlen(cs);
neg = 0; /* complement indicator */
grp = 0; /* group indicator */
n = 0; /* number of conditions */
ec = 0; /* end condition indicator */
nm = 0; /* number of member in group */
i = 0;
if (strcmp(cs,".")==0) {
ptr->numconds = 0;
return;
}
while (i < nc) {
c = *((unsigned char *)(cs + i));
if (c == '[') {
grp = 1;
c = 0;
}
if ((grp == 1) && (c == '^')) {
neg = 1;
c = 0;
}
if (c == ']') {
ec = 1;
c = 0;
}
if ((grp == 1) && (c != 0)) {
*(mbr + nm) = c;
nm++;
c = 0;
}
if (c != 0) {
ec = 1;
}
if (ec) {
if (grp == 1) {
if (neg == 0) {
for (j=0;j<nm;j++) {
k = (unsigned int) mbr[j];
ptr->conds[k] = ptr->conds[k] | (1 << n);
}
} else {
for (j=0;j<SET_SIZE;j++) ptr->conds[j] = ptr->conds[j] | (1 << n);
for (j=0;j<nm;j++) {
k = (unsigned int) mbr[j];
ptr->conds[k] = ptr->conds[k] & ~(1 << n);
}
}
neg = 0;
grp = 0;
nm = 0;
} else {
/* not a group so just set the proper bit for this char */
/* but first handle special case of . inside condition */
if (c == '.') {
/* wild card character so set them all */
for (j=0;j<SET_SIZE;j++) ptr->conds[j] = ptr->conds[j] | (1 << n);
} else {
ptr->conds[(unsigned int) c] = ptr->conds[(unsigned int)c] | (1 << n);
}
}
n++;
ec = 0;
}
i++;
}
ptr->numconds = n;
return;
}
/* add a prefix to word */
void pfx_add (const char * word, int len, struct affent* ep, int num)
{
struct affent * aent;
int cond;
int tlen;
unsigned char * cp;
int i;
char * pp;
char tword[MAX_WD_LEN];
for (aent = ep, i = num; i > 0; aent++, i--) {
/* now make sure all conditions match */
if ((len + fullstrip > aent->stripl) && (len >= aent->numconds) &&
((aent->stripl == 0) ||
(strncmp(aent->strip, word, aent->stripl) == 0))) {
cp = (unsigned char *) word;
for (cond = 0; cond < aent->numconds; cond++) {
if ((aent->conds[*cp++] & (1 << cond)) == 0)
break;
}
if (cond >= aent->numconds) {
/* we have a match so add prefix */
tlen = 0;
if (aent->appndl) {
strcpy(tword,aent->appnd);
tlen += aent->appndl;
}
pp = tword + tlen;
strcpy(pp, (word + aent->stripl));
tlen = tlen + len - aent->stripl;
if (numwords < MAX_WORDS) {
wlist[numwords].word = mystrdup(tword);
wlist[numwords].pallow = 0;
numwords++;
}
}
}
}
}
/* add a suffix to a word */
void suf_add (const char * word, int len, struct affent * ep, int num)
{
struct affent * aent;
int tlen;
int cond;
unsigned char * cp;
int i;
char tword[MAX_WD_LEN];
char * pp;
for (aent = ep, i = num; i > 0; aent++, i--) {
/* if conditions hold on root word
* then strip off strip string and add suffix
*/
if ((len + fullstrip > aent->stripl) && (len >= aent->numconds) &&
((aent->stripl == 0) ||
(strcmp(aent->strip, word + len - aent->stripl) == 0))) {
cp = (unsigned char *) (word + len);
for (cond = aent->numconds; --cond >= 0; ) {
if ((aent->conds[*--cp] & (1 << cond)) == 0) break;
}
if (cond < 0) {
/* we have a matching condition */
strcpy(tword,word);
tlen = len;
if (aent->stripl) {
tlen -= aent->stripl;
}
pp = (tword + tlen);
if (aent->appndl) {
strcpy (pp, aent->appnd);
tlen += aent->stripl;
} else *pp = '\0';
if (numwords < MAX_WORDS) {
wlist[numwords].word = mystrdup(tword);
wlist[numwords].pallow = (aent->xpflg & XPRODUCT);
numwords++;
}
}
}
}
}
int expand_rootword(const char * ts, int wl, const char * ap, int al)
{
int i;
int j;
int nh=0;
int nwl;
for (i=0; i < numsfx; i++) {
if (strchr(ap,(stable[i].aep)->achar)) {
suf_add(ts, wl, stable[i].aep, stable[i].num);
}
}
nh = numwords;
if (nh > 1) {
for (j=1;j<nh;j++){
if (wlist[j].pallow) {
for (i=0; i < numpfx; i++) {
if (strchr(ap,(ptable[i].aep)->achar)) {
if ((ptable[i].aep)->xpflg & XPRODUCT) {
nwl = strlen(wlist[j].word);
pfx_add(wlist[j].word, nwl, ptable[i].aep, ptable[i].num);
}
}
}
}
}
}
for (i=0; i < numpfx; i++) {
if (strchr(ap,(ptable[i].aep)->achar)) {
pfx_add(ts, wl, ptable[i].aep, ptable[i].num);
}
}
return 0;
}
/* strip strings into token based on single char delimiter
* acts like strsep() but only uses a delim char and not
* a delim string
*/
char * mystrsep(char ** stringp, const char delim)
{
char * rv = NULL;
char * mp = *stringp;
int n = strlen(mp);
if (n > 0) {
char * dp = (char *)memchr(mp,(int)((unsigned char)delim),n);
if (dp) {
int nc;
*stringp = dp+1;
nc = (int)((unsigned long)dp - (unsigned long)mp);
rv = (char *) malloc(nc+1);
if (rv) {
memcpy(rv,mp,nc);
*(rv+nc) = '\0';
}
} else {
rv = (char *) malloc(n+1);
if (rv) {
memcpy(rv, mp, n);
*(rv+n) = '\0';
*stringp = mp + n;
}
}
}
return rv;
}
char * mystrdup(const char * s)
{
char * d = NULL;
if (s) {
int sl = strlen(s)+1;
d = (char *) malloc(sl);
if (d) memcpy(d,s,sl);
}
return d;
}
void mychomp(char * s)
{
int k = strlen(s);
if ((k > 0) && (*(s+k-1) == '\n')) *(s+k-1) = '\0';
if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
}