| #include <cstdlib> |
| #include <cstring> |
| #include <cstdio> |
| #include <ctype.h> |
| |
| #include "../hunspell/csutil.hxx" |
| #include "textparser.hxx" |
| |
| #ifndef W32 |
| using namespace std; |
| #endif |
| |
| // ISO-8859-1 HTML character entities |
| |
| static const char * LATIN1[] = { |
| "À", |
| "Ã", |
| "Å", |
| "Æ", |
| "È", |
| "Ê", |
| "Ì", |
| "Ï", |
| "Ð", |
| "Ñ", |
| "Ò", |
| "Ø", |
| "Ù", |
| "Þ", |
| "à", |
| "ã", |
| "å", |
| "æ", |
| "è", |
| "ê", |
| "ì", |
| "ï", |
| "ð", |
| "ñ", |
| "ò", |
| "ø", |
| "ù", |
| "þ", |
| "ÿ" |
| }; |
| |
| #define LATIN1_LEN (sizeof(LATIN1) / sizeof(char *)) |
| |
| TextParser::TextParser() { |
| init((char *) NULL); |
| } |
| |
| TextParser::TextParser(const char * wordchars) |
| { |
| init(wordchars); |
| } |
| |
| TextParser::TextParser(unsigned short * wordchars, int len, int text_len) |
| { |
| // NOTE: Dynamically allocate space according to text_len |
| // to avoid segfault for too long input. |
| for (int i = 0; i < MAXPREVLINE; i++) { |
| line[i] = (char*)malloc(text_len + 1); |
| } |
| urlline = (char*)malloc(text_len + 1); |
| |
| init(wordchars, len); |
| } |
| |
| TextParser::~TextParser() |
| { |
| // NOTE: Free dynamically allocated space. |
| for (int i = 0; i < MAXPREVLINE; i++) { |
| free(line[i]); |
| } |
| free(urlline); |
| } |
| |
| int TextParser::is_wordchar(char * w) |
| { |
| if (*w == '\0') return 0; |
| if (utf8) { |
| w_char wc; |
| unsigned short idx; |
| u8_u16(&wc, 1, w); |
| idx = (wc.h << 8) + wc.l; |
| return (unicodeisalpha(idx) || (wordchars_utf16 && flag_bsearch(wordchars_utf16, *((unsigned short *) &wc), wclen))); |
| } else { |
| return wordcharacters[(*w + 256) % 256]; |
| } |
| } |
| |
| const char * TextParser::get_latin1(char * s) |
| { |
| if (s[0] == '&') { |
| unsigned int i = 0; |
| while ((i < LATIN1_LEN) && |
| strncmp(LATIN1[i], s, strlen(LATIN1[i]))) i++; |
| if (i != LATIN1_LEN) return LATIN1[i]; |
| } |
| return NULL; |
| } |
| |
| void TextParser::init(const char * wordchars) |
| { |
| for (int i = 0; i < MAXPREVLINE; i++) { |
| line[i][0] = '\0'; |
| } |
| actual = 0; |
| head = 0; |
| token = 0; |
| state = 0; |
| utf8 = 0; |
| checkurl = 0; |
| unsigned int j; |
| for (j = 0; j < 256; j++) { |
| wordcharacters[j] = 0; |
| } |
| if (!wordchars) wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM"; |
| for (j = 0; j < strlen(wordchars); j++) { |
| wordcharacters[(wordchars[j] + 256) % 256] = 1; |
| } |
| } |
| |
| void TextParser::init(unsigned short * wc, int len) |
| { |
| for (int i = 0; i < MAXPREVLINE; i++) { |
| line[i][0] = '\0'; |
| } |
| actual = 0; |
| head = 0; |
| token = 0; |
| state = 0; |
| utf8 = 1; |
| checkurl = 0; |
| wordchars_utf16 = wc; |
| wclen = len; |
| } |
| |
| int TextParser::next_char(char * line, int * pos) { |
| if (*(line + *pos) == '\0') return 1; |
| if (utf8) { |
| if (*(line + *pos) >> 7) { |
| // jump to next UTF-8 character |
| for((*pos)++; (*(line + *pos) & 0xc0) == 0x80; (*pos)++); |
| } else { |
| (*pos)++; |
| } |
| } else (*pos)++; |
| return 0; |
| } |
| |
| void TextParser::put_line(char * word) |
| { |
| actual = (actual + 1) % MAXPREVLINE; |
| strcpy(line[actual], word); |
| token = 0; |
| head = 0; |
| check_urls(); |
| } |
| |
| char * TextParser::get_prevline(int n) |
| { |
| return mystrdup(line[(actual + MAXPREVLINE - n) % MAXPREVLINE]); |
| } |
| |
| char * TextParser::get_line() |
| { |
| return get_prevline(0); |
| } |
| |
| char * TextParser::next_token() |
| { |
| const char * latin1; |
| |
| for (;;) { |
| switch (state) |
| { |
| case 0: // non word chars |
| if (is_wordchar(line[actual] + head)) { |
| state = 1; |
| token = head; |
| } else if ((latin1 = get_latin1(line[actual] + head))) { |
| state = 1; |
| token = head; |
| head += strlen(latin1); |
| } |
| break; |
| case 1: // wordchar |
| if ((latin1 = get_latin1(line[actual] + head))) { |
| head += strlen(latin1); |
| } else if (! is_wordchar(line[actual] + head)) { |
| state = 0; |
| char * t = alloc_token(token, &head); |
| if (t) return t; |
| } |
| break; |
| } |
| if (next_char(line[actual], &head)) return NULL; |
| } |
| } |
| |
| int TextParser::get_tokenpos() |
| { |
| return token; |
| } |
| |
| int TextParser::change_token(const char * word) |
| { |
| if (word) { |
| char * r = mystrdup(line[actual] + head); |
| strcpy(line[actual] + token, word); |
| strcat(line[actual], r); |
| head = token; |
| free(r); |
| return 1; |
| } |
| return 0; |
| } |
| |
| void TextParser::check_urls() |
| { |
| int url_state = 0; |
| int url_head = 0; |
| int url_token = 0; |
| int url = 0; |
| for (;;) { |
| switch (url_state) |
| { |
| case 0: // non word chars |
| if (is_wordchar(line[actual] + url_head)) { |
| url_state = 1; |
| url_token = url_head; |
| // Unix path |
| } else if (*(line[actual] + url_head) == '/') { |
| url_state = 1; |
| url_token = url_head; |
| url = 1; |
| } |
| break; |
| case 1: // wordchar |
| char ch = *(line[actual] + url_head); |
| // e-mail address |
| if ((ch == '@') || |
| // MS-DOS, Windows path |
| (strncmp(line[actual] + url_head, ":\\", 2) == 0) || |
| // URL |
| (strncmp(line[actual] + url_head, "://", 3) == 0)) { |
| url = 1; |
| } else if (! (is_wordchar(line[actual] + url_head) || |
| (ch == '-') || (ch == '_') || (ch == '\\') || |
| (ch == '.') || (ch == ':') || (ch == '/') || |
| (ch == '~') || (ch == '%') || (ch == '*') || |
| (ch == '$') || (ch == '[') || (ch == ']') || |
| (ch == '?') || (ch == '!') || |
| ((ch >= '0') && (ch <= '9')))) { |
| url_state = 0; |
| if (url == 1) { |
| for (int i = url_token; i < url_head; i++) { |
| *(urlline + i) = 1; |
| } |
| } |
| url = 0; |
| } |
| break; |
| } |
| *(urlline + url_head) = 0; |
| if (next_char(line[actual], &url_head)) return; |
| } |
| } |
| |
| int TextParser::get_url(int token_pos, int * head) |
| { |
| for (int i = *head; urlline[i] && *(line[actual]+i); i++, (*head)++); |
| return checkurl ? 0 : urlline[token_pos]; |
| } |
| |
| void TextParser::set_url_checking(int check) |
| { |
| checkurl = check; |
| } |
| |
| |
| char * TextParser::alloc_token(int token, int * head) |
| { |
| if (get_url(token, head)) return NULL; |
| char * t = (char *) malloc(*head - token + 1); |
| if (t) { |
| t[*head - token] = '\0'; |
| strncpy(t, line[actual] + token, *head - token); |
| // remove colon for Finnish and Swedish language |
| if (t[*head - token - 1] == ':') { |
| t[*head - token - 1] = '\0'; |
| if (!t[0]) { |
| free(t); |
| return NULL; |
| } |
| } |
| return t; |
| } |
| fprintf(stderr,"Error - Insufficient Memory\n"); |
| return NULL; |
| } |