| /* Fast fuzzy searching among messages. |
| Copyright (C) 2006, 2008, 2011, 2013 Free Software Foundation, Inc. |
| Written by Bruno Haible <bruno@clisp.org>, 2006. |
| |
| This program is free software: you can redistribute it and/or modify |
| it under the terms of the GNU General Public License as published by |
| the Free Software Foundation; either version 3 of the License, or |
| (at your option) any later version. |
| |
| This program is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| GNU General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with this program. If not, see <https://www.gnu.org/licenses/>. */ |
| |
| #ifdef HAVE_CONFIG_H |
| # include <config.h> |
| #endif |
| |
| /* Specification. */ |
| #include "msgl-fsearch.h" |
| |
| #include <math.h> |
| #include <stdlib.h> |
| |
| #include "xalloc.h" |
| #include "po-charset.h" |
| |
| |
| /* Fuzzy searching of L strings in a large set of N messages (assuming |
| they have all the same small size) takes O(L * N) when using repeated |
| fstrcmp() calls. When for example L = 800 and N = 69000, this is slow. |
| |
| So we preprocess the set of N messages, yielding a data structure |
| that allows to select the similar messages for any given string, and |
| apply fstrcmp() only to the search result. This allows to reduce the |
| time to something between O(L * 1) and O(L * N) - depending on the |
| structure of the strings. In the example with L = 800 and N = 69000, |
| memory use increases by a factor of 2 and the time decreases from |
| 9068 s to 230 s. |
| |
| The data structure is a hash table mapping each n-gram (here with n=4) |
| occurring in the message strings to the list of messages that contains |
| it. For example, if the message list is |
| [0] "close" |
| [1] "losers" |
| the index is a hash table mapping |
| "clos" -> { 0 } |
| "lose" -> { 0, 1 } |
| "oser" -> { 1 } |
| "sers" -> { 1 } |
| Searching the similar messages to, say, "closest", is done by looking up |
| all its 4-grams in the hash table and summing up the results: |
| "clos" -> { 0 } |
| "lose" -> { 0, 1 } |
| "oses" -> {} |
| "sest" -> {} |
| => total: { 2x0, 1x1 } |
| The list is sorted according to decreasing frequency: { 0, 1, ... } |
| and only the first few messages from this frequency list are passed to |
| fstrcmp(). |
| |
| The n-gram similarity and the fstrcmp() similarity are quite different |
| metrics. For example, "close" and "c l o s e" have no n-grams in common |
| (even for n as small as n = 2); however, fstrcmp() will find that they |
| are quite similar (= 0.71). Conversely, "AAAA BBBB ... ZZZZ" and |
| "ZZZZ ... BBBB AAAA" have many 4-grams in common, but their fstrcmp() is |
| only 0.02. Also, repeated n-grams don't have the same effect on the |
| two similarity measures. But all this doesn't matter much in practice. |
| |
| We chose n = 4 because for alphabetic languages, with n = 3 the occurrence |
| lists are likely too long. (Well, with ideographic languages like Chinese, |
| n = 3 might actually be quite good. Anyway, n = 4 is not bad in this case |
| either.) |
| |
| The units are characters in the current encoding. Not just bytes, |
| because 4 consecutive bytes in UTF-8 or GB18030 don't mean much. */ |
| |
| /* Each message is represented by its index in the message list. */ |
| typedef unsigned int index_ty; |
| |
| /* An index list has its allocated size and length tacked at the front. |
| The indices are sorted in ascending order. */ |
| typedef index_ty *index_list_ty; |
| #define IL_ALLOCATED 0 |
| #define IL_LENGTH 1 |
| |
| /* Create a new index list containing only a given index. */ |
| static inline index_list_ty |
| new_index (index_ty idx) |
| { |
| index_ty *list = XNMALLOC (2 + 1, index_ty); |
| list[IL_ALLOCATED] = 1; |
| list[IL_LENGTH] = 1; |
| list[2] = idx; |
| |
| return list; |
| } |
| |
| /* Add a given index, greater or equal than all previous indices, to an |
| index list. |
| Return the new index list, if it had to be reallocated, or NULL if it |
| didn't change. */ |
| static inline index_list_ty |
| addlast_index (index_list_ty list, index_ty idx) |
| { |
| index_list_ty result; |
| size_t length = list[IL_LENGTH]; |
| |
| /* Look whether it should be inserted. */ |
| if (length > 0 && list[2 + (length - 1)] == idx) |
| return NULL; |
| |
| /* Now make room for one more list element. */ |
| result = NULL; |
| if (length == list[IL_ALLOCATED]) |
| { |
| size_t new_allocated = 2 * length - (length >> 6); |
| list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty)); |
| list[IL_ALLOCATED] = new_allocated; |
| result = list; |
| } |
| list[2 + length] = idx; |
| list[IL_LENGTH] = length + 1; |
| return result; |
| } |
| |
| /* Add a given index to an index list. |
| Return the new index list, if it had to be reallocated, or NULL if it |
| didn't change. */ |
| static inline index_list_ty |
| add_index (index_list_ty list, index_ty idx) |
| { |
| index_list_ty result; |
| size_t length = list[IL_LENGTH]; |
| |
| /* Look where it should be inserted. */ |
| size_t lo = 0; |
| size_t hi = length; |
| while (lo < hi) |
| { |
| /* Here we know that idx must be inserted at a position >= lo, <= hi. */ |
| size_t mid = (lo + hi) / 2; /* lo <= mid < hi */ |
| index_ty val = list[2 + mid]; |
| if (val < idx) |
| lo = mid + 1; |
| else if (val > idx) |
| hi = mid; |
| else |
| return NULL; |
| } |
| |
| /* Now make room for one more list element. */ |
| result = NULL; |
| if (length == list[IL_ALLOCATED]) |
| { |
| size_t new_allocated = 2 * length - (length >> 6); |
| list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty)); |
| list[IL_ALLOCATED] = new_allocated; |
| result = list; |
| } |
| list[IL_LENGTH] = length + 1; |
| for (; length > hi; length--) |
| list[2 + length] = list[1 + length]; |
| list[2 + length] = idx; |
| return result; |
| } |
| |
| /* We use 4-grams, therefore strings with less than 4 characters cannot be |
| handled through the 4-grams table and need to be handled specially. |
| Since every character occupies at most 4 bytes (see po-charset.c), |
| this means the size of such short strings is bounded by: */ |
| #define SHORT_STRING_MAX_CHARACTERS (4 - 1) |
| #define SHORT_STRING_MAX_BYTES (SHORT_STRING_MAX_CHARACTERS * 4) |
| |
| /* Such short strings are handled by direct comparison with all messages |
| of appropriate size. Note that for two strings of length 0 <= l1 <= l2, |
| fstrcmp() is <= 2 * l1 / (l1 + l2). Since we are only interested in |
| fstrcmp() values >= FUZZY_THRESHOLD, we can for a given string of length l |
| limit the search to lengths l' in the range |
| l / (2 / FUZZY_THRESHOLD - 1) <= l' <= l * (2 / FUZZY_THRESHOLD - 1) |
| Thus we need the list of the short strings up to length: */ |
| #if !defined __SUNPRO_C |
| # define SHORT_MSG_MAX (int) (SHORT_STRING_MAX_BYTES * (2 / FUZZY_THRESHOLD - 1)) |
| #else |
| /* Sun C on Solaris 8 cannot compute this constant expression. */ |
| # define SHORT_MSG_MAX 28 |
| #endif |
| |
| /* A fuzzy index contains a hash table mapping all n-grams to their |
| occurrences list. */ |
| struct message_fuzzy_index_ty |
| { |
| message_ty **messages; |
| character_iterator_t iterator; |
| hash_table gram4; |
| size_t firstfew; |
| message_list_ty **short_messages; |
| }; |
| |
| /* Allocate a fuzzy index corresponding to a given list of messages. |
| The list of messages and the msgctxt and msgid fields of the messages |
| inside it must not be modified while the returned fuzzy index is in use. */ |
| message_fuzzy_index_ty * |
| message_fuzzy_index_alloc (const message_list_ty *mlp, |
| const char *canon_charset) |
| { |
| message_fuzzy_index_ty *findex = XMALLOC (message_fuzzy_index_ty); |
| size_t count = mlp->nitems; |
| size_t j; |
| size_t l; |
| |
| findex->messages = mlp->item; |
| findex->iterator = po_charset_character_iterator (canon_charset); |
| |
| /* Setup hash table. */ |
| if (hash_init (&findex->gram4, 10 * count) < 0) |
| xalloc_die (); |
| for (j = 0; j < count; j++) |
| { |
| message_ty *mp = mlp->item[j]; |
| |
| if (mp->msgstr != NULL && mp->msgstr[0] != '\0') |
| { |
| const char *str = mp->msgid; |
| |
| /* Let p0 < p1 < p2 < p3 < p4 walk through the string. */ |
| const char *p0 = str; |
| if (*p0 != '\0') |
| { |
| const char *p1 = p0 + findex->iterator (p0); |
| if (*p1 != '\0') |
| { |
| const char *p2 = p1 + findex->iterator (p1); |
| if (*p2 != '\0') |
| { |
| const char *p3 = p2 + findex->iterator (p2); |
| if (*p3 != '\0') |
| { |
| const char *p4 = p3 + findex->iterator (p3); |
| for (;;) |
| { |
| /* The segment from p0 to p4 is a 4-gram of |
| characters. Add a hash table entry that maps |
| it to the index j, or extend the existing |
| hash table entry accordingly. */ |
| void *found; |
| |
| if (hash_find_entry (&findex->gram4, p0, p4 - p0, |
| &found) == 0) |
| { |
| index_list_ty list = (index_list_ty) found; |
| list = addlast_index (list, j); |
| if (list != NULL) |
| hash_set_value (&findex->gram4, p0, p4 - p0, |
| list); |
| } |
| else |
| hash_insert_entry (&findex->gram4, p0, p4 - p0, |
| new_index (j)); |
| |
| /* Advance. */ |
| if (*p4 == '\0') |
| break; |
| p0 = p1; |
| p1 = p2; |
| p2 = p3; |
| p3 = p4; |
| p4 = p4 + findex->iterator (p4); |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| /* Shrink memory used by the hash table. */ |
| { |
| void *iter; |
| const void *key; |
| size_t keylen; |
| void **valuep; |
| |
| iter = NULL; |
| while (hash_iterate_modify (&findex->gram4, &iter, &key, &keylen, &valuep) |
| == 0) |
| { |
| index_list_ty list = (index_list_ty) *valuep; |
| index_ty length = list[IL_LENGTH]; |
| |
| if (length < list[IL_ALLOCATED]) |
| { |
| list[IL_ALLOCATED] = length; |
| *valuep = xrealloc (list, (2 + length) * sizeof (index_ty)); |
| } |
| } |
| } |
| |
| findex->firstfew = (int) sqrt ((double) count); |
| if (findex->firstfew < 10) |
| findex->firstfew = 10; |
| |
| /* Setup lists of short messages. */ |
| findex->short_messages = XNMALLOC (SHORT_MSG_MAX + 1, message_list_ty *); |
| for (l = 0; l <= SHORT_MSG_MAX; l++) |
| findex->short_messages[l] = message_list_alloc (false); |
| for (j = 0; j < count; j++) |
| { |
| message_ty *mp = mlp->item[j]; |
| |
| if (mp->msgstr != NULL && mp->msgstr[0] != '\0') |
| { |
| const char *str = mp->msgid; |
| size_t len = strlen (str); |
| |
| if (len <= SHORT_MSG_MAX) |
| message_list_append (findex->short_messages[len], mp); |
| } |
| } |
| |
| /* Shrink memory used by the lists of short messages. */ |
| for (l = 0; l <= SHORT_MSG_MAX; l++) |
| { |
| message_list_ty *mlp = findex->short_messages[l]; |
| |
| if (mlp->nitems < mlp->nitems_max) |
| { |
| mlp->nitems_max = mlp->nitems; |
| mlp->item = |
| (message_ty **) |
| xrealloc (mlp->item, mlp->nitems_max * sizeof (message_ty *)); |
| } |
| } |
| |
| return findex; |
| } |
| |
| /* An index with multiplicity. */ |
| struct mult_index |
| { |
| index_ty index; |
| unsigned int count; |
| }; |
| |
| /* A list of indices with multiplicity. |
| The indices are sorted in ascending order. */ |
| struct mult_index_list |
| { |
| struct mult_index *item; |
| size_t nitems; |
| size_t nitems_max; |
| /* Work area. */ |
| struct mult_index *item2; |
| size_t nitems2_max; |
| }; |
| |
| /* Initialize an empty list of indices with multiplicity. */ |
| static inline void |
| mult_index_list_init (struct mult_index_list *accu) |
| { |
| accu->nitems = 0; |
| accu->nitems_max = 0; |
| accu->item = NULL; |
| accu->nitems2_max = 0; |
| accu->item2 = NULL; |
| } |
| |
| /* Add an index list to a list of indices with multiplicity. */ |
| static inline void |
| mult_index_list_accumulate (struct mult_index_list *accu, index_list_ty list) |
| { |
| size_t len1 = accu->nitems; |
| size_t len2 = list[IL_LENGTH]; |
| size_t need = len1 + len2; |
| struct mult_index *ptr1; |
| struct mult_index *ptr1_end; |
| index_ty *ptr2; |
| index_ty *ptr2_end; |
| struct mult_index *destptr; |
| |
| /* Make the work area large enough. */ |
| if (accu->nitems2_max < need) |
| { |
| size_t new_max = 2 * accu->nitems2_max + 1; |
| |
| if (new_max < need) |
| new_max = need; |
| if (accu->item2 != NULL) |
| free (accu->item2); |
| accu->item2 = XNMALLOC (new_max, struct mult_index); |
| accu->nitems2_max = new_max; |
| } |
| |
| /* Make a linear pass through accu and list simultaneously. */ |
| ptr1 = accu->item; |
| ptr1_end = ptr1 + len1; |
| ptr2 = list + 2; |
| ptr2_end = ptr2 + len2; |
| destptr = accu->item2; |
| while (ptr1 < ptr1_end && ptr2 < ptr2_end) |
| { |
| if (ptr1->index < *ptr2) |
| { |
| *destptr = *ptr1; |
| ptr1++; |
| } |
| else if (ptr1->index > *ptr2) |
| { |
| destptr->index = *ptr2; |
| destptr->count = 1; |
| ptr2++; |
| } |
| else /* ptr1->index == list[2 + i2] */ |
| { |
| destptr->index = ptr1->index; |
| destptr->count = ptr1->count + 1; |
| ptr1++; |
| ptr2++; |
| } |
| destptr++; |
| } |
| while (ptr1 < ptr1_end) |
| { |
| *destptr = *ptr1; |
| ptr1++; |
| destptr++; |
| } |
| while (ptr2 < ptr2_end) |
| { |
| destptr->index = *ptr2; |
| destptr->count = 1; |
| ptr2++; |
| destptr++; |
| } |
| |
| /* Swap accu->item and accu->item2. */ |
| { |
| struct mult_index *dest = accu->item2; |
| size_t dest_max = accu->nitems2_max; |
| |
| accu->item2 = accu->item; |
| accu->nitems2_max = accu->nitems_max; |
| |
| accu->item = dest; |
| accu->nitems = destptr - dest; |
| accu->nitems_max = dest_max; |
| } |
| } |
| |
| /* Compares two indices with multiplicity, according to their multiplicity. */ |
| static int |
| mult_index_compare (const void *p1, const void *p2) |
| { |
| const struct mult_index *ptr1 = (const struct mult_index *) p1; |
| const struct mult_index *ptr2 = (const struct mult_index *) p2; |
| |
| if (ptr1->count < ptr2->count) |
| return 1; |
| if (ptr1->count > ptr2->count) |
| return -1; |
| /* For reproduceable results, also take into account the indices. */ |
| if (ptr1->index > ptr2->index) |
| return 1; |
| if (ptr1->index < ptr2->index) |
| return -1; |
| return 0; |
| } |
| |
| /* Sort a list of indices with multiplicity according to decreasing |
| multiplicity. */ |
| static inline void |
| mult_index_list_sort (struct mult_index_list *accu) |
| { |
| if (accu->nitems > 1) |
| qsort (accu->item, accu->nitems, sizeof (struct mult_index), |
| mult_index_compare); |
| } |
| |
| /* Frees a list of indices with multiplicity. */ |
| static inline void |
| mult_index_list_free (struct mult_index_list *accu) |
| { |
| if (accu->item != NULL) |
| free (accu->item); |
| if (accu->item2 != NULL) |
| free (accu->item2); |
| } |
| |
| /* Find a good match for the given msgctxt and msgid in the given fuzzy index. |
| The match does not need to be optimal. |
| Ignore matches for which the fuzzy_search_goal_function is < LOWER_BOUND. |
| LOWER_BOUND must be >= FUZZY_THRESHOLD. |
| If HEURISTIC is true, only the few best messages among the list - according |
| to a certain heuristic - are considered. If HEURISTIC is false, all |
| messages with a fuzzy_search_goal_function > FUZZY_THRESHOLD are considered, |
| like in message_list_search_fuzzy (except that in ambiguous cases where |
| several best matches exist, message_list_search_fuzzy chooses the one with |
| the smallest index whereas message_fuzzy_index_search makes a better |
| choice). */ |
| message_ty * |
| message_fuzzy_index_search (message_fuzzy_index_ty *findex, |
| const char *msgctxt, const char *msgid, |
| double lower_bound, |
| bool heuristic) |
| { |
| const char *str = msgid; |
| |
| /* Let p0 < p1 < p2 < p3 < p4 walk through the string. */ |
| const char *p0 = str; |
| if (*p0 != '\0') |
| { |
| const char *p1 = p0 + findex->iterator (p0); |
| if (*p1 != '\0') |
| { |
| const char *p2 = p1 + findex->iterator (p1); |
| if (*p2 != '\0') |
| { |
| const char *p3 = p2 + findex->iterator (p2); |
| if (*p3 != '\0') |
| { |
| const char *p4 = p3 + findex->iterator (p3); |
| struct mult_index_list accu; |
| |
| mult_index_list_init (&accu); |
| for (;;) |
| { |
| /* The segment from p0 to p4 is a 4-gram of |
| characters. Get the hash table entry containing |
| a list of indices, and add it to the accu. */ |
| void *found; |
| |
| if (hash_find_entry (&findex->gram4, p0, p4 - p0, |
| &found) == 0) |
| { |
| index_list_ty list = (index_list_ty) found; |
| mult_index_list_accumulate (&accu, list); |
| } |
| |
| /* Advance. */ |
| if (*p4 == '\0') |
| break; |
| p0 = p1; |
| p1 = p2; |
| p2 = p3; |
| p3 = p4; |
| p4 = p4 + findex->iterator (p4); |
| } |
| |
| /* Sort in decreasing count order. */ |
| mult_index_list_sort (&accu); |
| |
| /* Iterate over this sorted list, and maximize the |
| fuzzy_search_goal_function() result. |
| If HEURISTIC is true, take only the first few messages. |
| If HEURISTIC is false, consider all messages - to match |
| the behaviour of message_list_search_fuzzy -, but process |
| them in the order of the sorted list. This increases |
| the chances that the later calls to fstrcmp_bounded() (via |
| fuzzy_search_goal_function()) terminate quickly, thanks |
| to the best_weight which will be quite high already after |
| the first few messages. */ |
| { |
| size_t count; |
| struct mult_index *ptr; |
| message_ty *best_mp; |
| double best_weight; |
| |
| count = accu.nitems; |
| if (heuristic) |
| { |
| if (count > findex->firstfew) |
| count = findex->firstfew; |
| } |
| |
| best_weight = lower_bound; |
| best_mp = NULL; |
| for (ptr = accu.item; count > 0; ptr++, count--) |
| { |
| message_ty *mp = findex->messages[ptr->index]; |
| double weight = |
| fuzzy_search_goal_function (mp, msgctxt, msgid, |
| best_weight); |
| |
| if (weight > best_weight) |
| { |
| best_weight = weight; |
| best_mp = mp; |
| } |
| } |
| |
| mult_index_list_free (&accu); |
| |
| return best_mp; |
| } |
| } |
| } |
| } |
| } |
| |
| /* The string had less than 4 characters. */ |
| { |
| size_t l = strlen (str); |
| size_t lmin, lmax; |
| message_ty *best_mp; |
| double best_weight; |
| |
| if (!(l <= SHORT_STRING_MAX_BYTES)) |
| abort (); |
| |
| /* Walk through those short messages which have an appropriate length. |
| See the comment before SHORT_MSG_MAX. */ |
| lmin = (int) ceil (l / (2 / FUZZY_THRESHOLD - 1)); |
| lmax = (int) (l * (2 / FUZZY_THRESHOLD - 1)); |
| if (!(lmax <= SHORT_MSG_MAX)) |
| abort (); |
| |
| best_weight = lower_bound; |
| best_mp = NULL; |
| for (l = lmin; l <= lmax; l++) |
| { |
| message_list_ty *mlp = findex->short_messages[l]; |
| size_t j; |
| |
| for (j = 0; j < mlp->nitems; j++) |
| { |
| message_ty *mp = mlp->item[j]; |
| double weight = |
| fuzzy_search_goal_function (mp, msgctxt, msgid, best_weight); |
| |
| if (weight > best_weight) |
| { |
| best_weight = weight; |
| best_mp = mp; |
| } |
| } |
| } |
| |
| return best_mp; |
| } |
| } |
| |
| /* Free a fuzzy index. */ |
| void |
| message_fuzzy_index_free (message_fuzzy_index_ty *findex) |
| { |
| size_t l; |
| void *iter; |
| const void *key; |
| size_t keylen; |
| void *data; |
| |
| /* Free the short lists. */ |
| for (l = 0; l <= SHORT_MSG_MAX; l++) |
| message_list_free (findex->short_messages[l], 1); |
| free (findex->short_messages); |
| |
| /* Free the index lists occurring as values in the hash tables. */ |
| iter = NULL; |
| while (hash_iterate (&findex->gram4, &iter, &key, &keylen, &data) == 0) |
| free ((index_list_ty *) data); |
| /* Free the hash table itself. */ |
| hash_destroy (&findex->gram4); |
| |
| free (findex); |
| } |