1 /* Fast fuzzy searching among messages.
2 Copyright (C) 2006 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2006.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
18
19 #ifdef HAVE_CONFIG_H
20 # include <config.h>
21 #endif
22
23 /* Specification. */
24 #include "msgl-fsearch.h"
25
26 #include <math.h>
27 #include <stdlib.h>
28
29 #include "xalloc.h"
30 #include "po-charset.h"
31
32
33 /* Fuzzy searching of L strings in a large set of N messages (assuming
34 they have all the same small size) takes O(L * N) when using repeated
35 fstrcmp() calls. When for example L = 800 and N = 69000, this is slow.
36
37 So we preprocess the set of N messages, yielding a data structure
38 that allows to select the similar messages for any given string, and
39 apply fstrcmp() only to the search result. This allows to reduce the
40 time to something between O(L * 1) and O(L * N) - depending on the
41 structure of the strings. In the example with L = 800 and N = 69000,
42 memory use increases by a factor of 2 and the time decreases from
43 9068 s to 230 s.
44
45 The data structure is a hash table mapping each n-gram (here with n=4)
46 occurring in the message strings to the list of messages that contains
47 it. For example, if the message list is
48 [0] "close"
49 [1] "losers"
50 the index is a hash table mapping
51 "clos" -> { 0 }
52 "lose" -> { 0, 1 }
53 "oser" -> { 1 }
54 "sers" -> { 1 }
55 Searching the similar messages to, say, "closest", is done by looking up
56 all its 4-grams in the hash table and summing up the results:
57 "clos" -> { 0 }
58 "lose" -> { 0, 1 }
59 "oses" -> {}
60 "sest" -> {}
61 => total: { 2x0, 1x1 }
62 The list is sorted according to decreasing frequency: { 0, 1, ... }
63 and only the first few messages from this frequency list are passed to
64 fstrcmp().
65
66 The n-gram similarity and the fstrcmp() similarity are quite different
67 metrics. For example, "close" and "c l o s e" have no n-grams in common
68 (even for n as small as n = 2); however, fstrcmp() will find that they
69 are quite similar (= 0.71). Conversely, "AAAA BBBB ... ZZZZ" and
70 "ZZZZ ... BBBB AAAA" have many 4-grams in common, but their fstrcmp() is
71 only 0.02. Also, repeated n-grams don't have the same effect on the
72 two similarity measures. But all this doesn't matter much in practice.
73
74 We chose n = 4 because for alphabetic languages, with n = 3 the occurrence
75 lists are likely too long. (Well, with ideographic languages like Chinese,
76 n = 3 might actually be quite good. Anyway, n = 4 is not bad in this case
77 either.)
78
79 The units are characters in the current encoding. Not just bytes,
80 because 4 consecutive bytes in UTF-8 or GB18030 don't mean much. */
81
82 /* Each message is represented by its index in the message list. */
83 typedef unsigned int index_ty;
84
85 /* An index list has its allocated size and length tacked at the front.
86 The indices are sorted in ascending order. */
87 typedef index_ty *index_list_ty;
88 #define IL_ALLOCATED 0
89 #define IL_LENGTH 1
90
91 /* Create a new index list containing only a given index. */
92 static inline index_list_ty
new_index(index_ty idx)93 new_index (index_ty idx)
94 {
95 index_ty *list = (index_ty *) xmalloc ((2 + 1) * sizeof (index_ty));
96 list[IL_ALLOCATED] = 1;
97 list[IL_LENGTH] = 1;
98 list[2] = idx;
99
100 return list;
101 }
102
103 /* Add a given index, greater or equal than all previous indices, to an
104 index list.
105 Return the new index list, if it had to be reallocated, or NULL if it
106 didn't change. */
107 static inline index_list_ty
addlast_index(index_list_ty list,index_ty idx)108 addlast_index (index_list_ty list, index_ty idx)
109 {
110 index_list_ty result;
111 size_t length = list[IL_LENGTH];
112
113 /* Look whether it should be inserted. */
114 if (length > 0 && list[2 + (length - 1)] == idx)
115 return NULL;
116
117 /* Now make room for one more list element. */
118 result = NULL;
119 if (length == list[IL_ALLOCATED])
120 {
121 size_t new_allocated = 2 * length - (length >> 6);
122 list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
123 list[IL_ALLOCATED] = new_allocated;
124 result = list;
125 }
126 list[2 + length] = idx;
127 list[IL_LENGTH] = length + 1;
128 return result;
129 }
130
131 /* Add a given index to an index list.
132 Return the new index list, if it had to be reallocated, or NULL if it
133 didn't change. */
134 static inline index_list_ty
add_index(index_list_ty list,index_ty idx)135 add_index (index_list_ty list, index_ty idx)
136 {
137 index_list_ty result;
138 size_t length = list[IL_LENGTH];
139
140 /* Look where it should be inserted. */
141 size_t lo = 0;
142 size_t hi = length;
143 while (lo < hi)
144 {
145 /* Here we know that idx must be inserted at a position >= lo, <= hi. */
146 size_t mid = (lo + hi) / 2; /* lo <= mid < hi */
147 index_ty val = list[2 + mid];
148 if (val < idx)
149 lo = mid + 1;
150 else if (val > idx)
151 hi = mid;
152 else
153 return NULL;
154 }
155
156 /* Now make room for one more list element. */
157 result = NULL;
158 if (length == list[IL_ALLOCATED])
159 {
160 size_t new_allocated = 2 * length - (length >> 6);
161 list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
162 list[IL_ALLOCATED] = new_allocated;
163 result = list;
164 }
165 list[IL_LENGTH] = length + 1;
166 for (; length > hi; length--)
167 list[2 + length] = list[1 + length];
168 list[2 + length] = idx;
169 return result;
170 }
171
172 /* We use 4-grams, therefore strings with less than 4 characters cannot be
173 handled through the 4-grams table and need to be handled specially.
174 Since every character occupies at most 4 bytes (see po-charset.c),
175 this means the size of such short strings is bounded by: */
176 #define SHORT_STRING_MAX_CHARACTERS (4 - 1)
177 #define SHORT_STRING_MAX_BYTES (SHORT_STRING_MAX_CHARACTERS * 4)
178
179 /* Such short strings are handled by direct comparison with all messages
180 of appropriate size. Note that for two strings of length 0 <= l1 <= l2,
181 fstrcmp() is <= 2 * l1 / (l1 + l2). Since we are only interested in
182 fstrcmp() values >= FUZZY_THRESHOLD, we can for a given string of length l
183 limit the search to lengths l' in the range
184 l / (2 / FUZZY_THRESHOLD - 1) <= l' <= l * (2 / FUZZY_THRESHOLD - 1)
185 Thus we need the list of the short strings up to length: */
186 #define SHORT_MSG_MAX (int) (SHORT_STRING_MAX_BYTES * (2 * FUZZY_THRESHOLD_INV - 1))
187
188 /* A fuzzy index contains a hash table mapping all n-grams to their
189 occurrences list. */
190 struct message_fuzzy_index_ty
191 {
192 message_ty **messages;
193 character_iterator_t iterator;
194 hash_table gram4;
195 size_t firstfew;
196 message_list_ty *short_messages[SHORT_MSG_MAX + 1];
197 };
198
199 /* Allocate a fuzzy index corresponding to a given list of messages.
200 The list of messages and the msgctxt and msgid fields of the messages
201 inside it must not be modified while the returned fuzzy index is in use. */
202 message_fuzzy_index_ty *
message_fuzzy_index_alloc(const message_list_ty * mlp,const char * canon_charset)203 message_fuzzy_index_alloc (const message_list_ty *mlp,
204 const char *canon_charset)
205 {
206 message_fuzzy_index_ty *findex =
207 (message_fuzzy_index_ty *) xmalloc (sizeof (message_fuzzy_index_ty));
208 size_t count = mlp->nitems;
209 size_t j;
210 size_t l;
211
212 findex->messages = mlp->item;
213 findex->iterator = po_charset_character_iterator (canon_charset);
214
215 /* Setup hash table. */
216 if (hash_init (&findex->gram4, 10 * count) < 0)
217 xalloc_die ();
218 for (j = 0; j < count; j++)
219 {
220 message_ty *mp = mlp->item[j];
221
222 if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
223 {
224 const char *str = mp->msgid;
225
226 /* Let p0 < p1 < p2 < p3 < p4 walk through the string. */
227 const char *p0 = str;
228 if (*p0 != '\0')
229 {
230 const char *p1 = p0 + findex->iterator (p0);
231 if (*p1 != '\0')
232 {
233 const char *p2 = p1 + findex->iterator (p1);
234 if (*p2 != '\0')
235 {
236 const char *p3 = p2 + findex->iterator (p2);
237 if (*p3 != '\0')
238 {
239 const char *p4 = p3 + findex->iterator (p3);
240 for (;;)
241 {
242 /* The segment from p0 to p4 is a 4-gram of
243 characters. Add a hash table entry that maps
244 it to the index j, or extend the existing
245 hash table entry accordingly. */
246 void *found;
247
248 if (hash_find_entry (&findex->gram4, p0, p4 - p0,
249 &found) == 0)
250 {
251 index_list_ty list = (index_list_ty) found;
252 list = addlast_index (list, j);
253 if (list != NULL)
254 hash_set_value (&findex->gram4, p0, p4 - p0,
255 list);
256 }
257 else
258 hash_insert_entry (&findex->gram4, p0, p4 - p0,
259 new_index (j));
260
261 /* Advance. */
262 if (*p4 == '\0')
263 break;
264 p0 = p1;
265 p1 = p2;
266 p2 = p3;
267 p3 = p4;
268 p4 = p4 + findex->iterator (p4);
269 }
270 }
271 }
272 }
273 }
274 }
275 }
276
277 /* Shrink memory used by the hash table. */
278 {
279 void *iter;
280 const void *key;
281 size_t keylen;
282 void **valuep;
283
284 iter = NULL;
285 while (hash_iterate_modify (&findex->gram4, &iter, &key, &keylen, &valuep)
286 == 0)
287 {
288 index_list_ty list = (index_list_ty) *valuep;
289 index_ty length = list[IL_LENGTH];
290
291 if (length < list[IL_ALLOCATED])
292 {
293 list[IL_ALLOCATED] = length;
294 *valuep = xrealloc (list, (2 + length) * sizeof (index_ty));
295 }
296 }
297 }
298
299 findex->firstfew = (int) sqrt ((double) count);
300 if (findex->firstfew < 10)
301 findex->firstfew = 10;
302
303 /* Setup lists of short messages. */
304 for (l = 0; l <= SHORT_MSG_MAX; l++)
305 findex->short_messages[l] = message_list_alloc (false);
306 for (j = 0; j < count; j++)
307 {
308 message_ty *mp = mlp->item[j];
309
310 if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
311 {
312 const char *str = mp->msgid;
313 size_t len = strlen (str);
314
315 if (len <= SHORT_MSG_MAX)
316 message_list_append (findex->short_messages[len], mp);
317 }
318 }
319
320 /* Shrink memory used by the lists of short messages. */
321 for (l = 0; l <= SHORT_MSG_MAX; l++)
322 {
323 message_list_ty *mlp = findex->short_messages[l];
324
325 if (mlp->nitems < mlp->nitems_max)
326 {
327 mlp->nitems_max = mlp->nitems;
328 mlp->item =
329 (message_ty **)
330 xrealloc (mlp->item, mlp->nitems_max * sizeof (message_ty *));
331 }
332 }
333
334 return findex;
335 }
336
337 /* An index with multiplicity. */
338 struct mult_index
339 {
340 index_ty index;
341 unsigned int count;
342 };
343
344 /* A list of indices with multiplicity.
345 The indices are sorted in ascending order. */
346 struct mult_index_list
347 {
348 struct mult_index *item;
349 size_t nitems;
350 size_t nitems_max;
351 /* Work area. */
352 struct mult_index *item2;
353 size_t nitems2_max;
354 };
355
356 /* Initialize an empty list of indices with multiplicity. */
357 static inline void
mult_index_list_init(struct mult_index_list * accu)358 mult_index_list_init (struct mult_index_list *accu)
359 {
360 accu->nitems = 0;
361 accu->nitems_max = 0;
362 accu->item = NULL;
363 accu->nitems2_max = 0;
364 accu->item2 = NULL;
365 }
366
367 /* Add an index list to a list of indices with multiplicity. */
368 static inline void
mult_index_list_accumulate(struct mult_index_list * accu,index_list_ty list)369 mult_index_list_accumulate (struct mult_index_list *accu, index_list_ty list)
370 {
371 size_t len1 = accu->nitems;
372 size_t len2 = list[IL_LENGTH];
373 size_t need = len1 + len2;
374 struct mult_index *ptr1;
375 struct mult_index *ptr1_end;
376 index_ty *ptr2;
377 index_ty *ptr2_end;
378 struct mult_index *destptr;
379
380 /* Make the work area large enough. */
381 if (accu->nitems2_max < need)
382 {
383 size_t new_max = 2 * accu->nitems2_max + 1;
384
385 if (new_max < need)
386 new_max = need;
387 if (accu->item2 != NULL)
388 free (accu->item2);
389 accu->item2 =
390 (struct mult_index *) xmalloc (new_max * sizeof (struct mult_index));
391 accu->nitems2_max = new_max;
392 }
393
394 /* Make a linear pass through accu and list simultaneously. */
395 ptr1 = accu->item;
396 ptr1_end = ptr1 + len1;
397 ptr2 = list + 2;
398 ptr2_end = ptr2 + len2;
399 destptr = accu->item2;
400 while (ptr1 < ptr1_end && ptr2 < ptr2_end)
401 {
402 if (ptr1->index < *ptr2)
403 {
404 *destptr = *ptr1;
405 ptr1++;
406 }
407 else if (ptr1->index > *ptr2)
408 {
409 destptr->index = *ptr2;
410 destptr->count = 1;
411 ptr2++;
412 }
413 else /* ptr1->index == list[2 + i2] */
414 {
415 destptr->index = ptr1->index;
416 destptr->count = ptr1->count + 1;
417 ptr1++;
418 ptr2++;
419 }
420 destptr++;
421 }
422 while (ptr1 < ptr1_end)
423 {
424 *destptr = *ptr1;
425 ptr1++;
426 destptr++;
427 }
428 while (ptr2 < ptr2_end)
429 {
430 destptr->index = *ptr2;
431 destptr->count = 1;
432 ptr2++;
433 destptr++;
434 }
435
436 /* Swap accu->item and accu->item2. */
437 {
438 struct mult_index *dest = accu->item2;
439 size_t dest_max = accu->nitems2_max;
440
441 accu->item2 = accu->item;
442 accu->nitems2_max = accu->nitems_max;
443
444 accu->item = dest;
445 accu->nitems = destptr - dest;
446 accu->nitems_max = dest_max;
447 }
448 }
449
450 /* Compares two indices with multiplicity, according to their multiplicity. */
451 static int
mult_index_compare(const void * p1,const void * p2)452 mult_index_compare (const void *p1, const void *p2)
453 {
454 const struct mult_index *ptr1 = (const struct mult_index *) p1;
455 const struct mult_index *ptr2 = (const struct mult_index *) p2;
456
457 if (ptr1->count < ptr2->count)
458 return 1;
459 if (ptr1->count > ptr2->count)
460 return -1;
461 /* For reproduceable results, also take into account the indices. */
462 if (ptr1->index > ptr2->index)
463 return 1;
464 if (ptr1->index < ptr2->index)
465 return -1;
466 return 0;
467 }
468
469 /* Sort a list of indices with multiplicity according to decreasing
470 multiplicity. */
471 static inline void
mult_index_list_sort(struct mult_index_list * accu)472 mult_index_list_sort (struct mult_index_list *accu)
473 {
474 if (accu->nitems > 1)
475 qsort (accu->item, accu->nitems, sizeof (struct mult_index),
476 mult_index_compare);
477 }
478
479 /* Frees a list of indices with multiplicity. */
480 static inline void
mult_index_list_free(struct mult_index_list * accu)481 mult_index_list_free (struct mult_index_list *accu)
482 {
483 if (accu->item != NULL)
484 free (accu->item);
485 if (accu->item2 != NULL)
486 free (accu->item2);
487 }
488
489 /* Find a good match for the given msgctxt and msgid in the given fuzzy index.
490 The match does not need to be optimal. */
491 message_ty *
message_fuzzy_index_search(message_fuzzy_index_ty * findex,const char * msgctxt,const char * msgid)492 message_fuzzy_index_search (message_fuzzy_index_ty *findex,
493 const char *msgctxt, const char *msgid)
494 {
495 const char *str = msgid;
496
497 /* Let p0 < p1 < p2 < p3 < p4 walk through the string. */
498 const char *p0 = str;
499 if (*p0 != '\0')
500 {
501 const char *p1 = p0 + findex->iterator (p0);
502 if (*p1 != '\0')
503 {
504 const char *p2 = p1 + findex->iterator (p1);
505 if (*p2 != '\0')
506 {
507 const char *p3 = p2 + findex->iterator (p2);
508 if (*p3 != '\0')
509 {
510 const char *p4 = p3 + findex->iterator (p3);
511 struct mult_index_list accu;
512
513 mult_index_list_init (&accu);
514 for (;;)
515 {
516 /* The segment from p0 to p4 is a 4-gram of
517 characters. Get the hash table entry containing
518 a list of indices, and add it to the accu. */
519 void *found;
520
521 if (hash_find_entry (&findex->gram4, p0, p4 - p0,
522 &found) == 0)
523 {
524 index_list_ty list = (index_list_ty) found;
525 mult_index_list_accumulate (&accu, list);
526 }
527
528 /* Advance. */
529 if (*p4 == '\0')
530 break;
531 p0 = p1;
532 p1 = p2;
533 p2 = p3;
534 p3 = p4;
535 p4 = p4 + findex->iterator (p4);
536 }
537
538 /* Sort in decreasing count order. */
539 mult_index_list_sort (&accu);
540
541 /* Take the first few messages from this sorted list, and
542 maximize the fstrcmp() result. */
543 {
544 size_t count;
545 struct mult_index *ptr;
546 message_ty *best_mp;
547 double best_weight;
548
549 count = findex->firstfew;
550 if (count > accu.nitems)
551 count = accu.nitems;
552
553 best_weight = FUZZY_THRESHOLD;
554 best_mp = NULL;
555 for (ptr = accu.item; count > 0; ptr++, count--)
556 {
557 message_ty *mp = findex->messages[ptr->index];
558 double weight =
559 fuzzy_search_goal_function (mp, msgctxt, msgid);
560
561 if (weight > best_weight)
562 {
563 best_weight = weight;
564 best_mp = mp;
565 }
566 }
567
568 mult_index_list_free (&accu);
569
570 return best_mp;
571 }
572 }
573 }
574 }
575 }
576
577 /* The string had less than 4 characters. */
578 {
579 size_t l = strlen (str);
580 size_t lmin, lmax;
581 message_ty *best_mp;
582 double best_weight;
583
584 if (!(l <= SHORT_STRING_MAX_BYTES))
585 abort ();
586
587 /* Walk through those short messages which have an appropriate length.
588 See the comment before SHORT_MSG_MAX. */
589 lmin = (int) ceil (l / (2 / FUZZY_THRESHOLD - 1));
590 lmax = (int) (l * (2 / FUZZY_THRESHOLD - 1));
591 if (!(lmax <= SHORT_MSG_MAX))
592 abort ();
593
594 best_weight = FUZZY_THRESHOLD;
595 best_mp = NULL;
596 for (l = lmin; l <= lmax; l++)
597 {
598 message_list_ty *mlp = findex->short_messages[l];
599 size_t j;
600
601 for (j = 0; j < mlp->nitems; j++)
602 {
603 message_ty *mp = mlp->item[j];
604 double weight = fuzzy_search_goal_function (mp, msgctxt, msgid);
605
606 if (weight > best_weight)
607 {
608 best_weight = weight;
609 best_mp = mp;
610 }
611 }
612 }
613
614 return best_mp;
615 }
616 }
617
618 /* Free a fuzzy index. */
619 void
message_fuzzy_index_free(message_fuzzy_index_ty * findex)620 message_fuzzy_index_free (message_fuzzy_index_ty *findex)
621 {
622 size_t l;
623 void *iter;
624 const void *key;
625 size_t keylen;
626 void *data;
627
628 /* Free the short lists. */
629 for (l = 0; l <= SHORT_MSG_MAX; l++)
630 message_list_free (findex->short_messages[l], 1);
631
632 /* Free the index lists occurring as values in the hash tables. */
633 iter = NULL;
634 while (hash_iterate (&findex->gram4, &iter, &key, &keylen, &data) == 0)
635 free ((index_list_ty *) data);
636 /* Free the hash table itself. */
637 hash_destroy (&findex->gram4);
638
639 free (findex);
640 }
641