xref: /netbsd-src/external/gpl2/gettext/dist/gettext-tools/src/msgl-fsearch.c (revision 95b39c65ca575fb40c6bb7083e0eb7ec28eabef1)
1 /* Fast fuzzy searching among messages.
2    Copyright (C) 2006 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2006.
4 
5    This program is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 2, or (at your option)
8    any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program; if not, write to the Free Software Foundation,
17    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
18 
19 #ifdef HAVE_CONFIG_H
20 # include <config.h>
21 #endif
22 
23 /* Specification.  */
24 #include "msgl-fsearch.h"
25 
26 #include <math.h>
27 #include <stdlib.h>
28 
29 #include "xalloc.h"
30 #include "po-charset.h"
31 
32 
33 /* Fuzzy searching of L strings in a large set of N messages (assuming
34    they have all the same small size) takes O(L * N) when using repeated
35    fstrcmp() calls.  When for example L = 800 and N = 69000, this is slow.
36 
37    So we preprocess the set of N messages, yielding a data structure
38    that allows to select the similar messages for any given string, and
39    apply fstrcmp() only to the search result.  This allows to reduce the
40    time to something between O(L * 1) and O(L * N) - depending on the
41    structure of the strings.  In the example with L = 800 and N = 69000,
42    memory use increases by a factor of 2 and the time decreases from
43    9068 s to 230 s.
44 
45    The data structure is a hash table mapping each n-gram (here with n=4)
46    occurring in the message strings to the list of messages that contains
47    it.  For example, if the message list is
48       [0] "close"
49       [1] "losers"
50    the index is a hash table mapping
51       "clos" -> { 0 }
52       "lose" -> { 0, 1 }
53       "oser" -> { 1 }
54       "sers" -> { 1 }
55    Searching the similar messages to, say, "closest", is done by looking up
56    all its 4-grams in the hash table and summing up the results:
57       "clos" -> { 0 }
58       "lose" -> { 0, 1 }
59       "oses" -> {}
60       "sest" -> {}
61    => total: { 2x0, 1x1 }
62    The list is sorted according to decreasing frequency: { 0, 1, ... }
63    and only the first few messages from this frequency list are passed to
64    fstrcmp().
65 
66    The n-gram similarity and the fstrcmp() similarity are quite different
67    metrics.  For example, "close" and "c l o s e" have no n-grams in common
68    (even for n as small as n = 2); however, fstrcmp() will find that they
69    are quite similar (= 0.71).  Conversely, "AAAA BBBB ... ZZZZ" and
70    "ZZZZ ... BBBB AAAA" have many 4-grams in common, but their fstrcmp() is
71    only 0.02.  Also, repeated n-grams don't have the same effect on the
72    two similarity measures.  But all this doesn't matter much in practice.
73 
74    We chose n = 4 because for alphabetic languages, with n = 3 the occurrence
75    lists are likely too long.  (Well, with ideographic languages like Chinese,
76    n = 3 might actually be quite good.  Anyway, n = 4 is not bad in this case
77    either.)
78 
79    The units are characters in the current encoding.  Not just bytes,
80    because 4 consecutive bytes in UTF-8 or GB18030 don't mean much.  */
81 
82 /* Each message is represented by its index in the message list.  */
83 typedef unsigned int index_ty;
84 
85 /* An index list has its allocated size and length tacked at the front.
86    The indices are sorted in ascending order.  */
87 typedef index_ty *index_list_ty;
88 #define IL_ALLOCATED 0
89 #define IL_LENGTH 1
90 
91 /* Create a new index list containing only a given index.  */
92 static inline index_list_ty
new_index(index_ty idx)93 new_index (index_ty idx)
94 {
95   index_ty *list = (index_ty *) xmalloc ((2 + 1) * sizeof (index_ty));
96   list[IL_ALLOCATED] = 1;
97   list[IL_LENGTH] = 1;
98   list[2] = idx;
99 
100   return list;
101 }
102 
103 /* Add a given index, greater or equal than all previous indices, to an
104    index list.
105    Return the new index list, if it had to be reallocated, or NULL if it
106    didn't change.  */
107 static inline index_list_ty
addlast_index(index_list_ty list,index_ty idx)108 addlast_index (index_list_ty list, index_ty idx)
109 {
110   index_list_ty result;
111   size_t length = list[IL_LENGTH];
112 
113   /* Look whether it should be inserted.  */
114   if (length > 0 && list[2 + (length - 1)] == idx)
115     return NULL;
116 
117   /* Now make room for one more list element.  */
118   result = NULL;
119   if (length == list[IL_ALLOCATED])
120     {
121       size_t new_allocated = 2 * length - (length >> 6);
122       list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
123       list[IL_ALLOCATED] = new_allocated;
124       result = list;
125     }
126   list[2 + length] = idx;
127   list[IL_LENGTH] = length + 1;
128   return result;
129 }
130 
131 /* Add a given index to an index list.
132    Return the new index list, if it had to be reallocated, or NULL if it
133    didn't change.  */
134 static inline index_list_ty
add_index(index_list_ty list,index_ty idx)135 add_index (index_list_ty list, index_ty idx)
136 {
137   index_list_ty result;
138   size_t length = list[IL_LENGTH];
139 
140   /* Look where it should be inserted.  */
141   size_t lo = 0;
142   size_t hi = length;
143   while (lo < hi)
144     {
145       /* Here we know that idx must be inserted at a position >= lo, <= hi.  */
146       size_t mid = (lo + hi) / 2; /* lo <= mid < hi */
147       index_ty val = list[2 + mid];
148       if (val < idx)
149 	lo = mid + 1;
150       else if (val > idx)
151 	hi = mid;
152       else
153 	return NULL;
154     }
155 
156   /* Now make room for one more list element.  */
157   result = NULL;
158   if (length == list[IL_ALLOCATED])
159     {
160       size_t new_allocated = 2 * length - (length >> 6);
161       list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
162       list[IL_ALLOCATED] = new_allocated;
163       result = list;
164     }
165   list[IL_LENGTH] = length + 1;
166   for (; length > hi; length--)
167     list[2 + length] = list[1 + length];
168   list[2 + length] = idx;
169   return result;
170 }
171 
172 /* We use 4-grams, therefore strings with less than 4 characters cannot be
173    handled through the 4-grams table and need to be handled specially.
174    Since every character occupies at most 4 bytes (see po-charset.c),
175    this means the size of such short strings is bounded by:  */
176 #define SHORT_STRING_MAX_CHARACTERS (4 - 1)
177 #define SHORT_STRING_MAX_BYTES (SHORT_STRING_MAX_CHARACTERS * 4)
178 
179 /* Such short strings are handled by direct comparison with all messages
180    of appropriate size.  Note that for two strings of length 0 <= l1 <= l2,
181    fstrcmp() is <= 2 * l1 / (l1 + l2).  Since we are only interested in
182    fstrcmp() values >= FUZZY_THRESHOLD, we can for a given string of length l
183    limit the search to lengths l' in the range
184      l / (2 / FUZZY_THRESHOLD - 1) <= l' <= l * (2 / FUZZY_THRESHOLD - 1)
185    Thus we need the list of the short strings up to length:  */
186 #define SHORT_MSG_MAX (int) (SHORT_STRING_MAX_BYTES * (2 * FUZZY_THRESHOLD_INV - 1))
187 
188 /* A fuzzy index contains a hash table mapping all n-grams to their
189    occurrences list.  */
190 struct message_fuzzy_index_ty
191 {
192   message_ty **messages;
193   character_iterator_t iterator;
194   hash_table gram4;
195   size_t firstfew;
196   message_list_ty *short_messages[SHORT_MSG_MAX + 1];
197 };
198 
199 /* Allocate a fuzzy index corresponding to a given list of messages.
200    The list of messages and the msgctxt and msgid fields of the messages
201    inside it must not be modified while the returned fuzzy index is in use.  */
202 message_fuzzy_index_ty *
message_fuzzy_index_alloc(const message_list_ty * mlp,const char * canon_charset)203 message_fuzzy_index_alloc (const message_list_ty *mlp,
204 			   const char *canon_charset)
205 {
206   message_fuzzy_index_ty *findex =
207     (message_fuzzy_index_ty *) xmalloc (sizeof (message_fuzzy_index_ty));
208   size_t count = mlp->nitems;
209   size_t j;
210   size_t l;
211 
212   findex->messages = mlp->item;
213   findex->iterator = po_charset_character_iterator (canon_charset);
214 
215   /* Setup hash table.  */
216   if (hash_init (&findex->gram4, 10 * count) < 0)
217     xalloc_die ();
218   for (j = 0; j < count; j++)
219     {
220       message_ty *mp = mlp->item[j];
221 
222       if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
223 	{
224 	  const char *str = mp->msgid;
225 
226 	  /* Let p0 < p1 < p2 < p3 < p4 walk through the string.  */
227 	  const char *p0 = str;
228 	  if (*p0 != '\0')
229 	    {
230 	      const char *p1 = p0 + findex->iterator (p0);
231 	      if (*p1 != '\0')
232 		{
233 		  const char *p2 = p1 + findex->iterator (p1);
234 		  if (*p2 != '\0')
235 		    {
236 		      const char *p3 = p2 + findex->iterator (p2);
237 		      if (*p3 != '\0')
238 			{
239 			  const char *p4 = p3 + findex->iterator (p3);
240 			  for (;;)
241 			    {
242 			      /* The segment from p0 to p4 is a 4-gram of
243 				 characters.  Add a hash table entry that maps
244 				 it to the index j, or extend the existing
245 				 hash table entry accordingly.  */
246 			      void *found;
247 
248 			      if (hash_find_entry (&findex->gram4, p0, p4 - p0,
249 						   &found) == 0)
250 				{
251 				  index_list_ty list = (index_list_ty) found;
252 				  list = addlast_index (list, j);
253 				  if (list != NULL)
254 				    hash_set_value (&findex->gram4, p0, p4 - p0,
255 						    list);
256 				}
257 			      else
258 				hash_insert_entry (&findex->gram4, p0, p4 - p0,
259 						   new_index (j));
260 
261 			      /* Advance.  */
262 			      if (*p4 == '\0')
263 				break;
264 			      p0 = p1;
265 			      p1 = p2;
266 			      p2 = p3;
267 			      p3 = p4;
268 			      p4 = p4 + findex->iterator (p4);
269 			    }
270 			}
271 		    }
272 		}
273 	    }
274 	}
275     }
276 
277   /* Shrink memory used by the hash table.  */
278   {
279     void *iter;
280     const void *key;
281     size_t keylen;
282     void **valuep;
283 
284     iter = NULL;
285     while (hash_iterate_modify (&findex->gram4, &iter, &key, &keylen, &valuep)
286 	   == 0)
287       {
288 	index_list_ty list = (index_list_ty) *valuep;
289 	index_ty length = list[IL_LENGTH];
290 
291 	if (length < list[IL_ALLOCATED])
292 	  {
293 	    list[IL_ALLOCATED] = length;
294 	    *valuep = xrealloc (list, (2 + length) * sizeof (index_ty));
295 	  }
296       }
297   }
298 
299   findex->firstfew = (int) sqrt ((double) count);
300   if (findex->firstfew < 10)
301     findex->firstfew = 10;
302 
303   /* Setup lists of short messages.  */
304   for (l = 0; l <= SHORT_MSG_MAX; l++)
305     findex->short_messages[l] = message_list_alloc (false);
306   for (j = 0; j < count; j++)
307     {
308       message_ty *mp = mlp->item[j];
309 
310       if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
311 	{
312 	  const char *str = mp->msgid;
313 	  size_t len = strlen (str);
314 
315 	  if (len <= SHORT_MSG_MAX)
316 	    message_list_append (findex->short_messages[len], mp);
317 	}
318     }
319 
320   /* Shrink memory used by the lists of short messages.  */
321   for (l = 0; l <= SHORT_MSG_MAX; l++)
322     {
323       message_list_ty *mlp = findex->short_messages[l];
324 
325       if (mlp->nitems < mlp->nitems_max)
326 	{
327 	  mlp->nitems_max = mlp->nitems;
328 	  mlp->item =
329 	    (message_ty **)
330 	    xrealloc (mlp->item, mlp->nitems_max * sizeof (message_ty *));
331 	}
332     }
333 
334   return findex;
335 }
336 
337 /* An index with multiplicity.  */
338 struct mult_index
339 {
340   index_ty index;
341   unsigned int count;
342 };
343 
344 /* A list of indices with multiplicity.
345    The indices are sorted in ascending order.  */
346 struct mult_index_list
347 {
348   struct mult_index *item;
349   size_t nitems;
350   size_t nitems_max;
351   /* Work area.  */
352   struct mult_index *item2;
353   size_t nitems2_max;
354 };
355 
356 /* Initialize an empty list of indices with multiplicity.  */
357 static inline void
mult_index_list_init(struct mult_index_list * accu)358 mult_index_list_init (struct mult_index_list *accu)
359 {
360   accu->nitems = 0;
361   accu->nitems_max = 0;
362   accu->item = NULL;
363   accu->nitems2_max = 0;
364   accu->item2 = NULL;
365 }
366 
367 /* Add an index list to a list of indices with multiplicity.  */
368 static inline void
mult_index_list_accumulate(struct mult_index_list * accu,index_list_ty list)369 mult_index_list_accumulate (struct mult_index_list *accu, index_list_ty list)
370 {
371   size_t len1 = accu->nitems;
372   size_t len2 = list[IL_LENGTH];
373   size_t need = len1 + len2;
374   struct mult_index *ptr1;
375   struct mult_index *ptr1_end;
376   index_ty *ptr2;
377   index_ty *ptr2_end;
378   struct mult_index *destptr;
379 
380   /* Make the work area large enough.  */
381   if (accu->nitems2_max < need)
382     {
383       size_t new_max = 2 * accu->nitems2_max + 1;
384 
385       if (new_max < need)
386 	new_max = need;
387       if (accu->item2 != NULL)
388 	free (accu->item2);
389       accu->item2 =
390 	(struct mult_index *) xmalloc (new_max * sizeof (struct mult_index));
391       accu->nitems2_max = new_max;
392     }
393 
394   /* Make a linear pass through accu and list simultaneously.  */
395   ptr1 = accu->item;
396   ptr1_end = ptr1 + len1;
397   ptr2 = list + 2;
398   ptr2_end = ptr2 + len2;
399   destptr = accu->item2;
400   while (ptr1 < ptr1_end && ptr2 < ptr2_end)
401     {
402       if (ptr1->index < *ptr2)
403 	{
404 	  *destptr = *ptr1;
405 	  ptr1++;
406 	}
407       else if (ptr1->index > *ptr2)
408 	{
409 	  destptr->index = *ptr2;
410 	  destptr->count = 1;
411 	  ptr2++;
412 	}
413       else /* ptr1->index == list[2 + i2] */
414 	{
415 	  destptr->index = ptr1->index;
416 	  destptr->count = ptr1->count + 1;
417 	  ptr1++;
418 	  ptr2++;
419 	}
420       destptr++;
421     }
422   while (ptr1 < ptr1_end)
423     {
424       *destptr = *ptr1;
425       ptr1++;
426       destptr++;
427     }
428   while (ptr2 < ptr2_end)
429     {
430       destptr->index = *ptr2;
431       destptr->count = 1;
432       ptr2++;
433       destptr++;
434     }
435 
436   /* Swap accu->item and accu->item2.  */
437   {
438     struct mult_index *dest = accu->item2;
439     size_t dest_max = accu->nitems2_max;
440 
441     accu->item2 = accu->item;
442     accu->nitems2_max = accu->nitems_max;
443 
444     accu->item = dest;
445     accu->nitems = destptr - dest;
446     accu->nitems_max = dest_max;
447   }
448 }
449 
450 /* Compares two indices with multiplicity, according to their multiplicity.  */
451 static int
mult_index_compare(const void * p1,const void * p2)452 mult_index_compare (const void *p1, const void *p2)
453 {
454   const struct mult_index *ptr1 = (const struct mult_index *) p1;
455   const struct mult_index *ptr2 = (const struct mult_index *) p2;
456 
457   if (ptr1->count < ptr2->count)
458     return 1;
459   if (ptr1->count > ptr2->count)
460     return -1;
461   /* For reproduceable results, also take into account the indices.  */
462   if (ptr1->index > ptr2->index)
463     return 1;
464   if (ptr1->index < ptr2->index)
465     return -1;
466   return 0;
467 }
468 
469 /* Sort a list of indices with multiplicity according to decreasing
470    multiplicity.  */
471 static inline void
mult_index_list_sort(struct mult_index_list * accu)472 mult_index_list_sort (struct mult_index_list *accu)
473 {
474   if (accu->nitems > 1)
475     qsort (accu->item, accu->nitems, sizeof (struct mult_index),
476 	   mult_index_compare);
477 }
478 
479 /* Frees a list of indices with multiplicity.  */
480 static inline void
mult_index_list_free(struct mult_index_list * accu)481 mult_index_list_free (struct mult_index_list *accu)
482 {
483   if (accu->item != NULL)
484     free (accu->item);
485   if (accu->item2 != NULL)
486     free (accu->item2);
487 }
488 
489 /* Find a good match for the given msgctxt and msgid in the given fuzzy index.
490    The match does not need to be optimal.  */
491 message_ty *
message_fuzzy_index_search(message_fuzzy_index_ty * findex,const char * msgctxt,const char * msgid)492 message_fuzzy_index_search (message_fuzzy_index_ty *findex,
493 			    const char *msgctxt, const char *msgid)
494 {
495   const char *str = msgid;
496 
497   /* Let p0 < p1 < p2 < p3 < p4 walk through the string.  */
498   const char *p0 = str;
499   if (*p0 != '\0')
500     {
501       const char *p1 = p0 + findex->iterator (p0);
502       if (*p1 != '\0')
503 	{
504 	  const char *p2 = p1 + findex->iterator (p1);
505 	  if (*p2 != '\0')
506 	    {
507 	      const char *p3 = p2 + findex->iterator (p2);
508 	      if (*p3 != '\0')
509 		{
510 		  const char *p4 = p3 + findex->iterator (p3);
511 		  struct mult_index_list accu;
512 
513 		  mult_index_list_init (&accu);
514 		  for (;;)
515 		    {
516 		      /* The segment from p0 to p4 is a 4-gram of
517 			 characters.  Get the hash table entry containing
518 			 a list of indices, and add it to the accu.  */
519 		      void *found;
520 
521 		      if (hash_find_entry (&findex->gram4, p0, p4 - p0,
522 					   &found) == 0)
523 			{
524 			  index_list_ty list = (index_list_ty) found;
525 			  mult_index_list_accumulate (&accu, list);
526 			}
527 
528 		      /* Advance.  */
529 		      if (*p4 == '\0')
530 			break;
531 		      p0 = p1;
532 		      p1 = p2;
533 		      p2 = p3;
534 		      p3 = p4;
535 		      p4 = p4 + findex->iterator (p4);
536 		    }
537 
538 		  /* Sort in decreasing count order.  */
539 		  mult_index_list_sort (&accu);
540 
541 		  /* Take the first few messages from this sorted list, and
542 		     maximize the fstrcmp() result.  */
543 		  {
544 		    size_t count;
545 		    struct mult_index *ptr;
546 		    message_ty *best_mp;
547 		    double best_weight;
548 
549 		    count = findex->firstfew;
550 		    if (count > accu.nitems)
551 		      count = accu.nitems;
552 
553 		    best_weight = FUZZY_THRESHOLD;
554 		    best_mp = NULL;
555 		    for (ptr = accu.item; count > 0; ptr++, count--)
556 		      {
557 			message_ty *mp = findex->messages[ptr->index];
558 			double weight =
559 			  fuzzy_search_goal_function (mp, msgctxt, msgid);
560 
561 			if (weight > best_weight)
562 			  {
563 			    best_weight = weight;
564 			    best_mp = mp;
565 			  }
566 		      }
567 
568 		    mult_index_list_free (&accu);
569 
570 		    return best_mp;
571 		  }
572 		}
573 	    }
574 	}
575     }
576 
577   /* The string had less than 4 characters.  */
578   {
579     size_t l = strlen (str);
580     size_t lmin, lmax;
581     message_ty *best_mp;
582     double best_weight;
583 
584     if (!(l <= SHORT_STRING_MAX_BYTES))
585       abort ();
586 
587     /* Walk through those short messages which have an appropriate length.
588        See the comment before SHORT_MSG_MAX.  */
589     lmin = (int) ceil (l / (2 / FUZZY_THRESHOLD - 1));
590     lmax = (int) (l * (2 / FUZZY_THRESHOLD - 1));
591     if (!(lmax <= SHORT_MSG_MAX))
592       abort ();
593 
594     best_weight = FUZZY_THRESHOLD;
595     best_mp = NULL;
596     for (l = lmin; l <= lmax; l++)
597       {
598 	message_list_ty *mlp = findex->short_messages[l];
599 	size_t j;
600 
601 	for (j = 0; j < mlp->nitems; j++)
602 	  {
603 	    message_ty *mp = mlp->item[j];
604 	    double weight = fuzzy_search_goal_function (mp, msgctxt, msgid);
605 
606 	    if (weight > best_weight)
607 	      {
608 		best_weight = weight;
609 		best_mp = mp;
610 	      }
611 	  }
612       }
613 
614     return best_mp;
615   }
616 }
617 
618 /* Free a fuzzy index.  */
619 void
message_fuzzy_index_free(message_fuzzy_index_ty * findex)620 message_fuzzy_index_free (message_fuzzy_index_ty *findex)
621 {
622   size_t l;
623   void *iter;
624   const void *key;
625   size_t keylen;
626   void *data;
627 
628   /* Free the short lists.  */
629   for (l = 0; l <= SHORT_MSG_MAX; l++)
630     message_list_free (findex->short_messages[l], 1);
631 
632   /* Free the index lists occurring as values in the hash tables.  */
633   iter = NULL;
634   while (hash_iterate (&findex->gram4, &iter, &key, &keylen, &data) == 0)
635     free ((index_list_ty *) data);
636   /* Free the hash table itself.  */
637   hash_destroy (&findex->gram4);
638 
639   free (findex);
640 }
641