xref: /netbsd-src/external/gpl2/gettext/dist/gnulib-local/lib/fstrcmp.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /* Functions to make fuzzy comparisons between strings
2    Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006 Free Software Foundation, Inc.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; either version 2 of the License, or (at
7    your option) any later version.
8 
9    This program is distributed in the hope that it will be useful, but
10    WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12    General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program; if not, write to the Free Software
16    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 
18 
19    Derived from GNU diff 2.7, analyze.c et al.
20 
21    The basic idea is to consider two strings as similar if, when
22    transforming the first string into the second string through a
23    sequence of edits (inserts and deletes of one character each),
24    this sequence is short - or equivalently, if the ordered list
25    of characters that are untouched by these edits is long.  For a
26    good introduction to the subject, read about the "Levenshtein
27    distance" in Wikipedia.
28 
29    The basic algorithm is described in:
30    "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
31    Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
32    see especially section 4.2, which describes the variation used below.
33 
34    The basic algorithm was independently discovered as described in:
35    "Algorithms for Approximate String Matching", E. Ukkonen,
36    Information and Control Vol. 64, 1985, pp. 100-118.
37 
38    Unless the 'minimal' flag is set, this code uses the TOO_EXPENSIVE
39    heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
40    at the price of producing suboptimal output for large inputs with
41    many differences.
42 
43    Modified to work on strings rather than files
44    by Peter Miller <pmiller@agso.gov.au>, October 1995 */
45 
46 #include <config.h>
47 
48 /* Specification.  */
49 #include "fstrcmp.h"
50 
51 #include <string.h>
52 #include <stdio.h>
53 #include <stdlib.h>
54 #include <limits.h>
55 
56 #include "lock.h"
57 #include "tls.h"
58 #include "xalloc.h"
59 
60 #ifndef uintptr_t
61 # define uintptr_t unsigned long
62 #endif
63 
64 
65 /*
66  * Context of comparison operation.
67  */
68 struct context
69 {
70   /*
71    * Data on one input string being compared.
72    */
73   struct string_data
74   {
75     /* The string to be compared. */
76     const char *data;
77 
78     /* The length of the string to be compared. */
79     int data_length;
80 
81     /* The number of characters inserted or deleted. */
82     int edit_count;
83   }
84   string[2];
85 
86   #ifdef MINUS_H_FLAG
87 
88   /* This corresponds to the diff -H flag.  With this heuristic, for
89      strings with a constant small density of changes, the algorithm is
90      linear in the strings size.  This is unlikely in typical uses of
91      fstrcmp, and so is usually compiled out.  Besides, there is no
92      interface to set it true.  */
93   int heuristic;
94 
95   #endif
96 
97   /* Vector, indexed by diagonal, containing 1 + the X coordinate of the
98      point furthest along the given diagonal in the forward search of the
99      edit matrix.  */
100   int *fdiag;
101 
102   /* Vector, indexed by diagonal, containing the X coordinate of the point
103      furthest along the given diagonal in the backward search of the edit
104      matrix.  */
105   int *bdiag;
106 
107   /* Edit scripts longer than this are too expensive to compute.  */
108   int too_expensive;
109 
110   /* Snakes bigger than this are considered `big'.  */
111   #define SNAKE_LIMIT	20
112 };
113 
114 struct partition
115 {
116   /* Midpoints of this partition.  */
117   int xmid, ymid;
118 
119   /* Nonzero if low half will be analyzed minimally.  */
120   int lo_minimal;
121 
122   /* Likewise for high half.  */
123   int hi_minimal;
124 };
125 
126 
127 /* NAME
128 	diag - find diagonal path
129 
130    SYNOPSIS
131 	int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
132 		 struct partition *part, struct context *ctxt);
133 
134    DESCRIPTION
135 	Find the midpoint of the shortest edit script for a specified
136 	portion of the two strings.
137 
138 	Scan from the beginnings of the strings, and simultaneously from
139 	the ends, doing a breadth-first search through the space of
140 	edit-sequence.  When the two searches meet, we have found the
141 	midpoint of the shortest edit sequence.
142 
143 	If MINIMAL is nonzero, find the minimal edit script regardless
144 	of expense.  Otherwise, if the search is too expensive, use
145 	heuristics to stop the search and report a suboptimal answer.
146 
147    RETURNS
148 	Set PART->(XMID,YMID) to the midpoint (XMID,YMID).  The diagonal
149 	number XMID - YMID equals the number of inserted characters
150 	minus the number of deleted characters (counting only characters
151 	before the midpoint).  Return the approximate edit cost; this is
152 	the total number of characters inserted or deleted (counting
153 	only characters before the midpoint), unless a heuristic is used
154 	to terminate the search prematurely.
155 
156 	Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
157 	for the left half of the partition is known; similarly for
158 	PART->RIGHT_MINIMAL.
159 
160    CAVEAT
161 	This function assumes that the first characters of the specified
162 	portions of the two strings do not match, and likewise that the
163 	last characters do not match.  The caller must trim matching
164 	characters from the beginning and end of the portions it is
165 	going to specify.
166 
167 	If we return the "wrong" partitions, the worst this can do is
168 	cause suboptimal diff output.  It cannot cause incorrect diff
169 	output.  */
170 
171 static int
diag(int xoff,int xlim,int yoff,int ylim,int minimal,struct partition * part,struct context * ctxt)172 diag (int xoff, int xlim, int yoff, int ylim, int minimal,
173       struct partition *part, struct context *ctxt)
174 {
175   int *const fd = ctxt->fdiag;	/* Give the compiler a chance. */
176   int *const bd = ctxt->bdiag;	/* Additional help for the compiler. */
177   const char *const xv = ctxt->string[0].data;	/* Still more help for the compiler. */
178   const char *const yv = ctxt->string[1].data;	/* And more and more . . . */
179   const int dmin = xoff - ylim;	/* Minimum valid diagonal. */
180   const int dmax = xlim - yoff;	/* Maximum valid diagonal. */
181   const int fmid = xoff - yoff;	/* Center diagonal of top-down search. */
182   const int bmid = xlim - ylim;	/* Center diagonal of bottom-up search. */
183   int fmin = fmid;
184   int fmax = fmid;		/* Limits of top-down search. */
185   int bmin = bmid;
186   int bmax = bmid;		/* Limits of bottom-up search. */
187   int c;			/* Cost. */
188   int odd = (fmid - bmid) & 1;
189 
190   /*
191    * True if southeast corner is on an odd diagonal with respect
192    * to the northwest.
193    */
194   fd[fmid] = xoff;
195   bd[bmid] = xlim;
196   for (c = 1;; ++c)
197     {
198       int d;			/* Active diagonal. */
199       int big_snake;
200 
201       big_snake = 0;
202       /* Extend the top-down search by an edit step in each diagonal. */
203       if (fmin > dmin)
204 	fd[--fmin - 1] = -1;
205       else
206 	++fmin;
207       if (fmax < dmax)
208 	fd[++fmax + 1] = -1;
209       else
210 	--fmax;
211       for (d = fmax; d >= fmin; d -= 2)
212 	{
213 	  int x;
214 	  int y;
215 	  int oldx;
216 	  int tlo;
217 	  int thi;
218 
219 	  tlo = fd[d - 1],
220 	    thi = fd[d + 1];
221 
222 	  if (tlo >= thi)
223 	    x = tlo + 1;
224 	  else
225 	    x = thi;
226 	  oldx = x;
227 	  y = x - d;
228 	  while (x < xlim && y < ylim && xv[x] == yv[y])
229 	    {
230 	      ++x;
231 	      ++y;
232 	    }
233 	  if (x - oldx > SNAKE_LIMIT)
234 	    big_snake = 1;
235 	  fd[d] = x;
236 	  if (odd && bmin <= d && d <= bmax && bd[d] <= x)
237 	    {
238 	      part->xmid = x;
239 	      part->ymid = y;
240 	      part->lo_minimal = part->hi_minimal = 1;
241 	      return 2 * c - 1;
242 	    }
243 	}
244       /* Similarly extend the bottom-up search.  */
245       if (bmin > dmin)
246 	bd[--bmin - 1] = INT_MAX;
247       else
248 	++bmin;
249       if (bmax < dmax)
250 	bd[++bmax + 1] = INT_MAX;
251       else
252 	--bmax;
253       for (d = bmax; d >= bmin; d -= 2)
254 	{
255 	  int x;
256 	  int y;
257 	  int oldx;
258 	  int tlo;
259 	  int thi;
260 
261 	  tlo = bd[d - 1],
262 	    thi = bd[d + 1];
263 	  if (tlo < thi)
264 	    x = tlo;
265 	  else
266 	    x = thi - 1;
267 	  oldx = x;
268 	  y = x - d;
269 	  while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
270 	    {
271 	      --x;
272 	      --y;
273 	    }
274 	  if (oldx - x > SNAKE_LIMIT)
275 	    big_snake = 1;
276 	  bd[d] = x;
277 	  if (!odd && fmin <= d && d <= fmax && x <= fd[d])
278 	    {
279 	      part->xmid = x;
280 	      part->ymid = y;
281 	      part->lo_minimal = part->hi_minimal = 1;
282 	      return 2 * c;
283 	    }
284 	}
285 
286       if (minimal)
287 	continue;
288 
289 #ifdef MINUS_H_FLAG
290       /* Heuristic: check occasionally for a diagonal that has made lots
291          of progress compared with the edit distance.  If we have any
292          such, find the one that has made the most progress and return
293          it as if it had succeeded.
294 
295          With this heuristic, for strings with a constant small density
296          of changes, the algorithm is linear in the strings size.  */
297       if (c > 200 && big_snake && ctxt->heuristic)
298 	{
299 	  int best;
300 
301 	  best = 0;
302 	  for (d = fmax; d >= fmin; d -= 2)
303 	    {
304 	      int dd;
305 	      int x;
306 	      int y;
307 	      int v;
308 
309 	      dd = d - fmid;
310 	      x = fd[d];
311 	      y = x - d;
312 	      v = (x - xoff) * 2 - dd;
313 
314 	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
315 		{
316 		  if
317 		    (
318 		      v > best
319 		      &&
320 		      xoff + SNAKE_LIMIT <= x
321 		      &&
322 		      x < xlim
323 		      &&
324 		      yoff + SNAKE_LIMIT <= y
325 		      &&
326 		      y < ylim
327 		    )
328 		    {
329 		      /* We have a good enough best diagonal; now insist
330 			 that it end with a significant snake.  */
331 		      int k;
332 
333 		      for (k = 1; xv[x - k] == yv[y - k]; k++)
334 			{
335 			  if (k == SNAKE_LIMIT)
336 			    {
337 			      best = v;
338 			      part->xmid = x;
339 			      part->ymid = y;
340 			      break;
341 			    }
342 			}
343 		    }
344 		}
345 	    }
346 	  if (best > 0)
347 	    {
348 	      part->lo_minimal = 1;
349 	      part->hi_minimal = 0;
350 	      return 2 * c - 1;
351 	    }
352 	  best = 0;
353 	  for (d = bmax; d >= bmin; d -= 2)
354 	    {
355 	      int dd;
356 	      int x;
357 	      int y;
358 	      int v;
359 
360 	      dd = d - bmid;
361 	      x = bd[d];
362 	      y = x - d;
363 	      v = (xlim - x) * 2 + dd;
364 
365 	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
366 		{
367 		  if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT &&
368 		      yoff < y && y <= ylim - SNAKE_LIMIT)
369 		    {
370 		      /* We have a good enough best diagonal; now insist
371 			 that it end with a significant snake.  */
372 		      int k;
373 
374 		      for (k = 0; xv[x + k] == yv[y + k]; k++)
375 			{
376 			  if (k == SNAKE_LIMIT - 1)
377 			    {
378 			      best = v;
379 			      part->xmid = x;
380 			      part->ymid = y;
381 			      break;
382 			    }
383 			}
384 		    }
385 		}
386 	    }
387 	  if (best > 0)
388 	    {
389 	      part->lo_minimal = 0;
390 	      part->hi_minimal = 1;
391 	      return 2 * c - 1;
392 	    }
393 	}
394 #endif /* MINUS_H_FLAG */
395 
396       /* Heuristic: if we've gone well beyond the call of duty, give up
397 	 and report halfway between our best results so far.  */
398       if (c >= ctxt->too_expensive)
399 	{
400 	  int fxybest;
401 	  int fxbest;
402 	  int bxybest;
403 	  int bxbest;
404 
405 	  /* Pacify `gcc -Wall'. */
406 	  fxbest = 0;
407 	  bxbest = 0;
408 
409 	  /* Find forward diagonal that maximizes X + Y.  */
410 	  fxybest = -1;
411 	  for (d = fmax; d >= fmin; d -= 2)
412 	    {
413 	      int x;
414 	      int y;
415 
416 	      x = fd[d] < xlim ? fd[d] : xlim;
417 	      y = x - d;
418 
419 	      if (ylim < y)
420 		{
421 		  x = ylim + d;
422 		  y = ylim;
423 		}
424 	      if (fxybest < x + y)
425 		{
426 		  fxybest = x + y;
427 		  fxbest = x;
428 		}
429 	    }
430 	  /* Find backward diagonal that minimizes X + Y.  */
431 	  bxybest = INT_MAX;
432 	  for (d = bmax; d >= bmin; d -= 2)
433 	    {
434 	      int x;
435 	      int y;
436 
437 	      x = xoff > bd[d] ? xoff : bd[d];
438 	      y = x - d;
439 
440 	      if (y < yoff)
441 		{
442 		  x = yoff + d;
443 		  y = yoff;
444 		}
445 	      if (x + y < bxybest)
446 		{
447 		  bxybest = x + y;
448 		  bxbest = x;
449 		}
450 	    }
451 	  /* Use the better of the two diagonals.  */
452 	  if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
453 	    {
454 	      part->xmid = fxbest;
455 	      part->ymid = fxybest - fxbest;
456 	      part->lo_minimal = 1;
457 	      part->hi_minimal = 0;
458 	    }
459 	  else
460 	    {
461 	      part->xmid = bxbest;
462 	      part->ymid = bxybest - bxbest;
463 	      part->lo_minimal = 0;
464 	      part->hi_minimal = 1;
465 	    }
466 	  return 2 * c - 1;
467 	}
468     }
469 }
470 
471 
472 /* NAME
473 	compareseq - find edit sequence
474 
475    SYNOPSIS
476 	void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal,
477 			struct context *ctxt);
478 
479    DESCRIPTION
480 	Compare in detail contiguous subsequences of the two strings
481 	which are known, as a whole, to match each other.
482 
483 	The subsequence of string 0 is [XOFF, XLIM) and likewise for
484 	string 1.
485 
486 	Note that XLIM, YLIM are exclusive bounds.  All character
487 	numbers are origin-0.
488 
489 	If MINIMAL is nonzero, find a minimal difference no matter how
490 	expensive it is.  */
491 
492 static void
compareseq(int xoff,int xlim,int yoff,int ylim,int minimal,struct context * ctxt)493 compareseq (int xoff, int xlim, int yoff, int ylim, int minimal,
494 	    struct context *ctxt)
495 {
496   const char *const xv = ctxt->string[0].data;	/* Help the compiler.  */
497   const char *const yv = ctxt->string[1].data;
498 
499   /* Slide down the bottom initial diagonal. */
500   while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
501     {
502       ++xoff;
503       ++yoff;
504     }
505 
506   /* Slide up the top initial diagonal. */
507   while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
508     {
509       --xlim;
510       --ylim;
511     }
512 
513   /* Handle simple cases. */
514   if (xoff == xlim)
515     {
516       while (yoff < ylim)
517 	{
518 	  ctxt->string[1].edit_count++;
519 	  ++yoff;
520 	}
521     }
522   else if (yoff == ylim)
523     {
524       while (xoff < xlim)
525 	{
526 	  ctxt->string[0].edit_count++;
527 	  ++xoff;
528 	}
529     }
530   else
531     {
532       int c;
533       struct partition part;
534 
535       /* Find a point of correspondence in the middle of the strings.  */
536       c = diag (xoff, xlim, yoff, ylim, minimal, &part, ctxt);
537       if (c == 1)
538 	{
539 #if 0
540 	  /* This should be impossible, because it implies that one of
541 	     the two subsequences is empty, and that case was handled
542 	     above without calling `diag'.  Let's verify that this is
543 	     true.  */
544 	  abort ();
545 #else
546 	  /* The two subsequences differ by a single insert or delete;
547 	     record it and we are done.  */
548 	  if (part.xmid - part.ymid < xoff - yoff)
549 	    ctxt->string[1].edit_count++;
550 	  else
551 	    ctxt->string[0].edit_count++;
552 #endif
553 	}
554       else
555 	{
556 	  /* Use the partitions to split this problem into subproblems.  */
557 	  compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt);
558 	  compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt);
559 	}
560     }
561 }
562 
563 
564 /* Because fstrcmp is typically called multiple times, attempt to minimize
565    the number of memory allocations performed.  Thus, let a call reuse the
566    memory already allocated by the previous call, if it is sufficient.
567    To make it multithread-safe, without need for a lock that protects the
568    already allocated memory, store the allocated memory per thread.  Free
569    it only when the thread exits.  */
570 
571 static gl_tls_key_t buffer_key;	/* TLS key for a 'int *' */
572 static gl_tls_key_t bufmax_key;	/* TLS key for a 'size_t' */
573 
574 static void
keys_init(void)575 keys_init (void)
576 {
577   gl_tls_key_init (buffer_key, free);
578   gl_tls_key_init (bufmax_key, NULL);
579   /* The per-thread initial values are NULL and 0, respectively.  */
580 }
581 
582 /* Ensure that keys_init is called once only.  */
gl_once_define(static,keys_init_once)583 gl_once_define(static, keys_init_once)
584 
585 
586 /* NAME
587 	fstrcmp - fuzzy string compare
588 
589    SYNOPSIS
590 	double fstrcmp(const char *, const char *);
591 
592    DESCRIPTION
593 	The fstrcmp function may be used to compare two string for
594 	similarity.  It is very useful in reducing "cascade" or
595 	"secondary" errors in compilers or other situations where
596 	symbol tables occur.
597 
598    RETURNS
599 	double; 0 if the strings are entirly dissimilar, 1 if the
600 	strings are identical, and a number in between if they are
601 	similar.  */
602 
603 double
604 fstrcmp (const char *string1, const char *string2)
605 {
606   struct context ctxt;
607   int i;
608 
609   size_t fdiag_len;
610   int *buffer;
611   size_t bufmax;
612 
613   /* set the info for each string.  */
614   ctxt.string[0].data = string1;
615   ctxt.string[0].data_length = strlen (string1);
616   ctxt.string[1].data = string2;
617   ctxt.string[1].data_length = strlen (string2);
618 
619   /* short-circuit obvious comparisons */
620   if (ctxt.string[0].data_length == 0 && ctxt.string[1].data_length == 0)
621     return 1.0;
622   if (ctxt.string[0].data_length == 0 || ctxt.string[1].data_length == 0)
623     return 0.0;
624 
625   /* Set TOO_EXPENSIVE to be approximate square root of input size,
626      bounded below by 256.  */
627   ctxt.too_expensive = 1;
628   for (i = ctxt.string[0].data_length + ctxt.string[1].data_length;
629        i != 0;
630        i >>= 2)
631     ctxt.too_expensive <<= 1;
632   if (ctxt.too_expensive < 256)
633     ctxt.too_expensive = 256;
634 
635   /* Allocate memory for fdiag and bdiag from a thread-local pool.  */
636   fdiag_len = ctxt.string[0].data_length + ctxt.string[1].data_length + 3;
637   gl_once (keys_init_once, keys_init);
638   buffer = (int *) gl_tls_get (buffer_key);
639   bufmax = (size_t) (uintptr_t) gl_tls_get (bufmax_key);
640   if (fdiag_len > bufmax)
641     {
642       /* Need more memory.  */
643       bufmax = 2 * bufmax;
644       if (fdiag_len > bufmax)
645 	bufmax = fdiag_len;
646       /* Calling xrealloc would be a waste: buffer's contents does not need
647 	 to be preserved.  */
648       if (buffer != NULL)
649 	free (buffer);
650       buffer = (int *) xmalloc (bufmax * (2 * sizeof (int)));
651       gl_tls_set (buffer_key, buffer);
652       gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax);
653     }
654   ctxt.fdiag = buffer + ctxt.string[1].data_length + 1;
655   ctxt.bdiag = ctxt.fdiag + fdiag_len;
656 
657   /* Now do the main comparison algorithm */
658   ctxt.string[0].edit_count = 0;
659   ctxt.string[1].edit_count = 0;
660   compareseq (0, ctxt.string[0].data_length, 0, ctxt.string[1].data_length, 0,
661 	      &ctxt);
662 
663   /* The result is
664 	((number of chars in common) / (average length of the strings)).
665      This is admittedly biased towards finding that the strings are
666      similar, however it does produce meaningful results.  */
667   return ((double) (ctxt.string[0].data_length + ctxt.string[1].data_length
668 		    - ctxt.string[1].edit_count - ctxt.string[0].edit_count)
669 	  / (ctxt.string[0].data_length + ctxt.string[1].data_length));
670 }
671