1 /* Functions to make fuzzy comparisons between strings
2 Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or (at
7 your option) any later version.
8
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17
18
19 Derived from GNU diff 2.7, analyze.c et al.
20
21 The basic idea is to consider two strings as similar if, when
22 transforming the first string into the second string through a
23 sequence of edits (inserts and deletes of one character each),
24 this sequence is short - or equivalently, if the ordered list
25 of characters that are untouched by these edits is long. For a
26 good introduction to the subject, read about the "Levenshtein
27 distance" in Wikipedia.
28
29 The basic algorithm is described in:
30 "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
31 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
32 see especially section 4.2, which describes the variation used below.
33
34 The basic algorithm was independently discovered as described in:
35 "Algorithms for Approximate String Matching", E. Ukkonen,
36 Information and Control Vol. 64, 1985, pp. 100-118.
37
38 Unless the 'minimal' flag is set, this code uses the TOO_EXPENSIVE
39 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
40 at the price of producing suboptimal output for large inputs with
41 many differences.
42
43 Modified to work on strings rather than files
44 by Peter Miller <pmiller@agso.gov.au>, October 1995 */
45
46 #include <config.h>
47
48 /* Specification. */
49 #include "fstrcmp.h"
50
51 #include <string.h>
52 #include <stdio.h>
53 #include <stdlib.h>
54 #include <limits.h>
55
56 #include "lock.h"
57 #include "tls.h"
58 #include "xalloc.h"
59
60 #ifndef uintptr_t
61 # define uintptr_t unsigned long
62 #endif
63
64
65 /*
66 * Context of comparison operation.
67 */
68 struct context
69 {
70 /*
71 * Data on one input string being compared.
72 */
73 struct string_data
74 {
75 /* The string to be compared. */
76 const char *data;
77
78 /* The length of the string to be compared. */
79 int data_length;
80
81 /* The number of characters inserted or deleted. */
82 int edit_count;
83 }
84 string[2];
85
86 #ifdef MINUS_H_FLAG
87
88 /* This corresponds to the diff -H flag. With this heuristic, for
89 strings with a constant small density of changes, the algorithm is
90 linear in the strings size. This is unlikely in typical uses of
91 fstrcmp, and so is usually compiled out. Besides, there is no
92 interface to set it true. */
93 int heuristic;
94
95 #endif
96
97 /* Vector, indexed by diagonal, containing 1 + the X coordinate of the
98 point furthest along the given diagonal in the forward search of the
99 edit matrix. */
100 int *fdiag;
101
102 /* Vector, indexed by diagonal, containing the X coordinate of the point
103 furthest along the given diagonal in the backward search of the edit
104 matrix. */
105 int *bdiag;
106
107 /* Edit scripts longer than this are too expensive to compute. */
108 int too_expensive;
109
110 /* Snakes bigger than this are considered `big'. */
111 #define SNAKE_LIMIT 20
112 };
113
114 struct partition
115 {
116 /* Midpoints of this partition. */
117 int xmid, ymid;
118
119 /* Nonzero if low half will be analyzed minimally. */
120 int lo_minimal;
121
122 /* Likewise for high half. */
123 int hi_minimal;
124 };
125
126
127 /* NAME
128 diag - find diagonal path
129
130 SYNOPSIS
131 int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
132 struct partition *part, struct context *ctxt);
133
134 DESCRIPTION
135 Find the midpoint of the shortest edit script for a specified
136 portion of the two strings.
137
138 Scan from the beginnings of the strings, and simultaneously from
139 the ends, doing a breadth-first search through the space of
140 edit-sequence. When the two searches meet, we have found the
141 midpoint of the shortest edit sequence.
142
143 If MINIMAL is nonzero, find the minimal edit script regardless
144 of expense. Otherwise, if the search is too expensive, use
145 heuristics to stop the search and report a suboptimal answer.
146
147 RETURNS
148 Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal
149 number XMID - YMID equals the number of inserted characters
150 minus the number of deleted characters (counting only characters
151 before the midpoint). Return the approximate edit cost; this is
152 the total number of characters inserted or deleted (counting
153 only characters before the midpoint), unless a heuristic is used
154 to terminate the search prematurely.
155
156 Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
157 for the left half of the partition is known; similarly for
158 PART->RIGHT_MINIMAL.
159
160 CAVEAT
161 This function assumes that the first characters of the specified
162 portions of the two strings do not match, and likewise that the
163 last characters do not match. The caller must trim matching
164 characters from the beginning and end of the portions it is
165 going to specify.
166
167 If we return the "wrong" partitions, the worst this can do is
168 cause suboptimal diff output. It cannot cause incorrect diff
169 output. */
170
171 static int
diag(int xoff,int xlim,int yoff,int ylim,int minimal,struct partition * part,struct context * ctxt)172 diag (int xoff, int xlim, int yoff, int ylim, int minimal,
173 struct partition *part, struct context *ctxt)
174 {
175 int *const fd = ctxt->fdiag; /* Give the compiler a chance. */
176 int *const bd = ctxt->bdiag; /* Additional help for the compiler. */
177 const char *const xv = ctxt->string[0].data; /* Still more help for the compiler. */
178 const char *const yv = ctxt->string[1].data; /* And more and more . . . */
179 const int dmin = xoff - ylim; /* Minimum valid diagonal. */
180 const int dmax = xlim - yoff; /* Maximum valid diagonal. */
181 const int fmid = xoff - yoff; /* Center diagonal of top-down search. */
182 const int bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
183 int fmin = fmid;
184 int fmax = fmid; /* Limits of top-down search. */
185 int bmin = bmid;
186 int bmax = bmid; /* Limits of bottom-up search. */
187 int c; /* Cost. */
188 int odd = (fmid - bmid) & 1;
189
190 /*
191 * True if southeast corner is on an odd diagonal with respect
192 * to the northwest.
193 */
194 fd[fmid] = xoff;
195 bd[bmid] = xlim;
196 for (c = 1;; ++c)
197 {
198 int d; /* Active diagonal. */
199 int big_snake;
200
201 big_snake = 0;
202 /* Extend the top-down search by an edit step in each diagonal. */
203 if (fmin > dmin)
204 fd[--fmin - 1] = -1;
205 else
206 ++fmin;
207 if (fmax < dmax)
208 fd[++fmax + 1] = -1;
209 else
210 --fmax;
211 for (d = fmax; d >= fmin; d -= 2)
212 {
213 int x;
214 int y;
215 int oldx;
216 int tlo;
217 int thi;
218
219 tlo = fd[d - 1],
220 thi = fd[d + 1];
221
222 if (tlo >= thi)
223 x = tlo + 1;
224 else
225 x = thi;
226 oldx = x;
227 y = x - d;
228 while (x < xlim && y < ylim && xv[x] == yv[y])
229 {
230 ++x;
231 ++y;
232 }
233 if (x - oldx > SNAKE_LIMIT)
234 big_snake = 1;
235 fd[d] = x;
236 if (odd && bmin <= d && d <= bmax && bd[d] <= x)
237 {
238 part->xmid = x;
239 part->ymid = y;
240 part->lo_minimal = part->hi_minimal = 1;
241 return 2 * c - 1;
242 }
243 }
244 /* Similarly extend the bottom-up search. */
245 if (bmin > dmin)
246 bd[--bmin - 1] = INT_MAX;
247 else
248 ++bmin;
249 if (bmax < dmax)
250 bd[++bmax + 1] = INT_MAX;
251 else
252 --bmax;
253 for (d = bmax; d >= bmin; d -= 2)
254 {
255 int x;
256 int y;
257 int oldx;
258 int tlo;
259 int thi;
260
261 tlo = bd[d - 1],
262 thi = bd[d + 1];
263 if (tlo < thi)
264 x = tlo;
265 else
266 x = thi - 1;
267 oldx = x;
268 y = x - d;
269 while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
270 {
271 --x;
272 --y;
273 }
274 if (oldx - x > SNAKE_LIMIT)
275 big_snake = 1;
276 bd[d] = x;
277 if (!odd && fmin <= d && d <= fmax && x <= fd[d])
278 {
279 part->xmid = x;
280 part->ymid = y;
281 part->lo_minimal = part->hi_minimal = 1;
282 return 2 * c;
283 }
284 }
285
286 if (minimal)
287 continue;
288
289 #ifdef MINUS_H_FLAG
290 /* Heuristic: check occasionally for a diagonal that has made lots
291 of progress compared with the edit distance. If we have any
292 such, find the one that has made the most progress and return
293 it as if it had succeeded.
294
295 With this heuristic, for strings with a constant small density
296 of changes, the algorithm is linear in the strings size. */
297 if (c > 200 && big_snake && ctxt->heuristic)
298 {
299 int best;
300
301 best = 0;
302 for (d = fmax; d >= fmin; d -= 2)
303 {
304 int dd;
305 int x;
306 int y;
307 int v;
308
309 dd = d - fmid;
310 x = fd[d];
311 y = x - d;
312 v = (x - xoff) * 2 - dd;
313
314 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
315 {
316 if
317 (
318 v > best
319 &&
320 xoff + SNAKE_LIMIT <= x
321 &&
322 x < xlim
323 &&
324 yoff + SNAKE_LIMIT <= y
325 &&
326 y < ylim
327 )
328 {
329 /* We have a good enough best diagonal; now insist
330 that it end with a significant snake. */
331 int k;
332
333 for (k = 1; xv[x - k] == yv[y - k]; k++)
334 {
335 if (k == SNAKE_LIMIT)
336 {
337 best = v;
338 part->xmid = x;
339 part->ymid = y;
340 break;
341 }
342 }
343 }
344 }
345 }
346 if (best > 0)
347 {
348 part->lo_minimal = 1;
349 part->hi_minimal = 0;
350 return 2 * c - 1;
351 }
352 best = 0;
353 for (d = bmax; d >= bmin; d -= 2)
354 {
355 int dd;
356 int x;
357 int y;
358 int v;
359
360 dd = d - bmid;
361 x = bd[d];
362 y = x - d;
363 v = (xlim - x) * 2 + dd;
364
365 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
366 {
367 if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT &&
368 yoff < y && y <= ylim - SNAKE_LIMIT)
369 {
370 /* We have a good enough best diagonal; now insist
371 that it end with a significant snake. */
372 int k;
373
374 for (k = 0; xv[x + k] == yv[y + k]; k++)
375 {
376 if (k == SNAKE_LIMIT - 1)
377 {
378 best = v;
379 part->xmid = x;
380 part->ymid = y;
381 break;
382 }
383 }
384 }
385 }
386 }
387 if (best > 0)
388 {
389 part->lo_minimal = 0;
390 part->hi_minimal = 1;
391 return 2 * c - 1;
392 }
393 }
394 #endif /* MINUS_H_FLAG */
395
396 /* Heuristic: if we've gone well beyond the call of duty, give up
397 and report halfway between our best results so far. */
398 if (c >= ctxt->too_expensive)
399 {
400 int fxybest;
401 int fxbest;
402 int bxybest;
403 int bxbest;
404
405 /* Pacify `gcc -Wall'. */
406 fxbest = 0;
407 bxbest = 0;
408
409 /* Find forward diagonal that maximizes X + Y. */
410 fxybest = -1;
411 for (d = fmax; d >= fmin; d -= 2)
412 {
413 int x;
414 int y;
415
416 x = fd[d] < xlim ? fd[d] : xlim;
417 y = x - d;
418
419 if (ylim < y)
420 {
421 x = ylim + d;
422 y = ylim;
423 }
424 if (fxybest < x + y)
425 {
426 fxybest = x + y;
427 fxbest = x;
428 }
429 }
430 /* Find backward diagonal that minimizes X + Y. */
431 bxybest = INT_MAX;
432 for (d = bmax; d >= bmin; d -= 2)
433 {
434 int x;
435 int y;
436
437 x = xoff > bd[d] ? xoff : bd[d];
438 y = x - d;
439
440 if (y < yoff)
441 {
442 x = yoff + d;
443 y = yoff;
444 }
445 if (x + y < bxybest)
446 {
447 bxybest = x + y;
448 bxbest = x;
449 }
450 }
451 /* Use the better of the two diagonals. */
452 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
453 {
454 part->xmid = fxbest;
455 part->ymid = fxybest - fxbest;
456 part->lo_minimal = 1;
457 part->hi_minimal = 0;
458 }
459 else
460 {
461 part->xmid = bxbest;
462 part->ymid = bxybest - bxbest;
463 part->lo_minimal = 0;
464 part->hi_minimal = 1;
465 }
466 return 2 * c - 1;
467 }
468 }
469 }
470
471
472 /* NAME
473 compareseq - find edit sequence
474
475 SYNOPSIS
476 void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal,
477 struct context *ctxt);
478
479 DESCRIPTION
480 Compare in detail contiguous subsequences of the two strings
481 which are known, as a whole, to match each other.
482
483 The subsequence of string 0 is [XOFF, XLIM) and likewise for
484 string 1.
485
486 Note that XLIM, YLIM are exclusive bounds. All character
487 numbers are origin-0.
488
489 If MINIMAL is nonzero, find a minimal difference no matter how
490 expensive it is. */
491
492 static void
compareseq(int xoff,int xlim,int yoff,int ylim,int minimal,struct context * ctxt)493 compareseq (int xoff, int xlim, int yoff, int ylim, int minimal,
494 struct context *ctxt)
495 {
496 const char *const xv = ctxt->string[0].data; /* Help the compiler. */
497 const char *const yv = ctxt->string[1].data;
498
499 /* Slide down the bottom initial diagonal. */
500 while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
501 {
502 ++xoff;
503 ++yoff;
504 }
505
506 /* Slide up the top initial diagonal. */
507 while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
508 {
509 --xlim;
510 --ylim;
511 }
512
513 /* Handle simple cases. */
514 if (xoff == xlim)
515 {
516 while (yoff < ylim)
517 {
518 ctxt->string[1].edit_count++;
519 ++yoff;
520 }
521 }
522 else if (yoff == ylim)
523 {
524 while (xoff < xlim)
525 {
526 ctxt->string[0].edit_count++;
527 ++xoff;
528 }
529 }
530 else
531 {
532 int c;
533 struct partition part;
534
535 /* Find a point of correspondence in the middle of the strings. */
536 c = diag (xoff, xlim, yoff, ylim, minimal, &part, ctxt);
537 if (c == 1)
538 {
539 #if 0
540 /* This should be impossible, because it implies that one of
541 the two subsequences is empty, and that case was handled
542 above without calling `diag'. Let's verify that this is
543 true. */
544 abort ();
545 #else
546 /* The two subsequences differ by a single insert or delete;
547 record it and we are done. */
548 if (part.xmid - part.ymid < xoff - yoff)
549 ctxt->string[1].edit_count++;
550 else
551 ctxt->string[0].edit_count++;
552 #endif
553 }
554 else
555 {
556 /* Use the partitions to split this problem into subproblems. */
557 compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt);
558 compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt);
559 }
560 }
561 }
562
563
564 /* Because fstrcmp is typically called multiple times, attempt to minimize
565 the number of memory allocations performed. Thus, let a call reuse the
566 memory already allocated by the previous call, if it is sufficient.
567 To make it multithread-safe, without need for a lock that protects the
568 already allocated memory, store the allocated memory per thread. Free
569 it only when the thread exits. */
570
571 static gl_tls_key_t buffer_key; /* TLS key for a 'int *' */
572 static gl_tls_key_t bufmax_key; /* TLS key for a 'size_t' */
573
574 static void
keys_init(void)575 keys_init (void)
576 {
577 gl_tls_key_init (buffer_key, free);
578 gl_tls_key_init (bufmax_key, NULL);
579 /* The per-thread initial values are NULL and 0, respectively. */
580 }
581
582 /* Ensure that keys_init is called once only. */
gl_once_define(static,keys_init_once)583 gl_once_define(static, keys_init_once)
584
585
586 /* NAME
587 fstrcmp - fuzzy string compare
588
589 SYNOPSIS
590 double fstrcmp(const char *, const char *);
591
592 DESCRIPTION
593 The fstrcmp function may be used to compare two string for
594 similarity. It is very useful in reducing "cascade" or
595 "secondary" errors in compilers or other situations where
596 symbol tables occur.
597
598 RETURNS
599 double; 0 if the strings are entirly dissimilar, 1 if the
600 strings are identical, and a number in between if they are
601 similar. */
602
603 double
604 fstrcmp (const char *string1, const char *string2)
605 {
606 struct context ctxt;
607 int i;
608
609 size_t fdiag_len;
610 int *buffer;
611 size_t bufmax;
612
613 /* set the info for each string. */
614 ctxt.string[0].data = string1;
615 ctxt.string[0].data_length = strlen (string1);
616 ctxt.string[1].data = string2;
617 ctxt.string[1].data_length = strlen (string2);
618
619 /* short-circuit obvious comparisons */
620 if (ctxt.string[0].data_length == 0 && ctxt.string[1].data_length == 0)
621 return 1.0;
622 if (ctxt.string[0].data_length == 0 || ctxt.string[1].data_length == 0)
623 return 0.0;
624
625 /* Set TOO_EXPENSIVE to be approximate square root of input size,
626 bounded below by 256. */
627 ctxt.too_expensive = 1;
628 for (i = ctxt.string[0].data_length + ctxt.string[1].data_length;
629 i != 0;
630 i >>= 2)
631 ctxt.too_expensive <<= 1;
632 if (ctxt.too_expensive < 256)
633 ctxt.too_expensive = 256;
634
635 /* Allocate memory for fdiag and bdiag from a thread-local pool. */
636 fdiag_len = ctxt.string[0].data_length + ctxt.string[1].data_length + 3;
637 gl_once (keys_init_once, keys_init);
638 buffer = (int *) gl_tls_get (buffer_key);
639 bufmax = (size_t) (uintptr_t) gl_tls_get (bufmax_key);
640 if (fdiag_len > bufmax)
641 {
642 /* Need more memory. */
643 bufmax = 2 * bufmax;
644 if (fdiag_len > bufmax)
645 bufmax = fdiag_len;
646 /* Calling xrealloc would be a waste: buffer's contents does not need
647 to be preserved. */
648 if (buffer != NULL)
649 free (buffer);
650 buffer = (int *) xmalloc (bufmax * (2 * sizeof (int)));
651 gl_tls_set (buffer_key, buffer);
652 gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax);
653 }
654 ctxt.fdiag = buffer + ctxt.string[1].data_length + 1;
655 ctxt.bdiag = ctxt.fdiag + fdiag_len;
656
657 /* Now do the main comparison algorithm */
658 ctxt.string[0].edit_count = 0;
659 ctxt.string[1].edit_count = 0;
660 compareseq (0, ctxt.string[0].data_length, 0, ctxt.string[1].data_length, 0,
661 &ctxt);
662
663 /* The result is
664 ((number of chars in common) / (average length of the strings)).
665 This is admittedly biased towards finding that the strings are
666 similar, however it does produce meaningful results. */
667 return ((double) (ctxt.string[0].data_length + ctxt.string[1].data_length
668 - ctxt.string[1].edit_count - ctxt.string[0].edit_count)
669 / (ctxt.string[0].data_length + ctxt.string[1].data_length));
670 }
671