xref: /netbsd-src/external/gpl3/gcc/dist/gcc/spellcheck.cc (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1 /* Find near-matches for strings.
2    Copyright (C) 2015-2022 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "tree.h"
25 #include "spellcheck.h"
26 #include "selftest.h"
27 
28 /* Cost of a case transformation.  */
29 #define CASE_COST 1
30 
31 /* Cost of another kind of edit.  */
32 #define BASE_COST 2
33 
34 /* Get the edit distance between the two strings: the minimal
35    number of edits that are needed to change one string into another,
36    where edits can be one-character insertions, removals, or substitutions,
37    or transpositions of two adjacent characters (counting as one "edit").
38 
39    This implementation uses a modified variant of the Wagner-Fischer
40    algorithm for the Damerau-Levenshtein distance; specifically, the
41    "optimal string alignment distance" or "restricted edit distance"
42    variant.  This implementation has been further modified to take
43    case into account.  */
44 
45 edit_distance_t
get_edit_distance(const char * s,int len_s,const char * t,int len_t)46 get_edit_distance (const char *s, int len_s,
47 		   const char *t, int len_t)
48 {
49   const bool debug = false;
50 
51   if (debug)
52     {
53       printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
54       printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
55     }
56 
57   if (len_s == 0)
58     return BASE_COST * len_t;
59   if (len_t == 0)
60     return BASE_COST * len_s;
61 
62   /* We effectively build a matrix where each (i, j) contains the
63      distance between the prefix strings s[0:j] and t[0:i].
64      Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
65      we simply keep track of the last two rows, v_one_ago and v_two_ago,
66      and a new row, v_next, which avoids an (len_t + 1) * (len_s + 1)
67      allocation and memory accesses in favor of three (len_s + 1)
68      allocations.  These could potentially be
69      statically-allocated if we impose a maximum length on the
70      strings of interest.  */
71   edit_distance_t *v_two_ago = new edit_distance_t[len_s + 1];
72   edit_distance_t *v_one_ago = new edit_distance_t[len_s + 1];
73   edit_distance_t *v_next = new edit_distance_t[len_s + 1];
74 
75   /* The first row is for the case of an empty target string, which
76      we can reach by deleting every character in the source string.  */
77   for (int i = 0; i < len_s + 1; i++)
78     v_one_ago[i] = i * BASE_COST;
79 
80   /* Build successive rows.  */
81   for (int i = 0; i < len_t; i++)
82     {
83       if (debug)
84 	{
85 	  printf ("i:%i v_one_ago = ", i);
86 	  for (int j = 0; j < len_s + 1; j++)
87 	    printf ("%i ", v_one_ago[j]);
88 	  printf ("\n");
89 	}
90 
91       /* The initial column is for the case of an empty source string; we
92 	 can reach prefixes of the target string of length i
93 	 by inserting i characters.  */
94       v_next[0] = (i + 1) * BASE_COST;
95 
96       /* Build the rest of the row by considering neighbors to
97 	 the north, west and northwest.  */
98       for (int j = 0; j < len_s; j++)
99 	{
100 	  edit_distance_t cost;
101 
102 	  if (s[j] == t[i])
103 	    cost = 0;
104 	  else if (TOLOWER (s[j]) == TOLOWER (t[i]))
105 	    cost = CASE_COST;
106 	  else
107 	    cost = BASE_COST;
108 	  edit_distance_t deletion     = v_next[j] + BASE_COST;
109 	  edit_distance_t insertion    = v_one_ago[j + 1] + BASE_COST;
110 	  edit_distance_t substitution = v_one_ago[j] + cost;
111 	  edit_distance_t cheapest = MIN (deletion, insertion);
112 	  cheapest = MIN (cheapest, substitution);
113 	  if (i > 0 && j > 0 && s[j] == t[i - 1] && s[j - 1] == t[i])
114 	    {
115 	      edit_distance_t transposition = v_two_ago[j - 1] + BASE_COST;
116 	      cheapest = MIN (cheapest, transposition);
117 	    }
118 	  v_next[j + 1] = cheapest;
119 	}
120 
121       /* Prepare to move on to next row.  */
122       for (int j = 0; j < len_s + 1; j++)
123 	{
124 	  v_two_ago[j] = v_one_ago[j];
125 	  v_one_ago[j] = v_next[j];
126 	}
127     }
128 
129   if (debug)
130     {
131       printf ("final v_next = ");
132       for (int j = 0; j < len_s + 1; j++)
133 	printf ("%i ", v_next[j]);
134       printf ("\n");
135     }
136 
137   edit_distance_t result = v_next[len_s];
138   delete[] v_two_ago;
139   delete[] v_one_ago;
140   delete[] v_next;
141   return result;
142 }
143 
144 /* Get the edit distance between two nil-terminated strings.  */
145 
146 edit_distance_t
get_edit_distance(const char * s,const char * t)147 get_edit_distance (const char *s, const char *t)
148 {
149   return get_edit_distance (s, strlen (s), t, strlen (t));
150 }
151 
152 /* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to
153    an autovec of non-NULL strings, determine which element within
154    CANDIDATES has the lowest edit distance to TARGET.  If there are
155    multiple elements with the same minimal distance, the first in the
156    vector wins.
157 
158    If more than half of the letters were misspelled, the suggestion is
159    likely to be meaningless, so return NULL for this case.  */
160 
161 const char *
find_closest_string(const char * target,const auto_vec<const char * > * candidates)162 find_closest_string (const char *target,
163 		     const auto_vec<const char *> *candidates)
164 {
165   gcc_assert (target);
166   gcc_assert (candidates);
167 
168   int i;
169   const char *candidate;
170   best_match<const char *, const char *> bm (target);
171   FOR_EACH_VEC_ELT (*candidates, i, candidate)
172     {
173       gcc_assert (candidate);
174       bm.consider (candidate);
175     }
176 
177   return bm.get_best_meaningful_candidate ();
178 }
179 
180 /* Generate the maximum edit distance for which we consider a suggestion
181    to be meaningful, given a goal of length GOAL_LEN and a candidate of
182    length CANDIDATE_LEN.
183 
184    This is a third of the length of the candidate or of the goal,
185    whichever is bigger.  */
186 
187 edit_distance_t
get_edit_distance_cutoff(size_t goal_len,size_t candidate_len)188 get_edit_distance_cutoff (size_t goal_len, size_t candidate_len)
189 {
190   size_t max_length = MAX (goal_len, candidate_len);
191   size_t min_length = MIN (goal_len, candidate_len);
192 
193   gcc_assert (max_length >= min_length);
194 
195   /* Special case: don't offer suggestions for a pair of
196      length == 1 strings (or empty strings).  */
197   if (max_length <= 1)
198     return 0;
199 
200   /* If the lengths are close, then round down.  */
201   if (max_length - min_length <= 1)
202     /* ...but allow an edit distance of at least 1.  */
203     return BASE_COST * MAX (max_length / 3, 1);
204 
205   /* Otherwise, round up (thus giving a little extra leeway to some cases
206      involving insertions/deletions).  */
207   return BASE_COST * (max_length + 2) / 3;
208 }
209 
210 #if CHECKING_P
211 
212 namespace selftest {
213 
214 /* Selftests.  */
215 
216 /* Verify that get_edit_distance (A, B) equals the expected value.  */
217 
218 static void
test_get_edit_distance_one_way(const char * a,const char * b,edit_distance_t expected)219 test_get_edit_distance_one_way (const char *a, const char *b,
220 				edit_distance_t expected)
221 {
222   edit_distance_t actual = get_edit_distance (a, b);
223   ASSERT_EQ (actual, expected);
224 }
225 
226 /* Verify that both
227      get_edit_distance (A, B)
228    and
229      get_edit_distance (B, A)
230    equal the expected value, to ensure that the function is symmetric.  */
231 
232 static void
test_get_edit_distance_both_ways(const char * a,const char * b,edit_distance_t expected)233 test_get_edit_distance_both_ways (const char *a, const char *b,
234 			     edit_distance_t expected)
235 {
236   test_get_edit_distance_one_way (a, b, expected);
237   test_get_edit_distance_one_way (b, a, expected);
238 }
239 
240 /* Verify get_edit_distance for a variety of pairs of pre-canned
241    inputs, comparing against known-good values.  */
242 
243 static void
test_edit_distances()244 test_edit_distances ()
245 {
246   test_get_edit_distance_both_ways ("", "nonempty",
247 				    BASE_COST * strlen ("nonempty"));
248   test_get_edit_distance_both_ways ("saturday", "sunday",
249 				    BASE_COST * 3);
250   test_get_edit_distance_both_ways ("foo", "m_foo", BASE_COST * 2);
251   test_get_edit_distance_both_ways ("hello_world", "HelloWorld", 4);
252   test_get_edit_distance_both_ways
253     ("the quick brown fox jumps over the lazy dog", "dog", BASE_COST * 40);
254   test_get_edit_distance_both_ways
255     ("the quick brown fox jumps over the lazy dog",
256      "the quick brown dog jumps over the lazy fox",
257      BASE_COST * 4);
258   test_get_edit_distance_both_ways
259     ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
260      "All your base are belong to us",
261      BASE_COST * 44);
262   test_get_edit_distance_both_ways ("foo", "FOO", 3);
263   test_get_edit_distance_both_ways ("fee", "deed", BASE_COST * 2);
264   test_get_edit_distance_both_ways ("coorzd1", "coordx1", BASE_COST * 2);
265   test_get_edit_distance_both_ways ("assert", "sqrt", BASE_COST * 3);
266   test_get_edit_distance_both_ways ("PATH_MAX", "INT8_MAX", BASE_COST * 3);
267   test_get_edit_distance_both_ways ("time", "nice", BASE_COST * 2);
268   test_get_edit_distance_both_ways ("bar", "carg", BASE_COST * 2);
269   test_get_edit_distance_both_ways ("gtk_widget_show_all",
270 				    "GtkWidgetShowAll", 10);
271   test_get_edit_distance_both_ways ("m_bar", "bar", BASE_COST * 2);
272   test_get_edit_distance_both_ways ("MACRO", "MACRAME", BASE_COST * 3);
273   test_get_edit_distance_both_ways ("ab", "ac", BASE_COST * 1);
274   test_get_edit_distance_both_ways ("ab", "a", BASE_COST * 1);
275   test_get_edit_distance_both_ways ("a", "b", BASE_COST * 1);
276   test_get_edit_distance_both_ways ("nanl", "name", BASE_COST * 2);
277   test_get_edit_distance_both_ways ("char", "bar", BASE_COST * 2);
278   test_get_edit_distance_both_ways ("-optimize", "fsanitize", BASE_COST * 5);
279   test_get_edit_distance_both_ways ("__DATE__", "__i386__", BASE_COST * 4);
280 
281   /* Examples where transposition helps.  */
282   test_get_edit_distance_both_ways ("ab", "ba", BASE_COST * 1);
283   test_get_edit_distance_both_ways ("ba", "abc", BASE_COST * 2);
284   test_get_edit_distance_both_ways ("coorzd1", "coordz1", BASE_COST * 1);
285   test_get_edit_distance_both_ways ("abcdefghijklmnopqrstuvwxyz",
286 				    "bacdefghijklmnopqrstuvwxzy",
287 				    BASE_COST * 2);
288   test_get_edit_distance_both_ways ("saturday", "sundya", BASE_COST * 4);
289   test_get_edit_distance_both_ways ("signed", "singed", BASE_COST * 1);
290 }
291 
292 /* Subroutine of test_get_edit_distance_cutoff, for emulating the
293    spellchecking cutoff in up to GCC 8.  */
294 
295 static edit_distance_t
get_old_cutoff(size_t goal_len,size_t candidate_len)296 get_old_cutoff (size_t goal_len, size_t candidate_len)
297 {
298   return BASE_COST * MAX (goal_len, candidate_len) / 2;
299 }
300 
301 /* Verify that the cutoff for "meaningfulness" of suggestions is at least as
302    conservative as in older GCC releases.
303 
304    This should ensure that we don't offer additional meaningless
305    suggestions (apart from those for which transposition has helped).  */
306 
307 static void
test_get_edit_distance_cutoff()308 test_get_edit_distance_cutoff ()
309 {
310   for (size_t goal_len = 0; goal_len < 30; goal_len++)
311     for (size_t candidate_len = 0; candidate_len < 30; candidate_len++)
312       ASSERT_TRUE (get_edit_distance_cutoff (goal_len, candidate_len)
313 		   <= get_old_cutoff (goal_len, candidate_len));
314 }
315 
316 /* Assert that CANDIDATE is offered as a suggestion for TARGET.  */
317 
318 static void
assert_suggested_for(const location & loc,const char * candidate,const char * target)319 assert_suggested_for (const location &loc, const char *candidate,
320 		      const char *target)
321 {
322   auto_vec<const char *> candidates;
323   candidates.safe_push (candidate);
324   ASSERT_EQ_AT (loc, candidate, find_closest_string (target, &candidates));
325 }
326 
327 /* Assert that CANDIDATE is offered as a suggestion for TARGET.  */
328 
329 #define ASSERT_SUGGESTED_FOR(CANDIDATE, TARGET)			\
330   SELFTEST_BEGIN_STMT							\
331     assert_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET);	\
332   SELFTEST_END_STMT
333 
334 /* Assert that CANDIDATE is not offered as a suggestion for TARGET.  */
335 
336 static void
assert_not_suggested_for(const location & loc,const char * candidate,const char * target)337 assert_not_suggested_for (const location &loc, const char *candidate,
338 			  const char *target)
339 {
340   auto_vec<const char *> candidates;
341   candidates.safe_push (candidate);
342   ASSERT_EQ_AT (loc, NULL, find_closest_string (target, &candidates));
343 }
344 
345 /* Assert that CANDIDATE is not offered as a suggestion for TARGET.  */
346 
347 #define ASSERT_NOT_SUGGESTED_FOR(CANDIDATE, TARGET)			\
348   SELFTEST_BEGIN_STMT							\
349     assert_not_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET);	\
350   SELFTEST_END_STMT
351 
352 /* Verify that we offer varous suggestions that are meaningful,
353    and that we don't offer various other ones that aren't (PR c/82967).  */
354 
355 static void
test_suggestions()356 test_suggestions ()
357 {
358   /* Good suggestions.  */
359 
360   ASSERT_SUGGESTED_FOR ("m_bar", "bar");
361   // dist == 2, max_length == 5, min_length == 3
362 
363   ASSERT_SUGGESTED_FOR ("MACRO", "MACRAME");
364   // dist == 3, max_length == 7, min_length == 5
365 
366   ASSERT_SUGGESTED_FOR ("gtk_widget_show_all", "GtkWidgetShowAll");
367   // dist == 7, max_length == 16, min_length = 19
368 
369   ASSERT_SUGGESTED_FOR ("ab", "ac");
370   // dist == 1, max_length == min_length = 2
371 
372   ASSERT_SUGGESTED_FOR ("ab", "a");
373   // dist == 1, max_length == 2, min_length = 1
374 
375   /* Bad suggestions.  */
376 
377   ASSERT_NOT_SUGGESTED_FOR ("a", "b");
378   // dist == 1, max_length == min_length = 1
379 
380   ASSERT_NOT_SUGGESTED_FOR ("sqrt", "assert");
381   // dist == 3, max_length 6, min_length == 4
382 
383   ASSERT_NOT_SUGGESTED_FOR ("INT8_MAX", "PATH_MAX");
384   // dist == 3, max_length == min_length == 8
385 
386   ASSERT_NOT_SUGGESTED_FOR ("nice", "time");
387   ASSERT_NOT_SUGGESTED_FOR ("nanl", "name");
388   // dist == 2, max_length == min_length == 4
389 
390   ASSERT_NOT_SUGGESTED_FOR ("carg", "bar");
391   ASSERT_NOT_SUGGESTED_FOR ("char", "bar");
392   // dist == 2, max_length == 4, min_length == 3
393 
394   ASSERT_NOT_SUGGESTED_FOR ("-optimize", "fsanitize");
395   // dist == 5, max_length == min_length == 9
396 
397   ASSERT_NOT_SUGGESTED_FOR ("__DATE__", "__i386__");
398   // dist == 4, max_length == min_length == 8
399 
400   ASSERT_NOT_SUGGESTED_FOR ("start_input_device", "InputDevice");
401   // dist == 9, max_length == 18, min_length == 11
402 }
403 
404 /* Verify that find_closest_string is sane.  */
405 
406 static void
test_find_closest_string()407 test_find_closest_string ()
408 {
409   auto_vec<const char *> candidates;
410 
411   /* Verify that it can handle an empty vec.  */
412   ASSERT_EQ (NULL, find_closest_string ("", &candidates));
413 
414   /* Verify that it works sanely for non-empty vecs.  */
415   candidates.safe_push ("apple");
416   candidates.safe_push ("banana");
417   candidates.safe_push ("cherry");
418 
419   ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
420   ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
421   ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
422   ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
423 
424   /* The order of the vec can matter, but it should not matter for these
425      inputs.  */
426   candidates.truncate (0);
427   candidates.safe_push ("cherry");
428   candidates.safe_push ("banana");
429   candidates.safe_push ("apple");
430   ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
431   ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
432   ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
433   ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
434 
435   /* If the goal string somehow makes it into the candidate list, offering
436      it as a suggestion will be nonsensical.  Verify that we don't offer such
437      suggestions.  */
438   ASSERT_EQ (NULL, find_closest_string ("banana", &candidates));
439 
440   /* Example from PR 69968 where transposition helps.  */
441   candidates.truncate (0);
442   candidates.safe_push("coordx");
443   candidates.safe_push("coordy");
444   candidates.safe_push("coordz");
445   candidates.safe_push("coordx1");
446   candidates.safe_push("coordy1");
447   candidates.safe_push("coordz1");
448   ASSERT_STREQ ("coordz1", find_closest_string ("coorzd1", &candidates));
449 
450   candidates.truncate (0);
451   candidates.safe_push ("DWARF_GNAT_ENCODINGS_GDB");
452   candidates.safe_push ("DWARF_GNAT_ENCODINGS_ALL");
453   candidates.safe_push ("DWARF_GNAT_ENCODINGS_MINIMAL");
454   ASSERT_STREQ ("DWARF_GNAT_ENCODINGS_ALL",
455 		find_closest_string ("DWARF_GNAT_ENCODINGS_all",
456 				     &candidates));
457 
458   /* The same as the previous test, but with a different order of
459      candidates.  */
460   candidates.truncate (0);
461   candidates.safe_push ("DWARF_GNAT_ENCODINGS_ALL");
462   candidates.safe_push ("DWARF_GNAT_ENCODINGS_GDB");
463   candidates.safe_push ("DWARF_GNAT_ENCODINGS_MINIMAL");
464   ASSERT_STREQ ("DWARF_GNAT_ENCODINGS_ALL",
465 		find_closest_string ("DWARF_GNAT_ENCODINGS_all",
466 				     &candidates));
467 }
468 
469 /* Test data for test_metric_conditions.  */
470 
471 static const char * const test_data[] = {
472   "",
473   "foo",
474   "food",
475   "boo",
476   "1234567890123456789012345678901234567890123456789012345678901234567890",
477   "abc",
478   "ac",
479   "ca",
480 };
481 
482 /* Verify that get_edit_distance appears to be a sane distance function,
483    even though it doesn't satisfy the conditions for being a metric.  (This
484    is because the triangle inequality fails to hold: the distance between
485    "ca" and "ac" is 1, and so is the distance between "abc" and "ac", but
486    the distance between "abc" and "ca" is 3. Algorithms that calculate the
487    true Levenshtein-Damerau metric are much more expensive.)  */
488 
489 static void
test_metric_conditions()490 test_metric_conditions ()
491 {
492   const int num_test_cases = sizeof (test_data) / sizeof (test_data[0]);
493 
494   for (int i = 0; i < num_test_cases; i++)
495     {
496       for (int j = 0; j < num_test_cases; j++)
497 	{
498 	  edit_distance_t dist_ij
499 	    = get_edit_distance (test_data[i], test_data[j]);
500 
501 	  /* Identity of indiscernibles: d(i, j) > 0 iff i == j.  */
502 	  if (i == j)
503 	    ASSERT_EQ (dist_ij, 0);
504 	  else
505 	    ASSERT_TRUE (dist_ij > 0);
506 
507 	  /* Symmetry: d(i, j) == d(j, i).  */
508 	  edit_distance_t dist_ji
509 	    = get_edit_distance (test_data[j], test_data[i]);
510 	  ASSERT_EQ (dist_ij, dist_ji);
511 	}
512     }
513 }
514 
515 /* Run all of the selftests within this file.  */
516 
517 void
spellcheck_cc_tests()518 spellcheck_cc_tests ()
519 {
520   test_edit_distances ();
521   test_get_edit_distance_cutoff ();
522   test_suggestions ();
523   test_find_closest_string ();
524   test_metric_conditions ();
525 }
526 
527 } // namespace selftest
528 
529 #endif /* #if CHECKING_P */
530