xref: /dflybsd-src/contrib/gcc-8.0/gcc/spellcheck.c (revision 38fd149817dfbff97799f62fcb70be98c4e32523)
1*38fd1498Szrj /* Find near-matches for strings.
2*38fd1498Szrj    Copyright (C) 2015-2018 Free Software Foundation, Inc.
3*38fd1498Szrj 
4*38fd1498Szrj This file is part of GCC.
5*38fd1498Szrj 
6*38fd1498Szrj GCC is free software; you can redistribute it and/or modify it under
7*38fd1498Szrj the terms of the GNU General Public License as published by the Free
8*38fd1498Szrj Software Foundation; either version 3, or (at your option) any later
9*38fd1498Szrj version.
10*38fd1498Szrj 
11*38fd1498Szrj GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12*38fd1498Szrj WARRANTY; without even the implied warranty of MERCHANTABILITY or
13*38fd1498Szrj FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14*38fd1498Szrj for more details.
15*38fd1498Szrj 
16*38fd1498Szrj You should have received a copy of the GNU General Public License
17*38fd1498Szrj along with GCC; see the file COPYING3.  If not see
18*38fd1498Szrj <http://www.gnu.org/licenses/>.  */
19*38fd1498Szrj 
20*38fd1498Szrj #include "config.h"
21*38fd1498Szrj #include "system.h"
22*38fd1498Szrj #include "coretypes.h"
23*38fd1498Szrj #include "tm.h"
24*38fd1498Szrj #include "tree.h"
25*38fd1498Szrj #include "spellcheck.h"
26*38fd1498Szrj #include "selftest.h"
27*38fd1498Szrj 
28*38fd1498Szrj /* The Levenshtein distance is an "edit-distance": the minimal
29*38fd1498Szrj    number of one-character insertions, removals or substitutions
30*38fd1498Szrj    that are needed to change one string into another.
31*38fd1498Szrj 
32*38fd1498Szrj    This implementation uses the Wagner-Fischer algorithm.  */
33*38fd1498Szrj 
34*38fd1498Szrj edit_distance_t
levenshtein_distance(const char * s,int len_s,const char * t,int len_t)35*38fd1498Szrj levenshtein_distance (const char *s, int len_s,
36*38fd1498Szrj 		      const char *t, int len_t)
37*38fd1498Szrj {
38*38fd1498Szrj   const bool debug = false;
39*38fd1498Szrj 
40*38fd1498Szrj   if (debug)
41*38fd1498Szrj     {
42*38fd1498Szrj       printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
43*38fd1498Szrj       printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
44*38fd1498Szrj     }
45*38fd1498Szrj 
46*38fd1498Szrj   if (len_s == 0)
47*38fd1498Szrj     return len_t;
48*38fd1498Szrj   if (len_t == 0)
49*38fd1498Szrj     return len_s;
50*38fd1498Szrj 
51*38fd1498Szrj   /* We effectively build a matrix where each (i, j) contains the
52*38fd1498Szrj      Levenshtein distance between the prefix strings s[0:j]
53*38fd1498Szrj      and t[0:i].
54*38fd1498Szrj      Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
55*38fd1498Szrj      we simply keep track of the last row, v0 and a new row, v1,
56*38fd1498Szrj      which avoids an (len_t + 1) * (len_s + 1) allocation and memory accesses
57*38fd1498Szrj      in favor of two (len_s + 1) allocations.  These could potentially be
58*38fd1498Szrj      statically-allocated if we impose a maximum length on the
59*38fd1498Szrj      strings of interest.  */
60*38fd1498Szrj   edit_distance_t *v0 = new edit_distance_t[len_s + 1];
61*38fd1498Szrj   edit_distance_t *v1 = new edit_distance_t[len_s + 1];
62*38fd1498Szrj 
63*38fd1498Szrj   /* The first row is for the case of an empty target string, which
64*38fd1498Szrj      we can reach by deleting every character in the source string.  */
65*38fd1498Szrj   for (int i = 0; i < len_s + 1; i++)
66*38fd1498Szrj     v0[i] = i;
67*38fd1498Szrj 
68*38fd1498Szrj   /* Build successive rows.  */
69*38fd1498Szrj   for (int i = 0; i < len_t; i++)
70*38fd1498Szrj     {
71*38fd1498Szrj       if (debug)
72*38fd1498Szrj 	{
73*38fd1498Szrj 	  printf ("i:%i v0 = ", i);
74*38fd1498Szrj 	  for (int j = 0; j < len_s + 1; j++)
75*38fd1498Szrj 	    printf ("%i ", v0[j]);
76*38fd1498Szrj 	  printf ("\n");
77*38fd1498Szrj 	}
78*38fd1498Szrj 
79*38fd1498Szrj       /* The initial column is for the case of an empty source string; we
80*38fd1498Szrj 	 can reach prefixes of the target string of length i
81*38fd1498Szrj 	 by inserting i characters.  */
82*38fd1498Szrj       v1[0] = i + 1;
83*38fd1498Szrj 
84*38fd1498Szrj       /* Build the rest of the row by considering neighbors to
85*38fd1498Szrj 	 the north, west and northwest.  */
86*38fd1498Szrj       for (int j = 0; j < len_s; j++)
87*38fd1498Szrj 	{
88*38fd1498Szrj 	  edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
89*38fd1498Szrj 	  edit_distance_t deletion     = v1[j] + 1;
90*38fd1498Szrj 	  edit_distance_t insertion    = v0[j + 1] + 1;
91*38fd1498Szrj 	  edit_distance_t substitution = v0[j] + cost;
92*38fd1498Szrj 	  edit_distance_t cheapest = MIN (deletion, insertion);
93*38fd1498Szrj 	  cheapest = MIN (cheapest, substitution);
94*38fd1498Szrj 	  v1[j + 1] = cheapest;
95*38fd1498Szrj 	}
96*38fd1498Szrj 
97*38fd1498Szrj       /* Prepare to move on to next row.  */
98*38fd1498Szrj       for (int j = 0; j < len_s + 1; j++)
99*38fd1498Szrj 	v0[j] = v1[j];
100*38fd1498Szrj     }
101*38fd1498Szrj 
102*38fd1498Szrj   if (debug)
103*38fd1498Szrj     {
104*38fd1498Szrj       printf ("final v1 = ");
105*38fd1498Szrj       for (int j = 0; j < len_s + 1; j++)
106*38fd1498Szrj 	printf ("%i ", v1[j]);
107*38fd1498Szrj       printf ("\n");
108*38fd1498Szrj     }
109*38fd1498Szrj 
110*38fd1498Szrj   edit_distance_t result = v1[len_s];
111*38fd1498Szrj   delete[] v0;
112*38fd1498Szrj   delete[] v1;
113*38fd1498Szrj   return result;
114*38fd1498Szrj }
115*38fd1498Szrj 
116*38fd1498Szrj /* Calculate Levenshtein distance between two nil-terminated strings.  */
117*38fd1498Szrj 
118*38fd1498Szrj edit_distance_t
levenshtein_distance(const char * s,const char * t)119*38fd1498Szrj levenshtein_distance (const char *s, const char *t)
120*38fd1498Szrj {
121*38fd1498Szrj   return levenshtein_distance (s, strlen (s), t, strlen (t));
122*38fd1498Szrj }
123*38fd1498Szrj 
124*38fd1498Szrj /* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to
125*38fd1498Szrj    an autovec of non-NULL strings, determine which element within
126*38fd1498Szrj    CANDIDATES has the lowest edit distance to TARGET.  If there are
127*38fd1498Szrj    multiple elements with the same minimal distance, the first in the
128*38fd1498Szrj    vector wins.
129*38fd1498Szrj 
130*38fd1498Szrj    If more than half of the letters were misspelled, the suggestion is
131*38fd1498Szrj    likely to be meaningless, so return NULL for this case.  */
132*38fd1498Szrj 
133*38fd1498Szrj const char *
find_closest_string(const char * target,const auto_vec<const char * > * candidates)134*38fd1498Szrj find_closest_string (const char *target,
135*38fd1498Szrj 		     const auto_vec<const char *> *candidates)
136*38fd1498Szrj {
137*38fd1498Szrj   gcc_assert (target);
138*38fd1498Szrj   gcc_assert (candidates);
139*38fd1498Szrj 
140*38fd1498Szrj   int i;
141*38fd1498Szrj   const char *candidate;
142*38fd1498Szrj   best_match<const char *, const char *> bm (target);
143*38fd1498Szrj   FOR_EACH_VEC_ELT (*candidates, i, candidate)
144*38fd1498Szrj     {
145*38fd1498Szrj       gcc_assert (candidate);
146*38fd1498Szrj       bm.consider (candidate);
147*38fd1498Szrj     }
148*38fd1498Szrj 
149*38fd1498Szrj   return bm.get_best_meaningful_candidate ();
150*38fd1498Szrj }
151*38fd1498Szrj 
152*38fd1498Szrj #if CHECKING_P
153*38fd1498Szrj 
154*38fd1498Szrj namespace selftest {
155*38fd1498Szrj 
156*38fd1498Szrj /* Selftests.  */
157*38fd1498Szrj 
158*38fd1498Szrj /* Verify that the levenshtein_distance (A, B) equals the expected
159*38fd1498Szrj    value.  */
160*38fd1498Szrj 
161*38fd1498Szrj static void
levenshtein_distance_unit_test_oneway(const char * a,const char * b,edit_distance_t expected)162*38fd1498Szrj levenshtein_distance_unit_test_oneway (const char *a, const char *b,
163*38fd1498Szrj 				       edit_distance_t expected)
164*38fd1498Szrj {
165*38fd1498Szrj   edit_distance_t actual = levenshtein_distance (a, b);
166*38fd1498Szrj   ASSERT_EQ (actual, expected);
167*38fd1498Szrj }
168*38fd1498Szrj 
169*38fd1498Szrj /* Verify that both
170*38fd1498Szrj      levenshtein_distance (A, B)
171*38fd1498Szrj    and
172*38fd1498Szrj      levenshtein_distance (B, A)
173*38fd1498Szrj    equal the expected value, to ensure that the function is symmetric.  */
174*38fd1498Szrj 
175*38fd1498Szrj static void
levenshtein_distance_unit_test(const char * a,const char * b,edit_distance_t expected)176*38fd1498Szrj levenshtein_distance_unit_test (const char *a, const char *b,
177*38fd1498Szrj 				edit_distance_t expected)
178*38fd1498Szrj {
179*38fd1498Szrj   levenshtein_distance_unit_test_oneway (a, b, expected);
180*38fd1498Szrj   levenshtein_distance_unit_test_oneway (b, a, expected);
181*38fd1498Szrj }
182*38fd1498Szrj 
183*38fd1498Szrj /* Verify that find_closest_string is sane.  */
184*38fd1498Szrj 
185*38fd1498Szrj static void
test_find_closest_string()186*38fd1498Szrj test_find_closest_string ()
187*38fd1498Szrj {
188*38fd1498Szrj   auto_vec<const char *> candidates;
189*38fd1498Szrj 
190*38fd1498Szrj   /* Verify that it can handle an empty vec.  */
191*38fd1498Szrj   ASSERT_EQ (NULL, find_closest_string ("", &candidates));
192*38fd1498Szrj 
193*38fd1498Szrj   /* Verify that it works sanely for non-empty vecs.  */
194*38fd1498Szrj   candidates.safe_push ("apple");
195*38fd1498Szrj   candidates.safe_push ("banana");
196*38fd1498Szrj   candidates.safe_push ("cherry");
197*38fd1498Szrj 
198*38fd1498Szrj   ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
199*38fd1498Szrj   ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
200*38fd1498Szrj   ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
201*38fd1498Szrj   ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
202*38fd1498Szrj 
203*38fd1498Szrj   /* The order of the vec can matter, but it should not matter for these
204*38fd1498Szrj      inputs.  */
205*38fd1498Szrj   candidates.truncate (0);
206*38fd1498Szrj   candidates.safe_push ("cherry");
207*38fd1498Szrj   candidates.safe_push ("banana");
208*38fd1498Szrj   candidates.safe_push ("apple");
209*38fd1498Szrj   ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
210*38fd1498Szrj   ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
211*38fd1498Szrj   ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
212*38fd1498Szrj   ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
213*38fd1498Szrj 
214*38fd1498Szrj   /* If the goal string somehow makes it into the candidate list, offering
215*38fd1498Szrj      it as a suggestion will be nonsensical.  Verify that we don't offer such
216*38fd1498Szrj      suggestions.  */
217*38fd1498Szrj   ASSERT_EQ (NULL, find_closest_string ("banana", &candidates));
218*38fd1498Szrj }
219*38fd1498Szrj 
220*38fd1498Szrj /* Test data for test_metric_conditions.  */
221*38fd1498Szrj 
222*38fd1498Szrj static const char * const test_data[] = {
223*38fd1498Szrj   "",
224*38fd1498Szrj   "foo",
225*38fd1498Szrj   "food",
226*38fd1498Szrj   "boo",
227*38fd1498Szrj   "1234567890123456789012345678901234567890123456789012345678901234567890"
228*38fd1498Szrj };
229*38fd1498Szrj 
230*38fd1498Szrj /* Verify that levenshtein_distance appears to be a sane distance function,
231*38fd1498Szrj    i.e. the conditions for being a metric.  This is done directly for a
232*38fd1498Szrj    small set of examples, using test_data above.  This is O(N^3) in the size
233*38fd1498Szrj    of the array, due to the test for the triangle inequality, so we keep the
234*38fd1498Szrj    array small.  */
235*38fd1498Szrj 
236*38fd1498Szrj static void
test_metric_conditions()237*38fd1498Szrj test_metric_conditions ()
238*38fd1498Szrj {
239*38fd1498Szrj   const int num_test_cases = sizeof (test_data) / sizeof (test_data[0]);
240*38fd1498Szrj 
241*38fd1498Szrj   for (int i = 0; i < num_test_cases; i++)
242*38fd1498Szrj     {
243*38fd1498Szrj       for (int j = 0; j < num_test_cases; j++)
244*38fd1498Szrj 	{
245*38fd1498Szrj 	  edit_distance_t dist_ij
246*38fd1498Szrj 	    = levenshtein_distance (test_data[i], test_data[j]);
247*38fd1498Szrj 
248*38fd1498Szrj 	  /* Identity of indiscernibles: d(i, j) > 0 iff i == j.  */
249*38fd1498Szrj 	  if (i == j)
250*38fd1498Szrj 	    ASSERT_EQ (dist_ij, 0);
251*38fd1498Szrj 	  else
252*38fd1498Szrj 	    ASSERT_TRUE (dist_ij > 0);
253*38fd1498Szrj 
254*38fd1498Szrj 	  /* Symmetry: d(i, j) == d(j, i).  */
255*38fd1498Szrj 	  edit_distance_t dist_ji
256*38fd1498Szrj 	    = levenshtein_distance (test_data[j], test_data[i]);
257*38fd1498Szrj 	  ASSERT_EQ (dist_ij, dist_ji);
258*38fd1498Szrj 
259*38fd1498Szrj 	  /* Triangle inequality.  */
260*38fd1498Szrj 	  for (int k = 0; k < num_test_cases; k++)
261*38fd1498Szrj 	    {
262*38fd1498Szrj 	      edit_distance_t dist_ik
263*38fd1498Szrj 		= levenshtein_distance (test_data[i], test_data[k]);
264*38fd1498Szrj 	      edit_distance_t dist_jk
265*38fd1498Szrj 		= levenshtein_distance (test_data[j], test_data[k]);
266*38fd1498Szrj 	      ASSERT_TRUE (dist_ik <= dist_ij + dist_jk);
267*38fd1498Szrj 	    }
268*38fd1498Szrj 	}
269*38fd1498Szrj     }
270*38fd1498Szrj }
271*38fd1498Szrj 
272*38fd1498Szrj /* Verify levenshtein_distance for a variety of pairs of pre-canned
273*38fd1498Szrj    inputs, comparing against known-good values.  */
274*38fd1498Szrj 
275*38fd1498Szrj void
spellcheck_c_tests()276*38fd1498Szrj spellcheck_c_tests ()
277*38fd1498Szrj {
278*38fd1498Szrj   levenshtein_distance_unit_test ("", "nonempty", strlen ("nonempty"));
279*38fd1498Szrj   levenshtein_distance_unit_test ("saturday", "sunday", 3);
280*38fd1498Szrj   levenshtein_distance_unit_test ("foo", "m_foo", 2);
281*38fd1498Szrj   levenshtein_distance_unit_test ("hello_world", "HelloWorld", 3);
282*38fd1498Szrj   levenshtein_distance_unit_test
283*38fd1498Szrj     ("the quick brown fox jumps over the lazy dog", "dog", 40);
284*38fd1498Szrj   levenshtein_distance_unit_test
285*38fd1498Szrj     ("the quick brown fox jumps over the lazy dog",
286*38fd1498Szrj      "the quick brown dog jumps over the lazy fox",
287*38fd1498Szrj      4);
288*38fd1498Szrj   levenshtein_distance_unit_test
289*38fd1498Szrj     ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
290*38fd1498Szrj      "All your base are belong to us",
291*38fd1498Szrj      44);
292*38fd1498Szrj   levenshtein_distance_unit_test ("foo", "FOO", 3);
293*38fd1498Szrj 
294*38fd1498Szrj   test_find_closest_string ();
295*38fd1498Szrj   test_metric_conditions ();
296*38fd1498Szrj }
297*38fd1498Szrj 
298*38fd1498Szrj } // namespace selftest
299*38fd1498Szrj 
300*38fd1498Szrj #endif /* #if CHECKING_P */
301