1*38fd1498Szrj /* Find near-matches for strings.
2*38fd1498Szrj Copyright (C) 2015-2018 Free Software Foundation, Inc.
3*38fd1498Szrj
4*38fd1498Szrj This file is part of GCC.
5*38fd1498Szrj
6*38fd1498Szrj GCC is free software; you can redistribute it and/or modify it under
7*38fd1498Szrj the terms of the GNU General Public License as published by the Free
8*38fd1498Szrj Software Foundation; either version 3, or (at your option) any later
9*38fd1498Szrj version.
10*38fd1498Szrj
11*38fd1498Szrj GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12*38fd1498Szrj WARRANTY; without even the implied warranty of MERCHANTABILITY or
13*38fd1498Szrj FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14*38fd1498Szrj for more details.
15*38fd1498Szrj
16*38fd1498Szrj You should have received a copy of the GNU General Public License
17*38fd1498Szrj along with GCC; see the file COPYING3. If not see
18*38fd1498Szrj <http://www.gnu.org/licenses/>. */
19*38fd1498Szrj
20*38fd1498Szrj #include "config.h"
21*38fd1498Szrj #include "system.h"
22*38fd1498Szrj #include "coretypes.h"
23*38fd1498Szrj #include "tm.h"
24*38fd1498Szrj #include "tree.h"
25*38fd1498Szrj #include "spellcheck.h"
26*38fd1498Szrj #include "selftest.h"
27*38fd1498Szrj
28*38fd1498Szrj /* The Levenshtein distance is an "edit-distance": the minimal
29*38fd1498Szrj number of one-character insertions, removals or substitutions
30*38fd1498Szrj that are needed to change one string into another.
31*38fd1498Szrj
32*38fd1498Szrj This implementation uses the Wagner-Fischer algorithm. */
33*38fd1498Szrj
34*38fd1498Szrj edit_distance_t
levenshtein_distance(const char * s,int len_s,const char * t,int len_t)35*38fd1498Szrj levenshtein_distance (const char *s, int len_s,
36*38fd1498Szrj const char *t, int len_t)
37*38fd1498Szrj {
38*38fd1498Szrj const bool debug = false;
39*38fd1498Szrj
40*38fd1498Szrj if (debug)
41*38fd1498Szrj {
42*38fd1498Szrj printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
43*38fd1498Szrj printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
44*38fd1498Szrj }
45*38fd1498Szrj
46*38fd1498Szrj if (len_s == 0)
47*38fd1498Szrj return len_t;
48*38fd1498Szrj if (len_t == 0)
49*38fd1498Szrj return len_s;
50*38fd1498Szrj
51*38fd1498Szrj /* We effectively build a matrix where each (i, j) contains the
52*38fd1498Szrj Levenshtein distance between the prefix strings s[0:j]
53*38fd1498Szrj and t[0:i].
54*38fd1498Szrj Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
55*38fd1498Szrj we simply keep track of the last row, v0 and a new row, v1,
56*38fd1498Szrj which avoids an (len_t + 1) * (len_s + 1) allocation and memory accesses
57*38fd1498Szrj in favor of two (len_s + 1) allocations. These could potentially be
58*38fd1498Szrj statically-allocated if we impose a maximum length on the
59*38fd1498Szrj strings of interest. */
60*38fd1498Szrj edit_distance_t *v0 = new edit_distance_t[len_s + 1];
61*38fd1498Szrj edit_distance_t *v1 = new edit_distance_t[len_s + 1];
62*38fd1498Szrj
63*38fd1498Szrj /* The first row is for the case of an empty target string, which
64*38fd1498Szrj we can reach by deleting every character in the source string. */
65*38fd1498Szrj for (int i = 0; i < len_s + 1; i++)
66*38fd1498Szrj v0[i] = i;
67*38fd1498Szrj
68*38fd1498Szrj /* Build successive rows. */
69*38fd1498Szrj for (int i = 0; i < len_t; i++)
70*38fd1498Szrj {
71*38fd1498Szrj if (debug)
72*38fd1498Szrj {
73*38fd1498Szrj printf ("i:%i v0 = ", i);
74*38fd1498Szrj for (int j = 0; j < len_s + 1; j++)
75*38fd1498Szrj printf ("%i ", v0[j]);
76*38fd1498Szrj printf ("\n");
77*38fd1498Szrj }
78*38fd1498Szrj
79*38fd1498Szrj /* The initial column is for the case of an empty source string; we
80*38fd1498Szrj can reach prefixes of the target string of length i
81*38fd1498Szrj by inserting i characters. */
82*38fd1498Szrj v1[0] = i + 1;
83*38fd1498Szrj
84*38fd1498Szrj /* Build the rest of the row by considering neighbors to
85*38fd1498Szrj the north, west and northwest. */
86*38fd1498Szrj for (int j = 0; j < len_s; j++)
87*38fd1498Szrj {
88*38fd1498Szrj edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
89*38fd1498Szrj edit_distance_t deletion = v1[j] + 1;
90*38fd1498Szrj edit_distance_t insertion = v0[j + 1] + 1;
91*38fd1498Szrj edit_distance_t substitution = v0[j] + cost;
92*38fd1498Szrj edit_distance_t cheapest = MIN (deletion, insertion);
93*38fd1498Szrj cheapest = MIN (cheapest, substitution);
94*38fd1498Szrj v1[j + 1] = cheapest;
95*38fd1498Szrj }
96*38fd1498Szrj
97*38fd1498Szrj /* Prepare to move on to next row. */
98*38fd1498Szrj for (int j = 0; j < len_s + 1; j++)
99*38fd1498Szrj v0[j] = v1[j];
100*38fd1498Szrj }
101*38fd1498Szrj
102*38fd1498Szrj if (debug)
103*38fd1498Szrj {
104*38fd1498Szrj printf ("final v1 = ");
105*38fd1498Szrj for (int j = 0; j < len_s + 1; j++)
106*38fd1498Szrj printf ("%i ", v1[j]);
107*38fd1498Szrj printf ("\n");
108*38fd1498Szrj }
109*38fd1498Szrj
110*38fd1498Szrj edit_distance_t result = v1[len_s];
111*38fd1498Szrj delete[] v0;
112*38fd1498Szrj delete[] v1;
113*38fd1498Szrj return result;
114*38fd1498Szrj }
115*38fd1498Szrj
116*38fd1498Szrj /* Calculate Levenshtein distance between two nil-terminated strings. */
117*38fd1498Szrj
118*38fd1498Szrj edit_distance_t
levenshtein_distance(const char * s,const char * t)119*38fd1498Szrj levenshtein_distance (const char *s, const char *t)
120*38fd1498Szrj {
121*38fd1498Szrj return levenshtein_distance (s, strlen (s), t, strlen (t));
122*38fd1498Szrj }
123*38fd1498Szrj
124*38fd1498Szrj /* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to
125*38fd1498Szrj an autovec of non-NULL strings, determine which element within
126*38fd1498Szrj CANDIDATES has the lowest edit distance to TARGET. If there are
127*38fd1498Szrj multiple elements with the same minimal distance, the first in the
128*38fd1498Szrj vector wins.
129*38fd1498Szrj
130*38fd1498Szrj If more than half of the letters were misspelled, the suggestion is
131*38fd1498Szrj likely to be meaningless, so return NULL for this case. */
132*38fd1498Szrj
133*38fd1498Szrj const char *
find_closest_string(const char * target,const auto_vec<const char * > * candidates)134*38fd1498Szrj find_closest_string (const char *target,
135*38fd1498Szrj const auto_vec<const char *> *candidates)
136*38fd1498Szrj {
137*38fd1498Szrj gcc_assert (target);
138*38fd1498Szrj gcc_assert (candidates);
139*38fd1498Szrj
140*38fd1498Szrj int i;
141*38fd1498Szrj const char *candidate;
142*38fd1498Szrj best_match<const char *, const char *> bm (target);
143*38fd1498Szrj FOR_EACH_VEC_ELT (*candidates, i, candidate)
144*38fd1498Szrj {
145*38fd1498Szrj gcc_assert (candidate);
146*38fd1498Szrj bm.consider (candidate);
147*38fd1498Szrj }
148*38fd1498Szrj
149*38fd1498Szrj return bm.get_best_meaningful_candidate ();
150*38fd1498Szrj }
151*38fd1498Szrj
152*38fd1498Szrj #if CHECKING_P
153*38fd1498Szrj
154*38fd1498Szrj namespace selftest {
155*38fd1498Szrj
156*38fd1498Szrj /* Selftests. */
157*38fd1498Szrj
158*38fd1498Szrj /* Verify that the levenshtein_distance (A, B) equals the expected
159*38fd1498Szrj value. */
160*38fd1498Szrj
161*38fd1498Szrj static void
levenshtein_distance_unit_test_oneway(const char * a,const char * b,edit_distance_t expected)162*38fd1498Szrj levenshtein_distance_unit_test_oneway (const char *a, const char *b,
163*38fd1498Szrj edit_distance_t expected)
164*38fd1498Szrj {
165*38fd1498Szrj edit_distance_t actual = levenshtein_distance (a, b);
166*38fd1498Szrj ASSERT_EQ (actual, expected);
167*38fd1498Szrj }
168*38fd1498Szrj
169*38fd1498Szrj /* Verify that both
170*38fd1498Szrj levenshtein_distance (A, B)
171*38fd1498Szrj and
172*38fd1498Szrj levenshtein_distance (B, A)
173*38fd1498Szrj equal the expected value, to ensure that the function is symmetric. */
174*38fd1498Szrj
175*38fd1498Szrj static void
levenshtein_distance_unit_test(const char * a,const char * b,edit_distance_t expected)176*38fd1498Szrj levenshtein_distance_unit_test (const char *a, const char *b,
177*38fd1498Szrj edit_distance_t expected)
178*38fd1498Szrj {
179*38fd1498Szrj levenshtein_distance_unit_test_oneway (a, b, expected);
180*38fd1498Szrj levenshtein_distance_unit_test_oneway (b, a, expected);
181*38fd1498Szrj }
182*38fd1498Szrj
183*38fd1498Szrj /* Verify that find_closest_string is sane. */
184*38fd1498Szrj
185*38fd1498Szrj static void
test_find_closest_string()186*38fd1498Szrj test_find_closest_string ()
187*38fd1498Szrj {
188*38fd1498Szrj auto_vec<const char *> candidates;
189*38fd1498Szrj
190*38fd1498Szrj /* Verify that it can handle an empty vec. */
191*38fd1498Szrj ASSERT_EQ (NULL, find_closest_string ("", &candidates));
192*38fd1498Szrj
193*38fd1498Szrj /* Verify that it works sanely for non-empty vecs. */
194*38fd1498Szrj candidates.safe_push ("apple");
195*38fd1498Szrj candidates.safe_push ("banana");
196*38fd1498Szrj candidates.safe_push ("cherry");
197*38fd1498Szrj
198*38fd1498Szrj ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
199*38fd1498Szrj ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
200*38fd1498Szrj ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
201*38fd1498Szrj ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
202*38fd1498Szrj
203*38fd1498Szrj /* The order of the vec can matter, but it should not matter for these
204*38fd1498Szrj inputs. */
205*38fd1498Szrj candidates.truncate (0);
206*38fd1498Szrj candidates.safe_push ("cherry");
207*38fd1498Szrj candidates.safe_push ("banana");
208*38fd1498Szrj candidates.safe_push ("apple");
209*38fd1498Szrj ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
210*38fd1498Szrj ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
211*38fd1498Szrj ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
212*38fd1498Szrj ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
213*38fd1498Szrj
214*38fd1498Szrj /* If the goal string somehow makes it into the candidate list, offering
215*38fd1498Szrj it as a suggestion will be nonsensical. Verify that we don't offer such
216*38fd1498Szrj suggestions. */
217*38fd1498Szrj ASSERT_EQ (NULL, find_closest_string ("banana", &candidates));
218*38fd1498Szrj }
219*38fd1498Szrj
220*38fd1498Szrj /* Test data for test_metric_conditions. */
221*38fd1498Szrj
222*38fd1498Szrj static const char * const test_data[] = {
223*38fd1498Szrj "",
224*38fd1498Szrj "foo",
225*38fd1498Szrj "food",
226*38fd1498Szrj "boo",
227*38fd1498Szrj "1234567890123456789012345678901234567890123456789012345678901234567890"
228*38fd1498Szrj };
229*38fd1498Szrj
230*38fd1498Szrj /* Verify that levenshtein_distance appears to be a sane distance function,
231*38fd1498Szrj i.e. the conditions for being a metric. This is done directly for a
232*38fd1498Szrj small set of examples, using test_data above. This is O(N^3) in the size
233*38fd1498Szrj of the array, due to the test for the triangle inequality, so we keep the
234*38fd1498Szrj array small. */
235*38fd1498Szrj
236*38fd1498Szrj static void
test_metric_conditions()237*38fd1498Szrj test_metric_conditions ()
238*38fd1498Szrj {
239*38fd1498Szrj const int num_test_cases = sizeof (test_data) / sizeof (test_data[0]);
240*38fd1498Szrj
241*38fd1498Szrj for (int i = 0; i < num_test_cases; i++)
242*38fd1498Szrj {
243*38fd1498Szrj for (int j = 0; j < num_test_cases; j++)
244*38fd1498Szrj {
245*38fd1498Szrj edit_distance_t dist_ij
246*38fd1498Szrj = levenshtein_distance (test_data[i], test_data[j]);
247*38fd1498Szrj
248*38fd1498Szrj /* Identity of indiscernibles: d(i, j) > 0 iff i == j. */
249*38fd1498Szrj if (i == j)
250*38fd1498Szrj ASSERT_EQ (dist_ij, 0);
251*38fd1498Szrj else
252*38fd1498Szrj ASSERT_TRUE (dist_ij > 0);
253*38fd1498Szrj
254*38fd1498Szrj /* Symmetry: d(i, j) == d(j, i). */
255*38fd1498Szrj edit_distance_t dist_ji
256*38fd1498Szrj = levenshtein_distance (test_data[j], test_data[i]);
257*38fd1498Szrj ASSERT_EQ (dist_ij, dist_ji);
258*38fd1498Szrj
259*38fd1498Szrj /* Triangle inequality. */
260*38fd1498Szrj for (int k = 0; k < num_test_cases; k++)
261*38fd1498Szrj {
262*38fd1498Szrj edit_distance_t dist_ik
263*38fd1498Szrj = levenshtein_distance (test_data[i], test_data[k]);
264*38fd1498Szrj edit_distance_t dist_jk
265*38fd1498Szrj = levenshtein_distance (test_data[j], test_data[k]);
266*38fd1498Szrj ASSERT_TRUE (dist_ik <= dist_ij + dist_jk);
267*38fd1498Szrj }
268*38fd1498Szrj }
269*38fd1498Szrj }
270*38fd1498Szrj }
271*38fd1498Szrj
272*38fd1498Szrj /* Verify levenshtein_distance for a variety of pairs of pre-canned
273*38fd1498Szrj inputs, comparing against known-good values. */
274*38fd1498Szrj
275*38fd1498Szrj void
spellcheck_c_tests()276*38fd1498Szrj spellcheck_c_tests ()
277*38fd1498Szrj {
278*38fd1498Szrj levenshtein_distance_unit_test ("", "nonempty", strlen ("nonempty"));
279*38fd1498Szrj levenshtein_distance_unit_test ("saturday", "sunday", 3);
280*38fd1498Szrj levenshtein_distance_unit_test ("foo", "m_foo", 2);
281*38fd1498Szrj levenshtein_distance_unit_test ("hello_world", "HelloWorld", 3);
282*38fd1498Szrj levenshtein_distance_unit_test
283*38fd1498Szrj ("the quick brown fox jumps over the lazy dog", "dog", 40);
284*38fd1498Szrj levenshtein_distance_unit_test
285*38fd1498Szrj ("the quick brown fox jumps over the lazy dog",
286*38fd1498Szrj "the quick brown dog jumps over the lazy fox",
287*38fd1498Szrj 4);
288*38fd1498Szrj levenshtein_distance_unit_test
289*38fd1498Szrj ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
290*38fd1498Szrj "All your base are belong to us",
291*38fd1498Szrj 44);
292*38fd1498Szrj levenshtein_distance_unit_test ("foo", "FOO", 3);
293*38fd1498Szrj
294*38fd1498Szrj test_find_closest_string ();
295*38fd1498Szrj test_metric_conditions ();
296*38fd1498Szrj }
297*38fd1498Szrj
298*38fd1498Szrj } // namespace selftest
299*38fd1498Szrj
300*38fd1498Szrj #endif /* #if CHECKING_P */
301