xref: /dflybsd-src/contrib/grep/lib/str-kmp.h (revision 91b9ed38d3db6a8a8ac5b66da1d43e6e331e259a)
1200fbe8dSJohn Marino /* Substring search in a NUL terminated string of UNIT elements,
295b7b453SJohn Marino    using the Knuth-Morris-Pratt algorithm.
3*09d4459fSDaniel Fojt    Copyright (C) 2005-2020 Free Software Foundation, Inc.
495b7b453SJohn Marino    Written by Bruno Haible <bruno@clisp.org>, 2005.
595b7b453SJohn Marino 
695b7b453SJohn Marino    This program is free software; you can redistribute it and/or modify
795b7b453SJohn Marino    it under the terms of the GNU General Public License as published by
895b7b453SJohn Marino    the Free Software Foundation; either version 3, or (at your option)
995b7b453SJohn Marino    any later version.
1095b7b453SJohn Marino 
1195b7b453SJohn Marino    This program is distributed in the hope that it will be useful,
1295b7b453SJohn Marino    but WITHOUT ANY WARRANTY; without even the implied warranty of
1395b7b453SJohn Marino    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1495b7b453SJohn Marino    GNU General Public License for more details.
1595b7b453SJohn Marino 
1695b7b453SJohn Marino    You should have received a copy of the GNU General Public License
17*09d4459fSDaniel Fojt    along with this program; if not, see <https://www.gnu.org/licenses/>.  */
1895b7b453SJohn Marino 
1995b7b453SJohn Marino /* Before including this file, you need to define:
20200fbe8dSJohn Marino      UNIT                    The element type of the needle and haystack.
2195b7b453SJohn Marino      CANON_ELEMENT(c)        A macro that canonicalizes an element right after
22200fbe8dSJohn Marino                              it has been fetched from needle or haystack.
23200fbe8dSJohn Marino                              The argument is of type UNIT; the result must be
24200fbe8dSJohn Marino                              of type UNIT as well.  */
2595b7b453SJohn Marino 
2695b7b453SJohn Marino /* Knuth-Morris-Pratt algorithm.
27*09d4459fSDaniel Fojt    See https://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm
28200fbe8dSJohn Marino    HAYSTACK is the NUL terminated string in which to search for.
29200fbe8dSJohn Marino    NEEDLE is the string to search for in HAYSTACK, consisting of NEEDLE_LEN
30200fbe8dSJohn Marino    units.
3195b7b453SJohn Marino    Return a boolean indicating success:
3295b7b453SJohn Marino    Return true and set *RESULTP if the search was completed.
3395b7b453SJohn Marino    Return false if it was aborted because not enough memory was available.  */
3495b7b453SJohn Marino static bool
knuth_morris_pratt(const UNIT * haystack,const UNIT * needle,size_t needle_len,const UNIT ** resultp)35200fbe8dSJohn Marino knuth_morris_pratt (const UNIT *haystack,
36200fbe8dSJohn Marino                     const UNIT *needle, size_t needle_len,
37200fbe8dSJohn Marino                     const UNIT **resultp)
3895b7b453SJohn Marino {
39200fbe8dSJohn Marino   size_t m = needle_len;
4095b7b453SJohn Marino 
4195b7b453SJohn Marino   /* Allocate the table.  */
4295b7b453SJohn Marino   size_t *table = (size_t *) nmalloca (m, sizeof (size_t));
4395b7b453SJohn Marino   if (table == NULL)
4495b7b453SJohn Marino     return false;
4595b7b453SJohn Marino   /* Fill the table.
4695b7b453SJohn Marino      For 0 < i < m:
4795b7b453SJohn Marino        0 < table[i] <= i is defined such that
4895b7b453SJohn Marino        forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x],
4995b7b453SJohn Marino        and table[i] is as large as possible with this property.
5095b7b453SJohn Marino      This implies:
5195b7b453SJohn Marino      1) For 0 < i < m:
5295b7b453SJohn Marino           If table[i] < i,
5395b7b453SJohn Marino           needle[table[i]..i-1] = needle[0..i-1-table[i]].
5495b7b453SJohn Marino      2) For 0 < i < m:
5595b7b453SJohn Marino           rhaystack[0..i-1] == needle[0..i-1]
5695b7b453SJohn Marino           and exists h, i <= h < m: rhaystack[h] != needle[h]
5795b7b453SJohn Marino           implies
5895b7b453SJohn Marino           forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1].
5995b7b453SJohn Marino      table[0] remains uninitialized.  */
6095b7b453SJohn Marino   {
6195b7b453SJohn Marino     size_t i, j;
6295b7b453SJohn Marino 
6395b7b453SJohn Marino     /* i = 1: Nothing to verify for x = 0.  */
6495b7b453SJohn Marino     table[1] = 1;
6595b7b453SJohn Marino     j = 0;
6695b7b453SJohn Marino 
6795b7b453SJohn Marino     for (i = 2; i < m; i++)
6895b7b453SJohn Marino       {
6995b7b453SJohn Marino         /* Here: j = i-1 - table[i-1].
7095b7b453SJohn Marino            The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold
7195b7b453SJohn Marino            for x < table[i-1], by induction.
7295b7b453SJohn Marino            Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1].  */
73200fbe8dSJohn Marino         UNIT b = CANON_ELEMENT (needle[i - 1]);
7495b7b453SJohn Marino 
7595b7b453SJohn Marino         for (;;)
7695b7b453SJohn Marino           {
7795b7b453SJohn Marino             /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x]
7895b7b453SJohn Marino                is known to hold for x < i-1-j.
7995b7b453SJohn Marino                Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1].  */
80200fbe8dSJohn Marino             if (b == CANON_ELEMENT (needle[j]))
8195b7b453SJohn Marino               {
8295b7b453SJohn Marino                 /* Set table[i] := i-1-j.  */
8395b7b453SJohn Marino                 table[i] = i - ++j;
8495b7b453SJohn Marino                 break;
8595b7b453SJohn Marino               }
8695b7b453SJohn Marino             /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
8795b7b453SJohn Marino                for x = i-1-j, because
8895b7b453SJohn Marino                  needle[i-1] != needle[j] = needle[i-1-x].  */
8995b7b453SJohn Marino             if (j == 0)
9095b7b453SJohn Marino               {
9195b7b453SJohn Marino                 /* The inequality holds for all possible x.  */
9295b7b453SJohn Marino                 table[i] = i;
9395b7b453SJohn Marino                 break;
9495b7b453SJohn Marino               }
9595b7b453SJohn Marino             /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
9695b7b453SJohn Marino                for i-1-j < x < i-1-j+table[j], because for these x:
9795b7b453SJohn Marino                  needle[x..i-2]
9895b7b453SJohn Marino                  = needle[x-(i-1-j)..j-1]
9995b7b453SJohn Marino                  != needle[0..j-1-(x-(i-1-j))]  (by definition of table[j])
10095b7b453SJohn Marino                     = needle[0..i-2-x],
10195b7b453SJohn Marino                hence needle[x..i-1] != needle[0..i-1-x].
10295b7b453SJohn Marino                Furthermore
10395b7b453SJohn Marino                  needle[i-1-j+table[j]..i-2]
10495b7b453SJohn Marino                  = needle[table[j]..j-1]
10595b7b453SJohn Marino                  = needle[0..j-1-table[j]]  (by definition of table[j]).  */
10695b7b453SJohn Marino             j = j - table[j];
10795b7b453SJohn Marino           }
10895b7b453SJohn Marino         /* Here: j = i - table[i].  */
10995b7b453SJohn Marino       }
11095b7b453SJohn Marino   }
11195b7b453SJohn Marino 
11295b7b453SJohn Marino   /* Search, using the table to accelerate the processing.  */
11395b7b453SJohn Marino   {
11495b7b453SJohn Marino     size_t j;
115200fbe8dSJohn Marino     const UNIT *rhaystack;
116200fbe8dSJohn Marino     const UNIT *phaystack;
11795b7b453SJohn Marino 
11895b7b453SJohn Marino     *resultp = NULL;
11995b7b453SJohn Marino     j = 0;
12095b7b453SJohn Marino     rhaystack = haystack;
12195b7b453SJohn Marino     phaystack = haystack;
12295b7b453SJohn Marino     /* Invariant: phaystack = rhaystack + j.  */
123200fbe8dSJohn Marino     while (*phaystack != 0)
124200fbe8dSJohn Marino       if (CANON_ELEMENT (needle[j]) == CANON_ELEMENT (*phaystack))
12595b7b453SJohn Marino         {
12695b7b453SJohn Marino           j++;
12795b7b453SJohn Marino           phaystack++;
12895b7b453SJohn Marino           if (j == m)
12995b7b453SJohn Marino             {
13095b7b453SJohn Marino               /* The entire needle has been found.  */
13195b7b453SJohn Marino               *resultp = rhaystack;
13295b7b453SJohn Marino               break;
13395b7b453SJohn Marino             }
13495b7b453SJohn Marino         }
13595b7b453SJohn Marino       else if (j > 0)
13695b7b453SJohn Marino         {
13795b7b453SJohn Marino           /* Found a match of needle[0..j-1], mismatch at needle[j].  */
13895b7b453SJohn Marino           rhaystack += table[j];
13995b7b453SJohn Marino           j -= table[j];
14095b7b453SJohn Marino         }
14195b7b453SJohn Marino       else
14295b7b453SJohn Marino         {
14395b7b453SJohn Marino           /* Found a mismatch at needle[0] already.  */
14495b7b453SJohn Marino           rhaystack++;
14595b7b453SJohn Marino           phaystack++;
14695b7b453SJohn Marino         }
14795b7b453SJohn Marino   }
14895b7b453SJohn Marino 
14995b7b453SJohn Marino   freea (table);
15095b7b453SJohn Marino   return true;
15195b7b453SJohn Marino }
15295b7b453SJohn Marino 
15395b7b453SJohn Marino #undef CANON_ELEMENT
154