1dc7c36e4SJohn Marino /* Searching in a string. -*- coding: utf-8 -*-
2*09d4459fSDaniel Fojt Copyright (C) 2005-2020 Free Software Foundation, Inc.
395b7b453SJohn Marino Written by Bruno Haible <bruno@clisp.org>, 2005.
495b7b453SJohn Marino
595b7b453SJohn Marino This program is free software: you can redistribute it and/or modify
695b7b453SJohn Marino it under the terms of the GNU General Public License as published by
795b7b453SJohn Marino the Free Software Foundation; either version 3 of the License, or
895b7b453SJohn Marino (at your option) any later version.
995b7b453SJohn Marino
1095b7b453SJohn Marino This program is distributed in the hope that it will be useful,
1195b7b453SJohn Marino but WITHOUT ANY WARRANTY; without even the implied warranty of
1295b7b453SJohn Marino MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1395b7b453SJohn Marino GNU General Public License for more details.
1495b7b453SJohn Marino
1595b7b453SJohn Marino You should have received a copy of the GNU General Public License
16*09d4459fSDaniel Fojt along with this program. If not, see <https://www.gnu.org/licenses/>. */
1795b7b453SJohn Marino
1895b7b453SJohn Marino #include <config.h>
1995b7b453SJohn Marino
2095b7b453SJohn Marino /* Specification. */
2195b7b453SJohn Marino #include <string.h>
2295b7b453SJohn Marino
2395b7b453SJohn Marino #include <stdbool.h>
2495b7b453SJohn Marino #include <stddef.h> /* for NULL, in case a nonstandard string.h lacks it */
25*09d4459fSDaniel Fojt #include <stdlib.h>
2695b7b453SJohn Marino
2795b7b453SJohn Marino #include "malloca.h"
2895b7b453SJohn Marino #include "mbuiter.h"
2995b7b453SJohn Marino
3095b7b453SJohn Marino /* Knuth-Morris-Pratt algorithm. */
31200fbe8dSJohn Marino #define UNIT unsigned char
3295b7b453SJohn Marino #define CANON_ELEMENT(c) c
3395b7b453SJohn Marino #include "str-kmp.h"
3495b7b453SJohn Marino
3595b7b453SJohn Marino /* Knuth-Morris-Pratt algorithm.
36*09d4459fSDaniel Fojt See https://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm
3795b7b453SJohn Marino Return a boolean indicating success:
3895b7b453SJohn Marino Return true and set *RESULTP if the search was completed.
3995b7b453SJohn Marino Return false if it was aborted because not enough memory was available. */
4095b7b453SJohn Marino static bool
knuth_morris_pratt_multibyte(const char * haystack,const char * needle,const char ** resultp)4195b7b453SJohn Marino knuth_morris_pratt_multibyte (const char *haystack, const char *needle,
4295b7b453SJohn Marino const char **resultp)
4395b7b453SJohn Marino {
4495b7b453SJohn Marino size_t m = mbslen (needle);
4595b7b453SJohn Marino mbchar_t *needle_mbchars;
4695b7b453SJohn Marino size_t *table;
4795b7b453SJohn Marino
4895b7b453SJohn Marino /* Allocate room for needle_mbchars and the table. */
49680a9cb8SJohn Marino void *memory = nmalloca (m, sizeof (mbchar_t) + sizeof (size_t));
50680a9cb8SJohn Marino void *table_memory;
5195b7b453SJohn Marino if (memory == NULL)
5295b7b453SJohn Marino return false;
53680a9cb8SJohn Marino needle_mbchars = memory;
54680a9cb8SJohn Marino table_memory = needle_mbchars + m;
55680a9cb8SJohn Marino table = table_memory;
5695b7b453SJohn Marino
5795b7b453SJohn Marino /* Fill needle_mbchars. */
5895b7b453SJohn Marino {
5995b7b453SJohn Marino mbui_iterator_t iter;
6095b7b453SJohn Marino size_t j;
6195b7b453SJohn Marino
6295b7b453SJohn Marino j = 0;
6395b7b453SJohn Marino for (mbui_init (iter, needle); mbui_avail (iter); mbui_advance (iter), j++)
6495b7b453SJohn Marino mb_copy (&needle_mbchars[j], &mbui_cur (iter));
6595b7b453SJohn Marino }
6695b7b453SJohn Marino
6795b7b453SJohn Marino /* Fill the table.
6895b7b453SJohn Marino For 0 < i < m:
6995b7b453SJohn Marino 0 < table[i] <= i is defined such that
7095b7b453SJohn Marino forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x],
7195b7b453SJohn Marino and table[i] is as large as possible with this property.
7295b7b453SJohn Marino This implies:
7395b7b453SJohn Marino 1) For 0 < i < m:
7495b7b453SJohn Marino If table[i] < i,
7595b7b453SJohn Marino needle[table[i]..i-1] = needle[0..i-1-table[i]].
7695b7b453SJohn Marino 2) For 0 < i < m:
7795b7b453SJohn Marino rhaystack[0..i-1] == needle[0..i-1]
7895b7b453SJohn Marino and exists h, i <= h < m: rhaystack[h] != needle[h]
7995b7b453SJohn Marino implies
8095b7b453SJohn Marino forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1].
8195b7b453SJohn Marino table[0] remains uninitialized. */
8295b7b453SJohn Marino {
8395b7b453SJohn Marino size_t i, j;
8495b7b453SJohn Marino
8595b7b453SJohn Marino /* i = 1: Nothing to verify for x = 0. */
8695b7b453SJohn Marino table[1] = 1;
8795b7b453SJohn Marino j = 0;
8895b7b453SJohn Marino
8995b7b453SJohn Marino for (i = 2; i < m; i++)
9095b7b453SJohn Marino {
9195b7b453SJohn Marino /* Here: j = i-1 - table[i-1].
9295b7b453SJohn Marino The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold
9395b7b453SJohn Marino for x < table[i-1], by induction.
9495b7b453SJohn Marino Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */
9595b7b453SJohn Marino mbchar_t *b = &needle_mbchars[i - 1];
9695b7b453SJohn Marino
9795b7b453SJohn Marino for (;;)
9895b7b453SJohn Marino {
9995b7b453SJohn Marino /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x]
10095b7b453SJohn Marino is known to hold for x < i-1-j.
10195b7b453SJohn Marino Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */
10295b7b453SJohn Marino if (mb_equal (*b, needle_mbchars[j]))
10395b7b453SJohn Marino {
10495b7b453SJohn Marino /* Set table[i] := i-1-j. */
10595b7b453SJohn Marino table[i] = i - ++j;
10695b7b453SJohn Marino break;
10795b7b453SJohn Marino }
10895b7b453SJohn Marino /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
10995b7b453SJohn Marino for x = i-1-j, because
11095b7b453SJohn Marino needle[i-1] != needle[j] = needle[i-1-x]. */
11195b7b453SJohn Marino if (j == 0)
11295b7b453SJohn Marino {
11395b7b453SJohn Marino /* The inequality holds for all possible x. */
11495b7b453SJohn Marino table[i] = i;
11595b7b453SJohn Marino break;
11695b7b453SJohn Marino }
11795b7b453SJohn Marino /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds
11895b7b453SJohn Marino for i-1-j < x < i-1-j+table[j], because for these x:
11995b7b453SJohn Marino needle[x..i-2]
12095b7b453SJohn Marino = needle[x-(i-1-j)..j-1]
12195b7b453SJohn Marino != needle[0..j-1-(x-(i-1-j))] (by definition of table[j])
12295b7b453SJohn Marino = needle[0..i-2-x],
12395b7b453SJohn Marino hence needle[x..i-1] != needle[0..i-1-x].
12495b7b453SJohn Marino Furthermore
12595b7b453SJohn Marino needle[i-1-j+table[j]..i-2]
12695b7b453SJohn Marino = needle[table[j]..j-1]
12795b7b453SJohn Marino = needle[0..j-1-table[j]] (by definition of table[j]). */
12895b7b453SJohn Marino j = j - table[j];
12995b7b453SJohn Marino }
13095b7b453SJohn Marino /* Here: j = i - table[i]. */
13195b7b453SJohn Marino }
13295b7b453SJohn Marino }
13395b7b453SJohn Marino
13495b7b453SJohn Marino /* Search, using the table to accelerate the processing. */
13595b7b453SJohn Marino {
13695b7b453SJohn Marino size_t j;
13795b7b453SJohn Marino mbui_iterator_t rhaystack;
13895b7b453SJohn Marino mbui_iterator_t phaystack;
13995b7b453SJohn Marino
14095b7b453SJohn Marino *resultp = NULL;
14195b7b453SJohn Marino j = 0;
14295b7b453SJohn Marino mbui_init (rhaystack, haystack);
14395b7b453SJohn Marino mbui_init (phaystack, haystack);
14495b7b453SJohn Marino /* Invariant: phaystack = rhaystack + j. */
14595b7b453SJohn Marino while (mbui_avail (phaystack))
14695b7b453SJohn Marino if (mb_equal (needle_mbchars[j], mbui_cur (phaystack)))
14795b7b453SJohn Marino {
14895b7b453SJohn Marino j++;
14995b7b453SJohn Marino mbui_advance (phaystack);
15095b7b453SJohn Marino if (j == m)
15195b7b453SJohn Marino {
15295b7b453SJohn Marino /* The entire needle has been found. */
15395b7b453SJohn Marino *resultp = mbui_cur_ptr (rhaystack);
15495b7b453SJohn Marino break;
15595b7b453SJohn Marino }
15695b7b453SJohn Marino }
15795b7b453SJohn Marino else if (j > 0)
15895b7b453SJohn Marino {
15995b7b453SJohn Marino /* Found a match of needle[0..j-1], mismatch at needle[j]. */
16095b7b453SJohn Marino size_t count = table[j];
16195b7b453SJohn Marino j -= count;
16295b7b453SJohn Marino for (; count > 0; count--)
16395b7b453SJohn Marino {
16495b7b453SJohn Marino if (!mbui_avail (rhaystack))
16595b7b453SJohn Marino abort ();
16695b7b453SJohn Marino mbui_advance (rhaystack);
16795b7b453SJohn Marino }
16895b7b453SJohn Marino }
16995b7b453SJohn Marino else
17095b7b453SJohn Marino {
17195b7b453SJohn Marino /* Found a mismatch at needle[0] already. */
17295b7b453SJohn Marino if (!mbui_avail (rhaystack))
17395b7b453SJohn Marino abort ();
17495b7b453SJohn Marino mbui_advance (rhaystack);
17595b7b453SJohn Marino mbui_advance (phaystack);
17695b7b453SJohn Marino }
17795b7b453SJohn Marino }
17895b7b453SJohn Marino
17995b7b453SJohn Marino freea (memory);
18095b7b453SJohn Marino return true;
18195b7b453SJohn Marino }
18295b7b453SJohn Marino
18395b7b453SJohn Marino /* Find the first occurrence of the character string NEEDLE in the character
18495b7b453SJohn Marino string HAYSTACK. Return NULL if NEEDLE is not found in HAYSTACK. */
18595b7b453SJohn Marino char *
mbsstr(const char * haystack,const char * needle)18695b7b453SJohn Marino mbsstr (const char *haystack, const char *needle)
18795b7b453SJohn Marino {
18895b7b453SJohn Marino /* Be careful not to look at the entire extent of haystack or needle
18995b7b453SJohn Marino until needed. This is useful because of these two cases:
19095b7b453SJohn Marino - haystack may be very long, and a match of needle found early,
19195b7b453SJohn Marino - needle may be very long, and not even a short initial segment of
19295b7b453SJohn Marino needle may be found in haystack. */
19395b7b453SJohn Marino if (MB_CUR_MAX > 1)
19495b7b453SJohn Marino {
19595b7b453SJohn Marino mbui_iterator_t iter_needle;
19695b7b453SJohn Marino
19795b7b453SJohn Marino mbui_init (iter_needle, needle);
19895b7b453SJohn Marino if (mbui_avail (iter_needle))
19995b7b453SJohn Marino {
20095b7b453SJohn Marino /* Minimizing the worst-case complexity:
20195b7b453SJohn Marino Let n = mbslen(haystack), m = mbslen(needle).
20295b7b453SJohn Marino The naïve algorithm is O(n*m) worst-case.
20395b7b453SJohn Marino The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
20495b7b453SJohn Marino memory allocation.
20595b7b453SJohn Marino To achieve linear complexity and yet amortize the cost of the
20695b7b453SJohn Marino memory allocation, we activate the Knuth-Morris-Pratt algorithm
20795b7b453SJohn Marino only once the naïve algorithm has already run for some time; more
20895b7b453SJohn Marino precisely, when
20995b7b453SJohn Marino - the outer loop count is >= 10,
21095b7b453SJohn Marino - the average number of comparisons per outer loop is >= 5,
21195b7b453SJohn Marino - the total number of comparisons is >= m.
21295b7b453SJohn Marino But we try it only once. If the memory allocation attempt failed,
21395b7b453SJohn Marino we don't retry it. */
21495b7b453SJohn Marino bool try_kmp = true;
21595b7b453SJohn Marino size_t outer_loop_count = 0;
21695b7b453SJohn Marino size_t comparison_count = 0;
21795b7b453SJohn Marino size_t last_ccount = 0; /* last comparison count */
21895b7b453SJohn Marino mbui_iterator_t iter_needle_last_ccount; /* = needle + last_ccount */
21995b7b453SJohn Marino
22095b7b453SJohn Marino mbui_iterator_t iter_haystack;
22195b7b453SJohn Marino
22295b7b453SJohn Marino mbui_init (iter_needle_last_ccount, needle);
22395b7b453SJohn Marino mbui_init (iter_haystack, haystack);
22495b7b453SJohn Marino for (;; mbui_advance (iter_haystack))
22595b7b453SJohn Marino {
22695b7b453SJohn Marino if (!mbui_avail (iter_haystack))
22795b7b453SJohn Marino /* No match. */
22895b7b453SJohn Marino return NULL;
22995b7b453SJohn Marino
23095b7b453SJohn Marino /* See whether it's advisable to use an asymptotically faster
23195b7b453SJohn Marino algorithm. */
23295b7b453SJohn Marino if (try_kmp
23395b7b453SJohn Marino && outer_loop_count >= 10
23495b7b453SJohn Marino && comparison_count >= 5 * outer_loop_count)
23595b7b453SJohn Marino {
23695b7b453SJohn Marino /* See if needle + comparison_count now reaches the end of
23795b7b453SJohn Marino needle. */
23895b7b453SJohn Marino size_t count = comparison_count - last_ccount;
23995b7b453SJohn Marino for (;
24095b7b453SJohn Marino count > 0 && mbui_avail (iter_needle_last_ccount);
24195b7b453SJohn Marino count--)
24295b7b453SJohn Marino mbui_advance (iter_needle_last_ccount);
24395b7b453SJohn Marino last_ccount = comparison_count;
24495b7b453SJohn Marino if (!mbui_avail (iter_needle_last_ccount))
24595b7b453SJohn Marino {
24695b7b453SJohn Marino /* Try the Knuth-Morris-Pratt algorithm. */
24795b7b453SJohn Marino const char *result;
24895b7b453SJohn Marino bool success =
24995b7b453SJohn Marino knuth_morris_pratt_multibyte (haystack, needle,
25095b7b453SJohn Marino &result);
25195b7b453SJohn Marino if (success)
25295b7b453SJohn Marino return (char *) result;
25395b7b453SJohn Marino try_kmp = false;
25495b7b453SJohn Marino }
25595b7b453SJohn Marino }
25695b7b453SJohn Marino
25795b7b453SJohn Marino outer_loop_count++;
25895b7b453SJohn Marino comparison_count++;
25995b7b453SJohn Marino if (mb_equal (mbui_cur (iter_haystack), mbui_cur (iter_needle)))
26095b7b453SJohn Marino /* The first character matches. */
26195b7b453SJohn Marino {
26295b7b453SJohn Marino mbui_iterator_t rhaystack;
26395b7b453SJohn Marino mbui_iterator_t rneedle;
26495b7b453SJohn Marino
26595b7b453SJohn Marino memcpy (&rhaystack, &iter_haystack, sizeof (mbui_iterator_t));
26695b7b453SJohn Marino mbui_advance (rhaystack);
26795b7b453SJohn Marino
26895b7b453SJohn Marino mbui_init (rneedle, needle);
26995b7b453SJohn Marino if (!mbui_avail (rneedle))
27095b7b453SJohn Marino abort ();
27195b7b453SJohn Marino mbui_advance (rneedle);
27295b7b453SJohn Marino
27395b7b453SJohn Marino for (;; mbui_advance (rhaystack), mbui_advance (rneedle))
27495b7b453SJohn Marino {
27595b7b453SJohn Marino if (!mbui_avail (rneedle))
27695b7b453SJohn Marino /* Found a match. */
27795b7b453SJohn Marino return (char *) mbui_cur_ptr (iter_haystack);
27895b7b453SJohn Marino if (!mbui_avail (rhaystack))
27995b7b453SJohn Marino /* No match. */
28095b7b453SJohn Marino return NULL;
28195b7b453SJohn Marino comparison_count++;
28295b7b453SJohn Marino if (!mb_equal (mbui_cur (rhaystack), mbui_cur (rneedle)))
28395b7b453SJohn Marino /* Nothing in this round. */
28495b7b453SJohn Marino break;
28595b7b453SJohn Marino }
28695b7b453SJohn Marino }
28795b7b453SJohn Marino }
28895b7b453SJohn Marino }
28995b7b453SJohn Marino else
29095b7b453SJohn Marino return (char *) haystack;
29195b7b453SJohn Marino }
29295b7b453SJohn Marino else
29395b7b453SJohn Marino {
29495b7b453SJohn Marino if (*needle != '\0')
29595b7b453SJohn Marino {
29695b7b453SJohn Marino /* Minimizing the worst-case complexity:
29795b7b453SJohn Marino Let n = strlen(haystack), m = strlen(needle).
29895b7b453SJohn Marino The naïve algorithm is O(n*m) worst-case.
29995b7b453SJohn Marino The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
30095b7b453SJohn Marino memory allocation.
30195b7b453SJohn Marino To achieve linear complexity and yet amortize the cost of the
30295b7b453SJohn Marino memory allocation, we activate the Knuth-Morris-Pratt algorithm
30395b7b453SJohn Marino only once the naïve algorithm has already run for some time; more
30495b7b453SJohn Marino precisely, when
30595b7b453SJohn Marino - the outer loop count is >= 10,
30695b7b453SJohn Marino - the average number of comparisons per outer loop is >= 5,
30795b7b453SJohn Marino - the total number of comparisons is >= m.
30895b7b453SJohn Marino But we try it only once. If the memory allocation attempt failed,
30995b7b453SJohn Marino we don't retry it. */
31095b7b453SJohn Marino bool try_kmp = true;
31195b7b453SJohn Marino size_t outer_loop_count = 0;
31295b7b453SJohn Marino size_t comparison_count = 0;
31395b7b453SJohn Marino size_t last_ccount = 0; /* last comparison count */
31495b7b453SJohn Marino const char *needle_last_ccount = needle; /* = needle + last_ccount */
31595b7b453SJohn Marino
31695b7b453SJohn Marino /* Speed up the following searches of needle by caching its first
31795b7b453SJohn Marino character. */
31895b7b453SJohn Marino char b = *needle++;
31995b7b453SJohn Marino
32095b7b453SJohn Marino for (;; haystack++)
32195b7b453SJohn Marino {
32295b7b453SJohn Marino if (*haystack == '\0')
32395b7b453SJohn Marino /* No match. */
32495b7b453SJohn Marino return NULL;
32595b7b453SJohn Marino
32695b7b453SJohn Marino /* See whether it's advisable to use an asymptotically faster
32795b7b453SJohn Marino algorithm. */
32895b7b453SJohn Marino if (try_kmp
32995b7b453SJohn Marino && outer_loop_count >= 10
33095b7b453SJohn Marino && comparison_count >= 5 * outer_loop_count)
33195b7b453SJohn Marino {
33295b7b453SJohn Marino /* See if needle + comparison_count now reaches the end of
33395b7b453SJohn Marino needle. */
33495b7b453SJohn Marino if (needle_last_ccount != NULL)
33595b7b453SJohn Marino {
33695b7b453SJohn Marino needle_last_ccount +=
33795b7b453SJohn Marino strnlen (needle_last_ccount,
33895b7b453SJohn Marino comparison_count - last_ccount);
33995b7b453SJohn Marino if (*needle_last_ccount == '\0')
34095b7b453SJohn Marino needle_last_ccount = NULL;
34195b7b453SJohn Marino last_ccount = comparison_count;
34295b7b453SJohn Marino }
34395b7b453SJohn Marino if (needle_last_ccount == NULL)
34495b7b453SJohn Marino {
34595b7b453SJohn Marino /* Try the Knuth-Morris-Pratt algorithm. */
346200fbe8dSJohn Marino const unsigned char *result;
34795b7b453SJohn Marino bool success =
348200fbe8dSJohn Marino knuth_morris_pratt ((const unsigned char *) haystack,
349200fbe8dSJohn Marino (const unsigned char *) (needle - 1),
350200fbe8dSJohn Marino strlen (needle - 1),
35195b7b453SJohn Marino &result);
35295b7b453SJohn Marino if (success)
35395b7b453SJohn Marino return (char *) result;
35495b7b453SJohn Marino try_kmp = false;
35595b7b453SJohn Marino }
35695b7b453SJohn Marino }
35795b7b453SJohn Marino
35895b7b453SJohn Marino outer_loop_count++;
35995b7b453SJohn Marino comparison_count++;
36095b7b453SJohn Marino if (*haystack == b)
36195b7b453SJohn Marino /* The first character matches. */
36295b7b453SJohn Marino {
36395b7b453SJohn Marino const char *rhaystack = haystack + 1;
36495b7b453SJohn Marino const char *rneedle = needle;
36595b7b453SJohn Marino
36695b7b453SJohn Marino for (;; rhaystack++, rneedle++)
36795b7b453SJohn Marino {
36895b7b453SJohn Marino if (*rneedle == '\0')
36995b7b453SJohn Marino /* Found a match. */
37095b7b453SJohn Marino return (char *) haystack;
37195b7b453SJohn Marino if (*rhaystack == '\0')
37295b7b453SJohn Marino /* No match. */
37395b7b453SJohn Marino return NULL;
37495b7b453SJohn Marino comparison_count++;
37595b7b453SJohn Marino if (*rhaystack != *rneedle)
37695b7b453SJohn Marino /* Nothing in this round. */
37795b7b453SJohn Marino break;
37895b7b453SJohn Marino }
37995b7b453SJohn Marino }
38095b7b453SJohn Marino }
38195b7b453SJohn Marino }
38295b7b453SJohn Marino else
38395b7b453SJohn Marino return (char *) haystack;
38495b7b453SJohn Marino }
38595b7b453SJohn Marino }
386