1*dc7c36e4SJohn Marino /* Searching in a string. -*- coding: utf-8 -*- 2*dc7c36e4SJohn Marino Copyright (C) 2005-2015 Free Software Foundation, Inc. 395b7b453SJohn Marino Written by Bruno Haible <bruno@clisp.org>, 2005. 495b7b453SJohn Marino 595b7b453SJohn Marino This program is free software: you can redistribute it and/or modify 695b7b453SJohn Marino it under the terms of the GNU General Public License as published by 795b7b453SJohn Marino the Free Software Foundation; either version 3 of the License, or 895b7b453SJohn Marino (at your option) any later version. 995b7b453SJohn Marino 1095b7b453SJohn Marino This program is distributed in the hope that it will be useful, 1195b7b453SJohn Marino but WITHOUT ANY WARRANTY; without even the implied warranty of 1295b7b453SJohn Marino MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 1395b7b453SJohn Marino GNU General Public License for more details. 1495b7b453SJohn Marino 1595b7b453SJohn Marino You should have received a copy of the GNU General Public License 1695b7b453SJohn Marino along with this program. If not, see <http://www.gnu.org/licenses/>. */ 1795b7b453SJohn Marino 1895b7b453SJohn Marino #include <config.h> 1995b7b453SJohn Marino 2095b7b453SJohn Marino /* Specification. */ 2195b7b453SJohn Marino #include <string.h> 2295b7b453SJohn Marino 2395b7b453SJohn Marino #include <stdbool.h> 2495b7b453SJohn Marino #include <stddef.h> /* for NULL, in case a nonstandard string.h lacks it */ 2595b7b453SJohn Marino 2695b7b453SJohn Marino #include "malloca.h" 2795b7b453SJohn Marino #include "mbuiter.h" 2895b7b453SJohn Marino 2995b7b453SJohn Marino /* Knuth-Morris-Pratt algorithm. */ 30200fbe8dSJohn Marino #define UNIT unsigned char 3195b7b453SJohn Marino #define CANON_ELEMENT(c) c 3295b7b453SJohn Marino #include "str-kmp.h" 3395b7b453SJohn Marino 3495b7b453SJohn Marino /* Knuth-Morris-Pratt algorithm. 3595b7b453SJohn Marino See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm 3695b7b453SJohn Marino Return a boolean indicating success: 3795b7b453SJohn Marino Return true and set *RESULTP if the search was completed. 3895b7b453SJohn Marino Return false if it was aborted because not enough memory was available. */ 3995b7b453SJohn Marino static bool 4095b7b453SJohn Marino knuth_morris_pratt_multibyte (const char *haystack, const char *needle, 4195b7b453SJohn Marino const char **resultp) 4295b7b453SJohn Marino { 4395b7b453SJohn Marino size_t m = mbslen (needle); 4495b7b453SJohn Marino mbchar_t *needle_mbchars; 4595b7b453SJohn Marino size_t *table; 4695b7b453SJohn Marino 4795b7b453SJohn Marino /* Allocate room for needle_mbchars and the table. */ 48680a9cb8SJohn Marino void *memory = nmalloca (m, sizeof (mbchar_t) + sizeof (size_t)); 49680a9cb8SJohn Marino void *table_memory; 5095b7b453SJohn Marino if (memory == NULL) 5195b7b453SJohn Marino return false; 52680a9cb8SJohn Marino needle_mbchars = memory; 53680a9cb8SJohn Marino table_memory = needle_mbchars + m; 54680a9cb8SJohn Marino table = table_memory; 5595b7b453SJohn Marino 5695b7b453SJohn Marino /* Fill needle_mbchars. */ 5795b7b453SJohn Marino { 5895b7b453SJohn Marino mbui_iterator_t iter; 5995b7b453SJohn Marino size_t j; 6095b7b453SJohn Marino 6195b7b453SJohn Marino j = 0; 6295b7b453SJohn Marino for (mbui_init (iter, needle); mbui_avail (iter); mbui_advance (iter), j++) 6395b7b453SJohn Marino mb_copy (&needle_mbchars[j], &mbui_cur (iter)); 6495b7b453SJohn Marino } 6595b7b453SJohn Marino 6695b7b453SJohn Marino /* Fill the table. 6795b7b453SJohn Marino For 0 < i < m: 6895b7b453SJohn Marino 0 < table[i] <= i is defined such that 6995b7b453SJohn Marino forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x], 7095b7b453SJohn Marino and table[i] is as large as possible with this property. 7195b7b453SJohn Marino This implies: 7295b7b453SJohn Marino 1) For 0 < i < m: 7395b7b453SJohn Marino If table[i] < i, 7495b7b453SJohn Marino needle[table[i]..i-1] = needle[0..i-1-table[i]]. 7595b7b453SJohn Marino 2) For 0 < i < m: 7695b7b453SJohn Marino rhaystack[0..i-1] == needle[0..i-1] 7795b7b453SJohn Marino and exists h, i <= h < m: rhaystack[h] != needle[h] 7895b7b453SJohn Marino implies 7995b7b453SJohn Marino forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1]. 8095b7b453SJohn Marino table[0] remains uninitialized. */ 8195b7b453SJohn Marino { 8295b7b453SJohn Marino size_t i, j; 8395b7b453SJohn Marino 8495b7b453SJohn Marino /* i = 1: Nothing to verify for x = 0. */ 8595b7b453SJohn Marino table[1] = 1; 8695b7b453SJohn Marino j = 0; 8795b7b453SJohn Marino 8895b7b453SJohn Marino for (i = 2; i < m; i++) 8995b7b453SJohn Marino { 9095b7b453SJohn Marino /* Here: j = i-1 - table[i-1]. 9195b7b453SJohn Marino The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold 9295b7b453SJohn Marino for x < table[i-1], by induction. 9395b7b453SJohn Marino Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ 9495b7b453SJohn Marino mbchar_t *b = &needle_mbchars[i - 1]; 9595b7b453SJohn Marino 9695b7b453SJohn Marino for (;;) 9795b7b453SJohn Marino { 9895b7b453SJohn Marino /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x] 9995b7b453SJohn Marino is known to hold for x < i-1-j. 10095b7b453SJohn Marino Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ 10195b7b453SJohn Marino if (mb_equal (*b, needle_mbchars[j])) 10295b7b453SJohn Marino { 10395b7b453SJohn Marino /* Set table[i] := i-1-j. */ 10495b7b453SJohn Marino table[i] = i - ++j; 10595b7b453SJohn Marino break; 10695b7b453SJohn Marino } 10795b7b453SJohn Marino /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds 10895b7b453SJohn Marino for x = i-1-j, because 10995b7b453SJohn Marino needle[i-1] != needle[j] = needle[i-1-x]. */ 11095b7b453SJohn Marino if (j == 0) 11195b7b453SJohn Marino { 11295b7b453SJohn Marino /* The inequality holds for all possible x. */ 11395b7b453SJohn Marino table[i] = i; 11495b7b453SJohn Marino break; 11595b7b453SJohn Marino } 11695b7b453SJohn Marino /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds 11795b7b453SJohn Marino for i-1-j < x < i-1-j+table[j], because for these x: 11895b7b453SJohn Marino needle[x..i-2] 11995b7b453SJohn Marino = needle[x-(i-1-j)..j-1] 12095b7b453SJohn Marino != needle[0..j-1-(x-(i-1-j))] (by definition of table[j]) 12195b7b453SJohn Marino = needle[0..i-2-x], 12295b7b453SJohn Marino hence needle[x..i-1] != needle[0..i-1-x]. 12395b7b453SJohn Marino Furthermore 12495b7b453SJohn Marino needle[i-1-j+table[j]..i-2] 12595b7b453SJohn Marino = needle[table[j]..j-1] 12695b7b453SJohn Marino = needle[0..j-1-table[j]] (by definition of table[j]). */ 12795b7b453SJohn Marino j = j - table[j]; 12895b7b453SJohn Marino } 12995b7b453SJohn Marino /* Here: j = i - table[i]. */ 13095b7b453SJohn Marino } 13195b7b453SJohn Marino } 13295b7b453SJohn Marino 13395b7b453SJohn Marino /* Search, using the table to accelerate the processing. */ 13495b7b453SJohn Marino { 13595b7b453SJohn Marino size_t j; 13695b7b453SJohn Marino mbui_iterator_t rhaystack; 13795b7b453SJohn Marino mbui_iterator_t phaystack; 13895b7b453SJohn Marino 13995b7b453SJohn Marino *resultp = NULL; 14095b7b453SJohn Marino j = 0; 14195b7b453SJohn Marino mbui_init (rhaystack, haystack); 14295b7b453SJohn Marino mbui_init (phaystack, haystack); 14395b7b453SJohn Marino /* Invariant: phaystack = rhaystack + j. */ 14495b7b453SJohn Marino while (mbui_avail (phaystack)) 14595b7b453SJohn Marino if (mb_equal (needle_mbchars[j], mbui_cur (phaystack))) 14695b7b453SJohn Marino { 14795b7b453SJohn Marino j++; 14895b7b453SJohn Marino mbui_advance (phaystack); 14995b7b453SJohn Marino if (j == m) 15095b7b453SJohn Marino { 15195b7b453SJohn Marino /* The entire needle has been found. */ 15295b7b453SJohn Marino *resultp = mbui_cur_ptr (rhaystack); 15395b7b453SJohn Marino break; 15495b7b453SJohn Marino } 15595b7b453SJohn Marino } 15695b7b453SJohn Marino else if (j > 0) 15795b7b453SJohn Marino { 15895b7b453SJohn Marino /* Found a match of needle[0..j-1], mismatch at needle[j]. */ 15995b7b453SJohn Marino size_t count = table[j]; 16095b7b453SJohn Marino j -= count; 16195b7b453SJohn Marino for (; count > 0; count--) 16295b7b453SJohn Marino { 16395b7b453SJohn Marino if (!mbui_avail (rhaystack)) 16495b7b453SJohn Marino abort (); 16595b7b453SJohn Marino mbui_advance (rhaystack); 16695b7b453SJohn Marino } 16795b7b453SJohn Marino } 16895b7b453SJohn Marino else 16995b7b453SJohn Marino { 17095b7b453SJohn Marino /* Found a mismatch at needle[0] already. */ 17195b7b453SJohn Marino if (!mbui_avail (rhaystack)) 17295b7b453SJohn Marino abort (); 17395b7b453SJohn Marino mbui_advance (rhaystack); 17495b7b453SJohn Marino mbui_advance (phaystack); 17595b7b453SJohn Marino } 17695b7b453SJohn Marino } 17795b7b453SJohn Marino 17895b7b453SJohn Marino freea (memory); 17995b7b453SJohn Marino return true; 18095b7b453SJohn Marino } 18195b7b453SJohn Marino 18295b7b453SJohn Marino /* Find the first occurrence of the character string NEEDLE in the character 18395b7b453SJohn Marino string HAYSTACK. Return NULL if NEEDLE is not found in HAYSTACK. */ 18495b7b453SJohn Marino char * 18595b7b453SJohn Marino mbsstr (const char *haystack, const char *needle) 18695b7b453SJohn Marino { 18795b7b453SJohn Marino /* Be careful not to look at the entire extent of haystack or needle 18895b7b453SJohn Marino until needed. This is useful because of these two cases: 18995b7b453SJohn Marino - haystack may be very long, and a match of needle found early, 19095b7b453SJohn Marino - needle may be very long, and not even a short initial segment of 19195b7b453SJohn Marino needle may be found in haystack. */ 19295b7b453SJohn Marino if (MB_CUR_MAX > 1) 19395b7b453SJohn Marino { 19495b7b453SJohn Marino mbui_iterator_t iter_needle; 19595b7b453SJohn Marino 19695b7b453SJohn Marino mbui_init (iter_needle, needle); 19795b7b453SJohn Marino if (mbui_avail (iter_needle)) 19895b7b453SJohn Marino { 19995b7b453SJohn Marino /* Minimizing the worst-case complexity: 20095b7b453SJohn Marino Let n = mbslen(haystack), m = mbslen(needle). 20195b7b453SJohn Marino The naïve algorithm is O(n*m) worst-case. 20295b7b453SJohn Marino The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a 20395b7b453SJohn Marino memory allocation. 20495b7b453SJohn Marino To achieve linear complexity and yet amortize the cost of the 20595b7b453SJohn Marino memory allocation, we activate the Knuth-Morris-Pratt algorithm 20695b7b453SJohn Marino only once the naïve algorithm has already run for some time; more 20795b7b453SJohn Marino precisely, when 20895b7b453SJohn Marino - the outer loop count is >= 10, 20995b7b453SJohn Marino - the average number of comparisons per outer loop is >= 5, 21095b7b453SJohn Marino - the total number of comparisons is >= m. 21195b7b453SJohn Marino But we try it only once. If the memory allocation attempt failed, 21295b7b453SJohn Marino we don't retry it. */ 21395b7b453SJohn Marino bool try_kmp = true; 21495b7b453SJohn Marino size_t outer_loop_count = 0; 21595b7b453SJohn Marino size_t comparison_count = 0; 21695b7b453SJohn Marino size_t last_ccount = 0; /* last comparison count */ 21795b7b453SJohn Marino mbui_iterator_t iter_needle_last_ccount; /* = needle + last_ccount */ 21895b7b453SJohn Marino 21995b7b453SJohn Marino mbui_iterator_t iter_haystack; 22095b7b453SJohn Marino 22195b7b453SJohn Marino mbui_init (iter_needle_last_ccount, needle); 22295b7b453SJohn Marino mbui_init (iter_haystack, haystack); 22395b7b453SJohn Marino for (;; mbui_advance (iter_haystack)) 22495b7b453SJohn Marino { 22595b7b453SJohn Marino if (!mbui_avail (iter_haystack)) 22695b7b453SJohn Marino /* No match. */ 22795b7b453SJohn Marino return NULL; 22895b7b453SJohn Marino 22995b7b453SJohn Marino /* See whether it's advisable to use an asymptotically faster 23095b7b453SJohn Marino algorithm. */ 23195b7b453SJohn Marino if (try_kmp 23295b7b453SJohn Marino && outer_loop_count >= 10 23395b7b453SJohn Marino && comparison_count >= 5 * outer_loop_count) 23495b7b453SJohn Marino { 23595b7b453SJohn Marino /* See if needle + comparison_count now reaches the end of 23695b7b453SJohn Marino needle. */ 23795b7b453SJohn Marino size_t count = comparison_count - last_ccount; 23895b7b453SJohn Marino for (; 23995b7b453SJohn Marino count > 0 && mbui_avail (iter_needle_last_ccount); 24095b7b453SJohn Marino count--) 24195b7b453SJohn Marino mbui_advance (iter_needle_last_ccount); 24295b7b453SJohn Marino last_ccount = comparison_count; 24395b7b453SJohn Marino if (!mbui_avail (iter_needle_last_ccount)) 24495b7b453SJohn Marino { 24595b7b453SJohn Marino /* Try the Knuth-Morris-Pratt algorithm. */ 24695b7b453SJohn Marino const char *result; 24795b7b453SJohn Marino bool success = 24895b7b453SJohn Marino knuth_morris_pratt_multibyte (haystack, needle, 24995b7b453SJohn Marino &result); 25095b7b453SJohn Marino if (success) 25195b7b453SJohn Marino return (char *) result; 25295b7b453SJohn Marino try_kmp = false; 25395b7b453SJohn Marino } 25495b7b453SJohn Marino } 25595b7b453SJohn Marino 25695b7b453SJohn Marino outer_loop_count++; 25795b7b453SJohn Marino comparison_count++; 25895b7b453SJohn Marino if (mb_equal (mbui_cur (iter_haystack), mbui_cur (iter_needle))) 25995b7b453SJohn Marino /* The first character matches. */ 26095b7b453SJohn Marino { 26195b7b453SJohn Marino mbui_iterator_t rhaystack; 26295b7b453SJohn Marino mbui_iterator_t rneedle; 26395b7b453SJohn Marino 26495b7b453SJohn Marino memcpy (&rhaystack, &iter_haystack, sizeof (mbui_iterator_t)); 26595b7b453SJohn Marino mbui_advance (rhaystack); 26695b7b453SJohn Marino 26795b7b453SJohn Marino mbui_init (rneedle, needle); 26895b7b453SJohn Marino if (!mbui_avail (rneedle)) 26995b7b453SJohn Marino abort (); 27095b7b453SJohn Marino mbui_advance (rneedle); 27195b7b453SJohn Marino 27295b7b453SJohn Marino for (;; mbui_advance (rhaystack), mbui_advance (rneedle)) 27395b7b453SJohn Marino { 27495b7b453SJohn Marino if (!mbui_avail (rneedle)) 27595b7b453SJohn Marino /* Found a match. */ 27695b7b453SJohn Marino return (char *) mbui_cur_ptr (iter_haystack); 27795b7b453SJohn Marino if (!mbui_avail (rhaystack)) 27895b7b453SJohn Marino /* No match. */ 27995b7b453SJohn Marino return NULL; 28095b7b453SJohn Marino comparison_count++; 28195b7b453SJohn Marino if (!mb_equal (mbui_cur (rhaystack), mbui_cur (rneedle))) 28295b7b453SJohn Marino /* Nothing in this round. */ 28395b7b453SJohn Marino break; 28495b7b453SJohn Marino } 28595b7b453SJohn Marino } 28695b7b453SJohn Marino } 28795b7b453SJohn Marino } 28895b7b453SJohn Marino else 28995b7b453SJohn Marino return (char *) haystack; 29095b7b453SJohn Marino } 29195b7b453SJohn Marino else 29295b7b453SJohn Marino { 29395b7b453SJohn Marino if (*needle != '\0') 29495b7b453SJohn Marino { 29595b7b453SJohn Marino /* Minimizing the worst-case complexity: 29695b7b453SJohn Marino Let n = strlen(haystack), m = strlen(needle). 29795b7b453SJohn Marino The naïve algorithm is O(n*m) worst-case. 29895b7b453SJohn Marino The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a 29995b7b453SJohn Marino memory allocation. 30095b7b453SJohn Marino To achieve linear complexity and yet amortize the cost of the 30195b7b453SJohn Marino memory allocation, we activate the Knuth-Morris-Pratt algorithm 30295b7b453SJohn Marino only once the naïve algorithm has already run for some time; more 30395b7b453SJohn Marino precisely, when 30495b7b453SJohn Marino - the outer loop count is >= 10, 30595b7b453SJohn Marino - the average number of comparisons per outer loop is >= 5, 30695b7b453SJohn Marino - the total number of comparisons is >= m. 30795b7b453SJohn Marino But we try it only once. If the memory allocation attempt failed, 30895b7b453SJohn Marino we don't retry it. */ 30995b7b453SJohn Marino bool try_kmp = true; 31095b7b453SJohn Marino size_t outer_loop_count = 0; 31195b7b453SJohn Marino size_t comparison_count = 0; 31295b7b453SJohn Marino size_t last_ccount = 0; /* last comparison count */ 31395b7b453SJohn Marino const char *needle_last_ccount = needle; /* = needle + last_ccount */ 31495b7b453SJohn Marino 31595b7b453SJohn Marino /* Speed up the following searches of needle by caching its first 31695b7b453SJohn Marino character. */ 31795b7b453SJohn Marino char b = *needle++; 31895b7b453SJohn Marino 31995b7b453SJohn Marino for (;; haystack++) 32095b7b453SJohn Marino { 32195b7b453SJohn Marino if (*haystack == '\0') 32295b7b453SJohn Marino /* No match. */ 32395b7b453SJohn Marino return NULL; 32495b7b453SJohn Marino 32595b7b453SJohn Marino /* See whether it's advisable to use an asymptotically faster 32695b7b453SJohn Marino algorithm. */ 32795b7b453SJohn Marino if (try_kmp 32895b7b453SJohn Marino && outer_loop_count >= 10 32995b7b453SJohn Marino && comparison_count >= 5 * outer_loop_count) 33095b7b453SJohn Marino { 33195b7b453SJohn Marino /* See if needle + comparison_count now reaches the end of 33295b7b453SJohn Marino needle. */ 33395b7b453SJohn Marino if (needle_last_ccount != NULL) 33495b7b453SJohn Marino { 33595b7b453SJohn Marino needle_last_ccount += 33695b7b453SJohn Marino strnlen (needle_last_ccount, 33795b7b453SJohn Marino comparison_count - last_ccount); 33895b7b453SJohn Marino if (*needle_last_ccount == '\0') 33995b7b453SJohn Marino needle_last_ccount = NULL; 34095b7b453SJohn Marino last_ccount = comparison_count; 34195b7b453SJohn Marino } 34295b7b453SJohn Marino if (needle_last_ccount == NULL) 34395b7b453SJohn Marino { 34495b7b453SJohn Marino /* Try the Knuth-Morris-Pratt algorithm. */ 345200fbe8dSJohn Marino const unsigned char *result; 34695b7b453SJohn Marino bool success = 347200fbe8dSJohn Marino knuth_morris_pratt ((const unsigned char *) haystack, 348200fbe8dSJohn Marino (const unsigned char *) (needle - 1), 349200fbe8dSJohn Marino strlen (needle - 1), 35095b7b453SJohn Marino &result); 35195b7b453SJohn Marino if (success) 35295b7b453SJohn Marino return (char *) result; 35395b7b453SJohn Marino try_kmp = false; 35495b7b453SJohn Marino } 35595b7b453SJohn Marino } 35695b7b453SJohn Marino 35795b7b453SJohn Marino outer_loop_count++; 35895b7b453SJohn Marino comparison_count++; 35995b7b453SJohn Marino if (*haystack == b) 36095b7b453SJohn Marino /* The first character matches. */ 36195b7b453SJohn Marino { 36295b7b453SJohn Marino const char *rhaystack = haystack + 1; 36395b7b453SJohn Marino const char *rneedle = needle; 36495b7b453SJohn Marino 36595b7b453SJohn Marino for (;; rhaystack++, rneedle++) 36695b7b453SJohn Marino { 36795b7b453SJohn Marino if (*rneedle == '\0') 36895b7b453SJohn Marino /* Found a match. */ 36995b7b453SJohn Marino return (char *) haystack; 37095b7b453SJohn Marino if (*rhaystack == '\0') 37195b7b453SJohn Marino /* No match. */ 37295b7b453SJohn Marino return NULL; 37395b7b453SJohn Marino comparison_count++; 37495b7b453SJohn Marino if (*rhaystack != *rneedle) 37595b7b453SJohn Marino /* Nothing in this round. */ 37695b7b453SJohn Marino break; 37795b7b453SJohn Marino } 37895b7b453SJohn Marino } 37995b7b453SJohn Marino } 38095b7b453SJohn Marino } 38195b7b453SJohn Marino else 38295b7b453SJohn Marino return (char *) haystack; 38395b7b453SJohn Marino } 38495b7b453SJohn Marino } 385