1*95b7b453SJohn Marino /* Searching in a string. 2*95b7b453SJohn Marino Copyright (C) 2005-2010 Free Software Foundation, Inc. 3*95b7b453SJohn Marino Written by Bruno Haible <bruno@clisp.org>, 2005. 4*95b7b453SJohn Marino 5*95b7b453SJohn Marino This program is free software: you can redistribute it and/or modify 6*95b7b453SJohn Marino it under the terms of the GNU General Public License as published by 7*95b7b453SJohn Marino the Free Software Foundation; either version 3 of the License, or 8*95b7b453SJohn Marino (at your option) any later version. 9*95b7b453SJohn Marino 10*95b7b453SJohn Marino This program is distributed in the hope that it will be useful, 11*95b7b453SJohn Marino but WITHOUT ANY WARRANTY; without even the implied warranty of 12*95b7b453SJohn Marino MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13*95b7b453SJohn Marino GNU General Public License for more details. 14*95b7b453SJohn Marino 15*95b7b453SJohn Marino You should have received a copy of the GNU General Public License 16*95b7b453SJohn Marino along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17*95b7b453SJohn Marino 18*95b7b453SJohn Marino #include <config.h> 19*95b7b453SJohn Marino 20*95b7b453SJohn Marino /* Specification. */ 21*95b7b453SJohn Marino #include <string.h> 22*95b7b453SJohn Marino 23*95b7b453SJohn Marino #include <stdbool.h> 24*95b7b453SJohn Marino #include <stddef.h> /* for NULL, in case a nonstandard string.h lacks it */ 25*95b7b453SJohn Marino 26*95b7b453SJohn Marino #include "malloca.h" 27*95b7b453SJohn Marino #include "mbuiter.h" 28*95b7b453SJohn Marino 29*95b7b453SJohn Marino /* Knuth-Morris-Pratt algorithm. */ 30*95b7b453SJohn Marino #define CANON_ELEMENT(c) c 31*95b7b453SJohn Marino #include "str-kmp.h" 32*95b7b453SJohn Marino 33*95b7b453SJohn Marino /* Knuth-Morris-Pratt algorithm. 34*95b7b453SJohn Marino See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm 35*95b7b453SJohn Marino Return a boolean indicating success: 36*95b7b453SJohn Marino Return true and set *RESULTP if the search was completed. 37*95b7b453SJohn Marino Return false if it was aborted because not enough memory was available. */ 38*95b7b453SJohn Marino static bool 39*95b7b453SJohn Marino knuth_morris_pratt_multibyte (const char *haystack, const char *needle, 40*95b7b453SJohn Marino const char **resultp) 41*95b7b453SJohn Marino { 42*95b7b453SJohn Marino size_t m = mbslen (needle); 43*95b7b453SJohn Marino mbchar_t *needle_mbchars; 44*95b7b453SJohn Marino size_t *table; 45*95b7b453SJohn Marino 46*95b7b453SJohn Marino /* Allocate room for needle_mbchars and the table. */ 47*95b7b453SJohn Marino char *memory = (char *) nmalloca (m, sizeof (mbchar_t) + sizeof (size_t)); 48*95b7b453SJohn Marino if (memory == NULL) 49*95b7b453SJohn Marino return false; 50*95b7b453SJohn Marino needle_mbchars = (mbchar_t *) memory; 51*95b7b453SJohn Marino table = (size_t *) (memory + m * sizeof (mbchar_t)); 52*95b7b453SJohn Marino 53*95b7b453SJohn Marino /* Fill needle_mbchars. */ 54*95b7b453SJohn Marino { 55*95b7b453SJohn Marino mbui_iterator_t iter; 56*95b7b453SJohn Marino size_t j; 57*95b7b453SJohn Marino 58*95b7b453SJohn Marino j = 0; 59*95b7b453SJohn Marino for (mbui_init (iter, needle); mbui_avail (iter); mbui_advance (iter), j++) 60*95b7b453SJohn Marino mb_copy (&needle_mbchars[j], &mbui_cur (iter)); 61*95b7b453SJohn Marino } 62*95b7b453SJohn Marino 63*95b7b453SJohn Marino /* Fill the table. 64*95b7b453SJohn Marino For 0 < i < m: 65*95b7b453SJohn Marino 0 < table[i] <= i is defined such that 66*95b7b453SJohn Marino forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x], 67*95b7b453SJohn Marino and table[i] is as large as possible with this property. 68*95b7b453SJohn Marino This implies: 69*95b7b453SJohn Marino 1) For 0 < i < m: 70*95b7b453SJohn Marino If table[i] < i, 71*95b7b453SJohn Marino needle[table[i]..i-1] = needle[0..i-1-table[i]]. 72*95b7b453SJohn Marino 2) For 0 < i < m: 73*95b7b453SJohn Marino rhaystack[0..i-1] == needle[0..i-1] 74*95b7b453SJohn Marino and exists h, i <= h < m: rhaystack[h] != needle[h] 75*95b7b453SJohn Marino implies 76*95b7b453SJohn Marino forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1]. 77*95b7b453SJohn Marino table[0] remains uninitialized. */ 78*95b7b453SJohn Marino { 79*95b7b453SJohn Marino size_t i, j; 80*95b7b453SJohn Marino 81*95b7b453SJohn Marino /* i = 1: Nothing to verify for x = 0. */ 82*95b7b453SJohn Marino table[1] = 1; 83*95b7b453SJohn Marino j = 0; 84*95b7b453SJohn Marino 85*95b7b453SJohn Marino for (i = 2; i < m; i++) 86*95b7b453SJohn Marino { 87*95b7b453SJohn Marino /* Here: j = i-1 - table[i-1]. 88*95b7b453SJohn Marino The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold 89*95b7b453SJohn Marino for x < table[i-1], by induction. 90*95b7b453SJohn Marino Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ 91*95b7b453SJohn Marino mbchar_t *b = &needle_mbchars[i - 1]; 92*95b7b453SJohn Marino 93*95b7b453SJohn Marino for (;;) 94*95b7b453SJohn Marino { 95*95b7b453SJohn Marino /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x] 96*95b7b453SJohn Marino is known to hold for x < i-1-j. 97*95b7b453SJohn Marino Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ 98*95b7b453SJohn Marino if (mb_equal (*b, needle_mbchars[j])) 99*95b7b453SJohn Marino { 100*95b7b453SJohn Marino /* Set table[i] := i-1-j. */ 101*95b7b453SJohn Marino table[i] = i - ++j; 102*95b7b453SJohn Marino break; 103*95b7b453SJohn Marino } 104*95b7b453SJohn Marino /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds 105*95b7b453SJohn Marino for x = i-1-j, because 106*95b7b453SJohn Marino needle[i-1] != needle[j] = needle[i-1-x]. */ 107*95b7b453SJohn Marino if (j == 0) 108*95b7b453SJohn Marino { 109*95b7b453SJohn Marino /* The inequality holds for all possible x. */ 110*95b7b453SJohn Marino table[i] = i; 111*95b7b453SJohn Marino break; 112*95b7b453SJohn Marino } 113*95b7b453SJohn Marino /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds 114*95b7b453SJohn Marino for i-1-j < x < i-1-j+table[j], because for these x: 115*95b7b453SJohn Marino needle[x..i-2] 116*95b7b453SJohn Marino = needle[x-(i-1-j)..j-1] 117*95b7b453SJohn Marino != needle[0..j-1-(x-(i-1-j))] (by definition of table[j]) 118*95b7b453SJohn Marino = needle[0..i-2-x], 119*95b7b453SJohn Marino hence needle[x..i-1] != needle[0..i-1-x]. 120*95b7b453SJohn Marino Furthermore 121*95b7b453SJohn Marino needle[i-1-j+table[j]..i-2] 122*95b7b453SJohn Marino = needle[table[j]..j-1] 123*95b7b453SJohn Marino = needle[0..j-1-table[j]] (by definition of table[j]). */ 124*95b7b453SJohn Marino j = j - table[j]; 125*95b7b453SJohn Marino } 126*95b7b453SJohn Marino /* Here: j = i - table[i]. */ 127*95b7b453SJohn Marino } 128*95b7b453SJohn Marino } 129*95b7b453SJohn Marino 130*95b7b453SJohn Marino /* Search, using the table to accelerate the processing. */ 131*95b7b453SJohn Marino { 132*95b7b453SJohn Marino size_t j; 133*95b7b453SJohn Marino mbui_iterator_t rhaystack; 134*95b7b453SJohn Marino mbui_iterator_t phaystack; 135*95b7b453SJohn Marino 136*95b7b453SJohn Marino *resultp = NULL; 137*95b7b453SJohn Marino j = 0; 138*95b7b453SJohn Marino mbui_init (rhaystack, haystack); 139*95b7b453SJohn Marino mbui_init (phaystack, haystack); 140*95b7b453SJohn Marino /* Invariant: phaystack = rhaystack + j. */ 141*95b7b453SJohn Marino while (mbui_avail (phaystack)) 142*95b7b453SJohn Marino if (mb_equal (needle_mbchars[j], mbui_cur (phaystack))) 143*95b7b453SJohn Marino { 144*95b7b453SJohn Marino j++; 145*95b7b453SJohn Marino mbui_advance (phaystack); 146*95b7b453SJohn Marino if (j == m) 147*95b7b453SJohn Marino { 148*95b7b453SJohn Marino /* The entire needle has been found. */ 149*95b7b453SJohn Marino *resultp = mbui_cur_ptr (rhaystack); 150*95b7b453SJohn Marino break; 151*95b7b453SJohn Marino } 152*95b7b453SJohn Marino } 153*95b7b453SJohn Marino else if (j > 0) 154*95b7b453SJohn Marino { 155*95b7b453SJohn Marino /* Found a match of needle[0..j-1], mismatch at needle[j]. */ 156*95b7b453SJohn Marino size_t count = table[j]; 157*95b7b453SJohn Marino j -= count; 158*95b7b453SJohn Marino for (; count > 0; count--) 159*95b7b453SJohn Marino { 160*95b7b453SJohn Marino if (!mbui_avail (rhaystack)) 161*95b7b453SJohn Marino abort (); 162*95b7b453SJohn Marino mbui_advance (rhaystack); 163*95b7b453SJohn Marino } 164*95b7b453SJohn Marino } 165*95b7b453SJohn Marino else 166*95b7b453SJohn Marino { 167*95b7b453SJohn Marino /* Found a mismatch at needle[0] already. */ 168*95b7b453SJohn Marino if (!mbui_avail (rhaystack)) 169*95b7b453SJohn Marino abort (); 170*95b7b453SJohn Marino mbui_advance (rhaystack); 171*95b7b453SJohn Marino mbui_advance (phaystack); 172*95b7b453SJohn Marino } 173*95b7b453SJohn Marino } 174*95b7b453SJohn Marino 175*95b7b453SJohn Marino freea (memory); 176*95b7b453SJohn Marino return true; 177*95b7b453SJohn Marino } 178*95b7b453SJohn Marino 179*95b7b453SJohn Marino /* Find the first occurrence of the character string NEEDLE in the character 180*95b7b453SJohn Marino string HAYSTACK. Return NULL if NEEDLE is not found in HAYSTACK. */ 181*95b7b453SJohn Marino char * 182*95b7b453SJohn Marino mbsstr (const char *haystack, const char *needle) 183*95b7b453SJohn Marino { 184*95b7b453SJohn Marino /* Be careful not to look at the entire extent of haystack or needle 185*95b7b453SJohn Marino until needed. This is useful because of these two cases: 186*95b7b453SJohn Marino - haystack may be very long, and a match of needle found early, 187*95b7b453SJohn Marino - needle may be very long, and not even a short initial segment of 188*95b7b453SJohn Marino needle may be found in haystack. */ 189*95b7b453SJohn Marino if (MB_CUR_MAX > 1) 190*95b7b453SJohn Marino { 191*95b7b453SJohn Marino mbui_iterator_t iter_needle; 192*95b7b453SJohn Marino 193*95b7b453SJohn Marino mbui_init (iter_needle, needle); 194*95b7b453SJohn Marino if (mbui_avail (iter_needle)) 195*95b7b453SJohn Marino { 196*95b7b453SJohn Marino /* Minimizing the worst-case complexity: 197*95b7b453SJohn Marino Let n = mbslen(haystack), m = mbslen(needle). 198*95b7b453SJohn Marino The naïve algorithm is O(n*m) worst-case. 199*95b7b453SJohn Marino The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a 200*95b7b453SJohn Marino memory allocation. 201*95b7b453SJohn Marino To achieve linear complexity and yet amortize the cost of the 202*95b7b453SJohn Marino memory allocation, we activate the Knuth-Morris-Pratt algorithm 203*95b7b453SJohn Marino only once the naïve algorithm has already run for some time; more 204*95b7b453SJohn Marino precisely, when 205*95b7b453SJohn Marino - the outer loop count is >= 10, 206*95b7b453SJohn Marino - the average number of comparisons per outer loop is >= 5, 207*95b7b453SJohn Marino - the total number of comparisons is >= m. 208*95b7b453SJohn Marino But we try it only once. If the memory allocation attempt failed, 209*95b7b453SJohn Marino we don't retry it. */ 210*95b7b453SJohn Marino bool try_kmp = true; 211*95b7b453SJohn Marino size_t outer_loop_count = 0; 212*95b7b453SJohn Marino size_t comparison_count = 0; 213*95b7b453SJohn Marino size_t last_ccount = 0; /* last comparison count */ 214*95b7b453SJohn Marino mbui_iterator_t iter_needle_last_ccount; /* = needle + last_ccount */ 215*95b7b453SJohn Marino 216*95b7b453SJohn Marino mbui_iterator_t iter_haystack; 217*95b7b453SJohn Marino 218*95b7b453SJohn Marino mbui_init (iter_needle_last_ccount, needle); 219*95b7b453SJohn Marino mbui_init (iter_haystack, haystack); 220*95b7b453SJohn Marino for (;; mbui_advance (iter_haystack)) 221*95b7b453SJohn Marino { 222*95b7b453SJohn Marino if (!mbui_avail (iter_haystack)) 223*95b7b453SJohn Marino /* No match. */ 224*95b7b453SJohn Marino return NULL; 225*95b7b453SJohn Marino 226*95b7b453SJohn Marino /* See whether it's advisable to use an asymptotically faster 227*95b7b453SJohn Marino algorithm. */ 228*95b7b453SJohn Marino if (try_kmp 229*95b7b453SJohn Marino && outer_loop_count >= 10 230*95b7b453SJohn Marino && comparison_count >= 5 * outer_loop_count) 231*95b7b453SJohn Marino { 232*95b7b453SJohn Marino /* See if needle + comparison_count now reaches the end of 233*95b7b453SJohn Marino needle. */ 234*95b7b453SJohn Marino size_t count = comparison_count - last_ccount; 235*95b7b453SJohn Marino for (; 236*95b7b453SJohn Marino count > 0 && mbui_avail (iter_needle_last_ccount); 237*95b7b453SJohn Marino count--) 238*95b7b453SJohn Marino mbui_advance (iter_needle_last_ccount); 239*95b7b453SJohn Marino last_ccount = comparison_count; 240*95b7b453SJohn Marino if (!mbui_avail (iter_needle_last_ccount)) 241*95b7b453SJohn Marino { 242*95b7b453SJohn Marino /* Try the Knuth-Morris-Pratt algorithm. */ 243*95b7b453SJohn Marino const char *result; 244*95b7b453SJohn Marino bool success = 245*95b7b453SJohn Marino knuth_morris_pratt_multibyte (haystack, needle, 246*95b7b453SJohn Marino &result); 247*95b7b453SJohn Marino if (success) 248*95b7b453SJohn Marino return (char *) result; 249*95b7b453SJohn Marino try_kmp = false; 250*95b7b453SJohn Marino } 251*95b7b453SJohn Marino } 252*95b7b453SJohn Marino 253*95b7b453SJohn Marino outer_loop_count++; 254*95b7b453SJohn Marino comparison_count++; 255*95b7b453SJohn Marino if (mb_equal (mbui_cur (iter_haystack), mbui_cur (iter_needle))) 256*95b7b453SJohn Marino /* The first character matches. */ 257*95b7b453SJohn Marino { 258*95b7b453SJohn Marino mbui_iterator_t rhaystack; 259*95b7b453SJohn Marino mbui_iterator_t rneedle; 260*95b7b453SJohn Marino 261*95b7b453SJohn Marino memcpy (&rhaystack, &iter_haystack, sizeof (mbui_iterator_t)); 262*95b7b453SJohn Marino mbui_advance (rhaystack); 263*95b7b453SJohn Marino 264*95b7b453SJohn Marino mbui_init (rneedle, needle); 265*95b7b453SJohn Marino if (!mbui_avail (rneedle)) 266*95b7b453SJohn Marino abort (); 267*95b7b453SJohn Marino mbui_advance (rneedle); 268*95b7b453SJohn Marino 269*95b7b453SJohn Marino for (;; mbui_advance (rhaystack), mbui_advance (rneedle)) 270*95b7b453SJohn Marino { 271*95b7b453SJohn Marino if (!mbui_avail (rneedle)) 272*95b7b453SJohn Marino /* Found a match. */ 273*95b7b453SJohn Marino return (char *) mbui_cur_ptr (iter_haystack); 274*95b7b453SJohn Marino if (!mbui_avail (rhaystack)) 275*95b7b453SJohn Marino /* No match. */ 276*95b7b453SJohn Marino return NULL; 277*95b7b453SJohn Marino comparison_count++; 278*95b7b453SJohn Marino if (!mb_equal (mbui_cur (rhaystack), mbui_cur (rneedle))) 279*95b7b453SJohn Marino /* Nothing in this round. */ 280*95b7b453SJohn Marino break; 281*95b7b453SJohn Marino } 282*95b7b453SJohn Marino } 283*95b7b453SJohn Marino } 284*95b7b453SJohn Marino } 285*95b7b453SJohn Marino else 286*95b7b453SJohn Marino return (char *) haystack; 287*95b7b453SJohn Marino } 288*95b7b453SJohn Marino else 289*95b7b453SJohn Marino { 290*95b7b453SJohn Marino if (*needle != '\0') 291*95b7b453SJohn Marino { 292*95b7b453SJohn Marino /* Minimizing the worst-case complexity: 293*95b7b453SJohn Marino Let n = strlen(haystack), m = strlen(needle). 294*95b7b453SJohn Marino The naïve algorithm is O(n*m) worst-case. 295*95b7b453SJohn Marino The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a 296*95b7b453SJohn Marino memory allocation. 297*95b7b453SJohn Marino To achieve linear complexity and yet amortize the cost of the 298*95b7b453SJohn Marino memory allocation, we activate the Knuth-Morris-Pratt algorithm 299*95b7b453SJohn Marino only once the naïve algorithm has already run for some time; more 300*95b7b453SJohn Marino precisely, when 301*95b7b453SJohn Marino - the outer loop count is >= 10, 302*95b7b453SJohn Marino - the average number of comparisons per outer loop is >= 5, 303*95b7b453SJohn Marino - the total number of comparisons is >= m. 304*95b7b453SJohn Marino But we try it only once. If the memory allocation attempt failed, 305*95b7b453SJohn Marino we don't retry it. */ 306*95b7b453SJohn Marino bool try_kmp = true; 307*95b7b453SJohn Marino size_t outer_loop_count = 0; 308*95b7b453SJohn Marino size_t comparison_count = 0; 309*95b7b453SJohn Marino size_t last_ccount = 0; /* last comparison count */ 310*95b7b453SJohn Marino const char *needle_last_ccount = needle; /* = needle + last_ccount */ 311*95b7b453SJohn Marino 312*95b7b453SJohn Marino /* Speed up the following searches of needle by caching its first 313*95b7b453SJohn Marino character. */ 314*95b7b453SJohn Marino char b = *needle++; 315*95b7b453SJohn Marino 316*95b7b453SJohn Marino for (;; haystack++) 317*95b7b453SJohn Marino { 318*95b7b453SJohn Marino if (*haystack == '\0') 319*95b7b453SJohn Marino /* No match. */ 320*95b7b453SJohn Marino return NULL; 321*95b7b453SJohn Marino 322*95b7b453SJohn Marino /* See whether it's advisable to use an asymptotically faster 323*95b7b453SJohn Marino algorithm. */ 324*95b7b453SJohn Marino if (try_kmp 325*95b7b453SJohn Marino && outer_loop_count >= 10 326*95b7b453SJohn Marino && comparison_count >= 5 * outer_loop_count) 327*95b7b453SJohn Marino { 328*95b7b453SJohn Marino /* See if needle + comparison_count now reaches the end of 329*95b7b453SJohn Marino needle. */ 330*95b7b453SJohn Marino if (needle_last_ccount != NULL) 331*95b7b453SJohn Marino { 332*95b7b453SJohn Marino needle_last_ccount += 333*95b7b453SJohn Marino strnlen (needle_last_ccount, 334*95b7b453SJohn Marino comparison_count - last_ccount); 335*95b7b453SJohn Marino if (*needle_last_ccount == '\0') 336*95b7b453SJohn Marino needle_last_ccount = NULL; 337*95b7b453SJohn Marino last_ccount = comparison_count; 338*95b7b453SJohn Marino } 339*95b7b453SJohn Marino if (needle_last_ccount == NULL) 340*95b7b453SJohn Marino { 341*95b7b453SJohn Marino /* Try the Knuth-Morris-Pratt algorithm. */ 342*95b7b453SJohn Marino const char *result; 343*95b7b453SJohn Marino bool success = 344*95b7b453SJohn Marino knuth_morris_pratt_unibyte (haystack, needle - 1, 345*95b7b453SJohn Marino &result); 346*95b7b453SJohn Marino if (success) 347*95b7b453SJohn Marino return (char *) result; 348*95b7b453SJohn Marino try_kmp = false; 349*95b7b453SJohn Marino } 350*95b7b453SJohn Marino } 351*95b7b453SJohn Marino 352*95b7b453SJohn Marino outer_loop_count++; 353*95b7b453SJohn Marino comparison_count++; 354*95b7b453SJohn Marino if (*haystack == b) 355*95b7b453SJohn Marino /* The first character matches. */ 356*95b7b453SJohn Marino { 357*95b7b453SJohn Marino const char *rhaystack = haystack + 1; 358*95b7b453SJohn Marino const char *rneedle = needle; 359*95b7b453SJohn Marino 360*95b7b453SJohn Marino for (;; rhaystack++, rneedle++) 361*95b7b453SJohn Marino { 362*95b7b453SJohn Marino if (*rneedle == '\0') 363*95b7b453SJohn Marino /* Found a match. */ 364*95b7b453SJohn Marino return (char *) haystack; 365*95b7b453SJohn Marino if (*rhaystack == '\0') 366*95b7b453SJohn Marino /* No match. */ 367*95b7b453SJohn Marino return NULL; 368*95b7b453SJohn Marino comparison_count++; 369*95b7b453SJohn Marino if (*rhaystack != *rneedle) 370*95b7b453SJohn Marino /* Nothing in this round. */ 371*95b7b453SJohn Marino break; 372*95b7b453SJohn Marino } 373*95b7b453SJohn Marino } 374*95b7b453SJohn Marino } 375*95b7b453SJohn Marino } 376*95b7b453SJohn Marino else 377*95b7b453SJohn Marino return (char *) haystack; 378*95b7b453SJohn Marino } 379*95b7b453SJohn Marino } 380