1/* Searching in a string. 2 Copyright (C) 2005-2007 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2005. 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18#include <config.h> 19 20/* Specification. */ 21#include <string.h> 22 23#include <stdbool.h> 24#include <stddef.h> /* for NULL, in case a nonstandard string.h lacks it */ 25 26#include "malloca.h" 27#if HAVE_MBRTOWC 28# include "mbuiter.h" 29#endif 30 31/* Knuth-Morris-Pratt algorithm. 32 See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm 33 Return a boolean indicating success. */ 34 35static bool 36knuth_morris_pratt_unibyte (const char *haystack, const char *needle, 37 const char **resultp) 38{ 39 size_t m = strlen (needle); 40 41 /* Allocate the table. */ 42 size_t *table = (size_t *) malloca (m * sizeof (size_t)); 43 if (table == NULL) 44 return false; 45 /* Fill the table. 46 For 0 < i < m: 47 0 < table[i] <= i is defined such that 48 rhaystack[0..i-1] == needle[0..i-1] and rhaystack[i] != needle[i] 49 implies 50 forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1], 51 and table[i] is as large as possible with this property. 52 table[0] remains uninitialized. */ 53 { 54 size_t i, j; 55 56 table[1] = 1; 57 j = 0; 58 for (i = 2; i < m; i++) 59 { 60 unsigned char b = (unsigned char) needle[i - 1]; 61 62 for (;;) 63 { 64 if (b == (unsigned char) needle[j]) 65 { 66 table[i] = i - ++j; 67 break; 68 } 69 if (j == 0) 70 { 71 table[i] = i; 72 break; 73 } 74 j = j - table[j]; 75 } 76 } 77 } 78 79 /* Search, using the table to accelerate the processing. */ 80 { 81 size_t j; 82 const char *rhaystack; 83 const char *phaystack; 84 85 *resultp = NULL; 86 j = 0; 87 rhaystack = haystack; 88 phaystack = haystack; 89 /* Invariant: phaystack = rhaystack + j. */ 90 while (*phaystack != '\0') 91 if ((unsigned char) needle[j] == (unsigned char) *phaystack) 92 { 93 j++; 94 phaystack++; 95 if (j == m) 96 { 97 /* The entire needle has been found. */ 98 *resultp = rhaystack; 99 break; 100 } 101 } 102 else if (j > 0) 103 { 104 /* Found a match of needle[0..j-1], mismatch at needle[j]. */ 105 rhaystack += table[j]; 106 j -= table[j]; 107 } 108 else 109 { 110 /* Found a mismatch at needle[0] already. */ 111 rhaystack++; 112 phaystack++; 113 } 114 } 115 116 freea (table); 117 return true; 118} 119 120#if HAVE_MBRTOWC 121static bool 122knuth_morris_pratt_multibyte (const char *haystack, const char *needle, 123 const char **resultp) 124{ 125 size_t m = mbslen (needle); 126 mbchar_t *needle_mbchars; 127 size_t *table; 128 129 /* Allocate room for needle_mbchars and the table. */ 130 char *memory = (char *) malloca (m * (sizeof (mbchar_t) + sizeof (size_t))); 131 if (memory == NULL) 132 return false; 133 needle_mbchars = (mbchar_t *) memory; 134 table = (size_t *) (memory + m * sizeof (mbchar_t)); 135 136 /* Fill needle_mbchars. */ 137 { 138 mbui_iterator_t iter; 139 size_t j; 140 141 j = 0; 142 for (mbui_init (iter, needle); mbui_avail (iter); mbui_advance (iter), j++) 143 mb_copy (&needle_mbchars[j], &mbui_cur (iter)); 144 } 145 146 /* Fill the table. 147 For 0 < i < m: 148 0 < table[i] <= i is defined such that 149 rhaystack[0..i-1] == needle[0..i-1] and rhaystack[i] != needle[i] 150 implies 151 forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1], 152 and table[i] is as large as possible with this property. 153 table[0] remains uninitialized. */ 154 { 155 size_t i, j; 156 157 table[1] = 1; 158 j = 0; 159 for (i = 2; i < m; i++) 160 { 161 mbchar_t *b = &needle_mbchars[i - 1]; 162 163 for (;;) 164 { 165 if (mb_equal (*b, needle_mbchars[j])) 166 { 167 table[i] = i - ++j; 168 break; 169 } 170 if (j == 0) 171 { 172 table[i] = i; 173 break; 174 } 175 j = j - table[j]; 176 } 177 } 178 } 179 180 /* Search, using the table to accelerate the processing. */ 181 { 182 size_t j; 183 mbui_iterator_t rhaystack; 184 mbui_iterator_t phaystack; 185 186 *resultp = NULL; 187 j = 0; 188 mbui_init (rhaystack, haystack); 189 mbui_init (phaystack, haystack); 190 /* Invariant: phaystack = rhaystack + j. */ 191 while (mbui_avail (phaystack)) 192 if (mb_equal (needle_mbchars[j], mbui_cur (phaystack))) 193 { 194 j++; 195 mbui_advance (phaystack); 196 if (j == m) 197 { 198 /* The entire needle has been found. */ 199 *resultp = mbui_cur_ptr (rhaystack); 200 break; 201 } 202 } 203 else if (j > 0) 204 { 205 /* Found a match of needle[0..j-1], mismatch at needle[j]. */ 206 size_t count = table[j]; 207 j -= count; 208 for (; count > 0; count--) 209 { 210 if (!mbui_avail (rhaystack)) 211 abort (); 212 mbui_advance (rhaystack); 213 } 214 } 215 else 216 { 217 /* Found a mismatch at needle[0] already. */ 218 if (!mbui_avail (rhaystack)) 219 abort (); 220 mbui_advance (rhaystack); 221 mbui_advance (phaystack); 222 } 223 } 224 225 freea (memory); 226 return true; 227} 228#endif 229 230/* Find the first occurrence of the character string NEEDLE in the character 231 string HAYSTACK. Return NULL if NEEDLE is not found in HAYSTACK. */ 232char * 233mbsstr (const char *haystack, const char *needle) 234{ 235 /* Be careful not to look at the entire extent of haystack or needle 236 until needed. This is useful because of these two cases: 237 - haystack may be very long, and a match of needle found early, 238 - needle may be very long, and not even a short initial segment of 239 needle may be found in haystack. */ 240#if HAVE_MBRTOWC 241 if (MB_CUR_MAX > 1) 242 { 243 mbui_iterator_t iter_needle; 244 245 mbui_init (iter_needle, needle); 246 if (mbui_avail (iter_needle)) 247 { 248 /* Minimizing the worst-case complexity: 249 Let n = mbslen(haystack), m = mbslen(needle). 250 The na��ve algorithm is O(n*m) worst-case. 251 The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a 252 memory allocation. 253 To achieve linear complexity and yet amortize the cost of the 254 memory allocation, we activate the Knuth-Morris-Pratt algorithm 255 only once the na��ve algorithm has already run for some time; more 256 precisely, when 257 - the outer loop count is >= 10, 258 - the average number of comparisons per outer loop is >= 5, 259 - the total number of comparisons is >= m. 260 But we try it only once. If the memory allocation attempt failed, 261 we don't retry it. */ 262 bool try_kmp = true; 263 size_t outer_loop_count = 0; 264 size_t comparison_count = 0; 265 size_t last_ccount = 0; /* last comparison count */ 266 mbui_iterator_t iter_needle_last_ccount; /* = needle + last_ccount */ 267 268 mbui_iterator_t iter_haystack; 269 270 mbui_init (iter_needle_last_ccount, needle); 271 mbui_init (iter_haystack, haystack); 272 for (;; mbui_advance (iter_haystack)) 273 { 274 if (!mbui_avail (iter_haystack)) 275 /* No match. */ 276 return NULL; 277 278 /* See whether it's advisable to use an asymptotically faster 279 algorithm. */ 280 if (try_kmp 281 && outer_loop_count >= 10 282 && comparison_count >= 5 * outer_loop_count) 283 { 284 /* See if needle + comparison_count now reaches the end of 285 needle. */ 286 size_t count = comparison_count - last_ccount; 287 for (; 288 count > 0 && mbui_avail (iter_needle_last_ccount); 289 count--) 290 mbui_advance (iter_needle_last_ccount); 291 last_ccount = comparison_count; 292 if (!mbui_avail (iter_needle_last_ccount)) 293 { 294 /* Try the Knuth-Morris-Pratt algorithm. */ 295 const char *result; 296 bool success = 297 knuth_morris_pratt_multibyte (haystack, needle, 298 &result); 299 if (success) 300 return (char *) result; 301 try_kmp = false; 302 } 303 } 304 305 outer_loop_count++; 306 comparison_count++; 307 if (mb_equal (mbui_cur (iter_haystack), mbui_cur (iter_needle))) 308 /* The first character matches. */ 309 { 310 mbui_iterator_t rhaystack; 311 mbui_iterator_t rneedle; 312 313 memcpy (&rhaystack, &iter_haystack, sizeof (mbui_iterator_t)); 314 mbui_advance (rhaystack); 315 316 mbui_init (rneedle, needle); 317 if (!mbui_avail (rneedle)) 318 abort (); 319 mbui_advance (rneedle); 320 321 for (;; mbui_advance (rhaystack), mbui_advance (rneedle)) 322 { 323 if (!mbui_avail (rneedle)) 324 /* Found a match. */ 325 return (char *) mbui_cur_ptr (iter_haystack); 326 if (!mbui_avail (rhaystack)) 327 /* No match. */ 328 return NULL; 329 comparison_count++; 330 if (!mb_equal (mbui_cur (rhaystack), mbui_cur (rneedle))) 331 /* Nothing in this round. */ 332 break; 333 } 334 } 335 } 336 } 337 else 338 return (char *) haystack; 339 } 340 else 341#endif 342 { 343 if (*needle != '\0') 344 { 345 /* Minimizing the worst-case complexity: 346 Let n = strlen(haystack), m = strlen(needle). 347 The na��ve algorithm is O(n*m) worst-case. 348 The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a 349 memory allocation. 350 To achieve linear complexity and yet amortize the cost of the 351 memory allocation, we activate the Knuth-Morris-Pratt algorithm 352 only once the na��ve algorithm has already run for some time; more 353 precisely, when 354 - the outer loop count is >= 10, 355 - the average number of comparisons per outer loop is >= 5, 356 - the total number of comparisons is >= m. 357 But we try it only once. If the memory allocation attempt failed, 358 we don't retry it. */ 359 bool try_kmp = true; 360 size_t outer_loop_count = 0; 361 size_t comparison_count = 0; 362 size_t last_ccount = 0; /* last comparison count */ 363 const char *needle_last_ccount = needle; /* = needle + last_ccount */ 364 365 /* Speed up the following searches of needle by caching its first 366 character. */ 367 char b = *needle++; 368 369 for (;; haystack++) 370 { 371 if (*haystack == '\0') 372 /* No match. */ 373 return NULL; 374 375 /* See whether it's advisable to use an asymptotically faster 376 algorithm. */ 377 if (try_kmp 378 && outer_loop_count >= 10 379 && comparison_count >= 5 * outer_loop_count) 380 { 381 /* See if needle + comparison_count now reaches the end of 382 needle. */ 383 if (needle_last_ccount != NULL) 384 { 385 needle_last_ccount += 386 strnlen (needle_last_ccount, 387 comparison_count - last_ccount); 388 if (*needle_last_ccount == '\0') 389 needle_last_ccount = NULL; 390 last_ccount = comparison_count; 391 } 392 if (needle_last_ccount == NULL) 393 { 394 /* Try the Knuth-Morris-Pratt algorithm. */ 395 const char *result; 396 bool success = 397 knuth_morris_pratt_unibyte (haystack, needle - 1, 398 &result); 399 if (success) 400 return (char *) result; 401 try_kmp = false; 402 } 403 } 404 405 outer_loop_count++; 406 comparison_count++; 407 if (*haystack == b) 408 /* The first character matches. */ 409 { 410 const char *rhaystack = haystack + 1; 411 const char *rneedle = needle; 412 413 for (;; rhaystack++, rneedle++) 414 { 415 if (*rneedle == '\0') 416 /* Found a match. */ 417 return (char *) haystack; 418 if (*rhaystack == '\0') 419 /* No match. */ 420 return NULL; 421 comparison_count++; 422 if (*rhaystack != *rneedle) 423 /* Nothing in this round. */ 424 break; 425 } 426 } 427 } 428 } 429 else 430 return (char *) haystack; 431 } 432} 433