1259698Sdim/* Byte-wise substring search, using the Two-Way algorithm. 2259698Sdim Copyright (C) 2008-2022 Free Software Foundation, Inc. 3259698Sdim This file is part of the GNU C Library. 4259698Sdim Written by Eric Blake <ebb9@byu.net>, 2008. 5259698Sdim 6259698Sdim This file is free software: you can redistribute it and/or modify 7259698Sdim it under the terms of the GNU Lesser General Public License as 8259698Sdim published by the Free Software Foundation; either version 2.1 of the 9259698Sdim License, or (at your option) any later version. 10259698Sdim 11259698Sdim This file is distributed in the hope that it will be useful, 12259698Sdim but WITHOUT ANY WARRANTY; without even the implied warranty of 13259698Sdim MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14259698Sdim GNU Lesser General Public License for more details. 15259698Sdim 16259698Sdim You should have received a copy of the GNU Lesser General Public License 17259698Sdim along with this program. If not, see <https://www.gnu.org/licenses/>. */ 18259698Sdim 19259698Sdim/* Before including this file, you need to include <config.h> and 20259698Sdim <string.h>, and define: 21259698Sdim RETURN_TYPE A macro that expands to the return type. 22259698Sdim AVAILABLE(h, h_l, j, n_l) 23259698Sdim A macro that returns nonzero if there are 24259698Sdim at least N_L bytes left starting at H[J]. 25259698Sdim H is 'unsigned char *', H_L, J, and N_L 26259698Sdim are 'size_t'; H_L is an lvalue. For 27259698Sdim NUL-terminated searches, H_L can be 28259698Sdim modified each iteration to avoid having 29259698Sdim to compute the end of H up front. 30259698Sdim 31259698Sdim For case-insensitivity, you may optionally define: 32259698Sdim CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L 33259698Sdim characters of P1 and P2 are equal. 34259698Sdim CANON_ELEMENT(c) A macro that canonicalizes an element right after 35259698Sdim it has been fetched from one of the two strings. 36259698Sdim The argument is an 'unsigned char'; the result 37259698Sdim must be an 'unsigned char' as well. 38259698Sdim 39259698Sdim This file undefines the macros documented above, and defines 40259698Sdim LONG_NEEDLE_THRESHOLD. 41259698Sdim*/ 42259698Sdim 43259698Sdim#include <limits.h> 44259698Sdim#include <stdint.h> 45259698Sdim 46259698Sdim/* We use the Two-Way string matching algorithm (also known as 47259698Sdim Chrochemore-Perrin), which guarantees linear complexity with 48259698Sdim constant space. Additionally, for long needles, we also use a bad 49259698Sdim character shift table similar to the Boyer-Moore algorithm to 50259698Sdim achieve improved (potentially sub-linear) performance. 51259698Sdim 52259698Sdim See https://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260, 53259698Sdim https://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm, 54259698Sdim https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6641&rep=rep1&type=pdf 55259698Sdim*/ 56259698Sdim 57259698Sdim/* Point at which computing a bad-byte shift table is likely to be 58259698Sdim worthwhile. Small needles should not compute a table, since it 59259698Sdim adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a 60259698Sdim speedup no greater than a factor of NEEDLE_LEN. The larger the 61259698Sdim needle, the better the potential performance gain. On the other 62259698Sdim hand, on non-POSIX systems with CHAR_BIT larger than eight, the 63259698Sdim memory required for the table is prohibitive. */ 64259698Sdim#if CHAR_BIT < 10 65259698Sdim# define LONG_NEEDLE_THRESHOLD 32U 66259698Sdim#else 67259698Sdim# define LONG_NEEDLE_THRESHOLD SIZE_MAX 68259698Sdim#endif 69259698Sdim 70259698Sdim#ifndef MAX 71259698Sdim# define MAX(a, b) ((a < b) ? (b) : (a)) 72259698Sdim#endif 73259698Sdim 74259698Sdim#ifndef CANON_ELEMENT 75259698Sdim# define CANON_ELEMENT(c) c 76259698Sdim#endif 77259698Sdim#ifndef CMP_FUNC 78259698Sdim# define CMP_FUNC memcmp 79259698Sdim#endif 80259698Sdim 81259698Sdim/* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN. 82259698Sdim Return the index of the first byte in the right half, and set 83259698Sdim *PERIOD to the global period of the right half. 84259698Sdim 85259698Sdim The global period of a string is the smallest index (possibly its 86259698Sdim length) at which all remaining bytes in the string are repetitions 87259698Sdim of the prefix (the last repetition may be a subset of the prefix). 88259698Sdim 89259698Sdim When NEEDLE is factored into two halves, a local period is the 90259698Sdim length of the smallest word that shares a suffix with the left half 91259698Sdim and shares a prefix with the right half. All factorizations of a 92259698Sdim non-empty NEEDLE have a local period of at least 1 and no greater 93259698Sdim than NEEDLE_LEN. 94259698Sdim 95259698Sdim A critical factorization has the property that the local period 96259698Sdim equals the global period. All strings have at least one critical 97259698Sdim factorization with the left half smaller than the global period. 98259698Sdim And while some strings have more than one critical factorization, 99259698Sdim it is provable that with an ordered alphabet, at least one of the 100259698Sdim critical factorizations corresponds to a maximal suffix. 101259698Sdim 102259698Sdim Given an ordered alphabet, a critical factorization can be computed 103259698Sdim in linear time, with 2 * NEEDLE_LEN comparisons, by computing the 104259698Sdim shorter of two ordered maximal suffixes. The ordered maximal 105259698Sdim suffixes are determined by lexicographic comparison while tracking 106259698Sdim periodicity. */ 107259698Sdimstatic size_t 108259698Sdimcritical_factorization (const unsigned char *needle, size_t needle_len, 109259698Sdim size_t *period) 110259698Sdim{ 111259698Sdim /* Index of last byte of left half, or SIZE_MAX. */ 112259698Sdim size_t max_suffix, max_suffix_rev; 113259698Sdim size_t j; /* Index into NEEDLE for current candidate suffix. */ 114259698Sdim size_t k; /* Offset into current period. */ 115259698Sdim size_t p; /* Intermediate period. */ 116259698Sdim unsigned char a, b; /* Current comparison bytes. */ 117259698Sdim 118259698Sdim /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered 119259698Sdim out 0-length needles. */ 120259698Sdim if (needle_len < 3) 121259698Sdim { 122259698Sdim *period = 1; 123259698Sdim return needle_len - 1; 124259698Sdim } 125259698Sdim 126259698Sdim /* Invariants: 127259698Sdim 0 <= j < NEEDLE_LEN - 1 128259698Sdim -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) 129259698Sdim min(max_suffix, max_suffix_rev) < global period of NEEDLE 130259698Sdim 1 <= p <= global period of NEEDLE 131259698Sdim p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] 132259698Sdim 1 <= k <= p 133259698Sdim */ 134259698Sdim 135259698Sdim /* Perform lexicographic search. */ 136259698Sdim max_suffix = SIZE_MAX; 137259698Sdim j = 0; 138259698Sdim k = p = 1; 139259698Sdim while (j + k < needle_len) 140259698Sdim { 141259698Sdim a = CANON_ELEMENT (needle[j + k]); 142259698Sdim b = CANON_ELEMENT (needle[max_suffix + k]); 143259698Sdim if (a < b) 144259698Sdim { 145259698Sdim /* Suffix is smaller, period is entire prefix so far. */ 146259698Sdim j += k; 147259698Sdim k = 1; 148259698Sdim p = j - max_suffix; 149259698Sdim } 150259698Sdim else if (a == b) 151259698Sdim { 152259698Sdim /* Advance through repetition of the current period. */ 153259698Sdim if (k != p) 154259698Sdim ++k; 155259698Sdim else 156259698Sdim { 157259698Sdim j += p; 158259698Sdim k = 1; 159259698Sdim } 160259698Sdim } 161259698Sdim else /* b < a */ 162259698Sdim { 163259698Sdim /* Suffix is larger, start over from current location. */ 164259698Sdim max_suffix = j++; 165259698Sdim k = p = 1; 166259698Sdim } 167259698Sdim } 168259698Sdim *period = p; 169259698Sdim 170259698Sdim /* Perform reverse lexicographic search. */ 171259698Sdim max_suffix_rev = SIZE_MAX; 172259698Sdim j = 0; 173259698Sdim k = p = 1; 174259698Sdim while (j + k < needle_len) 175259698Sdim { 176259698Sdim a = CANON_ELEMENT (needle[j + k]); 177259698Sdim b = CANON_ELEMENT (needle[max_suffix_rev + k]); 178259698Sdim if (b < a) 179259698Sdim { 180259698Sdim /* Suffix is smaller, period is entire prefix so far. */ 181259698Sdim j += k; 182259698Sdim k = 1; 183259698Sdim p = j - max_suffix_rev; 184259698Sdim } 185259698Sdim else if (a == b) 186259698Sdim { 187259698Sdim /* Advance through repetition of the current period. */ 188259698Sdim if (k != p) 189259698Sdim ++k; 190259698Sdim else 191259698Sdim { 192259698Sdim j += p; 193259698Sdim k = 1; 194259698Sdim } 195259698Sdim } 196259698Sdim else /* a < b */ 197259698Sdim { 198259698Sdim /* Suffix is larger, start over from current location. */ 199259698Sdim max_suffix_rev = j++; 200259698Sdim k = p = 1; 201259698Sdim } 202259698Sdim } 203259698Sdim 204259698Sdim /* Choose the shorter suffix. Return the index of the first byte of 205 the right half, rather than the last byte of the left half. 206 207 For some examples, 'banana' has two critical factorizations, both 208 exposed by the two lexicographic extreme suffixes of 'anana' and 209 'nana', where both suffixes have a period of 2. On the other 210 hand, with 'aab' and 'bba', both strings have a single critical 211 factorization of the last byte, with the suffix having a period 212 of 1. While the maximal lexicographic suffix of 'aab' is 'b', 213 the maximal lexicographic suffix of 'bba' is 'ba', which is not a 214 critical factorization. Conversely, the maximal reverse 215 lexicographic suffix of 'a' works for 'bba', but not 'ab' for 216 'aab'. The shorter suffix of the two will always be a critical 217 factorization. */ 218 if (max_suffix_rev + 1 < max_suffix + 1) 219 return max_suffix + 1; 220 *period = p; 221 return max_suffix_rev + 1; 222} 223 224/* Return the first location of non-empty NEEDLE within HAYSTACK, or 225 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This 226 method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD. 227 Performance is guaranteed to be linear, with an initialization cost 228 of 2 * NEEDLE_LEN comparisons. 229 230 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at 231 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. 232 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * 233 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */ 234static RETURN_TYPE 235two_way_short_needle (const unsigned char *haystack, size_t haystack_len, 236 const unsigned char *needle, size_t needle_len) 237{ 238 size_t i; /* Index into current byte of NEEDLE. */ 239 size_t j; /* Index into current window of HAYSTACK. */ 240 size_t period; /* The period of the right half of needle. */ 241 size_t suffix; /* The index of the right half of needle. */ 242 243 /* Factor the needle into two halves, such that the left half is 244 smaller than the global period, and the right half is 245 periodic (with a period as large as NEEDLE_LEN - suffix). */ 246 suffix = critical_factorization (needle, needle_len, &period); 247 248 /* Perform the search. Each iteration compares the right half 249 first. */ 250 if (CMP_FUNC (needle, needle + period, suffix) == 0) 251 { 252 /* Entire needle is periodic; a mismatch in the left half can 253 only advance by the period, so use memory to avoid rescanning 254 known occurrences of the period in the right half. */ 255 size_t memory = 0; 256 j = 0; 257 while (AVAILABLE (haystack, haystack_len, j, needle_len)) 258 { 259 /* Scan for matches in right half. */ 260 i = MAX (suffix, memory); 261 while (i < needle_len && (CANON_ELEMENT (needle[i]) 262 == CANON_ELEMENT (haystack[i + j]))) 263 ++i; 264 if (needle_len <= i) 265 { 266 /* Scan for matches in left half. */ 267 i = suffix - 1; 268 while (memory < i + 1 && (CANON_ELEMENT (needle[i]) 269 == CANON_ELEMENT (haystack[i + j]))) 270 --i; 271 if (i + 1 < memory + 1) 272 return (RETURN_TYPE) (haystack + j); 273 /* No match, so remember how many repetitions of period 274 on the right half were scanned. */ 275 j += period; 276 memory = needle_len - period; 277 } 278 else 279 { 280 j += i - suffix + 1; 281 memory = 0; 282 } 283 } 284 } 285 else 286 { 287 /* The two halves of needle are distinct; no extra memory is 288 required, and any mismatch results in a maximal shift. */ 289 period = MAX (suffix, needle_len - suffix) + 1; 290 j = 0; 291 while (AVAILABLE (haystack, haystack_len, j, needle_len)) 292 { 293 /* Scan for matches in right half. */ 294 i = suffix; 295 while (i < needle_len && (CANON_ELEMENT (needle[i]) 296 == CANON_ELEMENT (haystack[i + j]))) 297 ++i; 298 if (needle_len <= i) 299 { 300 /* Scan for matches in left half. */ 301 i = suffix - 1; 302 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) 303 == CANON_ELEMENT (haystack[i + j]))) 304 --i; 305 if (i == SIZE_MAX) 306 return (RETURN_TYPE) (haystack + j); 307 j += period; 308 } 309 else 310 j += i - suffix + 1; 311 } 312 } 313 return NULL; 314} 315 316/* Return the first location of non-empty NEEDLE within HAYSTACK, or 317 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This 318 method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN. 319 Performance is guaranteed to be linear, with an initialization cost 320 of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations. 321 322 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at 323 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, 324 and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible. 325 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * 326 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and 327 sublinear performance is not possible. */ 328static RETURN_TYPE 329two_way_long_needle (const unsigned char *haystack, size_t haystack_len, 330 const unsigned char *needle, size_t needle_len) 331{ 332 size_t i; /* Index into current byte of NEEDLE. */ 333 size_t j; /* Index into current window of HAYSTACK. */ 334 size_t period; /* The period of the right half of needle. */ 335 size_t suffix; /* The index of the right half of needle. */ 336 size_t shift_table[1U << CHAR_BIT]; /* See below. */ 337 338 /* Factor the needle into two halves, such that the left half is 339 smaller than the global period, and the right half is 340 periodic (with a period as large as NEEDLE_LEN - suffix). */ 341 suffix = critical_factorization (needle, needle_len, &period); 342 343 /* Populate shift_table. For each possible byte value c, 344 shift_table[c] is the distance from the last occurrence of c to 345 the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE. 346 shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */ 347 for (i = 0; i < 1U << CHAR_BIT; i++) 348 shift_table[i] = needle_len; 349 for (i = 0; i < needle_len; i++) 350 shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1; 351 352 /* Perform the search. Each iteration compares the right half 353 first. */ 354 if (CMP_FUNC (needle, needle + period, suffix) == 0) 355 { 356 /* Entire needle is periodic; a mismatch in the left half can 357 only advance by the period, so use memory to avoid rescanning 358 known occurrences of the period in the right half. */ 359 size_t memory = 0; 360 size_t shift; 361 j = 0; 362 while (AVAILABLE (haystack, haystack_len, j, needle_len)) 363 { 364 /* Check the last byte first; if it does not match, then 365 shift to the next possible match location. */ 366 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; 367 if (0 < shift) 368 { 369 if (memory && shift < period) 370 { 371 /* Since needle is periodic, but the last period has 372 a byte out of place, there can be no match until 373 after the mismatch. */ 374 shift = needle_len - period; 375 } 376 memory = 0; 377 j += shift; 378 continue; 379 } 380 /* Scan for matches in right half. The last byte has 381 already been matched, by virtue of the shift table. */ 382 i = MAX (suffix, memory); 383 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) 384 == CANON_ELEMENT (haystack[i + j]))) 385 ++i; 386 if (needle_len - 1 <= i) 387 { 388 /* Scan for matches in left half. */ 389 i = suffix - 1; 390 while (memory < i + 1 && (CANON_ELEMENT (needle[i]) 391 == CANON_ELEMENT (haystack[i + j]))) 392 --i; 393 if (i + 1 < memory + 1) 394 return (RETURN_TYPE) (haystack + j); 395 /* No match, so remember how many repetitions of period 396 on the right half were scanned. */ 397 j += period; 398 memory = needle_len - period; 399 } 400 else 401 { 402 j += i - suffix + 1; 403 memory = 0; 404 } 405 } 406 } 407 else 408 { 409 /* The two halves of needle are distinct; no extra memory is 410 required, and any mismatch results in a maximal shift. */ 411 size_t shift; 412 period = MAX (suffix, needle_len - suffix) + 1; 413 j = 0; 414 while (AVAILABLE (haystack, haystack_len, j, needle_len)) 415 { 416 /* Check the last byte first; if it does not match, then 417 shift to the next possible match location. */ 418 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; 419 if (0 < shift) 420 { 421 j += shift; 422 continue; 423 } 424 /* Scan for matches in right half. The last byte has 425 already been matched, by virtue of the shift table. */ 426 i = suffix; 427 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) 428 == CANON_ELEMENT (haystack[i + j]))) 429 ++i; 430 if (needle_len - 1 <= i) 431 { 432 /* Scan for matches in left half. */ 433 i = suffix - 1; 434 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) 435 == CANON_ELEMENT (haystack[i + j]))) 436 --i; 437 if (i == SIZE_MAX) 438 return (RETURN_TYPE) (haystack + j); 439 j += period; 440 } 441 else 442 j += i - suffix + 1; 443 } 444 } 445 return NULL; 446} 447 448#undef AVAILABLE 449#undef CANON_ELEMENT 450#undef CMP_FUNC 451#undef MAX 452#undef RETURN_TYPE 453