1259698Sdim/* Byte-wise substring search, using the Two-Way algorithm.
2259698Sdim   Copyright (C) 2008-2022 Free Software Foundation, Inc.
3259698Sdim   This file is part of the GNU C Library.
4259698Sdim   Written by Eric Blake <ebb9@byu.net>, 2008.
5259698Sdim
6259698Sdim   This file is free software: you can redistribute it and/or modify
7259698Sdim   it under the terms of the GNU Lesser General Public License as
8259698Sdim   published by the Free Software Foundation; either version 2.1 of the
9259698Sdim   License, or (at your option) any later version.
10259698Sdim
11259698Sdim   This file is distributed in the hope that it will be useful,
12259698Sdim   but WITHOUT ANY WARRANTY; without even the implied warranty of
13259698Sdim   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14259698Sdim   GNU Lesser General Public License for more details.
15259698Sdim
16259698Sdim   You should have received a copy of the GNU Lesser General Public License
17259698Sdim   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
18259698Sdim
19259698Sdim/* Before including this file, you need to include <config.h> and
20259698Sdim   <string.h>, and define:
21259698Sdim     RETURN_TYPE             A macro that expands to the return type.
22259698Sdim     AVAILABLE(h, h_l, j, n_l)
23259698Sdim                             A macro that returns nonzero if there are
24259698Sdim                             at least N_L bytes left starting at H[J].
25259698Sdim                             H is 'unsigned char *', H_L, J, and N_L
26259698Sdim                             are 'size_t'; H_L is an lvalue.  For
27259698Sdim                             NUL-terminated searches, H_L can be
28259698Sdim                             modified each iteration to avoid having
29259698Sdim                             to compute the end of H up front.
30259698Sdim
31259698Sdim  For case-insensitivity, you may optionally define:
32259698Sdim     CMP_FUNC(p1, p2, l)     A macro that returns 0 iff the first L
33259698Sdim                             characters of P1 and P2 are equal.
34259698Sdim     CANON_ELEMENT(c)        A macro that canonicalizes an element right after
35259698Sdim                             it has been fetched from one of the two strings.
36259698Sdim                             The argument is an 'unsigned char'; the result
37259698Sdim                             must be an 'unsigned char' as well.
38259698Sdim
39259698Sdim  This file undefines the macros documented above, and defines
40259698Sdim  LONG_NEEDLE_THRESHOLD.
41259698Sdim*/
42259698Sdim
43259698Sdim#include <limits.h>
44259698Sdim#include <stdint.h>
45259698Sdim
46259698Sdim/* We use the Two-Way string matching algorithm (also known as
47259698Sdim   Chrochemore-Perrin), which guarantees linear complexity with
48259698Sdim   constant space.  Additionally, for long needles, we also use a bad
49259698Sdim   character shift table similar to the Boyer-Moore algorithm to
50259698Sdim   achieve improved (potentially sub-linear) performance.
51259698Sdim
52259698Sdim   See https://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260,
53259698Sdim   https://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm,
54259698Sdim   https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6641&rep=rep1&type=pdf
55259698Sdim*/
56259698Sdim
57259698Sdim/* Point at which computing a bad-byte shift table is likely to be
58259698Sdim   worthwhile.  Small needles should not compute a table, since it
59259698Sdim   adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
60259698Sdim   speedup no greater than a factor of NEEDLE_LEN.  The larger the
61259698Sdim   needle, the better the potential performance gain.  On the other
62259698Sdim   hand, on non-POSIX systems with CHAR_BIT larger than eight, the
63259698Sdim   memory required for the table is prohibitive.  */
64259698Sdim#if CHAR_BIT < 10
65259698Sdim# define LONG_NEEDLE_THRESHOLD 32U
66259698Sdim#else
67259698Sdim# define LONG_NEEDLE_THRESHOLD SIZE_MAX
68259698Sdim#endif
69259698Sdim
70259698Sdim#ifndef MAX
71259698Sdim# define MAX(a, b) ((a < b) ? (b) : (a))
72259698Sdim#endif
73259698Sdim
74259698Sdim#ifndef CANON_ELEMENT
75259698Sdim# define CANON_ELEMENT(c) c
76259698Sdim#endif
77259698Sdim#ifndef CMP_FUNC
78259698Sdim# define CMP_FUNC memcmp
79259698Sdim#endif
80259698Sdim
81259698Sdim/* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
82259698Sdim   Return the index of the first byte in the right half, and set
83259698Sdim   *PERIOD to the global period of the right half.
84259698Sdim
85259698Sdim   The global period of a string is the smallest index (possibly its
86259698Sdim   length) at which all remaining bytes in the string are repetitions
87259698Sdim   of the prefix (the last repetition may be a subset of the prefix).
88259698Sdim
89259698Sdim   When NEEDLE is factored into two halves, a local period is the
90259698Sdim   length of the smallest word that shares a suffix with the left half
91259698Sdim   and shares a prefix with the right half.  All factorizations of a
92259698Sdim   non-empty NEEDLE have a local period of at least 1 and no greater
93259698Sdim   than NEEDLE_LEN.
94259698Sdim
95259698Sdim   A critical factorization has the property that the local period
96259698Sdim   equals the global period.  All strings have at least one critical
97259698Sdim   factorization with the left half smaller than the global period.
98259698Sdim   And while some strings have more than one critical factorization,
99259698Sdim   it is provable that with an ordered alphabet, at least one of the
100259698Sdim   critical factorizations corresponds to a maximal suffix.
101259698Sdim
102259698Sdim   Given an ordered alphabet, a critical factorization can be computed
103259698Sdim   in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
104259698Sdim   shorter of two ordered maximal suffixes.  The ordered maximal
105259698Sdim   suffixes are determined by lexicographic comparison while tracking
106259698Sdim   periodicity.  */
107259698Sdimstatic size_t
108259698Sdimcritical_factorization (const unsigned char *needle, size_t needle_len,
109259698Sdim                        size_t *period)
110259698Sdim{
111259698Sdim  /* Index of last byte of left half, or SIZE_MAX.  */
112259698Sdim  size_t max_suffix, max_suffix_rev;
113259698Sdim  size_t j; /* Index into NEEDLE for current candidate suffix.  */
114259698Sdim  size_t k; /* Offset into current period.  */
115259698Sdim  size_t p; /* Intermediate period.  */
116259698Sdim  unsigned char a, b; /* Current comparison bytes.  */
117259698Sdim
118259698Sdim  /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered
119259698Sdim     out 0-length needles.  */
120259698Sdim  if (needle_len < 3)
121259698Sdim    {
122259698Sdim      *period = 1;
123259698Sdim      return needle_len - 1;
124259698Sdim    }
125259698Sdim
126259698Sdim  /* Invariants:
127259698Sdim     0 <= j < NEEDLE_LEN - 1
128259698Sdim     -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
129259698Sdim     min(max_suffix, max_suffix_rev) < global period of NEEDLE
130259698Sdim     1 <= p <= global period of NEEDLE
131259698Sdim     p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
132259698Sdim     1 <= k <= p
133259698Sdim  */
134259698Sdim
135259698Sdim  /* Perform lexicographic search.  */
136259698Sdim  max_suffix = SIZE_MAX;
137259698Sdim  j = 0;
138259698Sdim  k = p = 1;
139259698Sdim  while (j + k < needle_len)
140259698Sdim    {
141259698Sdim      a = CANON_ELEMENT (needle[j + k]);
142259698Sdim      b = CANON_ELEMENT (needle[max_suffix + k]);
143259698Sdim      if (a < b)
144259698Sdim        {
145259698Sdim          /* Suffix is smaller, period is entire prefix so far.  */
146259698Sdim          j += k;
147259698Sdim          k = 1;
148259698Sdim          p = j - max_suffix;
149259698Sdim        }
150259698Sdim      else if (a == b)
151259698Sdim        {
152259698Sdim          /* Advance through repetition of the current period.  */
153259698Sdim          if (k != p)
154259698Sdim            ++k;
155259698Sdim          else
156259698Sdim            {
157259698Sdim              j += p;
158259698Sdim              k = 1;
159259698Sdim            }
160259698Sdim        }
161259698Sdim      else /* b < a */
162259698Sdim        {
163259698Sdim          /* Suffix is larger, start over from current location.  */
164259698Sdim          max_suffix = j++;
165259698Sdim          k = p = 1;
166259698Sdim        }
167259698Sdim    }
168259698Sdim  *period = p;
169259698Sdim
170259698Sdim  /* Perform reverse lexicographic search.  */
171259698Sdim  max_suffix_rev = SIZE_MAX;
172259698Sdim  j = 0;
173259698Sdim  k = p = 1;
174259698Sdim  while (j + k < needle_len)
175259698Sdim    {
176259698Sdim      a = CANON_ELEMENT (needle[j + k]);
177259698Sdim      b = CANON_ELEMENT (needle[max_suffix_rev + k]);
178259698Sdim      if (b < a)
179259698Sdim        {
180259698Sdim          /* Suffix is smaller, period is entire prefix so far.  */
181259698Sdim          j += k;
182259698Sdim          k = 1;
183259698Sdim          p = j - max_suffix_rev;
184259698Sdim        }
185259698Sdim      else if (a == b)
186259698Sdim        {
187259698Sdim          /* Advance through repetition of the current period.  */
188259698Sdim          if (k != p)
189259698Sdim            ++k;
190259698Sdim          else
191259698Sdim            {
192259698Sdim              j += p;
193259698Sdim              k = 1;
194259698Sdim            }
195259698Sdim        }
196259698Sdim      else /* a < b */
197259698Sdim        {
198259698Sdim          /* Suffix is larger, start over from current location.  */
199259698Sdim          max_suffix_rev = j++;
200259698Sdim          k = p = 1;
201259698Sdim        }
202259698Sdim    }
203259698Sdim
204259698Sdim  /* Choose the shorter suffix.  Return the index of the first byte of
205     the right half, rather than the last byte of the left half.
206
207     For some examples, 'banana' has two critical factorizations, both
208     exposed by the two lexicographic extreme suffixes of 'anana' and
209     'nana', where both suffixes have a period of 2.  On the other
210     hand, with 'aab' and 'bba', both strings have a single critical
211     factorization of the last byte, with the suffix having a period
212     of 1.  While the maximal lexicographic suffix of 'aab' is 'b',
213     the maximal lexicographic suffix of 'bba' is 'ba', which is not a
214     critical factorization.  Conversely, the maximal reverse
215     lexicographic suffix of 'a' works for 'bba', but not 'ab' for
216     'aab'.  The shorter suffix of the two will always be a critical
217     factorization.  */
218  if (max_suffix_rev + 1 < max_suffix + 1)
219    return max_suffix + 1;
220  *period = p;
221  return max_suffix_rev + 1;
222}
223
224/* Return the first location of non-empty NEEDLE within HAYSTACK, or
225   NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
226   method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
227   Performance is guaranteed to be linear, with an initialization cost
228   of 2 * NEEDLE_LEN comparisons.
229
230   If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
231   most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
232   If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
233   HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.  */
234static RETURN_TYPE
235two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
236                      const unsigned char *needle, size_t needle_len)
237{
238  size_t i; /* Index into current byte of NEEDLE.  */
239  size_t j; /* Index into current window of HAYSTACK.  */
240  size_t period; /* The period of the right half of needle.  */
241  size_t suffix; /* The index of the right half of needle.  */
242
243  /* Factor the needle into two halves, such that the left half is
244     smaller than the global period, and the right half is
245     periodic (with a period as large as NEEDLE_LEN - suffix).  */
246  suffix = critical_factorization (needle, needle_len, &period);
247
248  /* Perform the search.  Each iteration compares the right half
249     first.  */
250  if (CMP_FUNC (needle, needle + period, suffix) == 0)
251    {
252      /* Entire needle is periodic; a mismatch in the left half can
253         only advance by the period, so use memory to avoid rescanning
254         known occurrences of the period in the right half.  */
255      size_t memory = 0;
256      j = 0;
257      while (AVAILABLE (haystack, haystack_len, j, needle_len))
258        {
259          /* Scan for matches in right half.  */
260          i = MAX (suffix, memory);
261          while (i < needle_len && (CANON_ELEMENT (needle[i])
262                                    == CANON_ELEMENT (haystack[i + j])))
263            ++i;
264          if (needle_len <= i)
265            {
266              /* Scan for matches in left half.  */
267              i = suffix - 1;
268              while (memory < i + 1 && (CANON_ELEMENT (needle[i])
269                                        == CANON_ELEMENT (haystack[i + j])))
270                --i;
271              if (i + 1 < memory + 1)
272                return (RETURN_TYPE) (haystack + j);
273              /* No match, so remember how many repetitions of period
274                 on the right half were scanned.  */
275              j += period;
276              memory = needle_len - period;
277            }
278          else
279            {
280              j += i - suffix + 1;
281              memory = 0;
282            }
283        }
284    }
285  else
286    {
287      /* The two halves of needle are distinct; no extra memory is
288         required, and any mismatch results in a maximal shift.  */
289      period = MAX (suffix, needle_len - suffix) + 1;
290      j = 0;
291      while (AVAILABLE (haystack, haystack_len, j, needle_len))
292        {
293          /* Scan for matches in right half.  */
294          i = suffix;
295          while (i < needle_len && (CANON_ELEMENT (needle[i])
296                                    == CANON_ELEMENT (haystack[i + j])))
297            ++i;
298          if (needle_len <= i)
299            {
300              /* Scan for matches in left half.  */
301              i = suffix - 1;
302              while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
303                                       == CANON_ELEMENT (haystack[i + j])))
304                --i;
305              if (i == SIZE_MAX)
306                return (RETURN_TYPE) (haystack + j);
307              j += period;
308            }
309          else
310            j += i - suffix + 1;
311        }
312    }
313  return NULL;
314}
315
316/* Return the first location of non-empty NEEDLE within HAYSTACK, or
317   NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
318   method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
319   Performance is guaranteed to be linear, with an initialization cost
320   of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
321
322   If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
323   most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
324   and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
325   If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
326   HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
327   sublinear performance is not possible.  */
328static RETURN_TYPE
329two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
330                     const unsigned char *needle, size_t needle_len)
331{
332  size_t i; /* Index into current byte of NEEDLE.  */
333  size_t j; /* Index into current window of HAYSTACK.  */
334  size_t period; /* The period of the right half of needle.  */
335  size_t suffix; /* The index of the right half of needle.  */
336  size_t shift_table[1U << CHAR_BIT]; /* See below.  */
337
338  /* Factor the needle into two halves, such that the left half is
339     smaller than the global period, and the right half is
340     periodic (with a period as large as NEEDLE_LEN - suffix).  */
341  suffix = critical_factorization (needle, needle_len, &period);
342
343  /* Populate shift_table.  For each possible byte value c,
344     shift_table[c] is the distance from the last occurrence of c to
345     the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
346     shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0.  */
347  for (i = 0; i < 1U << CHAR_BIT; i++)
348    shift_table[i] = needle_len;
349  for (i = 0; i < needle_len; i++)
350    shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
351
352  /* Perform the search.  Each iteration compares the right half
353     first.  */
354  if (CMP_FUNC (needle, needle + period, suffix) == 0)
355    {
356      /* Entire needle is periodic; a mismatch in the left half can
357         only advance by the period, so use memory to avoid rescanning
358         known occurrences of the period in the right half.  */
359      size_t memory = 0;
360      size_t shift;
361      j = 0;
362      while (AVAILABLE (haystack, haystack_len, j, needle_len))
363        {
364          /* Check the last byte first; if it does not match, then
365             shift to the next possible match location.  */
366          shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
367          if (0 < shift)
368            {
369              if (memory && shift < period)
370                {
371                  /* Since needle is periodic, but the last period has
372                     a byte out of place, there can be no match until
373                     after the mismatch.  */
374                  shift = needle_len - period;
375                }
376              memory = 0;
377              j += shift;
378              continue;
379            }
380          /* Scan for matches in right half.  The last byte has
381             already been matched, by virtue of the shift table.  */
382          i = MAX (suffix, memory);
383          while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
384                                        == CANON_ELEMENT (haystack[i + j])))
385            ++i;
386          if (needle_len - 1 <= i)
387            {
388              /* Scan for matches in left half.  */
389              i = suffix - 1;
390              while (memory < i + 1 && (CANON_ELEMENT (needle[i])
391                                        == CANON_ELEMENT (haystack[i + j])))
392                --i;
393              if (i + 1 < memory + 1)
394                return (RETURN_TYPE) (haystack + j);
395              /* No match, so remember how many repetitions of period
396                 on the right half were scanned.  */
397              j += period;
398              memory = needle_len - period;
399            }
400          else
401            {
402              j += i - suffix + 1;
403              memory = 0;
404            }
405        }
406    }
407  else
408    {
409      /* The two halves of needle are distinct; no extra memory is
410         required, and any mismatch results in a maximal shift.  */
411      size_t shift;
412      period = MAX (suffix, needle_len - suffix) + 1;
413      j = 0;
414      while (AVAILABLE (haystack, haystack_len, j, needle_len))
415        {
416          /* Check the last byte first; if it does not match, then
417             shift to the next possible match location.  */
418          shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
419          if (0 < shift)
420            {
421              j += shift;
422              continue;
423            }
424          /* Scan for matches in right half.  The last byte has
425             already been matched, by virtue of the shift table.  */
426          i = suffix;
427          while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
428                                        == CANON_ELEMENT (haystack[i + j])))
429            ++i;
430          if (needle_len - 1 <= i)
431            {
432              /* Scan for matches in left half.  */
433              i = suffix - 1;
434              while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
435                                       == CANON_ELEMENT (haystack[i + j])))
436                --i;
437              if (i == SIZE_MAX)
438                return (RETURN_TYPE) (haystack + j);
439              j += period;
440            }
441          else
442            j += i - suffix + 1;
443        }
444    }
445  return NULL;
446}
447
448#undef AVAILABLE
449#undef CANON_ELEMENT
450#undef CMP_FUNC
451#undef MAX
452#undef RETURN_TYPE
453