1/* Byte-wise substring search, using the Two-Way algorithm.
2   Copyright (C) 2008-2014 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4   Written by Eric Blake <ebb9@byu.net>, 2008.
5
6   This program is free software; you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 3, or (at your option)
9   any later version.
10
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15
16   You should have received a copy of the GNU General Public License along
17   with this program; if not, see <http://www.gnu.org/licenses/>.  */
18
19/* Before including this file, you need to include <config.h> and
20   <string.h>, and define:
21     RESULT_TYPE             A macro that expands to the return type.
22     AVAILABLE(h, h_l, j, n_l)
23                             A macro that returns nonzero if there are
24                             at least N_L bytes left starting at H[J].
25                             H is 'unsigned char *', H_L, J, and N_L
26                             are 'size_t'; H_L is an lvalue.  For
27                             NUL-terminated searches, H_L can be
28                             modified each iteration to avoid having
29                             to compute the end of H up front.
30
31  For case-insensitivity, you may optionally define:
32     CMP_FUNC(p1, p2, l)     A macro that returns 0 iff the first L
33                             characters of P1 and P2 are equal.
34     CANON_ELEMENT(c)        A macro that canonicalizes an element right after
35                             it has been fetched from one of the two strings.
36                             The argument is an 'unsigned char'; the result
37                             must be an 'unsigned char' as well.
38
39  This file undefines the macros documented above, and defines
40  LONG_NEEDLE_THRESHOLD.
41*/
42
43#include <limits.h>
44#include <stdint.h>
45
46/* We use the Two-Way string matching algorithm (also known as
47   Chrochemore-Perrin), which guarantees linear complexity with
48   constant space.  Additionally, for long needles, we also use a bad
49   character shift table similar to the Boyer-Moore algorithm to
50   achieve improved (potentially sub-linear) performance.
51
52   See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260,
53   http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm,
54   http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6641&rep=rep1&type=pdf
55*/
56
57/* Point at which computing a bad-byte shift table is likely to be
58   worthwhile.  Small needles should not compute a table, since it
59   adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
60   speedup no greater than a factor of NEEDLE_LEN.  The larger the
61   needle, the better the potential performance gain.  On the other
62   hand, on non-POSIX systems with CHAR_BIT larger than eight, the
63   memory required for the table is prohibitive.  */
64#if CHAR_BIT < 10
65# define LONG_NEEDLE_THRESHOLD 32U
66#else
67# define LONG_NEEDLE_THRESHOLD SIZE_MAX
68#endif
69
70#ifndef MAX
71# define MAX(a, b) ((a < b) ? (b) : (a))
72#endif
73
74#ifndef CANON_ELEMENT
75# define CANON_ELEMENT(c) c
76#endif
77#ifndef CMP_FUNC
78# define CMP_FUNC memcmp
79#endif
80
81/* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
82   Return the index of the first byte in the right half, and set
83   *PERIOD to the global period of the right half.
84
85   The global period of a string is the smallest index (possibly its
86   length) at which all remaining bytes in the string are repetitions
87   of the prefix (the last repetition may be a subset of the prefix).
88
89   When NEEDLE is factored into two halves, a local period is the
90   length of the smallest word that shares a suffix with the left half
91   and shares a prefix with the right half.  All factorizations of a
92   non-empty NEEDLE have a local period of at least 1 and no greater
93   than NEEDLE_LEN.
94
95   A critical factorization has the property that the local period
96   equals the global period.  All strings have at least one critical
97   factorization with the left half smaller than the global period.
98   And while some strings have more than one critical factorization,
99   it is provable that with an ordered alphabet, at least one of the
100   critical factorizations corresponds to a maximal suffix.
101
102   Given an ordered alphabet, a critical factorization can be computed
103   in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
104   shorter of two ordered maximal suffixes.  The ordered maximal
105   suffixes are determined by lexicographic comparison while tracking
106   periodicity.  */
107static size_t
108critical_factorization (const unsigned char *needle, size_t needle_len,
109                        size_t *period)
110{
111  /* Index of last byte of left half, or SIZE_MAX.  */
112  size_t max_suffix, max_suffix_rev;
113  size_t j; /* Index into NEEDLE for current candidate suffix.  */
114  size_t k; /* Offset into current period.  */
115  size_t p; /* Intermediate period.  */
116  unsigned char a, b; /* Current comparison bytes.  */
117
118  /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered
119     out 0-length needles.  */
120  if (needle_len < 3)
121    {
122      *period = 1;
123      return needle_len - 1;
124    }
125
126  /* Invariants:
127     0 <= j < NEEDLE_LEN - 1
128     -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
129     min(max_suffix, max_suffix_rev) < global period of NEEDLE
130     1 <= p <= global period of NEEDLE
131     p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
132     1 <= k <= p
133  */
134
135  /* Perform lexicographic search.  */
136  max_suffix = SIZE_MAX;
137  j = 0;
138  k = p = 1;
139  while (j + k < needle_len)
140    {
141      a = CANON_ELEMENT (needle[j + k]);
142      b = CANON_ELEMENT (needle[max_suffix + k]);
143      if (a < b)
144        {
145          /* Suffix is smaller, period is entire prefix so far.  */
146          j += k;
147          k = 1;
148          p = j - max_suffix;
149        }
150      else if (a == b)
151        {
152          /* Advance through repetition of the current period.  */
153          if (k != p)
154            ++k;
155          else
156            {
157              j += p;
158              k = 1;
159            }
160        }
161      else /* b < a */
162        {
163          /* Suffix is larger, start over from current location.  */
164          max_suffix = j++;
165          k = p = 1;
166        }
167    }
168  *period = p;
169
170  /* Perform reverse lexicographic search.  */
171  max_suffix_rev = SIZE_MAX;
172  j = 0;
173  k = p = 1;
174  while (j + k < needle_len)
175    {
176      a = CANON_ELEMENT (needle[j + k]);
177      b = CANON_ELEMENT (needle[max_suffix_rev + k]);
178      if (b < a)
179        {
180          /* Suffix is smaller, period is entire prefix so far.  */
181          j += k;
182          k = 1;
183          p = j - max_suffix_rev;
184        }
185      else if (a == b)
186        {
187          /* Advance through repetition of the current period.  */
188          if (k != p)
189            ++k;
190          else
191            {
192              j += p;
193              k = 1;
194            }
195        }
196      else /* a < b */
197        {
198          /* Suffix is larger, start over from current location.  */
199          max_suffix_rev = j++;
200          k = p = 1;
201        }
202    }
203
204  /* Choose the shorter suffix.  Return the index of the first byte of
205     the right half, rather than the last byte of the left half.
206
207     For some examples, 'banana' has two critical factorizations, both
208     exposed by the two lexicographic extreme suffixes of 'anana' and
209     'nana', where both suffixes have a period of 2.  On the other
210     hand, with 'aab' and 'bba', both strings have a single critical
211     factorization of the last byte, with the suffix having a period
212     of 1.  While the maximal lexicographic suffix of 'aab' is 'b',
213     the maximal lexicographic suffix of 'bba' is 'ba', which is not a
214     critical factorization.  Conversely, the maximal reverse
215     lexicographic suffix of 'a' works for 'bba', but not 'ab' for
216     'aab'.  The shorter suffix of the two will always be a critical
217     factorization.  */
218  if (max_suffix_rev + 1 < max_suffix + 1)
219    return max_suffix + 1;
220  *period = p;
221  return max_suffix_rev + 1;
222}
223
224/* Return the first location of non-empty NEEDLE within HAYSTACK, or
225   NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
226   method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
227   Performance is guaranteed to be linear, with an initialization cost
228   of 2 * NEEDLE_LEN comparisons.
229
230   If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
231   most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
232   If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
233   HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.  */
234static RETURN_TYPE
235two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
236                      const unsigned char *needle, size_t needle_len)
237{
238  size_t i; /* Index into current byte of NEEDLE.  */
239  size_t j; /* Index into current window of HAYSTACK.  */
240  size_t period; /* The period of the right half of needle.  */
241  size_t suffix; /* The index of the right half of needle.  */
242
243  /* Factor the needle into two halves, such that the left half is
244     smaller than the global period, and the right half is
245     periodic (with a period as large as NEEDLE_LEN - suffix).  */
246  suffix = critical_factorization (needle, needle_len, &period);
247
248  /* Perform the search.  Each iteration compares the right half
249     first.  */
250  if (CMP_FUNC (needle, needle + period, suffix) == 0)
251    {
252      /* Entire needle is periodic; a mismatch in the left half can
253         only advance by the period, so use memory to avoid rescanning
254         known occurrences of the period in the right half.  */
255      size_t memory = 0;
256      j = 0;
257      while (AVAILABLE (haystack, haystack_len, j, needle_len))
258        {
259          /* Scan for matches in right half.  */
260          i = MAX (suffix, memory);
261          while (i < needle_len && (CANON_ELEMENT (needle[i])
262                                    == CANON_ELEMENT (haystack[i + j])))
263            ++i;
264          if (needle_len <= i)
265            {
266              /* Scan for matches in left half.  */
267              i = suffix - 1;
268              while (memory < i + 1 && (CANON_ELEMENT (needle[i])
269                                        == CANON_ELEMENT (haystack[i + j])))
270                --i;
271              if (i + 1 < memory + 1)
272                return (RETURN_TYPE) (haystack + j);
273              /* No match, so remember how many repetitions of period
274                 on the right half were scanned.  */
275              j += period;
276              memory = needle_len - period;
277            }
278          else
279            {
280              j += i - suffix + 1;
281              memory = 0;
282            }
283        }
284    }
285  else
286    {
287      /* The two halves of needle are distinct; no extra memory is
288         required, and any mismatch results in a maximal shift.  */
289      period = MAX (suffix, needle_len - suffix) + 1;
290      j = 0;
291      while (AVAILABLE (haystack, haystack_len, j, needle_len))
292        {
293          /* Scan for matches in right half.  */
294          i = suffix;
295          while (i < needle_len && (CANON_ELEMENT (needle[i])
296                                    == CANON_ELEMENT (haystack[i + j])))
297            ++i;
298          if (needle_len <= i)
299            {
300              /* Scan for matches in left half.  */
301              i = suffix - 1;
302              while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
303                                       == CANON_ELEMENT (haystack[i + j])))
304                --i;
305              if (i == SIZE_MAX)
306                return (RETURN_TYPE) (haystack + j);
307              j += period;
308            }
309          else
310            j += i - suffix + 1;
311        }
312    }
313  return NULL;
314}
315
316/* Return the first location of non-empty NEEDLE within HAYSTACK, or
317   NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
318   method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
319   Performance is guaranteed to be linear, with an initialization cost
320   of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
321
322   If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
323   most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
324   and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
325   If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
326   HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
327   sublinear performance is not possible.  */
328static RETURN_TYPE
329two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
330                     const unsigned char *needle, size_t needle_len)
331{
332  size_t i; /* Index into current byte of NEEDLE.  */
333  size_t j; /* Index into current window of HAYSTACK.  */
334  size_t period; /* The period of the right half of needle.  */
335  size_t suffix; /* The index of the right half of needle.  */
336  size_t shift_table[1U << CHAR_BIT]; /* See below.  */
337
338  /* Factor the needle into two halves, such that the left half is
339     smaller than the global period, and the right half is
340     periodic (with a period as large as NEEDLE_LEN - suffix).  */
341  suffix = critical_factorization (needle, needle_len, &period);
342
343  /* Populate shift_table.  For each possible byte value c,
344     shift_table[c] is the distance from the last occurrence of c to
345     the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
346     shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0.  */
347  for (i = 0; i < 1U << CHAR_BIT; i++)
348    shift_table[i] = needle_len;
349  for (i = 0; i < needle_len; i++)
350    shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
351
352  /* Perform the search.  Each iteration compares the right half
353     first.  */
354  if (CMP_FUNC (needle, needle + period, suffix) == 0)
355    {
356      /* Entire needle is periodic; a mismatch in the left half can
357         only advance by the period, so use memory to avoid rescanning
358         known occurrences of the period in the right half.  */
359      size_t memory = 0;
360      size_t shift;
361      j = 0;
362      while (AVAILABLE (haystack, haystack_len, j, needle_len))
363        {
364          /* Check the last byte first; if it does not match, then
365             shift to the next possible match location.  */
366          shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
367          if (0 < shift)
368            {
369              if (memory && shift < period)
370                {
371                  /* Since needle is periodic, but the last period has
372                     a byte out of place, there can be no match until
373                     after the mismatch.  */
374                  shift = needle_len - period;
375                }
376              memory = 0;
377              j += shift;
378              continue;
379            }
380          /* Scan for matches in right half.  The last byte has
381             already been matched, by virtue of the shift table.  */
382          i = MAX (suffix, memory);
383          while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
384                                        == CANON_ELEMENT (haystack[i + j])))
385            ++i;
386          if (needle_len - 1 <= i)
387            {
388              /* Scan for matches in left half.  */
389              i = suffix - 1;
390              while (memory < i + 1 && (CANON_ELEMENT (needle[i])
391                                        == CANON_ELEMENT (haystack[i + j])))
392                --i;
393              if (i + 1 < memory + 1)
394                return (RETURN_TYPE) (haystack + j);
395              /* No match, so remember how many repetitions of period
396                 on the right half were scanned.  */
397              j += period;
398              memory = needle_len - period;
399            }
400          else
401            {
402              j += i - suffix + 1;
403              memory = 0;
404            }
405        }
406    }
407  else
408    {
409      /* The two halves of needle are distinct; no extra memory is
410         required, and any mismatch results in a maximal shift.  */
411      size_t shift;
412      period = MAX (suffix, needle_len - suffix) + 1;
413      j = 0;
414      while (AVAILABLE (haystack, haystack_len, j, needle_len))
415        {
416          /* Check the last byte first; if it does not match, then
417             shift to the next possible match location.  */
418          shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
419          if (0 < shift)
420            {
421              j += shift;
422              continue;
423            }
424          /* Scan for matches in right half.  The last byte has
425             already been matched, by virtue of the shift table.  */
426          i = suffix;
427          while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
428                                        == CANON_ELEMENT (haystack[i + j])))
429            ++i;
430          if (needle_len - 1 <= i)
431            {
432              /* Scan for matches in left half.  */
433              i = suffix - 1;
434              while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
435                                       == CANON_ELEMENT (haystack[i + j])))
436                --i;
437              if (i == SIZE_MAX)
438                return (RETURN_TYPE) (haystack + j);
439              j += period;
440            }
441          else
442            j += i - suffix + 1;
443        }
444    }
445  return NULL;
446}
447
448#undef AVAILABLE
449#undef CANON_ELEMENT
450#undef CMP_FUNC
451#undef MAX
452#undef RETURN_TYPE
453