1/* c-strstr.c -- substring search in C locale 2 Copyright (C) 2005-2007 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2005, 2007. 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18#include <config.h> 19 20/* Specification. */ 21#include "c-strstr.h" 22 23#include <stdbool.h> 24#include <stdlib.h> 25#include <string.h> 26 27#include "malloca.h" 28 29/* Knuth-Morris-Pratt algorithm. 30 See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm 31 Return a boolean indicating success. */ 32static bool 33knuth_morris_pratt (const char *haystack, const char *needle, 34 const char **resultp) 35{ 36 size_t m = strlen (needle); 37 38 /* Allocate the table. */ 39 size_t *table = (size_t *) malloca (m * sizeof (size_t)); 40 if (table == NULL) 41 return false; 42 /* Fill the table. 43 For 0 < i < m: 44 0 < table[i] <= i is defined such that 45 rhaystack[0..i-1] == needle[0..i-1] and rhaystack[i] != needle[i] 46 implies 47 forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1], 48 and table[i] is as large as possible with this property. 49 table[0] remains uninitialized. */ 50 { 51 size_t i, j; 52 53 table[1] = 1; 54 j = 0; 55 for (i = 2; i < m; i++) 56 { 57 unsigned char b = (unsigned char) needle[i - 1]; 58 59 for (;;) 60 { 61 if (b == (unsigned char) needle[j]) 62 { 63 table[i] = i - ++j; 64 break; 65 } 66 if (j == 0) 67 { 68 table[i] = i; 69 break; 70 } 71 j = j - table[j]; 72 } 73 } 74 } 75 76 /* Search, using the table to accelerate the processing. */ 77 { 78 size_t j; 79 const char *rhaystack; 80 const char *phaystack; 81 82 *resultp = NULL; 83 j = 0; 84 rhaystack = haystack; 85 phaystack = haystack; 86 /* Invariant: phaystack = rhaystack + j. */ 87 while (*phaystack != '\0') 88 if ((unsigned char) needle[j] == (unsigned char) *phaystack) 89 { 90 j++; 91 phaystack++; 92 if (j == m) 93 { 94 /* The entire needle has been found. */ 95 *resultp = rhaystack; 96 break; 97 } 98 } 99 else if (j > 0) 100 { 101 /* Found a match of needle[0..j-1], mismatch at needle[j]. */ 102 rhaystack += table[j]; 103 j -= table[j]; 104 } 105 else 106 { 107 /* Found a mismatch at needle[0] already. */ 108 rhaystack++; 109 phaystack++; 110 } 111 } 112 113 freea (table); 114 return true; 115} 116 117/* Find the first occurrence of NEEDLE in HAYSTACK. */ 118char * 119c_strstr (const char *haystack, const char *needle) 120{ 121 /* Be careful not to look at the entire extent of haystack or needle 122 until needed. This is useful because of these two cases: 123 - haystack may be very long, and a match of needle found early, 124 - needle may be very long, and not even a short initial segment of 125 needle may be found in haystack. */ 126 if (*needle != '\0') 127 { 128 /* Minimizing the worst-case complexity: 129 Let n = strlen(haystack), m = strlen(needle). 130 The na��ve algorithm is O(n*m) worst-case. 131 The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a 132 memory allocation. 133 To achieve linear complexity and yet amortize the cost of the memory 134 allocation, we activate the Knuth-Morris-Pratt algorithm only once 135 the na��ve algorithm has already run for some time; more precisely, 136 when 137 - the outer loop count is >= 10, 138 - the average number of comparisons per outer loop is >= 5, 139 - the total number of comparisons is >= m. 140 But we try it only once. If the memory allocation attempt failed, 141 we don't retry it. */ 142 bool try_kmp = true; 143 size_t outer_loop_count = 0; 144 size_t comparison_count = 0; 145 size_t last_ccount = 0; /* last comparison count */ 146 const char *needle_last_ccount = needle; /* = needle + last_ccount */ 147 148 /* Speed up the following searches of needle by caching its first 149 character. */ 150 unsigned char b = (unsigned char) *needle; 151 152 needle++; 153 for (;; haystack++) 154 { 155 if (*haystack == '\0') 156 /* No match. */ 157 return NULL; 158 159 /* See whether it's advisable to use an asymptotically faster 160 algorithm. */ 161 if (try_kmp 162 && outer_loop_count >= 10 163 && comparison_count >= 5 * outer_loop_count) 164 { 165 /* See if needle + comparison_count now reaches the end of 166 needle. */ 167 if (needle_last_ccount != NULL) 168 { 169 needle_last_ccount += 170 strnlen (needle_last_ccount, comparison_count - last_ccount); 171 if (*needle_last_ccount == '\0') 172 needle_last_ccount = NULL; 173 last_ccount = comparison_count; 174 } 175 if (needle_last_ccount == NULL) 176 { 177 /* Try the Knuth-Morris-Pratt algorithm. */ 178 const char *result; 179 bool success = 180 knuth_morris_pratt (haystack, needle - 1, &result); 181 if (success) 182 return (char *) result; 183 try_kmp = false; 184 } 185 } 186 187 outer_loop_count++; 188 comparison_count++; 189 if ((unsigned char) *haystack == b) 190 /* The first character matches. */ 191 { 192 const char *rhaystack = haystack + 1; 193 const char *rneedle = needle; 194 195 for (;; rhaystack++, rneedle++) 196 { 197 if (*rneedle == '\0') 198 /* Found a match. */ 199 return (char *) haystack; 200 if (*rhaystack == '\0') 201 /* No match. */ 202 return NULL; 203 comparison_count++; 204 if ((unsigned char) *rhaystack != (unsigned char) *rneedle) 205 /* Nothing in this round. */ 206 break; 207 } 208 } 209 } 210 } 211 else 212 return (char *) haystack; 213} 214