1/* c-strcasestr.c -- case insensitive substring search in C locale 2 Copyright (C) 2005-2007 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2005. 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18#include <config.h> 19 20/* Specification. */ 21#include "c-strcasestr.h" 22 23#include <stdbool.h> 24#include <stddef.h> 25#include <string.h> 26 27#include "malloca.h" 28#include "c-ctype.h" 29 30/* Knuth-Morris-Pratt algorithm. 31 See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm 32 Return a boolean indicating success. */ 33static bool 34knuth_morris_pratt (const char *haystack, const char *needle, 35 const char **resultp) 36{ 37 size_t m = strlen (needle); 38 39 /* Allocate the table. */ 40 size_t *table = (size_t *) malloca (m * sizeof (size_t)); 41 if (table == NULL) 42 return false; 43 /* Fill the table. 44 For 0 < i < m: 45 0 < table[i] <= i is defined such that 46 rhaystack[0..i-1] == needle[0..i-1] and rhaystack[i] != needle[i] 47 implies 48 forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1], 49 and table[i] is as large as possible with this property. 50 table[0] remains uninitialized. */ 51 { 52 size_t i, j; 53 54 table[1] = 1; 55 j = 0; 56 for (i = 2; i < m; i++) 57 { 58 unsigned char b = c_tolower ((unsigned char) needle[i - 1]); 59 60 for (;;) 61 { 62 if (b == c_tolower ((unsigned char) needle[j])) 63 { 64 table[i] = i - ++j; 65 break; 66 } 67 if (j == 0) 68 { 69 table[i] = i; 70 break; 71 } 72 j = j - table[j]; 73 } 74 } 75 } 76 77 /* Search, using the table to accelerate the processing. */ 78 { 79 size_t j; 80 const char *rhaystack; 81 const char *phaystack; 82 83 *resultp = NULL; 84 j = 0; 85 rhaystack = haystack; 86 phaystack = haystack; 87 /* Invariant: phaystack = rhaystack + j. */ 88 while (*phaystack != '\0') 89 if (c_tolower ((unsigned char) needle[j]) 90 == c_tolower ((unsigned char) *phaystack)) 91 { 92 j++; 93 phaystack++; 94 if (j == m) 95 { 96 /* The entire needle has been found. */ 97 *resultp = rhaystack; 98 break; 99 } 100 } 101 else if (j > 0) 102 { 103 /* Found a match of needle[0..j-1], mismatch at needle[j]. */ 104 rhaystack += table[j]; 105 j -= table[j]; 106 } 107 else 108 { 109 /* Found a mismatch at needle[0] already. */ 110 rhaystack++; 111 phaystack++; 112 } 113 } 114 115 freea (table); 116 return true; 117} 118 119/* Find the first occurrence of NEEDLE in HAYSTACK, using case-insensitive 120 comparison. 121 Note: This function may, in multibyte locales, return success even if 122 strlen (haystack) < strlen (needle) ! */ 123char * 124c_strcasestr (const char *haystack, const char *needle) 125{ 126 /* Be careful not to look at the entire extent of haystack or needle 127 until needed. This is useful because of these two cases: 128 - haystack may be very long, and a match of needle found early, 129 - needle may be very long, and not even a short initial segment of 130 needle may be found in haystack. */ 131 if (*needle != '\0') 132 { 133 /* Minimizing the worst-case complexity: 134 Let n = strlen(haystack), m = strlen(needle). 135 The na��ve algorithm is O(n*m) worst-case. 136 The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a 137 memory allocation. 138 To achieve linear complexity and yet amortize the cost of the memory 139 allocation, we activate the Knuth-Morris-Pratt algorithm only once 140 the na��ve algorithm has already run for some time; more precisely, 141 when 142 - the outer loop count is >= 10, 143 - the average number of comparisons per outer loop is >= 5, 144 - the total number of comparisons is >= m. 145 But we try it only once. If the memory allocation attempt failed, 146 we don't retry it. */ 147 bool try_kmp = true; 148 size_t outer_loop_count = 0; 149 size_t comparison_count = 0; 150 size_t last_ccount = 0; /* last comparison count */ 151 const char *needle_last_ccount = needle; /* = needle + last_ccount */ 152 153 /* Speed up the following searches of needle by caching its first 154 character. */ 155 unsigned char b = c_tolower ((unsigned char) *needle); 156 157 needle++; 158 for (;; haystack++) 159 { 160 if (*haystack == '\0') 161 /* No match. */ 162 return NULL; 163 164 /* See whether it's advisable to use an asymptotically faster 165 algorithm. */ 166 if (try_kmp 167 && outer_loop_count >= 10 168 && comparison_count >= 5 * outer_loop_count) 169 { 170 /* See if needle + comparison_count now reaches the end of 171 needle. */ 172 if (needle_last_ccount != NULL) 173 { 174 needle_last_ccount += 175 strnlen (needle_last_ccount, comparison_count - last_ccount); 176 if (*needle_last_ccount == '\0') 177 needle_last_ccount = NULL; 178 last_ccount = comparison_count; 179 } 180 if (needle_last_ccount == NULL) 181 { 182 /* Try the Knuth-Morris-Pratt algorithm. */ 183 const char *result; 184 bool success = 185 knuth_morris_pratt (haystack, needle - 1, &result); 186 if (success) 187 return (char *) result; 188 try_kmp = false; 189 } 190 } 191 192 outer_loop_count++; 193 comparison_count++; 194 if (c_tolower ((unsigned char) *haystack) == b) 195 /* The first character matches. */ 196 { 197 const char *rhaystack = haystack + 1; 198 const char *rneedle = needle; 199 200 for (;; rhaystack++, rneedle++) 201 { 202 if (*rneedle == '\0') 203 /* Found a match. */ 204 return (char *) haystack; 205 if (*rhaystack == '\0') 206 /* No match. */ 207 return NULL; 208 comparison_count++; 209 if (c_tolower ((unsigned char) *rhaystack) 210 != c_tolower ((unsigned char) *rneedle)) 211 /* Nothing in this round. */ 212 break; 213 } 214 } 215 } 216 } 217 else 218 return (char *) haystack; 219} 220