1/* $NetBSD$ */ 2 3/* search.c -- searching large bodies of text. 4 Id: search.c,v 1.3 2004/04/11 17:56:46 karl Exp 5 6 Copyright (C) 1993, 1997, 1998, 2002, 2004 Free Software Foundation, Inc. 7 8 This program is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 2, or (at your option) 11 any later version. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program; if not, write to the Free Software 20 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 21 22 Written by Brian Fox (bfox@ai.mit.edu). */ 23 24#include "info.h" 25 26#include "search.h" 27#include "nodes.h" 28 29/* The search functions take two arguments: 30 31 1) a string to search for, and 32 33 2) a pointer to a SEARCH_BINDING which contains the buffer, start, 34 and end of the search. 35 36 They return a long, which is the offset from the start of the buffer 37 at which the match was found. An offset of -1 indicates failure. */ 38 39/* A function which makes a binding with buffer and bounds. */ 40SEARCH_BINDING * 41make_binding (char *buffer, long int start, long int end) 42{ 43 SEARCH_BINDING *binding; 44 45 binding = (SEARCH_BINDING *)xmalloc (sizeof (SEARCH_BINDING)); 46 binding->buffer = buffer; 47 binding->start = start; 48 binding->end = end; 49 binding->flags = 0; 50 51 return (binding); 52} 53 54/* Make a copy of BINDING without duplicating the data. */ 55SEARCH_BINDING * 56copy_binding (SEARCH_BINDING *binding) 57{ 58 SEARCH_BINDING *copy; 59 60 copy = make_binding (binding->buffer, binding->start, binding->end); 61 copy->flags = binding->flags; 62 return (copy); 63} 64 65 66/* **************************************************************** */ 67/* */ 68/* The Actual Searching Functions */ 69/* */ 70/* **************************************************************** */ 71 72/* Search forwards or backwards for the text delimited by BINDING. 73 The search is forwards if BINDING->start is greater than BINDING->end. */ 74long 75search (char *string, SEARCH_BINDING *binding) 76{ 77 long result; 78 79 /* If the search is backwards, then search backwards, otherwise forwards. */ 80 if (binding->start > binding->end) 81 result = search_backward (string, binding); 82 else 83 result = search_forward (string, binding); 84 85 return (result); 86} 87 88/* Search forwards for STRING through the text delimited in BINDING. */ 89long 90search_forward (char *string, SEARCH_BINDING *binding) 91{ 92 register int c, i, len; 93 register char *buff, *end; 94 char *alternate = (char *)NULL; 95 96 len = strlen (string); 97 98 /* We match characters in the search buffer against STRING and ALTERNATE. 99 ALTERNATE is a case reversed version of STRING; this is cheaper than 100 case folding each character before comparison. Alternate is only 101 used if the case folding bit is turned on in the passed BINDING. */ 102 103 if (binding->flags & S_FoldCase) 104 { 105 alternate = xstrdup (string); 106 107 for (i = 0; i < len; i++) 108 { 109 if (islower (alternate[i])) 110 alternate[i] = toupper (alternate[i]); 111 else if (isupper (alternate[i])) 112 alternate[i] = tolower (alternate[i]); 113 } 114 } 115 116 buff = binding->buffer + binding->start; 117 end = binding->buffer + binding->end + 1; 118 119 while (buff < (end - len)) 120 { 121 for (i = 0; i < len; i++) 122 { 123 c = buff[i]; 124 125 if ((c != string[i]) && (!alternate || c != alternate[i])) 126 break; 127 } 128 129 if (!string[i]) 130 { 131 if (alternate) 132 free (alternate); 133 if (binding->flags & S_SkipDest) 134 buff += len; 135 return ((long) (buff - binding->buffer)); 136 } 137 138 buff++; 139 } 140 141 if (alternate) 142 free (alternate); 143 144 return ((long) -1); 145} 146 147/* Search for STRING backwards through the text delimited in BINDING. */ 148long 149search_backward (char *input_string, SEARCH_BINDING *binding) 150{ 151 register int c, i, len; 152 register char *buff, *end; 153 char *string; 154 char *alternate = (char *)NULL; 155 156 len = strlen (input_string); 157 158 /* Reverse the characters in the search string. */ 159 string = (char *)xmalloc (1 + len); 160 for (c = 0, i = len - 1; input_string[c]; c++, i--) 161 string[i] = input_string[c]; 162 163 string[c] = '\0'; 164 165 /* We match characters in the search buffer against STRING and ALTERNATE. 166 ALTERNATE is a case reversed version of STRING; this is cheaper than 167 case folding each character before comparison. ALTERNATE is only 168 used if the case folding bit is turned on in the passed BINDING. */ 169 170 if (binding->flags & S_FoldCase) 171 { 172 alternate = xstrdup (string); 173 174 for (i = 0; i < len; i++) 175 { 176 if (islower (alternate[i])) 177 alternate[i] = toupper (alternate[i]); 178 else if (isupper (alternate[i])) 179 alternate[i] = tolower (alternate[i]); 180 } 181 } 182 183 buff = binding->buffer + binding->start - 1; 184 end = binding->buffer + binding->end; 185 186 while (buff > (end + len)) 187 { 188 for (i = 0; i < len; i++) 189 { 190 c = *(buff - i); 191 192 if (c != string[i] && (!alternate || c != alternate[i])) 193 break; 194 } 195 196 if (!string[i]) 197 { 198 free (string); 199 if (alternate) 200 free (alternate); 201 202 if (binding->flags & S_SkipDest) 203 buff -= len; 204 return ((long) (1 + (buff - binding->buffer))); 205 } 206 207 buff--; 208 } 209 210 free (string); 211 if (alternate) 212 free (alternate); 213 214 return ((long) -1); 215} 216 217/* Find STRING in LINE, returning the offset of the end of the string. 218 Return an offset of -1 if STRING does not appear in LINE. The search 219 is bound by the end of the line (i.e., either NEWLINE or 0). */ 220int 221string_in_line (char *string, char *line) 222{ 223 register int end; 224 SEARCH_BINDING binding; 225 226 /* Find the end of the line. */ 227 for (end = 0; line[end] && line[end] != '\n'; end++); 228 229 /* Search for STRING within these confines. */ 230 binding.buffer = line; 231 binding.start = 0; 232 binding.end = end; 233 binding.flags = S_FoldCase | S_SkipDest; 234 235 return (search_forward (string, &binding)); 236} 237 238/* Return non-zero if STRING is the first text to appear at BINDING. */ 239int 240looking_at (char *string, SEARCH_BINDING *binding) 241{ 242 long search_end; 243 244 search_end = search (string, binding); 245 246 /* If the string was not found, SEARCH_END is -1. If the string was found, 247 but not right away, SEARCH_END is != binding->start. Otherwise, the 248 string was found at binding->start. */ 249 return (search_end == binding->start); 250} 251 252/* **************************************************************** */ 253/* */ 254/* Small String Searches */ 255/* */ 256/* **************************************************************** */ 257 258/* Function names that start with "skip" are passed a string, and return 259 an offset from the start of that string. Function names that start 260 with "find" are passed a SEARCH_BINDING, and return an absolute position 261 marker of the item being searched for. "Find" functions return a value 262 of -1 if the item being looked for couldn't be found. */ 263 264/* Return the index of the first non-whitespace character in STRING. */ 265int 266skip_whitespace (char *string) 267{ 268 register int i; 269 270 for (i = 0; string && whitespace (string[i]); i++); 271 return (i); 272} 273 274/* Return the index of the first non-whitespace or newline character in 275 STRING. */ 276int 277skip_whitespace_and_newlines (char *string) 278{ 279 register int i; 280 281 for (i = 0; string && whitespace_or_newline (string[i]); i++); 282 return (i); 283} 284 285/* Return the index of the first whitespace character in STRING. */ 286int 287skip_non_whitespace (char *string) 288{ 289 register int i; 290 291 for (i = 0; string && string[i] && !whitespace (string[i]); i++); 292 return (i); 293} 294 295/* Return the index of the first non-node character in STRING. Note that 296 this function contains quite a bit of hair to ignore periods in some 297 special cases. This is because we here at GNU ship some info files which 298 contain nodenames that contain periods. No such nodename can start with 299 a period, or continue with whitespace, newline, or ')' immediately following 300 the period. If second argument NEWLINES_OKAY is non-zero, newlines should 301 be skipped while parsing out the nodename specification. */ 302int 303skip_node_characters (char *string, int newlines_okay) 304{ 305 register int c, i = 0; 306 int paren_seen = 0; 307 int paren = 0; 308 309 /* Handle special case. This is when another function has parsed out the 310 filename component of the node name, and we just want to parse out the 311 nodename proper. In that case, a period at the start of the nodename 312 indicates an empty nodename. */ 313 if (string && *string == '.') 314 return (0); 315 316 if (string && *string == '(') 317 { 318 paren++; 319 paren_seen++; 320 i++; 321 } 322 323 for (; string && (c = string[i]); i++) 324 { 325 if (paren) 326 { 327 if (c == '(') 328 paren++; 329 else if (c == ')') 330 paren--; 331 332 continue; 333 } 334 335 /* If the character following the close paren is a space or period, 336 then this node name has no more characters associated with it. */ 337 if (c == '\t' || 338 c == ',' || 339 c == INFO_TAGSEP || 340 ((!newlines_okay) && (c == '\n')) || 341 ((paren_seen && string[i - 1] == ')') && 342 (c == ' ' || c == '.')) || 343 (c == '.' && 344 ( 345#if 0 346/* This test causes a node name ending in a period, like `This.', not to 347 be found. The trailing . is stripped. This occurs in the jargon 348 file (`I see no X here.' is a node name). */ 349 (!string[i + 1]) || 350#endif 351 (whitespace_or_newline (string[i + 1])) || 352 (string[i + 1] == ')')))) 353 break; 354 } 355 return (i); 356} 357 358 359/* **************************************************************** */ 360/* */ 361/* Searching FILE_BUFFER's */ 362/* */ 363/* **************************************************************** */ 364 365/* Return the absolute position of the first occurence of a node separator in 366 BINDING-buffer. The search starts at BINDING->start. Return -1 if no node 367 separator was found. */ 368long 369find_node_separator (SEARCH_BINDING *binding) 370{ 371 register long i; 372 char *body; 373 374 body = binding->buffer; 375 376 /* A node is started by [^L]^_[^L]\n. That is to say, the C-l's are 377 optional, but the DELETE and NEWLINE are not. This separator holds 378 true for all separated elements in an Info file, including the tags 379 table (if present) and the indirect tags table (if present). */ 380 for (i = binding->start; i < binding->end - 1; i++) 381 if (((body[i] == INFO_FF && body[i + 1] == INFO_COOKIE) && 382 (body[i + 2] == '\n' || 383 (body[i + 2] == INFO_FF && body[i + 3] == '\n'))) || 384 ((body[i] == INFO_COOKIE) && 385 (body[i + 1] == '\n' || 386 (body[i + 1] == INFO_FF && body[i + 2] == '\n')))) 387 return (i); 388 return (-1); 389} 390 391/* Return the length of the node separator characters that BODY is 392 currently pointing at. */ 393int 394skip_node_separator (char *body) 395{ 396 register int i; 397 398 i = 0; 399 400 if (body[i] == INFO_FF) 401 i++; 402 403 if (body[i++] != INFO_COOKIE) 404 return (0); 405 406 if (body[i] == INFO_FF) 407 i++; 408 409 if (body[i++] != '\n') 410 return (0); 411 412 return (i); 413} 414 415/* Return the number of characters from STRING to the start of 416 the next line. */ 417int 418skip_line (char *string) 419{ 420 register int i; 421 422 for (i = 0; string && string[i] && string[i] != '\n'; i++); 423 424 if (string[i] == '\n') 425 i++; 426 427 return (i); 428} 429 430/* Return the absolute position of the beginning of a tags table in this 431 binding starting the search at binding->start. */ 432long 433find_tags_table (SEARCH_BINDING *binding) 434{ 435 SEARCH_BINDING tmp_search; 436 long position; 437 438 tmp_search.buffer = binding->buffer; 439 tmp_search.start = binding->start; 440 tmp_search.end = binding->end; 441 tmp_search.flags = S_FoldCase; 442 443 while ((position = find_node_separator (&tmp_search)) != -1 ) 444 { 445 tmp_search.start = position; 446 tmp_search.start += skip_node_separator (tmp_search.buffer 447 + tmp_search.start); 448 449 if (looking_at (TAGS_TABLE_BEG_LABEL, &tmp_search)) 450 return (position); 451 } 452 return (-1); 453} 454 455/* Return the absolute position of the node named NODENAME in BINDING. 456 This is a brute force search, and we wish to avoid it when possible. 457 This function is called when a tag (indirect or otherwise) doesn't 458 really point to the right node. It returns the absolute position of 459 the separator preceding the node. */ 460long 461find_node_in_binding (char *nodename, SEARCH_BINDING *binding) 462{ 463 long position; 464 int offset, namelen; 465 SEARCH_BINDING tmp_search; 466 467 namelen = strlen (nodename); 468 469 tmp_search.buffer = binding->buffer; 470 tmp_search.start = binding->start; 471 tmp_search.end = binding->end; 472 tmp_search.flags = 0; 473 474 while ((position = find_node_separator (&tmp_search)) != -1) 475 { 476 tmp_search.start = position; 477 tmp_search.start += skip_node_separator 478 (tmp_search.buffer + tmp_search.start); 479 480 offset = string_in_line 481 (INFO_NODE_LABEL, tmp_search.buffer + tmp_search.start); 482 483 if (offset == -1) 484 continue; 485 486 tmp_search.start += offset; 487 tmp_search.start += skip_whitespace (tmp_search.buffer + tmp_search.start); 488 offset = skip_node_characters 489 (tmp_search.buffer + tmp_search.start, DONT_SKIP_NEWLINES); 490 491 /* Notice that this is an exact match. You cannot grovel through 492 the buffer with this function looking for random nodes. */ 493 if ((offset == namelen) && 494 (tmp_search.buffer[tmp_search.start] == nodename[0]) && 495 (strncmp (tmp_search.buffer + tmp_search.start, nodename, offset) == 0)) 496 return (position); 497 } 498 return (-1); 499} 500