search.c revision 42660
1/* search.c -- How to search large bodies of text. */ 2 3/* This file is part of GNU Info, a program for reading online documentation 4 stored in Info format. 5 6 Copyright (C) 1993, 97 Free Software Foundation, Inc. 7 8 This program is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 2, or (at your option) 11 any later version. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program; if not, write to the Free Software 20 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 21 22 Written by Brian Fox (bfox@ai.mit.edu). */ 23 24#include "info.h" 25 26#include "search.h" 27#include "nodes.h" 28 29/* The search functions take two arguments: 30 31 1) a string to search for, and 32 33 2) a pointer to a SEARCH_BINDING which contains the buffer, start, 34 and end of the search. 35 36 They return a long, which is the offset from the start of the buffer 37 at which the match was found. An offset of -1 indicates failure. */ 38 39/* A function which makes a binding with buffer and bounds. */ 40SEARCH_BINDING * 41make_binding (buffer, start, end) 42 char *buffer; 43 long start, end; 44{ 45 SEARCH_BINDING *binding; 46 47 binding = (SEARCH_BINDING *)xmalloc (sizeof (SEARCH_BINDING)); 48 binding->buffer = buffer; 49 binding->start = start; 50 binding->end = end; 51 binding->flags = 0; 52 53 return (binding); 54} 55 56/* Make a copy of BINDING without duplicating the data. */ 57SEARCH_BINDING * 58copy_binding (binding) 59 SEARCH_BINDING *binding; 60{ 61 SEARCH_BINDING *copy; 62 63 copy = make_binding (binding->buffer, binding->start, binding->end); 64 copy->flags = binding->flags; 65 return (copy); 66} 67 68 69/* **************************************************************** */ 70/* */ 71/* The Actual Searching Functions */ 72/* */ 73/* **************************************************************** */ 74 75/* Search forwards or backwards for the text delimited by BINDING. 76 The search is forwards if BINDING->start is greater than BINDING->end. */ 77long 78search (string, binding) 79 char *string; 80 SEARCH_BINDING *binding; 81{ 82 long result; 83 84 /* If the search is backwards, then search backwards, otherwise forwards. */ 85 if (binding->start > binding->end) 86 result = search_backward (string, binding); 87 else 88 result = search_forward (string, binding); 89 90 return (result); 91} 92 93/* Search forwards for STRING through the text delimited in BINDING. */ 94long 95search_forward (string, binding) 96 char *string; 97 SEARCH_BINDING *binding; 98{ 99 register int c, i, len; 100 register char *buff, *end; 101 char *alternate = (char *)NULL; 102 103 len = strlen (string); 104 105 /* We match characters in the search buffer against STRING and ALTERNATE. 106 ALTERNATE is a case reversed version of STRING; this is cheaper than 107 case folding each character before comparison. Alternate is only 108 used if the case folding bit is turned on in the passed BINDING. */ 109 110 if (binding->flags & S_FoldCase) 111 { 112 alternate = xstrdup (string); 113 114 for (i = 0; i < len; i++) 115 { 116 if (islower (alternate[i])) 117 alternate[i] = toupper (alternate[i]); 118 else if (isupper (alternate[i])) 119 alternate[i] = tolower (alternate[i]); 120 } 121 } 122 123 buff = binding->buffer + binding->start; 124 end = binding->buffer + binding->end + 1; 125 126 while (buff < (end - len)) 127 { 128 for (i = 0; i < len; i++) 129 { 130 c = buff[i]; 131 132 if ((c != string[i]) && (!alternate || c != alternate[i])) 133 break; 134 } 135 136 if (!string[i]) 137 { 138 if (alternate) 139 free (alternate); 140 if (binding->flags & S_SkipDest) 141 buff += len; 142 return ((long) (buff - binding->buffer)); 143 } 144 145 buff++; 146 } 147 148 if (alternate) 149 free (alternate); 150 151 return ((long) -1); 152} 153 154/* Search for STRING backwards through the text delimited in BINDING. */ 155long 156search_backward (input_string, binding) 157 char *input_string; 158 SEARCH_BINDING *binding; 159{ 160 register int c, i, len; 161 register char *buff, *end; 162 char *string; 163 char *alternate = (char *)NULL; 164 165 len = strlen (input_string); 166 167 /* Reverse the characters in the search string. */ 168 string = (char *)xmalloc (1 + len); 169 for (c = 0, i = len - 1; input_string[c]; c++, i--) 170 string[i] = input_string[c]; 171 172 string[c] = '\0'; 173 174 /* We match characters in the search buffer against STRING and ALTERNATE. 175 ALTERNATE is a case reversed version of STRING; this is cheaper than 176 case folding each character before comparison. ALTERNATE is only 177 used if the case folding bit is turned on in the passed BINDING. */ 178 179 if (binding->flags & S_FoldCase) 180 { 181 alternate = xstrdup (string); 182 183 for (i = 0; i < len; i++) 184 { 185 if (islower (alternate[i])) 186 alternate[i] = toupper (alternate[i]); 187 else if (isupper (alternate[i])) 188 alternate[i] = tolower (alternate[i]); 189 } 190 } 191 192 buff = binding->buffer + binding->start - 1; 193 end = binding->buffer + binding->end; 194 195 while (buff > (end + len)) 196 { 197 for (i = 0; i < len; i++) 198 { 199 c = *(buff - i); 200 201 if (c != string[i] && (alternate && c != alternate[i])) 202 break; 203 } 204 205 if (!string[i]) 206 { 207 free (string); 208 if (alternate) 209 free (alternate); 210 211 if (binding->flags & S_SkipDest) 212 buff -= len; 213 return ((long) (1 + (buff - binding->buffer))); 214 } 215 216 buff--; 217 } 218 219 free (string); 220 if (alternate) 221 free (alternate); 222 223 return ((long) -1); 224} 225 226/* Find STRING in LINE, returning the offset of the end of the string. 227 Return an offset of -1 if STRING does not appear in LINE. The search 228 is bound by the end of the line (i.e., either NEWLINE or 0). */ 229int 230string_in_line (string, line) 231 char *string, *line; 232{ 233 register int end; 234 SEARCH_BINDING binding; 235 236 /* Find the end of the line. */ 237 for (end = 0; line[end] && line[end] != '\n'; end++); 238 239 /* Search for STRING within these confines. */ 240 binding.buffer = line; 241 binding.start = 0; 242 binding.end = end; 243 binding.flags = S_FoldCase | S_SkipDest; 244 245 return (search_forward (string, &binding)); 246} 247 248/* Return non-zero if STRING is the first text to appear at BINDING. */ 249int 250looking_at (string, binding) 251 char *string; 252 SEARCH_BINDING *binding; 253{ 254 long search_end; 255 256 search_end = search (string, binding); 257 258 /* If the string was not found, SEARCH_END is -1. If the string was found, 259 but not right away, SEARCH_END is != binding->start. Otherwise, the 260 string was found at binding->start. */ 261 return (search_end == binding->start); 262} 263 264/* **************************************************************** */ 265/* */ 266/* Small String Searches */ 267/* */ 268/* **************************************************************** */ 269 270/* Function names that start with "skip" are passed a string, and return 271 an offset from the start of that string. Function names that start 272 with "find" are passed a SEARCH_BINDING, and return an absolute position 273 marker of the item being searched for. "Find" functions return a value 274 of -1 if the item being looked for couldn't be found. */ 275 276/* Return the index of the first non-whitespace character in STRING. */ 277int 278skip_whitespace (string) 279 char *string; 280{ 281 register int i; 282 283 for (i = 0; string && whitespace (string[i]); i++); 284 return (i); 285} 286 287/* Return the index of the first non-whitespace or newline character in 288 STRING. */ 289int 290skip_whitespace_and_newlines (string) 291 char *string; 292{ 293 register int i; 294 295 for (i = 0; string && (whitespace (string[i]) || string[i] == '\n'); i++); 296 return (i); 297} 298 299/* Return the index of the first whitespace character in STRING. */ 300int 301skip_non_whitespace (string) 302 char *string; 303{ 304 register int i; 305 306 for (i = 0; string && !whitespace (string[i]); i++); 307 return (i); 308} 309 310/* Return the index of the first non-node character in STRING. Note that 311 this function contains quite a bit of hair to ignore periods in some 312 special cases. This is because we here at GNU ship some info files which 313 contain nodenames that contain periods. No such nodename can start with 314 a period, or continue with whitespace, newline, or ')' immediately following 315 the period. If second argument NEWLINES_OKAY is non-zero, newlines should 316 be skipped while parsing out the nodename specification. */ 317int 318skip_node_characters (string, newlines_okay) 319 char *string; 320 int newlines_okay; 321{ 322 register int c, i = 0; 323 int paren_seen = 0; 324 int paren = 0; 325 326 /* Handle special case. This is when another function has parsed out the 327 filename component of the node name, and we just want to parse out the 328 nodename proper. In that case, a period at the start of the nodename 329 indicates an empty nodename. */ 330 if (string && *string == '.') 331 return (0); 332 333 if (string && *string == '(') 334 { 335 paren++; 336 paren_seen++; 337 i++; 338 } 339 340 for (; string && (c = string[i]); i++) 341 { 342 if (paren) 343 { 344 if (c == '(') 345 paren++; 346 else if (c == ')') 347 paren--; 348 349 continue; 350 } 351 352 /* If the character following the close paren is a space or period, 353 then this node name has no more characters associated with it. */ 354 if (c == '\t' || 355 c == ',' || 356 c == INFO_TAGSEP || 357 ((!newlines_okay) && (c == '\n')) || 358 ((paren_seen && string[i - 1] == ')') && 359 (c == ' ' || c == '.')) || 360 (c == '.' && 361 ( 362#if 0 363/* This test causes a node name ending in a period, like `This.', not to 364 be found. The trailing . is stripped. This occurs in the jargon 365 file (`I see no X here.' is a node name). */ 366 (!string[i + 1]) || 367#endif 368 (whitespace_or_newline (string[i + 1])) || 369 (string[i + 1] == ')')))) 370 break; 371 } 372 return (i); 373} 374 375 376/* **************************************************************** */ 377/* */ 378/* Searching FILE_BUFFER's */ 379/* */ 380/* **************************************************************** */ 381 382/* Return the absolute position of the first occurence of a node separator in 383 BINDING-buffer. The search starts at BINDING->start. Return -1 if no node 384 separator was found. */ 385long 386find_node_separator (binding) 387 SEARCH_BINDING *binding; 388{ 389 register long i; 390 char *body; 391 392 body = binding->buffer; 393 394 /* A node is started by [^L]^_[^L]\n. That is to say, the C-l's are 395 optional, but the DELETE and NEWLINE are not. This separator holds 396 true for all separated elements in an Info file, including the tags 397 table (if present) and the indirect tags table (if present). */ 398 for (i = binding->start; i < binding->end - 1; i++) 399 if (((body[i] == INFO_FF && body[i + 1] == INFO_COOKIE) && 400 (body[i + 2] == '\n' || 401 (body[i + 2] == INFO_FF && body[i + 3] == '\n'))) || 402 ((body[i] == INFO_COOKIE) && 403 (body[i + 1] == '\n' || 404 (body[i + 1] == INFO_FF && body[i + 2] == '\n')))) 405 return (i); 406 return (-1); 407} 408 409/* Return the length of the node separator characters that BODY is 410 currently pointing at. */ 411int 412skip_node_separator (body) 413 char *body; 414{ 415 register int i; 416 417 i = 0; 418 419 if (body[i] == INFO_FF) 420 i++; 421 422 if (body[i++] != INFO_COOKIE) 423 return (0); 424 425 if (body[i] == INFO_FF) 426 i++; 427 428 if (body[i++] != '\n') 429 return (0); 430 431 return (i); 432} 433 434/* Return the number of characters from STRING to the start of 435 the next line. */ 436int 437skip_line (string) 438 char *string; 439{ 440 register int i; 441 442 for (i = 0; string && string[i] && string[i] != '\n'; i++); 443 444 if (string[i] == '\n') 445 i++; 446 447 return (i); 448} 449 450/* Return the absolute position of the beginning of a tags table in this 451 binding starting the search at binding->start. */ 452long 453find_tags_table (binding) 454 SEARCH_BINDING *binding; 455{ 456 SEARCH_BINDING search; 457 long position; 458 459 search.buffer = binding->buffer; 460 search.start = binding->start; 461 search.end = binding->end; 462 search.flags = S_FoldCase; 463 464 while ((position = find_node_separator (&search)) != -1 ) 465 { 466 search.start = position; 467 search.start += skip_node_separator (search.buffer + search.start); 468 469 if (looking_at (TAGS_TABLE_BEG_LABEL, &search)) 470 return (position); 471 } 472 return (-1); 473} 474 475/* Return the absolute position of the node named NODENAME in BINDING. 476 This is a brute force search, and we wish to avoid it when possible. 477 This function is called when a tag (indirect or otherwise) doesn't 478 really point to the right node. It returns the absolute position of 479 the separator preceding the node. */ 480long 481find_node_in_binding (nodename, binding) 482 char *nodename; 483 SEARCH_BINDING *binding; 484{ 485 long position; 486 int offset, namelen; 487 SEARCH_BINDING search; 488 489 namelen = strlen (nodename); 490 491 search.buffer = binding->buffer; 492 search.start = binding->start; 493 search.end = binding->end; 494 search.flags = 0; 495 496 while ((position = find_node_separator (&search)) != -1) 497 { 498 search.start = position; 499 search.start += skip_node_separator (search.buffer + search.start); 500 501 offset = string_in_line (INFO_NODE_LABEL, search.buffer + search.start); 502 503 if (offset == -1) 504 continue; 505 506 search.start += offset; 507 search.start += skip_whitespace (search.buffer + search.start); 508 offset = skip_node_characters 509 (search.buffer + search.start, DONT_SKIP_NEWLINES); 510 511 /* Notice that this is an exact match. You cannot grovel through 512 the buffer with this function looking for random nodes. */ 513 if ((offset == namelen) && 514 (search.buffer[search.start] == nodename[0]) && 515 (strncmp (search.buffer + search.start, nodename, offset) == 0)) 516 return (position); 517 } 518 return (-1); 519} 520