search.c revision 146515
10SN/A/* search.c -- searching large bodies of text. 20SN/A $Id: search.c,v 1.3 2004/04/11 17:56:46 karl Exp $ 30SN/A 40SN/A Copyright (C) 1993, 1997, 1998, 2002, 2004 Free Software Foundation, Inc. 50SN/A 67433SJohn.Ojemann@Sun.COM This program is free software; you can redistribute it and/or modify 70SN/A it under the terms of the GNU General Public License as published by 80SN/A the Free Software Foundation; either version 2, or (at your option) 90SN/A any later version. 100SN/A 110SN/A This program is distributed in the hope that it will be useful, 120SN/A but WITHOUT ANY WARRANTY; without even the implied warranty of 130SN/A MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 140SN/A GNU General Public License for more details. 150SN/A 160SN/A You should have received a copy of the GNU General Public License 170SN/A along with this program; if not, write to the Free Software 180SN/A Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 190SN/A 200SN/A Written by Brian Fox (bfox@ai.mit.edu). */ 212393Syz155240 222393Syz155240#include "info.h" 232393Syz155240 240SN/A#include "search.h" 250SN/A#include "nodes.h" 260SN/A 270SN/A/* The search functions take two arguments: 280SN/A 290SN/A 1) a string to search for, and 300SN/A 310SN/A 2) a pointer to a SEARCH_BINDING which contains the buffer, start, 320SN/A and end of the search. 330SN/A 340SN/A They return a long, which is the offset from the start of the buffer 350SN/A at which the match was found. An offset of -1 indicates failure. */ 362393Syz155240 372393Syz155240/* A function which makes a binding with buffer and bounds. */ 382393Syz155240SEARCH_BINDING * 390SN/Amake_binding (char *buffer, long int start, long int end) 400SN/A{ 410SN/A SEARCH_BINDING *binding; 422393Syz155240 432393Syz155240 binding = (SEARCH_BINDING *)xmalloc (sizeof (SEARCH_BINDING)); 440SN/A binding->buffer = buffer; 450SN/A binding->start = start; 460SN/A binding->end = end; 470SN/A binding->flags = 0; 480SN/A 490SN/A return (binding); 500SN/A} 510SN/A 520SN/A/* Make a copy of BINDING without duplicating the data. */ 530SN/ASEARCH_BINDING * 540SN/Acopy_binding (SEARCH_BINDING *binding) 550SN/A{ 560SN/A SEARCH_BINDING *copy; 570SN/A 580SN/A copy = make_binding (binding->buffer, binding->start, binding->end); 590SN/A copy->flags = binding->flags; 600SN/A return (copy); 610SN/A} 620SN/A 630SN/A 645274Syx160601/* **************************************************************** */ 650SN/A/* */ 660SN/A/* The Actual Searching Functions */ 670SN/A/* */ 685274Syx160601/* **************************************************************** */ 690SN/A 700SN/A/* Search forwards or backwards for the text delimited by BINDING. 710SN/A The search is forwards if BINDING->start is greater than BINDING->end. */ 720SN/Along 730SN/Asearch (char *string, SEARCH_BINDING *binding) 740SN/A{ 750SN/A long result; 760SN/A 770SN/A /* If the search is backwards, then search backwards, otherwise forwards. */ 780SN/A if (binding->start > binding->end) 790SN/A result = search_backward (string, binding); 800SN/A else 812393Syz155240 result = search_forward (string, binding); 822393Syz155240 832393Syz155240 return (result); 840SN/A} 850SN/A 860SN/A/* Search forwards for STRING through the text delimited in BINDING. */ 872393Syz155240long 880SN/Asearch_forward (char *string, SEARCH_BINDING *binding) 893448Sdh155122{ 900SN/A register int c, i, len; 910SN/A register char *buff, *end; 920SN/A char *alternate = (char *)NULL; 930SN/A 940SN/A len = strlen (string); 950SN/A 960SN/A /* We match characters in the search buffer against STRING and ALTERNATE. 970SN/A ALTERNATE is a case reversed version of STRING; this is cheaper than 982393Syz155240 case folding each character before comparison. Alternate is only 992393Syz155240 used if the case folding bit is turned on in the passed BINDING. */ 1002393Syz155240 1012393Syz155240 if (binding->flags & S_FoldCase) 1022393Syz155240 { 1032393Syz155240 alternate = xstrdup (string); 1042393Syz155240 1052393Syz155240 for (i = 0; i < len; i++) 1062393Syz155240 { 1072393Syz155240 if (islower (alternate[i])) 1082393Syz155240 alternate[i] = toupper (alternate[i]); 1092393Syz155240 else if (isupper (alternate[i])) 1102393Syz155240 alternate[i] = tolower (alternate[i]); 1112393Syz155240 } 1120SN/A } 1132393Syz155240 1140SN/A buff = binding->buffer + binding->start; 1150SN/A end = binding->buffer + binding->end + 1; 1160SN/A 1170SN/A while (buff < (end - len)) 1180SN/A { 1190SN/A for (i = 0; i < len; i++) 1200SN/A { 1212393Syz155240 c = buff[i]; 1220SN/A 1232393Syz155240 if ((c != string[i]) && (!alternate || c != alternate[i])) 1240SN/A break; 1253448Sdh155122 } 1263448Sdh155122 1270SN/A if (!string[i]) 1283448Sdh155122 { 1290SN/A if (alternate) 1300SN/A free (alternate); 1310SN/A if (binding->flags & S_SkipDest) 1323448Sdh155122 buff += len; 1330SN/A return ((long) (buff - binding->buffer)); 1340SN/A } 1350SN/A 1363448Sdh155122 buff++; 1370SN/A } 1380SN/A 1390SN/A if (alternate) 1403448Sdh155122 free (alternate); 1410SN/A 1420SN/A return ((long) -1); 1430SN/A} 1443448Sdh155122 1450SN/A/* Search for STRING backwards through the text delimited in BINDING. */ 1460SN/Along 1470SN/Asearch_backward (char *input_string, SEARCH_BINDING *binding) 1483448Sdh155122{ 1490SN/A register int c, i, len; 1500SN/A register char *buff, *end; 1510SN/A char *string; 1523448Sdh155122 char *alternate = (char *)NULL; 1530SN/A 1542393Syz155240 len = strlen (input_string); 1552393Syz155240 1562393Syz155240 /* Reverse the characters in the search string. */ 1573448Sdh155122 string = (char *)xmalloc (1 + len); 1582393Syz155240 for (c = 0, i = len - 1; input_string[c]; c++, i--) 1592393Syz155240 string[i] = input_string[c]; 1600SN/A 1610SN/A string[c] = '\0'; 1623448Sdh155122 1630SN/A /* We match characters in the search buffer against STRING and ALTERNATE. 1643448Sdh155122 ALTERNATE is a case reversed version of STRING; this is cheaper than 1650SN/A case folding each character before comparison. ALTERNATE is only 1660SN/A used if the case folding bit is turned on in the passed BINDING. */ 1670SN/A 1680SN/A if (binding->flags & S_FoldCase) 1693448Sdh155122 { 1700SN/A alternate = xstrdup (string); 1710SN/A 1720SN/A for (i = 0; i < len; i++) 1733448Sdh155122 { 1740SN/A if (islower (alternate[i])) 1750SN/A alternate[i] = toupper (alternate[i]); 1760SN/A else if (isupper (alternate[i])) 1773448Sdh155122 alternate[i] = tolower (alternate[i]); 1780SN/A } 1790SN/A } 1800SN/A 1810SN/A buff = binding->buffer + binding->start - 1; 1820SN/A end = binding->buffer + binding->end; 1830SN/A 1843448Sdh155122 while (buff > (end + len)) 1850SN/A { 1863448Sdh155122 for (i = 0; i < len; i++) 1870SN/A { 1880SN/A c = *(buff - i); 1890SN/A 1903448Sdh155122 if (c != string[i] && (!alternate || c != alternate[i])) 1910SN/A break; 1920SN/A } 1932393Syz155240 1942393Syz155240 if (!string[i]) 1952393Syz155240 { 1962393Syz155240 free (string); 1970SN/A if (alternate) 1982393Syz155240 free (alternate); 1990SN/A 2003448Sdh155122 if (binding->flags & S_SkipDest) 2010SN/A buff -= len; 2020SN/A return ((long) (1 + (buff - binding->buffer))); 2032393Syz155240 } 2042393Syz155240 2052393Syz155240 buff--; 2062393Syz155240 } 2070SN/A 2082393Syz155240 free (string); 2093448Sdh155122 if (alternate) 2103448Sdh155122 free (alternate); 2110SN/A 2123448Sdh155122 return ((long) -1); 2133448Sdh155122} 2140SN/A 2150SN/A/* Find STRING in LINE, returning the offset of the end of the string. 2160SN/A Return an offset of -1 if STRING does not appear in LINE. The search 2170SN/A is bound by the end of the line (i.e., either NEWLINE or 0). */ 2180SN/Aint 2190SN/Astring_in_line (char *string, char *line) 2200SN/A{ 2210SN/A register int end; 2220SN/A SEARCH_BINDING binding; 2233448Sdh155122 2240SN/A /* Find the end of the line. */ 2253448Sdh155122 for (end = 0; line[end] && line[end] != '\n'; end++); 2260SN/A 2270SN/A /* Search for STRING within these confines. */ 2280SN/A binding.buffer = line; 2290SN/A binding.start = 0; 2303448Sdh155122 binding.end = end; 2312393Syz155240 binding.flags = S_FoldCase | S_SkipDest; 2322393Syz155240 2332393Syz155240 return (search_forward (string, &binding)); 2342393Syz155240} 2350SN/A 2362393Syz155240/* Return non-zero if STRING is the first text to appear at BINDING. */ 2372393Syz155240int 2382393Syz155240looking_at (char *string, SEARCH_BINDING *binding) 2392393Syz155240{ 2400SN/A long search_end; 2412393Syz155240 2423448Sdh155122 search_end = search (string, binding); 2432393Syz155240 2442393Syz155240 /* If the string was not found, SEARCH_END is -1. If the string was found, 2452393Syz155240 but not right away, SEARCH_END is != binding->start. Otherwise, the 2462393Syz155240 string was found at binding->start. */ 2470SN/A return (search_end == binding->start); 2480SN/A} 2490SN/A 2500SN/A/* **************************************************************** */ 2510SN/A/* */ 2520SN/A/* Small String Searches */ 2530SN/A/* */ 2540SN/A/* **************************************************************** */ 2550SN/A 2563448Sdh155122/* Function names that start with "skip" are passed a string, and return 2570SN/A an offset from the start of that string. Function names that start 2583448Sdh155122 with "find" are passed a SEARCH_BINDING, and return an absolute position 2590SN/A marker of the item being searched for. "Find" functions return a value 2600SN/A of -1 if the item being looked for couldn't be found. */ 2610SN/A 2623448Sdh155122/* Return the index of the first non-whitespace character in STRING. */ 2633448Sdh155122int 2640SN/Askip_whitespace (char *string) 2650SN/A{ 2660SN/A register int i; 2672393Syz155240 2682393Syz155240 for (i = 0; string && whitespace (string[i]); i++); 2692393Syz155240 return (i); 2702393Syz155240} 2710SN/A 2722393Syz155240/* Return the index of the first non-whitespace or newline character in 2730SN/A STRING. */ 2740SN/Aint 2752393Syz155240skip_whitespace_and_newlines (char *string) 2762393Syz155240{ 2770SN/A register int i; 2780SN/A 2790SN/A for (i = 0; string && whitespace_or_newline (string[i]); i++); 2800SN/A return (i); 2810SN/A} 2820SN/A 2830SN/A/* Return the index of the first whitespace character in STRING. */ 2840SN/Aint 2850SN/Askip_non_whitespace (char *string) 2860SN/A{ 2870SN/A register int i; 2880SN/A 2890SN/A for (i = 0; string && string[i] && !whitespace (string[i]); i++); 2900SN/A return (i); 2910SN/A} 2920SN/A 2930SN/A/* Return the index of the first non-node character in STRING. Note that 2940SN/A this function contains quite a bit of hair to ignore periods in some 2950SN/A special cases. This is because we here at GNU ship some info files which 2960SN/A contain nodenames that contain periods. No such nodename can start with 2970SN/A a period, or continue with whitespace, newline, or ')' immediately following 2980SN/A the period. If second argument NEWLINES_OKAY is non-zero, newlines should 2990SN/A be skipped while parsing out the nodename specification. */ 3000SN/Aint 3013448Sdh155122skip_node_characters (char *string, int newlines_okay) 3020SN/A{ 3032393Syz155240 register int c, i = 0; 3040SN/A int paren_seen = 0; 3053448Sdh155122 int paren = 0; 3060SN/A 3070SN/A /* Handle special case. This is when another function has parsed out the 3080SN/A filename component of the node name, and we just want to parse out the 3090SN/A nodename proper. In that case, a period at the start of the nodename 3100SN/A indicates an empty nodename. */ 3110SN/A if (string && *string == '.') 3120SN/A return (0); 3130SN/A 3140SN/A if (string && *string == '(') 3150SN/A { 3167433SJohn.Ojemann@Sun.COM paren++; 3177433SJohn.Ojemann@Sun.COM paren_seen++; 3187433SJohn.Ojemann@Sun.COM i++; 3197433SJohn.Ojemann@Sun.COM } 3200SN/A 3210SN/A for (; string && (c = string[i]); i++) 3220SN/A { 3230SN/A if (paren) 3240SN/A { 3250SN/A if (c == '(') 3260SN/A paren++; 3270SN/A else if (c == ')') 3280SN/A paren--; 3290SN/A 3300SN/A continue; 3310SN/A } 3320SN/A 3330SN/A /* If the character following the close paren is a space or period, 3340SN/A then this node name has no more characters associated with it. */ 3350SN/A if (c == '\t' || 3360SN/A c == ',' || 3370SN/A c == INFO_TAGSEP || 3383448Sdh155122 ((!newlines_okay) && (c == '\n')) || 3390SN/A ((paren_seen && string[i - 1] == ')') && 3402393Syz155240 (c == ' ' || c == '.')) || 3412393Syz155240 (c == '.' && 3420SN/A ( 3430SN/A#if 0 3440SN/A/* This test causes a node name ending in a period, like `This.', not to 3450SN/A be found. The trailing . is stripped. This occurs in the jargon 3460SN/A file (`I see no X here.' is a node name). */ 3470SN/A (!string[i + 1]) || 3480SN/A#endif 3490SN/A (whitespace_or_newline (string[i + 1])) || 3500SN/A (string[i + 1] == ')')))) 3510SN/A break; 3520SN/A } 3530SN/A return (i); 3540SN/A} 3550SN/A 3560SN/A 3570SN/A/* **************************************************************** */ 3580SN/A/* */ 3590SN/A/* Searching FILE_BUFFER's */ 3600SN/A/* */ 3610SN/A/* **************************************************************** */ 3620SN/A 3632393Syz155240/* Return the absolute position of the first occurence of a node separator in 3640SN/A BINDING-buffer. The search starts at BINDING->start. Return -1 if no node 3650SN/A separator was found. */ 3662393Syz155240long 3672393Syz155240find_node_separator (SEARCH_BINDING *binding) 3682393Syz155240{ 3692393Syz155240 register long i; 3702393Syz155240 char *body; 3712393Syz155240 3722393Syz155240 body = binding->buffer; 3732393Syz155240 3742393Syz155240 /* A node is started by [^L]^_[^L]\n. That is to say, the C-l's are 3750SN/A optional, but the DELETE and NEWLINE are not. This separator holds 3762393Syz155240 true for all separated elements in an Info file, including the tags 3772393Syz155240 table (if present) and the indirect tags table (if present). */ 3780SN/A for (i = binding->start; i < binding->end - 1; i++) 3792393Syz155240 if (((body[i] == INFO_FF && body[i + 1] == INFO_COOKIE) && 3802393Syz155240 (body[i + 2] == '\n' || 3812393Syz155240 (body[i + 2] == INFO_FF && body[i + 3] == '\n'))) || 3822393Syz155240 ((body[i] == INFO_COOKIE) && 3830SN/A (body[i + 1] == '\n' || 3842393Syz155240 (body[i + 1] == INFO_FF && body[i + 2] == '\n')))) 3852393Syz155240 return (i); 3862393Syz155240 return (-1); 3873448Sdh155122} 3882393Syz155240 3892393Syz155240/* Return the length of the node separator characters that BODY is 3902393Syz155240 currently pointing at. */ 3910SN/Aint 3922393Syz155240skip_node_separator (char *body) 3932393Syz155240{ 3940SN/A register int i; 3950SN/A 3960SN/A i = 0; 3970SN/A 3980SN/A if (body[i] == INFO_FF) 3990SN/A i++; 4000SN/A 4010SN/A if (body[i++] != INFO_COOKIE) 4020SN/A return (0); 4030SN/A 4040SN/A if (body[i] == INFO_FF) 4050SN/A i++; 4060SN/A 4070SN/A if (body[i++] != '\n') 4080SN/A return (0); 4093448Sdh155122 4100SN/A return (i); 4112393Syz155240} 4122393Syz155240 4132393Syz155240/* Return the number of characters from STRING to the start of 4142393Syz155240 the next line. */ 4152393Syz155240int 4162393Syz155240skip_line (char *string) 4172393Syz155240{ 4180SN/A register int i; 4192393Syz155240 4200SN/A for (i = 0; string && string[i] && string[i] != '\n'; i++); 4210SN/A 4220SN/A if (string[i] == '\n') 4232393Syz155240 i++; 4242393Syz155240 4252393Syz155240 return (i); 4262393Syz155240} 4272393Syz155240 4280SN/A/* Return the absolute position of the beginning of a tags table in this 4292393Syz155240 binding starting the search at binding->start. */ 4300SN/Along 4310SN/Afind_tags_table (SEARCH_BINDING *binding) 4322393Syz155240{ 4332393Syz155240 SEARCH_BINDING tmp_search; 4342393Syz155240 long position; 4352393Syz155240 4360SN/A tmp_search.buffer = binding->buffer; 4372393Syz155240 tmp_search.start = binding->start; 4382393Syz155240 tmp_search.end = binding->end; 4390SN/A tmp_search.flags = S_FoldCase; 4400SN/A 4410SN/A while ((position = find_node_separator (&tmp_search)) != -1 ) 4420SN/A { 4430SN/A tmp_search.start = position; 4440SN/A tmp_search.start += skip_node_separator (tmp_search.buffer 4453448Sdh155122 + tmp_search.start); 4460SN/A 4470SN/A if (looking_at (TAGS_TABLE_BEG_LABEL, &tmp_search)) 4480SN/A return (position); 4490SN/A } 4502393Syz155240 return (-1); 4512393Syz155240} 4522393Syz155240 4530SN/A/* Return the absolute position of the node named NODENAME in BINDING. 4540SN/A This is a brute force search, and we wish to avoid it when possible. 4550SN/A This function is called when a tag (indirect or otherwise) doesn't 4563448Sdh155122 really point to the right node. It returns the absolute position of 4573448Sdh155122 the separator preceding the node. */ 4580SN/Along 4590SN/Afind_node_in_binding (char *nodename, SEARCH_BINDING *binding) 4600SN/A{ 4610SN/A long position; 4620SN/A int offset, namelen; 4630SN/A SEARCH_BINDING tmp_search; 4640SN/A 4650SN/A namelen = strlen (nodename); 4660SN/A 4670SN/A tmp_search.buffer = binding->buffer; 4680SN/A tmp_search.start = binding->start; 4690SN/A tmp_search.end = binding->end; 4700SN/A tmp_search.flags = 0; 4710SN/A 4720SN/A while ((position = find_node_separator (&tmp_search)) != -1) 4730SN/A { 4742958Sdr146992 tmp_search.start = position; 4750SN/A tmp_search.start += skip_node_separator 4760SN/A (tmp_search.buffer + tmp_search.start); 4770SN/A 4780SN/A offset = string_in_line 4790SN/A (INFO_NODE_LABEL, tmp_search.buffer + tmp_search.start); 4805274Syx160601 4815274Syx160601 if (offset == -1) 4820SN/A continue; 4830SN/A 4840SN/A tmp_search.start += offset; 4850SN/A tmp_search.start += skip_whitespace (tmp_search.buffer + tmp_search.start); 4860SN/A offset = skip_node_characters 4870SN/A (tmp_search.buffer + tmp_search.start, DONT_SKIP_NEWLINES); 4880SN/A 4890SN/A /* Notice that this is an exact match. You cannot grovel through 4903448Sdh155122 the buffer with this function looking for random nodes. */ 4913448Sdh155122 if ((offset == namelen) && 4922958Sdr146992 (tmp_search.buffer[tmp_search.start] == nodename[0]) && 4937513SDarren.Reed@Sun.COM (strncmp (tmp_search.buffer + tmp_search.start, nodename, offset) == 0)) 4942958Sdr146992 return (position); 4953448Sdh155122 } 4962958Sdr146992 return (-1); 4973448Sdh155122} 4982958Sdr146992