156160Sru/* search.c -- searching large bodies of text. 2146515Sru $Id: search.c,v 1.3 2004/04/11 17:56:46 karl Exp $ 321495Sjmacd 4146515Sru Copyright (C) 1993, 1997, 1998, 2002, 2004 Free Software Foundation, Inc. 521495Sjmacd 621495Sjmacd This program is free software; you can redistribute it and/or modify 721495Sjmacd it under the terms of the GNU General Public License as published by 821495Sjmacd the Free Software Foundation; either version 2, or (at your option) 921495Sjmacd any later version. 1021495Sjmacd 1121495Sjmacd This program is distributed in the hope that it will be useful, 1221495Sjmacd but WITHOUT ANY WARRANTY; without even the implied warranty of 1321495Sjmacd MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 1421495Sjmacd GNU General Public License for more details. 1521495Sjmacd 1621495Sjmacd You should have received a copy of the GNU General Public License 1721495Sjmacd along with this program; if not, write to the Free Software 1821495Sjmacd Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 1921495Sjmacd 2021495Sjmacd Written by Brian Fox (bfox@ai.mit.edu). */ 2121495Sjmacd 2242660Smarkm#include "info.h" 2342660Smarkm 2421495Sjmacd#include "search.h" 2521495Sjmacd#include "nodes.h" 2621495Sjmacd 2721495Sjmacd/* The search functions take two arguments: 2821495Sjmacd 2921495Sjmacd 1) a string to search for, and 3021495Sjmacd 3121495Sjmacd 2) a pointer to a SEARCH_BINDING which contains the buffer, start, 3221495Sjmacd and end of the search. 3321495Sjmacd 3421495Sjmacd They return a long, which is the offset from the start of the buffer 3521495Sjmacd at which the match was found. An offset of -1 indicates failure. */ 3621495Sjmacd 3721495Sjmacd/* A function which makes a binding with buffer and bounds. */ 3821495SjmacdSEARCH_BINDING * 39146515Srumake_binding (char *buffer, long int start, long int end) 4021495Sjmacd{ 4121495Sjmacd SEARCH_BINDING *binding; 4221495Sjmacd 4321495Sjmacd binding = (SEARCH_BINDING *)xmalloc (sizeof (SEARCH_BINDING)); 4421495Sjmacd binding->buffer = buffer; 4521495Sjmacd binding->start = start; 4621495Sjmacd binding->end = end; 4721495Sjmacd binding->flags = 0; 4821495Sjmacd 4921495Sjmacd return (binding); 5021495Sjmacd} 5121495Sjmacd 5221495Sjmacd/* Make a copy of BINDING without duplicating the data. */ 5321495SjmacdSEARCH_BINDING * 54146515Srucopy_binding (SEARCH_BINDING *binding) 5521495Sjmacd{ 5621495Sjmacd SEARCH_BINDING *copy; 5721495Sjmacd 5821495Sjmacd copy = make_binding (binding->buffer, binding->start, binding->end); 5921495Sjmacd copy->flags = binding->flags; 6021495Sjmacd return (copy); 6121495Sjmacd} 6221495Sjmacd 6321495Sjmacd 6421495Sjmacd/* **************************************************************** */ 6542660Smarkm/* */ 6642660Smarkm/* The Actual Searching Functions */ 6742660Smarkm/* */ 6821495Sjmacd/* **************************************************************** */ 6921495Sjmacd 7021495Sjmacd/* Search forwards or backwards for the text delimited by BINDING. 7121495Sjmacd The search is forwards if BINDING->start is greater than BINDING->end. */ 7221495Sjmacdlong 73146515Srusearch (char *string, SEARCH_BINDING *binding) 7421495Sjmacd{ 7521495Sjmacd long result; 7621495Sjmacd 7721495Sjmacd /* If the search is backwards, then search backwards, otherwise forwards. */ 7821495Sjmacd if (binding->start > binding->end) 7921495Sjmacd result = search_backward (string, binding); 8021495Sjmacd else 8121495Sjmacd result = search_forward (string, binding); 8221495Sjmacd 8321495Sjmacd return (result); 8421495Sjmacd} 8521495Sjmacd 8621495Sjmacd/* Search forwards for STRING through the text delimited in BINDING. */ 8721495Sjmacdlong 88146515Srusearch_forward (char *string, SEARCH_BINDING *binding) 8921495Sjmacd{ 9021495Sjmacd register int c, i, len; 9121495Sjmacd register char *buff, *end; 9221495Sjmacd char *alternate = (char *)NULL; 9321495Sjmacd 9421495Sjmacd len = strlen (string); 9521495Sjmacd 9621495Sjmacd /* We match characters in the search buffer against STRING and ALTERNATE. 9721495Sjmacd ALTERNATE is a case reversed version of STRING; this is cheaper than 9821495Sjmacd case folding each character before comparison. Alternate is only 9921495Sjmacd used if the case folding bit is turned on in the passed BINDING. */ 10021495Sjmacd 10121495Sjmacd if (binding->flags & S_FoldCase) 10221495Sjmacd { 10342660Smarkm alternate = xstrdup (string); 10421495Sjmacd 10521495Sjmacd for (i = 0; i < len; i++) 10642660Smarkm { 10742660Smarkm if (islower (alternate[i])) 10842660Smarkm alternate[i] = toupper (alternate[i]); 10942660Smarkm else if (isupper (alternate[i])) 11042660Smarkm alternate[i] = tolower (alternate[i]); 11142660Smarkm } 11221495Sjmacd } 11321495Sjmacd 11421495Sjmacd buff = binding->buffer + binding->start; 11521495Sjmacd end = binding->buffer + binding->end + 1; 11621495Sjmacd 11721495Sjmacd while (buff < (end - len)) 11821495Sjmacd { 11921495Sjmacd for (i = 0; i < len; i++) 12042660Smarkm { 12142660Smarkm c = buff[i]; 12221495Sjmacd 12342660Smarkm if ((c != string[i]) && (!alternate || c != alternate[i])) 12442660Smarkm break; 12542660Smarkm } 12621495Sjmacd 12721495Sjmacd if (!string[i]) 12842660Smarkm { 12942660Smarkm if (alternate) 13042660Smarkm free (alternate); 13142660Smarkm if (binding->flags & S_SkipDest) 13242660Smarkm buff += len; 13342660Smarkm return ((long) (buff - binding->buffer)); 13442660Smarkm } 13521495Sjmacd 13621495Sjmacd buff++; 13721495Sjmacd } 13821495Sjmacd 13921495Sjmacd if (alternate) 14021495Sjmacd free (alternate); 14121495Sjmacd 14221495Sjmacd return ((long) -1); 14321495Sjmacd} 14421495Sjmacd 14521495Sjmacd/* Search for STRING backwards through the text delimited in BINDING. */ 14621495Sjmacdlong 147146515Srusearch_backward (char *input_string, SEARCH_BINDING *binding) 14821495Sjmacd{ 14921495Sjmacd register int c, i, len; 15021495Sjmacd register char *buff, *end; 15121495Sjmacd char *string; 15221495Sjmacd char *alternate = (char *)NULL; 15321495Sjmacd 15421495Sjmacd len = strlen (input_string); 15521495Sjmacd 15621495Sjmacd /* Reverse the characters in the search string. */ 15721495Sjmacd string = (char *)xmalloc (1 + len); 15821495Sjmacd for (c = 0, i = len - 1; input_string[c]; c++, i--) 15921495Sjmacd string[i] = input_string[c]; 16021495Sjmacd 16121495Sjmacd string[c] = '\0'; 16221495Sjmacd 16321495Sjmacd /* We match characters in the search buffer against STRING and ALTERNATE. 16421495Sjmacd ALTERNATE is a case reversed version of STRING; this is cheaper than 16521495Sjmacd case folding each character before comparison. ALTERNATE is only 16621495Sjmacd used if the case folding bit is turned on in the passed BINDING. */ 16721495Sjmacd 16821495Sjmacd if (binding->flags & S_FoldCase) 16921495Sjmacd { 17042660Smarkm alternate = xstrdup (string); 17121495Sjmacd 17221495Sjmacd for (i = 0; i < len; i++) 17342660Smarkm { 17442660Smarkm if (islower (alternate[i])) 17542660Smarkm alternate[i] = toupper (alternate[i]); 17642660Smarkm else if (isupper (alternate[i])) 17742660Smarkm alternate[i] = tolower (alternate[i]); 17842660Smarkm } 17921495Sjmacd } 18021495Sjmacd 18121495Sjmacd buff = binding->buffer + binding->start - 1; 18221495Sjmacd end = binding->buffer + binding->end; 18321495Sjmacd 18421495Sjmacd while (buff > (end + len)) 18521495Sjmacd { 18621495Sjmacd for (i = 0; i < len; i++) 18742660Smarkm { 18842660Smarkm c = *(buff - i); 18921495Sjmacd 19056160Sru if (c != string[i] && (!alternate || c != alternate[i])) 19142660Smarkm break; 19242660Smarkm } 19321495Sjmacd 19421495Sjmacd if (!string[i]) 19542660Smarkm { 19642660Smarkm free (string); 19742660Smarkm if (alternate) 19842660Smarkm free (alternate); 19921495Sjmacd 20042660Smarkm if (binding->flags & S_SkipDest) 20142660Smarkm buff -= len; 20242660Smarkm return ((long) (1 + (buff - binding->buffer))); 20342660Smarkm } 20421495Sjmacd 20521495Sjmacd buff--; 20621495Sjmacd } 20721495Sjmacd 20821495Sjmacd free (string); 20921495Sjmacd if (alternate) 21021495Sjmacd free (alternate); 21121495Sjmacd 21221495Sjmacd return ((long) -1); 21321495Sjmacd} 21421495Sjmacd 21521495Sjmacd/* Find STRING in LINE, returning the offset of the end of the string. 21621495Sjmacd Return an offset of -1 if STRING does not appear in LINE. The search 21721495Sjmacd is bound by the end of the line (i.e., either NEWLINE or 0). */ 21821495Sjmacdint 219146515Srustring_in_line (char *string, char *line) 22021495Sjmacd{ 22121495Sjmacd register int end; 22221495Sjmacd SEARCH_BINDING binding; 22321495Sjmacd 22421495Sjmacd /* Find the end of the line. */ 22521495Sjmacd for (end = 0; line[end] && line[end] != '\n'; end++); 22621495Sjmacd 22721495Sjmacd /* Search for STRING within these confines. */ 22821495Sjmacd binding.buffer = line; 22921495Sjmacd binding.start = 0; 23021495Sjmacd binding.end = end; 23121495Sjmacd binding.flags = S_FoldCase | S_SkipDest; 23221495Sjmacd 23321495Sjmacd return (search_forward (string, &binding)); 23421495Sjmacd} 23521495Sjmacd 23621495Sjmacd/* Return non-zero if STRING is the first text to appear at BINDING. */ 23721495Sjmacdint 238146515Srulooking_at (char *string, SEARCH_BINDING *binding) 23921495Sjmacd{ 24021495Sjmacd long search_end; 24121495Sjmacd 24221495Sjmacd search_end = search (string, binding); 24321495Sjmacd 24421495Sjmacd /* If the string was not found, SEARCH_END is -1. If the string was found, 24521495Sjmacd but not right away, SEARCH_END is != binding->start. Otherwise, the 24621495Sjmacd string was found at binding->start. */ 24721495Sjmacd return (search_end == binding->start); 24821495Sjmacd} 24921495Sjmacd 25021495Sjmacd/* **************************************************************** */ 25142660Smarkm/* */ 25242660Smarkm/* Small String Searches */ 25342660Smarkm/* */ 25421495Sjmacd/* **************************************************************** */ 25521495Sjmacd 25621495Sjmacd/* Function names that start with "skip" are passed a string, and return 25721495Sjmacd an offset from the start of that string. Function names that start 25821495Sjmacd with "find" are passed a SEARCH_BINDING, and return an absolute position 25921495Sjmacd marker of the item being searched for. "Find" functions return a value 26021495Sjmacd of -1 if the item being looked for couldn't be found. */ 26121495Sjmacd 26221495Sjmacd/* Return the index of the first non-whitespace character in STRING. */ 26321495Sjmacdint 264146515Sruskip_whitespace (char *string) 26521495Sjmacd{ 26621495Sjmacd register int i; 26721495Sjmacd 26821495Sjmacd for (i = 0; string && whitespace (string[i]); i++); 26921495Sjmacd return (i); 27021495Sjmacd} 27121495Sjmacd 27221495Sjmacd/* Return the index of the first non-whitespace or newline character in 27321495Sjmacd STRING. */ 27421495Sjmacdint 275146515Sruskip_whitespace_and_newlines (char *string) 27621495Sjmacd{ 27721495Sjmacd register int i; 27821495Sjmacd 27956160Sru for (i = 0; string && whitespace_or_newline (string[i]); i++); 28021495Sjmacd return (i); 28121495Sjmacd} 28221495Sjmacd 28321495Sjmacd/* Return the index of the first whitespace character in STRING. */ 28421495Sjmacdint 285146515Sruskip_non_whitespace (char *string) 28621495Sjmacd{ 28721495Sjmacd register int i; 28821495Sjmacd 289100513Sru for (i = 0; string && string[i] && !whitespace (string[i]); i++); 29021495Sjmacd return (i); 29121495Sjmacd} 29221495Sjmacd 29321495Sjmacd/* Return the index of the first non-node character in STRING. Note that 29421495Sjmacd this function contains quite a bit of hair to ignore periods in some 29521495Sjmacd special cases. This is because we here at GNU ship some info files which 29621495Sjmacd contain nodenames that contain periods. No such nodename can start with 29721495Sjmacd a period, or continue with whitespace, newline, or ')' immediately following 29821495Sjmacd the period. If second argument NEWLINES_OKAY is non-zero, newlines should 29921495Sjmacd be skipped while parsing out the nodename specification. */ 30021495Sjmacdint 301146515Sruskip_node_characters (char *string, int newlines_okay) 30221495Sjmacd{ 30321495Sjmacd register int c, i = 0; 30421495Sjmacd int paren_seen = 0; 30521495Sjmacd int paren = 0; 30621495Sjmacd 30721495Sjmacd /* Handle special case. This is when another function has parsed out the 30821495Sjmacd filename component of the node name, and we just want to parse out the 30921495Sjmacd nodename proper. In that case, a period at the start of the nodename 31021495Sjmacd indicates an empty nodename. */ 31121495Sjmacd if (string && *string == '.') 31221495Sjmacd return (0); 31321495Sjmacd 31421495Sjmacd if (string && *string == '(') 31521495Sjmacd { 31621495Sjmacd paren++; 31721495Sjmacd paren_seen++; 31821495Sjmacd i++; 31921495Sjmacd } 32021495Sjmacd 32121495Sjmacd for (; string && (c = string[i]); i++) 32221495Sjmacd { 32321495Sjmacd if (paren) 32442660Smarkm { 32542660Smarkm if (c == '(') 32642660Smarkm paren++; 32742660Smarkm else if (c == ')') 32842660Smarkm paren--; 32921495Sjmacd 33042660Smarkm continue; 33142660Smarkm } 33221495Sjmacd 33321495Sjmacd /* If the character following the close paren is a space or period, 33442660Smarkm then this node name has no more characters associated with it. */ 33521495Sjmacd if (c == '\t' || 33642660Smarkm c == ',' || 33742660Smarkm c == INFO_TAGSEP || 33842660Smarkm ((!newlines_okay) && (c == '\n')) || 33942660Smarkm ((paren_seen && string[i - 1] == ')') && 34042660Smarkm (c == ' ' || c == '.')) || 34142660Smarkm (c == '.' && 34242660Smarkm ( 34342660Smarkm#if 0 34442660Smarkm/* This test causes a node name ending in a period, like `This.', not to 34542660Smarkm be found. The trailing . is stripped. This occurs in the jargon 34642660Smarkm file (`I see no X here.' is a node name). */ 34742660Smarkm (!string[i + 1]) || 34842660Smarkm#endif 34942660Smarkm (whitespace_or_newline (string[i + 1])) || 35042660Smarkm (string[i + 1] == ')')))) 35142660Smarkm break; 35221495Sjmacd } 35321495Sjmacd return (i); 35421495Sjmacd} 35521495Sjmacd 35621495Sjmacd 35721495Sjmacd/* **************************************************************** */ 35842660Smarkm/* */ 35942660Smarkm/* Searching FILE_BUFFER's */ 36042660Smarkm/* */ 36121495Sjmacd/* **************************************************************** */ 36221495Sjmacd 36321495Sjmacd/* Return the absolute position of the first occurence of a node separator in 36421495Sjmacd BINDING-buffer. The search starts at BINDING->start. Return -1 if no node 36521495Sjmacd separator was found. */ 36621495Sjmacdlong 367146515Srufind_node_separator (SEARCH_BINDING *binding) 36821495Sjmacd{ 36921495Sjmacd register long i; 37021495Sjmacd char *body; 37121495Sjmacd 37221495Sjmacd body = binding->buffer; 37321495Sjmacd 37421495Sjmacd /* A node is started by [^L]^_[^L]\n. That is to say, the C-l's are 37521495Sjmacd optional, but the DELETE and NEWLINE are not. This separator holds 37621495Sjmacd true for all separated elements in an Info file, including the tags 37721495Sjmacd table (if present) and the indirect tags table (if present). */ 37821495Sjmacd for (i = binding->start; i < binding->end - 1; i++) 37921495Sjmacd if (((body[i] == INFO_FF && body[i + 1] == INFO_COOKIE) && 38042660Smarkm (body[i + 2] == '\n' || 38142660Smarkm (body[i + 2] == INFO_FF && body[i + 3] == '\n'))) || 38242660Smarkm ((body[i] == INFO_COOKIE) && 38342660Smarkm (body[i + 1] == '\n' || 38442660Smarkm (body[i + 1] == INFO_FF && body[i + 2] == '\n')))) 38521495Sjmacd return (i); 38621495Sjmacd return (-1); 38721495Sjmacd} 38821495Sjmacd 38921495Sjmacd/* Return the length of the node separator characters that BODY is 39021495Sjmacd currently pointing at. */ 39121495Sjmacdint 392146515Sruskip_node_separator (char *body) 39321495Sjmacd{ 39421495Sjmacd register int i; 39521495Sjmacd 39621495Sjmacd i = 0; 39721495Sjmacd 39821495Sjmacd if (body[i] == INFO_FF) 39921495Sjmacd i++; 40021495Sjmacd 40121495Sjmacd if (body[i++] != INFO_COOKIE) 40221495Sjmacd return (0); 40321495Sjmacd 40421495Sjmacd if (body[i] == INFO_FF) 40521495Sjmacd i++; 40621495Sjmacd 40721495Sjmacd if (body[i++] != '\n') 40821495Sjmacd return (0); 40921495Sjmacd 41021495Sjmacd return (i); 41121495Sjmacd} 41221495Sjmacd 41321495Sjmacd/* Return the number of characters from STRING to the start of 41421495Sjmacd the next line. */ 41521495Sjmacdint 416146515Sruskip_line (char *string) 41721495Sjmacd{ 41821495Sjmacd register int i; 41921495Sjmacd 42021495Sjmacd for (i = 0; string && string[i] && string[i] != '\n'; i++); 42121495Sjmacd 42221495Sjmacd if (string[i] == '\n') 42321495Sjmacd i++; 42421495Sjmacd 42521495Sjmacd return (i); 42621495Sjmacd} 42721495Sjmacd 42821495Sjmacd/* Return the absolute position of the beginning of a tags table in this 42921495Sjmacd binding starting the search at binding->start. */ 43021495Sjmacdlong 431146515Srufind_tags_table (SEARCH_BINDING *binding) 43221495Sjmacd{ 433146515Sru SEARCH_BINDING tmp_search; 43421495Sjmacd long position; 43521495Sjmacd 436146515Sru tmp_search.buffer = binding->buffer; 437146515Sru tmp_search.start = binding->start; 438146515Sru tmp_search.end = binding->end; 439146515Sru tmp_search.flags = S_FoldCase; 44021495Sjmacd 441146515Sru while ((position = find_node_separator (&tmp_search)) != -1 ) 44221495Sjmacd { 443146515Sru tmp_search.start = position; 444146515Sru tmp_search.start += skip_node_separator (tmp_search.buffer 445146515Sru + tmp_search.start); 44621495Sjmacd 447146515Sru if (looking_at (TAGS_TABLE_BEG_LABEL, &tmp_search)) 44842660Smarkm return (position); 44921495Sjmacd } 45021495Sjmacd return (-1); 45121495Sjmacd} 45221495Sjmacd 45321495Sjmacd/* Return the absolute position of the node named NODENAME in BINDING. 45421495Sjmacd This is a brute force search, and we wish to avoid it when possible. 45521495Sjmacd This function is called when a tag (indirect or otherwise) doesn't 45621495Sjmacd really point to the right node. It returns the absolute position of 45721495Sjmacd the separator preceding the node. */ 45821495Sjmacdlong 459146515Srufind_node_in_binding (char *nodename, SEARCH_BINDING *binding) 46021495Sjmacd{ 46142660Smarkm long position; 46242660Smarkm int offset, namelen; 463146515Sru SEARCH_BINDING tmp_search; 46421495Sjmacd 46521495Sjmacd namelen = strlen (nodename); 46621495Sjmacd 467146515Sru tmp_search.buffer = binding->buffer; 468146515Sru tmp_search.start = binding->start; 469146515Sru tmp_search.end = binding->end; 470146515Sru tmp_search.flags = 0; 47121495Sjmacd 472146515Sru while ((position = find_node_separator (&tmp_search)) != -1) 47321495Sjmacd { 474146515Sru tmp_search.start = position; 475146515Sru tmp_search.start += skip_node_separator 476146515Sru (tmp_search.buffer + tmp_search.start); 47721495Sjmacd 478146515Sru offset = string_in_line 479146515Sru (INFO_NODE_LABEL, tmp_search.buffer + tmp_search.start); 48021495Sjmacd 48121495Sjmacd if (offset == -1) 48242660Smarkm continue; 48321495Sjmacd 484146515Sru tmp_search.start += offset; 485146515Sru tmp_search.start += skip_whitespace (tmp_search.buffer + tmp_search.start); 48621495Sjmacd offset = skip_node_characters 487146515Sru (tmp_search.buffer + tmp_search.start, DONT_SKIP_NEWLINES); 48821495Sjmacd 48921495Sjmacd /* Notice that this is an exact match. You cannot grovel through 49042660Smarkm the buffer with this function looking for random nodes. */ 49121495Sjmacd if ((offset == namelen) && 492146515Sru (tmp_search.buffer[tmp_search.start] == nodename[0]) && 493146515Sru (strncmp (tmp_search.buffer + tmp_search.start, nodename, offset) == 0)) 49442660Smarkm return (position); 49521495Sjmacd } 49621495Sjmacd return (-1); 49721495Sjmacd} 498