search.c revision 146515
10SN/A/* search.c -- searching large bodies of text.
20SN/A   $Id: search.c,v 1.3 2004/04/11 17:56:46 karl Exp $
30SN/A
40SN/A   Copyright (C) 1993, 1997, 1998, 2002, 2004 Free Software Foundation, Inc.
50SN/A
67433SJohn.Ojemann@Sun.COM   This program is free software; you can redistribute it and/or modify
70SN/A   it under the terms of the GNU General Public License as published by
80SN/A   the Free Software Foundation; either version 2, or (at your option)
90SN/A   any later version.
100SN/A
110SN/A   This program is distributed in the hope that it will be useful,
120SN/A   but WITHOUT ANY WARRANTY; without even the implied warranty of
130SN/A   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
140SN/A   GNU General Public License for more details.
150SN/A
160SN/A   You should have received a copy of the GNU General Public License
170SN/A   along with this program; if not, write to the Free Software
180SN/A   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
190SN/A
200SN/A   Written by Brian Fox (bfox@ai.mit.edu). */
212393Syz155240
222393Syz155240#include "info.h"
232393Syz155240
240SN/A#include "search.h"
250SN/A#include "nodes.h"
260SN/A
270SN/A/* The search functions take two arguments:
280SN/A
290SN/A     1) a string to search for, and
300SN/A
310SN/A     2) a pointer to a SEARCH_BINDING which contains the buffer, start,
320SN/A        and end of the search.
330SN/A
340SN/A   They return a long, which is the offset from the start of the buffer
350SN/A   at which the match was found.  An offset of -1 indicates failure. */
362393Syz155240
372393Syz155240/* A function which makes a binding with buffer and bounds. */
382393Syz155240SEARCH_BINDING *
390SN/Amake_binding (char *buffer, long int start, long int end)
400SN/A{
410SN/A  SEARCH_BINDING *binding;
422393Syz155240
432393Syz155240  binding = (SEARCH_BINDING *)xmalloc (sizeof (SEARCH_BINDING));
440SN/A  binding->buffer = buffer;
450SN/A  binding->start = start;
460SN/A  binding->end = end;
470SN/A  binding->flags = 0;
480SN/A
490SN/A  return (binding);
500SN/A}
510SN/A
520SN/A/* Make a copy of BINDING without duplicating the data. */
530SN/ASEARCH_BINDING *
540SN/Acopy_binding (SEARCH_BINDING *binding)
550SN/A{
560SN/A  SEARCH_BINDING *copy;
570SN/A
580SN/A  copy = make_binding (binding->buffer, binding->start, binding->end);
590SN/A  copy->flags = binding->flags;
600SN/A  return (copy);
610SN/A}
620SN/A
630SN/A
645274Syx160601/* **************************************************************** */
650SN/A/*                                                                  */
660SN/A/*                 The Actual Searching Functions                   */
670SN/A/*                                                                  */
685274Syx160601/* **************************************************************** */
690SN/A
700SN/A/* Search forwards or backwards for the text delimited by BINDING.
710SN/A   The search is forwards if BINDING->start is greater than BINDING->end. */
720SN/Along
730SN/Asearch (char *string, SEARCH_BINDING *binding)
740SN/A{
750SN/A  long result;
760SN/A
770SN/A  /* If the search is backwards, then search backwards, otherwise forwards. */
780SN/A  if (binding->start > binding->end)
790SN/A    result = search_backward (string, binding);
800SN/A  else
812393Syz155240    result = search_forward (string, binding);
822393Syz155240
832393Syz155240  return (result);
840SN/A}
850SN/A
860SN/A/* Search forwards for STRING through the text delimited in BINDING. */
872393Syz155240long
880SN/Asearch_forward (char *string, SEARCH_BINDING *binding)
893448Sdh155122{
900SN/A  register int c, i, len;
910SN/A  register char *buff, *end;
920SN/A  char *alternate = (char *)NULL;
930SN/A
940SN/A  len = strlen (string);
950SN/A
960SN/A  /* We match characters in the search buffer against STRING and ALTERNATE.
970SN/A     ALTERNATE is a case reversed version of STRING; this is cheaper than
982393Syz155240     case folding each character before comparison.   Alternate is only
992393Syz155240     used if the case folding bit is turned on in the passed BINDING. */
1002393Syz155240
1012393Syz155240  if (binding->flags & S_FoldCase)
1022393Syz155240    {
1032393Syz155240      alternate = xstrdup (string);
1042393Syz155240
1052393Syz155240      for (i = 0; i < len; i++)
1062393Syz155240        {
1072393Syz155240          if (islower (alternate[i]))
1082393Syz155240            alternate[i] = toupper (alternate[i]);
1092393Syz155240          else if (isupper (alternate[i]))
1102393Syz155240            alternate[i] = tolower (alternate[i]);
1112393Syz155240        }
1120SN/A    }
1132393Syz155240
1140SN/A  buff = binding->buffer + binding->start;
1150SN/A  end = binding->buffer + binding->end + 1;
1160SN/A
1170SN/A  while (buff < (end - len))
1180SN/A    {
1190SN/A      for (i = 0; i < len; i++)
1200SN/A        {
1212393Syz155240          c = buff[i];
1220SN/A
1232393Syz155240          if ((c != string[i]) && (!alternate || c != alternate[i]))
1240SN/A            break;
1253448Sdh155122        }
1263448Sdh155122
1270SN/A      if (!string[i])
1283448Sdh155122        {
1290SN/A          if (alternate)
1300SN/A            free (alternate);
1310SN/A          if (binding->flags & S_SkipDest)
1323448Sdh155122            buff += len;
1330SN/A          return ((long) (buff - binding->buffer));
1340SN/A        }
1350SN/A
1363448Sdh155122      buff++;
1370SN/A    }
1380SN/A
1390SN/A  if (alternate)
1403448Sdh155122    free (alternate);
1410SN/A
1420SN/A  return ((long) -1);
1430SN/A}
1443448Sdh155122
1450SN/A/* Search for STRING backwards through the text delimited in BINDING. */
1460SN/Along
1470SN/Asearch_backward (char *input_string, SEARCH_BINDING *binding)
1483448Sdh155122{
1490SN/A  register int c, i, len;
1500SN/A  register char *buff, *end;
1510SN/A  char *string;
1523448Sdh155122  char *alternate = (char *)NULL;
1530SN/A
1542393Syz155240  len = strlen (input_string);
1552393Syz155240
1562393Syz155240  /* Reverse the characters in the search string. */
1573448Sdh155122  string = (char *)xmalloc (1 + len);
1582393Syz155240  for (c = 0, i = len - 1; input_string[c]; c++, i--)
1592393Syz155240    string[i] = input_string[c];
1600SN/A
1610SN/A  string[c] = '\0';
1623448Sdh155122
1630SN/A  /* We match characters in the search buffer against STRING and ALTERNATE.
1643448Sdh155122     ALTERNATE is a case reversed version of STRING; this is cheaper than
1650SN/A     case folding each character before comparison.   ALTERNATE is only
1660SN/A     used if the case folding bit is turned on in the passed BINDING. */
1670SN/A
1680SN/A  if (binding->flags & S_FoldCase)
1693448Sdh155122    {
1700SN/A      alternate = xstrdup (string);
1710SN/A
1720SN/A      for (i = 0; i < len; i++)
1733448Sdh155122        {
1740SN/A          if (islower (alternate[i]))
1750SN/A            alternate[i] = toupper (alternate[i]);
1760SN/A          else if (isupper (alternate[i]))
1773448Sdh155122            alternate[i] = tolower (alternate[i]);
1780SN/A        }
1790SN/A    }
1800SN/A
1810SN/A  buff = binding->buffer + binding->start - 1;
1820SN/A  end = binding->buffer + binding->end;
1830SN/A
1843448Sdh155122  while (buff > (end + len))
1850SN/A    {
1863448Sdh155122      for (i = 0; i < len; i++)
1870SN/A        {
1880SN/A          c = *(buff - i);
1890SN/A
1903448Sdh155122          if (c != string[i] && (!alternate || c != alternate[i]))
1910SN/A            break;
1920SN/A        }
1932393Syz155240
1942393Syz155240      if (!string[i])
1952393Syz155240        {
1962393Syz155240          free (string);
1970SN/A          if (alternate)
1982393Syz155240            free (alternate);
1990SN/A
2003448Sdh155122          if (binding->flags & S_SkipDest)
2010SN/A            buff -= len;
2020SN/A          return ((long) (1 + (buff - binding->buffer)));
2032393Syz155240        }
2042393Syz155240
2052393Syz155240      buff--;
2062393Syz155240    }
2070SN/A
2082393Syz155240  free (string);
2093448Sdh155122  if (alternate)
2103448Sdh155122    free (alternate);
2110SN/A
2123448Sdh155122  return ((long) -1);
2133448Sdh155122}
2140SN/A
2150SN/A/* Find STRING in LINE, returning the offset of the end of the string.
2160SN/A   Return an offset of -1 if STRING does not appear in LINE.  The search
2170SN/A   is bound by the end of the line (i.e., either NEWLINE or 0). */
2180SN/Aint
2190SN/Astring_in_line (char *string, char *line)
2200SN/A{
2210SN/A  register int end;
2220SN/A  SEARCH_BINDING binding;
2233448Sdh155122
2240SN/A  /* Find the end of the line. */
2253448Sdh155122  for (end = 0; line[end] && line[end] != '\n'; end++);
2260SN/A
2270SN/A  /* Search for STRING within these confines. */
2280SN/A  binding.buffer = line;
2290SN/A  binding.start = 0;
2303448Sdh155122  binding.end = end;
2312393Syz155240  binding.flags = S_FoldCase | S_SkipDest;
2322393Syz155240
2332393Syz155240  return (search_forward (string, &binding));
2342393Syz155240}
2350SN/A
2362393Syz155240/* Return non-zero if STRING is the first text to appear at BINDING. */
2372393Syz155240int
2382393Syz155240looking_at (char *string, SEARCH_BINDING *binding)
2392393Syz155240{
2400SN/A  long search_end;
2412393Syz155240
2423448Sdh155122  search_end = search (string, binding);
2432393Syz155240
2442393Syz155240  /* If the string was not found, SEARCH_END is -1.  If the string was found,
2452393Syz155240     but not right away, SEARCH_END is != binding->start.  Otherwise, the
2462393Syz155240     string was found at binding->start. */
2470SN/A  return (search_end == binding->start);
2480SN/A}
2490SN/A
2500SN/A/* **************************************************************** */
2510SN/A/*                                                                  */
2520SN/A/*                    Small String Searches                         */
2530SN/A/*                                                                  */
2540SN/A/* **************************************************************** */
2550SN/A
2563448Sdh155122/* Function names that start with "skip" are passed a string, and return
2570SN/A   an offset from the start of that string.  Function names that start
2583448Sdh155122   with "find" are passed a SEARCH_BINDING, and return an absolute position
2590SN/A   marker of the item being searched for.  "Find" functions return a value
2600SN/A   of -1 if the item being looked for couldn't be found. */
2610SN/A
2623448Sdh155122/* Return the index of the first non-whitespace character in STRING. */
2633448Sdh155122int
2640SN/Askip_whitespace (char *string)
2650SN/A{
2660SN/A  register int i;
2672393Syz155240
2682393Syz155240  for (i = 0; string && whitespace (string[i]); i++);
2692393Syz155240  return (i);
2702393Syz155240}
2710SN/A
2722393Syz155240/* Return the index of the first non-whitespace or newline character in
2730SN/A   STRING. */
2740SN/Aint
2752393Syz155240skip_whitespace_and_newlines (char *string)
2762393Syz155240{
2770SN/A  register int i;
2780SN/A
2790SN/A  for (i = 0; string && whitespace_or_newline (string[i]); i++);
2800SN/A  return (i);
2810SN/A}
2820SN/A
2830SN/A/* Return the index of the first whitespace character in STRING. */
2840SN/Aint
2850SN/Askip_non_whitespace (char *string)
2860SN/A{
2870SN/A  register int i;
2880SN/A
2890SN/A  for (i = 0; string && string[i] && !whitespace (string[i]); i++);
2900SN/A  return (i);
2910SN/A}
2920SN/A
2930SN/A/* Return the index of the first non-node character in STRING.  Note that
2940SN/A   this function contains quite a bit of hair to ignore periods in some
2950SN/A   special cases.  This is because we here at GNU ship some info files which
2960SN/A   contain nodenames that contain periods.  No such nodename can start with
2970SN/A   a period, or continue with whitespace, newline, or ')' immediately following
2980SN/A   the period.  If second argument NEWLINES_OKAY is non-zero, newlines should
2990SN/A   be skipped while parsing out the nodename specification. */
3000SN/Aint
3013448Sdh155122skip_node_characters (char *string, int newlines_okay)
3020SN/A{
3032393Syz155240  register int c, i = 0;
3040SN/A  int paren_seen = 0;
3053448Sdh155122  int paren = 0;
3060SN/A
3070SN/A  /* Handle special case.  This is when another function has parsed out the
3080SN/A     filename component of the node name, and we just want to parse out the
3090SN/A     nodename proper.  In that case, a period at the start of the nodename
3100SN/A     indicates an empty nodename. */
3110SN/A  if (string && *string == '.')
3120SN/A    return (0);
3130SN/A
3140SN/A  if (string && *string == '(')
3150SN/A    {
3167433SJohn.Ojemann@Sun.COM      paren++;
3177433SJohn.Ojemann@Sun.COM      paren_seen++;
3187433SJohn.Ojemann@Sun.COM      i++;
3197433SJohn.Ojemann@Sun.COM    }
3200SN/A
3210SN/A  for (; string && (c = string[i]); i++)
3220SN/A    {
3230SN/A      if (paren)
3240SN/A        {
3250SN/A          if (c == '(')
3260SN/A            paren++;
3270SN/A          else if (c == ')')
3280SN/A            paren--;
3290SN/A
3300SN/A          continue;
3310SN/A        }
3320SN/A
3330SN/A      /* If the character following the close paren is a space or period,
3340SN/A         then this node name has no more characters associated with it. */
3350SN/A      if (c == '\t' ||
3360SN/A          c == ','  ||
3370SN/A          c == INFO_TAGSEP ||
3383448Sdh155122          ((!newlines_okay) && (c == '\n')) ||
3390SN/A          ((paren_seen && string[i - 1] == ')') &&
3402393Syz155240           (c == ' ' || c == '.')) ||
3412393Syz155240          (c == '.' &&
3420SN/A           (
3430SN/A#if 0
3440SN/A/* This test causes a node name ending in a period, like `This.', not to
3450SN/A   be found.  The trailing . is stripped.  This occurs in the jargon
3460SN/A   file (`I see no X here.' is a node name).  */
3470SN/A           (!string[i + 1]) ||
3480SN/A#endif
3490SN/A            (whitespace_or_newline (string[i + 1])) ||
3500SN/A            (string[i + 1] == ')'))))
3510SN/A        break;
3520SN/A    }
3530SN/A  return (i);
3540SN/A}
3550SN/A
3560SN/A
3570SN/A/* **************************************************************** */
3580SN/A/*                                                                  */
3590SN/A/*                   Searching FILE_BUFFER's                        */
3600SN/A/*                                                                  */
3610SN/A/* **************************************************************** */
3620SN/A
3632393Syz155240/* Return the absolute position of the first occurence of a node separator in
3640SN/A   BINDING-buffer.  The search starts at BINDING->start.  Return -1 if no node
3650SN/A   separator was found. */
3662393Syz155240long
3672393Syz155240find_node_separator (SEARCH_BINDING *binding)
3682393Syz155240{
3692393Syz155240  register long i;
3702393Syz155240  char *body;
3712393Syz155240
3722393Syz155240  body = binding->buffer;
3732393Syz155240
3742393Syz155240  /* A node is started by [^L]^_[^L]\n.  That is to say, the C-l's are
3750SN/A     optional, but the DELETE and NEWLINE are not.  This separator holds
3762393Syz155240     true for all separated elements in an Info file, including the tags
3772393Syz155240     table (if present) and the indirect tags table (if present). */
3780SN/A  for (i = binding->start; i < binding->end - 1; i++)
3792393Syz155240    if (((body[i] == INFO_FF && body[i + 1] == INFO_COOKIE) &&
3802393Syz155240         (body[i + 2] == '\n' ||
3812393Syz155240          (body[i + 2] == INFO_FF && body[i + 3] == '\n'))) ||
3822393Syz155240        ((body[i] == INFO_COOKIE) &&
3830SN/A         (body[i + 1] == '\n' ||
3842393Syz155240          (body[i + 1] == INFO_FF && body[i + 2] == '\n'))))
3852393Syz155240      return (i);
3862393Syz155240  return (-1);
3873448Sdh155122}
3882393Syz155240
3892393Syz155240/* Return the length of the node separator characters that BODY is
3902393Syz155240   currently pointing at. */
3910SN/Aint
3922393Syz155240skip_node_separator (char *body)
3932393Syz155240{
3940SN/A  register int i;
3950SN/A
3960SN/A  i = 0;
3970SN/A
3980SN/A  if (body[i] == INFO_FF)
3990SN/A    i++;
4000SN/A
4010SN/A  if (body[i++] != INFO_COOKIE)
4020SN/A    return (0);
4030SN/A
4040SN/A  if (body[i] == INFO_FF)
4050SN/A    i++;
4060SN/A
4070SN/A  if (body[i++] != '\n')
4080SN/A    return (0);
4093448Sdh155122
4100SN/A  return (i);
4112393Syz155240}
4122393Syz155240
4132393Syz155240/* Return the number of characters from STRING to the start of
4142393Syz155240   the next line. */
4152393Syz155240int
4162393Syz155240skip_line (char *string)
4172393Syz155240{
4180SN/A  register int i;
4192393Syz155240
4200SN/A  for (i = 0; string && string[i] && string[i] != '\n'; i++);
4210SN/A
4220SN/A  if (string[i] == '\n')
4232393Syz155240    i++;
4242393Syz155240
4252393Syz155240  return (i);
4262393Syz155240}
4272393Syz155240
4280SN/A/* Return the absolute position of the beginning of a tags table in this
4292393Syz155240   binding starting the search at binding->start. */
4300SN/Along
4310SN/Afind_tags_table (SEARCH_BINDING *binding)
4322393Syz155240{
4332393Syz155240  SEARCH_BINDING tmp_search;
4342393Syz155240  long position;
4352393Syz155240
4360SN/A  tmp_search.buffer = binding->buffer;
4372393Syz155240  tmp_search.start = binding->start;
4382393Syz155240  tmp_search.end = binding->end;
4390SN/A  tmp_search.flags = S_FoldCase;
4400SN/A
4410SN/A  while ((position = find_node_separator (&tmp_search)) != -1 )
4420SN/A    {
4430SN/A      tmp_search.start = position;
4440SN/A      tmp_search.start += skip_node_separator (tmp_search.buffer
4453448Sdh155122          + tmp_search.start);
4460SN/A
4470SN/A      if (looking_at (TAGS_TABLE_BEG_LABEL, &tmp_search))
4480SN/A        return (position);
4490SN/A    }
4502393Syz155240  return (-1);
4512393Syz155240}
4522393Syz155240
4530SN/A/* Return the absolute position of the node named NODENAME in BINDING.
4540SN/A   This is a brute force search, and we wish to avoid it when possible.
4550SN/A   This function is called when a tag (indirect or otherwise) doesn't
4563448Sdh155122   really point to the right node.  It returns the absolute position of
4573448Sdh155122   the separator preceding the node. */
4580SN/Along
4590SN/Afind_node_in_binding (char *nodename, SEARCH_BINDING *binding)
4600SN/A{
4610SN/A  long position;
4620SN/A  int offset, namelen;
4630SN/A  SEARCH_BINDING tmp_search;
4640SN/A
4650SN/A  namelen = strlen (nodename);
4660SN/A
4670SN/A  tmp_search.buffer = binding->buffer;
4680SN/A  tmp_search.start = binding->start;
4690SN/A  tmp_search.end = binding->end;
4700SN/A  tmp_search.flags = 0;
4710SN/A
4720SN/A  while ((position = find_node_separator (&tmp_search)) != -1)
4730SN/A    {
4742958Sdr146992      tmp_search.start = position;
4750SN/A      tmp_search.start += skip_node_separator
4760SN/A        (tmp_search.buffer + tmp_search.start);
4770SN/A
4780SN/A      offset = string_in_line
4790SN/A        (INFO_NODE_LABEL, tmp_search.buffer + tmp_search.start);
4805274Syx160601
4815274Syx160601      if (offset == -1)
4820SN/A        continue;
4830SN/A
4840SN/A      tmp_search.start += offset;
4850SN/A      tmp_search.start += skip_whitespace (tmp_search.buffer + tmp_search.start);
4860SN/A      offset = skip_node_characters
4870SN/A        (tmp_search.buffer + tmp_search.start, DONT_SKIP_NEWLINES);
4880SN/A
4890SN/A      /* Notice that this is an exact match.  You cannot grovel through
4903448Sdh155122         the buffer with this function looking for random nodes. */
4913448Sdh155122       if ((offset == namelen) &&
4922958Sdr146992           (tmp_search.buffer[tmp_search.start] == nodename[0]) &&
4937513SDarren.Reed@Sun.COM           (strncmp (tmp_search.buffer + tmp_search.start, nodename, offset) == 0))
4942958Sdr146992         return (position);
4953448Sdh155122    }
4962958Sdr146992  return (-1);
4973448Sdh155122}
4982958Sdr146992