156160Sru/* search.c -- searching large bodies of text.
2146515Sru   $Id: search.c,v 1.3 2004/04/11 17:56:46 karl Exp $
321495Sjmacd
4146515Sru   Copyright (C) 1993, 1997, 1998, 2002, 2004 Free Software Foundation, Inc.
521495Sjmacd
621495Sjmacd   This program is free software; you can redistribute it and/or modify
721495Sjmacd   it under the terms of the GNU General Public License as published by
821495Sjmacd   the Free Software Foundation; either version 2, or (at your option)
921495Sjmacd   any later version.
1021495Sjmacd
1121495Sjmacd   This program is distributed in the hope that it will be useful,
1221495Sjmacd   but WITHOUT ANY WARRANTY; without even the implied warranty of
1321495Sjmacd   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1421495Sjmacd   GNU General Public License for more details.
1521495Sjmacd
1621495Sjmacd   You should have received a copy of the GNU General Public License
1721495Sjmacd   along with this program; if not, write to the Free Software
1821495Sjmacd   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
1921495Sjmacd
2021495Sjmacd   Written by Brian Fox (bfox@ai.mit.edu). */
2121495Sjmacd
2242660Smarkm#include "info.h"
2342660Smarkm
2421495Sjmacd#include "search.h"
2521495Sjmacd#include "nodes.h"
2621495Sjmacd
2721495Sjmacd/* The search functions take two arguments:
2821495Sjmacd
2921495Sjmacd     1) a string to search for, and
3021495Sjmacd
3121495Sjmacd     2) a pointer to a SEARCH_BINDING which contains the buffer, start,
3221495Sjmacd        and end of the search.
3321495Sjmacd
3421495Sjmacd   They return a long, which is the offset from the start of the buffer
3521495Sjmacd   at which the match was found.  An offset of -1 indicates failure. */
3621495Sjmacd
3721495Sjmacd/* A function which makes a binding with buffer and bounds. */
3821495SjmacdSEARCH_BINDING *
39146515Srumake_binding (char *buffer, long int start, long int end)
4021495Sjmacd{
4121495Sjmacd  SEARCH_BINDING *binding;
4221495Sjmacd
4321495Sjmacd  binding = (SEARCH_BINDING *)xmalloc (sizeof (SEARCH_BINDING));
4421495Sjmacd  binding->buffer = buffer;
4521495Sjmacd  binding->start = start;
4621495Sjmacd  binding->end = end;
4721495Sjmacd  binding->flags = 0;
4821495Sjmacd
4921495Sjmacd  return (binding);
5021495Sjmacd}
5121495Sjmacd
5221495Sjmacd/* Make a copy of BINDING without duplicating the data. */
5321495SjmacdSEARCH_BINDING *
54146515Srucopy_binding (SEARCH_BINDING *binding)
5521495Sjmacd{
5621495Sjmacd  SEARCH_BINDING *copy;
5721495Sjmacd
5821495Sjmacd  copy = make_binding (binding->buffer, binding->start, binding->end);
5921495Sjmacd  copy->flags = binding->flags;
6021495Sjmacd  return (copy);
6121495Sjmacd}
6221495Sjmacd
6321495Sjmacd
6421495Sjmacd/* **************************************************************** */
6542660Smarkm/*                                                                  */
6642660Smarkm/*                 The Actual Searching Functions                   */
6742660Smarkm/*                                                                  */
6821495Sjmacd/* **************************************************************** */
6921495Sjmacd
7021495Sjmacd/* Search forwards or backwards for the text delimited by BINDING.
7121495Sjmacd   The search is forwards if BINDING->start is greater than BINDING->end. */
7221495Sjmacdlong
73146515Srusearch (char *string, SEARCH_BINDING *binding)
7421495Sjmacd{
7521495Sjmacd  long result;
7621495Sjmacd
7721495Sjmacd  /* If the search is backwards, then search backwards, otherwise forwards. */
7821495Sjmacd  if (binding->start > binding->end)
7921495Sjmacd    result = search_backward (string, binding);
8021495Sjmacd  else
8121495Sjmacd    result = search_forward (string, binding);
8221495Sjmacd
8321495Sjmacd  return (result);
8421495Sjmacd}
8521495Sjmacd
8621495Sjmacd/* Search forwards for STRING through the text delimited in BINDING. */
8721495Sjmacdlong
88146515Srusearch_forward (char *string, SEARCH_BINDING *binding)
8921495Sjmacd{
9021495Sjmacd  register int c, i, len;
9121495Sjmacd  register char *buff, *end;
9221495Sjmacd  char *alternate = (char *)NULL;
9321495Sjmacd
9421495Sjmacd  len = strlen (string);
9521495Sjmacd
9621495Sjmacd  /* We match characters in the search buffer against STRING and ALTERNATE.
9721495Sjmacd     ALTERNATE is a case reversed version of STRING; this is cheaper than
9821495Sjmacd     case folding each character before comparison.   Alternate is only
9921495Sjmacd     used if the case folding bit is turned on in the passed BINDING. */
10021495Sjmacd
10121495Sjmacd  if (binding->flags & S_FoldCase)
10221495Sjmacd    {
10342660Smarkm      alternate = xstrdup (string);
10421495Sjmacd
10521495Sjmacd      for (i = 0; i < len; i++)
10642660Smarkm        {
10742660Smarkm          if (islower (alternate[i]))
10842660Smarkm            alternate[i] = toupper (alternate[i]);
10942660Smarkm          else if (isupper (alternate[i]))
11042660Smarkm            alternate[i] = tolower (alternate[i]);
11142660Smarkm        }
11221495Sjmacd    }
11321495Sjmacd
11421495Sjmacd  buff = binding->buffer + binding->start;
11521495Sjmacd  end = binding->buffer + binding->end + 1;
11621495Sjmacd
11721495Sjmacd  while (buff < (end - len))
11821495Sjmacd    {
11921495Sjmacd      for (i = 0; i < len; i++)
12042660Smarkm        {
12142660Smarkm          c = buff[i];
12221495Sjmacd
12342660Smarkm          if ((c != string[i]) && (!alternate || c != alternate[i]))
12442660Smarkm            break;
12542660Smarkm        }
12621495Sjmacd
12721495Sjmacd      if (!string[i])
12842660Smarkm        {
12942660Smarkm          if (alternate)
13042660Smarkm            free (alternate);
13142660Smarkm          if (binding->flags & S_SkipDest)
13242660Smarkm            buff += len;
13342660Smarkm          return ((long) (buff - binding->buffer));
13442660Smarkm        }
13521495Sjmacd
13621495Sjmacd      buff++;
13721495Sjmacd    }
13821495Sjmacd
13921495Sjmacd  if (alternate)
14021495Sjmacd    free (alternate);
14121495Sjmacd
14221495Sjmacd  return ((long) -1);
14321495Sjmacd}
14421495Sjmacd
14521495Sjmacd/* Search for STRING backwards through the text delimited in BINDING. */
14621495Sjmacdlong
147146515Srusearch_backward (char *input_string, SEARCH_BINDING *binding)
14821495Sjmacd{
14921495Sjmacd  register int c, i, len;
15021495Sjmacd  register char *buff, *end;
15121495Sjmacd  char *string;
15221495Sjmacd  char *alternate = (char *)NULL;
15321495Sjmacd
15421495Sjmacd  len = strlen (input_string);
15521495Sjmacd
15621495Sjmacd  /* Reverse the characters in the search string. */
15721495Sjmacd  string = (char *)xmalloc (1 + len);
15821495Sjmacd  for (c = 0, i = len - 1; input_string[c]; c++, i--)
15921495Sjmacd    string[i] = input_string[c];
16021495Sjmacd
16121495Sjmacd  string[c] = '\0';
16221495Sjmacd
16321495Sjmacd  /* We match characters in the search buffer against STRING and ALTERNATE.
16421495Sjmacd     ALTERNATE is a case reversed version of STRING; this is cheaper than
16521495Sjmacd     case folding each character before comparison.   ALTERNATE is only
16621495Sjmacd     used if the case folding bit is turned on in the passed BINDING. */
16721495Sjmacd
16821495Sjmacd  if (binding->flags & S_FoldCase)
16921495Sjmacd    {
17042660Smarkm      alternate = xstrdup (string);
17121495Sjmacd
17221495Sjmacd      for (i = 0; i < len; i++)
17342660Smarkm        {
17442660Smarkm          if (islower (alternate[i]))
17542660Smarkm            alternate[i] = toupper (alternate[i]);
17642660Smarkm          else if (isupper (alternate[i]))
17742660Smarkm            alternate[i] = tolower (alternate[i]);
17842660Smarkm        }
17921495Sjmacd    }
18021495Sjmacd
18121495Sjmacd  buff = binding->buffer + binding->start - 1;
18221495Sjmacd  end = binding->buffer + binding->end;
18321495Sjmacd
18421495Sjmacd  while (buff > (end + len))
18521495Sjmacd    {
18621495Sjmacd      for (i = 0; i < len; i++)
18742660Smarkm        {
18842660Smarkm          c = *(buff - i);
18921495Sjmacd
19056160Sru          if (c != string[i] && (!alternate || c != alternate[i]))
19142660Smarkm            break;
19242660Smarkm        }
19321495Sjmacd
19421495Sjmacd      if (!string[i])
19542660Smarkm        {
19642660Smarkm          free (string);
19742660Smarkm          if (alternate)
19842660Smarkm            free (alternate);
19921495Sjmacd
20042660Smarkm          if (binding->flags & S_SkipDest)
20142660Smarkm            buff -= len;
20242660Smarkm          return ((long) (1 + (buff - binding->buffer)));
20342660Smarkm        }
20421495Sjmacd
20521495Sjmacd      buff--;
20621495Sjmacd    }
20721495Sjmacd
20821495Sjmacd  free (string);
20921495Sjmacd  if (alternate)
21021495Sjmacd    free (alternate);
21121495Sjmacd
21221495Sjmacd  return ((long) -1);
21321495Sjmacd}
21421495Sjmacd
21521495Sjmacd/* Find STRING in LINE, returning the offset of the end of the string.
21621495Sjmacd   Return an offset of -1 if STRING does not appear in LINE.  The search
21721495Sjmacd   is bound by the end of the line (i.e., either NEWLINE or 0). */
21821495Sjmacdint
219146515Srustring_in_line (char *string, char *line)
22021495Sjmacd{
22121495Sjmacd  register int end;
22221495Sjmacd  SEARCH_BINDING binding;
22321495Sjmacd
22421495Sjmacd  /* Find the end of the line. */
22521495Sjmacd  for (end = 0; line[end] && line[end] != '\n'; end++);
22621495Sjmacd
22721495Sjmacd  /* Search for STRING within these confines. */
22821495Sjmacd  binding.buffer = line;
22921495Sjmacd  binding.start = 0;
23021495Sjmacd  binding.end = end;
23121495Sjmacd  binding.flags = S_FoldCase | S_SkipDest;
23221495Sjmacd
23321495Sjmacd  return (search_forward (string, &binding));
23421495Sjmacd}
23521495Sjmacd
23621495Sjmacd/* Return non-zero if STRING is the first text to appear at BINDING. */
23721495Sjmacdint
238146515Srulooking_at (char *string, SEARCH_BINDING *binding)
23921495Sjmacd{
24021495Sjmacd  long search_end;
24121495Sjmacd
24221495Sjmacd  search_end = search (string, binding);
24321495Sjmacd
24421495Sjmacd  /* If the string was not found, SEARCH_END is -1.  If the string was found,
24521495Sjmacd     but not right away, SEARCH_END is != binding->start.  Otherwise, the
24621495Sjmacd     string was found at binding->start. */
24721495Sjmacd  return (search_end == binding->start);
24821495Sjmacd}
24921495Sjmacd
25021495Sjmacd/* **************************************************************** */
25142660Smarkm/*                                                                  */
25242660Smarkm/*                    Small String Searches                         */
25342660Smarkm/*                                                                  */
25421495Sjmacd/* **************************************************************** */
25521495Sjmacd
25621495Sjmacd/* Function names that start with "skip" are passed a string, and return
25721495Sjmacd   an offset from the start of that string.  Function names that start
25821495Sjmacd   with "find" are passed a SEARCH_BINDING, and return an absolute position
25921495Sjmacd   marker of the item being searched for.  "Find" functions return a value
26021495Sjmacd   of -1 if the item being looked for couldn't be found. */
26121495Sjmacd
26221495Sjmacd/* Return the index of the first non-whitespace character in STRING. */
26321495Sjmacdint
264146515Sruskip_whitespace (char *string)
26521495Sjmacd{
26621495Sjmacd  register int i;
26721495Sjmacd
26821495Sjmacd  for (i = 0; string && whitespace (string[i]); i++);
26921495Sjmacd  return (i);
27021495Sjmacd}
27121495Sjmacd
27221495Sjmacd/* Return the index of the first non-whitespace or newline character in
27321495Sjmacd   STRING. */
27421495Sjmacdint
275146515Sruskip_whitespace_and_newlines (char *string)
27621495Sjmacd{
27721495Sjmacd  register int i;
27821495Sjmacd
27956160Sru  for (i = 0; string && whitespace_or_newline (string[i]); i++);
28021495Sjmacd  return (i);
28121495Sjmacd}
28221495Sjmacd
28321495Sjmacd/* Return the index of the first whitespace character in STRING. */
28421495Sjmacdint
285146515Sruskip_non_whitespace (char *string)
28621495Sjmacd{
28721495Sjmacd  register int i;
28821495Sjmacd
289100513Sru  for (i = 0; string && string[i] && !whitespace (string[i]); i++);
29021495Sjmacd  return (i);
29121495Sjmacd}
29221495Sjmacd
29321495Sjmacd/* Return the index of the first non-node character in STRING.  Note that
29421495Sjmacd   this function contains quite a bit of hair to ignore periods in some
29521495Sjmacd   special cases.  This is because we here at GNU ship some info files which
29621495Sjmacd   contain nodenames that contain periods.  No such nodename can start with
29721495Sjmacd   a period, or continue with whitespace, newline, or ')' immediately following
29821495Sjmacd   the period.  If second argument NEWLINES_OKAY is non-zero, newlines should
29921495Sjmacd   be skipped while parsing out the nodename specification. */
30021495Sjmacdint
301146515Sruskip_node_characters (char *string, int newlines_okay)
30221495Sjmacd{
30321495Sjmacd  register int c, i = 0;
30421495Sjmacd  int paren_seen = 0;
30521495Sjmacd  int paren = 0;
30621495Sjmacd
30721495Sjmacd  /* Handle special case.  This is when another function has parsed out the
30821495Sjmacd     filename component of the node name, and we just want to parse out the
30921495Sjmacd     nodename proper.  In that case, a period at the start of the nodename
31021495Sjmacd     indicates an empty nodename. */
31121495Sjmacd  if (string && *string == '.')
31221495Sjmacd    return (0);
31321495Sjmacd
31421495Sjmacd  if (string && *string == '(')
31521495Sjmacd    {
31621495Sjmacd      paren++;
31721495Sjmacd      paren_seen++;
31821495Sjmacd      i++;
31921495Sjmacd    }
32021495Sjmacd
32121495Sjmacd  for (; string && (c = string[i]); i++)
32221495Sjmacd    {
32321495Sjmacd      if (paren)
32442660Smarkm        {
32542660Smarkm          if (c == '(')
32642660Smarkm            paren++;
32742660Smarkm          else if (c == ')')
32842660Smarkm            paren--;
32921495Sjmacd
33042660Smarkm          continue;
33142660Smarkm        }
33221495Sjmacd
33321495Sjmacd      /* If the character following the close paren is a space or period,
33442660Smarkm         then this node name has no more characters associated with it. */
33521495Sjmacd      if (c == '\t' ||
33642660Smarkm          c == ','  ||
33742660Smarkm          c == INFO_TAGSEP ||
33842660Smarkm          ((!newlines_okay) && (c == '\n')) ||
33942660Smarkm          ((paren_seen && string[i - 1] == ')') &&
34042660Smarkm           (c == ' ' || c == '.')) ||
34142660Smarkm          (c == '.' &&
34242660Smarkm           (
34342660Smarkm#if 0
34442660Smarkm/* This test causes a node name ending in a period, like `This.', not to
34542660Smarkm   be found.  The trailing . is stripped.  This occurs in the jargon
34642660Smarkm   file (`I see no X here.' is a node name).  */
34742660Smarkm           (!string[i + 1]) ||
34842660Smarkm#endif
34942660Smarkm            (whitespace_or_newline (string[i + 1])) ||
35042660Smarkm            (string[i + 1] == ')'))))
35142660Smarkm        break;
35221495Sjmacd    }
35321495Sjmacd  return (i);
35421495Sjmacd}
35521495Sjmacd
35621495Sjmacd
35721495Sjmacd/* **************************************************************** */
35842660Smarkm/*                                                                  */
35942660Smarkm/*                   Searching FILE_BUFFER's                        */
36042660Smarkm/*                                                                  */
36121495Sjmacd/* **************************************************************** */
36221495Sjmacd
36321495Sjmacd/* Return the absolute position of the first occurence of a node separator in
36421495Sjmacd   BINDING-buffer.  The search starts at BINDING->start.  Return -1 if no node
36521495Sjmacd   separator was found. */
36621495Sjmacdlong
367146515Srufind_node_separator (SEARCH_BINDING *binding)
36821495Sjmacd{
36921495Sjmacd  register long i;
37021495Sjmacd  char *body;
37121495Sjmacd
37221495Sjmacd  body = binding->buffer;
37321495Sjmacd
37421495Sjmacd  /* A node is started by [^L]^_[^L]\n.  That is to say, the C-l's are
37521495Sjmacd     optional, but the DELETE and NEWLINE are not.  This separator holds
37621495Sjmacd     true for all separated elements in an Info file, including the tags
37721495Sjmacd     table (if present) and the indirect tags table (if present). */
37821495Sjmacd  for (i = binding->start; i < binding->end - 1; i++)
37921495Sjmacd    if (((body[i] == INFO_FF && body[i + 1] == INFO_COOKIE) &&
38042660Smarkm         (body[i + 2] == '\n' ||
38142660Smarkm          (body[i + 2] == INFO_FF && body[i + 3] == '\n'))) ||
38242660Smarkm        ((body[i] == INFO_COOKIE) &&
38342660Smarkm         (body[i + 1] == '\n' ||
38442660Smarkm          (body[i + 1] == INFO_FF && body[i + 2] == '\n'))))
38521495Sjmacd      return (i);
38621495Sjmacd  return (-1);
38721495Sjmacd}
38821495Sjmacd
38921495Sjmacd/* Return the length of the node separator characters that BODY is
39021495Sjmacd   currently pointing at. */
39121495Sjmacdint
392146515Sruskip_node_separator (char *body)
39321495Sjmacd{
39421495Sjmacd  register int i;
39521495Sjmacd
39621495Sjmacd  i = 0;
39721495Sjmacd
39821495Sjmacd  if (body[i] == INFO_FF)
39921495Sjmacd    i++;
40021495Sjmacd
40121495Sjmacd  if (body[i++] != INFO_COOKIE)
40221495Sjmacd    return (0);
40321495Sjmacd
40421495Sjmacd  if (body[i] == INFO_FF)
40521495Sjmacd    i++;
40621495Sjmacd
40721495Sjmacd  if (body[i++] != '\n')
40821495Sjmacd    return (0);
40921495Sjmacd
41021495Sjmacd  return (i);
41121495Sjmacd}
41221495Sjmacd
41321495Sjmacd/* Return the number of characters from STRING to the start of
41421495Sjmacd   the next line. */
41521495Sjmacdint
416146515Sruskip_line (char *string)
41721495Sjmacd{
41821495Sjmacd  register int i;
41921495Sjmacd
42021495Sjmacd  for (i = 0; string && string[i] && string[i] != '\n'; i++);
42121495Sjmacd
42221495Sjmacd  if (string[i] == '\n')
42321495Sjmacd    i++;
42421495Sjmacd
42521495Sjmacd  return (i);
42621495Sjmacd}
42721495Sjmacd
42821495Sjmacd/* Return the absolute position of the beginning of a tags table in this
42921495Sjmacd   binding starting the search at binding->start. */
43021495Sjmacdlong
431146515Srufind_tags_table (SEARCH_BINDING *binding)
43221495Sjmacd{
433146515Sru  SEARCH_BINDING tmp_search;
43421495Sjmacd  long position;
43521495Sjmacd
436146515Sru  tmp_search.buffer = binding->buffer;
437146515Sru  tmp_search.start = binding->start;
438146515Sru  tmp_search.end = binding->end;
439146515Sru  tmp_search.flags = S_FoldCase;
44021495Sjmacd
441146515Sru  while ((position = find_node_separator (&tmp_search)) != -1 )
44221495Sjmacd    {
443146515Sru      tmp_search.start = position;
444146515Sru      tmp_search.start += skip_node_separator (tmp_search.buffer
445146515Sru          + tmp_search.start);
44621495Sjmacd
447146515Sru      if (looking_at (TAGS_TABLE_BEG_LABEL, &tmp_search))
44842660Smarkm        return (position);
44921495Sjmacd    }
45021495Sjmacd  return (-1);
45121495Sjmacd}
45221495Sjmacd
45321495Sjmacd/* Return the absolute position of the node named NODENAME in BINDING.
45421495Sjmacd   This is a brute force search, and we wish to avoid it when possible.
45521495Sjmacd   This function is called when a tag (indirect or otherwise) doesn't
45621495Sjmacd   really point to the right node.  It returns the absolute position of
45721495Sjmacd   the separator preceding the node. */
45821495Sjmacdlong
459146515Srufind_node_in_binding (char *nodename, SEARCH_BINDING *binding)
46021495Sjmacd{
46142660Smarkm  long position;
46242660Smarkm  int offset, namelen;
463146515Sru  SEARCH_BINDING tmp_search;
46421495Sjmacd
46521495Sjmacd  namelen = strlen (nodename);
46621495Sjmacd
467146515Sru  tmp_search.buffer = binding->buffer;
468146515Sru  tmp_search.start = binding->start;
469146515Sru  tmp_search.end = binding->end;
470146515Sru  tmp_search.flags = 0;
47121495Sjmacd
472146515Sru  while ((position = find_node_separator (&tmp_search)) != -1)
47321495Sjmacd    {
474146515Sru      tmp_search.start = position;
475146515Sru      tmp_search.start += skip_node_separator
476146515Sru        (tmp_search.buffer + tmp_search.start);
47721495Sjmacd
478146515Sru      offset = string_in_line
479146515Sru        (INFO_NODE_LABEL, tmp_search.buffer + tmp_search.start);
48021495Sjmacd
48121495Sjmacd      if (offset == -1)
48242660Smarkm        continue;
48321495Sjmacd
484146515Sru      tmp_search.start += offset;
485146515Sru      tmp_search.start += skip_whitespace (tmp_search.buffer + tmp_search.start);
48621495Sjmacd      offset = skip_node_characters
487146515Sru        (tmp_search.buffer + tmp_search.start, DONT_SKIP_NEWLINES);
48821495Sjmacd
48921495Sjmacd      /* Notice that this is an exact match.  You cannot grovel through
49042660Smarkm         the buffer with this function looking for random nodes. */
49121495Sjmacd       if ((offset == namelen) &&
492146515Sru           (tmp_search.buffer[tmp_search.start] == nodename[0]) &&
493146515Sru           (strncmp (tmp_search.buffer + tmp_search.start, nodename, offset) == 0))
49442660Smarkm         return (position);
49521495Sjmacd    }
49621495Sjmacd  return (-1);
49721495Sjmacd}
498