1/* search.c -- How to search large bodies of text. */
2
3/* This file is part of GNU Info, a program for reading online documentation
4   stored in Info format.
5
6   Copyright (C) 1993, 97 Free Software Foundation, Inc.
7
8   This program is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 2, or (at your option)
11   any later version.
12
13   This program is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17
18   You should have received a copy of the GNU General Public License
19   along with this program; if not, write to the Free Software
20   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21
22   Written by Brian Fox (bfox@ai.mit.edu). */
23
24#include "info.h"
25
26#include "search.h"
27#include "nodes.h"
28
29/* The search functions take two arguments:
30
31     1) a string to search for, and
32
33     2) a pointer to a SEARCH_BINDING which contains the buffer, start,
34        and end of the search.
35
36   They return a long, which is the offset from the start of the buffer
37   at which the match was found.  An offset of -1 indicates failure. */
38
39/* A function which makes a binding with buffer and bounds. */
40SEARCH_BINDING *
41make_binding (buffer, start, end)
42     char *buffer;
43     long start, end;
44{
45  SEARCH_BINDING *binding;
46
47  binding = (SEARCH_BINDING *)xmalloc (sizeof (SEARCH_BINDING));
48  binding->buffer = buffer;
49  binding->start = start;
50  binding->end = end;
51  binding->flags = 0;
52
53  return (binding);
54}
55
56/* Make a copy of BINDING without duplicating the data. */
57SEARCH_BINDING *
58copy_binding (binding)
59     SEARCH_BINDING *binding;
60{
61  SEARCH_BINDING *copy;
62
63  copy = make_binding (binding->buffer, binding->start, binding->end);
64  copy->flags = binding->flags;
65  return (copy);
66}
67
68
69/* **************************************************************** */
70/*                                                                  */
71/*                 The Actual Searching Functions                   */
72/*                                                                  */
73/* **************************************************************** */
74
75/* Search forwards or backwards for the text delimited by BINDING.
76   The search is forwards if BINDING->start is greater than BINDING->end. */
77long
78search (string, binding)
79     char *string;
80     SEARCH_BINDING *binding;
81{
82  long result;
83
84  /* If the search is backwards, then search backwards, otherwise forwards. */
85  if (binding->start > binding->end)
86    result = search_backward (string, binding);
87  else
88    result = search_forward (string, binding);
89
90  return (result);
91}
92
93/* Search forwards for STRING through the text delimited in BINDING. */
94long
95search_forward (string, binding)
96     char *string;
97     SEARCH_BINDING *binding;
98{
99  register int c, i, len;
100  register char *buff, *end;
101  char *alternate = (char *)NULL;
102
103  len = strlen (string);
104
105  /* We match characters in the search buffer against STRING and ALTERNATE.
106     ALTERNATE is a case reversed version of STRING; this is cheaper than
107     case folding each character before comparison.   Alternate is only
108     used if the case folding bit is turned on in the passed BINDING. */
109
110  if (binding->flags & S_FoldCase)
111    {
112      alternate = xstrdup (string);
113
114      for (i = 0; i < len; i++)
115        {
116          if (islower (alternate[i]))
117            alternate[i] = toupper (alternate[i]);
118          else if (isupper (alternate[i]))
119            alternate[i] = tolower (alternate[i]);
120        }
121    }
122
123  buff = binding->buffer + binding->start;
124  end = binding->buffer + binding->end + 1;
125
126  while (buff < (end - len))
127    {
128      for (i = 0; i < len; i++)
129        {
130          c = buff[i];
131
132          if ((c != string[i]) && (!alternate || c != alternate[i]))
133            break;
134        }
135
136      if (!string[i])
137        {
138          if (alternate)
139            free (alternate);
140          if (binding->flags & S_SkipDest)
141            buff += len;
142          return ((long) (buff - binding->buffer));
143        }
144
145      buff++;
146    }
147
148  if (alternate)
149    free (alternate);
150
151  return ((long) -1);
152}
153
154/* Search for STRING backwards through the text delimited in BINDING. */
155long
156search_backward (input_string, binding)
157     char *input_string;
158     SEARCH_BINDING *binding;
159{
160  register int c, i, len;
161  register char *buff, *end;
162  char *string;
163  char *alternate = (char *)NULL;
164
165  len = strlen (input_string);
166
167  /* Reverse the characters in the search string. */
168  string = (char *)xmalloc (1 + len);
169  for (c = 0, i = len - 1; input_string[c]; c++, i--)
170    string[i] = input_string[c];
171
172  string[c] = '\0';
173
174  /* We match characters in the search buffer against STRING and ALTERNATE.
175     ALTERNATE is a case reversed version of STRING; this is cheaper than
176     case folding each character before comparison.   ALTERNATE is only
177     used if the case folding bit is turned on in the passed BINDING. */
178
179  if (binding->flags & S_FoldCase)
180    {
181      alternate = xstrdup (string);
182
183      for (i = 0; i < len; i++)
184        {
185          if (islower (alternate[i]))
186            alternate[i] = toupper (alternate[i]);
187          else if (isupper (alternate[i]))
188            alternate[i] = tolower (alternate[i]);
189        }
190    }
191
192  buff = binding->buffer + binding->start - 1;
193  end = binding->buffer + binding->end;
194
195  while (buff > (end + len))
196    {
197      for (i = 0; i < len; i++)
198        {
199          c = *(buff - i);
200
201          if (c != string[i] && (alternate && c != alternate[i]))
202            break;
203        }
204
205      if (!string[i])
206        {
207          free (string);
208          if (alternate)
209            free (alternate);
210
211          if (binding->flags & S_SkipDest)
212            buff -= len;
213          return ((long) (1 + (buff - binding->buffer)));
214        }
215
216      buff--;
217    }
218
219  free (string);
220  if (alternate)
221    free (alternate);
222
223  return ((long) -1);
224}
225
226/* Find STRING in LINE, returning the offset of the end of the string.
227   Return an offset of -1 if STRING does not appear in LINE.  The search
228   is bound by the end of the line (i.e., either NEWLINE or 0). */
229int
230string_in_line (string, line)
231     char *string, *line;
232{
233  register int end;
234  SEARCH_BINDING binding;
235
236  /* Find the end of the line. */
237  for (end = 0; line[end] && line[end] != '\n'; end++);
238
239  /* Search for STRING within these confines. */
240  binding.buffer = line;
241  binding.start = 0;
242  binding.end = end;
243  binding.flags = S_FoldCase | S_SkipDest;
244
245  return (search_forward (string, &binding));
246}
247
248/* Return non-zero if STRING is the first text to appear at BINDING. */
249int
250looking_at (string, binding)
251     char *string;
252     SEARCH_BINDING *binding;
253{
254  long search_end;
255
256  search_end = search (string, binding);
257
258  /* If the string was not found, SEARCH_END is -1.  If the string was found,
259     but not right away, SEARCH_END is != binding->start.  Otherwise, the
260     string was found at binding->start. */
261  return (search_end == binding->start);
262}
263
264/* **************************************************************** */
265/*                                                                  */
266/*                    Small String Searches                         */
267/*                                                                  */
268/* **************************************************************** */
269
270/* Function names that start with "skip" are passed a string, and return
271   an offset from the start of that string.  Function names that start
272   with "find" are passed a SEARCH_BINDING, and return an absolute position
273   marker of the item being searched for.  "Find" functions return a value
274   of -1 if the item being looked for couldn't be found. */
275
276/* Return the index of the first non-whitespace character in STRING. */
277int
278skip_whitespace (string)
279     char *string;
280{
281  register int i;
282
283  for (i = 0; string && whitespace (string[i]); i++);
284  return (i);
285}
286
287/* Return the index of the first non-whitespace or newline character in
288   STRING. */
289int
290skip_whitespace_and_newlines (string)
291     char *string;
292{
293  register int i;
294
295  for (i = 0; string && (whitespace (string[i]) || string[i] == '\n'); i++);
296  return (i);
297}
298
299/* Return the index of the first whitespace character in STRING. */
300int
301skip_non_whitespace (string)
302     char *string;
303{
304  register int i;
305
306  for (i = 0; string && !whitespace (string[i]); i++);
307  return (i);
308}
309
310/* Return the index of the first non-node character in STRING.  Note that
311   this function contains quite a bit of hair to ignore periods in some
312   special cases.  This is because we here at GNU ship some info files which
313   contain nodenames that contain periods.  No such nodename can start with
314   a period, or continue with whitespace, newline, or ')' immediately following
315   the period.  If second argument NEWLINES_OKAY is non-zero, newlines should
316   be skipped while parsing out the nodename specification. */
317int
318skip_node_characters (string, newlines_okay)
319     char *string;
320     int newlines_okay;
321{
322  register int c, i = 0;
323  int paren_seen = 0;
324  int paren = 0;
325
326  /* Handle special case.  This is when another function has parsed out the
327     filename component of the node name, and we just want to parse out the
328     nodename proper.  In that case, a period at the start of the nodename
329     indicates an empty nodename. */
330  if (string && *string == '.')
331    return (0);
332
333  if (string && *string == '(')
334    {
335      paren++;
336      paren_seen++;
337      i++;
338    }
339
340  for (; string && (c = string[i]); i++)
341    {
342      if (paren)
343        {
344          if (c == '(')
345            paren++;
346          else if (c == ')')
347            paren--;
348
349          continue;
350        }
351
352      /* If the character following the close paren is a space or period,
353         then this node name has no more characters associated with it. */
354      if (c == '\t' ||
355          c == ','  ||
356          c == INFO_TAGSEP ||
357          ((!newlines_okay) && (c == '\n')) ||
358          ((paren_seen && string[i - 1] == ')') &&
359           (c == ' ' || c == '.')) ||
360          (c == '.' &&
361           (
362#if 0
363/* This test causes a node name ending in a period, like `This.', not to
364   be found.  The trailing . is stripped.  This occurs in the jargon
365   file (`I see no X here.' is a node name).  */
366           (!string[i + 1]) ||
367#endif
368            (whitespace_or_newline (string[i + 1])) ||
369            (string[i + 1] == ')'))))
370        break;
371    }
372  return (i);
373}
374
375
376/* **************************************************************** */
377/*                                                                  */
378/*                   Searching FILE_BUFFER's                        */
379/*                                                                  */
380/* **************************************************************** */
381
382/* Return the absolute position of the first occurence of a node separator in
383   BINDING-buffer.  The search starts at BINDING->start.  Return -1 if no node
384   separator was found. */
385long
386find_node_separator (binding)
387     SEARCH_BINDING *binding;
388{
389  register long i;
390  char *body;
391
392  body = binding->buffer;
393
394  /* A node is started by [^L]^_[^L]\n.  That is to say, the C-l's are
395     optional, but the DELETE and NEWLINE are not.  This separator holds
396     true for all separated elements in an Info file, including the tags
397     table (if present) and the indirect tags table (if present). */
398  for (i = binding->start; i < binding->end - 1; i++)
399    if (((body[i] == INFO_FF && body[i + 1] == INFO_COOKIE) &&
400         (body[i + 2] == '\n' ||
401          (body[i + 2] == INFO_FF && body[i + 3] == '\n'))) ||
402        ((body[i] == INFO_COOKIE) &&
403         (body[i + 1] == '\n' ||
404          (body[i + 1] == INFO_FF && body[i + 2] == '\n'))))
405      return (i);
406  return (-1);
407}
408
409/* Return the length of the node separator characters that BODY is
410   currently pointing at. */
411int
412skip_node_separator (body)
413     char *body;
414{
415  register int i;
416
417  i = 0;
418
419  if (body[i] == INFO_FF)
420    i++;
421
422  if (body[i++] != INFO_COOKIE)
423    return (0);
424
425  if (body[i] == INFO_FF)
426    i++;
427
428  if (body[i++] != '\n')
429    return (0);
430
431  return (i);
432}
433
434/* Return the number of characters from STRING to the start of
435   the next line. */
436int
437skip_line (string)
438     char *string;
439{
440  register int i;
441
442  for (i = 0; string && string[i] && string[i] != '\n'; i++);
443
444  if (string[i] == '\n')
445    i++;
446
447  return (i);
448}
449
450/* Return the absolute position of the beginning of a tags table in this
451   binding starting the search at binding->start. */
452long
453find_tags_table (binding)
454     SEARCH_BINDING *binding;
455{
456  SEARCH_BINDING search;
457  long position;
458
459  search.buffer = binding->buffer;
460  search.start = binding->start;
461  search.end = binding->end;
462  search.flags = S_FoldCase;
463
464  while ((position = find_node_separator (&search)) != -1 )
465    {
466      search.start = position;
467      search.start += skip_node_separator (search.buffer + search.start);
468
469      if (looking_at (TAGS_TABLE_BEG_LABEL, &search))
470        return (position);
471    }
472  return (-1);
473}
474
475/* Return the absolute position of the node named NODENAME in BINDING.
476   This is a brute force search, and we wish to avoid it when possible.
477   This function is called when a tag (indirect or otherwise) doesn't
478   really point to the right node.  It returns the absolute position of
479   the separator preceding the node. */
480long
481find_node_in_binding (nodename, binding)
482     char *nodename;
483     SEARCH_BINDING *binding;
484{
485  long position;
486  int offset, namelen;
487  SEARCH_BINDING search;
488
489  namelen = strlen (nodename);
490
491  search.buffer = binding->buffer;
492  search.start = binding->start;
493  search.end = binding->end;
494  search.flags = 0;
495
496  while ((position = find_node_separator (&search)) != -1)
497    {
498      search.start = position;
499      search.start += skip_node_separator (search.buffer + search.start);
500
501      offset = string_in_line (INFO_NODE_LABEL, search.buffer + search.start);
502
503      if (offset == -1)
504        continue;
505
506      search.start += offset;
507      search.start += skip_whitespace (search.buffer + search.start);
508      offset = skip_node_characters
509        (search.buffer + search.start, DONT_SKIP_NEWLINES);
510
511      /* Notice that this is an exact match.  You cannot grovel through
512         the buffer with this function looking for random nodes. */
513       if ((offset == namelen) &&
514           (search.buffer[search.start] == nodename[0]) &&
515           (strncmp (search.buffer + search.start, nodename, offset) == 0))
516         return (position);
517    }
518  return (-1);
519}
520