1/* search.c -- searching large bodies of text.
2   $Id: search.c,v 1.3 2004/04/11 17:56:46 karl Exp $
3
4   Copyright (C) 1993, 1997, 1998, 2002, 2004 Free Software Foundation, Inc.
5
6   This program is free software; you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 2, or (at your option)
9   any later version.
10
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15
16   You should have received a copy of the GNU General Public License
17   along with this program; if not, write to the Free Software
18   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
20   Written by Brian Fox (bfox@ai.mit.edu). */
21
22#include "info.h"
23
24#include "search.h"
25#include "nodes.h"
26
27/* The search functions take two arguments:
28
29     1) a string to search for, and
30
31     2) a pointer to a SEARCH_BINDING which contains the buffer, start,
32        and end of the search.
33
34   They return a long, which is the offset from the start of the buffer
35   at which the match was found.  An offset of -1 indicates failure. */
36
37/* A function which makes a binding with buffer and bounds. */
38SEARCH_BINDING *
39make_binding (char *buffer, long int start, long int end)
40{
41  SEARCH_BINDING *binding;
42
43  binding = (SEARCH_BINDING *)xmalloc (sizeof (SEARCH_BINDING));
44  binding->buffer = buffer;
45  binding->start = start;
46  binding->end = end;
47  binding->flags = 0;
48
49  return (binding);
50}
51
52/* Make a copy of BINDING without duplicating the data. */
53SEARCH_BINDING *
54copy_binding (SEARCH_BINDING *binding)
55{
56  SEARCH_BINDING *copy;
57
58  copy = make_binding (binding->buffer, binding->start, binding->end);
59  copy->flags = binding->flags;
60  return (copy);
61}
62
63
64/* **************************************************************** */
65/*                                                                  */
66/*                 The Actual Searching Functions                   */
67/*                                                                  */
68/* **************************************************************** */
69
70/* Search forwards or backwards for the text delimited by BINDING.
71   The search is forwards if BINDING->start is greater than BINDING->end. */
72long
73search (char *string, SEARCH_BINDING *binding)
74{
75  long result;
76
77  /* If the search is backwards, then search backwards, otherwise forwards. */
78  if (binding->start > binding->end)
79    result = search_backward (string, binding);
80  else
81    result = search_forward (string, binding);
82
83  return (result);
84}
85
86/* Search forwards for STRING through the text delimited in BINDING. */
87long
88search_forward (char *string, SEARCH_BINDING *binding)
89{
90  register int c, i, len;
91  register char *buff, *end;
92  char *alternate = (char *)NULL;
93
94  len = strlen (string);
95
96  /* We match characters in the search buffer against STRING and ALTERNATE.
97     ALTERNATE is a case reversed version of STRING; this is cheaper than
98     case folding each character before comparison.   Alternate is only
99     used if the case folding bit is turned on in the passed BINDING. */
100
101  if (binding->flags & S_FoldCase)
102    {
103      alternate = xstrdup (string);
104
105      for (i = 0; i < len; i++)
106        {
107          if (islower (alternate[i]))
108            alternate[i] = toupper (alternate[i]);
109          else if (isupper (alternate[i]))
110            alternate[i] = tolower (alternate[i]);
111        }
112    }
113
114  buff = binding->buffer + binding->start;
115  end = binding->buffer + binding->end + 1;
116
117  while (buff < (end - len))
118    {
119      for (i = 0; i < len; i++)
120        {
121          c = buff[i];
122
123          if ((c != string[i]) && (!alternate || c != alternate[i]))
124            break;
125        }
126
127      if (!string[i])
128        {
129          if (alternate)
130            free (alternate);
131          if (binding->flags & S_SkipDest)
132            buff += len;
133          return ((long) (buff - binding->buffer));
134        }
135
136      buff++;
137    }
138
139  if (alternate)
140    free (alternate);
141
142  return ((long) -1);
143}
144
145/* Search for STRING backwards through the text delimited in BINDING. */
146long
147search_backward (char *input_string, SEARCH_BINDING *binding)
148{
149  register int c, i, len;
150  register char *buff, *end;
151  char *string;
152  char *alternate = (char *)NULL;
153
154  len = strlen (input_string);
155
156  /* Reverse the characters in the search string. */
157  string = (char *)xmalloc (1 + len);
158  for (c = 0, i = len - 1; input_string[c]; c++, i--)
159    string[i] = input_string[c];
160
161  string[c] = '\0';
162
163  /* We match characters in the search buffer against STRING and ALTERNATE.
164     ALTERNATE is a case reversed version of STRING; this is cheaper than
165     case folding each character before comparison.   ALTERNATE is only
166     used if the case folding bit is turned on in the passed BINDING. */
167
168  if (binding->flags & S_FoldCase)
169    {
170      alternate = xstrdup (string);
171
172      for (i = 0; i < len; i++)
173        {
174          if (islower (alternate[i]))
175            alternate[i] = toupper (alternate[i]);
176          else if (isupper (alternate[i]))
177            alternate[i] = tolower (alternate[i]);
178        }
179    }
180
181  buff = binding->buffer + binding->start - 1;
182  end = binding->buffer + binding->end;
183
184  while (buff > (end + len))
185    {
186      for (i = 0; i < len; i++)
187        {
188          c = *(buff - i);
189
190          if (c != string[i] && (!alternate || c != alternate[i]))
191            break;
192        }
193
194      if (!string[i])
195        {
196          free (string);
197          if (alternate)
198            free (alternate);
199
200          if (binding->flags & S_SkipDest)
201            buff -= len;
202          return ((long) (1 + (buff - binding->buffer)));
203        }
204
205      buff--;
206    }
207
208  free (string);
209  if (alternate)
210    free (alternate);
211
212  return ((long) -1);
213}
214
215/* Find STRING in LINE, returning the offset of the end of the string.
216   Return an offset of -1 if STRING does not appear in LINE.  The search
217   is bound by the end of the line (i.e., either NEWLINE or 0). */
218int
219string_in_line (char *string, char *line)
220{
221  register int end;
222  SEARCH_BINDING binding;
223
224  /* Find the end of the line. */
225  for (end = 0; line[end] && line[end] != '\n'; end++);
226
227  /* Search for STRING within these confines. */
228  binding.buffer = line;
229  binding.start = 0;
230  binding.end = end;
231  binding.flags = S_FoldCase | S_SkipDest;
232
233  return (search_forward (string, &binding));
234}
235
236/* Return non-zero if STRING is the first text to appear at BINDING. */
237int
238looking_at (char *string, SEARCH_BINDING *binding)
239{
240  long search_end;
241
242  search_end = search (string, binding);
243
244  /* If the string was not found, SEARCH_END is -1.  If the string was found,
245     but not right away, SEARCH_END is != binding->start.  Otherwise, the
246     string was found at binding->start. */
247  return (search_end == binding->start);
248}
249
250/* **************************************************************** */
251/*                                                                  */
252/*                    Small String Searches                         */
253/*                                                                  */
254/* **************************************************************** */
255
256/* Function names that start with "skip" are passed a string, and return
257   an offset from the start of that string.  Function names that start
258   with "find" are passed a SEARCH_BINDING, and return an absolute position
259   marker of the item being searched for.  "Find" functions return a value
260   of -1 if the item being looked for couldn't be found. */
261
262/* Return the index of the first non-whitespace character in STRING. */
263int
264skip_whitespace (char *string)
265{
266  register int i;
267
268  for (i = 0; string && whitespace (string[i]); i++);
269  return (i);
270}
271
272/* Return the index of the first non-whitespace or newline character in
273   STRING. */
274int
275skip_whitespace_and_newlines (char *string)
276{
277  register int i;
278
279  for (i = 0; string && whitespace_or_newline (string[i]); i++);
280  return (i);
281}
282
283/* Return the index of the first whitespace character in STRING. */
284int
285skip_non_whitespace (char *string)
286{
287  register int i;
288
289  for (i = 0; string && string[i] && !whitespace (string[i]); i++);
290  return (i);
291}
292
293/* Return the index of the first non-node character in STRING.  Note that
294   this function contains quite a bit of hair to ignore periods in some
295   special cases.  This is because we here at GNU ship some info files which
296   contain nodenames that contain periods.  No such nodename can start with
297   a period, or continue with whitespace, newline, or ')' immediately following
298   the period.  If second argument NEWLINES_OKAY is non-zero, newlines should
299   be skipped while parsing out the nodename specification. */
300int
301skip_node_characters (char *string, int newlines_okay)
302{
303  register int c, i = 0;
304  int paren_seen = 0;
305  int paren = 0;
306
307  /* Handle special case.  This is when another function has parsed out the
308     filename component of the node name, and we just want to parse out the
309     nodename proper.  In that case, a period at the start of the nodename
310     indicates an empty nodename. */
311  if (string && *string == '.')
312    return (0);
313
314  if (string && *string == '(')
315    {
316      paren++;
317      paren_seen++;
318      i++;
319    }
320
321  for (; string && (c = string[i]); i++)
322    {
323      if (paren)
324        {
325          if (c == '(')
326            paren++;
327          else if (c == ')')
328            paren--;
329
330          continue;
331        }
332
333      /* If the character following the close paren is a space or period,
334         then this node name has no more characters associated with it. */
335      if (c == '\t' ||
336          c == ','  ||
337          c == INFO_TAGSEP ||
338          ((!newlines_okay) && (c == '\n')) ||
339          ((paren_seen && string[i - 1] == ')') &&
340           (c == ' ' || c == '.')) ||
341          (c == '.' &&
342           (
343#if 0
344/* This test causes a node name ending in a period, like `This.', not to
345   be found.  The trailing . is stripped.  This occurs in the jargon
346   file (`I see no X here.' is a node name).  */
347           (!string[i + 1]) ||
348#endif
349            (whitespace_or_newline (string[i + 1])) ||
350            (string[i + 1] == ')'))))
351        break;
352    }
353  return (i);
354}
355
356
357/* **************************************************************** */
358/*                                                                  */
359/*                   Searching FILE_BUFFER's                        */
360/*                                                                  */
361/* **************************************************************** */
362
363/* Return the absolute position of the first occurence of a node separator in
364   BINDING-buffer.  The search starts at BINDING->start.  Return -1 if no node
365   separator was found. */
366long
367find_node_separator (SEARCH_BINDING *binding)
368{
369  register long i;
370  char *body;
371
372  body = binding->buffer;
373
374  /* A node is started by [^L]^_[^L]\n.  That is to say, the C-l's are
375     optional, but the DELETE and NEWLINE are not.  This separator holds
376     true for all separated elements in an Info file, including the tags
377     table (if present) and the indirect tags table (if present). */
378  for (i = binding->start; i < binding->end - 1; i++)
379    if (((body[i] == INFO_FF && body[i + 1] == INFO_COOKIE) &&
380         (body[i + 2] == '\n' ||
381          (body[i + 2] == INFO_FF && body[i + 3] == '\n'))) ||
382        ((body[i] == INFO_COOKIE) &&
383         (body[i + 1] == '\n' ||
384          (body[i + 1] == INFO_FF && body[i + 2] == '\n'))))
385      return (i);
386  return (-1);
387}
388
389/* Return the length of the node separator characters that BODY is
390   currently pointing at. */
391int
392skip_node_separator (char *body)
393{
394  register int i;
395
396  i = 0;
397
398  if (body[i] == INFO_FF)
399    i++;
400
401  if (body[i++] != INFO_COOKIE)
402    return (0);
403
404  if (body[i] == INFO_FF)
405    i++;
406
407  if (body[i++] != '\n')
408    return (0);
409
410  return (i);
411}
412
413/* Return the number of characters from STRING to the start of
414   the next line. */
415int
416skip_line (char *string)
417{
418  register int i;
419
420  for (i = 0; string && string[i] && string[i] != '\n'; i++);
421
422  if (string[i] == '\n')
423    i++;
424
425  return (i);
426}
427
428/* Return the absolute position of the beginning of a tags table in this
429   binding starting the search at binding->start. */
430long
431find_tags_table (SEARCH_BINDING *binding)
432{
433  SEARCH_BINDING tmp_search;
434  long position;
435
436  tmp_search.buffer = binding->buffer;
437  tmp_search.start = binding->start;
438  tmp_search.end = binding->end;
439  tmp_search.flags = S_FoldCase;
440
441  while ((position = find_node_separator (&tmp_search)) != -1 )
442    {
443      tmp_search.start = position;
444      tmp_search.start += skip_node_separator (tmp_search.buffer
445          + tmp_search.start);
446
447      if (looking_at (TAGS_TABLE_BEG_LABEL, &tmp_search))
448        return (position);
449    }
450  return (-1);
451}
452
453/* Return the absolute position of the node named NODENAME in BINDING.
454   This is a brute force search, and we wish to avoid it when possible.
455   This function is called when a tag (indirect or otherwise) doesn't
456   really point to the right node.  It returns the absolute position of
457   the separator preceding the node. */
458long
459find_node_in_binding (char *nodename, SEARCH_BINDING *binding)
460{
461  long position;
462  int offset, namelen;
463  SEARCH_BINDING tmp_search;
464
465  namelen = strlen (nodename);
466
467  tmp_search.buffer = binding->buffer;
468  tmp_search.start = binding->start;
469  tmp_search.end = binding->end;
470  tmp_search.flags = 0;
471
472  while ((position = find_node_separator (&tmp_search)) != -1)
473    {
474      tmp_search.start = position;
475      tmp_search.start += skip_node_separator
476        (tmp_search.buffer + tmp_search.start);
477
478      offset = string_in_line
479        (INFO_NODE_LABEL, tmp_search.buffer + tmp_search.start);
480
481      if (offset == -1)
482        continue;
483
484      tmp_search.start += offset;
485      tmp_search.start += skip_whitespace (tmp_search.buffer + tmp_search.start);
486      offset = skip_node_characters
487        (tmp_search.buffer + tmp_search.start, DONT_SKIP_NEWLINES);
488
489      /* Notice that this is an exact match.  You cannot grovel through
490         the buffer with this function looking for random nodes. */
491       if ((offset == namelen) &&
492           (tmp_search.buffer[tmp_search.start] == nodename[0]) &&
493           (strncmp (tmp_search.buffer + tmp_search.start, nodename, offset) == 0))
494         return (position);
495    }
496  return (-1);
497}
498