1114402Sru// -*- C++ -*-
2114402Sru/* Copyright (C) 1989, 1990, 1991, 1992, 2001 Free Software Foundation, Inc.
3114402Sru     Written by James Clark (jjc@jclark.com)
4114402Sru
5114402SruThis file is part of groff.
6114402Sru
7114402Srugroff is free software; you can redistribute it and/or modify it under
8114402Sruthe terms of the GNU General Public License as published by the Free
9114402SruSoftware Foundation; either version 2, or (at your option) any later
10114402Sruversion.
11114402Sru
12114402Srugroff is distributed in the hope that it will be useful, but WITHOUT ANY
13114402SruWARRANTY; without even the implied warranty of MERCHANTABILITY or
14114402SruFITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15114402Srufor more details.
16114402Sru
17114402SruYou should have received a copy of the GNU General Public License along
18114402Sruwith groff; see the file COPYING.  If not, write to the Free Software
19151497SruFoundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
20114402Sru
21114402Sru#include "refer.h"
22114402Sru#include "token.h"
23114402Sru
24114402Sru#define TOKEN_TABLE_SIZE 1009
25114402Sru// I believe in Icelandic thorn sorts after z.
26114402Sru#define THORN_SORT_KEY "{"
27114402Sru
28114402Srustruct token_table_entry {
29114402Sru  const char *tok;
30114402Sru  token_info ti;
31114402Sru  token_table_entry();
32114402Sru};
33114402Sru
34114402Srutoken_table_entry token_table[TOKEN_TABLE_SIZE];
35114402Sruint ntokens = 0;
36114402Sru
37114402Srustatic void skip_name(const char **ptr, const char *end)
38114402Sru{
39114402Sru  if (*ptr < end) {
40114402Sru    switch (*(*ptr)++) {
41114402Sru    case '(':
42114402Sru      if (*ptr < end) {
43114402Sru	*ptr += 1;
44114402Sru	if (*ptr < end)
45114402Sru	  *ptr += 1;
46114402Sru      }
47114402Sru      break;
48114402Sru    case '[':
49114402Sru      while (*ptr < end)
50114402Sru	if (*(*ptr)++ == ']')
51114402Sru	  break;
52114402Sru      break;
53114402Sru    }
54114402Sru  }
55114402Sru}
56114402Sru
57114402Sruint get_token(const char **ptr, const char *end)
58114402Sru{
59114402Sru  if (*ptr >= end)
60114402Sru    return 0;
61114402Sru  char c = *(*ptr)++;
62114402Sru  if (c == '\\' && *ptr < end) {
63114402Sru    switch (**ptr) {
64114402Sru    default:
65114402Sru      *ptr += 1;
66114402Sru      break;
67114402Sru    case '(':
68114402Sru    case '[':
69114402Sru      skip_name(ptr, end);
70114402Sru      break;
71114402Sru    case '*':
72114402Sru    case 'f':
73114402Sru      *ptr += 1;
74114402Sru      skip_name(ptr, end);
75114402Sru      break;
76114402Sru    }
77114402Sru  }
78114402Sru  return 1;
79114402Sru}
80114402Sru
81114402Srutoken_info::token_info()
82114402Sru: type(TOKEN_OTHER), sort_key(0), other_case(0)
83114402Sru{
84114402Sru}
85114402Sru
86114402Sruvoid token_info::set(token_type t, const char *sk, const char *oc)
87114402Sru{
88114402Sru  assert(oc == 0 || t == TOKEN_UPPER || t == TOKEN_LOWER);
89114402Sru  type = t;
90114402Sru  sort_key = sk;
91114402Sru  other_case = oc;
92114402Sru}
93114402Sru
94114402Sruvoid token_info::sortify(const char *start, const char *end, string &result)
95114402Sru     const
96114402Sru{
97114402Sru  if (sort_key)
98114402Sru    result += sort_key;
99114402Sru  else if (type == TOKEN_UPPER || type == TOKEN_LOWER) {
100114402Sru    for (; start < end; start++)
101114402Sru      if (csalpha(*start))
102114402Sru	result += cmlower(*start);
103114402Sru  }
104114402Sru}
105114402Sru
106114402Sruint token_info::sortify_non_empty(const char *start, const char *end) const
107114402Sru{
108114402Sru  if (sort_key)
109114402Sru    return *sort_key != '\0';
110114402Sru  if (type != TOKEN_UPPER && type != TOKEN_LOWER)
111114402Sru    return 0;
112114402Sru  for (; start < end; start++)
113114402Sru    if (csalpha(*start))
114114402Sru      return 1;
115114402Sru  return 0;
116114402Sru}
117114402Sru
118114402Sru
119114402Sruvoid token_info::lower_case(const char *start, const char *end,
120114402Sru			    string &result) const
121114402Sru{
122114402Sru  if (type != TOKEN_UPPER) {
123114402Sru    while (start < end)
124114402Sru      result += *start++;
125114402Sru  }
126114402Sru  else if (other_case)
127114402Sru    result += other_case;
128114402Sru  else {
129114402Sru    while (start < end)
130114402Sru      result += cmlower(*start++);
131114402Sru  }
132114402Sru}
133114402Sru
134114402Sruvoid token_info::upper_case(const char *start, const char *end,
135114402Sru			    string &result) const
136114402Sru{
137114402Sru  if (type != TOKEN_LOWER) {
138114402Sru    while (start < end)
139114402Sru      result += *start++;
140114402Sru  }
141114402Sru  else if (other_case)
142114402Sru    result += other_case;
143114402Sru  else {
144114402Sru    while (start < end)
145114402Sru      result += cmupper(*start++);
146114402Sru  }
147114402Sru}
148114402Sru
149114402Srutoken_table_entry::token_table_entry()
150114402Sru: tok(0)
151114402Sru{
152114402Sru}
153114402Sru
154114402Srustatic void store_token(const char *tok, token_type typ,
155114402Sru			const char *sk = 0, const char *oc = 0)
156114402Sru{
157114402Sru  unsigned n = hash_string(tok, strlen(tok)) % TOKEN_TABLE_SIZE;
158114402Sru  for (;;) {
159114402Sru    if (token_table[n].tok == 0) {
160114402Sru      if (++ntokens == TOKEN_TABLE_SIZE)
161114402Sru	assert(0);
162114402Sru      token_table[n].tok = tok;
163114402Sru      break;
164114402Sru    }
165114402Sru    if (strcmp(tok, token_table[n].tok) == 0)
166114402Sru      break;
167114402Sru    if (n == 0)
168114402Sru      n = TOKEN_TABLE_SIZE - 1;
169114402Sru    else
170114402Sru      --n;
171114402Sru  }
172114402Sru  token_table[n].ti.set(typ, sk, oc);
173114402Sru}
174114402Sru
175114402Sru
176114402Srutoken_info default_token_info;
177114402Sru
178114402Sruconst token_info *lookup_token(const char *start, const char *end)
179114402Sru{
180114402Sru  unsigned n = hash_string(start, end - start) % TOKEN_TABLE_SIZE;
181114402Sru  for (;;) {
182114402Sru    if (token_table[n].tok == 0)
183114402Sru      break;
184114402Sru    if (strlen(token_table[n].tok) == size_t(end - start)
185114402Sru	&& memcmp(token_table[n].tok, start, end - start) == 0)
186114402Sru      return &(token_table[n].ti);
187114402Sru    if (n == 0)
188114402Sru      n = TOKEN_TABLE_SIZE - 1;
189114402Sru    else
190114402Sru      --n;
191114402Sru  }
192114402Sru  return &default_token_info;
193114402Sru}
194114402Sru
195114402Srustatic void init_ascii()
196114402Sru{
197114402Sru  const char *p;
198114402Sru  for (p = "abcdefghijklmnopqrstuvwxyz"; *p; p++) {
199114402Sru    char buf[2];
200114402Sru    buf[0] = *p;
201114402Sru    buf[1] = '\0';
202114402Sru    store_token(strsave(buf), TOKEN_LOWER);
203114402Sru    buf[0] = cmupper(buf[0]);
204114402Sru    store_token(strsave(buf), TOKEN_UPPER);
205114402Sru  }
206114402Sru  for (p = "0123456789"; *p; p++) {
207114402Sru    char buf[2];
208114402Sru    buf[0] = *p;
209114402Sru    buf[1] = '\0';
210114402Sru    const char *s = strsave(buf);
211114402Sru    store_token(s, TOKEN_OTHER, s);
212114402Sru  }
213114402Sru  for (p = ".,:;?!"; *p; p++) {
214114402Sru    char buf[2];
215114402Sru    buf[0] = *p;
216114402Sru    buf[1] = '\0';
217114402Sru    store_token(strsave(buf), TOKEN_PUNCT);
218114402Sru  }
219114402Sru  store_token("-", TOKEN_HYPHEN);
220114402Sru}
221114402Sru
222114402Srustatic void store_letter(const char *lower, const char *upper,
223114402Sru		  const char *sort_key = 0)
224114402Sru{
225114402Sru  store_token(lower, TOKEN_LOWER, sort_key, upper);
226114402Sru  store_token(upper, TOKEN_UPPER, sort_key, lower);
227114402Sru}
228114402Sru
229114402Srustatic void init_letter(unsigned char uc_code, unsigned char lc_code,
230114402Sru		 const char *sort_key)
231114402Sru{
232114402Sru  char lbuf[2];
233114402Sru  lbuf[0] = lc_code;
234114402Sru  lbuf[1] = 0;
235114402Sru  char ubuf[2];
236114402Sru  ubuf[0] = uc_code;
237114402Sru  ubuf[1] = 0;
238114402Sru  store_letter(strsave(lbuf), strsave(ubuf), sort_key);
239114402Sru}
240114402Sru
241114402Srustatic void init_latin1()
242114402Sru{
243114402Sru  init_letter(0xc0, 0xe0, "a");
244114402Sru  init_letter(0xc1, 0xe1, "a");
245114402Sru  init_letter(0xc2, 0xe2, "a");
246114402Sru  init_letter(0xc3, 0xe3, "a");
247114402Sru  init_letter(0xc4, 0xe4, "a");
248114402Sru  init_letter(0xc5, 0xe5, "a");
249114402Sru  init_letter(0xc6, 0xe6, "ae");
250114402Sru  init_letter(0xc7, 0xe7, "c");
251114402Sru  init_letter(0xc8, 0xe8, "e");
252114402Sru  init_letter(0xc9, 0xe9, "e");
253114402Sru  init_letter(0xca, 0xea, "e");
254114402Sru  init_letter(0xcb, 0xeb, "e");
255114402Sru  init_letter(0xcc, 0xec, "i");
256114402Sru  init_letter(0xcd, 0xed, "i");
257114402Sru  init_letter(0xce, 0xee, "i");
258114402Sru  init_letter(0xcf, 0xef, "i");
259114402Sru
260114402Sru  init_letter(0xd0, 0xf0, "d");
261114402Sru  init_letter(0xd1, 0xf1, "n");
262114402Sru  init_letter(0xd2, 0xf2, "o");
263114402Sru  init_letter(0xd3, 0xf3, "o");
264114402Sru  init_letter(0xd4, 0xf4, "o");
265114402Sru  init_letter(0xd5, 0xf5, "o");
266114402Sru  init_letter(0xd6, 0xf6, "o");
267114402Sru  init_letter(0xd8, 0xf8, "o");
268114402Sru  init_letter(0xd9, 0xf9, "u");
269114402Sru  init_letter(0xda, 0xfa, "u");
270114402Sru  init_letter(0xdb, 0xfb, "u");
271114402Sru  init_letter(0xdc, 0xfc, "u");
272114402Sru  init_letter(0xdd, 0xfd, "y");
273114402Sru  init_letter(0xde, 0xfe, THORN_SORT_KEY);
274114402Sru
275114402Sru  store_token("\337", TOKEN_LOWER, "ss", "SS");
276114402Sru  store_token("\377", TOKEN_LOWER, "y", "Y");
277114402Sru}
278114402Sru
279114402Srustatic void init_two_char_letter(char l1, char l2, char u1, char u2,
280114402Sru				 const char *sk = 0)
281114402Sru{
282114402Sru  char buf[6];
283114402Sru  buf[0] = '\\';
284114402Sru  buf[1] = '(';
285114402Sru  buf[2] = l1;
286114402Sru  buf[3] = l2;
287114402Sru  buf[4] = '\0';
288114402Sru  const char *p = strsave(buf);
289114402Sru  buf[2] = u1;
290114402Sru  buf[3] = u2;
291114402Sru  store_letter(p, strsave(buf), sk);
292114402Sru  buf[1] = '[';
293114402Sru  buf[4] = ']';
294114402Sru  buf[5] = '\0';
295114402Sru  p = strsave(buf);
296114402Sru  buf[2] = l1;
297114402Sru  buf[3] = l2;
298114402Sru  store_letter(strsave(buf), p, sk);
299114402Sru
300114402Sru}
301114402Sru
302114402Srustatic void init_special_chars()
303114402Sru{
304114402Sru  const char *p;
305114402Sru  for (p = "':^`~"; *p; p++)
306114402Sru    for (const char *q = "aeiouy"; *q; q++) {
307114402Sru      // Use a variable to work around bug in gcc 2.0
308114402Sru      char c = cmupper(*q);
309114402Sru      init_two_char_letter(*p, *q, *p, c);
310114402Sru    }
311114402Sru  for (p = "/l/o~n,coeaeij"; *p; p += 2) {
312114402Sru    // Use variables to work around bug in gcc 2.0
313114402Sru    char c0 = cmupper(p[0]);
314114402Sru    char c1 = cmupper(p[1]);
315114402Sru    init_two_char_letter(p[0], p[1], c0, c1);
316114402Sru  }
317114402Sru  init_two_char_letter('v', 's', 'v', 'S', "s");
318114402Sru  init_two_char_letter('v', 'z', 'v', 'Z', "z");
319114402Sru  init_two_char_letter('o', 'a', 'o', 'A', "a");
320114402Sru  init_two_char_letter('T', 'p', 'T', 'P', THORN_SORT_KEY);
321114402Sru  init_two_char_letter('-', 'd', '-', 'D');
322114402Sru
323114402Sru  store_token("\\(ss", TOKEN_LOWER, 0, "SS");
324114402Sru  store_token("\\[ss]", TOKEN_LOWER, 0, "SS");
325114402Sru
326114402Sru  store_token("\\(Sd", TOKEN_LOWER, "d", "\\(-D");
327114402Sru  store_token("\\[Sd]", TOKEN_LOWER, "d", "\\[-D]");
328114402Sru  store_token("\\(hy", TOKEN_HYPHEN);
329114402Sru  store_token("\\[hy]", TOKEN_HYPHEN);
330114402Sru  store_token("\\(en", TOKEN_RANGE_SEP);
331114402Sru  store_token("\\[en]", TOKEN_RANGE_SEP);
332114402Sru}
333114402Sru
334114402Srustatic void init_strings()
335114402Sru{
336114402Sru  char buf[6];
337114402Sru  buf[0] = '\\';
338114402Sru  buf[1] = '*';
339114402Sru  for (const char *p = "'`^^,:~v_o./;"; *p; p++) {
340114402Sru    buf[2] = *p;
341114402Sru    buf[3] = '\0';
342114402Sru    store_token(strsave(buf), TOKEN_ACCENT);
343114402Sru    buf[2] = '[';
344114402Sru    buf[3] = *p;
345114402Sru    buf[4] = ']';
346114402Sru    buf[5] = '\0';
347114402Sru    store_token(strsave(buf), TOKEN_ACCENT);
348114402Sru  }
349114402Sru
350114402Sru  // -ms special letters
351114402Sru  store_letter("\\*(th", "\\*(Th", THORN_SORT_KEY);
352114402Sru  store_letter("\\*[th]", "\\*[Th]", THORN_SORT_KEY);
353114402Sru  store_letter("\\*(d-", "\\*(D-");
354114402Sru  store_letter("\\*[d-]", "\\*[D-]");
355114402Sru  store_letter("\\*(ae", "\\*(Ae", "ae");
356114402Sru  store_letter("\\*[ae]", "\\*[Ae]", "ae");
357114402Sru  store_letter("\\*(oe", "\\*(Oe", "oe");
358114402Sru  store_letter("\\*[oe]", "\\*[Oe]", "oe");
359114402Sru
360114402Sru  store_token("\\*3", TOKEN_LOWER, "y", "Y");
361114402Sru  store_token("\\*8", TOKEN_LOWER, "ss", "SS");
362114402Sru  store_token("\\*q", TOKEN_LOWER, "o", "O");
363114402Sru}
364114402Sru
365114402Srustruct token_initer {
366114402Sru  token_initer();
367114402Sru};
368114402Sru
369114402Srustatic token_initer the_token_initer;
370114402Sru
371114402Srutoken_initer::token_initer()
372114402Sru{
373114402Sru  init_ascii();
374114402Sru  init_latin1();
375114402Sru  init_special_chars();
376114402Sru  init_strings();
377114402Sru  default_token_info.set(TOKEN_OTHER);
378114402Sru}
379