1// -*- C++ -*- 2/* Copyright (C) 1989, 1990, 1991, 1992, 2001 Free Software Foundation, Inc. 3 Written by James Clark (jjc@jclark.com) 4 5This file is part of groff. 6 7groff is free software; you can redistribute it and/or modify it under 8the terms of the GNU General Public License as published by the Free 9Software Foundation; either version 2, or (at your option) any later 10version. 11 12groff is distributed in the hope that it will be useful, but WITHOUT ANY 13WARRANTY; without even the implied warranty of MERCHANTABILITY or 14FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15for more details. 16 17You should have received a copy of the GNU General Public License along 18with groff; see the file COPYING. If not, write to the Free Software 19Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */ 20 21#include "refer.h" 22#include "token.h" 23 24#define TOKEN_TABLE_SIZE 1009 25// I believe in Icelandic thorn sorts after z. 26#define THORN_SORT_KEY "{" 27 28struct token_table_entry { 29 const char *tok; 30 token_info ti; 31 token_table_entry(); 32}; 33 34token_table_entry token_table[TOKEN_TABLE_SIZE]; 35int ntokens = 0; 36 37static void skip_name(const char **ptr, const char *end) 38{ 39 if (*ptr < end) { 40 switch (*(*ptr)++) { 41 case '(': 42 if (*ptr < end) { 43 *ptr += 1; 44 if (*ptr < end) 45 *ptr += 1; 46 } 47 break; 48 case '[': 49 while (*ptr < end) 50 if (*(*ptr)++ == ']') 51 break; 52 break; 53 } 54 } 55} 56 57int get_token(const char **ptr, const char *end) 58{ 59 if (*ptr >= end) 60 return 0; 61 char c = *(*ptr)++; 62 if (c == '\\' && *ptr < end) { 63 switch (**ptr) { 64 default: 65 *ptr += 1; 66 break; 67 case '(': 68 case '[': 69 skip_name(ptr, end); 70 break; 71 case '*': 72 case 'f': 73 *ptr += 1; 74 skip_name(ptr, end); 75 break; 76 } 77 } 78 return 1; 79} 80 81token_info::token_info() 82: type(TOKEN_OTHER), sort_key(0), other_case(0) 83{ 84} 85 86void token_info::set(token_type t, const char *sk, const char *oc) 87{ 88 assert(oc == 0 || t == TOKEN_UPPER || t == TOKEN_LOWER); 89 type = t; 90 sort_key = sk; 91 other_case = oc; 92} 93 94void token_info::sortify(const char *start, const char *end, string &result) 95 const 96{ 97 if (sort_key) 98 result += sort_key; 99 else if (type == TOKEN_UPPER || type == TOKEN_LOWER) { 100 for (; start < end; start++) 101 if (csalpha(*start)) 102 result += cmlower(*start); 103 } 104} 105 106int token_info::sortify_non_empty(const char *start, const char *end) const 107{ 108 if (sort_key) 109 return *sort_key != '\0'; 110 if (type != TOKEN_UPPER && type != TOKEN_LOWER) 111 return 0; 112 for (; start < end; start++) 113 if (csalpha(*start)) 114 return 1; 115 return 0; 116} 117 118 119void token_info::lower_case(const char *start, const char *end, 120 string &result) const 121{ 122 if (type != TOKEN_UPPER) { 123 while (start < end) 124 result += *start++; 125 } 126 else if (other_case) 127 result += other_case; 128 else { 129 while (start < end) 130 result += cmlower(*start++); 131 } 132} 133 134void token_info::upper_case(const char *start, const char *end, 135 string &result) const 136{ 137 if (type != TOKEN_LOWER) { 138 while (start < end) 139 result += *start++; 140 } 141 else if (other_case) 142 result += other_case; 143 else { 144 while (start < end) 145 result += cmupper(*start++); 146 } 147} 148 149token_table_entry::token_table_entry() 150: tok(0) 151{ 152} 153 154static void store_token(const char *tok, token_type typ, 155 const char *sk = 0, const char *oc = 0) 156{ 157 unsigned n = hash_string(tok, strlen(tok)) % TOKEN_TABLE_SIZE; 158 for (;;) { 159 if (token_table[n].tok == 0) { 160 if (++ntokens == TOKEN_TABLE_SIZE) 161 assert(0); 162 token_table[n].tok = tok; 163 break; 164 } 165 if (strcmp(tok, token_table[n].tok) == 0) 166 break; 167 if (n == 0) 168 n = TOKEN_TABLE_SIZE - 1; 169 else 170 --n; 171 } 172 token_table[n].ti.set(typ, sk, oc); 173} 174 175 176token_info default_token_info; 177 178const token_info *lookup_token(const char *start, const char *end) 179{ 180 unsigned n = hash_string(start, end - start) % TOKEN_TABLE_SIZE; 181 for (;;) { 182 if (token_table[n].tok == 0) 183 break; 184 if (strlen(token_table[n].tok) == size_t(end - start) 185 && memcmp(token_table[n].tok, start, end - start) == 0) 186 return &(token_table[n].ti); 187 if (n == 0) 188 n = TOKEN_TABLE_SIZE - 1; 189 else 190 --n; 191 } 192 return &default_token_info; 193} 194 195static void init_ascii() 196{ 197 const char *p; 198 for (p = "abcdefghijklmnopqrstuvwxyz"; *p; p++) { 199 char buf[2]; 200 buf[0] = *p; 201 buf[1] = '\0'; 202 store_token(strsave(buf), TOKEN_LOWER); 203 buf[0] = cmupper(buf[0]); 204 store_token(strsave(buf), TOKEN_UPPER); 205 } 206 for (p = "0123456789"; *p; p++) { 207 char buf[2]; 208 buf[0] = *p; 209 buf[1] = '\0'; 210 const char *s = strsave(buf); 211 store_token(s, TOKEN_OTHER, s); 212 } 213 for (p = ".,:;?!"; *p; p++) { 214 char buf[2]; 215 buf[0] = *p; 216 buf[1] = '\0'; 217 store_token(strsave(buf), TOKEN_PUNCT); 218 } 219 store_token("-", TOKEN_HYPHEN); 220} 221 222static void store_letter(const char *lower, const char *upper, 223 const char *sort_key = 0) 224{ 225 store_token(lower, TOKEN_LOWER, sort_key, upper); 226 store_token(upper, TOKEN_UPPER, sort_key, lower); 227} 228 229static void init_letter(unsigned char uc_code, unsigned char lc_code, 230 const char *sort_key) 231{ 232 char lbuf[2]; 233 lbuf[0] = lc_code; 234 lbuf[1] = 0; 235 char ubuf[2]; 236 ubuf[0] = uc_code; 237 ubuf[1] = 0; 238 store_letter(strsave(lbuf), strsave(ubuf), sort_key); 239} 240 241static void init_latin1() 242{ 243 init_letter(0xc0, 0xe0, "a"); 244 init_letter(0xc1, 0xe1, "a"); 245 init_letter(0xc2, 0xe2, "a"); 246 init_letter(0xc3, 0xe3, "a"); 247 init_letter(0xc4, 0xe4, "a"); 248 init_letter(0xc5, 0xe5, "a"); 249 init_letter(0xc6, 0xe6, "ae"); 250 init_letter(0xc7, 0xe7, "c"); 251 init_letter(0xc8, 0xe8, "e"); 252 init_letter(0xc9, 0xe9, "e"); 253 init_letter(0xca, 0xea, "e"); 254 init_letter(0xcb, 0xeb, "e"); 255 init_letter(0xcc, 0xec, "i"); 256 init_letter(0xcd, 0xed, "i"); 257 init_letter(0xce, 0xee, "i"); 258 init_letter(0xcf, 0xef, "i"); 259 260 init_letter(0xd0, 0xf0, "d"); 261 init_letter(0xd1, 0xf1, "n"); 262 init_letter(0xd2, 0xf2, "o"); 263 init_letter(0xd3, 0xf3, "o"); 264 init_letter(0xd4, 0xf4, "o"); 265 init_letter(0xd5, 0xf5, "o"); 266 init_letter(0xd6, 0xf6, "o"); 267 init_letter(0xd8, 0xf8, "o"); 268 init_letter(0xd9, 0xf9, "u"); 269 init_letter(0xda, 0xfa, "u"); 270 init_letter(0xdb, 0xfb, "u"); 271 init_letter(0xdc, 0xfc, "u"); 272 init_letter(0xdd, 0xfd, "y"); 273 init_letter(0xde, 0xfe, THORN_SORT_KEY); 274 275 store_token("\337", TOKEN_LOWER, "ss", "SS"); 276 store_token("\377", TOKEN_LOWER, "y", "Y"); 277} 278 279static void init_two_char_letter(char l1, char l2, char u1, char u2, 280 const char *sk = 0) 281{ 282 char buf[6]; 283 buf[0] = '\\'; 284 buf[1] = '('; 285 buf[2] = l1; 286 buf[3] = l2; 287 buf[4] = '\0'; 288 const char *p = strsave(buf); 289 buf[2] = u1; 290 buf[3] = u2; 291 store_letter(p, strsave(buf), sk); 292 buf[1] = '['; 293 buf[4] = ']'; 294 buf[5] = '\0'; 295 p = strsave(buf); 296 buf[2] = l1; 297 buf[3] = l2; 298 store_letter(strsave(buf), p, sk); 299 300} 301 302static void init_special_chars() 303{ 304 const char *p; 305 for (p = "':^`~"; *p; p++) 306 for (const char *q = "aeiouy"; *q; q++) { 307 // Use a variable to work around bug in gcc 2.0 308 char c = cmupper(*q); 309 init_two_char_letter(*p, *q, *p, c); 310 } 311 for (p = "/l/o~n,coeaeij"; *p; p += 2) { 312 // Use variables to work around bug in gcc 2.0 313 char c0 = cmupper(p[0]); 314 char c1 = cmupper(p[1]); 315 init_two_char_letter(p[0], p[1], c0, c1); 316 } 317 init_two_char_letter('v', 's', 'v', 'S', "s"); 318 init_two_char_letter('v', 'z', 'v', 'Z', "z"); 319 init_two_char_letter('o', 'a', 'o', 'A', "a"); 320 init_two_char_letter('T', 'p', 'T', 'P', THORN_SORT_KEY); 321 init_two_char_letter('-', 'd', '-', 'D'); 322 323 store_token("\\(ss", TOKEN_LOWER, 0, "SS"); 324 store_token("\\[ss]", TOKEN_LOWER, 0, "SS"); 325 326 store_token("\\(Sd", TOKEN_LOWER, "d", "\\(-D"); 327 store_token("\\[Sd]", TOKEN_LOWER, "d", "\\[-D]"); 328 store_token("\\(hy", TOKEN_HYPHEN); 329 store_token("\\[hy]", TOKEN_HYPHEN); 330 store_token("\\(en", TOKEN_RANGE_SEP); 331 store_token("\\[en]", TOKEN_RANGE_SEP); 332} 333 334static void init_strings() 335{ 336 char buf[6]; 337 buf[0] = '\\'; 338 buf[1] = '*'; 339 for (const char *p = "'`^^,:~v_o./;"; *p; p++) { 340 buf[2] = *p; 341 buf[3] = '\0'; 342 store_token(strsave(buf), TOKEN_ACCENT); 343 buf[2] = '['; 344 buf[3] = *p; 345 buf[4] = ']'; 346 buf[5] = '\0'; 347 store_token(strsave(buf), TOKEN_ACCENT); 348 } 349 350 // -ms special letters 351 store_letter("\\*(th", "\\*(Th", THORN_SORT_KEY); 352 store_letter("\\*[th]", "\\*[Th]", THORN_SORT_KEY); 353 store_letter("\\*(d-", "\\*(D-"); 354 store_letter("\\*[d-]", "\\*[D-]"); 355 store_letter("\\*(ae", "\\*(Ae", "ae"); 356 store_letter("\\*[ae]", "\\*[Ae]", "ae"); 357 store_letter("\\*(oe", "\\*(Oe", "oe"); 358 store_letter("\\*[oe]", "\\*[Oe]", "oe"); 359 360 store_token("\\*3", TOKEN_LOWER, "y", "Y"); 361 store_token("\\*8", TOKEN_LOWER, "ss", "SS"); 362 store_token("\\*q", TOKEN_LOWER, "o", "O"); 363} 364 365struct token_initer { 366 token_initer(); 367}; 368 369static token_initer the_token_initer; 370 371token_initer::token_initer() 372{ 373 init_ascii(); 374 init_latin1(); 375 init_special_chars(); 376 init_strings(); 377 default_token_info.set(TOKEN_OTHER); 378} 379