1114402Sru// -*- C++ -*- 2114402Sru/* Copyright (C) 1989, 1990, 1991, 1992, 2001 Free Software Foundation, Inc. 3114402Sru Written by James Clark (jjc@jclark.com) 4114402Sru 5114402SruThis file is part of groff. 6114402Sru 7114402Srugroff is free software; you can redistribute it and/or modify it under 8114402Sruthe terms of the GNU General Public License as published by the Free 9114402SruSoftware Foundation; either version 2, or (at your option) any later 10114402Sruversion. 11114402Sru 12114402Srugroff is distributed in the hope that it will be useful, but WITHOUT ANY 13114402SruWARRANTY; without even the implied warranty of MERCHANTABILITY or 14114402SruFITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15114402Srufor more details. 16114402Sru 17114402SruYou should have received a copy of the GNU General Public License along 18114402Sruwith groff; see the file COPYING. If not, write to the Free Software 19151497SruFoundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */ 20114402Sru 21114402Sru#include "refer.h" 22114402Sru#include "token.h" 23114402Sru 24114402Sru#define TOKEN_TABLE_SIZE 1009 25114402Sru// I believe in Icelandic thorn sorts after z. 26114402Sru#define THORN_SORT_KEY "{" 27114402Sru 28114402Srustruct token_table_entry { 29114402Sru const char *tok; 30114402Sru token_info ti; 31114402Sru token_table_entry(); 32114402Sru}; 33114402Sru 34114402Srutoken_table_entry token_table[TOKEN_TABLE_SIZE]; 35114402Sruint ntokens = 0; 36114402Sru 37114402Srustatic void skip_name(const char **ptr, const char *end) 38114402Sru{ 39114402Sru if (*ptr < end) { 40114402Sru switch (*(*ptr)++) { 41114402Sru case '(': 42114402Sru if (*ptr < end) { 43114402Sru *ptr += 1; 44114402Sru if (*ptr < end) 45114402Sru *ptr += 1; 46114402Sru } 47114402Sru break; 48114402Sru case '[': 49114402Sru while (*ptr < end) 50114402Sru if (*(*ptr)++ == ']') 51114402Sru break; 52114402Sru break; 53114402Sru } 54114402Sru } 55114402Sru} 56114402Sru 57114402Sruint get_token(const char **ptr, const char *end) 58114402Sru{ 59114402Sru if (*ptr >= end) 60114402Sru return 0; 61114402Sru char c = *(*ptr)++; 62114402Sru if (c == '\\' && *ptr < end) { 63114402Sru switch (**ptr) { 64114402Sru default: 65114402Sru *ptr += 1; 66114402Sru break; 67114402Sru case '(': 68114402Sru case '[': 69114402Sru skip_name(ptr, end); 70114402Sru break; 71114402Sru case '*': 72114402Sru case 'f': 73114402Sru *ptr += 1; 74114402Sru skip_name(ptr, end); 75114402Sru break; 76114402Sru } 77114402Sru } 78114402Sru return 1; 79114402Sru} 80114402Sru 81114402Srutoken_info::token_info() 82114402Sru: type(TOKEN_OTHER), sort_key(0), other_case(0) 83114402Sru{ 84114402Sru} 85114402Sru 86114402Sruvoid token_info::set(token_type t, const char *sk, const char *oc) 87114402Sru{ 88114402Sru assert(oc == 0 || t == TOKEN_UPPER || t == TOKEN_LOWER); 89114402Sru type = t; 90114402Sru sort_key = sk; 91114402Sru other_case = oc; 92114402Sru} 93114402Sru 94114402Sruvoid token_info::sortify(const char *start, const char *end, string &result) 95114402Sru const 96114402Sru{ 97114402Sru if (sort_key) 98114402Sru result += sort_key; 99114402Sru else if (type == TOKEN_UPPER || type == TOKEN_LOWER) { 100114402Sru for (; start < end; start++) 101114402Sru if (csalpha(*start)) 102114402Sru result += cmlower(*start); 103114402Sru } 104114402Sru} 105114402Sru 106114402Sruint token_info::sortify_non_empty(const char *start, const char *end) const 107114402Sru{ 108114402Sru if (sort_key) 109114402Sru return *sort_key != '\0'; 110114402Sru if (type != TOKEN_UPPER && type != TOKEN_LOWER) 111114402Sru return 0; 112114402Sru for (; start < end; start++) 113114402Sru if (csalpha(*start)) 114114402Sru return 1; 115114402Sru return 0; 116114402Sru} 117114402Sru 118114402Sru 119114402Sruvoid token_info::lower_case(const char *start, const char *end, 120114402Sru string &result) const 121114402Sru{ 122114402Sru if (type != TOKEN_UPPER) { 123114402Sru while (start < end) 124114402Sru result += *start++; 125114402Sru } 126114402Sru else if (other_case) 127114402Sru result += other_case; 128114402Sru else { 129114402Sru while (start < end) 130114402Sru result += cmlower(*start++); 131114402Sru } 132114402Sru} 133114402Sru 134114402Sruvoid token_info::upper_case(const char *start, const char *end, 135114402Sru string &result) const 136114402Sru{ 137114402Sru if (type != TOKEN_LOWER) { 138114402Sru while (start < end) 139114402Sru result += *start++; 140114402Sru } 141114402Sru else if (other_case) 142114402Sru result += other_case; 143114402Sru else { 144114402Sru while (start < end) 145114402Sru result += cmupper(*start++); 146114402Sru } 147114402Sru} 148114402Sru 149114402Srutoken_table_entry::token_table_entry() 150114402Sru: tok(0) 151114402Sru{ 152114402Sru} 153114402Sru 154114402Srustatic void store_token(const char *tok, token_type typ, 155114402Sru const char *sk = 0, const char *oc = 0) 156114402Sru{ 157114402Sru unsigned n = hash_string(tok, strlen(tok)) % TOKEN_TABLE_SIZE; 158114402Sru for (;;) { 159114402Sru if (token_table[n].tok == 0) { 160114402Sru if (++ntokens == TOKEN_TABLE_SIZE) 161114402Sru assert(0); 162114402Sru token_table[n].tok = tok; 163114402Sru break; 164114402Sru } 165114402Sru if (strcmp(tok, token_table[n].tok) == 0) 166114402Sru break; 167114402Sru if (n == 0) 168114402Sru n = TOKEN_TABLE_SIZE - 1; 169114402Sru else 170114402Sru --n; 171114402Sru } 172114402Sru token_table[n].ti.set(typ, sk, oc); 173114402Sru} 174114402Sru 175114402Sru 176114402Srutoken_info default_token_info; 177114402Sru 178114402Sruconst token_info *lookup_token(const char *start, const char *end) 179114402Sru{ 180114402Sru unsigned n = hash_string(start, end - start) % TOKEN_TABLE_SIZE; 181114402Sru for (;;) { 182114402Sru if (token_table[n].tok == 0) 183114402Sru break; 184114402Sru if (strlen(token_table[n].tok) == size_t(end - start) 185114402Sru && memcmp(token_table[n].tok, start, end - start) == 0) 186114402Sru return &(token_table[n].ti); 187114402Sru if (n == 0) 188114402Sru n = TOKEN_TABLE_SIZE - 1; 189114402Sru else 190114402Sru --n; 191114402Sru } 192114402Sru return &default_token_info; 193114402Sru} 194114402Sru 195114402Srustatic void init_ascii() 196114402Sru{ 197114402Sru const char *p; 198114402Sru for (p = "abcdefghijklmnopqrstuvwxyz"; *p; p++) { 199114402Sru char buf[2]; 200114402Sru buf[0] = *p; 201114402Sru buf[1] = '\0'; 202114402Sru store_token(strsave(buf), TOKEN_LOWER); 203114402Sru buf[0] = cmupper(buf[0]); 204114402Sru store_token(strsave(buf), TOKEN_UPPER); 205114402Sru } 206114402Sru for (p = "0123456789"; *p; p++) { 207114402Sru char buf[2]; 208114402Sru buf[0] = *p; 209114402Sru buf[1] = '\0'; 210114402Sru const char *s = strsave(buf); 211114402Sru store_token(s, TOKEN_OTHER, s); 212114402Sru } 213114402Sru for (p = ".,:;?!"; *p; p++) { 214114402Sru char buf[2]; 215114402Sru buf[0] = *p; 216114402Sru buf[1] = '\0'; 217114402Sru store_token(strsave(buf), TOKEN_PUNCT); 218114402Sru } 219114402Sru store_token("-", TOKEN_HYPHEN); 220114402Sru} 221114402Sru 222114402Srustatic void store_letter(const char *lower, const char *upper, 223114402Sru const char *sort_key = 0) 224114402Sru{ 225114402Sru store_token(lower, TOKEN_LOWER, sort_key, upper); 226114402Sru store_token(upper, TOKEN_UPPER, sort_key, lower); 227114402Sru} 228114402Sru 229114402Srustatic void init_letter(unsigned char uc_code, unsigned char lc_code, 230114402Sru const char *sort_key) 231114402Sru{ 232114402Sru char lbuf[2]; 233114402Sru lbuf[0] = lc_code; 234114402Sru lbuf[1] = 0; 235114402Sru char ubuf[2]; 236114402Sru ubuf[0] = uc_code; 237114402Sru ubuf[1] = 0; 238114402Sru store_letter(strsave(lbuf), strsave(ubuf), sort_key); 239114402Sru} 240114402Sru 241114402Srustatic void init_latin1() 242114402Sru{ 243114402Sru init_letter(0xc0, 0xe0, "a"); 244114402Sru init_letter(0xc1, 0xe1, "a"); 245114402Sru init_letter(0xc2, 0xe2, "a"); 246114402Sru init_letter(0xc3, 0xe3, "a"); 247114402Sru init_letter(0xc4, 0xe4, "a"); 248114402Sru init_letter(0xc5, 0xe5, "a"); 249114402Sru init_letter(0xc6, 0xe6, "ae"); 250114402Sru init_letter(0xc7, 0xe7, "c"); 251114402Sru init_letter(0xc8, 0xe8, "e"); 252114402Sru init_letter(0xc9, 0xe9, "e"); 253114402Sru init_letter(0xca, 0xea, "e"); 254114402Sru init_letter(0xcb, 0xeb, "e"); 255114402Sru init_letter(0xcc, 0xec, "i"); 256114402Sru init_letter(0xcd, 0xed, "i"); 257114402Sru init_letter(0xce, 0xee, "i"); 258114402Sru init_letter(0xcf, 0xef, "i"); 259114402Sru 260114402Sru init_letter(0xd0, 0xf0, "d"); 261114402Sru init_letter(0xd1, 0xf1, "n"); 262114402Sru init_letter(0xd2, 0xf2, "o"); 263114402Sru init_letter(0xd3, 0xf3, "o"); 264114402Sru init_letter(0xd4, 0xf4, "o"); 265114402Sru init_letter(0xd5, 0xf5, "o"); 266114402Sru init_letter(0xd6, 0xf6, "o"); 267114402Sru init_letter(0xd8, 0xf8, "o"); 268114402Sru init_letter(0xd9, 0xf9, "u"); 269114402Sru init_letter(0xda, 0xfa, "u"); 270114402Sru init_letter(0xdb, 0xfb, "u"); 271114402Sru init_letter(0xdc, 0xfc, "u"); 272114402Sru init_letter(0xdd, 0xfd, "y"); 273114402Sru init_letter(0xde, 0xfe, THORN_SORT_KEY); 274114402Sru 275114402Sru store_token("\337", TOKEN_LOWER, "ss", "SS"); 276114402Sru store_token("\377", TOKEN_LOWER, "y", "Y"); 277114402Sru} 278114402Sru 279114402Srustatic void init_two_char_letter(char l1, char l2, char u1, char u2, 280114402Sru const char *sk = 0) 281114402Sru{ 282114402Sru char buf[6]; 283114402Sru buf[0] = '\\'; 284114402Sru buf[1] = '('; 285114402Sru buf[2] = l1; 286114402Sru buf[3] = l2; 287114402Sru buf[4] = '\0'; 288114402Sru const char *p = strsave(buf); 289114402Sru buf[2] = u1; 290114402Sru buf[3] = u2; 291114402Sru store_letter(p, strsave(buf), sk); 292114402Sru buf[1] = '['; 293114402Sru buf[4] = ']'; 294114402Sru buf[5] = '\0'; 295114402Sru p = strsave(buf); 296114402Sru buf[2] = l1; 297114402Sru buf[3] = l2; 298114402Sru store_letter(strsave(buf), p, sk); 299114402Sru 300114402Sru} 301114402Sru 302114402Srustatic void init_special_chars() 303114402Sru{ 304114402Sru const char *p; 305114402Sru for (p = "':^`~"; *p; p++) 306114402Sru for (const char *q = "aeiouy"; *q; q++) { 307114402Sru // Use a variable to work around bug in gcc 2.0 308114402Sru char c = cmupper(*q); 309114402Sru init_two_char_letter(*p, *q, *p, c); 310114402Sru } 311114402Sru for (p = "/l/o~n,coeaeij"; *p; p += 2) { 312114402Sru // Use variables to work around bug in gcc 2.0 313114402Sru char c0 = cmupper(p[0]); 314114402Sru char c1 = cmupper(p[1]); 315114402Sru init_two_char_letter(p[0], p[1], c0, c1); 316114402Sru } 317114402Sru init_two_char_letter('v', 's', 'v', 'S', "s"); 318114402Sru init_two_char_letter('v', 'z', 'v', 'Z', "z"); 319114402Sru init_two_char_letter('o', 'a', 'o', 'A', "a"); 320114402Sru init_two_char_letter('T', 'p', 'T', 'P', THORN_SORT_KEY); 321114402Sru init_two_char_letter('-', 'd', '-', 'D'); 322114402Sru 323114402Sru store_token("\\(ss", TOKEN_LOWER, 0, "SS"); 324114402Sru store_token("\\[ss]", TOKEN_LOWER, 0, "SS"); 325114402Sru 326114402Sru store_token("\\(Sd", TOKEN_LOWER, "d", "\\(-D"); 327114402Sru store_token("\\[Sd]", TOKEN_LOWER, "d", "\\[-D]"); 328114402Sru store_token("\\(hy", TOKEN_HYPHEN); 329114402Sru store_token("\\[hy]", TOKEN_HYPHEN); 330114402Sru store_token("\\(en", TOKEN_RANGE_SEP); 331114402Sru store_token("\\[en]", TOKEN_RANGE_SEP); 332114402Sru} 333114402Sru 334114402Srustatic void init_strings() 335114402Sru{ 336114402Sru char buf[6]; 337114402Sru buf[0] = '\\'; 338114402Sru buf[1] = '*'; 339114402Sru for (const char *p = "'`^^,:~v_o./;"; *p; p++) { 340114402Sru buf[2] = *p; 341114402Sru buf[3] = '\0'; 342114402Sru store_token(strsave(buf), TOKEN_ACCENT); 343114402Sru buf[2] = '['; 344114402Sru buf[3] = *p; 345114402Sru buf[4] = ']'; 346114402Sru buf[5] = '\0'; 347114402Sru store_token(strsave(buf), TOKEN_ACCENT); 348114402Sru } 349114402Sru 350114402Sru // -ms special letters 351114402Sru store_letter("\\*(th", "\\*(Th", THORN_SORT_KEY); 352114402Sru store_letter("\\*[th]", "\\*[Th]", THORN_SORT_KEY); 353114402Sru store_letter("\\*(d-", "\\*(D-"); 354114402Sru store_letter("\\*[d-]", "\\*[D-]"); 355114402Sru store_letter("\\*(ae", "\\*(Ae", "ae"); 356114402Sru store_letter("\\*[ae]", "\\*[Ae]", "ae"); 357114402Sru store_letter("\\*(oe", "\\*(Oe", "oe"); 358114402Sru store_letter("\\*[oe]", "\\*[Oe]", "oe"); 359114402Sru 360114402Sru store_token("\\*3", TOKEN_LOWER, "y", "Y"); 361114402Sru store_token("\\*8", TOKEN_LOWER, "ss", "SS"); 362114402Sru store_token("\\*q", TOKEN_LOWER, "o", "O"); 363114402Sru} 364114402Sru 365114402Srustruct token_initer { 366114402Sru token_initer(); 367114402Sru}; 368114402Sru 369114402Srustatic token_initer the_token_initer; 370114402Sru 371114402Srutoken_initer::token_initer() 372114402Sru{ 373114402Sru init_ascii(); 374114402Sru init_latin1(); 375114402Sru init_special_chars(); 376114402Sru init_strings(); 377114402Sru default_token_info.set(TOKEN_OTHER); 378114402Sru} 379