1114402Sru// -*- C++ -*- 2151497Sru/* Copyright (C) 1989, 1990, 1991, 1992, 2001, 2003 3151497Sru Free Software Foundation, Inc. 4114402SruWritten by James Clark (jjc@jclark.com) 5114402Sru 6114402SruThis file is part of groff. 7114402Sru 8114402Srugroff is free software; you can redistribute it and/or modify it under 9114402Sruthe terms of the GNU General Public License as published by the Free 10114402SruSoftware Foundation; either version 2, or (at your option) any later 11114402Sruversion. 12114402Sru 13114402Srugroff is distributed in the hope that it will be useful, but WITHOUT ANY 14114402SruWARRANTY; without even the implied warranty of MERCHANTABILITY or 15114402SruFITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16114402Srufor more details. 17114402Sru 18114402SruYou should have received a copy of the GNU General Public License along 19114402Sruwith groff; see the file COPYING. If not, write to the Free Software 20151497SruFoundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */ 21114402Sru 22114402Sru#include "refer.h" 23114402Sru#include "refid.h" 24114402Sru#include "ref.h" 25114402Sru#include "token.h" 26114402Sru 27114402Srustatic const char *find_day(const char *, const char *, const char **); 28114402Srustatic int find_month(const char *start, const char *end); 29114402Srustatic void abbreviate_names(string &); 30114402Sru 31114402Sru#define DEFAULT_ARTICLES "the\000a\000an" 32114402Sru 33114402Srustring articles(DEFAULT_ARTICLES, sizeof(DEFAULT_ARTICLES)); 34114402Sru 35114402Sru// Multiple occurrences of fields are separated by FIELD_SEPARATOR. 36114402Sruconst char FIELD_SEPARATOR = '\0'; 37114402Sru 38114402Sruconst char MULTI_FIELD_NAMES[] = "AE"; 39114402Sruconst char *AUTHOR_FIELDS = "AQ"; 40114402Sru 41114402Sruenum { OTHER, JOURNAL_ARTICLE, BOOK, ARTICLE_IN_BOOK, TECH_REPORT, BELL_TM }; 42114402Sru 43114402Sruconst char *reference_types[] = { 44114402Sru "other", 45114402Sru "journal-article", 46114402Sru "book", 47114402Sru "article-in-book", 48114402Sru "tech-report", 49114402Sru "bell-tm", 50114402Sru}; 51114402Sru 52114402Srustatic string temp_fields[256]; 53114402Sru 54114402Srureference::reference(const char *start, int len, reference_id *ridp) 55114402Sru: h(0), merged(0), no(-1), field(0), nfields(0), label_ptr(0), 56114402Sru computed_authors(0), last_needed_author(-1), nauthors(-1) 57114402Sru{ 58114402Sru int i; 59114402Sru for (i = 0; i < 256; i++) 60114402Sru field_index[i] = NULL_FIELD_INDEX; 61114402Sru if (ridp) 62114402Sru rid = *ridp; 63114402Sru if (start == 0) 64114402Sru return; 65114402Sru if (len <= 0) 66114402Sru return; 67114402Sru const char *end = start + len; 68114402Sru const char *ptr = start; 69114402Sru assert(*ptr == '%'); 70114402Sru while (ptr < end) { 71114402Sru if (ptr + 1 < end && ptr[1] != '\0' 72114402Sru && ((ptr[1] != '%' && ptr[1] == annotation_field) 73114402Sru || (ptr + 2 < end && ptr[1] == '%' && ptr[2] != '\0' 74114402Sru && discard_fields.search(ptr[2]) < 0))) { 75114402Sru if (ptr[1] == '%') 76114402Sru ptr++; 77114402Sru string &f = temp_fields[(unsigned char)ptr[1]]; 78114402Sru ptr += 2; 79114402Sru while (ptr < end && csspace(*ptr)) 80114402Sru ptr++; 81114402Sru for (;;) { 82114402Sru for (;;) { 83114402Sru if (ptr >= end) { 84114402Sru f += '\n'; 85114402Sru break; 86114402Sru } 87114402Sru f += *ptr; 88114402Sru if (*ptr++ == '\n') 89114402Sru break; 90114402Sru } 91114402Sru if (ptr >= end || *ptr == '%') 92114402Sru break; 93114402Sru } 94114402Sru } 95114402Sru else if (ptr + 1 < end && ptr[1] != '\0' && ptr[1] != '%' 96114402Sru && discard_fields.search(ptr[1]) < 0) { 97114402Sru string &f = temp_fields[(unsigned char)ptr[1]]; 98114402Sru if (f.length() > 0) { 99114402Sru if (strchr(MULTI_FIELD_NAMES, ptr[1]) != 0) 100114402Sru f += FIELD_SEPARATOR; 101114402Sru else 102114402Sru f.clear(); 103114402Sru } 104114402Sru ptr += 2; 105114402Sru if (ptr < end) { 106114402Sru if (*ptr == ' ') 107114402Sru ptr++; 108114402Sru for (;;) { 109114402Sru const char *p = ptr; 110114402Sru while (ptr < end && *ptr != '\n') 111114402Sru ptr++; 112114402Sru // strip trailing white space 113114402Sru const char *q = ptr; 114114402Sru while (q > p && q[-1] != '\n' && csspace(q[-1])) 115114402Sru q--; 116114402Sru while (p < q) 117114402Sru f += *p++; 118114402Sru if (ptr >= end) 119114402Sru break; 120114402Sru ptr++; 121114402Sru if (ptr >= end) 122114402Sru break; 123114402Sru if (*ptr == '%') 124114402Sru break; 125114402Sru f += ' '; 126114402Sru } 127114402Sru } 128114402Sru } 129114402Sru else { 130114402Sru // skip this field 131114402Sru for (;;) { 132114402Sru while (ptr < end && *ptr++ != '\n') 133114402Sru ; 134114402Sru if (ptr >= end || *ptr == '%') 135114402Sru break; 136114402Sru } 137114402Sru } 138114402Sru } 139114402Sru for (i = 0; i < 256; i++) 140114402Sru if (temp_fields[i].length() > 0) 141114402Sru nfields++; 142114402Sru field = new string[nfields]; 143114402Sru int j = 0; 144114402Sru for (i = 0; i < 256; i++) 145114402Sru if (temp_fields[i].length() > 0) { 146114402Sru field[j].move(temp_fields[i]); 147114402Sru if (abbreviate_fields.search(i) >= 0) 148114402Sru abbreviate_names(field[j]); 149114402Sru field_index[i] = j; 150114402Sru j++; 151114402Sru } 152114402Sru} 153114402Sru 154114402Srureference::~reference() 155114402Sru{ 156114402Sru if (nfields > 0) 157114402Sru ad_delete(nfields) field; 158114402Sru} 159114402Sru 160114402Sru// ref is the inline, this is the database ref 161114402Sru 162114402Sruvoid reference::merge(reference &ref) 163114402Sru{ 164114402Sru int i; 165114402Sru for (i = 0; i < 256; i++) 166114402Sru if (field_index[i] != NULL_FIELD_INDEX) 167114402Sru temp_fields[i].move(field[field_index[i]]); 168114402Sru for (i = 0; i < 256; i++) 169114402Sru if (ref.field_index[i] != NULL_FIELD_INDEX) 170114402Sru temp_fields[i].move(ref.field[ref.field_index[i]]); 171114402Sru for (i = 0; i < 256; i++) 172114402Sru field_index[i] = NULL_FIELD_INDEX; 173114402Sru int old_nfields = nfields; 174114402Sru nfields = 0; 175114402Sru for (i = 0; i < 256; i++) 176114402Sru if (temp_fields[i].length() > 0) 177114402Sru nfields++; 178114402Sru if (nfields != old_nfields) { 179114402Sru if (old_nfields > 0) 180114402Sru ad_delete(old_nfields) field; 181114402Sru field = new string[nfields]; 182114402Sru } 183114402Sru int j = 0; 184114402Sru for (i = 0; i < 256; i++) 185114402Sru if (temp_fields[i].length() > 0) { 186114402Sru field[j].move(temp_fields[i]); 187114402Sru field_index[i] = j; 188114402Sru j++; 189114402Sru } 190114402Sru merged = 1; 191114402Sru} 192114402Sru 193114402Sruvoid reference::insert_field(unsigned char c, string &s) 194114402Sru{ 195114402Sru assert(s.length() > 0); 196114402Sru if (field_index[c] != NULL_FIELD_INDEX) { 197114402Sru field[field_index[c]].move(s); 198114402Sru return; 199114402Sru } 200114402Sru assert(field_index[c] == NULL_FIELD_INDEX); 201114402Sru string *old_field = field; 202114402Sru field = new string[nfields + 1]; 203114402Sru int pos = 0; 204114402Sru int i; 205114402Sru for (i = 0; i < int(c); i++) 206114402Sru if (field_index[i] != NULL_FIELD_INDEX) 207114402Sru pos++; 208114402Sru for (i = 0; i < pos; i++) 209114402Sru field[i].move(old_field[i]); 210114402Sru field[pos].move(s); 211114402Sru for (i = pos; i < nfields; i++) 212114402Sru field[i + 1].move(old_field[i]); 213114402Sru if (nfields > 0) 214114402Sru ad_delete(nfields) old_field; 215114402Sru nfields++; 216114402Sru field_index[c] = pos; 217114402Sru for (i = c + 1; i < 256; i++) 218114402Sru if (field_index[i] != NULL_FIELD_INDEX) 219114402Sru field_index[i] += 1; 220114402Sru} 221114402Sru 222114402Sruvoid reference::delete_field(unsigned char c) 223114402Sru{ 224114402Sru if (field_index[c] == NULL_FIELD_INDEX) 225114402Sru return; 226114402Sru string *old_field = field; 227114402Sru field = new string[nfields - 1]; 228114402Sru int i; 229114402Sru for (i = 0; i < int(field_index[c]); i++) 230114402Sru field[i].move(old_field[i]); 231114402Sru for (i = field_index[c]; i < nfields - 1; i++) 232114402Sru field[i].move(old_field[i + 1]); 233114402Sru if (nfields > 0) 234114402Sru ad_delete(nfields) old_field; 235114402Sru nfields--; 236114402Sru field_index[c] = NULL_FIELD_INDEX; 237114402Sru for (i = c + 1; i < 256; i++) 238114402Sru if (field_index[i] != NULL_FIELD_INDEX) 239114402Sru field_index[i] -= 1; 240114402Sru} 241114402Sru 242114402Sruvoid reference::compute_hash_code() 243114402Sru{ 244114402Sru if (!rid.is_null()) 245114402Sru h = rid.hash(); 246114402Sru else { 247114402Sru h = 0; 248114402Sru for (int i = 0; i < nfields; i++) 249114402Sru if (field[i].length() > 0) { 250114402Sru h <<= 4; 251114402Sru h ^= hash_string(field[i].contents(), field[i].length()); 252114402Sru } 253114402Sru } 254114402Sru} 255114402Sru 256114402Sruvoid reference::set_number(int n) 257114402Sru{ 258114402Sru no = n; 259114402Sru} 260114402Sru 261114402Sruconst char SORT_SEP = '\001'; 262114402Sruconst char SORT_SUB_SEP = '\002'; 263114402Sruconst char SORT_SUB_SUB_SEP = '\003'; 264114402Sru 265114402Sru// sep specifies additional word separators 266114402Sru 267114402Sruvoid sortify_words(const char *s, const char *end, const char *sep, 268114402Sru string &result) 269114402Sru{ 270114402Sru int non_empty = 0; 271114402Sru int need_separator = 0; 272114402Sru for (;;) { 273114402Sru const char *token_start = s; 274114402Sru if (!get_token(&s, end)) 275114402Sru break; 276114402Sru if ((s - token_start == 1 277114402Sru && (*token_start == ' ' 278114402Sru || *token_start == '\n' 279114402Sru || (sep && *token_start != '\0' 280114402Sru && strchr(sep, *token_start) != 0))) 281114402Sru || (s - token_start == 2 282114402Sru && token_start[0] == '\\' && token_start[1] == ' ')) { 283114402Sru if (non_empty) 284114402Sru need_separator = 1; 285114402Sru } 286114402Sru else { 287114402Sru const token_info *ti = lookup_token(token_start, s); 288114402Sru if (ti->sortify_non_empty(token_start, s)) { 289114402Sru if (need_separator) { 290114402Sru result += ' '; 291114402Sru need_separator = 0; 292114402Sru } 293114402Sru ti->sortify(token_start, s, result); 294114402Sru non_empty = 1; 295114402Sru } 296114402Sru } 297114402Sru } 298114402Sru} 299114402Sru 300114402Sruvoid sortify_word(const char *s, const char *end, string &result) 301114402Sru{ 302114402Sru for (;;) { 303114402Sru const char *token_start = s; 304114402Sru if (!get_token(&s, end)) 305114402Sru break; 306114402Sru const token_info *ti = lookup_token(token_start, s); 307114402Sru ti->sortify(token_start, s, result); 308114402Sru } 309114402Sru} 310114402Sru 311114402Sruvoid sortify_other(const char *s, int len, string &key) 312114402Sru{ 313114402Sru sortify_words(s, s + len, 0, key); 314114402Sru} 315114402Sru 316114402Sruvoid sortify_title(const char *s, int len, string &key) 317114402Sru{ 318114402Sru const char *end = s + len; 319114402Sru for (; s < end && (*s == ' ' || *s == '\n'); s++) 320114402Sru ; 321114402Sru const char *ptr = s; 322114402Sru for (;;) { 323114402Sru const char *token_start = ptr; 324114402Sru if (!get_token(&ptr, end)) 325114402Sru break; 326114402Sru if (ptr - token_start == 1 327114402Sru && (*token_start == ' ' || *token_start == '\n')) 328114402Sru break; 329114402Sru } 330114402Sru if (ptr < end) { 331114402Sru unsigned int first_word_len = ptr - s - 1; 332114402Sru const char *ae = articles.contents() + articles.length(); 333114402Sru for (const char *a = articles.contents(); 334114402Sru a < ae; 335114402Sru a = strchr(a, '\0') + 1) 336114402Sru if (first_word_len == strlen(a)) { 337114402Sru unsigned int j; 338114402Sru for (j = 0; j < first_word_len; j++) 339114402Sru if (a[j] != cmlower(s[j])) 340114402Sru break; 341114402Sru if (j >= first_word_len) { 342114402Sru s = ptr; 343114402Sru for (; s < end && (*s == ' ' || *s == '\n'); s++) 344114402Sru ; 345114402Sru break; 346114402Sru } 347114402Sru } 348114402Sru } 349114402Sru sortify_words(s, end, 0, key); 350114402Sru} 351114402Sru 352114402Sruvoid sortify_name(const char *s, int len, string &key) 353114402Sru{ 354114402Sru const char *last_name_end; 355114402Sru const char *last_name = find_last_name(s, s + len, &last_name_end); 356114402Sru sortify_word(last_name, last_name_end, key); 357114402Sru key += SORT_SUB_SUB_SEP; 358114402Sru if (last_name > s) 359114402Sru sortify_words(s, last_name, ".", key); 360114402Sru key += SORT_SUB_SUB_SEP; 361114402Sru if (last_name_end < s + len) 362114402Sru sortify_words(last_name_end, s + len, ".,", key); 363114402Sru} 364114402Sru 365114402Sruvoid sortify_date(const char *s, int len, string &key) 366114402Sru{ 367114402Sru const char *year_end; 368114402Sru const char *year_start = find_year(s, s + len, &year_end); 369114402Sru if (!year_start) { 370114402Sru // Things without years are often `forthcoming', so it makes sense 371114402Sru // that they sort after things with explicit years. 372114402Sru key += 'A'; 373114402Sru sortify_words(s, s + len, 0, key); 374114402Sru return; 375114402Sru } 376114402Sru int n = year_end - year_start; 377114402Sru while (n < 4) { 378114402Sru key += '0'; 379114402Sru n++; 380114402Sru } 381114402Sru while (year_start < year_end) 382114402Sru key += *year_start++; 383114402Sru int m = find_month(s, s + len); 384114402Sru if (m < 0) 385114402Sru return; 386114402Sru key += 'A' + m; 387114402Sru const char *day_end; 388114402Sru const char *day_start = find_day(s, s + len, &day_end); 389114402Sru if (!day_start) 390114402Sru return; 391114402Sru if (day_end - day_start == 1) 392114402Sru key += '0'; 393114402Sru while (day_start < day_end) 394114402Sru key += *day_start++; 395114402Sru} 396114402Sru 397114402Sru// SORT_{SUB,SUB_SUB}_SEP can creep in from use of @ in label specification. 398114402Sru 399114402Sruvoid sortify_label(const char *s, int len, string &key) 400114402Sru{ 401114402Sru const char *end = s + len; 402114402Sru for (;;) { 403114402Sru const char *ptr; 404114402Sru for (ptr = s; 405114402Sru ptr < end && *ptr != SORT_SUB_SEP && *ptr != SORT_SUB_SUB_SEP; 406114402Sru ptr++) 407114402Sru ; 408114402Sru if (ptr > s) 409114402Sru sortify_words(s, ptr, 0, key); 410114402Sru s = ptr; 411114402Sru if (s >= end) 412114402Sru break; 413114402Sru key += *s++; 414114402Sru } 415114402Sru} 416114402Sru 417114402Sruvoid reference::compute_sort_key() 418114402Sru{ 419114402Sru if (sort_fields.length() == 0) 420114402Sru return; 421114402Sru sort_fields += '\0'; 422114402Sru const char *sf = sort_fields.contents(); 423114402Sru while (*sf != '\0') { 424151497Sru sort_key += SORT_SEP; 425114402Sru char f = *sf++; 426114402Sru int n = 1; 427114402Sru if (*sf == '+') { 428114402Sru n = INT_MAX; 429114402Sru sf++; 430114402Sru } 431114402Sru else if (csdigit(*sf)) { 432114402Sru char *ptr; 433114402Sru long l = strtol(sf, &ptr, 10); 434114402Sru if (l == 0 && ptr == sf) 435114402Sru ; 436114402Sru else { 437114402Sru sf = ptr; 438114402Sru if (l < 0) { 439114402Sru n = 1; 440114402Sru } 441114402Sru else { 442114402Sru n = int(l); 443114402Sru } 444114402Sru } 445114402Sru } 446114402Sru if (f == '.') 447114402Sru sortify_label(label.contents(), label.length(), sort_key); 448114402Sru else if (f == AUTHOR_FIELDS[0]) 449114402Sru sortify_authors(n, sort_key); 450114402Sru else 451114402Sru sortify_field(f, n, sort_key); 452114402Sru } 453114402Sru sort_fields.set_length(sort_fields.length() - 1); 454114402Sru} 455114402Sru 456114402Sruvoid reference::sortify_authors(int n, string &result) const 457114402Sru{ 458114402Sru for (const char *p = AUTHOR_FIELDS; *p != '\0'; p++) 459114402Sru if (contains_field(*p)) { 460114402Sru sortify_field(*p, n, result); 461114402Sru return; 462114402Sru } 463114402Sru sortify_field(AUTHOR_FIELDS[0], n, result); 464114402Sru} 465114402Sru 466114402Sruvoid reference::canonicalize_authors(string &result) const 467114402Sru{ 468114402Sru int len = result.length(); 469114402Sru sortify_authors(INT_MAX, result); 470114402Sru if (result.length() > len) 471114402Sru result += SORT_SUB_SEP; 472114402Sru} 473114402Sru 474114402Sruvoid reference::sortify_field(unsigned char f, int n, string &result) const 475114402Sru{ 476114402Sru typedef void (*sortify_t)(const char *, int, string &); 477114402Sru sortify_t sortifier = sortify_other; 478114402Sru switch (f) { 479114402Sru case 'A': 480114402Sru case 'E': 481114402Sru sortifier = sortify_name; 482114402Sru break; 483114402Sru case 'D': 484114402Sru sortifier = sortify_date; 485114402Sru break; 486114402Sru case 'B': 487114402Sru case 'J': 488114402Sru case 'T': 489114402Sru sortifier = sortify_title; 490114402Sru break; 491114402Sru } 492114402Sru int fi = field_index[(unsigned char)f]; 493114402Sru if (fi != NULL_FIELD_INDEX) { 494114402Sru string &str = field[fi]; 495114402Sru const char *start = str.contents(); 496114402Sru const char *end = start + str.length(); 497114402Sru for (int i = 0; i < n && start < end; i++) { 498114402Sru const char *p = start; 499114402Sru while (start < end && *start != FIELD_SEPARATOR) 500114402Sru start++; 501114402Sru if (i > 0) 502114402Sru result += SORT_SUB_SEP; 503114402Sru (*sortifier)(p, start - p, result); 504114402Sru if (start < end) 505114402Sru start++; 506114402Sru } 507114402Sru } 508114402Sru} 509114402Sru 510114402Sruint compare_reference(const reference &r1, const reference &r2) 511114402Sru{ 512114402Sru assert(r1.no >= 0); 513114402Sru assert(r2.no >= 0); 514114402Sru const char *s1 = r1.sort_key.contents(); 515114402Sru int n1 = r1.sort_key.length(); 516114402Sru const char *s2 = r2.sort_key.contents(); 517114402Sru int n2 = r2.sort_key.length(); 518114402Sru for (; n1 > 0 && n2 > 0; --n1, --n2, ++s1, ++s2) 519114402Sru if (*s1 != *s2) 520114402Sru return (int)(unsigned char)*s1 - (int)(unsigned char)*s2; 521114402Sru if (n2 > 0) 522114402Sru return -1; 523114402Sru if (n1 > 0) 524114402Sru return 1; 525114402Sru return r1.no - r2.no; 526114402Sru} 527114402Sru 528114402Sruint same_reference(const reference &r1, const reference &r2) 529114402Sru{ 530114402Sru if (!r1.rid.is_null() && r1.rid == r2.rid) 531114402Sru return 1; 532114402Sru if (r1.h != r2.h) 533114402Sru return 0; 534114402Sru if (r1.nfields != r2.nfields) 535114402Sru return 0; 536114402Sru int i = 0; 537114402Sru for (i = 0; i < 256; i++) 538114402Sru if (r1.field_index != r2.field_index) 539114402Sru return 0; 540114402Sru for (i = 0; i < r1.nfields; i++) 541114402Sru if (r1.field[i] != r2.field[i]) 542114402Sru return 0; 543114402Sru return 1; 544114402Sru} 545114402Sru 546114402Sruconst char *find_last_name(const char *start, const char *end, 547114402Sru const char **endp) 548114402Sru{ 549114402Sru const char *ptr = start; 550114402Sru const char *last_word = start; 551114402Sru for (;;) { 552114402Sru const char *token_start = ptr; 553114402Sru if (!get_token(&ptr, end)) 554114402Sru break; 555114402Sru if (ptr - token_start == 1) { 556114402Sru if (*token_start == ',') { 557114402Sru *endp = token_start; 558114402Sru return last_word; 559114402Sru } 560114402Sru else if (*token_start == ' ' || *token_start == '\n') { 561114402Sru if (ptr < end && *ptr != ' ' && *ptr != '\n') 562114402Sru last_word = ptr; 563114402Sru } 564114402Sru } 565114402Sru } 566114402Sru *endp = end; 567114402Sru return last_word; 568114402Sru} 569114402Sru 570114402Sruvoid abbreviate_name(const char *ptr, const char *end, string &result) 571114402Sru{ 572114402Sru const char *last_name_end; 573114402Sru const char *last_name_start = find_last_name(ptr, end, &last_name_end); 574114402Sru int need_period = 0; 575114402Sru for (;;) { 576114402Sru const char *token_start = ptr; 577114402Sru if (!get_token(&ptr, last_name_start)) 578114402Sru break; 579114402Sru const token_info *ti = lookup_token(token_start, ptr); 580114402Sru if (need_period) { 581114402Sru if ((ptr - token_start == 1 && *token_start == ' ') 582114402Sru || (ptr - token_start == 2 && token_start[0] == '\\' 583114402Sru && token_start[1] == ' ')) 584114402Sru continue; 585114402Sru if (ti->is_upper()) 586114402Sru result += period_before_initial; 587114402Sru else 588114402Sru result += period_before_other; 589114402Sru need_period = 0; 590114402Sru } 591114402Sru result.append(token_start, ptr - token_start); 592114402Sru if (ti->is_upper()) { 593114402Sru const char *lower_ptr = ptr; 594114402Sru int first_token = 1; 595114402Sru for (;;) { 596114402Sru token_start = ptr; 597114402Sru if (!get_token(&ptr, last_name_start)) 598114402Sru break; 599114402Sru if ((ptr - token_start == 1 && *token_start == ' ') 600114402Sru || (ptr - token_start == 2 && token_start[0] == '\\' 601114402Sru && token_start[1] == ' ')) 602114402Sru break; 603114402Sru ti = lookup_token(token_start, ptr); 604114402Sru if (ti->is_hyphen()) { 605114402Sru const char *ptr1 = ptr; 606114402Sru if (get_token(&ptr1, last_name_start)) { 607114402Sru ti = lookup_token(ptr, ptr1); 608114402Sru if (ti->is_upper()) { 609114402Sru result += period_before_hyphen; 610114402Sru result.append(token_start, ptr1 - token_start); 611114402Sru ptr = ptr1; 612114402Sru } 613114402Sru } 614114402Sru } 615114402Sru else if (ti->is_upper()) { 616114402Sru // MacDougal -> MacD. 617114402Sru result.append(lower_ptr, ptr - lower_ptr); 618114402Sru lower_ptr = ptr; 619114402Sru first_token = 1; 620114402Sru } 621114402Sru else if (first_token && ti->is_accent()) { 622114402Sru result.append(token_start, ptr - token_start); 623114402Sru lower_ptr = ptr; 624114402Sru } 625114402Sru first_token = 0; 626114402Sru } 627114402Sru need_period = 1; 628114402Sru } 629114402Sru } 630114402Sru if (need_period) 631114402Sru result += period_before_last_name; 632114402Sru result.append(last_name_start, end - last_name_start); 633114402Sru} 634114402Sru 635114402Srustatic void abbreviate_names(string &result) 636114402Sru{ 637114402Sru string str; 638114402Sru str.move(result); 639114402Sru const char *ptr = str.contents(); 640114402Sru const char *end = ptr + str.length(); 641114402Sru while (ptr < end) { 642114402Sru const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr); 643114402Sru if (name_end == 0) 644114402Sru name_end = end; 645114402Sru abbreviate_name(ptr, name_end, result); 646114402Sru if (name_end >= end) 647114402Sru break; 648114402Sru ptr = name_end + 1; 649114402Sru result += FIELD_SEPARATOR; 650114402Sru } 651114402Sru} 652114402Sru 653114402Sruvoid reverse_name(const char *ptr, const char *name_end, string &result) 654114402Sru{ 655114402Sru const char *last_name_end; 656114402Sru const char *last_name_start = find_last_name(ptr, name_end, &last_name_end); 657114402Sru result.append(last_name_start, last_name_end - last_name_start); 658114402Sru while (last_name_start > ptr 659114402Sru && (last_name_start[-1] == ' ' || last_name_start[-1] == '\n')) 660114402Sru last_name_start--; 661114402Sru if (last_name_start > ptr) { 662114402Sru result += ", "; 663114402Sru result.append(ptr, last_name_start - ptr); 664114402Sru } 665114402Sru if (last_name_end < name_end) 666114402Sru result.append(last_name_end, name_end - last_name_end); 667114402Sru} 668114402Sru 669114402Sruvoid reverse_names(string &result, int n) 670114402Sru{ 671114402Sru if (n <= 0) 672114402Sru return; 673114402Sru string str; 674114402Sru str.move(result); 675114402Sru const char *ptr = str.contents(); 676114402Sru const char *end = ptr + str.length(); 677114402Sru while (ptr < end) { 678114402Sru if (--n < 0) { 679114402Sru result.append(ptr, end - ptr); 680114402Sru break; 681114402Sru } 682114402Sru const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr); 683114402Sru if (name_end == 0) 684114402Sru name_end = end; 685114402Sru reverse_name(ptr, name_end, result); 686114402Sru if (name_end >= end) 687114402Sru break; 688114402Sru ptr = name_end + 1; 689114402Sru result += FIELD_SEPARATOR; 690114402Sru } 691114402Sru} 692114402Sru 693114402Sru// Return number of field separators. 694114402Sru 695114402Sruint join_fields(string &f) 696114402Sru{ 697114402Sru const char *ptr = f.contents(); 698114402Sru int len = f.length(); 699114402Sru int nfield_seps = 0; 700114402Sru int j; 701114402Sru for (j = 0; j < len; j++) 702114402Sru if (ptr[j] == FIELD_SEPARATOR) 703114402Sru nfield_seps++; 704114402Sru if (nfield_seps == 0) 705114402Sru return 0; 706114402Sru string temp; 707114402Sru int field_seps_left = nfield_seps; 708114402Sru for (j = 0; j < len; j++) { 709114402Sru if (ptr[j] == FIELD_SEPARATOR) { 710114402Sru if (nfield_seps == 1) 711114402Sru temp += join_authors_exactly_two; 712114402Sru else if (--field_seps_left == 0) 713114402Sru temp += join_authors_last_two; 714114402Sru else 715114402Sru temp += join_authors_default; 716114402Sru } 717114402Sru else 718114402Sru temp += ptr[j]; 719114402Sru } 720114402Sru f = temp; 721114402Sru return nfield_seps; 722114402Sru} 723114402Sru 724114402Sruvoid uppercase(const char *start, const char *end, string &result) 725114402Sru{ 726114402Sru for (;;) { 727114402Sru const char *token_start = start; 728114402Sru if (!get_token(&start, end)) 729114402Sru break; 730114402Sru const token_info *ti = lookup_token(token_start, start); 731114402Sru ti->upper_case(token_start, start, result); 732114402Sru } 733114402Sru} 734114402Sru 735114402Sruvoid lowercase(const char *start, const char *end, string &result) 736114402Sru{ 737114402Sru for (;;) { 738114402Sru const char *token_start = start; 739114402Sru if (!get_token(&start, end)) 740114402Sru break; 741114402Sru const token_info *ti = lookup_token(token_start, start); 742114402Sru ti->lower_case(token_start, start, result); 743114402Sru } 744114402Sru} 745114402Sru 746114402Sruvoid capitalize(const char *ptr, const char *end, string &result) 747114402Sru{ 748114402Sru int in_small_point_size = 0; 749114402Sru for (;;) { 750114402Sru const char *start = ptr; 751114402Sru if (!get_token(&ptr, end)) 752114402Sru break; 753114402Sru const token_info *ti = lookup_token(start, ptr); 754114402Sru const char *char_end = ptr; 755114402Sru int is_lower = ti->is_lower(); 756114402Sru if ((is_lower || ti->is_upper()) && get_token(&ptr, end)) { 757114402Sru const token_info *ti2 = lookup_token(char_end, ptr); 758114402Sru if (!ti2->is_accent()) 759114402Sru ptr = char_end; 760114402Sru } 761114402Sru if (is_lower) { 762114402Sru if (!in_small_point_size) { 763114402Sru result += "\\s-2"; 764114402Sru in_small_point_size = 1; 765114402Sru } 766114402Sru ti->upper_case(start, char_end, result); 767114402Sru result.append(char_end, ptr - char_end); 768114402Sru } 769114402Sru else { 770114402Sru if (in_small_point_size) { 771114402Sru result += "\\s+2"; 772114402Sru in_small_point_size = 0; 773114402Sru } 774114402Sru result.append(start, ptr - start); 775114402Sru } 776114402Sru } 777114402Sru if (in_small_point_size) 778114402Sru result += "\\s+2"; 779114402Sru} 780114402Sru 781114402Sruvoid capitalize_field(string &str) 782114402Sru{ 783114402Sru string temp; 784114402Sru capitalize(str.contents(), str.contents() + str.length(), temp); 785114402Sru str.move(temp); 786114402Sru} 787114402Sru 788114402Sruint is_terminated(const char *ptr, const char *end) 789114402Sru{ 790114402Sru const char *last_token = end; 791114402Sru for (;;) { 792114402Sru const char *p = ptr; 793114402Sru if (!get_token(&ptr, end)) 794114402Sru break; 795114402Sru last_token = p; 796114402Sru } 797114402Sru return end - last_token == 1 798114402Sru && (*last_token == '.' || *last_token == '!' || *last_token == '?'); 799114402Sru} 800114402Sru 801114402Sruvoid reference::output(FILE *fp) 802114402Sru{ 803114402Sru fputs(".]-\n", fp); 804114402Sru for (int i = 0; i < 256; i++) 805114402Sru if (field_index[i] != NULL_FIELD_INDEX && i != annotation_field) { 806114402Sru string &f = field[field_index[i]]; 807114402Sru if (!csdigit(i)) { 808114402Sru int j = reverse_fields.search(i); 809114402Sru if (j >= 0) { 810114402Sru int n; 811114402Sru int len = reverse_fields.length(); 812114402Sru if (++j < len && csdigit(reverse_fields[j])) { 813114402Sru n = reverse_fields[j] - '0'; 814114402Sru for (++j; j < len && csdigit(reverse_fields[j]); j++) 815114402Sru // should check for overflow 816114402Sru n = n*10 + reverse_fields[j] - '0'; 817114402Sru } 818114402Sru else 819114402Sru n = INT_MAX; 820114402Sru reverse_names(f, n); 821114402Sru } 822114402Sru } 823114402Sru int is_multiple = join_fields(f) > 0; 824114402Sru if (capitalize_fields.search(i) >= 0) 825114402Sru capitalize_field(f); 826114402Sru if (memchr(f.contents(), '\n', f.length()) == 0) { 827114402Sru fprintf(fp, ".ds [%c ", i); 828114402Sru if (f[0] == ' ' || f[0] == '\\' || f[0] == '"') 829114402Sru putc('"', fp); 830114402Sru put_string(f, fp); 831114402Sru putc('\n', fp); 832114402Sru } 833114402Sru else { 834114402Sru fprintf(fp, ".de [%c\n", i); 835114402Sru put_string(f, fp); 836114402Sru fputs("..\n", fp); 837114402Sru } 838114402Sru if (i == 'P') { 839114402Sru int multiple_pages = 0; 840114402Sru const char *s = f.contents(); 841114402Sru const char *end = f.contents() + f.length(); 842114402Sru for (;;) { 843114402Sru const char *token_start = s; 844114402Sru if (!get_token(&s, end)) 845114402Sru break; 846114402Sru const token_info *ti = lookup_token(token_start, s); 847114402Sru if (ti->is_hyphen() || ti->is_range_sep()) { 848114402Sru multiple_pages = 1; 849114402Sru break; 850114402Sru } 851114402Sru } 852114402Sru fprintf(fp, ".nr [P %d\n", multiple_pages); 853114402Sru } 854114402Sru else if (i == 'E') 855114402Sru fprintf(fp, ".nr [E %d\n", is_multiple); 856114402Sru } 857114402Sru for (const char *p = "TAO"; *p; p++) { 858114402Sru int fi = field_index[(unsigned char)*p]; 859114402Sru if (fi != NULL_FIELD_INDEX) { 860114402Sru string &f = field[fi]; 861114402Sru fprintf(fp, ".nr [%c %d\n", *p, 862114402Sru is_terminated(f.contents(), f.contents() + f.length())); 863114402Sru } 864114402Sru } 865114402Sru int t = classify(); 866114402Sru fprintf(fp, ".][ %d %s\n", t, reference_types[t]); 867114402Sru if (annotation_macro.length() > 0 && annotation_field >= 0 868114402Sru && field_index[annotation_field] != NULL_FIELD_INDEX) { 869114402Sru putc('.', fp); 870114402Sru put_string(annotation_macro, fp); 871114402Sru putc('\n', fp); 872114402Sru put_string(field[field_index[annotation_field]], fp); 873114402Sru } 874114402Sru} 875114402Sru 876114402Sruvoid reference::print_sort_key_comment(FILE *fp) 877114402Sru{ 878114402Sru fputs(".\\\"", fp); 879114402Sru put_string(sort_key, fp); 880114402Sru putc('\n', fp); 881114402Sru} 882114402Sru 883114402Sruconst char *find_year(const char *start, const char *end, const char **endp) 884114402Sru{ 885114402Sru for (;;) { 886114402Sru while (start < end && !csdigit(*start)) 887114402Sru start++; 888114402Sru const char *ptr = start; 889114402Sru if (start == end) 890114402Sru break; 891114402Sru while (ptr < end && csdigit(*ptr)) 892114402Sru ptr++; 893114402Sru if (ptr - start == 4 || ptr - start == 3 894114402Sru || (ptr - start == 2 895114402Sru && (start[0] >= '4' || (start[0] == '3' && start[1] >= '2')))) { 896114402Sru *endp = ptr; 897114402Sru return start; 898114402Sru } 899114402Sru start = ptr; 900114402Sru } 901114402Sru return 0; 902114402Sru} 903114402Sru 904114402Srustatic const char *find_day(const char *start, const char *end, 905114402Sru const char **endp) 906114402Sru{ 907114402Sru for (;;) { 908114402Sru while (start < end && !csdigit(*start)) 909114402Sru start++; 910114402Sru const char *ptr = start; 911114402Sru if (start == end) 912114402Sru break; 913114402Sru while (ptr < end && csdigit(*ptr)) 914114402Sru ptr++; 915114402Sru if ((ptr - start == 1 && start[0] != '0') 916114402Sru || (ptr - start == 2 && 917114402Sru (start[0] == '1' 918114402Sru || start[0] == '2' 919114402Sru || (start[0] == '3' && start[1] <= '1') 920114402Sru || (start[0] == '0' && start[1] != '0')))) { 921114402Sru *endp = ptr; 922114402Sru return start; 923114402Sru } 924114402Sru start = ptr; 925114402Sru } 926114402Sru return 0; 927114402Sru} 928114402Sru 929114402Srustatic int find_month(const char *start, const char *end) 930114402Sru{ 931114402Sru static const char *months[] = { 932114402Sru "january", 933114402Sru "february", 934114402Sru "march", 935114402Sru "april", 936114402Sru "may", 937114402Sru "june", 938114402Sru "july", 939114402Sru "august", 940114402Sru "september", 941114402Sru "october", 942114402Sru "november", 943114402Sru "december", 944114402Sru }; 945114402Sru for (;;) { 946114402Sru while (start < end && !csalpha(*start)) 947114402Sru start++; 948114402Sru const char *ptr = start; 949114402Sru if (start == end) 950114402Sru break; 951114402Sru while (ptr < end && csalpha(*ptr)) 952114402Sru ptr++; 953114402Sru if (ptr - start >= 3) { 954114402Sru for (unsigned int i = 0; i < sizeof(months)/sizeof(months[0]); i++) { 955114402Sru const char *q = months[i]; 956114402Sru const char *p = start; 957114402Sru for (; p < ptr; p++, q++) 958114402Sru if (cmlower(*p) != *q) 959114402Sru break; 960114402Sru if (p >= ptr) 961114402Sru return i; 962114402Sru } 963114402Sru } 964114402Sru start = ptr; 965114402Sru } 966114402Sru return -1; 967114402Sru} 968114402Sru 969114402Sruint reference::contains_field(char c) const 970114402Sru{ 971114402Sru return field_index[(unsigned char)c] != NULL_FIELD_INDEX; 972114402Sru} 973114402Sru 974114402Sruint reference::classify() 975114402Sru{ 976114402Sru if (contains_field('J')) 977114402Sru return JOURNAL_ARTICLE; 978114402Sru if (contains_field('B')) 979114402Sru return ARTICLE_IN_BOOK; 980114402Sru if (contains_field('G')) 981114402Sru return TECH_REPORT; 982114402Sru if (contains_field('R')) 983114402Sru return TECH_REPORT; 984114402Sru if (contains_field('I')) 985114402Sru return BOOK; 986114402Sru if (contains_field('M')) 987114402Sru return BELL_TM; 988114402Sru return OTHER; 989114402Sru} 990114402Sru 991114402Sruconst char *reference::get_year(const char **endp) const 992114402Sru{ 993114402Sru if (field_index['D'] != NULL_FIELD_INDEX) { 994114402Sru string &date = field[field_index['D']]; 995114402Sru const char *start = date.contents(); 996114402Sru const char *end = start + date.length(); 997114402Sru return find_year(start, end, endp); 998114402Sru } 999114402Sru else 1000114402Sru return 0; 1001114402Sru} 1002114402Sru 1003114402Sruconst char *reference::get_field(unsigned char c, const char **endp) const 1004114402Sru{ 1005114402Sru if (field_index[c] != NULL_FIELD_INDEX) { 1006114402Sru string &f = field[field_index[c]]; 1007114402Sru const char *start = f.contents(); 1008114402Sru *endp = start + f.length(); 1009114402Sru return start; 1010114402Sru } 1011114402Sru else 1012114402Sru return 0; 1013114402Sru} 1014114402Sru 1015114402Sruconst char *reference::get_date(const char **endp) const 1016114402Sru{ 1017114402Sru return get_field('D', endp); 1018114402Sru} 1019114402Sru 1020114402Sruconst char *nth_field(int i, const char *start, const char **endp) 1021114402Sru{ 1022114402Sru while (--i >= 0) { 1023114402Sru start = (char *)memchr(start, FIELD_SEPARATOR, *endp - start); 1024114402Sru if (!start) 1025114402Sru return 0; 1026114402Sru start++; 1027114402Sru } 1028114402Sru const char *e = (char *)memchr(start, FIELD_SEPARATOR, *endp - start); 1029114402Sru if (e) 1030114402Sru *endp = e; 1031114402Sru return start; 1032114402Sru} 1033114402Sru 1034114402Sruconst char *reference::get_author(int i, const char **endp) const 1035114402Sru{ 1036114402Sru for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) { 1037114402Sru const char *start = get_field(*f, endp); 1038114402Sru if (start) { 1039114402Sru if (strchr(MULTI_FIELD_NAMES, *f) != 0) 1040114402Sru return nth_field(i, start, endp); 1041114402Sru else if (i == 0) 1042114402Sru return start; 1043114402Sru else 1044114402Sru return 0; 1045114402Sru } 1046114402Sru } 1047114402Sru return 0; 1048114402Sru} 1049114402Sru 1050114402Sruconst char *reference::get_author_last_name(int i, const char **endp) const 1051114402Sru{ 1052114402Sru for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) { 1053114402Sru const char *start = get_field(*f, endp); 1054114402Sru if (start) { 1055114402Sru if (strchr(MULTI_FIELD_NAMES, *f) != 0) { 1056114402Sru start = nth_field(i, start, endp); 1057114402Sru if (!start) 1058114402Sru return 0; 1059114402Sru } 1060114402Sru if (*f == 'A') 1061114402Sru return find_last_name(start, *endp, endp); 1062114402Sru else 1063114402Sru return start; 1064114402Sru } 1065114402Sru } 1066114402Sru return 0; 1067114402Sru} 1068114402Sru 1069114402Sruvoid reference::set_date(string &d) 1070114402Sru{ 1071114402Sru if (d.length() == 0) 1072114402Sru delete_field('D'); 1073114402Sru else 1074114402Sru insert_field('D', d); 1075114402Sru} 1076114402Sru 1077114402Sruint same_year(const reference &r1, const reference &r2) 1078114402Sru{ 1079114402Sru const char *ye1; 1080114402Sru const char *ys1 = r1.get_year(&ye1); 1081114402Sru const char *ye2; 1082114402Sru const char *ys2 = r2.get_year(&ye2); 1083114402Sru if (ys1 == 0) { 1084114402Sru if (ys2 == 0) 1085114402Sru return same_date(r1, r2); 1086114402Sru else 1087114402Sru return 0; 1088114402Sru } 1089114402Sru else if (ys2 == 0) 1090114402Sru return 0; 1091114402Sru else if (ye1 - ys1 != ye2 - ys2) 1092114402Sru return 0; 1093114402Sru else 1094114402Sru return memcmp(ys1, ys2, ye1 - ys1) == 0; 1095114402Sru} 1096114402Sru 1097114402Sruint same_date(const reference &r1, const reference &r2) 1098114402Sru{ 1099114402Sru const char *e1; 1100114402Sru const char *s1 = r1.get_date(&e1); 1101114402Sru const char *e2; 1102114402Sru const char *s2 = r2.get_date(&e2); 1103114402Sru if (s1 == 0) 1104114402Sru return s2 == 0; 1105114402Sru else if (s2 == 0) 1106114402Sru return 0; 1107114402Sru else if (e1 - s1 != e2 - s2) 1108114402Sru return 0; 1109114402Sru else 1110114402Sru return memcmp(s1, s2, e1 - s1) == 0; 1111114402Sru} 1112114402Sru 1113114402Sruconst char *reference::get_sort_field(int i, int si, int ssi, 1114114402Sru const char **endp) const 1115114402Sru{ 1116114402Sru const char *start = sort_key.contents(); 1117114402Sru const char *end = start + sort_key.length(); 1118114402Sru if (i < 0) { 1119114402Sru *endp = end; 1120114402Sru return start; 1121114402Sru } 1122114402Sru while (--i >= 0) { 1123114402Sru start = (char *)memchr(start, SORT_SEP, end - start); 1124114402Sru if (!start) 1125114402Sru return 0; 1126114402Sru start++; 1127114402Sru } 1128114402Sru const char *e = (char *)memchr(start, SORT_SEP, end - start); 1129114402Sru if (e) 1130114402Sru end = e; 1131114402Sru if (si < 0) { 1132114402Sru *endp = end; 1133114402Sru return start; 1134114402Sru } 1135114402Sru while (--si >= 0) { 1136114402Sru start = (char *)memchr(start, SORT_SUB_SEP, end - start); 1137114402Sru if (!start) 1138114402Sru return 0; 1139114402Sru start++; 1140114402Sru } 1141114402Sru e = (char *)memchr(start, SORT_SUB_SEP, end - start); 1142114402Sru if (e) 1143114402Sru end = e; 1144114402Sru if (ssi < 0) { 1145114402Sru *endp = end; 1146114402Sru return start; 1147114402Sru } 1148114402Sru while (--ssi >= 0) { 1149114402Sru start = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start); 1150114402Sru if (!start) 1151114402Sru return 0; 1152114402Sru start++; 1153114402Sru } 1154114402Sru e = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start); 1155114402Sru if (e) 1156114402Sru end = e; 1157114402Sru *endp = end; 1158114402Sru return start; 1159114402Sru} 1160114402Sru 1161