1/* -*- C++ -*- 2 Copyright (C) 1989, 1990, 1991, 1992, 2000, 2004 3 Free Software Foundation, Inc. 4 Written by James Clark (jjc@jclark.com) 5 6This file is part of groff. 7 8groff is free software; you can redistribute it and/or modify it under 9the terms of the GNU General Public License as published by the Free 10Software Foundation; either version 2, or (at your option) any later 11version. 12 13groff is distributed in the hope that it will be useful, but WITHOUT ANY 14WARRANTY; without even the implied warranty of MERCHANTABILITY or 15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16for more details. 17 18You should have received a copy of the GNU General Public License along 19with groff; see the file COPYING. If not, write to the Free Software 20Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */ 21 22%{ 23 24#include "refer.h" 25#include "refid.h" 26#include "ref.h" 27#include "token.h" 28 29int yylex(); 30void yyerror(const char *); 31int yyparse(); 32 33static const char *format_serial(char c, int n); 34 35struct label_info { 36 int start; 37 int length; 38 int count; 39 int total; 40 label_info(const string &); 41}; 42 43label_info *lookup_label(const string &label); 44 45struct expression { 46 enum { 47 // Does the tentative label depend on the reference? 48 CONTAINS_VARIABLE = 01, 49 CONTAINS_STAR = 02, 50 CONTAINS_FORMAT = 04, 51 CONTAINS_AT = 010 52 }; 53 virtual ~expression() { } 54 virtual void evaluate(int, const reference &, string &, 55 substring_position &) = 0; 56 virtual unsigned analyze() { return 0; } 57}; 58 59class at_expr : public expression { 60public: 61 at_expr() { } 62 void evaluate(int, const reference &, string &, substring_position &); 63 unsigned analyze() { return CONTAINS_VARIABLE|CONTAINS_AT; } 64}; 65 66class format_expr : public expression { 67 char type; 68 int width; 69 int first_number; 70public: 71 format_expr(char c, int w = 0, int f = 1) 72 : type(c), width(w), first_number(f) { } 73 void evaluate(int, const reference &, string &, substring_position &); 74 unsigned analyze() { return CONTAINS_FORMAT; } 75}; 76 77class field_expr : public expression { 78 int number; 79 char name; 80public: 81 field_expr(char nm, int num) : number(num), name(nm) { } 82 void evaluate(int, const reference &, string &, substring_position &); 83 unsigned analyze() { return CONTAINS_VARIABLE; } 84}; 85 86class literal_expr : public expression { 87 string s; 88public: 89 literal_expr(const char *ptr, int len) : s(ptr, len) { } 90 void evaluate(int, const reference &, string &, substring_position &); 91}; 92 93class unary_expr : public expression { 94protected: 95 expression *expr; 96public: 97 unary_expr(expression *e) : expr(e) { } 98 ~unary_expr() { delete expr; } 99 void evaluate(int, const reference &, string &, substring_position &) = 0; 100 unsigned analyze() { return expr ? expr->analyze() : 0; } 101}; 102 103// This caches the analysis of an expression. 104 105class analyzed_expr : public unary_expr { 106 unsigned flags; 107public: 108 analyzed_expr(expression *); 109 void evaluate(int, const reference &, string &, substring_position &); 110 unsigned analyze() { return flags; } 111}; 112 113class star_expr : public unary_expr { 114public: 115 star_expr(expression *e) : unary_expr(e) { } 116 void evaluate(int, const reference &, string &, substring_position &); 117 unsigned analyze() { 118 return ((expr ? (expr->analyze() & ~CONTAINS_VARIABLE) : 0) 119 | CONTAINS_STAR); 120 } 121}; 122 123typedef void map_func(const char *, const char *, string &); 124 125class map_expr : public unary_expr { 126 map_func *func; 127public: 128 map_expr(expression *e, map_func *f) : unary_expr(e), func(f) { } 129 void evaluate(int, const reference &, string &, substring_position &); 130}; 131 132typedef const char *extractor_func(const char *, const char *, const char **); 133 134class extractor_expr : public unary_expr { 135 int part; 136 extractor_func *func; 137public: 138 enum { BEFORE = +1, MATCH = 0, AFTER = -1 }; 139 extractor_expr(expression *e, extractor_func *f, int pt) 140 : unary_expr(e), part(pt), func(f) { } 141 void evaluate(int, const reference &, string &, substring_position &); 142}; 143 144class truncate_expr : public unary_expr { 145 int n; 146public: 147 truncate_expr(expression *e, int i) : unary_expr(e), n(i) { } 148 void evaluate(int, const reference &, string &, substring_position &); 149}; 150 151class separator_expr : public unary_expr { 152public: 153 separator_expr(expression *e) : unary_expr(e) { } 154 void evaluate(int, const reference &, string &, substring_position &); 155}; 156 157class binary_expr : public expression { 158protected: 159 expression *expr1; 160 expression *expr2; 161public: 162 binary_expr(expression *e1, expression *e2) : expr1(e1), expr2(e2) { } 163 ~binary_expr() { delete expr1; delete expr2; } 164 void evaluate(int, const reference &, string &, substring_position &) = 0; 165 unsigned analyze() { 166 return (expr1 ? expr1->analyze() : 0) | (expr2 ? expr2->analyze() : 0); 167 } 168}; 169 170class alternative_expr : public binary_expr { 171public: 172 alternative_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { } 173 void evaluate(int, const reference &, string &, substring_position &); 174}; 175 176class list_expr : public binary_expr { 177public: 178 list_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { } 179 void evaluate(int, const reference &, string &, substring_position &); 180}; 181 182class substitute_expr : public binary_expr { 183public: 184 substitute_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { } 185 void evaluate(int, const reference &, string &, substring_position &); 186}; 187 188class ternary_expr : public expression { 189protected: 190 expression *expr1; 191 expression *expr2; 192 expression *expr3; 193public: 194 ternary_expr(expression *e1, expression *e2, expression *e3) 195 : expr1(e1), expr2(e2), expr3(e3) { } 196 ~ternary_expr() { delete expr1; delete expr2; delete expr3; } 197 void evaluate(int, const reference &, string &, substring_position &) = 0; 198 unsigned analyze() { 199 return ((expr1 ? expr1->analyze() : 0) 200 | (expr2 ? expr2->analyze() : 0) 201 | (expr3 ? expr3->analyze() : 0)); 202 } 203}; 204 205class conditional_expr : public ternary_expr { 206public: 207 conditional_expr(expression *e1, expression *e2, expression *e3) 208 : ternary_expr(e1, e2, e3) { } 209 void evaluate(int, const reference &, string &, substring_position &); 210}; 211 212static expression *parsed_label = 0; 213static expression *parsed_date_label = 0; 214static expression *parsed_short_label = 0; 215 216static expression *parse_result; 217 218string literals; 219 220%} 221 222%union { 223 int num; 224 expression *expr; 225 struct { int ndigits; int val; } dig; 226 struct { int start; int len; } str; 227} 228 229/* uppercase or lowercase letter */ 230%token <num> TOKEN_LETTER 231/* literal characters */ 232%token <str> TOKEN_LITERAL 233/* digit */ 234%token <num> TOKEN_DIGIT 235 236%type <expr> conditional 237%type <expr> alternative 238%type <expr> list 239%type <expr> string 240%type <expr> substitute 241%type <expr> optional_conditional 242%type <num> number 243%type <dig> digits 244%type <num> optional_number 245%type <num> flag 246 247%% 248 249expr: 250 optional_conditional 251 { parse_result = ($1 ? new analyzed_expr($1) : 0); } 252 ; 253 254conditional: 255 alternative 256 { $$ = $1; } 257 | alternative '?' optional_conditional ':' conditional 258 { $$ = new conditional_expr($1, $3, $5); } 259 ; 260 261optional_conditional: 262 /* empty */ 263 { $$ = 0; } 264 | conditional 265 { $$ = $1; } 266 ; 267 268alternative: 269 list 270 { $$ = $1; } 271 | alternative '|' list 272 { $$ = new alternative_expr($1, $3); } 273 | alternative '&' list 274 { $$ = new conditional_expr($1, $3, 0); } 275 ; 276 277list: 278 substitute 279 { $$ = $1; } 280 | list substitute 281 { $$ = new list_expr($1, $2); } 282 ; 283 284substitute: 285 string 286 { $$ = $1; } 287 | substitute '~' string 288 { $$ = new substitute_expr($1, $3); } 289 ; 290 291string: 292 '@' 293 { $$ = new at_expr; } 294 | TOKEN_LITERAL 295 { 296 $$ = new literal_expr(literals.contents() + $1.start, 297 $1.len); 298 } 299 | TOKEN_LETTER 300 { $$ = new field_expr($1, 0); } 301 | TOKEN_LETTER number 302 { $$ = new field_expr($1, $2 - 1); } 303 | '%' TOKEN_LETTER 304 { 305 switch ($2) { 306 case 'I': 307 case 'i': 308 case 'A': 309 case 'a': 310 $$ = new format_expr($2); 311 break; 312 default: 313 command_error("unrecognized format `%1'", char($2)); 314 $$ = new format_expr('a'); 315 break; 316 } 317 } 318 319 | '%' digits 320 { 321 $$ = new format_expr('0', $2.ndigits, $2.val); 322 } 323 | string '.' flag TOKEN_LETTER optional_number 324 { 325 switch ($4) { 326 case 'l': 327 $$ = new map_expr($1, lowercase); 328 break; 329 case 'u': 330 $$ = new map_expr($1, uppercase); 331 break; 332 case 'c': 333 $$ = new map_expr($1, capitalize); 334 break; 335 case 'r': 336 $$ = new map_expr($1, reverse_name); 337 break; 338 case 'a': 339 $$ = new map_expr($1, abbreviate_name); 340 break; 341 case 'y': 342 $$ = new extractor_expr($1, find_year, $3); 343 break; 344 case 'n': 345 $$ = new extractor_expr($1, find_last_name, $3); 346 break; 347 default: 348 $$ = $1; 349 command_error("unknown function `%1'", char($4)); 350 break; 351 } 352 } 353 354 | string '+' number 355 { $$ = new truncate_expr($1, $3); } 356 | string '-' number 357 { $$ = new truncate_expr($1, -$3); } 358 | string '*' 359 { $$ = new star_expr($1); } 360 | '(' optional_conditional ')' 361 { $$ = $2; } 362 | '<' optional_conditional '>' 363 { $$ = new separator_expr($2); } 364 ; 365 366optional_number: 367 /* empty */ 368 { $$ = -1; } 369 | number 370 { $$ = $1; } 371 ; 372 373number: 374 TOKEN_DIGIT 375 { $$ = $1; } 376 | number TOKEN_DIGIT 377 { $$ = $1*10 + $2; } 378 ; 379 380digits: 381 TOKEN_DIGIT 382 { $$.ndigits = 1; $$.val = $1; } 383 | digits TOKEN_DIGIT 384 { $$.ndigits = $1.ndigits + 1; $$.val = $1.val*10 + $2; } 385 ; 386 387 388flag: 389 /* empty */ 390 { $$ = 0; } 391 | '+' 392 { $$ = 1; } 393 | '-' 394 { $$ = -1; } 395 ; 396 397%% 398 399/* bison defines const to be empty unless __STDC__ is defined, which it 400isn't under cfront */ 401 402#ifdef const 403#undef const 404#endif 405 406const char *spec_ptr; 407const char *spec_end; 408const char *spec_cur; 409 410static char uppercase_array[] = { 411 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 412 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 413 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 414 'Y', 'Z', 415}; 416 417static char lowercase_array[] = { 418 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 419 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 420 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 421 'y', 'z', 422}; 423 424int yylex() 425{ 426 while (spec_ptr < spec_end && csspace(*spec_ptr)) 427 spec_ptr++; 428 spec_cur = spec_ptr; 429 if (spec_ptr >= spec_end) 430 return 0; 431 unsigned char c = *spec_ptr++; 432 if (csalpha(c)) { 433 yylval.num = c; 434 return TOKEN_LETTER; 435 } 436 if (csdigit(c)) { 437 yylval.num = c - '0'; 438 return TOKEN_DIGIT; 439 } 440 if (c == '\'') { 441 yylval.str.start = literals.length(); 442 for (; spec_ptr < spec_end; spec_ptr++) { 443 if (*spec_ptr == '\'') { 444 if (++spec_ptr < spec_end && *spec_ptr == '\'') 445 literals += '\''; 446 else { 447 yylval.str.len = literals.length() - yylval.str.start; 448 return TOKEN_LITERAL; 449 } 450 } 451 else 452 literals += *spec_ptr; 453 } 454 yylval.str.len = literals.length() - yylval.str.start; 455 return TOKEN_LITERAL; 456 } 457 return c; 458} 459 460int set_label_spec(const char *label_spec) 461{ 462 spec_cur = spec_ptr = label_spec; 463 spec_end = strchr(label_spec, '\0'); 464 literals.clear(); 465 if (yyparse()) 466 return 0; 467 delete parsed_label; 468 parsed_label = parse_result; 469 return 1; 470} 471 472int set_date_label_spec(const char *label_spec) 473{ 474 spec_cur = spec_ptr = label_spec; 475 spec_end = strchr(label_spec, '\0'); 476 literals.clear(); 477 if (yyparse()) 478 return 0; 479 delete parsed_date_label; 480 parsed_date_label = parse_result; 481 return 1; 482} 483 484int set_short_label_spec(const char *label_spec) 485{ 486 spec_cur = spec_ptr = label_spec; 487 spec_end = strchr(label_spec, '\0'); 488 literals.clear(); 489 if (yyparse()) 490 return 0; 491 delete parsed_short_label; 492 parsed_short_label = parse_result; 493 return 1; 494} 495 496void yyerror(const char *message) 497{ 498 if (spec_cur < spec_end) 499 command_error("label specification %1 before `%2'", message, spec_cur); 500 else 501 command_error("label specification %1 at end of string", 502 message, spec_cur); 503} 504 505void at_expr::evaluate(int tentative, const reference &ref, 506 string &result, substring_position &) 507{ 508 if (tentative) 509 ref.canonicalize_authors(result); 510 else { 511 const char *end, *start = ref.get_authors(&end); 512 if (start) 513 result.append(start, end - start); 514 } 515} 516 517void format_expr::evaluate(int tentative, const reference &ref, 518 string &result, substring_position &) 519{ 520 if (tentative) 521 return; 522 const label_info *lp = ref.get_label_ptr(); 523 int num = lp == 0 ? ref.get_number() : lp->count; 524 if (type != '0') 525 result += format_serial(type, num + 1); 526 else { 527 const char *ptr = i_to_a(num + first_number); 528 int pad = width - strlen(ptr); 529 while (--pad >= 0) 530 result += '0'; 531 result += ptr; 532 } 533} 534 535static const char *format_serial(char c, int n) 536{ 537 assert(n > 0); 538 static char buf[128]; // more than enough. 539 switch (c) { 540 case 'i': 541 case 'I': 542 { 543 char *p = buf; 544 // troff uses z and w to represent 10000 and 5000 in Roman 545 // numerals; I can find no historical basis for this usage 546 const char *s = c == 'i' ? "zwmdclxvi" : "ZWMDCLXVI"; 547 if (n >= 40000) 548 return i_to_a(n); 549 while (n >= 10000) { 550 *p++ = s[0]; 551 n -= 10000; 552 } 553 for (int i = 1000; i > 0; i /= 10, s += 2) { 554 int m = n/i; 555 n -= m*i; 556 switch (m) { 557 case 3: 558 *p++ = s[2]; 559 /* falls through */ 560 case 2: 561 *p++ = s[2]; 562 /* falls through */ 563 case 1: 564 *p++ = s[2]; 565 break; 566 case 4: 567 *p++ = s[2]; 568 *p++ = s[1]; 569 break; 570 case 8: 571 *p++ = s[1]; 572 *p++ = s[2]; 573 *p++ = s[2]; 574 *p++ = s[2]; 575 break; 576 case 7: 577 *p++ = s[1]; 578 *p++ = s[2]; 579 *p++ = s[2]; 580 break; 581 case 6: 582 *p++ = s[1]; 583 *p++ = s[2]; 584 break; 585 case 5: 586 *p++ = s[1]; 587 break; 588 case 9: 589 *p++ = s[2]; 590 *p++ = s[0]; 591 } 592 } 593 *p = 0; 594 break; 595 } 596 case 'a': 597 case 'A': 598 { 599 char *p = buf; 600 // this is derived from troff/reg.c 601 while (n > 0) { 602 int d = n % 26; 603 if (d == 0) 604 d = 26; 605 n -= d; 606 n /= 26; 607 *p++ = c == 'a' ? lowercase_array[d - 1] : 608 uppercase_array[d - 1]; 609 } 610 *p-- = 0; 611 // Reverse it. 612 char *q = buf; 613 while (q < p) { 614 char temp = *q; 615 *q = *p; 616 *p = temp; 617 --p; 618 ++q; 619 } 620 break; 621 } 622 default: 623 assert(0); 624 } 625 return buf; 626} 627 628void field_expr::evaluate(int, const reference &ref, 629 string &result, substring_position &) 630{ 631 const char *end; 632 const char *start = ref.get_field(name, &end); 633 if (start) { 634 start = nth_field(number, start, &end); 635 if (start) 636 result.append(start, end - start); 637 } 638} 639 640void literal_expr::evaluate(int, const reference &, 641 string &result, substring_position &) 642{ 643 result += s; 644} 645 646analyzed_expr::analyzed_expr(expression *e) 647: unary_expr(e), flags(e ? e->analyze() : 0) 648{ 649} 650 651void analyzed_expr::evaluate(int tentative, const reference &ref, 652 string &result, substring_position &pos) 653{ 654 if (expr) 655 expr->evaluate(tentative, ref, result, pos); 656} 657 658void star_expr::evaluate(int tentative, const reference &ref, 659 string &result, substring_position &pos) 660{ 661 const label_info *lp = ref.get_label_ptr(); 662 if (!tentative 663 && (lp == 0 || lp->total > 1) 664 && expr) 665 expr->evaluate(tentative, ref, result, pos); 666} 667 668void separator_expr::evaluate(int tentative, const reference &ref, 669 string &result, substring_position &pos) 670{ 671 int start_length = result.length(); 672 int is_first = pos.start < 0; 673 if (expr) 674 expr->evaluate(tentative, ref, result, pos); 675 if (is_first) { 676 pos.start = start_length; 677 pos.length = result.length() - start_length; 678 } 679} 680 681void map_expr::evaluate(int tentative, const reference &ref, 682 string &result, substring_position &) 683{ 684 if (expr) { 685 string temp; 686 substring_position temp_pos; 687 expr->evaluate(tentative, ref, temp, temp_pos); 688 (*func)(temp.contents(), temp.contents() + temp.length(), result); 689 } 690} 691 692void extractor_expr::evaluate(int tentative, const reference &ref, 693 string &result, substring_position &) 694{ 695 if (expr) { 696 string temp; 697 substring_position temp_pos; 698 expr->evaluate(tentative, ref, temp, temp_pos); 699 const char *end, *start = (*func)(temp.contents(), 700 temp.contents() + temp.length(), 701 &end); 702 switch (part) { 703 case BEFORE: 704 if (start) 705 result.append(temp.contents(), start - temp.contents()); 706 else 707 result += temp; 708 break; 709 case MATCH: 710 if (start) 711 result.append(start, end - start); 712 break; 713 case AFTER: 714 if (start) 715 result.append(end, temp.contents() + temp.length() - end); 716 break; 717 default: 718 assert(0); 719 } 720 } 721} 722 723static void first_part(int len, const char *ptr, const char *end, 724 string &result) 725{ 726 for (;;) { 727 const char *token_start = ptr; 728 if (!get_token(&ptr, end)) 729 break; 730 const token_info *ti = lookup_token(token_start, ptr); 731 int counts = ti->sortify_non_empty(token_start, ptr); 732 if (counts && --len < 0) 733 break; 734 if (counts || ti->is_accent()) 735 result.append(token_start, ptr - token_start); 736 } 737} 738 739static void last_part(int len, const char *ptr, const char *end, 740 string &result) 741{ 742 const char *start = ptr; 743 int count = 0; 744 for (;;) { 745 const char *token_start = ptr; 746 if (!get_token(&ptr, end)) 747 break; 748 const token_info *ti = lookup_token(token_start, ptr); 749 if (ti->sortify_non_empty(token_start, ptr)) 750 count++; 751 } 752 ptr = start; 753 int skip = count - len; 754 if (skip > 0) { 755 for (;;) { 756 const char *token_start = ptr; 757 if (!get_token(&ptr, end)) 758 assert(0); 759 const token_info *ti = lookup_token(token_start, ptr); 760 if (ti->sortify_non_empty(token_start, ptr) && --skip < 0) { 761 ptr = token_start; 762 break; 763 } 764 } 765 } 766 first_part(len, ptr, end, result); 767} 768 769void truncate_expr::evaluate(int tentative, const reference &ref, 770 string &result, substring_position &) 771{ 772 if (expr) { 773 string temp; 774 substring_position temp_pos; 775 expr->evaluate(tentative, ref, temp, temp_pos); 776 const char *start = temp.contents(); 777 const char *end = start + temp.length(); 778 if (n > 0) 779 first_part(n, start, end, result); 780 else if (n < 0) 781 last_part(-n, start, end, result); 782 } 783} 784 785void alternative_expr::evaluate(int tentative, const reference &ref, 786 string &result, substring_position &pos) 787{ 788 int start_length = result.length(); 789 if (expr1) 790 expr1->evaluate(tentative, ref, result, pos); 791 if (result.length() == start_length && expr2) 792 expr2->evaluate(tentative, ref, result, pos); 793} 794 795void list_expr::evaluate(int tentative, const reference &ref, 796 string &result, substring_position &pos) 797{ 798 if (expr1) 799 expr1->evaluate(tentative, ref, result, pos); 800 if (expr2) 801 expr2->evaluate(tentative, ref, result, pos); 802} 803 804void substitute_expr::evaluate(int tentative, const reference &ref, 805 string &result, substring_position &pos) 806{ 807 int start_length = result.length(); 808 if (expr1) 809 expr1->evaluate(tentative, ref, result, pos); 810 if (result.length() > start_length && result[result.length() - 1] == '-') { 811 // ought to see if pos covers the - 812 result.set_length(result.length() - 1); 813 if (expr2) 814 expr2->evaluate(tentative, ref, result, pos); 815 } 816} 817 818void conditional_expr::evaluate(int tentative, const reference &ref, 819 string &result, substring_position &pos) 820{ 821 string temp; 822 substring_position temp_pos; 823 if (expr1) 824 expr1->evaluate(tentative, ref, temp, temp_pos); 825 if (temp.length() > 0) { 826 if (expr2) 827 expr2->evaluate(tentative, ref, result, pos); 828 } 829 else { 830 if (expr3) 831 expr3->evaluate(tentative, ref, result, pos); 832 } 833} 834 835void reference::pre_compute_label() 836{ 837 if (parsed_label != 0 838 && (parsed_label->analyze() & expression::CONTAINS_VARIABLE)) { 839 label.clear(); 840 substring_position temp_pos; 841 parsed_label->evaluate(1, *this, label, temp_pos); 842 label_ptr = lookup_label(label); 843 } 844} 845 846void reference::compute_label() 847{ 848 label.clear(); 849 if (parsed_label) 850 parsed_label->evaluate(0, *this, label, separator_pos); 851 if (short_label_flag && parsed_short_label) 852 parsed_short_label->evaluate(0, *this, short_label, short_separator_pos); 853 if (date_as_label) { 854 string new_date; 855 if (parsed_date_label) { 856 substring_position temp_pos; 857 parsed_date_label->evaluate(0, *this, new_date, temp_pos); 858 } 859 set_date(new_date); 860 } 861 if (label_ptr) 862 label_ptr->count += 1; 863} 864 865void reference::immediate_compute_label() 866{ 867 if (label_ptr) 868 label_ptr->total = 2; // force use of disambiguator 869 compute_label(); 870} 871 872int reference::merge_labels(reference **v, int n, label_type type, 873 string &result) 874{ 875 if (abbreviate_label_ranges) 876 return merge_labels_by_number(v, n, type, result); 877 else 878 return merge_labels_by_parts(v, n, type, result); 879} 880 881int reference::merge_labels_by_number(reference **v, int n, label_type type, 882 string &result) 883{ 884 if (n <= 1) 885 return 0; 886 int num = get_number(); 887 // Only merge three or more labels. 888 if (v[0]->get_number() != num + 1 889 || v[1]->get_number() != num + 2) 890 return 0; 891 int i; 892 for (i = 2; i < n; i++) 893 if (v[i]->get_number() != num + i + 1) 894 break; 895 result = get_label(type); 896 result += label_range_indicator; 897 result += v[i - 1]->get_label(type); 898 return i; 899} 900 901const substring_position &reference::get_separator_pos(label_type type) const 902{ 903 if (type == SHORT_LABEL && short_label_flag) 904 return short_separator_pos; 905 else 906 return separator_pos; 907} 908 909const string &reference::get_label(label_type type) const 910{ 911 if (type == SHORT_LABEL && short_label_flag) 912 return short_label; 913 else 914 return label; 915} 916 917int reference::merge_labels_by_parts(reference **v, int n, label_type type, 918 string &result) 919{ 920 if (n <= 0) 921 return 0; 922 const string &lb = get_label(type); 923 const substring_position &sp = get_separator_pos(type); 924 if (sp.start < 0 925 || sp.start != v[0]->get_separator_pos(type).start 926 || memcmp(lb.contents(), v[0]->get_label(type).contents(), 927 sp.start) != 0) 928 return 0; 929 result = lb; 930 int i = 0; 931 do { 932 result += separate_label_second_parts; 933 const substring_position &s = v[i]->get_separator_pos(type); 934 int sep_end_pos = s.start + s.length; 935 result.append(v[i]->get_label(type).contents() + sep_end_pos, 936 v[i]->get_label(type).length() - sep_end_pos); 937 } while (++i < n 938 && sp.start == v[i]->get_separator_pos(type).start 939 && memcmp(lb.contents(), v[i]->get_label(type).contents(), 940 sp.start) == 0); 941 return i; 942} 943 944string label_pool; 945 946label_info::label_info(const string &s) 947: start(label_pool.length()), length(s.length()), count(0), total(1) 948{ 949 label_pool += s; 950} 951 952static label_info **label_table = 0; 953static int label_table_size = 0; 954static int label_table_used = 0; 955 956label_info *lookup_label(const string &label) 957{ 958 if (label_table == 0) { 959 label_table = new label_info *[17]; 960 label_table_size = 17; 961 for (int i = 0; i < 17; i++) 962 label_table[i] = 0; 963 } 964 unsigned h = hash_string(label.contents(), label.length()) % label_table_size; 965 label_info **ptr; 966 for (ptr = label_table + h; 967 *ptr != 0; 968 (ptr == label_table) 969 ? (ptr = label_table + label_table_size - 1) 970 : ptr--) 971 if ((*ptr)->length == label.length() 972 && memcmp(label_pool.contents() + (*ptr)->start, label.contents(), 973 label.length()) == 0) { 974 (*ptr)->total += 1; 975 return *ptr; 976 } 977 label_info *result = *ptr = new label_info(label); 978 if (++label_table_used * 2 > label_table_size) { 979 // Rehash the table. 980 label_info **old_table = label_table; 981 int old_size = label_table_size; 982 label_table_size = next_size(label_table_size); 983 label_table = new label_info *[label_table_size]; 984 int i; 985 for (i = 0; i < label_table_size; i++) 986 label_table[i] = 0; 987 for (i = 0; i < old_size; i++) 988 if (old_table[i]) { 989 h = hash_string(label_pool.contents() + old_table[i]->start, 990 old_table[i]->length); 991 label_info **p; 992 for (p = label_table + (h % label_table_size); 993 *p != 0; 994 (p == label_table) 995 ? (p = label_table + label_table_size - 1) 996 : --p) 997 ; 998 *p = old_table[i]; 999 } 1000 a_delete old_table; 1001 } 1002 return result; 1003} 1004 1005void clear_labels() 1006{ 1007 for (int i = 0; i < label_table_size; i++) { 1008 delete label_table[i]; 1009 label_table[i] = 0; 1010 } 1011 label_table_used = 0; 1012 label_pool.clear(); 1013} 1014 1015static void consider_authors(reference **start, reference **end, int i); 1016 1017void compute_labels(reference **v, int n) 1018{ 1019 if (parsed_label 1020 && (parsed_label->analyze() & expression::CONTAINS_AT) 1021 && sort_fields.length() >= 2 1022 && sort_fields[0] == 'A' 1023 && sort_fields[1] == '+') 1024 consider_authors(v, v + n, 0); 1025 for (int i = 0; i < n; i++) 1026 v[i]->compute_label(); 1027} 1028 1029 1030/* A reference with a list of authors <A0,A1,...,AN> _needs_ author i 1031where 0 <= i <= N if there exists a reference with a list of authors 1032<B0,B1,...,BM> such that <A0,A1,...,AN> != <B0,B1,...,BM> and M >= i 1033and Aj = Bj for 0 <= j < i. In this case if we can't say ``A0, 1034A1,...,A(i-1) et al'' because this would match both <A0,A1,...,AN> and 1035<B0,B1,...,BM>. If a reference needs author i we only have to call 1036need_author(j) for some j >= i such that the reference also needs 1037author j. */ 1038 1039/* This function handles 2 tasks: 1040determine which authors are needed (cannot be elided with et al.); 1041determine which authors can have only last names in the labels. 1042 1043References >= start and < end have the same first i author names. 1044Also they're sorted by A+. */ 1045 1046static void consider_authors(reference **start, reference **end, int i) 1047{ 1048 if (start >= end) 1049 return; 1050 reference **p = start; 1051 if (i >= (*p)->get_nauthors()) { 1052 for (++p; p < end && i >= (*p)->get_nauthors(); p++) 1053 ; 1054 if (p < end && i > 0) { 1055 // If we have an author list <A B C> and an author list <A B C D>, 1056 // then both lists need C. 1057 for (reference **q = start; q < end; q++) 1058 (*q)->need_author(i - 1); 1059 } 1060 start = p; 1061 } 1062 while (p < end) { 1063 reference **last_name_start = p; 1064 reference **name_start = p; 1065 for (++p; 1066 p < end && i < (*p)->get_nauthors() 1067 && same_author_last_name(**last_name_start, **p, i); 1068 p++) { 1069 if (!same_author_name(**name_start, **p, i)) { 1070 consider_authors(name_start, p, i + 1); 1071 name_start = p; 1072 } 1073 } 1074 consider_authors(name_start, p, i + 1); 1075 if (last_name_start == name_start) { 1076 for (reference **q = last_name_start; q < p; q++) 1077 (*q)->set_last_name_unambiguous(i); 1078 } 1079 // If we have an author list <A B C D> and <A B C E>, then the lists 1080 // need author D and E respectively. 1081 if (name_start > start || p < end) { 1082 for (reference **q = last_name_start; q < p; q++) 1083 (*q)->need_author(i); 1084 } 1085 } 1086} 1087 1088int same_author_last_name(const reference &r1, const reference &r2, int n) 1089{ 1090 const char *ae1; 1091 const char *as1 = r1.get_sort_field(0, n, 0, &ae1); 1092 const char *ae2; 1093 const char *as2 = r2.get_sort_field(0, n, 0, &ae2); 1094 if (!as1 && !as2) return 1; // they are the same 1095 if (!as1 || !as2) return 0; 1096 return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0; 1097} 1098 1099int same_author_name(const reference &r1, const reference &r2, int n) 1100{ 1101 const char *ae1; 1102 const char *as1 = r1.get_sort_field(0, n, -1, &ae1); 1103 const char *ae2; 1104 const char *as2 = r2.get_sort_field(0, n, -1, &ae2); 1105 if (!as1 && !as2) return 1; // they are the same 1106 if (!as1 || !as2) return 0; 1107 return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0; 1108} 1109 1110 1111void int_set::set(int i) 1112{ 1113 assert(i >= 0); 1114 int bytei = i >> 3; 1115 if (bytei >= v.length()) { 1116 int old_length = v.length(); 1117 v.set_length(bytei + 1); 1118 for (int j = old_length; j <= bytei; j++) 1119 v[j] = 0; 1120 } 1121 v[bytei] |= 1 << (i & 7); 1122} 1123 1124int int_set::get(int i) const 1125{ 1126 assert(i >= 0); 1127 int bytei = i >> 3; 1128 return bytei >= v.length() ? 0 : (v[bytei] & (1 << (i & 7))) != 0; 1129} 1130 1131void reference::set_last_name_unambiguous(int i) 1132{ 1133 last_name_unambiguous.set(i); 1134} 1135 1136void reference::need_author(int n) 1137{ 1138 if (n > last_needed_author) 1139 last_needed_author = n; 1140} 1141 1142const char *reference::get_authors(const char **end) const 1143{ 1144 if (!computed_authors) { 1145 ((reference *)this)->computed_authors = 1; 1146 string &result = ((reference *)this)->authors; 1147 int na = get_nauthors(); 1148 result.clear(); 1149 for (int i = 0; i < na; i++) { 1150 if (last_name_unambiguous.get(i)) { 1151 const char *e, *start = get_author_last_name(i, &e); 1152 assert(start != 0); 1153 result.append(start, e - start); 1154 } 1155 else { 1156 const char *e, *start = get_author(i, &e); 1157 assert(start != 0); 1158 result.append(start, e - start); 1159 } 1160 if (i == last_needed_author 1161 && et_al.length() > 0 1162 && et_al_min_elide > 0 1163 && last_needed_author + et_al_min_elide < na 1164 && na >= et_al_min_total) { 1165 result += et_al; 1166 break; 1167 } 1168 if (i < na - 1) { 1169 if (na == 2) 1170 result += join_authors_exactly_two; 1171 else if (i < na - 2) 1172 result += join_authors_default; 1173 else 1174 result += join_authors_last_two; 1175 } 1176 } 1177 } 1178 const char *start = authors.contents(); 1179 *end = start + authors.length(); 1180 return start; 1181} 1182 1183int reference::get_nauthors() const 1184{ 1185 if (nauthors < 0) { 1186 const char *dummy; 1187 int na; 1188 for (na = 0; get_author(na, &dummy) != 0; na++) 1189 ; 1190 ((reference *)this)->nauthors = na; 1191 } 1192 return nauthors; 1193} 1194