1/* linebreak.c - line breaking of Unicode strings 2 Copyright (C) 2001-2003, 2006-2007 Free Software Foundation, Inc. 3 Written by Bruno Haible <haible@clisp.cons.org>, 2001. 4 5This program is free software: you can redistribute it and/or modify 6it under the terms of the GNU General Public License as published by 7the Free Software Foundation; either version 3 of the License, or 8(at your option) any later version. 9 10This program is distributed in the hope that it will be useful, 11but WITHOUT ANY WARRANTY; without even the implied warranty of 12MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13GNU General Public License for more details. 14 15You should have received a copy of the GNU General Public License 16along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18#include <config.h> 19 20/* Specification. */ 21#include "linebreak.h" 22 23#include <stdlib.h> 24#include <string.h> 25#include "c-ctype.h" 26#include "xsize.h" 27#include "unistr.h" 28#include "uniwidth.h" 29#include "uniwidth/cjk.h" 30#include "streq.h" 31 32 33static int 34is_utf8_encoding (const char *encoding) 35{ 36 if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0)) 37 return 1; 38 return 0; 39} 40 41 42/* Determine the line break points in S, and store the result at p[0..n-1]. */ 43/* We don't support line breaking of complex-context dependent characters 44 (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */ 45 46/* Line breaking classification. */ 47 48enum 49{ 50 /* Values >= 20 are resolved at run time. */ 51 LBP_BK = 0, /* mandatory break */ 52/*LBP_CR, carriage return - not used here because it's a DOSism */ 53/*LBP_LF, line feed - not used here because it's a DOSism */ 54 LBP_CM = 20, /* attached characters and combining marks */ 55/*LBP_SG, surrogates - not used here because they are not characters */ 56 LBP_ZW = 1, /* zero width space */ 57 LBP_IN = 2, /* inseparable */ 58 LBP_GL = 3, /* non-breaking (glue) */ 59 LBP_CB = 22, /* contingent break opportunity */ 60 LBP_SP = 21, /* space */ 61 LBP_BA = 4, /* break opportunity after */ 62 LBP_BB = 5, /* break opportunity before */ 63 LBP_B2 = 6, /* break opportunity before and after */ 64 LBP_HY = 7, /* hyphen */ 65 LBP_NS = 8, /* non starter */ 66 LBP_OP = 9, /* opening punctuation */ 67 LBP_CL = 10, /* closing punctuation */ 68 LBP_QU = 11, /* ambiguous quotation */ 69 LBP_EX = 12, /* exclamation/interrogation */ 70 LBP_ID = 13, /* ideographic */ 71 LBP_NU = 14, /* numeric */ 72 LBP_IS = 15, /* infix separator (numeric) */ 73 LBP_SY = 16, /* symbols allowing breaks */ 74 LBP_AL = 17, /* ordinary alphabetic and symbol characters */ 75 LBP_PR = 18, /* prefix (numeric) */ 76 LBP_PO = 19, /* postfix (numeric) */ 77 LBP_SA = 23, /* complex context (South East Asian) */ 78 LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */ 79 LBP_XX = 25 /* unknown */ 80}; 81 82#include "lbrkprop.h" 83 84static inline unsigned char 85lbrkprop_lookup (unsigned int uc) 86{ 87 unsigned int index1 = uc >> lbrkprop_header_0; 88 if (index1 < lbrkprop_header_1) 89 { 90 int lookup1 = lbrkprop.level1[index1]; 91 if (lookup1 >= 0) 92 { 93 unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3; 94 int lookup2 = lbrkprop.level2[lookup1 + index2]; 95 if (lookup2 >= 0) 96 { 97 unsigned int index3 = uc & lbrkprop_header_4; 98 return lbrkprop.level3[lookup2 + index3]; 99 } 100 } 101 } 102 return LBP_XX; 103} 104 105/* Table indexed by two line breaking classifications. */ 106#define D 1 /* direct break opportunity, empty in table 7.3 of UTR #14 */ 107#define I 2 /* indirect break opportunity, '%' in table 7.3 of UTR #14 */ 108#define P 3 /* prohibited break, '^' in table 7.3 of UTR #14 */ 109static const unsigned char lbrk_table[19][19] = { 110 /* after */ 111 /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */ 112/* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, }, 113/* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, 114/* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, }, 115/* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, 116/* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, }, 117/* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, }, 118/* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, 119/* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, 120/* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, }, 121/* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, }, 122/* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, }, 123/* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, 124/* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, }, 125/* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, }, 126/* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, }, 127/* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, }, 128/* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, }, 129/* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, }, 130/* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, 131/* "" */ 132/* before */ 133}; 134/* Note: The (B2,B2) entry should probably be D instead of P. */ 135/* Note: The (PR,ID) entry should probably be D instead of I. */ 136 137void 138u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p) 139{ 140 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); 141 const unsigned char *s_end = s + n; 142 int last_prop = LBP_BK; /* line break property of last non-space character */ 143 char *seen_space = NULL; /* Was a space seen after the last non-space character? */ 144 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ 145 146 /* Don't break inside multibyte characters. */ 147 memset (p, UC_BREAK_PROHIBITED, n); 148 149 while (s < s_end) 150 { 151 unsigned int uc; 152 int count = u8_mbtouc_unsafe (&uc, s, s_end - s); 153 int prop = lbrkprop_lookup (uc); 154 155 if (prop == LBP_BK) 156 { 157 /* Mandatory break. */ 158 *p = UC_BREAK_MANDATORY; 159 last_prop = LBP_BK; 160 seen_space = NULL; 161 seen_space2 = NULL; 162 } 163 else 164 { 165 char *q; 166 167 /* Resolve property values whose behaviour is not fixed. */ 168 switch (prop) 169 { 170 case LBP_AI: 171 /* Resolve ambiguous. */ 172 prop = LBP_AI_REPLACEMENT; 173 break; 174 case LBP_CB: 175 /* This is arbitrary. */ 176 prop = LBP_ID; 177 break; 178 case LBP_SA: 179 /* We don't handle complex scripts yet. 180 Treat LBP_SA like LBP_XX. */ 181 case LBP_XX: 182 /* This is arbitrary. */ 183 prop = LBP_AL; 184 break; 185 } 186 187 /* Deal with combining characters. */ 188 q = p; 189 if (prop == LBP_CM) 190 { 191 /* Don't break just before a combining character. */ 192 *p = UC_BREAK_PROHIBITED; 193 /* A combining character turns a preceding space into LBP_AL. */ 194 if (seen_space != NULL) 195 { 196 q = seen_space; 197 seen_space = seen_space2; 198 prop = LBP_AL; 199 goto lookup_via_table; 200 } 201 } 202 else if (prop == LBP_SP) 203 { 204 /* Don't break just before a space. */ 205 *p = UC_BREAK_PROHIBITED; 206 seen_space2 = seen_space; 207 seen_space = p; 208 } 209 else 210 { 211 lookup_via_table: 212 /* prop must be usable as an index for table 7.3 of UTR #14. */ 213 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0]))) 214 abort (); 215 216 if (last_prop == LBP_BK) 217 { 218 /* Don't break at the beginning of a line. */ 219 *q = UC_BREAK_PROHIBITED; 220 } 221 else 222 { 223 switch (lbrk_table [last_prop-1] [prop-1]) 224 { 225 case D: 226 *q = UC_BREAK_POSSIBLE; 227 break; 228 case I: 229 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); 230 break; 231 case P: 232 *q = UC_BREAK_PROHIBITED; 233 break; 234 default: 235 abort (); 236 } 237 } 238 last_prop = prop; 239 seen_space = NULL; 240 seen_space2 = NULL; 241 } 242 } 243 244 s += count; 245 p += count; 246 } 247} 248 249#ifdef unused 250 251void 252u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p) 253{ 254 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); 255 const unsigned short *s_end = s + n; 256 int last_prop = LBP_BK; /* line break property of last non-space character */ 257 char *seen_space = NULL; /* Was a space seen after the last non-space character? */ 258 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ 259 260 /* Don't break inside multibyte characters. */ 261 memset (p, UC_BREAK_PROHIBITED, n); 262 263 while (s < s_end) 264 { 265 unsigned int uc; 266 int count = u16_mbtouc_unsafe (&uc, s, s_end - s); 267 int prop = lbrkprop_lookup (uc); 268 269 if (prop == LBP_BK) 270 { 271 /* Mandatory break. */ 272 *p = UC_BREAK_MANDATORY; 273 last_prop = LBP_BK; 274 seen_space = NULL; 275 seen_space2 = NULL; 276 } 277 else 278 { 279 char *q; 280 281 /* Resolve property values whose behaviour is not fixed. */ 282 switch (prop) 283 { 284 case LBP_AI: 285 /* Resolve ambiguous. */ 286 prop = LBP_AI_REPLACEMENT; 287 break; 288 case LBP_CB: 289 /* This is arbitrary. */ 290 prop = LBP_ID; 291 break; 292 case LBP_SA: 293 /* We don't handle complex scripts yet. 294 Treat LBP_SA like LBP_XX. */ 295 case LBP_XX: 296 /* This is arbitrary. */ 297 prop = LBP_AL; 298 break; 299 } 300 301 /* Deal with combining characters. */ 302 q = p; 303 if (prop == LBP_CM) 304 { 305 /* Don't break just before a combining character. */ 306 *p = UC_BREAK_PROHIBITED; 307 /* A combining character turns a preceding space into LBP_AL. */ 308 if (seen_space != NULL) 309 { 310 q = seen_space; 311 seen_space = seen_space2; 312 prop = LBP_AL; 313 goto lookup_via_table; 314 } 315 } 316 else if (prop == LBP_SP) 317 { 318 /* Don't break just before a space. */ 319 *p = UC_BREAK_PROHIBITED; 320 seen_space2 = seen_space; 321 seen_space = p; 322 } 323 else 324 { 325 lookup_via_table: 326 /* prop must be usable as an index for table 7.3 of UTR #14. */ 327 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0]))) 328 abort (); 329 330 if (last_prop == LBP_BK) 331 { 332 /* Don't break at the beginning of a line. */ 333 *q = UC_BREAK_PROHIBITED; 334 } 335 else 336 { 337 switch (lbrk_table [last_prop-1] [prop-1]) 338 { 339 case D: 340 *q = UC_BREAK_POSSIBLE; 341 break; 342 case I: 343 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); 344 break; 345 case P: 346 *q = UC_BREAK_PROHIBITED; 347 break; 348 default: 349 abort (); 350 } 351 } 352 last_prop = prop; 353 seen_space = NULL; 354 seen_space2 = NULL; 355 } 356 } 357 358 s += count; 359 p += count; 360 } 361} 362 363void 364u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p) 365{ 366 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); 367 const unsigned int *s_end = s + n; 368 int last_prop = LBP_BK; /* line break property of last non-space character */ 369 char *seen_space = NULL; /* Was a space seen after the last non-space character? */ 370 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ 371 372 while (s < s_end) 373 { 374 unsigned int uc = *s; 375 int prop = lbrkprop_lookup (uc); 376 377 if (prop == LBP_BK) 378 { 379 /* Mandatory break. */ 380 *p = UC_BREAK_MANDATORY; 381 last_prop = LBP_BK; 382 seen_space = NULL; 383 seen_space2 = NULL; 384 } 385 else 386 { 387 char *q; 388 389 /* Resolve property values whose behaviour is not fixed. */ 390 switch (prop) 391 { 392 case LBP_AI: 393 /* Resolve ambiguous. */ 394 prop = LBP_AI_REPLACEMENT; 395 break; 396 case LBP_CB: 397 /* This is arbitrary. */ 398 prop = LBP_ID; 399 break; 400 case LBP_SA: 401 /* We don't handle complex scripts yet. 402 Treat LBP_SA like LBP_XX. */ 403 case LBP_XX: 404 /* This is arbitrary. */ 405 prop = LBP_AL; 406 break; 407 } 408 409 /* Deal with combining characters. */ 410 q = p; 411 if (prop == LBP_CM) 412 { 413 /* Don't break just before a combining character. */ 414 *p = UC_BREAK_PROHIBITED; 415 /* A combining character turns a preceding space into LBP_AL. */ 416 if (seen_space != NULL) 417 { 418 q = seen_space; 419 seen_space = seen_space2; 420 prop = LBP_AL; 421 goto lookup_via_table; 422 } 423 } 424 else if (prop == LBP_SP) 425 { 426 /* Don't break just before a space. */ 427 *p = UC_BREAK_PROHIBITED; 428 seen_space2 = seen_space; 429 seen_space = p; 430 } 431 else 432 { 433 lookup_via_table: 434 /* prop must be usable as an index for table 7.3 of UTR #14. */ 435 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0]))) 436 abort (); 437 438 if (last_prop == LBP_BK) 439 { 440 /* Don't break at the beginning of a line. */ 441 *q = UC_BREAK_PROHIBITED; 442 } 443 else 444 { 445 switch (lbrk_table [last_prop-1] [prop-1]) 446 { 447 case D: 448 *q = UC_BREAK_POSSIBLE; 449 break; 450 case I: 451 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); 452 break; 453 case P: 454 *q = UC_BREAK_PROHIBITED; 455 break; 456 default: 457 abort (); 458 } 459 } 460 last_prop = prop; 461 seen_space = NULL; 462 seen_space2 = NULL; 463 } 464 } 465 466 s++; 467 p++; 468 } 469} 470 471#endif 472 473 474/* Choose the best line breaks, assuming the uc_width function. 475 Return the column after the end of the string. */ 476 477int 478u8_width_linebreaks (const unsigned char *s, size_t n, 479 int width, int start_column, int at_end_columns, 480 const char *o, const char *encoding, 481 char *p) 482{ 483 const unsigned char *s_end; 484 char *last_p; 485 int last_column; 486 int piece_width; 487 488 u8_possible_linebreaks (s, n, encoding, p); 489 490 s_end = s + n; 491 last_p = NULL; 492 last_column = start_column; 493 piece_width = 0; 494 while (s < s_end) 495 { 496 unsigned int uc; 497 int count = u8_mbtouc_unsafe (&uc, s, s_end - s); 498 499 /* Respect the override. */ 500 if (o != NULL && *o != UC_BREAK_UNDEFINED) 501 *p = *o; 502 503 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY) 504 { 505 /* An atomic piece of text ends here. */ 506 if (last_p != NULL && last_column + piece_width > width) 507 { 508 /* Insert a line break. */ 509 *last_p = UC_BREAK_POSSIBLE; 510 last_column = 0; 511 } 512 } 513 514 if (*p == UC_BREAK_MANDATORY) 515 { 516 /* uc is a line break character. */ 517 /* Start a new piece at column 0. */ 518 last_p = NULL; 519 last_column = 0; 520 piece_width = 0; 521 } 522 else 523 { 524 /* uc is not a line break character. */ 525 int w; 526 527 if (*p == UC_BREAK_POSSIBLE) 528 { 529 /* Start a new piece. */ 530 last_p = p; 531 last_column += piece_width; 532 piece_width = 0; 533 /* No line break for the moment, may be turned into 534 UC_BREAK_POSSIBLE later, via last_p. */ 535 } 536 537 *p = UC_BREAK_PROHIBITED; 538 539 w = uc_width (uc, encoding); 540 if (w >= 0) /* ignore control characters in the string */ 541 piece_width += w; 542 } 543 544 s += count; 545 p += count; 546 if (o != NULL) 547 o += count; 548 } 549 550 /* The last atomic piece of text ends here. */ 551 if (last_p != NULL && last_column + piece_width + at_end_columns > width) 552 { 553 /* Insert a line break. */ 554 *last_p = UC_BREAK_POSSIBLE; 555 last_column = 0; 556 } 557 558 return last_column + piece_width; 559} 560 561#ifdef unused 562 563int 564u16_width_linebreaks (const unsigned short *s, size_t n, 565 int width, int start_column, int at_end_columns, 566 const char *o, const char *encoding, 567 char *p) 568{ 569 const unsigned short *s_end; 570 char *last_p; 571 int last_column; 572 int piece_width; 573 574 u16_possible_linebreaks (s, n, encoding, p); 575 576 s_end = s + n; 577 last_p = NULL; 578 last_column = start_column; 579 piece_width = 0; 580 while (s < s_end) 581 { 582 unsigned int uc; 583 int count = u16_mbtouc_unsafe (&uc, s, s_end - s); 584 585 /* Respect the override. */ 586 if (o != NULL && *o != UC_BREAK_UNDEFINED) 587 *p = *o; 588 589 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY) 590 { 591 /* An atomic piece of text ends here. */ 592 if (last_p != NULL && last_column + piece_width > width) 593 { 594 /* Insert a line break. */ 595 *last_p = UC_BREAK_POSSIBLE; 596 last_column = 0; 597 } 598 } 599 600 if (*p == UC_BREAK_MANDATORY) 601 { 602 /* uc is a line break character. */ 603 /* Start a new piece at column 0. */ 604 last_p = NULL; 605 last_column = 0; 606 piece_width = 0; 607 } 608 else 609 { 610 /* uc is not a line break character. */ 611 int w; 612 613 if (*p == UC_BREAK_POSSIBLE) 614 { 615 /* Start a new piece. */ 616 last_p = p; 617 last_column += piece_width; 618 piece_width = 0; 619 /* No line break for the moment, may be turned into 620 UC_BREAK_POSSIBLE later, via last_p. */ 621 } 622 623 *p = UC_BREAK_PROHIBITED; 624 625 w = uc_width (uc, encoding); 626 if (w >= 0) /* ignore control characters in the string */ 627 piece_width += w; 628 } 629 630 s += count; 631 p += count; 632 if (o != NULL) 633 o += count; 634 } 635 636 /* The last atomic piece of text ends here. */ 637 if (last_p != NULL && last_column + piece_width + at_end_columns > width) 638 { 639 /* Insert a line break. */ 640 *last_p = UC_BREAK_POSSIBLE; 641 last_column = 0; 642 } 643 644 return last_column + piece_width; 645} 646 647int 648u32_width_linebreaks (const unsigned int *s, size_t n, 649 int width, int start_column, int at_end_columns, 650 const char *o, const char *encoding, 651 char *p) 652{ 653 const unsigned int *s_end; 654 char *last_p; 655 int last_column; 656 int piece_width; 657 658 u32_possible_linebreaks (s, n, encoding, p); 659 660 s_end = s + n; 661 last_p = NULL; 662 last_column = start_column; 663 piece_width = 0; 664 while (s < s_end) 665 { 666 unsigned int uc = *s; 667 668 /* Respect the override. */ 669 if (o != NULL && *o != UC_BREAK_UNDEFINED) 670 *p = *o; 671 672 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY) 673 { 674 /* An atomic piece of text ends here. */ 675 if (last_p != NULL && last_column + piece_width > width) 676 { 677 /* Insert a line break. */ 678 *last_p = UC_BREAK_POSSIBLE; 679 last_column = 0; 680 } 681 } 682 683 if (*p == UC_BREAK_MANDATORY) 684 { 685 /* uc is a line break character. */ 686 /* Start a new piece at column 0. */ 687 last_p = NULL; 688 last_column = 0; 689 piece_width = 0; 690 } 691 else 692 { 693 /* uc is not a line break character. */ 694 int w; 695 696 if (*p == UC_BREAK_POSSIBLE) 697 { 698 /* Start a new piece. */ 699 last_p = p; 700 last_column += piece_width; 701 piece_width = 0; 702 /* No line break for the moment, may be turned into 703 UC_BREAK_POSSIBLE later, via last_p. */ 704 } 705 706 *p = UC_BREAK_PROHIBITED; 707 708 w = uc_width (uc, encoding); 709 if (w >= 0) /* ignore control characters in the string */ 710 piece_width += w; 711 } 712 713 s++; 714 p++; 715 if (o != NULL) 716 o++; 717 } 718 719 /* The last atomic piece of text ends here. */ 720 if (last_p != NULL && last_column + piece_width + at_end_columns > width) 721 { 722 /* Insert a line break. */ 723 *last_p = UC_BREAK_POSSIBLE; 724 last_column = 0; 725 } 726 727 return last_column + piece_width; 728} 729 730#endif 731 732 733#ifdef TEST1 734 735#include <stdio.h> 736 737/* Read the contents of an input stream, and return it, terminated with a NUL 738 byte. */ 739char * 740read_file (FILE *stream) 741{ 742#define BUFSIZE 4096 743 char *buf = NULL; 744 int alloc = 0; 745 int size = 0; 746 int count; 747 748 while (! feof (stream)) 749 { 750 if (size + BUFSIZE > alloc) 751 { 752 alloc = alloc + alloc / 2; 753 if (alloc < size + BUFSIZE) 754 alloc = size + BUFSIZE; 755 buf = realloc (buf, alloc); 756 if (buf == NULL) 757 { 758 fprintf (stderr, "out of memory\n"); 759 exit (1); 760 } 761 } 762 count = fread (buf + size, 1, BUFSIZE, stream); 763 if (count == 0) 764 { 765 if (ferror (stream)) 766 { 767 perror ("fread"); 768 exit (1); 769 } 770 } 771 else 772 size += count; 773 } 774 buf = realloc (buf, size + 1); 775 if (buf == NULL) 776 { 777 fprintf (stderr, "out of memory\n"); 778 exit (1); 779 } 780 buf[size] = '\0'; 781 return buf; 782#undef BUFSIZE 783} 784 785int 786main (int argc, char * argv[]) 787{ 788 if (argc == 1) 789 { 790 /* Display all the break opportunities in the input string. */ 791 char *input = read_file (stdin); 792 int length = strlen (input); 793 char *breaks = malloc (length); 794 int i; 795 796 u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks); 797 798 for (i = 0; i < length; i++) 799 { 800 switch (breaks[i]) 801 { 802 case UC_BREAK_POSSIBLE: 803 /* U+2027 in UTF-8 encoding */ 804 putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout); 805 break; 806 case UC_BREAK_MANDATORY: 807 /* U+21B2 (or U+21B5) in UTF-8 encoding */ 808 putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout); 809 break; 810 case UC_BREAK_PROHIBITED: 811 break; 812 default: 813 abort (); 814 } 815 putc (input[i], stdout); 816 } 817 818 free (breaks); 819 820 return 0; 821 } 822 else if (argc == 2) 823 { 824 /* Insert line breaks for a given width. */ 825 int width = atoi (argv[1]); 826 char *input = read_file (stdin); 827 int length = strlen (input); 828 char *breaks = malloc (length); 829 int i; 830 831 u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks); 832 833 for (i = 0; i < length; i++) 834 { 835 switch (breaks[i]) 836 { 837 case UC_BREAK_POSSIBLE: 838 putc ('\n', stdout); 839 break; 840 case UC_BREAK_MANDATORY: 841 break; 842 case UC_BREAK_PROHIBITED: 843 break; 844 default: 845 abort (); 846 } 847 putc (input[i], stdout); 848 } 849 850 free (breaks); 851 852 return 0; 853 } 854 else 855 return 1; 856} 857 858#endif /* TEST1 */ 859 860 861/* Now the same thing with an arbitrary encoding. 862 863 We convert the input string to Unicode. 864 865 The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16, 866 UTF-16BE, UTF-16LE, UTF-7. UCS-2 supports only characters up to 867 \U0000FFFF. UTF-16 and variants support only characters up to 868 \U0010FFFF. UTF-7 is way too complex and not supported by glibc-2.1. 869 UCS-4 specification leaves doubts about endianness and byte order mark. 870 glibc currently interprets it as big endian without byte order mark, 871 but this is not backed by an RFC. So we use UTF-8. It supports 872 characters up to \U7FFFFFFF and is unambiguously defined. */ 873 874#if HAVE_ICONV 875 876#include <iconv.h> 877#include <errno.h> 878 879/* Luckily, the encoding's name is platform independent. */ 880#define UTF8_NAME "UTF-8" 881 882/* Return the length of a string after conversion through an iconv_t. */ 883static size_t 884iconv_string_length (iconv_t cd, const char *s, size_t n) 885{ 886#define TMPBUFSIZE 4096 887 size_t count = 0; 888 char tmpbuf[TMPBUFSIZE]; 889 const char *inptr = s; 890 size_t insize = n; 891 while (insize > 0) 892 { 893 char *outptr = tmpbuf; 894 size_t outsize = TMPBUFSIZE; 895 size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize); 896 if (res == (size_t)(-1) && errno != E2BIG) 897 return (size_t)(-1); 898 count += outptr - tmpbuf; 899 } 900 /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug. */ 901#if defined _LIBICONV_VERSION \ 902 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun) 903 { 904 char *outptr = tmpbuf; 905 size_t outsize = TMPBUFSIZE; 906 size_t res = iconv (cd, NULL, NULL, &outptr, &outsize); 907 if (res == (size_t)(-1)) 908 return (size_t)(-1); 909 count += outptr - tmpbuf; 910 } 911 /* Return to the initial state. */ 912 iconv (cd, NULL, NULL, NULL, NULL); 913#endif 914 return count; 915#undef TMPBUFSIZE 916} 917 918static void 919iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n, 920 size_t *offtable, char *t, size_t m) 921{ 922 size_t i; 923 const char *s_end; 924 const char *inptr; 925 char *outptr; 926 size_t outsize; 927 /* Avoid glibc-2.1 bug. */ 928#if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) 929 const size_t extra = 1; 930#else 931 const size_t extra = 0; 932#endif 933 934 for (i = 0; i < n; i++) 935 offtable[i] = (size_t)(-1); 936 937 s_end = s + n; 938 inptr = s; 939 outptr = t; 940 outsize = m + extra; 941 while (inptr < s_end) 942 { 943 const char *saved_inptr; 944 size_t insize; 945 size_t res; 946 947 offtable[inptr - s] = outptr - t; 948 949 saved_inptr = inptr; 950 res = (size_t)(-1); 951 for (insize = 1; inptr + insize <= s_end; insize++) 952 { 953 res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize); 954 if (!(res == (size_t)(-1) && errno == EINVAL)) 955 break; 956 /* We expect that no input bytes have been consumed so far. */ 957 if (inptr != saved_inptr) 958 abort (); 959 } 960 /* After we verified the convertibility and computed the translation's 961 size m, there shouldn't be any conversion error here. */ 962 if (res == (size_t)(-1)) 963 abort (); 964 } 965 /* Avoid glibc-2.1 bug and Solaris 7 bug. */ 966#if defined _LIBICONV_VERSION \ 967 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun) 968 if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1)) 969 abort (); 970#endif 971 /* We should have produced exactly m output bytes. */ 972 if (outsize != extra) 973 abort (); 974} 975 976#endif /* HAVE_ICONV */ 977 978#if C_CTYPE_ASCII 979 980/* Tests whether a string is entirely ASCII. Returns 1 if yes. 981 Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding. */ 982static int 983is_all_ascii (const char *s, size_t n) 984{ 985 for (; n > 0; s++, n--) 986 { 987 unsigned char c = (unsigned char) *s; 988 989 if (!(c_isprint (c) || c_isspace (c))) 990 return 0; 991 } 992 return 1; 993} 994 995#endif /* C_CTYPE_ASCII */ 996 997#if defined unused || defined TEST2 998 999void 1000mbs_possible_linebreaks (const char *s, size_t n, const char *encoding, 1001 char *p) 1002{ 1003 if (n == 0) 1004 return; 1005 if (is_utf8_encoding (encoding)) 1006 u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p); 1007 else 1008 { 1009#if HAVE_ICONV 1010 iconv_t to_utf8; 1011 /* Avoid glibc-2.1 bug with EUC-KR. */ 1012# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION 1013 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)) 1014 to_utf8 = (iconv_t)(-1); 1015 else 1016# endif 1017 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK, 1018 GB18030. */ 1019# if defined __sun && !defined _LIBICONV_VERSION 1020 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) 1021 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0) 1022 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0) 1023 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C') 1024 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0) 1025 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) 1026 to_utf8 = (iconv_t)(-1); 1027 else 1028# endif 1029 to_utf8 = iconv_open (UTF8_NAME, encoding); 1030 if (to_utf8 != (iconv_t)(-1)) 1031 { 1032 /* Determine the length of the resulting UTF-8 string. */ 1033 size_t m = iconv_string_length (to_utf8, s, n); 1034 if (m != (size_t)(-1)) 1035 { 1036 /* Convert the string to UTF-8 and build a translation table 1037 from offsets into s to offsets into the translated string. */ 1038 size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m); 1039 char *memory = 1040 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL); 1041 if (memory != NULL) 1042 { 1043 size_t *offtable = (size_t *) memory; 1044 char *t = (char *) (offtable + n); 1045 char *q = (char *) (t + m); 1046 size_t i; 1047 1048 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m); 1049 1050 /* Determine the possible line breaks of the UTF-8 string. */ 1051 u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q); 1052 1053 /* Translate the result back to the original string. */ 1054 memset (p, UC_BREAK_PROHIBITED, n); 1055 for (i = 0; i < n; i++) 1056 if (offtable[i] != (size_t)(-1)) 1057 p[i] = q[offtable[i]]; 1058 1059 free (memory); 1060 iconv_close (to_utf8); 1061 return; 1062 } 1063 } 1064 iconv_close (to_utf8); 1065 } 1066#endif 1067 /* Impossible to convert. */ 1068#if C_CTYPE_ASCII 1069 if (is_all_ascii (s, n)) 1070 { 1071 /* ASCII is a subset of UTF-8. */ 1072 u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p); 1073 return; 1074 } 1075#endif 1076 /* We have a non-ASCII string and cannot convert it. 1077 Don't produce line breaks except those already present in the 1078 input string. All we assume here is that the encoding is 1079 minimally ASCII compatible. */ 1080 { 1081 const char *s_end = s + n; 1082 while (s < s_end) 1083 { 1084 *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED); 1085 s++; 1086 p++; 1087 } 1088 } 1089 } 1090} 1091 1092#endif 1093 1094int 1095mbs_width_linebreaks (const char *s, size_t n, 1096 int width, int start_column, int at_end_columns, 1097 const char *o, const char *encoding, 1098 char *p) 1099{ 1100 if (n == 0) 1101 return start_column; 1102 if (is_utf8_encoding (encoding)) 1103 return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p); 1104 else 1105 { 1106#if HAVE_ICONV 1107 iconv_t to_utf8; 1108 /* Avoid glibc-2.1 bug with EUC-KR. */ 1109# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION 1110 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)) 1111 to_utf8 = (iconv_t)(-1); 1112 else 1113# endif 1114 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK, 1115 GB18030. */ 1116# if defined __sun && !defined _LIBICONV_VERSION 1117 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) 1118 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0) 1119 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0) 1120 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C') 1121 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0) 1122 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) 1123 to_utf8 = (iconv_t)(-1); 1124 else 1125# endif 1126 to_utf8 = iconv_open (UTF8_NAME, encoding); 1127 if (to_utf8 != (iconv_t)(-1)) 1128 { 1129 /* Determine the length of the resulting UTF-8 string. */ 1130 size_t m = iconv_string_length (to_utf8, s, n); 1131 if (m != (size_t)(-1)) 1132 { 1133 /* Convert the string to UTF-8 and build a translation table 1134 from offsets into s to offsets into the translated string. */ 1135 size_t memory_size = 1136 xsum4 (xtimes (n, sizeof (size_t)), m, m, 1137 (o != NULL ? m : 0)); 1138 char *memory = 1139 (char *) 1140 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL); 1141 if (memory != NULL) 1142 { 1143 size_t *offtable = (size_t *) memory; 1144 char *t = (char *) (offtable + n); 1145 char *q = (char *) (t + m); 1146 char *o8 = (o != NULL ? (char *) (q + m) : NULL); 1147 int res_column; 1148 size_t i; 1149 1150 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m); 1151 1152 /* Translate the overrides to the UTF-8 string. */ 1153 if (o != NULL) 1154 { 1155 memset (o8, UC_BREAK_UNDEFINED, m); 1156 for (i = 0; i < n; i++) 1157 if (offtable[i] != (size_t)(-1)) 1158 o8[offtable[i]] = o[i]; 1159 } 1160 1161 /* Determine the line breaks of the UTF-8 string. */ 1162 res_column = 1163 u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q); 1164 1165 /* Translate the result back to the original string. */ 1166 memset (p, UC_BREAK_PROHIBITED, n); 1167 for (i = 0; i < n; i++) 1168 if (offtable[i] != (size_t)(-1)) 1169 p[i] = q[offtable[i]]; 1170 1171 free (memory); 1172 iconv_close (to_utf8); 1173 return res_column; 1174 } 1175 } 1176 iconv_close (to_utf8); 1177 } 1178#endif 1179 /* Impossible to convert. */ 1180#if C_CTYPE_ASCII 1181 if (is_all_ascii (s, n)) 1182 { 1183 /* ASCII is a subset of UTF-8. */ 1184 return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p); 1185 } 1186#endif 1187 /* We have a non-ASCII string and cannot convert it. 1188 Don't produce line breaks except those already present in the 1189 input string. All we assume here is that the encoding is 1190 minimally ASCII compatible. */ 1191 { 1192 const char *s_end = s + n; 1193 while (s < s_end) 1194 { 1195 *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n' 1196 ? UC_BREAK_MANDATORY 1197 : UC_BREAK_PROHIBITED); 1198 s++; 1199 p++; 1200 if (o != NULL) 1201 o++; 1202 } 1203 /* We cannot compute widths in this case. */ 1204 return start_column; 1205 } 1206 } 1207} 1208 1209 1210#ifdef TEST2 1211 1212#include <stdio.h> 1213#include <locale.h> 1214 1215/* Read the contents of an input stream, and return it, terminated with a NUL 1216 byte. */ 1217char * 1218read_file (FILE *stream) 1219{ 1220#define BUFSIZE 4096 1221 char *buf = NULL; 1222 int alloc = 0; 1223 int size = 0; 1224 int count; 1225 1226 while (! feof (stream)) 1227 { 1228 if (size + BUFSIZE > alloc) 1229 { 1230 alloc = alloc + alloc / 2; 1231 if (alloc < size + BUFSIZE) 1232 alloc = size + BUFSIZE; 1233 buf = realloc (buf, alloc); 1234 if (buf == NULL) 1235 { 1236 fprintf (stderr, "out of memory\n"); 1237 exit (1); 1238 } 1239 } 1240 count = fread (buf + size, 1, BUFSIZE, stream); 1241 if (count == 0) 1242 { 1243 if (ferror (stream)) 1244 { 1245 perror ("fread"); 1246 exit (1); 1247 } 1248 } 1249 else 1250 size += count; 1251 } 1252 buf = realloc (buf, size + 1); 1253 if (buf == NULL) 1254 { 1255 fprintf (stderr, "out of memory\n"); 1256 exit (1); 1257 } 1258 buf[size] = '\0'; 1259 return buf; 1260#undef BUFSIZE 1261} 1262 1263int 1264main (int argc, char * argv[]) 1265{ 1266 setlocale (LC_CTYPE, ""); 1267 if (argc == 1) 1268 { 1269 /* Display all the break opportunities in the input string. */ 1270 char *input = read_file (stdin); 1271 int length = strlen (input); 1272 char *breaks = malloc (length); 1273 int i; 1274 1275 mbs_possible_linebreaks (input, length, locale_charset (), breaks); 1276 1277 for (i = 0; i < length; i++) 1278 { 1279 switch (breaks[i]) 1280 { 1281 case UC_BREAK_POSSIBLE: 1282 putc ('|', stdout); 1283 break; 1284 case UC_BREAK_MANDATORY: 1285 break; 1286 case UC_BREAK_PROHIBITED: 1287 break; 1288 default: 1289 abort (); 1290 } 1291 putc (input[i], stdout); 1292 } 1293 1294 free (breaks); 1295 1296 return 0; 1297 } 1298 else if (argc == 2) 1299 { 1300 /* Insert line breaks for a given width. */ 1301 int width = atoi (argv[1]); 1302 char *input = read_file (stdin); 1303 int length = strlen (input); 1304 char *breaks = malloc (length); 1305 int i; 1306 1307 mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks); 1308 1309 for (i = 0; i < length; i++) 1310 { 1311 switch (breaks[i]) 1312 { 1313 case UC_BREAK_POSSIBLE: 1314 putc ('\n', stdout); 1315 break; 1316 case UC_BREAK_MANDATORY: 1317 break; 1318 case UC_BREAK_PROHIBITED: 1319 break; 1320 default: 1321 abort (); 1322 } 1323 putc (input[i], stdout); 1324 } 1325 1326 free (breaks); 1327 1328 return 0; 1329 } 1330 else 1331 return 1; 1332} 1333 1334#endif /* TEST2 */ 1335