• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /netgear-WNDR4500v2-V1.0.0.60_1.0.38/ap/gpl/timemachine/gettext-0.17/gettext-tools/gnulib-lib/
1/* linebreak.c - line breaking of Unicode strings
2   Copyright (C) 2001-2003, 2006-2007 Free Software Foundation, Inc.
3   Written by Bruno Haible <haible@clisp.cons.org>, 2001.
4
5This program is free software: you can redistribute it and/or modify
6it under the terms of the GNU General Public License as published by
7the Free Software Foundation; either version 3 of the License, or
8(at your option) any later version.
9
10This program is distributed in the hope that it will be useful,
11but WITHOUT ANY WARRANTY; without even the implied warranty of
12MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13GNU General Public License for more details.
14
15You should have received a copy of the GNU General Public License
16along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17
18#include <config.h>
19
20/* Specification.  */
21#include "linebreak.h"
22
23#include <stdlib.h>
24#include <string.h>
25#include "c-ctype.h"
26#include "xsize.h"
27#include "unistr.h"
28#include "uniwidth.h"
29#include "uniwidth/cjk.h"
30#include "streq.h"
31
32
33static int
34is_utf8_encoding (const char *encoding)
35{
36  if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
37    return 1;
38  return 0;
39}
40
41
42/* Determine the line break points in S, and store the result at p[0..n-1].  */
43/* We don't support line breaking of complex-context dependent characters
44   (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
45
46/* Line breaking classification.  */
47
48enum
49{
50  /* Values >= 20 are resolved at run time. */
51  LBP_BK =  0, /* mandatory break */
52/*LBP_CR,         carriage return - not used here because it's a DOSism */
53/*LBP_LF,         line feed - not used here because it's a DOSism */
54  LBP_CM = 20, /* attached characters and combining marks */
55/*LBP_SG,         surrogates - not used here because they are not characters */
56  LBP_ZW =  1, /* zero width space */
57  LBP_IN =  2, /* inseparable */
58  LBP_GL =  3, /* non-breaking (glue) */
59  LBP_CB = 22, /* contingent break opportunity */
60  LBP_SP = 21, /* space */
61  LBP_BA =  4, /* break opportunity after */
62  LBP_BB =  5, /* break opportunity before */
63  LBP_B2 =  6, /* break opportunity before and after */
64  LBP_HY =  7, /* hyphen */
65  LBP_NS =  8, /* non starter */
66  LBP_OP =  9, /* opening punctuation */
67  LBP_CL = 10, /* closing punctuation */
68  LBP_QU = 11, /* ambiguous quotation */
69  LBP_EX = 12, /* exclamation/interrogation */
70  LBP_ID = 13, /* ideographic */
71  LBP_NU = 14, /* numeric */
72  LBP_IS = 15, /* infix separator (numeric) */
73  LBP_SY = 16, /* symbols allowing breaks */
74  LBP_AL = 17, /* ordinary alphabetic and symbol characters */
75  LBP_PR = 18, /* prefix (numeric) */
76  LBP_PO = 19, /* postfix (numeric) */
77  LBP_SA = 23, /* complex context (South East Asian) */
78  LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
79  LBP_XX = 25  /* unknown */
80};
81
82#include "lbrkprop.h"
83
84static inline unsigned char
85lbrkprop_lookup (unsigned int uc)
86{
87  unsigned int index1 = uc >> lbrkprop_header_0;
88  if (index1 < lbrkprop_header_1)
89    {
90      int lookup1 = lbrkprop.level1[index1];
91      if (lookup1 >= 0)
92        {
93          unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3;
94          int lookup2 = lbrkprop.level2[lookup1 + index2];
95          if (lookup2 >= 0)
96            {
97              unsigned int index3 = uc & lbrkprop_header_4;
98              return lbrkprop.level3[lookup2 + index3];
99            }
100        }
101    }
102  return LBP_XX;
103}
104
105/* Table indexed by two line breaking classifications.  */
106#define D 1  /* direct break opportunity, empty in table 7.3 of UTR #14 */
107#define I 2  /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
108#define P 3  /* prohibited break,           '^' in table 7.3 of UTR #14 */
109static const unsigned char lbrk_table[19][19] = {
110                                /* after */
111        /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */
112/* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, },
113/* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
114/* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
115/* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
116/* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
117/* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, },
118/* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
119/* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
120/* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, },
121/* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, },
122/* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, },
123/* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
124/* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, },
125/* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, },
126/* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
127/* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
128/* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, },
129/* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, },
130/* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
131/* "" */
132/* before */
133};
134/* Note: The (B2,B2) entry should probably be D instead of P.  */
135/* Note: The (PR,ID) entry should probably be D instead of I.  */
136
137void
138u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p)
139{
140  int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
141  const unsigned char *s_end = s + n;
142  int last_prop = LBP_BK; /* line break property of last non-space character */
143  char *seen_space = NULL; /* Was a space seen after the last non-space character? */
144  char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
145
146  /* Don't break inside multibyte characters.  */
147  memset (p, UC_BREAK_PROHIBITED, n);
148
149  while (s < s_end)
150    {
151      unsigned int uc;
152      int count = u8_mbtouc_unsafe (&uc, s, s_end - s);
153      int prop = lbrkprop_lookup (uc);
154
155      if (prop == LBP_BK)
156        {
157          /* Mandatory break.  */
158          *p = UC_BREAK_MANDATORY;
159          last_prop = LBP_BK;
160          seen_space = NULL;
161          seen_space2 = NULL;
162        }
163      else
164        {
165          char *q;
166
167          /* Resolve property values whose behaviour is not fixed.  */
168          switch (prop)
169            {
170              case LBP_AI:
171                /* Resolve ambiguous.  */
172                prop = LBP_AI_REPLACEMENT;
173                break;
174              case LBP_CB:
175                /* This is arbitrary.  */
176                prop = LBP_ID;
177                break;
178              case LBP_SA:
179                /* We don't handle complex scripts yet.
180                   Treat LBP_SA like LBP_XX.  */
181              case LBP_XX:
182                /* This is arbitrary.  */
183                prop = LBP_AL;
184                break;
185            }
186
187          /* Deal with combining characters.  */
188          q = p;
189          if (prop == LBP_CM)
190            {
191              /* Don't break just before a combining character.  */
192              *p = UC_BREAK_PROHIBITED;
193              /* A combining character turns a preceding space into LBP_AL.  */
194              if (seen_space != NULL)
195                {
196                  q = seen_space;
197                  seen_space = seen_space2;
198                  prop = LBP_AL;
199                  goto lookup_via_table;
200                }
201            }
202          else if (prop == LBP_SP)
203            {
204              /* Don't break just before a space.  */
205              *p = UC_BREAK_PROHIBITED;
206              seen_space2 = seen_space;
207              seen_space = p;
208            }
209          else
210            {
211             lookup_via_table:
212              /* prop must be usable as an index for table 7.3 of UTR #14.  */
213              if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
214                abort ();
215
216              if (last_prop == LBP_BK)
217                {
218                  /* Don't break at the beginning of a line.  */
219                  *q = UC_BREAK_PROHIBITED;
220                }
221              else
222                {
223                  switch (lbrk_table [last_prop-1] [prop-1])
224                    {
225                      case D:
226                        *q = UC_BREAK_POSSIBLE;
227                        break;
228                      case I:
229                        *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
230                        break;
231                      case P:
232                        *q = UC_BREAK_PROHIBITED;
233                        break;
234                      default:
235                        abort ();
236                    }
237                }
238              last_prop = prop;
239              seen_space = NULL;
240              seen_space2 = NULL;
241            }
242        }
243
244      s += count;
245      p += count;
246    }
247}
248
249#ifdef unused
250
251void
252u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p)
253{
254  int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
255  const unsigned short *s_end = s + n;
256  int last_prop = LBP_BK; /* line break property of last non-space character */
257  char *seen_space = NULL; /* Was a space seen after the last non-space character? */
258  char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
259
260  /* Don't break inside multibyte characters.  */
261  memset (p, UC_BREAK_PROHIBITED, n);
262
263  while (s < s_end)
264    {
265      unsigned int uc;
266      int count = u16_mbtouc_unsafe (&uc, s, s_end - s);
267      int prop = lbrkprop_lookup (uc);
268
269      if (prop == LBP_BK)
270        {
271          /* Mandatory break.  */
272          *p = UC_BREAK_MANDATORY;
273          last_prop = LBP_BK;
274          seen_space = NULL;
275          seen_space2 = NULL;
276        }
277      else
278        {
279          char *q;
280
281          /* Resolve property values whose behaviour is not fixed.  */
282          switch (prop)
283            {
284              case LBP_AI:
285                /* Resolve ambiguous.  */
286                prop = LBP_AI_REPLACEMENT;
287                break;
288              case LBP_CB:
289                /* This is arbitrary.  */
290                prop = LBP_ID;
291                break;
292              case LBP_SA:
293                /* We don't handle complex scripts yet.
294                   Treat LBP_SA like LBP_XX.  */
295              case LBP_XX:
296                /* This is arbitrary.  */
297                prop = LBP_AL;
298                break;
299            }
300
301          /* Deal with combining characters.  */
302          q = p;
303          if (prop == LBP_CM)
304            {
305              /* Don't break just before a combining character.  */
306              *p = UC_BREAK_PROHIBITED;
307              /* A combining character turns a preceding space into LBP_AL.  */
308              if (seen_space != NULL)
309                {
310                  q = seen_space;
311                  seen_space = seen_space2;
312                  prop = LBP_AL;
313                  goto lookup_via_table;
314                }
315            }
316          else if (prop == LBP_SP)
317            {
318              /* Don't break just before a space.  */
319              *p = UC_BREAK_PROHIBITED;
320              seen_space2 = seen_space;
321              seen_space = p;
322            }
323          else
324            {
325             lookup_via_table:
326              /* prop must be usable as an index for table 7.3 of UTR #14.  */
327              if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
328                abort ();
329
330              if (last_prop == LBP_BK)
331                {
332                  /* Don't break at the beginning of a line.  */
333                  *q = UC_BREAK_PROHIBITED;
334                }
335              else
336                {
337                  switch (lbrk_table [last_prop-1] [prop-1])
338                    {
339                      case D:
340                        *q = UC_BREAK_POSSIBLE;
341                        break;
342                      case I:
343                        *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
344                        break;
345                      case P:
346                        *q = UC_BREAK_PROHIBITED;
347                        break;
348                      default:
349                        abort ();
350                    }
351                }
352              last_prop = prop;
353              seen_space = NULL;
354              seen_space2 = NULL;
355            }
356        }
357
358      s += count;
359      p += count;
360    }
361}
362
363void
364u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p)
365{
366  int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
367  const unsigned int *s_end = s + n;
368  int last_prop = LBP_BK; /* line break property of last non-space character */
369  char *seen_space = NULL; /* Was a space seen after the last non-space character? */
370  char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
371
372  while (s < s_end)
373    {
374      unsigned int uc = *s;
375      int prop = lbrkprop_lookup (uc);
376
377      if (prop == LBP_BK)
378        {
379          /* Mandatory break.  */
380          *p = UC_BREAK_MANDATORY;
381          last_prop = LBP_BK;
382          seen_space = NULL;
383          seen_space2 = NULL;
384        }
385      else
386        {
387          char *q;
388
389          /* Resolve property values whose behaviour is not fixed.  */
390          switch (prop)
391            {
392              case LBP_AI:
393                /* Resolve ambiguous.  */
394                prop = LBP_AI_REPLACEMENT;
395                break;
396              case LBP_CB:
397                /* This is arbitrary.  */
398                prop = LBP_ID;
399                break;
400              case LBP_SA:
401                /* We don't handle complex scripts yet.
402                   Treat LBP_SA like LBP_XX.  */
403              case LBP_XX:
404                /* This is arbitrary.  */
405                prop = LBP_AL;
406                break;
407            }
408
409          /* Deal with combining characters.  */
410          q = p;
411          if (prop == LBP_CM)
412            {
413              /* Don't break just before a combining character.  */
414              *p = UC_BREAK_PROHIBITED;
415              /* A combining character turns a preceding space into LBP_AL.  */
416              if (seen_space != NULL)
417                {
418                  q = seen_space;
419                  seen_space = seen_space2;
420                  prop = LBP_AL;
421                  goto lookup_via_table;
422                }
423            }
424          else if (prop == LBP_SP)
425            {
426              /* Don't break just before a space.  */
427              *p = UC_BREAK_PROHIBITED;
428              seen_space2 = seen_space;
429              seen_space = p;
430            }
431          else
432            {
433             lookup_via_table:
434              /* prop must be usable as an index for table 7.3 of UTR #14.  */
435              if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
436                abort ();
437
438              if (last_prop == LBP_BK)
439                {
440                  /* Don't break at the beginning of a line.  */
441                  *q = UC_BREAK_PROHIBITED;
442                }
443              else
444                {
445                  switch (lbrk_table [last_prop-1] [prop-1])
446                    {
447                      case D:
448                        *q = UC_BREAK_POSSIBLE;
449                        break;
450                      case I:
451                        *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
452                        break;
453                      case P:
454                        *q = UC_BREAK_PROHIBITED;
455                        break;
456                      default:
457                        abort ();
458                    }
459                }
460              last_prop = prop;
461              seen_space = NULL;
462              seen_space2 = NULL;
463            }
464        }
465
466      s++;
467      p++;
468    }
469}
470
471#endif
472
473
474/* Choose the best line breaks, assuming the uc_width function.
475   Return the column after the end of the string.  */
476
477int
478u8_width_linebreaks (const unsigned char *s, size_t n,
479                     int width, int start_column, int at_end_columns,
480                     const char *o, const char *encoding,
481                     char *p)
482{
483  const unsigned char *s_end;
484  char *last_p;
485  int last_column;
486  int piece_width;
487
488  u8_possible_linebreaks (s, n, encoding, p);
489
490  s_end = s + n;
491  last_p = NULL;
492  last_column = start_column;
493  piece_width = 0;
494  while (s < s_end)
495    {
496      unsigned int uc;
497      int count = u8_mbtouc_unsafe (&uc, s, s_end - s);
498
499      /* Respect the override.  */
500      if (o != NULL && *o != UC_BREAK_UNDEFINED)
501        *p = *o;
502
503      if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
504        {
505          /* An atomic piece of text ends here.  */
506          if (last_p != NULL && last_column + piece_width > width)
507            {
508              /* Insert a line break.  */
509              *last_p = UC_BREAK_POSSIBLE;
510              last_column = 0;
511            }
512        }
513
514      if (*p == UC_BREAK_MANDATORY)
515        {
516          /* uc is a line break character.  */
517          /* Start a new piece at column 0.  */
518          last_p = NULL;
519          last_column = 0;
520          piece_width = 0;
521        }
522      else
523        {
524          /* uc is not a line break character.  */
525          int w;
526
527          if (*p == UC_BREAK_POSSIBLE)
528            {
529              /* Start a new piece.  */
530              last_p = p;
531              last_column += piece_width;
532              piece_width = 0;
533              /* No line break for the moment, may be turned into
534                 UC_BREAK_POSSIBLE later, via last_p. */
535            }
536
537          *p = UC_BREAK_PROHIBITED;
538
539          w = uc_width (uc, encoding);
540          if (w >= 0) /* ignore control characters in the string */
541            piece_width += w;
542         }
543
544      s += count;
545      p += count;
546      if (o != NULL)
547        o += count;
548    }
549
550  /* The last atomic piece of text ends here.  */
551  if (last_p != NULL && last_column + piece_width + at_end_columns > width)
552    {
553      /* Insert a line break.  */
554      *last_p = UC_BREAK_POSSIBLE;
555      last_column = 0;
556    }
557
558  return last_column + piece_width;
559}
560
561#ifdef unused
562
563int
564u16_width_linebreaks (const unsigned short *s, size_t n,
565                      int width, int start_column, int at_end_columns,
566                      const char *o, const char *encoding,
567                      char *p)
568{
569  const unsigned short *s_end;
570  char *last_p;
571  int last_column;
572  int piece_width;
573
574  u16_possible_linebreaks (s, n, encoding, p);
575
576  s_end = s + n;
577  last_p = NULL;
578  last_column = start_column;
579  piece_width = 0;
580  while (s < s_end)
581    {
582      unsigned int uc;
583      int count = u16_mbtouc_unsafe (&uc, s, s_end - s);
584
585      /* Respect the override.  */
586      if (o != NULL && *o != UC_BREAK_UNDEFINED)
587        *p = *o;
588
589      if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
590        {
591          /* An atomic piece of text ends here.  */
592          if (last_p != NULL && last_column + piece_width > width)
593            {
594              /* Insert a line break.  */
595              *last_p = UC_BREAK_POSSIBLE;
596              last_column = 0;
597            }
598        }
599
600      if (*p == UC_BREAK_MANDATORY)
601        {
602          /* uc is a line break character.  */
603          /* Start a new piece at column 0.  */
604          last_p = NULL;
605          last_column = 0;
606          piece_width = 0;
607        }
608      else
609        {
610          /* uc is not a line break character.  */
611          int w;
612
613          if (*p == UC_BREAK_POSSIBLE)
614            {
615              /* Start a new piece.  */
616              last_p = p;
617              last_column += piece_width;
618              piece_width = 0;
619              /* No line break for the moment, may be turned into
620                 UC_BREAK_POSSIBLE later, via last_p. */
621            }
622
623          *p = UC_BREAK_PROHIBITED;
624
625          w = uc_width (uc, encoding);
626          if (w >= 0) /* ignore control characters in the string */
627            piece_width += w;
628         }
629
630      s += count;
631      p += count;
632      if (o != NULL)
633        o += count;
634    }
635
636  /* The last atomic piece of text ends here.  */
637  if (last_p != NULL && last_column + piece_width + at_end_columns > width)
638    {
639      /* Insert a line break.  */
640      *last_p = UC_BREAK_POSSIBLE;
641      last_column = 0;
642    }
643
644  return last_column + piece_width;
645}
646
647int
648u32_width_linebreaks (const unsigned int *s, size_t n,
649                      int width, int start_column, int at_end_columns,
650                      const char *o, const char *encoding,
651                      char *p)
652{
653  const unsigned int *s_end;
654  char *last_p;
655  int last_column;
656  int piece_width;
657
658  u32_possible_linebreaks (s, n, encoding, p);
659
660  s_end = s + n;
661  last_p = NULL;
662  last_column = start_column;
663  piece_width = 0;
664  while (s < s_end)
665    {
666      unsigned int uc = *s;
667
668      /* Respect the override.  */
669      if (o != NULL && *o != UC_BREAK_UNDEFINED)
670        *p = *o;
671
672      if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
673        {
674          /* An atomic piece of text ends here.  */
675          if (last_p != NULL && last_column + piece_width > width)
676            {
677              /* Insert a line break.  */
678              *last_p = UC_BREAK_POSSIBLE;
679              last_column = 0;
680            }
681        }
682
683      if (*p == UC_BREAK_MANDATORY)
684        {
685          /* uc is a line break character.  */
686          /* Start a new piece at column 0.  */
687          last_p = NULL;
688          last_column = 0;
689          piece_width = 0;
690        }
691      else
692        {
693          /* uc is not a line break character.  */
694          int w;
695
696          if (*p == UC_BREAK_POSSIBLE)
697            {
698              /* Start a new piece.  */
699              last_p = p;
700              last_column += piece_width;
701              piece_width = 0;
702              /* No line break for the moment, may be turned into
703                 UC_BREAK_POSSIBLE later, via last_p. */
704            }
705
706          *p = UC_BREAK_PROHIBITED;
707
708          w = uc_width (uc, encoding);
709          if (w >= 0) /* ignore control characters in the string */
710            piece_width += w;
711         }
712
713      s++;
714      p++;
715      if (o != NULL)
716        o++;
717    }
718
719  /* The last atomic piece of text ends here.  */
720  if (last_p != NULL && last_column + piece_width + at_end_columns > width)
721    {
722      /* Insert a line break.  */
723      *last_p = UC_BREAK_POSSIBLE;
724      last_column = 0;
725    }
726
727  return last_column + piece_width;
728}
729
730#endif
731
732
733#ifdef TEST1
734
735#include <stdio.h>
736
737/* Read the contents of an input stream, and return it, terminated with a NUL
738   byte. */
739char *
740read_file (FILE *stream)
741{
742#define BUFSIZE 4096
743  char *buf = NULL;
744  int alloc = 0;
745  int size = 0;
746  int count;
747
748  while (! feof (stream))
749    {
750      if (size + BUFSIZE > alloc)
751        {
752          alloc = alloc + alloc / 2;
753          if (alloc < size + BUFSIZE)
754            alloc = size + BUFSIZE;
755          buf = realloc (buf, alloc);
756          if (buf == NULL)
757            {
758              fprintf (stderr, "out of memory\n");
759              exit (1);
760            }
761        }
762      count = fread (buf + size, 1, BUFSIZE, stream);
763      if (count == 0)
764        {
765          if (ferror (stream))
766            {
767              perror ("fread");
768              exit (1);
769            }
770        }
771      else
772        size += count;
773    }
774  buf = realloc (buf, size + 1);
775  if (buf == NULL)
776    {
777      fprintf (stderr, "out of memory\n");
778      exit (1);
779    }
780  buf[size] = '\0';
781  return buf;
782#undef BUFSIZE
783}
784
785int
786main (int argc, char * argv[])
787{
788  if (argc == 1)
789    {
790      /* Display all the break opportunities in the input string.  */
791      char *input = read_file (stdin);
792      int length = strlen (input);
793      char *breaks = malloc (length);
794      int i;
795
796      u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks);
797
798      for (i = 0; i < length; i++)
799        {
800          switch (breaks[i])
801            {
802              case UC_BREAK_POSSIBLE:
803                /* U+2027 in UTF-8 encoding */
804                putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
805                break;
806              case UC_BREAK_MANDATORY:
807                /* U+21B2 (or U+21B5) in UTF-8 encoding */
808                putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
809                break;
810              case UC_BREAK_PROHIBITED:
811                break;
812              default:
813                abort ();
814            }
815          putc (input[i], stdout);
816        }
817
818      free (breaks);
819
820      return 0;
821    }
822  else if (argc == 2)
823    {
824      /* Insert line breaks for a given width.  */
825      int width = atoi (argv[1]);
826      char *input = read_file (stdin);
827      int length = strlen (input);
828      char *breaks = malloc (length);
829      int i;
830
831      u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks);
832
833      for (i = 0; i < length; i++)
834        {
835          switch (breaks[i])
836            {
837              case UC_BREAK_POSSIBLE:
838                putc ('\n', stdout);
839                break;
840              case UC_BREAK_MANDATORY:
841                break;
842              case UC_BREAK_PROHIBITED:
843                break;
844              default:
845                abort ();
846            }
847          putc (input[i], stdout);
848        }
849
850      free (breaks);
851
852      return 0;
853    }
854  else
855    return 1;
856}
857
858#endif /* TEST1 */
859
860
861/* Now the same thing with an arbitrary encoding.
862
863   We convert the input string to Unicode.
864
865   The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
866   UTF-16BE, UTF-16LE, UTF-7.  UCS-2 supports only characters up to
867   \U0000FFFF.  UTF-16 and variants support only characters up to
868   \U0010FFFF.  UTF-7 is way too complex and not supported by glibc-2.1.
869   UCS-4 specification leaves doubts about endianness and byte order mark.
870   glibc currently interprets it as big endian without byte order mark,
871   but this is not backed by an RFC.  So we use UTF-8. It supports
872   characters up to \U7FFFFFFF and is unambiguously defined.  */
873
874#if HAVE_ICONV
875
876#include <iconv.h>
877#include <errno.h>
878
879/* Luckily, the encoding's name is platform independent.  */
880#define UTF8_NAME "UTF-8"
881
882/* Return the length of a string after conversion through an iconv_t.  */
883static size_t
884iconv_string_length (iconv_t cd, const char *s, size_t n)
885{
886#define TMPBUFSIZE 4096
887  size_t count = 0;
888  char tmpbuf[TMPBUFSIZE];
889  const char *inptr = s;
890  size_t insize = n;
891  while (insize > 0)
892    {
893      char *outptr = tmpbuf;
894      size_t outsize = TMPBUFSIZE;
895      size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
896      if (res == (size_t)(-1) && errno != E2BIG)
897        return (size_t)(-1);
898      count += outptr - tmpbuf;
899    }
900  /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug.  */
901#if defined _LIBICONV_VERSION \
902    || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
903  {
904    char *outptr = tmpbuf;
905    size_t outsize = TMPBUFSIZE;
906    size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
907    if (res == (size_t)(-1))
908      return (size_t)(-1);
909    count += outptr - tmpbuf;
910  }
911  /* Return to the initial state.  */
912  iconv (cd, NULL, NULL, NULL, NULL);
913#endif
914  return count;
915#undef TMPBUFSIZE
916}
917
918static void
919iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n,
920                              size_t *offtable, char *t, size_t m)
921{
922  size_t i;
923  const char *s_end;
924  const char *inptr;
925  char *outptr;
926  size_t outsize;
927  /* Avoid glibc-2.1 bug.  */
928#if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
929  const size_t extra = 1;
930#else
931  const size_t extra = 0;
932#endif
933
934  for (i = 0; i < n; i++)
935    offtable[i] = (size_t)(-1);
936
937  s_end = s + n;
938  inptr = s;
939  outptr = t;
940  outsize = m + extra;
941  while (inptr < s_end)
942    {
943      const char *saved_inptr;
944      size_t insize;
945      size_t res;
946
947      offtable[inptr - s] = outptr - t;
948
949      saved_inptr = inptr;
950      res = (size_t)(-1);
951      for (insize = 1; inptr + insize <= s_end; insize++)
952        {
953          res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
954          if (!(res == (size_t)(-1) && errno == EINVAL))
955            break;
956          /* We expect that no input bytes have been consumed so far.  */
957          if (inptr != saved_inptr)
958            abort ();
959        }
960      /* After we verified the convertibility and computed the translation's
961         size m, there shouldn't be any conversion error here. */
962      if (res == (size_t)(-1))
963        abort ();
964    }
965  /* Avoid glibc-2.1 bug and Solaris 7 bug.  */
966#if defined _LIBICONV_VERSION \
967    || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
968  if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
969    abort ();
970#endif
971  /* We should have produced exactly m output bytes.  */
972  if (outsize != extra)
973    abort ();
974}
975
976#endif /* HAVE_ICONV */
977
978#if C_CTYPE_ASCII
979
980/* Tests whether a string is entirely ASCII.  Returns 1 if yes.
981   Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding.  */
982static int
983is_all_ascii (const char *s, size_t n)
984{
985  for (; n > 0; s++, n--)
986    {
987      unsigned char c = (unsigned char) *s;
988
989      if (!(c_isprint (c) || c_isspace (c)))
990	return 0;
991    }
992  return 1;
993}
994
995#endif /* C_CTYPE_ASCII */
996
997#if defined unused || defined TEST2
998
999void
1000mbs_possible_linebreaks (const char *s, size_t n, const char *encoding,
1001                         char *p)
1002{
1003  if (n == 0)
1004    return;
1005  if (is_utf8_encoding (encoding))
1006    u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1007  else
1008    {
1009#if HAVE_ICONV
1010      iconv_t to_utf8;
1011      /* Avoid glibc-2.1 bug with EUC-KR.  */
1012# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1013      if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1014	to_utf8 = (iconv_t)(-1);
1015      else
1016# endif
1017      /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1018         GB18030.  */
1019# if defined __sun && !defined _LIBICONV_VERSION
1020      if (   STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1021          || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1022          || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1023          || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1024          || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1025          || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1026        to_utf8 = (iconv_t)(-1);
1027      else
1028# endif
1029      to_utf8 = iconv_open (UTF8_NAME, encoding);
1030      if (to_utf8 != (iconv_t)(-1))
1031        {
1032          /* Determine the length of the resulting UTF-8 string.  */
1033          size_t m = iconv_string_length (to_utf8, s, n);
1034          if (m != (size_t)(-1))
1035            {
1036              /* Convert the string to UTF-8 and build a translation table
1037                 from offsets into s to offsets into the translated string.  */
1038	      size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m);
1039              char *memory =
1040		(size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1041              if (memory != NULL)
1042                {
1043                  size_t *offtable = (size_t *) memory;
1044                  char *t = (char *) (offtable + n);
1045                  char *q = (char *) (t + m);
1046                  size_t i;
1047
1048                  iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1049
1050                  /* Determine the possible line breaks of the UTF-8 string.  */
1051                  u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q);
1052
1053                  /* Translate the result back to the original string.  */
1054                  memset (p, UC_BREAK_PROHIBITED, n);
1055                  for (i = 0; i < n; i++)
1056                    if (offtable[i] != (size_t)(-1))
1057                      p[i] = q[offtable[i]];
1058
1059                  free (memory);
1060                  iconv_close (to_utf8);
1061                  return;
1062                }
1063            }
1064          iconv_close (to_utf8);
1065        }
1066#endif
1067      /* Impossible to convert.  */
1068#if C_CTYPE_ASCII
1069      if (is_all_ascii (s, n))
1070	{
1071	  /* ASCII is a subset of UTF-8.  */
1072	  u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1073	  return;
1074	}
1075#endif
1076      /* We have a non-ASCII string and cannot convert it.
1077	 Don't produce line breaks except those already present in the
1078	 input string.  All we assume here is that the encoding is
1079	 minimally ASCII compatible.  */
1080      {
1081        const char *s_end = s + n;
1082        while (s < s_end)
1083          {
1084            *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
1085            s++;
1086            p++;
1087          }
1088      }
1089    }
1090}
1091
1092#endif
1093
1094int
1095mbs_width_linebreaks (const char *s, size_t n,
1096                      int width, int start_column, int at_end_columns,
1097                      const char *o, const char *encoding,
1098                      char *p)
1099{
1100  if (n == 0)
1101    return start_column;
1102  if (is_utf8_encoding (encoding))
1103    return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1104  else
1105    {
1106#if HAVE_ICONV
1107      iconv_t to_utf8;
1108      /* Avoid glibc-2.1 bug with EUC-KR.  */
1109# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1110      if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1111	to_utf8 = (iconv_t)(-1);
1112      else
1113# endif
1114      /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1115         GB18030.  */
1116# if defined __sun && !defined _LIBICONV_VERSION
1117      if (   STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1118          || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1119          || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1120          || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1121          || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1122          || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1123        to_utf8 = (iconv_t)(-1);
1124      else
1125# endif
1126      to_utf8 = iconv_open (UTF8_NAME, encoding);
1127      if (to_utf8 != (iconv_t)(-1))
1128        {
1129          /* Determine the length of the resulting UTF-8 string.  */
1130          size_t m = iconv_string_length (to_utf8, s, n);
1131          if (m != (size_t)(-1))
1132            {
1133              /* Convert the string to UTF-8 and build a translation table
1134                 from offsets into s to offsets into the translated string.  */
1135	      size_t memory_size =
1136		xsum4 (xtimes (n, sizeof (size_t)), m, m,
1137		       (o != NULL ? m : 0));
1138	      char *memory =
1139		(char *)
1140		(size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1141              if (memory != NULL)
1142                {
1143                  size_t *offtable = (size_t *) memory;
1144                  char *t = (char *) (offtable + n);
1145                  char *q = (char *) (t + m);
1146                  char *o8 = (o != NULL ? (char *) (q + m) : NULL);
1147                  int res_column;
1148                  size_t i;
1149
1150                  iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1151
1152                  /* Translate the overrides to the UTF-8 string.  */
1153                  if (o != NULL)
1154                    {
1155                      memset (o8, UC_BREAK_UNDEFINED, m);
1156                      for (i = 0; i < n; i++)
1157                        if (offtable[i] != (size_t)(-1))
1158                          o8[offtable[i]] = o[i];
1159                    }
1160
1161                  /* Determine the line breaks of the UTF-8 string.  */
1162                  res_column =
1163                    u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q);
1164
1165                  /* Translate the result back to the original string.  */
1166                  memset (p, UC_BREAK_PROHIBITED, n);
1167                  for (i = 0; i < n; i++)
1168                    if (offtable[i] != (size_t)(-1))
1169                      p[i] = q[offtable[i]];
1170
1171                  free (memory);
1172                  iconv_close (to_utf8);
1173                  return res_column;
1174                }
1175            }
1176          iconv_close (to_utf8);
1177        }
1178#endif
1179      /* Impossible to convert.  */
1180#if C_CTYPE_ASCII
1181      if (is_all_ascii (s, n))
1182	{
1183	  /* ASCII is a subset of UTF-8.  */
1184	  return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1185	}
1186#endif
1187      /* We have a non-ASCII string and cannot convert it.
1188	 Don't produce line breaks except those already present in the
1189	 input string.  All we assume here is that the encoding is
1190	 minimally ASCII compatible.  */
1191      {
1192        const char *s_end = s + n;
1193        while (s < s_end)
1194          {
1195            *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
1196                  ? UC_BREAK_MANDATORY
1197                  : UC_BREAK_PROHIBITED);
1198            s++;
1199            p++;
1200            if (o != NULL)
1201              o++;
1202          }
1203        /* We cannot compute widths in this case.  */
1204        return start_column;
1205      }
1206    }
1207}
1208
1209
1210#ifdef TEST2
1211
1212#include <stdio.h>
1213#include <locale.h>
1214
1215/* Read the contents of an input stream, and return it, terminated with a NUL
1216   byte. */
1217char *
1218read_file (FILE *stream)
1219{
1220#define BUFSIZE 4096
1221  char *buf = NULL;
1222  int alloc = 0;
1223  int size = 0;
1224  int count;
1225
1226  while (! feof (stream))
1227    {
1228      if (size + BUFSIZE > alloc)
1229        {
1230          alloc = alloc + alloc / 2;
1231          if (alloc < size + BUFSIZE)
1232            alloc = size + BUFSIZE;
1233          buf = realloc (buf, alloc);
1234          if (buf == NULL)
1235            {
1236              fprintf (stderr, "out of memory\n");
1237              exit (1);
1238            }
1239        }
1240      count = fread (buf + size, 1, BUFSIZE, stream);
1241      if (count == 0)
1242        {
1243          if (ferror (stream))
1244            {
1245              perror ("fread");
1246              exit (1);
1247            }
1248        }
1249      else
1250        size += count;
1251    }
1252  buf = realloc (buf, size + 1);
1253  if (buf == NULL)
1254    {
1255      fprintf (stderr, "out of memory\n");
1256      exit (1);
1257    }
1258  buf[size] = '\0';
1259  return buf;
1260#undef BUFSIZE
1261}
1262
1263int
1264main (int argc, char * argv[])
1265{
1266  setlocale (LC_CTYPE, "");
1267  if (argc == 1)
1268    {
1269      /* Display all the break opportunities in the input string.  */
1270      char *input = read_file (stdin);
1271      int length = strlen (input);
1272      char *breaks = malloc (length);
1273      int i;
1274
1275      mbs_possible_linebreaks (input, length, locale_charset (), breaks);
1276
1277      for (i = 0; i < length; i++)
1278        {
1279          switch (breaks[i])
1280            {
1281              case UC_BREAK_POSSIBLE:
1282                putc ('|', stdout);
1283                break;
1284              case UC_BREAK_MANDATORY:
1285                break;
1286              case UC_BREAK_PROHIBITED:
1287                break;
1288              default:
1289                abort ();
1290            }
1291          putc (input[i], stdout);
1292        }
1293
1294      free (breaks);
1295
1296      return 0;
1297    }
1298  else if (argc == 2)
1299    {
1300      /* Insert line breaks for a given width.  */
1301      int width = atoi (argv[1]);
1302      char *input = read_file (stdin);
1303      int length = strlen (input);
1304      char *breaks = malloc (length);
1305      int i;
1306
1307      mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
1308
1309      for (i = 0; i < length; i++)
1310        {
1311          switch (breaks[i])
1312            {
1313              case UC_BREAK_POSSIBLE:
1314                putc ('\n', stdout);
1315                break;
1316              case UC_BREAK_MANDATORY:
1317                break;
1318              case UC_BREAK_PROHIBITED:
1319                break;
1320              default:
1321                abort ();
1322            }
1323          putc (input[i], stdout);
1324        }
1325
1326      free (breaks);
1327
1328      return 0;
1329    }
1330  else
1331    return 1;
1332}
1333
1334#endif /* TEST2 */
1335