1/* Line breaking of strings.
2   Copyright (C) 2001-2003, 2006-2010 Free Software Foundation, Inc.
3   Written by Bruno Haible <bruno@clisp.org>, 2001.
4
5   This program is free software: you can redistribute it and/or modify it
6   under the terms of the GNU Lesser General Public License as published
7   by the Free Software Foundation; either version 3 of the License, or
8   (at your option) any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public License
16   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17
18#include <config.h>
19
20/* Specification.  */
21#include "unilbrk.h"
22
23#include <stdlib.h>
24#include <string.h>
25
26#include "c-ctype.h"
27#include "uniconv.h"
28#include "unilbrk/ulc-common.h"
29
30/* Line breaking of a string in an arbitrary encoding.
31
32   We convert the input string to Unicode.
33
34   The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
35   UTF-16BE, UTF-16LE, UTF-7.  UCS-2 supports only characters up to
36   \U0000FFFF.  UTF-16 and variants support only characters up to
37   \U0010FFFF.  UTF-7 is way too complex and not supported by glibc-2.1.
38   UCS-4 specification leaves doubts about endianness and byte order mark.
39   glibc currently interprets it as big endian without byte order mark,
40   but this is not backed by an RFC.  So we use UTF-8. It supports
41   characters up to \U7FFFFFFF and is unambiguously defined.  */
42
43int
44ulc_width_linebreaks (const char *s, size_t n,
45                      int width, int start_column, int at_end_columns,
46                      const char *o, const char *encoding,
47                      char *p)
48{
49  if (n > 0)
50    {
51      if (is_utf8_encoding (encoding))
52        return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
53      else
54        {
55          /* Convert the string to UTF-8 and build a translation table
56             from offsets into s to offsets into the translated string.  */
57          size_t *offsets = (size_t *) malloc (n * sizeof (size_t));
58
59          if (offsets != NULL)
60            {
61              uint8_t *t;
62              size_t m;
63
64              t = u8_conv_from_encoding (encoding, iconveh_question_mark,
65                                         s, n, offsets, NULL, &m);
66              if (t != NULL)
67                {
68                  char *memory =
69                    (char *) (m > 0 ? malloc (m + (o != NULL ? m : 0)) : NULL);
70
71                  if (m == 0 || memory != NULL)
72                    {
73                      char *q = (char *) memory;
74                      char *o8 = (o != NULL ? (char *) (q + m) : NULL);
75                      int res_column;
76                      size_t i;
77
78                      /* Translate the overrides to the UTF-8 string.  */
79                      if (o != NULL)
80                        {
81                          memset (o8, UC_BREAK_UNDEFINED, m);
82                          for (i = 0; i < n; i++)
83                            if (offsets[i] != (size_t)(-1))
84                              o8[offsets[i]] = o[i];
85                        }
86
87                      /* Determine the line breaks of the UTF-8 string.  */
88                      res_column =
89                        u8_width_linebreaks (t, m, width, start_column, at_end_columns, o8, encoding, q);
90
91                      /* Translate the result back to the original string.  */
92                      memset (p, UC_BREAK_PROHIBITED, n);
93                      for (i = 0; i < n; i++)
94                        if (offsets[i] != (size_t)(-1))
95                          p[i] = q[offsets[i]];
96
97                      free (memory);
98                      free (t);
99                      free (offsets);
100                      return res_column;
101                    }
102                  free (t);
103                }
104              free (offsets);
105            }
106          /* Impossible to convert.  */
107#if C_CTYPE_ASCII
108          if (is_all_ascii (s, n))
109            {
110              /* ASCII is a subset of UTF-8.  */
111              return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
112            }
113#endif
114          /* We have a non-ASCII string and cannot convert it.
115             Don't produce line breaks except those already present in the
116             input string.  All we assume here is that the encoding is
117             minimally ASCII compatible.  */
118          {
119            const char *s_end = s + n;
120            while (s < s_end)
121              {
122                *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
123                      ? UC_BREAK_MANDATORY
124                      : UC_BREAK_PROHIBITED);
125                s++;
126                p++;
127                if (o != NULL)
128                  o++;
129              }
130            /* We cannot compute widths in this case.  */
131          }
132        }
133    }
134  return start_column;
135}
136
137
138#ifdef TEST
139
140#include <stdio.h>
141#include <locale.h>
142
143/* Read the contents of an input stream, and return it, terminated with a NUL
144   byte. */
145char *
146read_file (FILE *stream)
147{
148#define BUFSIZE 4096
149  char *buf = NULL;
150  int alloc = 0;
151  int size = 0;
152  int count;
153
154  while (! feof (stream))
155    {
156      if (size + BUFSIZE > alloc)
157        {
158          alloc = alloc + alloc / 2;
159          if (alloc < size + BUFSIZE)
160            alloc = size + BUFSIZE;
161          buf = realloc (buf, alloc);
162          if (buf == NULL)
163            {
164              fprintf (stderr, "out of memory\n");
165              exit (1);
166            }
167        }
168      count = fread (buf + size, 1, BUFSIZE, stream);
169      if (count == 0)
170        {
171          if (ferror (stream))
172            {
173              perror ("fread");
174              exit (1);
175            }
176        }
177      else
178        size += count;
179    }
180  buf = realloc (buf, size + 1);
181  if (buf == NULL)
182    {
183      fprintf (stderr, "out of memory\n");
184      exit (1);
185    }
186  buf[size] = '\0';
187  return buf;
188#undef BUFSIZE
189}
190
191int
192main (int argc, char * argv[])
193{
194  setlocale (LC_CTYPE, "");
195  if (argc == 2)
196    {
197      /* Insert line breaks for a given width.  */
198      int width = atoi (argv[1]);
199      char *input = read_file (stdin);
200      int length = strlen (input);
201      char *breaks = malloc (length);
202      int i;
203
204      ulc_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
205
206      for (i = 0; i < length; i++)
207        {
208          switch (breaks[i])
209            {
210            case UC_BREAK_POSSIBLE:
211              putc ('\n', stdout);
212              break;
213            case UC_BREAK_MANDATORY:
214              break;
215            case UC_BREAK_PROHIBITED:
216              break;
217            default:
218              abort ();
219            }
220          putc (input[i], stdout);
221        }
222
223      free (breaks);
224
225      return 0;
226    }
227  else
228    return 1;
229}
230
231#endif /* TEST */
232