1/* Line breaking of strings. 2 Copyright (C) 2001-2003, 2006-2010 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2001. 4 5 This program is free software: you can redistribute it and/or modify it 6 under the terms of the GNU Lesser General Public License as published 7 by the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18#include <config.h> 19 20/* Specification. */ 21#include "unilbrk.h" 22 23#include <stdlib.h> 24#include <string.h> 25 26#include "c-ctype.h" 27#include "uniconv.h" 28#include "unilbrk/ulc-common.h" 29 30/* Line breaking of a string in an arbitrary encoding. 31 32 We convert the input string to Unicode. 33 34 The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16, 35 UTF-16BE, UTF-16LE, UTF-7. UCS-2 supports only characters up to 36 \U0000FFFF. UTF-16 and variants support only characters up to 37 \U0010FFFF. UTF-7 is way too complex and not supported by glibc-2.1. 38 UCS-4 specification leaves doubts about endianness and byte order mark. 39 glibc currently interprets it as big endian without byte order mark, 40 but this is not backed by an RFC. So we use UTF-8. It supports 41 characters up to \U7FFFFFFF and is unambiguously defined. */ 42 43int 44ulc_width_linebreaks (const char *s, size_t n, 45 int width, int start_column, int at_end_columns, 46 const char *o, const char *encoding, 47 char *p) 48{ 49 if (n > 0) 50 { 51 if (is_utf8_encoding (encoding)) 52 return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p); 53 else 54 { 55 /* Convert the string to UTF-8 and build a translation table 56 from offsets into s to offsets into the translated string. */ 57 size_t *offsets = (size_t *) malloc (n * sizeof (size_t)); 58 59 if (offsets != NULL) 60 { 61 uint8_t *t; 62 size_t m; 63 64 t = u8_conv_from_encoding (encoding, iconveh_question_mark, 65 s, n, offsets, NULL, &m); 66 if (t != NULL) 67 { 68 char *memory = 69 (char *) (m > 0 ? malloc (m + (o != NULL ? m : 0)) : NULL); 70 71 if (m == 0 || memory != NULL) 72 { 73 char *q = (char *) memory; 74 char *o8 = (o != NULL ? (char *) (q + m) : NULL); 75 int res_column; 76 size_t i; 77 78 /* Translate the overrides to the UTF-8 string. */ 79 if (o != NULL) 80 { 81 memset (o8, UC_BREAK_UNDEFINED, m); 82 for (i = 0; i < n; i++) 83 if (offsets[i] != (size_t)(-1)) 84 o8[offsets[i]] = o[i]; 85 } 86 87 /* Determine the line breaks of the UTF-8 string. */ 88 res_column = 89 u8_width_linebreaks (t, m, width, start_column, at_end_columns, o8, encoding, q); 90 91 /* Translate the result back to the original string. */ 92 memset (p, UC_BREAK_PROHIBITED, n); 93 for (i = 0; i < n; i++) 94 if (offsets[i] != (size_t)(-1)) 95 p[i] = q[offsets[i]]; 96 97 free (memory); 98 free (t); 99 free (offsets); 100 return res_column; 101 } 102 free (t); 103 } 104 free (offsets); 105 } 106 /* Impossible to convert. */ 107#if C_CTYPE_ASCII 108 if (is_all_ascii (s, n)) 109 { 110 /* ASCII is a subset of UTF-8. */ 111 return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p); 112 } 113#endif 114 /* We have a non-ASCII string and cannot convert it. 115 Don't produce line breaks except those already present in the 116 input string. All we assume here is that the encoding is 117 minimally ASCII compatible. */ 118 { 119 const char *s_end = s + n; 120 while (s < s_end) 121 { 122 *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n' 123 ? UC_BREAK_MANDATORY 124 : UC_BREAK_PROHIBITED); 125 s++; 126 p++; 127 if (o != NULL) 128 o++; 129 } 130 /* We cannot compute widths in this case. */ 131 } 132 } 133 } 134 return start_column; 135} 136 137 138#ifdef TEST 139 140#include <stdio.h> 141#include <locale.h> 142 143/* Read the contents of an input stream, and return it, terminated with a NUL 144 byte. */ 145char * 146read_file (FILE *stream) 147{ 148#define BUFSIZE 4096 149 char *buf = NULL; 150 int alloc = 0; 151 int size = 0; 152 int count; 153 154 while (! feof (stream)) 155 { 156 if (size + BUFSIZE > alloc) 157 { 158 alloc = alloc + alloc / 2; 159 if (alloc < size + BUFSIZE) 160 alloc = size + BUFSIZE; 161 buf = realloc (buf, alloc); 162 if (buf == NULL) 163 { 164 fprintf (stderr, "out of memory\n"); 165 exit (1); 166 } 167 } 168 count = fread (buf + size, 1, BUFSIZE, stream); 169 if (count == 0) 170 { 171 if (ferror (stream)) 172 { 173 perror ("fread"); 174 exit (1); 175 } 176 } 177 else 178 size += count; 179 } 180 buf = realloc (buf, size + 1); 181 if (buf == NULL) 182 { 183 fprintf (stderr, "out of memory\n"); 184 exit (1); 185 } 186 buf[size] = '\0'; 187 return buf; 188#undef BUFSIZE 189} 190 191int 192main (int argc, char * argv[]) 193{ 194 setlocale (LC_CTYPE, ""); 195 if (argc == 2) 196 { 197 /* Insert line breaks for a given width. */ 198 int width = atoi (argv[1]); 199 char *input = read_file (stdin); 200 int length = strlen (input); 201 char *breaks = malloc (length); 202 int i; 203 204 ulc_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks); 205 206 for (i = 0; i < length; i++) 207 { 208 switch (breaks[i]) 209 { 210 case UC_BREAK_POSSIBLE: 211 putc ('\n', stdout); 212 break; 213 case UC_BREAK_MANDATORY: 214 break; 215 case UC_BREAK_PROHIBITED: 216 break; 217 default: 218 abort (); 219 } 220 putc (input[i], stdout); 221 } 222 223 free (breaks); 224 225 return 0; 226 } 227 else 228 return 1; 229} 230 231#endif /* TEST */ 232