1/* Line breaking of UTF-16 strings. 2 Copyright (C) 2001-2003, 2006-2010 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2001. 4 5 This program is free software: you can redistribute it and/or modify it 6 under the terms of the GNU Lesser General Public License as published 7 by the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18#include <config.h> 19 20/* Specification. */ 21#include "unilbrk.h" 22 23#include <stdlib.h> 24#include <string.h> 25 26#include "unilbrk/lbrktables.h" 27#include "uniwidth/cjk.h" 28#include "unistr.h" 29 30void 31u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char *p) 32{ 33 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); 34 const uint16_t *s_end = s + n; 35 int last_prop = LBP_BK; /* line break property of last non-space character */ 36 char *seen_space = NULL; /* Was a space seen after the last non-space character? */ 37 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ 38 39 /* Don't break inside multibyte characters. */ 40 memset (p, UC_BREAK_PROHIBITED, n); 41 42 while (s < s_end) 43 { 44 ucs4_t uc; 45 int count = u16_mbtouc_unsafe (&uc, s, s_end - s); 46 int prop = unilbrkprop_lookup (uc); 47 48 if (prop == LBP_BK) 49 { 50 /* Mandatory break. */ 51 *p = UC_BREAK_MANDATORY; 52 last_prop = LBP_BK; 53 seen_space = NULL; 54 seen_space2 = NULL; 55 } 56 else 57 { 58 char *q; 59 60 /* Resolve property values whose behaviour is not fixed. */ 61 switch (prop) 62 { 63 case LBP_AI: 64 /* Resolve ambiguous. */ 65 prop = LBP_AI_REPLACEMENT; 66 break; 67 case LBP_CB: 68 /* This is arbitrary. */ 69 prop = LBP_ID; 70 break; 71 case LBP_SA: 72 /* We don't handle complex scripts yet. 73 Treat LBP_SA like LBP_XX. */ 74 case LBP_XX: 75 /* This is arbitrary. */ 76 prop = LBP_AL; 77 break; 78 } 79 80 /* Deal with spaces and combining characters. */ 81 q = p; 82 if (prop == LBP_SP) 83 { 84 /* Don't break just before a space. */ 85 *p = UC_BREAK_PROHIBITED; 86 seen_space2 = seen_space; 87 seen_space = p; 88 } 89 else if (prop == LBP_ZW) 90 { 91 /* Don't break just before a zero-width space. */ 92 *p = UC_BREAK_PROHIBITED; 93 last_prop = LBP_ZW; 94 seen_space = NULL; 95 seen_space2 = NULL; 96 } 97 else if (prop == LBP_CM) 98 { 99 /* Don't break just before a combining character, except immediately after a 100 zero-width space. */ 101 if (last_prop == LBP_ZW) 102 { 103 /* Break after zero-width space. */ 104 *p = UC_BREAK_POSSIBLE; 105 /* A combining character turns a preceding space into LBP_ID. */ 106 last_prop = LBP_ID; 107 } 108 else 109 { 110 *p = UC_BREAK_PROHIBITED; 111 /* A combining character turns a preceding space into LBP_ID. */ 112 if (seen_space != NULL) 113 { 114 q = seen_space; 115 seen_space = seen_space2; 116 prop = LBP_ID; 117 goto lookup_via_table; 118 } 119 } 120 } 121 else 122 { 123 lookup_via_table: 124 /* prop must be usable as an index for table 7.3 of UTR #14. */ 125 if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0]))) 126 abort (); 127 128 if (last_prop == LBP_BK) 129 { 130 /* Don't break at the beginning of a line. */ 131 *q = UC_BREAK_PROHIBITED; 132 } 133 else if (last_prop == LBP_ZW) 134 { 135 /* Break after zero-width space. */ 136 *q = UC_BREAK_POSSIBLE; 137 } 138 else 139 { 140 switch (unilbrk_table [last_prop] [prop]) 141 { 142 case D: 143 *q = UC_BREAK_POSSIBLE; 144 break; 145 case I: 146 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); 147 break; 148 case P: 149 *q = UC_BREAK_PROHIBITED; 150 break; 151 default: 152 abort (); 153 } 154 } 155 last_prop = prop; 156 seen_space = NULL; 157 seen_space2 = NULL; 158 } 159 } 160 161 s += count; 162 p += count; 163 } 164} 165