1/* Line breaking of UTF-32 strings. 2 Copyright (C) 2001-2003, 2006-2010 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2001. 4 5 This program is free software: you can redistribute it and/or modify it 6 under the terms of the GNU Lesser General Public License as published 7 by the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18#include <config.h> 19 20/* Specification. */ 21#include "unilbrk.h" 22 23#include <stdlib.h> 24 25#include "unilbrk/lbrktables.h" 26#include "uniwidth/cjk.h" 27 28void 29u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char *p) 30{ 31 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); 32 const uint32_t *s_end = s + n; 33 int last_prop = LBP_BK; /* line break property of last non-space character */ 34 char *seen_space = NULL; /* Was a space seen after the last non-space character? */ 35 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ 36 37 while (s < s_end) 38 { 39 ucs4_t uc = *s; 40 int prop = unilbrkprop_lookup (uc); 41 42 if (prop == LBP_BK) 43 { 44 /* Mandatory break. */ 45 *p = UC_BREAK_MANDATORY; 46 last_prop = LBP_BK; 47 seen_space = NULL; 48 seen_space2 = NULL; 49 } 50 else 51 { 52 char *q; 53 54 /* Resolve property values whose behaviour is not fixed. */ 55 switch (prop) 56 { 57 case LBP_AI: 58 /* Resolve ambiguous. */ 59 prop = LBP_AI_REPLACEMENT; 60 break; 61 case LBP_CB: 62 /* This is arbitrary. */ 63 prop = LBP_ID; 64 break; 65 case LBP_SA: 66 /* We don't handle complex scripts yet. 67 Treat LBP_SA like LBP_XX. */ 68 case LBP_XX: 69 /* This is arbitrary. */ 70 prop = LBP_AL; 71 break; 72 } 73 74 /* Deal with spaces and combining characters. */ 75 q = p; 76 if (prop == LBP_SP) 77 { 78 /* Don't break just before a space. */ 79 *p = UC_BREAK_PROHIBITED; 80 seen_space2 = seen_space; 81 seen_space = p; 82 } 83 else if (prop == LBP_ZW) 84 { 85 /* Don't break just before a zero-width space. */ 86 *p = UC_BREAK_PROHIBITED; 87 last_prop = LBP_ZW; 88 seen_space = NULL; 89 seen_space2 = NULL; 90 } 91 else if (prop == LBP_CM) 92 { 93 /* Don't break just before a combining character, except immediately after a 94 zero-width space. */ 95 if (last_prop == LBP_ZW) 96 { 97 /* Break after zero-width space. */ 98 *p = UC_BREAK_POSSIBLE; 99 /* A combining character turns a preceding space into LBP_ID. */ 100 last_prop = LBP_ID; 101 } 102 else 103 { 104 *p = UC_BREAK_PROHIBITED; 105 /* A combining character turns a preceding space into LBP_ID. */ 106 if (seen_space != NULL) 107 { 108 q = seen_space; 109 seen_space = seen_space2; 110 prop = LBP_ID; 111 goto lookup_via_table; 112 } 113 } 114 } 115 else 116 { 117 lookup_via_table: 118 /* prop must be usable as an index for table 7.3 of UTR #14. */ 119 if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0]))) 120 abort (); 121 122 if (last_prop == LBP_BK) 123 { 124 /* Don't break at the beginning of a line. */ 125 *q = UC_BREAK_PROHIBITED; 126 } 127 else if (last_prop == LBP_ZW) 128 { 129 /* Break after zero-width space. */ 130 *q = UC_BREAK_POSSIBLE; 131 } 132 else 133 { 134 switch (unilbrk_table [last_prop] [prop]) 135 { 136 case D: 137 *q = UC_BREAK_POSSIBLE; 138 break; 139 case I: 140 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); 141 break; 142 case P: 143 *q = UC_BREAK_PROHIBITED; 144 break; 145 default: 146 abort (); 147 } 148 } 149 last_prop = prop; 150 seen_space = NULL; 151 seen_space2 = NULL; 152 } 153 } 154 155 s++; 156 p++; 157 } 158} 159