1/* Word breaks in UTF-8/UTF-16/UTF-32 strings. 2 Copyright (C) 2009-2010 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2009. 4 5 This program is free software: you can redistribute it and/or modify it 6 under the terms of the GNU Lesser General Public License as published 7 by the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18void 19FUNC (const UNIT *s, size_t n, char *p) 20{ 21 if (n > 0) 22 { 23 const UNIT *s_end = s + n; 24 25 /* Word break property of the last character. 26 -1 at the very beginning of the string. */ 27 int last_char_prop = -1; 28 29 /* Format and Extend characters are ignored; this means, the mostly used 30 unit is the complex character (= character with subsequent ignored 31 characters). 32 Word break property of the last complex character. 33 -1 at the very beginning of the string. */ 34 int last_compchar_prop = -1; 35 char *last_compchar_ptr = NULL; 36 37 /* For recognizing rules involving 3 complex characters: 38 Word break property of the second-to-last complex character. 39 -1 at the very beginning of the string. */ 40 int secondlast_compchar_prop = -1; 41 42 /* Don't break inside multibyte characters. */ 43 memset (p, 0, n); 44 45 while (s < s_end) 46 { 47 ucs4_t uc; 48 int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s); 49 int prop = uc_wordbreak_property (uc); 50 51 /* No break at the start of the string. */ 52 if (last_char_prop >= 0) 53 { 54 /* No break between CR and LF. */ 55 if (last_char_prop == WBP_CR && prop == WBP_LF) 56 /* *p = 0 */; 57 /* Break before and after newlines. */ 58 else if (last_char_prop >= WBP_NEWLINE 59 /* same as: 60 last_char_prop == WBP_CR 61 || last_char_prop == WBP_LF 62 || last_char_prop == WBP_NEWLINE */ 63 || prop >= WBP_NEWLINE 64 /* same as: 65 prop == WBP_CR 66 || prop == WBP_LF 67 || prop == WBP_NEWLINE */) 68 *p = 1; 69 /* Ignore Format and Extend characters. */ 70 else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT)) 71 { 72 /* No break in these situations (see UAX #29): 73 74 secondlast last current 75 76 ALetter (MidLetter | MidNumLet) �� ALetter (WB7) 77 ALetter �� (MidLetter | MidNumLet) ALetter (WB6) 78 Numeric (MidNum | MidNumLet) �� Numeric (WB11) 79 Numeric �� (MidNum | MidNumLet) Numeric (WB12) 80 ALetter �� ALetter (WB5) 81 ALetter �� Numeric (WB9) 82 Numeric �� ALetter (WB10) 83 Numeric �� Numeric (WB8) 84 Katakana �� Katakana (WB13) 85 (ALetter | Numeric | Katakana) �� ExtendNumLet (WB13a) 86 ExtendNumLet �� ExtendNumLet (WB13a) 87 ExtendNumLet �� (ALetter | Numeric | Katakana) (WB13b) 88 */ 89 /* No break across certain punctuation. Also, disable word 90 breaks that were recognized earlier (due to lookahead of 91 only one complex character). */ 92 if ((prop == WBP_ALETTER 93 && (last_compchar_prop == WBP_MIDLETTER 94 || last_compchar_prop == WBP_MIDNUMLET) 95 && secondlast_compchar_prop == WBP_ALETTER) 96 || (prop == WBP_NUMERIC 97 && (last_compchar_prop == WBP_MIDNUM 98 || last_compchar_prop == WBP_MIDNUMLET) 99 && secondlast_compchar_prop == WBP_NUMERIC)) 100 { 101 *last_compchar_ptr = 0; 102 /* *p = 0; */ 103 } 104 else 105 { 106 /* Perform a single table lookup. */ 107 if (uniwbrk_table[last_compchar_prop][prop]) 108 *p = 1; 109 /* else *p = 0; */ 110 } 111 } 112 } 113 114 last_char_prop = prop; 115 /* Ignore Format and Extend characters, except at the start of the string. */ 116 if (last_compchar_prop < 0 || !(prop == WBP_EXTEND || prop == WBP_FORMAT)) 117 { 118 secondlast_compchar_prop = last_compchar_prop; 119 last_compchar_prop = prop; 120 last_compchar_ptr = p; 121 } 122 123 s += count; 124 p += count; 125 } 126 } 127} 128