1/*
2 * Copyright (C) 1999-2001 Free Software Foundation, Inc.
3 * This file is part of the GNU LIBICONV Library.
4 *
5 * The GNU LIBICONV Library is free software; you can redistribute it
6 * and/or modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * The GNU LIBICONV Library is distributed in the hope that it will be
11 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public
16 * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
17 * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
18 * Fifth Floor, Boston, MA 02110-1301, USA.
19 */
20
21/*
22 * JOHAB Hangul
23 *
24 * Ken Lunde writes in his "CJKV Information Processing" book, p. 114:
25 * "Hangul can be composed of two or three jamo (some jamo are considered
26 *  compound). Johab uses 19 initial jamo (consonants), 21 medial jamo (vowels)
27 *  and 27 final jamo (consonants; 28 when you include the "fill" character
28 *  for Hangul containing only two jamo). Multiplying these numbers results in
29 *  11172."
30 *
31 * Structure of the Johab encoding (see p. 181-184):
32 *   bit 15 = 1
33 *   bit 14..10 = initial jamo, only 19+1 out of 32 possible values are used
34 *   bit 9..5 = medial jamo, only 21+1 out of 32 possible values are used
35 *   bit 4..0 = final jamo, only 27+1 out of 32 possible values are used
36 *
37 * Structure of the Unicode encoding:
38 * grep '^0x\([8-C]...\|D[0-7]..\)' unicode.org-mappings/EASTASIA/KSC/JOHAB.TXT
39 * You see that all characters there are marked "HANGUL LETTER" or "HANGUL
40 * SYLLABLE". If you eliminate the "HANGUL LETTER"s, the table is sorted
41 * in ascending order according to Johab encoding and according to the Unicode
42 * encoding. Now look a little more carefully, and you see that the following
43 * formula holds:
44 *     unicode == 0xAC00
45 *                + 21 * 28 * (jamo_initial_index[(johab >> 10) & 31] - 1)
46 *                + 28 * (jamo_medial_index[(johab >> 5) & 31] - 1)
47 *                + jamo_final_index[johab & 31]
48 * where the index tables are defined as below.
49 */
50
51/* Tables mapping 5-bit groups to jamo letters. */
52/* Note that Jamo XX = UHC 0xA4A0+XX = Unicode 0x3130+XX */
53#define NONE 0xfd
54#define FILL 0xff
55static const unsigned char jamo_initial[32] = {
56  NONE, FILL, 0x01, 0x02, 0x04, 0x07, 0x08, 0x09,
57  0x11, 0x12, 0x13, 0x15, 0x16, 0x17, 0x18, 0x19,
58  0x1a, 0x1b, 0x1c, 0x1d, 0x1e, NONE, NONE, NONE,
59  NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
60};
61static const unsigned char jamo_medial[32] = {
62  NONE, NONE, FILL, 0x1f, 0x20, 0x21, 0x22, 0x23,
63  NONE, NONE, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
64  NONE, NONE, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
65  NONE, NONE, 0x30, 0x31, 0x32, 0x33, NONE, NONE,
66};
67static const unsigned char jamo_final[32] = {
68  NONE, FILL, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
69  0x07, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
70  0x10, 0x11, NONE, 0x12, 0x14, 0x15, 0x16, 0x17,
71  0x18, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, NONE, NONE,
72};
73/* Same as jamo_final, except that it excludes characters already
74   contained in jamo_initial. 11 characters instead of 27. */
75static const unsigned char jamo_final_notinitial[32] = {
76  NONE, NONE, NONE, NONE, 0x03, NONE, 0x05, 0x06,
77  NONE, NONE, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
78  0x10, NONE, NONE, NONE, 0x14, NONE, NONE, NONE,
79  NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
80};
81
82/* Tables mapping 5-bit groups to packed indices. */
83#define none -1
84#define fill 0
85static const signed char jamo_initial_index[32] = {
86  none, fill, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
87  0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
88  0x0f, 0x10, 0x11, 0x12, 0x13, none, none, none,
89  none, none, none, none, none, none, none, none,
90};
91static const signed char jamo_medial_index[32] = {
92  none, none, fill, 0x01, 0x02, 0x03, 0x04, 0x05,
93  none, none, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
94  none, none, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11,
95  none, none, 0x12, 0x13, 0x14, 0x15, none, none,
96};
97static const signed char jamo_final_index[32] = {
98  none, fill, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
99  0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
100  0x0f, 0x10, none, 0x11, 0x12, 0x13, 0x14, 0x15,
101  0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, none, none,
102};
103
104static int
105johab_hangul_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
106{
107  unsigned char c1 = s[0];
108  if ((c1 >= 0x84 && c1 <= 0xd3)) {
109    if (n >= 2) {
110      unsigned char c2 = s[1];
111      if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff)) {
112        unsigned int johab = (c1 << 8) | c2;
113        unsigned int bitspart1 = (johab >> 10) & 31;
114        unsigned int bitspart2 = (johab >> 5) & 31;
115        unsigned int bitspart3 = johab & 31;
116        int index1 = jamo_initial_index[bitspart1];
117        int index2 = jamo_medial_index[bitspart2];
118        int index3 = jamo_final_index[bitspart3];
119        /* Exclude "none" values. */
120        if (index1 >= 0 && index2 >= 0 && index3 >= 0) {
121          /* Deal with "fill" values in initial or medial position. */
122          if (index1 == fill) {
123            if (index2 == fill) {
124              unsigned char jamo3 = jamo_final_notinitial[bitspart3];
125              if (jamo3 != NONE) {
126                *pwc = (ucs4_t) 0x3130 + jamo3;
127                return 2;
128              }
129            } else if (index3 == fill) {
130              unsigned char jamo2 = jamo_medial[bitspart2];
131              if (jamo2 != NONE && jamo2 != FILL) {
132                *pwc = (ucs4_t) 0x3130 + jamo2;
133                return 2;
134              }
135            }
136            /* Syllables composed only of medial and final don't exist. */
137          } else if (index2 == fill) {
138            if (index3 == fill) {
139              unsigned char jamo1 = jamo_initial[bitspart1];
140              if (jamo1 != NONE && jamo1 != FILL) {
141                *pwc = (ucs4_t) 0x3130 + jamo1;
142                return 2;
143              }
144            }
145            /* Syllables composed only of initial and final don't exist. */
146          } else {
147             /* index1 and index2 are not fill, but index3 may be fill. */
148             /* Nothing more to exclude. All 11172 code points are valid. */
149             *pwc = 0xac00 + ((index1 - 1) * 21 + (index2 - 1)) * 28 + index3;
150             return 2;
151          }
152        }
153      }
154      return RET_ILSEQ;
155    }
156    return RET_TOOFEW(0);
157  }
158  return RET_ILSEQ;
159}
160
161/* 51 Jamo: 19 initial, 21 medial, 11 final not initial. */
162static const unsigned short johab_hangul_page31[51] = {
163          0x8841, 0x8c41, 0x8444, 0x9041, 0x8446, 0x8447, 0x9441, /*0x30-0x37*/
164  0x9841, 0x9c41, 0x844a, 0x844b, 0x844c, 0x844d, 0x844e, 0x844f, /*0x38-0x3f*/
165  0x8450, 0xa041, 0xa441, 0xa841, 0x8454, 0xac41, 0xb041, 0xb441, /*0x40-0x47*/
166  0xb841, 0xbc41, 0xc041, 0xc441, 0xc841, 0xcc41, 0xd041, 0x8461, /*0x48-0x4f*/
167  0x8481, 0x84a1, 0x84c1, 0x84e1, 0x8541, 0x8561, 0x8581, 0x85a1, /*0x50-0x57*/
168  0x85c1, 0x85e1, 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1, /*0x58-0x5f*/
169  0x8741, 0x8761, 0x8781, 0x87a1,                                 /*0x60-0x67*/
170};
171
172/* Tables mapping packed indices to 5-bit groups. */
173/* index1+1 = jamo_initial_index[bitspart1]  <==>
174   bitspart1 = jamo_initial_index_inverse[index1] */
175static const char jamo_initial_index_inverse[19] = {
176              0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
177  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
178  0x10, 0x11, 0x12, 0x13, 0x14,
179};
180/* index2+1 = jamo_medial_index[bitspart2]  <==>
181   bitspart2 = jamo_medial_index_inverse[index2] */
182static const char jamo_medial_index_inverse[21] = {
183                    0x03, 0x04, 0x05, 0x06, 0x07,
184              0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
185              0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
186              0x1a, 0x1b, 0x1c, 0x1d,
187};
188/* index3 = jamo_final_index[bitspart3]  <==>
189   bitspart3 = jamo_final_index_inverse[index3] */
190static const char jamo_final_index_inverse[28] = {
191        0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
192  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
193  0x10, 0x11,       0x13, 0x14, 0x15, 0x16, 0x17,
194  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d,
195};
196
197static int
198johab_hangul_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
199{
200  if (n >= 2) {
201    if (wc >= 0x3131 && wc < 0x3164) {
202      unsigned short c = johab_hangul_page31[wc-0x3131];
203      r[0] = (c >> 8); r[1] = (c & 0xff);
204      return 2;
205    } else if (wc >= 0xac00 && wc < 0xd7a4) {
206      unsigned int index1;
207      unsigned int index2;
208      unsigned int index3;
209      unsigned short c;
210      unsigned int tmp = wc - 0xac00;
211      index3 = tmp % 28; tmp = tmp / 28;
212      index2 = tmp % 21; tmp = tmp / 21;
213      index1 = tmp;
214      c = (((((1 << 5)
215              | jamo_initial_index_inverse[index1]) << 5)
216            | jamo_medial_index_inverse[index2]) << 5)
217          | jamo_final_index_inverse[index3];
218      r[0] = (c >> 8); r[1] = (c & 0xff);
219      return 2;
220    }
221    return RET_ILUNI;
222  }
223  return RET_TOOSMALL;
224}
225
226/*
227 * Decomposition of JOHAB Hangul in one to three Johab Jamo elements.
228 */
229
230/* Decompose wc into r[0..2], and return the number of resulting Jamo elements.
231   Return RET_ILUNI if decomposition is not possible. */
232
233static int johab_hangul_decompose (conv_t conv, ucs4_t* r, ucs4_t wc)
234{
235  unsigned char buf[2];
236  int ret = johab_hangul_wctomb(conv,buf,wc,2);
237  if (ret != RET_ILUNI) {
238    unsigned int hangul = (buf[0] << 8) | buf[1];
239    unsigned char jamo1 = jamo_initial[(hangul >> 10) & 31];
240    unsigned char jamo2 = jamo_medial[(hangul >> 5) & 31];
241    unsigned char jamo3 = jamo_final[hangul & 31];
242    if ((hangul >> 15) != 1) abort();
243    if (jamo1 != NONE && jamo2 != NONE && jamo3 != NONE) {
244      /* They are not all three == FILL because that would correspond to
245         johab = 0x8441, which doesn't exist. */
246      ucs4_t* p = r;
247      if (jamo1 != FILL)
248        *p++ = 0x3130 + jamo1;
249      if (jamo2 != FILL)
250        *p++ = 0x3130 + jamo2;
251      if (jamo3 != FILL)
252        *p++ = 0x3130 + jamo3;
253      return p-r;
254    }
255  }
256  return RET_ILUNI;
257}
258
259#undef fill
260#undef none
261#undef FILL
262#undef NONE
263