1/*
2 * Copyright (C) 1999-2001 Free Software Foundation, Inc.
3 * This file is part of the GNU LIBICONV Library.
4 *
5 * The GNU LIBICONV Library is free software; you can redistribute it
6 * and/or modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * The GNU LIBICONV Library is distributed in the hope that it will be
11 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public
16 * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
17 * If not, write to the Free Software Foundation, Inc., 59 Temple Place -
18 * Suite 330, Boston, MA 02111-1307, USA.
19 */
20
21/*
22 * CP950
23 */
24
25/*
26 * Microsoft CP950 is a slightly extended and slightly modified version of
27 * BIG5. The differences between the EASTASIA/OTHER/BIG5.TXT and
28 * VENDORS/MICSFT/WINDOWS/CP950.TXT tables found on ftp.unicode.org are
29 * as follows:
30 *
31 * 1. Some characters in the BIG5 range are defined differently:
32 *
33 *     code   BIG5.TXT                       CP950.TXT
34 *    0xA145  0x2022 # BULLET                0x2027 # HYPHENATION POINT
35 *    0xA14E  0xFF64 # HALFWIDTH IDEOGRAPHIC COMMA
36 *                                           0xFE51 # SMALL IDEOGRAPHIC COMMA
37 *    0xA15A    ---                          0x2574 # BOX DRAWINGS LIGHT LEFT
38 *    0xA1C2  0x203E # OVERLINE              0x00AF # MACRON
39 *    0xA1C3    ---                          0xFFE3 # FULLWIDTH MACRON
40 *    0xA1C5    ---                          0x02CD # MODIFIER LETTER LOW MACRON
41 *    0xA1E3  0x223C # TILDE OPERATOR        0xFF5E # FULLWIDTH TILDE
42 *    0xA1F2  0x2641 # EARTH                 0x2295 # CIRCLED PLUS
43 *    0xA1F3  0x2609 # SUN                   0x2299 # CIRCLED DOT OPERATOR
44 *    0xA1FE    ---                          0xFF0F # FULLWIDTH SOLIDUS
45 *    0xA240    ---                          0xFF3C # FULLWIDTH REVERSE SOLIDUS
46 *    0xA241  0xFF0F # FULLWIDTH SOLIDUS     0x2215 # DIVISION SLASH
47 *    0xA242  0xFF3C # FULLWIDTH REVERSE SOLIDUS
48 *                                           0xFE68 # SMALL REVERSE SOLIDUS
49 *    0xA244  0x00A5 # YEN SIGN              0xFFE5 # FULLWIDTH YEN SIGN
50 *    0xA246  0x00A2 # CENT SIGN             0xFFE0 # FULLWIDTH CENT SIGN
51 *    0xA247  0x00A3 # POUND SIGN            0xFFE1 # FULLWIDTH POUND SIGN
52 *    0xA2CC    ---                          0x5341
53 *    0xA2CE    ---                          0x5345
54 *
55 * 2. A small new row. See cp950ext.h.
56 *
57 * 3. CP950.TXT is lacking the range 0xC6A1..0xC7FC (Hiragana, Katakana,
58 *    Cyrillic, circled digits, parenthesized digits).
59 *
60 *    We implement this omission, because said range is marked "uncertain"
61 *    in the unicode.org BIG5 table.
62 */
63
64static const unsigned short cp950_2uni_pagea1[314] = {
65  /* 0xa1 */
66  0x3000, 0xff0c, 0x3001, 0x3002, 0xff0e, 0x2027, 0xff1b, 0xff1a,
67  0xff1f, 0xff01, 0xfe30, 0x2026, 0x2025, 0xfe50, 0xfe51, 0xfe52,
68  0x00b7, 0xfe54, 0xfe55, 0xfe56, 0xfe57, 0xff5c, 0x2013, 0xfe31,
69  0x2014, 0xfe33, 0x2574, 0xfe34, 0xfe4f, 0xff08, 0xff09, 0xfe35,
70  0xfe36, 0xff5b, 0xff5d, 0xfe37, 0xfe38, 0x3014, 0x3015, 0xfe39,
71  0xfe3a, 0x3010, 0x3011, 0xfe3b, 0xfe3c, 0x300a, 0x300b, 0xfe3d,
72  0xfe3e, 0x3008, 0x3009, 0xfe3f, 0xfe40, 0x300c, 0x300d, 0xfe41,
73  0xfe42, 0x300e, 0x300f, 0xfe43, 0xfe44, 0xfe59, 0xfe5a, 0xfe5b,
74  0xfe5c, 0xfe5d, 0xfe5e, 0x2018, 0x2019, 0x201c, 0x201d, 0x301d,
75  0x301e, 0x2035, 0x2032, 0xff03, 0xff06, 0xff0a, 0x203b, 0x00a7,
76  0x3003, 0x25cb, 0x25cf, 0x25b3, 0x25b2, 0x25ce, 0x2606, 0x2605,
77  0x25c7, 0x25c6, 0x25a1, 0x25a0, 0x25bd, 0x25bc, 0x32a3, 0x2105,
78  0x00af, 0xffe3, 0xff3f, 0x02cd, 0xfe49, 0xfe4a, 0xfe4d, 0xfe4e,
79  0xfe4b, 0xfe4c, 0xfe5f, 0xfe60, 0xfe61, 0xff0b, 0xff0d, 0x00d7,
80  0x00f7, 0x00b1, 0x221a, 0xff1c, 0xff1e, 0xff1d, 0x2266, 0x2267,
81  0x2260, 0x221e, 0x2252, 0x2261, 0xfe62, 0xfe63, 0xfe64, 0xfe65,
82  0xfe66, 0xff5e, 0x2229, 0x222a, 0x22a5, 0x2220, 0x221f, 0x22bf,
83  0x33d2, 0x33d1, 0x222b, 0x222e, 0x2235, 0x2234, 0x2640, 0x2642,
84  0x2295, 0x2299, 0x2191, 0x2193, 0x2190, 0x2192, 0x2196, 0x2197,
85  0x2199, 0x2198, 0x2225, 0x2223, 0xff0f,
86  /* 0xa2 */
87  0xff3c, 0x2215, 0xfe68, 0xff04, 0xffe5, 0x3012, 0xffe0, 0xffe1,
88  0xff05, 0xff20, 0x2103, 0x2109, 0xfe69, 0xfe6a, 0xfe6b, 0x33d5,
89  0x339c, 0x339d, 0x339e, 0x33ce, 0x33a1, 0x338e, 0x338f, 0x33c4,
90  0x00b0, 0x5159, 0x515b, 0x515e, 0x515d, 0x5161, 0x5163, 0x55e7,
91  0x74e9, 0x7cce, 0x2581, 0x2582, 0x2583, 0x2584, 0x2585, 0x2586,
92  0x2587, 0x2588, 0x258f, 0x258e, 0x258d, 0x258c, 0x258b, 0x258a,
93  0x2589, 0x253c, 0x2534, 0x252c, 0x2524, 0x251c, 0x2594, 0x2500,
94  0x2502, 0x2595, 0x250c, 0x2510, 0x2514, 0x2518, 0x256d, 0x256e,
95  0x2570, 0x256f, 0x2550, 0x255e, 0x256a, 0x2561, 0x25e2, 0x25e3,
96  0x25e5, 0x25e4, 0x2571, 0x2572, 0x2573, 0xff10, 0xff11, 0xff12,
97  0xff13, 0xff14, 0xff15, 0xff16, 0xff17, 0xff18, 0xff19, 0x2160,
98  0x2161, 0x2162, 0x2163, 0x2164, 0x2165, 0x2166, 0x2167, 0x2168,
99  0x2169, 0x3021, 0x3022, 0x3023, 0x3024, 0x3025, 0x3026, 0x3027,
100  0x3028, 0x3029, 0x5341, 0x5344, 0x5345, 0xff21, 0xff22, 0xff23,
101  0xff24, 0xff25, 0xff26, 0xff27, 0xff28, 0xff29, 0xff2a, 0xff2b,
102  0xff2c, 0xff2d, 0xff2e, 0xff2f, 0xff30, 0xff31, 0xff32, 0xff33,
103  0xff34, 0xff35, 0xff36, 0xff37, 0xff38, 0xff39, 0xff3a, 0xff41,
104  0xff42, 0xff43, 0xff44, 0xff45, 0xff46, 0xff47, 0xff48, 0xff49,
105  0xff4a, 0xff4b, 0xff4c, 0xff4d, 0xff4e, 0xff4f, 0xff50, 0xff51,
106  0xff52, 0xff53, 0xff54, 0xff55, 0xff56,
107};
108
109#include "cp950ext.h"
110
111static int
112cp950_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
113{
114  unsigned char c = *s;
115  /* Code set 0 (ASCII) */
116  if (c < 0x80)
117    return ascii_mbtowc(conv,pwc,s,n);
118  /* Code set 1 (BIG5 extended) */
119  if (c >= 0xa1 && c < 0xff) {
120    if (n < 2)
121      return RET_TOOFEW(0);
122    {
123      unsigned char c2 = s[1];
124      if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff)) {
125        if (c < 0xa3) {
126          unsigned int i = 157 * (c - 0xa1) + (c2 - (c2 >= 0xa1 ? 0x62 : 0x40));
127          unsigned short wc = cp950_2uni_pagea1[i];
128          if (wc != 0xfffd) {
129            *pwc = (ucs4_t) wc;
130            return 2;
131          }
132        }
133        if (!((c == 0xc6 && c2 >= 0xa1) || c == 0xc7)) {
134          int ret = big5_mbtowc(conv,pwc,s,2);
135          if (ret != RET_ILSEQ)
136            return ret;
137        }
138      }
139    }
140    if (c == 0xf9) {
141      int ret = cp950ext_mbtowc(conv,pwc,s,2);
142      if (ret != RET_ILSEQ)
143        return ret;
144    }
145  }
146  return RET_ILSEQ;
147}
148
149static int
150cp950_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
151{
152  unsigned char buf[2];
153  int ret;
154
155  /* Code set 0 (ASCII) */
156  ret = ascii_wctomb(conv,r,wc,n);
157  if (ret != RET_ILUNI)
158    return ret;
159
160  /* Code set 1 (BIG5 extended) */
161  switch (wc >> 8) {
162    case 0x00:
163      if (wc == 0x00af) { buf[0] = 0xa1; buf[1] = 0xc2; ret = 2; break; }
164      if (wc == 0x00a2 || wc == 0x00a3 || wc == 0x00a4)
165        return RET_ILUNI;
166      break;
167    case 0x02:
168      if (wc == 0x02cd) { buf[0] = 0xa1; buf[1] = 0xc5; ret = 2; break; }
169      break;
170    case 0x20:
171      if (wc == 0x2027) { buf[0] = 0xa1; buf[1] = 0x45; ret = 2; break; }
172      if (wc == 0x2022 || wc == 0x203e)
173        return RET_ILUNI;
174      break;
175    case 0x22:
176      if (wc == 0x2215) { buf[0] = 0xa2; buf[1] = 0x41; ret = 2; break; }
177      if (wc == 0x2295) { buf[0] = 0xa1; buf[1] = 0xf2; ret = 2; break; }
178      if (wc == 0x2299) { buf[0] = 0xa1; buf[1] = 0xf3; ret = 2; break; }
179      if (wc == 0x223c)
180        return RET_ILUNI;
181      break;
182    case 0x25:
183      if (wc == 0x2574) { buf[0] = 0xa1; buf[1] = 0x5a; ret = 2; break; }
184      break;
185    case 0x26:
186      if (wc == 0x2609 || wc == 0x2641)
187        return RET_ILUNI;
188      break;
189    case 0xfe:
190      if (wc == 0xfe51) { buf[0] = 0xa1; buf[1] = 0x4e; ret = 2; break; }
191      if (wc == 0xfe68) { buf[0] = 0xa2; buf[1] = 0x42; ret = 2; break; }
192      break;
193    case 0xff:
194      if (wc == 0xff0f) { buf[0] = 0xa1; buf[1] = 0xfe; ret = 2; break; }
195      if (wc == 0xff3c) { buf[0] = 0xa2; buf[1] = 0x40; ret = 2; break; }
196      if (wc == 0xff5e) { buf[0] = 0xa1; buf[1] = 0xe3; ret = 2; break; }
197      if (wc == 0xffe0) { buf[0] = 0xa2; buf[1] = 0x46; ret = 2; break; }
198      if (wc == 0xffe1) { buf[0] = 0xa2; buf[1] = 0x47; ret = 2; break; }
199      if (wc == 0xffe3) { buf[0] = 0xa1; buf[1] = 0xc3; ret = 2; break; }
200      if (wc == 0xffe5) { buf[0] = 0xa2; buf[1] = 0x44; ret = 2; break; }
201      if (wc == 0xff64)
202        return RET_ILUNI;
203      break;
204  }
205  if (ret == RET_ILUNI)
206    ret = big5_wctomb(conv,buf,wc,2);
207  if (ret != RET_ILUNI) {
208    if (ret != 2) abort();
209    if (!((buf[0] == 0xc6 && buf[1] >= 0xa1) || buf[0] == 0xc7)) {
210      if (n < 2)
211        return RET_TOOSMALL;
212      r[0] = buf[0];
213      r[1] = buf[1];
214      return 2;
215    }
216  }
217  ret = cp950ext_wctomb(conv,buf,wc,2);
218  if (ret != RET_ILUNI) {
219    if (ret != 2) abort();
220    if (n < 2)
221      return RET_TOOSMALL;
222    r[0] = buf[0];
223    r[1] = buf[1];
224    return 2;
225  }
226
227  return RET_ILUNI;
228}
229