1/*
2**********************************************************************
3*   Copyright (C) 2000-2014, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   file name:  ucnv2022.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2000feb03
12*   created by: Markus W. Scherer
13*
14*   Change history:
15*
16*   06/29/2000  helena  Major rewrite of the callback APIs.
17*   08/08/2000  Ram     Included support for ISO-2022-JP-2
18*                       Changed implementation of toUnicode
19*                       function
20*   08/21/2000  Ram     Added support for ISO-2022-KR
21*   08/29/2000  Ram     Seperated implementation of EBCDIC to
22*                       ucnvebdc.c
23*   09/20/2000  Ram     Added support for ISO-2022-CN
24*                       Added implementations for getNextUChar()
25*                       for specific 2022 country variants.
26*   10/31/2000  Ram     Implemented offsets logic functions
27*/
28
29#include "unicode/utypes.h"
30
31#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
32
33#include "unicode/ucnv.h"
34#include "unicode/uset.h"
35#include "unicode/ucnv_err.h"
36#include "unicode/ucnv_cb.h"
37#include "unicode/utf16.h"
38#include "ucnv_imp.h"
39#include "ucnv_bld.h"
40#include "ucnv_cnv.h"
41#include "ucnvmbcs.h"
42#include "cstring.h"
43#include "cmemory.h"
44#include "uassert.h"
45
46#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
47
48#ifdef U_ENABLE_GENERIC_ISO_2022
49/*
50 * I am disabling the generic ISO-2022 converter after proposing to do so on
51 * the icu mailing list two days ago.
52 *
53 * Reasons:
54 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
55 *    its designation sequences, single shifts with return to the previous state,
56 *    switch-with-no-return to UTF-16BE or similar, etc.
57 *    This is unlike the language-specific variants like ISO-2022-JP which
58 *    require a much smaller repertoire of ISO-2022 features.
59 *    These variants continue to be supported.
60 * 2. I believe that no one is really using the generic ISO-2022 converter
61 *    but rather always one of the language-specific variants.
62 *    Note that ICU's generic ISO-2022 converter has always output one escape
63 *    sequence followed by UTF-8 for the whole stream.
64 * 3. Switching between subcharsets is extremely slow, because each time
65 *    the previous converter is closed and a new one opened,
66 *    without any kind of caching, least-recently-used list, etc.
67 * 4. The code is currently buggy, and given the above it does not seem
68 *    reasonable to spend the time on maintenance.
69 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
70 *    This means, for example, that when ISO-8859-7 is designated, the following
71 *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
72 *    The ICU ISO-2022 converter does not handle this - and has no information
73 *    about which subconverter would have to be shifted vs. which is designed
74 *    for 7-bit ISO-2022.
75 *
76 * Markus Scherer 2003-dec-03
77 */
78#endif
79
80static const char SHIFT_IN_STR[]  = "\x0F";
81// static const char SHIFT_OUT_STR[] = "\x0E";
82
83#define CR      0x0D
84#define LF      0x0A
85#define H_TAB   0x09
86#define V_TAB   0x0B
87#define SPACE   0x20
88
89enum {
90    HWKANA_START=0xff61,
91    HWKANA_END=0xff9f
92};
93
94/*
95 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
96 * as bytes 21..7E. (Subtract 0x80.)
97 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
98 * as bytes 20..7F. (Subtract 0x80.)
99 * Do not encode C1 control codes with native bytes 80..9F
100 * as bytes 00..1F (C0 control codes).
101 */
102enum {
103    GR94_START=0xa1,
104    GR94_END=0xfe,
105    GR96_START=0xa0,
106    GR96_END=0xff
107};
108
109/*
110 * ISO 2022 control codes must not be converted from Unicode
111 * because they would mess up the byte stream.
112 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
113 * corresponding to SO, SI, and ESC.
114 */
115#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
116
117/* for ISO-2022-JP and -CN implementations */
118typedef enum  {
119        /* shared values */
120        INVALID_STATE=-1,
121        ASCII = 0,
122
123        SS2_STATE=0x10,
124        SS3_STATE,
125
126        /* JP */
127        ISO8859_1 = 1 ,
128        ISO8859_7 = 2 ,
129        JISX201  = 3,
130        JISX208 = 4,
131        JISX212 = 5,
132        GB2312  =6,
133        KSC5601 =7,
134        HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
135
136        /* CN */
137        /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
138        GB2312_1=1,
139        ISO_IR_165=2,
140        CNS_11643=3,
141
142        /*
143         * these are used in StateEnum and ISO2022State variables,
144         * but CNS_11643 must be used to index into myConverterArray[]
145         */
146        CNS_11643_0=0x20,
147        CNS_11643_1,
148        CNS_11643_2,
149        CNS_11643_3,
150        CNS_11643_4,
151        CNS_11643_5,
152        CNS_11643_6,
153        CNS_11643_7
154} StateEnum;
155
156/* is the StateEnum charset value for a DBCS charset? */
157#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
158
159#define CSM(cs) ((uint16_t)1<<(cs))
160
161/*
162 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
163 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
164 *
165 * Note: The converter uses some leniency:
166 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
167 *   all versions, not just JIS7 and JIS8.
168 * - ICU does not distinguish between different versions of JIS X 0208.
169 */
170enum { MAX_JA_VERSION=4 };
171static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
172    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
173    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
174    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
175    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
176    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
177};
178
179typedef enum {
180        ASCII1=0,
181        LATIN1,
182        SBCS,
183        DBCS,
184        MBCS,
185        HWKANA
186}Cnv2022Type;
187
188typedef struct ISO2022State {
189    int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
190    int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
191    int8_t prevG;       /* g before single shift (SS2 or SS3) */
192} ISO2022State;
193
194#define UCNV_OPTIONS_VERSION_MASK 0xf
195#define UCNV_2022_MAX_CONVERTERS 10
196
197typedef struct{
198    UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
199    UConverter *currentConverter;
200    Cnv2022Type currentType;
201    ISO2022State toU2022State, fromU2022State;
202    uint32_t key;
203    uint32_t version;
204#ifdef U_ENABLE_GENERIC_ISO_2022
205    UBool isFirstBuffer;
206#endif
207    UBool isEmptySegment;
208    char name[30];
209    char locale[3];
210}UConverterDataISO2022;
211
212/* Protos */
213/* ISO-2022 ----------------------------------------------------------------- */
214
215/*Forward declaration */
216U_CFUNC void
217ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
218                      UErrorCode * err);
219U_CFUNC void
220ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
221                                    UErrorCode * err);
222
223#define ESC_2022 0x1B /*ESC*/
224
225typedef enum
226{
227        INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
228        VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
229        VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
230        VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
231} UCNV_TableStates_2022;
232
233/*
234* The way these state transition arrays work is:
235* ex : ESC$B is the sequence for JISX208
236*      a) First Iteration: char is ESC
237*          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
238*             int x = normalize_esq_chars_2022[27] which is equal to 1
239*         ii) Search for this value in escSeqStateTable_Key_2022[]
240*             value of x is stored at escSeqStateTable_Key_2022[0]
241*        iii) Save this index as offset
242*         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
243*             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
244*     b) Switch on this state and continue to next char
245*          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
246*             which is normalize_esq_chars_2022[36] == 4
247*         ii) x is currently 1(from above)
248*               x<<=5 -- x is now 32
249*               x+=normalize_esq_chars_2022[36]
250*               now x is 36
251*        iii) Search for this value in escSeqStateTable_Key_2022[]
252*             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
253*         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
254*             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
255*     c) Switch on this state and continue to next char
256*        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
257*        ii) x is currently 36 (from above)
258*            x<<=5 -- x is now 1152
259*            x+=normalize_esq_chars_2022[66]
260*            now x is 1161
261*       iii) Search for this value in escSeqStateTable_Key_2022[]
262*            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
263*        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
264*            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
265*         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
266*/
267
268
269/*Below are the 3 arrays depicting a state transition table*/
270static const int8_t normalize_esq_chars_2022[256] = {
271/*       0      1       2       3       4      5       6        7       8       9           */
272
273         0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
274        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
275        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
276        ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
277        ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
278        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
279        ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
280        ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
281        ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
282        ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
283        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
284        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
285        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
286        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
287        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
288        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
289        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
290        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
291        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
292        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
293        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
294        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
295        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
296        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
297        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
298        ,0     ,0      ,0      ,0      ,0      ,0
299};
300
301#ifdef U_ENABLE_GENERIC_ISO_2022
302/*
303 * When the generic ISO-2022 converter is completely removed, not just disabled
304 * per #ifdef, then the following state table and the associated tables that are
305 * dimensioned with MAX_STATES_2022 should be trimmed.
306 *
307 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
308 * the associated escape sequences starting with ESC ( B should be removed.
309 * This includes the ones with key values 1097 and all of the ones above 1000000.
310 *
311 * For the latter, the tables can simply be truncated.
312 * For the former, since the tables must be kept parallel, it is probably best
313 * to simply duplicate an adjacent table cell, parallel in all tables.
314 *
315 * It may make sense to restructure the tables, especially by using small search
316 * tables for the variants instead of indexing them parallel to the table here.
317 */
318#endif
319
320#define MAX_STATES_2022 74
321static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
322/*   0           1           2           3           4           5           6           7           8           9           */
323
324     1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
325    ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
326    ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
327    ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
328    ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
329    ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
330    ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
331    ,35947631   ,35947635   ,35947636   ,35947638
332};
333
334#ifdef U_ENABLE_GENERIC_ISO_2022
335
336static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
337 /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
338
339     NULL                   ,NULL                   ,NULL                   ,NULL               ,NULL               ,NULL                   ,NULL                   ,NULL                   ,"latin1"               ,"latin1"
340    ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
341    ,"latin1"               ,NULL                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,NULL                   ,NULL                   ,NULL                   ,NULL                   ,"UTF8"
342    ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,NULL               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
343    ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
344    ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
345    ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
346    ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
347};
348
349#endif
350
351static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
352/*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
353     VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
354    ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
355    ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
356    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
357    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
358    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
359    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
360    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
361};
362
363
364/* Type def for refactoring changeState_2022 code*/
365typedef enum{
366#ifdef U_ENABLE_GENERIC_ISO_2022
367    ISO_2022=0,
368#endif
369    ISO_2022_JP=1,
370    ISO_2022_KR=2,
371    ISO_2022_CN=3
372} Variant2022;
373
374/*********** ISO 2022 Converter Protos ***********/
375static void
376_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
377
378static void
379 _ISO2022Close(UConverter *converter);
380
381static void
382_ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
383
384static const char*
385_ISO2022getName(const UConverter* cnv);
386
387static void
388_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
389
390static UConverter *
391_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
392
393#ifdef U_ENABLE_GENERIC_ISO_2022
394static void
395T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
396#endif
397
398namespace {
399
400/*const UConverterSharedData _ISO2022Data;*/
401extern const UConverterSharedData _ISO2022JPData;
402extern const UConverterSharedData _ISO2022KRData;
403extern const UConverterSharedData _ISO2022CNData;
404
405}  // namespace
406
407/*************** Converter implementations ******************/
408
409/* The purpose of this function is to get around gcc compiler warnings. */
410static inline void
411fromUWriteUInt8(UConverter *cnv,
412                 const char *bytes, int32_t length,
413                 uint8_t **target, const char *targetLimit,
414                 int32_t **offsets,
415                 int32_t sourceIndex,
416                 UErrorCode *pErrorCode)
417{
418    char *targetChars = (char *)*target;
419    ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
420                         offsets, sourceIndex, pErrorCode);
421    *target = (uint8_t*)targetChars;
422
423}
424
425static inline void
426setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
427    if(myConverterData->version == 1) {
428        UConverter *cnv = myConverterData->currentConverter;
429
430        cnv->toUnicodeStatus=0;     /* offset */
431        cnv->mode=0;                /* state */
432        cnv->toULength=0;           /* byteIndex */
433    }
434}
435
436static inline void
437setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
438   /* in ISO-2022-KR the designator sequence appears only once
439    * in a file so we append it only once
440    */
441    if( converter->charErrorBufferLength==0){
442
443        converter->charErrorBufferLength = 4;
444        converter->charErrorBuffer[0] = 0x1b;
445        converter->charErrorBuffer[1] = 0x24;
446        converter->charErrorBuffer[2] = 0x29;
447        converter->charErrorBuffer[3] = 0x43;
448    }
449    if(myConverterData->version == 1) {
450        UConverter *cnv = myConverterData->currentConverter;
451
452        cnv->fromUChar32=0;
453        cnv->fromUnicodeStatus=1;   /* prevLength */
454    }
455}
456
457static void
458_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
459
460    char myLocale[6]={' ',' ',' ',' ',' ',' '};
461
462    cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
463    if(cnv->extraInfo != NULL) {
464        UConverterNamePieces stackPieces;
465        UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
466        UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
467        uint32_t version;
468
469        stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
470
471        uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
472        myConverterData->currentType = ASCII1;
473        cnv->fromUnicodeStatus =FALSE;
474        if(pArgs->locale){
475            uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
476        }
477        version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
478        myConverterData->version = version;
479        if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
480            (myLocale[2]=='_' || myLocale[2]=='\0'))
481        {
482            size_t len=0;
483            /* open the required converters and cache them */
484            if(version>MAX_JA_VERSION) {
485                /* prevent indexing beyond jpCharsetMasks[] */
486                myConverterData->version = version = 0;
487            }
488            if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
489                myConverterData->myConverterArray[ISO8859_7] =
490                    ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
491            }
492            myConverterData->myConverterArray[JISX208] =
493                ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
494            if(jpCharsetMasks[version]&CSM(JISX212)) {
495                myConverterData->myConverterArray[JISX212] =
496                    ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
497            }
498            if(jpCharsetMasks[version]&CSM(GB2312)) {
499                myConverterData->myConverterArray[GB2312] =
500                    ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);   /* gb_2312_80-1 */
501            }
502            if(jpCharsetMasks[version]&CSM(KSC5601)) {
503                myConverterData->myConverterArray[KSC5601] =
504                    ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
505            }
506
507            /* set the function pointers to appropriate funtions */
508            cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
509            uprv_strcpy(myConverterData->locale,"ja");
510
511            (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
512            len = uprv_strlen(myConverterData->name);
513            myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
514            myConverterData->name[len+1]='\0';
515        }
516        else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
517            (myLocale[2]=='_' || myLocale[2]=='\0'))
518        {
519            const char *cnvName;
520            if(version==1) {
521                cnvName="icu-internal-25546";
522            } else {
523                cnvName="ibm-949";
524                myConverterData->version=version=0;
525            }
526            if(pArgs->onlyTestIsLoadable) {
527                ucnv_canCreateConverter(cnvName, errorCode);  /* errorCode carries result */
528                uprv_free(cnv->extraInfo);
529                cnv->extraInfo=NULL;
530                return;
531            } else {
532                myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
533                if (U_FAILURE(*errorCode)) {
534                    _ISO2022Close(cnv);
535                    return;
536                }
537
538                if(version==1) {
539                    (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
540                    uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
541                    cnv->subCharLen = myConverterData->currentConverter->subCharLen;
542                }else{
543                    (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
544                }
545
546                /* initialize the state variables */
547                setInitialStateToUnicodeKR(cnv, myConverterData);
548                setInitialStateFromUnicodeKR(cnv, myConverterData);
549
550                /* set the function pointers to appropriate funtions */
551                cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
552                uprv_strcpy(myConverterData->locale,"ko");
553            }
554        }
555        else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
556            (myLocale[2]=='_' || myLocale[2]=='\0'))
557        {
558
559            /* open the required converters and cache them */
560            myConverterData->myConverterArray[GB2312_1] =
561                ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
562            if(version==1) {
563                myConverterData->myConverterArray[ISO_IR_165] =
564                    ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
565            }
566            myConverterData->myConverterArray[CNS_11643] =
567                ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
568
569
570            /* set the function pointers to appropriate funtions */
571            cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
572            uprv_strcpy(myConverterData->locale,"cn");
573
574            if (version==0){
575                myConverterData->version = 0;
576                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
577            }else if (version==1){
578                myConverterData->version = 1;
579                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
580            }else {
581                myConverterData->version = 2;
582                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
583            }
584        }
585        else{
586#ifdef U_ENABLE_GENERIC_ISO_2022
587            myConverterData->isFirstBuffer = TRUE;
588
589            /* append the UTF-8 escape sequence */
590            cnv->charErrorBufferLength = 3;
591            cnv->charErrorBuffer[0] = 0x1b;
592            cnv->charErrorBuffer[1] = 0x25;
593            cnv->charErrorBuffer[2] = 0x42;
594
595            cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
596            /* initialize the state variables */
597            uprv_strcpy(myConverterData->name,"ISO_2022");
598#else
599            *errorCode = U_UNSUPPORTED_ERROR;
600            return;
601#endif
602        }
603
604        cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
605
606        if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
607            _ISO2022Close(cnv);
608        }
609    } else {
610        *errorCode = U_MEMORY_ALLOCATION_ERROR;
611    }
612}
613
614
615static void
616_ISO2022Close(UConverter *converter) {
617    UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
618    UConverterSharedData **array = myData->myConverterArray;
619    int32_t i;
620
621    if (converter->extraInfo != NULL) {
622        /*close the array of converter pointers and free the memory*/
623        for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
624            if(array[i]!=NULL) {
625                ucnv_unloadSharedDataIfReady(array[i]);
626            }
627        }
628
629        ucnv_close(myData->currentConverter);
630
631        if(!converter->isExtraLocal){
632            uprv_free (converter->extraInfo);
633            converter->extraInfo = NULL;
634        }
635    }
636}
637
638static void
639_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
640    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
641    if(choice<=UCNV_RESET_TO_UNICODE) {
642        uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
643        myConverterData->key = 0;
644        myConverterData->isEmptySegment = FALSE;
645    }
646    if(choice!=UCNV_RESET_TO_UNICODE) {
647        uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
648    }
649#ifdef U_ENABLE_GENERIC_ISO_2022
650    if(myConverterData->locale[0] == 0){
651        if(choice<=UCNV_RESET_TO_UNICODE) {
652            myConverterData->isFirstBuffer = TRUE;
653            myConverterData->key = 0;
654            if (converter->mode == UCNV_SO){
655                ucnv_close (myConverterData->currentConverter);
656                myConverterData->currentConverter=NULL;
657            }
658            converter->mode = UCNV_SI;
659        }
660        if(choice!=UCNV_RESET_TO_UNICODE) {
661            /* re-append UTF-8 escape sequence */
662            converter->charErrorBufferLength = 3;
663            converter->charErrorBuffer[0] = 0x1b;
664            converter->charErrorBuffer[1] = 0x28;
665            converter->charErrorBuffer[2] = 0x42;
666        }
667    }
668    else
669#endif
670    {
671        /* reset the state variables */
672        if(myConverterData->locale[0] == 'k'){
673            if(choice<=UCNV_RESET_TO_UNICODE) {
674                setInitialStateToUnicodeKR(converter, myConverterData);
675            }
676            if(choice!=UCNV_RESET_TO_UNICODE) {
677                setInitialStateFromUnicodeKR(converter, myConverterData);
678            }
679        }
680    }
681}
682
683static const char*
684_ISO2022getName(const UConverter* cnv){
685    if(cnv->extraInfo){
686        UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
687        return myData->name;
688    }
689    return NULL;
690}
691
692
693/*************** to unicode *******************/
694/****************************************************************************
695 * Recognized escape sequences are
696 * <ESC>(B  ASCII
697 * <ESC>.A  ISO-8859-1
698 * <ESC>.F  ISO-8859-7
699 * <ESC>(J  JISX-201
700 * <ESC>(I  JISX-201
701 * <ESC>$B  JISX-208
702 * <ESC>$@  JISX-208
703 * <ESC>$(D JISX-212
704 * <ESC>$A  GB2312
705 * <ESC>$(C KSC5601
706 */
707static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
708/*      0                1               2               3               4               5               6               7               8               9    */
709    INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
710    ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
711    ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
712    ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
713    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
714    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
715    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
716    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
717};
718
719/*************** to unicode *******************/
720static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
721/*      0                1               2               3               4               5               6               7               8               9    */
722     INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
723    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
724    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
725    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
726    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
727    ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
728    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
729    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
730};
731
732
733static UCNV_TableStates_2022
734getKey_2022(char c,int32_t* key,int32_t* offset){
735    int32_t togo;
736    int32_t low = 0;
737    int32_t hi = MAX_STATES_2022;
738    int32_t oldmid=0;
739
740    togo = normalize_esq_chars_2022[(uint8_t)c];
741    if(togo == 0) {
742        /* not a valid character anywhere in an escape sequence */
743        *key = 0;
744        *offset = 0;
745        return INVALID_2022;
746    }
747    togo = (*key << 5) + togo;
748
749    while (hi != low)  /*binary search*/{
750
751        int32_t mid = (hi+low) >> 1; /*Finds median*/
752
753        if (mid == oldmid)
754            break;
755
756        if (escSeqStateTable_Key_2022[mid] > togo){
757            hi = mid;
758        }
759        else if (escSeqStateTable_Key_2022[mid] < togo){
760            low = mid;
761        }
762        else /*we found it*/{
763            *key = togo;
764            *offset = mid;
765            return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
766        }
767        oldmid = mid;
768
769    }
770
771    *key = 0;
772    *offset = 0;
773    return INVALID_2022;
774}
775
776/*runs through a state machine to determine the escape sequence - codepage correspondance
777 */
778static void
779changeState_2022(UConverter* _this,
780                const char** source,
781                const char* sourceLimit,
782                Variant2022 var,
783                UErrorCode* err){
784    UCNV_TableStates_2022 value;
785    UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
786    uint32_t key = myData2022->key;
787    int32_t offset = 0;
788    int8_t initialToULength = _this->toULength;
789    char c;
790
791    value = VALID_NON_TERMINAL_2022;
792    while (*source < sourceLimit) {
793        c = *(*source)++;
794        _this->toUBytes[_this->toULength++]=(uint8_t)c;
795        value = getKey_2022(c,(int32_t *) &key, &offset);
796
797        switch (value){
798
799        case VALID_NON_TERMINAL_2022 :
800            /* continue with the loop */
801            break;
802
803        case VALID_TERMINAL_2022:
804            key = 0;
805            goto DONE;
806
807        case INVALID_2022:
808            goto DONE;
809
810        case VALID_MAYBE_TERMINAL_2022:
811#ifdef U_ENABLE_GENERIC_ISO_2022
812            /* ESC ( B is ambiguous only for ISO_2022 itself */
813            if(var == ISO_2022) {
814                /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
815                _this->toULength = 0;
816
817                /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
818
819                /* continue with the loop */
820                value = VALID_NON_TERMINAL_2022;
821                break;
822            } else
823#endif
824            {
825                /* not ISO_2022 itself, finish here */
826                value = VALID_TERMINAL_2022;
827                key = 0;
828                goto DONE;
829            }
830        }
831    }
832
833DONE:
834    myData2022->key = key;
835
836    if (value == VALID_NON_TERMINAL_2022) {
837        /* indicate that the escape sequence is incomplete: key!=0 */
838        return;
839    } else if (value == INVALID_2022 ) {
840        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
841    } else /* value == VALID_TERMINAL_2022 */ {
842        switch(var){
843#ifdef U_ENABLE_GENERIC_ISO_2022
844        case ISO_2022:
845        {
846            const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
847            if(chosenConverterName == NULL) {
848                /* SS2 or SS3 */
849                *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
850                _this->toUCallbackReason = UCNV_UNASSIGNED;
851                return;
852            }
853
854            _this->mode = UCNV_SI;
855            ucnv_close(myData2022->currentConverter);
856            myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
857            if(U_SUCCESS(*err)) {
858                myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
859                _this->mode = UCNV_SO;
860            }
861            break;
862        }
863#endif
864        case ISO_2022_JP:
865            {
866                StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
867                switch(tempState) {
868                case INVALID_STATE:
869                    *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
870                    break;
871                case SS2_STATE:
872                    if(myData2022->toU2022State.cs[2]!=0) {
873                        if(myData2022->toU2022State.g<2) {
874                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
875                        }
876                        myData2022->toU2022State.g=2;
877                    } else {
878                        /* illegal to have SS2 before a matching designator */
879                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
880                    }
881                    break;
882                /* case SS3_STATE: not used in ISO-2022-JP-x */
883                case ISO8859_1:
884                case ISO8859_7:
885                    if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
886                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
887                    } else {
888                        /* G2 charset for SS2 */
889                        myData2022->toU2022State.cs[2]=(int8_t)tempState;
890                    }
891                    break;
892                default:
893                    if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
894                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
895                    } else {
896                        /* G0 charset */
897                        myData2022->toU2022State.cs[0]=(int8_t)tempState;
898                    }
899                    break;
900                }
901            }
902            break;
903        case ISO_2022_CN:
904            {
905                StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
906                switch(tempState) {
907                case INVALID_STATE:
908                    *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
909                    break;
910                case SS2_STATE:
911                    if(myData2022->toU2022State.cs[2]!=0) {
912                        if(myData2022->toU2022State.g<2) {
913                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
914                        }
915                        myData2022->toU2022State.g=2;
916                    } else {
917                        /* illegal to have SS2 before a matching designator */
918                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
919                    }
920                    break;
921                case SS3_STATE:
922                    if(myData2022->toU2022State.cs[3]!=0) {
923                        if(myData2022->toU2022State.g<2) {
924                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
925                        }
926                        myData2022->toU2022State.g=3;
927                    } else {
928                        /* illegal to have SS3 before a matching designator */
929                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
930                    }
931                    break;
932                case ISO_IR_165:
933                    if(myData2022->version==0) {
934                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
935                        break;
936                    }
937                    /*fall through*/
938                case GB2312_1:
939                    /*fall through*/
940                case CNS_11643_1:
941                    myData2022->toU2022State.cs[1]=(int8_t)tempState;
942                    break;
943                case CNS_11643_2:
944                    myData2022->toU2022State.cs[2]=(int8_t)tempState;
945                    break;
946                default:
947                    /* other CNS 11643 planes */
948                    if(myData2022->version==0) {
949                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
950                    } else {
951                       myData2022->toU2022State.cs[3]=(int8_t)tempState;
952                    }
953                    break;
954                }
955            }
956            break;
957        case ISO_2022_KR:
958            if(offset==0x30){
959                /* nothing to be done, just accept this one escape sequence */
960            } else {
961                *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
962            }
963            break;
964
965        default:
966            *err = U_ILLEGAL_ESCAPE_SEQUENCE;
967            break;
968        }
969    }
970    if(U_SUCCESS(*err)) {
971        _this->toULength = 0;
972    } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
973        if(_this->toULength>1) {
974            /*
975             * Ticket 5691: consistent illegal sequences:
976             * - We include at least the first byte (ESC) in the illegal sequence.
977             * - If any of the non-initial bytes could be the start of a character,
978             *   we stop the illegal sequence before the first one of those.
979             *   In escape sequences, all following bytes are "printable", that is,
980             *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
981             *   they are valid single/lead bytes.
982             *   For simplicity, we always only report the initial ESC byte as the
983             *   illegal sequence and back out all other bytes we looked at.
984             */
985            /* Back out some bytes. */
986            int8_t backOutDistance=_this->toULength-1;
987            int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
988            if(backOutDistance<=bytesFromThisBuffer) {
989                /* same as initialToULength<=1 */
990                *source-=backOutDistance;
991            } else {
992                /* Back out bytes from the previous buffer: Need to replay them. */
993                _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
994                /* same as -(initialToULength-1) */
995                /* preToULength is negative! */
996                uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
997                *source-=bytesFromThisBuffer;
998            }
999            _this->toULength=1;
1000        }
1001    } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1002        _this->toUCallbackReason = UCNV_UNASSIGNED;
1003    }
1004}
1005
1006/*Checks the characters of the buffer against valid 2022 escape sequences
1007*if the match we return a pointer to the initial start of the sequence otherwise
1008*we return sourceLimit
1009*/
1010/*for 2022 looks ahead in the stream
1011 *to determine the longest possible convertible
1012 *data stream
1013 */
1014static inline const char*
1015getEndOfBuffer_2022(const char** source,
1016                   const char* sourceLimit,
1017                   UBool /*flush*/){
1018
1019    const char* mySource = *source;
1020
1021#ifdef U_ENABLE_GENERIC_ISO_2022
1022    if (*source >= sourceLimit)
1023        return sourceLimit;
1024
1025    do{
1026
1027        if (*mySource == ESC_2022){
1028            int8_t i;
1029            int32_t key = 0;
1030            int32_t offset;
1031            UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1032
1033            /* Kludge: I could not
1034            * figure out the reason for validating an escape sequence
1035            * twice - once here and once in changeState_2022().
1036            * is it possible to have an ESC character in a ISO2022
1037            * byte stream which is valid in a code page? Is it legal?
1038            */
1039            for (i=0;
1040            (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1041            i++) {
1042                value =  getKey_2022(*(mySource+i), &key, &offset);
1043            }
1044            if (value > 0 || *mySource==ESC_2022)
1045                return mySource;
1046
1047            if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1048                return sourceLimit;
1049        }
1050    }while (++mySource < sourceLimit);
1051
1052    return sourceLimit;
1053#else
1054    while(mySource < sourceLimit && *mySource != ESC_2022) {
1055        ++mySource;
1056    }
1057    return mySource;
1058#endif
1059}
1060
1061
1062/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1063 * any future change in _MBCSFromUChar32() function should be reflected here.
1064 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1065 */
1066static inline int32_t
1067MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1068                                         UChar32 c,
1069                                         uint32_t* value,
1070                                         UBool useFallback,
1071                                         int outputType)
1072{
1073    const int32_t *cx;
1074    const uint16_t *table;
1075    uint32_t stage2Entry;
1076    uint32_t myValue;
1077    int32_t length;
1078    const uint8_t *p;
1079    /*
1080     * TODO(markus): Use and require new, faster MBCS conversion table structures.
1081     * Use internal version of ucnv_open() that verifies that the new structures are available,
1082     * else U_INTERNAL_PROGRAM_ERROR.
1083     */
1084    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1085    if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1086        table=sharedData->mbcs.fromUnicodeTable;
1087        stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1088        /* get the bytes and the length for the output */
1089        if(outputType==MBCS_OUTPUT_2){
1090            myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1091            if(myValue<=0xff) {
1092                length=1;
1093            } else {
1094                length=2;
1095            }
1096        } else /* outputType==MBCS_OUTPUT_3 */ {
1097            p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1098            myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1099            if(myValue<=0xff) {
1100                length=1;
1101            } else if(myValue<=0xffff) {
1102                length=2;
1103            } else {
1104                length=3;
1105            }
1106        }
1107        /* is this code point assigned, or do we use fallbacks? */
1108        if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1109            /* assigned */
1110            *value=myValue;
1111            return length;
1112        } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1113            /*
1114             * We allow a 0 byte output if the "assigned" bit is set for this entry.
1115             * There is no way with this data structure for fallback output
1116             * to be a zero byte.
1117             */
1118            *value=myValue;
1119            return -length;
1120        }
1121    }
1122
1123    cx=sharedData->mbcs.extIndexes;
1124    if(cx!=NULL) {
1125        return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1126    }
1127
1128    /* unassigned */
1129    return 0;
1130}
1131
1132/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1133 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1134 * @param retval pointer to output byte
1135 * @return 1 roundtrip byte  0 no mapping  -1 fallback byte
1136 */
1137static inline int32_t
1138MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1139                                       UChar32 c,
1140                                       uint32_t* retval,
1141                                       UBool useFallback)
1142{
1143    const uint16_t *table;
1144    int32_t value;
1145    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1146    if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1147        return 0;
1148    }
1149    /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1150    table=sharedData->mbcs.fromUnicodeTable;
1151    /* get the byte for the output */
1152    value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1153    /* is this code point assigned, or do we use fallbacks? */
1154    *retval=(uint32_t)(value&0xff);
1155    if(value>=0xf00) {
1156        return 1;  /* roundtrip */
1157    } else if(useFallback ? value>=0x800 : value>=0xc00) {
1158        return -1;  /* fallback taken */
1159    } else {
1160        return 0;  /* no mapping */
1161    }
1162}
1163
1164/*
1165 * Check that the result is a 2-byte value with each byte in the range A1..FE
1166 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1167 * to move it to the ISO 2022 range 21..7E.
1168 * Return 0 if out of range.
1169 */
1170static inline uint32_t
1171_2022FromGR94DBCS(uint32_t value) {
1172    if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1173        (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1174    ) {
1175        return value - 0x8080;  /* shift down to 21..7e byte range */
1176    } else {
1177        return 0;  /* not valid for ISO 2022 */
1178    }
1179}
1180
1181#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1182/*
1183 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1184 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1185 * unchanged.
1186 */
1187static inline uint32_t
1188_2022ToGR94DBCS(uint32_t value) {
1189    uint32_t returnValue = value + 0x8080;
1190    if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1191        (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1192        return returnValue;
1193    } else {
1194        return value;
1195    }
1196}
1197#endif
1198
1199#ifdef U_ENABLE_GENERIC_ISO_2022
1200
1201/**********************************************************************************
1202*  ISO-2022 Converter
1203*
1204*
1205*/
1206
1207static void
1208T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1209                                                           UErrorCode* err){
1210    const char* mySourceLimit, *realSourceLimit;
1211    const char* sourceStart;
1212    const UChar* myTargetStart;
1213    UConverter* saveThis;
1214    UConverterDataISO2022* myData;
1215    int8_t length;
1216
1217    saveThis = args->converter;
1218    myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1219
1220    realSourceLimit = args->sourceLimit;
1221    while (args->source < realSourceLimit) {
1222        if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1223            /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1224            mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1225
1226            if(args->source < mySourceLimit) {
1227                if(myData->currentConverter==NULL) {
1228                    myData->currentConverter = ucnv_open("ASCII",err);
1229                    if(U_FAILURE(*err)){
1230                        return;
1231                    }
1232
1233                    myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1234                    saveThis->mode = UCNV_SO;
1235                }
1236
1237                /* convert to before the ESC or until the end of the buffer */
1238                myData->isFirstBuffer=FALSE;
1239                sourceStart = args->source;
1240                myTargetStart = args->target;
1241                args->converter = myData->currentConverter;
1242                ucnv_toUnicode(args->converter,
1243                    &args->target,
1244                    args->targetLimit,
1245                    &args->source,
1246                    mySourceLimit,
1247                    args->offsets,
1248                    (UBool)(args->flush && mySourceLimit == realSourceLimit),
1249                    err);
1250                args->converter = saveThis;
1251
1252                if (*err == U_BUFFER_OVERFLOW_ERROR) {
1253                    /* move the overflow buffer */
1254                    length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1255                    myData->currentConverter->UCharErrorBufferLength = 0;
1256                    if(length > 0) {
1257                        uprv_memcpy(saveThis->UCharErrorBuffer,
1258                                    myData->currentConverter->UCharErrorBuffer,
1259                                    length*U_SIZEOF_UCHAR);
1260                    }
1261                    return;
1262                }
1263
1264                /*
1265                 * At least one of:
1266                 * -Error while converting
1267                 * -Done with entire buffer
1268                 * -Need to write offsets or update the current offset
1269                 *  (leave that up to the code in ucnv.c)
1270                 *
1271                 * or else we just stopped at an ESC byte and continue with changeState_2022()
1272                 */
1273                if (U_FAILURE(*err) ||
1274                    (args->source == realSourceLimit) ||
1275                    (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1276                    (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1277                ) {
1278                    /* copy partial or error input for truncated detection and error handling */
1279                    if(U_FAILURE(*err)) {
1280                        length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1281                        if(length > 0) {
1282                            uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1283                        }
1284                    } else {
1285                        length = saveThis->toULength = myData->currentConverter->toULength;
1286                        if(length > 0) {
1287                            uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1288                            if(args->source < mySourceLimit) {
1289                                *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1290                            }
1291                        }
1292                    }
1293                    return;
1294                }
1295            }
1296        }
1297
1298        sourceStart = args->source;
1299        changeState_2022(args->converter,
1300               &(args->source),
1301               realSourceLimit,
1302               ISO_2022,
1303               err);
1304        if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1305            /* let the ucnv.c code update its current offset */
1306            return;
1307        }
1308    }
1309}
1310
1311#endif
1312
1313/*
1314 * To Unicode Callback helper function
1315 */
1316static void
1317toUnicodeCallback(UConverter *cnv,
1318                  const uint32_t sourceChar, const uint32_t targetUniChar,
1319                  UErrorCode* err){
1320    if(sourceChar>0xff){
1321        cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1322        cnv->toUBytes[1] = (uint8_t)sourceChar;
1323        cnv->toULength = 2;
1324    }
1325    else{
1326        cnv->toUBytes[0] =(char) sourceChar;
1327        cnv->toULength = 1;
1328    }
1329
1330    if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1331        *err = U_INVALID_CHAR_FOUND;
1332    }
1333    else{
1334        *err = U_ILLEGAL_CHAR_FOUND;
1335    }
1336}
1337
1338/**************************************ISO-2022-JP*************************************************/
1339
1340/************************************** IMPORTANT **************************************************
1341* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1342* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1343* The converter iterates over each Unicode codepoint
1344* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1345* processed one char at a time it would make sense to reduce the extra processing a canned converter
1346* would do as far as possible.
1347*
1348* If the implementation of these macros or structure of sharedData struct change in the future, make
1349* sure that ISO-2022 is also changed.
1350***************************************************************************************************
1351*/
1352
1353/***************************************************************************************************
1354* Rules for ISO-2022-jp encoding
1355* (i)   Escape sequences must be fully contained within a line they should not
1356*       span new lines or CRs
1357* (ii)  If the last character on a line is represented by two bytes then an ASCII or
1358*       JIS-Roman character escape sequence should follow before the line terminates
1359* (iii) If the first character on the line is represented by two bytes then a two
1360*       byte character escape sequence should precede it
1361* (iv)  If no escape sequence is encountered then the characters are ASCII
1362* (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1363*       and invoked with SS2 (ESC N).
1364* (vi)  If there is any G0 designation in text, there must be a switch to
1365*       ASCII or to JIS X 0201-Roman before a space character (but not
1366*       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1367*       characters such as tab or CRLF.
1368* (vi)  Supported encodings:
1369*          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1370*
1371*  source : RFC-1554
1372*
1373*          JISX201, JISX208,JISX212 : new .cnv data files created
1374*          KSC5601 : alias to ibm-949 mapping table
1375*          GB2312 : alias to ibm-1386 mapping table
1376*          ISO-8859-1 : Algorithmic implemented as LATIN1 case
1377*          ISO-8859-7 : alisas to ibm-9409 mapping table
1378*/
1379
1380/* preference order of JP charsets */
1381static const StateEnum jpCharsetPref[]={
1382    ASCII,
1383    JISX201,
1384    ISO8859_1,
1385    ISO8859_7,
1386    JISX208,
1387    JISX212,
1388    GB2312,
1389    KSC5601,
1390    HWKANA_7BIT
1391};
1392
1393/*
1394 * The escape sequences must be in order of the enum constants like JISX201  = 3,
1395 * not in order of jpCharsetPref[]!
1396 */
1397static const char escSeqChars[][6] ={
1398    "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
1399    "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
1400    "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
1401    "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
1402    "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
1403    "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
1404    "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
1405    "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
1406    "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
1407
1408};
1409static  const int8_t escSeqCharsLen[] ={
1410    3, /* length of <ESC>(B  ASCII       */
1411    3, /* length of <ESC>.A  ISO-8859-1  */
1412    3, /* length of <ESC>.F  ISO-8859-7  */
1413    3, /* length of <ESC>(J  JISX-201    */
1414    3, /* length of <ESC>$B  JISX-208    */
1415    4, /* length of <ESC>$(D JISX-212    */
1416    3, /* length of <ESC>$A  GB2312      */
1417    4, /* length of <ESC>$(C KSC5601     */
1418    3  /* length of <ESC>(I  HWKANA_7BIT */
1419};
1420
1421/*
1422* The iteration over various code pages works this way:
1423* i)   Get the currentState from myConverterData->currentState
1424* ii)  Check if the character is mapped to a valid character in the currentState
1425*      Yes ->  a) set the initIterState to currentState
1426*       b) remain in this state until an invalid character is found
1427*      No  ->  a) go to the next code page and find the character
1428* iii) Before changing the state increment the current state check if the current state
1429*      is equal to the intitIteration state
1430*      Yes ->  A character that cannot be represented in any of the supported encodings
1431*       break and return a U_INVALID_CHARACTER error
1432*      No  ->  Continue and find the character in next code page
1433*
1434*
1435* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1436*/
1437
1438/* Map 00..7F to Unicode according to JIS X 0201. */
1439static inline uint32_t
1440jisx201ToU(uint32_t value) {
1441    if(value < 0x5c) {
1442        return value;
1443    } else if(value == 0x5c) {
1444        return 0xa5;
1445    } else if(value == 0x7e) {
1446        return 0x203e;
1447    } else /* value <= 0x7f */ {
1448        return value;
1449    }
1450}
1451
1452/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1453static inline uint32_t
1454jisx201FromU(uint32_t value) {
1455    if(value<=0x7f) {
1456        if(value!=0x5c && value!=0x7e) {
1457            return value;
1458        }
1459    } else if(value==0xa5) {
1460        return 0x5c;
1461    } else if(value==0x203e) {
1462        return 0x7e;
1463    }
1464    return 0xfffe;
1465}
1466
1467/*
1468 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1469 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1470 * Return 0 if the byte pair is out of range.
1471 */
1472static inline uint32_t
1473_2022FromSJIS(uint32_t value) {
1474    uint8_t trail;
1475
1476    if(value > 0xEFFC) {
1477        return 0;  /* beyond JIS X 0208 */
1478    }
1479
1480    trail = (uint8_t)value;
1481
1482    value &= 0xff00;  /* lead byte */
1483    if(value <= 0x9f00) {
1484        value -= 0x7000;
1485    } else /* 0xe000 <= value <= 0xef00 */ {
1486        value -= 0xb000;
1487    }
1488    value <<= 1;
1489
1490    if(trail <= 0x9e) {
1491        value -= 0x100;
1492        if(trail <= 0x7e) {
1493            value |= trail - 0x1f;
1494        } else {
1495            value |= trail - 0x20;
1496        }
1497    } else /* trail <= 0xfc */ {
1498        value |= trail - 0x7e;
1499    }
1500    return value;
1501}
1502
1503/*
1504 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1505 * If either byte is outside 21..7E make sure that the result is not valid
1506 * for Shift-JIS so that the converter catches it.
1507 * Some invalid byte values already turn into equally invalid Shift-JIS
1508 * byte values and need not be tested explicitly.
1509 */
1510static inline void
1511_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1512    if(c1&1) {
1513        ++c1;
1514        if(c2 <= 0x5f) {
1515            c2 += 0x1f;
1516        } else if(c2 <= 0x7e) {
1517            c2 += 0x20;
1518        } else {
1519            c2 = 0;  /* invalid */
1520        }
1521    } else {
1522        if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1523            c2 += 0x7e;
1524        } else {
1525            c2 = 0;  /* invalid */
1526        }
1527    }
1528    c1 >>= 1;
1529    if(c1 <= 0x2f) {
1530        c1 += 0x70;
1531    } else if(c1 <= 0x3f) {
1532        c1 += 0xb0;
1533    } else {
1534        c1 = 0;  /* invalid */
1535    }
1536    bytes[0] = (char)c1;
1537    bytes[1] = (char)c2;
1538}
1539
1540/*
1541 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1542 * Katakana.
1543 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1544 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1545 * These were the only fallbacks in ICU's jisx-208.ucm file.
1546 */
1547static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1548    0x2123,  /* U+FF61 */
1549    0x2156,
1550    0x2157,
1551    0x2122,
1552    0x2126,
1553    0x2572,
1554    0x2521,
1555    0x2523,
1556    0x2525,
1557    0x2527,
1558    0x2529,
1559    0x2563,
1560    0x2565,
1561    0x2567,
1562    0x2543,
1563    0x213C,  /* U+FF70 */
1564    0x2522,
1565    0x2524,
1566    0x2526,
1567    0x2528,
1568    0x252A,
1569    0x252B,
1570    0x252D,
1571    0x252F,
1572    0x2531,
1573    0x2533,
1574    0x2535,
1575    0x2537,
1576    0x2539,
1577    0x253B,
1578    0x253D,
1579    0x253F,  /* U+FF80 */
1580    0x2541,
1581    0x2544,
1582    0x2546,
1583    0x2548,
1584    0x254A,
1585    0x254B,
1586    0x254C,
1587    0x254D,
1588    0x254E,
1589    0x254F,
1590    0x2552,
1591    0x2555,
1592    0x2558,
1593    0x255B,
1594    0x255E,
1595    0x255F,  /* U+FF90 */
1596    0x2560,
1597    0x2561,
1598    0x2562,
1599    0x2564,
1600    0x2566,
1601    0x2568,
1602    0x2569,
1603    0x256A,
1604    0x256B,
1605    0x256C,
1606    0x256D,
1607    0x256F,
1608    0x2573,
1609    0x212B,
1610    0x212C   /* U+FF9F */
1611};
1612
1613static void
1614UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1615    UConverter *cnv = args->converter;
1616    UConverterDataISO2022 *converterData;
1617    ISO2022State *pFromU2022State;
1618    uint8_t *target = (uint8_t *) args->target;
1619    const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1620    const UChar* source = args->source;
1621    const UChar* sourceLimit = args->sourceLimit;
1622    int32_t* offsets = args->offsets;
1623    UChar32 sourceChar;
1624    char buffer[8];
1625    int32_t len, outLen;
1626    int8_t choices[10];
1627    int32_t choiceCount;
1628    uint32_t targetValue = 0;
1629    UBool useFallback;
1630
1631    int32_t i;
1632    int8_t cs, g;
1633
1634    /* set up the state */
1635    converterData     = (UConverterDataISO2022*)cnv->extraInfo;
1636    pFromU2022State   = &converterData->fromU2022State;
1637
1638    choiceCount = 0;
1639
1640    /* check if the last codepoint of previous buffer was a lead surrogate*/
1641    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1642        goto getTrail;
1643    }
1644
1645    while(source < sourceLimit) {
1646        if(target < targetLimit) {
1647
1648            sourceChar  = *(source++);
1649            /*check if the char is a First surrogate*/
1650            if(U16_IS_SURROGATE(sourceChar)) {
1651                if(U16_IS_SURROGATE_LEAD(sourceChar)) {
1652getTrail:
1653                    /*look ahead to find the trail surrogate*/
1654                    if(source < sourceLimit) {
1655                        /* test the following code unit */
1656                        UChar trail=(UChar) *source;
1657                        if(U16_IS_TRAIL(trail)) {
1658                            source++;
1659                            sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1660                            cnv->fromUChar32=0x00;
1661                            /* convert this supplementary code point */
1662                            /* exit this condition tree */
1663                        } else {
1664                            /* this is an unmatched lead code unit (1st surrogate) */
1665                            /* callback(illegal) */
1666                            *err=U_ILLEGAL_CHAR_FOUND;
1667                            cnv->fromUChar32=sourceChar;
1668                            break;
1669                        }
1670                    } else {
1671                        /* no more input */
1672                        cnv->fromUChar32=sourceChar;
1673                        break;
1674                    }
1675                } else {
1676                    /* this is an unmatched trail code unit (2nd surrogate) */
1677                    /* callback(illegal) */
1678                    *err=U_ILLEGAL_CHAR_FOUND;
1679                    cnv->fromUChar32=sourceChar;
1680                    break;
1681                }
1682            }
1683
1684            /* do not convert SO/SI/ESC */
1685            if(IS_2022_CONTROL(sourceChar)) {
1686                /* callback(illegal) */
1687                *err=U_ILLEGAL_CHAR_FOUND;
1688                cnv->fromUChar32=sourceChar;
1689                break;
1690            }
1691
1692            /* do the conversion */
1693
1694            if(choiceCount == 0) {
1695                uint16_t csm;
1696
1697                /*
1698                 * The csm variable keeps track of which charsets are allowed
1699                 * and not used yet while building the choices[].
1700                 */
1701                csm = jpCharsetMasks[converterData->version];
1702                choiceCount = 0;
1703
1704                /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1705                if(converterData->version == 3 || converterData->version == 4) {
1706                    choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1707                }
1708                /* Do not try single-byte half-width Katakana for other versions. */
1709                csm &= ~CSM(HWKANA_7BIT);
1710
1711                /* try the current G0 charset */
1712                choices[choiceCount++] = cs = pFromU2022State->cs[0];
1713                csm &= ~CSM(cs);
1714
1715                /* try the current G2 charset */
1716                if((cs = pFromU2022State->cs[2]) != 0) {
1717                    choices[choiceCount++] = cs;
1718                    csm &= ~CSM(cs);
1719                }
1720
1721                /* try all the other possible charsets */
1722                for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1723                    cs = (int8_t)jpCharsetPref[i];
1724                    if(CSM(cs) & csm) {
1725                        choices[choiceCount++] = cs;
1726                        csm &= ~CSM(cs);
1727                    }
1728                }
1729            }
1730
1731            cs = g = 0;
1732            /*
1733             * len==0: no mapping found yet
1734             * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1735             * len>0: found a roundtrip result, done
1736             */
1737            len = 0;
1738            /*
1739             * We will turn off useFallback after finding a fallback,
1740             * but we still get fallbacks from PUA code points as usual.
1741             * Therefore, we will also need to check that we don't overwrite
1742             * an early fallback with a later one.
1743             */
1744            useFallback = cnv->useFallback;
1745
1746            for(i = 0; i < choiceCount && len <= 0; ++i) {
1747                uint32_t value;
1748                int32_t len2;
1749                int8_t cs0 = choices[i];
1750                switch(cs0) {
1751                case ASCII:
1752                    if(sourceChar <= 0x7f) {
1753                        targetValue = (uint32_t)sourceChar;
1754                        len = 1;
1755                        cs = cs0;
1756                        g = 0;
1757                    }
1758                    break;
1759                case ISO8859_1:
1760                    if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1761                        targetValue = (uint32_t)sourceChar - 0x80;
1762                        len = 1;
1763                        cs = cs0;
1764                        g = 2;
1765                    }
1766                    break;
1767                case HWKANA_7BIT:
1768                    if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1769                        if(converterData->version==3) {
1770                            /* JIS7: use G1 (SO) */
1771                            /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1772                            targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1773                            len = 1;
1774                            pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1775                            g = 1;
1776                        } else if(converterData->version==4) {
1777                            /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1778                            /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1779                            targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1780                            len = 1;
1781
1782                            cs = pFromU2022State->cs[0];
1783                            if(IS_JP_DBCS(cs)) {
1784                                /* switch from a DBCS charset to JISX201 */
1785                                cs = (int8_t)JISX201;
1786                            }
1787                            /* else stay in the current G0 charset */
1788                            g = 0;
1789                        }
1790                        /* else do not use HWKANA_7BIT with other versions */
1791                    }
1792                    break;
1793                case JISX201:
1794                    /* G0 SBCS */
1795                    value = jisx201FromU(sourceChar);
1796                    if(value <= 0x7f) {
1797                        targetValue = value;
1798                        len = 1;
1799                        cs = cs0;
1800                        g = 0;
1801                        useFallback = FALSE;
1802                    }
1803                    break;
1804                case JISX208:
1805                    /* G0 DBCS from Shift-JIS table */
1806                    len2 = MBCS_FROM_UCHAR32_ISO2022(
1807                                converterData->myConverterArray[cs0],
1808                                sourceChar, &value,
1809                                useFallback, MBCS_OUTPUT_2);
1810                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1811                        value = _2022FromSJIS(value);
1812                        if(value != 0) {
1813                            targetValue = value;
1814                            len = len2;
1815                            cs = cs0;
1816                            g = 0;
1817                            useFallback = FALSE;
1818                        }
1819                    } else if(len == 0 && useFallback &&
1820                              (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1821                        targetValue = hwkana_fb[sourceChar - HWKANA_START];
1822                        len = -2;
1823                        cs = cs0;
1824                        g = 0;
1825                        useFallback = FALSE;
1826                    }
1827                    break;
1828                case ISO8859_7:
1829                    /* G0 SBCS forced to 7-bit output */
1830                    len2 = MBCS_SINGLE_FROM_UCHAR32(
1831                                converterData->myConverterArray[cs0],
1832                                sourceChar, &value,
1833                                useFallback);
1834                    if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1835                        targetValue = value - 0x80;
1836                        len = len2;
1837                        cs = cs0;
1838                        g = 2;
1839                        useFallback = FALSE;
1840                    }
1841                    break;
1842                default:
1843                    /* G0 DBCS */
1844                    len2 = MBCS_FROM_UCHAR32_ISO2022(
1845                                converterData->myConverterArray[cs0],
1846                                sourceChar, &value,
1847                                useFallback, MBCS_OUTPUT_2);
1848                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1849                        if(cs0 == KSC5601) {
1850                            /*
1851                             * Check for valid bytes for the encoding scheme.
1852                             * This is necessary because the sub-converter (windows-949)
1853                             * has a broader encoding scheme than is valid for 2022.
1854                             */
1855                            value = _2022FromGR94DBCS(value);
1856                            if(value == 0) {
1857                                break;
1858                            }
1859                        }
1860                        targetValue = value;
1861                        len = len2;
1862                        cs = cs0;
1863                        g = 0;
1864                        useFallback = FALSE;
1865                    }
1866                    break;
1867                }
1868            }
1869
1870            if(len != 0) {
1871                if(len < 0) {
1872                    len = -len;  /* fallback */
1873                }
1874                outLen = 0; /* count output bytes */
1875
1876                /* write SI if necessary (only for JIS7) */
1877                if(pFromU2022State->g == 1 && g == 0) {
1878                    buffer[outLen++] = UCNV_SI;
1879                    pFromU2022State->g = 0;
1880                }
1881
1882                /* write the designation sequence if necessary */
1883                if(cs != pFromU2022State->cs[g]) {
1884                    int32_t escLen = escSeqCharsLen[cs];
1885                    uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1886                    outLen += escLen;
1887                    pFromU2022State->cs[g] = cs;
1888
1889                    /* invalidate the choices[] */
1890                    choiceCount = 0;
1891                }
1892
1893                /* write the shift sequence if necessary */
1894                if(g != pFromU2022State->g) {
1895                    switch(g) {
1896                    /* case 0 handled before writing escapes */
1897                    case 1:
1898                        buffer[outLen++] = UCNV_SO;
1899                        pFromU2022State->g = 1;
1900                        break;
1901                    default: /* case 2 */
1902                        buffer[outLen++] = 0x1b;
1903                        buffer[outLen++] = 0x4e;
1904                        break;
1905                    /* no case 3: no SS3 in ISO-2022-JP-x */
1906                    }
1907                }
1908
1909                /* write the output bytes */
1910                if(len == 1) {
1911                    buffer[outLen++] = (char)targetValue;
1912                } else /* len == 2 */ {
1913                    buffer[outLen++] = (char)(targetValue >> 8);
1914                    buffer[outLen++] = (char)targetValue;
1915                }
1916            } else {
1917                /*
1918                 * if we cannot find the character after checking all codepages
1919                 * then this is an error
1920                 */
1921                *err = U_INVALID_CHAR_FOUND;
1922                cnv->fromUChar32=sourceChar;
1923                break;
1924            }
1925
1926            if(sourceChar == CR || sourceChar == LF) {
1927                /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1928                pFromU2022State->cs[2] = 0;
1929                choiceCount = 0;
1930            }
1931
1932            /* output outLen>0 bytes in buffer[] */
1933            if(outLen == 1) {
1934                *target++ = buffer[0];
1935                if(offsets) {
1936                    *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1937                }
1938            } else if(outLen == 2 && (target + 2) <= targetLimit) {
1939                *target++ = buffer[0];
1940                *target++ = buffer[1];
1941                if(offsets) {
1942                    int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1943                    *offsets++ = sourceIndex;
1944                    *offsets++ = sourceIndex;
1945                }
1946            } else {
1947                fromUWriteUInt8(
1948                    cnv,
1949                    buffer, outLen,
1950                    &target, (const char *)targetLimit,
1951                    &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1952                    err);
1953                if(U_FAILURE(*err)) {
1954                    break;
1955                }
1956            }
1957        } /* end if(myTargetIndex<myTargetLength) */
1958        else{
1959            *err =U_BUFFER_OVERFLOW_ERROR;
1960            break;
1961        }
1962
1963    }/* end while(mySourceIndex<mySourceLength) */
1964
1965    /*
1966     * the end of the input stream and detection of truncated input
1967     * are handled by the framework, but for ISO-2022-JP conversion
1968     * we need to be in ASCII mode at the very end
1969     *
1970     * conditions:
1971     *   successful
1972     *   in SO mode or not in ASCII mode
1973     *   end of input and no truncated input
1974     */
1975    if( U_SUCCESS(*err) &&
1976        (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
1977        args->flush && source>=sourceLimit && cnv->fromUChar32==0
1978    ) {
1979        int32_t sourceIndex;
1980
1981        outLen = 0;
1982
1983        if(pFromU2022State->g != 0) {
1984            buffer[outLen++] = UCNV_SI;
1985            pFromU2022State->g = 0;
1986        }
1987
1988        if(pFromU2022State->cs[0] != ASCII) {
1989            int32_t escLen = escSeqCharsLen[ASCII];
1990            uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
1991            outLen += escLen;
1992            pFromU2022State->cs[0] = (int8_t)ASCII;
1993        }
1994
1995        /* get the source index of the last input character */
1996        /*
1997         * TODO this would be simpler and more reliable if we used a pair
1998         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1999         * so that we could simply use the prevSourceIndex here;
2000         * this code gives an incorrect result for the rare case of an unmatched
2001         * trail surrogate that is alone in the last buffer of the text stream
2002         */
2003        sourceIndex=(int32_t)(source-args->source);
2004        if(sourceIndex>0) {
2005            --sourceIndex;
2006            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2007                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2008            ) {
2009                --sourceIndex;
2010            }
2011        } else {
2012            sourceIndex=-1;
2013        }
2014
2015        fromUWriteUInt8(
2016            cnv,
2017            buffer, outLen,
2018            &target, (const char *)targetLimit,
2019            &offsets, sourceIndex,
2020            err);
2021    }
2022
2023    /*save the state and return */
2024    args->source = source;
2025    args->target = (char*)target;
2026}
2027
2028/*************** to unicode *******************/
2029
2030static void
2031UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2032                                               UErrorCode* err){
2033    char tempBuf[2];
2034    const char *mySource = (char *) args->source;
2035    UChar *myTarget = args->target;
2036    const char *mySourceLimit = args->sourceLimit;
2037    uint32_t targetUniChar = 0x0000;
2038    uint32_t mySourceChar = 0x0000;
2039    uint32_t tmpSourceChar = 0x0000;
2040    UConverterDataISO2022* myData;
2041    ISO2022State *pToU2022State;
2042    StateEnum cs;
2043
2044    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2045    pToU2022State = &myData->toU2022State;
2046
2047    if(myData->key != 0) {
2048        /* continue with a partial escape sequence */
2049        goto escape;
2050    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2051        /* continue with a partial double-byte character */
2052        mySourceChar = args->converter->toUBytes[0];
2053        args->converter->toULength = 0;
2054        cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2055        targetUniChar = missingCharMarker;
2056        goto getTrailByte;
2057    }
2058
2059    while(mySource < mySourceLimit){
2060
2061        targetUniChar =missingCharMarker;
2062
2063        if(myTarget < args->targetLimit){
2064
2065            mySourceChar= (unsigned char) *mySource++;
2066
2067            switch(mySourceChar) {
2068            case UCNV_SI:
2069                if(myData->version==3) {
2070                    pToU2022State->g=0;
2071                    continue;
2072                } else {
2073                    /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2074                    myData->isEmptySegment = FALSE;	/* reset this, we have a different error */
2075                    break;
2076                }
2077
2078            case UCNV_SO:
2079                if(myData->version==3) {
2080                    /* JIS7: switch to G1 half-width Katakana */
2081                    pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2082                    pToU2022State->g=1;
2083                    continue;
2084                } else {
2085                    /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2086                    myData->isEmptySegment = FALSE;	/* reset this, we have a different error */
2087                    break;
2088                }
2089
2090            case ESC_2022:
2091                mySource--;
2092escape:
2093                {
2094                    const char * mySourceBefore = mySource;
2095                    int8_t toULengthBefore = args->converter->toULength;
2096
2097                    changeState_2022(args->converter,&(mySource),
2098                        mySourceLimit, ISO_2022_JP,err);
2099
2100                    /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2101                    if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2102                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2103                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
2104                        args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2105                    }
2106                }
2107
2108                /* invalid or illegal escape sequence */
2109                if(U_FAILURE(*err)){
2110                    args->target = myTarget;
2111                    args->source = mySource;
2112                    myData->isEmptySegment = FALSE;	/* Reset to avoid future spurious errors */
2113                    return;
2114                }
2115                /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2116                if(myData->key==0) {
2117                    myData->isEmptySegment = TRUE;
2118                }
2119                continue;
2120
2121            /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2122
2123            case CR:
2124                /*falls through*/
2125            case LF:
2126                /* automatically reset to single-byte mode */
2127                if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2128                    pToU2022State->cs[0] = (int8_t)ASCII;
2129                }
2130                pToU2022State->cs[2] = 0;
2131                pToU2022State->g = 0;
2132                /* falls through */
2133            default:
2134                /* convert one or two bytes */
2135                myData->isEmptySegment = FALSE;
2136                cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2137                if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2138                    !IS_JP_DBCS(cs)
2139                ) {
2140                    /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2141                    targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2142
2143                    /* return from a single-shift state to the previous one */
2144                    if(pToU2022State->g >= 2) {
2145                        pToU2022State->g=pToU2022State->prevG;
2146                    }
2147                } else switch(cs) {
2148                case ASCII:
2149                    if(mySourceChar <= 0x7f) {
2150                        targetUniChar = mySourceChar;
2151                    }
2152                    break;
2153                case ISO8859_1:
2154                    if(mySourceChar <= 0x7f) {
2155                        targetUniChar = mySourceChar + 0x80;
2156                    }
2157                    /* return from a single-shift state to the previous one */
2158                    pToU2022State->g=pToU2022State->prevG;
2159                    break;
2160                case ISO8859_7:
2161                    if(mySourceChar <= 0x7f) {
2162                        /* convert mySourceChar+0x80 to use a normal 8-bit table */
2163                        targetUniChar =
2164                            _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2165                                myData->myConverterArray[cs],
2166                                mySourceChar + 0x80);
2167                    }
2168                    /* return from a single-shift state to the previous one */
2169                    pToU2022State->g=pToU2022State->prevG;
2170                    break;
2171                case JISX201:
2172                    if(mySourceChar <= 0x7f) {
2173                        targetUniChar = jisx201ToU(mySourceChar);
2174                    }
2175                    break;
2176                case HWKANA_7BIT:
2177                    if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2178                        /* 7-bit halfwidth Katakana */
2179                        targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2180                    }
2181                    break;
2182                default:
2183                    /* G0 DBCS */
2184                    if(mySource < mySourceLimit) {
2185                        int leadIsOk, trailIsOk;
2186                        uint8_t trailByte;
2187getTrailByte:
2188                        trailByte = (uint8_t)*mySource;
2189                        /*
2190                         * Ticket 5691: consistent illegal sequences:
2191                         * - We include at least the first byte in the illegal sequence.
2192                         * - If any of the non-initial bytes could be the start of a character,
2193                         *   we stop the illegal sequence before the first one of those.
2194                         *
2195                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2196                         * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2197                         * Otherwise we convert or report the pair of bytes.
2198                         */
2199                        leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2200                        trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2201                        if (leadIsOk && trailIsOk) {
2202                            ++mySource;
2203                            tmpSourceChar = (mySourceChar << 8) | trailByte;
2204                            if(cs == JISX208) {
2205                                _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2206                                mySourceChar = tmpSourceChar;
2207                            } else {
2208                                /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2209                                mySourceChar = tmpSourceChar;
2210                                if (cs == KSC5601) {
2211                                    tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
2212                                }
2213                                tempBuf[0] = (char)(tmpSourceChar >> 8);
2214                                tempBuf[1] = (char)(tmpSourceChar);
2215                            }
2216                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2217                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2218                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2219                            ++mySource;
2220                            /* add another bit so that the code below writes 2 bytes in case of error */
2221                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2222                        }
2223                    } else {
2224                        args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2225                        args->converter->toULength = 1;
2226                        goto endloop;
2227                    }
2228                }  /* End of inner switch */
2229                break;
2230            }  /* End of outer switch */
2231            if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2232                if(args->offsets){
2233                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2234                }
2235                *(myTarget++)=(UChar)targetUniChar;
2236            }
2237            else if(targetUniChar > missingCharMarker){
2238                /* disassemble the surrogate pair and write to output*/
2239                targetUniChar-=0x0010000;
2240                *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2241                if(args->offsets){
2242                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2243                }
2244                ++myTarget;
2245                if(myTarget< args->targetLimit){
2246                    *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2247                    if(args->offsets){
2248                        args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2249                    }
2250                    ++myTarget;
2251                }else{
2252                    args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2253                                    (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2254                }
2255
2256            }
2257            else{
2258                /* Call the callback function*/
2259                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2260                break;
2261            }
2262        }
2263        else{    /* goes with "if(myTarget < args->targetLimit)"  way up near top of function */
2264            *err =U_BUFFER_OVERFLOW_ERROR;
2265            break;
2266        }
2267    }
2268endloop:
2269    args->target = myTarget;
2270    args->source = mySource;
2271}
2272
2273
2274/***************************************************************
2275*   Rules for ISO-2022-KR encoding
2276*   i) The KSC5601 designator sequence should appear only once in a file,
2277*      at the begining of a line before any KSC5601 characters. This usually
2278*      means that it appears by itself on the first line of the file
2279*  ii) There are only 2 shifting sequences SO to shift into double byte mode
2280*      and SI to shift into single byte mode
2281*/
2282static void
2283UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2284
2285    UConverter* saveConv = args->converter;
2286    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2287    args->converter=myConverterData->currentConverter;
2288
2289    myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2290    ucnv_MBCSFromUnicodeWithOffsets(args,err);
2291    saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2292
2293    if(*err == U_BUFFER_OVERFLOW_ERROR) {
2294        if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2295            uprv_memcpy(
2296                saveConv->charErrorBuffer,
2297                myConverterData->currentConverter->charErrorBuffer,
2298                myConverterData->currentConverter->charErrorBufferLength);
2299        }
2300        saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2301        myConverterData->currentConverter->charErrorBufferLength = 0;
2302    }
2303    args->converter=saveConv;
2304}
2305
2306static void
2307UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2308
2309    const UChar *source = args->source;
2310    const UChar *sourceLimit = args->sourceLimit;
2311    unsigned char *target = (unsigned char *) args->target;
2312    unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2313    int32_t* offsets = args->offsets;
2314    uint32_t targetByteUnit = 0x0000;
2315    UChar32 sourceChar = 0x0000;
2316    UBool isTargetByteDBCS;
2317    UBool oldIsTargetByteDBCS;
2318    UConverterDataISO2022 *converterData;
2319    UConverterSharedData* sharedData;
2320    UBool useFallback;
2321    int32_t length =0;
2322
2323    converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2324    /* if the version is 1 then the user is requesting
2325     * conversion with ibm-25546 pass the arguments to
2326     * MBCS converter and return
2327     */
2328    if(converterData->version==1){
2329        UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2330        return;
2331    }
2332
2333    /* initialize data */
2334    sharedData = converterData->currentConverter->sharedData;
2335    useFallback = args->converter->useFallback;
2336    isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2337    oldIsTargetByteDBCS = isTargetByteDBCS;
2338
2339    isTargetByteDBCS   = (UBool) args->converter->fromUnicodeStatus;
2340    if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2341        goto getTrail;
2342    }
2343    while(source < sourceLimit){
2344
2345        targetByteUnit = missingCharMarker;
2346
2347        if(target < (unsigned char*) args->targetLimit){
2348            sourceChar = *source++;
2349
2350            /* do not convert SO/SI/ESC */
2351            if(IS_2022_CONTROL(sourceChar)) {
2352                /* callback(illegal) */
2353                *err=U_ILLEGAL_CHAR_FOUND;
2354                args->converter->fromUChar32=sourceChar;
2355                break;
2356            }
2357
2358            length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2359            if(length < 0) {
2360                length = -length;  /* fallback */
2361            }
2362            /* only DBCS or SBCS characters are expected*/
2363            /* DB characters with high bit set to 1 are expected */
2364            if( length > 2 || length==0 ||
2365                (length == 1 && targetByteUnit > 0x7f) ||
2366                (length == 2 &&
2367                    ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2368                    (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2369            ) {
2370                targetByteUnit=missingCharMarker;
2371            }
2372            if (targetByteUnit != missingCharMarker){
2373
2374                oldIsTargetByteDBCS = isTargetByteDBCS;
2375                isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2376                  /* append the shift sequence */
2377                if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2378
2379                    if (isTargetByteDBCS)
2380                        *target++ = UCNV_SO;
2381                    else
2382                        *target++ = UCNV_SI;
2383                    if(offsets)
2384                        *(offsets++) = (int32_t)(source - args->source-1);
2385                }
2386                /* write the targetUniChar  to target */
2387                if(targetByteUnit <= 0x00FF){
2388                    if( target < targetLimit){
2389                        *(target++) = (unsigned char) targetByteUnit;
2390                        if(offsets){
2391                            *(offsets++) = (int32_t)(source - args->source-1);
2392                        }
2393
2394                    }else{
2395                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2396                        *err = U_BUFFER_OVERFLOW_ERROR;
2397                    }
2398                }else{
2399                    if(target < targetLimit){
2400                        *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2401                        if(offsets){
2402                            *(offsets++) = (int32_t)(source - args->source-1);
2403                        }
2404                        if(target < targetLimit){
2405                            *(target++) =(unsigned char) (targetByteUnit -0x80);
2406                            if(offsets){
2407                                *(offsets++) = (int32_t)(source - args->source-1);
2408                            }
2409                        }else{
2410                            args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2411                            *err = U_BUFFER_OVERFLOW_ERROR;
2412                        }
2413                    }else{
2414                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2415                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2416                        *err = U_BUFFER_OVERFLOW_ERROR;
2417                    }
2418                }
2419
2420            }
2421            else{
2422                /* oops.. the code point is unassingned
2423                 * set the error and reason
2424                 */
2425
2426                /*check if the char is a First surrogate*/
2427                if(U16_IS_SURROGATE(sourceChar)) {
2428                    if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2429getTrail:
2430                        /*look ahead to find the trail surrogate*/
2431                        if(source <  sourceLimit) {
2432                            /* test the following code unit */
2433                            UChar trail=(UChar) *source;
2434                            if(U16_IS_TRAIL(trail)) {
2435                                source++;
2436                                sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2437                                *err = U_INVALID_CHAR_FOUND;
2438                                /* convert this surrogate code point */
2439                                /* exit this condition tree */
2440                            } else {
2441                                /* this is an unmatched lead code unit (1st surrogate) */
2442                                /* callback(illegal) */
2443                                *err=U_ILLEGAL_CHAR_FOUND;
2444                            }
2445                        } else {
2446                            /* no more input */
2447                            *err = U_ZERO_ERROR;
2448                        }
2449                    } else {
2450                        /* this is an unmatched trail code unit (2nd surrogate) */
2451                        /* callback(illegal) */
2452                        *err=U_ILLEGAL_CHAR_FOUND;
2453                    }
2454                } else {
2455                    /* callback(unassigned) for a BMP code point */
2456                    *err = U_INVALID_CHAR_FOUND;
2457                }
2458
2459                args->converter->fromUChar32=sourceChar;
2460                break;
2461            }
2462        } /* end if(myTargetIndex<myTargetLength) */
2463        else{
2464            *err =U_BUFFER_OVERFLOW_ERROR;
2465            break;
2466        }
2467
2468    }/* end while(mySourceIndex<mySourceLength) */
2469
2470    /*
2471     * the end of the input stream and detection of truncated input
2472     * are handled by the framework, but for ISO-2022-KR conversion
2473     * we need to be in ASCII mode at the very end
2474     *
2475     * conditions:
2476     *   successful
2477     *   not in ASCII mode
2478     *   end of input and no truncated input
2479     */
2480    if( U_SUCCESS(*err) &&
2481        isTargetByteDBCS &&
2482        args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2483    ) {
2484        int32_t sourceIndex;
2485
2486        /* we are switching to ASCII */
2487        isTargetByteDBCS=FALSE;
2488
2489        /* get the source index of the last input character */
2490        /*
2491         * TODO this would be simpler and more reliable if we used a pair
2492         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2493         * so that we could simply use the prevSourceIndex here;
2494         * this code gives an incorrect result for the rare case of an unmatched
2495         * trail surrogate that is alone in the last buffer of the text stream
2496         */
2497        sourceIndex=(int32_t)(source-args->source);
2498        if(sourceIndex>0) {
2499            --sourceIndex;
2500            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2501                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2502            ) {
2503                --sourceIndex;
2504            }
2505        } else {
2506            sourceIndex=-1;
2507        }
2508
2509        fromUWriteUInt8(
2510            args->converter,
2511            SHIFT_IN_STR, 1,
2512            &target, (const char *)targetLimit,
2513            &offsets, sourceIndex,
2514            err);
2515    }
2516
2517    /*save the state and return */
2518    args->source = source;
2519    args->target = (char*)target;
2520    args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2521}
2522
2523/************************ To Unicode ***************************************/
2524
2525static void
2526UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2527                                                            UErrorCode* err){
2528    char const* sourceStart;
2529    UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2530
2531    UConverterToUnicodeArgs subArgs;
2532    int32_t minArgsSize;
2533
2534    /* set up the subconverter arguments */
2535    if(args->size<sizeof(UConverterToUnicodeArgs)) {
2536        minArgsSize = args->size;
2537    } else {
2538        minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2539    }
2540
2541    uprv_memcpy(&subArgs, args, minArgsSize);
2542    subArgs.size = (uint16_t)minArgsSize;
2543    subArgs.converter = myData->currentConverter;
2544
2545    /* remember the original start of the input for offsets */
2546    sourceStart = args->source;
2547
2548    if(myData->key != 0) {
2549        /* continue with a partial escape sequence */
2550        goto escape;
2551    }
2552
2553    while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2554        /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2555        subArgs.source = args->source;
2556        subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2557        if(subArgs.source != subArgs.sourceLimit) {
2558            /*
2559             * get the current partial byte sequence
2560             *
2561             * it needs to be moved between the public and the subconverter
2562             * so that the conversion framework, which only sees the public
2563             * converter, can handle truncated and illegal input etc.
2564             */
2565            if(args->converter->toULength > 0) {
2566                uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2567            }
2568            subArgs.converter->toULength = args->converter->toULength;
2569
2570            /*
2571             * Convert up to the end of the input, or to before the next escape character.
2572             * Does not handle conversion extensions because the preToU[] state etc.
2573             * is not copied.
2574             */
2575            ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2576
2577            if(args->offsets != NULL && sourceStart != args->source) {
2578                /* update offsets to base them on the actual start of the input */
2579                int32_t *offsets = args->offsets;
2580                UChar *target = args->target;
2581                int32_t delta = (int32_t)(args->source - sourceStart);
2582                while(target < subArgs.target) {
2583                    if(*offsets >= 0) {
2584                        *offsets += delta;
2585                    }
2586                    ++offsets;
2587                    ++target;
2588                }
2589            }
2590            args->source = subArgs.source;
2591            args->target = subArgs.target;
2592            args->offsets = subArgs.offsets;
2593
2594            /* copy input/error/overflow buffers */
2595            if(subArgs.converter->toULength > 0) {
2596                uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2597            }
2598            args->converter->toULength = subArgs.converter->toULength;
2599
2600            if(*err == U_BUFFER_OVERFLOW_ERROR) {
2601                if(subArgs.converter->UCharErrorBufferLength > 0) {
2602                    uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2603                                subArgs.converter->UCharErrorBufferLength);
2604                }
2605                args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2606                subArgs.converter->UCharErrorBufferLength = 0;
2607            }
2608        }
2609
2610        if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2611            return;
2612        }
2613
2614escape:
2615        changeState_2022(args->converter,
2616               &(args->source),
2617               args->sourceLimit,
2618               ISO_2022_KR,
2619               err);
2620    }
2621}
2622
2623static void
2624UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2625                                                            UErrorCode* err){
2626    char tempBuf[2];
2627    const char *mySource = ( char *) args->source;
2628    UChar *myTarget = args->target;
2629    const char *mySourceLimit = args->sourceLimit;
2630    UChar32 targetUniChar = 0x0000;
2631    UChar mySourceChar = 0x0000;
2632    UConverterDataISO2022* myData;
2633    UConverterSharedData* sharedData ;
2634    UBool useFallback;
2635
2636    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2637    if(myData->version==1){
2638        UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2639        return;
2640    }
2641
2642    /* initialize state */
2643    sharedData = myData->currentConverter->sharedData;
2644    useFallback = args->converter->useFallback;
2645
2646    if(myData->key != 0) {
2647        /* continue with a partial escape sequence */
2648        goto escape;
2649    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2650        /* continue with a partial double-byte character */
2651        mySourceChar = args->converter->toUBytes[0];
2652        args->converter->toULength = 0;
2653        goto getTrailByte;
2654    }
2655
2656    while(mySource< mySourceLimit){
2657
2658        if(myTarget < args->targetLimit){
2659
2660            mySourceChar= (unsigned char) *mySource++;
2661
2662            if(mySourceChar==UCNV_SI){
2663                myData->toU2022State.g = 0;
2664                if (myData->isEmptySegment) {
2665                    myData->isEmptySegment = FALSE;	/* we are handling it, reset to avoid future spurious errors */
2666                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2667                    args->converter->toUCallbackReason = UCNV_IRREGULAR;
2668                    args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2669                    args->converter->toULength = 1;
2670                    args->target = myTarget;
2671                    args->source = mySource;
2672                    return;
2673                }
2674                /*consume the source */
2675                continue;
2676            }else if(mySourceChar==UCNV_SO){
2677                myData->toU2022State.g = 1;
2678                myData->isEmptySegment = TRUE;	/* Begin a new segment, empty so far */
2679                /*consume the source */
2680                continue;
2681            }else if(mySourceChar==ESC_2022){
2682                mySource--;
2683escape:
2684                myData->isEmptySegment = FALSE;	/* Any invalid ESC sequences will be detected separately, so just reset this */
2685                changeState_2022(args->converter,&(mySource),
2686                                mySourceLimit, ISO_2022_KR, err);
2687                if(U_FAILURE(*err)){
2688                    args->target = myTarget;
2689                    args->source = mySource;
2690                    return;
2691                }
2692                continue;
2693            }
2694
2695            myData->isEmptySegment = FALSE;	/* Any invalid char errors will be detected separately, so just reset this */
2696            if(myData->toU2022State.g == 1) {
2697                if(mySource < mySourceLimit) {
2698                    int leadIsOk, trailIsOk;
2699                    uint8_t trailByte;
2700getTrailByte:
2701                    targetUniChar = missingCharMarker;
2702                    trailByte = (uint8_t)*mySource;
2703                    /*
2704                     * Ticket 5691: consistent illegal sequences:
2705                     * - We include at least the first byte in the illegal sequence.
2706                     * - If any of the non-initial bytes could be the start of a character,
2707                     *   we stop the illegal sequence before the first one of those.
2708                     *
2709                     * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2710                     * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2711                     * Otherwise we convert or report the pair of bytes.
2712                     */
2713                    leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2714                    trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2715                    if (leadIsOk && trailIsOk) {
2716                        ++mySource;
2717                        tempBuf[0] = (char)(mySourceChar + 0x80);
2718                        tempBuf[1] = (char)(trailByte + 0x80);
2719                        targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2720                        mySourceChar = (mySourceChar << 8) | trailByte;
2721                    } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2722                        /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2723                        ++mySource;
2724                        /* add another bit so that the code below writes 2 bytes in case of error */
2725                        mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2726                    }
2727                } else {
2728                    args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2729                    args->converter->toULength = 1;
2730                    break;
2731                }
2732            }
2733            else if(mySourceChar <= 0x7f) {
2734                targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2735            } else {
2736                targetUniChar = 0xffff;
2737            }
2738            if(targetUniChar < 0xfffe){
2739                if(args->offsets) {
2740                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2741                }
2742                *(myTarget++)=(UChar)targetUniChar;
2743            }
2744            else {
2745                /* Call the callback function*/
2746                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2747                break;
2748            }
2749        }
2750        else{
2751            *err =U_BUFFER_OVERFLOW_ERROR;
2752            break;
2753        }
2754    }
2755    args->target = myTarget;
2756    args->source = mySource;
2757}
2758
2759/*************************** END ISO2022-KR *********************************/
2760
2761/*************************** ISO-2022-CN *********************************
2762*
2763* Rules for ISO-2022-CN Encoding:
2764* i)   The designator sequence must appear once on a line before any instance
2765*      of character set it designates.
2766* ii)  If two lines contain characters from the same character set, both lines
2767*      must include the designator sequence.
2768* iii) Once the designator sequence is known, a shifting sequence has to be found
2769*      to invoke the  shifting
2770* iv)  All lines start in ASCII and end in ASCII.
2771* v)   Four shifting sequences are employed for this purpose:
2772*
2773*      Sequcence   ASCII Eq    Charsets
2774*      ----------  -------    ---------
2775*      SI           <SI>        US-ASCII
2776*      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2777*      SS2          <ESC>N      CNS-11643-1992 Plane 2
2778*      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
2779*
2780* vi)
2781*      SOdesignator  : ESC "$" ")" finalchar_for_SO
2782*      SS2designator : ESC "$" "*" finalchar_for_SS2
2783*      SS3designator : ESC "$" "+" finalchar_for_SS3
2784*
2785*      ESC $ ) A       Indicates the bytes following SO are Chinese
2786*       characters as defined in GB 2312-80, until
2787*       another SOdesignation appears
2788*
2789*
2790*      ESC $ ) E       Indicates the bytes following SO are as defined
2791*       in ISO-IR-165 (for details, see section 2.1),
2792*       until another SOdesignation appears
2793*
2794*      ESC $ ) G       Indicates the bytes following SO are as defined
2795*       in CNS 11643-plane-1, until another
2796*       SOdesignation appears
2797*
2798*      ESC $ * H       Indicates the two bytes immediately following
2799*       SS2 is a Chinese character as defined in CNS
2800*       11643-plane-2, until another SS2designation
2801*       appears
2802*       (Meaning <ESC>N must preceed every 2 byte
2803*        sequence.)
2804*
2805*      ESC $ + I       Indicates the immediate two bytes following SS3
2806*       is a Chinese character as defined in CNS
2807*       11643-plane-3, until another SS3designation
2808*       appears
2809*       (Meaning <ESC>O must preceed every 2 byte
2810*        sequence.)
2811*
2812*      ESC $ + J       Indicates the immediate two bytes following SS3
2813*       is a Chinese character as defined in CNS
2814*       11643-plane-4, until another SS3designation
2815*       appears
2816*       (In English: <ESC>O must preceed every 2 byte
2817*        sequence.)
2818*
2819*      ESC $ + K       Indicates the immediate two bytes following SS3
2820*       is a Chinese character as defined in CNS
2821*       11643-plane-5, until another SS3designation
2822*       appears
2823*
2824*      ESC $ + L       Indicates the immediate two bytes following SS3
2825*       is a Chinese character as defined in CNS
2826*       11643-plane-6, until another SS3designation
2827*       appears
2828*
2829*      ESC $ + M       Indicates the immediate two bytes following SS3
2830*       is a Chinese character as defined in CNS
2831*       11643-plane-7, until another SS3designation
2832*       appears
2833*
2834*       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2835*       has its own designation information before any Chinese characters
2836*       appear
2837*
2838*/
2839
2840/* The following are defined this way to make the strings truly readonly */
2841static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2842static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2843static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2844static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2845static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2846static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2847static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2848static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2849static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2850
2851/********************** ISO2022-CN Data **************************/
2852static const char* const escSeqCharsCN[10] ={
2853        SHIFT_IN_STR,                   /* 0 ASCII */
2854        GB_2312_80_STR,                 /* 1 GB2312_1 */
2855        ISO_IR_165_STR,                 /* 2 ISO_IR_165 */
2856        CNS_11643_1992_Plane_1_STR,
2857        CNS_11643_1992_Plane_2_STR,
2858        CNS_11643_1992_Plane_3_STR,
2859        CNS_11643_1992_Plane_4_STR,
2860        CNS_11643_1992_Plane_5_STR,
2861        CNS_11643_1992_Plane_6_STR,
2862        CNS_11643_1992_Plane_7_STR
2863};
2864
2865static void
2866UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2867    UConverter *cnv = args->converter;
2868    UConverterDataISO2022 *converterData;
2869    ISO2022State *pFromU2022State;
2870    uint8_t *target = (uint8_t *) args->target;
2871    const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2872    const UChar* source = args->source;
2873    const UChar* sourceLimit = args->sourceLimit;
2874    int32_t* offsets = args->offsets;
2875    UChar32 sourceChar;
2876    char buffer[8];
2877    int32_t len;
2878    int8_t choices[3];
2879    int32_t choiceCount;
2880    uint32_t targetValue = 0;
2881    UBool useFallback;
2882
2883    /* set up the state */
2884    converterData     = (UConverterDataISO2022*)cnv->extraInfo;
2885    pFromU2022State   = &converterData->fromU2022State;
2886
2887    choiceCount = 0;
2888
2889    /* check if the last codepoint of previous buffer was a lead surrogate*/
2890    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2891        goto getTrail;
2892    }
2893
2894    while( source < sourceLimit){
2895        if(target < targetLimit){
2896
2897            sourceChar  = *(source++);
2898            /*check if the char is a First surrogate*/
2899             if(U16_IS_SURROGATE(sourceChar)) {
2900                if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2901getTrail:
2902                    /*look ahead to find the trail surrogate*/
2903                    if(source < sourceLimit) {
2904                        /* test the following code unit */
2905                        UChar trail=(UChar) *source;
2906                        if(U16_IS_TRAIL(trail)) {
2907                            source++;
2908                            sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2909                            cnv->fromUChar32=0x00;
2910                            /* convert this supplementary code point */
2911                            /* exit this condition tree */
2912                        } else {
2913                            /* this is an unmatched lead code unit (1st surrogate) */
2914                            /* callback(illegal) */
2915                            *err=U_ILLEGAL_CHAR_FOUND;
2916                            cnv->fromUChar32=sourceChar;
2917                            break;
2918                        }
2919                    } else {
2920                        /* no more input */
2921                        cnv->fromUChar32=sourceChar;
2922                        break;
2923                    }
2924                } else {
2925                    /* this is an unmatched trail code unit (2nd surrogate) */
2926                    /* callback(illegal) */
2927                    *err=U_ILLEGAL_CHAR_FOUND;
2928                    cnv->fromUChar32=sourceChar;
2929                    break;
2930                }
2931            }
2932
2933            /* do the conversion */
2934            if(sourceChar <= 0x007f ){
2935                /* do not convert SO/SI/ESC */
2936                if(IS_2022_CONTROL(sourceChar)) {
2937                    /* callback(illegal) */
2938                    *err=U_ILLEGAL_CHAR_FOUND;
2939                    cnv->fromUChar32=sourceChar;
2940                    break;
2941                }
2942
2943                /* US-ASCII */
2944                if(pFromU2022State->g == 0) {
2945                    buffer[0] = (char)sourceChar;
2946                    len = 1;
2947                } else {
2948                    buffer[0] = UCNV_SI;
2949                    buffer[1] = (char)sourceChar;
2950                    len = 2;
2951                    pFromU2022State->g = 0;
2952                    choiceCount = 0;
2953                }
2954                if(sourceChar == CR || sourceChar == LF) {
2955                    /* reset the state at the end of a line */
2956                    uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2957                    choiceCount = 0;
2958                }
2959            }
2960            else{
2961                /* convert U+0080..U+10ffff */
2962                int32_t i;
2963                int8_t cs, g;
2964
2965                if(choiceCount == 0) {
2966                    /* try the current SO/G1 converter first */
2967                    choices[0] = pFromU2022State->cs[1];
2968
2969                    /* default to GB2312_1 if none is designated yet */
2970                    if(choices[0] == 0) {
2971                        choices[0] = GB2312_1;
2972                    }
2973
2974                    if(converterData->version == 0) {
2975                        /* ISO-2022-CN */
2976
2977                        /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2978                        if(choices[0] == GB2312_1) {
2979                            choices[1] = (int8_t)CNS_11643_1;
2980                        } else {
2981                            choices[1] = (int8_t)GB2312_1;
2982                        }
2983
2984                        choiceCount = 2;
2985                    } else if (converterData->version == 1) {
2986                        /* ISO-2022-CN-EXT */
2987
2988                        /* try one of the other converters */
2989                        switch(choices[0]) {
2990                        case GB2312_1:
2991                            choices[1] = (int8_t)CNS_11643_1;
2992                            choices[2] = (int8_t)ISO_IR_165;
2993                            break;
2994                        case ISO_IR_165:
2995                            choices[1] = (int8_t)GB2312_1;
2996                            choices[2] = (int8_t)CNS_11643_1;
2997                            break;
2998                        default: /* CNS_11643_x */
2999                            choices[1] = (int8_t)GB2312_1;
3000                            choices[2] = (int8_t)ISO_IR_165;
3001                            break;
3002                        }
3003
3004                        choiceCount = 3;
3005                    } else {
3006                        choices[0] = (int8_t)CNS_11643_1;
3007                        choices[1] = (int8_t)GB2312_1;
3008                    }
3009                }
3010
3011                cs = g = 0;
3012                /*
3013                 * len==0: no mapping found yet
3014                 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3015                 * len>0: found a roundtrip result, done
3016                 */
3017                len = 0;
3018                /*
3019                 * We will turn off useFallback after finding a fallback,
3020                 * but we still get fallbacks from PUA code points as usual.
3021                 * Therefore, we will also need to check that we don't overwrite
3022                 * an early fallback with a later one.
3023                 */
3024                useFallback = cnv->useFallback;
3025
3026                for(i = 0; i < choiceCount && len <= 0; ++i) {
3027                    int8_t cs0 = choices[i];
3028                    if(cs0 > 0) {
3029                        uint32_t value;
3030                        int32_t len2;
3031                        if(cs0 >= CNS_11643_0) {
3032                            len2 = MBCS_FROM_UCHAR32_ISO2022(
3033                                        converterData->myConverterArray[CNS_11643],
3034                                        sourceChar,
3035                                        &value,
3036                                        useFallback,
3037                                        MBCS_OUTPUT_3);
3038                            if(len2 == 3 || (len2 == -3 && len == 0)) {
3039                                targetValue = value;
3040                                cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3041                                if(len2 >= 0) {
3042                                    len = 2;
3043                                } else {
3044                                    len = -2;
3045                                    useFallback = FALSE;
3046                                }
3047                                if(cs == CNS_11643_1) {
3048                                    g = 1;
3049                                } else if(cs == CNS_11643_2) {
3050                                    g = 2;
3051                                } else /* plane 3..7 */ if(converterData->version == 1) {
3052                                    g = 3;
3053                                } else {
3054                                    /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3055                                    len = 0;
3056                                }
3057                            }
3058                        } else {
3059                            /* GB2312_1 or ISO-IR-165 */
3060                            U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
3061                            len2 = MBCS_FROM_UCHAR32_ISO2022(
3062                                        converterData->myConverterArray[cs0],
3063                                        sourceChar,
3064                                        &value,
3065                                        useFallback,
3066                                        MBCS_OUTPUT_2);
3067                            if(len2 == 2 || (len2 == -2 && len == 0)) {
3068                                targetValue = value;
3069                                len = len2;
3070                                cs = cs0;
3071                                g = 1;
3072                                useFallback = FALSE;
3073                            }
3074                        }
3075                    }
3076                }
3077
3078                if(len != 0) {
3079                    len = 0; /* count output bytes; it must have been abs(len) == 2 */
3080
3081                    /* write the designation sequence if necessary */
3082                    if(cs != pFromU2022State->cs[g]) {
3083                        if(cs < CNS_11643) {
3084                            uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3085                        } else {
3086                            U_ASSERT(cs >= CNS_11643_1);
3087                            uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3088                        }
3089                        len = 4;
3090                        pFromU2022State->cs[g] = cs;
3091                        if(g == 1) {
3092                            /* changing the SO/G1 charset invalidates the choices[] */
3093                            choiceCount = 0;
3094                        }
3095                    }
3096
3097                    /* write the shift sequence if necessary */
3098                    if(g != pFromU2022State->g) {
3099                        switch(g) {
3100                        case 1:
3101                            buffer[len++] = UCNV_SO;
3102
3103                            /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3104                            pFromU2022State->g = 1;
3105                            break;
3106                        case 2:
3107                            buffer[len++] = 0x1b;
3108                            buffer[len++] = 0x4e;
3109                            break;
3110                        default: /* case 3 */
3111                            buffer[len++] = 0x1b;
3112                            buffer[len++] = 0x4f;
3113                            break;
3114                        }
3115                    }
3116
3117                    /* write the two output bytes */
3118                    buffer[len++] = (char)(targetValue >> 8);
3119                    buffer[len++] = (char)targetValue;
3120                } else {
3121                    /* if we cannot find the character after checking all codepages
3122                     * then this is an error
3123                     */
3124                    *err = U_INVALID_CHAR_FOUND;
3125                    cnv->fromUChar32=sourceChar;
3126                    break;
3127                }
3128            }
3129
3130            /* output len>0 bytes in buffer[] */
3131            if(len == 1) {
3132                *target++ = buffer[0];
3133                if(offsets) {
3134                    *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3135                }
3136            } else if(len == 2 && (target + 2) <= targetLimit) {
3137                *target++ = buffer[0];
3138                *target++ = buffer[1];
3139                if(offsets) {
3140                    int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3141                    *offsets++ = sourceIndex;
3142                    *offsets++ = sourceIndex;
3143                }
3144            } else {
3145                fromUWriteUInt8(
3146                    cnv,
3147                    buffer, len,
3148                    &target, (const char *)targetLimit,
3149                    &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3150                    err);
3151                if(U_FAILURE(*err)) {
3152                    break;
3153                }
3154            }
3155        } /* end if(myTargetIndex<myTargetLength) */
3156        else{
3157            *err =U_BUFFER_OVERFLOW_ERROR;
3158            break;
3159        }
3160
3161    }/* end while(mySourceIndex<mySourceLength) */
3162
3163    /*
3164     * the end of the input stream and detection of truncated input
3165     * are handled by the framework, but for ISO-2022-CN conversion
3166     * we need to be in ASCII mode at the very end
3167     *
3168     * conditions:
3169     *   successful
3170     *   not in ASCII mode
3171     *   end of input and no truncated input
3172     */
3173    if( U_SUCCESS(*err) &&
3174        pFromU2022State->g!=0 &&
3175        args->flush && source>=sourceLimit && cnv->fromUChar32==0
3176    ) {
3177        int32_t sourceIndex;
3178
3179        /* we are switching to ASCII */
3180        pFromU2022State->g=0;
3181
3182        /* get the source index of the last input character */
3183        /*
3184         * TODO this would be simpler and more reliable if we used a pair
3185         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3186         * so that we could simply use the prevSourceIndex here;
3187         * this code gives an incorrect result for the rare case of an unmatched
3188         * trail surrogate that is alone in the last buffer of the text stream
3189         */
3190        sourceIndex=(int32_t)(source-args->source);
3191        if(sourceIndex>0) {
3192            --sourceIndex;
3193            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3194                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3195            ) {
3196                --sourceIndex;
3197            }
3198        } else {
3199            sourceIndex=-1;
3200        }
3201
3202        fromUWriteUInt8(
3203            cnv,
3204            SHIFT_IN_STR, 1,
3205            &target, (const char *)targetLimit,
3206            &offsets, sourceIndex,
3207            err);
3208    }
3209
3210    /*save the state and return */
3211    args->source = source;
3212    args->target = (char*)target;
3213}
3214
3215
3216static void
3217UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3218                                               UErrorCode* err){
3219    char tempBuf[3];
3220    const char *mySource = (char *) args->source;
3221    UChar *myTarget = args->target;
3222    const char *mySourceLimit = args->sourceLimit;
3223    uint32_t targetUniChar = 0x0000;
3224    uint32_t mySourceChar = 0x0000;
3225    UConverterDataISO2022* myData;
3226    ISO2022State *pToU2022State;
3227
3228    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3229    pToU2022State = &myData->toU2022State;
3230
3231    if(myData->key != 0) {
3232        /* continue with a partial escape sequence */
3233        goto escape;
3234    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3235        /* continue with a partial double-byte character */
3236        mySourceChar = args->converter->toUBytes[0];
3237        args->converter->toULength = 0;
3238        targetUniChar = missingCharMarker;
3239        goto getTrailByte;
3240    }
3241
3242    while(mySource < mySourceLimit){
3243
3244        targetUniChar =missingCharMarker;
3245
3246        if(myTarget < args->targetLimit){
3247
3248            mySourceChar= (unsigned char) *mySource++;
3249
3250            switch(mySourceChar){
3251            case UCNV_SI:
3252                pToU2022State->g=0;
3253                if (myData->isEmptySegment) {
3254                    myData->isEmptySegment = FALSE;	/* we are handling it, reset to avoid future spurious errors */
3255                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3256                    args->converter->toUCallbackReason = UCNV_IRREGULAR;
3257                    args->converter->toUBytes[0] = mySourceChar;
3258                    args->converter->toULength = 1;
3259                    args->target = myTarget;
3260                    args->source = mySource;
3261                    return;
3262                }
3263                continue;
3264
3265            case UCNV_SO:
3266                if(pToU2022State->cs[1] != 0) {
3267                    pToU2022State->g=1;
3268                    myData->isEmptySegment = TRUE;	/* Begin a new segment, empty so far */
3269                    continue;
3270                } else {
3271                    /* illegal to have SO before a matching designator */
3272                    myData->isEmptySegment = FALSE;	/* Handling a different error, reset this to avoid future spurious errs */
3273                    break;
3274                }
3275
3276            case ESC_2022:
3277                mySource--;
3278escape:
3279                {
3280                    const char * mySourceBefore = mySource;
3281                    int8_t toULengthBefore = args->converter->toULength;
3282
3283                    changeState_2022(args->converter,&(mySource),
3284                        mySourceLimit, ISO_2022_CN,err);
3285
3286                    /* After SO there must be at least one character before a designator (designator error handled separately) */
3287                    if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3288                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3289                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
3290                        args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3291                    }
3292                }
3293
3294                /* invalid or illegal escape sequence */
3295                if(U_FAILURE(*err)){
3296                    args->target = myTarget;
3297                    args->source = mySource;
3298                    myData->isEmptySegment = FALSE;	/* Reset to avoid future spurious errors */
3299                    return;
3300                }
3301                continue;
3302
3303            /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3304
3305            case CR:
3306                /*falls through*/
3307            case LF:
3308                uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3309                /* falls through */
3310            default:
3311                /* convert one or two bytes */
3312                myData->isEmptySegment = FALSE;
3313                if(pToU2022State->g != 0) {
3314                    if(mySource < mySourceLimit) {
3315                        UConverterSharedData *cnv;
3316                        StateEnum tempState;
3317                        int32_t tempBufLen;
3318                        int leadIsOk, trailIsOk;
3319                        uint8_t trailByte;
3320getTrailByte:
3321                        trailByte = (uint8_t)*mySource;
3322                        /*
3323                         * Ticket 5691: consistent illegal sequences:
3324                         * - We include at least the first byte in the illegal sequence.
3325                         * - If any of the non-initial bytes could be the start of a character,
3326                         *   we stop the illegal sequence before the first one of those.
3327                         *
3328                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3329                         * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3330                         * Otherwise we convert or report the pair of bytes.
3331                         */
3332                        leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3333                        trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3334                        if (leadIsOk && trailIsOk) {
3335                            ++mySource;
3336                            tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3337                            if(tempState >= CNS_11643_0) {
3338                                cnv = myData->myConverterArray[CNS_11643];
3339                                tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3340                                tempBuf[1] = (char) (mySourceChar);
3341                                tempBuf[2] = (char) trailByte;
3342                                tempBufLen = 3;
3343
3344                            }else{
3345                                U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
3346                                cnv = myData->myConverterArray[tempState];
3347                                tempBuf[0] = (char) (mySourceChar);
3348                                tempBuf[1] = (char) trailByte;
3349                                tempBufLen = 2;
3350                            }
3351                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3352                            mySourceChar = (mySourceChar << 8) | trailByte;
3353                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3354                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3355                            ++mySource;
3356                            /* add another bit so that the code below writes 2 bytes in case of error */
3357                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3358                        }
3359                        if(pToU2022State->g>=2) {
3360                            /* return from a single-shift state to the previous one */
3361                            pToU2022State->g=pToU2022State->prevG;
3362                        }
3363                    } else {
3364                        args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3365                        args->converter->toULength = 1;
3366                        goto endloop;
3367                    }
3368                }
3369                else{
3370                    if(mySourceChar <= 0x7f) {
3371                        targetUniChar = (UChar) mySourceChar;
3372                    }
3373                }
3374                break;
3375            }
3376            if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3377                if(args->offsets){
3378                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3379                }
3380                *(myTarget++)=(UChar)targetUniChar;
3381            }
3382            else if(targetUniChar > missingCharMarker){
3383                /* disassemble the surrogate pair and write to output*/
3384                targetUniChar-=0x0010000;
3385                *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3386                if(args->offsets){
3387                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3388                }
3389                ++myTarget;
3390                if(myTarget< args->targetLimit){
3391                    *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3392                    if(args->offsets){
3393                        args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3394                    }
3395                    ++myTarget;
3396                }else{
3397                    args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3398                                    (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3399                }
3400
3401            }
3402            else{
3403                /* Call the callback function*/
3404                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3405                break;
3406            }
3407        }
3408        else{
3409            *err =U_BUFFER_OVERFLOW_ERROR;
3410            break;
3411        }
3412    }
3413endloop:
3414    args->target = myTarget;
3415    args->source = mySource;
3416}
3417
3418static void
3419_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3420    UConverter *cnv = args->converter;
3421    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3422    ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3423    char *p, *subchar;
3424    char buffer[8];
3425    int32_t length;
3426
3427    subchar=(char *)cnv->subChars;
3428    length=cnv->subCharLen; /* assume length==1 for most variants */
3429
3430    p = buffer;
3431    switch(myConverterData->locale[0]){
3432    case 'j':
3433        {
3434            int8_t cs;
3435
3436            if(pFromU2022State->g == 1) {
3437                /* JIS7: switch from G1 to G0 */
3438                pFromU2022State->g = 0;
3439                *p++ = UCNV_SI;
3440            }
3441
3442            cs = pFromU2022State->cs[0];
3443            if(cs != ASCII && cs != JISX201) {
3444                /* not in ASCII or JIS X 0201: switch to ASCII */
3445                pFromU2022State->cs[0] = (int8_t)ASCII;
3446                *p++ = '\x1b';
3447                *p++ = '\x28';
3448                *p++ = '\x42';
3449            }
3450
3451            *p++ = subchar[0];
3452            break;
3453        }
3454    case 'c':
3455        if(pFromU2022State->g != 0) {
3456            /* not in ASCII mode: switch to ASCII */
3457            pFromU2022State->g = 0;
3458            *p++ = UCNV_SI;
3459        }
3460        *p++ = subchar[0];
3461        break;
3462    case 'k':
3463        if(myConverterData->version == 0) {
3464            if(length == 1) {
3465                if((UBool)args->converter->fromUnicodeStatus) {
3466                    /* in DBCS mode: switch to SBCS */
3467                    args->converter->fromUnicodeStatus = 0;
3468                    *p++ = UCNV_SI;
3469                }
3470                *p++ = subchar[0];
3471            } else /* length == 2*/ {
3472                if(!(UBool)args->converter->fromUnicodeStatus) {
3473                    /* in SBCS mode: switch to DBCS */
3474                    args->converter->fromUnicodeStatus = 1;
3475                    *p++ = UCNV_SO;
3476                }
3477                *p++ = subchar[0];
3478                *p++ = subchar[1];
3479            }
3480            break;
3481        } else {
3482            /* save the subconverter's substitution string */
3483            uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3484            int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3485
3486            /* set our substitution string into the subconverter */
3487            myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3488            myConverterData->currentConverter->subCharLen = (int8_t)length;
3489
3490            /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3491            args->converter = myConverterData->currentConverter;
3492            myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3493            ucnv_cbFromUWriteSub(args, 0, err);
3494            cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3495            args->converter = cnv;
3496
3497            /* restore the subconverter's substitution string */
3498            myConverterData->currentConverter->subChars = currentSubChars;
3499            myConverterData->currentConverter->subCharLen = currentSubCharLen;
3500
3501            if(*err == U_BUFFER_OVERFLOW_ERROR) {
3502                if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3503                    uprv_memcpy(
3504                        cnv->charErrorBuffer,
3505                        myConverterData->currentConverter->charErrorBuffer,
3506                        myConverterData->currentConverter->charErrorBufferLength);
3507                }
3508                cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3509                myConverterData->currentConverter->charErrorBufferLength = 0;
3510            }
3511            return;
3512        }
3513    default:
3514        /* not expected */
3515        break;
3516    }
3517    ucnv_cbFromUWriteBytes(args,
3518                           buffer, (int32_t)(p - buffer),
3519                           offsetIndex, err);
3520}
3521
3522/*
3523 * Structure for cloning an ISO 2022 converter into a single memory block.
3524 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3525 * and then ucnv_safeClone() of the sub-converter may additionally align
3526 * currentConverter inside the cloneStruct, for which we need the deadSpace
3527 * after currentConverter.
3528 * This is because UAlignedMemory may be larger than the actually
3529 * necessary alignment size for the platform.
3530 * The other cloneStruct fields will not be moved around,
3531 * and are aligned properly with cloneStruct's alignment.
3532 */
3533struct cloneStruct
3534{
3535    UConverter cnv;
3536    UConverter currentConverter;
3537    UAlignedMemory deadSpace;
3538    UConverterDataISO2022 mydata;
3539};
3540
3541
3542static UConverter *
3543_ISO_2022_SafeClone(
3544            const UConverter *cnv,
3545            void *stackBuffer,
3546            int32_t *pBufferSize,
3547            UErrorCode *status)
3548{
3549    struct cloneStruct * localClone;
3550    UConverterDataISO2022 *cnvData;
3551    int32_t i, size;
3552
3553    if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3554        *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3555        return NULL;
3556    }
3557
3558    cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3559    localClone = (struct cloneStruct *)stackBuffer;
3560
3561    /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3562
3563    uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3564    localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3565    localClone->cnv.isExtraLocal = TRUE;
3566
3567    /* share the subconverters */
3568
3569    if(cnvData->currentConverter != NULL) {
3570        size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3571        localClone->mydata.currentConverter =
3572            ucnv_safeClone(cnvData->currentConverter,
3573                            &localClone->currentConverter,
3574                            &size, status);
3575        if(U_FAILURE(*status)) {
3576            return NULL;
3577        }
3578    }
3579
3580    for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3581        if(cnvData->myConverterArray[i] != NULL) {
3582            ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3583        }
3584    }
3585
3586    return &localClone->cnv;
3587}
3588
3589static void
3590_ISO_2022_GetUnicodeSet(const UConverter *cnv,
3591                    const USetAdder *sa,
3592                    UConverterUnicodeSet which,
3593                    UErrorCode *pErrorCode)
3594{
3595    int32_t i;
3596    UConverterDataISO2022* cnvData;
3597
3598    if (U_FAILURE(*pErrorCode)) {
3599        return;
3600    }
3601#ifdef U_ENABLE_GENERIC_ISO_2022
3602    if (cnv->sharedData == &_ISO2022Data) {
3603        /* We use UTF-8 in this case */
3604        sa->addRange(sa->set, 0, 0xd7FF);
3605        sa->addRange(sa->set, 0xE000, 0x10FFFF);
3606        return;
3607    }
3608#endif
3609
3610    cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3611
3612    /* open a set and initialize it with code points that are algorithmically round-tripped */
3613    switch(cnvData->locale[0]){
3614    case 'j':
3615        /* include JIS X 0201 which is hardcoded */
3616        sa->add(sa->set, 0xa5);
3617        sa->add(sa->set, 0x203e);
3618        if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3619            /* include Latin-1 for some variants of JP */
3620            sa->addRange(sa->set, 0, 0xff);
3621        } else {
3622            /* include ASCII for JP */
3623            sa->addRange(sa->set, 0, 0x7f);
3624        }
3625        if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3626            /*
3627             * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3628             * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3629             * use half-width Katakana.
3630             * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3631             * half-width Katakana via the ESC ( I sequence.
3632             * However, we only emit (fromUnicode) half-width Katakana according to the
3633             * definition of each variant.
3634             *
3635             * When including fallbacks,
3636             * we need to include half-width Katakana Unicode code points for all JP variants because
3637             * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3638             */
3639            /* include half-width Katakana for JP */
3640            sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3641        }
3642        break;
3643    case 'c':
3644    case 'z':
3645        /* include ASCII for CN */
3646        sa->addRange(sa->set, 0, 0x7f);
3647        break;
3648    case 'k':
3649        /* there is only one converter for KR, and it is not in the myConverterArray[] */
3650        cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3651                cnvData->currentConverter, sa, which, pErrorCode);
3652        /* the loop over myConverterArray[] will simply not find another converter */
3653        break;
3654    default:
3655        break;
3656    }
3657
3658#if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3659            if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3660                cnvData->version==0 && i==CNS_11643
3661            ) {
3662                /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3663                ucnv_MBCSGetUnicodeSetForBytes(
3664                        cnvData->myConverterArray[i],
3665                        sa, UCNV_ROUNDTRIP_SET,
3666                        0, 0x81, 0x82,
3667                        pErrorCode);
3668            }
3669#endif
3670
3671    for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3672        UConverterSetFilter filter;
3673        if(cnvData->myConverterArray[i]!=NULL) {
3674            if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3675                cnvData->version==0 && i==CNS_11643
3676            ) {
3677                /*
3678                 * Version-specific for CN:
3679                 * CN version 0 does not map CNS planes 3..7 although
3680                 * they are all available in the CNS conversion table;
3681                 * CN version 1 (-EXT) does map them all.
3682                 * The two versions create different Unicode sets.
3683                 */
3684                filter=UCNV_SET_FILTER_2022_CN;
3685            } else if(cnvData->locale[0]=='j' && i==JISX208) {
3686                /*
3687                 * Only add code points that map to Shift-JIS codes
3688                 * corresponding to JIS X 0208.
3689                 */
3690                filter=UCNV_SET_FILTER_SJIS;
3691            } else if(i==KSC5601) {
3692                /*
3693                 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3694                 * are broader than GR94.
3695                 */
3696                filter=UCNV_SET_FILTER_GR94DBCS;
3697            } else {
3698                filter=UCNV_SET_FILTER_NONE;
3699            }
3700            ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3701        }
3702    }
3703
3704    /*
3705     * ISO 2022 converters must not convert SO/SI/ESC despite what
3706     * sub-converters do by themselves.
3707     * Remove these characters from the set.
3708     */
3709    sa->remove(sa->set, 0x0e);
3710    sa->remove(sa->set, 0x0f);
3711    sa->remove(sa->set, 0x1b);
3712
3713    /* ISO 2022 converters do not convert C1 controls either */
3714    sa->removeRange(sa->set, 0x80, 0x9f);
3715}
3716
3717static const UConverterImpl _ISO2022Impl={
3718    UCNV_ISO_2022,
3719
3720    NULL,
3721    NULL,
3722
3723    _ISO2022Open,
3724    _ISO2022Close,
3725    _ISO2022Reset,
3726
3727#ifdef U_ENABLE_GENERIC_ISO_2022
3728    T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3729    T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3730    ucnv_fromUnicode_UTF8,
3731    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3732#else
3733    NULL,
3734    NULL,
3735    NULL,
3736    NULL,
3737#endif
3738    NULL,
3739
3740    NULL,
3741    _ISO2022getName,
3742    _ISO_2022_WriteSub,
3743    _ISO_2022_SafeClone,
3744    _ISO_2022_GetUnicodeSet,
3745
3746    NULL,
3747    NULL
3748};
3749static const UConverterStaticData _ISO2022StaticData={
3750    sizeof(UConverterStaticData),
3751    "ISO_2022",
3752    2022,
3753    UCNV_IBM,
3754    UCNV_ISO_2022,
3755    1,
3756    3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3757    { 0x1a, 0, 0, 0 },
3758    1,
3759    FALSE,
3760    FALSE,
3761    0,
3762    0,
3763    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3764};
3765const UConverterSharedData _ISO2022Data={
3766    sizeof(UConverterSharedData),
3767    ~((uint32_t) 0),
3768    NULL,
3769    NULL,
3770    &_ISO2022StaticData,
3771    FALSE,
3772    &_ISO2022Impl,
3773    0, UCNV_MBCS_TABLE_INITIALIZER
3774};
3775
3776/*************JP****************/
3777static const UConverterImpl _ISO2022JPImpl={
3778    UCNV_ISO_2022,
3779
3780    NULL,
3781    NULL,
3782
3783    _ISO2022Open,
3784    _ISO2022Close,
3785    _ISO2022Reset,
3786
3787    UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3788    UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3789    UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3790    UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3791    NULL,
3792
3793    NULL,
3794    _ISO2022getName,
3795    _ISO_2022_WriteSub,
3796    _ISO_2022_SafeClone,
3797    _ISO_2022_GetUnicodeSet,
3798
3799    NULL,
3800    NULL
3801};
3802static const UConverterStaticData _ISO2022JPStaticData={
3803    sizeof(UConverterStaticData),
3804    "ISO_2022_JP",
3805    0,
3806    UCNV_IBM,
3807    UCNV_ISO_2022,
3808    1,
3809    6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3810    { 0x1a, 0, 0, 0 },
3811    1,
3812    FALSE,
3813    FALSE,
3814    0,
3815    0,
3816    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3817};
3818
3819namespace {
3820
3821const UConverterSharedData _ISO2022JPData={
3822    sizeof(UConverterSharedData),
3823    ~((uint32_t) 0),
3824    NULL,
3825    NULL,
3826    &_ISO2022JPStaticData,
3827    FALSE,
3828    &_ISO2022JPImpl,
3829    0, UCNV_MBCS_TABLE_INITIALIZER
3830};
3831
3832}  // namespace
3833
3834/************* KR ***************/
3835static const UConverterImpl _ISO2022KRImpl={
3836    UCNV_ISO_2022,
3837
3838    NULL,
3839    NULL,
3840
3841    _ISO2022Open,
3842    _ISO2022Close,
3843    _ISO2022Reset,
3844
3845    UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3846    UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3847    UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3848    UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3849    NULL,
3850
3851    NULL,
3852    _ISO2022getName,
3853    _ISO_2022_WriteSub,
3854    _ISO_2022_SafeClone,
3855    _ISO_2022_GetUnicodeSet,
3856
3857    NULL,
3858    NULL
3859};
3860static const UConverterStaticData _ISO2022KRStaticData={
3861    sizeof(UConverterStaticData),
3862    "ISO_2022_KR",
3863    0,
3864    UCNV_IBM,
3865    UCNV_ISO_2022,
3866    1,
3867    3, /* max 3 bytes per UChar: SO+DBCS */
3868    { 0x1a, 0, 0, 0 },
3869    1,
3870    FALSE,
3871    FALSE,
3872    0,
3873    0,
3874    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3875};
3876
3877namespace {
3878
3879const UConverterSharedData _ISO2022KRData={
3880    sizeof(UConverterSharedData),
3881    ~((uint32_t) 0),
3882    NULL,
3883    NULL,
3884    &_ISO2022KRStaticData,
3885    FALSE,
3886    &_ISO2022KRImpl,
3887    0, UCNV_MBCS_TABLE_INITIALIZER
3888};
3889
3890}  // namespace
3891
3892/*************** CN ***************/
3893static const UConverterImpl _ISO2022CNImpl={
3894
3895    UCNV_ISO_2022,
3896
3897    NULL,
3898    NULL,
3899
3900    _ISO2022Open,
3901    _ISO2022Close,
3902    _ISO2022Reset,
3903
3904    UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3905    UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3906    UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3907    UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3908    NULL,
3909
3910    NULL,
3911    _ISO2022getName,
3912    _ISO_2022_WriteSub,
3913    _ISO_2022_SafeClone,
3914    _ISO_2022_GetUnicodeSet,
3915
3916    NULL,
3917    NULL
3918};
3919static const UConverterStaticData _ISO2022CNStaticData={
3920    sizeof(UConverterStaticData),
3921    "ISO_2022_CN",
3922    0,
3923    UCNV_IBM,
3924    UCNV_ISO_2022,
3925    1,
3926    8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3927    { 0x1a, 0, 0, 0 },
3928    1,
3929    FALSE,
3930    FALSE,
3931    0,
3932    0,
3933    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3934};
3935
3936namespace {
3937
3938const UConverterSharedData _ISO2022CNData={
3939    sizeof(UConverterSharedData),
3940    ~((uint32_t) 0),
3941    NULL,
3942    NULL,
3943    &_ISO2022CNStaticData,
3944    FALSE,
3945    &_ISO2022CNImpl,
3946    0, UCNV_MBCS_TABLE_INITIALIZER
3947};
3948
3949}  // namespace
3950
3951#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
3952