1/*
2 **********************************************************************
3 *   Copyright (C) 2005-2012, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_CONVERSION
11
12#include "unicode/ucsdet.h"
13
14#include "csdetect.h"
15#include "csmatch.h"
16#include "uenumimp.h"
17
18#include "cmemory.h"
19#include "cstring.h"
20#include "umutex.h"
21#include "ucln_in.h"
22#include "uarrsort.h"
23#include "inputext.h"
24#include "csrsbcs.h"
25#include "csrmbcs.h"
26#include "csrutf8.h"
27#include "csrucode.h"
28#include "csr2022.h"
29
30#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
31
32#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33#define DELETE_ARRAY(array) uprv_free((void *) (array))
34
35U_CDECL_BEGIN
36static icu::CharsetRecognizer **fCSRecognizers = NULL;
37
38static int32_t fCSRecognizers_size = 0;
39
40static UBool U_CALLCONV csdet_cleanup(void)
41{
42    if (fCSRecognizers != NULL) {
43        for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
44            delete fCSRecognizers[r];
45            fCSRecognizers[r] = NULL;
46        }
47
48        DELETE_ARRAY(fCSRecognizers);
49        fCSRecognizers = NULL;
50        fCSRecognizers_size = 0;
51    }
52
53    return TRUE;
54}
55
56static int32_t U_CALLCONV
57charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
58{
59    U_NAMESPACE_USE
60
61    const CharsetMatch **csm_l = (const CharsetMatch **) left;
62    const CharsetMatch **csm_r = (const CharsetMatch **) right;
63
64    // NOTE: compare is backwards to sort from highest to lowest.
65    return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
66}
67
68U_CDECL_END
69
70U_NAMESPACE_BEGIN
71
72void CharsetDetector::setRecognizers(UErrorCode &status)
73{
74    UBool needsInit;
75    CharsetRecognizer **recognizers;
76
77    if (U_FAILURE(status)) {
78        return;
79    }
80
81    UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit);
82
83    if (needsInit) {
84        CharsetRecognizer *tempArray[] = {
85            new CharsetRecog_UTF8(),
86
87            new CharsetRecog_UTF_16_BE(),
88            new CharsetRecog_UTF_16_LE(),
89            new CharsetRecog_UTF_32_BE(),
90            new CharsetRecog_UTF_32_LE(),
91
92            new CharsetRecog_8859_1(),
93            new CharsetRecog_8859_2(),
94            new CharsetRecog_8859_5_ru(),
95            new CharsetRecog_8859_6_ar(),
96            new CharsetRecog_8859_7_el(),
97            new CharsetRecog_8859_8_I_he(),
98            new CharsetRecog_8859_8_he(),
99            new CharsetRecog_windows_1251(),
100            new CharsetRecog_windows_1256(),
101            new CharsetRecog_KOI8_R(),
102            new CharsetRecog_8859_9_tr(),
103            new CharsetRecog_sjis(),
104            new CharsetRecog_gb_18030(),
105            new CharsetRecog_euc_jp(),
106            new CharsetRecog_euc_kr(),
107            new CharsetRecog_big5(),
108
109            new CharsetRecog_2022JP(),
110            new CharsetRecog_2022KR(),
111            new CharsetRecog_2022CN(),
112
113            new CharsetRecog_IBM424_he_rtl(),
114            new CharsetRecog_IBM424_he_ltr(),
115            new CharsetRecog_IBM420_ar_rtl(),
116            new CharsetRecog_IBM420_ar_ltr()
117        };
118        int32_t rCount = ARRAY_SIZE(tempArray);
119        int32_t r;
120
121        recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
122
123        if (recognizers == NULL) {
124            status = U_MEMORY_ALLOCATION_ERROR;
125            return;
126        } else {
127            for (r = 0; r < rCount; r += 1) {
128                recognizers[r] = tempArray[r];
129
130                if (recognizers[r] == NULL) {
131                    status = U_MEMORY_ALLOCATION_ERROR;
132                    break;
133                }
134            }
135        }
136
137        if (U_SUCCESS(status)) {
138            umtx_lock(NULL);
139            if (fCSRecognizers == NULL) {
140                fCSRecognizers_size = rCount;
141                fCSRecognizers = recognizers;
142            }
143            umtx_unlock(NULL);
144        }
145
146        if (fCSRecognizers != recognizers) {
147            for (r = 0; r < rCount; r += 1) {
148                delete recognizers[r];
149                recognizers[r] = NULL;
150            }
151
152            DELETE_ARRAY(recognizers);
153        }
154
155        recognizers = NULL;
156        ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
157    }
158}
159
160CharsetDetector::CharsetDetector(UErrorCode &status)
161  : textIn(new InputText(status)), resultArray(NULL),
162    resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
163{
164    if (U_FAILURE(status)) {
165        return;
166    }
167
168    setRecognizers(status);
169
170    if (U_FAILURE(status)) {
171        return;
172    }
173
174    resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
175
176    if (resultArray == NULL) {
177        status = U_MEMORY_ALLOCATION_ERROR;
178        return;
179    }
180
181    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
182        resultArray[i] = new CharsetMatch();
183
184        if (resultArray[i] == NULL) {
185            status = U_MEMORY_ALLOCATION_ERROR;
186            break;
187        }
188    }
189}
190
191CharsetDetector::~CharsetDetector()
192{
193    delete textIn;
194
195    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
196        delete resultArray[i];
197    }
198
199    uprv_free(resultArray);
200}
201
202void CharsetDetector::setText(const char *in, int32_t len)
203{
204    textIn->setText(in, len);
205    fFreshTextSet = TRUE;
206}
207
208UBool CharsetDetector::setStripTagsFlag(UBool flag)
209{
210    UBool temp = fStripTags;
211    fStripTags = flag;
212    fFreshTextSet = TRUE;
213    return temp;
214}
215
216UBool CharsetDetector::getStripTagsFlag() const
217{
218    return fStripTags;
219}
220
221void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
222{
223    textIn->setDeclaredEncoding(encoding,len);
224}
225
226int32_t CharsetDetector::getDetectableCount()
227{
228    UErrorCode status = U_ZERO_ERROR;
229
230    setRecognizers(status);
231
232    return fCSRecognizers_size;
233}
234
235const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
236{
237    int32_t maxMatchesFound = 0;
238
239    detectAll(maxMatchesFound, status);
240
241    if(maxMatchesFound > 0) {
242        return resultArray[0];
243    } else {
244        return NULL;
245    }
246}
247
248const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
249{
250    if(!textIn->isSet()) {
251        status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
252
253        return NULL;
254    } else if (fFreshTextSet) {
255        CharsetRecognizer *csr;
256        int32_t            i;
257
258        textIn->MungeInput(fStripTags);
259
260        // Iterate over all possible charsets, remember all that
261        // give a match quality > 0.
262        resultCount = 0;
263        for (i = 0; i < fCSRecognizers_size; i += 1) {
264            csr = fCSRecognizers[i];
265            if (csr->match(textIn, resultArray[resultCount])) {
266                resultCount++;
267            }
268        }
269
270        if (resultCount > 1) {
271            uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
272        }
273        fFreshTextSet = FALSE;
274    }
275
276    maxMatchesFound = resultCount;
277
278    return resultArray;
279}
280
281/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
282{
283    if( index > fCSRecognizers_size-1 || index < 0) {
284        status = U_INDEX_OUTOFBOUNDS_ERROR;
285
286        return 0;
287    } else {
288        return fCSRecognizers[index]->getName();
289    }
290}*/
291
292U_NAMESPACE_END
293
294U_CDECL_BEGIN
295typedef struct {
296    int32_t currIndex;
297} Context;
298
299
300
301static void U_CALLCONV
302enumClose(UEnumeration *en) {
303    if(en->context != NULL) {
304        DELETE_ARRAY(en->context);
305    }
306
307    DELETE_ARRAY(en);
308}
309
310static int32_t U_CALLCONV
311enumCount(UEnumeration *, UErrorCode *) {
312    return fCSRecognizers_size;
313}
314
315static const char* U_CALLCONV
316enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
317    if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
318        if(resultLength != NULL) {
319            *resultLength = 0;
320        }
321        return NULL;
322    }
323    const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
324    if(resultLength != NULL) {
325        *resultLength = (int32_t)uprv_strlen(currName);
326    }
327    ((Context *)en->context)->currIndex++;
328
329    return currName;
330}
331
332static void U_CALLCONV
333enumReset(UEnumeration *en, UErrorCode *) {
334    ((Context *)en->context)->currIndex = 0;
335}
336
337static const UEnumeration gCSDetEnumeration = {
338    NULL,
339    NULL,
340    enumClose,
341    enumCount,
342    uenum_unextDefault,
343    enumNext,
344    enumReset
345};
346
347U_CAPI  UEnumeration * U_EXPORT2
348ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status)
349{
350    U_NAMESPACE_USE
351
352    if(U_FAILURE(*status)) {
353        return 0;
354    }
355
356    /* Initialize recognized charsets. */
357    CharsetDetector::getDetectableCount();
358
359    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
360    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
361    en->context = (void*)NEW_ARRAY(Context, 1);
362    uprv_memset(en->context, 0, sizeof(Context));
363    return en;
364}
365U_CDECL_END
366
367#endif
368
369