1/*
2*******************************************************************************
3* Copyright (C) 2012, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* dictionarydata.h
7*
8* created on: 2012may31
9* created by: Markus W. Scherer & Maxime Serrano
10*/
11
12#include "dictionarydata.h"
13#include "unicode/ucharstrie.h"
14#include "unicode/bytestrie.h"
15#include "unicode/udata.h"
16#include "cmemory.h"
17
18#if !UCONFIG_NO_BREAK_ITERATION
19
20U_NAMESPACE_BEGIN
21
22#ifndef CYGWINMSVC /* On Cygwin/MSVC, the error redefinition of symbols occurs.*/
23const int32_t DictionaryData::TRIE_TYPE_BYTES;
24const int32_t DictionaryData::TRIE_TYPE_UCHARS;
25#endif
26
27DictionaryMatcher::~DictionaryMatcher() {
28}
29
30UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
31    udata_close(file);
32}
33
34int32_t UCharsDictionaryMatcher::getType() const {
35    return DictionaryData::TRIE_TYPE_UCHARS;
36}
37
38int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
39    UCharsTrie uct(characters);
40    UChar32 c = utext_next32(text);
41    if (c < 0) {
42        return 0;
43    }
44    UStringTrieResult result = uct.first(c);
45    int32_t numChars = 1;
46    count = 0;
47    for (;;) {
48        if (USTRINGTRIE_HAS_VALUE(result)) {
49            if (count < limit) {
50                if (values != NULL) {
51                    values[count] = uct.getValue();
52                }
53                lengths[count++] = numChars;
54            }
55            if (result == USTRINGTRIE_FINAL_VALUE) {
56                break;
57            }
58        }
59        else if (result == USTRINGTRIE_NO_MATCH) {
60            break;
61        }
62
63        // TODO: why do we have a text limit if the UText knows its length?
64        if (numChars >= maxLength) {
65            break;
66        }
67
68        c = utext_next32(text);
69        if (c < 0) {
70            break;
71        }
72        ++numChars;
73        result = uct.next(c);
74    }
75    return numChars;
76}
77
78BytesDictionaryMatcher::~BytesDictionaryMatcher() {
79    udata_close(file);
80}
81
82UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
83    if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
84        if (c == 0x200D) {
85            return 0xFF;
86        } else if (c == 0x200C) {
87            return 0xFE;
88        }
89        int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
90        if (delta < 0 || 0xFD < delta) {
91            return U_SENTINEL;
92        }
93        return (UChar32)delta;
94    }
95    return c;
96}
97
98int32_t BytesDictionaryMatcher::getType() const {
99    return DictionaryData::TRIE_TYPE_BYTES;
100}
101
102int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
103    BytesTrie bt(characters);
104    UChar32 c = utext_next32(text);
105    if (c < 0) {
106        return 0;
107    }
108    UStringTrieResult result = bt.first(transform(c));
109    int32_t numChars = 1;
110    count = 0;
111    for (;;) {
112        if (USTRINGTRIE_HAS_VALUE(result)) {
113            if (count < limit) {
114                if (values != NULL) {
115                    values[count] = bt.getValue();
116            }
117                lengths[count++] = numChars;
118            }
119            if (result == USTRINGTRIE_FINAL_VALUE) {
120                break;
121            }
122        }
123        else if (result == USTRINGTRIE_NO_MATCH) {
124            break;
125        }
126
127        // TODO: why do we have a text limit if the UText knows its length?
128        if (numChars >= maxLength) {
129            break;
130        }
131
132        c = utext_next32(text);
133        if (c < 0) {
134            break;
135        }
136        ++numChars;
137        result = bt.next(transform(c));
138    }
139    return numChars;
140}
141
142
143U_NAMESPACE_END
144
145U_NAMESPACE_USE
146
147U_CAPI int32_t U_EXPORT2
148udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
149           void *outData, UErrorCode *pErrorCode) {
150    const UDataInfo *pInfo;
151    int32_t headerSize;
152    const uint8_t *inBytes;
153    uint8_t *outBytes;
154    const int32_t *inIndexes;
155    int32_t indexes[DictionaryData::IX_COUNT];
156    int32_t i, offset, size;
157
158    headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
159    if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
160    pInfo = (const UDataInfo *)((const char *)inData + 4);
161    if (!(pInfo->dataFormat[0] == 0x44 &&
162          pInfo->dataFormat[1] == 0x69 &&
163          pInfo->dataFormat[2] == 0x63 &&
164          pInfo->dataFormat[3] == 0x74 &&
165          pInfo->formatVersion[0] == 1)) {
166        udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
167                         pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
168        *pErrorCode = U_UNSUPPORTED_ERROR;
169        return 0;
170    }
171
172    inBytes = (const uint8_t *)inData + headerSize;
173    outBytes = (uint8_t *)outData + headerSize;
174
175    inIndexes = (const int32_t *)inBytes;
176    if (length >= 0) {
177        length -= headerSize;
178        if (length < (int32_t)(sizeof(indexes))) {
179            udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
180            *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
181            return 0;
182        }
183    }
184
185    for (i = 0; i < DictionaryData::IX_COUNT; i++) {
186        indexes[i] = udata_readInt32(ds, inIndexes[i]);
187    }
188
189    size = indexes[DictionaryData::IX_TOTAL_SIZE];
190
191    if (length >= 0) {
192        if (length < size) {
193            udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
194            *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
195            return 0;
196        }
197
198        if (inBytes != outBytes) {
199            uprv_memcpy(outBytes, inBytes, size);
200        }
201
202        offset = 0;
203        ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
204        offset = (int32_t)sizeof(indexes);
205        int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
206        int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
207
208        if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
209            ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
210        } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
211            // nothing to do
212        } else {
213            udata_printError(ds, "udict_swap(): unknown trie type!\n");
214            *pErrorCode = U_UNSUPPORTED_ERROR;
215            return 0;
216        }
217
218        // these next two sections are empty in the current format,
219        // but may be used later.
220        offset = nextOffset;
221        nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
222        offset = nextOffset;
223        nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
224        offset = nextOffset;
225    }
226    return headerSize + size;
227}
228#endif
229