1/*
2*******************************************************************************
3* Copyright (C) 2013-2014, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* collationdatawriter.cpp
7*
8* created on: 2013aug06
9* created by: Markus W. Scherer
10*/
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_COLLATION
15
16#include "unicode/tblcoll.h"
17#include "unicode/udata.h"
18#include "unicode/uniset.h"
19#include "cmemory.h"
20#include "collationdata.h"
21#include "collationdatabuilder.h"
22#include "collationdatareader.h"
23#include "collationdatawriter.h"
24#include "collationfastlatin.h"
25#include "collationsettings.h"
26#include "collationtailoring.h"
27#include "uassert.h"
28#include "ucmndata.h"
29
30U_NAMESPACE_BEGIN
31
32uint8_t *
33RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const {
34    if(U_FAILURE(errorCode)) { return NULL; }
35    LocalMemory<uint8_t> buffer((uint8_t *)uprv_malloc(20000));
36    if(buffer.isNull()) {
37        errorCode = U_MEMORY_ALLOCATION_ERROR;
38        return NULL;
39    }
40    length = cloneBinary(buffer.getAlias(), 20000, errorCode);
41    if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
42        if(buffer.allocateInsteadAndCopy(length, 0) == NULL) {
43            errorCode = U_MEMORY_ALLOCATION_ERROR;
44            return NULL;
45        }
46        errorCode = U_ZERO_ERROR;
47        length = cloneBinary(buffer.getAlias(), length, errorCode);
48    }
49    if(U_FAILURE(errorCode)) { return NULL; }
50    return buffer.orphan();
51}
52
53int32_t
54RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const {
55    int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1];
56    return CollationDataWriter::writeTailoring(
57            *tailoring, *settings, indexes, dest, capacity,
58            errorCode);
59}
60
61static const UDataInfo dataInfo = {
62    sizeof(UDataInfo),
63    0,
64
65    U_IS_BIG_ENDIAN,
66    U_CHARSET_FAMILY,
67    U_SIZEOF_UCHAR,
68    0,
69
70    { 0x55, 0x43, 0x6f, 0x6c },         // dataFormat="UCol"
71    { 4, 0, 0, 0 },                     // formatVersion
72    { 6, 3, 0, 0 }                      // dataVersion
73};
74
75int32_t
76CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings,
77                               const void *rootElements, int32_t rootElementsLength,
78                               int32_t indexes[], uint8_t *dest, int32_t capacity,
79                               UErrorCode &errorCode) {
80    return write(TRUE, NULL,
81                 data, settings,
82                 rootElements, rootElementsLength,
83                 indexes, dest, capacity, errorCode);
84}
85
86int32_t
87CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings,
88                                    int32_t indexes[], uint8_t *dest, int32_t capacity,
89                                    UErrorCode &errorCode) {
90    return write(FALSE, t.version,
91                 *t.data, settings,
92                 NULL, 0,
93                 indexes, dest, capacity, errorCode);
94}
95
96int32_t
97CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
98                           const CollationData &data, const CollationSettings &settings,
99                           const void *rootElements, int32_t rootElementsLength,
100                           int32_t indexes[], uint8_t *dest, int32_t capacity,
101                           UErrorCode &errorCode) {
102    if(U_FAILURE(errorCode)) { return 0; }
103    if(capacity < 0 || (capacity > 0 && dest == NULL)) {
104        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
105        return 0;
106    }
107
108    // Figure out which data items to write before settling on
109    // the indexes length and writing offsets.
110    // For any data item, we need to write the start and limit offsets,
111    // so the indexes length must be at least index-of-start-offset + 2.
112    int32_t indexesLength;
113    UBool hasMappings;
114    UnicodeSet unsafeBackwardSet;
115    const CollationData *baseData = data.base;
116
117    int32_t fastLatinVersion;
118    if(data.fastLatinTable != NULL) {
119        fastLatinVersion = (int32_t)CollationFastLatin::VERSION << 16;
120    } else {
121        fastLatinVersion = 0;
122    }
123    int32_t fastLatinTableLength = 0;
124
125    if(isBase) {
126        // For the root collator, we write an even number of indexes
127        // so that we start with an 8-aligned offset.
128        indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1;
129        U_ASSERT(settings.reorderCodesLength == 0);
130        hasMappings = TRUE;
131        unsafeBackwardSet = *data.unsafeBackwardSet;
132        fastLatinTableLength = data.fastLatinTableLength;
133    } else if(baseData == NULL) {
134        hasMappings = FALSE;
135        if(settings.reorderCodesLength == 0) {
136            // only options
137            indexesLength = CollationDataReader::IX_OPTIONS + 1;  // no limit offset here
138        } else {
139            // only options, reorder codes, and the reorder table
140            indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2;
141        }
142    } else {
143        hasMappings = TRUE;
144        // Tailored mappings, and what else?
145        // Check in ascending order of optional tailoring data items.
146        indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2;
147        if(data.contextsLength != 0) {
148            indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2;
149        }
150        unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet);
151        if(!unsafeBackwardSet.isEmpty()) {
152            indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2;
153        }
154        if(data.fastLatinTable != baseData->fastLatinTable) {
155            fastLatinTableLength = data.fastLatinTableLength;
156            indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2;
157        }
158    }
159
160    int32_t headerSize;
161    if(isBase) {
162        headerSize = 0;  // udata_create() writes the header
163    } else {
164        DataHeader header;
165        header.dataHeader.magic1 = 0xda;
166        header.dataHeader.magic2 = 0x27;
167        uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo));
168        uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo));
169        headerSize = (int32_t)sizeof(header);
170        U_ASSERT((headerSize & 3) == 0);  // multiple of 4 bytes
171        if(hasMappings && data.cesLength != 0) {
172            // Sum of the sizes of the data items which are
173            // not automatically multiples of 8 bytes and which are placed before the CEs.
174            int32_t sum = headerSize + (indexesLength + settings.reorderCodesLength) * 4;
175            if((sum & 7) != 0) {
176                // We need to add padding somewhere so that the 64-bit CEs are 8-aligned.
177                // We add to the header size here.
178                // Alternatively, we could increment the indexesLength
179                // or add a few bytes to the reorderTable.
180                headerSize += 4;
181            }
182        }
183        header.dataHeader.headerSize = (uint16_t)headerSize;
184        if(headerSize <= capacity) {
185            uprv_memcpy(dest, &header, sizeof(header));
186            // Write 00 bytes so that the padding is not mistaken for a copyright string.
187            uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header));
188            dest += headerSize;
189            capacity -= headerSize;
190        } else {
191            dest = NULL;
192            capacity = 0;
193        }
194    }
195
196    indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength;
197    U_ASSERT((settings.options & ~0xffff) == 0);
198    indexes[CollationDataReader::IX_OPTIONS] =
199            data.numericPrimary | fastLatinVersion | settings.options;
200    indexes[CollationDataReader::IX_RESERVED2] = 0;
201    indexes[CollationDataReader::IX_RESERVED3] = 0;
202
203    // Byte offsets of data items all start from the start of the indexes.
204    // We add the headerSize at the very end.
205    int32_t totalSize = indexesLength * 4;
206
207    if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) {
208        indexes[CollationDataReader::IX_JAMO_CE32S_START] = data.jamoCE32s - data.ce32s;
209    } else {
210        indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1;
211    }
212
213    indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize;
214    totalSize += settings.reorderCodesLength * 4;
215
216    indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize;
217    if(settings.reorderTable != NULL) {
218        totalSize += 256;
219    }
220
221    indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize;
222    if(hasMappings) {
223        UErrorCode errorCode2 = U_ZERO_ERROR;
224        int32_t length;
225        if(totalSize < capacity) {
226            length = utrie2_serialize(data.trie, dest + totalSize,
227                                      capacity - totalSize, &errorCode2);
228        } else {
229            length = utrie2_serialize(data.trie, NULL, 0, &errorCode2);
230        }
231        if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
232            errorCode = errorCode2;
233            return 0;
234        }
235        // The trie size should be a multiple of 8 bytes due to the way
236        // compactIndex2(UNewTrie2 *trie) currently works.
237        U_ASSERT((length & 7) == 0);
238        totalSize += length;
239    }
240
241    indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize;
242    indexes[CollationDataReader::IX_CES_OFFSET] = totalSize;
243    if(hasMappings && data.cesLength != 0) {
244        U_ASSERT(((headerSize + totalSize) & 7) == 0);
245        totalSize += data.cesLength * 8;
246    }
247
248    indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize;
249    indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize;
250    if(hasMappings) {
251        totalSize += data.ce32sLength * 4;
252    }
253
254    indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize;
255    totalSize += rootElementsLength * 4;
256
257    indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize;
258    if(hasMappings) {
259        totalSize += data.contextsLength * 2;
260    }
261
262    indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize;
263    if(hasMappings && !unsafeBackwardSet.isEmpty()) {
264        UErrorCode errorCode2 = U_ZERO_ERROR;
265        int32_t length;
266        if(totalSize < capacity) {
267            uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize);
268            length = unsafeBackwardSet.serialize(
269                    p, (capacity - totalSize) / 2, errorCode2);
270        } else {
271            length = unsafeBackwardSet.serialize(NULL, 0, errorCode2);
272        }
273        if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
274            errorCode = errorCode2;
275            return 0;
276        }
277        totalSize += length * 2;
278    }
279
280    indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize;
281    totalSize += fastLatinTableLength * 2;
282
283    indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize;
284    if(isBase) {
285        totalSize += data.scriptsLength * 2;
286    }
287
288    indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize;
289    if(isBase) {
290        totalSize += 256;
291    }
292
293    indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize;
294    indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize;
295
296    if(totalSize > capacity) {
297        errorCode = U_BUFFER_OVERFLOW_ERROR;
298        return headerSize + totalSize;
299    }
300
301    uprv_memcpy(dest, indexes, indexesLength * 4);
302    copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, settings.reorderCodes, dest);
303    copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest);
304    // The trie has already been serialized into the dest buffer.
305    copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest);
306    copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest);
307    copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest);
308    copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest);
309    // The unsafeBackwardSet has already been serialized into the dest buffer.
310    copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest);
311    copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, data.scripts, dest);
312    copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest);
313
314    return headerSize + totalSize;
315}
316
317void
318CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex,
319                              const void *src, uint8_t *dest) {
320    int32_t start = indexes[startIndex];
321    int32_t limit = indexes[startIndex + 1];
322    if(start < limit) {
323        uprv_memcpy(dest + start, src, limit - start);
324    }
325}
326
327U_NAMESPACE_END
328
329#endif  // !UCONFIG_NO_COLLATION
330