1/* Test of compatibility normalization of UTF-8 strings.
2   Copyright (C) 2009, 2010 Free Software Foundation, Inc.
3
4   This program is free software: you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; either version 3 of the License, or
7   (at your option) any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
16
17/* Written by Bruno Haible <bruno@clisp.org>, 2009.  */
18
19#include <config.h>
20
21#if GNULIB_TEST_UNINORM_U8_NORMALIZE
22
23#include "uninorm.h"
24
25#include <signal.h>
26#include <stdlib.h>
27#include <unistd.h>
28
29#include "unistr.h"
30#include "macros.h"
31
32static int
33check (const uint8_t *input, size_t input_length,
34       const uint8_t *expected, size_t expected_length)
35{
36  size_t length;
37  uint8_t *result;
38
39  /* Test return conventions with resultbuf == NULL.  */
40  result = u8_normalize (UNINORM_NFKC, input, input_length, NULL, &length);
41  if (!(result != NULL))
42    return 1;
43  if (!(length == expected_length))
44    return 2;
45  if (!(u8_cmp (result, expected, expected_length) == 0))
46    return 3;
47  free (result);
48
49  /* Test return conventions with resultbuf too small.  */
50  if (expected_length > 0)
51    {
52      uint8_t *preallocated;
53
54      length = expected_length - 1;
55      preallocated = (uint8_t *) malloc (length * sizeof (uint8_t));
56      result = u8_normalize (UNINORM_NFKC, input, input_length, preallocated, &length);
57      if (!(result != NULL))
58        return 4;
59      if (!(result != preallocated))
60        return 5;
61      if (!(length == expected_length))
62        return 6;
63      if (!(u8_cmp (result, expected, expected_length) == 0))
64        return 7;
65      free (result);
66      free (preallocated);
67    }
68
69  /* Test return conventions with resultbuf large enough.  */
70  {
71    uint8_t *preallocated;
72
73    length = expected_length;
74    preallocated = (uint8_t *) malloc (length * sizeof (uint8_t));
75    result = u8_normalize (UNINORM_NFKC, input, input_length, preallocated, &length);
76    if (!(result != NULL))
77      return 8;
78    if (!(preallocated == NULL || result == preallocated))
79      return 9;
80    if (!(length == expected_length))
81      return 10;
82    if (!(u8_cmp (result, expected, expected_length) == 0))
83      return 11;
84    free (preallocated);
85  }
86
87  return 0;
88}
89
90void
91test_u8_nfkc (void)
92{
93  { /* Empty string.  */
94    ASSERT (check (NULL, 0, NULL, 0) == 0);
95  }
96  { /* SPACE */
97    static const uint8_t input[]    = { 0x20 };
98    ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
99  }
100
101  { /* LATIN CAPITAL LETTER A WITH DIAERESIS */
102    static const uint8_t input[]      = { 0xC3, 0x84 };
103    static const uint8_t decomposed[] = { 0x41, 0xCC, 0x88 };
104    ASSERT (check (input, SIZEOF (input),           input, SIZEOF (input)) == 0);
105    ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
106  }
107
108  { /* LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON */
109    static const uint8_t input[]      = { 0xC7, 0x9E };
110    static const uint8_t decomposed[] = { 0x41, 0xCC, 0x88, 0xCC, 0x84 };
111    ASSERT (check (input, SIZEOF (input),           input, SIZEOF (input)) == 0);
112    ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
113  }
114
115  { /* ANGSTROM SIGN */
116    static const uint8_t input[]      = { 0xE2, 0x84, 0xAB };
117    static const uint8_t decomposed[] = { 0x41, 0xCC, 0x8A };
118    static const uint8_t expected[]   = { 0xC3, 0x85 };
119    ASSERT (check (input, SIZEOF (input),           expected, SIZEOF (expected)) == 0);
120    ASSERT (check (decomposed, SIZEOF (decomposed), expected, SIZEOF (expected)) == 0);
121    ASSERT (check (expected, SIZEOF (expected),     expected, SIZEOF (expected)) == 0);
122  }
123
124  { /* GREEK DIALYTIKA AND PERISPOMENI */
125    static const uint8_t input[]      = { 0xE1, 0xBF, 0x81 };
126    static const uint8_t decomposed[] = { 0x20, 0xCC, 0x88, 0xCD, 0x82 };
127    ASSERT (check (input, SIZEOF (input),           decomposed, SIZEOF (decomposed)) == 0);
128    ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
129  }
130
131  { /* SCRIPT SMALL L */
132    static const uint8_t input[]      = { 0xE2, 0x84, 0x93 };
133    static const uint8_t decomposed[] = { 0x6C };
134    ASSERT (check (input, SIZEOF (input),           decomposed, SIZEOF (decomposed)) == 0);
135    ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
136  }
137
138  { /* NO-BREAK SPACE */
139    static const uint8_t input[]      = { 0xC2, 0xA0 };
140    static const uint8_t decomposed[] = { 0x20 };
141    ASSERT (check (input, SIZEOF (input),           decomposed, SIZEOF (decomposed)) == 0);
142    ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
143  }
144
145  { /* ARABIC LETTER VEH INITIAL FORM */
146    static const uint8_t input[]      = { 0xEF, 0xAD, 0xAC };
147    static const uint8_t decomposed[] = { 0xDA, 0xA4 };
148    ASSERT (check (input, SIZEOF (input),           decomposed, SIZEOF (decomposed)) == 0);
149    ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
150  }
151
152  { /* ARABIC LETTER VEH MEDIAL FORM */
153    static const uint8_t input[]      = { 0xEF, 0xAD, 0xAD };
154    static const uint8_t decomposed[] = { 0xDA, 0xA4 };
155    ASSERT (check (input, SIZEOF (input),           decomposed, SIZEOF (decomposed)) == 0);
156    ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
157  }
158
159  { /* ARABIC LETTER VEH FINAL FORM */
160    static const uint8_t input[]      = { 0xEF, 0xAD, 0xAB };
161    static const uint8_t decomposed[] = { 0xDA, 0xA4 };
162    ASSERT (check (input, SIZEOF (input),           decomposed, SIZEOF (decomposed)) == 0);
163    ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
164  }
165
166  { /* ARABIC LETTER VEH ISOLATED FORM */
167    static const uint8_t input[]      = { 0xEF, 0xAD, 0xAA };
168    static const uint8_t decomposed[] = { 0xDA, 0xA4 };
169    ASSERT (check (input, SIZEOF (input),           decomposed, SIZEOF (decomposed)) == 0);
170    ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
171  }
172
173  { /* CIRCLED NUMBER FIFTEEN */
174    static const uint8_t input[]      = { 0xE2, 0x91, 0xAE };
175    static const uint8_t decomposed[] = { 0x31, 0x35 };
176    ASSERT (check (input, SIZEOF (input),           decomposed, SIZEOF (decomposed)) == 0);
177    ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
178  }
179
180  { /* TRADE MARK SIGN */
181    static const uint8_t input[]      = { 0xE2, 0x84, 0xA2 };
182    static const uint8_t decomposed[] = { 0x54, 0x4D };
183    ASSERT (check (input, SIZEOF (input),           decomposed, SIZEOF (decomposed)) == 0);
184    ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
185  }
186
187  { /* LATIN SUBSCRIPT SMALL LETTER I */
188    static const uint8_t input[]      = { 0xE1, 0xB5, 0xA2 };
189    static const uint8_t decomposed[] = { 0x69 };
190    ASSERT (check (input, SIZEOF (input),           decomposed, SIZEOF (decomposed)) == 0);
191    ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
192  }
193
194  { /* PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS */
195    static const uint8_t input[]      = { 0xEF, 0xB8, 0xB5 };
196    static const uint8_t decomposed[] = { 0x28 };
197    ASSERT (check (input, SIZEOF (input),           decomposed, SIZEOF (decomposed)) == 0);
198    ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
199  }
200
201  { /* FULLWIDTH LATIN CAPITAL LETTER A */
202    static const uint8_t input[]      = { 0xEF, 0xBC, 0xA1 };
203    static const uint8_t decomposed[] = { 0x41 };
204    ASSERT (check (input, SIZEOF (input),           decomposed, SIZEOF (decomposed)) == 0);
205    ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
206  }
207
208  { /* HALFWIDTH IDEOGRAPHIC COMMA */
209    static const uint8_t input[]      = { 0xEF, 0xBD, 0xA4 };
210    static const uint8_t decomposed[] = { 0xE3, 0x80, 0x81 };
211    ASSERT (check (input, SIZEOF (input),           decomposed, SIZEOF (decomposed)) == 0);
212    ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
213  }
214
215  { /* SMALL IDEOGRAPHIC COMMA */
216    static const uint8_t input[]      = { 0xEF, 0xB9, 0x91 };
217    static const uint8_t decomposed[] = { 0xE3, 0x80, 0x81 };
218    ASSERT (check (input, SIZEOF (input),           decomposed, SIZEOF (decomposed)) == 0);
219    ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
220  }
221
222  { /* SQUARE MHZ */
223    static const uint8_t input[]      = { 0xE3, 0x8E, 0x92 };
224    static const uint8_t decomposed[] = { 0x4D, 0x48, 0x7A };
225    ASSERT (check (input, SIZEOF (input),           decomposed, SIZEOF (decomposed)) == 0);
226    ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
227  }
228
229  { /* VULGAR FRACTION THREE EIGHTHS */
230    static const uint8_t input[]      = { 0xE2, 0x85, 0x9C };
231    static const uint8_t decomposed[] = { 0x33, 0xE2, 0x81, 0x84, 0x38 };
232    ASSERT (check (input, SIZEOF (input),           decomposed, SIZEOF (decomposed)) == 0);
233    ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
234  }
235
236  { /* MICRO SIGN */
237    static const uint8_t input[]      = { 0xC2, 0xB5 };
238    static const uint8_t decomposed[] = { 0xCE, 0xBC };
239    ASSERT (check (input, SIZEOF (input),           decomposed, SIZEOF (decomposed)) == 0);
240    ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
241  }
242
243  { /* ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM */
244    static const uint8_t input[]      = { 0xEF, 0xB7, 0xBA };
245    static const uint8_t decomposed[] =
246      { 0xD8, 0xB5, 0xD9, 0x84, 0xD9, 0x89, 0x20, 0xD8, 0xA7, 0xD9, 0x84, 0xD9,
247        0x84, 0xD9, 0x87, 0x20, 0xD8, 0xB9, 0xD9, 0x84, 0xD9, 0x8A, 0xD9, 0x87,
248        0x20, 0xD9, 0x88, 0xD8, 0xB3, 0xD9, 0x84, 0xD9, 0x85
249      };
250    ASSERT (check (input, SIZEOF (input),           decomposed, SIZEOF (decomposed)) == 0);
251    ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
252  }
253
254  { /* HANGUL SYLLABLE GEUL */
255    static const uint8_t input[]      = { 0xEA, 0xB8, 0x80 };
256    static const uint8_t decomposed[] =
257      { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF };
258    ASSERT (check (input, SIZEOF (input),           input, SIZEOF (input)) == 0);
259    ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
260  }
261
262  { /* HANGUL SYLLABLE GEU */
263    static const uint8_t input[]      = { 0xEA, 0xB7, 0xB8 };
264    static const uint8_t decomposed[] = { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3 };
265    ASSERT (check (input, SIZEOF (input),           input, SIZEOF (input)) == 0);
266    ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
267  }
268
269  { /* "Gr���� Gott. ������������������������! x=(-b��sqrt(b��-4ac))/(2a)  ���������,������,������" */
270    static const uint8_t input[] =
271      { 'G', 'r', 0xC3, 0xBC, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.',
272        ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1,
273        0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB9,
274        0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1,
275        's', 'q', 'r', 't', '(', 'b', 0xC2, 0xB2, '-', '4', 'a', 'c', ')', ')',
276        '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC,
277        0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
278        0xED, 0x95, 0x9C,
279        0xEA, 0xB8, 0x80, '\n'
280      };
281    static const uint8_t decomposed[] =
282      { 'G', 'r', 0x75, 0xCC, 0x88, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.',
283        ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1,
284        0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB8, 0xCC, 0x86,
285        0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1,
286        's', 'q', 'r', 't', '(', 'b', 0x32, '-', '4', 'a', 'c', ')', ')',
287        '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC,
288        0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
289        0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE1, 0x86, 0xAB,
290        0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF, '\n'
291      };
292    static const uint8_t expected[] =
293      { 'G', 'r', 0xC3, 0xBC, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.',
294        ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1,
295        0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB9,
296        0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1,
297        's', 'q', 'r', 't', '(', 'b', 0x32, '-', '4', 'a', 'c', ')', ')',
298        '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC,
299        0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
300        0xED, 0x95, 0x9C,
301        0xEA, 0xB8, 0x80, '\n'
302      };
303    ASSERT (check (input, SIZEOF (input),           expected, SIZEOF (expected)) == 0);
304    ASSERT (check (decomposed, SIZEOF (decomposed), expected, SIZEOF (expected)) == 0);
305    ASSERT (check (expected, SIZEOF (expected),     expected, SIZEOF (expected)) == 0);
306  }
307
308#if HAVE_DECL_ALARM
309  /* Declare failure if test takes too long, by using default abort
310     caused by SIGALRM.  */
311  signal (SIGALRM, SIG_DFL);
312  alarm (50);
313#endif
314
315  /* Check that the sorting is not O(n��) but O(n log n).  */
316  {
317    int pass;
318    for (pass = 0; pass < 3; pass++)
319      {
320        size_t repeat = 1;
321        size_t m = 100000;
322        uint8_t *input = (uint8_t *) malloc (2 * (2 * m - 1) * sizeof (uint8_t));
323        if (input != NULL)
324          {
325            uint8_t *expected = input + (2 * m - 1);
326            size_t m1 = m / 2;
327            size_t m2 = (m - 1) / 2;
328            /* NB: m1 + m2 == m - 1.  */
329            uint8_t *p;
330            size_t i;
331
332            input[0] = 0x41;
333            p = input + 1;
334            switch (pass)
335              {
336              case 0:
337                for (i = 0; i < m1; i++)
338                  {
339                    *p++ = 0xCC;
340                    *p++ = 0x99;
341                  }
342                for (i = 0; i < m2; i++)
343                  {
344                    *p++ = 0xCC;
345                    *p++ = 0x80;
346                  }
347                break;
348
349              case 1:
350                for (i = 0; i < m2; i++)
351                  {
352                    *p++ = 0xCC;
353                    *p++ = 0x80;
354                  }
355                for (i = 0; i < m1; i++)
356                  {
357                    *p++ = 0xCC;
358                    *p++ = 0x99;
359                  }
360                break;
361
362              case 2:
363                for (i = 0; i < m2; i++)
364                  {
365                    *p++ = 0xCC;
366                    *p++ = 0x99;
367                    *p++ = 0xCC;
368                    *p++ = 0x80;
369                  }
370                for (; i < m1; i++)
371                  {
372                    *p++ = 0xCC;
373                    *p++ = 0x99;
374                  }
375                break;
376
377              default:
378                abort ();
379              }
380
381            expected[0] = 0xC3;
382            expected[1] = 0x80;
383            p = expected + 2;
384            for (i = 0; i < m1; i++)
385              {
386                *p++ = 0xCC;
387                *p++ = 0x99;
388              }
389            for (i = 0; i < m2 - 1; i++)
390              {
391                *p++ = 0xCC;
392                *p++ = 0x80;
393              }
394
395            for (; repeat > 0; repeat--)
396              {
397                ASSERT (check (input, 2 * m - 1,    expected, 2 * m - 2) == 0);
398                ASSERT (check (expected, 2 * m - 2, expected, 2 * m - 2) == 0);
399              }
400
401            free (input);
402          }
403      }
404  }
405}
406
407#else
408
409void
410test_u8_nfkc (void)
411{
412}
413
414#endif
415