1/* Test of canonical normalization of UTF-8 strings.
2   Copyright (C) 2009, 2010 Free Software Foundation, Inc.
3
4   This program is free software: you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; either version 3 of the License, or
7   (at your option) any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
16
17/* Written by Bruno Haible <bruno@clisp.org>, 2009.  */
18
19#include <config.h>
20
21#if GNULIB_TEST_UNINORM_U8_NORMALIZE
22
23#include "uninorm.h"
24
25#include <signal.h>
26#include <stdlib.h>
27#include <unistd.h>
28
29#include "unistr.h"
30#include "macros.h"
31
32static int
33check (const uint8_t *input, size_t input_length,
34       const uint8_t *expected, size_t expected_length)
35{
36  size_t length;
37  uint8_t *result;
38
39  /* Test return conventions with resultbuf == NULL.  */
40  result = u8_normalize (UNINORM_NFC, input, input_length, NULL, &length);
41  if (!(result != NULL))
42    return 1;
43  if (!(length == expected_length))
44    return 2;
45  if (!(u8_cmp (result, expected, expected_length) == 0))
46    return 3;
47  free (result);
48
49  /* Test return conventions with resultbuf too small.  */
50  if (expected_length > 0)
51    {
52      uint8_t *preallocated;
53
54      length = expected_length - 1;
55      preallocated = (uint8_t *) malloc (length * sizeof (uint8_t));
56      result = u8_normalize (UNINORM_NFC, input, input_length, preallocated, &length);
57      if (!(result != NULL))
58        return 4;
59      if (!(result != preallocated))
60        return 5;
61      if (!(length == expected_length))
62        return 6;
63      if (!(u8_cmp (result, expected, expected_length) == 0))
64        return 7;
65      free (result);
66      free (preallocated);
67    }
68
69  /* Test return conventions with resultbuf large enough.  */
70  {
71    uint8_t *preallocated;
72
73    length = expected_length;
74    preallocated = (uint8_t *) malloc (length * sizeof (uint8_t));
75    result = u8_normalize (UNINORM_NFC, input, input_length, preallocated, &length);
76    if (!(result != NULL))
77      return 8;
78    if (!(preallocated == NULL || result == preallocated))
79      return 9;
80    if (!(length == expected_length))
81      return 10;
82    if (!(u8_cmp (result, expected, expected_length) == 0))
83      return 11;
84    free (preallocated);
85  }
86
87  return 0;
88}
89
90void
91test_u8_nfc (void)
92{
93  { /* Empty string.  */
94    ASSERT (check (NULL, 0, NULL, 0) == 0);
95  }
96  { /* SPACE */
97    static const uint8_t input[]    = { 0x20 };
98    ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
99  }
100
101  { /* LATIN CAPITAL LETTER A WITH DIAERESIS */
102    static const uint8_t input[]      = { 0xC3, 0x84 };
103    static const uint8_t decomposed[] = { 0x41, 0xCC, 0x88 };
104    ASSERT (check (input, SIZEOF (input),           input, SIZEOF (input)) == 0);
105    ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
106  }
107
108  { /* LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON */
109    static const uint8_t input[]      = { 0xC7, 0x9E };
110    static const uint8_t decomposed[] = { 0x41, 0xCC, 0x88, 0xCC, 0x84 };
111    ASSERT (check (input, SIZEOF (input),           input, SIZEOF (input)) == 0);
112    ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
113  }
114
115  { /* ANGSTROM SIGN */
116    static const uint8_t input[]      = { 0xE2, 0x84, 0xAB };
117    static const uint8_t decomposed[] = { 0x41, 0xCC, 0x8A };
118    static const uint8_t expected[]   = { 0xC3, 0x85 };
119    ASSERT (check (input, SIZEOF (input),           expected, SIZEOF (expected)) == 0);
120    ASSERT (check (decomposed, SIZEOF (decomposed), expected, SIZEOF (expected)) == 0);
121    ASSERT (check (expected, SIZEOF (expected),     expected, SIZEOF (expected)) == 0);
122  }
123
124  { /* GREEK DIALYTIKA AND PERISPOMENI */
125    static const uint8_t input[]      = { 0xE1, 0xBF, 0x81 };
126    ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
127  }
128
129  { /* SCRIPT SMALL L */
130    static const uint8_t input[]      = { 0xE2, 0x84, 0x93 };
131    ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
132  }
133
134  { /* NO-BREAK SPACE */
135    static const uint8_t input[]      = { 0xC2, 0xA0 };
136    ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
137  }
138
139  { /* ARABIC LETTER VEH INITIAL FORM */
140    static const uint8_t input[]      = { 0xEF, 0xAD, 0xAC };
141    ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
142  }
143
144  { /* ARABIC LETTER VEH MEDIAL FORM */
145    static const uint8_t input[]      = { 0xEF, 0xAD, 0xAD };
146    ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
147  }
148
149  { /* ARABIC LETTER VEH FINAL FORM */
150    static const uint8_t input[]      = { 0xEF, 0xAD, 0xAB };
151    ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
152  }
153
154  { /* ARABIC LETTER VEH ISOLATED FORM */
155    static const uint8_t input[]      = { 0xEF, 0xAD, 0xAA };
156    ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
157  }
158
159  { /* CIRCLED NUMBER FIFTEEN */
160    static const uint8_t input[]      = { 0xE2, 0x91, 0xAE };
161    ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
162  }
163
164  { /* TRADE MARK SIGN */
165    static const uint8_t input[]      = { 0xE2, 0x84, 0xA2 };
166    ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
167  }
168
169  { /* LATIN SUBSCRIPT SMALL LETTER I */
170    static const uint8_t input[]      = { 0xE1, 0xB5, 0xA2 };
171    ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
172  }
173
174  { /* PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS */
175    static const uint8_t input[]      = { 0xEF, 0xB8, 0xB5 };
176    ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
177  }
178
179  { /* FULLWIDTH LATIN CAPITAL LETTER A */
180    static const uint8_t input[]      = { 0xEF, 0xBC, 0xA1 };
181    ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
182  }
183
184  { /* HALFWIDTH IDEOGRAPHIC COMMA */
185    static const uint8_t input[]      = { 0xEF, 0xBD, 0xA4 };
186    ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
187  }
188
189  { /* SMALL IDEOGRAPHIC COMMA */
190    static const uint8_t input[]      = { 0xEF, 0xB9, 0x91 };
191    ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
192  }
193
194  { /* SQUARE MHZ */
195    static const uint8_t input[]      = { 0xE3, 0x8E, 0x92 };
196    ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
197  }
198
199  { /* VULGAR FRACTION THREE EIGHTHS */
200    static const uint8_t input[]      = { 0xE2, 0x85, 0x9C };
201    ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
202  }
203
204  { /* MICRO SIGN */
205    static const uint8_t input[]      = { 0xC2, 0xB5 };
206    ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
207  }
208
209  { /* ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM */
210    static const uint8_t input[]      = { 0xEF, 0xB7, 0xBA };
211    ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
212  }
213
214  { /* HANGUL SYLLABLE GEUL */
215    static const uint8_t input[]      = { 0xEA, 0xB8, 0x80 };
216    static const uint8_t decomposed[] =
217      { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF };
218    ASSERT (check (input, SIZEOF (input),           input, SIZEOF (input)) == 0);
219    ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
220  }
221
222  { /* HANGUL SYLLABLE GEU */
223    static const uint8_t input[]      = { 0xEA, 0xB7, 0xB8 };
224    static const uint8_t decomposed[] = { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3 };
225    ASSERT (check (input, SIZEOF (input),           input, SIZEOF (input)) == 0);
226    ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
227  }
228
229  { /* "Gr���� Gott. ������������������������! x=(-b��sqrt(b��-4ac))/(2a)  ���������,������,������" */
230    static const uint8_t input[] =
231      { 'G', 'r', 0xC3, 0xBC, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.',
232        ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1,
233        0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB9,
234        0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1,
235        's', 'q', 'r', 't', '(', 'b', 0xC2, 0xB2, '-', '4', 'a', 'c', ')', ')',
236        '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC,
237        0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
238        0xED, 0x95, 0x9C,
239        0xEA, 0xB8, 0x80, '\n'
240      };
241    static const uint8_t decomposed[] =
242      { 'G', 'r', 0x75, 0xCC, 0x88, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.',
243        ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1,
244        0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB8, 0xCC, 0x86,
245        0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1,
246        's', 'q', 'r', 't', '(', 'b', 0xC2, 0xB2, '-', '4', 'a', 'c', ')', ')',
247        '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC,
248        0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
249        0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE1, 0x86, 0xAB,
250        0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF, '\n'
251      };
252    ASSERT (check (input, SIZEOF (input),           input, SIZEOF (input)) == 0);
253    ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
254  }
255
256#if HAVE_DECL_ALARM
257  /* Declare failure if test takes too long, by using default abort
258     caused by SIGALRM.  */
259  signal (SIGALRM, SIG_DFL);
260  alarm (50);
261#endif
262
263  /* Check that the sorting is not O(n��) but O(n log n).  */
264  {
265    int pass;
266    for (pass = 0; pass < 3; pass++)
267      {
268        size_t repeat = 1;
269        size_t m = 100000;
270        uint8_t *input = (uint8_t *) malloc (2 * (2 * m - 1) * sizeof (uint8_t));
271        if (input != NULL)
272          {
273            uint8_t *expected = input + (2 * m - 1);
274            size_t m1 = m / 2;
275            size_t m2 = (m - 1) / 2;
276            /* NB: m1 + m2 == m - 1.  */
277            uint8_t *p;
278            size_t i;
279
280            input[0] = 0x41;
281            p = input + 1;
282            switch (pass)
283              {
284              case 0:
285                for (i = 0; i < m1; i++)
286                  {
287                    *p++ = 0xCC;
288                    *p++ = 0x99;
289                  }
290                for (i = 0; i < m2; i++)
291                  {
292                    *p++ = 0xCC;
293                    *p++ = 0x80;
294                  }
295                break;
296
297              case 1:
298                for (i = 0; i < m2; i++)
299                  {
300                    *p++ = 0xCC;
301                    *p++ = 0x80;
302                  }
303                for (i = 0; i < m1; i++)
304                  {
305                    *p++ = 0xCC;
306                    *p++ = 0x99;
307                  }
308                break;
309
310              case 2:
311                for (i = 0; i < m2; i++)
312                  {
313                    *p++ = 0xCC;
314                    *p++ = 0x99;
315                    *p++ = 0xCC;
316                    *p++ = 0x80;
317                  }
318                for (; i < m1; i++)
319                  {
320                    *p++ = 0xCC;
321                    *p++ = 0x99;
322                  }
323                break;
324
325              default:
326                abort ();
327              }
328
329            expected[0] = 0xC3;
330            expected[1] = 0x80;
331            p = expected + 2;
332            for (i = 0; i < m1; i++)
333              {
334                *p++ = 0xCC;
335                *p++ = 0x99;
336              }
337            for (i = 0; i < m2 - 1; i++)
338              {
339                *p++ = 0xCC;
340                *p++ = 0x80;
341              }
342
343            for (; repeat > 0; repeat--)
344              {
345                ASSERT (check (input, 2 * m - 1,    expected, 2 * m - 2) == 0);
346                ASSERT (check (expected, 2 * m - 2, expected, 2 * m - 2) == 0);
347              }
348
349            free (input);
350          }
351      }
352  }
353}
354
355#else
356
357void
358test_u8_nfc (void)
359{
360}
361
362#endif
363