1/* Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements.  See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License.  You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "apr.h"
18#include "arch/win32/apr_arch_utf8.h"
19#include <wchar.h>
20#include <string.h>
21#include <assert.h>
22
23struct testval {
24    unsigned char n[8];
25    int nl;
26    wchar_t w[4];
27    int wl;
28};
29
30/* For reference; a table of invalid utf-8 encoded ucs-2/ucs-4 sequences.
31 * The table consists of start, end pairs for all invalid ranges.
32 * NO_UCS2_PAIRS will pass the reservered D800-DFFF values, halting at FFFF
33 * FULL_UCS4_MAPPER represents all 31 bit values to 7FFF FFFF
34 *
35 * We already tested these, because we ensure there is a 1:1 mapping across
36 * the entire range of byte values in each position of 1 to 6 byte sequences.
37 */
38struct testval malformed[] = [
39    [[0x80,], 1,],      /* 10000000  64 invalid leading continuation values */
40    [[0xBF,], 1,],      /* 10111111  64 invalid leading continuation values */
41    [[0xC0,0x80], 2,],                         /* overshort mapping of 0000 */
42    [[0xC1,0xBF], 2,],                         /* overshort mapping of 007F */
43    [[0xE0,0x80,0x80,], 3,],                   /* overshort mapping of 0000 */
44    [[0xE0,0x9F,0xBF,], 3,],                   /* overshort mapping of 07FF */
45#ifndef NO_UCS2_PAIRS
46    [[0xED,0xA0,0x80,], 3,],    /* unexpected mapping of UCS-2 literal D800 */
47    [[0xED,0xBF,0xBF,], 3,],    /* unexpected mapping of UCS-2 literal DFFF */
48#endif
49    [[0xF0,0x80,0x80,0x80,], 4,],              /* overshort mapping of 0000 */
50    [[0xF0,0x8F,0xBF,0xBF,], 4,],              /* overshort mapping of FFFF */
51#ifdef NO_UCS2_PAIRS
52    [[0xF0,0x90,0x80,0x80,], 4,],      /* invalid too large value 0001 0000 */
53    [[0xF4,0x8F,0xBF,0xBF,], 4,],      /* invalid too large value 0010 FFFF */
54#endif
55#ifndef FULL_UCS4_MAPPER
56    [[0xF4,0x90,0x80,0x80,], 4,],      /* invalid too large value 0011 0000 */
57    [[0xF7,0xBF,0xBF,0xBF,], 4,],      /* invalid too large value 001F FFFF */
58#endif
59    [[0xF8,0x80,0x80,0x80,0x80,], 5,],    /* overshort mapping of 0000 0000 */
60    [[0xF8,0x87,0xBF,0xBF,0xBF,], 5,],    /* overshort mapping of 001F FFFF */
61#ifndef FULL_UCS4_MAPPER
62    [[0xF8,0x88,0x80,0x80,0x80,], 5,], /* invalid too large value 0020 0000 */
63    [[0xFB,0xBF,0xBF,0xBF,0xBF,], 5,], /* invalid too large value 03FF FFFF */
64#endif
65    [[0xFC,0x80,0x80,0x80,0x80,0x80,], 6,],  /* overshort mapping 0000 0000 */
66    [[0xFC,0x83,0xBF,0xBF,0xBF,0xBF,], 6,],  /* overshort mapping 03FF FFFF */
67#ifndef FULL_UCS4_MAPPER
68    [[0xFC,0x84,0x80,0x80,0x80,0x80,], 6,],  /* overshort mapping 0400 0000 */
69    [[0xFD,0xBF,0xBF,0xBF,0xBF,0xBF,], 6,],  /* overshort mapping 7FFF FFFF */
70#endif
71    [[0xFE,], 1,],    /* 11111110  invalid "too large" value, no 7 byte seq */
72    [[0xFF,], 1,],    /* 11111111  invalid "too large" value, no 8 byte seq */
73];
74
75void displaynw(struct testval *f, struct testval *l)
76{
77    char x[80], *t = x;
78    int i;
79    for (i = 0; i < f->nl; ++i)
80        t += sprintf(t, "%02X ", f->n[i]);
81    *(t++) = '-';
82    for (i = 0; i < l->nl; ++i)
83        t += sprintf(t, " %02X", l->n[i]);
84    *(t++) = ' ';
85    *(t++) = '=';
86    *(t++) = ' ';
87    for (i = 0; i < f->wl; ++i)
88        t += sprintf(t, "%04X ", f->w[i]);
89    *(t++) = '-';
90    for (i = 0; i < l->wl; ++i)
91        t += sprintf(t, " %04X", l->w[i]);
92    *t = '\0';
93    puts(x);
94}
95
96/*
97 *  Test every possible byte value.
98 *  If the test passes or fails at this byte value we are done.
99 *  Otherwise iterate test_nrange again, appending another byte.
100 */
101void test_nrange(struct testval *p)
102{
103    struct testval f, l, s;
104    apr_status_t rc;
105    int success = 0;
106
107    memcpy (&s, p, sizeof(s));
108    ++s.nl;
109
110    do {
111        apr_size_t nl = s.nl, wl = sizeof(s.w) / 2;
112        rc = apr_conv_utf8_to_ucs2(s.n, &nl, s.w, &wl);
113        s.wl = (sizeof(s.w) / 2) - wl;
114        if (!nl && rc == APR_SUCCESS) {
115            if (!success) {
116                memcpy(&f, &s, sizeof(s));
117                success = -1;
118            }
119            else {
120                if (s.wl != l.wl
121                 || memcmp(s.w, l.w, (s.wl - 1) * 2) != 0
122                 || s.w[s.wl - 1] != l.w[l.wl - 1] + 1) {
123                    displaynw(&f, &l);
124                    memcpy(&f, &s, sizeof(s));
125                }
126            }
127            memcpy(&l, &s, sizeof(s));
128        }
129        else {
130            if (success) {
131                displaynw(&f, &l);
132                success = 0;
133            }
134            if (rc == APR_INCOMPLETE) {
135                test_nrange(&s);
136            }
137        }
138    } while (++s.n[s.nl - 1]);
139
140    if (success) {
141        displaynw(&f, &l);
142        success = 0;
143    }
144}
145
146/*
147 *  Test every possible word value.
148 *  Once we are finished, retest every possible word value.
149 *  if the test fails on the following null word, iterate test_nrange
150 *  again, appending another word.
151 *  This assures the output order of the two tests are in sync.
152 */
153void test_wrange(struct testval *p)
154{
155    struct testval f, l, s;
156    apr_status_t rc;
157    int success = 0;
158
159    memcpy (&s, p, sizeof(s));
160    ++s.wl;
161
162    do {
163        apr_size_t nl = sizeof(s.n), wl = s.wl;
164        rc = apr_conv_ucs2_to_utf8(s.w, &wl, s.n, &nl);
165        s.nl = sizeof(s.n) - nl;
166        if (!wl && rc == APR_SUCCESS) {
167            if (!success) {
168                memcpy(&f, &s, sizeof(s));
169                success = -1;
170            }
171            else {
172                if (s.nl != l.nl
173                 || memcmp(s.n, l.n, s.nl - 1) != 0
174                 || s.n[s.nl - 1] != l.n[l.nl - 1] + 1) {
175                    displaynw(&f, &l);
176                    memcpy(&f, &s, sizeof(s));
177                }
178            }
179            memcpy(&l, &s, sizeof(s));
180        }
181        else {
182            if (success) {
183                displaynw(&f, &l);
184                success = 0;
185            }
186        }
187    } while (++s.w[s.wl - 1]);
188
189    if (success) {
190        displaynw(&f, &l);
191        success = 0;
192    }
193
194    do {
195        int wl = s.wl, nl = sizeof(s.n);
196        rc = apr_conv_ucs2_to_utf8(s.w, &wl, s.n, &nl);
197        s.nl = sizeof(s.n) - s.nl;
198        if (rc == APR_INCOMPLETE) {
199            test_wrange(&s);
200        }
201    } while (++s.w[s.wl - 1]);
202}
203
204/*
205 *  Test every possible byte value.
206 *  If the test passes or fails at this byte value we are done.
207 *  Otherwise iterate test_nrange again, appending another byte.
208 */
209void test_ranges()
210{
211    struct testval ntest, wtest;
212    apr_status_t nrc, wrc;
213    apr_size_t inlen;
214    unsigned long matches = 0;
215
216    memset(&ntest, 0, sizeof(ntest));
217    ++ntest.nl;
218
219    memset(&wtest, 0, sizeof(wtest));
220    ++wtest.wl;
221
222    do {
223        do {
224            inlen = ntest.nl;
225            ntest.wl = sizeof(ntest.w) / 2;
226            nrc = apr_conv_utf8_to_ucs2(ntest.n, &inlen, ntest.w, &ntest.wl);
227            if (nrc == APR_SUCCESS) {
228                ntest.wl = (sizeof(ntest.w) / 2) - ntest.wl;
229                break;
230            }
231            if (nrc == APR_INCOMPLETE) {
232                ++ntest.nl;
233                if (ntest.nl > 6) {
234                    printf ("\n\nUnexpected utf8 sequence of >6 bytes;\n");
235                    exit(255);
236                }
237                continue;
238            }
239            else {
240                while (!(++ntest.n[ntest.nl - 1])) {
241                    if (!(--ntest.nl))
242                        break;
243                }
244            }
245        } while (ntest.nl);
246
247        do {
248            inlen = wtest.wl;
249            wtest.nl = sizeof(wtest.n);
250            wrc = apr_conv_ucs2_to_utf8(wtest.w, &inlen, wtest.n, &wtest.nl);
251            if (wrc == APR_SUCCESS) {
252                wtest.nl = sizeof(wtest.n) - wtest.nl;
253                break;
254            }
255            else {
256                if (!(++wtest.w[wtest.wl - 1])) {
257                    if (wtest.wl == 1)
258                        ++wtest.wl;
259                    else
260                        ++wtest.w[0];
261
262                    /* On the second pass, ensure lead word is incomplete */
263                    do {
264                        inlen = 1;
265                        wtest.nl = sizeof(wtest.n);
266                        if (apr_conv_ucs2_to_utf8(wtest.w, &inlen, wtest.n, &wtest.nl)
267                                == APR_INCOMPLETE)
268                            break;
269                        if (!(++wtest.w[0])) {
270                            wtest.wl = 0;
271                            break;
272                        }
273                    } while (1);
274                }
275            }
276        } while (wtest.wl);
277
278        if (!ntest.nl && !wtest.wl)
279            break;
280
281        /* Identical? */
282        if ((wtest.nl != ntest.nl)
283         || (memcmp(wtest.n, ntest.n, ntest.nl) != 0)
284         || (wtest.wl != ntest.wl)
285         || (memcmp(ntest.w, wtest.w, wtest.wl * 2) != 0)) {
286            printf ("\n\nMismatch of w/n conversion at;\n");
287            displaynw(&ntest, &wtest);
288            exit(255);
289        }
290        ++matches;
291
292        while (!(++ntest.n[ntest.nl - 1])) {
293            if (!(--ntest.nl))
294                break;
295        }
296
297        if (!(++wtest.w[wtest.wl - 1])) {
298            if (wtest.wl == 1)
299                ++wtest.wl;
300            else
301                ++wtest.w[0];
302
303            /* On the second pass, ensure lead word is incomplete */
304            do {
305                inlen = 1;
306                wtest.nl = sizeof(wtest.n);
307                if (apr_conv_ucs2_to_utf8(wtest.w, &inlen, wtest.n, &wtest.nl)
308                        == APR_INCOMPLETE)
309                    break;
310                if (!(++wtest.w[0])) {
311                    wtest.wl = 0;
312                    break;
313                }
314            } while (1);
315        }
316    } while (wtest.wl || ntest.nl);
317
318    printf ("\n\nutf8 and ucs2 sequences of %lu transformations matched OK.\n",
319            matches);
320}
321
322/*
323 *  Syntax: testucs [w|n]
324 *
325 *  If no arg or arg is not recognized, run equality sequence test.
326 */
327int main(int argc, char **argv)
328{
329    struct testval s;
330    memset (&s, 0, sizeof(s));
331
332    if (argc >= 2 && apr_tolower(*argv[1]) != 'w') {
333        printf ("\n\nTesting Narrow Char Ranges\n");
334        test_nrange(&s);
335    }
336    else if (argc >= 2 && apr_tolower(*argv[1]) != 'n') {
337        printf ("\n\nTesting Wide Char Ranges\n");
338        test_wrange(&s);
339    }
340    else {
341        test_ranges();
342    }
343    return 0;
344}
345