1/* Normalization forms (composition and decomposition) of Unicode strings.
2   Copyright (C) 2001-2002, 2009-2010 Free Software Foundation, Inc.
3   Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5   This program is free software: you can redistribute it and/or modify it
6   under the terms of the GNU Lesser General Public License as published
7   by the Free Software Foundation; either version 3 of the License, or
8   (at your option) any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public License
16   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17
18#ifndef _UNINORM_H
19#define _UNINORM_H
20
21/* Get LIBUNISTRING_DLL_VARIABLE.  */
22#include <unistring/woe32dll.h>
23
24/* Get size_t.  */
25#include <stddef.h>
26
27#include "unitypes.h"
28
29
30#ifdef __cplusplus
31extern "C" {
32#endif
33
34
35/* Conventions:
36
37   All functions prefixed with u8_ operate on UTF-8 encoded strings.
38   Their unit is an uint8_t (1 byte).
39
40   All functions prefixed with u16_ operate on UTF-16 encoded strings.
41   Their unit is an uint16_t (a 2-byte word).
42
43   All functions prefixed with u32_ operate on UCS-4 encoded strings.
44   Their unit is an uint32_t (a 4-byte word).
45
46   All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
47   n units.
48
49   Functions returning a string result take a (resultbuf, lengthp) argument
50   pair.  If resultbuf is not NULL and the result fits into *lengthp units,
51   it is put in resultbuf, and resultbuf is returned.  Otherwise, a freshly
52   allocated string is returned.  In both cases, *lengthp is set to the
53   length (number of units) of the returned string.  In case of error,
54   NULL is returned and errno is set.  */
55
56
57enum
58{
59  UC_DECOMP_CANONICAL,/*            Canonical decomposition.                  */
60  UC_DECOMP_FONT,    /*   <font>    A font variant (e.g. a blackletter form). */
61  UC_DECOMP_NOBREAK, /* <noBreak>   A no-break version of a space or hyphen.  */
62  UC_DECOMP_INITIAL, /* <initial>   An initial presentation form (Arabic).    */
63  UC_DECOMP_MEDIAL,  /*  <medial>   A medial presentation form (Arabic).      */
64  UC_DECOMP_FINAL,   /*  <final>    A final presentation form (Arabic).       */
65  UC_DECOMP_ISOLATED,/* <isolated>  An isolated presentation form (Arabic).   */
66  UC_DECOMP_CIRCLE,  /*  <circle>   An encircled form.                        */
67  UC_DECOMP_SUPER,   /*  <super>    A superscript form.                       */
68  UC_DECOMP_SUB,     /*   <sub>     A subscript form.                         */
69  UC_DECOMP_VERTICAL,/* <vertical>  A vertical layout presentation form.      */
70  UC_DECOMP_WIDE,    /*   <wide>    A wide (or zenkaku) compatibility character. */
71  UC_DECOMP_NARROW,  /*  <narrow>   A narrow (or hankaku) compatibility character. */
72  UC_DECOMP_SMALL,   /*  <small>    A small variant form (CNS compatibility). */
73  UC_DECOMP_SQUARE,  /*  <square>   A CJK squared font variant.               */
74  UC_DECOMP_FRACTION,/* <fraction>  A vulgar fraction form.                   */
75  UC_DECOMP_COMPAT   /*  <compat>   Otherwise unspecified compatibility character. */
76};
77
78/* Maximum size of decomposition of a single Unicode character.  */
79#define UC_DECOMPOSITION_MAX_LENGTH 32
80
81/* Return the character decomposition mapping of a Unicode character.
82   DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
83   ucs_t elements.
84   When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are
85   filled and N is returned.  Otherwise -1 is returned.  */
86extern int
87       uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition);
88
89/* Return the canonical character decomposition mapping of a Unicode character.
90   DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
91   ucs_t elements.
92   When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is
93   returned.  Otherwise -1 is returned.  */
94extern int
95       uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition);
96
97
98/* Attempt to combine the Unicode characters uc1, uc2.
99   uc1 is known to have canonical combining class 0.
100   Return the combination of uc1 and uc2, if it exists.
101   Return 0 otherwise.
102   Not all decompositions can be recombined using this function.  See the
103   Unicode file CompositionExclusions.txt for details.  */
104extern ucs4_t
105       uc_composition (ucs4_t uc1, ucs4_t uc2);
106
107
108/* An object of type uninorm_t denotes a Unicode normalization form.  */
109struct unicode_normalization_form;
110typedef const struct unicode_normalization_form *uninorm_t;
111
112/* UNINORM_NFD: Normalization form D: canonical decomposition.  */
113extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfd;
114#define UNINORM_NFD (&uninorm_nfd)
115
116/* UNINORM_NFC: Normalization form C: canonical decomposition, then
117   canonical composition.  */
118extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfc;
119#define UNINORM_NFC (&uninorm_nfc)
120
121/* UNINORM_NFKD: Normalization form KD: compatibility decomposition.  */
122extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfkd;
123#define UNINORM_NFKD (&uninorm_nfkd)
124
125/* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then
126   canonical composition.  */
127extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfkc;
128#define UNINORM_NFKC (&uninorm_nfkc)
129
130/* Test whether a normalization form does compatibility decomposition.  */
131#define uninorm_is_compat_decomposing(nf) \
132  ((* (const unsigned int *) (nf) >> 0) & 1)
133
134/* Test whether a normalization form includes canonical composition.  */
135#define uninorm_is_composing(nf) \
136  ((* (const unsigned int *) (nf) >> 1) & 1)
137
138/* Return the decomposing variant of a normalization form.
139   This maps NFC,NFD -> NFD and NFKC,NFKD -> NFKD.  */
140extern uninorm_t uninorm_decomposing_form (uninorm_t nf);
141
142
143/* Return the specified normalization form of a string.  */
144extern uint8_t *
145       u8_normalize (uninorm_t nf, const uint8_t *s, size_t n,
146                     uint8_t *resultbuf, size_t *lengthp);
147extern uint16_t *
148       u16_normalize (uninorm_t nf, const uint16_t *s, size_t n,
149                      uint16_t *resultbuf, size_t *lengthp);
150extern uint32_t *
151       u32_normalize (uninorm_t nf, const uint32_t *s, size_t n,
152                      uint32_t *resultbuf, size_t *lengthp);
153
154
155/* Compare S1 and S2, ignoring differences in normalization.
156   NF must be either UNINORM_NFD or UNINORM_NFKD.
157   If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
158   return 0.  Upon failure, return -1 with errno set.  */
159extern int
160       u8_normcmp (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
161                   uninorm_t nf, int *resultp);
162extern int
163       u16_normcmp (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
164                    uninorm_t nf, int *resultp);
165extern int
166       u32_normcmp (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
167                    uninorm_t nf, int *resultp);
168
169
170/* Converts the string S of length N to a NUL-terminated byte sequence, in such
171   a way that comparing uN_normxfrm (S1) and uN_normxfrm (S2) with uN_cmp2() is
172   equivalent to comparing S1 and S2 with uN_normcoll().
173   NF must be either UNINORM_NFC or UNINORM_NFKC.  */
174extern char *
175       u8_normxfrm (const uint8_t *s, size_t n, uninorm_t nf,
176                    char *resultbuf, size_t *lengthp);
177extern char *
178       u16_normxfrm (const uint16_t *s, size_t n, uninorm_t nf,
179                     char *resultbuf, size_t *lengthp);
180extern char *
181       u32_normxfrm (const uint32_t *s, size_t n, uninorm_t nf,
182                     char *resultbuf, size_t *lengthp);
183
184
185/* Compare S1 and S2, ignoring differences in normalization, using the
186   collation rules of the current locale.
187   NF must be either UNINORM_NFC or UNINORM_NFKC.
188   If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
189   return 0.  Upon failure, return -1 with errno set.  */
190extern int
191       u8_normcoll (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
192                    uninorm_t nf, int *resultp);
193extern int
194       u16_normcoll (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
195                     uninorm_t nf, int *resultp);
196extern int
197       u32_normcoll (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
198                     uninorm_t nf, int *resultp);
199
200
201/* Normalization of a stream of Unicode characters.
202
203   A "stream of Unicode characters" is essentially a function that accepts an
204   ucs4_t argument repeatedly, optionally combined with a function that
205   "flushes" the stream.  */
206
207/* Data type of a stream of Unicode characters that normalizes its input
208   according to a given normalization form and passes the normalized character
209   sequence to the encapsulated stream of Unicode characters.  */
210struct uninorm_filter;
211
212/* Create and return a normalization filter for Unicode characters.
213   The pair (stream_func, stream_data) is the encapsulated stream.
214   stream_func (stream_data, uc) receives the Unicode character uc
215   and returns 0 if successful, or -1 with errno set upon failure.
216   Return the new filter, or NULL with errno set upon failure.  */
217extern struct uninorm_filter *
218       uninorm_filter_create (uninorm_t nf,
219                              int (*stream_func) (void *stream_data, ucs4_t uc),
220                              void *stream_data);
221
222/* Stuff a Unicode character into a normalizing filter.
223   Return 0 if successful, or -1 with errno set upon failure.  */
224extern int
225       uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc);
226
227/* Bring data buffered in the filter to its destination, the encapsulated
228   stream.
229   Return 0 if successful, or -1 with errno set upon failure.
230   Note! If after calling this function, additional characters are written
231   into the filter, the resulting character sequence in the encapsulated stream
232   will not necessarily be normalized.  */
233extern int
234       uninorm_filter_flush (struct uninorm_filter *filter);
235
236/* Bring data buffered in the filter to its destination, the encapsulated
237   stream, then close and free the filter.
238   Return 0 if successful, or -1 with errno set upon failure.  */
239extern int
240       uninorm_filter_free (struct uninorm_filter *filter);
241
242
243#ifdef __cplusplus
244}
245#endif
246
247
248#endif /* _UNINORM_H */
249