1/*
2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
3 *
4 * This code is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License version 2 only, as
6 * published by the Free Software Foundation.  Oracle designates this
7 * particular file as subject to the "Classpath" exception as provided
8 * by Oracle in the LICENSE file that accompanied this code.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 */
24/*
25/*
26 *******************************************************************************
27 * Copyright (C) 2003-2004, International Business Machines Corporation and         *
28 * others. All Rights Reserved.                                                *
29 *******************************************************************************
30 */
31//
32// CHANGELOG
33//      2005-05-19 Edward Wang
34//          - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/StringPrep.java
35//          - move from package com.ibm.icu.text to package sun.net.idn
36//          - use ParseException instead of StringPrepParseException
37//          - change 'Normalizer.getUnicodeVersion()' to 'NormalizerImpl.getUnicodeVersion()'
38//          - remove all @deprecated tag to make compiler happy
39//      2007-08-14 Martin Buchholz
40//          - remove redundant casts
41//
42package sun.net.idn;
43
44import java.io.BufferedInputStream;
45import java.io.ByteArrayInputStream;
46import java.io.IOException;
47import java.io.InputStream;
48import java.text.ParseException;
49
50import sun.text.Normalizer;
51import sun.text.normalizer.CharTrie;
52import sun.text.normalizer.Trie;
53import sun.text.normalizer.VersionInfo;
54import sun.text.normalizer.UCharacter;
55import sun.text.normalizer.UCharacterIterator;
56import sun.text.normalizer.UTF16;
57import sun.net.idn.UCharacterDirection;
58import sun.net.idn.StringPrepDataReader;
59
60/**
61 * StringPrep API implements the StingPrep framework as described by
62 * <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>.
63 * StringPrep prepares Unicode strings for use in network protocols.
64 * Profiles of StingPrep are set of rules and data according to which the
65 * Unicode Strings are prepared. Each profiles contains tables which describe
66 * how a code point should be treated. The tables are broadly classied into
67 * <ul>
68 *     <li> Unassigned Table: Contains code points that are unassigned
69 *          in the Unicode Version supported by StringPrep. Currently
70 *          RFC 3454 supports Unicode 3.2. </li>
71 *     <li> Prohibited Table: Contains code points that are prohibted from
72 *          the output of the StringPrep processing function. </li>
73 *     <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li>
74 * </ul>
75 *
76 * The procedure for preparing Unicode strings:
77 * <ol>
78 *      <li> Map: For each character in the input, check if it has a mapping
79 *           and, if so, replace it with its mapping. </li>
80 *      <li> Normalize: Possibly normalize the result of step 1 using Unicode
81 *           normalization. </li>
82 *      <li> Prohibit: Check for any characters that are not allowed in the
83 *           output.  If any are found, return an error.</li>
84 *      <li> Check bidi: Possibly check for right-to-left characters, and if
85 *           any are found, make sure that the whole string satisfies the
86 *           requirements for bidirectional strings.  If the string does not
87 *           satisfy the requirements for bidirectional strings, return an
88 *           error.  </li>
89 * </ol>
90 * @author Ram Viswanadha
91 * @draft ICU 2.8
92 */
93public final class StringPrep {
94    /**
95     * Option to prohibit processing of unassigned code points in the input
96     *
97     * @see   #prepare
98     * @draft ICU 2.8
99     */
100    public static final int DEFAULT = 0x0000;
101
102    /**
103     * Option to allow processing of unassigned code points in the input
104     *
105     * @see   #prepare
106     * @draft ICU 2.8
107     */
108    public static final int ALLOW_UNASSIGNED = 0x0001;
109
110    private static final int UNASSIGNED        = 0x0000;
111    private static final int MAP               = 0x0001;
112    private static final int PROHIBITED        = 0x0002;
113    private static final int DELETE            = 0x0003;
114    private static final int TYPE_LIMIT        = 0x0004;
115
116    private static final int NORMALIZATION_ON  = 0x0001;
117    private static final int CHECK_BIDI_ON     = 0x0002;
118
119    private static final int TYPE_THRESHOLD       = 0xFFF0;
120    private static final int MAX_INDEX_VALUE      = 0x3FBF;   /*16139*/
121    private static final int MAX_INDEX_TOP_LENGTH = 0x0003;
122
123    /* indexes[] value names */
124    private static final int INDEX_TRIE_SIZE                  =  0; /* number of bytes in normalization trie */
125    private static final int INDEX_MAPPING_DATA_SIZE          =  1; /* The array that contains the mapping   */
126    private static final int NORM_CORRECTNS_LAST_UNI_VERSION  =  2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */
127    private static final int ONE_UCHAR_MAPPING_INDEX_START    =  3; /* The starting index of 1 UChar mapping index in the mapping data array */
128    private static final int TWO_UCHARS_MAPPING_INDEX_START   =  4; /* The starting index of 2 UChars mapping index in the mapping data array */
129    private static final int THREE_UCHARS_MAPPING_INDEX_START =  5;
130    private static final int FOUR_UCHARS_MAPPING_INDEX_START  =  6;
131    private static final int OPTIONS                          =  7; /* Bit set of options to turn on in the profile */
132    private static final int INDEX_TOP                        = 16;                          /* changing this requires a new formatVersion */
133
134
135    /**
136     * Default buffer size of datafile
137     */
138    private static final int DATA_BUFFER_SIZE = 25000;
139
140    /* Wrappers for Trie implementations */
141    private static final class StringPrepTrieImpl implements Trie.DataManipulate{
142        private CharTrie sprepTrie = null;
143       /**
144        * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
145        * data the index array offset of the indexes for that lead surrogate.
146        * @param property data value for a surrogate from the trie, including
147        *        the folding offset
148        * @return data offset or 0 if there is no data for the lead surrogate
149        */
150         public int getFoldingOffset(int value){
151            return value;
152        }
153    }
154
155    // CharTrie implementation for reading the trie data
156    private StringPrepTrieImpl sprepTrieImpl;
157    // Indexes read from the data file
158    private int[] indexes;
159    // mapping data read from the data file
160    private char[] mappingData;
161    // format version of the data file
162    private byte[] formatVersion;
163    // the version of Unicode supported by the data file
164    private VersionInfo sprepUniVer;
165    // the Unicode version of last entry in the
166    // NormalizationCorrections.txt file if normalization
167    // is turned on
168    private VersionInfo normCorrVer;
169    // Option to turn on Normalization
170    private boolean doNFKC;
171    // Option to turn on checking for BiDi rules
172    private boolean checkBiDi;
173
174
175    private char getCodePointValue(int ch){
176        return sprepTrieImpl.sprepTrie.getCodePointValue(ch);
177    }
178
179    private static VersionInfo getVersionInfo(int comp){
180        int micro = comp & 0xFF;
181        int milli =(comp >> 8)  & 0xFF;
182        int minor =(comp >> 16) & 0xFF;
183        int major =(comp >> 24) & 0xFF;
184        return VersionInfo.getInstance(major,minor,milli,micro);
185    }
186    private static VersionInfo getVersionInfo(byte[] version){
187        if(version.length != 4){
188            return null;
189        }
190        return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]);
191    }
192    /**
193     * Creates an StringPrep object after reading the input stream.
194     * The object does not hold a reference to the input steam, so the stream can be
195     * closed after the method returns.
196     *
197     * @param inputStream The stream for reading the StringPrep profile binarySun
198     * @throws IOException
199     * @draft ICU 2.8
200     */
201    public StringPrep(InputStream inputStream) throws IOException{
202
203        BufferedInputStream b = new BufferedInputStream(inputStream,DATA_BUFFER_SIZE);
204
205        StringPrepDataReader reader = new StringPrepDataReader(b);
206
207        // read the indexes
208        indexes = reader.readIndexes(INDEX_TOP);
209
210        byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]];
211
212
213        //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes
214        mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE]/2];
215        // load the rest of the data data and initialize the data members
216        reader.read(sprepBytes,mappingData);
217
218        sprepTrieImpl           = new StringPrepTrieImpl();
219        sprepTrieImpl.sprepTrie = new CharTrie( new ByteArrayInputStream(sprepBytes),sprepTrieImpl  );
220
221        // get the data format version
222        formatVersion = reader.getDataFormatVersion();
223
224        // get the options
225        doNFKC            = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);
226        checkBiDi         = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
227        sprepUniVer   = getVersionInfo(reader.getUnicodeVersion());
228        normCorrVer   = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
229        VersionInfo normUniVer = UCharacter.getUnicodeVersion();
230        if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
231           normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
232           ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
233           ){
234            throw new IOException("Normalization Correction version not supported");
235        }
236        b.close();
237    }
238
239    private static final class Values{
240        boolean isIndex;
241        int value;
242        int type;
243        public void reset(){
244            isIndex = false;
245            value = 0;
246            type = -1;
247        }
248    }
249
250    private static final void getValues(char trieWord,Values values){
251        values.reset();
252        if(trieWord == 0){
253            /*
254             * Initial value stored in the mapping table
255             * just return TYPE_LIMIT .. so that
256             * the source codepoint is copied to the destination
257             */
258            values.type = TYPE_LIMIT;
259        }else if(trieWord >= TYPE_THRESHOLD){
260            values.type = (trieWord - TYPE_THRESHOLD);
261        }else{
262            /* get the type */
263            values.type = MAP;
264            /* ascertain if the value is index or delta */
265            if((trieWord & 0x02)>0){
266                values.isIndex = true;
267                values.value = trieWord  >> 2; //mask off the lower 2 bits and shift
268
269            }else{
270                values.isIndex = false;
271                values.value = (trieWord<<16)>>16;
272                values.value =  (values.value >> 2);
273
274            }
275
276            if((trieWord>>2) == MAX_INDEX_VALUE){
277                values.type = DELETE;
278                values.isIndex = false;
279                values.value = 0;
280            }
281        }
282    }
283
284
285
286    private StringBuffer map( UCharacterIterator iter, int options)
287                            throws ParseException {
288
289        Values val = new Values();
290        char result = 0;
291        int ch  = UCharacterIterator.DONE;
292        StringBuffer dest = new StringBuffer();
293        boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0);
294
295        while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
296
297            result = getCodePointValue(ch);
298            getValues(result,val);
299
300            // check if the source codepoint is unassigned
301            if(val.type == UNASSIGNED && allowUnassigned == false){
302                 throw new ParseException("An unassigned code point was found in the input " +
303                                          iter.getText(), iter.getIndex());
304            }else if((val.type == MAP)){
305                int index, length;
306
307                if(val.isIndex){
308                    index = val.value;
309                    if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] &&
310                             index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){
311                        length = 1;
312                    }else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] &&
313                             index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){
314                        length = 2;
315                    }else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] &&
316                             index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){
317                        length = 3;
318                    }else{
319                        length = mappingData[index++];
320                    }
321                    /* copy mapping to destination */
322                    dest.append(mappingData,index,length);
323                    continue;
324
325                }else{
326                    ch -= val.value;
327                }
328            }else if(val.type == DELETE){
329                // just consume the codepoint and contine
330                continue;
331            }
332            //copy the source into destination
333            UTF16.append(dest,ch);
334        }
335
336        return dest;
337    }
338
339
340    private StringBuffer normalize(StringBuffer src){
341        /*
342         * Option UNORM_BEFORE_PRI_29:
343         *
344         * IDNA as interpreted by IETF members (see unicode mailing list 2004H1)
345         * requires strict adherence to Unicode 3.2 normalization,
346         * including buggy composition from before fixing Public Review Issue #29.
347         * Note that this results in some valid but nonsensical text to be
348         * either corrupted or rejected, depending on the text.
349         * See http://www.unicode.org/review/resolved-pri.html#pri29
350         * See unorm.cpp and cnormtst.c
351         */
352        return new StringBuffer(
353            Normalizer.normalize(
354                src.toString(),
355                java.text.Normalizer.Form.NFKC,
356                Normalizer.UNICODE_3_2));
357    }
358    /*
359    boolean isLabelSeparator(int ch){
360        int result = getCodePointValue(ch);
361        if( (result & 0x07)  == LABEL_SEPARATOR){
362            return true;
363        }
364        return false;
365    }
366    */
367     /*
368       1) Map -- For each character in the input, check if it has a mapping
369          and, if so, replace it with its mapping.
370
371       2) Normalize -- Possibly normalize the result of step 1 using Unicode
372          normalization.
373
374       3) Prohibit -- Check for any characters that are not allowed in the
375          output.  If any are found, return an error.
376
377       4) Check bidi -- Possibly check for right-to-left characters, and if
378          any are found, make sure that the whole string satisfies the
379          requirements for bidirectional strings.  If the string does not
380          satisfy the requirements for bidirectional strings, return an
381          error.
382          [Unicode3.2] defines several bidirectional categories; each character
383           has one bidirectional category assigned to it.  For the purposes of
384           the requirements below, an "RandALCat character" is a character that
385           has Unicode bidirectional categories "R" or "AL"; an "LCat character"
386           is a character that has Unicode bidirectional category "L".  Note
387
388
389           that there are many characters which fall in neither of the above
390           definitions; Latin digits (<U+0030> through <U+0039>) are examples of
391           this because they have bidirectional category "EN".
392
393           In any profile that specifies bidirectional character handling, all
394           three of the following requirements MUST be met:
395
396           1) The characters in section 5.8 MUST be prohibited.
397
398           2) If a string contains any RandALCat character, the string MUST NOT
399              contain any LCat character.
400
401           3) If a string contains any RandALCat character, a RandALCat
402              character MUST be the first character of the string, and a
403              RandALCat character MUST be the last character of the string.
404    */
405    /**
406     * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC),
407     * checks for prohited and BiDi characters in the order defined by RFC 3454
408     * depending on the options specified in the profile.
409     *
410     * @param src           A UCharacterIterator object containing the source string
411     * @param options       A bit set of options:
412     *
413     *  - StringPrep.NONE               Prohibit processing of unassigned code points in the input
414     *
415     *  - StringPrep.ALLOW_UNASSIGNED   Treat the unassigned code points are in the input
416     *                                  as normal Unicode code points.
417     *
418     * @return StringBuffer A StringBuffer containing the output
419     * @throws ParseException
420     * @draft ICU 2.8
421     */
422    public StringBuffer prepare(UCharacterIterator src, int options)
423                        throws ParseException{
424
425        // map
426        StringBuffer mapOut = map(src,options);
427        StringBuffer normOut = mapOut;// initialize
428
429        if(doNFKC){
430            // normalize
431            normOut = normalize(mapOut);
432        }
433
434        int ch;
435        char result;
436        UCharacterIterator iter = UCharacterIterator.getInstance(normOut);
437        Values val = new Values();
438        int direction=UCharacterDirection.CHAR_DIRECTION_COUNT,
439            firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT;
440        int rtlPos=-1, ltrPos=-1;
441        boolean rightToLeft=false, leftToRight=false;
442
443        while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
444            result = getCodePointValue(ch);
445            getValues(result,val);
446
447            if(val.type == PROHIBITED ){
448                throw new ParseException("A prohibited code point was found in the input" +
449                                         iter.getText(), val.value);
450            }
451
452            direction = UCharacter.getDirection(ch);
453            if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){
454                firstCharDir = direction;
455            }
456            if(direction == UCharacterDirection.LEFT_TO_RIGHT){
457                leftToRight = true;
458                ltrPos = iter.getIndex()-1;
459            }
460            if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){
461                rightToLeft = true;
462                rtlPos = iter.getIndex()-1;
463            }
464        }
465        if(checkBiDi == true){
466            // satisfy 2
467            if( leftToRight == true && rightToLeft == true){
468                throw new ParseException("The input does not conform to the rules for BiDi code points." +
469                                         iter.getText(),
470                                         (rtlPos>ltrPos) ? rtlPos : ltrPos);
471             }
472
473            //satisfy 3
474            if( rightToLeft == true &&
475                !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) &&
476                (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))
477              ){
478                throw new ParseException("The input does not conform to the rules for BiDi code points." +
479                                         iter.getText(),
480                                         (rtlPos>ltrPos) ? rtlPos : ltrPos);
481            }
482        }
483        return normOut;
484
485      }
486}
487