1/*
2 * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26package com.sun.xml.internal.ws.encoding;
27
28import javax.xml.ws.WebServiceException;
29
30/**
31 * This class tokenizes RFC822 and MIME headers into the basic
32 * symbols specified by RFC822 and MIME. <p>
33 *
34 * This class handles folded headers (ie headers with embedded
35 * CRLF SPACE sequences). The folds are removed in the returned
36 * tokens.
37 *
38 * @version 1.9, 02/03/27
39 * @author  John Mani
40 */
41
42class HeaderTokenizer {
43
44    /**
45     * The Token class represents tokens returned by the
46     * HeaderTokenizer.
47     */
48    static class Token {
49
50        private int type;
51        private String value;
52
53        /**
54         * Token type indicating an ATOM.
55         */
56        public static final int ATOM            = -1;
57
58        /**
59         * Token type indicating a quoted string. The value
60         * field contains the string without the quotes.
61         */
62        public static final int QUOTEDSTRING    = -2;
63
64        /**
65         * Token type indicating a comment. The value field
66         * contains the comment string without the comment
67         * start and end symbols.
68         */
69        public static final int COMMENT         = -3;
70
71        /**
72         * Token type indicating end of input.
73         */
74        public static final int  EOF            = -4;
75
76        /**
77         * Constructor.
78         * @param       type    Token type
79         * @param       value   Token value
80         */
81        public Token(int type, String value) {
82             this.type = type;
83             this.value = value;
84        }
85
86        /**
87         * Return the type of the token. If the token represents a
88         * delimiter or a control character, the type is that character
89         * itself, converted to an integer. Otherwise, it's value is
90         * one of the following:
91         * <ul>
92         * <li><code>ATOM</code> A sequence of ASCII characters
93         *      delimited by either SPACE, CTL, "(", <"> or the
94         *      specified SPECIALS
95         * <li><code>QUOTEDSTRING</code> A sequence of ASCII characters
96         *      within quotes
97         * <li><code>COMMENT</code> A sequence of ASCII characters
98         *      within "(" and ")".
99         * <li><code>EOF</code> End of header
100         * </ul>
101         */
102        public int getType() {
103            return type;
104        }
105
106        /**
107         * Returns the value of the token just read. When the current
108         * token is a quoted string, this field contains the body of the
109         * string, without the quotes. When the current token is a comment,
110         * this field contains the body of the comment.
111         *
112         * @return      token value
113         */
114        public String getValue() {
115            return value;
116        }
117    }
118
119    private String string; // the string to be tokenized
120    private boolean skipComments; // should comments be skipped ?
121    private String delimiters; // delimiter string
122    private int currentPos; // current parse position
123    private int maxPos; // string length
124    private int nextPos; // track start of next Token for next()
125    private int peekPos; // track start of next Token for peek()
126
127    /**
128     * RFC822 specials
129     */
130    private final static String RFC822 = "()<>@,;:\\\"\t .[]";
131
132    /**
133     * MIME specials
134     */
135    final static String MIME = "()<>@,;:\\\"\t []/?=";
136
137    // The EOF Token
138    private final static Token EOFToken = new Token(Token.EOF, null);
139
140    /**
141     * Constructor that takes a rfc822 style header.
142     *
143     * @param   header  The rfc822 header to be tokenized
144     * @param   delimiters      Set of delimiter characters
145     *                          to be used to delimit ATOMS. These
146     *                          are usually <code>RFC822</code> or
147     *                          <code>MIME</code>
148     * @param   skipComments  If true, comments are skipped and
149     *                          not returned as tokens
150     */
151    HeaderTokenizer(String header, String delimiters,
152                           boolean skipComments) {
153        string = (header == null) ? "" : header; // paranoia ?!
154        this.skipComments = skipComments;
155        this.delimiters = delimiters;
156        currentPos = nextPos = peekPos = 0;
157        maxPos = string.length();
158    }
159
160    /**
161     * Constructor. Comments are ignored and not returned as tokens
162     *
163     * @param   header  The header that is tokenized
164     * @param   delimiters  The delimiters to be used
165     */
166    HeaderTokenizer(String header, String delimiters) {
167            this(header, delimiters, true);
168    }
169
170    /**
171     * Constructor. The RFC822 defined delimiters - RFC822 - are
172     * used to delimit ATOMS. Also comments are skipped and not
173     * returned as tokens
174     */
175    HeaderTokenizer(String header)  {
176            this(header, RFC822);
177    }
178
179    /**
180     * Parses the next token from this String. <p>
181     *
182     * Clients sit in a loop calling next() to parse successive
183     * tokens until an EOF Token is returned.
184     *
185     * @return          the next Token
186     * @exception WebServiceException if the parse fails
187     */
188    Token next() throws WebServiceException {
189        Token tk;
190
191        currentPos = nextPos; // setup currentPos
192        tk = getNext();
193        nextPos = peekPos = currentPos; // update currentPos and peekPos
194        return tk;
195    }
196
197    /**
198     * Peek at the next token, without actually removing the token
199     * from the parse stream. Invoking this method multiple times
200     * will return successive tokens, until <code>next()</code> is
201     * called. <p>
202     *
203     * @return          the next Token
204     * @exception       WebServiceException if the parse fails
205     */
206    Token peek() throws WebServiceException {
207        Token tk;
208
209        currentPos = peekPos; // setup currentPos
210        tk = getNext();
211        peekPos = currentPos; // update peekPos
212        return tk;
213    }
214
215    /**
216     * Return the rest of the Header.
217     *
218     * @return String   rest of header. null is returned if we are
219     *                  already at end of header
220     */
221    String getRemainder() {
222            return string.substring(nextPos);
223    }
224
225    /*
226     * Return the next token starting from 'currentPos'. After the
227     * parse, 'currentPos' is updated to point to the start of the
228     * next token.
229     */
230    private Token getNext() throws WebServiceException {
231        // If we're already at end of string, return EOF
232        if (currentPos >= maxPos)
233            return EOFToken;
234
235        // Skip white-space, position currentPos beyond the space
236        if (skipWhiteSpace() == Token.EOF)
237            return EOFToken;
238
239        char c;
240        int start;
241        boolean filter = false;
242
243        c = string.charAt(currentPos);
244
245        // Check or Skip comments and position currentPos
246        // beyond the comment
247        while (c == '(') {
248            // Parsing comment ..
249            int nesting;
250            for (start = ++currentPos, nesting = 1;
251             nesting > 0 && currentPos < maxPos;
252             currentPos++) {
253            c = string.charAt(currentPos);
254            if (c == '\\') {  // Escape sequence
255                currentPos++; // skip the escaped character
256                filter = true;
257            } else if (c == '\r')
258                filter = true;
259            else if (c == '(')
260                nesting++;
261            else if (c == ')')
262                nesting--;
263            }
264            if (nesting != 0)
265            throw new WebServiceException("Unbalanced comments");
266
267            if (!skipComments) {
268            // Return the comment, if we are asked to.
269            // Note that the comment start & end markers are ignored.
270            String s;
271            if (filter) // need to go thru the token again.
272                s = filterToken(string, start, currentPos-1);
273            else
274                s = string.substring(start,currentPos-1);
275
276            return new Token(Token.COMMENT, s);
277            }
278
279            // Skip any whitespace after the comment.
280            if (skipWhiteSpace() == Token.EOF)
281            return EOFToken;
282            c = string.charAt(currentPos);
283        }
284
285        // Check for quoted-string and position currentPos
286        //  beyond the terminating quote
287        if (c == '"') {
288            for (start = ++currentPos; currentPos < maxPos; currentPos++) {
289            c = string.charAt(currentPos);
290            if (c == '\\') { // Escape sequence
291                currentPos++;
292                filter = true;
293            } else if (c == '\r')
294                filter = true;
295            else if (c == '"') {
296                currentPos++;
297                String s;
298
299                if (filter)
300                s = filterToken(string, start, currentPos-1);
301                else
302                s = string.substring(start,currentPos-1);
303
304                return new Token(Token.QUOTEDSTRING, s);
305            }
306            }
307            throw new WebServiceException("Unbalanced quoted string");
308        }
309
310        // Check for SPECIAL or CTL
311        if (c < 040 || c >= 0177 || delimiters.indexOf(c) >= 0) {
312            currentPos++; // re-position currentPos
313            char ch[] = new char[1];
314            ch[0] = c;
315            return new Token((int)c, new String(ch));
316        }
317
318        // Check for ATOM
319        for (start = currentPos; currentPos < maxPos; currentPos++) {
320            c = string.charAt(currentPos);
321            // ATOM is delimited by either SPACE, CTL, "(", <">
322            // or the specified SPECIALS
323            if (c < 040 || c >= 0177 || c == '(' || c == ' ' ||
324            c == '"' || delimiters.indexOf(c) >= 0)
325            break;
326        }
327        return new Token(Token.ATOM, string.substring(start, currentPos));
328        }
329
330        // Skip SPACE, HT, CR and NL
331        private int skipWhiteSpace() {
332        char c;
333        for (; currentPos < maxPos; currentPos++)
334            if (((c = string.charAt(currentPos)) != ' ') &&
335            (c != '\t') && (c != '\r') && (c != '\n'))
336            return currentPos;
337        return Token.EOF;
338    }
339
340    /* Process escape sequences and embedded LWSPs from a comment or
341     * quoted string.
342     */
343    private static String filterToken(String s, int start, int end) {
344        StringBuffer sb = new StringBuffer();
345        char c;
346        boolean gotEscape = false;
347        boolean gotCR = false;
348
349        for (int i = start; i < end; i++) {
350            c = s.charAt(i);
351            if (c == '\n' && gotCR) {
352            // This LF is part of an unescaped
353            // CRLF sequence (i.e, LWSP). Skip it.
354            gotCR = false;
355            continue;
356            }
357
358            gotCR = false;
359            if (!gotEscape) {
360            // Previous character was NOT '\'
361            if (c == '\\') // skip this character
362                gotEscape = true;
363            else if (c == '\r') // skip this character
364                gotCR = true;
365            else // append this character
366                sb.append(c);
367            } else {
368            // Previous character was '\'. So no need to
369            // bother with any special processing, just
370            // append this character
371            sb.append(c);
372            gotEscape = false;
373            }
374        }
375        return sb.toString();
376    }
377}
378