1/*
2 * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26/*
27 * @(#)HeaderTokenizer.java   1.9 02/03/27
28 */
29
30
31
32package com.sun.xml.internal.messaging.saaj.packaging.mime.internet;
33
34
35/**
36 * This class tokenizes RFC822 and MIME headers into the basic
37 * symbols specified by RFC822 and MIME. <p>
38 *
39 * This class handles folded headers (ie headers with embedded
40 * CRLF SPACE sequences). The folds are removed in the returned
41 * tokens.
42 *
43 * @version 1.9, 02/03/27
44 * @author  John Mani
45 */
46
47public class HeaderTokenizer {
48
49    /**
50     * The Token class represents tokens returned by the
51     * HeaderTokenizer.
52     */
53    public static class Token {
54
55        private int type;
56        private String value;
57
58        /**
59         * Token type indicating an ATOM.
60         */
61        public static final int ATOM            = -1;
62
63        /**
64         * Token type indicating a quoted string. The value
65         * field contains the string without the quotes.
66         */
67        public static final int QUOTEDSTRING    = -2;
68
69        /**
70         * Token type indicating a comment. The value field
71         * contains the comment string without the comment
72         * start and end symbols.
73         */
74        public static final int COMMENT         = -3;
75
76        /**
77         * Token type indicating end of input.
78         */
79        public static final int  EOF            = -4;
80
81        /**
82         * Constructor.
83         * @param       type    Token type
84         * @param       value   Token value
85         */
86        public Token(int type, String value) {
87             this.type = type;
88             this.value = value;
89        }
90
91        /**
92         * Return the type of the token. If the token represents a
93         * delimiter or a control character, the type is that character
94         * itself, converted to an integer. Otherwise, it's value is
95         * one of the following:
96         * <ul>
97         * <li><code>ATOM</code> A sequence of ASCII characters
98         *      delimited by either SPACE, CTL, "(", &lt;"&gt; or the
99         *      specified SPECIALS</li>
100         * <li><code>QUOTEDSTRING</code> A sequence of ASCII characters
101         *      within quotes</li>
102         * <li><code>COMMENT</code> A sequence of ASCII characters
103         *      within "(" and ")".</li>
104         * <li><code>EOF</code> End of header</li>
105         * </ul>
106         * @return type
107         */
108        public int getType() {
109            return type;
110        }
111
112        /**
113         * Returns the value of the token just read. When the current
114         * token is a quoted string, this field contains the body of the
115         * string, without the quotes. When the current token is a comment,
116         * this field contains the body of the comment.
117         *
118         * @return      token value
119         */
120        public String getValue() {
121            return value;
122        }
123    }
124
125    private String string; // the string to be tokenized
126    private boolean skipComments; // should comments be skipped ?
127    private String delimiters; // delimiter string
128    private int currentPos; // current parse position
129    private int maxPos; // string length
130    private int nextPos; // track start of next Token for next()
131    private int peekPos; // track start of next Token for peek()
132
133    /**
134     * RFC822 specials
135     */
136    public final static String RFC822 = "()<>@,;:\\\"\t .[]";
137
138    /**
139     * MIME specials
140     */
141    public final static String MIME = "()<>@,;:\\\"\t []/?=";
142
143    // The EOF Token
144    private final static Token EOFToken = new Token(Token.EOF, null);
145
146    /**
147     * Constructor that takes a rfc822 style header.
148     *
149     * @param   header  The rfc822 header to be tokenized
150     * @param   delimiters      Set of delimiter characters
151     *                          to be used to delimit ATOMS. These
152     *                          are usually <code>RFC822</code> or
153     *                          <code>MIME</code>
154     * @param   skipComments  If true, comments are skipped and
155     *                          not returned as tokens
156     */
157    public HeaderTokenizer(String header, String delimiters,
158                           boolean skipComments) {
159        string = (header == null) ? "" : header; // paranoia ?!
160        this.skipComments = skipComments;
161        this.delimiters = delimiters;
162        currentPos = nextPos = peekPos = 0;
163        maxPos = string.length();
164    }
165
166    /**
167     * Constructor. Comments are ignored and not returned as tokens
168     *
169     * @param   header  The header that is tokenized
170     * @param   delimiters  The delimiters to be used
171     */
172    public HeaderTokenizer(String header, String delimiters) {
173        this(header, delimiters, true);
174    }
175
176    /**
177     * Constructor. The RFC822 defined delimiters - RFC822 - are
178     * used to delimit ATOMS. Also comments are skipped and not
179     * returned as tokens
180     * @param header The header that is tokenized.
181     */
182    public HeaderTokenizer(String header)  {
183        this(header, RFC822);
184    }
185
186    /**
187     * Parses the next token from this String. <p>
188     *
189     * Clients sit in a loop calling next() to parse successive
190     * tokens until an EOF Token is returned.
191     *
192     * @return          the next Token
193     * @exception       ParseException if the parse fails
194     */
195    public Token next() throws ParseException {
196        Token tk;
197
198        currentPos = nextPos; // setup currentPos
199        tk = getNext();
200        nextPos = peekPos = currentPos; // update currentPos and peekPos
201        return tk;
202    }
203
204    /**
205     * Peek at the next token, without actually removing the token
206     * from the parse stream. Invoking this method multiple times
207     * will return successive tokens, until <code>next()</code> is
208     * called. <p>
209     *
210     * @return          the next Token
211     * @exception       ParseException if the parse fails
212     */
213    public Token peek() throws ParseException {
214        Token tk;
215
216        currentPos = peekPos; // setup currentPos
217        tk = getNext();
218        peekPos = currentPos; // update peekPos
219        return tk;
220    }
221
222    /**
223     * Return the rest of the Header.
224     *
225     * @return String   rest of header. null is returned if we are
226     *                  already at end of header
227     */
228    public String getRemainder() {
229        return string.substring(nextPos);
230    }
231
232    /*
233     * Return the next token starting from 'currentPos'. After the
234     * parse, 'currentPos' is updated to point to the start of the
235     * next token.
236     */
237    private Token getNext() throws ParseException {
238        // If we're already at end of string, return EOF
239        if (currentPos >= maxPos)
240            return EOFToken;
241
242        // Skip white-space, position currentPos beyond the space
243        if (skipWhiteSpace() == Token.EOF)
244            return EOFToken;
245
246        char c;
247        int start;
248        boolean filter = false;
249
250        c = string.charAt(currentPos);
251
252        // Check or Skip comments and position currentPos
253        // beyond the comment
254        while (c == '(') {
255            // Parsing comment ..
256            int nesting;
257            for (start = ++currentPos, nesting = 1;
258                 nesting > 0 && currentPos < maxPos;
259                 currentPos++) {
260                c = string.charAt(currentPos);
261                if (c == '\\') {  // Escape sequence
262                    currentPos++; // skip the escaped character
263                    filter = true;
264                } else if (c == '\r')
265                    filter = true;
266                else if (c == '(')
267                    nesting++;
268                else if (c == ')')
269                    nesting--;
270            }
271            if (nesting != 0)
272                throw new ParseException("Unbalanced comments");
273
274            if (!skipComments) {
275                // Return the comment, if we are asked to.
276                // Note that the comment start & end markers are ignored.
277                String s;
278                if (filter) // need to go thru the token again.
279                    s = filterToken(string, start, currentPos-1);
280                else
281                    s = string.substring(start,currentPos-1);
282
283                return new Token(Token.COMMENT, s);
284            }
285
286            // Skip any whitespace after the comment.
287            if (skipWhiteSpace() == Token.EOF)
288                return EOFToken;
289            c = string.charAt(currentPos);
290        }
291
292        // Check for quoted-string and position currentPos
293        //  beyond the terminating quote
294        if (c == '"') {
295            for (start = ++currentPos; currentPos < maxPos; currentPos++) {
296                c = string.charAt(currentPos);
297                if (c == '\\') { // Escape sequence
298                    currentPos++;
299                    filter = true;
300                } else if (c == '\r')
301                    filter = true;
302                else if (c == '"') {
303                    currentPos++;
304                    String s;
305
306                    if (filter)
307                        s = filterToken(string, start, currentPos-1);
308                    else
309                        s = string.substring(start,currentPos-1);
310
311                    return new Token(Token.QUOTEDSTRING, s);
312                }
313            }
314            throw new ParseException("Unbalanced quoted string");
315        }
316
317        // Check for SPECIAL or CTL
318        if (c < 040 || c >= 0177 || delimiters.indexOf(c) >= 0) {
319            currentPos++; // re-position currentPos
320            char ch[] = new char[1];
321            ch[0] = c;
322            return new Token(c, new String(ch));
323        }
324
325        // Check for ATOM
326        for (start = currentPos; currentPos < maxPos; currentPos++) {
327            c = string.charAt(currentPos);
328            // ATOM is delimited by either SPACE, CTL, "(", <">
329            // or the specified SPECIALS
330            if (c < 040 || c >= 0177 || c == '(' || c == ' ' ||
331                c == '"' || delimiters.indexOf(c) >= 0)
332                break;
333        }
334        return new Token(Token.ATOM, string.substring(start, currentPos));
335    }
336
337    // Skip SPACE, HT, CR and NL
338    private int skipWhiteSpace() {
339        char c;
340        for (; currentPos < maxPos; currentPos++)
341            if (((c = string.charAt(currentPos)) != ' ') &&
342                (c != '\t') && (c != '\r') && (c != '\n'))
343                return currentPos;
344        return Token.EOF;
345    }
346
347    /* Process escape sequences and embedded LWSPs from a comment or
348     * quoted string.
349     */
350    private static String filterToken(String s, int start, int end) {
351        StringBuilder sb = new StringBuilder();
352        char c;
353        boolean gotEscape = false;
354        boolean gotCR = false;
355
356        for (int i = start; i < end; i++) {
357            c = s.charAt(i);
358            if (c == '\n' && gotCR) {
359                // This LF is part of an unescaped
360                // CRLF sequence (i.e, LWSP). Skip it.
361                gotCR = false;
362                continue;
363            }
364
365            gotCR = false;
366            if (!gotEscape) {
367                // Previous character was NOT '\'
368                if (c == '\\') // skip this character
369                    gotEscape = true;
370                else if (c == '\r') // skip this character
371                    gotCR = true;
372                else // append this character
373                    sb.append(c);
374            } else {
375                // Previous character was '\'. So no need to
376                // bother with any special processing, just
377                // append this character
378                sb.append(c);
379                gotEscape = false;
380            }
381        }
382        return sb.toString();
383    }
384}
385