1/* 2 * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26/* 27 * @(#)HeaderTokenizer.java 1.9 02/03/27 28 */ 29 30 31 32package com.sun.xml.internal.messaging.saaj.packaging.mime.internet; 33 34 35/** 36 * This class tokenizes RFC822 and MIME headers into the basic 37 * symbols specified by RFC822 and MIME. <p> 38 * 39 * This class handles folded headers (ie headers with embedded 40 * CRLF SPACE sequences). The folds are removed in the returned 41 * tokens. 42 * 43 * @version 1.9, 02/03/27 44 * @author John Mani 45 */ 46 47public class HeaderTokenizer { 48 49 /** 50 * The Token class represents tokens returned by the 51 * HeaderTokenizer. 52 */ 53 public static class Token { 54 55 private int type; 56 private String value; 57 58 /** 59 * Token type indicating an ATOM. 60 */ 61 public static final int ATOM = -1; 62 63 /** 64 * Token type indicating a quoted string. The value 65 * field contains the string without the quotes. 66 */ 67 public static final int QUOTEDSTRING = -2; 68 69 /** 70 * Token type indicating a comment. The value field 71 * contains the comment string without the comment 72 * start and end symbols. 73 */ 74 public static final int COMMENT = -3; 75 76 /** 77 * Token type indicating end of input. 78 */ 79 public static final int EOF = -4; 80 81 /** 82 * Constructor. 83 * @param type Token type 84 * @param value Token value 85 */ 86 public Token(int type, String value) { 87 this.type = type; 88 this.value = value; 89 } 90 91 /** 92 * Return the type of the token. If the token represents a 93 * delimiter or a control character, the type is that character 94 * itself, converted to an integer. Otherwise, it's value is 95 * one of the following: 96 * <ul> 97 * <li><code>ATOM</code> A sequence of ASCII characters 98 * delimited by either SPACE, CTL, "(", <"> or the 99 * specified SPECIALS</li> 100 * <li><code>QUOTEDSTRING</code> A sequence of ASCII characters 101 * within quotes</li> 102 * <li><code>COMMENT</code> A sequence of ASCII characters 103 * within "(" and ")".</li> 104 * <li><code>EOF</code> End of header</li> 105 * </ul> 106 * @return type 107 */ 108 public int getType() { 109 return type; 110 } 111 112 /** 113 * Returns the value of the token just read. When the current 114 * token is a quoted string, this field contains the body of the 115 * string, without the quotes. When the current token is a comment, 116 * this field contains the body of the comment. 117 * 118 * @return token value 119 */ 120 public String getValue() { 121 return value; 122 } 123 } 124 125 private String string; // the string to be tokenized 126 private boolean skipComments; // should comments be skipped ? 127 private String delimiters; // delimiter string 128 private int currentPos; // current parse position 129 private int maxPos; // string length 130 private int nextPos; // track start of next Token for next() 131 private int peekPos; // track start of next Token for peek() 132 133 /** 134 * RFC822 specials 135 */ 136 public final static String RFC822 = "()<>@,;:\\\"\t .[]"; 137 138 /** 139 * MIME specials 140 */ 141 public final static String MIME = "()<>@,;:\\\"\t []/?="; 142 143 // The EOF Token 144 private final static Token EOFToken = new Token(Token.EOF, null); 145 146 /** 147 * Constructor that takes a rfc822 style header. 148 * 149 * @param header The rfc822 header to be tokenized 150 * @param delimiters Set of delimiter characters 151 * to be used to delimit ATOMS. These 152 * are usually <code>RFC822</code> or 153 * <code>MIME</code> 154 * @param skipComments If true, comments are skipped and 155 * not returned as tokens 156 */ 157 public HeaderTokenizer(String header, String delimiters, 158 boolean skipComments) { 159 string = (header == null) ? "" : header; // paranoia ?! 160 this.skipComments = skipComments; 161 this.delimiters = delimiters; 162 currentPos = nextPos = peekPos = 0; 163 maxPos = string.length(); 164 } 165 166 /** 167 * Constructor. Comments are ignored and not returned as tokens 168 * 169 * @param header The header that is tokenized 170 * @param delimiters The delimiters to be used 171 */ 172 public HeaderTokenizer(String header, String delimiters) { 173 this(header, delimiters, true); 174 } 175 176 /** 177 * Constructor. The RFC822 defined delimiters - RFC822 - are 178 * used to delimit ATOMS. Also comments are skipped and not 179 * returned as tokens 180 * @param header The header that is tokenized. 181 */ 182 public HeaderTokenizer(String header) { 183 this(header, RFC822); 184 } 185 186 /** 187 * Parses the next token from this String. <p> 188 * 189 * Clients sit in a loop calling next() to parse successive 190 * tokens until an EOF Token is returned. 191 * 192 * @return the next Token 193 * @exception ParseException if the parse fails 194 */ 195 public Token next() throws ParseException { 196 Token tk; 197 198 currentPos = nextPos; // setup currentPos 199 tk = getNext(); 200 nextPos = peekPos = currentPos; // update currentPos and peekPos 201 return tk; 202 } 203 204 /** 205 * Peek at the next token, without actually removing the token 206 * from the parse stream. Invoking this method multiple times 207 * will return successive tokens, until <code>next()</code> is 208 * called. <p> 209 * 210 * @return the next Token 211 * @exception ParseException if the parse fails 212 */ 213 public Token peek() throws ParseException { 214 Token tk; 215 216 currentPos = peekPos; // setup currentPos 217 tk = getNext(); 218 peekPos = currentPos; // update peekPos 219 return tk; 220 } 221 222 /** 223 * Return the rest of the Header. 224 * 225 * @return String rest of header. null is returned if we are 226 * already at end of header 227 */ 228 public String getRemainder() { 229 return string.substring(nextPos); 230 } 231 232 /* 233 * Return the next token starting from 'currentPos'. After the 234 * parse, 'currentPos' is updated to point to the start of the 235 * next token. 236 */ 237 private Token getNext() throws ParseException { 238 // If we're already at end of string, return EOF 239 if (currentPos >= maxPos) 240 return EOFToken; 241 242 // Skip white-space, position currentPos beyond the space 243 if (skipWhiteSpace() == Token.EOF) 244 return EOFToken; 245 246 char c; 247 int start; 248 boolean filter = false; 249 250 c = string.charAt(currentPos); 251 252 // Check or Skip comments and position currentPos 253 // beyond the comment 254 while (c == '(') { 255 // Parsing comment .. 256 int nesting; 257 for (start = ++currentPos, nesting = 1; 258 nesting > 0 && currentPos < maxPos; 259 currentPos++) { 260 c = string.charAt(currentPos); 261 if (c == '\\') { // Escape sequence 262 currentPos++; // skip the escaped character 263 filter = true; 264 } else if (c == '\r') 265 filter = true; 266 else if (c == '(') 267 nesting++; 268 else if (c == ')') 269 nesting--; 270 } 271 if (nesting != 0) 272 throw new ParseException("Unbalanced comments"); 273 274 if (!skipComments) { 275 // Return the comment, if we are asked to. 276 // Note that the comment start & end markers are ignored. 277 String s; 278 if (filter) // need to go thru the token again. 279 s = filterToken(string, start, currentPos-1); 280 else 281 s = string.substring(start,currentPos-1); 282 283 return new Token(Token.COMMENT, s); 284 } 285 286 // Skip any whitespace after the comment. 287 if (skipWhiteSpace() == Token.EOF) 288 return EOFToken; 289 c = string.charAt(currentPos); 290 } 291 292 // Check for quoted-string and position currentPos 293 // beyond the terminating quote 294 if (c == '"') { 295 for (start = ++currentPos; currentPos < maxPos; currentPos++) { 296 c = string.charAt(currentPos); 297 if (c == '\\') { // Escape sequence 298 currentPos++; 299 filter = true; 300 } else if (c == '\r') 301 filter = true; 302 else if (c == '"') { 303 currentPos++; 304 String s; 305 306 if (filter) 307 s = filterToken(string, start, currentPos-1); 308 else 309 s = string.substring(start,currentPos-1); 310 311 return new Token(Token.QUOTEDSTRING, s); 312 } 313 } 314 throw new ParseException("Unbalanced quoted string"); 315 } 316 317 // Check for SPECIAL or CTL 318 if (c < 040 || c >= 0177 || delimiters.indexOf(c) >= 0) { 319 currentPos++; // re-position currentPos 320 char ch[] = new char[1]; 321 ch[0] = c; 322 return new Token(c, new String(ch)); 323 } 324 325 // Check for ATOM 326 for (start = currentPos; currentPos < maxPos; currentPos++) { 327 c = string.charAt(currentPos); 328 // ATOM is delimited by either SPACE, CTL, "(", <"> 329 // or the specified SPECIALS 330 if (c < 040 || c >= 0177 || c == '(' || c == ' ' || 331 c == '"' || delimiters.indexOf(c) >= 0) 332 break; 333 } 334 return new Token(Token.ATOM, string.substring(start, currentPos)); 335 } 336 337 // Skip SPACE, HT, CR and NL 338 private int skipWhiteSpace() { 339 char c; 340 for (; currentPos < maxPos; currentPos++) 341 if (((c = string.charAt(currentPos)) != ' ') && 342 (c != '\t') && (c != '\r') && (c != '\n')) 343 return currentPos; 344 return Token.EOF; 345 } 346 347 /* Process escape sequences and embedded LWSPs from a comment or 348 * quoted string. 349 */ 350 private static String filterToken(String s, int start, int end) { 351 StringBuilder sb = new StringBuilder(); 352 char c; 353 boolean gotEscape = false; 354 boolean gotCR = false; 355 356 for (int i = start; i < end; i++) { 357 c = s.charAt(i); 358 if (c == '\n' && gotCR) { 359 // This LF is part of an unescaped 360 // CRLF sequence (i.e, LWSP). Skip it. 361 gotCR = false; 362 continue; 363 } 364 365 gotCR = false; 366 if (!gotEscape) { 367 // Previous character was NOT '\' 368 if (c == '\\') // skip this character 369 gotEscape = true; 370 else if (c == '\r') // skip this character 371 gotCR = true; 372 else // append this character 373 sb.append(c); 374 } else { 375 // Previous character was '\'. So no need to 376 // bother with any special processing, just 377 // append this character 378 sb.append(c); 379 gotEscape = false; 380 } 381 } 382 return sb.toString(); 383 } 384} 385