1/*
2 * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26package com.sun.xml.internal.org.jvnet.mimepull;
27
28import java.io.InputStream;
29import java.io.IOException;
30import java.util.*;
31import java.util.logging.Logger;
32import java.nio.ByteBuffer;
33import java.util.logging.Level;
34
35/**
36 * Pull parser for the MIME messages. Applications can use pull API to continue
37 * the parsing MIME messages lazily.
38 *
39 * <pre>
40 * for e.g.:
41 * <p>
42 *
43 * MIMEParser parser = ...
44 * Iterator<MIMEEvent> it = parser.iterator();
45 * while(it.hasNext()) {
46 *   MIMEEvent event = it.next();
47 *   ...
48 * }
49 * </pre>
50 *
51 * @author Jitendra Kotamraju
52 */
53class MIMEParser implements Iterable<MIMEEvent> {
54
55    private static final Logger LOGGER = Logger.getLogger(MIMEParser.class.getName());
56
57    private static final String HEADER_ENCODING = "ISO8859-1";
58
59    // Actually, the grammar doesn't support whitespace characters
60    // after boundary. But the mail implementation checks for it.
61    // We will only check for these many whitespace characters after boundary
62    private static final int NO_LWSP = 1000;
63    private enum STATE {START_MESSAGE, SKIP_PREAMBLE, START_PART, HEADERS, BODY, END_PART, END_MESSAGE}
64    private STATE state = STATE.START_MESSAGE;
65
66    private final InputStream in;
67    private final byte[] bndbytes;
68    private final int bl;
69    private final MIMEConfig config;
70    private final int[] bcs = new int[128]; // BnM algo: Bad Character Shift table
71    private final int[] gss;                // BnM algo : Good Suffix Shift table
72
73    /**
74     * Have we parsed the data from our InputStream yet?
75     */
76    private boolean parsed;
77
78    /*
79     * Read and process body partsList until we see the
80     * terminating boundary line (or EOF).
81     */
82    private boolean done = false;
83
84    private boolean eof;
85    private final int capacity;
86    private byte[] buf;
87    private int len;
88    private boolean bol;        // beginning of the line
89
90    /*
91     * Parses the MIME content. At the EOF, it also closes input stream
92     */
93    MIMEParser(InputStream in, String boundary, MIMEConfig config) {
94        this.in = in;
95        this.bndbytes = getBytes("--"+boundary);
96        bl = bndbytes.length;
97        this.config = config;
98        gss = new int[bl];
99        compileBoundaryPattern();
100
101        // \r\n + boundary + "--\r\n" + lots of LWSP
102        capacity = config.chunkSize+2+bl+4+NO_LWSP;
103        createBuf(capacity);
104    }
105
106    /**
107     * Returns iterator for the parsing events. Use the iterator to advance
108     * the parsing.
109     *
110     * @return iterator for parsing events
111     */
112    @Override
113    public Iterator<MIMEEvent> iterator() {
114        return new MIMEEventIterator();
115    }
116
117    class MIMEEventIterator implements Iterator<MIMEEvent> {
118
119        @Override
120        public boolean hasNext() {
121            return !parsed;
122        }
123
124        @Override
125        public MIMEEvent next() {
126
127            if (parsed) {
128                throw new NoSuchElementException();
129            }
130
131            switch(state) {
132                case START_MESSAGE :
133                    if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.START_MESSAGE);}
134                    state = STATE.SKIP_PREAMBLE;
135                    return MIMEEvent.START_MESSAGE;
136
137                case SKIP_PREAMBLE :
138                    if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.SKIP_PREAMBLE);}
139                    skipPreamble();
140                    // fall through
141                case START_PART :
142                    if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.START_PART);}
143                    state = STATE.HEADERS;
144                    return MIMEEvent.START_PART;
145
146                case HEADERS :
147                    if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.HEADERS);}
148                    InternetHeaders ih = readHeaders();
149                    state = STATE.BODY;
150                    bol = true;
151                    return new MIMEEvent.Headers(ih);
152
153                case BODY :
154                    if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.BODY);}
155                    ByteBuffer buf = readBody();
156                    bol = false;
157                    return new MIMEEvent.Content(buf);
158
159                case END_PART :
160                    if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.END_PART);}
161                    if (done) {
162                        state = STATE.END_MESSAGE;
163                    } else {
164                        state = STATE.START_PART;
165                    }
166                    return MIMEEvent.END_PART;
167
168                case END_MESSAGE :
169                    if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.END_MESSAGE);}
170                    parsed = true;
171                    return MIMEEvent.END_MESSAGE;
172
173                default :
174                    throw new MIMEParsingException("Unknown Parser state = "+state);
175            }
176        }
177
178        @Override
179        public void remove() {
180            throw new UnsupportedOperationException();
181        }
182    }
183
184    /**
185     * Collects the headers for the current part by parsing mesage stream.
186     *
187     * @return headers for the current part
188     */
189    private InternetHeaders readHeaders() {
190        if (!eof) {
191            fillBuf();
192        }
193        return new InternetHeaders(new LineInputStream());
194    }
195
196    /**
197     * Reads and saves the part of the current attachment part's content.
198     * At the end of this method, buf should have the remaining data
199     * at index 0.
200     *
201     * @return a chunk of the part's content
202     *
203     */
204    private ByteBuffer readBody() {
205        if (!eof) {
206            fillBuf();
207        }
208        int start = match(buf, 0, len);     // matches boundary
209        if (start == -1) {
210            // No boundary is found
211            assert eof || len >= config.chunkSize;
212            int chunkSize = eof ? len : config.chunkSize;
213            if (eof) {
214                done = true;
215                throw new MIMEParsingException("Reached EOF, but there is no closing MIME boundary.");
216            }
217            return adjustBuf(chunkSize, len-chunkSize);
218        }
219        // Found boundary.
220        // Is it at the start of a line ?
221        int chunkLen = start;
222        if (bol && start == 0) {
223            // nothing to do
224        } else if (start > 0 && (buf[start-1] == '\n' || buf[start-1] =='\r')) {
225            --chunkLen;
226            if (buf[start-1] == '\n' && start >1 && buf[start-2] == '\r') {
227                --chunkLen;
228            }
229        } else {
230           return adjustBuf(start+1, len-start-1);  // boundary is not at beginning of a line
231        }
232
233        if (start+bl+1 < len && buf[start+bl] == '-' && buf[start+bl+1] == '-') {
234            state = STATE.END_PART;
235            done = true;
236            return adjustBuf(chunkLen, 0);
237        }
238
239        // Consider all the whitespace in boundary+whitespace+"\r\n"
240        int lwsp = 0;
241        for(int i=start+bl; i < len && (buf[i] == ' ' || buf[i] == '\t'); i++) {
242            ++lwsp;
243        }
244
245        // Check for \n or \r\n in boundary+whitespace+"\n" or boundary+whitespace+"\r\n"
246        if (start+bl+lwsp < len && buf[start+bl+lwsp] == '\n') {
247            state = STATE.END_PART;
248            return adjustBuf(chunkLen, len-start-bl-lwsp-1);
249        } else if (start+bl+lwsp+1 < len && buf[start+bl+lwsp] == '\r' && buf[start+bl+lwsp+1] == '\n') {
250            state = STATE.END_PART;
251            return adjustBuf(chunkLen, len-start-bl-lwsp-2);
252        } else if (start+bl+lwsp+1 < len) {
253            return adjustBuf(chunkLen+1, len-chunkLen-1);       // boundary string in a part data
254        } else if (eof) {
255            done = true;
256            throw new MIMEParsingException("Reached EOF, but there is no closing MIME boundary.");
257        }
258
259        // Some more data needed to determine if it is indeed a proper boundary
260        return adjustBuf(chunkLen, len-chunkLen);
261    }
262
263    /**
264     * Returns a chunk from the original buffer. A new buffer is
265     * created with the remaining bytes.
266     *
267     * @param chunkSize create a chunk with these many bytes
268     * @param remaining bytes from the end of the buffer that need to be copied to
269     *        the beginning of the new buffer
270     * @return chunk
271     */
272    private ByteBuffer adjustBuf(int chunkSize, int remaining) {
273        assert buf != null;
274        assert chunkSize >= 0;
275        assert remaining >= 0;
276
277        byte[] temp = buf;
278        // create a new buf and adjust it without this chunk
279        createBuf(remaining);
280        System.arraycopy(temp, len-remaining, buf, 0, remaining);
281        len = remaining;
282
283        return ByteBuffer.wrap(temp, 0, chunkSize);
284    }
285
286    private void createBuf(int min) {
287        buf = new byte[min < capacity ? capacity : min];
288    }
289
290    /**
291     * Skips the preamble to find the first attachment part
292     */
293    private void skipPreamble() {
294
295        while(true) {
296            if (!eof) {
297                fillBuf();
298            }
299            int start = match(buf, 0, len);     // matches boundary
300            if (start == -1) {
301                // No boundary is found
302                if (eof) {
303                    throw new MIMEParsingException("Missing start boundary");
304                } else {
305                    adjustBuf(len-bl+1, bl-1);
306                    continue;
307                }
308            }
309
310            if (start > config.chunkSize) {
311                adjustBuf(start, len-start);
312                continue;
313            }
314            // Consider all the whitespace boundary+whitespace+"\r\n"
315            int lwsp = 0;
316            for(int i=start+bl; i < len && (buf[i] == ' ' || buf[i] == '\t'); i++) {
317                ++lwsp;
318            }
319            // Check for \n or \r\n
320            if (start+bl+lwsp < len && (buf[start+bl+lwsp] == '\n' || buf[start+bl+lwsp] == '\r') ) {
321                if (buf[start+bl+lwsp] == '\n') {
322                    adjustBuf(start+bl+lwsp+1, len-start-bl-lwsp-1);
323                    break;
324                } else if (start+bl+lwsp+1 < len && buf[start+bl+lwsp+1] == '\n') {
325                    adjustBuf(start+bl+lwsp+2, len-start-bl-lwsp-2);
326                    break;
327                }
328            }
329            adjustBuf(start+1, len-start-1);
330        }
331        if (LOGGER.isLoggable(Level.FINE)) {LOGGER.log(Level.FINE, "Skipped the preamble. buffer len={0}", len);}
332    }
333
334    private static byte[] getBytes(String s) {
335        char [] chars= s.toCharArray();
336        int size = chars.length;
337        byte[] bytes = new byte[size];
338
339        for (int i = 0; i < size;) {
340            bytes[i] = (byte) chars[i++];
341        }
342        return bytes;
343    }
344
345        /**
346     * Boyer-Moore search method. Copied from java.util.regex.Pattern.java
347     *
348     * Pre calculates arrays needed to generate the bad character
349     * shift and the good suffix shift. Only the last seven bits
350     * are used to see if chars match; This keeps the tables small
351     * and covers the heavily used ASCII range, but occasionally
352     * results in an aliased match for the bad character shift.
353     */
354    private void compileBoundaryPattern() {
355        int i, j;
356
357        // Precalculate part of the bad character shift
358        // It is a table for where in the pattern each
359        // lower 7-bit value occurs
360        for (i = 0; i < bndbytes.length; i++) {
361            bcs[bndbytes[i]&0x7F] = i + 1;
362        }
363
364        // Precalculate the good suffix shift
365        // i is the shift amount being considered
366NEXT:   for (i = bndbytes.length; i > 0; i--) {
367            // j is the beginning index of suffix being considered
368            for (j = bndbytes.length - 1; j >= i; j--) {
369                // Testing for good suffix
370                if (bndbytes[j] == bndbytes[j-i]) {
371                    // src[j..len] is a good suffix
372                    gss[j-1] = i;
373                } else {
374                    // No match. The array has already been
375                    // filled up with correct values before.
376                    continue NEXT;
377                }
378            }
379            // This fills up the remaining of optoSft
380            // any suffix can not have larger shift amount
381            // then its sub-suffix. Why???
382            while (j > 0) {
383                gss[--j] = i;
384            }
385        }
386        // Set the guard value because of unicode compression
387        gss[bndbytes.length -1] = 1;
388    }
389
390    /**
391     * Finds the boundary in the given buffer using Boyer-Moore algo.
392     * Copied from java.util.regex.Pattern.java
393     *
394     * @param mybuf boundary to be searched in this mybuf
395     * @param off start index in mybuf
396     * @param len number of bytes in mybuf
397     *
398     * @return -1 if there is no match or index where the match starts
399     */
400    private int match(byte[] mybuf, int off, int len) {
401        int last = len - bndbytes.length;
402
403        // Loop over all possible match positions in text
404NEXT:   while (off <= last) {
405            // Loop over pattern from right to left
406            for (int j = bndbytes.length - 1; j >= 0; j--) {
407                byte ch = mybuf[off+j];
408                if (ch != bndbytes[j]) {
409                    // Shift search to the right by the maximum of the
410                    // bad character shift and the good suffix shift
411                    off += Math.max(j + 1 - bcs[ch&0x7F], gss[j]);
412                    continue NEXT;
413                }
414            }
415            // Entire pattern matched starting at off
416            return off;
417        }
418        return -1;
419    }
420
421    /**
422     * Fills the remaining buf to the full capacity
423     */
424    private void fillBuf() {
425        if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "Before fillBuf() buffer len={0}", len);}
426        assert !eof;
427        while(len < buf.length) {
428            int read;
429            try {
430                read = in.read(buf, len, buf.length-len);
431            } catch(IOException ioe) {
432                throw new MIMEParsingException(ioe);
433            }
434            if (read == -1) {
435                eof = true;
436                try {
437                    if (LOGGER.isLoggable(Level.FINE)) {LOGGER.fine("Closing the input stream.");}
438                    in.close();
439                } catch(IOException ioe) {
440                    throw new MIMEParsingException(ioe);
441                }
442                break;
443            } else {
444                len += read;
445            }
446        }
447        if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "After fillBuf() buffer len={0}", len);}
448    }
449
450    private void doubleBuf() {
451        byte[] temp = new byte[2*len];
452        System.arraycopy(buf, 0, temp, 0, len);
453        buf = temp;
454        if (!eof) {
455            fillBuf();
456        }
457    }
458
459    class LineInputStream {
460        private int offset;
461
462        /*
463         * Read a line containing only ASCII characters from the input
464         * stream. A line is terminated by a CR or NL or CR-NL sequence.
465         * A common error is a CR-CR-NL sequence, which will also terminate
466         * a line.
467         * The line terminator is not returned as part of the returned
468         * String. Returns null if no data is available. <p>
469         *
470         * This class is similar to the deprecated
471         * <code>DataInputStream.readLine()</code>
472         */
473        public String readLine() throws IOException {
474
475            int hdrLen = 0;
476            int lwsp = 0;
477            while(offset+hdrLen < len) {
478                if (buf[offset+hdrLen] == '\n') {
479                    lwsp = 1;
480                    break;
481                }
482                if (offset+hdrLen+1 == len) {
483                    doubleBuf();
484                }
485                if (offset+hdrLen+1 >= len) {   // No more data in the stream
486                    assert eof;
487                    return null;
488                }
489                if (buf[offset+hdrLen] == '\r' && buf[offset+hdrLen+1] == '\n') {
490                    lwsp = 2;
491                    break;
492                }
493                ++hdrLen;
494            }
495            if (hdrLen == 0) {
496                adjustBuf(offset+lwsp, len-offset-lwsp);
497                return null;
498            }
499
500            String hdr = new String(buf, offset, hdrLen, HEADER_ENCODING);
501            offset += hdrLen+lwsp;
502            return hdr;
503        }
504
505    }
506
507}
508