reader/internalizer/DOMForest.java

/*
 * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package com.sun.tools.internal.xjc.reader.internalizer;

import com.sun.istack.internal.NotNull;
import com.sun.istack.internal.XMLStreamReaderToContentHandler;
import com.sun.tools.internal.xjc.ErrorReceiver;
import com.sun.tools.internal.xjc.Options;
import com.sun.tools.internal.xjc.reader.Const;
import com.sun.tools.internal.xjc.reader.xmlschema.parser.SchemaConstraintChecker;
import com.sun.tools.internal.xjc.util.ErrorReceiverFilter;
import com.sun.xml.internal.bind.marshaller.DataWriter;
import com.sun.xml.internal.bind.v2.util.XmlFactory;
import com.sun.xml.internal.xsom.parser.JAXPParser;
import com.sun.xml.internal.xsom.parser.XMLParser;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.xml.sax.*;
import org.xml.sax.helpers.XMLFilterImpl;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.sax.SAXResult;
import javax.xml.transform.sax.SAXSource;
import javax.xml.validation.SchemaFactory;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.*;

import static com.sun.xml.internal.bind.v2.util.XmlFactory.allowExternalAccess;
import static javax.xml.XMLConstants.W3C_XML_SCHEMA_NS_URI;


/**
 * Builds a DOM forest and maintains association from
 * system IDs to DOM trees.
 *
 * <p>
 * A forest is a transitive reflexive closure of referenced documents.
 * IOW, if a document is in a forest, all the documents referenced from
 * it is in a forest, too. To support this semantics, {@link DOMForest}
 * uses {@link InternalizationLogic} to find referenced documents.
 *
 * <p>
 * Some documents are marked as "root"s, meaning those documents were
 * put into a forest explicitly, not because it is referenced from another
 * document. (However, a root document can be referenced from other
 * documents, too.)
 *
 * @author
 *     Kohsuke Kawaguchi (kohsuke.kawaguchi@sun.com)
 */
public final class DOMForest {
    /** actual data storage {@code map<SystemId,Document>}. */
    private final Map<String,Document> core = new LinkedHashMap<>();

    /**
     * To correctly feed documents to a schema parser, we need to remember
     * which documents (of the forest) were given as the root
     * documents, and which of them are read as included/imported
     * documents.
     *
     * <p>
     * Set of system ids as strings.
     */
    private final Set<String> rootDocuments = new LinkedHashSet<String>();

    /** Stores location information for all the trees in this forest. */
    public final LocatorTable locatorTable = new LocatorTable();

    /** Stores all the outer-most {@code <jaxb:bindings>} customizations. */
    public final Set<Element> outerMostBindings = new HashSet<Element>();

    /** Used to resolve references to other schema documents. */
    private EntityResolver entityResolver = null;

    /** Errors encountered during the parsing will be sent to this object. */
    private ErrorReceiver errorReceiver = null;

    /** Schema language dependent part of the processing. */
    protected final InternalizationLogic logic;

    private final SAXParserFactory parserFactory;
    private final DocumentBuilder documentBuilder;

    private final Options options;

    public DOMForest(
        SAXParserFactory parserFactory, DocumentBuilder documentBuilder,
        InternalizationLogic logic ) {

        this.parserFactory = parserFactory;
        this.documentBuilder = documentBuilder;
        this.logic = logic;
        this.options = null;
    }

    public DOMForest( InternalizationLogic logic, Options opt ) {

        if (opt == null) throw new AssertionError("Options object null");
        this.options = opt;

        try {
            DocumentBuilderFactory dbf = XmlFactory.createDocumentBuilderFactory(opt.disableXmlSecurity);
            this.documentBuilder = dbf.newDocumentBuilder();
            this.parserFactory = XmlFactory.createParserFactory(opt.disableXmlSecurity);
        } catch( ParserConfigurationException e ) {
            throw new AssertionError(e);
        }

        this.logic = logic;
    }

    /**
     * Gets the DOM tree associated with the specified system ID,
     * or null if none is found.
     */
    public Document get( String systemId ) {
        Document doc = core.get(systemId);

        if( doc==null && systemId.startsWith("file:/") && !systemId.startsWith("file://") ) {
            // As of JDK1.4, java.net.URL.toExternal method returns URLs like
            // "file:/abc/def/ghi" which is an incorrect file protocol URL according to RFC1738.
            // Some other correctly functioning parts return the correct URLs ("file:///abc/def/ghi"),
            // and this descripancy breaks DOM look up by system ID.

            // this extra check solves this problem.
            doc = core.get( "file://"+systemId.substring(5) );
        }

        if( doc==null && systemId.startsWith("file:") ) {
            // on Windows, filenames are case insensitive.
            // perform case-insensitive search for improved user experience
            String systemPath = getPath(systemId);
            for (String key : core.keySet()) {
                if(key.startsWith("file:") && getPath(key).equalsIgnoreCase(systemPath)) {
                    doc = core.get(key);
                    break;
                }
            }
        }

        return doc;
    }

    /**
     * Strips off the leading 'file:///' portion from an URL.
     */
    private String getPath(String key) {
        key = key.substring(5); // skip 'file:'
        while(key.length()>0 && key.charAt(0)=='/') {
            key = key.substring(1);
        }
        return key;
    }

    /**
     * Returns a read-only set of root document system IDs.
     */
    public Set<String> getRootDocuments() {
        return Collections.unmodifiableSet(rootDocuments);
    }

    /**
     * Picks one document at random and returns it.
     */
    public Document getOneDocument() {
        for (Document dom : core.values()) {
            if (!dom.getDocumentElement().getNamespaceURI().equals(Const.JAXB_NSURI))
                return dom;
        }
        // we should have caught this error very early on
        throw new AssertionError();
    }

    /**
     * Checks the correctness of the XML Schema documents and return true
     * if it's OK.
     *
     * <p>
     * This method performs a weaker version of the tests where error messages
     * are provided without line number information. So whenever possible
     * use {@link SchemaConstraintChecker}.
     *
     * @see SchemaConstraintChecker
     */
    public boolean checkSchemaCorrectness(ErrorReceiver errorHandler) {
        try {
            boolean disableXmlSecurity = false;
            if (options != null) {
                disableXmlSecurity = options.disableXmlSecurity;
            }
            SchemaFactory sf = XmlFactory.createSchemaFactory(W3C_XML_SCHEMA_NS_URI, disableXmlSecurity);
            ErrorReceiverFilter filter = new ErrorReceiverFilter(errorHandler);
            sf.setErrorHandler(filter);
            Set<String> roots = getRootDocuments();
            Source[] sources = new Source[roots.size()];
            int i=0;
            for (String root : roots) {
                sources[i++] = new DOMSource(get(root),root);
            }
            sf.newSchema(sources);
            return !filter.hadError();
        } catch (SAXException e) {
            // the errors should have been reported
            return false;
        }
    }

    /**
     * Gets the system ID from which the given DOM is parsed.
     * <p>
     * Poor-man's base URI.
     */
    public String getSystemId( Document dom ) {
        for (Map.Entry<String,Document> e : core.entrySet()) {
            if (e.getValue() == dom)
                return e.getKey();
        }
        return null;
    }

    public Document parse( InputSource source, boolean root ) throws SAXException {
        if( source.getSystemId()==null )
            throw new IllegalArgumentException();

        return parse( source.getSystemId(), source, root );
    }

    /**
     * Parses an XML at the given location (
     * and XMLs referenced by it) into DOM trees
     * and stores them to this forest.
     *
     * @return the parsed DOM document object.
     */
    public Document parse( String systemId, boolean root ) throws SAXException, IOException {

        systemId = Options.normalizeSystemId(systemId);

        if( core.containsKey(systemId) )
            // this document has already been parsed. Just ignore.
            return core.get(systemId);

        InputSource is=null;

        // allow entity resolver to find the actual byte stream.
        if( entityResolver!=null )
            is = entityResolver.resolveEntity(null,systemId);
        if( is==null )
            is = new InputSource(systemId);

        // but we still use the original system Id as the key.
        return parse( systemId, is, root );
    }

    /**
     * Returns a {@link ContentHandler} to feed SAX events into.
     *
     * <p>
     * The client of this class can feed SAX events into the handler
     * to parse a document into this DOM forest.
     *
     * This version requires that the DOM object to be created and registered
     * to the map beforehand.
     */
    private ContentHandler getParserHandler( Document dom ) {
        ContentHandler handler = new DOMBuilder(dom,locatorTable,outerMostBindings);
        handler = new WhitespaceStripper(handler,errorReceiver,entityResolver);
        handler = new VersionChecker(handler,errorReceiver,entityResolver);

        // insert the reference finder so that
        // included/imported schemas will be also parsed
        XMLFilterImpl f = logic.createExternalReferenceFinder(this);
        f.setContentHandler(handler);

        if(errorReceiver!=null)
            f.setErrorHandler(errorReceiver);
        if(entityResolver!=null)
            f.setEntityResolver(entityResolver);

        return f;
    }

    public interface Handler extends ContentHandler {
        /**
         * Gets the DOM that was built.
         */
        public Document getDocument();
    }

    private static abstract class HandlerImpl extends XMLFilterImpl implements Handler {
    }

    /**
     * Returns a {@link ContentHandler} to feed SAX events into.
     *
     * <p>
     * The client of this class can feed SAX events into the handler
     * to parse a document into this DOM forest.
     */
    public Handler getParserHandler( String systemId, boolean root ) {
        final Document dom = documentBuilder.newDocument();
        core.put( systemId, dom );
        if(root)
            rootDocuments.add(systemId);

        ContentHandler handler = getParserHandler(dom);

        // we will register the DOM to the map once the system ID becomes available.
        // but the SAX allows the event source to not to provide that information,
        // so be prepared for such case.
        HandlerImpl x = new HandlerImpl() {
            public Document getDocument() {
                return dom;
            }
        };
        x.setContentHandler(handler);

        return x;
   }

    /**
     * Parses the given document and add it to the DOM forest.
     *
     * @return
     *      null if there was a parse error. otherwise non-null.
     */
    public Document parse( String systemId, InputSource inputSource, boolean root ) throws SAXException {
        Document dom = documentBuilder.newDocument();

        systemId = Options.normalizeSystemId(systemId);

        // put into the map before growing a tree, to
        // prevent recursive reference from causing infinite loop.
        core.put( systemId, dom );
        if(root)
            rootDocuments.add(systemId);

        try {
            XMLReader reader = parserFactory.newSAXParser().getXMLReader();
            reader.setContentHandler(getParserHandler(dom));
            if(errorReceiver!=null)
                reader.setErrorHandler(errorReceiver);
            if(entityResolver!=null)
                reader.setEntityResolver(entityResolver);
            reader.parse(inputSource);
        } catch( ParserConfigurationException e ) {
            // in practice, this exception won't happen.
            errorReceiver.error(e.getMessage(),e);
            core.remove(systemId);
            rootDocuments.remove(systemId);
            return null;
        } catch( IOException e ) {
            errorReceiver.error(Messages.format(Messages.DOMFOREST_INPUTSOURCE_IOEXCEPTION, systemId, e.toString()),e);
            core.remove(systemId);
            rootDocuments.remove(systemId);
            return null;
        }

        return dom;
    }

    public Document parse( String systemId, XMLStreamReader parser, boolean root ) throws XMLStreamException {
        Document dom = documentBuilder.newDocument();

        systemId = Options.normalizeSystemId(systemId);

        if(root)
            rootDocuments.add(systemId);

        if(systemId==null)
            throw new IllegalArgumentException("system id cannot be null");
        core.put( systemId, dom );

        new XMLStreamReaderToContentHandler(parser,getParserHandler(dom),false,false).bridge();

        return dom;
    }

    /**
     * Performs internalization.
     *
     * This method should be called only once, only after all the
     * schemas are parsed.
     *
     * @return
     *      the returned bindings need to be applied after schema
     *      components are built.
     */
    public SCDBasedBindingSet transform(boolean enableSCD) {
        return Internalizer.transform(this, enableSCD, options.disableXmlSecurity);
    }

    /**
     * Performs the schema correctness check by using JAXP 1.3.
     *
     * <p>
     * This is "weak", because {@link SchemaFactory#newSchema(Source[])}
     * doesn't handle inclusions very correctly (it ends up parsing it
     * from its original source, not in this tree), and because
     * it doesn't handle two documents for the same namespace very
     * well.
     *
     * <p>
     * We should eventually fix JAXP (and Xerces), but meanwhile
     * this weaker and potentially wrong correctness check is still
     * better than nothing when used inside JAX-WS (JAXB CLI and Ant
     * does a better job of checking this.)
     *
     * <p>
     * To receive errors, use {@link SchemaFactory#setErrorHandler(ErrorHandler)}.
     */
    public void weakSchemaCorrectnessCheck(SchemaFactory sf) {
        List<SAXSource> sources = new ArrayList<SAXSource>();
        for( String systemId : getRootDocuments() ) {
            Document dom = get(systemId);
            if (dom.getDocumentElement().getNamespaceURI().equals(Const.JAXB_NSURI))
                continue;   // this isn't a schema. we have to do a negative check because if we see completely unrelated ns, we want to report that as an error

            SAXSource ss = createSAXSource(systemId);
            try {
                ss.getXMLReader().setFeature("http://xml.org/sax/features/namespace-prefixes",true);
            } catch (SAXException e) {
                throw new AssertionError(e);    // Xerces wants this. See 6395322.
            }
            sources.add(ss);
        }

        try {
            allowExternalAccess(sf, "file,http", options.disableXmlSecurity).newSchema(sources.toArray(new SAXSource[0]));
        } catch (SAXException e) {
            // error should have been reported.
        } catch (RuntimeException re) {
            // JAXP RI isn't very trustworthy when it comes to schema error check,
            // and we know some cases where it just dies with NPE. So handle it gracefully.
            // this masks a bug in the JAXP RI, but we need a release that we have to make.
            try {
                sf.getErrorHandler().warning(
                    new SAXParseException(Messages.format(
                        Messages.ERR_GENERAL_SCHEMA_CORRECTNESS_ERROR,re.getMessage()),
                        null,null,-1,-1,re));
            } catch (SAXException e) {
                // ignore
            }
        }
    }

    /**
     * Creates a {@link SAXSource} that, when parsed, reads from this {@link DOMForest}
     * (instead of parsing the original source identified by the system ID.)
     */
    public @NotNull SAXSource createSAXSource(String systemId) {
        ContentHandlerNamespacePrefixAdapter reader = new ContentHandlerNamespacePrefixAdapter(new XMLFilterImpl() {
            // XMLReader that uses XMLParser to parse. We need to use XMLFilter to indrect
            // handlers, since SAX allows handlers to be changed while parsing.
            @Override
            public void parse(InputSource input) throws SAXException, IOException {
                createParser().parse(input, this, this, this);
            }

            @Override
            public void parse(String systemId) throws SAXException, IOException {
                parse(new InputSource(systemId));
            }
        });

        return new SAXSource(reader,new InputSource(systemId));
    }

    /**
     * Creates {@link XMLParser} for XSOM which reads documents from
     * this DOMForest rather than doing a fresh parse.
     *
     * The net effect is that XSOM will read transformed XML Schemas
     * instead of the original documents.
     */
    public XMLParser createParser() {
        return new DOMForestParser(this, new JAXPParser(XmlFactory.createParserFactory(options.disableXmlSecurity)));
    }

    public EntityResolver getEntityResolver() {
        return entityResolver;
    }

    public void setEntityResolver(EntityResolver entityResolver) {
        this.entityResolver = entityResolver;
    }

    public ErrorReceiver getErrorHandler() {
        return errorReceiver;
    }

    public void setErrorHandler(ErrorReceiver errorHandler) {
        this.errorReceiver = errorHandler;
    }

    /**
     * Gets all the parsed documents.
     */
    public Document[] listDocuments() {
        return core.values().toArray(new Document[core.size()]);
    }

    /**
     * Gets all the system IDs of the documents.
     */
    public String[] listSystemIDs() {
        return core.keySet().toArray(new String[core.keySet().size()]);
    }

    /**
     * Dumps the contents of the forest to the specified stream.
     *
     * This is a debug method. As such, error handling is sloppy.
     */
    @SuppressWarnings("CallToThreadDumpStack")
    public void dump( OutputStream out ) throws IOException {
        try {
            // create identity transformer
            boolean disableXmlSecurity = false;
            if (options != null) {
                disableXmlSecurity = options.disableXmlSecurity;
            }
            TransformerFactory tf = XmlFactory.createTransformerFactory(disableXmlSecurity);
            Transformer it = tf.newTransformer();

            for (Map.Entry<String, Document> e : core.entrySet()) {
                out.write( ("---<< "+e.getKey()+'\n').getBytes() );

                DataWriter dw = new DataWriter(new OutputStreamWriter(out),null);
                dw.setIndentStep("  ");
                it.transform( new DOMSource(e.getValue()),
                    new SAXResult(dw));

                out.write( "\n\n\n".getBytes() );
            }
        } catch( TransformerException e ) {
            e.printStackTrace();
        }
    }
}