cogent/regression/testspec.py

#!/usr/bin/python
#
# Copyright 2014, NICTA
#
# This software may be distributed and modified according to the terms of
# the BSD 2-Clause license. Note that NO WARRANTY is provided.
# See "LICENSE_BSD2.txt" for details.
#
# @TAG(NICTA_BSD)
#

import os
import copy
import heapq
import glob
import subprocess
import itertools
import argparse
import sys

from lxml import etree

REGRESSION_DIR = os.path.dirname(os.path.realpath(__file__))
REGRESSION_DTD = os.path.join(REGRESSION_DIR, "regression.dtd")

class TestSpecParseException(Exception):
    pass

class TestEnv():
    def __init__(self, pwd):
        self.pwd = pwd
        self.cwd = "."
        self.timeout = 0
        self.depends = set()

class Test():
    def __init__(self, name, command, timeout=0, cwd="", depends=None):
        self.name = name
        self.command = command
        self.timeout = timeout
        self.cwd = cwd

        if depends == None:
            depends = set([])
        self.depends = depends

def parse_attributes(tag, env, strict=True):
    """Parse attributes such as "timeout" in the given XML tag,
    updating the given "env" to reflect them."""
    if tag.get("timeout"):
        try:
            env.timeout = int(tag.get("timeout"))
        except:
            if strict:
                raise
    if tag.get("cwd"):
        env.cwd = tag.get("cwd")
    if tag.get("depends"):
        env.depends |= set(tag.get("depends").split())

def parse_test(doc, env, strict=True):
    """Parse a <test> tag."""
    env = copy.deepcopy(env)
    parse_attributes(doc, env, strict=strict)
    return Test(doc.get("name"), doc.text.strip(),
            timeout=env.timeout,
            cwd=os.path.normpath(os.path.join(env.pwd, env.cwd)),
            depends=env.depends)

def parse_sequence(doc, env, strict=True):
    # Create a copy of env so that the scope is restored.
    env = copy.deepcopy(env)

    # Parse attributes.
    parse_attributes(doc, env)

    # Parse children.
    tests = []
    for child in doc:
        if child.tag == "set":
            # Parse set, recording dependencies of the tests inside the set.
            new_tests = parse_set(child, env, strict=strict)
            for x in new_tests:
                env.depends.add(x.name)
            tests += new_tests
        elif child.tag == "sequence":
            # Parse sequence, recording dependencies of the tests inside the set.
            new_tests = parse_sequence(child, env, strict=strict)
            for x in new_tests:
                env.depends.add(x.name)
            tests += new_tests
        elif child.tag == "test":
            tests.append(parse_test(child, env, strict=strict))
            env.depends.add(tests[-1].name)
        elif strict:
            raise TestSpecParseException("Unknown tag '%s'" % child.tag)

    return tests

def parse_set(doc, env, strict=True):
    # Create a copy of env so that the scope is restored.
    env = copy.deepcopy(env)

    # Parse attributes.
    parse_attributes(doc, env, strict=strict)

    # Parse children.
    tests = []
    for child in doc:
        if child.tag == "set":
            tests += parse_set(child, env, strict=strict)
        elif child.tag == "sequence":
            tests += parse_sequence(child, env, strict=strict)
        elif child.tag == "test":
            tests.append(parse_test(child, env, strict=strict))
        elif strict:
            raise TestSpecParseException("Unknown tag '%s'" % child.tag)

    return tests

def find_cycle(keys, depends_on):
    """Find the shortest cycle in the input graph. Unnecessarily O(n**2)."""
    def dfs(n):
        safe = set()
        active = set()
        def do_dfs(n):
            if n in safe:
                return None
            if n in active:
                return [n]
            active.add(n)
            for c in depends_on(n):
                x = do_dfs(c)
                if x != None:
                    return [n] + x
            active.discard(n)
            safe.add(n)
        return do_dfs(n)
    shortest_cycle = None
    for i in keys:
        x = dfs(i)
        if x != None and (shortest_cycle == None
                or len(x) < len(shortest_cycle)):
            shortest_cycle = x
    return shortest_cycle

def toposort(keys, prio, depends_on):
    """topological sort of keys.

    Perform a toposort for keys, trying to order elements by the priority
    returned by function "prio" as closely as possible without breaking
    dependencies.
    """
    #
    # We start by creating a dictionary of which tests are dependent on others,
    # and then how many outstanding dependencies each test has.
    #
    # Instead of using "dependents" and "dependencies", we use "parents" and
    # "children". A parent must be processed before its child.
    #
    keys = sorted(keys, key=prio)
    children = {}
    num_parents = {}
    for key in keys:
        num_parents[key] = len(depends_on(key))
        for parent in depends_on(key):
            children.setdefault(parent, set()).add(key)

    #
    # Generate heap of tests without a parent, and keep popping off
    # the heap and processing the tests.
    #
    final_order = []
    parentless = sorted([(prio(k), k) for k in keys if num_parents[k] == 0])
    while len(parentless) > 0:
        (p, k) = heapq.heappop(parentless)
        final_order.append(k)
        for s in children.get(k, []):
            num_parents[s] -= 1
            if num_parents[s] == 0:
                heapq.heappush(parentless, (prio(s), s))

    # Ensure we saw everybody. If we didn't, there is a cycle.
    if len(keys) != len(final_order):
        shortest_cycle = find_cycle(keys, depends_on)
        raise ValueError("Circular dependency involving: %s" %
                (" -> ".join(shortest_cycle)))

    return final_order

def validate_xml(filename):
    """Ensure the XML matches the regression DTD."""

    # Read in the DTD
    with open(REGRESSION_DTD) as dtd_file:
        dtd = etree.DTD(dtd_file)

    # Parse the file, and validate against the DTD.
    parser = etree.XMLParser(remove_comments=True)
    doc = etree.parse(filename, parser=parser)
    if not dtd.validate(doc):
        raise Exception(
                "%s does not validate against DTD:\n\n" % filename
                + str(dtd.error_log))

def parse_testsuite_xml(filename, strict=True):

    # Validate the XML if requested.
    if strict:
        validate_xml(filename)

    # Parse the file. We try to keep reading broken XML. If "strict" is false,
    # keep trying to parse over broken XML.
    parser = etree.XMLParser(remove_comments=True, recover=(not strict))
    doc = etree.parse(filename, parser=parser).getroot()

    # Setup an empty environment
    env = TestEnv(os.path.dirname(filename))

    # Parse this tag as a set of tests.
    return parse_set(doc, env, strict=strict)

def process_tests(tests, strict=False):
    """Given a list of tests (possibly from multiple XML file), check for
    errors and return a list of tests in dependency-satisfying order."""

    # Check for duplicate names.
    seen_names = set()
    for t in tests:
        if t.name in seen_names:
            if strict:
                raise TestSpecParseException("Duplicate test name detected: %s" % t.name)
            for x in itertools.count(2):
                proposed_name = "%s_%d" % (t.name, x)
                if not proposed_name in seen_names:
                    t.name = proposed_name
                    break
        seen_names.add(t.name)

    # Check dependencies.
    valid_names = set()
    for test in tests:
        valid_names.add(test.name)
    for test in tests:
        test_depends = sorted(test.depends)
        for dependency_name in test_depends:
            if not dependency_name in valid_names:
                if strict:
                    raise TestSpecParseException(
                            "Depedency '%s' invalid." % dependency_name)
                test.depends.remove(dependency_name)

   # Toposort.
    test_ordering = {}
    for (n, t) in enumerate(tests):
        test_ordering[t.name] = n
    test_depends = {}
    for t in tests:
        test_depends[t.name] = t.depends
    try:
        ordering = toposort([t.name for t in tests],
                lambda x: test_ordering[x],
                lambda x: test_depends[x])
    except ValueError, e:
        if strict:
            raise TestSpecParseException(
                    "Cycle in dependencies: %s" % e.message)
        else:
            # There is a cycle, but we want to continue anyway.
            # Just ignore all deps and hope for the best.
            ordering = dict((t, n) for (n, t)
                    in enumerate(sorted([t.name for t in tests])))
    ordering = dict((t, n) for (n, t) in enumerate(ordering))
    tests = sorted(tests, key=lambda k: ordering[k.name])

    return tests

def legacy_testspec(root):
    """Find tests inside makefiles."""

    # Find candidate "IsaMakefile"s
    candidates = sorted(
        glob.glob(os.path.join(root, "*", "IsaMakefile"))
        + glob.glob(os.path.join(root, "*", "*", "IsaMakefile")))

    # Get isabelle binary.
    isabelle_bin = os.path.abspath(os.path.join(root, "isabelle", "bin", "isabelle"))

    # Run "isabelle make report-regression" on each.
    def report_regression(filename):
        filename = os.path.abspath(filename)
        base_name = os.path.split(os.path.dirname(filename))[1]
        try:
            with open("/dev/null", "w") as devnull:
                results = subprocess.check_output(
                    [isabelle_bin, "make", "-f", filename, "report-regression"],
                    cwd=os.path.dirname(filename),
                    stderr=devnull)
            return [(base_name + "/" + x, x) for x in results.strip().split()]
        except subprocess.CalledProcessError:
            return []

    # Search for tests.
    tests = []
    for candidate in candidates:
        targets = report_regression(os.path.abspath(candidate))
        for (name, target) in targets:
            new_test = Test(name, "isabelle make " + target, timeout=4*3600,
                        cwd=os.path.dirname(os.path.abspath(candidate)))
            tests.append(new_test)
    return tests

def parse_test_files(xml_files, strict=False):
    tests = []
    seen_files = set()
    for x in xml_files:
        # Some files may be symlinked; don't process them multiple times.
        try:
            st = os.stat(x)
            if (st.st_dev, st.st_ino) in seen_files:
                continue
            seen_files.add((st.st_dev, st.st_ino))
        except OSError:
            pass

        try:
            tests += parse_testsuite_xml(x)
        except:
            sys.stderr.write("Exception while parsing file: %s.\n" % x)
            if strict:
                raise
    return process_tests(tests, strict=strict)

def main():
    # Parse arguments
    parser = argparse.ArgumentParser(description="Regression Framework Testspec Parser")
    parser.add_argument("file", metavar="FILE", type=str, nargs="*",
            help="a regression XML file to parse")
    parser.add_argument("-r", "--relax", action="store_false", dest="strict",
            help="be less strict when parsing XML files")
    parser.add_argument("-l", "--legacy", action="store_true",
            help="use legacy 'IsaMakefile' specs")
    args = parser.parse_args()

    # Ensure we are either in legacy more or we have at least one file.
    if not args.legacy and len(args.file) == 0:
        parser.error("Please provide at least one XML file.")
    if args.legacy and len(args.file) > 0:
        parser.error("Can not use both legacy mode and XML files.")

    if args.legacy:
        # Fetch legacy tests.
        tests = legacy_testspec(os.getcwd())
    else:
        # Fetch XML tests.
        tests = parse_test_files(args.file, strict=args.strict)

    # Print results
    for test in tests:
        print("\"%s\" [timeout=%d, parents=%s, cwd=%s]" % (
            test.command, test.timeout, ",".join(test.depends), test.cwd))


if __name__ == "__main__":
    main()