1#!/usr/bin/env python3
2#
3#  Copyright 2020, Data61
4#  Commonwealth Scientific and Industrial Research Organisation (CSIRO)
5#  ABN 41 687 119 230.
6#
7#  This software may be distributed and modified according to the terms of
8#  the BSD 2-Clause license. Note that NO WARRANTY is provided.
9#  See "LICENSE_BSD2.txt" for details.
10#
11#  @TAG(DATA61_BSD)
12import argparse
13import bs4
14import functools
15import re
16import sys
17
18SYS_OUT = 'system-out'
19XML_SPECIAL_CHARS = {'<': "&lt;", '&': "&amp;", '>': "&gt;", '"': "&quot;", "'": "&apos;"}
20
21TAG_WHITELIST = {
22    # Keys are tags to be emitted, values are whether to emit their inner text.
23    'error': True,
24    'failure': True,
25    'testsuite': False,
26    'testcase': False,
27    SYS_OUT: True,
28}
29
30TOP_TAG = 'testsuite'
31
32
33def print_tag(f, tag):
34    assert isinstance(tag, bs4.element.Tag)
35
36    # Skip non-whitelisted tags.
37    if tag.name not in TAG_WHITELIST:
38        return
39
40    # If we want the inner text, just blindly dump the soup.
41    if TAG_WHITELIST[tag.name]:
42        if tag.name != SYS_OUT:
43            print(tag, file=f)
44        else:
45            print('<%s>' % tag.name, file=f)
46            text = tag.get_text()
47            for ch in text:
48                if ch not in XML_SPECIAL_CHARS:
49                    f.write(ch)
50                else:
51                    f.write(XML_SPECIAL_CHARS[ch])
52            print('</%s>' % tag.name, file=f)
53    else:
54        print('<%(name)s %(attrs)s>' % {
55            'name': tag.name,
56            'attrs': ' '.join(['%s="%s"' % (x[0], x[1]) for x in list(tag.attrs.items())]),
57        }, file=f)
58
59        # Recurse for our children.
60        list(map(functools.partial(print_tag, f),
61                 [x for x in tag.children if isinstance(x, bs4.element.Tag)]))
62
63        print('</%s>' % tag.name, file=f)
64
65
66def main():
67    parser = argparse.ArgumentParser('Cleanup messy XML output from sel4test')
68    parser.add_argument('input',
69                        nargs='?', help='Input file', type=argparse.FileType('r', errors="ignore"),
70                        default=sys.stdin)
71    parser.add_argument('output',
72                        nargs='?', help='Output file', type=argparse.FileType('w'),
73                        default=sys.stdout)
74    parser.add_argument('--quiet', '-q',
75                        help='Suppress unmodified output to stdout', action='store_true',
76                        default=False)
77    args = parser.parse_args()
78
79    data = args.input.read()
80
81    # Strip trailing crap around the XML we want to parse. Without this, even
82    # BeautifulSoup sometimes backs away in horror.
83    regexp = re.compile(r'(<%(top)s>.*</%(top)s>)' % {'top': TOP_TAG}, re.S)
84    matches = re.search(regexp, data)
85    if not matches or len(matches.groups()) != 1:
86        print('Failed to strip leading and trailing garbage', file=sys.stderr)
87        return -1
88    data = matches.group(0)
89
90    # Dump input data *before* parsing in case we choke during parsing. This
91    # means end users have a chance of determining what went wrong from the
92    # original output.
93    if not args.quiet:
94        print(data)
95
96    # Parse the input as HTML even though BS supports XML. It seems the XML
97    # parser is a bit more precious about the input.
98    try:
99        soup = bs4.BeautifulSoup(data, "lxml")
100    except Exception as inst:
101        print('Failed to parse input: %s' % inst, file=sys.stderr)
102        return -1
103
104    try:
105        top = soup.find_all(TOP_TAG)[0]
106    except Exception as inst:
107        print('Failed to find initial %s tag: %s' % (TOP_TAG, inst), file=sys.stderr)
108        return -1
109
110    try:
111        print_tag(args.output, top)
112    except Exception as inst:
113        print('While navigating XML: %s' % inst, file=sys.stderr)
114
115    return 0
116
117
118if __name__ == '__main__':
119    sys.exit(main())
120