1#!/usr/bin/env python3 2# 3# Copyright 2020, Data61 4# Commonwealth Scientific and Industrial Research Organisation (CSIRO) 5# ABN 41 687 119 230. 6# 7# This software may be distributed and modified according to the terms of 8# the BSD 2-Clause license. Note that NO WARRANTY is provided. 9# See "LICENSE_BSD2.txt" for details. 10# 11# @TAG(DATA61_BSD) 12import argparse 13import bs4 14import functools 15import re 16import sys 17 18SYS_OUT = 'system-out' 19XML_SPECIAL_CHARS = {'<': "<", '&': "&", '>': ">", '"': """, "'": "'"} 20 21TAG_WHITELIST = { 22 # Keys are tags to be emitted, values are whether to emit their inner text. 23 'error': True, 24 'failure': True, 25 'testsuite': False, 26 'testcase': False, 27 SYS_OUT: True, 28} 29 30TOP_TAG = 'testsuite' 31 32 33def print_tag(f, tag): 34 assert isinstance(tag, bs4.element.Tag) 35 36 # Skip non-whitelisted tags. 37 if tag.name not in TAG_WHITELIST: 38 return 39 40 # If we want the inner text, just blindly dump the soup. 41 if TAG_WHITELIST[tag.name]: 42 if tag.name != SYS_OUT: 43 print(tag, file=f) 44 else: 45 print('<%s>' % tag.name, file=f) 46 text = tag.get_text() 47 for ch in text: 48 if ch not in XML_SPECIAL_CHARS: 49 f.write(ch) 50 else: 51 f.write(XML_SPECIAL_CHARS[ch]) 52 print('</%s>' % tag.name, file=f) 53 else: 54 print('<%(name)s %(attrs)s>' % { 55 'name': tag.name, 56 'attrs': ' '.join(['%s="%s"' % (x[0], x[1]) for x in list(tag.attrs.items())]), 57 }, file=f) 58 59 # Recurse for our children. 60 list(map(functools.partial(print_tag, f), 61 [x for x in tag.children if isinstance(x, bs4.element.Tag)])) 62 63 print('</%s>' % tag.name, file=f) 64 65 66def main(): 67 parser = argparse.ArgumentParser('Cleanup messy XML output from sel4test') 68 parser.add_argument('input', 69 nargs='?', help='Input file', type=argparse.FileType('r', errors="ignore"), 70 default=sys.stdin) 71 parser.add_argument('output', 72 nargs='?', help='Output file', type=argparse.FileType('w'), 73 default=sys.stdout) 74 parser.add_argument('--quiet', '-q', 75 help='Suppress unmodified output to stdout', action='store_true', 76 default=False) 77 args = parser.parse_args() 78 79 data = args.input.read() 80 81 # Strip trailing crap around the XML we want to parse. Without this, even 82 # BeautifulSoup sometimes backs away in horror. 83 regexp = re.compile(r'(<%(top)s>.*</%(top)s>)' % {'top': TOP_TAG}, re.S) 84 matches = re.search(regexp, data) 85 if not matches or len(matches.groups()) != 1: 86 print('Failed to strip leading and trailing garbage', file=sys.stderr) 87 return -1 88 data = matches.group(0) 89 90 # Dump input data *before* parsing in case we choke during parsing. This 91 # means end users have a chance of determining what went wrong from the 92 # original output. 93 if not args.quiet: 94 print(data) 95 96 # Parse the input as HTML even though BS supports XML. It seems the XML 97 # parser is a bit more precious about the input. 98 try: 99 soup = bs4.BeautifulSoup(data, "lxml") 100 except Exception as inst: 101 print('Failed to parse input: %s' % inst, file=sys.stderr) 102 return -1 103 104 try: 105 top = soup.find_all(TOP_TAG)[0] 106 except Exception as inst: 107 print('Failed to find initial %s tag: %s' % (TOP_TAG, inst), file=sys.stderr) 108 return -1 109 110 try: 111 print_tag(args.output, top) 112 except Exception as inst: 113 print('While navigating XML: %s' % inst, file=sys.stderr) 114 115 return 0 116 117 118if __name__ == '__main__': 119 sys.exit(main()) 120