1#!/usr/bin/env python 2 3#this is a script to extract given named nodes from a dot file, with 4#the associated edges. An edge is kept iff for edge x -> y 5# x and y are both nodes specified to be kept. 6 7#known issues: if a line contains '->' and is not an edge line 8#problems will occur. If node labels do not begin with 9#Node this also will not work. Since this is designed to work 10#on DSA dot output and not general dot files this is ok. 11#If you want to use this on other files rename the node labels 12#to Node[.*] with a script or something. This also relies on 13#the length of a node name being 13 characters (as it is in all 14#DSA dot output files) 15 16#Note that the name of the node can be any substring of the actual 17#name in the dot file. Thus if you say specify COLLAPSED 18#as a parameter this script will pull out all COLLAPSED 19#nodes in the file 20 21#Specifying escape characters in the name like \n also will not work, 22#as Python 23#will make it \\n, I'm not really sure how to fix this 24 25#currently the script prints the names it is searching for 26#to STDOUT, so you can check to see if they are what you intend 27 28from __future__ import print_function 29 30import re 31import string 32import sys 33 34 35if len(sys.argv) < 3: 36 print('usage is ./DSAextract <dot_file_to_modify> \ 37 <output_file> [list of nodes to extract]') 38 39#open the input file 40input = open(sys.argv[1], 'r') 41 42#construct a set of node names 43node_name_set = set() 44for name in sys.argv[3:]: 45 node_name_set |= set([name]) 46 47#construct a list of compiled regular expressions from the 48#node_name_set 49regexp_list = [] 50for name in node_name_set: 51 regexp_list.append(re.compile(name)) 52 53#used to see what kind of line we are on 54nodeexp = re.compile('Node') 55#used to check to see if the current line is an edge line 56arrowexp = re.compile('->') 57 58node_set = set() 59 60#read the file one line at a time 61buffer = input.readline() 62while buffer != '': 63 #filter out the unnecessary checks on all the edge lines 64 if not arrowexp.search(buffer): 65 #check to see if this is a node we are looking for 66 for regexp in regexp_list: 67 #if this name is for the current node, add the dot variable name 68 #for the node (it will be Node(hex number)) to our set of nodes 69 if regexp.search(buffer): 70 node_set |= set([re.split('\s+',buffer,2)[1]]) 71 break 72 buffer = input.readline() 73 74 75#test code 76#print '\n' 77 78print(node_name_set) 79 80#print node_set 81 82 83#open the output file 84output = open(sys.argv[2], 'w') 85#start the second pass over the file 86input = open(sys.argv[1], 'r') 87 88buffer = input.readline() 89while buffer != '': 90 #there are three types of lines we are looking for 91 #1) node lines, 2) edge lines 3) support lines (like page size, etc) 92 93 #is this an edge line? 94 #note that this is no completely robust, if a none edge line 95 #for some reason contains -> it will be missidentified 96 #hand edit the file if this happens 97 if arrowexp.search(buffer): 98 #check to make sure that both nodes are in the node list 99 #if they are print this to output 100 nodes = arrowexp.split(buffer) 101 nodes[0] = string.strip(nodes[0]) 102 nodes[1] = string.strip(nodes[1]) 103 if nodes[0][:13] in node_set and \ 104 nodes[1][:13] in node_set: 105 output.write(buffer) 106 elif nodeexp.search(buffer): #this is a node line 107 node = re.split('\s+', buffer,2)[1] 108 if node in node_set: 109 output.write(buffer) 110 else: #this is a support line 111 output.write(buffer) 112 buffer = input.readline() 113 114