1#!/usr/bin/env python 2 3# this is a script to extract given named nodes from a dot file, with 4# the associated edges. An edge is kept iff for edge x -> y 5# x and y are both nodes specified to be kept. 6 7# known issues: if a line contains '->' and is not an edge line 8# problems will occur. If node labels do not begin with 9# Node this also will not work. Since this is designed to work 10# on DSA dot output and not general dot files this is ok. 11# If you want to use this on other files rename the node labels 12# to Node[.*] with a script or something. This also relies on 13# the length of a node name being 13 characters (as it is in all 14# DSA dot output files) 15 16# Note that the name of the node can be any substring of the actual 17# name in the dot file. Thus if you say specify COLLAPSED 18# as a parameter this script will pull out all COLLAPSED 19# nodes in the file 20 21# Specifying escape characters in the name like \n also will not work, 22# as Python 23# will make it \\n, I'm not really sure how to fix this 24 25# currently the script prints the names it is searching for 26# to STDOUT, so you can check to see if they are what you intend 27 28from __future__ import print_function 29 30import re 31import string 32import sys 33 34 35if len(sys.argv) < 3: 36 print( 37 "usage is ./DSAextract <dot_file_to_modify> \ 38 <output_file> [list of nodes to extract]" 39 ) 40 41# open the input file 42input = open(sys.argv[1], "r") 43 44# construct a set of node names 45node_name_set = set() 46for name in sys.argv[3:]: 47 node_name_set |= set([name]) 48 49# construct a list of compiled regular expressions from the 50# node_name_set 51regexp_list = [] 52for name in node_name_set: 53 regexp_list.append(re.compile(name)) 54 55# used to see what kind of line we are on 56nodeexp = re.compile("Node") 57# used to check to see if the current line is an edge line 58arrowexp = re.compile("->") 59 60node_set = set() 61 62# read the file one line at a time 63buffer = input.readline() 64while buffer != "": 65 # filter out the unnecessary checks on all the edge lines 66 if not arrowexp.search(buffer): 67 # check to see if this is a node we are looking for 68 for regexp in regexp_list: 69 # if this name is for the current node, add the dot variable name 70 # for the node (it will be Node(hex number)) to our set of nodes 71 if regexp.search(buffer): 72 node_set |= set([re.split("\s+", buffer, 2)[1]]) 73 break 74 buffer = input.readline() 75 76 77# test code 78# print '\n' 79 80print(node_name_set) 81 82# print node_set 83 84 85# open the output file 86output = open(sys.argv[2], "w") 87# start the second pass over the file 88input = open(sys.argv[1], "r") 89 90buffer = input.readline() 91while buffer != "": 92 # there are three types of lines we are looking for 93 # 1) node lines, 2) edge lines 3) support lines (like page size, etc) 94 95 # is this an edge line? 96 # note that this is no completely robust, if a none edge line 97 # for some reason contains -> it will be missidentified 98 # hand edit the file if this happens 99 if arrowexp.search(buffer): 100 # check to make sure that both nodes are in the node list 101 # if they are print this to output 102 nodes = arrowexp.split(buffer) 103 nodes[0] = string.strip(nodes[0]) 104 nodes[1] = string.strip(nodes[1]) 105 if nodes[0][:13] in node_set and nodes[1][:13] in node_set: 106 output.write(buffer) 107 elif nodeexp.search(buffer): # this is a node line 108 node = re.split("\s+", buffer, 2)[1] 109 if node in node_set: 110 output.write(buffer) 111 else: # this is a support line 112 output.write(buffer) 113 buffer = input.readline() 114