-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdingII_parsesol.py
executable file
·185 lines (170 loc) · 6.69 KB
/
dingII_parsesol.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/usr/bin/python3
from dingII_util import *
from argparse import ArgumentParser, FileType
from ilp_util_adj import *
import sys
import logging
def set_edges(rd, vrs):
for eid, val in vrs['x'].items():
u,v,k = tuple([int(s) for s in eid.strip().split('_')])
if val == 1:
rd[u][v][k]['selected'] = True
else:
rd[u][v][k]['selected'] = False
DELETION = 'x'
def get_matching(rd):
#record number of occs seen per family
genes_seen = {}
match ={}
#print(rd.edges(data=True))
for u,v, d in rd.edges(data=True):
if u in match or v in match:
continue
if not d['etype'] == SELFEDGE:
continue
u_ = [partner for partner, edges in rd[u].items() for e, edgedata in edges.items() if edgedata['selected'] and edgedata['etype']!= ADJACENCY]
v_ = [partner for partner, edges in rd[v].items() for e, edgedata in edges.items() if edgedata['selected'] and edgedata['etype']!= ADJACENCY]
if len(u_) != 1:
LOG.fatal('Vertex %i has not degree 2!'%u)
sys.exit(1)
if len(v_) != 1:
LOG.fatal('Vertex %i has not degree 2!'%v)
sys.exit(1)
u_ = u_[0]
v_ = v_[0]
if u_ == v:
match[u] = DELETION
match[v] = DELETION
else:
gene = rd.nodes[u]['gene']
dadd(genes_seen, gene,1)
matchid = dgetc(genes_seen,gene)
match[u] = matchid
match[v] = matchid
match[v_] = matchid
match[u_] = matchid
return match
def relabel(extremitygenomes, gnmnames, match):
gnms = []
for name, e_gnm in zip(gnmnames, extremitygenomes):
gnm = []
for e_chrm in e_gnm:
chrm = []
chr_type = CHR_CIRCULAR
if e_chrm[0][1] == TELOGENE:
e_chrm_ = e_chrm[1:len(e_chrm)-1]
chr_type = CHR_LINEAR
else:
e_chrm_ = e_chrm
for i in range(1, len(e_chrm_),2):
eid1 ,g, e = e_chrm_[i-1]
eid2 ,_, _ = e_chrm_[i]
m1 = match[eid1]
m2 = match[eid2]
if not m1 == m2:
LOG.fatal('Doubly matched gene {} with indices {} and {}'.format(g,m1,m2))
sys.exit(1)
orient = ORIENT_POSITIVE
if e == EXTREMITY_HEAD:
orient = ORIENT_NEGATIVE
chrm.append((orient,'{}_{}'.format(g, m1)))
gnm.append((chr_type,chrm))
gnms.append((name,gnm))
return gnms
def print_genomes(genomes, file=sys.stdout):
for name, chrs in genomes:
print('>%s'%name, file=file)
for c_type, chrm in chrs:
for g in chrm:
print('%s%s '%(g), end='', file=file)
print('%s'%c_type, file=file)
print('', file=file)
def get_runs(rd):
rd_ = rd.copy()
nonselected = [(u,v,k) for u,v,k in rd_.edges if not rd_.get_edge_data(u, v, key=k)['selected']]
rd_.remove_edges_from(nonselected)
ext_to_orient = lambda xt: ORIENT_POSITIVE if xt==EXTREMITY_TAIL else ORIENT_NEGATIVE
cycle_runs = []
for cyc in [rd_.subgraph(c).copy() for c in nx.connected_components(rd_)]:
runs = []
genome = None
run = []
for u,v,k in nx.eulerian_circuit(cyc, keys=True):
data = rd_.get_edge_data(u,v,key=k)
if data['etype'] == SELFEDGE:
gnm_u = cyc.nodes[u]['genome']
if gnm_u != genome:
if genome != None:
runs.append((genome, run))
genome = gnm_u
run=[]
run.append((ext_to_orient(cyc.nodes[u]['extremity']),cyc.nodes[u]['gene']))
if len(runs) > 0:
#if the last run has the same genome as the first,they are actually the same run
if runs[0][0] == genome:
run.extend(runs[0][1])
runs[0] = (genome, run)
elif genome != None:
runs.append((genome, run))
elif genome != None:
runs.append((genome, run))
if len(runs) > 0:
cycle_runs.append(runs)
return cycle_runs
def small_lambda(c):
n_runs = len(c)
if n_runs ==0:
return 0
elif n_runs ==1:
return 1
else:
return n_runs//2 +1
def canonize(cyc):
'''
Make every cycle start with a (possibly empty) A-run
'''
if cyc[0][0] == GENOME1:
return cyc
elif len(cyc) <= 1:
cyc_=[(GENOME1,[])]+cyc
return cyc_
else:
fst = cyc[0]
cyc_ = cyc[1::]
cyc_.append(fst)
return cyc_
def print_runs(runs, file=sys.stdout):
for cyc in runs:
cyc = canonize(cyc)
print('\t'.join([''.join(['%s%s'%kv for kv in r[1]]) for r in cyc]), file=file)
def main():
parser = ArgumentParser('Parse a gurobi solution into a distance and optionally give a matching')
add_unimog_parsing_groups(parser)
parser.add_argument('-m','--matching', type=FileType('w'), help='Give the matching as a pair of indexed genomes.')
g = parser.add_mutually_exclusive_group(required=True)
g.add_argument('--solgur', type=FileType('r'), help='Gurobi solution file with a single solution.')
parser.add_argument('--runs', type=FileType('w'), help='Write runs of indels to the specified file. Format: Each line represents a cycle, each consecutive sequence of oriented markers is a run. Runs in the same cycle are separated by tab characters an begin with an A-run or a tab character if no A-run exists.')
parser.add_argument('--numindels', action='store_true', help='Give a possible number of indels in the sorting scenario. Note that this number is NOT the same for all optimal scenarios.')
args = parser.parse_args()
obj, vrs = read_gurobi(args.solgur)
genomes = read_genomes(args)
print('d(%s,%s) = %i'%(genomes[0][0], genomes[1][0], obj))
rd, _ , _, _, extr_genomes = full_relational_diagram(genomes, LOG)
set_edges(rd, vrs)
if args.runs or args.numindels:
runs = get_runs(rd)
if args.numindels:
print('sum(lambda(C)) = %i'%sum([small_lambda(c) for c in runs]))
if args.runs:
print_runs(runs,file=args.runs)
if args.matching:
m = get_matching(rd)
matched_gnms = relabel(extr_genomes, [gnm[0] for gnm in genomes] , m)
print_genomes(matched_gnms, file=args.matching)
st = logging.StreamHandler(sys.stderr)
LOG = logging.getLogger(__name__)
LOG.setLevel(logging.DEBUG)
st.setLevel(logging.DEBUG)
st.setFormatter(logging.Formatter('%(levelname)s\t%(asctime)s\t%(message)s'))
LOG.addHandler(st)
main()