-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathauto_clone.py
executable file
·339 lines (281 loc) · 16 KB
/
auto_clone.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
# -*- coding: UTF-8 -*-
import pandas
import Bio
import Bio.SeqIO
import Bio.Seq
import Bio.SeqRecord
import Bio.Alphabet
import time
import os
import datetime
import collections
class IDT_plate_object():
def __init__(self,PrimerNamePrefix,PrimerIndex,StartWellPosition,direction,ignoreWellOverlap=True):
self.PrimerNamePrefix = PrimerNamePrefix
self.PrimerIndex = PrimerIndex
self.WellPosition = StartWellPosition
self.plate_direction = direction ##alphabetical (A1,B1,C1... A2,B2,C2...) or numerical (A1,A2,A3...B1,B2,C3...)
self.usedWells = []
self.df = pandas.DataFrame(columns=["WellPosition","Name","Sequence","Notes"]) ##Write the header
self.ignoreWellOverlap = ignoreWellOverlap
def incrementWellPositionCol(self,increment):
self.usedWells.append(self.WellPosition)
letter = self.WellPosition[0]
number = int(self.WellPosition[1:])
number += increment
##Handle wrapping
if number > 12: ##If the number is too large
while number > 12:
letter = chr(ord(letter) + 1)
number -= 12
if number < 1: ##If the number is too small
while number < 1:
letter = chr(ord(letter) - 1)
number += 12
if ord(letter) > ord("I"):
print("plate well out of range: too high")
if ord(letter) < ord("A"):
print("plate well out of range: too low")
self.WellPosition = letter+str(number)
def incrementWellPositionRow(self,increment):
self.usedWells.append(self.WellPosition)
letter = self.WellPosition[0]
number = int(self.WellPosition[1:])
##No wrapping handling implemented. This is because running off the letters of the plate is probably not intended (have to implement having multiple plates)
letter = chr(ord(letter) + increment)
if ord(letter) > ord("I"):
print("plate well out of range: too high")
print("Exiting program...")
exit()
if ord(letter) < ord("A"):
print("plate well out of range: too low")
print("Exiting program...")
exit()
self.WellPosition = letter+str(number)
def incrementWellPosition(self,increment):
if self.plate_direction == "numerical":
self.incrementWellPositionCol(increment)
elif self.plate_direction == "alphabetical":
self.incrementWellPositionRow(increment)
print("Alphabetical incrementing not properly implemented!")
exit()
else:
print("Plate direction not properly set!")
exit()
def formatPrimerRow(self,record,plasmid,direction):
newRow = dict()
newRow['WellPosition'] = self.WellPosition[0]+str(self.WellPosition[1:])
if direction == "+":
newRow['Notes'] = "Forward "+plasmid.description+record.id+record.description
newRow['Sequence'] = plasmid.gibson_forward_overlap+str(record.Sequence)
if direction == "-":
newRow['Notes'] = "Reverse "+plasmid.description+record.id+record.description
newRow['Sequence'] = plasmid.gibson_reverse_overlap+str(record.Sequence.reverse_complement())
newRow['Name'] = Name
return newRow
def newPrimer(self,record):
if self.WellPosition in self.usedWells and self.ignoreWellOverlap == False:
print("**Primer collision** Tried to put a primer in a well that was already marked as used.")
print("Exiting...")
exit()
if record.description == None:
record.description = str(len(record.seq))
##Layout is ["WellPosition","Name","Sequence","Notes"]
newRow = {'WellPosition':self.WellPosition,"Name":str(self.PrimerNamePrefix)+format(self.PrimerIndex,'04'),"Sequence":str(record.seq),"Notes":str(record.description)}
self.df = self.df.append(newRow,ignore_index=True)
self.usedWells = list(self.df['WellPosition'].values)
self.PrimerIndex += 1
def newPrimerPairVertical(self,record_1,record_2):
self.newPrimer(record_1) ##Add primer A1
self.incrementWellPositionRow(1) ##+1 for the letter A1->B1
self.newPrimer(record_2) ##add primer B1
self.incrementWellPositionRow(-1) ## B1->A1
if int(self.WellPosition[1:]) == 12: ##Wrapping support. Go an extra row to not run into R primer of previous primers. E.g, if at A12, go to B12, then wrap to C1 when incremeting
self.incrementWellPositionRow(1)
self.incrementWellPositionCol(1) ##A1->A2
def getCSV(self):
##Need name, sequence, scale, purification
##Scale = 25nm
##Purification = STD
rows = self.df.values.tolist()
buffer = ''
for row in rows:
name = row[1]
sequence = row[2]
scale = "25nm"
purity = "STD"
buffer += name+","+sequence+","+scale+","+purity+'\n'
return buffer
class plasmid_object():
def __init__(self,name):
if name != None:
###Load plasmids to clone into:
###Expects the plasmid to have a stretch of "N"s, which is where it will stick the insert.
plasmid_path="../../plasmids/"+name+".gb"
if not os.path.isfile(plasmid_path):
print("No plasmid file found:",plasmid_path)
self.plasmid = None
self.forward_plasmid = None
self.reverse_plasmid = None
elif os.path.isfile(plasmid_path):
handle = open(plasmid_path,"rU")
self.plasmid = list(Bio.SeqIO.parse(handle,"gb"))[0]
handle.close()
if len(self.plasmid.seq) == 0:
print("Empty plasmid.")
self.plasmid = None
self.forward_plasmid = None
self.reverse_plasmid = None
elif len(self.plasmid.seq) > 0:
self.plasmid.id = name
Nstart = self.plasmid.seq.find("N")
Nend = self.plasmid.seq.rfind("N")+1
self.forward_plasmid = self.plasmid[0:Nstart]
self.reverse_plasmid = self.plasmid[Nend:]
##Load the plasmid overlap & comments. Also possible to derive from plasmid file.
text_path = "../../plasmids/"+name+".txt"
handle = open(text_path,"rU")
forwardString = handle.readline() ##Line 1
reverseString = handle.readline() ##Line 2
commentString = handle.readline() ##Line 3
handle.close()
self.gibson_forward_overlap = forwardString[forwardString.find(":")+1:].strip()
self.gibson_reverse_overlap = reverseString[reverseString.find(":")+1:].strip() ##Overlap Not including stop codon
self.description = commentString[commentString.find(":")+1:].strip()
else:
print("Couldn't match plasmid file!")
self.plasmid = None
self.forward_plasmid = None
self.reverse_plasmid = None
self.gibson_forward_overlap = None
self.gibson_reverse_overlap = None
self.description = "uninitialized plasmid"
##This function concatenates multiple records to form a final plasmid & writes it to disk (genbank format)
def writePlasmidFile(self,record,plate=None):
if self.plasmid is None:
print("Can't write. Plasmid wasn't loaded.")
return None
##Have to get the record sequence type to IUPACAmbiguousDNA
seq_reformatted = Bio.Seq.Seq(str(record.seq), Bio.Alphabet.IUPAC.IUPACAmbiguousDNA())
record_reformatted = Bio.SeqRecord.SeqRecord(seq_reformatted,id=record.id, name=record.name,description=record.description)
##Have to add a feature so it looks nice in APE & other plasmid editors. This site https://www.biostars.org/p/57549/ explains how.
##http://biopython.org/DIST/docs/api/Bio.SeqFeature.SeqFeature-class.html also useful.
my_start_pos = Bio.SeqFeature.ExactPosition(0)
my_end_pos = Bio.SeqFeature.ExactPosition(len(record.seq))
my_feature_location = Bio.SeqFeature.FeatureLocation(my_start_pos,my_end_pos)
my_feature_type = "CDS"
translation_seq = str(record_reformatted[my_start_pos:my_end_pos].seq.translate()).replace("\n","").replace("*","")
if plate != None:
my_label = plate.WellPosition+"_"+record.id+" CDS "+str(len(record.seq))+"bp"
else:
my_label = record.id+" CDS "+str(len(record.seq))+"bp"
feature_annotations = collections.OrderedDict()
feature_annotations = {"codon_start":"1","label":my_label,"note":"color: #00ccff","translation":translation_seq}
my_feature = Bio.SeqFeature.SeqFeature(my_feature_location,type=my_feature_type,strand=1,qualifiers=feature_annotations)
record_reformatted.features.append(my_feature)
complete_plasmid = self.forward_plasmid+record_reformatted+self.reverse_plasmid ##Concatenated records is as simple as addition. Thanks BioPython!
##Add appropriate annotations to plasmid
complete_plasmid.name = "Exported"
complete_plasmid.annotations["topology"] = "circular"
complete_plasmid.annotations["molecule_type"] = "ds-DNA"
today = datetime.datetime.today()
today_string = today.strftime("%d-%b-%Y").upper() ##%d-%b-%Y -> uppercase
complete_plasmid.annotations["date"] = today_string
complete_plasmid.annotations["source"] = "natural DNA sequence"
complete_plasmid.annotations["accession"] = "<unknown id>"
complete_plasmid.annotations["organism"] = "unspecified"
##Generate generic reference object
genbank_reference = Bio.SeqFeature.Reference()
genbank_reference.location = [Bio.SeqFeature.FeatureLocation(Bio.SeqFeature.ExactPosition(0), Bio.SeqFeature.ExactPosition(len(complete_plasmid)))]
genbank_reference.authors = "."
genbank_reference.title = "Direct Submission"
genbank_reference_date_string = today.strftime("%b %d, %Y")
##Exported Apr 20, 2018 from SnapGene 4.1.8
genbank_reference.journal="Exported "+genbank_reference_date_string+" from SnapGene 4.1.8 <- String to trick SnapGene, actually exported with auto_clone https://github.com/photocyte/auto_clone"
##genbank_reference.journal = "Exported "+genbank_reference_date_string+" using auto_clone https://github.com/photocyte/auto_clone. Formatted for SnapGene 4.1.8\nhttp://www.snapgene.com"
complete_plasmid.annotations["references"] = [genbank_reference]
if plate != None:
file_string = plate.WellPosition+"_"+record.id+"_"+self.forward_plasmid.id+".gb"
else:
file_string = record.id+"_"+self.forward_plasmid.id+".gb"
Bio.SeqIO.write(complete_plasmid, file_string, "gb")
return file_string
def formatGibsonPrimerForward(self,record,forwardPrimerLength):
primerDescription = "Forward "+self.description+record.id+record.description
sequence = str(self.gibson_forward_overlap).upper()+str(record.seq[:forwardPrimerLength]).lower()
seq_reformatted = Bio.Seq.Seq(str(sequence), Bio.Alphabet.IUPAC.IUPACAmbiguousDNA())
newRecord = Bio.SeqRecord.SeqRecord(seq_reformatted,id=record.id, name=record.name,description=primerDescription)
return newRecord
def formatGibsonPrimerReverse(self,record,reversePrimerLength):
primerDescription = "Reverse "+self.description+record.id+record.description
sequence = str(self.gibson_reverse_overlap).upper()+str(record.seq[-reversePrimerLength:].reverse_complement()).lower()
seq_reformatted = Bio.Seq.Seq(str(sequence), Bio.Alphabet.IUPAC.IUPACAmbiguousDNA())
newRecord = Bio.SeqRecord.SeqRecord(seq_reformatted,id=record.id, name=record.name,description=primerDescription)
return newRecord
def plate_vertical_primer_order_fasta(record_iterator,plasmid_name,prefix,startindex,startwell,platetype="numerical"):
plate = IDT_plate_object(prefix,startindex,startwell,platetype,ignoreWellOverlap=False)
for record in record_iterator:
plasmid = plasmid_object(plasmid_name) ##Needs to know which plasmid is being cloned into
##Write a plasmid file
plasmid.writePlasmidFile(record,plate=plate) ##Inserts CDS into plasmid. Uses plate for description type stuff. Plate optional.
print("________________________________________________________")
#if record.seq[:45].translate()[0] != "M" or record.seq[-45:].translate()[14] != "*":
# print "***",record.id ,"*** not properly translated"
#else:
print("Record id:",record.id)
print("Record description:",record.description)
print("")
if "GGTCTC" in record.seq or "CCAGAG" in record.seq:
print("Incompatible with BsaI")
print("GGTCTC:",record.seq.find("GGTCTC"))
print("CCAGAG:",record.seq.find("CCAGAG"))
if "CGTCTC" in record.seq or "GCAGAG" in record.seq:
print("Incompatible with BsmBI")
print("CGTCTC:",record.seq.find("CGTCTC"))
print("GCAGAG:",record.seq.find("GCAGAG"))
if "GCGGCCGC" in record.seq:
print("Incompatible with NotI")
print("GCGGCCGC:",record.seq.find("GCGGCCGC"))
###Related primer pairs in a vertical orientation. E.g. A1,B1 (numeric incrementing) or A1,A2 (alphabet incremeting)
#plate.newPrimer(record)
forwardPrimer = plasmid.formatGibsonPrimerForward(record,24)
reversePrimer = plasmid.formatGibsonPrimerReverse(record,24)
plate.newPrimerPairVertical(forwardPrimer,reversePrimer)
###Ordering only single primers at a time.
#plate.newPrimer(record)
plate.df.to_excel("IDT.xls",index=False)
def single_primer_order_fasta(record_iterator,plasmid_name,prefix,startindex,startwell,platetype="numerical"):
plate = IDT_plate_object(prefix,startindex,startwell,platetype,ignoreWellOverlap=True)
for record in record_iterator:
plasmid = plasmid_object(plasmid_name) ##Needs to know which plasmid is being cloned into
##Write a plasmid file
plasmid.writePlasmidFile(record,plate=plate) ##Inserts CDS into plasmid. Uses plate for description type stuff. Plate optional.
print("________________________________________________________")
#if record.seq[:45].translate()[0] != "M" or record.seq[-45:].translate()[14] != "*":
# print "***",record.id ,"*** not properly translated"
print("Record id:",record.id)
print("Record description:",record.description)
print("")
if "GGTCTC" in record.seq or "CCAGAG" in record.seq:
print("Incompatible with BsaI")
print("GGTCTC:",record.seq.find("GGTCTC"))
print("CCAGAG:",record.seq.find("CCAGAG"))
if "CGTCTC" in record.seq or "GCAGAG" in record.seq:
print("Incompatible with BsmBI")
print("CGTCTC:",record.seq.find("CGTCTC"))
print("GCAGAG:",record.seq.find("GCAGAG"))
if "GCGGCCGC" in record.seq:
print("Incompatible with NotI")
print("GCGGCCGC:",record.seq.find("GCGGCCGC"))
forwardPrimer = plasmid.formatGibsonPrimerForward(record,24)
reversePrimer = plasmid.formatGibsonPrimerReverse(record,24)
plate.newPrimer(forwardPrimer)
plate.newPrimer(reversePrimer)
return plate.getCSV()
if __name__ == "__main__":
CDS_fasta_file = "photinus_pyralyis_151_v1_Trinity_cleaned.fasta.transdecoder.cds_annotated"
handle = open(CDS_fasta_file,"rb")
record_iterator = list(Bio.SeqIO.parse(fasta_handle,"fasta"))
plate_vertical_primer_order_fasta(record_iterator,"pHis8_4","TRF_x",1,"A1")
else:
print("importing auto_clone")