1 """PineGenomeComparisonUsingEntrez script by [[yong27/2004-02-27]]
2 """
3 import unittest, tempfile, os
4 from Bio import GenBank, Fasta
5 from Bio.Blast import NCBIStandalone
6
7 gi={
8 'Pinus koraiensis':'29469655',
9 'Pinus thunbergii':'529643',
10 'Nicotiana tabacum':'11465934',
11 }
12
13 BL2SEQ = '../bl2seq'
14
15 fpser = GenBank.FeatureParser()
16 ncbi = GenBank.NCBIDictionary('nucleotide', 'genbank', parser=fpser)
17
18 def parseIdentity(aHandle):
19 origin = []
20 identities = []
21 average = []
22 for line in aHandle.readlines():
23 try:
24 if line[1] == 'I':
25 identities.append(int(line.split()[2].split('/')[0]))
26 average.append(int(line.split()[3].replace('%','').replace('(','').replace(')','').replace(',','')))
27 except IndexError:
28 pass
29 origin.append(line)
30 sum=0
31 for each in average:
32 sum+=each
33 try:
34 average = sum/float(len(average))
35 except ZeroDivisionError:
36 average =0
37 return ' '.join(origin), identities, average
38
39 def bl2seq(fasta1, fasta2):
40 ff1Name=tempfile.mktemp('fasta')
41 ff2Name=tempfile.mktemp('fasta')
42 ff1=open(ff1Name,'w')
43 ff2=open(ff2Name,'w')
44 ff1.write(str(fasta1))
45 ff2.write(str(fasta2))
46 ff1.close()
47 ff2.close()
48
49 cmd = " ".join([BL2SEQ,'-i',ff1Name,'-j',ff2Name,'-p blastn','-X 30','-W 11'])
50 w,r,e = os.popen3(cmd)
51 w.close()
52 return parseIdentity(r)
53
54
55 class MyFeature:
56 def __init__(self, id, position, sequence, species):
57 self.id = id
58 self.position = position
59 self.sequence = sequence
60 self.partner = None
61 self.species = species
62 def compareWith(self,aFeature):
63 assert self.id == aFeature.id
64 self.partner = aFeature
65 def makeFasta(self):
66 fasta = Fasta.Record()
67 try:
68 fasta.title = self.id +' '+str(self.species)
69 except TypeError:
70 fasta.title = self.id
71 fasta.sequence = self.sequence
72 return fasta
73 def getTwoSequence(self):
74 return (self.sequence, self.partner.sequence)
75 def align(self):
76 self.record, self.identities, self.average = bl2seq(self.makeFasta(), self.partner.makeFasta())
77 def getSequenceLength(self):
78 return len(self.sequence)
79 def getResult(self):
80 return self.record
81 def getIdentities(self):
82 return self.identities
83 def getIdentity(self):
84 sum=0
85 for each in self.identities:
86 sum+=each
87 return sum/float(self.getSequenceLength()) * 100
88 def getAverage(self):
89 return self.average
90
91 def getGenomeDict(aSpecies):
92 genome = ncbi[gi[aSpecies]]
93 genomeDict = {}
94 for feature in genome.features:
95 id = feature.qualifiers.get('gene')
96 if not id:
97 id = feature.qualifiers.get('product')
98 try:
99 id = id[0]
100 except:
101 pass
102 if genomeDict.has_key(id):
103 continue
104 start = feature.location._start.position
105 end = feature.location._end.position
106 sequence = genome.seq[start-1:end].tostring()
107 genomeDict[id] = MyFeature(id, (start,end), sequence, aSpecies)
108 return genomeDict
109
110 def combine(dict1, dict2):
111 combDict = {}
112 solo = []
113 for id,feature in dict1.copy().iteritems():
114 if id in dict2.keys():
115 feature.compareWith(dict2.pop(id))
116 combDict[id]=feature
117 dict1.pop(id)
118 return combDict,dict1,dict2
119
120 def main():
121 pk=getGenomeDict('Pinus koraiensis')
122 pt=getGenomeDict('Pinus thunbergii')
123 pkpt,pkpt_pk,pkpt_pt=combine(pk,pt)
124
125
126 pk=getGenomeDict('Pinus koraiensis')
127 nt=getGenomeDict('Nicotiana tabacum')
128 pknt,pknt_pk,pknt_nt=combine(pk,nt)
129
130 tp_pkpt = open('pk_pt','w')
131 tp_pknt = open('pk_nt','w')
132
133 result={}
134 for id,feature in pkpt.iteritems():
135 if id is None:
136 continue
137 feature.align()
138 tp_pkpt.write(feature.getResult())
139 print '\t'.join([id, str(feature.getIdentities()), str(feature.getIdentity())])
140 result[id]= [feature.getAverage()]
141
142 print "================"
143 for id,feature in pknt.iteritems():
144 if id is None:
145 continue
146 feature.align()
147 tp_pknt.write(feature.getResult())
148 print '\t'.join([id, str(feature.getIdentities()), str(feature.getIdentity())])
149 try:
150 result[id].append(feature.getAverage())
151 except:
152 pass
153 print "================"
154 for id,feature in result.iteritems():
155 if id is None:
156 continue
157 ptpk=''
158 ptnt=''
159 try:
160 ptpk = str(feature[0])
161 ptnt = str(feature[1])
162 except IndexError:
163 pass
164 print '\t'.join([id,ptpk,ptnt])
165
166 class PineTest(unittest.TestCase):
167 def setUp(self):
168 self.td1 = {
169 'psbT': MyFeature('psbT', (52109, 52217), 'TATGGAAGCATTGGTTTATAA'),
170 'ORF44d': MyFeature('ORF44d', (68128, 68263), 'ATTAATGAATTGTTGCAT'),
171 'ORF63a': MyFeature('ORF63a', (26349, 26541), 'TATGGTTAATAGTTCCATGA'),
172 }
173 self.td2 = {
174 'psbT': MyFeature('psbT', (52109, 52217), 'TATGGAAGCATTGGTTTATAA'),
175 'ORF44d': MyFeature('ORF44d', (68128, 68263), 'ATTAATGAATTGTTGCAT'),
176 'ORF63': MyFeature('ORF63', (26349, 26541), 'TATGGTTAATAGTTCCATGA'),
177 }
178 def testCombine(self):
179 c,d1,d2=combine(self.td1,self.td2)
180 expected = ('TATGGAAGCATTGGTTTATAA','TATGGAAGCATTGGTTTATAA')
181 self.assertEquals(expected,c['psbT'].getTwoSequence())
182 self.assertEquals('ORF63a',d1['ORF63a'].id)
183 self.assertEquals('ORF63',d2['ORF63'].id)
184
185 def testBl2seq(self):
186 a=MyFeature('psbT', (52109, 52217), 'TTATGGGCATTGGTTTATAATATGGAAGCATTGGTTTATAATGGAAGCATTGGTTTATAATATGGAAGCATTGGTTTATAAATGGAAGCATTGGTTTATAA', 'abcd')
187 b=MyFeature('psbT', (52109, 52217), 'TTATGGAAGCATTGGTTTATAATATGGAAGCATTGGTTTATAATGGAAGCATTGGTTTATAATATGGGCATTGGTTTATAAATGGAAGCATTGTATAA','efgh')
188 a.compareWith(b)
189 a.align()
190 self.assertEquals(3,a.getIdentity())
191
192 if __name__=='__main__':
193
194 main()