#format python #!/usr/local/bin/python """PineGenomeComparisonUsingEntrez script by [[yong27/2004-02-27]] """ import unittest, tempfile, os from Bio import GenBank, Fasta from Bio.Blast import NCBIStandalone gi={ #NCBI Nucleotide's gi 'Pinus koraiensis':'29469655', 'Pinus thunbergii':'529643', 'Nicotiana tabacum':'11465934', } BL2SEQ = '../bl2seq' fpser = GenBank.FeatureParser() ncbi = GenBank.NCBIDictionary('nucleotide', 'genbank', parser=fpser) def parseIdentity(aHandle): origin = [] identities = [] average = [] for line in aHandle.readlines(): try: if line[1] == 'I': identities.append(int(line.split()[2].split('/')[0])) average.append(int(line.split()[3].replace('%','').replace('(','').replace(')','').replace(',',''))) except IndexError: pass origin.append(line) sum=0 for each in average: sum+=each try: average = sum/float(len(average)) except ZeroDivisionError: average =0 return ' '.join(origin), identities, average def bl2seq(fasta1, fasta2): ff1Name=tempfile.mktemp('fasta') ff2Name=tempfile.mktemp('fasta') ff1=open(ff1Name,'w') ff2=open(ff2Name,'w') ff1.write(str(fasta1)) ff2.write(str(fasta2)) ff1.close() ff2.close() cmd = " ".join([BL2SEQ,'-i',ff1Name,'-j',ff2Name,'-p blastn','-X 30','-W 11']) w,r,e = os.popen3(cmd) w.close() return parseIdentity(r) class MyFeature: def __init__(self, id, position, sequence, species): self.id = id self.position = position self.sequence = sequence self.partner = None self.species = species def compareWith(self,aFeature): assert self.id == aFeature.id self.partner = aFeature def makeFasta(self): fasta = Fasta.Record() try: fasta.title = self.id +' '+str(self.species) except TypeError: fasta.title = self.id fasta.sequence = self.sequence return fasta def getTwoSequence(self): return (self.sequence, self.partner.sequence) def align(self): self.record, self.identities, self.average = bl2seq(self.makeFasta(), self.partner.makeFasta()) def getSequenceLength(self): return len(self.sequence) def getResult(self): return self.record def getIdentities(self): return self.identities def getIdentity(self): sum=0 for each in self.identities: sum+=each return sum/float(self.getSequenceLength()) * 100 def getAverage(self): return self.average def getGenomeDict(aSpecies): genome = ncbi[gi[aSpecies]] genomeDict = {} for feature in genome.features: id = feature.qualifiers.get('gene') if not id: id = feature.qualifiers.get('product') try: id = id[0] except: pass if genomeDict.has_key(id): continue start = feature.location._start.position end = feature.location._end.position sequence = genome.seq[start-1:end].tostring() genomeDict[id] = MyFeature(id, (start,end), sequence, aSpecies) return genomeDict def combine(dict1, dict2): combDict = {} solo = [] for id,feature in dict1.copy().iteritems(): if id in dict2.keys(): feature.compareWith(dict2.pop(id)) combDict[id]=feature dict1.pop(id) return combDict,dict1,dict2 def main(): pk=getGenomeDict('Pinus koraiensis') pt=getGenomeDict('Pinus thunbergii') pkpt,pkpt_pk,pkpt_pt=combine(pk,pt) pk=getGenomeDict('Pinus koraiensis') nt=getGenomeDict('Nicotiana tabacum') pknt,pknt_pk,pknt_nt=combine(pk,nt) tp_pkpt = open('pk_pt','w') tp_pknt = open('pk_nt','w') result={} for id,feature in pkpt.iteritems(): if id is None: continue feature.align() tp_pkpt.write(feature.getResult()) print '\t'.join([id, str(feature.getIdentities()), str(feature.getIdentity())]) result[id]= [feature.getAverage()] print "================" for id,feature in pknt.iteritems(): if id is None: continue feature.align() tp_pknt.write(feature.getResult()) print '\t'.join([id, str(feature.getIdentities()), str(feature.getIdentity())]) try: result[id].append(feature.getAverage()) except: pass print "================" for id,feature in result.iteritems(): if id is None: continue ptpk='' ptnt='' try: ptpk = str(feature[0]) ptnt = str(feature[1]) except IndexError: pass print '\t'.join([id,ptpk,ptnt]) class PineTest(unittest.TestCase): def setUp(self): self.td1 = { 'psbT': MyFeature('psbT', (52109, 52217), 'TATGGAAGCATTGGTTTATAA'), 'ORF44d': MyFeature('ORF44d', (68128, 68263), 'ATTAATGAATTGTTGCAT'), 'ORF63a': MyFeature('ORF63a', (26349, 26541), 'TATGGTTAATAGTTCCATGA'), } self.td2 = { 'psbT': MyFeature('psbT', (52109, 52217), 'TATGGAAGCATTGGTTTATAA'), 'ORF44d': MyFeature('ORF44d', (68128, 68263), 'ATTAATGAATTGTTGCAT'), 'ORF63': MyFeature('ORF63', (26349, 26541), 'TATGGTTAATAGTTCCATGA'), } def testCombine(self): c,d1,d2=combine(self.td1,self.td2) expected = ('TATGGAAGCATTGGTTTATAA','TATGGAAGCATTGGTTTATAA') self.assertEquals(expected,c['psbT'].getTwoSequence()) self.assertEquals('ORF63a',d1['ORF63a'].id) self.assertEquals('ORF63',d2['ORF63'].id) def testBl2seq(self): a=MyFeature('psbT', (52109, 52217), 'TTATGGGCATTGGTTTATAATATGGAAGCATTGGTTTATAATGGAAGCATTGGTTTATAATATGGAAGCATTGGTTTATAAATGGAAGCATTGGTTTATAA', 'abcd') b=MyFeature('psbT', (52109, 52217), 'TTATGGAAGCATTGGTTTATAATATGGAAGCATTGGTTTATAATGGAAGCATTGGTTTATAATATGGGCATTGGTTTATAAATGGAAGCATTGTATAA','efgh') a.compareWith(b) a.align() self.assertEquals(3,a.getIdentity()) if __name__=='__main__': #unittest.main(argv=('','-v')) main()