1 """PineGenomeComparisonUsingEntrez script by [[yong27/2004-02-27]]
   2 """
   3 import unittest, tempfile, os
   4 from Bio import GenBank, Fasta
   5 from Bio.Blast import NCBIStandalone
   6 
   7 gi={ #NCBI Nucleotide's gi 
   8     'Pinus koraiensis':'29469655',
   9     'Pinus thunbergii':'529643',
  10     'Nicotiana tabacum':'11465934',
  11 }
  12 
  13 BL2SEQ = '../bl2seq'
  14 
  15 fpser = GenBank.FeatureParser()
  16 ncbi = GenBank.NCBIDictionary('nucleotide', 'genbank', parser=fpser)
  17 
  18 def parseIdentity(aHandle):
  19     origin = []
  20     identities = []
  21     average = []
  22     for line in aHandle.readlines():
  23         try:
  24             if line[1] == 'I':
  25                 identities.append(int(line.split()[2].split('/')[0]))
  26                 average.append(int(line.split()[3].replace('%','').replace('(','').replace(')','').replace(',','')))
  27         except IndexError:
  28             pass
  29         origin.append(line)
  30     sum=0
  31     for each in average:
  32         sum+=each
  33     try:
  34         average = sum/float(len(average))
  35     except ZeroDivisionError:
  36         average =0
  37     return ' '.join(origin), identities, average
  38 
  39 def bl2seq(fasta1, fasta2):
  40     ff1Name=tempfile.mktemp('fasta')
  41     ff2Name=tempfile.mktemp('fasta')
  42     ff1=open(ff1Name,'w')
  43     ff2=open(ff2Name,'w')
  44     ff1.write(str(fasta1))
  45     ff2.write(str(fasta2))
  46     ff1.close()
  47     ff2.close()
  48 
  49     cmd = " ".join([BL2SEQ,'-i',ff1Name,'-j',ff2Name,'-p blastn','-X 30','-W 11'])
  50     w,r,e = os.popen3(cmd)
  51     w.close()
  52     return parseIdentity(r)
  53 
  54     
  55 class MyFeature:
  56     def __init__(self, id, position, sequence, species):
  57         self.id = id
  58         self.position = position
  59         self.sequence = sequence
  60         self.partner = None
  61         self.species = species
  62     def compareWith(self,aFeature):
  63         assert self.id == aFeature.id
  64         self.partner = aFeature
  65     def makeFasta(self):
  66         fasta = Fasta.Record()
  67         try:
  68             fasta.title = self.id +' '+str(self.species)
  69         except TypeError:
  70             fasta.title = self.id
  71         fasta.sequence = self.sequence
  72         return fasta
  73     def getTwoSequence(self):
  74         return (self.sequence, self.partner.sequence)
  75     def align(self):
  76         self.record, self.identities, self.average = bl2seq(self.makeFasta(), self.partner.makeFasta())
  77     def getSequenceLength(self):
  78         return len(self.sequence)
  79     def getResult(self):
  80         return self.record
  81     def getIdentities(self):
  82         return self.identities
  83     def getIdentity(self):
  84         sum=0
  85         for each in self.identities:
  86             sum+=each
  87         return sum/float(self.getSequenceLength()) * 100
  88     def getAverage(self):
  89         return self.average
  90 
  91 def getGenomeDict(aSpecies):
  92     genome = ncbi[gi[aSpecies]]
  93     genomeDict = {}
  94     for feature in genome.features:
  95         id = feature.qualifiers.get('gene')
  96         if not id:
  97             id = feature.qualifiers.get('product')
  98         try:
  99             id = id[0]
 100         except:
 101             pass
 102         if genomeDict.has_key(id):
 103             continue
 104         start = feature.location._start.position
 105         end = feature.location._end.position
 106         sequence = genome.seq[start-1:end].tostring()
 107         genomeDict[id] = MyFeature(id, (start,end), sequence, aSpecies)
 108     return genomeDict
 109 
 110 def combine(dict1, dict2):
 111     combDict = {}
 112     solo = []
 113     for id,feature in dict1.copy().iteritems():
 114         if id in dict2.keys():
 115             feature.compareWith(dict2.pop(id))
 116             combDict[id]=feature
 117             dict1.pop(id)
 118     return combDict,dict1,dict2
 119 
 120 def main():    
 121     pk=getGenomeDict('Pinus koraiensis')
 122     pt=getGenomeDict('Pinus thunbergii')
 123     pkpt,pkpt_pk,pkpt_pt=combine(pk,pt)
 124 
 125 
 126     pk=getGenomeDict('Pinus koraiensis')
 127     nt=getGenomeDict('Nicotiana tabacum')
 128     pknt,pknt_pk,pknt_nt=combine(pk,nt)
 129 
 130     tp_pkpt = open('pk_pt','w')
 131     tp_pknt = open('pk_nt','w')
 132 
 133     result={}
 134     for id,feature in pkpt.iteritems():
 135         if id is None:
 136             continue
 137         feature.align()
 138         tp_pkpt.write(feature.getResult())
 139         print '\t'.join([id, str(feature.getIdentities()), str(feature.getIdentity())])
 140         result[id]= [feature.getAverage()]
 141 
 142     print "================"
 143     for id,feature in pknt.iteritems():
 144         if id is None:
 145             continue
 146         feature.align()
 147         tp_pknt.write(feature.getResult())
 148         print '\t'.join([id, str(feature.getIdentities()), str(feature.getIdentity())])
 149         try:
 150             result[id].append(feature.getAverage())
 151         except:
 152             pass
 153     print "================"
 154     for id,feature in result.iteritems():
 155         if id is None:
 156             continue
 157         ptpk=''
 158         ptnt=''
 159         try:
 160             ptpk = str(feature[0])
 161             ptnt = str(feature[1])
 162         except IndexError:
 163             pass
 164         print '\t'.join([id,ptpk,ptnt])
 165 
 166 class PineTest(unittest.TestCase):
 167     def setUp(self):
 168         self.td1 = {
 169             'psbT': MyFeature('psbT', (52109, 52217), 'TATGGAAGCATTGGTTTATAA'), 
 170             'ORF44d': MyFeature('ORF44d', (68128, 68263), 'ATTAATGAATTGTTGCAT'), 
 171             'ORF63a': MyFeature('ORF63a', (26349, 26541), 'TATGGTTAATAGTTCCATGA'),
 172         }
 173         self.td2 = {
 174             'psbT': MyFeature('psbT', (52109, 52217), 'TATGGAAGCATTGGTTTATAA'), 
 175             'ORF44d': MyFeature('ORF44d', (68128, 68263), 'ATTAATGAATTGTTGCAT'), 
 176             'ORF63': MyFeature('ORF63', (26349, 26541), 'TATGGTTAATAGTTCCATGA'),
 177         }
 178     def testCombine(self):
 179         c,d1,d2=combine(self.td1,self.td2)
 180         expected = ('TATGGAAGCATTGGTTTATAA','TATGGAAGCATTGGTTTATAA')
 181         self.assertEquals(expected,c['psbT'].getTwoSequence())
 182         self.assertEquals('ORF63a',d1['ORF63a'].id)
 183         self.assertEquals('ORF63',d2['ORF63'].id)
 184 
 185     def testBl2seq(self):
 186         a=MyFeature('psbT', (52109, 52217), 'TTATGGGCATTGGTTTATAATATGGAAGCATTGGTTTATAATGGAAGCATTGGTTTATAATATGGAAGCATTGGTTTATAAATGGAAGCATTGGTTTATAA', 'abcd')
 187         b=MyFeature('psbT', (52109, 52217), 'TTATGGAAGCATTGGTTTATAATATGGAAGCATTGGTTTATAATGGAAGCATTGGTTTATAATATGGGCATTGGTTTATAAATGGAAGCATTGTATAA','efgh') 
 188         a.compareWith(b)
 189         a.align()
 190         self.assertEquals(3,a.getIdentity())
 191 
 192 if __name__=='__main__':
 193     #unittest.main(argv=('','-v'))
 194     main()

PineGenomeComparisonUsingEntrez.py (last edited 2012-08-24 11:33:25 by 182)

web biohackers.net