EstClustering으로 ExpressionProfile을 계산
동일종에서 유래한 각기 다른 라이브러리들을 합쳐서 EST Clustering 한 뒤에 각 라이브러리 별로 특정 유전자(contig)의 발현빈도를 계산하는 방법
Symbols description
m : number of cDNA libraries
xij : number of transcript(est) copies of gene(contig) j in the i th library
Ni : total number of cDNA clones sequenced in the i th library
fj : frequency of gene transcript copies of gene j in all the libraries
Python script
1 """ExpressionProfileByEstClustering by -- ["yong27"] [[DateTime(2006-08-23T16:57:50Z)]]"""
2
3 import sys, math, unittest
4
5 class SummableDict(dict):
6 def sum(self):
7 return sum(self.values())
8
9 def __iadd__(self, sdict):
10 for key in sdict:
11 if key in self:
12 self[key]+=sdict[key]
13 else:
14 self[key] = sdict[key]
15 return self
16
17
18 class Contig:
19 def __init__(self, lines):
20 self.name = lines[0]
21 self.ests = lines[1:]
22 self.libraries = SummableDict(PLM=0, BF=0, PAF=0, PFC1=0)
23 for est in self.ests:
24 swords = est.split('_')
25 if swords[2] !="N":
26 self.libraries[swords[1]] += 1
27
28 self.nAllEsts = self.libraries.sum()
29 self.r = 0
30
31 def nEsts(self, library):
32 return self.libraries.get(library)
33
34 def withTotalEstsDict(self, total_ests):
35 f = self.nAllEsts / float(total_ests.sum())
36 for library, nEsts in self.libraries.iteritems():
37 if nEsts != 0:
38 self.r += nEsts * math.log(nEsts / (total_ests.get(library) * f))
39
40 def __str__(self):
41 return '\t'.join(map(str, [self.name,
42 self.nEsts('PLM'),
43 self.nEsts('BF'),
44 self.nEsts('PAF'),
45 self.nEsts('PFC1'),
46 self.nAllEsts, self.r,
47 ]))
48
49
50 class FastaIterator:
51 def __init__(self, ifile):
52 self.ifile = ifile
53 self.g = self.getGenerator()
54 def getGenerator(self):
55 lines = [self.ifile.next().strip()]
56 for line in self.ifile:
57 if line.startswith('>'):
58 yield Contig(lines)
59 lines = [line.strip()]
60 else:
61 lines.append(line.strip())
62 else:
63 yield Contig(lines)
64 def __iter__(self):
65 return self.g
66
67 def next(self):
68 return self.g.next()
69
70
71 from cStringIO import StringIO
72 class ContigStatTest(unittest.TestCase):
73 def setUp(self):
74 input = """\
75 >TOTAL_contig00001
76 susfleck_BF_8184_P21.ab1
77 susfleck_PFC1_6568_A14.ab1
78 susfleck_PLM_N_7376_F15.ab1
79 susfleck_PLM_7780_K07.ab1
80 susfleck_PLM_7781_K07.ab1
81 >TOTAL_contig00002
82 susfleck_BF_0021_0024_D23.ab1
83 susfleck_PAF_2932_E15.ab1
84 susfleck_BF_0073_0076_C12.ab
85 """
86 self.fi = FastaIterator(StringIO(input))
87
88 def testCount(self):
89 c1 = self.fi.next()
90 self.assertEquals('>TOTAL_contig00001', c1.name)
91 self.assertEquals(1, c1.nEsts('BF'))
92 self.assertEquals(2, c1.nEsts('PLM'))
93 self.assertEquals(4, c1.nAllEsts)
94
95 c2 = self.fi.next()
96 self.assertEquals('>TOTAL_contig00002', c2.name)
97 self.assertEquals(2, c2.nEsts('BF'))
98 self.assertEquals(0, c2.nEsts('PLM'))
99 self.assertEquals(1, c2.nEsts('PAF'))
100 self.assertEquals(3, c2.nAllEsts)
101
102 def testSummableDict(self):
103 sd1 = SummableDict(a=0, b=2, d=2)
104 sd2 = SummableDict(a=1, c=4, d=3)
105 sd1 += sd2
106 self.assertEquals(SummableDict(a=1, b=2, c=4, d=5), sd1)
107
108
109 def main(aFile):
110 contigs = []
111 total_ests = SummableDict(PLM=0, BF=0, PAF=0, PFC1=0)
112 for contig in FastaIterator(aFile):
113 total_ests += contig.libraries
114 contigs.append(contig)
115
116 for contig in contigs:
117 contig.withTotalEstsDict(total_ests)
118
119 contigs.sort(key=lambda x: x.r, reverse=True)
120
121 print '\t'.join(['Contig name', 'PLM', 'BF','PAF','PFC1', 'Sum', 'R'])
122 for contig in contigs:
123 print contig
124
125 if __name__=='__main__':
126 #unittest.main()
127 #main(file('total_contig_info.txt'))
128 main(sys.stdin)