Phred score 파일 Parsing하여, 통계내기 스크립트.
Phred score파일은 FastaFormat과 유사하므로, BioPython을 이용할 수도 있으나, 다음 라인으로 건너뛸때, 전 라인의 마지막수치값과 다음라인의 첫수치값이 하나의 문자열로 결합되는 문제 때문에 새로 만들 필요가 있다. 참고로, qual파일이 아닌 phd파일일 경우, 해당 파서가 만들어져 있다. (Bio.Sequencing.Phred)
ReverseComplementIterator.py와 유사하며, PythonNewFunction인 클래스Iterator를 이용하여 구현
1 """PhredScoreParsing by [yong27], 2004-03-29
2 """
3
4 import sys,unittest,math
5 from cStringIO import StringIO
6
7 class QualRecord:
8 def __init__(self):
9 self.ori_title = ''
10 self.title = ''
11 self.quals = []
12
13 def setTitle(self, aTitle):
14 self.ori_title = aTitle
15 self.title = self.ori_title.split()[0][1:]
16
17 def setQuals(self, aStrLines):
18 for line in aStrLines:
19 self.quals.extend(map(int,line.split()))
20
21 def getLength(self):
22 length = len(self.quals)
23 assert length == int(self.ori_title.split()[1])
24 return length
25
26 def getAverage(self):
27 sum=0
28 for each in self.quals:
29 sum+=each
30 return sum / float(self.getLength())
31
32 def getStd(self):
33 avg = self.getAverage()
34 sum=0
35 for each in self.quals:
36 sum+=(avg-each)*(avg-each)
37 return math.sqrt(sum/float(self.getLength()))
38
39 def getNumberMoreThan20(self):
40 return len([n for n in self.quals if n>=20])
41
42 def getStatisticalResult(self):
43 return self.getLength(), self.getAverage(), self.getStd()
44
45
46 class QualIterator:
47 def __init__(self, aFileHandler):
48 self.file = aFileHandler
49 self._saved = []
50 def __iter__(self):
51 return self
52
53 def next(self):
54 lines=[]
55 while True:
56 line = self.file.readline()
57 if not line:
58 break
59 if lines and line[0]=='>':
60 self._saved.append(line)
61 break
62 lines.append(line.strip())
63 if not lines:
64 raise StopIteration
65 return self.getRecord(lines)
66
67 def getRecord(self, lines):
68 record = QualRecord()
69 if lines[0][0]=='>': ## when first iteration
70 record.setTitle(lines[0])
71 lines = lines[1:]
72 else:
73 record.setTitle(self._saved[0])
74 self._saved = self._saved[1:]
75 record.setQuals(lines)
76 return record
77
78
79 class QualIteratorTest(unittest.TestCase):
80 def setUp(self):
81 instr = """\
82 >1013C2SP6 36 0 226 ABI
83 10 24 29 40 40 40 40 40 40 46 51 56 51 51 40 40 45
84 45 45 45 51 46 46 46 32 35 23 23 11 11 11 45 31 35
85 33 32
86 >1013C2SP7 38 0 226 ABI
87 10 24 29 40 40 40 40 40 40 46 51 56 51 51 40 40 45
88 45 45 45 51 46 46 46 32 35 23 23 11 11 11 45 31 35
89 33 32 11 19
90 """
91 self.input = StringIO(instr)
92 def testIterationAndTitle(self):
93 qi=QualIterator(self.input)
94 self.assertEquals('1013C2SP6',qi.next().title)
95 self.assertEquals('1013C2SP7',qi.next().title)
96 self.failUnlessRaises(StopIteration,qi.next)
97
98 def testQuals(self):
99 qi=QualIterator(self.input)
100 expected=[10, 24, 29, 40, 40, 40, 40, 40, 40, 46,
101 51, 56, 51, 51, 40, 40, 45, 45, 45, 45,
102 51, 46, 46, 46, 32, 35, 23, 23, 11, 11,
103 11, 45, 31, 35, 33, 32]
104 self.assertEquals(expected, qi.next().quals)
105 self.assertEquals(expected+[11,19], qi.next().quals)
106 self.failUnlessRaises(StopIteration,qi.next)
107
108 def testNumberMoreThan20(self):
109 qi=QualIterator(self.input)
110 qr = qi.next()
111 self.assertEquals(32, qr.getNumberMoreThan20())
112
113
114 def testStatisticalResult(self):
115 qi=QualIterator(self.input)
116 len,avg,std=qi.next().getStatisticalResult()
117 self.assertEquals((36, 36.916666666666664, 12.193748398257197),(len,avg,std))
118
119 if __name__=='__main__':
120 #unittest.main(argv=('','-v'))
121 sys.stdout.write('\t'.join([
122 'filename','length','average','std','more than 20',
123 ])+'\n')
124 for record in QualIterator(sys.stdin):
125 result = [record.title]
126 result.extend(map(str, record.getStatisticalResult()))
127 result.append(str(record.getNumberMoreThan20()))
128 sys.stdout.write('\t'.join(result)+'\n')