Phred score 파일 Parsing하여, 통계내기 스크립트.

Phred score파일은 FastaFormat과 유사하므로, BioPython을 이용할 수도 있으나, 다음 라인으로 건너뛸때, 전 라인의 마지막수치값과 다음라인의 첫수치값이 하나의 문자열로 결합되는 문제 때문에 새로 만들 필요가 있다. 참고로, qual파일이 아닌 phd파일일 경우, 해당 파서가 만들어져 있다. (Bio.Sequencing.Phred)

ReverseComplementIterator.py와 유사하며, PythonNewFunction인 클래스Iterator를 이용하여 구현

   1 """PhredScoreParsing by [yong27], 2004-03-29
   2 """
   3 
   4 import sys,unittest,math
   5 from cStringIO import StringIO
   6 
   7 class QualRecord:
   8     def __init__(self):
   9         self.ori_title = ''
  10         self.title = ''
  11         self.quals = []
  12 
  13     def setTitle(self, aTitle):
  14         self.ori_title = aTitle
  15         self.title = self.ori_title.split()[0][1:]
  16 
  17     def setQuals(self, aStrLines):
  18         for line in aStrLines:
  19             self.quals.extend(map(int,line.split()))
  20 
  21     def getLength(self):
  22         length = len(self.quals)
  23         assert length == int(self.ori_title.split()[1])
  24         return length
  25 
  26     def getAverage(self):
  27         sum=0
  28         for each in self.quals:
  29             sum+=each
  30         return sum / float(self.getLength())
  31 
  32     def getStd(self):
  33         avg = self.getAverage()
  34         sum=0
  35         for each in self.quals:
  36             sum+=(avg-each)*(avg-each)
  37         return math.sqrt(sum/float(self.getLength()))
  38 
  39     def getNumberMoreThan20(self):
  40         return len([n for n in self.quals if n>=20])
  41 
  42     def getStatisticalResult(self):
  43         return self.getLength(), self.getAverage(), self.getStd()
  44 
  45 
  46 class QualIterator:
  47     def __init__(self, aFileHandler):
  48         self.file = aFileHandler
  49         self._saved = []
  50     def __iter__(self):
  51         return self
  52 
  53     def next(self):
  54         lines=[]
  55         while True:
  56             line = self.file.readline()
  57             if not line:
  58                 break
  59             if lines and line[0]=='>':
  60                 self._saved.append(line)
  61                 break
  62             lines.append(line.strip())
  63         if not lines:
  64             raise StopIteration
  65         return self.getRecord(lines)
  66 
  67     def getRecord(self, lines):
  68         record = QualRecord()
  69         if lines[0][0]=='>': ## when first iteration
  70             record.setTitle(lines[0])
  71             lines = lines[1:]
  72         else:
  73             record.setTitle(self._saved[0])
  74             self._saved = self._saved[1:]
  75         record.setQuals(lines)
  76         return record
  77 
  78 
  79 class QualIteratorTest(unittest.TestCase):
  80     def setUp(self):
  81         instr = """\
  82 >1013C2SP6    36      0    226  ABI
  83 10 24 29 40 40 40 40 40 40 46 51 56 51 51 40 40 45
  84 45 45 45 51 46 46 46 32 35 23 23 11 11 11 45 31 35
  85 33 32
  86 >1013C2SP7    38      0    226  ABI
  87 10 24 29 40 40 40 40 40 40 46 51 56 51 51 40 40 45
  88 45 45 45 51 46 46 46 32 35 23 23 11 11 11 45 31 35
  89 33 32 11 19
  90 """
  91         self.input = StringIO(instr)
  92     def testIterationAndTitle(self):
  93         qi=QualIterator(self.input)
  94         self.assertEquals('1013C2SP6',qi.next().title)
  95         self.assertEquals('1013C2SP7',qi.next().title)
  96         self.failUnlessRaises(StopIteration,qi.next)
  97 
  98     def testQuals(self):
  99         qi=QualIterator(self.input)
 100         expected=[10, 24, 29, 40, 40, 40, 40, 40, 40, 46,
 101                   51, 56, 51, 51, 40, 40, 45, 45, 45, 45,
 102                   51, 46, 46, 46, 32, 35, 23, 23, 11, 11,
 103                   11, 45, 31, 35, 33, 32]
 104         self.assertEquals(expected, qi.next().quals)
 105         self.assertEquals(expected+[11,19], qi.next().quals)
 106         self.failUnlessRaises(StopIteration,qi.next)
 107 
 108     def testNumberMoreThan20(self):
 109         qi=QualIterator(self.input)
 110         qr = qi.next()
 111         self.assertEquals(32, qr.getNumberMoreThan20())
 112 
 113 
 114     def testStatisticalResult(self):
 115         qi=QualIterator(self.input)
 116         len,avg,std=qi.next().getStatisticalResult()
 117         self.assertEquals((36, 36.916666666666664, 12.193748398257197),(len,avg,std))
 118 
 119 if __name__=='__main__':
 120     #unittest.main(argv=('','-v'))
 121     sys.stdout.write('\t'.join([
 122         'filename','length','average','std','more than 20',
 123         ])+'\n')
 124     for record in QualIterator(sys.stdin):
 125         result = [record.title]
 126         result.extend(map(str, record.getStatisticalResult()))
 127         result.append(str(record.getNumberMoreThan20()))
 128         sys.stdout.write('\t'.join(result)+'\n')

PhredScoreParsing (last edited 2012-12-26 16:46:22 by 182)

web biohackers.net