FastaFormat - BioHackersNet

FASTA format.

A sequence in FastaFormat begins with a single-line description, followed by lines of sequence data. The description line is distinguished from the sequence data by a greater-than (">") symbol in the first column. It is recommended that all lines of text be shorter than 80 characters in length. An example sequence is:

>gi|532319|pir|TVFV2E|TVFV2E envelope protein
ELRLRYCAPAGFALLKCNDADYDGFKTNCSNVSVVHCTNLMNTTVTTGLLLNGSYSENRT
QIWQKHRTSNDSALILLNKHYNLTVTCKRPGNKTVLPVTIMAGLVFHSQKYNLRLRQAWC
HFPSNWKGAWKEVKEEIVNLPKERYRGTNDPKRIFFQRQWGDPETANLWFNCHGEFFYCK
MDWFLNYLNNLTVDADHNECKNTSGTKSGNKRAPGPCVQRTYVACHIRSVIIWLETISKK
TYAPPREGHLECTSTVTGMTVELNYIPKNRTNVTLSPQIESIWAAELDRYKLVEITPIGF
APTEVRRYTGGHERQKRVPFVXXXXXXXXXXXXXXXXXXXXXXVQSQHLLAGILQQQKNL
LAAVEAQQQMLKLTIWGVK
>my test sequence for 532319
ELRLRYCAPAGFALLKCNDADYDGFKTNCSNVSVVHCTNLMNTTVTTGLLLNGSYSENRT
QIWQKHRTSNDSALILLNKHYNLTVTCKRPGNKTVLPVTIMAGLVFHSQKYNLRLRQAWC
HFPSNWKGAWKEVKEEIVNLPKERYRGTNDPKRIFFQRQWGDPETANLWFNCHGEFFYCK
MDWFLNYLNNLTVDADHNECKNTSGTKSGNKRAPGPCVQRTYVACHIRSVIIWLETISKK
LAAVEAQQQMLKLTIWGVK

BioSequence is expected to be represented in the standard IUB/IUPAC AminoAcid and NucleicAcid codes, with these exceptions: lower-case letters are accepted and are mapped into upper-case; a single hyphen or dash can be used to represent a gap of indeterminate length; and in AminoAcid sequences, U and * are acceptable letters (see below). Before submitting a request, any numerical digits in the query sequence should either be removed or replaced by appropriate letter codes (e.g., N for unknown NucleicAcid residue or X for unknown AminoAcid residue).

BioPython을 써서 FastaFormat다루기

BioPython으로 FastaFormat을 다루는 요령은 다음과 같다.

입력할때 - 주로 FastaFormat의 Parsing

   1 from Bio import Fasta, File
   2 from cStringIO import StringIO 
   3 #file = File.UndoHandle(StringIO(fastaStr)) # 만일 스트링으로 갖고있을경우
   4 file = open('file.fasta', 'r') 
   5 parser = Fasta.RecordParser() 
   6 iterator = Fasta.Iterator(file, parser) 
   7 while 1: 
   8     curRecord = iterator.next()  # 하나의 fasta file내에 여러개의 record를 반복적으로 접근 
   9     if curRecord is None: break 
  10     title = curRecord.title   # 레코드에서 타이틀 
  11     seq = curRecord.sequence  # 레코드에서 서열

출력할때 - stdout으로 뿌려준다면

   1 from Bio import Fasta 
   2 title = '>This is test title'  # fasta file의 title 
   3 seq = 'ATGGGGGTGTGTGTGGGG' # 하나의 긴 문자열 
   4 fasta = Fasta.Record()   # fasta라는 인스턴스를 만듦 
   5 fasta.title = title              # 강제로 title속성에 값을 부여 
   6 fasta.sequence = seq    # 마찬가지 
   7 print fasta                     # 이 명령으로 60자리후의 '\n'입력까지 자동으로 된다. 
   8 
   9 # if you want to write on file
  10 wfile = open('쓰고자하는파일', 'w') 
  11 wfile.write(str(fasta))

RelationalDatabase에서 직접만들기

SELECT CONCAT(">gi|", annot.gi, "|sp|", annot.acc, "|", sp.name, " ", annot.descr, "\n", protein.seq)
FROM   protein INNER JOIN annot USING (prot_id) INNER JOIN sp USING (acc)
WHERE  annot.current = 1;
$ mysql seqdb -N < swissprot.sql > swissprot.fa

관련코드모음

HTML로 FastaFormat꾸미기

DecoratorPattern 이용 : FastaDecorator.py

JuneKim씨 코드(2004-06-13) : 파이썬 커뮤니티에 정규식 중에 중간에 개행문자가 들어와도 되는 경우를 물으셨더군요. 다음과 같이 할 수도 있습니다.

   1 import re
   2 
   3 class Enclose:
   4     def __init__(self,d):
   5         self.d=[(v,self.fragmentable(k)) for k,v in d]
   6         self.p=re.compile("(?i)(%s)"%")|(".join([f for _,f in self.d]))
   7     def fragmentable(self,s): return '\s?'.join(list(s))
   8     def __call__(self, m):
   9         opener,closer=self.d[m.lastindex-1][0]
  10         return "%s%s%s"%(opener,m.group(),closer)
  11     def do(self, text):
  12         return self.p.sub(self, text)
  13 
  14 if __name__ == "__main__": 
  15     sequence = """\
  16 TCTTCTCCTCACCTCGCTCTCGCCGCCTGCTCGCCCCGNCCGCTTTGCTCGGCGCCCCAA
  17 AACACNCTTCCACCATGNGCCACCTCGGCGAGCCCTCCCACTTGAACAAAGGGGTGCTCG
  18 GCGCGTGTACNNATGGCCC\
  19 """
  20     expected="""TCTTCTCCTCACCTCGCTCTCGCCGCCTGCTCGCCCCGNCCGCTTTGCTCGGCG<b>CCCCAA
  21 AACACN</b>CTTCCACCATGNGCC<font color="red">ACCTCGGCGAGCC</font>CTCCCACTTGAACAAAGGGGTGC<i>TCG
  22 GCGCGTG</i>TACNNATGGCCC"""
  23 
  24     d=(('CCCCAAAACACN',('<b>','</b>')),
  25        ('TCGGCGCGTG',('<i>','</i>')),
  26        ('ACCTCGGCGAGCC',('<font color="red">','</font>')),
  27        )
  28 
  29     r=Enclose(d).do(sequence)
  30     assert r==expected

간단한 Iterator

   1 class FastaIterator:
   2     def __init__(self, ifile):
   3         self.ifile = ifile
   4         self.g = self.getGenerator()
   5     def getGenerator(self):
   6         lines = [self.ifile.next()]
   7         for line in self.ifile:
   8             if line.startswith('>'):
   9                 yield ''.join(lines)
  10                 lines = [line]
  11             else:
  12                 lines.append(line)
  13         else:
  14             yield ''.join(lines)
  15     def __iter__(self):
  16         return self.g

프로그래밍 교육용 예제

   1 #!/usr/bin/env python
   2 import unittest, sys
   3 from cStringIO import StringIO
   4 
   5 class DNA:
   6     CompMap = {'A':'T','T':'A','G':'C','C':'G'}
   7     def __init__(self):
   8         self.title = ''
   9         self.sequence = ''
  10 
  11     def getRevCompSeq(self):
  12         return ''.join(DNA.CompMap[nucleotide]
  13                 for nucleotide in reversed(self.sequence))
  14 
  15     def getFasta(self, cols=60):
  16         result = []
  17         result.append('>'+self.title)
  18         seq = self.sequence
  19         while seq:
  20             result.append(seq[:cols])
  21             seq = seq[cols:]
  22         return '\n'.join(result)+'\n'
  23 
  24     @staticmethod
  25     def parseFasta(aStr):
  26         dna=DNA()
  27         lines = aStr.splitlines()
  28         dna.title = lines[0][1:]
  29         dna.sequence = ''.join(lines[1:])
  30         return dna
  31 
  32 def FastaGenerator(aFile):
  33     lines =[aFile.next()]
  34     for line in aFile:
  35         if line.startswith('>'):
  36             yield ''.join(lines)
  37             lines = []
  38         lines.append(line)
  39     else:
  40         yield ''.join(lines)
  41 
  42 class DnaTest(unittest.TestCase):
  43     def setUp(self):
  44         self.dna = DNA()
  45         self.dna.title = 'test'
  46         self.dna.sequence = 'AGTC'
  47 
  48     def testRevComp(self):
  49         self.assertEquals('GACT',self.dna.getRevCompSeq())
  50 
  51     def testParsing(self):
  52         input='''\
  53 >test
  54 AGTC
  55 CCC'''
  56         self.dna = DNA.parseFasta(input)
  57         self.assertEquals('test',self.dna.title)
  58         self.assertEquals('AGTCCCC', self.dna.sequence)
  59 
  60     def testRepr(self):
  61         self.dna.sequence = 'AGTCCCC'
  62         expected='''\
  63 >test
  64 AGT
  65 CCC
  66 C
  67 '''
  68         self.assertEquals(expected, self.dna.getFasta(cols=3))
  69 
  70     def testFastaGenerator(self):
  71         input = StringIO("""\
  72 >test1
  73 AGTC
  74 CCC
  75 >test2
  76 GGGG
  77 """)
  78         g = FastaGenerator(input)
  79         expect1 = """\
  80 >test1
  81 AGTC
  82 CCC
  83 """
  84         self.assertEquals(expect1, g.next())
  85         expect2 = """\
  86 >test2
  87 GGGG
  88 """
  89         self.assertEquals(expect2, g.next())
  90         #self.assertRaise(StopIteration, g.next)
  91 
  92 def main():
  93     g = FastaGenerator(sys.stdin)
  94     for record in g:
  95         dna = DNA.parseFasta(record)
  96         dna.sequence = dna.getRevCompSeq()
  97         sys.stdout.write(dna.getFasta())
  98 
  99 if __name__=='__main__':
 100     #main()
 101     unittest.main()

각종 변환 프로그램(using WxPython) --> wiki/FastaConvertor