FASTA format.
A sequence in FastaFormat begins with a single-line description, followed by lines of sequence data. The description line is distinguished from the sequence data by a greater-than (">") symbol in the first column. It is recommended that all lines of text be shorter than 80 characters in length. An example sequence is:
>gi|532319|pir|TVFV2E|TVFV2E envelope protein ELRLRYCAPAGFALLKCNDADYDGFKTNCSNVSVVHCTNLMNTTVTTGLLLNGSYSENRT QIWQKHRTSNDSALILLNKHYNLTVTCKRPGNKTVLPVTIMAGLVFHSQKYNLRLRQAWC HFPSNWKGAWKEVKEEIVNLPKERYRGTNDPKRIFFQRQWGDPETANLWFNCHGEFFYCK MDWFLNYLNNLTVDADHNECKNTSGTKSGNKRAPGPCVQRTYVACHIRSVIIWLETISKK TYAPPREGHLECTSTVTGMTVELNYIPKNRTNVTLSPQIESIWAAELDRYKLVEITPIGF APTEVRRYTGGHERQKRVPFVXXXXXXXXXXXXXXXXXXXXXXVQSQHLLAGILQQQKNL LAAVEAQQQMLKLTIWGVK >my test sequence for 532319 ELRLRYCAPAGFALLKCNDADYDGFKTNCSNVSVVHCTNLMNTTVTTGLLLNGSYSENRT QIWQKHRTSNDSALILLNKHYNLTVTCKRPGNKTVLPVTIMAGLVFHSQKYNLRLRQAWC HFPSNWKGAWKEVKEEIVNLPKERYRGTNDPKRIFFQRQWGDPETANLWFNCHGEFFYCK MDWFLNYLNNLTVDADHNECKNTSGTKSGNKRAPGPCVQRTYVACHIRSVIIWLETISKK LAAVEAQQQMLKLTIWGVK
BioSequence is expected to be represented in the standard IUB/IUPAC AminoAcid and NucleicAcid codes, with these exceptions: lower-case letters are accepted and are mapped into upper-case; a single hyphen or dash can be used to represent a gap of indeterminate length; and in AminoAcid sequences, U and * are acceptable letters (see below). Before submitting a request, any numerical digits in the query sequence should either be removed or replaced by appropriate letter codes (e.g., N for unknown NucleicAcid residue or X for unknown AminoAcid residue).
BioPython을 써서 FastaFormat다루기
BioPython으로 FastaFormat을 다루는 요령은 다음과 같다.
입력할때 - 주로 FastaFormat의 Parsing
1 from Bio import Fasta, File
2 from cStringIO import StringIO
3 #file = File.UndoHandle(StringIO(fastaStr)) # 만일 스트링으로 갖고있을경우
4 file = open('file.fasta', 'r')
5 parser = Fasta.RecordParser()
6 iterator = Fasta.Iterator(file, parser)
7 while 1:
8 curRecord = iterator.next() # 하나의 fasta file내에 여러개의 record를 반복적으로 접근
9 if curRecord is None: break
10 title = curRecord.title # 레코드에서 타이틀
11 seq = curRecord.sequence # 레코드에서 서열
출력할때 - stdout으로 뿌려준다면
1 from Bio import Fasta
2 title = '>This is test title' # fasta file의 title
3 seq = 'ATGGGGGTGTGTGTGGGG' # 하나의 긴 문자열
4 fasta = Fasta.Record() # fasta라는 인스턴스를 만듦
5 fasta.title = title # 강제로 title속성에 값을 부여
6 fasta.sequence = seq # 마찬가지
7 print fasta # 이 명령으로 60자리후의 '\n'입력까지 자동으로 된다.
8
9 # if you want to write on file
10 wfile = open('쓰고자하는파일', 'w')
11 wfile.write(str(fasta))
RelationalDatabase에서 직접만들기
SELECT CONCAT(">gi|", annot.gi, "|sp|", annot.acc, "|", sp.name, " ", annot.descr, "\n", protein.seq) FROM protein INNER JOIN annot USING (prot_id) INNER JOIN sp USING (acc) WHERE annot.current = 1; $ mysql seqdb -N < swissprot.sql > swissprot.fa
관련코드모음
HTML로 FastaFormat꾸미기
JuneKim씨 코드(2004-06-13) : 파이썬 커뮤니티에 정규식 중에 중간에 개행문자가 들어와도 되는 경우를 물으셨더군요. 다음과 같이 할 수도 있습니다.
1 import re 2 3 class Enclose: 4 def __init__(self,d): 5 self.d=[(v,self.fragmentable(k)) for k,v in d] 6 self.p=re.compile("(?i)(%s)"%")|(".join([f for _,f in self.d])) 7 def fragmentable(self,s): return '\s?'.join(list(s)) 8 def __call__(self, m): 9 opener,closer=self.d[m.lastindex-1][0] 10 return "%s%s%s"%(opener,m.group(),closer) 11 def do(self, text): 12 return self.p.sub(self, text) 13 14 if __name__ == "__main__": 15 sequence = """\ 16 TCTTCTCCTCACCTCGCTCTCGCCGCCTGCTCGCCCCGNCCGCTTTGCTCGGCGCCCCAA 17 AACACNCTTCCACCATGNGCCACCTCGGCGAGCCCTCCCACTTGAACAAAGGGGTGCTCG 18 GCGCGTGTACNNATGGCCC\ 19 """ 20 expected="""TCTTCTCCTCACCTCGCTCTCGCCGCCTGCTCGCCCCGNCCGCTTTGCTCGGCG<b>CCCCAA 21 AACACN</b>CTTCCACCATGNGCC<font color="red">ACCTCGGCGAGCC</font>CTCCCACTTGAACAAAGGGGTGC<i>TCG 22 GCGCGTG</i>TACNNATGGCCC""" 23 24 d=(('CCCCAAAACACN',('<b>','</b>')), 25 ('TCGGCGCGTG',('<i>','</i>')), 26 ('ACCTCGGCGAGCC',('<font color="red">','</font>')), 27 ) 28 29 r=Enclose(d).do(sequence) 30 assert r==expected
간단한 Iterator
1 class FastaIterator:
2 def __init__(self, ifile):
3 self.ifile = ifile
4 self.g = self.getGenerator()
5 def getGenerator(self):
6 lines = [self.ifile.next()]
7 for line in self.ifile:
8 if line.startswith('>'):
9 yield ''.join(lines)
10 lines = [line]
11 else:
12 lines.append(line)
13 else:
14 yield ''.join(lines)
15 def __iter__(self):
16 return self.g
프로그래밍 교육용 예제
1 #!/usr/bin/env python
2 import unittest, sys
3 from cStringIO import StringIO
4
5 class DNA:
6 CompMap = {'A':'T','T':'A','G':'C','C':'G'}
7 def __init__(self):
8 self.title = ''
9 self.sequence = ''
10
11 def getRevCompSeq(self):
12 return ''.join(DNA.CompMap[nucleotide]
13 for nucleotide in reversed(self.sequence))
14
15 def getFasta(self, cols=60):
16 result = []
17 result.append('>'+self.title)
18 seq = self.sequence
19 while seq:
20 result.append(seq[:cols])
21 seq = seq[cols:]
22 return '\n'.join(result)+'\n'
23
24 @staticmethod
25 def parseFasta(aStr):
26 dna=DNA()
27 lines = aStr.splitlines()
28 dna.title = lines[0][1:]
29 dna.sequence = ''.join(lines[1:])
30 return dna
31
32 def FastaGenerator(aFile):
33 lines =[aFile.next()]
34 for line in aFile:
35 if line.startswith('>'):
36 yield ''.join(lines)
37 lines = []
38 lines.append(line)
39 else:
40 yield ''.join(lines)
41
42 class DnaTest(unittest.TestCase):
43 def setUp(self):
44 self.dna = DNA()
45 self.dna.title = 'test'
46 self.dna.sequence = 'AGTC'
47
48 def testRevComp(self):
49 self.assertEquals('GACT',self.dna.getRevCompSeq())
50
51 def testParsing(self):
52 input='''\
53 >test
54 AGTC
55 CCC'''
56 self.dna = DNA.parseFasta(input)
57 self.assertEquals('test',self.dna.title)
58 self.assertEquals('AGTCCCC', self.dna.sequence)
59
60 def testRepr(self):
61 self.dna.sequence = 'AGTCCCC'
62 expected='''\
63 >test
64 AGT
65 CCC
66 C
67 '''
68 self.assertEquals(expected, self.dna.getFasta(cols=3))
69
70 def testFastaGenerator(self):
71 input = StringIO("""\
72 >test1
73 AGTC
74 CCC
75 >test2
76 GGGG
77 """)
78 g = FastaGenerator(input)
79 expect1 = """\
80 >test1
81 AGTC
82 CCC
83 """
84 self.assertEquals(expect1, g.next())
85 expect2 = """\
86 >test2
87 GGGG
88 """
89 self.assertEquals(expect2, g.next())
90 #self.assertRaise(StopIteration, g.next)
91
92 def main():
93 g = FastaGenerator(sys.stdin)
94 for record in g:
95 dna = DNA.parseFasta(record)
96 dna.sequence = dna.getRevCompSeq()
97 sys.stdout.write(dna.getFasta())
98
99 if __name__=='__main__':
100 #main()
101 unittest.main()
각종 변환 프로그램(using WxPython) --> wiki/FastaConvertor