[[FASTA]] format. A sequence in FastaFormat begins with a single-line description, followed by lines of sequence data. The description line is distinguished from the sequence data by a greater-than (">") symbol in the first column. It is recommended that all lines of text be shorter than 80 characters in length. An example sequence is: {{{ >gi|532319|pir|TVFV2E|TVFV2E envelope protein ELRLRYCAPAGFALLKCNDADYDGFKTNCSNVSVVHCTNLMNTTVTTGLLLNGSYSENRT QIWQKHRTSNDSALILLNKHYNLTVTCKRPGNKTVLPVTIMAGLVFHSQKYNLRLRQAWC HFPSNWKGAWKEVKEEIVNLPKERYRGTNDPKRIFFQRQWGDPETANLWFNCHGEFFYCK MDWFLNYLNNLTVDADHNECKNTSGTKSGNKRAPGPCVQRTYVACHIRSVIIWLETISKK TYAPPREGHLECTSTVTGMTVELNYIPKNRTNVTLSPQIESIWAAELDRYKLVEITPIGF APTEVRRYTGGHERQKRVPFVXXXXXXXXXXXXXXXXXXXXXXVQSQHLLAGILQQQKNL LAAVEAQQQMLKLTIWGVK >my test sequence for 532319 ELRLRYCAPAGFALLKCNDADYDGFKTNCSNVSVVHCTNLMNTTVTTGLLLNGSYSENRT QIWQKHRTSNDSALILLNKHYNLTVTCKRPGNKTVLPVTIMAGLVFHSQKYNLRLRQAWC HFPSNWKGAWKEVKEEIVNLPKERYRGTNDPKRIFFQRQWGDPETANLWFNCHGEFFYCK MDWFLNYLNNLTVDADHNECKNTSGTKSGNKRAPGPCVQRTYVACHIRSVIIWLETISKK LAAVEAQQQMLKLTIWGVK }}} BioSequence is expected to be represented in the standard IUB/IUPAC AminoAcid and NucleicAcid codes, with these exceptions: lower-case letters are accepted and are mapped into upper-case; a single hyphen or dash can be used to represent a gap of indeterminate length; and in AminoAcid sequences, U and * are acceptable letters (see below). Before submitting a request, any numerical digits in the query sequence should either be removed or replaced by appropriate letter codes (e.g., N for unknown NucleicAcid residue or X for unknown AminoAcid residue). == BioPython을 써서 FastaFormat다루기 == BioPython으로 FastaFormat을 다루는 요령은 다음과 같다. === 입력할때 - 주로 FastaFormat의 Parsing === {{{#!python from Bio import Fasta, File from cStringIO import StringIO #file = File.UndoHandle(StringIO(fastaStr)) # 만일 스트링으로 갖고있을경우 file = open('file.fasta', 'r') parser = Fasta.RecordParser() iterator = Fasta.Iterator(file, parser) while 1: curRecord = iterator.next() # 하나의 fasta file내에 여러개의 record를 반복적으로 접근 if curRecord is None: break title = curRecord.title # 레코드에서 타이틀 seq = curRecord.sequence # 레코드에서 서열 }}} === 출력할때 - stdout으로 뿌려준다면 === {{{#!python from Bio import Fasta title = '>This is test title' # fasta file의 title seq = 'ATGGGGGTGTGTGTGGGG' # 하나의 긴 문자열 fasta = Fasta.Record() # fasta라는 인스턴스를 만듦 fasta.title = title # 강제로 title속성에 값을 부여 fasta.sequence = seq # 마찬가지 print fasta # 이 명령으로 60자리후의 '\n'입력까지 자동으로 된다. # if you want to write on file wfile = open('쓰고자하는파일', 'w') wfile.write(str(fasta)) }}} == RelationalDatabase에서 직접만들기 == {{{ SELECT CONCAT(">gi|", annot.gi, "|sp|", annot.acc, "|", sp.name, " ", annot.descr, "\n", protein.seq) FROM protein INNER JOIN annot USING (prot_id) INNER JOIN sp USING (acc) WHERE annot.current = 1; $ mysql seqdb -N < swissprot.sql > swissprot.fa }}} == 관련코드모음 == [[HTML]]로 FastaFormat꾸미기 * DecoratorPattern 이용 : [[FastaDecorator.py]] * JuneKim씨 코드(2004-06-13) : 파이썬 커뮤니티에 정규식 중에 중간에 개행문자가 들어와도 되는 경우를 물으셨더군요. 다음과 같이 할 수도 있습니다. {{{#!python import re class Enclose: def __init__(self,d): self.d=[(v,self.fragmentable(k)) for k,v in d] self.p=re.compile("(?i)(%s)"%")|(".join([f for _,f in self.d])) def fragmentable(self,s): return '\s?'.join(list(s)) def __call__(self, m): opener,closer=self.d[m.lastindex-1][0] return "%s%s%s"%(opener,m.group(),closer) def do(self, text): return self.p.sub(self, text) if __name__ == "__main__": sequence = """\ TCTTCTCCTCACCTCGCTCTCGCCGCCTGCTCGCCCCGNCCGCTTTGCTCGGCGCCCCAA AACACNCTTCCACCATGNGCCACCTCGGCGAGCCCTCCCACTTGAACAAAGGGGTGCTCG GCGCGTGTACNNATGGCCC\ """ expected="""TCTTCTCCTCACCTCGCTCTCGCCGCCTGCTCGCCCCGNCCGCTTTGCTCGGCGCCCCAA AACACNCTTCCACCATGNGCCACCTCGGCGAGCCCTCCCACTTGAACAAAGGGGTGCTCG GCGCGTGTACNNATGGCCC""" d=(('CCCCAAAACACN',('','')), ('TCGGCGCGTG',('','')), ('ACCTCGGCGAGCC',('','')), ) r=Enclose(d).do(sequence) assert r==expected }}} 간단한 [[Iterator]] {{{#!python class FastaIterator: def __init__(self, ifile): self.ifile = ifile self.g = self.getGenerator() def getGenerator(self): lines = [self.ifile.next()] for line in self.ifile: if line.startswith('>'): yield ''.join(lines) lines = [line] else: lines.append(line) else: yield ''.join(lines) def __iter__(self): return self.g }}} 프로그래밍 교육용 예제 {{{#!python #!/usr/bin/env python import unittest, sys from cStringIO import StringIO class DNA: CompMap = {'A':'T','T':'A','G':'C','C':'G'} def __init__(self): self.title = '' self.sequence = '' def getRevCompSeq(self): return ''.join(DNA.CompMap[nucleotide] for nucleotide in reversed(self.sequence)) def getFasta(self, cols=60): result = [] result.append('>'+self.title) seq = self.sequence while seq: result.append(seq[:cols]) seq = seq[cols:] return '\n'.join(result)+'\n' @staticmethod def parseFasta(aStr): dna=DNA() lines = aStr.splitlines() dna.title = lines[0][1:] dna.sequence = ''.join(lines[1:]) return dna def FastaGenerator(aFile): lines =[aFile.next()] for line in aFile: if line.startswith('>'): yield ''.join(lines) lines = [] lines.append(line) else: yield ''.join(lines) class DnaTest(unittest.TestCase): def setUp(self): self.dna = DNA() self.dna.title = 'test' self.dna.sequence = 'AGTC' def testRevComp(self): self.assertEquals('GACT',self.dna.getRevCompSeq()) def testParsing(self): input='''\ >test AGTC CCC''' self.dna = DNA.parseFasta(input) self.assertEquals('test',self.dna.title) self.assertEquals('AGTCCCC', self.dna.sequence) def testRepr(self): self.dna.sequence = 'AGTCCCC' expected='''\ >test AGT CCC C ''' self.assertEquals(expected, self.dna.getFasta(cols=3)) def testFastaGenerator(self): input = StringIO("""\ >test1 AGTC CCC >test2 GGGG """) g = FastaGenerator(input) expect1 = """\ >test1 AGTC CCC """ self.assertEquals(expect1, g.next()) expect2 = """\ >test2 GGGG """ self.assertEquals(expect2, g.next()) #self.assertRaise(StopIteration, g.next) def main(): g = FastaGenerator(sys.stdin) for record in g: dna = DNA.parseFasta(record) dna.sequence = dna.getRevCompSeq() sys.stdout.write(dna.getFasta()) if __name__=='__main__': #main() unittest.main() }}} 각종 변환 프로그램(using WxPython) --> YongsLib:wiki/FastaConvertor