[[FASTA]] format.

A sequence in FastaFormat begins with a single-line description, followed by lines of sequence data. The description line is distinguished from the sequence data by a greater-than (">") symbol in the first column. It is recommended that all lines of text be shorter than 80 characters in length. An example sequence is: 

{{{
>gi|532319|pir|TVFV2E|TVFV2E envelope protein
ELRLRYCAPAGFALLKCNDADYDGFKTNCSNVSVVHCTNLMNTTVTTGLLLNGSYSENRT
QIWQKHRTSNDSALILLNKHYNLTVTCKRPGNKTVLPVTIMAGLVFHSQKYNLRLRQAWC
HFPSNWKGAWKEVKEEIVNLPKERYRGTNDPKRIFFQRQWGDPETANLWFNCHGEFFYCK
MDWFLNYLNNLTVDADHNECKNTSGTKSGNKRAPGPCVQRTYVACHIRSVIIWLETISKK
TYAPPREGHLECTSTVTGMTVELNYIPKNRTNVTLSPQIESIWAAELDRYKLVEITPIGF
APTEVRRYTGGHERQKRVPFVXXXXXXXXXXXXXXXXXXXXXXVQSQHLLAGILQQQKNL
LAAVEAQQQMLKLTIWGVK
>my test sequence for 532319
ELRLRYCAPAGFALLKCNDADYDGFKTNCSNVSVVHCTNLMNTTVTTGLLLNGSYSENRT
QIWQKHRTSNDSALILLNKHYNLTVTCKRPGNKTVLPVTIMAGLVFHSQKYNLRLRQAWC
HFPSNWKGAWKEVKEEIVNLPKERYRGTNDPKRIFFQRQWGDPETANLWFNCHGEFFYCK
MDWFLNYLNNLTVDADHNECKNTSGTKSGNKRAPGPCVQRTYVACHIRSVIIWLETISKK
LAAVEAQQQMLKLTIWGVK
}}}

BioSequence is expected to be represented in the standard IUB/IUPAC AminoAcid and NucleicAcid codes, with these exceptions: lower-case letters are accepted and are mapped into upper-case; a single hyphen or dash can be used to represent a gap of indeterminate length; and in AminoAcid sequences, U and * are acceptable letters (see below). Before submitting a request, any numerical digits in the query sequence should either be removed or replaced by appropriate letter codes (e.g., N for unknown NucleicAcid residue or X for unknown AminoAcid residue). 

== BioPython을 써서 FastaFormat다루기 ==

BioPython으로 FastaFormat을 다루는 요령은 다음과 같다.

=== 입력할때 - 주로 FastaFormat의 Parsing ===
{{{#!python
from Bio import Fasta, File
from cStringIO import StringIO 
#file = File.UndoHandle(StringIO(fastaStr)) # 만일 스트링으로 갖고있을경우
file = open('file.fasta', 'r') 
parser = Fasta.RecordParser() 
iterator = Fasta.Iterator(file, parser) 
while 1: 
    curRecord = iterator.next()  # 하나의 fasta file내에 여러개의 record를 반복적으로 접근 
    if curRecord is None: break 
    title = curRecord.title   # 레코드에서 타이틀 
    seq = curRecord.sequence  # 레코드에서 서열 
}}}

=== 출력할때 - stdout으로 뿌려준다면 ===
{{{#!python
from Bio import Fasta 
title = '>This is test title'  # fasta file의 title 
seq = 'ATGGGGGTGTGTGTGGGG' # 하나의 긴 문자열 
fasta = Fasta.Record()   # fasta라는 인스턴스를 만듦 
fasta.title = title              # 강제로 title속성에 값을 부여 
fasta.sequence = seq    # 마찬가지 
print fasta                     # 이 명령으로 60자리후의 '\n'입력까지 자동으로 된다. 

# if you want to write on file
wfile = open('쓰고자하는파일', 'w') 
wfile.write(str(fasta))
}}}

== RelationalDatabase에서 직접만들기 ==
{{{
SELECT CONCAT(">gi|", annot.gi, "|sp|", annot.acc, "|", sp.name, " ", annot.descr, "\n", protein.seq)
FROM   protein INNER JOIN annot USING (prot_id) INNER JOIN sp USING (acc)
WHERE  annot.current = 1;
$ mysql seqdb -N < swissprot.sql > swissprot.fa
}}}

== 관련코드모음 ==
[[HTML]]로 FastaFormat꾸미기 
 * DecoratorPattern 이용 : [[FastaDecorator.py]]
 * JuneKim씨 코드(2004-06-13) : 파이썬 커뮤니티에 정규식 중에 중간에 개행문자가 들어와도 되는 경우를 물으셨더군요. 다음과 같이 할 수도 있습니다.
 {{{#!python
import re

class Enclose:
    def __init__(self,d):
        self.d=[(v,self.fragmentable(k)) for k,v in d]
        self.p=re.compile("(?i)(%s)"%")|(".join([f for _,f in self.d]))
    def fragmentable(self,s): return '\s?'.join(list(s))
    def __call__(self, m):
        opener,closer=self.d[m.lastindex-1][0]
        return "%s%s%s"%(opener,m.group(),closer)
    def do(self, text):
        return self.p.sub(self, text)

if __name__ == "__main__": 
    sequence = """\
TCTTCTCCTCACCTCGCTCTCGCCGCCTGCTCGCCCCGNCCGCTTTGCTCGGCGCCCCAA
AACACNCTTCCACCATGNGCCACCTCGGCGAGCCCTCCCACTTGAACAAAGGGGTGCTCG
GCGCGTGTACNNATGGCCC\
"""
    expected="""TCTTCTCCTCACCTCGCTCTCGCCGCCTGCTCGCCCCGNCCGCTTTGCTCGGCG<b>CCCCAA
AACACN</b>CTTCCACCATGNGCC<font color="red">ACCTCGGCGAGCC</font>CTCCCACTTGAACAAAGGGGTGC<i>TCG
GCGCGTG</i>TACNNATGGCCC"""

    d=(('CCCCAAAACACN',('<b>','</b>')),
       ('TCGGCGCGTG',('<i>','</i>')),
       ('ACCTCGGCGAGCC',('<font color="red">','</font>')),
       )

    r=Enclose(d).do(sequence)
    assert r==expected
}}}

간단한 [[Iterator]]
{{{#!python
class FastaIterator:
    def __init__(self, ifile):
        self.ifile = ifile
        self.g = self.getGenerator()
    def getGenerator(self):
        lines = [self.ifile.next()]
        for line in self.ifile:
            if line.startswith('>'):
                yield ''.join(lines)
                lines = [line]
            else:
                lines.append(line)
        else:
            yield ''.join(lines)
    def __iter__(self):
        return self.g
}}}

프로그래밍 교육용 예제
{{{#!python
#!/usr/bin/env python
import unittest, sys
from cStringIO import StringIO

class DNA:
    CompMap = {'A':'T','T':'A','G':'C','C':'G'}
    def __init__(self):
        self.title = ''
        self.sequence = ''

    def getRevCompSeq(self):
        return ''.join(DNA.CompMap[nucleotide]
                for nucleotide in reversed(self.sequence))

    def getFasta(self, cols=60):
        result = []
        result.append('>'+self.title)
        seq = self.sequence
        while seq:
            result.append(seq[:cols])
            seq = seq[cols:]
        return '\n'.join(result)+'\n'

    @staticmethod
    def parseFasta(aStr):
        dna=DNA()
        lines = aStr.splitlines()
        dna.title = lines[0][1:]
        dna.sequence = ''.join(lines[1:])
        return dna

def FastaGenerator(aFile):
    lines =[aFile.next()]
    for line in aFile:
        if line.startswith('>'):
            yield ''.join(lines)
            lines = []
        lines.append(line)
    else:
        yield ''.join(lines)

class DnaTest(unittest.TestCase):
    def setUp(self):
        self.dna = DNA()
        self.dna.title = 'test'
        self.dna.sequence = 'AGTC'

    def testRevComp(self):
        self.assertEquals('GACT',self.dna.getRevCompSeq())

    def testParsing(self):
        input='''\
>test
AGTC
CCC'''
        self.dna = DNA.parseFasta(input)
        self.assertEquals('test',self.dna.title)
        self.assertEquals('AGTCCCC', self.dna.sequence)

    def testRepr(self):
        self.dna.sequence = 'AGTCCCC'
        expected='''\
>test
AGT
CCC
C
'''
        self.assertEquals(expected, self.dna.getFasta(cols=3))

    def testFastaGenerator(self):
        input = StringIO("""\
>test1
AGTC
CCC
>test2
GGGG
""")
        g = FastaGenerator(input)
        expect1 = """\
>test1
AGTC
CCC
"""
        self.assertEquals(expect1, g.next())
        expect2 = """\
>test2
GGGG
"""
        self.assertEquals(expect2, g.next())
        #self.assertRaise(StopIteration, g.next)

def main():
    g = FastaGenerator(sys.stdin)
    for record in g:
        dna = DNA.parseFasta(record)
        dna.sequence = dna.getRevCompSeq()
        sys.stdout.write(dna.getFasta())

if __name__=='__main__':
    #main()
    unittest.main()
}}}

각종 변환 프로그램(using WxPython) --> YongsLib:wiki/FastaConvertor