#format python """ExpressionProfileByEstClustering by -- ["yong27"] [[DateTime(2006-08-23T16:57:50Z)]]""" import sys, math, unittest class SummableDict(dict): def sum(self): return sum(self.values()) def __iadd__(self, sdict): for key in sdict: if key in self: self[key]+=sdict[key] else: self[key] = sdict[key] return self class Contig: def __init__(self, lines): self.name = lines[0] self.ests = lines[1:] self.libraries = SummableDict(PLM=0, BF=0, PAF=0, PFC1=0) for est in self.ests: swords = est.split('_') if swords[2] !="N": self.libraries[swords[1]] += 1 self.nAllEsts = self.libraries.sum() self.r = 0 def nEsts(self, library): return self.libraries.get(library) def withTotalEstsDict(self, total_ests): f = self.nAllEsts / float(total_ests.sum()) for library, nEsts in self.libraries.iteritems(): if nEsts != 0: self.r += nEsts * math.log(nEsts / (total_ests.get(library) * f)) def __str__(self): return '\t'.join(map(str, [self.name, self.nEsts('PLM'), self.nEsts('BF'), self.nEsts('PAF'), self.nEsts('PFC1'), self.nAllEsts, self.r, ])) class FastaIterator: def __init__(self, ifile): self.ifile = ifile self.g = self.getGenerator() def getGenerator(self): lines = [self.ifile.next().strip()] for line in self.ifile: if line.startswith('>'): yield Contig(lines) lines = [line.strip()] else: lines.append(line.strip()) else: yield Contig(lines) def __iter__(self): return self.g def next(self): return self.g.next() from cStringIO import StringIO class ContigStatTest(unittest.TestCase): def setUp(self): input = """\ >TOTAL_contig00001 susfleck_BF_8184_P21.ab1 susfleck_PFC1_6568_A14.ab1 susfleck_PLM_N_7376_F15.ab1 susfleck_PLM_7780_K07.ab1 susfleck_PLM_7781_K07.ab1 >TOTAL_contig00002 susfleck_BF_0021_0024_D23.ab1 susfleck_PAF_2932_E15.ab1 susfleck_BF_0073_0076_C12.ab """ self.fi = FastaIterator(StringIO(input)) def testCount(self): c1 = self.fi.next() self.assertEquals('>TOTAL_contig00001', c1.name) self.assertEquals(1, c1.nEsts('BF')) self.assertEquals(2, c1.nEsts('PLM')) self.assertEquals(4, c1.nAllEsts) c2 = self.fi.next() self.assertEquals('>TOTAL_contig00002', c2.name) self.assertEquals(2, c2.nEsts('BF')) self.assertEquals(0, c2.nEsts('PLM')) self.assertEquals(1, c2.nEsts('PAF')) self.assertEquals(3, c2.nAllEsts) def testSummableDict(self): sd1 = SummableDict(a=0, b=2, d=2) sd2 = SummableDict(a=1, c=4, d=3) sd1 += sd2 self.assertEquals(SummableDict(a=1, b=2, c=4, d=5), sd1) def main(aFile): contigs = [] total_ests = SummableDict(PLM=0, BF=0, PAF=0, PFC1=0) for contig in FastaIterator(aFile): total_ests += contig.libraries contigs.append(contig) for contig in contigs: contig.withTotalEstsDict(total_ests) contigs.sort(key=lambda x: x.r, reverse=True) print '\t'.join(['Contig name', 'PLM', 'BF','PAF','PFC1', 'Sum', 'R']) for contig in contigs: print contig if __name__=='__main__': #unittest.main() #main(file('total_contig_info.txt')) main(sys.stdin)