#!/usr/bin/env python import os, sys, string class Seq: def __init__(self): self.ID=0 self.data="" class DPSeq: def __init__(self): self.ID=0 self.data=0 # DloopExcluded flag variable # if non-zero, exclude d-loop DloopExcluded=0 # read entire file into a string file_string=open(sys.argv[1]).read() # split according to line-endings. Note I'd usually use \n but for the # sequences on mtDB they have \r's. lines=file_string.split("\r") NumSeq=int(lines[0].split()[0]) NumSites=int(lines[0].split()[1]) Sequences=[] #print "Number of total sites:",NumSites #Deal with first block for i in lines[1:NumSeq+1]: NewSeq=Seq() NewSeq.ID=i.split()[0] for j in i.split()[1:]: NewSeq.data=NewSeq.data+j Sequences.append(NewSeq) skipLines=0 #Deal with the rest of data for i in range(NumSeq+1,len(lines)): if len(lines[i])<2: skipLines=skipLines+1 # print "Skipping:",lines[i] continue #print (i-skipLines-1)%NumSeq for j in lines[i].split(): Sequences[(i-skipLines-1) % NumSeq].data=Sequences[(i-skipLines-1) % NumSeq].data+j #for i in Sequences: # print i.ID,i.data[150:160] #Transfer IDs to DPSeqs DPSeqs=[] for i in range(0,NumSeq): NewSeq=DPSeq() NewSeq.ID=Sequences[i].ID NewSeq.data=[0]*NumSites DPSeqs.append(NewSeq) #Identify segragating sites within a region and output to DPSeqs SegSites=[] if DloopExcluded: region=range(577,16023) # to exclude D loop else: region=range(0,len(Sequences[0].data)) #whole mtDNA for i in region: A=0 C=0 G=0 T=0 gap=0 first='' for j in range(0,NumSeq): if Sequences[j].data[i]=='A': if first=='': first='A' DPSeqs[j].data[i]=0 elif first=='A': DPSeqs[j].data[i]=0 else: DPSeqs[j].data[i]=1 A=1 if Sequences[j].data[i]=='C': if first=='': first='C' DPSeqs[j].data[i]=0 elif first=='C': DPSeqs[j].data[i]=0 else: DPSeqs[j].data[i]=1 C=1 if Sequences[j].data[i]=='G': if first=='': first='G' DPSeqs[j].data[i]=0 elif first=='G': DPSeqs[j].data[i]=0 else: DPSeqs[j].data[i]=1 G=1 if Sequences[j].data[i]=='T': if first=='': first='T' DPSeqs[j].data[i]=0 elif first=='T': DPSeqs[j].data[i]=0 else: DPSeqs[j].data[i]=1 T=1 if Sequences[j].data[i]=='-': gap=1 #This line will exclude sequences with gaps from being considered segregating sites if gap: sum=0 else: sum=A+C+G+T #Note this line limits us to taking only segregating sites that are biallelic if sum==2: SegSites.append(i) #print "Segregating Sites and corresponding original positions:", #for i in range(0,len(SegSites)): # print i,"(", SegSites[i],")", print print "Seqs:",NumSeq print "Markers:",len(SegSites) #Output the sequences in the format for Eric and I's MDBlocks program for i in DPSeqs: print i.ID, count=6 for j in SegSites: print i.data[j], count=count+1 if(count==60): print count=0 print