next up previous contents
Next: Predizione della struttura secondaria Up: Ancora allineamenti Previous: Allinemanto multiplo   Indice

Pseudo-BLAST

La prima cosa importante è quella di preprocessare la nostra sequenza query in modo da avere un dizionario che abbia per chiave ogni tripletta di residui contigui appartenenti alla nostra query e per valore l'elenco delle

def preprocessQuery(query,mat,residues):
    ''' preprocessQuery(query,mat,residues)
    '''
    simpos={}
    simwords={}
    for i in range(len(query)-2):
        word=query[i:i+3]
        simwords[word]=simwords.get(word,findSumWords(word,mat,residues))
        for k in simwords[word]:
            simpos[k]=simpos.get(k,[])
            simpos[k].append(i)
    return simpos
#------------------------#


class PSEUDOBLAST:
    ''' emulate ungapped blast '''
    def __init__(self,filedbase,filequery,fmatrix='blosum62.mat',wordth=11,numalign=25):
        ''' __init__(self,filedbase,filequery,fmatrix='blosum62.mat',wordth=11,numalign=25)
        '''
        import Seq
        self.db=Seq.readFasta(filedbase)
        self.queries=Seq.readFasta(filequery)
        self.alphabet,self.mat=getmat(fmatrix)
        self.wth=wordth
        self.numalign=numalign
        
    def findSumWords(self,word):
        ''' findSumWords(word) 
        '''
        score=0
        for i in range(len(word)):
            score+=self.mat[word[i],word[i]]
        if score < self.wth :
            return [word]
        similar=[]
        for r1 in self.alphabet:
            for r2 in self.alphabet: 
                for r3 in self.alphabet: 
                    if(self.mat[word[0],r1]+self.mat[word[1],r2]+self.mat[word[2],r3] >= self.wth):
                        similar.append(r1+r2+r3)
        return similar
    #------------------------#
    
    def preprocessQuery(self,query):
        ''' preprocessQuery(query)
        '''
        simpos={}
        simwords={}
        for i in range(len(query)-2):
            word=query[i:i+3]
            simwords[word]=simwords.get(word,self.findSumWords(word))
            for k in simwords[word]:
                simpos[k]=simpos.get(k,[])
                simpos[k].append(i)
        return simpos
    #------------------------#

    def extendHit(self,query,qpos,target,tpos):
        ''' extendHit(query,qpos,target,tpos)
        '''
        score=0
        for i in range(3):
             score+=self.mat[target[tpos+i],query[qpos+i]]
        maxscore=score
        left,right=0,2
        # extend on the right
        ilim=min(len(target)-tpos,len(query)-qpos)
        i=3
        while(i<ilim and score > 0):
            score+=self.mat[target[tpos+i],query[qpos+i]]
            if score > maxscore:
                maxscore,right=score,i
            i+=1 
        # extend on the left
        score=maxscore
        ilim=min(tpos,qpos)
        i=1
        while(i<ilim and score > 0):
            score+=self.mat[target[tpos-i],query[qpos-i]]
            if score > maxscore:
                maxscore,left=score,i
            i+=1 
        return maxscore,(qpos-left,qpos+right+1),(tpos-left,tpos+right+1)    
    #----------------------------#
    
    def isin(self,pos1,pos2,pairlist):
        ''' isin(self,pos1,pos2,pairlist) 
              returns 1 if pos1 and pos2 are inside a pair of a list
                      0 otherwise
        '''
        i=0
        inlist=0
        while i<len(pairlist) and not inlist:
            (b1,e1),(b2,e2)=pairlist[i]
            if (pos1 >= b1 and pos1 <=e1) and (pos2 >= b2 and pos2 <=e2):
                inlist=1
            i+=1
        return inlist
    #----------------------------#

    def scanTarget(self,target,tid,query,qid,qindex):
        ''' scanTarget(target,tid,query,qid,qindex)
        '''
        maxnum=self.numalign
        alignments=[]
        qpositions=[]
        for i in range(len(target)-2):
            word=target[i:i+3]
            tpos=i
            for qpos in qindex.get(word,[]):
                if not self.isin(qpos,i,qpositions):
                    align=self.extendHit(query,qpos,target,tpos)
                    qpositions.append(align[1:])
                    alignments.append((align[0],align[1:],tid,qid))
        alignments.sort()
        minidx=min(len(alignments),maxnum)
        alignments=alignments[-minidx:]
        alignments.reverse()
        return alignments
    #----------------------------#

    def run(self):
        ''' run(self)
        '''
        alph,mat=self.alphabet,self.mat
        
        for qid in self.queries.getNames():
            query=self.queries.seqs[qid]
            qidx=self.preprocessQuery(query)
            alignments=[]
            for tid in self.db.getNames():
                target=self.db.seqs[tid]
    #            print "process",qid,"against",tid
                alignments.extend(self.scanTarget(target,tid,query,qid,qidx))  
            alignments.sort()
            minidx=min(len(alignments),self.numalign)    
            alignments=alignments[-minidx:]
            alignments.reverse()
            print "\n\n\n\n\nthe best ",self.numalign,"alignments for query",qid
            for k in alignments:
                score,((bq,eq),(bt,et)),tname,qname=k
    #            print k
                print "score",score,"target",tname
                print "      ",bq+1," "*(eq-bq),eq
                print "query ",query[bq:eq]
                print "      ",bt+1," "*(et-bt),et
                print "target",self.db.seqs[tname][bt:et]      
                print "\n"
#------------------------------------------#
# end class #

def getmat(filename):
    ''' getmat(filename)
        return alphabet,mat 
           mat is a dictionary a,a => values 
    '''
    import sys
    import string
    try:
        lines=open(filename,'r').readlines()
    except:
        sys.stderr.write('cannot open file '+filename+'\n')
        sys.exit()
    
    mat={}
    alphabet=lines[0].split()
    for line in lines[1:]:
        v=line.split()
        for i in range(len(alphabet)):
            mat[(v[0],alphabet[i])]=string.atoi(v[i+1])
    return alphabet,mat    
#------------------------#


2004-11-02