La prima cosa importante è quella di preprocessare la nostra sequenza query in modo da avere un dizionario che abbia per chiave ogni tripletta di residui contigui appartenenti alla nostra query e per valore l'elenco delle
def preprocessQuery(query,mat,residues):
''' preprocessQuery(query,mat,residues)
'''
simpos={}
simwords={}
for i in range(len(query)-2):
word=query[i:i+3]
simwords[word]=simwords.get(word,findSumWords(word,mat,residues))
for k in simwords[word]:
simpos[k]=simpos.get(k,[])
simpos[k].append(i)
return simpos
#------------------------#
class PSEUDOBLAST:
''' emulate ungapped blast '''
def __init__(self,filedbase,filequery,fmatrix='blosum62.mat',wordth=11,numalign=25):
''' __init__(self,filedbase,filequery,fmatrix='blosum62.mat',wordth=11,numalign=25)
'''
import Seq
self.db=Seq.readFasta(filedbase)
self.queries=Seq.readFasta(filequery)
self.alphabet,self.mat=getmat(fmatrix)
self.wth=wordth
self.numalign=numalign
def findSumWords(self,word):
''' findSumWords(word)
'''
score=0
for i in range(len(word)):
score+=self.mat[word[i],word[i]]
if score < self.wth :
return [word]
similar=[]
for r1 in self.alphabet:
for r2 in self.alphabet:
for r3 in self.alphabet:
if(self.mat[word[0],r1]+self.mat[word[1],r2]+self.mat[word[2],r3] >= self.wth):
similar.append(r1+r2+r3)
return similar
#------------------------#
def preprocessQuery(self,query):
''' preprocessQuery(query)
'''
simpos={}
simwords={}
for i in range(len(query)-2):
word=query[i:i+3]
simwords[word]=simwords.get(word,self.findSumWords(word))
for k in simwords[word]:
simpos[k]=simpos.get(k,[])
simpos[k].append(i)
return simpos
#------------------------#
def extendHit(self,query,qpos,target,tpos):
''' extendHit(query,qpos,target,tpos)
'''
score=0
for i in range(3):
score+=self.mat[target[tpos+i],query[qpos+i]]
maxscore=score
left,right=0,2
# extend on the right
ilim=min(len(target)-tpos,len(query)-qpos)
i=3
while(i<ilim and score > 0):
score+=self.mat[target[tpos+i],query[qpos+i]]
if score > maxscore:
maxscore,right=score,i
i+=1
# extend on the left
score=maxscore
ilim=min(tpos,qpos)
i=1
while(i<ilim and score > 0):
score+=self.mat[target[tpos-i],query[qpos-i]]
if score > maxscore:
maxscore,left=score,i
i+=1
return maxscore,(qpos-left,qpos+right+1),(tpos-left,tpos+right+1)
#----------------------------#
def isin(self,pos1,pos2,pairlist):
''' isin(self,pos1,pos2,pairlist)
returns 1 if pos1 and pos2 are inside a pair of a list
0 otherwise
'''
i=0
inlist=0
while i<len(pairlist) and not inlist:
(b1,e1),(b2,e2)=pairlist[i]
if (pos1 >= b1 and pos1 <=e1) and (pos2 >= b2 and pos2 <=e2):
inlist=1
i+=1
return inlist
#----------------------------#
def scanTarget(self,target,tid,query,qid,qindex):
''' scanTarget(target,tid,query,qid,qindex)
'''
maxnum=self.numalign
alignments=[]
qpositions=[]
for i in range(len(target)-2):
word=target[i:i+3]
tpos=i
for qpos in qindex.get(word,[]):
if not self.isin(qpos,i,qpositions):
align=self.extendHit(query,qpos,target,tpos)
qpositions.append(align[1:])
alignments.append((align[0],align[1:],tid,qid))
alignments.sort()
minidx=min(len(alignments),maxnum)
alignments=alignments[-minidx:]
alignments.reverse()
return alignments
#----------------------------#
def run(self):
''' run(self)
'''
alph,mat=self.alphabet,self.mat
for qid in self.queries.getNames():
query=self.queries.seqs[qid]
qidx=self.preprocessQuery(query)
alignments=[]
for tid in self.db.getNames():
target=self.db.seqs[tid]
# print "process",qid,"against",tid
alignments.extend(self.scanTarget(target,tid,query,qid,qidx))
alignments.sort()
minidx=min(len(alignments),self.numalign)
alignments=alignments[-minidx:]
alignments.reverse()
print "\n\n\n\n\nthe best ",self.numalign,"alignments for query",qid
for k in alignments:
score,((bq,eq),(bt,et)),tname,qname=k
# print k
print "score",score,"target",tname
print " ",bq+1," "*(eq-bq),eq
print "query ",query[bq:eq]
print " ",bt+1," "*(et-bt),et
print "target",self.db.seqs[tname][bt:et]
print "\n"
#------------------------------------------#
# end class #
def getmat(filename):
''' getmat(filename)
return alphabet,mat
mat is a dictionary a,a => values
'''
import sys
import string
try:
lines=open(filename,'r').readlines()
except:
sys.stderr.write('cannot open file '+filename+'\n')
sys.exit()
mat={}
alphabet=lines[0].split()
for line in lines[1:]:
v=line.split()
for i in range(len(alphabet)):
mat[(v[0],alphabet[i])]=string.atoi(v[i+1])
return alphabet,mat
#------------------------#