#' Jointly call and refine a set of seed motifs provided by the findamotif function
#'
#' @param scorematset is a set of matrices, row-concatenated, giving pwms (log-scale) for the initialisation of the algorithm.
#' scorematset is of dimension `sum(dimvec)` rows by 4 columns. and the first `dimvec[1]` rows of this matrix gives the pwm for the first motifs,
#' the next `dimvec[2]` rows the second motif, and so on
#' @param dimvec gives the lengths of each of the initial motifs. If dimvec is of length n_motifs, motif k is of length `dimvec[k]`
#' @param seqs a vector of input sequences used for finding motifs within.
#' Lower case bases are ignored/masked - e.g. if repeats are an issue. In some cases it may be helpful NOT to mask repeats that may contain motif matches
#' @param maxwidth the length that elements of "seqs" will be trimmed to (around their centre). Run times depend roughly linearly on this parameter
#' @param alpha a vector of initial assumed probabilities each motif is present in a sequence
#' @param incprob can usually be left as default value
#' @param maxits the number of iterations (if no motif is found the algorithm could terminate early)
#' @param plen a parameter setting the geometric prior on how long each motif found should be.
#' plen=0.05 corresponds to a mean length of 20bp and is the default. Setting plen large penalises longer motifs more
#' @param ourprior a vector of length 10 probabilities giving the initial probability of a motif being found across different parts of the
#' sequence from start:end. If left unspecified the initial prior is set at uniform and the algorithm tries to learn where motifs are, e.g. if they are centrally enriched.
#' @param bg should be left at default value normally (technical parameter setting background model)
#' @param updatemot a flag - should the algorithm update (learn) the initial motifs (default is 1)
#' @param updatealpha a flag - should the algorithm update (learn) the initial motifs (default is 1)
#' @param updateprior a flag - should the algorithm update (learn) the prior on where the motifs occur within the DNA sequences(default is 1)
#' @param allowinf a flag - should infinite values be allowed in scoremat (not recommended, default is FALSE).
#' @param dt logical; should a data table of the results be returned
#' @param seed integer; seed for random number generation, set this for exactly reproducible results.
#' @param verbosity integer; How verbose should this function be? 0=silent, 3=everything.
#'
#' @details
#' Given a user-input set of initial PWMs and input sequences to identify motifs, run a Gibbs sampler to update these motifs, and output the results
#'
#' The user can also optionally supply priors on the fraction of sequences containing motifs, the likely length of motifs, and the positional distribution of motifs within the sequences.
#'
#' User-supplied information can either be updated (the default) by the algorithm, or fixed at the input values
#'
#' The program outputs a list of results, including information on inferred PWMs (i.e. motifs found), as well as a probabilistic output of which regions contain which motifs, and posterior distributions of the other parameters
#'
#' If you use this program, please cite Altemose et al. eLife 2017
#'
#' @return The code returns detailed output as a list, whose elements are as follows (access these using commands like outputlist$scoremat)\cr
#'
#' Details of input data given:
#' * seqs: the vector of input sequences used for finding motifs within
#' * trimmedseqs: the vector of input sequences used for finding motifs within, after trimming to shorten long input sequences
#'
#' Details of overall fitted model:
#' * scoremat: a matrix made up of matrices, row-concatenated, giving pwms (log-scale) for the identified motifs after iteration
#' * scorematdim: the lengths of each of the identified motifs. If scorematdim is of length n_motifs, motif k is of length `scorematdim[k]`
#' scoremat is of dimension `sum(scorematdim)` by 4 and the first `scorematdim[1]` rows of this matrix gives the pwm for the first motifs, the next `scorematdim[2]` rows the second motif, and so on
#' * prior: a vector of length 10 probabilities giving the inferred probability of a motif being found across different parts of the sequence from start to end.
#' * alpha: a vector of probabilities giving the inferred probability of each motif being found within a single input region
#' * bindmat:a version of scoremat accounting for the background sequence composition
#' * background: the inferred background model
#' * seed: random number generation seed used
#'
#' Details of output for given data
#' * regprobs: a matrix giving the probability of each motif occurring in each given input sequence
#' * regprob: a vector giving the overall probability of any motif occurring in each given input sequence
#' * bestpos: a matrix giving the best match for each motif in each given input sequence
#' * whichregs is a vector showing which input sequences had motifs identified in the final round of sampling of the Gibbs sampler
#' * whichpos: for motifs identified in regions described in whichreg, the start positions of motifs identified in the final round of sampling of the Gibbs sampler
#' * whichmot: for motifs identified in regions described in whichreg, the type (an integer in the range 1 to length(scorematdim)) of motifs identified in the final round of sampling of the Gibbs sampler
#' * whichstrand: for motifs identified in regions described in whichreg, the strand associated with motifs identified in the final round of sampling of the Gibbs sampler, relative to the input sequence
#'
#' @export
#' @import gtools data.table
getmotifs=function(scorematset,dimvec,seqs,maxwidth=800,alpha=0.5,incprob=0.99999,maxits=30,plen=0.05,updatemot=1,updatealpha=1,ourprior=NULL,updateprior=1,bg=-1,dt=T, allowinf=FALSE,seed=NULL,verbosity=1, stranded_prior=F, conv_t=0.05, conv_n=200){
starttime=proc.time()
its=0;
if(ncol(scorematset)!=4){
stop("scorematset should be a matrix with four columns one for each of A,C,G, and T")
}
if(nrow(scorematset)!=sum(dimvec)){
stop("The number of rows in scorematset does not match the dimvec parameter")
}
if(any(!is.finite(scorematset))==TRUE & allowinf==FALSE){
stop("Scorematset should probably not contain infinite values, try adding pseudocounts to the PFM when creating your PWM or change the allowinf parameter to TRUE")
}
### initialise vector of prior probabilities
alphas=matrix(nrow=0,ncol=length(dimvec))
if(length(alpha)!=length(dimvec)){
if(verbosity>=3) print("Initialising fractions as uniform")
alpha=rep(1/length(dimvec),length(dimvec))/2
}
if (is.null(seed)){
seed <- sample.int(2^20, 1)
}
if (!is.na(seed)){
set.seed(seed)
}
###need later
logfact=cumsum(log(1:(length(seqs)+4)))
logfact=c(0,logfact)
origseqs=seqs
######external regions get score -10000
####initialise so that need to construct background in the first iteration
qvec=vector(length=0)
#####for now, don't update background each iteration
####get sequences, pad to make same length
this=nchar(seqs)
statement=paste("Removing ",sum(this>maxwidth)," long regions with over ",maxwidth, " characters, of ", length(this)," regions in total (", sum(this>maxwidth)/length(this),"%).",sep="")
if(verbosity>=3) print(statement)
seqs=seqs[this<=maxwidth]
this=this[this<=maxwidth]
fullseqs=seqs
seqs = chartr("ACGTN","12345",seqs)
if(verbosity>=3) print("Padding with N's to equalize size")
temp=rep("5",maxwidth)
temp=paste(temp,collapse="")
maxwidth=max(this)
for(i in 1:length(seqs)) if(this[i]<max(this)) seqs[i]=paste(seqs[i],substring(temp,1,(max(this)-this[i])),sep="")
seqs_matrix <- matrix(unlist(strsplit(seqs,""), use.names = FALSE), ncol=maxwidth, byrow=TRUE)
mode(seqs_matrix) <- "numeric"
if(sum(is.na(seqs_matrix))){
warning("Sequences should only contain A,C,G,T or N", call. = FALSE)
return(NULL)
}
####begin iteration!
while(its<maxits){
if(verbosity>=3) print("Setting up")
###get start and end positions of each motif within the score matrix. note need to do this each time, as size of things can change
starts=c(1,cumsum(dimvec)+1)
starts=starts[1:length(dimvec)]
ends=cumsum(dimvec)
###score
bestpos=matrix(0,nrow=length(seqs),ncol=length(starts))
beststrand=bestpos
newmat=matrix(0,ncol=maxwidth,nrow=length(seqs))
scores=newmat
scores2=newmat
its=its+1
if(verbosity>=2){
print(paste("Iteration number ",its,sep=""))
}else if(verbosity>=1){ # at verbosity level 1, only iters and text motifs are printed, so put them on the same line
cat(paste0("Iteration: ",sprintf("%02d", its),", "))
}
if(its==1 | updatemot==1){ #note this is different from original
overallscores=matrix(0,nrow=length(seqs)*length(starts),ncol=maxwidth)
overallscores2=overallscores
background=overallscores
priormat=overallscores
}
posmat=overallscores
for(i in 1:maxwidth) posmat[,i]=i
newmat <- seqs_matrix
if(verbosity>=3) print("...done")
###scoremat now just prob of each base given binding
###we'll get probs of a sequence without binding in the matrix "background"
###get sets of probabilities
for(j in 1:length(starts)){
scoremat=scorematset[starts[j]:ends[j],]
if(verbosity>=3) print(scoremat)
scoremat=cbind(scoremat,rep(-10000,nrow(scoremat))) #nick removed pointless maxwidth=200 setting
compmat=scoremat[,c(4:1,5)]
compmat=compmat[nrow(compmat):1,]
if(verbosity>=3) print(paste("Beginning scoring for Motif",j))
if(updatemot==1 | its==1){
startrange=seq(1,maxwidth-nrow(scoremat)+1)
maxes=max(startrange)-1
for(i in 1:nrow(scoremat)){ #####sum log probabilities of each base according to pwm
newmat_t <- newmat[,i:(i+maxes)]
if(i==1){
scores = scoremat[i,newmat_t]
scores2 = compmat[i,newmat_t]
}else{
scores = scores + scoremat[i,newmat_t]
scores2 = scores2 + compmat[i,newmat_t]
}
}
rm(newmat_t)
scores <- matrix(scores, nrow=nrow(newmat))
scores2 <- matrix(scores2, nrow=nrow(newmat))
overall=scores
overall[scores2>scores]=scores2[scores2>scores]
if(verbosity>=3) print("Finding best positions")
for(i in 1:nrow(overall)){
bestpos[i,j]=sample(c(which(overall[i,]==max(overall[i,])),which(overall[i,]==max(overall[i,]))),1)
score1=scores[i,bestpos[i,j]]
score2=scores2[i,bestpos[i,j]]
beststrand[i,j]=1
if(score2>score1) beststrand[i,j]=0
}
if(verbosity>=3) print("Next motif")
overallscores[(nrow(newmat)*(j-1)+1):(nrow(newmat)*j),]=-1e9
overallscores[(nrow(newmat)*(j-1)+1):(nrow(newmat)*j),1:ncol(scores)]=scores
overallscores2[(nrow(newmat)*(j-1)+1):(nrow(newmat)*j),]=-1e9
overallscores2[(nrow(newmat)*(j-1)+1):(nrow(newmat)*j),1:ncol(scores)]=scores2
}
} # closes for(j in 1:length(starts)
if(updatemot!=1 & its!=1){
###after first iteration, if not changing motif then unsubtract background from last iteration
overallscores=overallscores+background
overallscores2=overallscores2+background
}
if(verbosity>=3) print("Background finding")
if(its==1){
###index each sequence by its triplet
index=newmat
index[,3:ncol(newmat)]=16*(newmat[,1:(ncol(newmat)-2)]-1)+4*(newmat[,2:(ncol(newmat)-1)]-1)+newmat[,3:ncol(newmat)]
index[newmat==5]=65
index[,2:ncol(newmat)][newmat[,1:(ncol(newmat)-1)]==5]=65
index[,3:ncol(newmat)][newmat[,1:(ncol(newmat)-2)]==5]=65
}
##tidy up the endpoints using 0.25
if(length(bg)==1){
if(length(qvec)==0){
if(verbosity>=3) print("Setting up background....")
bg=1:64*0
bg2=matrix(0,nrow=length(seqs),ncol=length(bg))
for(i in 1:length(seqs)){
if(!i%%1000){ if(verbosity>=3) print(i);if(verbosity>=3) print(bg);}
temp=substring(seqs[i],1,nchar(fullseqs[i]))
temp=substring(temp,1:nchar(temp),1:nchar(temp))
temp=as.numeric(temp)
newtemp=1:(length(temp)-2)
newtemp=16*(temp[1:(length(temp)-2)]-1)+4*(temp[2:(length(temp)-1)]-1)+(temp[3:(length(temp)-0)]-1)
newtemp[temp[1:(length(temp)-2)]==5]=-1
newtemp[temp[2:(length(temp)-1)]==5]=-1
newtemp[temp[3:(length(temp)-0)]==5]=-1
for(k in 1:64){
bg2[i,k]=bg2[i,k]+sum(newtemp==(k-1))
}
bg=bg+bg2[i,]
}
if(verbosity>=3) print("...done")
####allow either strand because under null don't expect this to matter
####just getting triplet probs
e=c("A","C","G","T")
for(k in 1:2) e=c(paste("A",e,sep=""),paste("C",e,sep=""),paste("G",e,sep=""),paste("T",e,sep=""))
d=c("T","G","C","A")
for(k in 1:2) d=c(paste(d,"T",sep=""),paste(d,"G",sep=""),paste(d,"C",sep=""),paste(d,"A",sep=""))
names(bg)=e
bg=bg+bg[d]
#### add one on for the prior, and to make robust to lack of data
bg=bg+1
bgstore=bg
qvec=as.vector(bg)/sum(bg)
qvecstore=qvec
}
}
#####now use qvec to get scores for all our sequences under the null
###first, get conditional probabilities for all triplets
####adjust below!!
letter1=floor(((1:64)-1)/16)+1
letter2=floor(((1:64)-(letter1-1)*16-1)/4)+1
letter3=floor(((1:64)-(letter1-1)*16-(letter2-1)*4-1))+1
newqvec=qvec*0
for(i in 1:64){
newqvec[i]=qvec[i]/sum(qvec[letter1==letter1[i] & letter2==letter2[i]])
}
#####add an extra term for missing bases
newqvec=c(newqvec,0.25)
newqvec=log(newqvec)
####now apply to the matrix storing our triplet lookup information
###take logs
###now add up to give the background equivalent to the length of the motif
background=background*0
background[is.na(background)]=0
if(verbosity>=3) print(dim(index))
if(verbosity>=3) print(dim(scores))
if(verbosity>=3) print("Putting in background information for the probability of each motif")
for(j in 1:length(starts)){
scoremat=scorematset[starts[j]:ends[j],]
cols = ncol(background) - nrow(scoremat) + 1
background_tmp <- background[(nrow(newmat)*(j-1)+1):(nrow(newmat)*j), 1:cols]
newqvec_index <- matrix(newqvec[as.double(index)], nrow=nrow(index))
for(i in 1:nrow(scoremat)){
background_tmp <- background_tmp + newqvec_index[,i:(i+cols-1)]
}
background[(nrow(newmat)*(j-1)+1):(nrow(newmat)*j),1:cols] <- background_tmp
}
rm(background_tmp)
if(verbosity>=3) print("Done")
if(verbosity>=3) print(range(overallscores))
if(verbosity>=3) print(range(overallscores2))
if(verbosity>=3) print(range(background))
overallscores=overallscores-background
overallscores2=overallscores2-background
# rm(background), required if updatemotif=1
if(verbosity>=3) print(range(overallscores))
if(verbosity>=3) print(range(overallscores2))
###assume a uniform prob. inside regions at first then infer
###priorprobs are just a set of densities
if(verbosity>=3) print("Applying prior matrix")
priormat=overallscores
priormat=scores*0
if(its==1){
if(is.null(ourprior)){
prior=rep(0.1,10)
prior2=prior
}
###otherwise, has vector of 10 probabilities
else{
prior=ourprior
prior=prior/sum(prior)
if(stranded_prior){
prior2=prior[10:1] # this assumes the priors are reversiable, which isn't always true
}else{
prior2=prior
}
}
}
if(verbosity>=3) print("...OK...")
##define priormat - note only have to correctly get "proportion" along
####first take stored "column positions"
endpos=1:nrow(overallscores)*0
for(j in 1:length(starts)){
scoremat=scorematset[starts[j]:ends[j],]
endpos[(nrow(newmat)*(j-1)+1):(nrow(newmat)*j)]=nchar(fullseqs[(1):(nrow(newmat))])-nrow(scoremat)+1
}
if(verbosity>=3) print("...OK3...")
priormat=posmat/endpos
priormat[priormat>1]=0
priormat=priormat-1/endpos
if(verbosity>=3) print("...OK4...")
priormat=floor(priormat*10)+1
priormat2 <- priormat
for(i in 1:10) priormat[priormat==i]=prior[i]
priormat[priormat<=0]=0
priormat[priormat>10]=0
priormat=priormat/rowSums(priormat)/2
for(i in 1:10) priormat2[priormat2==i]=prior2[i]
priormat2[priormat2<=0]=0
priormat2[priormat2>10]=0
priormat2=priormat2/rowSums(priormat2)/2
# priormat[priormat %in% 1:10] <- prior[match(priormat, 1:10, nomatch = 0)]
# priormat[priormat<0 | priormat>10] = 0
# priormat = priormat/rowSums(priormat)/2
scoremat=scorematset
if(verbosity>=3) print("Done")
if(verbosity>=3) print("Setting up sampling probabilities for motifs")
postforward=exp(overallscores)
postbackward=exp(overallscores2)
for(j in 1:length(starts)){
priormat[(nrow(newmat)*(j-1)+1):(nrow(newmat)*j),]=priormat[(nrow(newmat)*(j-1)+1):(nrow(newmat)*j),]*alpha[j];
priormat2[(nrow(newmat)*(j-1)+1):(nrow(newmat)*j),]=priormat2[(nrow(newmat)*(j-1)+1):(nrow(newmat)*j),]*alpha[j];
}
if(verbosity>=3) print("Calculating probabilities")
postforward=postforward*priormat
postbackward=postbackward*priormat2
for(j in 1:length(starts)){
temp=postforward[(nrow(newmat)*(j-1)+1):(nrow(newmat)*j),]
postforward[(nrow(newmat)*(j-1)+1):(nrow(newmat)*j),]=temp
temp=postbackward[(nrow(newmat)*(j-1)+1):(nrow(newmat)*j),]
postbackward[(nrow(newmat)*(j-1)+1):(nrow(newmat)*j),]=temp
}
if(verbosity>=3) print("Done")
if(verbosity>=3) print("Making sampling matrix")
samplemat=matrix(nrow=nrow(newmat),ncol=ncol(postforward)*length(starts)*2)
s=ncol(postforward)
for(j in 1:length(starts)){
samplemat[,(s*(j-1)*2+1):(s*(2*j-1))]=postforward[(nrow(newmat)*(j-1)+1):(nrow(newmat)*j),];
samplemat[,(s*(2*j-1)+1):(s*(2*j))]=postbackward[(nrow(newmat)*(j-1)+1):(nrow(newmat)*j),];
}
rm(postforward)
rm(postbackward)
samplemat=samplemat/(rowSums(samplemat)+(1-sum(alpha)))
###enables sampling
if(verbosity>=3) print("Done")
if(verbosity>=3) print("Sampling")
######sample
q=runif(nrow(samplemat))
#####need to record which motif if any
#####which strand
#####best position, best strand
#####start point
regprobs=matrix(0,nrow=length(q),ncol=length(starts))
for(j in 1:length(starts)){
regprobs[,j]=rowSums(samplemat[,(s*(j-1)*2+1):(s*(2*j))])
}
regprob=rowSums(regprobs)
mot=q*0
if(verbosity>=3) print("Picking positions within regions")
whichcol=mot*0
testmat=samplemat
for(j in 2:ncol(samplemat)){
testmat[,j]=testmat[,(j-1)]+testmat[,j]
}
for(j in ncol(samplemat):1){
whichcol[q<=testmat[,j]]=j
}
rm(testmat)
rm(samplemat)
mot[whichcol!=0]=1
whichmot=floor((whichcol-1)/2/ncol(overallscores))+1
whichstrand=(floor((whichcol-1)/ncol(overallscores))+1)%%2
whichpos=whichcol-(whichmot-1)*ncol(overallscores)*2-(1-whichstrand)*ncol(overallscores)
whichmot=whichmot[mot==1]
whichpos=whichpos[mot==1]
whichstrand=whichstrand[mot==1]
if(verbosity>=3) print("Done")
#print(table(whichpos))
#######get a prior on positions
######update prior using sampled positions
totals=1:length(starts)
for(i in 1:length(totals)) totals[i]=sum(whichmot==i)
totals=c(totals,sum(mot==0))
####make sure prior prob of motif is 1/2 in dirichlet
totals[length(totals)]=totals[length(totals)]+length(starts)-1
######need mcmc pack for rdirichlet(n,alpha)
alphanew=rdirichlet(1,alpha=1+totals)
##alphanew=rbeta(1,shape1=sum(mot==1)+1,shape2=sum(mot==0)+1)
######sample start pos
alphanew=alphanew[1:length(starts)]
if(verbosity>=3) print("Alpha values sampled:")
if(verbosity>=3) print(c(alphanew,sum(alphanew)))
whichregs=which(mot==1)
if(stranded_prior){
strand_corrected_pos <- c(whichpos[whichstrand==1],
nchar(fullseqs[whichregs][whichstrand==0])+1 - (whichpos[whichstrand==0] + dimvec[whichmot][whichstrand==0]-1))
strand_corrected_pos2 <- c(nchar(fullseqs[whichregs][whichstrand==1])+1 - (whichpos[whichstrand==1] + dimvec[whichmot][whichstrand==1]-1),
whichpos[whichstrand==0])
denom = c(nchar(fullseqs[whichregs][whichstrand==1]),nchar(fullseqs[whichregs][whichstrand==0])) -
c(dimvec[whichmot][whichstrand==1],dimvec[whichmot][whichstrand==0])
v=hist((strand_corrected_pos-1) / denom, breaks=seq(0,1,0.1),plot=FALSE)
v2=hist((strand_corrected_pos2-1) / denom, breaks=seq(0,1,0.1),plot=FALSE)
}else{
v=hist((whichpos-1)/(nchar(fullseqs[whichregs])-dimvec[whichmot]),breaks=seq(0,1,0.1),plot=FALSE)
}
if(updateprior==1){
prior=v$counts+5
prior=prior/sum(prior)
if(stranded_prior){
prior2=v2$counts+5
prior2=prior2/sum(prior2)
# Sometimes the priors of each strand are reversiable
# so could be possible (future todo) to skip some of the priormat calculations
#if(all(prior2!=prior[10:1])){
# print("Warning Priors not reversiable!")
# print(prior-prior2)
#}
}else{
prior2=prior
}
}
####for compatibility
strand=whichstrand
if(verbosity>=3) print("Sampling sequences")
###get sequences, to make a new motif
###background model is going to be based on the overall - inconsistent otherwise
###so we'll successively subtract the motif occurrences
bg=bgstore
qvec=qvecstore
ourcounts=matrix(nrow=65,ncol=0)
oursummary=matrix(nrow=5,ncol=0)
####new, stronger background model
newbackground=matrix(0,nrow=length(starts),ncol=64)
for(j in 1:length(starts)){
if(verbosity>=3) print(paste("Updating Motif",j,"counts"))
scoremat=scorematset[starts[j]:ends[j],]
tempregs=whichregs[whichmot==j]
newbackground[j,]=colSums(bg2[whichregs[whichmot==j],])
if(verbosity>=3) print(newbackground[j,])
tempstarts=whichpos[whichmot==j]
tempends=tempstarts+nrow(scoremat)-1
tempstrand=whichstrand[whichmot==j]
ourseqs=matrix(nrow=length(tempregs),ncol=nrow(scoremat)+50)
sampleseqs=seqs[tempregs]
v=substring(sampleseqs,tempstarts-25,tempends+25)
for(i in 1:length(v)) if(tempstarts[i]<=25){
v[i]=paste(c(rep("5",25-tempstarts[i]+1),v[i]),collapse="")
}
extension_lengths <- tempends+25-nchar(sampleseqs)
extension_lengths[extension_lengths<0] <- 0
v = paste0(v,strrep("5", extension_lengths))
######have subsequences
for(k in 1:(nrow(scoremat)+50)) ourseqs[,k]=as.double(substring(v,k,k))
ourseqs2=ourseqs[,ncol(ourseqs):1]
ourseqs2=5-ourseqs2
ourseqs2[ourseqs2==0]=5
ourseqs[tempstrand==0,]=ourseqs2[tempstrand==0,]
summary=matrix(nrow=4,ncol=ncol(ourseqs))
for(i in 1:4) summary[i,]=colSums(ourseqs==i)
oursummary=cbind(oursummary,rbind(summary,j))
####this gives us counts - now need to get likelihood under a background model
####enables us to sample a new motif in a relatively "principled" manner
if(verbosity>=3) print("Building background model...")
newtemp=16*(ourseqs[,1:(ncol(ourseqs)-2)]-1)+4*(ourseqs[,2:(ncol(ourseqs)-1)]-1)+(ourseqs[,3:(ncol(ourseqs)-0)]-1)
newtemp[ourseqs[,1:(ncol(ourseqs)-2)]==5]=-1
newtemp[ourseqs[,2:(ncol(ourseqs)-1)]==5]=-1
newtemp[ourseqs[,3:(ncol(ourseqs)-0)]==5]=-1
newtemp=newtemp+1
motcounts=matrix(ncol=ncol(newtemp),nrow=64)
for(i in 1:64) motcounts[i,]=colSums(newtemp==i)
if(verbosity>=3) print("...done")
bg=bg-rowSums(motcounts)
ourcounts=cbind(ourcounts,rbind(motcounts,j))
}
####fold over background - may need this to avoid negative values, given strand flipping
e=c("A","C","G","T")
for(i in 1:2) e=c(paste("A",e,sep=""),paste("C",e,sep=""),paste("G",e,sep=""),paste("T",e,sep=""))
d=c("T","G","C","A")
for(i in 1:2) d=c(paste(d,"T",sep=""),paste(d,"G",sep=""),paste(d,"C",sep=""),paste(d,"A",sep=""))
names(bg)=e
bg=bg+bg[d]
###add one for prior
bg=bg+1
colnames(newbackground)=e
newbackground=newbackground+newbackground[,d]
qvec=as.vector(bg)/sum(bg)
newqvec=newbackground/rowSums(newbackground)
letter1=floor(((1:64)-1)/16)+1
letter2=floor(((1:64)-(letter1-1)*16-1)/4)+1
letter3=floor(((1:64)-(letter1-1)*16-(letter2-1)*4-1))+1
predfrac=matrix(nrow=64,ncol=4)
for(i in 1:64){
predfrac[i,1]=qvec[letter1==letter1[i] & letter3==letter3[i] & letter2==1]
predfrac[i,2]=qvec[letter1==letter1[i] & letter3==letter3[i] & letter2==2]
predfrac[i,3]=qvec[letter1==letter1[i] & letter3==letter3[i] & letter2==3]
predfrac[i,4]=qvec[letter1==letter1[i] & letter3==letter3[i] & letter2==4]
}
predfrac=predfrac/rowSums(predfrac)
newpredfrac=array(0,dim=c(length(starts),64,4))
for(i in 1:64){
newpredfrac[,i,1]=newqvec[,letter1==letter1[i] & letter3==letter3[i] & letter2==1]
newpredfrac[,i,2]=newqvec[,letter1==letter1[i] & letter3==letter3[i] & letter2==2]
newpredfrac[,i,3]=newqvec[,letter1==letter1[i] & letter3==letter3[i] & letter2==3]
newpredfrac[,i,4]=newqvec[,letter1==letter1[i] & letter3==letter3[i] & letter2==4]
}
for(j in 1:length(starts)){
newpredfrac[j,,]=newpredfrac[j,,]/rowSums(newpredfrac[j,,])
}
###this gives the prediction we can now apply to each motif
newnewdim=vector(length=0)
newmatset=matrix(nrow=0,ncol=4)
bindmatset=matrix(nrow=0,ncol=4)
for(j in 1:length(starts)){
motcounts=ourcounts[1:64,ourcounts[65,]==j]
motcounts=t(motcounts)
summary=oursummary[1:4,oursummary[5,]==j]
expcounts=matrix(nrow=nrow(motcounts),ncol=4)
###add one for robustness
for(i in 1:4) expcounts[,i]=motcounts %*% newpredfrac[j,,i]+1
expcounts=expcounts/rowSums(expcounts)
summary=summary[,2:(ncol(summary)-1)]
expsummary=t(expcounts)
summary2=summary/expsummary*rowSums(expcounts)
noninclogprob=colSums(summary*t(log(expcounts)))
if(verbosity>=3) print("Calculating likelihood terms")
###have a uniform prior for bases included, four bases
########need likelihood for an included base
###uniform dirichlet prior leads to following posterior after integrating out frequencies
inclogprob=log(6)+logfact[summary[1,]+1]+logfact[summary[2,]+1]+logfact[summary[3,]+1]+
logfact[summary[4,]+1]-logfact[colSums(summary)+4]
increl=log(incprob*exp(inclogprob-noninclogprob)+(1-incprob))
increl[is.infinite(increl)]=(inclogprob-noninclogprob)[is.infinite(increl)]
newrel=c(0,cumsum((increl)))
#####get lhood for each possible value
lhood=matrix(0,nrow=ncol(summary),ncol=ncol(summary))
for(start in 1:ncol(summary)){
for(end in start:ncol(summary)){
lhood[start,end]=log(plen)*(end-start)+log(1-plen)+newrel[end+1]-newrel[start]
}
}
######these are relative log-probs
lhood2=exp(lhood-max(lhood[lhood!=0]))
lhood2[lhood==0]=0
lhood2=lhood2/sum(lhood2[lhood!=0])
motpos=sample(length(lhood2),1,prob=as.double(lhood2))
start=motpos %% nrow(lhood2)
if(start==0) start=nrow(lhood2)
end=(motpos-start)/nrow(lhood2)+1
#######have sampled new motif start and end positions
###sample new parameters - should be dirichlet but use expectations instead
temp=summary[,start:end]+1
temp=matrix(temp,nrow=4)
temp=t(t(temp)/colSums(temp))
temp=matrix(log(temp),nrow=4)
#######for binding, get rid of background
temp2=summary2[,start:end]+1
temp2=matrix(temp2,nrow=4)
##temp2=t(t(temp2)/colSums(temp2))
temp2=matrix(log(temp2),nrow=4)
newmat=matrix(nrow=end-start+1,ncol=4)
newmat[,1:4]=t(temp)
newmat2=t(temp2)
####make a new combined matrix for looking at...
newmatset=rbind(newmatset,newmat)
bindmatset=rbind(bindmatset,newmat2)
newnewdim=c(newnewdim,nrow(newmat))
}
###expected fractions for each base conditional on their neighbours
#####set new params
scorematsetold=scorematset
if(updatealpha==1) alpha=alphanew
if(updatemot==1){
scorematset=newmatset[,1:4, drop=FALSE]
dimvec=newnewdim
}else {
scorematset=scorematset[,1:4]
}
alphas=rbind(alphas,alpha)
# Print motif
if(verbosity>=1){
newstarts=c(1,cumsum(dimvec)+1)
newends=cumsum(dimvec)
motif_text <- pwm2text(scorematset[newstarts[1]:newends[1],,drop=FALSE])
if(length(dimvec)>1){
for(i in 2:length(dimvec)){
motif_text = paste(motif_text,"|",pwm2text(scorematset[newstarts[i]:newends[i],]))
}
}
cat(paste0("Prior:",spark_bar(prior/max(prior))," Alpha(s): ",paste(round(alpha,2),collapse=" "),", Motif(s): '",motif_text,"'\n"))
}
if(updatemot==1){
####remove motifs if not long enough
if(min(dimvec)<=3){
remo=which(dimvec<=3)
if(verbosity>=1) print("Some motifs have length <=3, removing")
if(verbosity>=1) print(paste("Motif Lengths:",dimvec))
if(sum(dimvec>3)==0){
warning("No motifs remaining!", call.=FALSE)
return(NULL)
}
newmat=matrix(nrow=0,ncol=4)
newmat2=matrix(nrow=0,ncol=4)
newstarts=c(1,cumsum(dimvec)+1)
newends=cumsum(dimvec)
for(i in 1:length(dimvec)) if(dimvec[i]>3){
newmat=rbind(newmat,scorematset[newstarts[i]:newends[i],])
newmat2=rbind(newmat2,bindmatset[newstarts[i]:newends[i],])
}
scorematset=newmat
bindmatset=newmat2
scorematset=newmat
####remove offending motif
if(verbosity>=3) print(dim(alphas))
if(verbosity>=3) print(length(dimvec))
alphas=alphas[,dimvec>3]
alpha=alpha[dimvec>3]
dimvec=dimvec[dimvec>3]
}
####remove motifs if too long
if(max(dimvec)>=maxwidth/2){
remo=which(dimvec>=maxwidth/2)
if(verbosity>=1) print("Some motifs have length >=maxwidth/2, removing")
if(verbosity>=1) print(paste("Motif Lengths:",dimvec))
if(sum(dimvec<=maxwidth/2)==0){
warning("No motifs remaining!", call.=FALSE)
return(NULL)
}
newmat=matrix(nrow=0,ncol=4)
newmat2=matrix(nrow=0,ncol=4)
newstarts=c(1,cumsum(dimvec)+1)
newends=cumsum(dimvec)
for(i in 1:length(dimvec)) if(dimvec[i]>=maxwidth/2){
newmat=rbind(newmat,scorematset[newstarts[i]:newends[i],])
newmat2=rbind(newmat2,bindmatset[newstarts[i]:newends[i],])
}
scorematset=newmat
bindmatset=newmat2
scorematset=newmat
####remove offending motif
if(verbosity>=3) print(dim(alphas))
if(verbosity>=3) print(length(dimvec))
alphas=alphas[,dimvec>3]
alpha=alpha[dimvec>3]
dimvec=dimvec[dimvec>3]
}
####remove motifs if not viable
if(min(length(fullseqs)*alpha)<=10){
if(verbosity>=1) print("Some motifs have <=10 expected copies, removing")
if(verbosity>=1) print(paste("Expected copies per motif:",round(length(fullseqs)*alpha,2)))
if(all(length(fullseqs)*alpha<=10)){
warning("No motifs remaining!", call.=FALSE)
return(NULL)
}
newmat=matrix(nrow=0,ncol=4)
newmat2=matrix(nrow=0,ncol=4)
newstarts=c(1,cumsum(dimvec)+1)
newends=cumsum(dimvec)
for(i in 1:length(dimvec)) if(length(fullseqs)*alpha[i]>10){
newmat=rbind(newmat,scorematset[newstarts[i]:newends[i],])
newmat2=rbind(newmat2,bindmatset[newstarts[i]:newends[i],])
}
scorematset=newmat
bindmatset=newmat2
dimvec=dimvec[length(fullseqs)*alpha>10]
####remove offending motif
alphas=alphas[,length(fullseqs)*alpha>10]
alpha=alpha[length(fullseqs)*alpha>10]
}
}
# determine if convergence critera has been met
if(nrow(alphas)>conv_n &
var(tail(alphas,conv_n)) < conv_t){
break
}
} #ends iteration while loop
if(verbosity>=1) print(proc.time() - starttime)
z2 <- list(seqs=origseqs,alphas=alphas,beststrand=beststrand, trimmedseqs=fullseqs,prior=prior,alpha=alpha,bindmat=bindmatset,scoremat=scorematset,scorematdim=dimvec,regprob=regprob,regprobs=regprobs,bestmatch=bestpos,whichregs=whichregs,whichpos=whichpos,background=qvec,whichmot=whichmot, whichstrand=strand,seed=as.integer(seed))
if(dt==T){
dt_all <- data.table(regprob = c(z2$regprobs),
sequence = rep(names(z2$seqs),ncol(z2$regprobs)),
whichmotif = rep(1:ncol(z2$regprobs),each=nrow(z2$regprobs)))
dt_regulated <- data.table(sequence = names(z2$seqs)[z2$whichregs],
whichpos = z2$whichpos,
whichmotif = z2$whichmot,
whichstrand = z2$whichstrand)
z2$dt <- merge(dt_all, dt_regulated, by=c("sequence","whichmotif"), all.x=T)
}
return(z2)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.