#' Detect local shape changes
#'
#' This function discovers outlying subjects whose RNA-seq have "local" abnormal shapes
#' and provides the most outlying window-direction for each outlier.
#'
#' @param miscGlobalobj object from \link{miscGlobal}
#' @param countData raw coverage matrix, dataI from \link{process_data}
#' @param residualData a residual matrix from PCA, a data matrix subtracted by a low-rank matrix.
#' @param exonset data annotation matrix, exonset from \link{process_data}
#' @param windowSize a window length. Default is 100.
#' @param siglev the significance level for a robust outlier detection. Default is 1e-5.
#' IF cutoff is specified, siglev is not used.
#' @param cutoff the cutoff value for outlying statistics.
#' If NULL, the cutoff value is computed based on the specified siglev.
#' @param reducedReturn
#'
#' @export
miscLocal = function(miscGlobalobj,countData,exonset,ADcutoff=3,windowSize=100,
siglev=1e-5,cutoff=NULL,reducedReturn=TRUE) {
n = ncol(miscGlobalobj$residualData); d = nrow(miscGlobalobj$residualData);
if (nrow(exonset)==1) {
exon.base=c(exonset[1,2]:exonset[1,3])
} else {
exon.base=c()
for (i in 1:(nrow(exonset)-1)) {
exon.base=c(exon.base,c(exonset[i,2]:exonset[i,3]))
}
}
out1 = miscGlobalobj$outliers;
residualData2 = miscGlobalobj$residualData[,which(! c(1:n) %in% out1)]
countData2 = countData[,which(! c(1:n) %in% out1)]
MOD=matrix(0,nrow=d,ncol=n)
NPS=matrix(0,nrow=n,ncol=n)
OS = rep(0,n);
if (miscGlobalobj$K==0) {
out2=out2.sort=c()
out2stat=rep(0,ncol(residualData2))
cutoff=0
residualData_out=NULL
} else {
residualData_out = detect_localout(residualData=residualData2,countData=countData2,exonset=exonset,
windowSize=windowSize,readconstr=readconstr,
siglev=siglev,cutoff=cutoff,reducedReturn=reducedReturn)
changeID = Relist.hy(1:n,out1);
out2 = changeID$new[residualData_out$outliers];
out2.sort = changeID$new[residualData_out$outliers.sort]
## Step 2 MOD
if (length(out1)>=1) {
MOD[,-out1] = residualData_out$MOD;
NPS[-out1,-out1] = residualData_out$NPS;
} else {
MOD = residualData_out$MOD;
NPS = NPS
}
cutoff = residualData_out$cutoff;
out2stat = residualData_out$onoff_stat;
OS[which(! c(1:n) %in% out1)] = out2stat;
}
return(list(outliers=out2,outliers.sort=out2.sort,
out2stat=out2stat,
cutoff=cutoff,ks.df=residualData_out$ks.df,ks.pval=residualData_out$ks.pval,
MOD=MOD,NPS=NPS,OS=OS,
out2res=residualData_out));
}
#' Detect local shape outliers with a residual matrix
#'
#' This function identifies local shape variants by mining locally on/off abnormalities.
#'
#'
#' @export
detect_localout = function(residualData,countData,exonset,
windowSize=100,ADcutoff=3,
siglev=NULL,cutoff=NULL,reducedReturn=FALSE) {
n2=ncol(residualData)
onoff_res=as.list(NULL)
readconstr=10 # the minimum reads count required to be considered for on/off local shape changes.
onoff_res[[1]]=get_offstat(residualData=residualData,countData=countData,exonset=exonset,ADcutoff=ADcutoff,
windowSize=windowSize,readconstr=readconstr)
onoff_res[[2]]=get_exon_offstat(residualData=residualData,countData=countData,exonset=exonset,ADcutoff=ADcutoff)
onoff_res[[3]]=get_exon_onstat(residualData=residualData,countData=countData,
exonset=exonset,readconstr=readconstr)
onoff_res[[4]]=get_intron_onstat(residualData=residualData,countData=countData,
exonset=exonset,readconstr=readconstr)
onoff_statmat=matrix(0,nrow=length(onoff_res),ncol=n2)
for (i in 1:length(onoff_res)) {
onoff_statmat[i,]=onoff_res[[i]]$stat
}
onoff_stat=apply(onoff_statmat,2,max)
onoff_stat_which=apply(onoff_statmat,2,which.max)
MOD = matrix(0,nrow=nrow(residualData),ncol=n2)
NPS = matrix(0,nrow=n2,ncol=n2)
for (j in 1:n2) {
MOD[,j]=onoff_res[[onoff_stat_which[j]]]$tMOD[,j]
NPS[,j]=onoff_res[[onoff_stat_which[j]]]$tNPS[,j]
}
ks.pval=ks.stat=rep(0,20)
for (df in 1:20) {
# hist(pchisq(onoff_stat^2,df=df,lower.tail=F),main=df)
temp.out=which(onoff_stat>sqrt(qchisq(siglev,df=df,lower.tail=F)))
ks.output=ks.test(onoff_stat[which(! c(1:n2) %in% temp.out)]^2,pchisq,df)
ks.pval[df]=ks.output$p.value
ks.stat[df]=ks.output$statistic
# cat(paste(df,"|",round(ks.pval[df],digits=5),"|",
# round(ks.stat[df],digits=3)),"\n")
}
df=which.min(ks.stat)
if (is.null(cutoff)) {
cutoff=sqrt(qchisq(siglev,df=df,lower.tail=F))
}
onoffout=c()
for (i in 1:length(onoff_res)) {
onoffout=unique(c(onoffout,which(onoff_res[[i]]$stat>cutoff)))
}
if (length(onoffout)>0) {
onoffout.sort=onoffout[order(onoff_stat[onoffout],decreasing=T)]
} else {
onoffout=NULL; onoffout.sort=NULL;
}
if (reducedReturn) {
onoff_res=NULL
}
return(list(outliers=onoffout,
outliers.sort=onoffout.sort,
onoff_stat=onoff_stat,
MOD=MOD,NPS=NPS,
cutoff=cutoff,ks.df=df,ks.pval=ks.pval[df],
onoff_res=onoff_res))
}
#' Quantifying abnormal deletions in narrow regions
#'
#' This function quantifies abnormality associated with
#' @export
get_offstat= function(residualData,countData,exonset,ADcutoff=3,
windowSize=100,readconstr=10) {
## Step 2 : detect "off" outliers
require(zoo)
# Find eligible regions
n2 = ncol(residualData)
medvec = apply(countData,1,median)
if (nrow(exonset)==1) {
exon.base=c(exonset[1,2]:exonset[1,3])
} else {
exon.base=c()
for (i in 1:(nrow(exonset)-1)) {
exon.base=c(exon.base,c(exonset[i,2]:exonset[i,3]))
}
}
std.overall=t(apply(residualData[exon.base,],1,pd.rate.hy))
mad.overall=apply(std.overall,2,mad)
mad.overall[which(mad.overall<1)]=1
residualData = sweep(residualData,2,mad.overall,"/");
medvec.case = apply(countData[exon.base,],2,median)
medvec.case01 = rep(0,n2); medvec.case01[which(medvec.case>10)]=1;
window_min = rollapply(medvec,width=windowSize,FUN=min) # min of med
# window_mean = rollapply(medvec,width=windowSize,FUN=mean)
window_idx0 = which(window_min>=readconstr)
window_idx1 = window_idx0[which(window_idx0 %in% exon.base)]
window_idx = window_idx1[which(window_min[window_idx1]>=quantile(window_min[window_idx1],probs=0.1))] # do we really need this?
# Find exons whose lengths are less than the pre-determined size.
smallexon_0 = which((exonset[1:(nrow(exonset)-1),3]-exonset[1:(nrow(exonset)-1),2]+1)<windowSize)
smallexon_min = c()
for (j in smallexon_0) {
smallexon_min = min(medvec[c(exonset[j,2]:exonset[j,3])])
}
smallexon = smallexon_0[which(smallexon_min>=readconstr)]
# Determine window size
winsize_vec = rep(0,length(window_min))
winsize_vec[window_idx] = windowSize;
winsize_vec[exonset[smallexon,2]] = exonset[smallexon,3]-exonset[smallexon,2]+1
# Final start_positions for calculating statistics
window_site = which(winsize_vec>0)
window_size = winsize_vec[window_site]
pdrate = matrix(0,nrow=length(window_site),ncol=ncol(residualData))
tMOD=matrix(0,nrow=nrow(countData),ncol=n2) # temp MOD
tNPS=matrix(0,nrow=n2,ncol=n2) # temp NPS
if (length(window_site)>0) {
for (i in which((window_site>100) & (window_site<(nrow(residualData)-100)))) {
carea = c((window_site[i]):(window_site[i]-1+window_size[i]));
# pdrate[i,] = pd.rate.hy(-apply(residualData[carea,],2,sum),qrsc=T);
pdrate[i,] = pd.rate.hy(-apply(residualData[carea,],2,sum),qrsc=F);
}
outdir = which(apply(pdrate,1,ADstatWins.hy)>=ADcutoff)
pdrate[outdir,]=0
pdrate2 = sweep(pdrate,2,medvec.case01,"*")
# inner.std=function(x) {
# m=mad(x)
# if (m>1) {
# return(x/m)
# } else {
# return(x)
# }
# }
# pdrate3=apply(pdrate2,2,inner.std)
pdrate3=pdrate2
off_stat = apply(pdrate3,2,max)
where_on = apply(pdrate3,2,which.max)
for (j in 1:n2) {
start_loc = window_site[where_on[j]]
end_loc = start_loc + window_size[where_on[j]] - 1
carea = c(start_loc:end_loc)
tMOD[carea,j] = -1/sqrt(length(carea))
tNPS[,j] = pdrate3[where_on[j],]
}
} else {
off_stat = rep(0,ncol(residualData))
}
return(list(stat=off_stat,tMOD=tMOD,tNPS=tNPS))
}
#'
#' @export
get_exon_offstat= function(residualData,countData,exonset,ADcutoff=3) {
n2 = ncol(residualData)
medvec = apply(countData,1,median)
nexon=nrow(exonset)
exon.comb=diag(1,nexon)
if (nexon>2) {
for (j in 2:(nexon-1)) {
temp=rep(0,nexon)
temp[c(1:j)]=1
exon.comb=cbind(exon.comb,temp,rev(temp))
}
}
colnames(exon.comb)=NULL
pdrate = matrix(0,nrow=ncol(exon.comb),ncol=n2)
for (i in 1:nrow(pdrate)) {
carea=c()
for (e in which(exon.comb[,i]==1)) {
carea=c(carea,c(exonset[e,2]:exonset[e,3]))
}
if (length(carea)>20) {
pdrate[i,] = pd.rate.hy(-apply(residualData[carea,],2,sum),qrsc=F);
}
}
outdir=which(apply(pdrate,1,ADstatWins.hy)>=ADcutoff)
pdrate[outdir,]=0
exonoff_stat = apply(pdrate,2,max)
where_on = apply(pdrate,2,which.max)
tMOD=matrix(0,nrow=nrow(countData),ncol=n2) # temp MOD
tNPS=matrix(0,nrow=n2,ncol=n2) # temp NPS
for (j in 1:n2) {
carea=c()
for (e in which(exon.comb[,where_on[j]]==1)) {
carea=c(carea,c(exonset[e,2]:exonset[e,3]))
}
if (length(carea)>20) {
tMOD[carea,j] = -1/sqrt(length(carea))
tNPS[,j] = pdrate[where_on[j],]
}
}
return(list(stat=exonoff_stat,tMOD=tMOD,tNPS=tNPS))
}
#'
#' @export
get_exon_onstat= function(residualData,countData,exonset,readconstr=10) {
n2 = ncol(residualData)
medvec = apply(countData,1,median)
nexon=nrow(exonset)
exon.comb=diag(1,nexon)
if (nexon>2) {
for (j in 2:(nexon-1)) {
temp=rep(0,nexon)
temp[c(1:j)]=1
exon.comb=cbind(exon.comb,temp,rev(temp))
}
}
colnames(exon.comb)=NULL
medianmat = matrix(0,nrow=ncol(exon.comb),ncol=n2)
medianmat1 = matrix(0,nrow=ncol(exon.comb),ncol=n2)
medianmat2 = matrix(0,nrow=ncol(exon.comb),ncol=n2)
pdrate1 = matrix(0,nrow=ncol(exon.comb),ncol=n2)
pdrate2 = matrix(0,nrow=ncol(exon.comb),ncol=n2)
for (i in 1:ncol(exon.comb)) {
carea1=c()
for (e in which(exon.comb[,i]==1)) {
carea1=c(carea1,c(exonset[e,2]:exonset[e,3]))
}
if (length(carea1)>20) {
pdrate1[i,] = pd.rate.hy(apply(residualData[carea1,],2,sum),qrsc=T);
medianmat[i,] = apply(countData[carea1,],2,FUN=function(x){(quantile(x,probs=0.75)>=readconstr)})
medianmat1[i,] = apply(countData[carea1,],2,FUN=function(x){fivenum(x)[3]})
}
# carea2=c()
# for (e in which(exon.comb[,i]==0)) {
# carea2=c(carea2,c(exonset[e,2]:exonset[e,3]))
# }
# pdrate2[i,] = pd.rate.hy(apply(residualData[carea2,],2,sum),qrsc=T);
# medianmat2[i,] = apply(countData[carea2,],2,FUN=function(x){fivenum(x)[3]})
}
exonon_stat = apply(pdrate1*medianmat,2,max)
where_on = apply(pdrate1*medianmat,2,which.max)
tMOD=matrix(0,nrow=nrow(countData),ncol=n2)
tNPS=matrix(0,nrow=n2,ncol=n2) # temp NPS
for (j in 1:n2) {
carea=c()
for (e in which(exon.comb[,where_on[j]]==1)) {
carea=c(carea,c(exonset[e,2]:exonset[e,3]))
}
if (length(carea)>20) {
tMOD[carea,j] = 1/sqrt(length(carea))
tNPS[,j] = pdrate1[where_on[j],]
}
}
return(list(stat=exonon_stat,tMOD=tMOD,tNPS=tNPS))
}
#'
#' @export
get_intron_onstat= function(residualData,countData,exonset,readconstr=10) {
intron_onstat_inner2=function(x){
lcarea=length(carea)
y=max(quantile(x[carea[6:ceiling(lcarea/3)]],probs=0.75),
quantile(x[carea[ceiling(lcarea/3):(2*ceiling(lcarea/3))]],probs=0.75),
quantile(x[carea[(2*ceiling(lcarea/3)):(lcarea-6)]],probs=0.75))
max_exonread = max(quantile(x[carea1],probs=0.5),quantile(x[carea2],probs=0.5));
if (max_exonread>readconstr) {
readconstr2=max(5,0.1*max_exonread)
} else {
readconstr2=readconstr
}
if (max_exonread>100) {
((y>=readconstr) & (y>=readconstr2))
} else {
((y>=readconstr) | (y>=readconstr2))
}
}
n2 = ncol(residualData)
nexon = nrow(exonset)
medianmat = matrix(0,nrow=(nexon-1),ncol=n2)
pdrate = matrix(0,nrow=(nexon-1),ncol=n2)
if (nexon > 1) {
for (i in 1:(nexon-1)) {
carea = c((exonset[i,3]+1):(exonset[(i+1),2]-1))
if (length(carea)>20) {
carea1 = c(exonset[i,2]:exonset[i,3])
carea2 = c(exonset[(i+1),2]:exonset[(i+1),3])
medianmat[i,] = apply(countData,2,intron_onstat_inner2)
pdrate[i,] = pd.rate.hy(apply(residualData[carea,],2,sum),qrsc=T);
}
}
}
intron_onstat = apply(pdrate*medianmat,2,max)
where_on = apply(pdrate*medianmat,2,which.max)
tMOD=matrix(0,nrow=nrow(countData),ncol=n2)
tNPS=matrix(0,nrow=n2,ncol=n2) # temp NPS
if (nexon > 1) {
for (j in 1:n2) {
carea=c((exonset[where_on[j],3]+1):(exonset[(where_on[j]+1),2]-1))
tMOD[carea,j] = 1/sqrt(length(carea))
tNPS[,j] = pdrate[where_on[j],]
}
}
return(list(stat=intron_onstat,tMOD=tMOD,tNPS=tNPS))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.