####################################################################
## Author: Gro Nilsen, Knut Liest?l and Ole Christian Lingj?rde.
## Maintainer: Gro Nilsen <gronilse@ifi.uio.no>
## License: Artistic 2.0
## Part of the copynumber package
## Reference: Nilsen and Liest?l et al. (2012), BMC Genomics
####################################################################
##Required by:
### none
##Requires:
### getMad
### getArms
### numericArms
### numericChrom
### pullOutContent
## Main function for multipcf-analysis to be called by the user
multipcf <- function(data,pos.unit="bp",arms=NULL,Y=NULL,gamma=40,normalize=TRUE,w=1,fast=TRUE,assembly="hg19",digits=4,return.est=FALSE,save.res=FALSE,file.names=NULL,verbose=TRUE){
#Check pos.unit input:
if(!pos.unit %in% c("bp","kbp","mbp")){
stop("'pos.unit' must be one of bp, kbp and mbp",call.=FALSE)
}
#Check assembly input:
if(!assembly %in% c("hg19","hg18","hg17","hg16","mm7","mm8","mm9","hg38","mm10")){
stop("assembly must be one of hg19, hg18, hg17 or hg16",call.=FALSE)
}
#Is data a file:
isfile.data <- class(data)=="character"
#Check data input:
if(!isfile.data){
#Input could come from winsorize and thus be a list; check and possibly retrieve data frame wins.data
data <- pullOutContent(data,what="wins.data")
stopifnot(ncol(data)>=4) #something is missing in input data
#Extract information from data:
chrom <- data[,1]
position <- data[,2]
nSample <- ncol(data)-2
sampleid <- colnames(data)[-c(1:2)]
}else{
#data is a datafile which contains data
f <- file(data,"r") #open file connection
head <- scan(f,nlines=1,what="character",quiet=TRUE,sep="\t") #Read header
if(length(head)<3){
stop("Data in file must have at least 3 columns",call.=FALSE)
}
sampleid <- head[-c(1:2)]
nSample <- length(sampleid)
#Read just the two first columns to get chrom and pos
chrom.pos <- read.table(file=data,sep="\t",header=TRUE,colClasses=c(rep(NA,2),rep("NULL",nSample)),as.is=TRUE)
chrom <- chrom.pos[,1]
position <- chrom.pos[,2]
}
#Make sure chrom is not factor:
if(is.factor(chrom)){
#If chrom is factor; convert to character
chrom <- as.character(chrom)
}
#Make sure chromosomes are numeric (replace X and Y by 23 and 24)
num.chrom <- numericChrom(chrom)
nProbe <- length(num.chrom)
#Make sure position is numeric:
if(!is.numeric(position)){
stop("input in data column 2 (posistions) must be numeric",call.=FALSE)
}
#Get character arms:
if(is.null(arms)){
arms <- getArms(num.chrom,position,pos.unit,get(assembly))
}else{
stopifnot(length(arms)==nProbe)
}
#Translate to numeric arms:
num.arms <- numericArms(num.chrom,arms)
#Unique arms:
arm.list <- unique(num.arms)
nArm <- length(arm.list)
#Check that w is same length as number of samples:
if(length(w)==1){
w <- rep(w,nSample)
}else if(length(w)!=nSample){
stop("'w' must be a single number or a vector of same length as the number of samples in 'data'",call.=FALSE)
}
#Check Y input:
if(!is.null(Y)){
stopifnot(class(Y)%in%c("matrix","data.frame","character"))
isfile.Y <- class(Y)=="character"
if(!isfile.Y){
ncol.Y <- ncol(Y)
nrow.Y <- nrow(Y)
}else{
f.y <- file(Y,"r")
ncol.Y <- length(scan(f.y,nlines=1,what="character",quiet=TRUE,sep="\t"))
nrow.Y <- nrow(read.table(file=Y,sep="\t",header=TRUE,colClasses=c(NA,rep("NULL",ncol.Y-1)),as.is=TRUE))
}
if(nrow.Y!=nProbe || ncol.Y!=nSample+2){
stop("Input Y does not represent the same number of probes and samples as found in input data",call.=FALSE)
}
}
#Create folders in working directory where results are saved:
#Initialize
seg.names <- c("chrom","arm","start.pos","end.pos","n.probes",sampleid)
mpcf.names <- c("chrom","pos",sampleid)
segments <- data.frame(matrix(nrow=0,ncol=nSample+5))
colnames(segments) <- seg.names
if(return.est){
mpcf.est <- matrix(nrow=0,ncol=nSample)
}
if(save.res){
if(is.null(file.names)){
#Create directory where results are to be saved
dir.res <- "multipcf_results"
if(!dir.res %in% dir()){
dir.create(dir.res)
}
file.names <- c(paste(dir.res,"/","estimates.txt",sep=""),paste(dir.res,"/","segments.txt",sep=""))
}else{
#Check that file.names is the correct length
if(length(file.names)<2){
stop("'file.names' must be of length 2", call.=FALSE)
}
}
}
#estimates must be returned from routines if return.est or save.res
yest <- any(return.est,save.res)
#If normalize=T, we will scale by the sample residual standard error. If the number of probes in the data set < 100K, the MAD sd-estimate is calculated using all obs for the sample:
sd <- rep(1,nSample) #to have a value to check in if-test later; not used otherwise. sd is only used if normalize=TRUE and then these values are replaced by MAD-sd
if(normalize && nProbe < 100000){
for(j in 1:nSample){
if(!isfile.data){
sample.data <- data[,j+2]
}else{
cc <- rep("NULL",nSample+2)
cc[j+2] <- "numeric"
#only read data for the j'th sample
sample.data <- read.table(file=data,sep="\t",header=TRUE,colClasses=cc)[,1]
}
sd[j] <- getMad(sample.data[!is.na(sample.data)],k=25) #Take out missing values before calculating mad
}
}
#Scale gamma according to the number of samples:
gamma <- gamma*nSample
#run multiPCF separately on each chromosomearm:
for(c in 1:nArm){
probe.c <- which(num.arms==arm.list[c])
pos.c <- position[probe.c]
nProbe.c <- length(probe.c)
#get data for this arm
if(!isfile.data){
arm.data <- data[probe.c,-c(1:2),drop=FALSE]
}else{
#Read data for this arm from file; since f is a opened connection, the reading will start on the next line which has not already been read
#two first columns skipped
arm.data <- read.table(f,nrows=nProbe.c,sep="\t",colClasses=c(rep("NULL",2),rep("numeric",nSample)))
}
#Check that there are no missing values:
if(any(is.na(arm.data))){
stop("multiPCF cannot be run because there are missing data values, see 'imputeMissing' for imputation of missing values")
}
#Make sure data is numeric:
if(any(!sapply(arm.data,is.numeric))){
stop("input in data columns 3 and onwards (copy numbers) must be numeric",call.=FALSE)
}
#If normalize=T and nProbe>=100K, we calculate the MAD sd-estimate for each sample using only obs in this arm
if(normalize && nProbe >= 100000){
sd <- apply(arm.data,2,getMad)
}
#Check sd; cannot normalize if sd=0 or if sd=NA:
if(any(sd==0) || any(is.na(sd))){
#not run multipcf, return mean for each sample:
m <- apply(arm.data,2,mean)
dim(m) <- c(length(m),1)
if(yest){
yhat <- sapply(m,rep,nrow(arm.data))
mpcf <- list(pcf=t(yhat),nIntervals=1,start0=1,length=nrow(arm.data),mean=m)
}else{
mpcf <- list(nIntervals=1,start0=1,length=nrow(arm.data),mean=m)
}
}else{
#normalize data data (sd=1 if normalize=FALSE)
arm.data <- sweep(arm.data,2,sd,"/")
#weight data (default weights is 1)
arm.data <- sweep(arm.data,2,w,"*")
#Run multipcf:
if(!fast || nrow(arm.data)<400){
mpcf <- doMultiPCF(as.matrix(t(arm.data)),gamma=gamma,yest=yest) #requires samples in rows, probes in columns
#note: returns samples in rows, estimates in columns.
}else{
mpcf <- selectFastMultiPcf(as.matrix(arm.data),gamma=gamma,L=15,yest=yest) #requires samples in columns, probes in rows
}
#"Unweight" estimates:
mpcf$mean <- sweep(mpcf$mean,1,w,"/")
if(yest){
mpcf$pcf <- sweep(mpcf$pcf,1,w,"/")
}
#"Un-normalize" estimates:
mpcf$mean <- sweep(mpcf$mean,1,sd,"*")
if(yest){
mpcf$pcf <- sweep(mpcf$pcf,1,sd,"*")
}
}
#Information about segments:
nSeg <- mpcf$nIntervals
start0 <- mpcf$start0
n.pos <- mpcf$length
seg.mean <- t(mpcf$mean) #get samples in columns
posStart <- pos.c[start0]
posEnd <- c(pos.c[start0-1],pos.c[nProbe.c])
#Chromosome number and character arm id:
chr <- unique(chrom[probe.c])
a <- unique(arms[probe.c])
chrid <- rep(chr,times=nSeg)
armid <- rep(a,times=nSeg)
#May use mean of input data or the observed data specified in Y:
if(!is.null(Y)){
#get Y for this arm
if(!isfile.Y){
arm.Y <- Y[probe.c,-c(1:2),drop=FALSE]
}else{
arm.Y <- read.table(f.y,nrows=length(probe.c),sep="\t",colClasses=c(rep("NULL",2),rep("numeric",nSample)))
}
#Make sure Y is numeric:
if(any(!sapply(arm.Y,is.numeric))){
stop("input in Y columns 3 and onwards (copy numbers) must be numeric",call.=FALSE)
}
#Use observed data to calculate segment mean (recommended)
seg.mean <- matrix(NA,nrow=nSeg,ncol=nSample)
for(s in 1:nSeg){
seg.Y <- as.matrix(arm.Y[start0[s]:(start0[s]+n.pos[s]-1),])
seg.mean[s,] <- apply(seg.Y,2,mean,na.rm=TRUE)
}
}
#Round
if(yest){
yhat <- round(mpcf$pcf,digits=digits)
}
seg.mean <- round(seg.mean,digits=digits)
#Data frame:
segments.c <- data.frame(chrid,armid,posStart,posEnd,n.pos,seg.mean,stringsAsFactors=FALSE)
colnames(segments.c) <- seg.names
#Should results be written to files or returned to user:
if(save.res){
if(c==1){
#open connection for writing to file
w1 <- file(file.names[1],"w")
w2 <- file(file.names[2],"w")
}
#Write segments to file for this arm
write.table(segments.c,file=w2,col.names=if(c==1) seg.names else FALSE,row.names=FALSE,quote=FALSE,sep="\t")
#Write estimated multiPCF-values file for this arm:
write.table(data.frame(chrom[probe.c],pos.c,t(yhat),stringsAsFactors=FALSE), file = w1,col.names=if(c==1) mpcf.names else FALSE,row.names=FALSE,quote=FALSE,sep="\t")
}
#Append results for this arm:
segments <- rbind(segments,segments.c)
if(return.est){
mpcf.est <- rbind(mpcf.est,t(yhat))
}
if(verbose){
cat(paste("multipcf finished for chromosome arm ",chr,a,sep=""),"\n")
}
}#endfor
#Close connections
if(isfile.data){
close(f)
}
if(!is.null(Y)){
if(isfile.Y){
close(f.y)
}
}
if(save.res){
cat(paste("multipcf-estimates were saved in file",file.names[1]),sep="\n")
close(w1)
cat(paste("segments were saved in file",file.names[2]),sep="\n")
close(w2)
}
#return results:
if(return.est){
mpcf.est <- data.frame(chrom,position,mpcf.est,stringsAsFactors=FALSE)
colnames(mpcf.est) <- mpcf.names
return(list(estimates=mpcf.est,segments=segments))
}else{
return(segments)
}
}#endfunction
#Run exact multipcf algorithm, to be called by multipcf (main function)
doMultiPCF <- function(y, gamma, yest) {
## y: input matrix of copy number estimates, samples in rows
## gamma: penalty for discontinuities
## yest: logical, should estimates be returned
N <- length(y)
nSamples <- nrow(y)
nProbes <- ncol(y)
## initialisations
if(yest){
yhat <- rep(0,N);
dim(yhat) <- c(nSamples,nProbes)
}
bestCost <- rep(0,nProbes)
bestSplit <- rep(0,nProbes+1)
bestAver <- rep(0,N)
dim(bestAver) <- c(nSamples,nProbes)
Sum <- rep(0,N)
dim(Sum) <- c(nSamples,nProbes)
Aver <- rep(0,N)
dim(Aver) <- c(nSamples,nProbes)
Nevner <- rep(0,N)
dim(Nevner) <- c(nProbes,nSamples)
Nevner <- t(Nevner+(nProbes:1))
eachCost <- rep(0,N)
dim(eachCost) <- c(nSamples,nProbes)
Cost <- rep(0,nProbes)
## Filling of first elements
y1<-y[ ,1]
Sum[ ,1]<-y1
Aver[ ,1]<-y1
bestCost[1]<-0
bestSplit[1]<-0
bestAver[ ,1]<-y1
helper <- rep(1,nSamples)
## Solving for gradually longer arrays. Sum accumulates
## values for errors for righthand plateau downward from n;
## this error added to gamma and the stored cost in bestCost
## give the total cost. Cost stores the total cost for breaks
## at any position below n, and which.min finds the position
## with lowest cost (best split). Aver is the average of the
## righthand plateau.
for (n in 2:nProbes) {
Sum[ ,1:n] <- Sum[ ,1:n]+y[,n]
Aver[ ,1:n] <- Sum[ ,1:n]/Nevner[ ,(nProbes-n+1):nProbes]
eachCost[ ,1:n] <- -(Sum[ ,1:n]*Aver[ ,1:n])
Cost[1:n] <- helper %*% eachCost[ ,1:n]
Cost[2:n] <- Cost[2:n]+bestCost[1:(n-1)]+gamma
Pos <- which.min(Cost[1:n])
bestCost[n] <- Cost[Pos]
bestAver[ ,n] <- Aver[ ,Pos]
bestSplit[n] <- Pos-1
}
## The final solution is found iteratively from the sequence
## of split positions stored in bestSplit and the averages
## for each plateau stored in bestAver
n <- nProbes
antInt <- 0
while (n > 0) {
if(yest){
yhat[ ,(bestSplit[n]+1):n] <- bestAver[ ,n]
}
n <- bestSplit[n]
antInt <- antInt+1
}
n <- nProbes
lengde <- rep(0,antInt)
start0 <- rep(0,antInt)
verdi <- rep(0,antInt*nSamples)
dim(verdi) <- c(nSamples,antInt)
oldSplit <- n
antall <- antInt
while (n > 0) {
start0[antall] <- bestSplit[n]+1
lengde[antall] <- oldSplit-bestSplit[n]
verdi[ ,antall] <- bestAver[ ,n]
n <- bestSplit[n]
oldSplit <- n
antall <- antall-1
}
if(yest){
return(list(pcf=yhat, length = lengde, start0 = start0, mean = verdi, nIntervals=antInt))
}else{
return(list(length = lengde, start0 = start0, mean = verdi, nIntervals=antInt))
}
}
## Choose fast multipcf version, called by multipcf (main function)
selectFastMultiPcf <- function(x,gamma,L,yest){
xLength <- nrow(x)
if (xLength< 1000) {
result<-runFastMultiPCF(x,gamma,L,0.15,0.15,yest)
} else {
if (xLength < 3000){
result<-runFastMultiPCF(x,gamma,L,0.12,0.10,yest)
} else {
if (xLength < 15000){
result<-runFastMultiPCF(x,gamma,L,0.12,0.05,yest)
} else {
result<-runMultiPcfSubset(x,gamma,L,0.12,0.05,yest)
}
}
}
return(result)
}
# Fast version 1, for moderately long sequences, called by selectFastMultiPcf
runFastMultiPCF <- function (x, gamma, L, frac1, frac2, yest) {
mark <- rep(0, nrow(x))
mark<-sawMarkM(x,L,frac1,frac2)
dense <- compactMulti(t(x), mark)
compPotts <- multiPCFcompact(dense$Nr, dense$Sum, gamma)
if (yest) {
potts <- expandMulti(nrow(x),ncol(x), compPotts$Lengde,compPotts$mean)
return(list(pcf = potts, length = compPotts$Lengde, start0 = compPotts$sta,
mean = compPotts$mean, nIntervals = compPotts$nIntervals))
} else {
return(list(length = compPotts$Lengde, start0 = compPotts$sta,
mean = compPotts$mean, nIntervals = compPotts$nIntervals))
}
}
# Fast version 2, for very long sequences, called by selectFastMultiPcf
runMultiPcfSubset <- function(x,gamma,L,frac1,frac2,yest){
SUBSIZE <- 5000 #length of subsets
antGen <- nrow(x)
mark <- sawMarkM(x,L,frac1,frac2)
markInit <- c(mark[1:(SUBSIZE-1)],TRUE)
compX <- compactMulti(t(x[1:SUBSIZE,]),markInit)
mark2 <- rep(FALSE,antGen)
mark2[1:SUBSIZE] <- markMultiPotts(compX$Nr,compX$Sum,gamma,SUBSIZE)
mark2[4*SUBSIZE/5] <- TRUE
start0 <- 4*SUBSIZE/5+1
while(start0 + SUBSIZE < antGen){
slutt <- start0+SUBSIZE-1
markSub <- c(mark2[1:(start0-1)],mark[start0:slutt])
markSub[slutt] <- TRUE
compX <- compactMulti(t(x[1:slutt,]),markSub)
mark2[1:slutt] <- markMultiPotts(compX$Nr,compX$Sum,gamma,slutt)
start0 <- start0+4*SUBSIZE/5
mark2[start0-1] <- TRUE
}
markSub <- c(mark2[1:(start0-1)],mark[start0:antGen])
compX <- compactMulti(t(x),markSub)
compPotts <- multiPCFcompact(compX$Nr,compX$Sum,gamma)
if (yest) {
potts <- expandMulti(nrow(x),ncol(x), compPotts$Lengde,compPotts$mean)
return(list(pcf = potts, length = compPotts$Lengde, start0 = compPotts$sta,
mean = compPotts$mean, nIntervals = compPotts$nIntervals))
} else {
return(list(length = compPotts$Lengde, start0 = compPotts$sta,
mean = compPotts$mean, nIntervals = compPotts$nIntervals))
}
}
# function that accumulates numbers of observations and sums between potential breakpoints
compactMulti <- function(y,mark){
antGen <- ncol(y)
antSample <- nrow(y)
antMark <- sum(mark)
ant <- rep(0,antMark)
sum <- rep(0,antMark*antSample)
dim(sum) <- c(antSample,antMark)
pos <- 1
oldPos <- 0
count <- 1
delSum <- rep(0,antSample)
while(pos <= antGen){
delSum <- 0
while (mark[pos] < 1){
delSum <- delSum + y[,pos]
pos <- pos+1
}
ant[count] <- pos-oldPos
sum[,count] <- delSum+y[,pos]
oldPos <- pos
pos <- pos+1
count <- count+1
}
list(Nr=ant,Sum=sum)
}
# main calculations for fast multipcf-versions
multiPCFcompact <- function(nr,sum,gamma) {
## nr,sum : numbers and sums for one analysis unit,
## typically one chromosomal arm. Samples assumed to be in rows.
## gamma: penalty for discontinuities
N <- length(nr)
nSamples <- nrow(sum)
## initialisations
yhat <- rep(0,N*nSamples);
dim(yhat) <- c(nSamples,N)
bestCost <- rep(0,N)
bestSplit <- rep(0,N+1)
bestAver <- rep(0,N*nSamples)
dim(bestAver) <- c(nSamples,N)
Sum <- rep(0,N*nSamples)
dim(Sum) <- c(nSamples,N)
Nevner <- rep(0,N*nSamples)
dim(Nevner) <- c(nSamples,N)
eachCost <- rep(0,N*nSamples)
dim(eachCost) <- c(nSamples,N)
Cost <- rep(0,N)
## Filling of first elements
Sum[ ,1]<-sum[,1]
Nevner[,1]<-nr[1]
bestSplit[1]<-0
bestAver[,1] <- sum[,1]/nr[1]
helper <- rep(1, nSamples)
bestCost[1]<-helper%*%(-Sum[,1]*bestAver[,1])
lengde <- rep(0,N)
## Solving for gradually longer arrays. Sum accumulates
## error values for righthand plateau downward from n;
## this error added to gamma and the stored cost in bestCost
## give the total cost. Cost stores the total cost for breaks
## at any position below n, and which.min finds the position
## with lowest cost (best split). Aver is the average of the
## righthand plateau.
for (n in 2:N) {
Sum[ ,1:n] <- Sum[ ,1:n]+sum[,n]
Nevner[,1:n] <- Nevner[,1:n]+nr[n]
eachCost[ ,1:n] <- -(Sum[ ,1:n]^2)/Nevner[ ,1:n]
Cost[1:n] <- helper %*% eachCost[, 1:n]
Cost[2:n] <- Cost[2:n]+bestCost[1:(n-1)]+gamma
Pos <- which.min(Cost[1:n])
cost <- Cost[Pos]
aver <- Sum[ ,Pos]/Nevner[,Pos]
bestCost[n] <- cost
bestAver[ ,n] <- aver
bestSplit[n] <- Pos-1
}
## The final solution is found iteratively from the sequence
## of split positions stored in bestSplit and the averages
## for each plateau stored in bestAver
n <- N
antInt <- 0
while (n > 0) {
yhat[ ,(bestSplit[n]+1):n] <- bestAver[ ,n]
antInt <- antInt+1
lengde[antInt] <- sum(nr[(bestSplit[n]+1):n])
n <- bestSplit[n]
}
lengdeRev <- lengde[antInt:1]
init <- rep(0,antInt)
init[1]<-1
if(antInt>=2){
for(k in 2:antInt){
init[k]<-init[k-1]+lengdeRev[k-1]
}
}
n <- N
verdi <- rep(0,antInt*nSamples)
dim(verdi) <- c(nSamples,antInt)
bestSplit[n+1] <- n
antall <- antInt
while (n > 0) {
verdi[ ,antall] <- bestAver[ ,n]
n <- bestSplit[n]
antall <- antall-1
}
list(Lengde = lengdeRev, sta = init, mean = verdi, nIntervals=antInt)
}
# helper function for fast version 2
markMultiPotts <- function(nr,sum,gamma,subsize) {
## nr,sum: numbers and sums for one analysis unit,
## typically one chromosomal arm. Samples assumed to be in rows.
## gamma: penalty for discontinuities
N <- length(nr)
nSamples <- nrow(sum)
markSub <- rep(FALSE,N)
bestCost <- rep(0,N)
bestSplit <- rep(0,N+1)
bestAver <- rep(0,N*nSamples)
dim(bestAver) <- c(nSamples,N)
Sum <- rep(0,N*nSamples)
dim(Sum) <- c(nSamples,N)
Nevner <- rep(0,N*nSamples)
dim(Nevner) <- c(nSamples,N)
eachCost <- rep(0,N*nSamples)
dim(eachCost) <- c(nSamples,N)
Cost <- rep(0,N)
## Filling of first elements
Sum[ ,1]<-sum[,1]
Nevner[,1]<-nr[1]
bestSplit[1]<-0
bestAver[,1] <- sum[,1]/nr[1]
helper <- rep(1, nSamples)
bestCost[1]<-helper%*%(-Sum[,1]*bestAver[,1])
lengde <- rep(0,N)
for (n in 2:N) {
Sum[ ,1:n] <- Sum[ ,1:n]+sum[,n]
Nevner[,1:n] <- Nevner[,1:n]+nr[n]
eachCost[ ,1:n] <- -(Sum[ ,1:n]^2)/Nevner[ ,1:n]
Cost[1:n] <- helper %*% eachCost[, 1:n]
Cost[2:n] <- Cost[2:n]+bestCost[1:(n-1)]+gamma
Pos <- which.min(Cost[1:n])
cost <- Cost[Pos]
aver <- Sum[ ,Pos]/Nevner[,Pos]
bestCost[n] <- cost
bestAver[ ,n] <- aver
bestSplit[n] <- Pos-1
markSub[Pos-1] <- TRUE
}
help<-findMarksMulti(markSub,nr,subsize)
return(help)
}
# Function which finds potential breakpoints
findMarksMulti <- function(markSub,Nr,subsize){
## markSub: marks in compressed scale
## NR: number of observations between potenstial breakpoints
mark <- rep(FALSE,subsize) ## marks in original scale
if(sum(markSub)<1) {
return(mark)
} else {
N<-length(markSub)
ant <- seq(1:N)
help <- ant[markSub]
lengdeHelp <- length(help)
help0 <- c(0,help[1:(lengdeHelp-1)])
lengde <- help-help0
start0<-1
oldStart<-1
startOrig<-1
for(i in 1:lengdeHelp){
start0 <- start0+lengde[i]
lengdeOrig <- sum(Nr[oldStart:(start0-1)])
startOrig <- startOrig+lengdeOrig
mark[startOrig-1]<-TRUE
oldStart<-start0
}
return(mark)
}
}
## expand compact solution
expandMulti <- function(nProbes,nSamples,lengthInt, mean){
##input: nr of probes, length of intervals,
## value in intervals; returns the expansion
Potts <- rep(0,nProbes*nSamples)
dim(Potts) <- c(nSamples,nProbes)
lengthCompArr <- length(lengthInt)
k <- 1
for(i in 1:lengthCompArr){
for(j in 1:lengthInt[i]){
Potts[,k] <- mean[,i]
k <- k+1
}
}
return(Potts)
}
## sawtooth-filter for multiPCF - marks potential breakpoints. Uses two
## sawtoothfilters, one lang (length L) and one short (fixed length 6)
sawMarkM <- function(x,L,frac1,frac2){
nrProbes <- nrow(x)
nrSample <- ncol(x)
mark <- rep(0,nrProbes)
sawValue <- rep(0,nrProbes)
filter <- rep(0,2*L)
sawValue2 <- rep(0,nrProbes)
filter2 <- rep(0,6)
for (k in 1:L) {
filter[k] <- k/L
filter[2*L+1-k] <- -k/L
}
for (k in 1:3) {
filter2[k] <- k/3
filter2[7-k] <- -k/3
}
for (l in 1:(nrProbes-2*L+1)){
for (m in 1:nrSample){
diff=crossprod(filter,x[l:(l+2*L-1),m])
sawValue[l+L-1]<-sawValue[l+L-1]+abs(diff)
}
}
limit <- quantile(sawValue,(1-frac1))
for (l in 1:(nrProbes-2*L)){
if (sawValue[l+L-1] > limit) {
mark[l+L-1] <- 1
}
}
for (l in (L-1):(nrProbes-L-2)){
for (m in 1:nrSample){
diff2=crossprod(filter2,x[l:(l+5),m])
sawValue2[l+2]<-sawValue2[l+2]+abs(diff2)
}
}
limit2 <- quantile(sawValue2,(1-frac2))
for (l in (L-1):(nrProbes-L-2)){
if (sawValue2[l+2] > limit2) {
mark[l+2] <- 1
}
}
for (l in 1:L){
mark[l] <- 1
mark[nrProbes+1-l]<- 1
}
return(mark)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.