####################################################################
## Author: Gro Nilsen, Knut Liest?l and Ole Christian Lingj?rde.
## Maintainer: Gro Nilsen <gronilse@ifi.uio.no>
## License: Artistic 2.0
## Part of the copynumber package
## Reference: Nilsen and Liest?l et al. (2012), BMC Genomics
####################################################################
##Required by:
### none
##Requires:
### getMad
### getArms
### numericArms
### numericChrom
### pullOutContent
## Main function for multipcf-analysis to be called by the user
multipcf <- function(data,pos.unit="bp",arms=NULL,Y=NULL,gamma=40,normalize=TRUE,w=1,fast=TRUE,assembly="hg19",digits=4,return.est=FALSE,save.res=FALSE,file.names=NULL,verbose=TRUE){
#Check pos.unit input:
if(!pos.unit %in% c("bp","kbp","mbp")){
stop("'pos.unit' must be one of bp, kbp and mbp",call.=FALSE)
}
#Check assembly input:
if(!assembly %in% c("hg19","hg18","hg17","hg16","mm7","mm8","mm9")){
stop("assembly must be one of hg19, hg18, hg17, hg16, mm7, mm8, or mm9",call.=FALSE)
}
#Is data a file:
isfile.data <- class(data)=="character"
#Check data input:
if(!isfile.data){
#Input could come from winsorize and thus be a list; check and possibly retrieve data frame wins.data
data <- pullOutContent(data,what="wins.data")
stopifnot(ncol(data)>=4) #something is missing in input data
#Extract information from data:
chrom <- data[,1]
position <- data[,2]
nSample <- ncol(data)-2
sampleid <- colnames(data)[-c(1:2)]
}else{
#data is a datafile which contains data
f <- file(data,"r") #open file connection
head <- scan(f,nlines=1,what="character",quiet=TRUE,sep="\t") #Read header
if(length(head)<3){
stop("Data in file must have at least 3 columns",call.=FALSE)
}
sampleid <- head[-c(1:2)]
nSample <- length(sampleid)
#Read just the two first columns to get chrom and pos
chrom.pos <- read.table(file=data,sep="\t",header=TRUE,colClasses=c(rep(NA,2),rep("NULL",nSample)),as.is=TRUE)
chrom <- chrom.pos[,1]
position <- chrom.pos[,2]
}
#Make sure chrom is not factor:
if(is.factor(chrom)){
#If chrom is factor; convert to character
chrom <- as.character(chrom)
}
#Make sure chromosomes are numeric (replace X and Y by 23 and 24)
num.chrom <- numericChrom(chrom)
nProbe <- length(num.chrom)
#Make sure position is numeric:
if(!is.numeric(position)){
stop("input in data column 2 (posistions) must be numeric",call.=FALSE)
}
#Get character arms:
if(is.null(arms)){
arms <- getArms(num.chrom,position,pos.unit,get(assembly))
}else{
stopifnot(length(arms)==nProbe)
}
#Translate to numeric arms:
num.arms <- numericArms(num.chrom,arms)
#Unique arms:
arm.list <- unique(num.arms)
nArm <- length(arm.list)
#Check that w is same length as number of samples:
if(length(w)==1){
w <- rep(w,nSample)
}else if(length(w)!=nSample){
stop("'w' must be a single number or a vector of same length as the number of samples in 'data'",call.=FALSE)
}
#Check Y input:
if(!is.null(Y)){
stopifnot(class(Y)%in%c("matrix","data.frame","character"))
isfile.Y <- class(Y)=="character"
if(!isfile.Y){
ncol.Y <- ncol(Y)
nrow.Y <- nrow(Y)
}else{
f.y <- file(Y,"r")
ncol.Y <- length(scan(f.y,nlines=1,what="character",quiet=TRUE,sep="\t"))
nrow.Y <- nrow(read.table(file=Y,sep="\t",header=TRUE,colClasses=c(NA,rep("NULL",ncol.Y-1)),as.is=TRUE))
}
if(nrow.Y!=nProbe || ncol.Y!=nSample+2){
stop("Input Y does not represent the same number of probes and samples as found in input data",call.=FALSE)
}
}
#Create folders in working directory where results are saved:
#Initialize
seg.names <- c("chrom","arm","start.pos","end.pos","n.probes",sampleid)
mpcf.names <- c("chrom","pos",sampleid)
segments <- data.frame(matrix(nrow=0,ncol=nSample+5))
colnames(segments) <- seg.names
if(return.est){
mpcf.est <- matrix(nrow=0,ncol=nSample)
}
if(save.res){
if(is.null(file.names)){
#Create directory where results are to be saved
dir.res <- "multipcf_results"
if(!dir.res %in% dir()){
dir.create(dir.res)
}
file.names <- c(paste(dir.res,"/","estimates.txt",sep=""),paste(dir.res,"/","segments.txt",sep=""))
}else{
#Check that file.names is the correct length
if(length(file.names)<2){
stop("'file.names' must be of length 2", call.=FALSE)
}
}
}
#estimates must be returned from routines if return.est or save.res
yest <- any(return.est,save.res)
#If normalize=T, we will scale by the sample residual standard error. If the number of probes in the data set < 100K, the MAD sd-estimate is calculated using all obs for the sample:
sd <- rep(1,nSample) #to have a value to check in if-test later; not used otherwise. sd is only used if normalize=TRUE and then these values are replaced by MAD-sd
if(normalize && nProbe < 100000){
for(j in 1:nSample){
if(!isfile.data){
sample.data <- data[,j+2]
}else{
cc <- rep("NULL",nSample+2)
cc[j+2] <- "numeric"
#only read data for the j'th sample
sample.data <- read.table(file=data,sep="\t",header=TRUE,colClasses=cc)[,1]
}
sample.noNAdata <- sample.data[!is.na(sample.data)]
if(length(sample.noNAdata)==0){
sd[j] <- 1
}else{
sd[j] <- getMad(sample.noNAdata,k=25) #Take out missing values before calculating mad
}
}
}
#Scale gamma according to the number of samples:
#No! If you do this, adding a wt sample gives you fewer breakpoints, which is not appropriate. Scaling gamma by the number of samples should only be done if each sample is an independent estimate of the same thing, i.e. if you believe that any given breakpoint should be seen in *all* samples.
#gamma <- gamma*nSample
#run multiPCF separately on each chromosomearm:
for(c in 1:nArm){
probe.c <- which(num.arms==arm.list[c])
pos.c <- position[probe.c]
nProbe.c <- length(probe.c)
#get data for this arm
if(!isfile.data){
arm.data <- data[probe.c,-c(1:2),drop=FALSE]
}else{
#Read data for this arm from file; since f is a opened connection, the reading will start on the next line which has not already been read
#two first columns skipped
arm.data <- read.table(f,nrows=nProbe.c,sep="\t",colClasses=c(rep("NULL",2),rep("numeric",nSample)))
}
#Check that there are no missing values:
# if(any(is.na(arm.data))){
# stop("multiPCF cannot be run because there are missing data values, see 'imputeMissing' for imputation of missing values")
# }
#Make sure data is numeric:
if(any(!sapply(arm.data,is.numeric))){
stop("input in data columns 3 and onwards (copy numbers) must be numeric",call.=FALSE)
}
#If normalize=T and nProbe>=100K, we calculate the MAD sd-estimate for each sample using only obs in this arm
if(normalize && nProbe >= 100000){
sd <- apply(arm.data,2,getMad)
}
#Check sd; cannot normalize if sd=0 or if sd=NA:
if(any(sd==0) || any(is.na(sd))){
#not run multipcf, return mean for each sample:
m <- apply(arm.data,2,mean)
dim(m) <- c(length(m),1)
if(yest){
yhat <- sapply(m,rep,nrow(arm.data))
mpcf <- list(pcf=t(yhat),nIntervals=1,start0=1,length=nrow(arm.data),mean=m)
}else{
mpcf <- list(nIntervals=1,start0=1,length=nrow(arm.data),mean=m)
}
}else{
#normalize data data (sd=1 if normalize=FALSE)
arm.data <- sweep(arm.data,2,sd,"/")
#weight data (default weights is 1)
arm.data <- sweep(arm.data,2,w,"*")
#Swap initial-NA SNPs with the first non-NA SNP for each sample
for (s in 1:ncol(arm.data)) {
snp <- 2
while (is.na(arm.data[1,s]) && snp<nrow(arm.data)) {
if(!is.na(arm.data[snp,s])) {
arm.data[1,s] = arm.data[snp,s]
arm.data[snp,s] = NA
}
snp <- snp+1
}
}
#Run multipcf:
if(!fast || nrow(arm.data)<400){
print("Running 'doMultiPCF'")
mpcf <- doMultiPCF(as.matrix(t(arm.data)),gamma=gamma,yest=yest) #requires samples in rows, probes in columns
#note: returns samples in rows, estimates in columns.
}else{
print("Running 'selectFastMultiPcf'")
mpcf <- selectFastMultiPcf(as.matrix(arm.data),gamma=gamma,L=15,yest=yest) #requires samples in columns, probes in rows
}
#"Unweight" estimates:
mpcf$mean <- sweep(mpcf$mean,1,w,"/")
if(yest){
mpcf$pcf <- sweep(mpcf$pcf,1,w,"/")
}
#"Un-normalize" estimates:
mpcf$mean <- sweep(mpcf$mean,1,sd,"*")
if(yest){
mpcf$pcf <- sweep(mpcf$pcf,1,sd,"*")
}
}
#Information about segments:
nSeg <- mpcf$nIntervals
start0 <- mpcf$start0
n.pos <- mpcf$length
seg.mean <- t(mpcf$mean) #get samples in columns
posStart <- pos.c[start0]
posEnd <- c(pos.c[start0-1],pos.c[nProbe.c])
#Chromosome number and character arm id:
chr <- unique(chrom[probe.c])
a <- unique(arms[probe.c])
chrid <- rep(chr,times=nSeg)
armid <- rep(a,times=nSeg)
#May use mean of input data or the observed data specified in Y:
if(!is.null(Y)){
#get Y for this arm
if(!isfile.Y){
arm.Y <- Y[probe.c,-c(1:2),drop=FALSE]
}else{
arm.Y <- read.table(f.y,nrows=length(probe.c),sep="\t",colClasses=c(rep("NULL",2),rep("numeric",nSample)))
}
#Make sure Y is numeric:
if(any(!sapply(arm.Y,is.numeric))){
stop("input in Y columns 3 and onwards (copy numbers) must be numeric",call.=FALSE)
}
#Use observed data to calculate segment mean (recommended)
seg.mean <- matrix(NA,nrow=nSeg,ncol=nSample)
for(s in 1:nSeg){
seg.Y <- as.matrix(arm.Y[start0[s]:(start0[s]+n.pos[s]-1),])
seg.mean[s,] <- apply(seg.Y,2,mean,na.rm=TRUE)
}
}
#Round
if(yest){
yhat <- round(mpcf$pcf,digits=digits)
}
seg.mean <- round(seg.mean,digits=digits)
#Data frame:
segments.c <- data.frame(chrid,armid,posStart,posEnd,n.pos,seg.mean,stringsAsFactors=FALSE)
colnames(segments.c) <- seg.names
#Should results be written to files or returned to user:
if(save.res){
if(c==1){
#open connection for writing to file
w1 <- file(file.names[1],"w")
w2 <- file(file.names[2],"w")
}
#Write segments to file for this arm
write.table(segments.c,file=w2,col.names=if(c==1) seg.names else FALSE,row.names=FALSE,quote=FALSE,sep="\t")
#Write estimated multiPCF-values file for this arm:
write.table(data.frame(chrom[probe.c],pos.c,t(yhat),stringsAsFactors=FALSE), file = w1,col.names=if(c==1) mpcf.names else FALSE,row.names=FALSE,quote=FALSE,sep="\t")
}
#Append results for this arm:
segments <- rbind(segments,segments.c)
if(return.est){
mpcf.est <- rbind(mpcf.est,t(yhat))
}
if(verbose){
cat(paste("multipcf finished for chromosome arm ",chr,a,sep=""),"\n")
}
}#endfor
#Close connections
if(isfile.data){
close(f)
}
if(!is.null(Y)){
if(isfile.Y){
close(f.y)
}
}
if(save.res){
cat(paste("multipcf-estimates were saved in file",file.names[1]),sep="\n")
close(w1)
cat(paste("segments were saved in file",file.names[2]),sep="\n")
close(w2)
}
#return results:
if(return.est){
mpcf.est <- data.frame(chrom,position,mpcf.est,stringsAsFactors=FALSE)
colnames(mpcf.est) <- mpcf.names
return(list(estimates=mpcf.est,segments=segments))
}else{
return(segments)
}
}#endfunction
#Run exact multipcf algorithm, to be called by multipcf (main function)
doMultiPCF <- function(y, gamma, yest) {
## y: input matrix of copy number estimates, samples in rows
## gamma: penalty for discontinuities
## yest: logical, should estimates be returned
N <- length(y)
nSamples <- nrow(y)
nProbes <- ncol(y)
## initialisations
if(yest){
yhat <- rep(0,N);
dim(yhat) <- c(nSamples,nProbes)
}
bestCost <- rep(0,nProbes)
bestSplit <- rep(0,nProbes+1)
bestAver <- rep(0,N)
dim(bestAver) <- c(nSamples,nProbes)
Sum <- rep(0,N)
dim(Sum) <- c(nSamples,nProbes)
Aver <- rep(0,N)
dim(Aver) <- c(nSamples,nProbes)
#We create Nevner such that it counts the number of probes *for which we have data*, so that the averages are calculated correctly.
Nevner <- rep(0,N)
dim(Nevner) <- c(nSamples,nProbes)
Nevner[,1] <- 1*!is.na(y[,1])
eachCost <- rep(0,N)
dim(eachCost) <- c(nSamples,nProbes)
Cost <- rep(0,nProbes)
## Filling of first elements
ynoNA <- y
ynoNA[is.na(ynoNA)] <- 0
y1<-ynoNA[ ,1]
Sum[ ,1]<-y1
Aver[ ,1]<-y1
bestCost[1]<-0
bestSplit[1]<-0
bestAver[ ,1]<-y1
helper <- rep(1,nSamples)
## Solving for gradually longer arrays. Sum accumulates
## values for errors for righthand plateau downward from n;
## this error added to gamma and the stored cost in bestCost
## give the total cost. Cost stores the total cost for breaks
## at any position below n, and which.min finds the position
## with lowest cost (best split). Aver is the average of the
## righthand plateau.
if (nProbes>1) {
for (n in 2:nProbes) {
Sum[ ,1:n] <- Sum[ ,1:n]+ynoNA[,n]
Nevner[,1:n] <- Nevner[,1:n]+!is.na(y[,n])
## Make sure we don't divide by zero by cheating on the ends a little:
divisor <- Nevner[ , 1:n]
divisor[divisor==0] <- 1
Aver[ ,1:n] <- Sum[ ,1:n]/divisor
#print(Aver[,1:n])
eachCost[ ,1:n] <- -(Sum[ ,1:n]*Aver[ ,1:n])
Cost[1:n] <- helper %*% eachCost[ ,1:n]
Cost[2:n] <- Cost[2:n]+bestCost[1:(n-1)]+gamma
Pos <- which.min(Cost[1:n])
#print("First location")
bestCost[n] <- Cost[Pos]
bestAver[ ,n] <- Aver[ ,Pos]
bestSplit[n] <- Pos-1
}
}
## The final solution is found iteratively from the sequence
## of split positions stored in bestSplit and the averages
## for each plateau stored in bestAver
n <- nProbes
antInt <- 0
while (n > 0) {
if(yest){
yhat[ ,(bestSplit[n]+1):n] <- bestAver[ ,n]
}
n <- bestSplit[n]
antInt <- antInt+1
}
n <- nProbes
lengde <- rep(0,antInt)
start0 <- rep(0,antInt)
verdi <- rep(0,antInt*nSamples)
dim(verdi) <- c(nSamples,antInt)
oldSplit <- n
antall <- antInt
while (n > 0) {
start0[antall] <- bestSplit[n]+1
lengde[antall] <- oldSplit-bestSplit[n]
verdi[ ,antall] <- bestAver[ ,n]
n <- bestSplit[n]
oldSplit <- n
antall <- antall-1
}
if(yest){
return(list(pcf=yhat, length = lengde, start0 = start0, mean = verdi, nIntervals=antInt))
}else{
return(list(length = lengde, start0 = start0, mean = verdi, nIntervals=antInt))
}
}
## Choose fast multipcf version, called by multipcf (main function)
selectFastMultiPcf <- function(x,gamma,L,yest){
xLength <- nrow(x)
if (xLength< 1000) {
result<-runFastMultiPCF(x,gamma,L,0.15,0.15,yest)
} else {
if (xLength < 3000){
result<-runFastMultiPCF(x,gamma,L,0.12,0.10,yest)
} else {
if (xLength < 15000){
result<-runFastMultiPCF(x,gamma,L,0.12,0.05,yest)
} else {
result<-runMultiPcfSubset(x,gamma,L,0.12,0.05,yest)
}
}
}
return(result)
}
# Fast version 1, for moderately long sequences, called by selectFastMultiPcf
runFastMultiPCF <- function (x, gamma, L, frac1, frac2, yest) {
mark <- rep(0, nrow(x))
mark <- sawMarkM(x,L,frac1,frac2)
dense <- compactMulti(t(x), mark)
compPotts <- multiPCFcompact(dense$Nr, dense$Sum, dense$Nindex, gamma)
if (yest) {
potts <- expandMulti(nrow(x),ncol(x), compPotts$Lengde,compPotts$mean)
return(list(pcf = potts, length = compPotts$Lengde, start0 = compPotts$sta,
mean = compPotts$mean, nIntervals = compPotts$nIntervals))
} else {
return(list(length = compPotts$Lengde, start0 = compPotts$sta,
mean = compPotts$mean, nIntervals = compPotts$nIntervals))
}
}
# Fast version 2, for very long sequences, called by selectFastMultiPcf
runMultiPcfSubset <- function(x,gamma,L,frac1,frac2,yest){
SUBSIZE <- 5000 #length of subsets
antGen <- nrow(x)
mark <- sawMarkM(x,L,frac1,frac2)
markInit <- c(mark[1:(SUBSIZE-1)],TRUE)
compX <- compactMulti(t(x[1:SUBSIZE,]),markInit)
mark2 <- rep(FALSE,antGen)
mark2[1:SUBSIZE] <- markMultiPotts(compX$Nr,compX$Sum,compX$Nindex,gamma,SUBSIZE)
mark2[4*SUBSIZE/5] <- TRUE
start0 <- 4*SUBSIZE/5+1
while(start0 + SUBSIZE < antGen){
slutt <- start0+SUBSIZE-1
markSub <- c(mark2[1:(start0-1)],mark[start0:slutt])
markSub[slutt] <- TRUE
compX <- compactMulti(t(x[1:slutt,]),markSub)
mark2[1:slutt] <- markMultiPotts(compX$Nr,compX$Sum,compX$Nindex,gamma,slutt)
start0 <- start0+4*SUBSIZE/5
mark2[start0-1] <- TRUE
}
markSub <- c(mark2[1:(start0-1)],mark[start0:antGen])
compX <- compactMulti(t(x),markSub)
compPotts <- multiPCFcompact(compX$Nr,compX$Sum,compX$Nindex,gamma)
if (yest) {
potts <- expandMulti(nrow(x),ncol(x), compPotts$Lengde,compPotts$mean)
return(list(pcf = potts, length = compPotts$Lengde, start0 = compPotts$sta,
mean = compPotts$mean, nIntervals = compPotts$nIntervals))
} else {
return(list(length = compPotts$Lengde, start0 = compPotts$sta,
mean = compPotts$mean, nIntervals = compPotts$nIntervals))
}
}
# function that accumulates numbers of observations and sums between potential breakpoints
compactMulti <- function(y,mark){
#y[is.na(y)] <- 0
ynoNA <- y
ynoNA[is.na(ynoNA)] <- 0
antGen <- ncol(y)
antSample <- nrow(y)
antMark <- sum(mark)
ant <- rep(0,antMark*antSample)
sum <- rep(0,antMark*antSample)
nindex <- rep(1,antMark)
dim(ant) <- c(antSample,antMark)
dim(sum) <- c(antSample,antMark)
pos <- 1
count <- 1
delSum <- rep(0,antSample)
while(pos <= antGen){
delSum <- 0
posArray <- 1*!is.na(y[,pos])
while (mark[pos] < 1){
delSum <- delSum + ynoNA[,pos]
nindex[count] <- nindex[count]+1
pos <- pos+1
posArray <- posArray + !is.na(y[,pos])
}
ant[,count] <- posArray
sum[,count] <- delSum+ynoNA[,pos]
pos <- pos+1
count <- count+1
}
list(Nr=ant,Sum=sum,Nindex=nindex)
}
# main calculations for fast multipcf-versions
multiPCFcompact <- function(nr,sum,nindex,gamma) {
## nr,sum : numbers and sums for one analysis unit,
## typically one chromosomal arm. Samples assumed to be in rows.
## gamma: penalty for discontinuities
N <- ncol(sum)
nSamples <- nrow(sum)
## initialisations
yhat <- rep(0,N*nSamples);
dim(yhat) <- c(nSamples,N)
bestCost <- rep(0,N)
bestSplit <- rep(0,N+1)
bestAver <- rep(0,N*nSamples)
dim(bestAver) <- c(nSamples,N)
Sum <- rep(0,N*nSamples)
dim(Sum) <- c(nSamples,N)
Nevner <- rep(0,N*nSamples)
dim(Nevner) <- c(nSamples,N)
eachCost <- rep(0,N*nSamples)
dim(eachCost) <- c(nSamples,N)
Cost <- rep(0,N)
## Filling of first elements
Sum[ ,1]<-sum[,1]
Nevner[,1]<-nr[,1]
bestSplit[1]<-0
bestAver[,1] <- sum[,1]/nr[1]
helper <- rep(1, nSamples)
bestCost[1]<-helper%*%(-Sum[,1]*bestAver[,1])
lengde <- rep(0,N)
## Solving for gradually longer arrays. Sum accumulates
## error values for righthand plateau downward from n;
## this error added to gamma and the stored cost in bestCost
## give the total cost. Cost stores the total cost for breaks
## at any position below n, and which.min finds the position
## with lowest cost (best split). Aver is the average of the
## righthand plateau.
if (N>1) {
for (n in 2:N) {
Sum[ ,1:n] <- Sum[ ,1:n]+sum[,n]
Nevner[,1:n] <- Nevner[,1:n]+nr[,n]
eachCost[ ,1:n] <- -(Sum[ ,1:n]^2)/Nevner[ ,1:n]
eachCost[is.na(eachCost[,1:n])] <- 0
Cost[1:n] <- helper %*% eachCost[, 1:n]
Cost[2:n] <- Cost[2:n]+bestCost[1:(n-1)]+gamma
Pos <- which.min(Cost[1:n])
cost <- Cost[Pos]
aver <- Sum[ ,Pos]/Nevner[,Pos]
#print("Second location")
bestCost[n] <- cost
bestAver[ ,n] <- aver
bestSplit[n] <- Pos-1
}
}
## The final solution is found iteratively from the sequence
## of split positions stored in bestSplit and the averages
## for each plateau stored in bestAver
n <- N
antInt <- 0
while (n > 0) {
yhat[ ,(bestSplit[n]+1):n] <- bestAver[ ,n]
antInt <- antInt+1
lengde[antInt] <- sum(nindex[(bestSplit[n]+1):n])
n <- bestSplit[n]
}
lengdeRev <- lengde[antInt:1]
init <- rep(0,antInt)
init[1]<-1
if(antInt>=2){
for(k in 2:antInt){
init[k]<-init[k-1]+lengdeRev[k-1]
}
}
n <- N
verdi <- rep(0,antInt*nSamples)
dim(verdi) <- c(nSamples,antInt)
bestSplit[n+1] <- n
antall <- antInt
while (n > 0) {
verdi[ ,antall] <- bestAver[ ,n]
n <- bestSplit[n]
antall <- antall-1
}
list(Lengde = lengdeRev, sta = init, mean = verdi, nIntervals=antInt)
}
# helper function for fast version 2
markMultiPotts <- function(nr,sum,nindex,gamma,subsize) {
## nr,sum: numbers and sums for one analysis unit,
## typically one chromosomal arm. Samples assumed to be in rows.
## gamma: penalty for discontinuities
N <- ncol(nr)
nSamples <- nrow(sum)
markSub <- rep(FALSE,N)
bestCost <- rep(0,N)
bestSplit <- rep(0,N+1)
bestAver <- rep(0,N*nSamples)
dim(bestAver) <- c(nSamples,N)
Sum <- rep(0,N*nSamples)
dim(Sum) <- c(nSamples,N)
Nevner <- rep(0,N*nSamples)
dim(Nevner) <- c(nSamples,N)
eachCost <- rep(0,N*nSamples)
dim(eachCost) <- c(nSamples,N)
Cost <- rep(0,N)
## Filling of first elements
Sum[ ,1]<-sum[,1]
Nevner[,1]<-nr[,1]
bestSplit[1]<-0
bestAver[,1] <- sum[,1]/nr[,1]
helper <- rep(1, nSamples)
bestCost[1]<-helper%*%(-Sum[,1]*bestAver[,1])
lengde <- rep(0,N)
if (N>1) {
for (n in 2:N) {
Sum[ ,1:n] <- Sum[ ,1:n]+sum[,n]
Nevner[,1:n] <- Nevner[,1:n]+nr[,n]
eachCost[ ,1:n] <- -(Sum[ ,1:n]^2)/Nevner[ ,1:n]
eachCost[is.na(eachCost[, 1:n])] <- 0
Cost[1:n] <- helper %*% eachCost[, 1:n]
Cost[2:n] <- Cost[2:n]+bestCost[1:(n-1)]+gamma
Pos <- which.min(Cost[1:n])
cost <- Cost[Pos]
if (length(cost)==0) {
cost <- 0.0
}
aver <- Sum[ ,Pos]/Nevner[,Pos]
#print "Third location"
bestCost[n] <- cost
bestAver[ ,n] <- aver
bestSplit[n] <- Pos-1
markSub[Pos-1] <- TRUE
}
}
help<-findMarksMulti(markSub,nindex,subsize)
return(help)
}
# Function which finds potential breakpoints
findMarksMulti <- function(markSub,Nr,subsize){
## markSub: marks in compressed scale
## NR: number of observations between potential breakpoints
mark <- rep(FALSE,subsize) ## marks in original scale
if(sum(markSub)<1) {
return(mark)
} else {
N<-length(markSub)
ant <- seq(1:N)
help <- ant[markSub]
lengdeHelp <- length(help)
help0 <- c(0,help[1:(lengdeHelp-1)])
lengde <- help-help0
start0<-1
oldStart<-1
startOrig<-1
for(i in 1:lengdeHelp){
start0 <- start0+lengde[i]
lengdeOrig <- sum(Nr[oldStart:(start0-1)])
startOrig <- startOrig+lengdeOrig
mark[startOrig-1]<-TRUE
oldStart<-start0
}
return(mark)
}
}
## expand compact solution
expandMulti <- function(nProbes,nSamples,lengthInt, mean){
##input: nr of probes, length of intervals,
## value in intervals; returns the expansion
Potts <- rep(0,nProbes*nSamples)
dim(Potts) <- c(nSamples,nProbes)
lengthCompArr <- length(lengthInt)
k <- 1
for(i in 1:lengthCompArr){
for(j in 1:lengthInt[i]){
Potts[,k] <- mean[,i]
k <- k+1
}
}
return(Potts)
}
## sawtooth-filter for multiPCF - marks potential breakpoints. Uses two
## sawtoothfilters, one long (length 2*L) and one short (fixed length 6)
sawMarkM <- function(x,L,frac1,frac2){
#print("Running the new version of sawMarkM")
nrProbes <- nrow(x)
nrSample <- ncol(x)
mark <- rep(0,nrProbes)
if (nrProbes < 2*L) {
#We don't have enough probes to do a whole sawtooth: fill with 1's and return.
mark <- rep(1,nrProbes)
return(mark)
}
sawValue <- rep(0,nrProbes)
filter <- rep(0,2*L)
sawValue2 <- rep(0,nrProbes)
filter2 <- rep(0,6)
for (k in 1:L) {
filter[k] <- k/L
filter[2*L+1-k] <- -k/L
}
for (k in 1:3) {
filter2[k] <- k/3
filter2[7-k] <- -k/3
}
index_longneg <-rep(0,nrProbes*nrSample)
dim(index_longneg) <- c(nrProbes,nrSample)
index_longpos <-rep(0,nrProbes*nrSample)
dim(index_longpos) <- c(nrProbes,nrSample)
index_shortneg <-rep(0,nrProbes*nrSample)
dim(index_shortneg) <- c(nrProbes,nrSample)
index_shortpos <-rep(0,nrProbes*nrSample)
dim(index_shortpos) <- c(nrProbes,nrSample)
xnas <-rep(0,nrProbes*nrSample)
dim(xnas) <- c(nrProbes,nrSample)
xnas <- !is.na(x)*1
for (s in 1:nrSample) {
longneg <- 2
longpos <- L
shortneg <- 2
shortpos <- 3
for (p in 3:(nrProbes-3)) {
sumlong <- sum(xnas[(p-longneg):p, s])
while(p>longneg && sumlong != L) {
if(sumlong<L) {
longneg <- longneg + 1
sumlong <- sum(xnas[(p-longneg):p, s])
}
else {
longneg <- longneg - 1
sumlong <- sum(xnas[(p-longneg):p, s])
}
}
if (sumlong==L) {
index_longneg[p, s] = longneg
}
if(p+longpos >= nrProbes) {
longpos <- longpos-1
}
sumlong <- sum(xnas[(p+1):(p+longpos), s])
while(p+longpos < nrProbes && sumlong != L) {
if(sumlong<L) {
longpos <- longpos + 1
sumlong <- sum(xnas[(p+1):(p+longpos), s])
}
else {
longpos <- longpos - 1
sumlong <- sum(xnas[(p+1):(p+longpos), s])
}
}
if (sumlong == L) {
index_longpos[p, s] = longpos
}
sumshort <- sum(xnas[(p-shortneg):p, s])
while(p>shortneg && sumshort != 3) {
if(sumshort<3) {
shortneg <- shortneg + 1
sumshort <- sum(xnas[(p-shortneg):p, s])
}
else {
shortneg <- shortneg - 1
sumshort <- sum(xnas[(p-shortneg):p, s])
}
}
if (sumshort==3) {
index_shortneg[p, s] = shortneg
}
if(p+shortpos >= nrProbes) {
shortpos <- shortpos-1
}
sumshort <- sum(xnas[(p+1):(p+shortpos), s])
while(p+shortpos < nrProbes && sumshort != 3) {
if(sumshort<3) {
shortpos <- shortpos + 1
sumshort <- sum(xnas[(p+1):(p+shortpos), s])
}
else {
shortpos <- shortpos - 1
sumshort <- sum(xnas[(p+1):(p+shortpos), s])
}
}
if (sumshort == 3) {
index_shortpos[p, s] = shortpos
}
}
}
#For the long probe:
for(s in 1:nrSample) {
for (p in L:(nrProbes-L)) {
neg = index_longneg[p,s]
pos = index_longpos[p,s]
if (neg != 0 && pos != 0) {
start <- p-neg
stop <- p+pos
diff = crossprod(filter, x[start:stop,s][!is.na(x[start:stop,s])])
stopifnot(length(x[start:stop,s][!is.na(x[start:stop,s])]) == L*2)
sawValue[p]<-sawValue[p]+abs(diff)
}
}
}
limit <- quantile(sawValue,(1-frac1), na.rm=TRUE)
for (l in 1:(nrProbes-2*L)){
if (!is.na(sawValue[l+L-1]) && sawValue[l+L-1] > limit) {
mark[l+L-1] <- 1
}
}
#For the short probe:
for(s in 1:nrSample) {
for (p in 3:(nrProbes-3)) {
neg = index_shortneg[p,s]
pos = index_shortpos[p,s]
if (neg != 0 && pos != 0) {
start <- p-neg
stop <- p+pos
diff2 = crossprod(filter2, x[start:stop,s][!is.na(x[start:stop,s])])
stopifnot(length(x[start:stop,s][!is.na(x[start:stop,s])]) == 6)
sawValue2[p]<-sawValue2[p]+abs(diff2)
}
}
}
limit2 <- quantile(sawValue2,(1-frac2), na.rm=TRUE)
for (l in 1:(nrProbes-3)){
if (!is.na(sawValue2[l+2]) && sawValue2[l+2] > limit2) {
mark[l+2] <- 1
}
}
for (l in 1:L){
mark[l] <- 1
mark[nrProbes+1-l]<- 1
}
return(mark)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.