R/multipcf.r

####################################################################
## Author: Gro Nilsen, Knut Liest?l and Ole Christian Lingj?rde.
## Maintainer: Gro Nilsen <gronilse@ifi.uio.no>
## License: Artistic 2.0
## Part of the copynumber package
## Reference: Nilsen and Liest?l et al. (2012), BMC Genomics
####################################################################

##Required by:
### none


##Requires:
### getMad
### getArms
### numericArms
### numericChrom
### pullOutContent

## Main function for multipcf-analysis to be called by the user

multipcf <- function(data,pos.unit="bp",arms=NULL,Y=NULL,gamma=40,normalize=TRUE,w=1,fast=TRUE,assembly="hg19",digits=4,return.est=FALSE,save.res=FALSE,file.names=NULL,verbose=TRUE){
	
  #Check pos.unit input:
  if(!pos.unit %in% c("bp","kbp","mbp")){
    stop("'pos.unit' must be one of bp, kbp and mbp",call.=FALSE)
  }
  
  #Check assembly input:
  if(!assembly %in% c("hg19","hg18","hg17","hg16","mm7","mm8","mm9")){
    stop("assembly must be one of hg19, hg18, hg17, hg16, mm7, mm8, or mm9",call.=FALSE)
  }

  #Is data a file:
  isfile.data <- class(data)=="character"
  
  #Check data input:
  if(!isfile.data){
    #Input could come from winsorize and thus be a list; check and possibly retrieve data frame wins.data
    data <- pullOutContent(data,what="wins.data")
    stopifnot(ncol(data)>=4)  #something is missing in input data
    #Extract information from data:
    chrom <- data[,1]
    position <- data[,2]
    nSample <- ncol(data)-2
    sampleid <- colnames(data)[-c(1:2)]
  }else{
    #data is a datafile which contains data
    f <- file(data,"r")  #open file connection
    head <- scan(f,nlines=1,what="character",quiet=TRUE,sep="\t") #Read header
    if(length(head)<3){
      stop("Data in file must have at least 3 columns",call.=FALSE)
    }
    sampleid <- head[-c(1:2)]
    nSample <- length(sampleid)

    #Read just the two first columns to get chrom and pos
    chrom.pos <- read.table(file=data,sep="\t",header=TRUE,colClasses=c(rep(NA,2),rep("NULL",nSample)),as.is=TRUE)          
    chrom <- chrom.pos[,1]
    position <- chrom.pos[,2]
  }
  
  #Make sure chrom is not factor:
  if(is.factor(chrom)){
    #If chrom is factor; convert to character
    chrom <- as.character(chrom)
  }
  #Make sure chromosomes are numeric (replace X and Y by 23 and 24)
  num.chrom <- numericChrom(chrom)
  nProbe <- length(num.chrom)
  
  #Make sure position is numeric:
  if(!is.numeric(position)){
    stop("input in data column 2 (posistions) must be numeric",call.=FALSE)
  }
  
  #Get character arms:
	if(is.null(arms)){
    arms <- getArms(num.chrom,position,pos.unit,get(assembly))
	}else{
    stopifnot(length(arms)==nProbe)
	}
	#Translate to numeric arms:
	num.arms <- numericArms(num.chrom,arms)
	
	#Unique arms:
	arm.list <- unique(num.arms)
	nArm <- length(arm.list)

  #Check that w is same length as number of samples:
  if(length(w)==1){
    w <- rep(w,nSample)
  }else if(length(w)!=nSample){
    stop("'w' must be a single number or a vector of same length as the number of samples in 'data'",call.=FALSE)
  }
  
  #Check Y input:
  if(!is.null(Y)){
    stopifnot(class(Y)%in%c("matrix","data.frame","character"))
    isfile.Y <- class(Y)=="character"
    
    if(!isfile.Y){
      ncol.Y <- ncol(Y)
      nrow.Y <- nrow(Y)
    }else{
      f.y <- file(Y,"r")
      ncol.Y <- length(scan(f.y,nlines=1,what="character",quiet=TRUE,sep="\t"))
      nrow.Y <- nrow(read.table(file=Y,sep="\t",header=TRUE,colClasses=c(NA,rep("NULL",ncol.Y-1)),as.is=TRUE))
    }
    if(nrow.Y!=nProbe || ncol.Y!=nSample+2){
      stop("Input Y does not represent the same number of probes and samples as found in input data",call.=FALSE)
    }
  }

  #Create folders in working directory where results are saved:
	#Initialize
  seg.names <- c("chrom","arm","start.pos","end.pos","n.probes",sampleid)
  mpcf.names <- c("chrom","pos",sampleid)
  segments <- data.frame(matrix(nrow=0,ncol=nSample+5))
	colnames(segments) <- seg.names
  if(return.est){
    mpcf.est <- matrix(nrow=0,ncol=nSample)
	}
	if(save.res){
    if(is.null(file.names)){
      #Create directory where results are to be saved
      dir.res <- "multipcf_results"
      if(!dir.res %in% dir()){
        dir.create(dir.res)
      }
      file.names <- c(paste(dir.res,"/","estimates.txt",sep=""),paste(dir.res,"/","segments.txt",sep=""))
      
    }else{
      #Check that file.names is the correct length
      if(length(file.names)<2){
        stop("'file.names' must be of length 2", call.=FALSE)
      }
    }  
  } 

  #estimates must be returned from routines if return.est or save.res
  yest <- any(return.est,save.res)

  #If normalize=T, we will scale by the sample residual standard error. If the number of probes in the data set < 100K, the MAD sd-estimate is calculated using all obs for the sample:
  sd <- rep(1,nSample) #to have a value to check in if-test later; not used otherwise. sd is only used if normalize=TRUE and then these values are replaced by MAD-sd
  if(normalize && nProbe < 100000){
    for(j in 1:nSample){
      if(!isfile.data){
        sample.data <- data[,j+2]
      }else{
        cc <- rep("NULL",nSample+2)
        cc[j+2] <- "numeric"
        #only read data for the j'th sample
        sample.data <- read.table(file=data,sep="\t",header=TRUE,colClasses=cc)[,1]
      }
      sample.noNAdata <- sample.data[!is.na(sample.data)]
      if(length(sample.noNAdata)==0){
        sd[j] <- 1
      }else{
        sd[j] <- getMad(sample.noNAdata,k=25)   #Take out missing values before calculating mad
      }
    }
  }
  
  #Scale gamma according to the number of samples:
  #No!  If you do this, adding a wt sample gives you fewer breakpoints, which is not appropriate.  Scaling gamma by the number of samples should only be done if each sample is an independent estimate of the same thing, i.e. if you believe that any given breakpoint should be seen in *all* samples.
  #gamma <- gamma*nSample 
  
	#run multiPCF separately on each chromosomearm:
  for(c in 1:nArm){
    probe.c <- which(num.arms==arm.list[c])
    pos.c <- position[probe.c]
    nProbe.c <- length(probe.c)

    #get data for this arm
    if(!isfile.data){
			arm.data <- data[probe.c,-c(1:2),drop=FALSE]
    }else{
      #Read data for this arm from file; since f is a opened connection, the reading will start on the next line which has not already been read
      #two first columns skipped
      arm.data <- read.table(f,nrows=nProbe.c,sep="\t",colClasses=c(rep("NULL",2),rep("numeric",nSample)))
    }
   
    #Check that there are no missing values:
#    if(any(is.na(arm.data))){
#	   stop("multiPCF cannot be run because there are missing data values, see 'imputeMissing' for imputation of missing values")
#    }
    
    #Make sure data is numeric:
    if(any(!sapply(arm.data,is.numeric))){
      stop("input in data columns 3 and onwards (copy numbers) must be numeric",call.=FALSE)
    }  
    
    #If normalize=T and nProbe>=100K, we calculate the MAD sd-estimate for each sample using only obs in this arm
    if(normalize && nProbe >= 100000){
      sd <- apply(arm.data,2,getMad)   
    }
    
    #Check sd; cannot normalize if sd=0 or if sd=NA:
    if(any(sd==0) || any(is.na(sd))){
      #not run multipcf, return mean for each sample:
      m <- apply(arm.data,2,mean)
      dim(m) <- c(length(m),1)
      if(yest){
        yhat <- sapply(m,rep,nrow(arm.data))
        mpcf <- list(pcf=t(yhat),nIntervals=1,start0=1,length=nrow(arm.data),mean=m)
      }else{	
        mpcf <- list(nIntervals=1,start0=1,length=nrow(arm.data),mean=m)
      }	
      
    }else{
      #normalize data data (sd=1 if normalize=FALSE)
      arm.data <- sweep(arm.data,2,sd,"/")  
   
      #weight data (default weights is 1)
      arm.data <- sweep(arm.data,2,w,"*")
      
      #Swap initial-NA SNPs with the first non-NA SNP for each sample
      for (s in 1:ncol(arm.data)) {
        snp <- 2
        while (is.na(arm.data[1,s]) && snp<nrow(arm.data)) {
          if(!is.na(arm.data[snp,s])) {
            arm.data[1,s] = arm.data[snp,s]
            arm.data[snp,s] = NA
          }
          snp <- snp+1
        }
      }
      
      #Run multipcf:
      if(!fast || nrow(arm.data)<400){ 
        print("Running 'doMultiPCF'")
        mpcf <- doMultiPCF(as.matrix(t(arm.data)),gamma=gamma,yest=yest)   #requires samples in rows, probes in columns
        #note: returns samples in rows, estimates in columns.
      }else{  
        print("Running 'selectFastMultiPcf'")
        mpcf <- selectFastMultiPcf(as.matrix(arm.data),gamma=gamma,L=15,yest=yest)    #requires samples in columns, probes in rows
      }

      #"Unweight" estimates:
  		mpcf$mean <- sweep(mpcf$mean,1,w,"/")
      if(yest){      
        mpcf$pcf <- sweep(mpcf$pcf,1,w,"/")
  		}
  		#"Un-normalize" estimates:
  		mpcf$mean <- sweep(mpcf$mean,1,sd,"*")
  		if(yest){
        mpcf$pcf <- sweep(mpcf$pcf,1,sd,"*")
      }
  	}	
		
    #Information about segments:
    nSeg <- mpcf$nIntervals
		start0 <- mpcf$start0
		n.pos <- mpcf$length
		seg.mean <- t(mpcf$mean)  #get samples in columns
		posStart <- pos.c[start0]
		posEnd <- c(pos.c[start0-1],pos.c[nProbe.c])
		
    #Chromosome number and character arm id:
    chr <- unique(chrom[probe.c])
		a <- unique(arms[probe.c])
		chrid <- rep(chr,times=nSeg)
		armid <- rep(a,times=nSeg)
		
	  #May use mean of input data or the observed data specified in Y:
		if(!is.null(Y)){
      #get Y for this arm
      if(!isfile.Y){
        arm.Y <- Y[probe.c,-c(1:2),drop=FALSE]
      }else{
        arm.Y <- read.table(f.y,nrows=length(probe.c),sep="\t",colClasses=c(rep("NULL",2),rep("numeric",nSample)))
      }
       #Make sure Y is numeric:
      if(any(!sapply(arm.Y,is.numeric))){
        stop("input in Y columns 3 and onwards (copy numbers) must be numeric",call.=FALSE)
      }
    	#Use observed data to calculate segment mean (recommended)
			seg.mean <- matrix(NA,nrow=nSeg,ncol=nSample)
			for(s in 1:nSeg){
				seg.Y <- as.matrix(arm.Y[start0[s]:(start0[s]+n.pos[s]-1),])
				seg.mean[s,] <- apply(seg.Y,2,mean,na.rm=TRUE)
			}
		}
		
		#Round
		if(yest){
		  yhat <- round(mpcf$pcf,digits=digits)
    }
		seg.mean <- round(seg.mean,digits=digits)
		
    #Data frame:					
		segments.c <- data.frame(chrid,armid,posStart,posEnd,n.pos,seg.mean,stringsAsFactors=FALSE)
		colnames(segments.c) <- seg.names

    #Should results be written to files or returned to user:
    if(save.res){
      if(c==1){
        #open connection for writing to file
        w1 <- file(file.names[1],"w")
        w2 <- file(file.names[2],"w")
      }
      
      #Write segments to file for this arm
      write.table(segments.c,file=w2,col.names=if(c==1) seg.names else FALSE,row.names=FALSE,quote=FALSE,sep="\t")
			#Write estimated multiPCF-values file for this arm:
      write.table(data.frame(chrom[probe.c],pos.c,t(yhat),stringsAsFactors=FALSE), file = w1,col.names=if(c==1) mpcf.names else FALSE,row.names=FALSE,quote=FALSE,sep="\t")
      
    }
      
    #Append results for this arm:
    segments <- rbind(segments,segments.c)
    if(return.est){
      mpcf.est <- rbind(mpcf.est,t(yhat))
    } 

  	if(verbose){
      cat(paste("multipcf finished for chromosome arm ",chr,a,sep=""),"\n")
    }
    
  }#endfor
	
	#Close connections
	if(isfile.data){
    close(f)
  }
  if(!is.null(Y)){
    if(isfile.Y){
      close(f.y)
    }
  }
  
  if(save.res){
    cat(paste("multipcf-estimates were saved in file",file.names[1]),sep="\n")
    close(w1)
    cat(paste("segments were saved in file",file.names[2]),sep="\n")
    close(w2)
    
	}
	
  #return results:
	if(return.est){
	  mpcf.est <- data.frame(chrom,position,mpcf.est,stringsAsFactors=FALSE)
	  colnames(mpcf.est) <- mpcf.names
		return(list(estimates=mpcf.est,segments=segments))
	}else{
    return(segments) 
	}
	
		
	
}#endfunction



                                           
#Run exact multipcf algorithm, to be called by multipcf (main function)                                             
doMultiPCF <- function(y, gamma, yest) {
  
  ## y: input matrix of copy number estimates, samples in rows 
  ## gamma: penalty for discontinuities
 	## yest: logical, should estimates be returned
  N <- length(y)
	nSamples <- nrow(y)
	nProbes <- ncol(y)
	## initialisations
	if(yest){
    yhat <- rep(0,N);
    dim(yhat) <- c(nSamples,nProbes)
  }
 	bestCost <- rep(0,nProbes)
 	bestSplit <- rep(0,nProbes+1)
 	bestAver <- rep(0,N)
	dim(bestAver) <- c(nSamples,nProbes)
	Sum <- rep(0,N)
	dim(Sum) <- c(nSamples,nProbes)
	Aver <- rep(0,N)
	dim(Aver) <- c(nSamples,nProbes)
	#We create Nevner such that it counts the number of probes *for which we have data*, so that the averages are calculated correctly.
	Nevner <- rep(0,N)
	dim(Nevner) <- c(nSamples,nProbes)
  Nevner[,1] <- 1*!is.na(y[,1])
	eachCost <- rep(0,N)
	dim(eachCost) <- c(nSamples,nProbes)
	Cost <- rep(0,nProbes)
	## Filling of first elements
	ynoNA <- y
	ynoNA[is.na(ynoNA)] <- 0
	y1<-ynoNA[ ,1]
	Sum[ ,1]<-y1
	Aver[ ,1]<-y1
	bestCost[1]<-0
	bestSplit[1]<-0
	bestAver[ ,1]<-y1
	helper <- rep(1,nSamples)
	## Solving for gradually longer arrays. Sum accumulates
	## values for errors for righthand plateau downward from n;   
	## this error added to gamma and the stored cost in bestCost 
	## give the total cost. Cost stores the total cost for breaks 
	## at any position below n, and which.min finds the position 
	## with lowest cost (best split). Aver is the average of the 
	## righthand plateau.
	if (nProbes>1) {
  	for (n in 2:nProbes) {
     		Sum[ ,1:n] <- Sum[ ,1:n]+ynoNA[,n]
     		Nevner[,1:n] <- Nevner[,1:n]+!is.na(y[,n])
     		## Make sure we don't divide by zero by cheating on the ends a little:
     		divisor <- Nevner[ , 1:n]
     		divisor[divisor==0] <- 1
     		Aver[ ,1:n] <- Sum[ ,1:n]/divisor
     		#print(Aver[,1:n])
     		eachCost[ ,1:n] <- -(Sum[ ,1:n]*Aver[ ,1:n])
        Cost[1:n] <- helper %*% eachCost[ ,1:n]
  		  Cost[2:n] <- Cost[2:n]+bestCost[1:(n-1)]+gamma
     		Pos <- which.min(Cost[1:n])
     		#print("First location")
     		bestCost[n] <- Cost[Pos]
     		bestAver[ ,n] <- Aver[ ,Pos]
     		bestSplit[n] <- Pos-1
  	}
	}
	## The final solution is found iteratively from the sequence   
	## of split positions stored in bestSplit and the averages 
	## for each plateau stored in bestAver
 	n <- nProbes
	antInt <- 0
 	while (n > 0) {
 	    if(yest){
        yhat[ ,(bestSplit[n]+1):n] <- bestAver[ ,n]
   		}
      n <- bestSplit[n]
		  antInt <- antInt+1
 	}
	n <- nProbes
	lengde <- rep(0,antInt)
	start0 <- rep(0,antInt)
	verdi <- rep(0,antInt*nSamples)
	dim(verdi) <- c(nSamples,antInt)
	oldSplit  <- n
	antall <- antInt
	while (n > 0) {
   		start0[antall] <- bestSplit[n]+1
		  lengde[antall] <- oldSplit-bestSplit[n]
		  verdi[ ,antall] <- bestAver[ ,n]
  		n <- bestSplit[n]
		  oldSplit <- n
		  antall <- antall-1
 	}
  if(yest){
    return(list(pcf=yhat, length = lengde, start0 = start0, mean = verdi, nIntervals=antInt))
  }else{
    return(list(length = lengde, start0 = start0, mean = verdi, nIntervals=antInt))
  }
}


## Choose fast multipcf version, called by multipcf (main function)                                                   
selectFastMultiPcf <- function(x,gamma,L,yest){
	xLength <- nrow(x)
	if (xLength< 1000) {
	  result<-runFastMultiPCF(x,gamma,L,0.15,0.15,yest)
	} else {
		if (xLength < 3000){
			result<-runFastMultiPCF(x,gamma,L,0.12,0.10,yest)
		} else {
			if (xLength < 15000){
				result<-runFastMultiPCF(x,gamma,L,0.12,0.05,yest)
			} else  {
				result<-runMultiPcfSubset(x,gamma,L,0.12,0.05,yest)
	 		}
		}
	}
	return(result)
}


# Fast version 1, for moderately long sequences, called by selectFastMultiPcf
runFastMultiPCF <- function (x, gamma, L, frac1, frac2, yest) {   	
  mark <- rep(0, nrow(x))
	mark <- sawMarkM(x,L,frac1,frac2)
	dense <- compactMulti(t(x), mark)
	compPotts <- multiPCFcompact(dense$Nr, dense$Sum, dense$Nindex, gamma)
  if (yest) {
		potts <- expandMulti(nrow(x),ncol(x), compPotts$Lengde,compPotts$mean)
		return(list(pcf = potts, length = compPotts$Lengde, start0 = compPotts$sta, 
        		mean = compPotts$mean, nIntervals = compPotts$nIntervals))
	} else {
		return(list(length = compPotts$Lengde, start0 = compPotts$sta, 
        		mean = compPotts$mean, nIntervals = compPotts$nIntervals))		
	}
}

# Fast version 2, for very long sequences, called by selectFastMultiPcf
runMultiPcfSubset <- function(x,gamma,L,frac1,frac2,yest){
	SUBSIZE <- 5000   #length of subsets
	antGen <- nrow(x)
	mark <- sawMarkM(x,L,frac1,frac2)
	markInit <- c(mark[1:(SUBSIZE-1)],TRUE)
	compX <- compactMulti(t(x[1:SUBSIZE,]),markInit)
	mark2 <- rep(FALSE,antGen)
	mark2[1:SUBSIZE] <- markMultiPotts(compX$Nr,compX$Sum,compX$Nindex,gamma,SUBSIZE)
  mark2[4*SUBSIZE/5] <- TRUE
	start0 <- 4*SUBSIZE/5+1
	while(start0 + SUBSIZE < antGen){
		slutt <- start0+SUBSIZE-1
		markSub <- c(mark2[1:(start0-1)],mark[start0:slutt])
		markSub[slutt] <- TRUE
		compX <- compactMulti(t(x[1:slutt,]),markSub)
		mark2[1:slutt] <- markMultiPotts(compX$Nr,compX$Sum,compX$Nindex,gamma,slutt)
    start0 <- start0+4*SUBSIZE/5
		mark2[start0-1] <- TRUE
	}
	markSub <- c(mark2[1:(start0-1)],mark[start0:antGen])
	compX <- compactMulti(t(x),markSub)
	compPotts <- multiPCFcompact(compX$Nr,compX$Sum,compX$Nindex,gamma)
  if (yest) {
		potts <- expandMulti(nrow(x),ncol(x), compPotts$Lengde,compPotts$mean)
		return(list(pcf = potts, length = compPotts$Lengde, start0 = compPotts$sta, 
        		mean = compPotts$mean, nIntervals = compPotts$nIntervals))
	} else {
		return(list(length = compPotts$Lengde, start0 = compPotts$sta, 
        		mean = compPotts$mean, nIntervals = compPotts$nIntervals))		
	}

}

# function that accumulates numbers of observations and sums between potential breakpoints
compactMulti <- function(y,mark){
  #y[is.na(y)] <- 0
  ynoNA <- y
  ynoNA[is.na(ynoNA)] <- 0
	antGen <- ncol(y)
	antSample <- nrow(y)
	antMark <- sum(mark)
	ant <- rep(0,antMark*antSample)
	sum <- rep(0,antMark*antSample)
	nindex <- rep(1,antMark)
	dim(ant) <- c(antSample,antMark)
	dim(sum) <- c(antSample,antMark)
	pos <- 1
	count <- 1
	delSum <- rep(0,antSample)
	while(pos <= antGen){
		delSum <- 0
		posArray <- 1*!is.na(y[,pos])
		while (mark[pos] < 1){
			delSum <- delSum + ynoNA[,pos]
			nindex[count] <- nindex[count]+1
			pos <- pos+1
			posArray <- posArray + !is.na(y[,pos])
		}		
		ant[,count] <- posArray
		sum[,count] <- delSum+ynoNA[,pos]
		pos <- pos+1
		count <- count+1
	}
	list(Nr=ant,Sum=sum,Nindex=nindex)
}


# main calculations for fast multipcf-versions
multiPCFcompact <- function(nr,sum,nindex,gamma) {
  ## nr,sum : numbers and sums for one analysis unit,
  ## typically one chromosomal arm. Samples assumed to be in rows. 
  ## gamma: penalty for discontinuities
  N <- ncol(sum)
	nSamples <- nrow(sum)
	## initialisations
	yhat <- rep(0,N*nSamples);
	dim(yhat) <- c(nSamples,N)
	bestCost <- rep(0,N)
	bestSplit <- rep(0,N+1)
	bestAver <- rep(0,N*nSamples)
	dim(bestAver) <- c(nSamples,N)
	Sum <- rep(0,N*nSamples)
	dim(Sum) <- c(nSamples,N)
	Nevner <- rep(0,N*nSamples)
	dim(Nevner) <- c(nSamples,N)
	eachCost <- rep(0,N*nSamples)
	dim(eachCost) <- c(nSamples,N)
	Cost <- rep(0,N)
	## Filling of first elements
	Sum[ ,1]<-sum[,1]
	Nevner[,1]<-nr[,1]
	bestSplit[1]<-0	
	bestAver[,1] <- sum[,1]/nr[1]
	helper <- rep(1, nSamples)
	bestCost[1]<-helper%*%(-Sum[,1]*bestAver[,1])	
  lengde <- rep(0,N)

	## Solving for gradually longer arrays. Sum accumulates
	## error values for righthand plateau downward from n;   
	## this error added to gamma and the stored cost in bestCost 
	## give the total cost. Cost stores the total cost for breaks 
	## at any position below n, and which.min finds the position 
	## with lowest cost (best split). Aver is the average of the 
	## righthand plateau.
  if (N>1) {
    for (n in 2:N) {
      Sum[ ,1:n] <- Sum[ ,1:n]+sum[,n]
  		Nevner[,1:n] <- Nevner[,1:n]+nr[,n]
  		eachCost[ ,1:n] <- -(Sum[ ,1:n]^2)/Nevner[ ,1:n]
  		eachCost[is.na(eachCost[,1:n])] <- 0
      Cost[1:n] <- helper %*% eachCost[, 1:n]
  		Cost[2:n] <- Cost[2:n]+bestCost[1:(n-1)]+gamma
  		Pos <- which.min(Cost[1:n])
   		cost <- Cost[Pos]
   		aver <- Sum[ ,Pos]/Nevner[,Pos]
   		#print("Second location")
   		bestCost[n] <- cost
   		bestAver[ ,n] <- aver
   		bestSplit[n] <- Pos-1
   	}
  }
	## The final solution is found iteratively from the sequence   
	## of split positions stored in bestSplit and the averages 
	## for each plateau stored in bestAver

 	n <- N
	antInt <- 0
 	while (n > 0) {
		yhat[ ,(bestSplit[n]+1):n] <- bestAver[ ,n]
 		antInt <- antInt+1
		lengde[antInt] <- sum(nindex[(bestSplit[n]+1):n])
		n <- bestSplit[n]
 	}
	lengdeRev <- lengde[antInt:1]
	init <- rep(0,antInt)
	init[1]<-1
	if(antInt>=2){
		for(k in 2:antInt){
			init[k]<-init[k-1]+lengdeRev[k-1]
		}
	}

 	n <- N
	verdi <- rep(0,antInt*nSamples)
	dim(verdi) <- c(nSamples,antInt)
	bestSplit[n+1] <- n
	antall <- antInt
 	while (n > 0) {
 		verdi[ ,antall] <- bestAver[ ,n]
 		n <- bestSplit[n]
		antall <- antall-1
 	}

 	list(Lengde = lengdeRev, sta = init, mean = verdi, nIntervals=antInt)
}

# helper function for fast version 2
markMultiPotts <- function(nr,sum,nindex,gamma,subsize) {
  ## nr,sum: numbers and sums for one analysis unit,
  ##  typically one chromosomal arm. Samples assumed to be in rows. 
  ## gamma: penalty for discontinuities
  N <- ncol(nr)
	nSamples <- nrow(sum)
	markSub <- rep(FALSE,N)
	bestCost <- rep(0,N)
	bestSplit <- rep(0,N+1)
	bestAver <- rep(0,N*nSamples)
	dim(bestAver) <- c(nSamples,N)
	Sum <- rep(0,N*nSamples)
	dim(Sum) <- c(nSamples,N)
	Nevner <- rep(0,N*nSamples)
	dim(Nevner) <- c(nSamples,N)
	eachCost <- rep(0,N*nSamples)
	dim(eachCost) <- c(nSamples,N)
	Cost <- rep(0,N)
	
	## Filling of first elements
	Sum[ ,1]<-sum[,1]
	Nevner[,1]<-nr[,1]
	bestSplit[1]<-0	
	bestAver[,1] <- sum[,1]/nr[,1]
	helper <- rep(1, nSamples)
	bestCost[1]<-helper%*%(-Sum[,1]*bestAver[,1])	
	lengde <- rep(0,N)
	if (N>1) {
	  for (n in 2:N) {
      Sum[ ,1:n] <- Sum[ ,1:n]+sum[,n]
      Nevner[,1:n] <- Nevner[,1:n]+nr[,n]
      eachCost[ ,1:n] <- -(Sum[ ,1:n]^2)/Nevner[ ,1:n]
      eachCost[is.na(eachCost[, 1:n])] <- 0
      Cost[1:n] <- helper %*% eachCost[, 1:n]
  		Cost[2:n] <- Cost[2:n]+bestCost[1:(n-1)]+gamma
  		Pos <- which.min(Cost[1:n])
   		cost <- Cost[Pos]
   		if (length(cost)==0) {
   		  cost <- 0.0
   		}
   		aver <- Sum[ ,Pos]/Nevner[,Pos]
   		#print "Third location"
   		bestCost[n] <- cost
   		bestAver[ ,n] <- aver
   		bestSplit[n] <- Pos-1
  		markSub[Pos-1] <- TRUE
  
	  }
	}
	help<-findMarksMulti(markSub,nindex,subsize)
	return(help)

}


# Function which finds potential breakpoints
findMarksMulti <- function(markSub,Nr,subsize){
	## markSub: marks in compressed scale
	## NR: number of observations between potential breakpoints
	mark <- rep(FALSE,subsize)  ## marks in original scale
	if(sum(markSub)<1) {
    return(mark)
  } else {	
		N<-length(markSub)
		ant <- seq(1:N)
		help <- ant[markSub]
		lengdeHelp <- length(help)
		help0 <- c(0,help[1:(lengdeHelp-1)])
		lengde <- help-help0
		start0<-1
		oldStart<-1
		startOrig<-1
		for(i in 1:lengdeHelp){
			start0 <- start0+lengde[i]
			lengdeOrig <- sum(Nr[oldStart:(start0-1)])
			startOrig <- startOrig+lengdeOrig
			mark[startOrig-1]<-TRUE
			oldStart<-start0
		}
		return(mark)
	}
	
}


## expand compact solution
expandMulti <- function(nProbes,nSamples,lengthInt, mean){
  ##input: nr of probes, length of intervals, 
  ## value in intervals; returns the expansion 

	Potts <- rep(0,nProbes*nSamples)
	dim(Potts) <- c(nSamples,nProbes)
	lengthCompArr <- length(lengthInt)
	k <- 1
	for(i in 1:lengthCompArr){
		for(j in 1:lengthInt[i]){
			Potts[,k] <- mean[,i]
			k <- k+1
		}
	}
	return(Potts)	
}

## sawtooth-filter for multiPCF - marks potential breakpoints. Uses two 
## sawtoothfilters, one long (length 2*L) and one short (fixed length 6)	
sawMarkM <- function(x,L,frac1,frac2){
  #print("Running the new version of sawMarkM")
  nrProbes <- nrow(x)
 	nrSample <- ncol(x)
  mark <- rep(0,nrProbes)
  if (nrProbes < 2*L) {
    #We don't have enough probes to do a whole sawtooth: fill with 1's and return.
    mark <- rep(1,nrProbes)
    return(mark)
  }
  sawValue <- rep(0,nrProbes)
  filter <- rep(0,2*L)
  sawValue2 <- rep(0,nrProbes)
  filter2 <- rep(0,6)
	for (k in 1:L) {
  		filter[k] <- k/L
  		filter[2*L+1-k] <- -k/L
	}

	for (k in 1:3) {
  		filter2[k] <- k/3
  		filter2[7-k] <- -k/3
	}
  
  index_longneg <-rep(0,nrProbes*nrSample)
  dim(index_longneg) <- c(nrProbes,nrSample)
  index_longpos <-rep(0,nrProbes*nrSample)
  dim(index_longpos) <- c(nrProbes,nrSample)
  index_shortneg <-rep(0,nrProbes*nrSample)
  dim(index_shortneg) <- c(nrProbes,nrSample)
  index_shortpos <-rep(0,nrProbes*nrSample)
  dim(index_shortpos) <- c(nrProbes,nrSample)
  
  xnas <-rep(0,nrProbes*nrSample)
  dim(xnas) <- c(nrProbes,nrSample)
  xnas <- !is.na(x)*1
  
  for (s in 1:nrSample) {
    longneg <- 2
    longpos <- L
    shortneg <- 2
    shortpos <- 3
    for (p in 3:(nrProbes-3)) {
      sumlong <- sum(xnas[(p-longneg):p, s])
      while(p>longneg && sumlong != L) {
        if(sumlong<L) {
          longneg <- longneg + 1
          sumlong <- sum(xnas[(p-longneg):p, s])
        }
        else {
          longneg <- longneg - 1
          sumlong <- sum(xnas[(p-longneg):p, s])
        }
      }
      if (sumlong==L) {
        index_longneg[p, s] = longneg
      }
      
      if(p+longpos >= nrProbes) {
        longpos <- longpos-1
      }
      sumlong <- sum(xnas[(p+1):(p+longpos), s])
      while(p+longpos < nrProbes && sumlong != L) {
        if(sumlong<L) {
          longpos <- longpos + 1
          sumlong <- sum(xnas[(p+1):(p+longpos), s])
        }
        else {
          longpos <- longpos - 1
          sumlong <- sum(xnas[(p+1):(p+longpos), s])
        }
      }
      if (sumlong == L) {
        index_longpos[p, s] = longpos
      }
      
      
      sumshort <- sum(xnas[(p-shortneg):p, s])
      while(p>shortneg && sumshort != 3) {
        if(sumshort<3) {
          shortneg <- shortneg + 1
          sumshort <- sum(xnas[(p-shortneg):p, s])
        }
        else {
          shortneg <- shortneg - 1
          sumshort <- sum(xnas[(p-shortneg):p, s])
        }
      }
      if (sumshort==3) {
        index_shortneg[p, s] = shortneg
      }

      if(p+shortpos >= nrProbes) {
        shortpos <- shortpos-1
      }
      sumshort <- sum(xnas[(p+1):(p+shortpos), s])
      while(p+shortpos < nrProbes && sumshort != 3) {
        if(sumshort<3) {
          shortpos <- shortpos + 1
          sumshort <- sum(xnas[(p+1):(p+shortpos), s])
        }
        else {
          shortpos <- shortpos - 1
          sumshort <- sum(xnas[(p+1):(p+shortpos), s])
        }
      }
      if (sumshort == 3) {
        index_shortpos[p, s] = shortpos
      }
    }
  }
  
  #For the long probe:
  for(s in 1:nrSample) {
    for (p in L:(nrProbes-L)) {
      neg = index_longneg[p,s]
      pos = index_longpos[p,s]
      if (neg != 0 && pos != 0) {
        start <- p-neg
        stop <- p+pos
        diff = crossprod(filter, x[start:stop,s][!is.na(x[start:stop,s])])
        stopifnot(length(x[start:stop,s][!is.na(x[start:stop,s])]) == L*2)
        sawValue[p]<-sawValue[p]+abs(diff)
      }
    }
  }

	limit <- quantile(sawValue,(1-frac1), na.rm=TRUE)
	for (l in 1:(nrProbes-2*L)){
	  if (!is.na(sawValue[l+L-1]) && sawValue[l+L-1] > limit) {
			mark[l+L-1] <- 1
		}
	}

	#For the short probe:
	for(s in 1:nrSample) {
	  for (p in 3:(nrProbes-3)) {
	    neg = index_shortneg[p,s]
	    pos = index_shortpos[p,s]
	    if (neg != 0 && pos != 0) {
	      start <- p-neg
	      stop <- p+pos
	      diff2 = crossprod(filter2, x[start:stop,s][!is.na(x[start:stop,s])])
	      stopifnot(length(x[start:stop,s][!is.na(x[start:stop,s])]) == 6)
	      sawValue2[p]<-sawValue2[p]+abs(diff2)
	    }
	  }
	}
	
	limit2 <- quantile(sawValue2,(1-frac2), na.rm=TRUE)
	for (l in 1:(nrProbes-3)){
	  if (!is.na(sawValue2[l+2]) && sawValue2[l+2] > limit2) {
	    mark[l+2] <- 1
	  }
	}
	
	for (l in 1:L){
		mark[l] <- 1
		mark[nrProbes+1-l]<- 1
	}

	return(mark)
}
luciansmith/copynumber documentation built on May 6, 2019, 2:32 p.m.