Nothing
####################################################################
## Author: Gro Nilsen, Knut Liestřl and Ole Christian Lingjćrde.
## Maintainer: Gro Nilsen <gronilse@ifi.uio.no>
## License: Artistic 2.0
## Part of the copynumber package
## Reference: Nilsen and Liestřl et al. (2012), BMC Genomics
####################################################################
##Required by:
##Requires:
### findNN
### getMad
### pcf(not the main function, but the rest of the help functions found in the same document)
### handleMissing
### pullOutContent
## Main function for pcf-analysis to be called by the user
pcfPlain <- function(pos.data,kmin=5,gamma=40,normalize=TRUE,fast=TRUE,digits=4,return.est=FALSE,verbose=TRUE){
#Input could come from winsorize and thus be a list; check and possibly retrieve data frame wins.data
pos.data <- pullOutContent(pos.data,what="wins.data")
#Make sure all data columns are numeric:
if(any(!sapply(pos.data,is.numeric))){
stop("All input in pos.data must be numeric",call.=FALSE)
}
#Check data input:
stopifnot(ncol(pos.data)>=2) #something is missing in input data
#Extract information from pos.data:
position <- pos.data[,1]
cn.data <- as.matrix(pos.data[,-1])
nSample <- ncol(pos.data)-1
sampleid <- colnames(pos.data)[-1]
nProbe <- length(position)
#save user's gamma
gamma0 <- gamma
sd <- rep(1,nSample) #sd is only used if normalize=TRUE, and then these values are replaced by MAD-sd
#If number of probes in entire data set is less than 100K, the MAD sd-estimate is calculated using all obs for every sample
#Only required if normalize=T
if(nProbe<100000 && normalize){
#calculate MAD-sd for each sample:
for(j in 1:nSample){
sample.data <- pos.data[,j+1]
sd[j] <- getMad(sample.data[!is.na(sample.data)],k=25) #Take out missing values before calculating mad
}
}#endif
#Initialize
pcf.names <- c("pos",sampleid)
seg.names <- c("sampleID","start.pos","end.pos","n.probes","mean")
segments <- data.frame(matrix(nrow=0,ncol=5))
colnames(segments) <- seg.names
if(return.est){
pcf.est <- matrix(nrow=0,ncol=nSample)
}
#Run PCF separately for each sample:
for(i in 1:nSample){
if(return.est){
#Initialize:
yhat <- rep(NA,length(nProbe))
}
sample.data <- cn.data[,i]
#Remove probes with missing obs; Only run pcf on non-missing values
obs <- !is.na(sample.data)
obs.data <- sample.data[obs]
if(length(obs.data)>0){ ##Make sure there are observations for this sample! If not, estimates are left NA as well
#If number of probes in entire data set is >= 100K, the MAD sd-estimate is calculated using obs in this arm for this sample.
#Only required if normalize=T
if(nProbe>=100000 && normalize){
sd[i] <- getMad(obs.data,k=25)
}
#Scale gamma by variance if normalize is TRUE
use.gamma <- gamma0
if(normalize){
use.gamma <- gamma0*(sd[i])^2
}
#Must check that sd!=0 and sd!!=NA -> use.gamma=0/NA. If not, simply calculate mean of observations
if(use.gamma==0 || is.na(use.gamma)){
if(return.est){
res <- list(Lengde=length(obs.data),sta=1,mean=mean(obs.data),nIntervals=1,yhat=rep(mean(obs.data)))
}else{
res <- list(Lengde=length(obs.data),sta=1,mean=mean(obs.data),nIntervals=1)
}
}else{
# Compute piecewise constant fit
#run fast approximate PCF if fast=TRUE and number of probes>400, or exact PCF otherwise
if(!fast || length(obs.data)<400){
#Exact PCF:
res <- exactPcf(y=obs.data,kmin=kmin,gamma=use.gamma,yest=return.est)
}else{
#Run fast PCF:
res <- selectFastPcf(x=obs.data,kmin=kmin,gamma=use.gamma,yest=return.est)
}#endif
}#endif
#Retrieve segment info from results:
seg.start <- res$sta
seg.stop <- c(seg.start[-1]-1,length(obs.data))
seg.npos <- res$Lengde
seg.mean <- res$mean
nSeg <- res$nIntervals
if(return.est){
yhat[obs] <- res$yhat
}
#Find genomic positions for start and stop of each segment:
pos.start <- position[seg.start]
pos.stop <- position[seg.stop]
#Handle missing values:
if(any(!obs)){
#first find nearest non-missing neighbour for missing probes:
nn <- findNN(pos=position,obs=obs)
#Include probes with missing values in segments where their nearest neighbour probes are located
new.res <- handleMissing(nn=nn,pos=position,obs=obs,pos.start=pos.start,pos.stop=pos.stop,seg.npos=seg.npos)
pos.start <- new.res$pos.start
pos.stop <- new.res$pos.stop
seg.npos <- new.res$seg.npos
if(return.est){
yhat[!obs] <- yhat[nn]
}
}
}else{
warning(paste("pcf is not run for sample ",i," because all observations are missing. NA is returned.",sep=""),immediate.=TRUE,call.=FALSE)
seg.start <- 1
seg.stop <- nProbe
pos.start <- position[seg.start]
pos.stop <- position[seg.stop]
nSeg <- 1
seg.mean <- NA
seg.npos <- nProbe
}
#Round:
seg.mean <- round(seg.mean,digits=digits)
#Add results for this sample to results for other samples in data frame:
seg <- data.frame(rep(sampleid[i],nSeg),pos.start,pos.stop,seg.npos,seg.mean,stringsAsFactors=FALSE)
colnames(seg) <- seg.names
segments <- rbind(segments,seg)
if(return.est){
#Rounding:
yhat <- round(yhat,digits=digits)
pcf.est <- cbind(pcf.est,yhat)
}
}#endfor
if(verbose){
cat(paste("pcf finished for sample ",i,sep=""),"\n")
}
if(return.est){
pcf.est <- data.frame(position,pcf.est,stringsAsFactors=FALSE)
colnames(pcf.est) <- pcf.names
return(list(estimates=pcf.est,segments=segments))
}else{
return(segments)
}
}#endfunction
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.