R/estimate_nkr.r
In RCNA: Robust Copy Number Alteration Detection (RCNA)

Documented in estimate_nkr estimate_nkr.default estimate_nkr.RCNA_object

#' @include classes.r
#' @include RCNA.r

#' @description This generic function is used to run normal karyotype range estimation.
#'
#'
#' @title estimate_nkr: Estimate CNA "normal" karyotype ranges
#' @param obj A \linkS4class{RCNA_object} class object, or a data.frame object that contains all the necessary parameters.
#' @param ... Other arguments
#' @details This function can be run as a stand-alone or as part of \link{run_RCNA}
#' @return A \linkS4class{RCNA_analysis} class object that describes the input parameters and output files generated by this step of the workflow.
#' @seealso \linkS4class{RCNA_object}, \linkS4class{RCNA_analysis}, \link{run_RCNA}
#' @rdname estimate_nkr.default
#' @export estimate_nkr

estimate_nkr <- function(obj, ...) UseMethod("estimate_nkr")

#' Estimate the normal karyotype range for each feature
#'
#' This function estimates the normal karyotype range for each feature in the annotation file. It creates an .RData object for each feature, which is placed in the output directory under `/nkr`. This intermediate output is used in \link{estimate_feature_score}.
#'
#' @param df Path to the config file, or a `data.frame` object containing the valid parameters. Valid column names are `file.nkr.coverage`, `x.norm`, and `sample.names`. Additional columns will be ignored.
#' @param sample.names Character vector of sample names. Alternatively can be specified in `df`.
#' @param ano.file Location of the annotation file. This file must be in CSV format and contain the following information (with column headers as specified): "feature,chromosome,start,end".
#' @param out.dir Output directory for results. A subdirectory for results will be created under this + `/nkr/`.
#' @param ncpus Integer number of CPUs to use. Specifying more than one allows this function to be parallelized by feature.
#' @param file.ci.coverage Character vector listing the input coverage files. Must be the same length as `sample.names`. Alternatively can be specified in `df`.
#' @param nkr Numeric between 0 and 1 which specifies the coverage quantile that should be considered a "normal" karyotype range for each position. Lowering this value may increase sensitivity but also Type I error.
#' @param x.norm Whether or not to perform normalization for normal female/XX karyotype (default = FALSE). Can be specified for each sample separately via `df` column labeled `x.norm`.
#' @param norm.cov.matrix Character file path detailing the location of the normalized coverage matrix generated by this function. Re-using this file between runs can cut down on runtime significantly for large sample sizes. If the file doesn't exist yet it will be created at this location. If this file name ends in ".gz" then the output will be compressed using gzip.
#' @param verbose When TRUE increases level of detail for command output
#' @param obj An `RCNA_object` type created by \link{create_RCNA_object}. If specified this takes precedent over all over args.
#' @param ... Additional arguments
#' @examples
#' ## Run NKR estimation on example object
#' # See \link{example_obj} for more information on example
#' example_obj@ano.file <- system.file("examples" ,"annotations-example.csv", package = "RCNA")
#' \donttest{example_obj}
#' # Create output directory
#' dir.create(file.path("output", "nkr"), recursive = TRUE)
#' # Copy example GC-corrected coverage files
#' cov.corrected <- system.file("examples", "gc", package = "RCNA")
#' file.copy(from = cov.corrected, to = "output", recursive = TRUE)
#' # Run NKR estimation, append results
#' estimate_nkr_analysisObj <- estimate_nkr(example_obj)
#' example_obj@commands <- c(example_obj@commands, estimate_nkr_analysisObj)
#' \dontshow{system("rm -rf output")}
#' @return A \linkS4class{RCNA_analysis} class object that describes the input parameters and output files generated by this step of the workflow.
#' @rdname estimate_nkr.default
#' @import utils parallel doParallel
#' @details
#'
#' The `df` argument corresponds to the `nkrParams` matrix on \linkS4class{RCNA_object}. Valid column names are `sample.names`, `file.ci.coverage`, and `x.norm`. Additional columns will be ignored.
#' @export

estimate_nkr.default = function(obj = NULL, df = NULL, sample.names = NULL, ano.file, out.dir = NULL, ncpus = 1,
		file.ci.coverage = NULL, nkr = 0.9, x.norm = NULL, norm.cov.matrix = NULL,
		verbose = FALSE, ...){

  if(is.data.frame(obj) | is.character(obj)){
    df = obj
  } else if(!missing(obj)){
    return(estimate_nkr.RCNA_object(obj, verbose))
  } else if(missing(df)){
		if(verbose) message("No data.frame detected for estimate_nkr. Constructing run from parameters instead")
		if(!missing(sample.names) & !missing(ano.file)){
			tryCatch({ # Attempt to create an RCNA object
						new_RCNA_obj = create_RCNA_object(sample.names = sample.names, ano.file = ano.file, ncpu = ncpus, ...)
					},
					error = function(cond){
						message("RCNA Object could not be created due to error:")
					},
					warning = function(cond){
						message("Creating an RCNA object caused a warning:")
						message(conditionMessage(cond))
					})
		} else if(missing(sample.names)){
			stop("Missing both `df` argument and `sample.names` argument - please specify at least one.")
		}
	} else {
		if(missing(sample.names)){
			tryCatch({
						if(verbose) message("Pulling sample.names from data.frame")
						sample.names = df[["sample.names"]]
					},
					error = function(cond){
						message("Error while populating sample names from data.frame:")
					},
					warning = function(cond){
						message("Populating sample names produced a warning:")
						message(conditionMessage(cond))
					})
		}

		tryCatch({ # Attempt to create an RCNA object
					new_RCNA_obj = create_RCNA_object(sample.names = sample.names, ano.file = ano.file, ncpu = ncpus, nkrParams = df, ...)
				},
				error = function(cond){
					message("RCNA Object could not be created due to error:")
				},
				warning = function(cond){
					message("Creating an RCNA object caused a warning:")
					message(conditionMessage(cond))
				})
	}

	analysis.obj = estimate_nkr.RCNA_object(obj = new_RCNA_obj, verbose = verbose)

	new_RCNA_obj@commands = analysis.obj
	return(new_RCNA_obj)
}
#' Estimate the normal karyotype range for each feature
#'
#' This function estimates the normal karyotype range for each feature in the annotation file. It creates an .RData object for each feature, which is placed in the output directory under `/nkr`. This intermediate output is used in \link{estimate_feature_score}.
#'
#' @param obj A RCNA_object type object - parameters will be pulled from the object instead, specifically from the `nkrParams` slot.
#' @param verbose If set to TRUE will display more detail
#' @param ... Additional arguments (unused)
#' @importFrom data.table fread
#' @importFrom R.utils gzip
#' @import modeest parallel doParallel foreach
#' @method estimate_nkr RCNA_object
#' @details For more parameter information, see \link{estimate_nkr.default}.
#' @return A \linkS4class{RCNA_analysis} class object that describes the input parameters and output files generated by this step of the workflow.
#' @rdname estimate_nkr.default
#' @export

estimate_nkr.RCNA_object = function(obj, verbose = FALSE, ...){

  ano.file = obj@ano.file
  nkr.dir = file.path(obj@out.dir, "nkr")
  nkr = obj@nkr
  norm.cov.matrix = obj@norm.cov.matrix
  config.df = obj@nkrParams
  ncpus = obj@ncpu

  if(!dir.exists(nkr.dir)){
	  if(verbose) message("Creating NKR output directory at ", nkr.dir)
	  dir.create(nkr.dir, recursive = TRUE)
  }

  ano = read.csv(ano.file,sep=',',header=TRUE,stringsAsFactors=FALSE)

  ## calculate mode normalized (subject level), median centered and mode adjusted results
  if(is.null(norm.cov.matrix)){
	  warning("Normalized coverage matrix location not specified - using output dir (", nkr.dir, ")")
	  res.file = file.path(nkr.dir, 'normalized-coverage-matrix.csv.gz')
  } else if(dir.exists(norm.cov.matrix)){
	  res.file = file.path(norm.cov.matrix, 'normalized-coverage-matrix.csv.gz')
	  warning("norm.cov.matrix is a directory - creating coverage matrix at ", res.file)
  } else {
	  res.file = norm.cov.matrix
	  if(file.exists(res.file) & verbose) message(res.file, " passed as argument `norm.cov.matrix` already exists. Overwriting existing file.")
  }

  if (file.exists(res.file) & !dir.exists(res.file)) {
    if(verbose) message('Importing coverage matrix from previous run')
	  cov.ratios = data.table::fread(paste0(res.file),sep=',',header=TRUE,stringsAsFactors=FALSE)
  } else {
    if(verbose) message('Generating coverage matrix')
	  # Load first coverage file
	  cov.ex = read.csv(config.df$file.nkr.coverage[1], sep = "\t", stringsAsFactors = FALSE)
	  cov.ratios = as.data.frame(matrix(ncol=nrow(config.df),nrow=nrow(cov.ex)),stringsAsFactors=FALSE)
	  # Add feature to coverage file
	  cov.ex$feature = unlist(lapply(sapply(cov.ex$pos, function(x) ano[which(x > ano$start & x < ano$end), "feature"]),function(y){paste(y,collapse='_')}))
	  rownames(cov.ratios) = paste(cov.ex$feature, cov.ex$chr, cov.ex$pos, sep = "_")
	  colnames(cov.ratios) = config.df$sample.names
	  for (i in 1:nrow(config.df)) {
		  x.norm = config.df$x.norm[i]
		  cov = read.csv(config.df$file.nkr.coverage[i],sep='\t',header=TRUE,stringsAsFactors=FALSE)

		  ## impute 0 values to 1 to avoid -INF
		  cov$coverage[cov$coverage==0] = 1

		  # calculate mode normalized coverage ratio correcting for females on the X chromosome
		  female.x.norm = if(x.norm) {as.numeric(cov.ex$chrom=='chrX')} else {rep(0,nrow(cov))}
		  cov$log2.ratio = log2(cov$coverage/modeest::mlv(cov$coverage,method='shorth')) - female.x.norm
		  cov.ratios[,i] = cov$log2.ratio
	  }

	  ## center subjects using median across subjects
	  pos.medians = apply(cov.ratios,1,median)
	  cov.ratios = cov.ratios - pos.medians

	  ## correct subject bias by subtracting the subject's mode from the 0 value (acorss-subject median)
	  mode.adj = sapply(1:ncol(cov.ratios),function(x){modeest::mlv(cov.ratios[,x],method='shorth')})
	  cov.ratios = t(t(cov.ratios) - mode.adj)
	  if(grepl(".gz$", res.file, perl = TRUE)){
	    write.table(cov.ratios, gsub(".gz$", "", res.file, perl=TRUE),sep=',',quote=TRUE,row.names=TRUE)
	    R.utils::gzip(gsub(".gz$", "", res.file, perl=TRUE),overwrite=TRUE)
	  } else{
	    write.table(cov.ratios, res.file, sep = ',', quote=TRUE,row.names=TRUE)
	  }
  }

  config.df = config.df[,c("file.nkr.coverage", "x.norm", "sample.names")]

  do_estimate_nkr(sample.names = config.df$sample.names, file.nkr.coverage = config.df$file.nkr.coverage, x.norm = config.df$x.norm, ano.file = ano.file, res.dir = nkr.dir, nkr = nkr, ncpus = ncpus, cov.ratios = cov.ratios, verbose = verbose)
  if(verbose) message("NKR estimation succeeded!")
  output_analysis_obj = new("RCNA_analysis", call = "estimate_nkr", params = list("nkrParams" = config.df, "nkr" = nkr, "cov.ratios" = res.file), res.files = list("Output" = list.files(nkr.dir), "Normalized coverage ratios" = res.file))
  return(output_analysis_obj)
}

#' @import doParallel parallel foreach
#' @importFrom stats median
# Function to run each sample
do_estimate_nkr = function (ano.file, file.nkr.coverage, res.dir,  nkr = 0.95, x.norm, ncpus = 1, cov.ratios, sample.names, verbose = FALSE) {
	# Coerce args
	nkr = as.numeric(nkr)
	ano.file = as.character(ano.file)
	file.nkr.coverage = as.character(file.nkr.coverage)
	res.dir = as.character(res.dir)
	ncpus = as.numeric(ncpus)

	if(!dir.exists(res.dir)){
	  if(verbose) message("Creating ", res.dir)
		dir.create(res.dir)
	}

	## generate exon level feature model data frame (format based on example data in Gviz manual)
	ano = read.csv(ano.file,sep=',',header=TRUE,stringsAsFactors=FALSE)
  k <- 0
	## for each feature do
	# parallel implementation add ".options.multicore = opts" to foreach options
	cl=parallel::makeCluster(ncpus)
	registerDoParallel(cl)
	foreach (k=1:length(unique(ano$feature))) %dopar% {
		feature = unique(ano$feature)[k]

		chr = paste0('chr',ano$chromosome[ano$feature==feature])
		start = ano$start[ano$feature==feature]
		end = ano$end[ano$feature==feature]

		if (! file.exists(file.path(res.dir, paste0(feature,'.RData')))) {

			## limit all positions on feature to those actually sequenced
			cov.feature.pos = grep(paste0('^',feature,'_|_',feature,'_'),rownames(cov.ratios))
			cov.ratios.keep = cov.ratios[cov.feature.pos,]
			cov.ratios.keep = cov.ratios.keep[which(rowSums(cov.ratios.keep,na.rm=TRUE)!=0),]

			res = normal.range.by.pos(x=cov.ratios.keep, statistic='median', p=as.numeric(nkr), digits=6)
			res[['raw']] = cov.ratios.keep

			## write results
			outfile = file.path(res.dir, paste0(feature,'.RData'))
			saved.res = save(res,file=outfile,compress=TRUE)
		}
	}
	parallel::stopCluster(cl)
}


#' @import stats
## generate statistics as well as "p" data nkr interval
normal.range.by.pos = function(x, statistic='median', p=0.95, digits=6) {

	## generate holding variables
	vals.pos = c()
	lcis.pos = c()
	ucis.pos = c()

	## for each row in the matrix do
	for (i in 1:nrow(x)) {
		x.pos = stats::na.omit(as.numeric(x[i,]))

		## Get specified confidence intervals
		qnt.pos = stats::quantile(x.pos, (1-p) / 2*c(1,-1) + c(0,1),na.rm=TRUE)

		## get desired value based on statistic
		vals.pos[i] = round(do.call(statistic, list(x.pos)),digits)
		lcis.pos[i] = round(qnt.pos[1],digits)
		ucis.pos[i] = round(qnt.pos[2],digits)
	}

	## Return list of results
	return(list(val.pos = vals.pos, lci.pos = lcis.pos, uci.pos = ucis.pos))
}