ppGS: Parallel pipeline for Genomic Selection

utils::globalVariables (c("EN", "brewer.pal"))
#' Genomic selection for single phenotype and multiple models 
#'
#' Function that performs genomic selection by evaluating in parallel multiple models on a single phenotype.
#' Models are evaluated through cross-validation, then the best model is selected, 
#' and GEBVs are calculated using this model. 
#' Results are generated in both tables and graphics, and they include: 
#' GEBVs for target and training datasets, graphic comparison of predicted ability for each model 
#' (correlation between actual and predicted values), and comparison of execution times between models.
#'
#' @param paramsFile Filename of the parameter file (YAML Format).
#' @param outputDir  (Optional) Directory name where results will be saved. If empty, phenotype name is used. 
#'
#' @return TRUE if no errors or FALSE otherwise.
#'
#' @import utils
#' @import grDevices
#' @import graphics
#' @importFrom stats complete.cases
#' @import parallel
#' @import yaml
#' @importFrom RColorBrewer brewer.pal
#'
#' @export
gs_single <- function (paramsFile, outputDir="") {
	message (">>>>>>>>>>>>>>>>>>>> Running GS for parameters file: ", paramsFile, "<<<<<<<<<<<<<<<<<<<<<<<")
	#------------------------------------------------------
	# Single phenotype / Multiple models genomic selection
	#------------------------------------------------------

	# load input files with or without all markers
	out    = readGenoPhenoTrainingTargetData (paramsFile)
	data   = out$data
	params = out$params

	errorOutput = TRUE
	tryCatch ({
		trainingGeno  = data$trainingGeno
		trainingPheno = data$trainingPheno
		targetGeno    = data$targetGeno
		targetPheno   = data$targetPheno

		# Create and change to output dir
		if (outputDir=="")
			outputDir = data$phenoName
		createDir (outputDir)
		setwd (outputDir)

		listCVs = runCrossValidation (data, params)
		createOutputsCrossValidation (listCVs, params$phenoName)
		bestMethod = getBestGenomicPredictionMethod (listCVs, params$methods)

		predictGEBVsTraining (trainingGeno, trainingPheno, trainingGeno, params$methods, params$phenoName, params$nCores)
		predictGEBVsTarget (trainingGeno, trainingPheno, trainingGeno, trainingPheno, bestMethod, params$phenoName, "TRAINING")
		predictGEBVsTarget (trainingGeno, trainingPheno, targetGeno, targetPheno, bestMethod, params$phenoName, "TARGET")

		flush(stderr())
		#BglrMethods = c("BA", "BB", "BC", "BL","BRR","RKHS")
		#METHODS = c("GBLUP","EGBLUP","RR","LASSO","EN","BRR","BL","BA", "BB", "BC","RKHS","RF","SVM")
		#AllMethods = c("GBLUP","EGBLUP","RR","LASSO","EN","BRR","BL","BA", "BB", "BC","RKHS","RF","SVM","BRNN")
	}, error = function (e){
		message (sprintf ("Error: In trait %s ", outputDir), e)
		errorOutput <<- FALSE
	})
	return (errorOutput)
}

#-----------------------------------------------------------------------------------------------------------
# Compare predicted value with diffetent methods
#-----------------------------------------------------------------------------------------------------------
predictGEBVsTarget <- function (trainingGeno, trainingPheno, targetGeno, targetPheno,  bestMethod, TRAIT, PREFIX) {
	message (">>> Predicting GEBVs for best model using target data...")

	# Run predictions target
	params             = list ("gtrn"=trainingGeno, "ptrn"=trainingPheno, "gtrg"=targetGeno)
	predictedGEBVs     = predictGEBVsFunction (bestMethod, params)

	if (is.null (targetPheno)) 
		predictedGEBVsTable =  data.frame (predictedGEBVs, row.names = rownames (targetGeno))
	else {
		targetPheno         = as.data.frame (targetPheno)
		noNAs               = row.names (targetPheno [complete.cases (targetPheno),,drop=F])
		predicted           = data.frame (noNAs, PREDICTED=predictedGEBVs [noNAs], row.names=1)
		predictedGEBVsList  = list (predicted, ACTUAL=targetPheno [noNAs,])

		# Write outputs to table and histograms
		mainText            = sprintf ("%s GEBVs for '%s' trait, using '%s' model", PREFIX, TRAIT, bestMethod)
		outFile             = sprintf ("out-GEBVs-BestModel-%s", PREFIX)
		predictedGEBVsTable = as.data.frame (predictedGEBVsList)
		rownames (predictedGEBVsTable) = noNAs
		message (">>> Creating outputs for Actual vs. Predicted GEBVs...", TRAIT)
		graphicPredictedOutputs (predictedGEBVsList, mainText, outFile, predictedGEBVsTable)
	}
	colnames (predictedGEBVsTable)[1] = sprintf ("GEBVs(%s)", bestMethod)
	predictedGEBVsTable = cbind (SAMPLES=rownames (predictedGEBVsTable), predictedGEBVsTable)
	write.csv (predictedGEBVsTable, "out-GEBVs-BestModel-TARGET.csv", row.names=F,quote=F)
}

#-----------------------------------------------------------------------------------------------------------
# Compare predicted value with diffetent methods
#-----------------------------------------------------------------------------------------------------------
predictGEBVsTraining <- function (trainingGeno, trainingPheno, targetGeno, METHODS, TRAIT, NCORES) {
	message (">>> Predicting GEBVs for different models only using training data...")
	params = list ("gtrn"=trainingGeno, "ptrn"=trainingPheno, "gtrg"=targetGeno)

	# Run predictions in parallel
	predictedGEBVsList = mclapply (METHODS, predictGEBVsFunction, params, mc.cores=NCORES, mc.silent=T)

	# Write outputs to table and histograms
	outFile             = "out-GEBVs-AllModels-TRAINING"
	mainText            = sprintf ("Comparison of GEBVs from different models for '%s' trait", TRAIT)
	predictedGEBVsTable = as.data.frame (predictedGEBVsList, col.names=METHODS)
	message (">>> Creating outputs for Predicted GEBVs from different methods...")
	graphicPredictedOutputs (predictedGEBVsList, mainText, outFile, predictedGEBVsTable)
}

#----------------------------------------------------------
# Read geno/pheno trainit/target data
# Also, use selected or all markers, if "gwasResultsPath" is set or not.
#----------------------------------------------------------
readGenoPhenoTrainingTargetData <- function (paramsFile) {
	message (">>> Reding geno/pheno datasets...")
	# Load parameters file and make a list from methods parameter
	params         = yaml.load_file (paramsFile)
	params$methods = strsplit (params$methods, "[ ]+")[[1]]

	# Read arguments and load data
	trainingGeno     = read.csv (params$trainingGenoFile, check.names=F, row.names=1)
	trainingPhenoTbl = read.csv (params$trainingPhenoFile, check.names=F)
	targetGeno       = read.csv (params$targetGenoFile, check.names=F, row.names=1)
	phenoName        = params$phenoName
	# Check if target pheno was given
	if (is.null (params$targetPhenoFile)) 
	targetPheno = NULL
	else {
		targetPhenoTbl      = read.csv (params$targetPhenoFile, check.names=F)
	  	targetPheno         = targetPhenoTbl [,2]
	  	names (targetPheno) = targetPhenoTbl [,1]
	}

	# Convert phenotype table to vector
	trainingPheno         = trainingPhenoTbl [,2]
	names (trainingPheno) = trainingPhenoTbl [,1]
	trait                 = colnames (trainingPhenoTbl)[2]


	data = list (trainingGeno, trainingPheno, targetGeno, targetPheno, trait, phenoName)
	names (data) = c ("trainingGeno", "trainingPheno", "targetGeno", "targetPheno", "trait", "phenoName")

	return (list (params=params, data=data))
}
#-------------------------------------------------------------
# Wrapper for BWGS function "predict" that predics GEBVs
#-------------------------------------------------------------
predictGEBVsFunction <- function (method, params) {
	message (">>>>>>>>>>", method)
	predictedGEBVs  = bwgs.predict (predict.method=method,
	                                geno_train=params$gtrn,pheno_train=params$ptrn, geno_target=params$gtrg,
	                                MAXNA=0.2,MAF=0.05,geno.reduct.method="NULL",reduct.size="NULL",
	                                r2="NULL",pval="NULL",MAP="NULL",geno.impute.method="MNI")
	df = predictedGEBVs [,1]
	names (df) = rownames (params$gtrg)
	return (df)
}

#-------------------------------------------------------------
# Graphic histograms and scatterplots for GEBVx
#-------------------------------------------------------------
graphicPredictedOutputs <- function (predictedGEBVsList, mainText, outFilename, predictedGEBVsTable, PREFIX="") {
	## put histograms on the diagonal
	panel.hist <- function(x, ...) {
	  usr = par("usr"); on.exit(par(usr))
	  par(usr = c(usr[1:2], 0, 1.5) )
	  h      = hist(x, plot = FALSE)
	  breaks = h$breaks; nB <- length(breaks)
	  y      = h$counts; y <- y/max(y)
	  rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
	}
	## put (absolute) correlations on the upper panels, with size proportional to the correlations.
	panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...) {
	  usr = par("usr"); on.exit(par(usr))
	  par(usr = c(0, 1, 0, 1))
	  r   = abs(cor(x, y))
	  #txt = format(c(r, 0.123456789), digits = digits)[1]
	  txt = format(r, digits = digits)
	  txt = paste0(prefix, txt)
	  if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
	  text(0.5, 0.5, txt, cex = cex.cor * r)
	}

	write.csv (predictedGEBVsTable, paste0 (outFilename, ".csv"), quote=F, row.names=T)
	pdf (paste0 (outFilename, ".pdf"), width=13, height=13)
	pairs(predictedGEBVsTable,lower.panel = panel.smooth,
	      upper.panel = panel.cor, diag.panel=panel.hist, main=mainText)
	dev.off()
}

#-------------------------------------------------------------
# Get Best Method
#-------------------------------------------------------------
getBestGenomicPredictionMethod <- function (listCVs, METHODS) {
	predictedAbilitiesMeans = mclapply (listCVs, function (cvObject ){return (cvObject$summary[[1]])}, mc.silent=T)
	bestMethod          = METHODS [which.max (predictedAbilitiesMeans)]
	return (bestMethod)
}
#-------------------------------------------------------------
# Prediction ability comparison of methods
# a.Execute prediction methods
#-------------------------------------------------------------
runCrossValidation <- function (data, params) {
	trainingGeno  = data$trainingGeno
	trainingPheno = data$trainingPheno
	METHODS       = params$methods
	nTimes        = params$nTimes
	nFolds        = params$nFolds
	NCORES        = params$nCores
	TRAIT         = params$phenoName

	message ("\t>>> Running cross validation using nTimes=",nTimes, " nFolds=", nFolds)
	message ("\t>>> Methods=", METHODS)

	crossValidationFunction <- function (method) {
	  message (">>> ", method)
	  cvOutput <- bwgs.cv (trainingGeno, trainingPheno, predict.method= method,
	                       geno.impute.method="mni", nFolds=nFolds, nTimes=nTimes )
	  return (cvOutput)
	}

	listCVs         = mclapply (METHODS, crossValidationFunction, mc.cores=NCORES, mc.silent=T)
	names (listCVs) = METHODS
	return (listCVs)
}

#-------------------------------------------------------------
# Create output tables and graphics from Cross Validation results
#-------------------------------------------------------------
createOutputsCrossValidation <-function (listCVs, TRAIT) {
	message ("\t>>> Get info from cross validation...")
	predictedAbilitiesList   = mclapply (listCVs, function (cvObject ){return (cvObject$cv)}, mc.silent=T)
	predAbilTable            = as.data.frame (predictedAbilitiesList)
	predictedTimesList       = mclapply (listCVs, function (cvObject ){return (cvObject$summary[[3]])}, mc.silent=T)

	predictedAbilitiesKFolds = mclapply (listCVs, function (cvObject ){return (cvObject$summary$KFoldsTable)}, mc.silent=T)
	predAbilKFoldsTable      = do.call ("rbind", predictedAbilitiesKFolds)

	predTimesTable           = as.data.frame (predictedTimesList)
	#predictedAbilitiesMeans = mclapply (listCVs, function (cvObject ){return (cvObject$summary[[1]])})
	#bestMethod          = METHODS [which.max (predictedAbilitiesMeans)]

	# Write outputs: tables and boxplots
	nModels   = ncol (predAbilTable)
	allColors = unlist (lapply (c ("Set1", "Set2", "Set3"), function (x) brewer.pal (n=8, x)))
	COLORS    = allColors [1:nModels]

	message ("\t>>> Outputs for predicted ability by methods...")
	outFilename = "out-CrossValidation"

	#write.csv (predAbilTable, paste0 (outFilename, "-means.csv"), quote=F, row.names=F)
	#pdf (paste0 (outFilename, "-means.pdf"), width=13, height=13)
	#mainText = paste0 ("Predictive ability by models\nTrait: ", TRAIT)
	#boxplot(predAbilTable, xlab="Prediction model",  ylab="Predictive ability", main=mainText, col=COLORS)
	#dev.off()

	message ("\t>>> Outputs for times for each method...")
	write.csv (predTimesTable, paste0 (outFilename, "-times.csv"), quote=F, row.names=F)
	pdf (paste0 (outFilename, "-times.pdf"), width=13, height=13)
	mainText = paste0 ("Average time by models\nTrait: ", TRAIT)
	barplot (unlist (predictedTimesList), main=mainText, ylab="Time (mins)", xlab="Methods", col=COLORS)
	dev.off()

	# Create kfolds table with PA for each fold and time
	message ("\t>>> Outputs for predicted ability by methods correlations by fold...")
	write.csv (predAbilKFoldsTable, paste0 (outFilename, "-kfolds.csv"), quote=F, row.names=F)
	pdf (paste0 (outFilename, "-kfolds.pdf"), width=13, height=13)
	mainText = paste0 ("Cross validation by models \nTrait: ", TRAIT)
	boxplot (Predictive_ability~Models, data=predAbilKFoldsTable, main=mainText, col=COLORS)
	dev.off()
}
#-----------------------------------------------------------------------------------------------------------
# c. Compare predicted value varying training size
#-----------------------------------------------------------------------------------------------------------
comparePredictionAbilitiesBySize <- function (listCVs, trainingGeno, trainingPheno, METHODS, nTimes, nFolds, TRAIT, NCORES, NCHUNKS) {
	message (">>> Predicting values by size")
	nSamples  = length (trainingPheno)
	N         = NCHUNKS   # number of chunks
	chunkSize = floor (nSamples / N)
	sizes     = seq (chunkSize, nSamples, chunkSize)
	sizes     = sizes [1:N]

	message ("\t>>> Get info from cross validation")
	predictedAbilitiesMeans = mclapply (listCVs, function (cvObject ){return (cvObject$summary[[1]])}, mc.silent=T)
	bestMethod          = METHODS [which.max (predictedAbilitiesMeans)]

	#--- Local Function -------------------------------------------------------------------
	cvFunSize <- function (sampleSize, method) {
	  cvOutput <-bwgs.cv (sample.pop.size=sampleSize,
	                      predict.method=method,
	                      geno=trainingGeno, pheno=trainingPheno, pop.reduct.method="RANDOM",
	                      geno.impute.method="mni", nFolds=nFolds, nTimes=nTimes )
	  return (cvOutput$cv)
	} #-------------------------------------------------------------------------------------

	listCVsSize = mclapply (sizes[1:(N-1)], cvFunSize, bestMethod, mc.cores=NCORES, mc.silent=T)
	listCVsSize = append (listCVsSize, list(listCVs[[bestMethod]]$cv))
	names (listCVsSize) = paste0("N=", sizes)

	message ("\t>>> Writing outputs for predicted value by size...")
	outFilename = "out-predicted-value-by-size-bestMethod"
	CompareSize = as.data.frame (listCVsSize, col.names=paste0("N=",as.character(sizes)),check.names=F)
	write.csv (CompareSize, paste0 (outFilename, ".csv"), quote=F, row.names=F)

	pdf (paste0 (outFilename, ".pdf"), width=13, height=13)
	mainText = paste0 ("Effect of training population size\n Trait: ", TRAIT, ", Method: ", bestMethod )
	boxplot(CompareSize,xlab="Training POP size",ylab="Predictive avility", main=mainText)
	dev.off()
}

#----------------------------------------------------------
# Create dir, if it exists the it is renamed old-XXX
#----------------------------------------------------------
createDir <- function (newDir) {
	checkOldDir <- function (newDir) {
	  name  = basename (newDir)
	  path  = dirname  (newDir)
	  if (dir.exists (newDir) == T) {
	    oldDir = sprintf ("%s/old-%s", path, name)
	    if (dir.exists (oldDir) == T)
	      checkOldDir (oldDir)
	    file.rename (newDir, oldDir)
	  }
	}
	checkOldDir (newDir)
	system (sprintf ("mkdir %s", newDir))
}