R/gs_plots.R

#!/usr/bin/Rscript

#' Plot for cross validation
#' Function that creates a comparison plot of predictive abilities for each phenotype and each model
#' @param predictionsKFoldsTable Table with information from cross validation: GEBVs for each fold and each repetition.
#' @return None
#' @import ggplot2
#' @import dplyr 

gs_outputs <- function (resultsDir="", kfoldsTable=""){
	if (resultsDir!="")
		kfoldsTable  = createSummaryTables (resultsDir)
	gs_plots (kfoldsTable)

}

#-------------------------------------------------------------
#--- Create output tables and boxplots for best model for each trait
#-------------------------------------------------------------
gs_plots <- function (predictionsKFoldsTable) {
	gs_plotsHCL (predictionsKFoldsTable, "Predictive_ability")
}

#-------------------------------------------------------------
#--- Create output tables and boxplots for best model for each trait
#-------------------------------------------------------------
gs_plotsHCL <- function (HCLTable, columnName, TITLE) {
	message (">>> Creating output plots and tables for best model ...")

	# Summary plot boxplot best models for traits
	meansTable = HCLTable %>% dplyr::group_by (Traits, Models) %>% 
		summarize (Means=mean(get(columnName)), .groups="keep") 

	# Get best model for each trait
	bestTable = meansTable %>% group_by (Traits) %>% slice_max (Means) %>% slice_head

	# Create table with correlations from best models
	getBestCorr <- function (trait, model) 
		return (dplyr::filter (HCLTable, Traits==trait, Models==model))
	bestCorrs = do.call (rbind, mapply (getBestCorr, bestTable$Traits, bestTable$Models, SIMPLIFY=F))

	traitColors = unlist (lapply(1:8, function(x) rep(x,3))) 
	nTraits     = length (unique (bestCorrs$Traits))
	traitColors = traitColors [1: nTraits] 
	outPlotname   = "out-CrossValidation-ModelsTraits.pdf"
	ggplot (bestCorrs, aes(x=HCL_component, y=get(columnName))) + ylim (0,1) +
		    geom_boxplot (alpha=0.3, fill=traitColors) + 
			theme(axis.text.x=element_text(angle=0, hjust=1)) + labs (title="Best GS models for color traits in tetraploid potato") + 
			stat_summary(geom='text', label=round (bestTable$Means,2) , fun=max, vjust = -1, size=3, col="blue") +
			stat_summary(geom='text', label=bestTable$Models, fun=min, vjust = 1.5, angle=0,size=3, col="blue") + 
			facet_wrap (~Prefix, ncol=8) 
	ggsave (outPlotname, width=11)
}
#-------------------------------------------------------------
#-------------------------------------------------------------
# Create summary tables from GS results of each phenotype
# Create a GEBVs table and kfolds table, the last is returned to be graphed
#-------------------------------------------------------------
createSummaryTables <- function (inputDir) {
	message (">>> Creating table from cross validations predictive abilities...")

	fileCrossValPredictions  = "out-CrossValidation-kfolds.csv"
	fileGEBVsPredictions     = "out-GEBVs-BestModel-TARGET.csv"

	predictionsKFoldsTable   = data.frame ()

#	phenoNames               = colnames (read.csv (phenotypeFile)[,-1])
	phenoNames               = list.files (inputDir)

	predictionsTableList     = list()
	for (pheno in phenoNames) {
		traitPrefix = strsplit (pheno, "[.]")[[1]][1]
		traitHCL    = strsplit (pheno, "[.]")[[1]][3]

		# Construct table of cross validation means, checking if CV data exists
		predictionsFilePath = sprintf ("%s/%s/%s", inputDir, pheno, fileCrossValPredictions)
		if (!file.exists (predictionsFilePath)) {
			message ("WARNING: file ", predictionsFilePath, " not found!")
			cvKFoldsTable  = as.data.frame (list (Models="",Predictive_ability=0))
		}else {
			cvKFoldsTable  = read.csv (predictionsFilePath)

			# Build GEBVs table for all phenotypes
			filePathGEBVsPredictions = sprintf ("%s/%s/%s", inputDir, pheno, fileGEBVsPredictions)
			predictionsTable         = read.csv (filePathGEBVsPredictions, row.names=1)
			names (predictionsTable) = c (pheno, sprintf ("%s.GEBV", pheno))
			predictionsTableList     = append (predictionsTableList, predictionsTable)
		}
		
		# Add prefix and component 
		dfModels = data.frame (Prefix=traitPrefix, HCL_component=traitHCL, Traits=pheno, cvKFoldsTable)
		predictionsKFoldsTable = rbind (predictionsKFoldsTable, dfModels)
	}

	# Write GEBVs file
	#outFile = gsub (".csv", "-ALL.csv", fileGEBVsPredictions)
	outFile = addLabel (fileGEBVsPredictions, "ALL")
	df      = data.frame (SAMPLES=rownames(predictionsTable), predictionsTableList)
	write.csv (df, outFile, row.names=F, quote=F)

	# Write Prediction Ability file
	outFile = addLabel (fileCrossValPredictions, "GEBVs")
	write.csv (predictionsKFoldsTable, outFile, quote=F, row.names=F)

	return (predictionsKFoldsTable)
}

#-------------------------------------------------------------
# Add label to filename and new extension (optional)
#-------------------------------------------------------------
addLabel <- function (filename, label, newExt=NULL)  {
	nameext = strsplit (filename, split="[.]")
	name    = nameext [[1]][1] 
	if (is.null (newExt))
		ext     = nameext [[1]][2] 
	else
		ext     = newExt
	newName = paste0 (nameext [[1]][1], "-", label, ".", ext )
	return (newName)
}
#-------------------------------------------------------------
#-------------------------------------------------------------
main <- function () {
	library (dplyr)
	library (ggplot2)
	args = commandArgs (trailingOnly=T)
	resultsDir = args[1]
	gs_outputs (resultsDir)
}
luisgarreta/ppGS documentation built on Jan. 17, 2022, 9:23 p.m.