utils::globalVariables(c("Traits", "Models", "y", "Predictive_ability", "SNP", "Means", "HCL_component"))
#' Genomic selection fro multiple phenotypes and multiple Models
#'
#' Function that runs genomic selection in parallel for multiple phenotypes and multiple models.
#'
#' @param configFile Filename of the parameter file (YAML Format). It includes:
#' trainingGenoFile: Filename of training genotype data,
#' trainingPhenoFile: Filename of training phenotype data,
#' trainingGenoFile: Filename of target genotype table,
#' targetPhenoFile: Filename of target phenotype table (opt.),
#' markersFile: Filename of selected markers to test from
#' genotype (opt.),
#' nMarkers: Number of selected markers to test,
#' methods: Methods or models to test in GS,
#' nTimes: Number of independent replicates for the cross-validation,
#' nFolds: Number of folds for the cross-validation,
#' nCores: Number of cores to use.
#'
#' @param outputDir The directory where the results will be saved.
#'
#' @return None
#'
#' @import parallel
#' @import yaml
#' @import ggplot2
#' @import dplyr
#'
#' @export
gs_multi <- function (configFile, outputDir) {
#-------------------------------------------------------------
# Multiple phenotype / Multiple models genomic selection
#-------------------------------------------------------------
params = loadConfigurationParameters (configFile)
phenosDir = "phenotypes"
# Read/Processing input files and set directories
init = initFilesAndWorkingDir (outputDir, params, configFile)
files = createInputFilesGS (init$data, init$files, params, phenosDir)
# Create parameters file for each phenotype to run GS
paramsPhenoFiles = createParamFileForEachPhenotype (files, params, phenosDir)
# Run in parallel GS
dummy=mclapply (paramsPhenoFiles, run_gs_single, init$outputDir, mc.cores=params$nCores)
message ("LOG Errors: ")
print (dummy)
# Organize result files
organizeOutputFiles (init$outputDir)
# Create summary plots and tables
message ("Creating predictionsKFoldsTable...")
resultsDir = "results"
predictionsKFoldsTable = createSummaryTables (resultsDir)
createSummaryPlot (predictionsKFoldsTable)
}
#-------------------------------------------------------------
# Wrapped function for gs_single
#-------------------------------------------------------------
run_gs_single <- function (configFile, outputDir) {
setwd (outputDir)
out = gs_single (configFile)
return (out)
}
#-------------------------------------------------------------
# Create parallelGS parameter file for each phenotype
#-------------------------------------------------------------
createParamFileForEachPhenotype <- function (files, params, phenosDir) {
paramsPhenoFiles = c()
for (i in 1:length(files$phenoNames)) {
phenoName = files$phenoNames [i]
trainingGeno = files$trainingGeno [i]
targetGeno = files$targetGeno [i]
trainingPheno = files$trainingPhenos [i]
targetPheno = files$targetPhenos [i]
configFile = sprintf ("%s/%s-params.yml", phenosDir, phenoName)
sink (configFile)
cat (sprintf ("trainingGenoFile : %s\n", trainingGeno))
cat (sprintf ("trainingPhenoFile : %s\n", trainingPheno))
cat (sprintf ("targetGenoFile : %s\n", targetGeno))
cat (sprintf ("targetPhenoFile : %s\n", targetPheno))
cat (sprintf ("phenoName : %s\n", phenoName))
cat (sprintf ("useGwasMarkers : %s\n", params$useGwasMarkers)) # Use only best markers from GWAS results
cat (sprintf ("gwasResultsPath : %s\n", params$gwasResultsPath))
cat (sprintf ("methods : %s\n", params$methods))
cat (sprintf ("nTimes : %s\n", params$nTimes))
cat (sprintf ("nFolds : %s\n", params$nFolds))
cat (sprintf ("nCores : %s\n", params$nCores))
sink ()
paramsPhenoFiles = c (configFile, paramsPhenoFiles)
}
return (paramsPhenoFiles)
}
#-------------------------------------------------------------
# Move files to directories: data, phenotypes, and results
#-------------------------------------------------------------
organizeOutputFiles <- function (outputDir) {
setwd (outputDir)
dir.create ("data")
for (f in list.files (pattern=".csv"))
file.rename (f, sprintf ("data/%s", f))
for (f in list.files (pattern=".yml"))
file.rename (f, sprintf ("data/%s", f))
for (f in list.files (pattern=".log"))
file.rename (f, sprintf ("data/%s", f))
dataFiles = setdiff (list.files (), c("data", "phenotypes"))
dir.create ("results")
for (f in dataFiles)
file.rename (f, sprintf ("results/%s", f))
}
#-------------------------------------------------------------
# Create output directory and copy into it genotype and phenotype data
#-------------------------------------------------------------
initFilesAndWorkingDir <- function (outputDir, params, configFile, createNewDir=T) {
data = c()
data$trainingGeno = read.csv (params$trainingGenoFile)
data$trainingPhenos = read.csv (params$trainingPhenoFile)
data$targetGeno = read.csv (params$targetGenoFile)
data$nMarkers = params$nMarkers
# Check if gwas markers was given
if (is.null(params$markersFile))
data$gwasMarkers = NULL
else
# Get N random markers, only for testing
if (params$markersFile=="RANDOM") {
randomMarkers = unlist (sample (colnames (data$trainingGeno[,-1]), data$nMarkers))
data$trainingGeno = cbind (SAMPLES=data$trainingGeno[,1], data$trainingGeno [,randomMarkers])
data$targetGeno = cbind (SAMPLES=data$targetGeno[,1], data$targetGeno [,randomMarkers])
}else
data$gwasMarkers = read.csv (params$markersFile)
# Check if target pheno was given
if (is.null (params$targetPhenoFile))
data$targetPhenos = NULL
else
data$targetPhenos = read.csv (params$targetPhenoFile)
files = c()
files$trainingGenoFile = addLabel (basename (params$trainingGenoFile), "TRAINING")
files$targetGenoFile = addLabel (basename (params$targetGenoFile), "TARGET")
files$trainingPhenoFile = addLabel (basename (params$trainingPhenoFile), "TRAINING")
# Create and set global working dir where all outputs will be saved
if (createNewDir)
createDir (outputDir)
file.copy (configFile, outputDir)
setwd (outputDir)
outputDir = getwd() # Full path
write.csv (data$trainingGeno, files$trainingGenoFile, quote=F, row.names=F)
write.csv (data$targetGeno, files$targetGenoFile, quote=F, row.names=F)
write.csv (data$trainingPhenos, files$trainingPhenoFile, quote=F, row.names=F)
if (!is.null (params$targetPhenoFile)) {
files$targetPhenoFile = addLabel (basename (params$targetPhenoFile), "TARGET")
write.csv (data$targetPhenos, files$targetPhenoFile, quote=F, row.names=F)
}
return (list (data=data, files=files, outputDir=outputDir))
}
#-------------------------------------------------------------
# Create genotype and phenotype files for each phenotype
#-------------------------------------------------------------
createInputFilesGS <- function (data, files, params, phenosDir) {
# Write one file by trait
phenoNames = colnames (data$trainingPhenos)[-1];#view (phenoNames)
trainingGenoFileList = c()
targetGenoFileList = c()
trainingPhenoFileList = c()
targetPhenoFileList = c()
createDir (phenosDir)
for (i in 1:length(phenoNames)) {
message (phenoNames [i])
phenoName = phenoNames [i]
# Get all or selected (GWAS) markers
genoFiles = getMarkersForGS (data$gwasMarkers, data$nMarkers, data$trainingGeno, data$targetGeno,
files$trainingGenoFile, files$targetGenoFile, phenoName, phenosDir)
trainingGenoFile = genoFiles$training
targetGenoFile = genoFiles$target
# Get single phenotype
trainingPheno = data$trainingPhenos [, c(1,1+i)] # ["Samples", "XXX"]
trainingPhenoFilename = sprintf ("%s/%s-trainingPheno.csv", phenosDir, phenoName)
write.csv (trainingPheno, trainingPhenoFilename, quote=F, row.names=F)
trainingGenoFileList = c (trainingGenoFileList, trainingGenoFile)
targetGenoFileList = c (targetGenoFileList, targetGenoFile)
trainingPhenoFileList = c (trainingPhenoFileList, trainingPhenoFilename)
targetPhenoFile = "NULL"
if (!is.null (params$targetPhenoFile)) {
targetPheno = data$targetPhenos [, c(1,1+i)] # ["Samples", "XXX"]
targetPhenoFile = sprintf ("%s/%s-targetPheno.csv", phenosDir, phenoName)
write.csv (targetPheno, targetPhenoFile, quote=F, row.names=F)
}
targetPhenoFileList = c(targetPhenoFileList, targetPhenoFile)
}
outList = list(trainingGenoFileList, targetGenoFileList, trainingPhenoFileList, files$targetGenoFile, targetPhenoFileList, phenoNames)
names (outList) = c("trainingGeno", "targetGeno", "trainingPhenos", "targetGeno", "targetPhenos", "phenoNames")
return (outList)
}
#-------------------------------------------------------------
# Use ALL or GWAS markers
# Return the genotype filename with ALL or GWAS markers
#-------------------------------------------------------------
getMarkersForGS <- function (gwasMarkers, nMarkers, trainingGeno, targetGeno, trainingGenoFile, targetGenoFile, phenoName, phenosDir) {
if (is.null (gwasMarkers)) {
message (">>> GS with ALL markers...")
}else {
message (">>> GS with GWAS markers...")
bestGwasMarkers = unique (gwasMarkers$SNP [gwasMarkers$COLORTRAIT==phenoName])
#bestGwasMarkers = getNScoredBestSNPs (gwasMarkers, phenoName, nMarkers)
allMarkers = colnames (trainingGeno);
commonMarkers = intersect (allMarkers, bestGwasMarkers)
trainingGeno = data.frame (SAMPLES=trainingGeno [,1], trainingGeno [,commonMarkers])
trainingGenoFile = sprintf ("%s/%s-trainingGeno.csv", phenosDir, phenoName)
write.csv (trainingGeno, trainingGenoFile, quote=F, row.names=F)
targetGeno = data.frame (SAMPLES=targetGeno [,1], targetGeno [,commonMarkers])
targetGenoFile = sprintf ("%s/%s-targetGeno.csv", phenosDir, phenoName)
write.csv (targetGeno, targetGenoFile, quote=F, row.names=F)
#message ("...END...");quit()
}
return (list (training=trainingGenoFile, target=targetGenoFile))
}
#-----------------------------------------------------------
# Sort best SNPs from multiple tools using a own score
#-----------------------------------------------------------
getNScoredBestSNPs <- function (gwasMarkers, phenoName, nMarkers) {
scoresTable = gwasMarkers [gwasMarkers[,1]==phenoName,]
# Add Count of SNPs between groups
dfCountSNPs = data.frame (add_count (scoresTable, SNP, sort=F, name="scoreShared"));
# Score GC
scoreSign = ifelse (scoresTable$SIGNIFICANCE, 1, 0)
scoreGC = 1 - abs (1-scoresTable$GC)
scoreSNPs = data.frame (add_count (scoresTable, SNP))$n
SCORE_SNP = 0.5*scoreGC + 0.3*scoreSNPs + 0.2*scoreSign
dataSNPsScores = select (data.frame (scoresTable, SCORE_SNP), SNP, SCORE_SNP, everything())
sortedGwasMarkers = dataSNPsScores [order (-SCORE_SNP),] %>% distinct (SNP, .keep_all=T)
nSNPsScores = nrow (sortedGwasMarkers)
if (nMarkers > nSNPsScores)
nMarkers = nSNPsScores
#bestGwasMarkers = sort (levels (scoresTable$SNP))
bestGwasMarkers = unlist (sortedGwasMarkers [1:nMarkers,1])
return (bestGwasMarkers)
}
#----------------------------------------------------------
# Read configuration parameters
#----------------------------------------------------------
loadConfigurationParameters <- function (configFile) {
params = yaml.load_file (configFile)
return (params)
}
#-------------------------------------------------------------
# Add label to filename and new extension (optional)
#-------------------------------------------------------------
addLabel <- function (filename, label, newExt=NULL) {
nameext = strsplit (filename, split="[.]")
name = nameext [[1]][1]
if (is.null (newExt))
ext = nameext [[1]][2]
else
ext = newExt
newName = paste0 (nameext [[1]][1], "-", label, ".", ext )
return (newName)
}
#----------------------------------------------------------
# Create dir, if it exists the it is renamed old-XXX
#----------------------------------------------------------
createDir <- function (newDir) {
checkOldDir <- function (newDir) {
name = basename (newDir)
path = dirname (newDir)
if (dir.exists (newDir) == T) {
oldDir = sprintf ("%s/old-%s", path, name)
if (dir.exists (oldDir) == T)
checkOldDir (oldDir)
file.rename (newDir, oldDir)
}
}
checkOldDir (newDir)
system (sprintf ("mkdir %s", newDir))
}
#-------------------------------------------------------------
# Create summary tables from GS results of each phenotype
# Create a GEBVs table and kfolds table, the last is returned to be graphed
#-------------------------------------------------------------
createSummaryTables <- function (inputDir) {
message (">>> Creating table from cross validations predictive abilities...")
fileCrossValPredictions = "out-CrossValidation-kfolds.csv"
fileGEBVsPredictions = "out-GEBVs-BestModel-TARGET.csv"
predictionsKFoldsTable = data.frame ()
# phenoNames = colnames (read.csv (phenotypeFile)[,-1])
phenoNames = list.files (inputDir)
predictionsTableList = list()
for (pheno in phenoNames) {
traitPrefix = strsplit (pheno, "[.]")[[1]][1]
traitHCL = strsplit (pheno, "[.]")[[1]][3]
# Construct table of cross validation means, checking if CV data exists
predictionsFilePath = sprintf ("%s/%s/%s", inputDir, pheno, fileCrossValPredictions)
if (!file.exists (predictionsFilePath)) {
message ("WARNING: file ", predictionsFilePath, " not found!")
cvKFoldsTable = as.data.frame (list (Models="",Predictive_ability=0))
}else {
cvKFoldsTable = read.csv (predictionsFilePath)
# Build GEBVs table for all phenotypes
filePathGEBVsPredictions = sprintf ("%s/%s/%s", inputDir, pheno, fileGEBVsPredictions)
predictionsTable = read.csv (filePathGEBVsPredictions, row.names=1)
names (predictionsTable) = c (pheno, sprintf ("%s.GEBV", pheno))
predictionsTableList = append (predictionsTableList, predictionsTable)
}
# Add prefix and component
dfModels = data.frame (Prefix=traitPrefix, HCL_component=traitHCL, Traits=pheno, cvKFoldsTable)
predictionsKFoldsTable = rbind (predictionsKFoldsTable, dfModels)
}
# Write GEBVs file
#outFile = gsub (".csv", "-ALL.csv", fileGEBVsPredictions)
outFile = addLabel (fileGEBVsPredictions, "ALL")
df = data.frame (SAMPLES=rownames(predictionsTable), predictionsTableList)
write.csv (df, outFile, row.names=F, quote=F)
# Write Prediction Ability file
outFile = addLabel (fileCrossValPredictions, "GEBVs")
write.csv (predictionsKFoldsTable, outFile, quote=F, row.names=F)
return (predictionsKFoldsTable)
}
#-------------------------------------------------------------
#--- Create output tables and boxplots for best model for each trait
#-------------------------------------------------------------
createSummaryPlot <- function (predictionsKFoldsTable) {
message (">>> Creating output plots and tables for best model ...")
# Summary plot boxplot best models for traits
meansTable = predictionsKFoldsTable %>% dplyr::group_by (Traits, Models) %>%
summarize (Means=mean(Predictive_ability), .groups="keep")
# Get best model for each trait
bestTable = meansTable %>% group_by (Traits) %>% slice_max (Means) %>% slice_head
# Create table with correlations from best models
getBestCorr <- function (trait, model)
return (dplyr::filter (predictionsKFoldsTable, Traits==trait, Models==model))
bestCorrs = do.call (rbind, mapply (getBestCorr, bestTable$Traits, bestTable$Models, SIMPLIFY=F))
traitColors = unlist (lapply(1:8, function(x) rep(x,3)))
nTraits = length (unique (bestCorrs$Traits))
traitColors = traitColors [1: nTraits]
outPlotname = "out-CrossValidation-ModelsTraits.pdf"
ggplot (bestCorrs, aes(x=HCL_component, y=Predictive_ability)) + ylim (0,1) +
geom_boxplot (alpha=0.3, fill=traitColors) +
theme(axis.text.x=element_text(angle=0, hjust=1)) + labs (title="Best GS models for color traits in tetraploid potato") +
stat_summary(geom='text', label=round (bestTable$Means,2) , fun=max, vjust = -1, size=3, col="blue") +
stat_summary(geom='text', label=bestTable$Models, fun=min, vjust = 1.5, angle=0,size=3, col="blue") +
facet_wrap (~Prefix, ncol=8)
ggsave (outPlotname, width=11)
}
#-------------------------------------------------------------
#-------------------------------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.