#' Run k-fold cross-validation
#'
#' Assess the accuracy of predicted previously unobserved genotypes
#' (individuals) based on the available training data. Runs k-fold
#' cross-validation for potentially multiple traits and optionally computing
#' prediction accuracy on user-specified selection index. Three models are
#' enabled: additive-only ("A"), additive-plus-dominance ("AD") and a
#' directional-dominance model that incorporates a genome-wide homozygosity
#' effect ("DirDom"). The union of all genotypes scored for all traits is broken
#' into k-folds a user specified number of times. Subsequently each train-test
#' pair is predicted for each trait and accuracies are computed.
#'
#' @param blups nested data.frame with list-column "TrainingData" containing
#' BLUPs. Each element of "TrainingData" list, is data.frame with de-regressed
#' BLUPs, BLUPs and weights (WT) for training and test.
#' @param modelType string, "A", "AD", "DirDom". modelType="A": additive-only,
#' GEBVS modelType="AD": the "classic" add-dom model, GEBVS+GEDDs = GETGVs
#' modelType="DirDom": the "genotypic" add-dom model with prop. homozygous fit
#' as a fixed-effect, to estimate a genome-wide inbreeding effect. obtains
#' add-dom effects, computes allele sub effects (\eqn{\alpha = a + d(q-p)})
#' incorporates into GEBV and GETGV. "DirDom" requires dosages
#' @param selInd logical, TRUE/FALSE, selection index accuracy estimates,
#' requires input weights via \code{SIwts}
#' @param SIwts required if \code{selInd=FALSE}, named vector of selection index
#' weights, names match the "Trait" variable in \code{blups}
#' @param grms list of GRMs where each element is named either A, D, or, AD.
#' Matrices supplied must match required by A, AD and ADE models. For ADE
#' grms=list(A=A,D=D)
#' @param dosages dosage matrix. required only for modelType=="DirDom". Assumes
#' SNPs coded 0, 1, 2. Nind rows x Nsnp cols, numeric matrix, with rownames
#' and colnames to indicate SNP/ind ID
#' @param nrepeats number of repeats
#' @param nfolds number of folds
#' @param ncores number of cores, parallelizes across repeat-folds
#' @param nBLASthreads number of cores for each worker to use for multi-thread
#' BLAS
#' @param gid string variable name used for genotype ID's/ in e.g. \code{blups}
#' (default="GID")
#' @param seed numeric, use seed to achieve reproducibile train-test folds.
#' @param ...
#'
#' @return Returns tidy results in a tibble with accuracy estimates for each rep-fold in a list-column "accuracyEstOut".
#'
#' @export
#' @family CrossVal
runCrossVal<-function(blups,
modelType,
selInd,SIwts = NULL,
grms,dosages=NULL,
nrepeats,nfolds,
ncores=1,nBLASthreads=NULL,
gid="GID",seed=NULL,...){
# SET-UP CROSS-VALIDATION TRAINING-TEST FOLDS
# same train-test folds across traits
gids<-blups %>%
unnest(TrainingData) %>%
distinct(!!sym(gid)) %>%
.[[gid]]
# whether or not user inputs a master seed
# generate and store in output
# seeds to make each replicate reproducible.
if(!is.null(seed)){
set.seed(seed);
seeds<-sample(1:1e6,replace = F,size = nrepeats)
} else {
seeds<-sample(1:1e6,replace = F,size = nrepeats) }
# Set-up replicated cross-validation folds
# splitting by clone (if clone in training dataset, it can't be in testing)
require(rsample)
cvsamples<-tibble(repeats=1:nrepeats,
seeds=seeds,
splits=map(seeds,function(seeds,...){
set.seed(seeds);
cvfolds<-vfold_cv(tibble(GID=gids),v = nfolds)
return(cvfolds)})) %>%
unnest(splits)
# FIT GENOMIC PREDICTION MODELS
## The fitModels() internal function
## Now runs _across_ traits and, if requested,
## computes the selection index accuracy
## runCrossVal() now parallelizes over repeat-folds using ncores
## Traits are handled in serial by each parallel worker
## nBLASthreads controls the number of additional cores each worker
## uses to speed matrix computations
## Internal function
## fits prediction model and calcs. accuracy for each train-test split
fitModels<-function(splits,gids,
modelType,
blups,selInd,SIwts,
gid="GID",
grms,dosages=NULL,
nBLASthreads){
# internal testing of fitModels() inputs - one rep-fold
# splits<-cvsamples$splits[[1]]
# rm(splits)
if(!is.null(nBLASthreads)) { RhpcBLASctl::blas_set_num_threads(nBLASthreads) }
# workers in plan(multisession) need this call internal to the function, it seems.
# subset kinship (and if modelType=="DirDom" also the dosages) matrices
### only lines with BLUPs for cross-validation
A<-grms[["A"]][gids,gids]
if(modelType %in% c("AD","DirDom")){ D<-grms[["D"]][gids,gids] }
if(modelType=="DirDom"){
snps<-dosages[gids,]
Mmat<-centerDosage(snps);
Dmat<-dose2domDevGenotypic(snps)
f<-getPropHom(snps);
p<-getAF(snps)
q<-1-p
rm(snps);
}
predictOneTrait<-possibly(function(TrainingData,splits,gid,
modelType,A,D=NULL,
Mmat=NULL,Dmat=NULL,f=NULL,p=NULL,q=NULL){
#TrainingData<-blups$TrainingData[[1]]
trainingdata<-TrainingData %>%
dplyr::rename(GID=!!sym(gid)) %>%
filter(GID %in% training(splits)[[gid]],
GID %in% rownames(A))
trainingdata[[paste0(gid,"a")]]<-factor(trainingdata[["GID"]],
levels=rownames(A))
if(modelType %in% c("AD")){
trainingdata[[paste0(gid,"d")]]<-trainingdata[[paste0(gid,"a")]] }
if(modelType %in% c("DirDom")){
trainingdata[[paste0(gid,"d_star")]]<-trainingdata[[paste0(gid,"a")]] }
# Set-up random model statements
randFormula<-paste0("~vs(",gid,"a,Gu=A)")
if(modelType %in% c("AD")){
randFormula<-paste0(randFormula,"+vs(",gid,"d,Gu=D)") }
if(modelType=="DirDom"){
randFormula<-paste0(randFormula,"+vs(",gid,"d_star,Gu=D)")
trainingdata %<>%
left_join(tibble(GID=names(f),f=as.numeric(f)))
}
# Fixed model statements
fixedFormula<-ifelse(modelType=="DirDom",
"drgBLUP ~1+f","drgBLUP ~1")
# Fit genomic prediction model
suppressMessages(require(sommer))
fit <- sommer::mmer(fixed = as.formula(fixedFormula),
random = as.formula(randFormula),
weights = WT,
data=trainingdata,
date.warning = F,
getPEV = FALSE)
# reduce memory footprint
rm(A); if(modelType %in% c("AD","DirDom")){ rm(D); gc() }
print(paste0("GBLUP model complete - one trait"))
if(modelType=="DirDom"){
# Backsolve SNP effects
# Compute allele sub effects
## Every model has an additive random term
ga<-as.matrix(fit$U[[paste0("u:",gid,"a")]]$drgBLUP,ncol=1)
# model DirDom is a different add-dom partition,
### add effects are not allele sub effects and gblups are not GEBV
addsnpeff<-backsolveSNPeff(Z=Mmat,g=ga)
### dom effects are called d*, gd_star or domstar
### because of the genome-wide homoz. term included in model
gd_star<-as.matrix(fit$U[[paste0("u:",gid,"d_star")]]$drgBLUP,ncol=1)
domstar_snpeff<-backsolveSNPeff(Z=Dmat,g=gd_star)
### b = the estimate (BLUE) for the genome-wide homoz. effect
b<-fit$Beta[fit$Beta$Effect=="f","Estimate"]
### calc. domsnpeff including the genome-wide homoz. effect
### divide the b effect up by number of SNPs and _subtract_ from domstar
domsnpeff<-domstar_snpeff-(b/length(domstar_snpeff))
### allele substitution effects using a+d(q-p) where d=d*-b/p
allelesubsnpeff<-addsnpeff+(domsnpeff*(q-p))
}
# Gather the GBLUPs
if(modelType %in% c("A","AD")){
gblups<-tibble(GID=as.character(names(fit$U[[paste0("u:",gid,"a")]]$drgBLUP)),
GEBV=as.numeric(fit$U[[paste0("u:",gid,"a")]]$drgBLUP)) }
if(modelType=="AD"){
gblups %<>% # compute GEDD (genomic-estimated dominance deviation)
dplyr::mutate(GEDD=as.numeric(fit$U[[paste0("u:",gid,"d")]]$drgBLUP),
# compute GETGV
GETGV=rowSums(.[,grepl("GE",colnames(.))])) }
if(modelType=="DirDom"){
# re-calc the GBLUP, GEdomval using dom. effects where d=d*-b/p
ge_domval<-Dmat%*%domsnpeff
# calc. the GEBV using allele sub. effects where alpha=a+d(p-q), and d=d*-b/p
gebv<-Mmat%*%allelesubsnpeff
# Tidy tibble of GBLUPs
gblups<-tibble(GID=as.character(names(fit$U[[paste0("u:",gid,"a")]]$drgBLUP)),
GEadd=as.numeric(fit$U[[paste0("u:",gid,"a")]]$drgBLUP),
GEdom_star=as.numeric(fit$U[[paste0("u:",gid,"d_star")]]$drgBLUP)) %>%
left_join(tibble(GID=rownames(ge_domval),GEdomval=as.numeric(ge_domval))) %>%
left_join(tibble(GID=rownames(gebv),GEBV=as.numeric(gebv))) %>%
# GETGV from GEadd + GEdomval
dplyr::mutate(GETGV=GEadd+GEdomval)
# free up the memory footprint
rm(ga,addsnpeff,gd_star,domstar_snpeff,b,domsnpeff,allelesubsnpeff,
ge_domval,gebv,Dmat,Mmat,fit); gc()
print(paste0("Backsolving SNP effects for DirDom model compete - one trait"))
}
# this is to remove conflicts with dplyr function select() downstream
detach("package:sommer",unload = T); detach("package:MASS",unload = T)
# Calculate accuracy for each trait
## Convert predicted gblups to a long-format
gblups %<>%
dplyr::select(GID,any_of(c("GEBV","GETGV"))) %>%
pivot_longer(any_of(c("GEBV","GETGV")),
names_to = "predOf",
values_to = "GBLUP")
## Grab the test set BLUPs as validation data
validationData<-TrainingData %>%
dplyr::rename(GID=!!sym(gid)) %>%
dplyr::select(GID,BLUP) %>%
filter(GID %in% testing(splits)[[gid]])
# Measure accuracy in test set
## cor(GEBV,BLUP)
## cor(GETGV,BLUP)
accuracy<-gblups %>%
left_join(validationData) %>%
nest(predVSobs=c(GID,GBLUP,BLUP)) %>%
dplyr::mutate(Accuracy=map_dbl(predVSobs,~cor(.$GBLUP,.$BLUP, use = 'complete.obs')))
return(accuracy)
},
otherwise = NA)
# Predict for one trait to each trait's training dataset
if(modelType=="A"){
predictions<-blups %>%
dplyr::mutate(modelOut=map(TrainingData,~predictOneTrait(TrainingData=.,
splits=splits,gid=gid,
modelType=modelType,
A=A))) }
if(modelType=="AD"){
predictions<-blups %>%
dplyr::mutate(modelOut=map(TrainingData,~predictOneTrait(TrainingData=.,
splits=splits,gid=gid,
modelType=modelType,
A=A,D=D))) }
if(modelType=="DirDom"){
predictions<-blups %>%
dplyr::mutate(modelOut=map(TrainingData,~predictOneTrait(TrainingData=.,
splits=splits,gid=gid,
modelType=modelType,
A=A,D=D,
Mmat=Mmat,Dmat=Dmat,
f=f,p=p,q=q))) }
rm(A); if(modelType=="AD"){ rm(D) };
if(modelType=="DirDom"){ rm(D,Mmat,Dmat,f,p,q) }
print(paste0("Genomic predictions done for all traits in one repeat-fold"))
predictions %<>%
select(-TrainingData) %>%
unnest(modelOut,keep_empty = F)
if(selInd){
# calc. SELIND and SELIND accuracy
gblups<-predictions %>%
select(-Accuracy) %>%
unnest(predVSobs) %>%
select(-BLUP) %>%
pivot_wider(values_from = "GBLUP",
names_from = "Trait")
if(all(names(SIwts) %in% colnames(gblups))){
gblups %<>%
dplyr::mutate(GBLUP=as.numeric((gblups %>%
select(names(SIwts)) %>%
as.matrix(.))%*%SIwts)) %>%
select(predOf,GID,GBLUP)
validationData<-blups %>%
unnest(TrainingData) %>%
select(Trait,GID,BLUP) %>%
pivot_wider(names_from = "Trait", values_from = "BLUP")
validationData %<>%
dplyr::mutate(BLUP=as.numeric((validationData %>%
select(names(SIwts)) %>%
as.matrix(.))%*%SIwts)) %>%
select(GID,BLUP)
predictions %<>%
bind_rows(gblups %>%
left_join(validationData) %>%
nest(predVSobs=c(GID,GBLUP,BLUP)) %>%
dplyr::mutate(Trait="SELIND") %>%
relocate(Trait,.before = 1) %>%
dplyr::mutate(Accuracy=map_dbl(predVSobs,~cor(.$GBLUP,.$BLUP,
use = 'na.or.complete'))))
}
}
predictions %<>%
dplyr::mutate(NcompleteTestPairs=map_dbl(predVSobs,function(predVSobs){
if(!is.null(predVSobs)){
out<-na.omit(.) %>% nrow(.) } else { out<-NA }
return(out) }))
return(predictions)
}
require(furrr); plan(multisession, workers = ncores)
options(future.globals.maxSize=+Inf); options(future.rng.onMisuse="ignore")
# quick test
#cvsamples %<>% slice(1:2)
cvsamples %<>%
dplyr::mutate(accuracyEstOut=future_map(splits,
~fitModels(splits=.,
modelType=modelType,
blups=blups,
selInd=selInd,SIwts=SIwts,
gid=gid,grms=grms,dosages=dosages,
nBLASthreads=nBLASthreads)))
plan(sequential)
return(cvsamples)
}
#' Run genomic predictions
#'
#' Run GBLUP model using \code{\link[sommer]{mmer}}, potentially on multiple
#' traits. Returns genomic BLUPs (GEBV and GETGV). If requested, returns
#' backsolved marker effects (equivalent to ridge regression / SNP-BLUP).
#' Three models are
#' enabled: additive-only ("A"), additive-plus-dominance ("AD") and a
#' directional-dominance model that incorporates a genome-wide homozygosity
#' effect ("DirDom"). Inbreeding effect is included in output GEBV/GETGV
#' predictions *after* backsolving SNP effects. If requested, returns
#' GEBV/GETGV computed for a selection index using \code{selInd=TRUE}
#' and supplying \code{SIwts}.
#'
#' @param modelType string, "A", "AD", "DirDom".
#' modelType="A": additive-only, GEBVS
#' modelType="AD": the "classic" add-dom model, GEBVS+GEDDs = GETGVs
#' modelType="DirDom":
#' the "genotypic" add-dom model with prop. homozygous
#' fit as a fixed-effect, to estimate a genome-wide inbreeding effect.
#' obtains add-dom effects, computes
#' allele sub effects (\eqn{\alpha = a + d(q-p)})
#' incorporates into GEBV and GETGV
#' @param selInd logical, TRUE/FALSE, selection index accuracy estimates,
#' requires input weights via \code{SIwts}
#' @param SIwts required if \code{selInd=FALSE}, named vector of selection
#' index weights, names match the "Trait" variable in \code{blups}
#' @param getMarkEffs T/F return marker effects, backsolved from GBLUP
#' @param returnPEV T/F return PEVs in GBLUP
#' @param blups nested data.frame with list-column "TrainingData" containing
#' BLUPs. Each element of "TrainingData" list, is data.frame with de-regressed
#' BLUPs, BLUPs and weights (WT) for training and test.
#' @param grms list of genomic relation matrices (GRMs, aka kinship matrices).
#' Any genotypes in the GRMs get predicted with, or without phenotypes.
#' Each element is named either A or D. Matrices supplied must match
#' required by A, AD and DirDom models. e.g. grms=list(A=A,D=D).
#' @param dosages dosage matrix. required only for modelType=="DirDom".
#' Also required if getMarkEffs==TRUE.
#' Assumes SNPs coded 0, 1, 2. Nind rows x Nsnp
#' cols, numeric matrix, with rownames and colnames to indicate SNP/ind ID
#' @param gid string variable name used for genotype ID's in e.g. \code{blups} (default="GID")
#' @param ncores number of cores
#' @param nBLASthreads number of cores for each worker to use for multi-thread BLAS
#'
#' @return tibble, one row, two list columns (basically a named two-element
#' list of lists): \code{gblups[[1]]} and \code{genomicPredOut[[1]]}.
#' code{gblups[[1]]}: tibble of predicted GEBV/GETGV, all traits and potentially
#' SELIND genomic BLUPs along the columns.
#'
#' \code{genomicPredOut[[1]]} is a tibble that contains
#' some combination of lists-columns:
#' \itemize{
#' \item gblups
#' \item varcomps,
#' \item fixeffs,
#' \item allelesubsnpeff,
#' \item addsnpeff,
#' \item domstar_snpeff,
#' \item domsnpeff
#' }
#'
#' @export
#' @family prediction_functions
runGenomicPredictions<-function(modelType,
selInd,SIwts = NULL,
getMarkEffs=FALSE,
returnPEV=FALSE,
blups,grms,dosages=NULL,gid="GID",
ncores=1,
nBLASthreads=NULL){
fitModel<-function(trainingdata,modelType,getMarkEffs,returnPEV,
gid="GID",grms,dosages,
nBLASthreads,...){
if(!is.null(nBLASthreads)) { RhpcBLASctl::blas_set_num_threads(nBLASthreads) }
# workers in plan(multisession) need this call internal to the function, it seems.
A<-grms[["A"]]
if(modelType %in% c("AD","DirDom")){ D<-grms[["D"]] }
trainingdata %<>%
dplyr::rename(gid=!!sym(gid)) %>%
filter(gid %in% rownames(A))
trainingdata[[paste0(gid,"a")]]<-factor(trainingdata[["gid"]],
levels=rownames(A))
if(modelType %in% c("AD")){
trainingdata[[paste0(gid,"d")]]<-trainingdata[[paste0(gid,"a")]] }
if(modelType %in% c("DirDom")){
trainingdata[[paste0(gid,"d_star")]]<-trainingdata[[paste0(gid,"a")]] }
# Set-up random model statements
randFormula<-paste0("~vs(",gid,"a,Gu=A)")
if(modelType %in% c("AD")){
randFormula<-paste0(randFormula,"+vs(",gid,"d,Gu=D)") }
if(modelType=="DirDom"){
randFormula<-paste0(randFormula,"+vs(",gid,"d_star,Gu=D)")
f<-getPropHom(dosages)
trainingdata %<>% dplyr::mutate(f=f[trainingdata$gid]) }
# Fixed model statements
fixedFormula<-ifelse(modelType=="DirDom",
"drgBLUP ~1+f","drgBLUP ~1")
# Fit genomic prediction model
suppressMessages(require(sommer))
fit <- sommer::mmer(fixed = as.formula(fixedFormula),
random = as.formula(randFormula),
weights = WT,
data=trainingdata,
date.warning = F,
getPEV = returnPEV)
# Shrink the memory footprint
rm(grms,A); if(modelType %in% c("DirDom","AD")){ rm(D) };
if(getMarkEffs==TRUE | modelType=="DirDom"){
# Backsolve SNP effects
# Compute allele sub effects
## Every model has an additive random term
ga<-as.matrix(fit$U[[paste0("u:",gid,"a")]]$drgBLUP,ncol=1)
M<-centerDosage(dosages)
if(modelType %in% c("A","AD")){
# models A and AD give add effects corresponding to allele sub effects
allelesubsnpeff<-backsolveSNPeff(Z=M,g=ga) }
if(modelType %in% c("AD")){
# model AD the dom effects are dominance deviation effects
gd<-as.matrix(fit$U[[paste0("u:",gid,"d")]]$drgBLUP,ncol=1)
domdevsnpeff<-backsolveSNPeff(Z=dose2domDev(dosages),g=gd) }
if(modelType %in% c("DirDom")){
# model DirDom is a different add-dom partition,
### add effects are not allele sub effects and gblups are not GEBV
addsnpeff<-backsolveSNPeff(Z=M,g=ga)
### dom effects are called d*, gd_star or domstar
### because of the genome-wide homoz. term included in model
gd_star<-as.matrix(fit$U[[paste0("u:",gid,"d_star")]]$drgBLUP,ncol=1)
domdevMat_genotypic<-dose2domDevGenotypic(dosages)
domstar_snpeff<-backsolveSNPeff(Z=domdevMat_genotypic,g=gd_star)
### b = the estimate (BLUE) for the genome-wide homoz. effect
b<-fit$Beta[fit$Beta$Effect=="f","Estimate"]
### calc. domsnpeff including the genome-wide homoz. effect
### divide the b effect up by number of SNPs and _subtract_ from domstar
domsnpeff<-domstar_snpeff-(b/length(domstar_snpeff))
### allele substitution effects using a+d(q-p) where d=d*-b/p
p<-getAF(dosages)
q<-1-p
allelesubsnpeff<-addsnpeff+(domsnpeff*(q-p))
} }
# Gather the GBLUPs
if(modelType %in% c("A","AD")){
gblups<-tibble(GID=as.character(names(fit$U[[paste0("u:",gid,"a")]]$drgBLUP)),
GEBV=as.numeric(fit$U[[paste0("u:",gid,"a")]]$drgBLUP))
if(returnPEV){
pev_bv<-diag((fit$PevU[[paste0("u:",gid,"a")]]$drgBLUP))
gblups %<>% left_join(tibble(GID=names(pev_bv),PEVbv=pev_bv))
}
}
if(modelType=="AD"){
gblups %<>% # compute GEDD (genomic-estimated dominance deviation)
dplyr::mutate(GEDD=as.numeric(fit$U[[paste0("u:",gid,"d")]]$drgBLUP),
# compute GETGV
GETGV=rowSums(.[,grepl("GE",colnames(.))]))
if(returnPEV){
pev_dd<-diag((fit$PevU[[paste0("u:",gid,"d")]]$drgBLUP))
gblups %<>% left_join(tibble(GID=names(pev_dd),PEVdd=pev_dd))
}
}
if(modelType=="DirDom"){
# re-calc the GBLUP, GEdomval using dom. effects where d=d*-b/p
ge_domval<-domdevMat_genotypic%*%domsnpeff
# calc. the GEBV using allele sub. effects where alpha=a+d(p-q), and d=d*-b/p
gebv<-M%*%allelesubsnpeff
# Tidy tibble of GBLUPs
gblups<-tibble(GID=as.character(names(fit$U[[paste0("u:",gid,"a")]]$drgBLUP)),
GEadd=as.numeric(fit$U[[paste0("u:",gid,"a")]]$drgBLUP),
GEdom_star=as.numeric(fit$U[[paste0("u:",gid,"d_star")]]$drgBLUP)) %>%
left_join(tibble(GID=rownames(ge_domval),GEdomval=as.numeric(ge_domval))) %>%
left_join(tibble(GID=rownames(gebv),GEBV=as.numeric(gebv))) %>%
# GETGV from GEadd + GEdomval
dplyr::mutate(GETGV=GEadd+GEdomval)
if(returnPEV){
pev_a<-diag((fit$PevU[[paste0("u:",gid,"a")]]$drgBLUP))
pev_dstar<-diag((fit$PevU[[paste0("u:",gid,"d_star")]]$drgBLUP))
gblups %<>%
left_join(tibble(GID=names(pev_a),PEVd_star=pev_a)) %>%
left_join(tibble(GID=names(pev_dstar),PEVd_star=pev_dstar))
}
}
# Extract variance components
varcomps<-summary(fit)$varcomp
# Exract fixed effects
# for modelType="DirDom", contains estimate of genome-wide homoz. effect
fixeffs<-summary(fit)$betas
# Shrink the memory footprint again
rm(fit); gc()
# Collect results
results<-tibble(gblups=list(gblups),
varcomps=list(varcomps),
fixeffs=list(fixeffs))
if(getMarkEffs==TRUE){
# Add snpeffects to output
results %<>% dplyr::mutate(allelesubsnpeff=list(allelesubsnpeff))
if(modelType=="AD"){ results %<>% dplyr::mutate(domdevsnpeff=list(domdevsnpeff)) }
if(modelType=="DirDom"){
results %<>% dplyr::mutate(addsnpeff=list(addsnpeff),
domstar_snpeff=list(domstar_snpeff),
domsnpeff=list(domsnpeff)) } }
# this is to remove conflicts with dplyr function select() downstream
detach("package:sommer",unload = T); detach("package:MASS",unload = T)
# return results
return(results)
}
require(furrr);
if(ncores>1){ plan(multisession, workers = ncores); }
options(future.globals.maxSize=+Inf); options(future.rng.onMisuse="ignore")
predictions<-blups %>%
dplyr::mutate(genomicPredOut=future_map(TrainingData,
~fitModel(trainingdata=.,
modelType=modelType,
getMarkEffs=getMarkEffs,
returnPEV=returnPEV,
gid=gid,
grms=grms,
dosages=dosages,
nBLASthreads=nBLASthreads)))
if(ncores>1){ plan(sequential) }
predictions %<>%
select(-TrainingData) %>%
unnest(genomicPredOut)
# tidy GBLUP output for e.g. breeders / selections
gblups<-predictions %>%
select(Trait,gblups) %>%
unnest(gblups) %>%
select(!!sym(gid),Trait,any_of(c("GEBV","GETGV"))) %>%
pivot_longer(any_of(c("GEBV","GETGV")),
values_to = "GBLUP",
names_to = "predOf") %>%
pivot_wider(names_from = "Trait",
values_from = "GBLUP")
if(selInd){
# calc. SELIND and add to tidy output
gblups %<>%
dplyr::mutate(SELIND=as.numeric((gblups %>%
select(names(SIwts)) %>%
as.matrix(.))%*%SIwts)) %>%
relocate(SELIND, .after = predOf)
}
predictions<-tibble(gblups=list(gblups),
genomicPredOut=list(predictions))
return(predictions)
}
#' Predict crosses
#'
#' Predict potentially for multiple traits, the means, variances and
#' trait-trait covariances in a set ofuser supplied crosses.l
#' If requested, computed the selection index means and variances.
#' Computes the usefulness criteria \eqn{UC_{parent}} and \eqn{UC_{variety}}
#' potentially with a user supplied standardized selection intensity value
#' \code{stdSelInt}. Output enables easy ranking of potential crosses.
#' This function takes the matrices of snpeffects output
#' (\code{genomicPredOut[[1]]}) from the \code{\link{runGenomicPredictions}}
#' function (when \code{getMarkEffs=TRUE}).
#' This is a wrapper function around \code{\link{predCrossVars} and
#' \link{predCrossMeans}}.
#'
#' @param modelType string, A, AD or DirDom. A and AD representing model with
# Additive-only, Add. plus Dominance, respectively. **NEW**:
# modelType="DirDom" includes a genome-wide homozygosity effect as in Xiang
# et al. 2016, uses a different dominance GRM and will probably be a little
# slower.#' @param stdSelInt
#' @param selInd logical, TRUE/FALSE, selection index accuracy estimates,
#' requires input weights via \code{SIwts}
#' @param CrossesToPredict data.frame or tibble, col/colnames: sireID, damID. sireID and damID must both be in the haploMat.
#' @param snpeffs the element \code{genomicPredOut[[1]]} of the output of
#' \code{\link{runGenomicPredictions}}.
#' @param dosages dosage matrix. required only for modelType=="DirDom".
#' Assumes SNPs coded 0, 1, 2. Nind rows x Nsnp
#' cols, numeric matrix, with rownames and colnames to indicate SNP/ind ID
#' @param haploMat matrix of phased haplotypes, 2 rows per sample, cols = loci, {0,1},
#' rownames assumed to contain GIDs with a suffix, separated by "_" to distinguish haplotypes.
#' Currently, the haplotypes must be distinguished by the mandatory suffixes "_HapA" and "_HapB".
#' @param recombFreqMat a square symmetric matrix with values = (1-2*c1), where c1=matrix of expected recomb. frequencies. The choice to do 1-2c1 outside the function was made for computation efficiency; every operation on a big matrix takes time.
#' @param ncores number of cores
#' @param nBLASthreads number of cores for each worker to use for multi-thread BLAS
#' @param predTheMeans default: TRUE, t/f whether to predict cross means
#' @param predTheVars default: TRUE, t/f whether to predict cross vars
#'
#' @return tibble, one row, two list columns (basically a named two-element
#' list of lists): \code{tidyPreds[[1]]} and \code{rawPreds[[1]]}.
#' code{tidyPreds[[1]]}: tidy output, fewer details. sireID, damID, Nsegsnps, predOf,Trait, predMean, predVar, predSD, predUsefulnesstibble of predicted GEBV/GETGV, all traits and potentially
#' SELIND genomic BLUPs along the columns. \code{rawPreds[[1]]}: more detailed output, list of 2 ("predMeans" tibble and "predVars" tibble).
#'
#' @export
#' @family prediction_functions
predictCrosses<-function(modelType,
stdSelInt = 2.67,
selInd,SIwts = NULL,
CrossesToPredict,
snpeffs,dosages,
haploMat,recombFreqMat,
ncores=1,nBLASthreads=NULL,
predTheMeans=TRUE,
predTheVars=TRUE){
## Format SNP effect matrices ~~~~~~~~~~~~~~~~~~~~~~~~
AlleleSubEffectList<-snpeffs$allelesubsnpeff %>%
`names<-`(.,snpeffs$Trait) %>%
map(.,~t(.))
if(modelType=="AD"){
# DomDevEffects for model "AD" to predict VarTGV = VarBV + VarDD
DomDevEffectList=snpeffs$domdevsnpeff %>%
`names<-`(.,snpeffs$Trait) %>%
map(.,~t(.)) }
if(modelType=="DirDom"){
# AddEffectList + DomEffectList --> VarTGV; AlleleSubEffectList --> VarBV;
AddEffectList<-snpeffs$addsnpeff %>%
`names<-`(.,snpeffs$Trait) %>%
map(.,~t(.))
DomEffectList<-snpeffs$domsnpeff %>%
`names<-`(.,snpeffs$Trait) %>%
map(.,~t(.)) }
# store raw mean and var preds
if(predTheVars){
## Predict cross variances ~~~~~~~~~~~~~~~~~~~~~~~~
print("Predicting cross variance parameters")
if(modelType=="A"){
predictedvars<-predCrossVars(CrossesToPredict=CrossesToPredict,
AddEffectList=AlleleSubEffectList,
modelType="A",
haploMat=haploMat,
recombFreqMat=recombFreqMat,
ncores=ncores,nBLASthreads=nBLASthreads) %>%
unnest(predVars) %>%
dplyr::mutate(predOf="VarBV") }
if(modelType=="AD"){
predictedvars<-predCrossVars(CrossesToPredict=CrossesToPredict,
AddEffectList=AlleleSubEffectList,
DomEffectList=DomDevEffectList,
modelType="AD",
haploMat=haploMat,
recombFreqMat=recombFreqMat,
ncores=ncores,nBLASthreads=nBLASthreads)
predictedvars %<>%
unnest(predVars) %>%
dplyr::mutate(predOf=ifelse(predOf=="VarA","VarBV","VarDD"))
}
if(modelType=="DirDom"){
predictedvarTGV<-predCrossVars(CrossesToPredict=CrossesToPredict,
AddEffectList=AddEffectList,
DomEffectList=DomEffectList,
modelType="AD", # no "DirDom" model in predCrossVars() nor is it needed
haploMat=haploMat,
recombFreqMat=recombFreqMat,
ncores=ncores,nBLASthreads=nBLASthreads)
predictedvarBV<-predCrossVars(CrossesToPredict=CrossesToPredict,
AddEffectList=AlleleSubEffectList,
DomEffectList=NULL,
modelType="A", # no "DirDom" model in predCrossVars() nor is it needed
haploMat=haploMat,
recombFreqMat=recombFreqMat,
ncores=ncores,nBLASthreads=nBLASthreads)
predictedvars<-predictedvarBV %>%
unnest(predVars) %>%
dplyr::mutate(predOf="VarBV") %>%
bind_rows(predictedvarTGV %>%
unnest(predVars)) }
}
if(predTheMeans){
## Predict cross means ~~~~~~~~~~~~~~~~~~~~~~~~
print("Predicting cross means")
### predict MeanBVs
predictedmeans<-predCrossMeans(AddEffectList=AlleleSubEffectList,
CrossesToPredict=CrossesToPredict,
doseMat=dosages,
ncores=ncores,
nBLASthreads=nBLASthreads,
predType="BV")
if(modelType=="AD"){
### DO NOT predict MeanTGV ~but~ duplicate MeanBV as MeanTGV prediction
### there IS predVarTGV for this model, output predUC-TGV (i.e. UC_variety)
predictedmeans %<>%
bind_rows(predictedmeans %>% dplyr::mutate(predOf="TGV")) }
if(modelType=="DirDom"){
### predict MeanTGVs
#### Prediction of MeanTGV is only available for the DirDom model
#### or a model with "genotypic" additive-dominance SNP effects
#### As implemented, modelType="AD" is the "classical" partition (BVs+ DomDevs)
predictedmeans %<>%
bind_rows(predCrossMeans(AddEffectList=AddEffectList,
DomEffectList=DomEffectList,
CrossesToPredict=CrossesToPredict,
doseMat=dosages,
ncores=ncores,
nBLASthreads=nBLASthreads,
predType="TGV"))
}
}
## SIMPLIFIED, TIDY, CROSS-WISE OUTPUT ~~~~~~~~~~~~~~~~~~~~~~~~
rawPreds<-list()
if(predTheMeans){ rawPreds[["predMeans"]]<-list(predictedmeans) }
if(predTheVars){ rawPreds[["predVars"]]<-list(predictedvars) }
if(predTheMeans){
## tidy pred. means ~~~~~~
predictedmeans %<>%
dplyr::mutate(predOf=gsub("Mean","",predOf),
Trait2=Trait) %>% # to match with variance pred. output
rename(Trait1=Trait) %>% # to match with variance pred. output
select(sireID,damID,predOf,Trait1,Trait2,predMean)
}
if(predTheVars){
## tidy pred. vars ~~~~~~
predictedvars %<>%
select(sireID,damID,Nsegsnps,predOf,Trait1,Trait2,predVar) %>%
dplyr::mutate(predOf=gsub("Var","",predOf))
if(modelType=="AD"){
predictedvars %<>%
filter(predOf=="BV") %>%
bind_rows(predictedvars %>%
pivot_wider(names_from = "predOf",
values_from = "predVar",
names_prefix = "predVar") %>%
dplyr::mutate(predVar=predVarBV+predVarDD,
predOf="TGV") %>%
select(-predVarBV,-predVarDD))
}
if(modelType=="DirDom"){
predictedvars %<>%
filter(predOf=="BV") %>%
bind_rows(predictedvars %>%
filter(predOf!="BV") %>%
pivot_wider(names_from = "predOf",
values_from = "predVar",
names_prefix = "predVar") %>%
dplyr::mutate(predVar=predVarA+predVarD,
predOf="TGV") %>%
select(-predVarA,-predVarD))
}
}
## SELECTION INDEX MEANS AND VARIANCES ~~~~~~~~~~~~~~~~~~~~~~~~
#### Compute and add to tidy output, if requested
if(selInd){
print("Computing SELECTION INDEX means and variances.")
if(predTheMeans){
traits<-unique(predictedmeans$Trait1)
## Compute Mean SELIND
predictedmeans %<>%
select(-Trait2) %>%
spread(Trait1,predMean) %>%
select(sireID,damID,predOf,all_of(traits)) %>%
dplyr::mutate(SELIND=as.numeric((predictedmeans %>%
select(-Trait2) %>%
spread(Trait1,predMean) %>%
select(all_of(names(SIwts))) %>%
as.matrix(.))%*%SIwts)) %>%
relocate(SELIND, .after = predOf) %>%
pivot_longer(cols = c(SELIND,all_of(traits)),
names_to = "Trait1",
values_to = "predMean") %>%
dplyr::mutate(Trait2=Trait1) %>%
select(sireID,damID,predOf,Trait1,Trait2,predMean)
}
if(predTheVars){
## Compute Var SELIND
require(furrr); plan(multisession, workers = ncores)
options(future.globals.maxSize=+Inf); options(future.rng.onMisuse="ignore")
predictedvars %<>%
nest(predVars=c(Trait1,Trait2,predVar)) %>%
## loop over each rep-fold-predOf-sireIDxdamID
dplyr::mutate(predVars=future_map(predVars,function(predVars,...){
gmat<-predVars %>%
dplyr::rename(T1=Trait1,T2=Trait2) %>%
tidyr::pivot_wider(names_from = "T2",
values_from = "predVar") %>%
column_to_rownames(var = "T1") %>%
as.matrix
gmat[lower.tri(gmat)]<-t(gmat)[lower.tri(gmat)]
gmat %<>% .[names(SIwts),names(SIwts)]
predSelIndVar<-SIwts%*%gmat%*%SIwts
## add sel index predictions to component trait
## var-covar predictions
predVars<-tibble(Trait1="SELIND",Trait2="SELIND",
predVar=as.numeric(predSelIndVar)) %>%
bind_rows(predVars)
return(predVars) })) %>%
unnest(predVars)
plan(sequential)
}
}
if(predTheMeans & predTheVars){
## USEFULNESS CRITERIA ~~~~~~~~~~~~~~~~~~~~~~~~
tidyPreds<-predictedvars %>%
filter(Trait1==Trait2) %>%
inner_join(predictedmeans,
by = c("sireID", "damID", "predOf", "Trait1", "Trait2")) %>%
rename(Trait=Trait1) %>%
select(sireID,damID,Nsegsnps,predOf,Trait,predMean,predVar) %>%
dplyr::mutate(predSD=sqrt(predVar),
predUsefulness=predMean+(stdSelInt*predSD)) }
if(!predTheMeans & predTheVars){
tidyPreds<-predictedvars %>%
filter(Trait1==Trait2) %>%
rename(Trait=Trait1) %>%
select(sireID,damID,Nsegsnps,predOf,Trait,predVar) %>%
dplyr::mutate(predSD=sqrt(predVar))
}
if(predTheMeans & !predTheVars){
tidyPreds<-predictedmeans %>%
rename(Trait=Trait1) %>%
select(sireID,damID,predOf,Trait,predMean)
}
predictions<-tibble(tidyPreds=list(tidyPreds),
rawPreds=list(rawPreds))
return(predictions)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.