GROAN.vignette.R
In GROAN: Genomic Regression Workbench

## ---- eval=TRUE, include=TRUE, echo=FALSE-------------------------------------
#just to be sure
library(GROAN)

## ---- eval=FALSE, include=TRUE, echo=TRUE, results='hide'---------------------
#  library(GROAN)
#  
#  #arrays of phenotypes
#  GROAN.KI$yield
#  GROAN.AI$yield
#  
#  #dataframes of SNP genotypes
#  GROAN.KI$SNPs
#  GROAN.AI$SNPs
#  
#  #dataframes of realized genotypic kinship
#  GROAN.KI$kinship
#  GROAN.AI$kinship

## ---- eval=TRUE, include=TRUE, echo=TRUE--------------------------------------
#creating a GROAN.NoisyDataset without any extra noise injected
nds.no_noise = createNoisyDataset(
  name = "PEA KI, no noise",
  genotypes = GROAN.KI$SNPs, 
  phenotypes = GROAN.KI$yield
)

#creating a GROAN.NoisyDataset adding noise sampled from a normal distribution
nds.normal_noise = createNoisyDataset(
  name = "PEA KI, normal noise",
  genotypes = GROAN.KI$SNPs, 
  phenotypes = GROAN.KI$yield,
  noiseInjector = noiseInjector.norm,
  mean = 0,
  sd = sd(GROAN.KI$yield) * 0.5
)

#creating a third dataset, this time with data from the AI lines
nds.no_noise.AI = createNoisyDataset(
  name = "PEA AI, no noise",
  genotypes = GROAN.AI$SNPs, 
  phenotypes = GROAN.AI$yield
)


## ----eval=TRUE, echo=TRUE, fig.align="center", fig.height=5, fig.width=5, include=TRUE----
#plotting the original phenotypes
plot(GROAN.KI$yield, pch=20, main = "True (black) vs. Noisy (red)", xlab = "Samples", ylab = "Phenotypes")
#plotting an instance of the phenotypes with noise injected 
points(getNoisyPhenotype(nds.normal_noise), col="red")

## ----eval=TRUE, include=TRUE, echo=TRUE, , results="hold"---------------------
#average correlation oscillates around 0.89
cor(GROAN.KI$yield, getNoisyPhenotype(nds.normal_noise))
cor(GROAN.KI$yield, getNoisyPhenotype(nds.normal_noise))
cor(GROAN.KI$yield, getNoisyPhenotype(nds.normal_noise))

## ----eval=TRUE, include=TRUE, echo=TRUE---------------------------------------
#no noise injector ==> the original phenotypes are returned
all(GROAN.KI$yield == getNoisyPhenotype(nds.no_noise))

## ----eval=TRUE, include=TRUE, echo=TRUE---------------------------------------
print(nds.no_noise)
print(nds.no_noise.AI)

## ---- eval=TRUE, include=TRUE, echo=TRUE--------------------------------------
#creating a Workbench with default values 
wb = createWorkbench()

## ---- eval=TRUE, include=TRUE, echo=TRUE--------------------------------------
#creating a GROAN.Workbench with default values explicitly assigned 
wb = createWorkbench(
  #parameters defining crossvalidation
  folds = 5, reps = 10, stratified = FALSE, 
  
  #parameters defining save-on-hard-disk policy
  outfolder = NULL, saveHyperParms = FALSE, saveExtraData = FALSE,
  
  #a regressor
  regressor = phenoRegressor.rrBLUP, regressor.name = "rrBLUP"
)

## ---- eval=TRUE, include=TRUE, echo=TRUE--------------------------------------
#adding a regressor to an existing Workbench
wb = addRegressor(
  #the Workbench to be updater
  wb,
  #the new regressor
  regressor = phenoRegressor.BGLR, regressor.name = "Bayesian Lasso",
  
  #regressor-specific parameters
  type = "BL"
)

## ---- eval=TRUE, include=TRUE, echo=TRUE--------------------------------------
print(wb)

## ---- eval=FALSE, include=TRUE, echo=TRUE-------------------------------------
#  #executing two GROAN test, same workbench, different datasets
#  res.no_noise     = GROAN.run(nds.no_noise, wb)
#  res.normal_noise = GROAN.run(nds.normal_noise, wb)
#  
#  #putting the results together for further analysis
#  res.total = rbind(res.no_noise, res.normal_noise)

## ---- eval=FALSE, include=TRUE, echo=TRUE-------------------------------------
#  #defaults is a boxplot of Pearson's correlations
#  p = plotResult(res.total)
#  print(p)

## ---- out.width = "600px", eval=TRUE, include=TRUE, echo=FALSE----------------
knitr::include_graphics("plot1.png")

## ---- eval=FALSE, include=TRUE, echo=TRUE-------------------------------------
#  #a barplot with 95% confidence interval of Pearson's correlations
#  p = plotResult(res.total, plot.type = "bar_conf95")
#  print(p)

## ---- out.width = "600px", eval=TRUE, include=TRUE, echo=FALSE----------------
knitr::include_graphics("plot2.png")

## ---- eval=FALSE, include=TRUE, echo=TRUE-------------------------------------
#  #a barplot of execution times per fold, in seconds
#  p = plotResult(res.total, plot.type = "bar", variable = "time")
#  print(p)

## ---- out.width = "600px", eval=TRUE, include=TRUE, echo=FALSE----------------
knitr::include_graphics("plot3.png")

## ---- eval=FALSE, include=TRUE, echo=TRUE-------------------------------------
#  #collating the two example datasets
#  nds.double = createNoisyDataset(
#    name = "KI and AI",
#    genotypes = rbind(GROAN.KI$SNPs, GROAN.AI$SNPs),
#    phenotypes = c(GROAN.KI$yield, GROAN.AI$yield),
#    strata = c(rep("KI", 103), rep("AI", ,105)) #we have 103 KI and 105 AI
#  )
#  
#  #the workbench will take into account strata
#  wb = createWorkbench(stratified = TRUE)
#  
#  #ready to go
#  res = GROAN.run(nds.double, wb)
#  plotResult(res, strata = "single", plot.type = "bar")

## ---- out.width = "600px", eval=TRUE, include=TRUE, echo=FALSE----------------
knitr::include_graphics("plot4.png")

## ---- eval=FALSE, include=TRUE, echo=TRUE-------------------------------------
#  #a new GROAN.Workbench with NO crossvalidation and only one repetition
#  wb = createWorkbench(
#    folds = NULL, reps = 1,
#    regressor.name = "rrBLUP", regressor = phenoRegressor.rrBLUP)
#  
#  #training on PEA.KI, testing on PEA.AI
#  res = GROAN.run(nds = nds.normal_noise, wb = wb, nds.test = nds.no_noise.AI)
#  
#  print(res[,c("dataset.train", "dataset.test", "pearson")])

## ---- eval=FALSE, include=TRUE, echo=TRUE-------------------------------------
#  #a new GROAN.Workbench with 5-fold crossvalidation and only one repetition
#  wb = createWorkbench(
#    folds = 5, reps = 1,
#    regressor.name = "rrBLUP", regressor = phenoRegressor.rrBLUP)
#  
#  #training on PEA.KI, testing on PEA.KI (crossvalidation) and PEA.AI
#  res = GROAN.run(nds = nds.normal_noise, wb = wb, nds.test = nds.no_noise.AI)
#  
#  print(res[,c("dataset.train", "dataset.test", "pearson")])

## ---- eval=FALSE, include=TRUE, echo=TRUE-------------------------------------
#  #training on PEA.KI, testing on PEA.KI (crossvalidation), PEA.AI, and PEA.KI again (overfitting!)
#  res = GROAN.run(
#    nds = nds.normal_noise, wb = wb,
#    nds.test = list(nds.no_noise.AI, nds.no_noise)
#    )
#  
#  print(res[,c("dataset.train", "dataset.test", "pearson")])

## ----eval=TRUE, echo=TRUE, fig.align="center", fig.height=5, fig.width=5, include=TRUE----
  #GROAN.KI has 103 samples, we'll use the first 50 samples for training
  #and the remaining will be predicted
  my.pheno = GROAN.KI$yield
  my.pheno[51:103] = NA

  #doing the predictions
  res = phenoRegressor.rrBLUP(phenotypes = my.pheno, genotypes = GROAN.KI$SNPs)
  
  #we just obtained a list with the following fields
  print(names(res))
  
  #visualize the predictions
  plot(
    x = GROAN.KI$yield[51:103], xlab = "Real values", 
    y = res$predictions[51:103], ylab = "Predicted values"
  )
  abline(a=0, b=1) #adding first quadrant bisector, for reference

## ---- eval=FALSE, include=TRUE, echo=TRUE-------------------------------------
#  #A noise injector adding a fixed bias to a random subset of about 50% of the data
#  my.noiseInjector.bias = function(phenotypes){
#    #an array of random booleans
#    labels = runif(length(phenotypes)) > 0.5
#  
#    #adding the bias (a fixed value of 5)
#    phenotypes[labels] = phenotypes[labels] + 5
#  
#    #returning the original phenotypes with added noise
#    return(phenotypes)
#  }

## ---- eval=FALSE, include=TRUE, echo=TRUE-------------------------------------
#  #A GROAN.NoisyDataSet that embeds the bias noise
#  nds.bias_noise = createNoisyDataset(
#    name = "PEA, bias noise",
#    genotypes = GROAN.KI$SNPs,
#    phenotypes = GROAN.KI$yield,
#    noiseInjector = my.noiseInjector.bias   #the function we defined above
#  )

## ---- eval=FALSE, include=TRUE, echo=TRUE-------------------------------------
#  #An improved version of the above function, this time the bias is not fixed
#  my.noiseInjector.bias2 = function(phenotypes, bias = 5){
#    #an array of random booleans
#    labels = runif(length(phenotypes)) > 0.5
#  
#    #adding the bias (from the function argument)
#    phenotypes[labels] = phenotypes[labels] + bias
#  
#    #returning the original phenotypes with added noise
#    return(phenotypes)
#  }

## ---- eval=FALSE, include=TRUE, echo=TRUE-------------------------------------
#  #A NoisyDataSet with bias noise function, using the second version of the function
#  nds.bias_noise2 = createNoisyDataset(
#    name = "PEA, bias noise, second function",
#    genotypes = GROAN.KI$SNPs,
#    phenotypes = GROAN.KI$yield,
#    noiseInjector = my.noiseInjector.bias2,   #the new version
#    bias = 20 #if omitted the default would be used
#  )

## ---- eval=FALSE, include=TRUE, echo=TRUE-------------------------------------
#  phenoRegressor.dummy = function (phenotypes, genotypes, covariances, extraCovariates){
#    #no operation is actually required
#    return(list(
#      predictions = runif(length(phenotypes)), #predictions
#      hyperparams = c(some = 1, params="a"),
#      extradata = "filler extra data for phenoRegressor.dummy"
#    ))
#  }