R/rsa.R

Defines functions rsa

Documented in rsa

#' Regression Stability Analysis
#'
#' Function to run the regression stability analysis (Yates and Cochran, 1938,
#' Finlay and Wilkinson, 1963).
#' @param trait The trait to analyze.
#' @param geno The genotypes.
#' @param env The environments.
#' @param rep The replications.
#' @param data The name of the data frame containing the data.
#' @param maxp Maximum allowed proportion of missing values to estimate, default is 10\%.
#' @author Raul Eyzaguirre.
#' @details The regression stability analysis is evaluated with a balanced data set.
#' If data is unbalanced, missing values are estimated up to an specified maximum proportion,
#' 10\% by default. For the ANOVA table, genotypes and environments are considered as fixed
#' factors while the blocks are considered as random and nested into the environments.
#' To run a regression stability analysis you need a set of genotypes evaluated in a set of
#' environments. At least 3 genotypes or environments are needed. In a regression stability
#' analysis for genotypes grown at several environments, for each genotype a simple linear
#' regression of individual yield (Y) on the mean yield of all genotypes for each environment
#' (X) is fitted. In a similar way, for each environment a simple linear regression of
#' individual yield (Y) on the mean yield of all environments for each genotype (X) is fitted.
#' In both cases the X values are centered on zero, so the intercepts of the models correspond to
#' the means of the genotypes or environments.
#' @return It returns the regression stability analysis decomposition of the GxE interaction
#' for genotypes and environments (Heterogeneity among regressions and deviation from regression),
#' the coefficient of variation, and the following regression stability measures for genotypes
#' and environments:
#' \itemize{
#' \item \code{a} the intercept. 
#' \item \code{b} the slope.
#' \item \code{se} the standard error for the slope.
#' \item \code{MSe} the mean square error.
#' \item \code{MSentry} the variance of the genotype means across environments and the environment
#' means across genotypes.
#' \item \code{MSinter} the variance of the genotype interaction effects across environments and the
#' environment interaction effects across genotypes.
#' }
#' @references
#' Finlay, K. W., and Wilkinson, G. N. (1963). The Analysis of Adaption in a Plant-Breeding Programme.
#' Aust. J. Agric. Res. 14: 742-754.
#'
#' Yates, F., and Cochran, W. G. (1938). The Analysis of Group Experiments.
#' J. Agric. Sci. 28: 556-580.
#' @examples
#' rsa("y", "geno", "env", "rep", met8x12)
#' @importFrom stats coef lm summary.lm
#' @export

rsa <- function(trait, geno, env, rep, data, maxp = 0.1) {
  
  # Everything as factor
  
  data[, geno] <- factor(data[, geno])
  data[, env] <- factor(data[, env])
  data[, rep] <- factor(data[, rep])

  # Error messages
  
  lc <- ck.2f(trait, geno, env, rep, data)
  
  if (lc$na == 2 & lc$nb == 2)
    stop("You need at least 3 genotypes or 3 environments for regression stability analysis.")

  # Compute ANOVA
  
  at <- suppressWarnings(aov.met(trait, geno, env, rep, data, maxp))

  # Check data and estimate missing values
  
  if (lc$c1 == 0 | lc$c2 == 0 | lc$c3 == 0 | lc$c4 == 0) {
    data[, trait] <- mve.met(trait, geno, env, rep, data, maxp, tol = 1e-06)[, 5]
    warning(paste("The data set is unbalanced, ",
                  format(lc$pmis * 100, digits = 3),
                  "% missing values estimated.", sep = ""))
  }
  
  # Some statistics

  int.mean <- tapply(data[, trait], list(data[, geno], data[, env]), mean, na.rm = TRUE)
  overall.mean <- mean(int.mean, na.rm = TRUE)
  env.mean <- apply(int.mean, 2, mean, na.rm = TRUE)
  geno.mean <- apply(int.mean, 1, mean, na.rm = TRUE)
  
  # Regression-stability for genotypes

  a <- NULL        # linear regression intercept
  b <- NULL        # linear regression slope
  se <- NULL       # slope standard error
  MSe <- NULL      # error mean square
  MSentry <- NULL  # variance of the means
  MSinter <- NULL  # variance of the interaction effects
  ssr <- NULL      # residual sum of squares

  for (i in 1:lc$na) {
    modelo <- lm(int.mean[i, ] ~ I(env.mean - overall.mean))
    a[i] <- coef(modelo)[1]
    b[i] <- coef(modelo)[2]
    se[i] <- summary.lm(modelo)$coefficients[2, 2]
    MSe[i] <- anova(modelo)[2, 3]
    MSentry[i] <- sum((int.mean[i, ] - geno.mean[i])^2) / (lc$nb - 1)
    MSinter[i] <- sum((int.mean[i, ] - geno.mean[i] - env.mean + overall.mean)^2) / (lc$nb - 1)
    ssr[i] <- anova(modelo)[2, 2] * lc$nr
  }
  stab.geno <- cbind(a, b, se, MSe, MSentry, MSinter)
  row.names(stab.geno) <- row.names(int.mean)
  
  if (lc$nb > 2) {
    drg.sc <- sum(ssr)
    hrg.sc <- at[4, 2] - drg.sc
    hrg.gl <- lc$na - 1
    drg.gl <- (lc$na - 1) * (lc$nb - 1) - hrg.gl
    drg.cm <- drg.sc / drg.gl
    hrg.cm <- hrg.sc / hrg.gl
    drg.f <- drg.cm / at[5, 3]
    hrg.f <- hrg.cm / drg.cm
    drg.p <- pf(drg.f, drg.gl, at[5, 1], lower.tail = FALSE)
    hrg.p <- pf(hrg.f, hrg.gl, drg.gl, lower.tail = FALSE)
  } else {
    drg.sc <- NA
    hrg.sc <- NA
    drg.gl <- NA
    hrg.gl <- NA
    drg.cm <- NA
    hrg.cm <- NA
    drg.f <- NA
    hrg.f <- NA
    drg.p <- NA
    hrg.p <- NA
  }

  # Regression-stability for environments

  a <- NULL        # linear regression intercept
  b <- NULL        # linear regression slope
  se <- NULL       # slope standard error
  MSe <- NULL      # error mean square
  MSentry <- NULL  # variance of the means
  MSinter <- NULL  # variance of the interaction effects
  ssr <- NULL      # residual sum of squares
  
  for (i in 1:lc$nb) {
    modelo <- lm(int.mean[, i] ~ I(geno.mean - overall.mean))
    a[i] <- coef(modelo)[1]
    b[i] <- coef(modelo)[2]
    se[i] <- summary.lm(modelo)$coefficients[2, 2]
    MSe[i] <- anova(modelo)[2, 3]
    MSentry[i] <- sum((int.mean[, i] - env.mean[i])^2) / (lc$na - 1)
    MSinter[i] <- sum((int.mean[, i] - env.mean[i] - geno.mean + overall.mean)^2) / (lc$na - 1)
    ssr[i] <- anova(modelo)[2, 2] * lc$nr
  }
  stab.env <- cbind(a, b, se, MSe, MSentry, MSinter)
  row.names(stab.env) <- colnames(int.mean)

  if (lc$na > 2) {
    dre.sc <- sum(ssr)
    hre.sc <- at[4, 2] - dre.sc
    hre.gl <- lc$nb - 1
    dre.gl <- (lc$na - 1) * (lc$nb - 1) - hre.gl
    dre.cm <- dre.sc / dre.gl
    hre.cm <- hre.sc / hre.gl
    dre.f <- dre.cm / at[5, 3]
    hre.f <- hre.cm / dre.cm
    dre.p <- pf(dre.f, dre.gl, at[5, 1], lower.tail = FALSE)
    hre.p <- pf(hre.f, hre.gl, dre.gl, lower.tail = FALSE)
  } else {
    dre.sc <- NA
    hre.sc <- NA
    dre.gl <- NA
    hre.gl <- NA
    dre.cm <- NA
    hre.cm <- NA
    dre.f <- NA
    hre.f <- NA
    dre.p <- NA
    hre.p <- NA
  }

  # ANOVA plus regression stability

  cv <- sqrt(at[5, 3]) / abs(overall.mean) * 100

  fileaux <- at[5, ]
  at[5, ] <- c(hrg.gl, hrg.sc, hrg.cm, hrg.f, hrg.p)
  at[6, ] <- c(drg.gl, drg.sc, drg.cm, drg.f, drg.p)
  at[7, ] <- c(hre.gl, hre.sc, hre.cm, hre.f, hre.p)
  at[8, ] <- c(dre.gl, dre.sc, dre.cm, dre.f, dre.p)
  at[9, ] <- fileaux
  row.names(at) <- c("G", "E", "R:E", "GxE", "- Het.Regr.G", "- Dev.Regr.G",
                     "- Het.Regr.E", "- Dev.Regr.E", "Residuals")

  # Return

  list(ANOVA = at, CV = cv, Stability_for_genotypes = stab.geno,
       Stability_for_environments = stab.env)
}
reyzaguirre/st4gi documentation built on June 19, 2018, 12:20 p.m.