R/outlierDetection.R

Defines functions FuncDetectOutlierPlantMaize outlierCriteria

Documented in FuncDetectOutlierPlantMaize outlierCriteria

#-------------------------------------------------------------------------------
# Program: outlierDetection.R
# Objective: functions for outliers detection according to biological meaning
# Author: I.Sanchez
# Creation: 05/09/2016
# Update: 04/09/2020
#-------------------------------------------------------------------------------

##' a function for several outlier criteria
##' @description this function calculates outlier criteria on plants for each parameter
##' @param datain input dataframe of parameters
##' @param typeD type of datain dataframe (1==wide, 2==long)
##' @param residin input dataframe of residuals
##' @param typeR type of residin dataframe (1==wide, 2==long)
##' @param trait character, trait of interest to model (example biovolume24, PH24 ...)
##' @param resRawName character, names of the raw residual in datain
##' @param resStdName character, names of the standardized residual in datain
##' @param threshold, numeric threshold for the normal quantile in raw criteria
##' @details This function needs in input a dataframe with residuals extracted from
##' a mixed linear model (for instance using asreml or nlme libraries) and an another dataframe
##' with the estimated parameters (biovolume, plantHeight, leafArea etc...). Several criteria
##' will be calculated using different types of residuals. The 2 input dataframe must contain
##' the following columns names: "experimentAlias","Line" and "Position".
##' \describe{
##' \item{raw  and quartile criteria}{use raw residuals}
##' \item{influence criterion}{uses standardized residuals}
##' }
##' The function must be executed for each parameter of interest: biovolume, plantHeight and phy.
##' Each criteria will be used according with some rules:
##' \describe{
##' \item{Small plant}{biovolume and phy}
##' \item{Big plants}{biovolume and plantHeight}
##' }
##' @return a dataframe with columns identifiying criteria used to detect outlier plants
##' with 1==plant OK - 0==plant KO to suppress
##' \describe{
##' \item{critraw}{raw criterion, critci: quartiles criterion}
##' \item{critinfl}{influence criterion with standardized residuals}
##' }
##'
##' @importFrom stats IQR qnorm quantile sd lm na.omit as.formula
##' @importFrom dplyr select filter rename mutate left_join
##'
##' @examples
##' # Not run
##' # dt1<-outlierCriteria()
##'
##' @export
outlierCriteria<-function(datain,typeD,residin,typeR,trait,
                          resRawName,resStdName,threshold){
  
  .Deprecated("FuncDetectOutlierPlantMaize")
  
  # Create a dataframe with raw data, fitted and residuals for each parameter
  datain<-as.data.frame(datain)
  if (typeD==1){ # wide format column==differents traits
    tmp1<-select(datain,Ref,Genosce,Line,Position,genotypeAlias,
                        experimentAlias,scenario,repetition,
                        potAlias,.data[[trait]])
  } else if (typeD==2){ # long format 1 column Trait, 1 column value
    tmp1<-filter(datain, Trait==trait)
    tmp1<-rename(tmp1,"Trait" = {{ trait }})
  }
  
  residin<-as.data.frame(residin)
  if (typeR==1){ # wide format column==differents traits
    tmp2<-residin
  } else if (typeR==2){ # long format 1 column Trait, 1 column value
    tmp2<-filter(residin, Trait==trait)
  }
  
  # merge datain and residin by experimentAlias, line and position (unique key)
  tmp<-left_join(tmp1,tmp2,by=c("experimentAlias","Line","Position"))
  
  # mean and sd of residuals
  tmp<-mutate(tmp,mean.res=mean(tmp[,resRawName],na.rm=TRUE),
                  sd.res=sd(tmp[,resRawName],na.rm=TRUE))
  #--- raw cleaning
  tmp<-mutate(tmp,lower.res=mean.res - sd.res*qnorm(threshold),
                  upper.res=mean.res + sd.res*qnorm(threshold))
  tmp<-mutate(tmp,lower.critraw=ifelse(tmp[,resRawName]-tmp[,"lower.res"]>0,yes=1,no=0),
                  upper.critraw=ifelse(tmp[,resRawName]-tmp[,"upper.res"]<0,yes=1,no=0))
  #--- Quantiles cleaning
  tmp<-mutate(tmp, Q1.res=quantile(tmp[,resRawName],probs=0.25,na.rm=TRUE)-1.5*IQR(tmp[,resRawName],na.rm=TRUE),
                  Q3.res=quantile(tmp[,resRawName],probs=0.75,na.rm=TRUE)+1.5*IQR(tmp[,resRawName],na.rm=TRUE))
  tmp<-mutate(tmp,lower.critci=ifelse(tmp[,resRawName]-tmp[,"Q1.res"]>0,yes=1,no=0),
                  upper.critci=ifelse(tmp[,resRawName]-tmp[,"Q3.res"]<0,yes=1,no=0))
  #--- influence with standardized residuals
  tmp<-mutate(tmp,lower.critinfl=ifelse(tmp[,resStdName]>=-2,yes=1,no=0),
                  upper.critinfl=ifelse(tmp[,resStdName]<=2,yes=1,no=0))
  # output
  return(tmp)
}

#------------------- tutorial version
#' FuncDetectOutlierPlantMaize
#' @description function to detect plant outliers in a temporal lattice experiment 
#'       on Maize which can be extended to others experiment types. 
#'  The criteria needs 3 phenotypes (ex for maize: the estimated biomass, 
#'  plant height and phyllocron)
#'  Please, take a look of the structure of the example dataset: plant4
#'  \describe{
#'    \item{plants are identified as "small outlier plant"}{ if for biomass AND phyllocron
#'              res_i < mu_{res} - qnorm(threshold) * sd_{res}}  
#'    \item{plants are identified as "big outlier plant"}{ if for biomass AND plant height
#'              res_i > mu_{res} + qnorm(threshold) * sd_{res} } 
#'  }
#' @param datain input dataframe, a spatio-temporal data.frame
#' @param dateBeforeTrt character, date just before treatment in the experiment
#' @param param1 character, name of a phenotypic variable in datain (ex: Biomass)
#' @param param2 character, name of a phenotypic variable in datain (ex: plant height)
#' @param param3 character, name of a phenotypic variable in datain (ex: phyllocron)
#' @param paramGeno character, name of the genotype variable in datain
#' @param paramCol character, name of the Line variable in the datain
#' @param paramRow character, name of the position variable in datain
#' @param threshold numeric,
#' @param nCol numeric, nunber of lines in the lattice platform (28 for phenoarch)
#' @param nRow numeric, nunber of columns in the lattice platform (60 for phenoarch)
#' @param genotype.as.random logical, If TRUE, the genotype is included as random effect 
#'                            in the model. The default is FALSE. (see the SpATS() help)
#' @param timeColumn character, name of the time points column in datain (ex: Time)
#' 
#' @details see SpATS() from the SpATS R library
#' The input dataset must contain the following columns:
#'  In the case of a plant experiment in phenoarch platform
#' \describe{
#'  \item{1}{the estimated biomass, numeric}
#'  \item{2}{the estimated plant height, numeric}
#'  \item{3}{the estimated phyllocron, numeric}
#'  \item{4}{the genotype id, character}
#'  \item{5}{the lines in the greenhouse or lattice, numeric}
#'  \item{6}{the columns in the greenhouse or lattice, numeric}
#'  }
#'  In other kind of lattice platform
#'  \describe{
#'  \item{1}{param1 a numeric phenotypic parameter}
#'  \item{2}{param2 a numeric phenotypic parameter}
#'  \item{3}{param3 a numeric phenotypic parameter}
#'  \item{4}{the genotype id, character}
#'  \item{5}{the lines in the platform or lattice, numeric}
#'  \item{6}{the columns in the platform or lattice, numeric}
#' }
#'
#' @return return a list of 4 elements
#' \describe{
#'  \item{outputDataframe}{ a data.frame with the used data set, the fitted values 
#'                        and residuals calculated by the model, 
#'                        the flag of outliers}
#'  \item{smallOutlier}{a data.frame of the detected "small" outliers}
#'  \item{bigOutlier}{a data.frame of the detected "big" outliers}
#'  \item{m1}{A list of the SpATS results for param1 (see the SpATS() help)}
#'  \item{m2}{A list of the SpATS results for param2 (see the SpATS() help)}
#'  \item{m3}{A list of the SpATS results for param3 (see the SpATS() help)}
#' }
#' 
#' @importFrom SpATS SpATS PSANOVA
#' @importFrom dplyr filter mutate %>%
#' 
#' @examples
#' \donttest{
#' library(ggplot2)
#' test<-FuncDetectOutlierPlantMaize(datain=PAdata,dateBeforeTrt="2017-04-27",
#'                 param1="Biomass_Estimated",param2="Height_Estimated",
#'                 param3="phyllocron",paramGeno="Genotype",
#'                 paramCol="Col",paramRow="Row",
#'                 threshold=0.95,nCol=28,nRow=60,genotype.as.random=FALSE,
#'                 timeColumn = "Time")
#' plot(test$m1)  
#' plot(test$m2)  
#' plot(test$m3) 
#' ggplot(data=test$outputDataframe,aes(x=fittedP1,y=devResP1)) + geom_point()
#' ggplot(data=test$outputDataframe,aes(x=fittedP2,y=devResP2)) + geom_point()
#' ggplot(data=test$outputDataframe,aes(x=fittedP3,y=devResP3)) + geom_point()
#' # a summary of the detected outlier
#' print(test$smallOutlier)
#' print(test$bigOutlier)
#' }
#' @export
FuncDetectOutlierPlantMaize <- function(datain,
                                        dateBeforeTrt,
                                        param1,
                                        param2,
                                        param3,
                                        paramGeno,
                                        paramCol, # paramLine
                                        paramRow, # paramPosition
                                        threshold,
                                        nCol, # nLine
                                        nRow, # nPosition
                                        genotype.as.random = FALSE,
                                        timeColumn) {
  
  #--- Sort before trt and add useful columns or variables
  tp <- filter(datain, get(timeColumn) == dateBeforeTrt)
  tp$R <- as.factor(tp[,paramCol])
  tp$C <- as.factor(tp[,paramRow])
  nsegL <- round(nCol/2)
  nsegC <- round(nRow/2)
  # I need to have these column names in the model
  colnames(tp)[which(colnames(tp)==paramCol)] <- "Line"
  colnames(tp)[which(colnames(tp)==paramRow)] <- "Position"
  
  #--- model SpATS
  m1 <- SpATS(response = param1, 
              spatial = ~ PSANOVA(Line, Position, nseg = c(nsegL,nsegC)),
              genotype = paramGeno, 
              random = ~ C + R, 
              genotype.as.random = genotype.as.random,
              data = tp, 
              control =  list(tolerance = 1e-03, monitoring = 0))
  
  m2 <- SpATS(response = param2, 
              spatial = ~ PSANOVA(Line,Position, nseg = c(nsegL,nsegC)),
              genotype = paramGeno, 
              random = ~ C + R, 
              genotype.as.random = genotype.as.random,
              data = tp, 
              control =  list(tolerance = 1e-03, monitoring = 0))
  
  m3 <- SpATS(response = param3, 
              spatial = ~ PSANOVA(Line,Position, nseg = c(nsegL,nsegC)),
              genotype = paramGeno, 
              random = ~ C + R, 
              genotype.as.random = genotype.as.random,
              data = tp, 
              control =  list(tolerance = 1e-03, monitoring = 0))
  
  #--- Retrieve fitted values and deviance residuals for each model
  # P1: param1
  # P2: param2
  # P3: param3
  devResP1 <- m1$residuals
  fittedP1 <- m1$fitted
  devResP2 <- m2$residuals
  fittedP2 <- m2$fitted
  devResP3 <- m3$residuals
  fittedP3 <- m3$fitted
  
  tpresu <- cbind.data.frame(tp,
                             devResP1,
                             devResP2,
                             devResP3,
                             fittedP1,
                             fittedP2,
                             fittedP3)
  
  #--- Calculate criteria for outliers
  tpresu <- mutate(tpresu,
                   mean.devP1 = mean(devResP1, na.rm=TRUE),
                   sd.devP1 = sd(devResP1, na.rm=TRUE),
                   mean.devP2 = mean(devResP2, na.rm = TRUE),           
                   sd.devP2 = sd(devResP2, na.rm = TRUE), 
                   mean.devP3 = mean(devResP3, na.rm = TRUE),         
                   sd.devP3 = sd(devResP3, na.rm = TRUE), 
                   
                   lower.devP1 = mean.devP1 - sd.devP1*qnorm(threshold), 
                   lower.devP2 = mean.devP2 - sd.devP2*qnorm(threshold), 
                   lower.devP3 = mean.devP3 - sd.devP3*qnorm(threshold), 
                   upper.devP1 = mean.devP1 + sd.devP1*qnorm(threshold), 
                   upper.devP2 = mean.devP2 + sd.devP2*qnorm(threshold), 
                   upper.devP3 = mean.devP3 + sd.devP3*qnorm(threshold), 
                   
                   lower.spatsP1 = ifelse(devResP1-lower.devP1 > 0, yes = 1, no = 0), 
                   lower.spatsP2 = ifelse(devResP2-lower.devP2 > 0, yes = 1, no = 0), 
                   lower.spatsP3 = ifelse(devResP3-lower.devP3 > 0, yes = 1, no = 0), 
                   upper.spatsP1 = ifelse(devResP1-upper.devP1 < 0, yes = 1, no = 0), 
                   upper.spatsP2 = ifelse(devResP2-upper.devP2 < 0, yes = 1, no = 0), 
                   upper.spatsP3 = ifelse(devResP3-upper.devP3 < 0, yes = 1, no = 0)
  )
  
  # small plants
  tpresu <- mutate(tpresu,
                   flagLowerSpats = ifelse(lower.spatsP1 + lower.spatsP3 == 0, 
                                           yes = 0,
                                           no = 1)
  )
  
  # big plants
  tpresu <- mutate(tpresu,
                   flagUpperSpats = ifelse(upper.spatsP1 + upper.spatsP2==0,
                                           yes = 0,
                                           no = 1)
  )
  
  #--- Detect outliers
  outlier1 <- tpresu %>% filter(flagLowerSpats == 0)
  outlier2 <- tpresu %>% filter(flagUpperSpats == 0)
  
  # return a list
  outlist <- list(tpresu, outlier1, outlier2, m1, m2, m3)
  names(outlist) <- c("outputDataframe", "smallOutlier", "bigOutlier", "m1", "m2", "m3")
  return(outlist)
}
sanchezi/openSilexStatR documentation built on Sept. 10, 2020, 1:03 p.m.