Objective

  library(dplyr)
  library(tidyr)
  library(phisStatR)
  library(ggplot2)

This vignette deals with the detection of outlier plants in a lattice experiment using a spatial model using splines (SpATS library) [3]. Here the following steps of this procedure developed for Maize experiment but easily adaptable to others species:

We have so a dataset with one row for each plant in the experiment containing the three phenotypes: biomass24, PH24 and Phy.

Import of data

In this vignette, we use a toy data set of the phisStatR library (anonymized real data set). the first one plant4 is extracted from plant1 and is used to detect outlier at a specific time point. The detected outlier will be checked on the plant1 dataset graphically.

  mydata<-plant4
  str(mydata)

SpATS library

Biomass

  spat.biomass<-fitSpATS(datain=mydata,trait="Biomass24",genotypeId="genotypeAlias",rowId="Line",colId="Position",
                 typeModel="anova",genotype.as.random=FALSE,nseg=c(14,30),verbose)
  plot(spat.biomass)

Plant height

  spat.ph<-fitSpATS(datain=mydata,trait="PH24",genotypeId="genotypeAlias",rowId="Line",colId="Position",
                 typeModel="anova",genotype.as.random=FALSE,nseg=c(14,30),verbose)
  plot(spat.ph)

Phyllocron

  spat.phy<-fitSpATS(datain=mydata,trait="Phy",genotypeId="genotypeAlias",rowId="Line",colId="Position",
                 typeModel="anova",genotype.as.random=FALSE,nseg=c(14,30),verbose)
  plot(spat.phy)

Data cleaning

The data cleaning procedure is to identify plants having too large or too small features, from deviance residuals obtained with SpATS mixed models for each parameter (biomass, plant height and phyllocron). 2 procedures are tested:

  # concatenate raw data and residuals calculated with the SpATS model
  devResBio<-spat.biomass$residuals
  devResPH<-spat.ph$residuals
  devResPhy<-spat.phy$residuals
  myglobal<-cbind.data.frame(mydata,devResBio,devResPH,devResPhy)
  threshold<-0.95

  myglobal<-mutate(myglobal,mean.devBio=mean(devResBio,na.rm=TRUE),
                          sd.devBio=sd(devResBio,na.rm=TRUE),
                          mean.devPH=mean(devResPH,na.rm=TRUE),
                          sd.devPH=sd(devResPH,na.rm=TRUE),
                          mean.devPhy=mean(devResPhy,na.rm=TRUE),
                          sd.devPhy=sd(devResPhy,na.rm=TRUE),
                          lower.devBio=mean.devBio - sd.devBio*qnorm(threshold),
                          lower.devPH=mean.devPH - sd.devPH*qnorm(threshold),
                          lower.devPhy=mean.devPhy - sd.devPhy*qnorm(threshold),
                          upper.devBio=mean.devBio + sd.devBio*qnorm(threshold),
                          upper.devPH=mean.devPH + sd.devPH*qnorm(threshold),
                          upper.devPhy=mean.devPhy + sd.devPhy*qnorm(threshold),
                          # yes=1 == OK
                          # no =0 == problem
                          lower.critraw.spatsBio=ifelse(devResBio - lower.devBio > 0,yes=1,no=0),
                          lower.critraw.spatsPH=ifelse(devResPH - lower.devPH > 0,yes=1,no=0),
                          lower.critraw.spatsPhy=ifelse(devResPhy - lower.devPhy > 0,yes=1,no=0),
                          upper.critraw.spatsBio=ifelse(devResBio - upper.devBio < 0,yes=1,no=0),
                          upper.critraw.spatsPH=ifelse(devResPH - upper.devPH < 0,yes=1,no=0),
                          upper.critraw.spatsPhy=ifelse(devResPhy - upper.devPhy < 0,yes=1,no=0),
                          Q1.devBio=quantile(devResBio,probs=0.25,na.rm=TRUE) - 
                                    1.5*(quantile(devResBio,probs=0.75,na.rm=TRUE) - 
                                         quantile(devResBio,probs=0.25,na.rm=TRUE)),
                          Q1.devPH=quantile(devResPH,probs=0.25,na.rm=TRUE) - 
                                    1.5*(quantile(devResPH,probs=0.75,na.rm=TRUE) - 
                                         quantile(devResPH,probs=0.25,na.rm=TRUE)),
                          Q1.devPhy=quantile(devResPhy,probs=0.25,na.rm=TRUE) - 
                                    1.5*(quantile(devResPhy,probs=0.75,na.rm=TRUE) - 
                                         quantile(devResPhy,probs=0.25,na.rm=TRUE)),
                          Q3.devBio=quantile(devResBio,probs=0.75,na.rm=TRUE) + 
                                    1.5*(quantile(devResBio,probs=0.75,na.rm=TRUE) - 
                                         quantile(devResBio,probs=0.25,na.rm=TRUE)),
                          Q3.devPH=quantile(devResPH,probs=0.75,na.rm=TRUE) + 
                                    1.5*(quantile(devResPH,probs=0.75,na.rm=TRUE) - 
                                         quantile(devResPH,probs=0.25,na.rm=TRUE)),
                          Q3.devPhy=quantile(devResPhy,probs=0.75,na.rm=TRUE) + 
                                    1.5*(quantile(devResPhy,probs=0.75,na.rm=TRUE) - 
                                         quantile(devResPhy,probs=0.25,na.rm=TRUE)),
                          lower.critci.spatsBio=ifelse(devResBio - Q1.devBio > 0,yes=1,no=0),
                          lower.critci.spatsPH=ifelse(devResPH - Q1.devPH > 0,yes=1,no=0),
                          lower.critci.spatsPhy=ifelse(devResPhy - Q1.devPhy > 0,yes=1,no=0),
                          upper.critci.spatsBio=ifelse(devResBio - Q3.devBio < 0,yes=1,no=0),
                          upper.critci.spatsPH=ifelse(devResPH - Q3.devPH < 0,yes=1,no=0),
                          upper.critci.spatsPhy=ifelse(devResPhy - Q3.devPhy < 0,yes=1,no=0)
                    )

  # small plants
  # yes=1 == OK
  # no =0 == problem
  myglobal<-mutate(myglobal,
                  flagLowerRawSpats=ifelse(lower.critraw.spatsBio + lower.critraw.spatsPhy==0,yes=0,no=1),
                  flagLowerCiSpats=ifelse(lower.critci.spatsBio + lower.critci.spatsPhy==0,yes=0,no=1)
                  )

  # big plants
  # yes=1 == OK
  # no =0 == problem
  myglobal<-mutate(myglobal,
                  flagUpperRawSpats=ifelse(upper.critraw.spatsBio + upper.critraw.spatsPH==0,yes=0,no=1),
                  flagUpperCiSpats=ifelse(upper.critci.spatsBio + upper.critci.spatsPH==0,yes=0,no=1)
                  )

The user can save the residuals and detected outliers in an output file.

  write.table(myglobal,"OutputFile_detectOutlierCurves_SpATS.csv",row.names=FALSE,sep="\t")

Summary of the cleaning procedure:

raw criteria

  cat("Lower raw outlier:")
  myglobal %>% select(Ref,genotypeAlias,repetition,Biomass24,PH24,Phy,flagLowerRawSpats) %>%
                filter(flagLowerRawSpats !=1)

  cat("Upper raw outlier:")
  myglobal %>% select(Ref,genotypeAlias,repetition,Biomass24,PH24,Phy,flagUpperRawSpats) %>%
                filter(flagUpperRawSpats !=1)

quantile criteria

  cat("Lower quantile outlier:")
  myglobal %>% select(Ref,genotypeAlias,repetition,Biomass24,PH24,Phy,flagLowerCiSpats) %>%
                filter(flagLowerCiSpats !=1)

  cat("Upper quantile outlier:")
  myglobal %>% select(Ref,genotypeAlias,repetition,Biomass24,PH24,Phy,flagUpperCiSpats) %>%
                filter(flagUpperCiSpats !=1)

raw criteria : Plots

The user can check the outlier curves detected by this procedure. We use plant1 the dataset containing all the phenotypes of the same toy experiment as plant4 but over thermal time.

  outlierId<-filter(myglobal,flagLowerRawSpats !=1 | flagUpperRawSpats !=1)  %>% 
             select(genotypeAlias,scenario,repetition,flagLowerRawSpats,flagUpperRawSpats) %>%
             unite("geno_sce",genotypeAlias,scenario,sep="_",remove=FALSE)

  mycurve<-plant1 %>%
           unite("geno_sce",genotypeAlias,scenario,sep="_",remove=FALSE) %>%
           left_join(outlierId,mycurve,by=c("geno_sce","repetition"))

  # flagLower ou upper == 0 problem
  # flagLower ou upper == 1 OK
  # flag == "lower" problem en lower
  # flag == "upper" problem en upper
  # flag == "OK" plant OK
  tmp<-filter(mycurve,geno_sce %in% outlierId[,"geno_sce"]) %>%
       mutate(flagLowerRawSpats=ifelse(is.na(flagLowerRawSpats),1,flagLowerRawSpats),
         flagUpperRawSpats=ifelse(is.na(flagUpperRawSpats),1,flagUpperRawSpats),
         flag=case_when(flagLowerRawSpats == 0 ~ "lower",
                        flagUpperRawSpats == 0 ~ "upper",
                        flagUpperRawSpats + flagLowerRawSpats ==2 ~ "OK"
         ))
  ggplot(data=tmp,aes(x=thermalTime,y=biovolume,color=as.factor(repetition))) + geom_line() +
    geom_point(aes(shape=flag,color=as.factor(repetition))) + scale_shape_manual(values = c(0,19,2)) +
    facet_wrap(~geno_sce) 
  ggplot(data=tmp,aes(x=thermalTime,y=plantHeight,color=as.factor(repetition))) + geom_line() +
    geom_point(aes(shape=flag,color=as.factor(repetition))) + scale_shape_manual(values = c(0,19,2)) +
    facet_wrap(~geno_sce) 

Session info

  sessionInfo()

References

  1. R Development Core Team (2015). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. ISBN 3-900051-07-0, URL http://www.R-project.org.
  2. Maria Xose Rodriguez-Alvarez, Martin P. Boer, Fred A. van Eeuwijk, Paul H.C. Eilers (2017). Correcting for spatial heterogeneity in plant breeding experiments with P-splines. Spatial Statistics URL https://doi.org/10.1016/j.spasta.2017.10.003
  3. Alvarez Prado, S., Sanchez, I., Cabrera Bosquet, L., Grau, A., Welcker, C., Tardieu, F., Hilgert, N. (2019). To clean or not to clean phenotypic datasets for outlier plants in genetic analyses?. Journal of Experimental Botany, 70 (15), 3693-3698. , DOI : 10.1093/jxb/erz191 https://prodinra.inra.fr/record/481355


sanchezi/phisStatR documentation built on Nov. 14, 2019, 7:10 p.m.