library(dplyr) library(tidyr) library(phisStatR) library(ggplot2)
This vignette deals with the detection of outlier plants in a lattice experiment using a spatial model using splines (SpATS library) [3]. Here the following steps of this procedure developed for Maize experiment but easily adaptable to others species:
We have so a dataset with one row for each plant in the experiment containing the three phenotypes: biomass24, PH24 and Phy.
On the residuals, we can detect outlier plant(s) with a combined physiological criterion applying the following rules:
raw procedure: at a threshold=0.95 (can be modified)
In this vignette, we use a toy data set of the phisStatR library (anonymized real data set). the first one plant4 is extracted from plant1 and is used to detect outlier at a specific time point. The detected outlier will be checked on the plant1 dataset graphically.
mydata<-plant4 str(mydata)
spat.biomass<-fitSpATS(datain=mydata,trait="Biomass24",genotypeId="genotypeAlias",rowId="Line",colId="Position", typeModel="anova",genotype.as.random=FALSE,nseg=c(14,30),verbose)
plot(spat.biomass)
spat.ph<-fitSpATS(datain=mydata,trait="PH24",genotypeId="genotypeAlias",rowId="Line",colId="Position", typeModel="anova",genotype.as.random=FALSE,nseg=c(14,30),verbose)
plot(spat.ph)
spat.phy<-fitSpATS(datain=mydata,trait="Phy",genotypeId="genotypeAlias",rowId="Line",colId="Position", typeModel="anova",genotype.as.random=FALSE,nseg=c(14,30),verbose)
plot(spat.phy)
The data cleaning procedure is to identify plants having too large or too small features, from deviance residuals obtained with SpATS mixed models for each parameter (biomass, plant height and phyllocron). 2 procedures are tested:
# concatenate raw data and residuals calculated with the SpATS model devResBio<-spat.biomass$residuals devResPH<-spat.ph$residuals devResPhy<-spat.phy$residuals myglobal<-cbind.data.frame(mydata,devResBio,devResPH,devResPhy) threshold<-0.95 myglobal<-mutate(myglobal,mean.devBio=mean(devResBio,na.rm=TRUE), sd.devBio=sd(devResBio,na.rm=TRUE), mean.devPH=mean(devResPH,na.rm=TRUE), sd.devPH=sd(devResPH,na.rm=TRUE), mean.devPhy=mean(devResPhy,na.rm=TRUE), sd.devPhy=sd(devResPhy,na.rm=TRUE), lower.devBio=mean.devBio - sd.devBio*qnorm(threshold), lower.devPH=mean.devPH - sd.devPH*qnorm(threshold), lower.devPhy=mean.devPhy - sd.devPhy*qnorm(threshold), upper.devBio=mean.devBio + sd.devBio*qnorm(threshold), upper.devPH=mean.devPH + sd.devPH*qnorm(threshold), upper.devPhy=mean.devPhy + sd.devPhy*qnorm(threshold), # yes=1 == OK # no =0 == problem lower.critraw.spatsBio=ifelse(devResBio - lower.devBio > 0,yes=1,no=0), lower.critraw.spatsPH=ifelse(devResPH - lower.devPH > 0,yes=1,no=0), lower.critraw.spatsPhy=ifelse(devResPhy - lower.devPhy > 0,yes=1,no=0), upper.critraw.spatsBio=ifelse(devResBio - upper.devBio < 0,yes=1,no=0), upper.critraw.spatsPH=ifelse(devResPH - upper.devPH < 0,yes=1,no=0), upper.critraw.spatsPhy=ifelse(devResPhy - upper.devPhy < 0,yes=1,no=0), Q1.devBio=quantile(devResBio,probs=0.25,na.rm=TRUE) - 1.5*(quantile(devResBio,probs=0.75,na.rm=TRUE) - quantile(devResBio,probs=0.25,na.rm=TRUE)), Q1.devPH=quantile(devResPH,probs=0.25,na.rm=TRUE) - 1.5*(quantile(devResPH,probs=0.75,na.rm=TRUE) - quantile(devResPH,probs=0.25,na.rm=TRUE)), Q1.devPhy=quantile(devResPhy,probs=0.25,na.rm=TRUE) - 1.5*(quantile(devResPhy,probs=0.75,na.rm=TRUE) - quantile(devResPhy,probs=0.25,na.rm=TRUE)), Q3.devBio=quantile(devResBio,probs=0.75,na.rm=TRUE) + 1.5*(quantile(devResBio,probs=0.75,na.rm=TRUE) - quantile(devResBio,probs=0.25,na.rm=TRUE)), Q3.devPH=quantile(devResPH,probs=0.75,na.rm=TRUE) + 1.5*(quantile(devResPH,probs=0.75,na.rm=TRUE) - quantile(devResPH,probs=0.25,na.rm=TRUE)), Q3.devPhy=quantile(devResPhy,probs=0.75,na.rm=TRUE) + 1.5*(quantile(devResPhy,probs=0.75,na.rm=TRUE) - quantile(devResPhy,probs=0.25,na.rm=TRUE)), lower.critci.spatsBio=ifelse(devResBio - Q1.devBio > 0,yes=1,no=0), lower.critci.spatsPH=ifelse(devResPH - Q1.devPH > 0,yes=1,no=0), lower.critci.spatsPhy=ifelse(devResPhy - Q1.devPhy > 0,yes=1,no=0), upper.critci.spatsBio=ifelse(devResBio - Q3.devBio < 0,yes=1,no=0), upper.critci.spatsPH=ifelse(devResPH - Q3.devPH < 0,yes=1,no=0), upper.critci.spatsPhy=ifelse(devResPhy - Q3.devPhy < 0,yes=1,no=0) ) # small plants # yes=1 == OK # no =0 == problem myglobal<-mutate(myglobal, flagLowerRawSpats=ifelse(lower.critraw.spatsBio + lower.critraw.spatsPhy==0,yes=0,no=1), flagLowerCiSpats=ifelse(lower.critci.spatsBio + lower.critci.spatsPhy==0,yes=0,no=1) ) # big plants # yes=1 == OK # no =0 == problem myglobal<-mutate(myglobal, flagUpperRawSpats=ifelse(upper.critraw.spatsBio + upper.critraw.spatsPH==0,yes=0,no=1), flagUpperCiSpats=ifelse(upper.critci.spatsBio + upper.critci.spatsPH==0,yes=0,no=1) )
The user can save the residuals and detected outliers in an output file.
write.table(myglobal,"OutputFile_detectOutlierCurves_SpATS.csv",row.names=FALSE,sep="\t")
Summary of the cleaning procedure:
r ifelse(length(table(myglobal[,"flagLowerRawSpats"]))==1,
0,table(myglobal[,"flagLowerRawSpats"])[[1]])
r ifelse(length(table(myglobal[,"flagUpperRawSpats"]))==1,
0,table(myglobal[,"flagUpperRawSpats"])[[1]])
r ifelse(length(table(myglobal[,"flagLowerCiSpats"]))==1,
0,table(myglobal[,"flagLowerCiSpats"])[[1]])
r ifelse(length(table(myglobal[,"flagUpperCiSpats"]))==1,
0,table(myglobal[,"flagUpperCiSpats"])[[1]])
cat("Lower raw outlier:") myglobal %>% select(Ref,genotypeAlias,repetition,Biomass24,PH24,Phy,flagLowerRawSpats) %>% filter(flagLowerRawSpats !=1) cat("Upper raw outlier:") myglobal %>% select(Ref,genotypeAlias,repetition,Biomass24,PH24,Phy,flagUpperRawSpats) %>% filter(flagUpperRawSpats !=1)
cat("Lower quantile outlier:") myglobal %>% select(Ref,genotypeAlias,repetition,Biomass24,PH24,Phy,flagLowerCiSpats) %>% filter(flagLowerCiSpats !=1) cat("Upper quantile outlier:") myglobal %>% select(Ref,genotypeAlias,repetition,Biomass24,PH24,Phy,flagUpperCiSpats) %>% filter(flagUpperCiSpats !=1)
The user can check the outlier curves detected by this procedure. We use plant1 the dataset containing all the phenotypes of the same toy experiment as plant4 but over thermal time.
outlierId<-filter(myglobal,flagLowerRawSpats !=1 | flagUpperRawSpats !=1) %>% select(genotypeAlias,scenario,repetition,flagLowerRawSpats,flagUpperRawSpats) %>% unite("geno_sce",genotypeAlias,scenario,sep="_",remove=FALSE) mycurve<-plant1 %>% unite("geno_sce",genotypeAlias,scenario,sep="_",remove=FALSE) %>% left_join(outlierId,mycurve,by=c("geno_sce","repetition")) # flagLower ou upper == 0 problem # flagLower ou upper == 1 OK # flag == "lower" problem en lower # flag == "upper" problem en upper # flag == "OK" plant OK tmp<-filter(mycurve,geno_sce %in% outlierId[,"geno_sce"]) %>% mutate(flagLowerRawSpats=ifelse(is.na(flagLowerRawSpats),1,flagLowerRawSpats), flagUpperRawSpats=ifelse(is.na(flagUpperRawSpats),1,flagUpperRawSpats), flag=case_when(flagLowerRawSpats == 0 ~ "lower", flagUpperRawSpats == 0 ~ "upper", flagUpperRawSpats + flagLowerRawSpats ==2 ~ "OK" ))
ggplot(data=tmp,aes(x=thermalTime,y=biovolume,color=as.factor(repetition))) + geom_line() + geom_point(aes(shape=flag,color=as.factor(repetition))) + scale_shape_manual(values = c(0,19,2)) + facet_wrap(~geno_sce)
ggplot(data=tmp,aes(x=thermalTime,y=plantHeight,color=as.factor(repetition))) + geom_line() + geom_point(aes(shape=flag,color=as.factor(repetition))) + scale_shape_manual(values = c(0,19,2)) + facet_wrap(~geno_sce)
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.