R/pointCleaning.R

Defines functions FuncDetectPointOutlierLocFit plotFlagPoint flagPointLocfit

Documented in flagPointLocfit FuncDetectPointOutlierLocFit plotFlagPoint

#-------------------------------------------------------------------------------
# Program: pointCleaning.R
# Objective: detection of outlier points for time courses using a
#            locfit smoothing and an interval of prediction
# Author: I.Sanchez
# Creation: 13/04/2018
# Update  : 04/09/2020
#-------------------------------------------------------------------------------

#' flagPointLocfit
#'
#' detection of outlier points in time courses
#' @param datain input dataframe of parameters
#' @param trait character, parameter of interest (ex: plantHeight)
#' @param xvar character, time variable (ex: thermalTime)
#' @param loopID character, ID on which to make the loop
#' @param locfit.h numeric, the constant component of the smoothing parameter
#' @param threshold numeric, threshold to detect on the prediction interval
#'
#' @return a list
#' \describe{
#' \item{1}{prediction and detection of outlier on observed data}
#' \item{2}{prediction on regular abscissa data}
#' \item{3}{time courses with not enough point to be evaluated}
#' }
#' @export
#'
#' @importFrom dplyr select bind_rows
#' 
#' @examples
#' \donttest{
#' # Example
#' }
flagPointLocfit<-function(datain,trait,xvar,loopID,locfit.h,threshold){
  
  .Deprecated("FuncDetectPointOutlierLocFit")
  
  # Initialisation
  #myfm<-list()
  tabpred<-NULL    # dataframe on observed data
  tabpredreg<-NULL    # dataframe on regular abscissa data
  tabNotEnoughPoint<-NULL # dataframe of observed data with not enough point by loopId

  for (i in levels(factor(datain[,loopID]))){
    ### select the data
    tmp<-datain[datain[,loopID]==i,]
    tmp<-na.omit(select(tmp,.data[[trait]],.data[[xvar]]))
    y<-tmp[,trait]
    x<-tmp[,xvar]
    # abscisse reguliere
    xreg<-seq(0,max(x,na.rm=TRUE),1)

    ### Choix du pas suivant le nombre de donnees, car certaines courbes peuvent avoir peu de points...
    if (length(x) <=4){
      next
    } else if (length(x) <= 7) {
      if (is.null(locfit.h)) mylocfit<-30
      else mylocfit<-locfit.h
    } else {
      if (is.null(locfit.h)) mylocfit<-15
      else mylocfit<-locfit.h
    }

    ### lissage et intervalles de prediction
    if (length(na.omit(y)) <=4){
      tabNotEnoughPoint<-bind_rows(tabNotEnoughPoint,
                                    select(datain[datain[,loopID]==i,],
                                            .data[[loopID]],.data[[xvar]],
                                            .data[[trait]]))
      names(tabNotEnoughPoint)<-c("Ref","x","y")
      next
    } else {
      # modele locfit
      myfm<-locfit(y ~ lp(x, h = mylocfit, deg = 2))
      #-- Recuperation des predictions pour l'intervalle sur vecteur regulier
      ypred<-predict(myfm,newdata=list(x=xreg),se.fit=TRUE)
      lwr<-ypred[[1]]-threshold*ypred[[2]]
      upr<-ypred[[1]]+threshold*ypred[[2]]
      #-- Recuperation des predictions pour l'intervalle sur x observe
      ypred2<-predict(myfm,newdata=list(x=x),se.fit=TRUE)
      lwr2<-ypred2[[1]]-threshold*ypred2[[2]]
      upr2<-ypred2[[1]]+threshold*ypred2[[2]]

      #-- concatenation des resus en dataframe
      tppred<-cbind.data.frame(rep(i,length(x)),x,y,ypred2[[1]],ypred2[[2]],lwr2,upr2)
      tppred<-mutate(tppred,outlier=if_else(y < upr2 & y > lwr2,0,1))
      tabpred<-rbind.data.frame(tabpred,tppred)

      tppredreg<-cbind.data.frame(rep(i,length(xreg)),xreg,ypred[[1]],ypred[[2]],lwr,upr)
      tabpredreg<-rbind.data.frame(tabpredreg,tppredreg)
    }
  }
  names(tabpred)<-c("Ref","x","y","ypred","ypred_se","lwr","upr","outlier")
  names(tabpredreg)<-c("Ref","xreg","ypred","ypred_se","lwr","upr")
  return(list(tabpred,tabpredreg,tabNotEnoughPoint))
}

#' plotFlagPoint
#'
#' plot detection of outlier points in time courses
#' @param smoothin the 1st element of a flagPointLocfit() list result. smoothin contains the prediction on observed data and the detected outlier
#' @param loopID character, name of the column of the time courses' ID
#' @param myselect character, a vector of ID
#'
#' @return a graph
#' @export
#'
#' @examples
#' \donttest{
#' # Example
#' }
plotFlagPoint<-function(smoothin,loopID,myselect){
  # filtering smoothin dataframe for some time courses
  tmp<-smoothin[smoothin[,loopID] %in% myselect,]
  g<-ggplot(data=tmp,aes(x=x,y=y)) +
    geom_point() +
    geom_line(aes(y=ypred,colour="red")) +
    geom_line(aes(y=upr,colour="green")) +
    geom_line(aes(y=lwr,colour="green")) +
    geom_point(data=subset(tmp, outlier == 1),colour="blue",shape=2,size=2) +
    facet_wrap(loopID)
  print(g)
}

#,nbCurveByPage,xlabel
#plot(x,y,main=i,xlab=xlabel,ylab=trait,cex.main=0.7)
# vecteur pour disposition des graphs dans page
# if (nbCurveByPage ==1) mypage<-c(1,1)
# else if (nbCurveByPage ==2) mypage<-c(2,1)
# else if (nbCurveByPage %in% c(3,4)) mypage<-c(2,2)
# else if (nbCurveByPage %in% c(5,6)) mypage<-c(2,2)
# else if (nbCurveByPage %in% c(7,8)) mypage<-c(4,2)
# else if (nbCurveByPage ==9) mypage<-c(3,3)
# par(mfrow=mypage)
# Graph locfit modele + IC
#plot(x,y,main=i,xlab=xlabel,pch=19,cex.main=0.7,cex=0.8)
#lines(xreg,ypred[[1]],col="red")
#lines(xreg,upr,col="green")
#lines(xreg,lwr,col="green")
#points(filter(tppred,outlier==1)[,"x"],filter(tppred,outlier==1)[,"y"],col="blue",pch=1,cex=1.5)

#------------- Tutorial version

#' FuncDetectPointOutlierLocFit
#' @description function to model each curve of a dataset using a local regression
#' @param datain input dataframe. This dataframe contains a set of time courses
#' @param myparam character, name of the variable to model in datain 
#'                 (for example, Biomass, PH or LA and so on)
#' @param mytime  character, name of the time variable in datain which must be numeric
#' @param myid    character, name of the id variable in datain
#' @param mylevel numeric, factor to calculate the confidence interval
#' @param mylocfit numeric, The constant component of the smoothing parameter. (see the locfit())
#' 
#' @details see locfit() help function from the locfit R library
#' The user can act on:
#' \describe{
#'  \item{mylocfit}{the constant of the smoothing parameter. Increase mylocfit to have a 
#'    very smooth curve}
#'  \item{mylevel}{the level to calculate the confidence interval. Increase mylevel to 
#'    exclude less outliers}
#' }
#' @details to produce the grahics of the prediction and detected outliers, please use
#'         plotDetectPointOutlierLocFit() function.
#' @return a data.frame with outlier containing the following columns:
#' \describe{
#'  \item{Ref}{the id variable}
#'  \item{mytime}{name of the time variable in datain}
#'  \item{myparam}{name of the modeled variable in datain}
#'  \item{ypred}{the locfit prediction}
#'  \item{sd_ypred}{standard deviation of the prediction}
#'  \item{lwr}{lower bound of the confidence interval}
#'  \item{upr}{upper bound of the confidence interval}
#'  \item{outlier}{flag of detected outlier (0 is outlier, 1 is not)}
#' }
#' @export
#'
#' @importFrom dplyr select_ bind_rows mutate if_else
#' 
#' @examples
#' \donttest{
#' data(plant1)
#' library(locfit)
#' selec<-c("manip1_1_1_WW","manip1_1_2_WW","manip1_1_3_WW",
#'          "manip1_1_4_WW","manip1_1_5_WW")
#' mydata<-plant1[plant1[,"Ref"] %in% selec,]
#' resu<-FuncDetectPointOutlierLocFit(datain=mydata,
#'                myparam="biovolume",mytime="thermalTime",
#'                myid="Ref",mylevel=5,mylocfit=70)
#' }
#-------------------------------------------------------------------
FuncDetectPointOutlierLocFit <-function(datain,
                                        myparam,
                                        mytime,
                                        myid,
                                        mylevel,
                                        mylocfit) {
  
  myfm<-list()     # initialize a list
  tabpred<-NULL    # initialize a dataframe
  
  for (i in levels(factor(datain[,myid]))) {
    tmp <- datain[datain[,myid]==i,] 
    tmp <- na.omit(tmp)
    y <- tmp[,myparam]
    x <- tmp[,mytime]
    xpred <- seq(0,max(x,na.rm=TRUE),1)
    
    # Choice of step according to the number of data
    if (length(x) <=6) {
      print("Not enough data (<=6 points): ",i)
      next
    } else {
      myfm <- locfit(y ~ lp(x, h = mylocfit, deg = 2))
      
      # Retrieving predictions for the interval for a fine graphic
      ypred <- predict(myfm,newdata=list(x=xpred),se.fit=TRUE)
      lwr <- ypred[[1]]-mylevel*ypred[[2]]
      upr <- ypred[[1]]+mylevel*ypred[[2]]
      
      # Retrieving predictions for the x input interval
      ypred2 <- predict(myfm,newdata=list(x=x),se.fit=TRUE)
      lwr2 <- ypred2[[1]] - mylevel*ypred2[[2]]
      upr2 <- ypred2[[1]] + mylevel*ypred2[[2]]
      
      # joining results into dataframe
      tppred <- cbind.data.frame(rep(i,length(x)),x,y,ypred2[[1]],ypred2[[2]],lwr2,upr2)
      tppred <- mutate(tppred,outlier=if_else(y < upr2 & y > lwr2,0,1))
      tabpred <- rbind.data.frame(tabpred,tppred) 
    }
  }
  
  names(tabpred)<-c(myid,mytime,myparam,"ypred","sd_ypred","lwr","upr","outlier")
  return(tabpred)
}

#---------------------------- End of file --------------------------
sanchezi/openSilexStatR documentation built on Sept. 10, 2020, 1:03 p.m.