R/outlier_detect.R

Defines functions outlier_detect

Documented in outlier_detect

#' @title Outlier detection and replacement
#' @description This function identifies outlier observations
#' in the trajectories, and allows users to replace the observations
#' or remove trajectories entirely.
#' @param traj [matrix (numeric)]: longitudinal data. Each row
#' represents an individual trajectory (of observations). The columns
#' show the observations at consecutive time points.
#' @param id_field [numeric or character] Whether the first column
#' of the \code{traj} is a unique (\code{id}) field.
#' Default: \code{FALSE}. If \code{TRUE} the function recognizes
#' the second column as the first time step.
#' @param method [integer (numeric)] indicating the method for
#' identifying the outlier. Options are: \code{'1'}: quantile method
#' (\code{default}), and \code{'2'}: manual method. The \code{manual}
#' method requires a user-defined value.
#' @param threshold [numeric] A cut-off value for outliers. If the
#' \code{method} parameter is set as \code{'1'}:quantile, the \code{threshold}
#' should be a numeric vector of probability between \code{[0,1]}, whilst if
#' the \code{method} is set as \code{'2'}: \code{manual}, the
#' \code{threshold} could be any numeric vector.
#' @param count [integer (numeric)] indicating the number of observations
#' (in a trajectory) that must exceed the \code{threshold} in order for the
#' trajectory to be considered an \code{outlier}. Default is \code{1}.
#' @param replace_with [integer (numeric)] indicating the technique to
#' use for calculating a replacement for an outlier observation. The remaining
#' observations on the row or the column in which the outlier observation is
#' located are used to calculate the replacement.
#' The replacement options are: \code{'1'}: Mean value of the column,
#' \code{'2'}: Mean value of the row and \code{'3'}: remove the row
#' (trajectory) completely from the data. Default value is the
#' \code{'1'} option.
#' @param verbose to suppress output messages (to the console).
#' Default: \code{TRUE}.
#' @usage outlier_detect(traj, id_field = FALSE, method = 1, threshold = 0.95,
#' count = 1, replace_with = 1, verbose=TRUE)
#' @details Given a matrix, this function identifies outliers that
#' exceed the threshold and replaces the outliers with an estimate
#' calculated using the other observations either the rows or the columns
#' in which the outlier observation is located. Option is also provided to
#' remove the trajectories (containing the outlier) from the data.
#' @examples
#'
#' data(traj)
#'
#' trajectry <- data_imputation(traj, id_field=TRUE, method = 1,
#'    replace_with = 1, verbose=FALSE)
#'
#' trajectry <- props(trajectry$CompleteData, id_field=TRUE)
#'
#' outp <- outlier_detect(trajectry, id_field = TRUE, method = 1,
#' threshold = 0.95, count = 1, replace_with = 1, verbose=TRUE)
#'
#' outp <- outlier_detect(trajectry, id_field = TRUE, method = 2, threshold = 15,
#'   count = 4, replace_with = 3, verbose=TRUE)
#'
#' @return A dataframe with outlier observations replaced or removed.
#' @importFrom utils flush.console
#' @export

outlier_detect <- function(traj, id_field = FALSE, method = 1, threshold = 0.95,
                          count = 1, replace_with = 1, verbose = TRUE){

  solution <- list()

  dat <- traj

  #back up data
  b_dat <- dat

  #remove the id field
  if(id_field ==  TRUE){
    dat  <- dat[,2:ncol(dat)]
    c_name <- colnames(traj)[1]
  }

  #matrix to track the outlier incidents [TRUE or FALSE]
  outlier_mat <- matrix(FALSE, nrow(dat), ncol(dat))

  #------------------------------------
  #if method: "quantile"
  if(method==1){
    #check if the value is in-between [0,1]
    if(threshold < 0 | threshold > 1){
      stop(paste("*--Terminated!!!--*, The 'threshold'",
      "value should be between 0 and 1", sep=" "))
    }
    #calculate the cut-off value based on the 'threshold'
    thres_ <- as.vector(round(quantile(as.vector(unlist(as.data.frame(dat))),
                                       threshold), digits=5))
    id_ <- which(dat > thres_)
    #update the outlier tracker
    outlier_mat[id_] <- "TRUE"
  }

  # method: "manual"
  #------------------------------------
  if(method==2){
    id_ <- which(dat > threshold)
    outlier_mat[id_] <- "TRUE"
    thres_ <- threshold
  }

  #check if an entire column is outliers...then terminate.
  c_out <- NULL
  for(n_ in seq_len(ncol(outlier_mat))){ #n_<-1
    c_out <- rbind(c_out, cbind(n_, length(which(outlier_mat[,n_]==TRUE))))
  }

  if(replace_with == 1){
    wc_out <- which(c_out[,2]==nrow(dat))
    if(length(wc_out)!=0){
      stop(paste("*--Function terminated!!!--* All observations on Column(s)",
      wc_out, "are outliers!!", sep=" "))
    }
  }

  #check if an entire row is ouliers...then terminate.
  r_out <- NULL
  for(m_ in seq_len(nrow(outlier_mat))){ #m_<-1
    r_out <- rbind(r_out, cbind(m_,length(which(outlier_mat[m_,]==TRUE))))
  }

  if(replace_with == 2){# check which trajectory has 100% outlier observations
    w_out <- which(r_out[,2]==ncol(dat))
    if(length(w_out)!=0){
      stop(paste("*--Function terminated!!!--* All observations on Row(s)",
      w_out,"are outliers!!", sep=" "))
    }
  }

  #--------------------------------------------------
  #identify the rows in where outliers are found
  list_traj <- NULL
  for(j in seq_len(nrow(outlier_mat))){ #j<-1
    w_ <- length(which(outlier_mat[j,]==TRUE))
    if(w_ >= count){ #checking the count
      list_traj <- rbind(list_traj, cbind(j,"TRUE"))
    }
  }

  #to replace the outlier observation,
  if(!is.null(list_traj)){

    #replace with mean of col
    if(replace_with == 1){
      for(k in seq_len(nrow(list_traj))){ #k<-1
      idd_ <-
        which(outlier_mat[as.numeric(as.character(list_traj[k,1])),]==TRUE)

        #loop through each column and remove the outlier in
        #them before calculating the value of the mean column value
        for(l_ in seq_len(length(idd_))){ #l_<-1
          dat[as.numeric(as.character(list_traj[k,1])),idd_[l_]] <-
          round(mean(dat[-which(outlier_mat[,idd_[l_]]==TRUE),idd_[l_]]),
                digits = 2)
        }
      }
    }

    #replace with mean of row
    if(replace_with == 2){
      for(k in seq_len(nrow(list_traj))){ #k<-1
        #use the non-outlier observation for the calculation
        idd_nonOutlier <-
          which(outlier_mat[as.numeric(as.character(list_traj[k,1])),]==FALSE)
        idd_Outlier <-
          which(outlier_mat[as.numeric(as.character(list_traj[k,1])),]==TRUE)
        dat[list_traj[k,1],idd_Outlier] <-
      round(mean(as.numeric(as.character(dat[list_traj[k,1],idd_nonOutlier]))),
                digits = 2)
      }
    }

    #to remove the outlier trajectory
    if(replace_with == 3){
      dat <- dat[-as.numeric(as.character(list_traj[,1])),]
    }

    dat_ <- dat

    if(id_field ==  TRUE & replace_with != 3){
      id <- data.frame(as.vector(b_dat[,1]))
      colnames(id) <- c_name
      b_dat <- cbind(id, dat)
      dat_  <- b_dat
    }

    if(id_field ==  TRUE & replace_with == 3){
      id <- data.frame(as.vector(b_dat[,1]))
      colnames(id) <- c_name
      #remove the oulier row from the column ids
      id <- id[-as.numeric(as.character(list_traj[,1]))]
      b_dat <-  b_dat[-as.numeric(as.character(list_traj[,1])),]
      dat_  <- b_dat
    }

    #if 'replace_with' is 1 or 2
    if(replace_with==1|replace_with==2){

      if(verbose==TRUE){
        flush.console()
        print(paste(nrow(list_traj),
          paste("trajectories were found to contain outlier",
            "observations and replaced accordingly!", sep=" "),
          sep=" "))
        print("Summary:")
      }

      for(u_ in seq_len(nrow(list_traj))){ #u_<-1
        if(verbose==TRUE){
          flush.console()
          print(paste("*--Outlier observation(s) was found in trajectory ",
          list_traj[u_,1]," --*", sep=""))
        }
      }
    }

    #if 'replace_with' is 3
    if(replace_with==3){
      if(verbose==TRUE){
        flush.console()
        print(paste(nrow(list_traj),
             "trajectory(s) identified as outliers and removed!", sep=" "))
              print("Details:")
      }

      for(u_ in seq_len(nrow(list_traj))){ #u_<-1
        if(verbose==TRUE){
          flush.console()
          print(paste("*----- trajectory ",
              list_traj[u_,1], "removed"), sep=" ")
        }
      }
    }
  }

  #for method 2, in which outlier may not be found
  if(is.null(list_traj)){
    dat_ <- b_dat
      if(verbose==TRUE){
        print("No outlier(s) found!")
      }
  }

  #indexes of traj where outliers are found
  outlier_obs_id <- as.numeric(as.character(list_traj[,1]))
  non_outlier_obs_id <-
    seq_len(nrow(dat))[!seq_len(nrow(dat)) %in% outlier_obs_id]
  threshold_estimated <- thres_
  solution <- list(Outlier_Observations=outlier_obs_id,
                   Non_Outlier_Observations=non_outlier_obs_id,
                   Threshold=threshold_estimated,
                   Outliers_Replaced = dat_)
  return(solution)

}

#outlier detected in traj index (if id_field is absent..)
#    #replace with mean of col

Try the akmedoids package in your browser

Any scripts or data that you put into this service are public.

akmedoids documentation built on April 13, 2021, 9:07 a.m.