#' @title Outlier detection and replacement
#' @description This function identifies outlier observations
#' in the trajectories, and allows users to replace the observations
#' or remove trajectories entirely.
#' @param traj [matrix (numeric)]: longitudinal data. Each row
#' represents an individual trajectory (of observations). The columns
#' show the observations at consecutive time points.
#' @param id_field [numeric or character] Whether the first column
#' of the \code{traj} is a unique (\code{id}) field.
#' Default: \code{FALSE}. If \code{TRUE} the function recognizes
#' the second column as the first time step.
#' @param method [integer (numeric)] indicating the method for
#' identifying the outlier. Options are: \code{'1'}: quantile method
#' (\code{default}), and \code{'2'}: manual method. The \code{manual}
#' method requires a user-defined value.
#' @param threshold [numeric] A cut-off value for outliers. If the
#' \code{method} parameter is set as \code{'1'}:quantile, the \code{threshold}
#' should be a numeric vector of probability between \code{[0,1]}, whilst if
#' the \code{method} is set as \code{'2'}: \code{manual}, the
#' \code{threshold} could be any numeric vector.
#' @param count [integer (numeric)] indicating the number of observations
#' (in a trajectory) that must exceed the \code{threshold} in order for the
#' trajectory to be considered an \code{outlier}. Default is \code{1}.
#' @param replace_with [integer (numeric)] indicating the technique to
#' use for calculating a replacement for an outlier observation. The remaining
#' observations on the row or the column in which the outlier observation is
#' located are used to calculate the replacement.
#' The replacement options are: \code{'1'}: Mean value of the column,
#' \code{'2'}: Mean value of the row and \code{'3'}: remove the row
#' (trajectory) completely from the data. Default value is the
#' \code{'1'} option.
#' @param verbose to suppress output messages (to the console).
#' Default: \code{TRUE}.
#' @usage outlier_detect(traj, id_field = FALSE, method = 1, threshold = 0.95,
#' count = 1, replace_with = 1, verbose=TRUE)
#' @details Given a matrix, this function identifies outliers that
#' exceed the threshold and replaces the outliers with an estimate
#' calculated using the other observations either the rows or the columns
#' in which the outlier observation is located. Option is also provided to
#' remove the trajectories (containing the outlier) from the data.
#' @examples
#'
#' data(traj)
#'
#' trajectry <- data_imputation(traj, id_field=TRUE, method = 1,
#' replace_with = 1, verbose=FALSE)
#'
#' trajectry <- props(trajectry$CompleteData, id_field=TRUE)
#'
#' outp <- outlier_detect(trajectry, id_field = TRUE, method = 1,
#' threshold = 0.95, count = 1, replace_with = 1, verbose=TRUE)
#'
#' outp <- outlier_detect(trajectry, id_field = TRUE, method = 2, threshold = 15,
#' count = 4, replace_with = 3, verbose=TRUE)
#'
#' @return A dataframe with outlier observations replaced or removed.
#' @importFrom utils flush.console
#' @export
outlier_detect <- function(traj, id_field = FALSE, method = 1, threshold = 0.95,
count = 1, replace_with = 1, verbose = TRUE){
solution <- list()
dat <- traj
#back up data
b_dat <- dat
#remove the id field
if(id_field == TRUE){
dat <- dat[,2:ncol(dat)]
c_name <- colnames(traj)[1]
}
#matrix to track the outlier incidents [TRUE or FALSE]
outlier_mat <- matrix(FALSE, nrow(dat), ncol(dat))
#------------------------------------
#if method: "quantile"
if(method==1){
#check if the value is in-between [0,1]
if(threshold < 0 | threshold > 1){
stop(paste("*--Terminated!!!--*, The 'threshold'",
"value should be between 0 and 1", sep=" "))
}
#calculate the cut-off value based on the 'threshold'
thres_ <- as.vector(round(quantile(as.vector(unlist(as.data.frame(dat))),
threshold), digits=5))
id_ <- which(dat > thres_)
#update the outlier tracker
outlier_mat[id_] <- "TRUE"
}
# method: "manual"
#------------------------------------
if(method==2){
id_ <- which(dat > threshold)
outlier_mat[id_] <- "TRUE"
thres_ <- threshold
}
#check if an entire column is outliers...then terminate.
c_out <- NULL
for(n_ in seq_len(ncol(outlier_mat))){ #n_<-1
c_out <- rbind(c_out, cbind(n_, length(which(outlier_mat[,n_]==TRUE))))
}
if(replace_with == 1){
wc_out <- which(c_out[,2]==nrow(dat))
if(length(wc_out)!=0){
stop(paste("*--Function terminated!!!--* All observations on Column(s)",
wc_out, "are outliers!!", sep=" "))
}
}
#check if an entire row is ouliers...then terminate.
r_out <- NULL
for(m_ in seq_len(nrow(outlier_mat))){ #m_<-1
r_out <- rbind(r_out, cbind(m_,length(which(outlier_mat[m_,]==TRUE))))
}
if(replace_with == 2){# check which trajectory has 100% outlier observations
w_out <- which(r_out[,2]==ncol(dat))
if(length(w_out)!=0){
stop(paste("*--Function terminated!!!--* All observations on Row(s)",
w_out,"are outliers!!", sep=" "))
}
}
#--------------------------------------------------
#identify the rows in where outliers are found
list_traj <- NULL
for(j in seq_len(nrow(outlier_mat))){ #j<-1
w_ <- length(which(outlier_mat[j,]==TRUE))
if(w_ >= count){ #checking the count
list_traj <- rbind(list_traj, cbind(j,"TRUE"))
}
}
#to replace the outlier observation,
if(!is.null(list_traj)){
#replace with mean of col
if(replace_with == 1){
for(k in seq_len(nrow(list_traj))){ #k<-1
idd_ <-
which(outlier_mat[as.numeric(as.character(list_traj[k,1])),]==TRUE)
#loop through each column and remove the outlier in
#them before calculating the value of the mean column value
for(l_ in seq_len(length(idd_))){ #l_<-1
dat[as.numeric(as.character(list_traj[k,1])),idd_[l_]] <-
round(mean(dat[-which(outlier_mat[,idd_[l_]]==TRUE),idd_[l_]]),
digits = 2)
}
}
}
#replace with mean of row
if(replace_with == 2){
for(k in seq_len(nrow(list_traj))){ #k<-1
#use the non-outlier observation for the calculation
idd_nonOutlier <-
which(outlier_mat[as.numeric(as.character(list_traj[k,1])),]==FALSE)
idd_Outlier <-
which(outlier_mat[as.numeric(as.character(list_traj[k,1])),]==TRUE)
dat[list_traj[k,1],idd_Outlier] <-
round(mean(as.numeric(as.character(dat[list_traj[k,1],idd_nonOutlier]))),
digits = 2)
}
}
#to remove the outlier trajectory
if(replace_with == 3){
dat <- dat[-as.numeric(as.character(list_traj[,1])),]
}
dat_ <- dat
if(id_field == TRUE & replace_with != 3){
id <- data.frame(as.vector(b_dat[,1]))
colnames(id) <- c_name
b_dat <- cbind(id, dat)
dat_ <- b_dat
}
if(id_field == TRUE & replace_with == 3){
id <- data.frame(as.vector(b_dat[,1]))
colnames(id) <- c_name
#remove the oulier row from the column ids
id <- id[-as.numeric(as.character(list_traj[,1]))]
b_dat <- b_dat[-as.numeric(as.character(list_traj[,1])),]
dat_ <- b_dat
}
#if 'replace_with' is 1 or 2
if(replace_with==1|replace_with==2){
if(verbose==TRUE){
flush.console()
print(paste(nrow(list_traj),
paste("trajectories were found to contain outlier",
"observations and replaced accordingly!", sep=" "),
sep=" "))
print("Summary:")
}
for(u_ in seq_len(nrow(list_traj))){ #u_<-1
if(verbose==TRUE){
flush.console()
print(paste("*--Outlier observation(s) was found in trajectory ",
list_traj[u_,1]," --*", sep=""))
}
}
}
#if 'replace_with' is 3
if(replace_with==3){
if(verbose==TRUE){
flush.console()
print(paste(nrow(list_traj),
"trajectory(s) identified as outliers and removed!", sep=" "))
print("Details:")
}
for(u_ in seq_len(nrow(list_traj))){ #u_<-1
if(verbose==TRUE){
flush.console()
print(paste("*----- trajectory ",
list_traj[u_,1], "removed"), sep=" ")
}
}
}
}
#for method 2, in which outlier may not be found
if(is.null(list_traj)){
dat_ <- b_dat
if(verbose==TRUE){
print("No outlier(s) found!")
}
}
#indexes of traj where outliers are found
outlier_obs_id <- as.numeric(as.character(list_traj[,1]))
non_outlier_obs_id <-
seq_len(nrow(dat))[!seq_len(nrow(dat)) %in% outlier_obs_id]
threshold_estimated <- thres_
solution <- list(Outlier_Observations=outlier_obs_id,
Non_Outlier_Observations=non_outlier_obs_id,
Threshold=threshold_estimated,
Outliers_Replaced = dat_)
return(solution)
}
#outlier detected in traj index (if id_field is absent..)
# #replace with mean of col
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.