SlicerMorphR: Import SlicerMorph data into R

Documented in extract.outliers

#' Extract outlier estimates by individual templates
#'
#' This function extracts outlier estimates based on the squared root Euclidean distance between each
#' estimate generated by a template to its corresponding MALPACA final output. It replace the outlier
#' landmark coordinates in the input 4D array by NA.
#'
#' This function also generates a pdf, each page of which is a boxplot of
#' distances between each estimate to the final output for one specimen, storing in a user specified directory.
#' Estimates generated by different templates are denoted by different colors.
#' The threshold, which is the red horizontal line, is by default 2*standard deviation + mean for the pooled distances
#' for that specimen. Points above the line represent the outlier estimates.
#' @param allEstimates The 4d array that contains all estimates of individual templates. One can use the read.malpca.estimates function in SlicerMorphR to generate it.
#' @param MALPACA_medians The 3d array that contains MALPACA median estimates.
#' @param outputPath The directory for storing the output pdf for boxplots that mark outlier estimates.
#' @param ZScore The index is used to determined how many standard deviations above the mean distances for one specimen\cr
#' as the threshold for determining outliers. The default value is 2, so the threshold is 2*SD + mean.
#' @return A list that contains median estimates, manual landmarks and estimates of individual templates \cr
#' The returned value contains: \cr
#' \itemize{
#'   \item $estimates_no_out = a 4d array with all outliers denoted as NA
#'   \item $outlier_info = A list that stores outlier landmarks and templates that generate them for each specimen. The length is the sample size. Each element is a shorter list, in which each element represents a template and outlier estimates generated by it.
#' }
#'
#' @examples
#' #Please refer to the help file of 'read.malpaca.estimates' in SlicerMorphR.
#' @export

extract.outliers <- function(allEstimates, MALPACA_medians, outputPath, ZScore = 2){
  #Extract template file names
  template_names <- dimnames(allEstimates)[[3]]
  target_names <- dimnames(allEstimates)[[4]]
  
  #Extract LM number, template number, and sample size
  LM_number <- dim(allEstimates)[1]
  template_number <- dim(allEstimates)[3]
  sample_size <- dim(allEstimates)[4]
  
  #Calculate centroid sizes for each target using the MALPACA median estimates
  Csizes <- NULL
  for (i in 1:sample_size) {
    centroid <- apply(MALPACA_medians[, , i], 2, mean)
    temp_LMs <- sweep(MALPACA_medians[, , i], 2, centroid)
    size <- sqrt(sum(rowSums(temp_LMs^2)))
    Csizes <- append(Csizes, size)
  }
  
  
  #The 4d array that stores LM estimates after setting outlier estimates as NA
  allEstimates_new <- allEstimates
  
  #Record the outlier estimates generated by each template for every specimen
  outlier_info <- vector("list", length = sample_size)
  names(outlier_info) <- target_names
  
  #Create a pdf for boxplots
  pdf(file = paste(outputPath,"Individual LM distances to the final output.pdf", sep = "/"))
  #First page: legends
  plot(NULL, xaxt='n',yaxt='n',bty='n',ylab='',xlab='', xlim = 0:1, ylim = 0:1)
  legend("topleft", legend = template_names, pch = 21,
         col = "black", pt.bg = rainbow(template_number), border = "black", cex=1, bty = 'n')
  
  for (k in 1:sample_size){
    #Create a matrix for store distances between each LM estimate to the final output of MALPACA
    LM_dists = matrix(nrow = template_number, ncol = LM_number)
    #Read individual LM
    for (i in 1:template_number){
      #For the kth specimen estimated by the ith template, calculate LM distances vs. its final output
      #Store distances in the ith column of LM_dists
      LM_dists[i, ] <- rmse(allEstimates[, , i, k], MALPACA_medians[, , k])$LM_distances
    }
    
    #Unfold the LM_dists matrix to the vector Dists_vector by column
    #Prepare a dataframe for boxplots
    Dists_vector <- c(LM_dists)
    
    out_template <- NULL #A vector storing indices of templates that generate outlier estimates for the kth specimen
    out_LM <- NULL #A vector store indices of outlier estimates that match with each template in out_template
    
    #Detect outliers and responsible template(s) for each LM for the kth specimen
    #Loop through the dists_LMs matrix
    #Threshold = ZScore*sd + mean based on pooled distances stored in Dists_vector
    for (i in 1:LM_number){
      for ( j in 1:template_number){
        if (LM_dists[j, i] > (mean(Dists_vector) + ZScore*sd(Dists_vector))){
          out_LM <- append(out_LM, i)
          out_template <- append(out_template, j)
          #If the ith LM estimate from the jth template for the kth specimen is an outlier, assign NA to its coordinates
          allEstimates_new[i, ,j, k] <- c(NA, NA, NA)
        }
      }
    }
    #out_LM and out_template have the same lengths, each element in out_template matches with an element in out_LM
    
    #Create a list outlier_info_kth for storing the outlier estimates generated by each template for the kth specimen
    #Each element is a vector that stores outlier LM indices created by one template
    outlier_info_kth <- vector("list", length = template_number)
    names(outlier_info_kth) <- template_names
    unique_templates <- sort(unique(out_template)) #Extract and sort unique indices of templates that generate outliers
    
    #if out_template == 0, no outlier exists; if out_template > 0, at least one template has an outlier for a specimen
    #Store outlier LM indices created by each template into the corresponding element in the "outlier_info_kth" list
    if (length(out_template) > 0){
      for (i in 1:length(unique_templates)){
        Indices <- which(out_template == unique_templates[i])
        #Assign indices of outlier LMs created by a template into the list
        outlier_info_kth[[unique_templates[i]]] <- out_LM[Indices]
      }
    }
    outlier_info[[k]] <- outlier_info_kth
    
    #Prepare a dataframe for boxplot of LM distances to the final output for the kth specimen
    groups <- rep(1:LM_number, each = template_number)
    boxplot_mat <- cbind(Dists_vector, groups)
    colnames(boxplot_mat) <- c("all_dists", "groups")
    boxplot_df <- data.frame(boxplot_mat)
    
    #Create boxplot for the kth spcimen
    par(mar = c(5, 4, 4, 4) + 0.3)  #Increased margin for additional axis
    boxplot(all_dists~groups, boxplot_df, xlab = "", ylab = "", cex.axis = 1,
            main = paste(target_names[k]), col = "white", medcol = "grey", boxcol = "grey", whiskcol = "grey", staplecol = "grey", outcol = "white")
    title(xlab="Landmarks", ylab="Landmark Errors (mm)", cex.lab=1.5)
    
    #Convert landmark distances for the kth specimen to % of the size
    dists_csize <- Dists_vector*100/Csizes[k]
    boxplot_mat_csize <- cbind(dists_csize, groups)
    colnames(boxplot_mat_csize) <- c("all_dists", "groups")
    boxplot_df_csize <- data.frame(boxplot_mat_csize)
    
    #Add another y axis as dists in % of csizes
    par(new=TRUE )
    boxplot(all_dists~groups, boxplot_df_csize, axes = FALSE, bty = 'n',xlab = "", ylab = "", col = "white", medcol = "grey", boxcol = "grey", whiskcol = "grey", staplecol = "grey", outcol = "white")
    axis(4)
    
    mtext("Landmark Errors (% of centroid size)", side = 4, line = 3, cex = 1.5)
    
    #Add each individual estimate as points on the boxplot; color each template's estimates by a unique color
    c1 = rainbow(template_number)
    point_counter <- 1
    for (i in 1:LM_number){
      LM_points <- NULL
      for (j in 1: template_number){
        LM_points <- append(LM_points, dists_csize[point_counter])
        point_counter = point_counter + 1
      }
      points(x = rep(i, template_number), y = LM_points, pch = 21, bg = c1, cex = 1)
    }
    #Add a horizontal line representing the threshold for the ith
    abline(h = (mean(dists_csize) + ZScore*sd(dists_csize)), col = "red")
  }
  
  dev.off()
  
  results = list()
  results$estimates_no_out <- allEstimates_new
  results$outlier_info <- outlier_info
  
  return(results)
}