#' Extract outlier estimates by individual templates
#'
#' This function extracts outlier estimates based on the squared root Euclidean distance between each
#' estimate generated by a template to its corresponding MALPACA final output. It replace the outlier
#' landmark coordinates in the input 4D array by NA.
#'
#' This function also generates a pdf, each page of which is a boxplot of
#' distances between each estimate to the final output for one specimen, storing in a user specified directory.
#' Estimates generated by different templates are denoted by different colors.
#' The threshold, which is the red horizontal line, is by default 2*standard deviation + mean for the pooled distances
#' for that specimen. Points above the line represent the outlier estimates.
#' @param allEstimates The 4d array that contains all estimates of individual templates. One can use the read.malpca.estimates function in SlicerMorphR to generate it.
#' @param MALPACA_medians The 3d array that contains MALPACA median estimates.
#' @param outputPath The directory for storing the output pdf for boxplots that mark outlier estimates.
#' @param ZScore The index is used to determined how many standard deviations above the mean distances for one specimen\cr
#' as the threshold for determining outliers. The default value is 2, so the threshold is 2*SD + mean.
#' @return A list that contains median estimates, manual landmarks and estimates of individual templates \cr
#' The returned value contains: \cr
#' \itemize{
#' \item $estimates_no_out = a 4d array with all outliers denoted as NA
#' \item $outlier_info = A list that stores outlier landmarks and templates that generate them for each specimen. The length is the sample size. Each element is a shorter list, in which each element represents a template and outlier estimates generated by it.
#' }
#'
#' @examples
#' #Please refer to the help file of 'read.malpaca.estimates' in SlicerMorphR.
#' @export
extract.outliers <- function(allEstimates, MALPACA_medians, outputPath, ZScore = 2){
#Extract template file names
template_names <- dimnames(allEstimates)[[3]]
target_names <- dimnames(allEstimates)[[4]]
#Extract LM number, template number, and sample size
LM_number <- dim(allEstimates)[1]
template_number <- dim(allEstimates)[3]
sample_size <- dim(allEstimates)[4]
#Calculate centroid sizes for each target using the MALPACA median estimates
Csizes <- NULL
for (i in 1:sample_size) {
centroid <- apply(MALPACA_medians[, , i], 2, mean)
temp_LMs <- sweep(MALPACA_medians[, , i], 2, centroid)
size <- sqrt(sum(rowSums(temp_LMs^2)))
Csizes <- append(Csizes, size)
}
#The 4d array that stores LM estimates after setting outlier estimates as NA
allEstimates_new <- allEstimates
#Record the outlier estimates generated by each template for every specimen
outlier_info <- vector("list", length = sample_size)
names(outlier_info) <- target_names
#Create a pdf for boxplots
pdf(file = paste(outputPath,"Individual LM distances to the final output.pdf", sep = "/"))
#First page: legends
plot(NULL, xaxt='n',yaxt='n',bty='n',ylab='',xlab='', xlim = 0:1, ylim = 0:1)
legend("topleft", legend = template_names, pch = 21,
col = "black", pt.bg = rainbow(template_number), border = "black", cex=1, bty = 'n')
for (k in 1:sample_size){
#Create a matrix for store distances between each LM estimate to the final output of MALPACA
LM_dists = matrix(nrow = template_number, ncol = LM_number)
#Read individual LM
for (i in 1:template_number){
#For the kth specimen estimated by the ith template, calculate LM distances vs. its final output
#Store distances in the ith column of LM_dists
LM_dists[i, ] <- rmse(allEstimates[, , i, k], MALPACA_medians[, , k])$LM_distances
}
#Unfold the LM_dists matrix to the vector Dists_vector by column
#Prepare a dataframe for boxplots
Dists_vector <- c(LM_dists)
out_template <- NULL #A vector storing indices of templates that generate outlier estimates for the kth specimen
out_LM <- NULL #A vector store indices of outlier estimates that match with each template in out_template
#Detect outliers and responsible template(s) for each LM for the kth specimen
#Loop through the dists_LMs matrix
#Threshold = ZScore*sd + mean based on pooled distances stored in Dists_vector
for (i in 1:LM_number){
for ( j in 1:template_number){
if (LM_dists[j, i] > (mean(Dists_vector) + ZScore*sd(Dists_vector))){
out_LM <- append(out_LM, i)
out_template <- append(out_template, j)
#If the ith LM estimate from the jth template for the kth specimen is an outlier, assign NA to its coordinates
allEstimates_new[i, ,j, k] <- c(NA, NA, NA)
}
}
}
#out_LM and out_template have the same lengths, each element in out_template matches with an element in out_LM
#Create a list outlier_info_kth for storing the outlier estimates generated by each template for the kth specimen
#Each element is a vector that stores outlier LM indices created by one template
outlier_info_kth <- vector("list", length = template_number)
names(outlier_info_kth) <- template_names
unique_templates <- sort(unique(out_template)) #Extract and sort unique indices of templates that generate outliers
#if out_template == 0, no outlier exists; if out_template > 0, at least one template has an outlier for a specimen
#Store outlier LM indices created by each template into the corresponding element in the "outlier_info_kth" list
if (length(out_template) > 0){
for (i in 1:length(unique_templates)){
Indices <- which(out_template == unique_templates[i])
#Assign indices of outlier LMs created by a template into the list
outlier_info_kth[[unique_templates[i]]] <- out_LM[Indices]
}
}
outlier_info[[k]] <- outlier_info_kth
#Prepare a dataframe for boxplot of LM distances to the final output for the kth specimen
groups <- rep(1:LM_number, each = template_number)
boxplot_mat <- cbind(Dists_vector, groups)
colnames(boxplot_mat) <- c("all_dists", "groups")
boxplot_df <- data.frame(boxplot_mat)
#Create boxplot for the kth spcimen
par(mar = c(5, 4, 4, 4) + 0.3) #Increased margin for additional axis
boxplot(all_dists~groups, boxplot_df, xlab = "", ylab = "", cex.axis = 1,
main = paste(target_names[k]), col = "white", medcol = "grey", boxcol = "grey", whiskcol = "grey", staplecol = "grey", outcol = "white")
title(xlab="Landmarks", ylab="Landmark Errors (mm)", cex.lab=1.5)
#Convert landmark distances for the kth specimen to % of the size
dists_csize <- Dists_vector*100/Csizes[k]
boxplot_mat_csize <- cbind(dists_csize, groups)
colnames(boxplot_mat_csize) <- c("all_dists", "groups")
boxplot_df_csize <- data.frame(boxplot_mat_csize)
#Add another y axis as dists in % of csizes
par(new=TRUE )
boxplot(all_dists~groups, boxplot_df_csize, axes = FALSE, bty = 'n',xlab = "", ylab = "", col = "white", medcol = "grey", boxcol = "grey", whiskcol = "grey", staplecol = "grey", outcol = "white")
axis(4)
mtext("Landmark Errors (% of centroid size)", side = 4, line = 3, cex = 1.5)
#Add each individual estimate as points on the boxplot; color each template's estimates by a unique color
c1 = rainbow(template_number)
point_counter <- 1
for (i in 1:LM_number){
LM_points <- NULL
for (j in 1: template_number){
LM_points <- append(LM_points, dists_csize[point_counter])
point_counter = point_counter + 1
}
points(x = rep(i, template_number), y = LM_points, pch = 21, bg = c1, cex = 1)
}
#Add a horizontal line representing the threshold for the ith
abline(h = (mean(dists_csize) + ZScore*sd(dists_csize)), col = "red")
}
dev.off()
results = list()
results$estimates_no_out <- allEstimates_new
results$outlier_info <- outlier_info
return(results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.