#' Categorize cell lines by the level of similarity to k-nearest tumors
#'
#' @param num_tumors_for_comparison number of tumors used in a k-nearest
#' neighbor comparison (DEFAULT: 10)
#' @param dist_mat a matrix of pairwise weighted distances between all cell lines and tumors
#' @param cell_line_ids IDs/names of cell lines
#' @param tumor_ids IDs of tumors
#' @param trim_cell_line_names a boolean whether to trim the the cell lines;
#' this is optional and used for long cell line names in CCLE format (i.e. CELLLINE_TISSUE); (DEFAULT: FALSE)
#'
#' @return a list with the following items:
#' * mean_similarity_cell_line_to_k_nearest_tumors: the mean similarity of each cell line to the k-nearest tumors
#' * mean_similarity_tumor_to_k_nearest_tumors: the mean similarity of each tumor sample to the k-nearest tumors
#' * categorization: a 2-column data.frame with the cell line categorizations: Sample_ID and Category; Category values can be: "Great", "Good", "Moderately Good", "Poor", "Outliers"
#' @md
#'
#' @author Rileen Sinha (rileen@gmail.com), Augustin Luna (aluna@jimmy.harvard.edu)
#'
#' @examples
#' # Generated using: tumorcomparer::run_comparison()
#' comparison_result <- readRDS(system.file("test_output",
#' "ov_comparison_result.rds",
#' package="tumorcomparer"))
#'
#' categorization_list <- categorize_cell_lines(
#' num_tumors_for_comparison=length(comparison_result$tumor_ids)-1,
#' dist_mat=comparison_result$dist_mat,
#' cell_line_ids=comparison_result$cell_line_ids,
#' tumor_ids=comparison_result$tumor_ids,
#' trim_cell_line_names=FALSE)
#'
#' @concept tumorcomparer
#' @export
#'
#' @importFrom stats cor dist mad median sd
categorize_cell_lines <- function(num_tumors_for_comparison=10,
dist_mat,
cell_line_ids,
tumor_ids,
trim_cell_line_names=FALSE) {
#categorize_cell_lines <- function(fraction_of_tumors_for_comparison=0.1,
# dist_mat,
# composite_mat,
# cell_line_ids,
# tumor_ids,
# trim_cell_line_names=FALSE) {
num_cell_lines <- length(cell_line_ids)
num_tumors <- length(tumor_ids)
## Set K-nearest neighbors
k <- num_tumors_for_comparison
#colnames(dist_mat) <- colnames(composite_mat)
#rownames(dist_mat) <- colnames(composite_mat)
dist_tumors_only <- dist_mat[setdiff(colnames(dist_mat),cell_line_ids),setdiff(colnames(dist_mat),cell_line_ids)]
# Calculate the standard deviations for categorization
median_dist_tumor_to_k_nearest_tumors <- rep(NA, num_tumors)
mad_dist_tumor_to_k_nearest_tumors <- rep(NA, num_tumors)
mean_dist_tumor_to_k_nearest_tumors <- rep(NA, num_tumors)
sd_dist_tumor_to_k_nearest_tumors <- rep(NA, num_tumors)
for(i in 1:num_tumors){
# Sort the distances of this tumor to other tumors, and save distances to the k-nearest tumors (in variable tmp)
tmp <- sort(dist_tumors_only[i,-i])[1:k]
median_dist_tumor_to_k_nearest_tumors[i] <- median(tmp)
mad_dist_tumor_to_k_nearest_tumors[i] <- mad(tmp)
mean_dist_tumor_to_k_nearest_tumors[i] <- mean(tmp)
sd_dist_tumor_to_k_nearest_tumors[i] <- sd(tmp)
}
dist_cell_line_to_nearest_tumor <- rep(NA, num_cell_lines)
median_dist_cell_line_to_k_nearest_tumors <- rep(NA, num_cell_lines)
mad_dist_cell_line_to_k_nearest_tumors <- rep(NA, num_cell_lines)
mean_dist_cell_line_to_k_nearest_tumors <- rep(NA, num_cell_lines)
sd_dist_cell_line_to_k_nearest_tumors <- rep(NA, num_cell_lines)
dist_cell_line_to_nearest_tumor <- rep(NA, num_cell_lines)
for(i in 1:num_cell_lines) {
# Sort the distances of this cell line to tumors, and save distances to the k-nearest tumors (in variable tmp)
tmp <- sort(dist_mat[cell_line_ids[i], setdiff(colnames(dist_mat),cell_line_ids)])[1:k]
mad_dist_cell_line_to_k_nearest_tumors[i] <- mad(tmp)
median_dist_cell_line_to_k_nearest_tumors[i] <- median(tmp)
sd_dist_cell_line_to_k_nearest_tumors[i] <- sd(tmp)
mean_dist_cell_line_to_k_nearest_tumors[i] <- mean(tmp)
# Compute distance to nearest tumor
#dist_cell_line_to_nearest_tumor[i] <- min(dist_mat[cell_line_ids[i],setdiff(colnames(dist_mat),cell_line_ids)])
}
names(median_dist_cell_line_to_k_nearest_tumors) <- cell_line_ids
names(mean_dist_cell_line_to_k_nearest_tumors) <- cell_line_ids
names(mad_dist_cell_line_to_k_nearest_tumors) <- cell_line_ids
names(sd_dist_cell_line_to_k_nearest_tumors) <- cell_line_ids
names(dist_cell_line_to_nearest_tumor) <- cell_line_ids
#cutoff_high <- mean(mean_dist_tumor_to_k_nearest_tumors) + 3*sd(mean_dist_tumor_to_k_nearest_tumors)
#cutoff_low <- mean(mean_dist_tumor_to_k_nearest_tumors) + sd(mean_dist_tumor_to_k_nearest_tumors)
median_similarity_cell_line_to_k_nearest_tumors <- 1 - median_dist_cell_line_to_k_nearest_tumors
mean_similarity_cell_line_to_k_nearest_tumors <- 1 - mean_dist_cell_line_to_k_nearest_tumors
median_similarity_tumor_to_k_nearest_tumors <- 1 - median_dist_tumor_to_k_nearest_tumors
mean_similarity_tumor_to_k_nearest_tumors <- 1 - mean_dist_tumor_to_k_nearest_tumors
#cutoff_high_similarity <- mean(mean_similarity_tumor_to_k_nearest_tumors) - sd(mean_similarity_tumor_to_k_nearest_tumors)
#cutoff_low_similarity <- mean(mean_similarity_tumor_to_k_nearest_tumors) - 3*sd(mean_similarity_tumor_to_k_nearest_tumors)
if(trim_cell_line_names) {
cell_line_ids <- sapply(cell_line_ids, return_first_part)
}
# Categorize matches
great_matches <- cell_line_ids[which(mean_similarity_cell_line_to_k_nearest_tumors >= mean(mean_similarity_tumor_to_k_nearest_tumors))]
good_matches <- cell_line_ids[which((mean_similarity_cell_line_to_k_nearest_tumors >= mean(mean_similarity_tumor_to_k_nearest_tumors) - sd(mean_similarity_tumor_to_k_nearest_tumors)) & (mean_similarity_cell_line_to_k_nearest_tumors < mean(mean_similarity_tumor_to_k_nearest_tumors))) ]
moderately_good_matches <- cell_line_ids[which((mean_similarity_cell_line_to_k_nearest_tumors >= (mean(mean_similarity_tumor_to_k_nearest_tumors) - 2*sd(mean_similarity_tumor_to_k_nearest_tumors)) & (mean_similarity_cell_line_to_k_nearest_tumors < (mean(mean_similarity_tumor_to_k_nearest_tumors) - sd(mean_similarity_tumor_to_k_nearest_tumors))))) ]
poor_matches <- cell_line_ids[which((mean_similarity_cell_line_to_k_nearest_tumors >= (mean(mean_similarity_tumor_to_k_nearest_tumors) - 3*sd(mean_similarity_tumor_to_k_nearest_tumors)) & (mean_similarity_cell_line_to_k_nearest_tumors < (mean(mean_similarity_tumor_to_k_nearest_tumors) - 2*sd(mean_similarity_tumor_to_k_nearest_tumors))))) ]
outliers <- cell_line_ids[which(mean_similarity_cell_line_to_k_nearest_tumors < (mean(mean_similarity_tumor_to_k_nearest_tumors) - 3*sd(mean_similarity_tumor_to_k_nearest_tumors)))]
matches_samples <- c(great_matches, good_matches, moderately_good_matches, poor_matches, outliers)
matches_categories <- c(rep("Great", length(great_matches)),
rep("Good", length(good_matches)),
rep("Moderately Good", length(moderately_good_matches)),
rep("Poor", length(poor_matches)),
rep("Outlier", length(outliers))
)
# Create single data.frame
categorization <- data.frame(Sample_ID= matches_samples, Category=matches_categories, stringsAsFactors = FALSE)
# Create results
results <- list(
mean_similarity_cell_line_to_k_nearest_tumors=mean_similarity_cell_line_to_k_nearest_tumors,
mean_similarity_tumor_to_k_nearest_tumors=mean_similarity_tumor_to_k_nearest_tumors,
categorization=categorization
)
return(results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.