R/stats_functions.R

Defines functions do_feature_selection test_radiomic_signature calc_cox_regression calc_concordance_index select_top_features calc_auc calc_kruskal_wallis calc_wilcox calc_differential_radiomics do_hierarchical_clustering

Documented in calc_concordance_index calc_cox_regression calc_differential_radiomics do_feature_selection do_hierarchical_clustering select_top_features test_radiomic_signature

# perform hierarchical clustering on rdr obj ---------------------------------------------------
#' Perform hierarchical clustering of the radiomic dataset
#'
#' @param rdr A RadAR object (class \code{\link{SummarizedExperiment}}).
#' @param which_data (character) Which data use for the computation of the correlation coefficients. It can be one of the following: "normal",
#' "scaled", "normalized".
#' @param method_dist_row (character) Which method use to computed distance matrix between features (rows). Print available
#' methods by \code{\link{print_distance_methods}}
#' @param method_dist_col (character) Which method use to computed distance matrix between samples (columns). Print available
#' methods by \code{\link{print_distance_methods}}
#' @param method_hcl_row (character) Which method use to generate hierarchical clustering for features (rows). Print available
#' methods by \code{\link{print_hcl_methods}}
#' @param method_hcl_col (character) Which method use to generate hierarchical clustering for samples (columns). Print available
#' methods by \code{\link{print_hcl_methods}}
#'
#' @return An updated rdr (a RadAR object)
#' @author Matteo Benelli (\email{matteo.benelli@uslcentro.toscana.it})
#' @export
#'
#' @examples
do_hierarchical_clustering <- function(rdr = NULL,
                                       which_data = "scaled",
                                       method_dist_row = "euclidean",
                                       method_dist_col = "correlation.pearson",
                                       method_hcl_row = "ward.D",
                                       method_hcl_col = "ward.D"

)
{
  methods_hcl <- c("ward.D",
                   "ward.D2",
                   "single",
                   "complete",
                   "average" ,
                   "mcquitty",
                   "median",
                   "centroid")
  methods_dist <- c("euclidean",
                    "maximum",
                    "manhattan",
                    "canberra",
                    "binary",
                    "minkowski",
                    "correlation.pearson",
                    "correlation.spearman",
                    "correlation.kendall"
  )

  assertthat::assert_that(length(rdr) > 0, msg = "[RadAR] Error: rdr object required")
  assertthat::assert_that(which_data %in% c("normal", "scaled", "normalized"), msg = "[RadAR] Error: Invalid data type")
  assertthat::assert_that(method_dist_row %in% methods_dist & length(method_dist_row) == 1, msg = "[RadAR] Error: Invalid method for calculating distance")
  assertthat::assert_that(method_dist_col %in% methods_dist & length(method_dist_col) == 1, msg = "[RadAR] Error: Invalid method for calculating distance")
  assertthat::assert_that(method_hcl_row %in% methods_hcl & length(method_hcl_row) == 1, msg = "[RadAR] Error: Invalid method for performing hierarcgical clustering")
  assertthat::assert_that(method_hcl_col %in% methods_hcl & length(method_hcl_col) == 1, msg = "[RadAR] Error: Invalid method for performing hierarcgical clustering")

  if (which_data == "normal") {
    data <- assays(rdr)$values
  }
  if (which_data == "scaled") {
    data <- assays(rdr)$scaled_values
    assertthat::assert_that(!is.null(data), msg = "[RadAR] Error: Feature values have not been yet scaled")
  }
  if (which_data == "normalized") {
    data <- assays(rdr)$norm_values
    assertthat::assert_that(!is.null(data), msg = "[RadAR] Error: Feature values have not been yet normalized")
  }

  ## calc distance
  dist0 <- unlist(strsplit(method_dist_col, ".", fixed = T))
  if (dist0[1] != "correlation") {
    dist_col <- dist(t(data), method = method_dist_col)
  } else  {
    dist_col <- as.dist(1 - cor(data, method = dist0[2], use = "na"))
  }
  dist0 <- unlist(strsplit(method_dist_row, ".", fixed = T))
  if (dist0[1] != "correlation") {
    dist_row <- dist(data, method = method_dist_row)
  } else  {
    dist_row <- as.dist(1 - cor(t(data), method = dist0[2], use = "na"))
  }
  hcl_col <- hclust(dist_col, method = method_hcl_col)
  hcl_row <- hclust(dist_row, method = method_hcl_row)
  hcl <- list (hcl_col = hcl_col,
               hcl_row = hcl_row)

  metadata(rdr)$hcl <- hcl
  metadata(rdr)$which_data_hcl <- which_data
  return(rdr)
}

# calc differential radiomics -------------------------------------------------------------------
#' Perform differential radiomic analysis
#'
#' This function implements two non-parametric statistical tests (Wilcoxon-Mann-Whitney and AUC)
#' to perform differential analysis of radiomic features. In case of Wilcoxon-Mann-Whitney p-values can be adjusted using
#' different methods.
#'
#'
#' @param rdr A RadAR object (class \code{\link{SummarizedExperiment}}).
#' @param conditions (numeric, character) Vector of labels for each sample in rdr. Should have the same
#' length of \code{ncol(rdr)}. Required.
#' @param which_data (character) Which data use for plot. It can be one of the following: "normal", "scaled", "normalized".
#' @param method (character) Which statistical methods use for differential analysis.
#' It can be "wilcox" (Wilcoxon-Mann-Whitney), "auc" (Area Under the Curve) or "kruskal-wallis".
#' @param adjust_pvalues_by (character) Which method use to correct p-values from wilcox test. Print available methods by
#' \code{\link{p.adjust.methods}}.
#' @param thr_pvalue (numeric) P-value threshold to identify statistically significant features for wilcox or kruskal-wallis.
#' It should be in the range (0, 1].
#' @param thr_auc (numeric) AUC threshold to identify statistically significant features.
#' It should be in the range (0.5,1].
#'
#' @return An updated rdr (a RadAR object)
#' @author Matteo Benelli (\email{matteo.benelli@uslcentro.toscana.it})
#' @export
#'
#' @examples
calc_differential_radiomics <- function(rdr = NULL,
                                        conditions = NULL,
                                        which_data = "scaled",
                                        method = "wilcox",
                                        adjust_pvalues_by = "BH",
                                        thr_pvalue = NA,
                                        thr_auc = NA)

{

  assertthat::assert_that(length(rdr) > 0,
                          msg = "[RadAR] Error: rdr object required")
  if (!is.na(thr_pvalue)) {
    assertthat::assert_that(thr_pvalue > 0,
                            msg = "[RadAR] Error: thr_pvalue should be positive")
  }
  if (!is.na(thr_auc)) {
    assertthat::assert_that(thr_auc > 0.5 & thr_auc <= 1,
                            msg = "[RadAR] Error: thr_auc should be in the range (0.5,1]")
  }
  assertthat::assert_that(length(conditions) == ncol(rdr),
                          msg = "[RadAR] Error: conditions need to be specified and should be a vector of length ncol(rdr)")
  assertthat::assert_that(which_data %in% c("normal", "scaled", "normalized"),
                          msg = "[RadAR] Error: Invalid data type")
  assertthat::assert_that(method %in% c("wilcox", "auc", "kruskal-wallis"),
                          msg = "[RadAR] Error: invalid method to perform differential statistics")

  if (which_data == "normal") {
    data <- assays(rdr)$values
  }
  if (which_data == "scaled") {
    data <- assays(rdr)$scaled_values
    assertthat::assert_that(!is.null(data), msg = "[RadAR] Error: feature values have not been yet scaled")
  }
  if (which_data == "normalized") {
    data <- assays(rdr)$norm_values
    assertthat::assert_that(!is.null(data), msg = "[RadAR] Error: feature values have not been yet normalized")
  }

  if (method %in% c("wilcox", "auc")) {

    assertthat::assert_that(length(unique(conditions)) == 2 |
                              (length(unique(conditions)) == 3 & any(is.na(unique(conditions)))),
                            msg = "[RadAR] Error: two conditions should be indicated (test vs ctrl)")

    unique_conditions <- unique(conditions)
    if (any(is.na(unique_conditions))) {
      unique_conditions <- unique_conditions[-which(is.na(unique_conditions))]
    }

    median_cond1 <- apply(data[, which(conditions == unique_conditions[1])], 1, median, na.rm = T)
    median_cond2 <- apply(data[, which(conditions == unique_conditions[2])], 1, median, na.rm = T)

    if (method == "wilcox") {
      assertthat::assert_that(adjust_pvalues_by %in% p.adjust.methods,
                              msg = "[RadAR] Invalid method for adjusting p-values")
      pvalues <- apply(data, 1, calc_wilcox, conditions)
      adjusted_pvalues <- p.adjust(pvalues, method = adjust_pvalues_by)
      rowData(rdr)$wilcox_test_pvalue <- adjusted_pvalues
      metadata(rdr)$which_data_wilcox <- which_data
      metadata(rdr)$conditions_wilcox <- conditions

      if (!is.na(thr_pvalue)) {
        ix_features <- which(rowData(rdr)$wilcox_test_pvalue <= thr_pvalue  )
        if (length(ix_features) == 0) {
          message("Warning: [RadAR] No statistically significant features found")
        }
        flag_condition <- rep("", nrow(rdr))
        flag_condition[which(median_cond1 > median_cond2 &
                               adjusted_pvalues <= thr_pvalue)] <- paste("Higher in", unique_conditions[1])
        flag_condition[which(median_cond2 > median_cond1 &
                               adjusted_pvalues <= thr_pvalue)] <- paste("Higher in", unique_conditions[2])
        rowData(rdr)$wilcox_test_description <- flag_condition
      }
    }

    if (method == "auc") {
      auc <- apply(data, 1, calc_auc, conditions)
      rowData(rdr)$auc_value <- auc
      metadata(rdr)$which_data_auc <- which_data
      metadata(rdr)$conditions_auc <- conditions
      if (!is.na(thr_auc)) {
        ix_features <- which(rowData(rdr)$auc_value >= thr_auc | rowData(rdr)$auc_value <= (1-thr_auc) )
        if (length(ix_features) == 0) {
          message("Warning: [RadAR] No statistically significant features found")
        }
        flag_condition <- rep("", nrow(rdr))
        flag_condition[which(auc >= thr_auc)] <- paste("Higher in", unique_conditions[1])
        flag_condition[which(auc <= (1-thr_auc) )] <- paste("Higher in", unique_conditions[2])
        rowData(rdr)$auc_description <- flag_condition
      }
    }
  }

  if (method == "kruskal-wallis") {
    assertthat::assert_that(length(unique(conditions)) > 2 |
                              (length(unique(conditions)) > 3 & any(is.na(unique(conditions)))),
                            msg = "[RadAR] Error: at least three conditions should be indicated (test vs ctrl)")

    assertthat::assert_that(adjust_pvalues_by %in% p.adjust.methods,
                            msg = "[RadAR] Invalid method for adjusting p-values")

    pvalues <- apply(data, 1, calc_kruskal_wallis, conditions)
    adjusted_pvalues <- p.adjust(pvalues, method = adjust_pvalues_by)
    rowData(rdr)$kruskal_wallis_test_pvalue <- adjusted_pvalues
    metadata(rdr)$which_data_kruskal_wallis <- which_data
    metadata(rdr)$conditions_kruskal_wallis <- conditions

    if (!is.na(thr_pvalue)) {
      ix_features <- which(rowData(rdr)$kruskal_wallis_test_pvalue <= thr_pvalue  )
      if (length(ix_features) == 0) {
        message("Warning: [RadAR] No statistically significant features found")
      }
    }
  }


  return (rdr)

}


calc_wilcox <- function(x, conditions) {
  y <- wilcox.test(x ~ conditions)
  return (y$p.value)
}

calc_kruskal_wallis <- function(x, conditions) {
  y <- kruskal.test(x ~ conditions)
  return (y$p.value)
}

calc_auc <- function(x, conditions) {
  y <- wilcox.test(x ~ conditions)
  return (y$statistic/prod(table(conditions)))
}

#' Select top radiomic features according to a given statistics
#'
#' This function create a summary table of most significant features based on previously computed statistics
#' (wilcox, AUC, concordance index or cox regression)
#'
#' @param rdr A RadAR object (class \code{\link{SummarizedExperiment}}).
#' @param which_statistics (character) Select top features based on one of the following pre-computed statistics:
#' "wilcox", "AUC", "concordance" or "cox".
#' @param thr_pvalue (numeric) P-value threshold to identify statistically significant features from wilcox.
#' It should be in the range (0, 1].
#' @param thr_auc (numeric) AUC threshold to identify statistically significant features from AUC
#' It should be in the range (0.5, 1].
#' @param thr_concordance (numeric) Threshold to identify statistically significant features from concordance analysis.
#' It should be in the range (0, .5].
#' @param thr_cox_zvalue (numeric) Z threshold (Wald statistics) to identify statistically significant features from cox regression analysis.
#' It should be in the range (0, inf).
#' @param write_to (character) If specified, filename to output top statistically significant features (tab-delimited).
#'
#' @return A table of class \code{\link{tibble}} reporting top features.
#' @author Matteo Benelli (\email{matteo.benelli@uslcentro.toscana.it})
#' @export
#'
#' @examples
select_top_features  <- function(rdr = NULL,
                                 which_statistics = "wilcox",
                                 thr_pvalue = 5e-2,
                                 thr_auc = .8,
                                 thr_concordance = 0.10,
                                 thr_cox_zvalue = 3,
                                 write_to = NULL
)

{

  assertthat::assert_that(length(rdr) > 0, msg = "[RadAR] Error: rdr object required")
  assertthat::assert_that(thr_pvalue > 0, msg = "[RadAR] Error: thr_pvalue_wilcox should be positive")
  assertthat::assert_that(thr_cox_zvalue > 0, msg = "[RadAR] Error: thr_cox_zvalue should be positive")
  assertthat::assert_that(thr_auc > 0.5 & thr_auc <= 1, msg = "[RadAR] Error: auc_thr should be in the range (0.5,1]")
  assertthat::assert_that(thr_concordance > 0 & thr_concordance < 0.5, msg = "[RadAR] Error: thr_concordance should be in the range (0, .5)")
  assertthat::assert_that(length(which_statistics) == 1, msg = "[RadAR] Error: Only one statistics should be indicated")
  assertthat::assert_that(which_statistics %in% c("wilcox", "auc", "concordance", "cox", "kruskal-wallis"),
                          msg = "[RadAR] Error: Invalid statistcs")

  if (which_statistics == "kruskal-wallis") {
    which_statistics <- "kruskal_wallis"
  }
  which_data_wilcox <- metadata(rdr)$which_data_wilcox
  which_data_auc <- metadata(rdr)$which_data_auc
  which_data_cox <- metadata(rdr)$which_data_cox
  which_data_concordance <- metadata(rdr)$which_data_concordance
  which_data_kruskal_wallis <- metadata(rdr)$which_data_kruskal_wallis

  conditions_wilcox <- metadata(rdr)$conditions_wilcox
  conditions_auc <- metadata(rdr)$conditions_auc
  conditions_kruskal_wallis <- metadata(rdr)$conditions_wilcox

  which_data <- get(paste0("which_data", "_", which_statistics))

  if (which_statistics == "wilcox") {
    assertthat::assert_that(any("wilcox_test_pvalue" %in% colnames(rowData(rdr))),
                            msg = "[RadAR] Error: Wilcox-Mann-Whitney test has not been yet computed")
    thr_pvalue_wilcox <- thr_pvalue
  }
  if (which_statistics == "auc") {
    assertthat::assert_that(any("auc_value" %in% colnames(rowData(rdr))),
                            msg = "[RadAR] Error: AUC test has not been yet computed")
  }
  if (which_statistics == "kruskal_wallis") {
    assertthat::assert_that(any("kruskal_wallis_test_pvalue" %in% colnames(rowData(rdr))),
                            msg = "[RadAR] Error: Kruskal-Wallis test has not been yet computed")

  }

  if (which_data == "normal") {
    data <- assays(rdr)$values
  }
  if (which_data == "scaled") {
    data <- assays(rdr)$scaled_values
    assertthat::assert_that(!is.null(data),
                            msg = "[RadAR] Error: Feature values have not been scaled yet")
  }
  if (which_data == "normalized") {
    data <- assays(rdr)$norm_values
    assertthat::assert_that(!is.null(data),
                            msg = "[RadAR] Error: Feature values have not been normalized yet")
  }

  if (which_statistics %in% c("wilcox", "auc")) {
    conditions <- get(paste0("conditions", "_", which_statistics))
    unique_conditions <- unique(conditions)
    if (any(is.na(unique_conditions))) {
      unique_conditions <- unique_conditions[-which(is.na(unique_conditions))]
    }
    median_cond1 <- apply(data[, which(conditions == unique_conditions[1])], 1, median, na.rm = T)
    median_cond2 <- apply(data[, which(conditions == unique_conditions[2])], 1, median, na.rm = T)
  }

  if (which_statistics == "wilcox") {

    ix_features <- which(rowData(rdr)$wilcox_test_pvalue <= thr_pvalue_wilcox)
    assertthat::assert_that(length(ix_features) > 0, msg = "[RadAR] Error: No statistically significant features found")
    flag_condition <- rep("", nrow(rdr))
    flag_condition[which(median_cond1 > median_cond2 &
                           rowData(rdr)$wilcox_test_pvalue <= thr_pvalue_wilcox)] <- paste("Higher in", unique_conditions[1])
    flag_condition[which(median_cond2 > median_cond1 &
                           rowData(rdr)$wilcox_test_pvalue <= thr_pvalue_wilcox)] <- paste("Higher in", unique_conditions[2])

    df_top_features <- data.frame(feature_name = rowData(rdr)$feature_name[ix_features],
                                  image_type = rowData(rdr)$image_type[ix_features],
                                  feature_type = rowData(rdr)$feature_type[ix_features],
                                  feature_description = rowData(rdr)$feature_description[ix_features],
                                  statistics = rowData(rdr)$wilcox_test_pvalue[ix_features],
                                  statistics_description = flag_condition[ix_features],
                                  median_cond1[ix_features],
                                  median_cond2[ix_features]
    )
    colnames(df_top_features)[c(7, 8)] <- c(paste0("median_in_", unique_conditions[1]),
                                            paste0("median_in_", unique_conditions[2]))
    rownames(df_top_features) <- rownames(rdr)[ix_features]
    df_top_features <- df_top_features[order(df_top_features$statistics), ]
  }

  if (which_statistics == "auc") {

    ix_features <- which(rowData(rdr)$auc_value >= thr_auc | rowData(rdr)$auc_value <= (1-thr_auc) )
    assertthat::assert_that(length(ix_features) > 0, msg = "[RadAR] Error: No statistically significant features found")

    flag_condition <- rep("", nrow(rdr))
    flag_condition[which(median_cond1 > median_cond2 &
                           abs(rowData(rdr)$auc_value-0.5) > (thr_auc-0.5))] <- paste("Higher in", unique_conditions[1])
    flag_condition[which(median_cond2 > median_cond1 &
                           abs(rowData(rdr)$auc_value-0.5) > (thr_auc-0.5))] <- paste("Higher in", unique_conditions[2])

    df_top_features <- data.frame(feature_name = rowData(rdr)$feature_name[ix_features],
                                  image_type = rowData(rdr)$image_type[ix_features],
                                  feature_type = rowData(rdr)$feature_type[ix_features],
                                  feature_description = rowData(rdr)$feature_description[ix_features],
                                  statistics = rowData(rdr)$auc_value[ix_features],
                                  statistics_description = flag_condition[ix_features],
                                  median_cond1[ix_features],
                                  median_cond2[ix_features]
    )
    colnames(df_top_features)[c(7, 8)] <- c(paste0("median_in_", unique_conditions[1]),
                                            paste0("median_in_", unique_conditions[2]))
    rownames(df_top_features) <- rownames(rdr)[ix_features]
    df_top_features <- df_top_features[order(abs(df_top_features$statistics-0.5), decreasing = T), ]
  }

  if (which_statistics == "kruskal_wallis") {

    ix_features <- which(rowData(rdr)$kruskal_wallis_test_pvalue <= thr_pvalue)
    assertthat::assert_that(length(ix_features) > 0,
                            msg = "[RadAR] Error: No statistically significant features found")

    df_top_features <- data.frame(feature_name = rowData(rdr)$feature_name[ix_features],
                                  image_type = rowData(rdr)$image_type[ix_features],
                                  feature_type = rowData(rdr)$feature_type[ix_features],
                                  feature_description = rowData(rdr)$feature_description[ix_features],
                                  statistics = rowData(rdr)$kruskal_wallis_test_pvalue[ix_features]
    )
    rownames(df_top_features) <- rownames(rdr)[ix_features]
    df_top_features <- df_top_features[order(df_top_features$statistics), ]
  }
  if (which_statistics == "concordance") {

    ix_features <- which(abs(rowData(rdr)$concordance_index - 0.5) >= thr_concordance  )
    assertthat::assert_that(length(ix_features) > 0, msg = "[RadAR] No statistically significant features found")

    flag_condition <- rep("", nrow(rdr))
    flag_condition[intersect(ix_features, which(rowData(rdr)$concordance_index - .5 < 0))] <- "worse increasing"
    flag_condition[intersect(ix_features, which(rowData(rdr)$concordance_index - .5 > 0))] <- "worse decreasing"

    df_top_features <- data.frame(feature_name = rowData(rdr)$feature_name[ix_features],
                                  image_type = rowData(rdr)$image_type[ix_features],
                                  feature_type = rowData(rdr)$feature_type[ix_features],
                                  feature_description = rowData(rdr)$feature_description[ix_features],
                                  statistics = rowData(rdr)$concordance_index[ix_features],
    )
    rownames(df_top_features) <- rownames(rdr)[ix_features]
    df_top_features <- df_top_features[order(abs(df_top_features$statistics-0.5), decreasing = T), ]
  }

  if (which_statistics == "cox") {

    ix_features <- which(abs(rowData(rdr)$cox_regression_statistics) >= thr_cox_zvalue)
    assertthat::assert_that(length(ix_features) > 0, msg = "[RadAR] No statistically significant features found")

    flag_condition <- rep("", nrow(rdr))
    flag_condition[intersect(ix_features, which(rowData(rdr)$cox_regression_statistics >= thr_cox_zvalue))] <- "worse increasing"
    flag_condition[intersect(ix_features, which(rowData(rdr)$cox_regression_statistics <= -thr_cox_zvalue))] <- "worse decreasing"
    rowData(rdr)$cox_regression_description <- flag_condition

    df_top_features <- data.frame(feature_name = rowData(rdr)$feature_name[ix_features],
                                  image_type = rowData(rdr)$image_type[ix_features],
                                  feature_type = rowData(rdr)$feature_type[ix_features],
                                  feature_description = rowData(rdr)$feature_description[ix_features],
                                  statistics = rowData(rdr)$cox_regression_statistics[ix_features],
                                  statistics_description = flag_condition[ix_features]
    )
    rownames(df_top_features) <- rownames(rdr)[ix_features]
    df_top_features <- df_top_features[order(abs(df_top_features$statistics), decreasing = T), ]
  }
  if (!is.null(write_to)) {
    write.table(df_top_features, file = write_to, quote = F, row.names = F, sep ="\t")
  }
  df_top_features <- tibble::as_tibble(df_top_features)
  return (df_top_features)

}

#' Compute concordance index for radiomic features
#'
#' This function calculates concordance index of radiomic features.
#' Response should be a survival object obtained by \code{\link{Surv}}
#'
#' @param rdr A RadAR object (class \code{\link{SummarizedExperiment}}).
#' @param surv_obj An object of class \code{\link{Surv}}.
#' @param thr_concordance (numeric) Threshold to identify statistically significant features from concordance analysis.
#' It should be in the range (0, .5].
#' @param which_data (character) Which data use to compute concordance index. It can be one of the following: "normal",
#' "scaled", "normalized".
#'
#' @return An updated rdr (a RadAR object)
#' @author Matteo Benelli (\email{matteo.benelli@uslcentro.toscana.it})
#' @export
#'
#' @examples
calc_concordance_index <- function(rdr = NULL,
                                   surv_obj = NULL,
                                   thr_concordance = NA,
                                   which_data = "scaled"
)
{

  assertthat::assert_that(length(rdr) > 0, msg = "[RadAR] Error: rdr object required")
  assertthat::assert_that(is.Surv(surv_obj) > 0, msg = "[RadAR] Error: surv_obj object required")
  assertthat::assert_that(which_data %in% c("normal", "scaled", "normalized"), msg = "[RadAR] Error: Invalid data type")

  if (!is.na(thr_concordance)) {
    assertthat::assert_that(thr_concordance > 0 & thr_concordance < 0.5, msg = "[RadAR] Error: thr_concordance should be in the range (0, .5)")
  }

  if (which_data == "normal") {
    data <- assays(rdr)$values
  }
  if (which_data == "scaled") {
    data <- assays(rdr)$scaled_values
    assertthat::assert_that(!is.null(data), msg = "[RadAR] Error: Feature values have not been yet scaled")
  }
  if (which_data == "normalized") {
    data <- assays(rdr)$norm_values
    assertthat::assert_that(!is.null(data), msg = "[RadAR] Error: Feature values have not been yet normalized")
  }

  concordance_index <- rep(NA, nrow(rdr))
  for (i in 1: nrow(rdr)) {
    concordance_index[i] <- concordance(surv_obj ~ data[i, ])$concordance
    #    concordance_index[i] <- concordance(surv_obj ~ data[i, ], data = data.frame(rdr_original@assays$data$values[i, ]))$concordance
  }
  rowData(rdr)$concordance_index <- concordance_index
  metadata(rdr)$which_data_concordance <- which_data

  if (!is.na(thr_concordance)) {
    ix_features <- which(abs(rowData(rdr)$concordance_index - .5) >= thr_concordance)
    if (length(ix_features) == 0) {
      message("Warning: [RadAR] No statistically significant features found")
    }
    flag_condition <- rep("", nrow(rdr))
    flag_condition[intersect(ix_features, which(rowData(rdr)$concordance_index - .5 < 0))] <- "worse increasing"
    flag_condition[intersect(ix_features, which(rowData(rdr)$concordance_index - .5 > 0))] <- "worse decreasing"
    rowData(rdr)$concordance_description <- flag_condition
  }
  return(rdr)

}


#' Fit a Cox regression model using radiomic features
#'
#' This function fits a Cox Proportional-Hazards Model using radiomic features as independent variables.
#' Response should be an object obtained by \code{\link{Surv}}
#'
#' @param rdr A RadAR object (class \code{\link{SummarizedExperiment}}).
#' @param surv_obj An object of class \code{\link{Surv}}.
#' @param thr_cox_zvalue (numeric) Z threshold (Wald statistics) to identify statistically significant features from cox regression analysis.
#' It should be in the range (0, inf).
#' @param which_data (character) Which data use to compute concordance index. It can be one of the following: "normal",
#' "scaled", "normalized".
#' @param multiple_regression (character). Cox regression can be "multiple" (surv ~ all features) or "single" (surv ~ each feature).
#'
#' @return An updated rdr (a RadAR object)
#' @author Matteo Benelli (\email{matteo.benelli@uslcentro.toscana.it})
#' @export
#'
#' @examples
calc_cox_regression <- function(rdr = NULL,
                                surv_obj = NULL,
                                thr_cox_zvalue = NA,
                                which_data = "scaled",
                                multiple_regression = F
)
{

  assertthat::assert_that(length(rdr) > 0, msg = "[RadAR] Error: rdr object required")
  assertthat::assert_that(is.Surv(surv_obj) > 0, msg = "[RadAR] Error: surv_obj object required")
  assertthat::assert_that(which_data %in% c("normal", "scaled", "normalized"), msg = "[RadAR] Error: Invalid data type")
  assertthat::assert_that(is.logical(multiple_regression), msg = "[RadAR] Error: multiple_regression should be logical")

  if (!is.na(thr_cox_zvalue)) {
    assertthat::assert_that(thr_cox_zvalue > 0, msg = "[RadAR] Error: thr_cox_zvalue should be positive")
  }

  if (which_data == "normal") {
    data <- assays(rdr)$values
  }
  if (which_data == "scaled") {
    data <- assays(rdr)$scaled_values
    assertthat::assert_that(!is.null(data), msg = "[RadAR] Error: Feature values have not been yet scaled")
  }
  if (which_data == "normalized") {
    data <- assays(rdr)$norm_values
    assertthat::assert_that(!is.null(data), msg = "[RadAR] Error: Feature values have not been yet normalized")
  }

  if (!multiple_regression) {
    cox_regression_statistics <- rep(NA, nrow(rdr))
    cox_hazard_ratio <- rep(NA, nrow(rdr))
    for (i in 1: nrow(rdr)) {
      tmp <- summary(coxph(surv_obj ~ data[i, ]))$coefficients
      cox_regression_statistics[i] <- tmp[1, 4]
      cox_hazard_ratio[i] <- tmp[1, 2]
    }
  }
  if (multiple_regression) {
    tmp <- summary(coxph(surv_obj ~ t(data)))$coefficients
    cox_regression_statistics <- tmp[, 4]
    cox_hazard_ratio <-  tmp[, 2]
  }

  rowData(rdr)$cox_regression_statistics <- cox_regression_statistics
  rowData(rdr)$cox_hazard_ratio <- cox_hazard_ratio
  metadata(rdr)$which_data_cox <- which_data
  metadata(rdr)$which_cox_regression <- ifelse(multiple_regression, "multiple", "single")

  if (!is.na(thr_cox_zvalue)) {
    ix_features <- which(abs(rowData(rdr)$cox_regression_statistics) >= thr_cox_zvalue)
    if (length(ix_features) == 0) {
      message("Warning: [RadAR] No statistically significant features found")
    }
    flag_condition <- rep("", nrow(rdr))
    flag_condition[intersect(ix_features, which(rowData(rdr)$cox_regression_statistics >= thr_cox_zvalue))] <- "worse increasing"
    flag_condition[intersect(ix_features, which(rowData(rdr)$cox_regression_statistics <= -thr_cox_zvalue))] <- "worse decreasing"
    rowData(rdr)$cox_regression_description <- flag_condition
  }
  return(rdr)
}

#' Test radiomic signature by Cox regression model
#'
#' @param rdr A RadAR object (class \code{\link{SummarizedExperiment}}).
#' @param surv_obj An object of class \code{\link{Surv}}.
#' @param signature (character) vector of radiomic features defining the signature.
#' @param signature_rdr  RadAR object (class \code{\link{SummarizedExperiment}}) obtained by
#' \code{\link{do_feature_selection}}. Required to test a model generated using
#'  glmnet-cox or glmnet-binom.
#' @param which_data (character) Which data use to compute concordance index. It can be one of the following: "normal",
#' "scaled", "normalized".
#'
#' @return In case of mRMR, hcl and pca methods, the function returns the results of the cox regression model, including the concordance index.
#' In case of glmnet-cox method, the function returns a list of two elements, including 1) the prediction of the glmnet model on the new data
#' and 2) the computation of concordance index.
#'
#' @author Matteo Benelli (\email{matteo.benelli@uslcentro.toscana.it})
#' @export
#'
#' @examples
test_radiomic_signature <- function(rdr = NULL,
                                    surv_obj = NULL,
                                    signature = NULL,
                                    signature_rdr = NULL,
                                    which_data = "scaled"
)
{

  assertthat::assert_that(length(rdr) > 0, msg = "[RadAR] Error: rdr object required")
  assertthat::assert_that(is.Surv(surv_obj) > 0, msg = "[RadAR] Error: surv_obj object required")
  assertthat::assert_that(length(signature) > 0 | length(signature_rdr) > 0, msg = "[RadAR] Error: signature object required")
  assertthat::assert_that(which_data %in% c("normal", "scaled", "normalized"), msg = "[RadAR] Error: Invalid data type")

  if (length(signature_rdr) > 0) {
    signature <- rownames(signature_rdr)
  }

  if (!all(signature %in% rownames(rdr))) {
    ix <- which(signature %in% rownames(rdr) == F)
    assertthat::assert_that(1<0, msg = paste0("[RadAR] Error:", toString(signature[ix], " not in rdr")))
  }

  if (which_data == "normal") {
    data <- assays(rdr)$values
  }
  if (which_data == "scaled") {
    data <- assays(rdr)$scaled_values
    assertthat::assert_that(!is.null(data), msg = "[RadAR] Error: Feature values have not been yet scaled")
  }
  if (which_data == "normalized") {
    data <- assays(rdr)$norm_values
    assertthat::assert_that(!is.null(data), msg = "[RadAR] Error: Feature values have not been yet normalized")
  }

  data_sign <- data[signature, ]

  if (length(signature_rdr) > 0) {
    if (length(metadata(signature_rdr)$glmnet_model) == 1) {
      res <- summary(coxph(surv_obj ~ t(data_sign)))
    } else {
      if (metadata(signature_rdr)$glmnet_model_lambda == "min") {
        my_s <- "lambda.min"
      } else {
        my_s <- "lambda.1se"
      }
      cvfit <- metadata(signature_rdr)$glmnet_model
      res <- list()
      res$pred <- predict(cvfit,
                          newx = t(data),
                          s = my_s,
                          type = "response")

      res$concordance <- Cindex(pred = res$pred,
                       y = surv_obj)

    }
  } else {
    res <- summary(coxph(surv_obj ~ t(data_sign)))
  }

  return(res)
}

#' Perform feature selection based on different methods
#'
#' This function implements different methods to perform feature selection of radiomic datasets.
#'
#' @param rdr A RadAR object (class \code{\link{SummarizedExperiment}}).
#' @param n_features (numeric) Number of features to be selected. Required.
#' @param select_by (character) Which criteria use to select informative radiomic features within clusters
#' of similar (i.e., redundant) features. It can be one of the following: "variability", "random", "concordance".
#' @param method (character) Which  method use to identify redundant features. It can be one of the following:
#' "mRMR" (minimum-redundancy-maximum-relevance),"hcl" (hierarchical clustering of correlation matrix), "pca" (K-means applied to Principal Component Analysis),
#' "glmnet-cox" (generalized linear model via penalized maximum likelihood (glmnet) fitting cox regression model),
#' "glmnet-binonial" (glmnet fitting binomial regression model),
#' Using mRMR, this function works as a wrapper to \code{\link{mRMR}} package.
#' Using glmnet-*, this function works as a wrapper to \code{\link{glmnet}} package.
#'
#' @param surv_obj An object of class \code{\link{Surv}}. Required if select_by is "concordance".
#' @param which_data (character) Which data use to compute concordance index. It can be one of the following: "normal",
#' "scaled", "normalized".
#' @param corr_measure (character) Which method use to calculate correlation. It can be one of the following:
#' "pearson", "kendall", "spearman".
#' @param min_features_per_group (numeric) Minimum number of features for each cluster.
#' @param thr_pca_cum_prop (numeric) Threshold to select number of components based on
#' cumulative proportion of explained variance criterion.
#' @param response (numeric) A response variable, required if any of mRMR or glmnet-binomial or
#' methods are used.
#' @param lambda (character) In glmnet, it controls the overall strength of the penalty.
#' Possible values are "min" or "1se" (1 standard deviation). For more details see \code{\link{glmnet}}
#' @param alpha (numeric) In glmnet, it controls elastic-net penalty.
#' Typical values are 0 (ridge) or 1 (lasso). For more details see \code{\link{glmnet}}.
#'
#' @return A list including two elements:
#' `rdr`: the updated (reduced) rdr (a RadAR object)
#' `signature`: the radiomic features included in the signature
#' @author Matteo Benelli (\email{matteo.benelli@uslcentro.toscana.it})
#' @export
#'
#' @examples
do_feature_selection <- function(rdr = NULL,
                                 n_features = NULL,
                                 select_by = NULL,
                                 method =  "hcl",
                                 surv_obj = NULL,
                                 which_data = "scaled",
                                 corr_measure = "pearson",
                                 min_features_per_group = 5,
                                 thr_pca_cum_prop = .8,
                                 response = NULL,
                                 lambda = "min",
                                 alpha = NULL
)
{
  set.seed(1)
  assertthat::assert_that(length(rdr) > 0, msg = "[RadAR] rdr object required")
  assertthat::assert_that(which_data %in% c("normal", "scaled", "normalized"), msg = "[RadAR] Invalid data type")
  assertthat::assert_that(corr_measure %in% c("pearson", "kendall", "spearman"), msg = "[RadAR] Invalid corr_measure setting")
  assertthat::assert_that(length(method) > 0 , msg = "[RadAR] method required")
  assertthat::assert_that(method %in% c("hcl", "pca", "mRMR",
                                        "glmnet-cox", "glmnet-binomial") ,
                          msg = "[RadAR] Invalid method")

  if (method %in% c("hcl", "pca", "mRMR")) {
    assertthat::assert_that(min_features_per_group > 0 & min_features_per_group < (nrow(rdr)-n_features) ,
                            msg = "[RadAR] min_features_per_group should be in the range [1, (total features - n_features)]")
    assertthat::assert_that(!is.null(n_features), msg = "[RadAR] n_features required for methods hcl, pca or mRMR")

  }

  if (method %in% c("mRMR", "glmnet-cox", "glmnet-binomial") == F) {
    assertthat::assert_that(length(select_by) > 0 , msg = "[RadAR] select_by required")
    assertthat::assert_that(select_by %in% c("variability", "random", "concordance"), msg = "[RadAR] Invalid method for feature prioritization")
  }
  if (method == "mRMR") {
    assertthat::assert_that(length(response) > 0, msg = "[RadAR] Error: For mRMR a response variable is required")
    assertthat::assert_that(length(response) == ncol(rdr), msg = "[RadAR] Error: Response variable should have same length of ncol(rdr)")
    assertthat::assert_that(length(unique(response)) == 2  |
                              (length(unique(response)) == 3 & any(is.na(unique(response)))),
                            msg = "[RadAR] Error: Response variable should have two states")
    select_by <- "none"
  }
  if (method %in% c("glmnet-cox", "glmnet-binomial")) {
    if (length(alpha) > 0) {
      assertthat::assert_that(alpha >= 0 & alpha <= 1,
                              msg = "[RadAR] Invalid value for alpha. It should be in the range [0, 1].")
    }
    assertthat::assert_that(lambda %in% c("min", "1se"),
                            msg = "[RadAR] Invalid value for lamba. It should min or 1se.")
    select_by <- "none"
  }
  if (select_by == "interpretability" ) {
    assertthat::assert_that(length(rowData(rdr)$interpretability_score) > 0, msg = "[RadAR] Error: Interpretability of features is not available")
  }
  if (select_by == "concordance") {
    assertthat::assert_that(length(surv_obj) > 0, msg = "[RadAR] Error: For concordance a surv_obj is required")
  }
  if (method == "glmnet-cox") {
    assertthat::assert_that(length(surv_obj) > 0, msg = "[RadAR] Error: For glmnet-cox a surv_obj is required")
  }
  if (method %in% c("glmnet-binomial")) {
    assertthat::assert_that(length(response) > 0, msg = "[RadAR] Error: For glmnet-binomial a response variable is required")
    assertthat::assert_that(length(response) == ncol(rdr), msg = "[RadAR] Error: Response variable should have same length of ncol(rdr)")
    assertthat::assert_that(length(unique(response)) == 2 |
                              (length(unique(response)) == 3 & any(is.na(unique(response)))),
                            msg = "[RadAR] Error: Response variable should have two states")
  }

  if (which_data == "normal") {
    data <- assays(rdr)$values
  }
  if (which_data == "scaled") {
    data <- assays(rdr)$scaled_values
    assertthat::assert_that(!is.null(data), msg = "[RadAR] Error: Feature values have not been yet scaled")
  }
  if (which_data == "normalized") {
    data <- assays(rdr)$norm_values
    assertthat::assert_that(!is.null(data), msg = "[RadAR] Error: Feature values have not been yet normalized")
  }

  if (method %in% c("mRMR", "glmnet-binomial")) {
    if(any(is.na(response))) {
      ixna <- which(is.na(response))
      response <- response[-ixna]
      data <- data[, -ixna]
    }
  }
  if (method == "glmnet-cox" | select_by == "concordance") {
    if(any(is.na(surv_obj[, 1]))) {
      ixna <- which(is.na(surv_obj[, 1]))
      surv_obj <- surv_obj[-ixna, ]
      data <- data[, -ixna]
    }
  }


  if (method == "hcl") {
    dist_row <- as.dist( 1- cor(t(data), use = "na", method = corr_measure))
    hcl <- hclust(dist_row, method = "ward.D")
    clusts <- cutree(hcl, k = n_features)
  }

  if (method == "pca") {
    prc <- prcomp(data)
    summ_prc <- summary(prc)
    n_components <- which(summ_prc$importance[3, ] > thr_pca_cum_prop)[1]-1
    kmeans_pca <- kmeans(prc$x[, 1:n_components], centers = n_features)
    clusts <- kmeans_pca$cluster
  }

  if (method %in% c("pca", "hcl")) {
    assertthat::assert_that(all(table(clusts) > min_features_per_group),
                            msg = "[RadAR] Error: one or more groups with too few features. Decrease min_features_per_group or n_features")

    if (select_by == "variability") {
      q1 <- apply(data, 1, quantile, .25, na.rm = T)
      q2 <- apply(data, 1, quantile, .5, na.rm = T)
      q3 <- apply(data, 1, quantile, .75, na.rm = T)
      variability <- abs((q3-q1)/q2)
      sel_features <- rep("", n_features)
      for (i in 1: n_features) {
        sel_features[i] <- names(which.max(variability[which(clusts == i)]))
      }
    }

    if (select_by == "random") {
      sel_features <- rep("", n_features)
      for (i in 1: n_features) {
        sel_features[i] <- names(sample(which(clusts == i), size = 1))
      }
    }

    if (select_by == "interpretability") {
      sel_features <- rep("", n_features)
      for (i in 1: n_features) {
        sel_features[i] <- names(which.max(rowData(rdr)$interpretability_score[which(clusts == i)]))
      }
    }
    if (select_by == "concordance") {
      concordance_index <- rep(NA, nrow(rdr))
      for (i in 1: nrow(rdr)) {
        concordance_index[i] <- concordance(surv_obj ~ data[i, ])$concordance
      }
      names(concordance_index) <- rownames(rdr)
      sel_features <- rep("", n_features)
      for (i in 1: n_features) {
        sel_features[i] <- names(which.max(abs(concordance_index[which(clusts == i)]-0.5) ))
      }
    }
  }
  if (method == "mRMR") {
    mrmr_in_data <- t(rbind(data, response))
    mrmr_data <- mRMRe::mRMR.data(data = data.frame(mrmr_in_data))
    mrmr_res <- mRMRe::mRMR.classic("mRMRe.Filter", data = mrmr_data,
                                    target_indices = ncol(mrmr_in_data),
                                    feature_count = n_features)
    ix_features <- as.numeric(unlist(solutions(mrmr_res)))
    sel_features <- rownames(rdr)[ix_features]
  }

  if (method == "glmnet-cox") {
    if (lambda == "min") {
      my_s <- "lambda.min"
    } else {
      my_s <- "lambda.1se"
    }
    if (length(alpha) > 0) {
      cvfit <- cv.glmnet(x = t(data),
                         y = surv_obj, family = "cox",
                         alpha = alpha)
    } else {
      cvfit <- cv.glmnet(x = t(data),
                         y = surv_obj,
                         family = "cox")
    }
    coef.min <- coef(cvfit, s = my_s)
    cf <- as.matrix(coef(cvfit, s = my_s))
    res <- cf[which(cf!=0), ]
    if (any(names(res) == "(Intercept)")) {
      res <- res[-which(names(res) == "(Intercept)")]
    }
    if (length(res) > 0) {
      message(paste("[RadAR]", "n=", length(res), "features selected with glmnet"))
    } else {
      assertthat::assert_that(1 < 0,
                              msg = "[RadAR] Error: No features selected with glmnet. Try changing lambda and/or alpha parameters.")
    }
    ix_features <- which(rownames(rdr) %in% names(res))
    sel_features <- names(res)
  }

  if (method == "glmnet-binomial") {
    if (lambda == "min") {
      my_s <- "lambda.min"
    } else {
      my_s <- "lambda.1se"
    }
    if (length(alpha) > 0) {
      cvfit <- cv.glmnet(x = t(data),
                         y = response,
                         family = "binomial",
                         alpha = alpha)
    } else {
      cvfit <- cv.glmnet(x = t(data),
                         y = response,
                         family = "binomial")
    }
    coef.min <- coef(cvfit, s = my_s)
    cf <- as.matrix(coef(cvfit, s = my_s))
    res <- cf[which(cf!=0), ]
    if (any(names(res) == "(Intercept)")) {
      res <- res[-which(names(res) == "(Intercept)")]
    }
    if (length(res) > 0) {
      message(paste("[RadAR]", "n=", length(res), "features selected with glmnet"))
    } else {
      assertthat::assert_that(1 < 0,
                              msg = "[RadAR] Error: No features selected with glmnet. Try changing lambda and/or alpha parameters.")
    }
    ix_features <- which(rownames(rdr) %in% names(res))
    sel_features <- names(res)
  }


  message(paste("[RadAR]", "Selected features are:", toString(sel_features)))
  rdr <- rdr[sel_features, ]
  metadata(rdr)$feature_selection_method <- method
  metadata(rdr)$feature_selection_selectby <- select_by
  metadata(rdr)$feature_selection_which_data <- which_data
  if (method %in% c("glmnet-cox", "glmnet-binomial")) {
    metadata(rdr)$glmnet_model <- cvfit
    metadata(rdr)$glmnet_model_lambda <- lambda
  } else {
    metadata(rdr)$glmnet_model <- F
    metadata(rdr)$glmnet_model_lambda <- NULL
  }
  out <- list()
  out$rdr <- rdr
  out$signature <- rownames(rdr)
  return(out)

}
cgplab/RadAR documentation built on Nov. 10, 2021, 1:32 a.m.