R/gene_selection.R

Defines functions gene_selection_surv cox_all_genes

Documented in cox_all_genes gene_selection_surv

#' @title Survival analysis based on gene expression levels.
#' @description It carries out univariate cox proportional hazard models for
#' the expression levels of each gene included in the provided dataset
#' (case_full_data) and their link with relapse-free or overall survival.
#' @param case_full_data Input matrix whose columns correspond to the patients
#' and rows to the genes, having selected only the columns belonging to disease
#' samples. The names of the rows must be the names of the genes.
#' @param survival_time Numeric vector that includes time to the event
#' information
#' @param survival_event Numeric vector that indicates if relapse or death
#' have been produced (0 and 1s).
#' @return A matrix with the results of the application of proportional
#' hazard models using the expression levels of each gene as covariate.
#' The \code{coef} column corresponds to the regression coefficient; the
#' \code{exp_coef} column corresponds to the value of e^coef  (which is
#' interpreted as the odds ratio); the \code{se_coef} column corresponds
#' to the standard error of each coefficient; the \code{Z} column corresponds
#' to the value of coef/se_coef (the higher the Z value, the higher the
#' significance of the variable) and the \code{Pr_z} column corresponds to
#' the p-value for each Z value.
#' @import survival
cox_all_genes <- function(case_full_data, survival_time, survival_event){
  message("Calculating the matrix of Zcox")
  pb <- utils::txtProgressBar(min = 0, max = nrow(case_full_data), style = 3)

  list_out <- list()
  for(i in 1:nrow(case_full_data)){
    utils::setTxtProgressBar(pb, i)

    temp <- summary(survival::coxph(survival::Surv(survival_time,survival_event)
                                    ~case_full_data[i,]))$coefficients[1,]
    list_out[[i]] <- temp
  }
  cox_all_matrix <- data.frame(do.call("rbind",list_out))
  colnames(cox_all_matrix) <-  c("coef","exp_coef","se_coef","z","Pr_z")
  rownames(cox_all_matrix) <- rownames(case_full_data)
  cox_all_matrix <- as.matrix(cox_all_matrix)
  return(cox_all_matrix)
}


#' @title Gene selection based on variability and the relationship
#' to survival.
#' @description
#' It selects genes for mapper based on the product of standard deviation
#' of the rows (genes) in the disease component matrix
#' plus one times the Z score obtained by fitting a cox proportional
#' hazard model to the level of each gene. For further information see
#' "Topology based data analysis identifies a subgroup of breast cancers
#' with a unique mutational profile and excellent survival"
#' @param case_disease_component Disease component matrix (output of the function
#' \code{generate_disease_component}) having selected only the columns
#' belonging to disease samples. The names of the rows must be the names of the genes.
#' @param cox_all_matrix Output from the \code{cox_all_genes} function. Data.frame with
#' information on the relationship between genes and survival.
#' @param gen_select_type Option. Select the "Abs" option, which means that the
#' genes with the highest absolute value are chosen, or the
#' "Top_Bot" option, which means that half of the selected
#' genes are those with the highest value (positive value, i.e.
#' worst survival prognosis) and the other half are those with the
#' lowest value (negative value, i.e. best prognosis).
#' @param num_gen_select Number of genes to be selected (those with the highest
#' product value).
#' @return Character vector with the names of the selected genes.
gene_selection_surv <- function(case_disease_component, cox_all_matrix, gen_select_type, num_gen_select){
  # Same operation to both methods
  probes_test <- apply(case_disease_component, 1,stats::sd)+1

  cox_vector <- cox_all_matrix[,"z"]
  cox_vector[cox_vector < 0] <- ifelse(!is.na(cox_vector[cox_vector < 0]),
                                       cox_vector[cox_vector < 0] - 1,
                                       cox_vector[cox_vector < 0])
  cox_vector[cox_vector >= 0] <- ifelse(!is.na(cox_vector[cox_vector >= 0]),
                                        cox_vector[cox_vector >= 0] + 1,
                                        cox_vector[cox_vector >= 0])

  if(gen_select_type == "abs"){
    probes_test <- probes_test * abs(cox_vector)
    genes_selected <- names(probes_test[order(probes_test,decreasing = T)])[1:num_gen_select]
  }else{
    probes_test <- probes_test * cox_vector
    if(num_gen_select %% 2 == 0){ num_gen_select <- num_gen_select/2}
    else{ num_gen_select <- (num_gen_select + 1)/2}

    genes_selected <- names(c(probes_test[order(probes_test,decreasing = T)][1:num_gen_select],
                              probes_test[order(probes_test,decreasing = F)][1:num_gen_select]))
  }
  return(genes_selected)
}

Try the GSSTDA package in your browser

Any scripts or data that you put into this service are public.

GSSTDA documentation built on June 22, 2024, 10:44 a.m.