R/check_arguments.R
In GSSTDA: Progression Analysis of Disease with Survival using Topological Data Analysis

Documented in check_arg_mapper check_filter_values check_full_data check_gene_selection check_vectors

#' @title check_full_data
#'
#' @description Checking the full_data introduces in the package
#' @param full_data Matrix with the columns of the input matrix
#' corresponding to the individuals belonging to the level.
#' @param na.rm \code{logical}. If \code{TRUE}, \code{NA} rows are omitted.
#' If \code{FALSE}, an error occurs in case of \code{NA} rows.
#' @return Return \code{full_data} without NAN's and as a matrix
check_full_data <- function(full_data, na.rm = TRUE){
  # If this function has been executed don't do nothing
  if(na.rm == "checked"){
    return(full_data)
  }

  #Read the data set
  yes_no <- readline(prompt="Are the columns of the data set the patient and the rows the genes?: yes/no ")

  if(yes_no == "no" | yes_no == "n"){
    #Transpose the data set. Columns = patient and rows = genes
    full_data <- t(full_data)
  }
  #Convert full_data to matrix type
  full_data <- as.matrix(full_data)

  #Omit NAN's values
  if (na.rm == TRUE){
    nrow_ini = nrow(full_data)
    # Remove rows (genes) with NA's values
    full_data <- full_data[rowSums(is.na(full_data))==0,]

    message(paste(nrow(full_data) - nrow_ini, " missing values and NaN's are omitted in the genes (rows)"))
  }
  return(full_data)
}


#' @title check_vectors
#' @description Checking the \code{survival_time}, \code{survival_event} and \code{case_tag} introduces in the \code{GSSTDA} object.
#'
#' @param full_data The genes of the full_data (maybe remove by na.rm = TRUE)
#' @param survival_time Time between disease diagnosis and event (if there was
#' no event until the end of follow-up).
#' @param survival_event \code{logical}. Whether or not the event has occurred.
#' @param case_tag The tag of the healthy sample (healthy or not).
#' @param control_tag Tag of the healthy sample.E.g. "T"
#' @param na.rm \code{logical}. If \code{TRUE}, \code{NA} rows are omitted.
#' If \code{FALSE}, an error occurs in case of \code{NA} rows.
#' @return control_tag Return the tag of the healthy sample.
check_vectors <- function(full_data, survival_time, survival_event, case_tag, control_tag, na.rm = TRUE){
  ncol_full_data <- ncol(full_data)
  # Check if the arguments are vectors; a valid type of data; and the vectors are the same dimension as a full_data
  if(!is.vector(survival_time) | !is.numeric(survival_time) | length(survival_time) != ncol_full_data){
    stop("survival_time must be a valid values vector and its length must be the same as the number of patients (columns) of the full_data.")
  }

  # Omit NAN's values in checking
  if(!is.vector(survival_event) | !(length(unique(stats::na.omit(survival_event))) == 2 & is.numeric(stats::na.omit(survival_event))) | length(survival_event) != ncol_full_data){
    stop("survival_event must be a valid values vector. Only two type of event (0 or 1). Also, its length must be the same as the number of patients (columns) of the full_data.")
  }

  #If exits NAN's values remove it and check if it contain only two cases and it has the same dimension (columns) as full_data
  if(!is.vector(case_tag)){
    stop("case_tag must be a valid values vector.")
  }
  if( any(is.na(case_tag))){
    without_nan_patient <- which(!is.na(case_tag))
    case_tag <- case_tag[without_nan_patient]
    full_data <- full_data[,without_nan_patient]
    survival_event <- survival_event[without_nan_patient]
    survival_time <- survival_time[without_nan_patient]
    ncol_full_data <- ncol(full_data)
    message("NAN's values in patient was removed in case_tag, full_data, survival_time and survival_event")
  }
  if(length(unique(case_tag)) != 2){
    stop("case_tag must has only two type of tags.")
  }
  if(length(case_tag) != ncol_full_data){
    stop("The length of case_tag must be the same as the number of patients (columns) of the full_data.")
  }

  control_tag_opt <- unique(case_tag)
  if(is.na(control_tag)){
    control_tag <- readline(prompt=paste("What is the tag of the healthy patient (value in the case_tag)? (", control_tag_opt[1], " or ", control_tag_opt[2], "): " , sep="") )
  }

  if(!(control_tag %in% control_tag_opt)){
    print(paste("The case tag is '", control_tag_opt[1], "' by default"))
    control_tag <- control_tag_opt[1]
  }
  return(list(control_tag, full_data, survival_event, survival_time, case_tag))
}

#' @title check_filter_values
#'
#' @description Checking the filter_values introduces in the \code{mapper} object.
#'
#' @param full_data Matrix with the columns of the input matrix
#' corresponding to the individuals belonging to the level. This matrix could be the
#' genes_disease_component.
#' @param filter_values Vector obtained after applying the filtering function
#' to the input matrix, i.e, a vector with the filtering function
#' values for each included sample.
#' @param na.rm \code{logical}. If \code{TRUE}, \code{NA} rows are omitted.
#' If \code{FALSE}, an error occurs in case of \code{NA} rows.
#'
#' @return \code{filter_value} and \code{full_data} without NAN's
check_filter_values <- function(full_data, filter_values, na.rm = TRUE){
  # Check if filter_values is a vector
  if(!is.vector(filter_values)){
    stop("filter_values must be a valid values vector")
  }

  #Check if the names of the filter_values are the same as the cols of full_data.
  if(!setequal(names(filter_values), colnames(full_data))){
    stop("The name of the filter_values must be the same as the patient name of the full_data (or genes_disease_component).")
  }

  #Omit NAN's values
  if (na.rm == TRUE){
    # Remove rows (subjects) and their filter values with NA's values
    filter_values <- filter_values[colnames(full_data)]
    # Remove filter values and respective rows with NA's values
    filter_values <- stats::na.omit(filter_values)
    full_data <- full_data[,names(filter_values)]
  }
  return(list(full_data, filter_values))
}

#' @title check_gene_selection
#'
#' @description Checking the arguments introduces in the gene selection process.
#'
#' @param num_genes Number of genes in the full_data
#' @param gen_select_type Type of gene selection to be used. Choose between "top_bot" (top-botton)
#' and "abs" (absolute)
#' @param percent_gen_select Percentage of genes to be selected
#'
#' @return num_gen_select Number of genes to be selected according to the percent_gen_select value
check_gene_selection <- function(num_genes, gen_select_type, percent_gen_select){
  #Convert text to lowercase
  gen_select_type <- tolower(gen_select_type)
  #Check gen_select_type
  gen <- c("top_bot","abs")
  if(!gen_select_type %in% gen){
    stop(paste("Invalid gene selection type selected. Choose one of the folowing: ", paste(gen, collapse = ", ")))
  }

  #Number of genes to be selected in gene_selection_surv function
  num_gen_select <- trunc((percent_gen_select/100) * num_genes)

  return(num_gen_select)
}

#' @title check_arg_mapper
#'
#' @description Checking the arguments introduces in the \code{mapper} object.
#'
#' @param full_data Matrix with the columns of the input matrix
#' corresponding to the individuals belonging to the level.
#' @param filter_values Vector obtained after applying the filtering function
#' to the input matrix, i.e, a vector with the filtering function
#' values for each included sample.
#' @param distance_type Type of distance to be used for clustering.
#' Choose between correlation ("correlation") and euclidean ("euclidean"). "correlation"
#' default option.
#' @param clustering_type Type of clustering method. Choose between
#' "hierarchical" and "PAM" (“partition around medoids”) options.
#' "hierarchical" default option.
#' @param linkage_type Linkage criteria used in hierarchical clustering.
#' Choose between "single" for single-linkage clustering, "complete" for
#' complete-linkage clustering or "average" for average linkage clustering
#' (or UPGMA). Only necessary for hierarchical clustering.
#' "single" default option.
#' @param optimal_clustering_mode Method for selection optimal number of
#' clusters. It is only necessary if the chosen type of algorithm is
#' hierarchical. In this case, choose between "standard" (the method used
#' in the original mapper article) or "silhouette". In the case of the PAM
#' algorithm, the method will always be "silhouette".
#' @param silhouette_threshold Minimum value of \eqn{\overline{s}}{s-bar} that a set of
#' clusters must have to be chosen as optimal. Within each interval of the
#' filter function, the average silhouette values \eqn{\overline{s}}{s-bar} are computed
#' for all possible partitions from $2$ to $n-1$, where $n$ is the number of
#' samples within a specific interval. The $n$ that produces the highest value
#' of \eqn{\overline{s}}{s-bar} and that exceeds a specific threshold is selected as the
#' optimum number of clusters. If no partition produces an \eqn{\overline{s}}{s-bar}
#' exceeding the chosen threshold, all samples are then assigned to a unique
#' cluster. The default value is $0.25$. The threshold of $0.25$ for
#' \eqn{\overline{s}}{s-bar} has been chosen based on standard practice, recognizing it
#' as a moderate value that reflects adequate separation and cohesion within
#' clusters.
#' @param na.rm \code{logical}. If \code{TRUE}, \code{NA} rows are omitted.
#' If \code{FALSE}, an error occurs in case of \code{NA} rows.
#'
#' @return \code{optimal_clustering_mode}
check_arg_mapper <- function(full_data, filter_values, distance_type, clustering_type, linkage_type,
                             optimal_clustering_mode = NA, silhouette_threshold = 0.25, na.rm = TRUE){
  #Check distance_type
  distances <- c("correlation","euclidean")
  if(!distance_type %in% distances){
    stop(paste("Invalid distance selected. Choose one of the folowing: ", paste(distances, collapse = ", ")))
  }

  #Check clustering_type
  clust_types <- c("hierarchical","PAM")
  if(!clustering_type %in% clust_types){
    stop(paste("Invalid clustering method selected. Choose one of the folowing: ", paste(clust_types,collapse = ", ")))
  }

  if(is.na(optimal_clustering_mode)){
    optimal_clustering_mode <- "silhouette"

    if(clustering_type == "hierarchical"){
      option <- readline(prompt="Choose one of the following optimal cluster number method: standard/silhouette: ")

      if(option != "standard"){
        optimal_clustering_mode <- "silhouette"
      }
      else{
        optimal_clustering_mode <- "standard"
      }
    }
  }else{
    #Check optimal_clustering_mode
    optimal_clustering <- c("silhouette","standard")
    if(!optimal_clustering_mode %in% optimal_clustering){
      stop(paste("Invalid optimal_clustering selected. Choose one of the folowing: ", paste(optimal_clustering, collapse = ", ")))
    }
  }
  message("The optimal clustering mode is '", optimal_clustering_mode, " '")


  # Check
  if (optimal_clustering_mode == "silhouette"){
    if(silhouette_threshold != 0.25){
      if(silhouette_threshold<0 || silhouette_threshold>1)
        stop(paste("Invalid silhouette_threshold value. Choose one between 0 and 1"))
    }
  }

  #Check linkage_type
  link_types <- c("single","average","complete")
  if(!linkage_type %in% link_types){
    stop(paste("Invalid linkage method selected. Choose one of the folowing: ", paste(link_types,collapse = ", ")))
  }

  # Check if filter_values == [] the filter_values is not calculated yet. So, we checked only the others args
  if(length(filter_values) != 0 & na.rm != "checked"){
    full_data_and_filter_values <- check_filter_values(full_data, filter_values)
    full_data <- full_data_and_filter_values[[1]]
    filter_values <- full_data_and_filter_values[[2]]
  }

  return(list(full_data, filter_values, optimal_clustering_mode))
}

Any scripts or data that you put into this service are public.

GSSTDA documentation built on June 22, 2024, 10:44 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

GSSTDA
Progression Analysis of Disease with Survival using Topological Data Analysis

R/check_arguments.R
In GSSTDA: Progression Analysis of Disease with Survival using Topological Data Analysis

Defines functions check_arg_mapper check_gene_selection check_filter_values check_vectors check_full_data

Documented in check_arg_mapper check_filter_values check_full_data check_gene_selection check_vectors

Try the GSSTDA package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GSSTDA Progression Analysis of Disease with Survival using Topological Data Analysis

R/check_arguments.R In GSSTDA: Progression Analysis of Disease with Survival using Topological Data Analysis

Defines functions check_arg_mapper check_gene_selection check_filter_values check_vectors check_full_data

Documented in check_arg_mapper check_filter_values check_full_data check_gene_selection check_vectors

Try the GSSTDA package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GSSTDA
Progression Analysis of Disease with Survival using Topological Data Analysis

R/check_arguments.R
In GSSTDA: Progression Analysis of Disease with Survival using Topological Data Analysis