R/internal.R

Defines functions .group_replicates_by_sample .dsr_to_tibble .filter_gff .categorize_diff_genes .piarwise_upset .fix_hm_colors .validate_parcutils_obj .get_all_named_expression_matrix .filter_df_by_genes

Documented in .categorize_diff_genes .dsr_to_tibble .filter_df_by_genes .filter_gff .fix_hm_colors .get_all_named_expression_matrix .group_replicates_by_sample .piarwise_upset .validate_parcutils_obj

#' Filter expression matrix (dataframe) by genes (first column).
#'
#' @param df a gene expression data frame
#' @param genes a character vector of genes to be filtered from df.
#'
#' @return a filtered dataframe.
#' @export
#' @keywords internal
#' @examples
#' \dontrun{
#'
#' # TO DO
#' }
#'
.filter_df_by_genes <- function(df, genes){

  column_gene_id <- df %>% colnames() %>%.[1]

  ## check which genes are not present in the df
  df_genes <- df %>%
    dplyr::pull(!!rlang::sym(column_gene_id))

  genes_not_present <- dplyr::setdiff(genes, df_genes)
  genes_present <- genes[genes %in% df_genes]

  if(length(genes_not_present) == length(genes) ){

    stop("No genes found.")

  } else if(length(genes_present) < length(genes)){

    cli::cli_alert_warning(text = paste0("Genes below are not present into x\n" ,
                                         paste0(genes_not_present , collapse = ",")))

  }

  # filter available genes
  df <- df %>%
    dplyr::slice(match(genes_present, !!rlang::sym(column_gene_id)))

}


#' Get named list of gene expression matrix for all samples in \code{x}.
#'
#' @param x an object of class parcutils.
#'
#' @return named list of gene expression matrix.
#' @export
#' @keywords internal
.get_all_named_expression_matrix <- function(x){

  .validate_parcutils_obj(x)

  all_sample_names <- x$norm_counts %>%
    purrr::flatten() %>% names()

  unique_index <- purrr::map_dbl(all_sample_names %>% unique() , ~ which(..1 == all_sample_names)[1])

  ret <- x$norm_counts %>% purrr::flatten() %>% magrittr::extract(unique_index)
  return(ret)

}


#' Validate if an object is of class 'parcutils'.
#'
#' @param x an object
#'
#' @return error
#' @export
#' @keywords internal
.validate_parcutils_obj <- function(x){
  stopifnot("x must be an object of class 'parcutils'. Usually x is derived by parcutils::run_deseq_analysis()." = is(x, "parcutils"))
}


#' Given a matrix generate a color scale for a heatmap.
#'
#' @param hm_matrix
#'
#' @return color for [ComplexHeatmap::Heatmap()].
#' @export
#' @keywords internal
.fix_hm_colors <- function(hm_matrix){

  stopifnot(is.data.frame(hm_matrix))

  col_choices <- c("min" = MetBrewer::met.brewer(name = "Austria")[2],
                   "max" = MetBrewer::met.brewer(name = "Austria")[1])

  max_val <-  ceiling(hm_matrix %>% dplyr::select_if(is.numeric) %>% max())
  min_val <-  floor(hm_matrix %>% dplyr::select_if(is.numeric) %>% min() )
  mid_val <- (max_val + min_val) / 2

  col = circlize::colorRamp2(c(min_val , mid_val, max_val), c(col_choices["min"], "white", col_choices["max"]))


}

#' Generate upset plots for DEG between a comparison.
#' @description This function called from [parcutils::plot_deg_upsets()].
#' @param x an abject of class "parcutils". This is an output of the function [parcutils::run_deseq_analysis()].
#' @param sample_comparison a character vector of length 2 denoting  sample comparisons between upset plot to be generated.
#' @param color_up a character string denoting a valid color code for bars in upset plot for up regulated genes.
#' @param color_down a character string denoting a valid color code for bars in upset plot for down regulated genes.
#' @param ... other arguments to be passed to [UpSetR::upset()].
#'
#' @return an object of named list where each element is a list of two - 1)  an upset plots  [UpSetR::upset()] and their intersections in form of tibble.
#' @export
#' @keywords internal
#' @examples
#' \dontrun{
#' }
.piarwise_upset <- function(x, sample_comparison , color_up = "#b30000", color_down = "#006d2c",... ){

  # validate sample_comparison

  stopifnot("sample_comparison must be a character vector of length 2." = is.character(sample_comparison) & length(sample_comparison) == 2)

  yy <- purrr::map(sample_comparison %>%
                     purrr::set_names(sample_comparison), ~parcutils::get_genes_by_regulation(x = x, sample_comparison = ..1,regulation = "both") ) %>%
    purrr::map(~split(names(..1), ..1))

  input_list_for_upset <- purrr::map(names(yy), ~ purrr::set_names(yy[[..1]], glue::glue("{..1}_{yy[[..1]] %>% names %>% tolower}"))) %>% purrr::flatten()


  # order list by up and down genes

  lst_order <- list(sample_comparison , c("up" ,"down")) %>%
    purrr::cross() %>%
    purrr::map_chr( paste0 , collapse ="_")

  input_list_for_upset <- input_list_for_upset[lst_order]

  # up set colors
  upset_colors <- c("up" = color_up, #MetBrewer::met.brewer(name = "Austria")[1],
                    "down" = color_down) #MetBrewer::met.brewer(name = "Austria")[3])



  # plot upset
  us_plot <- UpSetR::upset(UpSetR::fromList(input_list_for_upset) ,
                           order.by =  c("degree"),
                           #group.by = "sets",
                           nsets = length(input_list_for_upset),
                           nintersects = NA,
                           sets = names(input_list_for_upset) %>% rev(),
                           keep.order = TRUE,
                           text.scale = 1.5,
                           mb.ratio = c(0.6, 0.4),
                           sets.x.label = "No. of genes",
                           sets.bar.color = rep(upset_colors %>% rev(),each = 2),...)

  # get upset data.
  upset_intersects  <- parcutils::get_upset_intersects(input_list_for_upset, upset_plot = us_plot)

  ret <- list(upset_plot = us_plot,upset_intersects = upset_intersects)


  return(ret)


}



#' @title  Categorize diff genes in to `up` and `down` based on log2FC, p-value and p-adj values.
#' @description Usually fold change 1.5 (log2FC ~0.6) and statistical significance defined by p and/or p.adj values
#' are considered to mark gene as differential expressed. However, these cutoffs change based on data quality.
#' This function allows changing these cutoffs and mark genes differently expressed.
#' While using this package, most of the time user do not need to call this function explicitly as it is internally called by
#' [parcutils::run_deseq_analysis()].
#'
#' @param dsr_tibble  a dataframe obtained from DESeqResult object or an object of DESeqResult.
#' @param log2fc_cutoff,pval_cutoff,padj_cutoff a numeric value, default `(log2fc_cutoff = 1, pval_cutoff =  0.05, padj_cutoff = 0.01)`
#' denoting cutoffs. These criteria will be used to decide significance  `(NS, p-value, log2FC, p-value&log2FC)` and
#' type of regulation `(Up, Down & other)` of diff genes.
#' See details.
#' @param add_column_regul logical, default `TRUE`, indicating whether to add column  `add_column_regul` or not. Added column will contain the values
#' Up', 'Down' or 'other'.
#' @param regul_based_upon one of the numeric choices  1, 2, or 3.
#' ## if 1 ...
#'  - Up : log2fc >= log2fc_cutoff & pvalue <= pvalue_cutoff
#'  - Down : log2fc  <= (-1) * log2fc_cutoff & pvalue <= pvalue_cutoff
#'  - Other : remaining genes
#' ## if 2 ...
#'  - Up : log2fc >= log2fc_cutoff & padj <= padj_cutoff
#'  - Down : log2fc  <= (-1) * log2fc_cutoff & padj <= padj_cutoff
#'  - Other : remaining genes
#' ## if 3 ...
#'  - Up : log2fc >= log2fc_cutoff & pvalue <= pvalue_cutoff & padj <= padj_cutoff
#'  - Down : log2fc  <= (-1) * log2fc_cutoff pvalue <= pvalue_cutoff & padj <= padj_cutoff
#'  - Other : remaining genes
#' @return a data frame
#' @details // TO DO. --> explain columns 'signif' and 'type' in the returned data frame.
#' @keywords internal
.categorize_diff_genes <-  function(dsr_tibble,
                                   log2fc_cutoff =  1,
                                   pval_cutoff = 0.05,
                                   padj_cutoff = 0.01,
                                   add_column_regul = TRUE,
                                   regul_based_upon = 1
){

  # convert DESeqResult object to tibble

  if(any(is(dsr_tibble) %in% "DESeqResults")) {
    dsr_tibble %<>% .dsr_to_tibble()
  }

  # add column 'type' having one of the four values

  # 1. NS -> Not significant
  # 2. p-value -> only pvalue significant but gene is not diff expressed
  # 3. log2FC -> only diff expressed but not pvalue significant
  # 4. p-value&log2FC -> diff expressed and pvalue signficant

  pval_cutoff <- pval_cutoff
  log2fc_cutoff <- log2fc_cutoff

  dsr_tibble %<>% dplyr::mutate(signif = dplyr::case_when(dplyr::between(log2FoldChange , -log2fc_cutoff, log2fc_cutoff ) &  pvalue >= pval_cutoff ~ "NS",
                                                          dplyr::between(log2FoldChange , -log2fc_cutoff, log2fc_cutoff  ) &  pvalue <= pval_cutoff ~ "p-value",
                                                          (log2FoldChange <= -log2fc_cutoff | log2FoldChange >= log2fc_cutoff) & pvalue >= pval_cutoff ~ "log2FC",
                                                          (log2FoldChange <= -log2fc_cutoff | log2FoldChange >= log2fc_cutoff) & pvalue <= log2fc_cutoff ~ "p-value&log2FC",
                                                          TRUE ~ "other"))

  # add column 'regul' having one of the three values

  if(add_column_regul){

    # categorization into 'Up', 'Down' and 'Other' is done based on value of user selected value of regul_based_upon

    # if regul_based_upon == 1, filtering is done based on pvalue cutoff
    #   Up -> log2fc >= log2fc_cutoff & pvalue <= pvalue_cutoff
    #   Down -> log2fc  <= (-1) * log2fc_cutoff & pvalue <= pvalue_cutoff
    #   Other -> remaining genes

    if(regul_based_upon == 1){
      dsr_tibble %<>%
        dplyr::mutate(regul = dplyr::case_when(log2FoldChange >= log2fc_cutoff & pvalue <= pval_cutoff ~ "Up",
                                               log2FoldChange <= -1 * log2fc_cutoff & pvalue <= pval_cutoff ~ "Down",
                                               TRUE ~ "other"))

    }
    # if regul_based_upon == 2

    #   Up -> log2fc >= log2fc_cutoff & padj <= padj_cutoff
    #   Down -> log2fc  <= (-1) * log2fc_cutoff & padj <= padj_cutoff
    #   Other -> remaining genes

    else if(regul_based_upon == 2){
      dsr_tibble %<>%
        dplyr::mutate(regul = dplyr::case_when(log2FoldChange >= log2fc_cutoff & padj <= padj_cutoff ~ "Up",
                                               log2FoldChange <= -1 * log2fc_cutoff & padj <= padj_cutoff ~ "Down",
                                               TRUE ~ "other"))

    }

    # if regul_based_upon == 3

    #   Up -> log2fc >= log2fc_cutoff & pvalue <= pvalue_cutoff & padj <= padj_cutoff
    #   Down -> log2fc  <= (-1) * log2fc_cutoff pvalue <= pvalue_cutoff & padj <= padj_cutoff
    #   Other -> remaining genes

    else if(regul_based_upon == 3){
      dsr_tibble %<>%
        dplyr::mutate(regul = dplyr::case_when(log2FoldChange >= log2fc_cutoff &
                                                 pvalue <= pval_cutoff &
                                                 padj <= padj_cutoff   ~ "Up",
                                               log2FoldChange <= -1 * log2fc_cutoff &
                                                 pvalue <= pval_cutoff &
                                                 padj <= padj_cutoff ~ "Down",
                                               TRUE ~ "other"))

    } else {
      stop ("value of 'regul_based_upon' must be one of the 1,2, or 3.")
    }
  }

}



#' Filter rows from a GFF file.
#'
#' @param gtf_file  // TO DO
#' @param feature_type // TO DO
#' @param feature_biotype // TO DO
#' @param var_gene_id // TO DO
#' @param var_feature_type // TO DO
#' @param var_feature_biotype // TO DO
#' @param var_gene_name // TO DO, to be implement yet.
#'
#' @return // TO DO
#' @keywords internal
.filter_gff <- function(gtf_file ,

                       feature_type = "gene" ,
                       feature_biotype = "protein_coding",

                       var_gene_name = "gene_name",
                       var_gene_id = "gene_id",
                       var_feature_type = "type" ,
                       var_feature_biotype = "gene_biotype"){

  # gtf_file = gff
  # feature_type = feature_type
  # feature_bio_type = feature_bio_type

  column_feature_type_quo = rlang::enquo(column_feature_type)
  column_feature_biotype_quo = rlang::enquo(column_feature_biotype)
  column_gene_id_quo  = rlang::enquo(column_gene_id)

  cli::cli_alert_info(text = "Reading gft/gff file ...")
  gtf_annot <- rtracklayer::import(gtf_file) %>%
    as.data.frame() %>%
    tibble::as_tibble()
  cli::cli_alert_success(text = "Done.")

  # get genes
  gtf_annot_subset <- gtf_annot %>%
    dplyr::filter(type == feature_type) %>%
    dplyr::filter(gene_biotype == feature_bio_type) %>%
    dplyr::select(!!column_gene_id_quo, !!column_feature_type_quo, !!column_feature_biotype_quo)

  return(gtf_annot_subset)
}


#' Convert DESeq result object in to a tibble.
#'
#' @param x DEseq2 result object. Can be generated by [DESeq2::DESeqResults()].
#'
#' @return a data frame.
#' @export
#' @keywords internal
#'
.dsr_to_tibble <- function(x, .col_gene_id = "gene_id"){

  x %>% as.data.frame() %>%
    tibble::rownames_to_column(.col_gene_id) %>%
    tibble::as_tibble() #%>%
  #dplyr::mutate_if(is.numeric, ~ round(..1, 3)) ## round by 3 digits
}



#' Group replicates by samples.
#'
#' @param x an abject of class "parcutils". This is an output of the function [parcutils::run_deseq_analysis()].
#'
#' @return a tbl.
#' @export
#' @keywords internal
.group_replicates_by_sample <- function(x){

  .validate_parcutils_obj(x)

  sample_info <- purrr::map(.get_all_named_expression_matrix(x), ~ ..1 %>% colnames() %>% .[-1]) %>%
    tibble::tibble(groups = names(.) , samples = .) %>%
    tidyr::unnest(cols = "samples")

  return(sample_info)
}



#' Get list of all expressed genes from \code{x}
#'
#' @param x an object of class 'parcutils'
#'
#' @return a character vector.
#' @export
#' @keywords internal
.get_all_expressed_genes <- function(x){
  .validate_parcutils_obj(x)

  all_genes <- parcutils::get_genes_by_regulation(x = x,
                                                  regulation = "all",
                                                  sample_comparisons = x$de_comparisons[1])

  return(all_genes)
}



#' Remove items from the list if no enriched terms found.
#'
#' @param x a list containing output of GO enrichment. Each element of the list is an output of  [clusterProfiler::enrichGO()].
#'
#' @return a list of containing output of GO enrichment.
#' @export
#' @keywords internal
.keep_only_enriched_go <- function(x){

  safely_enrich <- purrr::safely(enrichplot::pairwise_termsim , otherwise = NULL)

  ## remove those which has no enriched term

  go_enrichment_result_only_enriched <- x %>%
    purrr::map( ~ safely_enrich(..1)) %>%
    purrr::transpose() %>%
    purrr::simplify_all()

  go_enrichment_result_only_enriched <-
    go_enrichment_result_only_enriched$result %>%
    purrr::discard(is.null)

}


#' Remove highly similar GO.
#'
#' @param x a list containing output of GO enrichment. Each element of the list is an output of  [clusterProfiler::enrichGO()].
#' @param similarity_cutoff a value between 0 and 1. Default 0.9.
#'
#' @return  a list containing output of GO enrichment.
#' @export
#' @keywords internal
.simplify_go <- function(x, similarity_cutoff = 0.9){

  go_enrichment_result_only_enriched <-
    x %>%
    purrr::map(~ clusterProfiler::simplify(..1, cutoff = similarity_cutoff))

  return(go_enrichment_result_only_enriched)
}


#' Generate a list of GO EMAP plot.
#'
#' @param x a list containing output of GO enrichment. Each element of the list is an output of  [clusterProfiler::enrichGO()].
#' @param n_terms number of terms to show.
#' @param color_terms_by
#'
#' @return a list of GO plots.
#' @export
#'
#' @keywords internal
.generate_go_emap_plot <- function(x, n_terms = 30 , color_terms_by = "p.adjust"){

  safely_plot <- purrr::safely(.f = function(x, y,...){
    ep <- enrichplot::emapplot(x,...)
    ep <-   ep + ggplot2::ggtitle(y)
    return(ep)
  } , otherwise = NULL)

  go_plots <- purrr::map(names(x) , ~
                           safely_plot(x = x[[..1]],
                                       y = ..1,
                                       showCategory = n_terms,
                                       color = color_terms_by,
                                       layout = "kk"))

  # simplify list
  go_plots <- go_plots %>%
    purrr::transpose() %>%
    purrr::simplify_all()

  # keep result
  go_plots <- go_plots$result

  # assign names
  names(go_plots) <- names(x)

  # remove element if NULL
  go_plots <- go_plots %>%
    purrr::discard(is.null)

  go_plots

}


#' Group replicates by sample - returns list
#' @rdname get_replicates_by_sample
#'
#' @keywords internal
#' @export
.get_replicates_by_sample_list <- function(x){

  # validate x
  .validate_parcutils_obj(x)

  y <- .group_replicates_by_sample(x)

  y_grpd <-
    y %>%
    dplyr::group_by(groups) %>%
    tidyr::nest() %>%
    dplyr::mutate(data = purrr::map(data, ~ ..1 %>% dplyr::pull(1)))

  y <-
    y_grpd %>%
    dplyr::pull(2)

  names(y) <- y_grpd %>% dplyr::pull(1)

  return(y)
}



#' Map metadata (GC, length and seq) to GRanges object.
#'
#' @param x an object of class GRanges
#' @param bs_genome_object an object of class BSgenome, default \code{BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38}
#'
#' @return a dataframe.
#' @export
#'
#' @keywords internal
.map_granges_metadata <- function(x, bs_genome_object = BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38){

  stopifnot("x must be the object of class GRanges" = is(x, "GRanges"))
  stopifnot("bs_genome_object must be an object of class BSgenome" = is(bs_genome_object, "BSgenome"))

  # check if seqlevels of two objects matches.

  not_found <- GenomeInfoDb::seqlevels(x)[!GenomeInfoDb::seqlevels(x) %in% GenomeInfoDb::seqlevels(bs_genome_object)]

  if(length(not_found)>0){
    cli::cli_warn("seqlevel{?s}  {.emph {cli::col_red({not_found})}} {?is/are} not present in the BSgenome.\nASE belongs to {?this/these} level{?s} will be discarded.")
  }

  x <- x %>% tibble::as_tibble() %>% dplyr::filter(!(seqnames %in% not_found)) %>% plyranges::as_granges()

  if(length(x) == 0){
    stop("No ASE have common seqlevels with bs_genome_object.")
  }

  x <- x %>%
    # add sequence for each intron
    dplyr::mutate(seq =  BSgenome::getSeq(bs_genome_object,x)) %>%

    # add GC for each intron
    dplyr::mutate(GC =  BSgenome::letterFrequency(seq, letters = "GC", as.prob = TRUE) %>%  as.numeric()) %>%

    # add length for each intron
    dplyr::mutate(length =  BSgenome::width(seq))

  return(x)
}


#' For each intron in the object irFinderSdata assign unique intron id
#'
#' @param x an object of class irFinderSdata
#'
#' @return an object of class irFinderSdata
#' @export
#' @keywords internal
#' @examples
#'
#' example_dir <- system.file("extdata" ,"ir_data" , package="parcutils")
#' example_files <- fs::dir_ls(example_dir, glob = "*-IR-nondir*.txt")
#' names(example_files) <- stringr::str_replace(example_files , pattern = ".*/", replacement = "") %>%
#' stringr::str_replace(pattern = "-IR-nondir.txt", replacement = "")
#' x <- read_irfinderS_output(files = example_files,  add_prefix_chr = FALSE)
#' .parcutils_assign_intron_identifier(x)
.parcutils_assign_intron_identifier  <- function(x){
  # assign intron id to each element in the x

  .validate_irfinders_object(x)
  x <- purrr::map(x  , ~ ..1 %>% dplyr::mutate(intron_id = stringr::str_c("intron", 1:dplyr::n(), sep = "_")))
  x <- .assign_class_irFinderSdata(x)
  return(x)
}

#' Check if the object belongs to class irFinderSdata.
#'
#' @param x an object to be tested for class 'irFinderSdata'
#'
#' @return error.
#' @export
#'
#' @keywords internal
.validate_irfinders_object <- function(x){
  stopifnot("x must be an object of class irFinderSdata" = is(x, "irFinderSdata") )
}


#' Assign class irFinderSdata
#' @description This function does all mandatory checks before it assigns class irFinderSdata
#' @param x a list or dataframe to which class irFinderSdata to assign.
#'
#' @return an object of class irFinderSdata
#' @export
#' @keywords internal
.assign_class_irFinderSdata <- function(x){

  # x can be a dataframe or list of dataframes
  # if dataframe it must have mandatory columns
  # if a list it must have mandatory columns in each dataframe and same number of rows in each dataframe

  if(is(x , "data.frame")){
    .check_mendate_columns_for_irFinderSdata(x)
    x <- list(x)
    class(x) <- c("irFinderSdata", class(x))
  }

  if(is(x , "list")){
    purrr::walk(x, ~ .check_mendate_columns_for_irFinderSdata(..1))

    # all elems of the list have same number of rows
    x_nrows <- purrr::map_int(x, ~..1 %>% nrow())

    if(!all(x_nrows==x_nrows[1])){
      stop("All elements in x must have same number of rows.")
    }

    class(x) <- c("irFinderSdata", class(x))
  }

  return(x)

}



#' Check mandatory columns in a dataframe
#' @description This function helps to create mandatory column in a dataframe for the object irFinderSdata.
#' @param x dataframe
#' @param mandat_columns a character vector denoting mandatory columns in x.
#'
#' @return TRUE or ERROR
#' @export
#' @keywords internal
.check_mendate_columns_for_irFinderSdata <- function(x,
                                                     mandat_columns = c("chr",
                                                                        "start",
                                                                        "end",
                                                                        "name",
                                                                        "null",
                                                                        "strand",
                                                                        "coverage",
                                                                        "introndepth",
                                                                        "spliceleft",
                                                                        "spliceright",
                                                                        "spliceexact",
                                                                        "irratio",
                                                                        "warnings")){

  stopifnot("x must be a dataframe" = is(x , "data.frame"))

  x_cols <- colnames(x)

  col_not_found <- mandat_columns[is.na(match(mandat_columns, x_cols))]
  if(length(col_not_found) > 0){
    stop(cli::format_error(c("x" ="Column{?s} {col_not_found} not found")))
  }

  return(TRUE)

}


#' Get intron annotations
#'
#' @param x an object of class irFinderSdata
#' @param intron_ids a character vector of intron ids for which
#' annotations to be retrieved.
#'
#' @return a dataframe with columns \code{chr, start, end, intron_id,
#' null, strand, gene_id, gene_symbol}
#' @export
#' @keywords internal
#' @examples
#' example_dir <- system.file("extdata" ,"ir_data" , package="parcutils")
#' example_files <- fs::dir_ls(example_dir, glob = "*-IR-nondir*.txt")
#' names(example_files) <- stringr::str_replace(example_files , pattern = ".*/", replacement = "") %>%
#' stringr::str_replace(pattern = "-IR-nondir.txt", replacement = "")
#' x <- read_irfinderS_output(files = example_files,  add_prefix_chr = FALSE)
#' .get_intron_annotations(x, intron_ids = paste("intron", sample(1:100, 10),sep ="_"))
.get_intron_annotations <- function(x, intron_ids){

  .validate_irfinders_object(x)

  stopifnot("'intron_ids' must be a character vector." = is.character(intron_ids))

  # first six columns of irfinders output
  dd <- x[[1]]  %>%
    dplyr::select(1:6 , intron_id)  %>%
    dplyr::filter( intron_id %in% intron_ids) %>%
    dplyr::mutate(gene_id = stringr::str_replace(name, pattern = "(.*)/(.*)/(.*)",replacement = "\\2")) %>%
    dplyr::mutate(gene_symbol = stringr::str_replace(name, pattern = "(.*)/(.*)/(.*)",replacement = "\\1")) %>%
    dplyr::mutate(name = intron_id) %>%
    dplyr::select(-intron_id) %>%
    dplyr::rename(intron_id = name) %>%
    dplyr::distinct()

  return(dd)
}



#' Prepare object of parcutils_ir from the object of parcutils
#'
#' @param x an object of class 'parcutils'.
#'
#' @return an object of class 'parcutils_ir'
#' @export
#' @keywords internal
.prepare_parcutils_ir <-  function(x){
  .validate_parcutils_obj(x)
  y <- x %>% tibble::tibble()
  id_column <- y$norm_counts[[1]][[1]] %>% colnames() %>% .[1]
  # fix 1st column names

  y <- y %>%
    dplyr::mutate(norm_counts = purrr::map(norm_counts, function(.x){
      purrr::map(.x, ~ ..1 %>% dplyr::rename( !!id_column := 1))})) %>%
    dplyr::mutate(dsr_tibble = purrr::map(dsr_tibble, function(.x){
      .x %>% dplyr::rename(!!id_column := 1)}))  %>%
    dplyr::mutate(dsr_tibble_deg = purrr::map(dsr_tibble_deg, function(.x){
      .x %>% dplyr::rename(!!id_column := 1)
    }))

  class(y) <- c("parcutils_ir",class(x))
  return(y)
}



#' Save DEG results in an excel file.
#'
#' @param x an object of class parcutils.
#' @param file a file path.
#'
#' @return output of \code{writexl::write_xlsx()}
#' @export
#' @keywords internal
.save_deg_results <- function(x, file){

  .validate_parcutils_obj(x)
  de_table <- x$dsr_tibble_deg
  ret <- writexl::write_xlsx(de_table,path =  file)
  return(ret)
}

#' Save DE ir results in an excel file.
#'
#' @param x an object of class parcutils_ir
#' @param file a file path.
#'
#' @return a file path.
#' @export
#' @keywords internal
.save_de_ir_results <- function(x , file ){

  .validate_parcutils_ir_object(x)

  oo <- purrr::map(x$de_comparisons , ~{
    de_table <- x$dsr_tibble_deg[[.x]]
    intron_annot_table <-  x$intron_annot[[.x]]

    # join de and annotation tables.
    intron_annot_table %>%
      dplyr::left_join( de_table ,by = "intron_id") %>%
      # add IGV search text
      dplyr::mutate(igv_search_text = stringr::str_c(chr, ":", start, "-",end))
  })
  names(oo) <- x$de_comparisons
  ret <- writexl::write_xlsx(oo,path =  file)
  return(ret)
}

#' Validate parcutils_ir object
#'
#' @param x
#'
#' @export
#' @keywords internal
.validate_parcutils_ir_object <- function(x) {
  stopifnot("x must be an object of class 'parcutils_ir'.
           Usually x is derived by parcutils::run_deseq_analysis_ir()." = is(x, "parcutils_ir"))
}


#' Prepare count data for the function [parcutils::run_deseq_analysis()]
#'
#' @param counts a character sting or dataframe.
#'
#' @return a dataframe.
#' @export
#' @keywords internal
.get_count_data  <- function(counts){
  if(!inherits(counts , what = c("character","tbl_df","tbl" ,"data.frame"))){
    cli::cli_abort("{.arg counts} must be an object of class {.cls character} or {.cls data.frame}")
  }

  if(inherits(counts , what = c("character"))){
    cli::cli_alert_info(" Reading count file ...")
    count_data <- readr::read_delim(counts,
                                    delim = delim,
                                    comment = comment_char,
                                    trim_ws = TRUE,
                                    col_names = TRUE,
                                    show_col_types = FALSE ,
                                    progress = TRUE)
    cli::cli_alert_success("Done.")
  } else {
    count_data <- counts
  }
  # remove duplicate rows if any
  count_data %<>% dplyr::distinct()

  return(count_data)
}


#' Prepare sample information for the function [parcutils::run_deseq_analysis()]
#'
#' @param sample_info a character string or data frame.
#'
#' @return a data frame
#' @export
#'
#' @keywords internal
.get_sample_information <- function(sample_info){

  # value for sample_info either from a class character, tbl_df, tbl or data.frame

  if(!inherits(sample_info, what = c("character","tbl_df","tbl" ,"data.frame"))){
    cli::cli_abort("{.arg sample_info} must be an object of class {.cls character} or {.cls data.frame}")

  }

  # if sample_info is a character string then read data from file.

  if(inherits(sample_info , what = c("character"))){
    cli::cli_alert_info(" Reading sample_info file ...")
    sample_info_data <- readr::read_delim(sample_info ,
                                          delim = "\t",
                                          trim_ws = TRUE,
                                          col_names = FALSE,
                                          col_types = c("cc"), # both column will be considered as character
                                          show_col_types = FALSE,
                                          progress = TRUE)

    cli::cli_alert_success("Done.")
  } else{
    sample_info_data <- sample_info
  }
  # if more than 2 columns present in sample_info keep only 2 and show warning.
  if(ncol(sample_info_data) != 2 ){
    if(ncol(sample_info_data) > 2 ){
      cli::cli_alert_warning("'sample_info' contains more than 2 columns. Only first 2 will be considered.")
      sample_info_data %<>%
        dplyr::select(1:2)
    } else if(ncol(sample_info_data) < 2 ) {
      cli::cli_abort("{.arg sample_info} must contains two columns deliminated by '\\t'")
    }
  } else{
    sample_info_data = sample_info_data
  }

  # remove duplicates
  sample_info_data %<>% dplyr::distinct()

  return(sample_info_data)

}



#' Convert NA to 0 from into count data for the function [parcutils::run_deseq_analysis()]
#'
#' @param count_data a dataframe
#'
#' @return a dataframe
#' @export
#'
#' @keywords internal
.count_data_convert_NA_to_0 <- function(count_data){
  count_data_select_NA <- count_data %>%
    TidyWrappers::tbl_keep_rows_NA_any()

  if(nrow(count_data_select_NA) >= 1) {
    cli::cli_alert_warning("Following gene(s) contains NA in one or more samples. Either remove them or Default NA will be converted to 0.")
    genes_NA <- count_data_select_NA %>%
      dplyr::pull(!!column_geneid_quo)
    cli::cli_alert(genes_NA)
    # replace NA with 0
    count_data %<>% dplyr::mutate_if(is.numeric , ~ ..1 %>% tidyr::replace_na(0))
  }
  return(count_data)
}



#' Filter count data using user thresholds for the function [parcutils::run_deseq_analysis()]
#'
#' @param count_data a data frame
#' @param column_geneid a character string
#' @param sample_info a data frame
#' @param min_counts a numeric value
#' @param min_replicates a numeric value
#'
#' @return a tibble.
#' @export
#'
#' @keywords internal
.filter_rows_from_count_data <- function(count_data, column_geneid, sample_info,
                                         min_counts,min_replicates){

  # For DESeq analysis genes having non zero count in at least one sample will be used.
  # In addition genes will be filtered based on user defined cutoffs -
  # 1. minimum counts to each gene
  # 2. minimum number of replicates with minimum counts

  # quote column names
  column_geneid_quo <- rlang::enquo(column_geneid)

  sample_info_colnames <- colnames(sample_info)

  genes_to_keep <- count_data %>%

    # convert data wide to long format.
    tidyr::gather(-!!column_geneid_quo, key = "sample", value = "counts") %>%

    # join sampleinfo table
    dplyr::left_join(sample_info, by = c("sample" = sample_info_colnames[1])) %>%

    # to identify genes which has counts more than cutoff across replicates group by sample groups followed by gene id.
    # total groups will be number of unique genes * number of sample groups.
    dplyr::group_by(!!rlang::sym(sample_info_colnames[2]), !!rlang::sym(column_geneid)) %>%

    dplyr::arrange(!!rlang::sym(column_geneid), !!rlang::sym(sample_info_colnames[2])) %>%

    # add column (count_above_threshold) containing counts of replicates having min counts >= min_counts
    dplyr::mutate(count_above_threshold = sum(counts >= min_counts)) %>%

    # filter genes count_above_thresholds >= min_replicates (e.g. 2);
    # meaning that for a given gene at least two replicates have count >= threshold (e.g., 10)
    dplyr::filter(count_above_threshold >= min_replicates) %>%
    dplyr::ungroup() %>%
    dplyr::pull(!!rlang::sym(column_geneid)) %>%
    unique()

  count_data_filter <- count_data %>%
    dplyr::filter(!!rlang::sym(column_geneid) %in% genes_to_keep)

  return(count_data_filter)
}


#' Check availability of sample comparisons in the object of parcutils
#'
#' @param x an object of class \code{parcutils}
#' @param sample_comparisons a character vector
#'
#' @return either TRUE or abort.
#' @export
#'
#' @keywords internal
.is_sample_comparsions_present_in_obj_parcutils <- function(x, sample_comparisons){

  .validate_parcutils_obj(x)
  not_present <- sample_comparisons[!(sample_comparisons %in% x$de_comparisons)]
  if(length(not_present) > 0 ){
    cli::cli_abort("{.arg sample_comparisons} - {.emph {cli::col_red({not_present})}} {?is/are} not present in {.arg x}")
  } else{
    return(TRUE)
  }
}


#' Check availability of genes in the object of parcutils
#'
#' @param x an object of class \code{parcutils}.
#' @param genes a character vector.
#'
#' @return either TRUE or abort.
#' @export
#'
#' @keywords internal
.is_genes_present_in_obj_parcutils <- function(x, genes){
  all_genes <- .get_all_expressed_genes(x) %>% names()

  # check if values from labels present in x
  not_present <- genes[!genes %in% all_genes]
  if(length(not_present) >0 ){
    cli::cli_abort("{.arg genes} - {.emph {cli::col_red({not_present})}} {?is/are} not present in {.arg x}")
  } else{
    return(TRUE)
  }

}


#' Get gene groups for fc scatter plot
#'
#' @param x an object of class parcutils
#' @param sample_comparisons a character vector
#'
#' @return a dataframe
#' @export
#'
#' @keywords internal
.get_gene_groups_for_fc_scatter_plot <- function(x, sample_comparisons,color_label){

  # prepare groups to color data points
  genes_by_regul <- parcutils::get_genes_by_regulation(x = x,
                                                       sample_comparisons =sample_comparisons,
                                                       regulation = "all",simplify = TRUE) %>%
    dplyr::bind_rows() %>%
    dplyr::group_by(.data$regul,.data$genes) %>%
    dplyr::add_count(name = "counts") %>%
    dplyr::ungroup()

  # assign groups -> both_up, both_down, both

  if(color_label == "both_up"){
    gene_groups <- genes_by_regul %>%
      dplyr::mutate(group = dplyr::case_when(counts == 2 & grepl("up", regul) ~
                                              stringr::str_c("both_",regul,sep = ""),
                                            TRUE ~ "other")) %>%
      dplyr::select(.data$genes, .data$group) %>% dplyr::distinct()

  } else if(color_label == "both_down"){
    gene_groups <- genes_by_regul %>%
      dplyr::mutate(group = dplyr::case_when(counts == 2 & grepl("down", regul) ~
                                              stringr::str_c("both_",regul,sep = ""),
                                            TRUE ~ "other")) %>%
      dplyr::select(.data$genes, .data$group) %>% dplyr::distinct()
  } else {
    gene_groups <- genes_by_regul %>%
      dplyr::mutate(group = dplyr::case_when(counts == 2 & grepl("up|down", regul) ~
                                              stringr::str_c("both_",regul,sep = ""),
                                            TRUE ~ "other")) %>%
      dplyr::select(.data$genes, .data$group) %>% dplyr::distinct()
  }

}


#' Perform k-means clustering
#'
#' @param data a dataframe. First column will be used as row identifier.
#' Therefore, values in the first column must be unique.
#' @param km an integer denoting number of clusters to return.
#'
#' @return a dataframe with two columns -1) 1st column from `data` and 2) clusters
#' @export
#'
#' @keywords internal
.perform_kemans_cluster <- function(data, km ){

  data_column_names <- data %>% colnames()
  data_id_col <- data_column_names[1]

  # convert data to matrix
  data_mat <- data %>%
    as.data.frame() %>%
    tibble::column_to_rownames(data_id_col) %>% as.matrix()
  set.seed(123)
  km_out <- kmeans(data_mat, centers = km )
  ret <- km_out$cluster %>%
    tibble::enframe() %>%
    dplyr::rename(!!rlang::sym(data_id_col) := name, clusters = value) %>%
    dplyr::mutate(clusters = stringr::str_c("clust_",clusters, sep = ""))
  return(ret)
}


#' Generate a gene expression line plot.
#' @description This function called internally from the function
#' `get_gene_expression_line_plot`.
#' @param line_plot_data_wide a dataframe containing data for line plot.
#' @param km a numeric, indicates number of clusters for k-means clustering.
#' @param facet_clusters a logical, denotes whether to facet clusters generated by k-means.
#' @param scale_log10 a logical, denotes whether to transform Y axis on log10 scale.
#' @param line_transparency a numeric, denotes value for line transparency.
#' @param show_average_line a logical, denotes whether to show average line.
#' @param average_line_color a character string, denotes color for average line.
#' @param average_line_size a numeric, denotes size for average line.
#' @param average_line_summary_method character string either \code{mean} or  \code{median},
#' denotes summary method for average line.
#'
#' @return a ggplot2.
#' @export
#'
#' @keywords internal
.generate_a_lineplot <- function(line_plot_data_wide,
                                                  km,
                                                  facet_clusters,
                                                  scale_log10,
                                                  line_transparency,
                                                  show_average_line,
                                                  average_line_color,
                                                  average_line_size,
                                                  average_line_summary_method
){

  gene_id_col <- line_plot_data_wide %>% colnames() %>%.[1]

  # long data
  line_plot_data_long <- line_plot_data_wide %>%
    tidyr::pivot_longer(-!!rlang::sym(gene_id_col), names_to = "samples", values_to = "values") %>%
    dplyr::group_by(samples) %>%
    dplyr::mutate(row_num =  1:dplyr::n())

  # perform clustering
  if(!is.null(km)){
    km_output <- .perform_kemans_cluster(data = line_plot_data_wide, km = km)
    line_plot_data_long <- line_plot_data_long %>%
      dplyr::left_join(km_output, by = gene_id_col)
  }

  # plot
  gp <- line_plot_data_long %>%
    ggplot2::ggplot(ggplot2::aes(x = samples, y = values)) +
    ggplot2::geom_point()

  # add layers
  if(is.null(km)){
    gp <- gp + ggplot2::geom_line(alpha = line_transparency, ggplot2::aes(group = row_num))
  } else {
    gp <- gp + ggplot2::geom_line(alpha = line_transparency, ggplot2::aes(group = row_num, col = clusters))
  }

  # modify theme and scale
  if(TRUE){
    gp <- gp +   ggplot2::theme(text = ggplot2::element_text(size = 12)) +
      ggplot2::theme_bw() +
      ggplot2::scale_y_continuous(labels = scales::label_number(scale_cut = scales::cut_long_scale()))
  }
  # scale log10
  if(scale_log10) {
    suppressMessages(
      gp <- gp +
        ggplot2::scale_y_log10(labels = scales::label_number(scale_cut = scales::cut_long_scale()))
    )
  }
  # change label
  if(TRUE){
    gp  <- gp + ggplot2::ylab("Normalised Gene Expression") +
      ggplot2::xlab("Samples")
  }

  # show average line
  if(show_average_line){
    if(!is.null(km)){
      avg_line_data <- line_plot_data_long %>%
        dplyr::group_by(samples, clusters) %>%
        dplyr::summarise_at(.vars = dplyr::vars(values),
                            .funs = average_line_summary_method)
      gp <- gp +
        ggplot2::geom_line(data = avg_line_data %>%
                             dplyr::mutate(row_num = dplyr::row_number()) ,
                           ggplot2::aes( group = row_num),
                           col = average_line_color ,
                           size = average_line_size)

    } else {
      avg_line_data <- line_plot_data_long  %>%
        dplyr::group_by(samples) %>%
        dplyr::summarise_at(.vars = dplyr::vars(values),
                            .funs = average_line_summary_method)

      gp <- gp +
        ggplot2::geom_line(data = avg_line_data %>%
                             dplyr::mutate(row_num = dplyr::row_number()) ,
                           ggplot2::aes( group = 1),
                           col = average_line_color ,
                           size = average_line_size)
    }

  }
  # facet
  if(facet_clusters){
    if(!is.null(km)){
      gp <- gp + ggplot2::facet_wrap(~clusters)
    }
  }
  return(gp)
}




#' Get the gene names stored in the parcutils object
#'
#' @param x an object of class parcutils
#' @param query_genes a character vector denoting genes for which corresponding gene names to be derived from the parcutils object.
#'
#' @return a dataframe containing two columns - 1) queried genes and 2) gene names from the object parcutils.
#' @export
#'
#' @examples
#'
#' res = .get_parcutils_object_example()
#' .get_parcutils_obj_gene_names(x = res, query_genes= c("LSS", "GOLGA8S", "FXYD5", "CDC34"))
#'
#' @keywords internal
.get_parcutils_obj_gene_names <- function(x, query_genes){


  lookup_table <- parcutils:::.get_all_expressed_genes(x = x) %>%
    names()

  names(query_genes) <- query_genes

  purrr::map(query_genes , ~which(stringr::str_detect(pattern = glue::glue(".*:{..1}$"),string = lookup_table))) %>%
    tibble::enframe(value = "index") %>%
    tidyr::unnest(keep_empty = T) %>%
    dplyr::mutate(full_name = lookup_table[index]) %>%
    dplyr::select(-index)


}


#' Obtain an example object of the class parcutils
#'
#' @return an object of class \code{parcutils}
#' @export
#'
#' @keywords internal
.get_parcutils_object_example <- function(){
  count_file <- system.file("extdata","toy_counts.txt" , package = "parcutils")
  count_data <- readr::read_delim(count_file, delim = "\t",show_col_types = FALSE)

  sample_info <- count_data %>% colnames() %>% .[-1]  %>%
    tibble::tibble(samples = . , groups = rep(c("control" ,"treatment1" , "treatment2"), each = 3) )


  res <- run_deseq_analysis(counts = count_data ,
                            sample_info = sample_info,
                            column_geneid = "gene_id" ,
                            cutoff_lfc = 1 ,
                            cutoff_pval = 0.05,
                            group_numerator = c("treatment1", "treatment2") ,
                            group_denominator = c("control"))

  return(res)
}
cparsania/parcutils documentation built on Oct. 27, 2024, 4:55 a.m.