niar: Nice Interactive Analysis with R

Documented in multiply_csv plot_rmats_summary read_rmats read_rmats_summary

#' Multiply comma-separated numbers in a character vector by a number, handling NAs

#' @description
#' This function takes a character vector containing comma-separated numbers and 
#' multiplies each number by 100. It handles missing values (NAs) by returning `NA` 
#' for elements that cannot be converted to numeric.

#' @param x A character vector containing comma-separated numbers.
#' @param k The number to multiply by. Default `100`.
#' 
#' @importFrom stringr str_split
#' 
#' @return A character vector with the same number of elements as `x`, 
#' where each element is the product of the original number and `k`, 
#' or NA if the original element could not be converted to numeric.
#' 
#' @examples
#' x <- c("0.1,0.2,NA,0.3", "0.4,0.5,NA,0.66")
#' multiply_csv(x)
#' # "10,20,NA,30" "40,50,NA,66"
#' y <- c("0.2,!!,0.9,NA", "*,NA,0.015")
#' multiply_csv(y)
#' # "20,NA,90,NA"  "NA,NA,1.5,NA"
multiply_csv <- function(x, k = 100) {
  mat <- str_split(string = x, pattern = ",", simplify = TRUE) 
  num_mat <- suppressWarnings(apply(mat, MARGIN = 2, FUN = as.numeric))
  mult_mat <- num_mat * k
  mult_vec <- apply(mult_mat, MARGIN = 1, FUN = paste, collapse = ",")
  return(mult_vec)
}

#' Read rMATS Output Files and Process Splicing Event Data
#'
#' @description
#' This function reads and processes output files generated by the rMATS (Replicate Multivariate Analysis of Transcript Splicing) software.
#' It imports the data for different types of splicing events, scales the PSI values, and returns a combined data frame with the specified columns (if needed).
#'
#' @details
#' The function processes five types of splicing events: Skipped Exons (SE), Alternative 5' Splice Sites (A5SS), Alternative 3' Splice Sites (A3SS), Mutually Exclusive Exons (MXE), and Retained Introns (RI).
#' It reads the corresponding files, scales the PSI values, and returns a tibble. 
#' The user can choose to return only the shared columns across all event types or data specific to a particular event type with `return`.
#' To understand the different between `JC` and `JCEC` please refer to the rMATS manual.
#'
#' @param outdir Character. Path/to/the/directory/containing/rMATS/output/files.
#' @param reads_counts_method Character. The method to use for reading counts. Either `JC` (default) for junction counts or `JCEC` for junction counts plus exon counts.
#' @param return Character. Specifies the type of data to return. Options are `SHARED_COLS_ONLY` (default), `SE`, `ALT53SS`, `MXE`, `RI`, and `ALL`.
#' @param max_lines Numeric. Maximum number of lines to read from each file. Default is `1e9`.
#' 
#' @return A data frame containing the processed splicing event data. The structure of the returned data frame depends on the value of the `return` parameter. If `return` is set to `ALL` a list of dataframes is returned.
#' 
#' @importFrom dplyr mutate select rename relocate bind_rows across all_of
#' @importFrom readr read_delim locale
#'
#' @export
#' @examples
#' # result <- read_rmats(outdir = "path/to/rmats/output", reads_counts_method = "JC", return = "SHARED_COLS_ONLY")
read_rmats <- function(outdir, reads_counts_method = c('JC', 'JCEC'), 
                       return = c('SHARED_COLS_ONLY', 'SE', 'ALT53SS', 'MXE', 'RI', 'ALL'),
                       max_lines = 1e9) {
  
  if ( missing(reads_counts_method) ) { reads_counts_method <- 'JC' }
  if ( missing(return) ) { return <- 'SHARED_COLS_ONLY' }
  
  events_type <- c('SE', 'A5SS', 'A3SS', 'MXE', 'RI')
  
  ### --------  Shared columns --------
  # ID: rMATS event id
  # GeneID: Gene id
  # geneSymbol: Gene name
  # chr: Chromosome
  # strand: Strand of the gene
  # IJC_SAMPLE_1: Inclusion counts for sample 1. Replicates are comma separated
  # SJC_SAMPLE_1: Skipping counts for sample 1. Replicates are comma separated
  # IJC_SAMPLE_2: Inclusion counts for sample 2. Replicates are comma separated
  # SJC_SAMPLE_2: Skipping counts for sample 2. Replicates are comma separated
  # IncFormLen: Length of inclusion form, used for normalization
  # SkipFormLen: Length of skipping form, used for normalization
  # PValue: Significance of splicing difference between the two sample groups. (Only available if the statistical model is on)
  # FDR: False Discovery Rate calculated from p-value. (Only available if statistical model is on)
  # IncLevel1: Inclusion level for sample 1. Replicates are comma separated. Calculated from normalized counts
  # IncLevel2: Inclusion level for sample 2. Replicates are comma separated. Calculated from normalized counts
  # IncLevelDifference: average(IncLevel1) - average(IncLevel2)
  
  shared_cols <- c('ID', 'GeneID', 'geneSymbol', 'chr', 'strand', 
                   'IJC_SAMPLE_1', 'SJC_SAMPLE_1', 'IJC_SAMPLE_2' , 'SJC_SAMPLE_2',
                   'IncFormLen', 'SkipFormLen', 'PValue', 'FDR', 
                   'IncLevel1', 'IncLevel2', 'IncLevelDifference')
  ### --------  Skipped Exons specific columns --------
  se_cols <- c('exonStart_0base', 'exonEnd', 'upstreamES', 'upstreamEE', 
               'downstreamES', 'downstreamEE', 'upstream_to_target_count',
               'target_to_downstream_count', 'target_count', 
               'upstream_to_downstream_count')
  ### --------  Alternative 5' & 3' splice sites specific columns --------
  alt53ss_cols <- c('longExonStart_0base', 'longExonEnd', 'shortES', 'shortEE',
                    'flankingES', 'flankingEE', 'across_short_boundary_count',
                    'long_to_flanking_count', 'exclusive_to_long_count',
                    'short_to_flanking_count')
  ### --------  Mutually Exclusive Exons specific columns --------
  mxe_cols <- c('X1stExonStart_0base', 'X1stExonEnd', 'X2ndExonStart_0base',
                'X2ndExonEnd', 'upstreamES', 'upstreamEE', 'downstreamES',
                'downstreamEE', 'upstream_to_first_count',
                'first_to_downstream_count', 'first_count', 
                'upstream_to_second_count','second_to_downstream_count',
                'second_count')
  
  ### --------  Retained Intron specific columns --------
  ri_cols <- c('riExonStart_0base', 'riExonEnd', 'upstreamES', 'upstreamEE',
               'downstreamES', 'downstreamEE', 'upstream_to_intron_count',
               'intron_to_downstream_count', 'intron_count',
               'upstream_to_downstream_count')
  ### --------  READ EVENTS --------
  events_df <- list()
  for (e in 1:length(events_type)) {
    # message('Importing ', events_type[e] )
    
    if ( reads_counts_method == 'JC' ) { 
      filename_JC <- paste0(events_type[e], '.MATS.JC.txt')
      in_path <- file.path(outdir, filename_JC)
      
    } else if (  reads_counts_method == 'JCEC' ) {
      filename_JCEC <- paste0(events_type[e], '.MATS.JCEC.txt')
      in_path <- file.path(outdir, filename_JCEC)
      
    } else {
      stop('The argument reads_counts_method must be either "JC" or "JCEC" !')
    }
    
    check_file(in_path)
    
    if (events_type[e] == 'SE') { event_cols <- c(shared_cols, se_cols) }
    else if (events_type[e] ==  'A5SS') { event_cols <- c(shared_cols, alt53ss_cols) }
    else if (events_type[e] ==  'A3SS') { event_cols <- c(shared_cols, alt53ss_cols) }
    else if (events_type[e] ==  'MXE') { event_cols <- c(shared_cols, mxe_cols) }
    else if (events_type[e] ==  'RI') { event_cols <- c(shared_cols, ri_cols) }
    else { stop('Event Type not known, please investigate...') }
    
    event_txt <- read_delim(file = in_path, delim = '\t', col_select = event_cols,
                            trim_ws = TRUE, n_max = max_lines, progress = FALSE,
                            name_repair = make.names, show_col_types = FALSE,
                            locale = locale(decimal_mark = '.', grouping_mark = ',') ) |>
      mutate(TYPE = events_type[e], .before = ID) |>
      mutate(TYPE = factor(TYPE, levels = c( "SE", "MXE", "A5SS", "A3SS", "RI") ) ) |>
      mutate(CLASS = case_when(TYPE %in% c("SE", "MXE") ~ 'Exon',
                               TYPE %in% c("A5SS", "A3SS") ~ 'Alt.SS',
                               TYPE == c("RI") ~ 'Intron') ) |>
      mutate(CLASS = factor(CLASS, levels = c( "Exon", "Intron", "Alt.SS") ), .after = TYPE ) |>
      rename(ensembl_gene_id = GeneID) |>
      rename(external_gene_name = geneSymbol) |>
      # scale PSIs from 0 to 100
      mutate( across( .cols = c(IncLevel1, IncLevel2), .fns = multiply_csv ) ) |>
      rename(PSI_1 = IncLevel1) |> rename(PSI_2 = IncLevel2) |>
      # scale dPSIs from -100 to 100
      rename(dPSI = IncLevelDifference) |>
      mutate(dPSI = dPSI * 100) |>
      # coerce all counts to character to make them compatible with
      # situations in which samples have reps or not and are comma separated values
      mutate( across( .cols = c(starts_with('IJC_'), starts_with('SJC_') ), .fns = as.character ) )
    
    events_df[[e]] <- event_txt
  }
  
  ### -------- PREPARE OUTPUT --------
  # fix renamed columns
  shared_cols[shared_cols == 'GeneID'] <- 'ensembl_gene_id'
  shared_cols[shared_cols == 'geneSymbol'] <- 'external_gene_name'
  shared_cols[shared_cols == 'IncLevel1'] <- 'PSI_1'
  shared_cols[shared_cols == 'IncLevel2'] <- 'PSI_2'
  shared_cols[shared_cols == 'IncLevelDifference'] <- 'dPSI'
  shared_cols <- c('TYPE', 'CLASS', shared_cols)

  if ( return == 'SHARED_COLS_ONLY' ) { res <- lapply(events_df, function(x) { select(x, all_of(shared_cols)) } ) |> bind_rows() }
  else if ( return == 'SE' ) { res <- events_df[[1]] }
  else if ( return == 'ALT53SS' ) { res <- rbind( events_df[[2]], events_df[[3]] ) }
  else if ( return == 'MXE' ) { res <- events_df[[4]] }
  else if ( return == 'RI' ) { res <- events_df[[5]] }
  else if ( return == 'ALL' ) { res <- events_df }
  
  return(res)
}

#' Import the rMATS-turbo summary files.
#'
#' @description
#' This function imports the numbers of total and significant AS events quantified with both JC and JCEC.
#'
#' @param outdir Character. Path/to/the/directory/containing/rMATS/output/files.
#'
#' @return A tibble
#' @importFrom tidyr pivot_longer
#' @importFrom dplyr mutate starts_with
#' @importFrom stringr str_extract str_remove
#' @export
#'
#' @seealso [plot_rmats_summary()]
#'
#' @examples
#' # read_rmats_summary(outdir = "path/to/rmats/output" ) |>
#' #          plot_rmats_summary()
read_rmats_summary <- function(outdir) {
  
  # If needed this are the columns
  # summary_cols <- c('EventType',
  #                   'EventTypeDescription',
  #                   'TotalEventsJC',
  #                   'TotalEventsJCEC',
  #                   'SignificantEventsJC',
  #                   'SigEventsJCSample1HigherInclusion',
  #                   'SigEventsJCSample2HigherInclusion',
  #                   'SignificantEventsJCEC',
  #                   'SigEventsJCECSample1HigherInclusion',
  #                   'SigEventsJCECSample2HigherInclusion')
  
  summary_path <- file.path(outdir, 'summary.txt')
  check_file(summary_path)
  smmry_nums <- read.delim(file = summary_path, header = T, sep = '\t', quote = '')
  
  smmry_nums |>
    pivot_longer(cols = !starts_with('Event'), names_to = 'Stat', values_to = 'Count') |>
    mutate(Method = str_extract(string = Stat, pattern = 'JC(?:EC)?'), .before = Count) |>
    mutate(Group = str_remove(string = Stat, pattern = 'JC(?:EC)?'), .before = Count)  |>
    mutate(Group = gsub(pattern = 'SigEventsSample1HigherInclusion', replacement = 'SigEvents_HigherS1', x = Group)) |>
    mutate(Group = gsub(pattern = 'SigEventsSample2HigherInclusion', replacement = 'SigEvents_HigherS2', x = Group)) |>       
    mutate(Group = factor(Group, levels = c( "TotalEvents", "SignificantEvents", "SigEvents_HigherS1", "SigEvents_HigherS2") ) ) |>
    mutate(EventType = factor(EventType, levels = c( "SE", "MXE", "A5SS", "A3SS", "RI") ) ) -> tidy_smmry
  
  return(tidy_smmry)  
}

#' Plot the number of AS events quantified with rMATS-turbo
#'
#' @param data A data.frame generated with `read_rmats_summary()`
#' @param legend_position The relative location of the legend in the plot. Passed to `theme(legend.position.inside)`
#'
#' @return A ggplot
#' @import ggplot2
#' @export
#'
#' @seealso [read_rmats_summary()]
#'
#' @examples
#' # read_rmats_summary(outdir = "path/to/rmats/output" ) |>
#' #          plot_rmats_summary()
plot_rmats_summary <- function(data, legend_position = c(0.8, 0.8)) {
  ggplot(data) +
    aes(x = Group, y = Count, group = Method, fill = Method) +
    facet_grid(~EventType) +
    geom_col(position = position_dodge(width = 0.75), width = 0.7, colour = 'black', linewidth = 0.2) +
    scale_y_continuous(expand = expansion(mult = c(0, 0.05), add = 0), n.breaks = 10) +
    scale_fill_manual(values = c('darkorchid1', 'forestgreen')) +
    theme_classic(base_family = 'Arial', base_size = 6) +
    theme(plot.background = element_blank(),
          panel.background = element_blank(),
          panel.border = element_rect(colour = 'black', fill = NA),
          panel.grid.major.y = element_line(colour = 'grey84', linewidth = 0.2),
          panel.spacing.x = unit(0.75, units = 'mm'),
          panel.spacing.y = unit(1, units = 'mm'),
          
          strip.background = element_blank(),
          strip.text = element_text(colour = 'black'),
          strip.text.x = element_text(margin = margin(b = -1)),
          strip.text.y = element_text(margin = margin(l = -1)),
          
          axis.line = element_line(linewidth = 0.5, colour = 0.5),
          axis.ticks = element_line(linewidth = 0.2, colour = 'black'),
          axis.ticks.x = element_blank(),
          axis.ticks.length = unit(1, 'mm'),
          axis.text = element_text(colour = 'black'),
          axis.text.x = element_text(angle = 45, hjust = 1, margin = margin(t = -2)),
          axis.text.y = element_text(margin = margin(r = 0)),
          axis.title = element_blank(),
          legend.title = element_blank(),
          legend.background = element_blank(),
          legend.position = 'inside',
          legend.position.inside = legend_position,
          legend.key.size = unit(1, 'mm'),
          legend.key = element_blank(), 
          legend.key.height = unit(1, 'mm'), 
          legend.key.width = unit(1, 'mm') )
}