#' Multiply comma-separated numbers in a character vector by a number, handling NAs
#' @description
#' This function takes a character vector containing comma-separated numbers and
#' multiplies each number by 100. It handles missing values (NAs) by returning `NA`
#' for elements that cannot be converted to numeric.
#' @param x A character vector containing comma-separated numbers.
#' @param k The number to multiply by. Default `100`.
#'
#' @importFrom stringr str_split
#'
#' @return A character vector with the same number of elements as `x`,
#' where each element is the product of the original number and `k`,
#' or NA if the original element could not be converted to numeric.
#'
#' @examples
#' x <- c("0.1,0.2,NA,0.3", "0.4,0.5,NA,0.66")
#' multiply_csv(x)
#' # "10,20,NA,30" "40,50,NA,66"
#' y <- c("0.2,!!,0.9,NA", "*,NA,0.015")
#' multiply_csv(y)
#' # "20,NA,90,NA" "NA,NA,1.5,NA"
multiply_csv <- function(x, k = 100) {
mat <- str_split(string = x, pattern = ",", simplify = TRUE)
num_mat <- suppressWarnings(apply(mat, MARGIN = 2, FUN = as.numeric))
mult_mat <- num_mat * k
mult_vec <- apply(mult_mat, MARGIN = 1, FUN = paste, collapse = ",")
return(mult_vec)
}
#' Read rMATS Output Files and Process Splicing Event Data
#'
#' @description
#' This function reads and processes output files generated by the rMATS (Replicate Multivariate Analysis of Transcript Splicing) software.
#' It imports the data for different types of splicing events, scales the PSI values, and returns a combined data frame with the specified columns (if needed).
#'
#' @details
#' The function processes five types of splicing events: Skipped Exons (SE), Alternative 5' Splice Sites (A5SS), Alternative 3' Splice Sites (A3SS), Mutually Exclusive Exons (MXE), and Retained Introns (RI).
#' It reads the corresponding files, scales the PSI values, and returns a tibble.
#' The user can choose to return only the shared columns across all event types or data specific to a particular event type with `return`.
#' To understand the different between `JC` and `JCEC` please refer to the rMATS manual.
#'
#' @param outdir Character. Path/to/the/directory/containing/rMATS/output/files.
#' @param reads_counts_method Character. The method to use for reading counts. Either `JC` (default) for junction counts or `JCEC` for junction counts plus exon counts.
#' @param return Character. Specifies the type of data to return. Options are `SHARED_COLS_ONLY` (default), `SE`, `ALT53SS`, `MXE`, `RI`, and `ALL`.
#' @param max_lines Numeric. Maximum number of lines to read from each file. Default is `1e9`.
#'
#' @return A data frame containing the processed splicing event data. The structure of the returned data frame depends on the value of the `return` parameter. If `return` is set to `ALL` a list of dataframes is returned.
#'
#' @importFrom dplyr mutate select rename relocate bind_rows across all_of
#' @importFrom readr read_delim locale
#'
#' @export
#' @examples
#' # result <- read_rmats(outdir = "path/to/rmats/output", reads_counts_method = "JC", return = "SHARED_COLS_ONLY")
read_rmats <- function(outdir, reads_counts_method = c('JC', 'JCEC'),
return = c('SHARED_COLS_ONLY', 'SE', 'ALT53SS', 'MXE', 'RI', 'ALL'),
max_lines = 1e9) {
if ( missing(reads_counts_method) ) { reads_counts_method <- 'JC' }
if ( missing(return) ) { return <- 'SHARED_COLS_ONLY' }
events_type <- c('SE', 'A5SS', 'A3SS', 'MXE', 'RI')
### -------- Shared columns --------
# ID: rMATS event id
# GeneID: Gene id
# geneSymbol: Gene name
# chr: Chromosome
# strand: Strand of the gene
# IJC_SAMPLE_1: Inclusion counts for sample 1. Replicates are comma separated
# SJC_SAMPLE_1: Skipping counts for sample 1. Replicates are comma separated
# IJC_SAMPLE_2: Inclusion counts for sample 2. Replicates are comma separated
# SJC_SAMPLE_2: Skipping counts for sample 2. Replicates are comma separated
# IncFormLen: Length of inclusion form, used for normalization
# SkipFormLen: Length of skipping form, used for normalization
# PValue: Significance of splicing difference between the two sample groups. (Only available if the statistical model is on)
# FDR: False Discovery Rate calculated from p-value. (Only available if statistical model is on)
# IncLevel1: Inclusion level for sample 1. Replicates are comma separated. Calculated from normalized counts
# IncLevel2: Inclusion level for sample 2. Replicates are comma separated. Calculated from normalized counts
# IncLevelDifference: average(IncLevel1) - average(IncLevel2)
shared_cols <- c('ID', 'GeneID', 'geneSymbol', 'chr', 'strand',
'IJC_SAMPLE_1', 'SJC_SAMPLE_1', 'IJC_SAMPLE_2' , 'SJC_SAMPLE_2',
'IncFormLen', 'SkipFormLen', 'PValue', 'FDR',
'IncLevel1', 'IncLevel2', 'IncLevelDifference')
### -------- Skipped Exons specific columns --------
se_cols <- c('exonStart_0base', 'exonEnd', 'upstreamES', 'upstreamEE',
'downstreamES', 'downstreamEE', 'upstream_to_target_count',
'target_to_downstream_count', 'target_count',
'upstream_to_downstream_count')
### -------- Alternative 5' & 3' splice sites specific columns --------
alt53ss_cols <- c('longExonStart_0base', 'longExonEnd', 'shortES', 'shortEE',
'flankingES', 'flankingEE', 'across_short_boundary_count',
'long_to_flanking_count', 'exclusive_to_long_count',
'short_to_flanking_count')
### -------- Mutually Exclusive Exons specific columns --------
mxe_cols <- c('X1stExonStart_0base', 'X1stExonEnd', 'X2ndExonStart_0base',
'X2ndExonEnd', 'upstreamES', 'upstreamEE', 'downstreamES',
'downstreamEE', 'upstream_to_first_count',
'first_to_downstream_count', 'first_count',
'upstream_to_second_count','second_to_downstream_count',
'second_count')
### -------- Retained Intron specific columns --------
ri_cols <- c('riExonStart_0base', 'riExonEnd', 'upstreamES', 'upstreamEE',
'downstreamES', 'downstreamEE', 'upstream_to_intron_count',
'intron_to_downstream_count', 'intron_count',
'upstream_to_downstream_count')
### -------- READ EVENTS --------
events_df <- list()
for (e in 1:length(events_type)) {
# message('Importing ', events_type[e] )
if ( reads_counts_method == 'JC' ) {
filename_JC <- paste0(events_type[e], '.MATS.JC.txt')
in_path <- file.path(outdir, filename_JC)
} else if ( reads_counts_method == 'JCEC' ) {
filename_JCEC <- paste0(events_type[e], '.MATS.JCEC.txt')
in_path <- file.path(outdir, filename_JCEC)
} else {
stop('The argument reads_counts_method must be either "JC" or "JCEC" !')
}
check_file(in_path)
if (events_type[e] == 'SE') { event_cols <- c(shared_cols, se_cols) }
else if (events_type[e] == 'A5SS') { event_cols <- c(shared_cols, alt53ss_cols) }
else if (events_type[e] == 'A3SS') { event_cols <- c(shared_cols, alt53ss_cols) }
else if (events_type[e] == 'MXE') { event_cols <- c(shared_cols, mxe_cols) }
else if (events_type[e] == 'RI') { event_cols <- c(shared_cols, ri_cols) }
else { stop('Event Type not known, please investigate...') }
event_txt <- read_delim(file = in_path, delim = '\t', col_select = event_cols,
trim_ws = TRUE, n_max = max_lines, progress = FALSE,
name_repair = make.names, show_col_types = FALSE,
locale = locale(decimal_mark = '.', grouping_mark = ',') ) |>
mutate(TYPE = events_type[e], .before = ID) |>
mutate(TYPE = factor(TYPE, levels = c( "SE", "MXE", "A5SS", "A3SS", "RI") ) ) |>
mutate(CLASS = case_when(TYPE %in% c("SE", "MXE") ~ 'Exon',
TYPE %in% c("A5SS", "A3SS") ~ 'Alt.SS',
TYPE == c("RI") ~ 'Intron') ) |>
mutate(CLASS = factor(CLASS, levels = c( "Exon", "Intron", "Alt.SS") ), .after = TYPE ) |>
rename(ensembl_gene_id = GeneID) |>
rename(external_gene_name = geneSymbol) |>
# scale PSIs from 0 to 100
mutate( across( .cols = c(IncLevel1, IncLevel2), .fns = multiply_csv ) ) |>
rename(PSI_1 = IncLevel1) |> rename(PSI_2 = IncLevel2) |>
# scale dPSIs from -100 to 100
rename(dPSI = IncLevelDifference) |>
mutate(dPSI = dPSI * 100) |>
# coerce all counts to character to make them compatible with
# situations in which samples have reps or not and are comma separated values
mutate( across( .cols = c(starts_with('IJC_'), starts_with('SJC_') ), .fns = as.character ) )
events_df[[e]] <- event_txt
}
### -------- PREPARE OUTPUT --------
# fix renamed columns
shared_cols[shared_cols == 'GeneID'] <- 'ensembl_gene_id'
shared_cols[shared_cols == 'geneSymbol'] <- 'external_gene_name'
shared_cols[shared_cols == 'IncLevel1'] <- 'PSI_1'
shared_cols[shared_cols == 'IncLevel2'] <- 'PSI_2'
shared_cols[shared_cols == 'IncLevelDifference'] <- 'dPSI'
shared_cols <- c('TYPE', 'CLASS', shared_cols)
if ( return == 'SHARED_COLS_ONLY' ) { res <- lapply(events_df, function(x) { select(x, all_of(shared_cols)) } ) |> bind_rows() }
else if ( return == 'SE' ) { res <- events_df[[1]] }
else if ( return == 'ALT53SS' ) { res <- rbind( events_df[[2]], events_df[[3]] ) }
else if ( return == 'MXE' ) { res <- events_df[[4]] }
else if ( return == 'RI' ) { res <- events_df[[5]] }
else if ( return == 'ALL' ) { res <- events_df }
return(res)
}
#' Import the rMATS-turbo summary files.
#'
#' @description
#' This function imports the numbers of total and significant AS events quantified with both JC and JCEC.
#'
#' @param outdir Character. Path/to/the/directory/containing/rMATS/output/files.
#'
#' @return A tibble
#' @importFrom tidyr pivot_longer
#' @importFrom dplyr mutate starts_with
#' @importFrom stringr str_extract str_remove
#' @export
#'
#' @seealso [plot_rmats_summary()]
#'
#' @examples
#' # read_rmats_summary(outdir = "path/to/rmats/output" ) |>
#' # plot_rmats_summary()
read_rmats_summary <- function(outdir) {
# If needed this are the columns
# summary_cols <- c('EventType',
# 'EventTypeDescription',
# 'TotalEventsJC',
# 'TotalEventsJCEC',
# 'SignificantEventsJC',
# 'SigEventsJCSample1HigherInclusion',
# 'SigEventsJCSample2HigherInclusion',
# 'SignificantEventsJCEC',
# 'SigEventsJCECSample1HigherInclusion',
# 'SigEventsJCECSample2HigherInclusion')
summary_path <- file.path(outdir, 'summary.txt')
check_file(summary_path)
smmry_nums <- read.delim(file = summary_path, header = T, sep = '\t', quote = '')
smmry_nums |>
pivot_longer(cols = !starts_with('Event'), names_to = 'Stat', values_to = 'Count') |>
mutate(Method = str_extract(string = Stat, pattern = 'JC(?:EC)?'), .before = Count) |>
mutate(Group = str_remove(string = Stat, pattern = 'JC(?:EC)?'), .before = Count) |>
mutate(Group = gsub(pattern = 'SigEventsSample1HigherInclusion', replacement = 'SigEvents_HigherS1', x = Group)) |>
mutate(Group = gsub(pattern = 'SigEventsSample2HigherInclusion', replacement = 'SigEvents_HigherS2', x = Group)) |>
mutate(Group = factor(Group, levels = c( "TotalEvents", "SignificantEvents", "SigEvents_HigherS1", "SigEvents_HigherS2") ) ) |>
mutate(EventType = factor(EventType, levels = c( "SE", "MXE", "A5SS", "A3SS", "RI") ) ) -> tidy_smmry
return(tidy_smmry)
}
#' Plot the number of AS events quantified with rMATS-turbo
#'
#' @param data A data.frame generated with `read_rmats_summary()`
#' @param legend_position The relative location of the legend in the plot. Passed to `theme(legend.position.inside)`
#'
#' @return A ggplot
#' @import ggplot2
#' @export
#'
#' @seealso [read_rmats_summary()]
#'
#' @examples
#' # read_rmats_summary(outdir = "path/to/rmats/output" ) |>
#' # plot_rmats_summary()
plot_rmats_summary <- function(data, legend_position = c(0.8, 0.8)) {
ggplot(data) +
aes(x = Group, y = Count, group = Method, fill = Method) +
facet_grid(~EventType) +
geom_col(position = position_dodge(width = 0.75), width = 0.7, colour = 'black', linewidth = 0.2) +
scale_y_continuous(expand = expansion(mult = c(0, 0.05), add = 0), n.breaks = 10) +
scale_fill_manual(values = c('darkorchid1', 'forestgreen')) +
theme_classic(base_family = 'Arial', base_size = 6) +
theme(plot.background = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = 'black', fill = NA),
panel.grid.major.y = element_line(colour = 'grey84', linewidth = 0.2),
panel.spacing.x = unit(0.75, units = 'mm'),
panel.spacing.y = unit(1, units = 'mm'),
strip.background = element_blank(),
strip.text = element_text(colour = 'black'),
strip.text.x = element_text(margin = margin(b = -1)),
strip.text.y = element_text(margin = margin(l = -1)),
axis.line = element_line(linewidth = 0.5, colour = 0.5),
axis.ticks = element_line(linewidth = 0.2, colour = 'black'),
axis.ticks.x = element_blank(),
axis.ticks.length = unit(1, 'mm'),
axis.text = element_text(colour = 'black'),
axis.text.x = element_text(angle = 45, hjust = 1, margin = margin(t = -2)),
axis.text.y = element_text(margin = margin(r = 0)),
axis.title = element_blank(),
legend.title = element_blank(),
legend.background = element_blank(),
legend.position = 'inside',
legend.position.inside = legend_position,
legend.key.size = unit(1, 'mm'),
legend.key = element_blank(),
legend.key.height = unit(1, 'mm'),
legend.key.width = unit(1, 'mm') )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.