#' Normalise junction counts for RNA-seq samples
#'
#' \code{normalise_junc} takes as input the junction co-ordinates and counts for
#' samples and normalises the counts for each sample using total number of
#' counts for junctions that share an start or end with that junction
#'
#' @param junc_coords gr. Junction co-ordinates in a GRanges format
#' @param raw_counts df. Detailing the counts for each junction for all
#' samples. Each column is a sample and each row corresponding to a junction.
#' @param use_strand lgl scalar. Whether to use the strand information when
#' clustering junctions.
#'
#' @return list. Raw and normalised counts including metdata detailing
#' co-ordinates and clusters.
#' @export
normalise_junc <- function(junc_coords, raw_counts, use_strand = T){
print(stringr::str_c(Sys.time(), (" - Clustering junctions...")))
junc_coords_sep_start_end <- .get_gr_for_start_end(junc_coords)
junc_start_hits <-
GenomicRanges::findOverlaps(query = junc_coords_sep_start_end$start,
subject = junc_coords_sep_start_end$start,
type = "equal",
ignore.strand = !(use_strand))
junc_end_hits <-
GenomicRanges::findOverlaps(query = junc_coords_sep_start_end$end,
subject = junc_coords_sep_start_end$end,
type = "equal",
ignore.strand = !(use_strand))
junc_start_end_hits_subject <-
c(S4Vectors::subjectHits(junc_start_hits),
S4Vectors::subjectHits(junc_end_hits))
junc_start_end_hits_query <-
c(S4Vectors::queryHits(junc_start_hits),
S4Vectors::queryHits(junc_end_hits))
# split the subject hits grouped by query hits
# each junc will be one element of the list and contain the indexes for the other junctions it shares
# either a start or an end position with
junc_clusters <-
S4Vectors::split(x = junc_start_end_hits_subject,
f = junc_start_end_hits_query) %>%
IntegerList() %>%
unique()
# add clusters and indexes to coords
junc_coords$index <- 1:length(junc_coords)
junc_coords$junc_clusters <- junc_clusters
stopifnot(length(junc_clusters) == nrow(raw_counts))
print(stringr::str_c(Sys.time(), (" - Normalising junc counts...")))
raw_counts_norm_mat <- matrix(ncol = ncol(raw_counts),
nrow = nrow(raw_counts))
junc_clusters_subjects <- junc_clusters %>% unlist()
junc_clusters_query <- junc_clusters %>% unlist() %>% names()
# for every column/sample
# generate a vector of the total sum of counts juncs with overlapping start or ends
# divide to normalise
for(i in 1:ncol(raw_counts)){
cluster_total_counts <-
raw_counts[[i]][junc_clusters_subjects] %>%
S4Vectors::split(f = junc_clusters_query) %>%
lapply(FUN = sum) %>%
unlist()
# need to reorder the names since they are converted to chr and then sorted
# otherwise error in order of the clusters
cluster_total_counts <- cluster_total_counts[names(cluster_total_counts) %>% as.integer() %>% order()]
stopifnot(identical(as.integer(names(cluster_total_counts)), 1:nrow(raw_counts)))
raw_counts_norm_mat[,i] <-
raw_counts[[i]]/cluster_total_counts
}
colnames(raw_counts_norm_mat) <- colnames(raw_counts)
raw_counts_norm_df <- raw_counts_norm_mat %>% as.data.frame() %>% dplyr::as_tibble()
junc_raw_norm_metadata_list <-
list(raw = raw_counts %>%
dplyr::mutate(index = row_number()) %>%
dplyr::select(index, everything()),
norm = raw_counts_norm_df %>%
dplyr::mutate(index = row_number()) %>%
dplyr::select(index, everything()),
metadata = junc_coords)
print(stringr::str_c(Sys.time(), (" - done!")))
return(junc_raw_norm_metadata_list)
}
#' Seperates gr by start and end
#'
#' \code{.get_gr_for_start_end} takes a gr object and generates 2, one
#' containing only the precise start and the other, the precise end
#'
#' @param gr gr. Any genomic ranges object.
#'
#' @return list. 2 grs, each with 1 range corresponding to every range in the
#' input. One containing precise start positions, the other precise ends
.get_gr_for_start_end <- function(gr){
gr_start <- gr
GenomicRanges::end(gr_start) <- GenomicRanges::start(gr_start)
gr_end <- gr
GenomicRanges::start(gr_end) <- GenomicRanges::end(gr_end)
gr_start_end_list <- list(start = gr_start, end = gr_end)
return(gr_start_end_list)
}
#' Tests whether the normalise_junc function has worked as expected
#'
#' \code{test_normalise_junc} takes in the raw and norm counts in a list
#' outputted from \code{normalise_junc} and tests whether all normalised values
#' are between 0-1. Then for \code{n_samples_to_test}, check
#'
#' @param raw_norm_counts_metadata list. Returned from \code{normalise_junc}.
#' @param n_samples_to_test int. Number of random samples to check
#' clustering/normalisation.
#'
#'
#' @return lgl. Should be TRUE if normalisation has performed accurately.
#' @export
test_normalise_junc <- function(raw_norm_counts_metadata, n_samples_to_test = 5, n_juncs_to_test = 20, junc_coords, raw_counts){
raw_norm_counts_metadata$norm[is.na(raw_norm_counts_metadata$norm)] <- 0
# any counts outside the range of 0-1?
norm_counts_0_to_1 <-
raw_norm_counts_metadata$norm %>%
dplyr::select(-index) %>%
lapply(FUN = function(x){ any(x < 0 | x > 1) }) %>%
unlist() %>%
any()
sample_indexes_to_test <- sample(x = 2:ncol(raw_norm_counts_metadata$norm %>% dplyr::select(-index)), n_samples_to_test)
sample_test_eval <- logical(length = n_samples_to_test)
for(i in seq_along(sample_indexes_to_test)){
sample_index_to_test <- sample_indexes_to_test[i]
junc_indexes_to_test <- sample(x = 1:nrow(raw_norm_counts_metadata$norm), n_juncs_to_test)
junc_test_eval <- logical(length = n_juncs_to_test)
for(j in seq_along(junc_indexes_to_test)){
junc_index_to_test <- junc_indexes_to_test[j]
# check if the start end clustering has gotten the right number of juncs
junc_to_test <- raw_norm_counts_metadata$metadata[junc_index_to_test]
pred_cluster <- which(as.vector(GenomicRanges::seqnames(junc_coords) == GenomicRanges::seqnames(junc_to_test) &
GenomicRanges::start(junc_coords) == GenomicRanges::start(junc_to_test) |
GenomicRanges::end(junc_coords) == GenomicRanges::end(junc_to_test)))
actual_cluster <- junc_to_test$junc_clusters %>% unlist() %>% unname()
# check if the cluster normalised count matches
cluster_total_raw_counts <- raw_norm_counts_metadata$raw[actual_cluster, sample_index_to_test] %>% unlist() %>% sum()
pred_norm_count <- raw_norm_counts_metadata$raw[[junc_index_to_test, sample_index_to_test]]/cluster_total_raw_counts
actual_norm_count <- raw_norm_counts_metadata$norm[[junc_index_to_test, sample_index_to_test]]
if(is.na(pred_norm_count)) pred_norm_count <- 0
junc_test_eval[j] <- (identical(sort(pred_cluster), sort(pred_cluster)) && pred_norm_count == actual_norm_count)
}
sample_test_eval[i] <- all(junc_test_eval)
}
# check ranges are in the same order and counts are
identical_ranges <- (identical(raw_norm_counts_metadata$metadata %>% GenomicRanges::ranges(), junc_coords %>% GenomicRanges::ranges()) &&
identical(GenomicRanges::seqnames(raw_norm_counts_metadata$metadata), GenomicRanges::seqnames(junc_coords)))
identical_raw_counts <- identical(raw_norm_counts_metadata$raw %>% dplyr::select(-index), raw_counts)
return(c(!norm_counts_0_to_1, all(sample_test_eval), identical_ranges, identical_raw_counts))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.