R/normalise_junc.R

Defines functions normalise_junc .get_gr_for_start_end test_normalise_junc

Documented in .get_gr_for_start_end normalise_junc test_normalise_junc

#' Normalise junction counts for RNA-seq samples
#'
#' \code{normalise_junc} takes as input the junction co-ordinates and counts for
#' samples and normalises the counts for each sample using total number of
#' counts for junctions that share an start or end with that junction
#'
#' @param junc_coords gr. Junction co-ordinates in a GRanges format
#' @param raw_counts df. Detailing the counts for each junction for all
#'   samples. Each column is a sample and each row corresponding to a junction.
#' @param use_strand lgl scalar. Whether to use the strand information when
#'   clustering junctions.
#'
#' @return list. Raw and normalised counts including metdata detailing
#'   co-ordinates and clusters.
#' @export
normalise_junc <- function(junc_coords, raw_counts, use_strand = T){

  print(stringr::str_c(Sys.time(), (" - Clustering junctions...")))

  junc_coords_sep_start_end <- .get_gr_for_start_end(junc_coords)

  junc_start_hits <-
    GenomicRanges::findOverlaps(query = junc_coords_sep_start_end$start,
                                subject = junc_coords_sep_start_end$start,
                                type = "equal",
                                ignore.strand = !(use_strand))

  junc_end_hits <-
    GenomicRanges::findOverlaps(query = junc_coords_sep_start_end$end,
                                subject = junc_coords_sep_start_end$end,
                                type = "equal",
                                ignore.strand = !(use_strand))

  junc_start_end_hits_subject <-
    c(S4Vectors::subjectHits(junc_start_hits),
      S4Vectors::subjectHits(junc_end_hits))

  junc_start_end_hits_query <-
    c(S4Vectors::queryHits(junc_start_hits),
      S4Vectors::queryHits(junc_end_hits))

  # split the subject hits grouped by query hits
  # each junc will be one element of the list and contain the indexes for the other junctions it shares
  # either a start or an end position with
  junc_clusters <-
    S4Vectors::split(x = junc_start_end_hits_subject,
                     f = junc_start_end_hits_query) %>%
    IntegerList() %>%
    unique()

  # add clusters and indexes to coords
  junc_coords$index <- 1:length(junc_coords)
  junc_coords$junc_clusters <- junc_clusters

  stopifnot(length(junc_clusters) == nrow(raw_counts))

  print(stringr::str_c(Sys.time(), (" - Normalising junc counts...")))

  raw_counts_norm_mat <- matrix(ncol = ncol(raw_counts),
                                 nrow = nrow(raw_counts))

  junc_clusters_subjects <- junc_clusters %>% unlist()
  junc_clusters_query <- junc_clusters %>% unlist() %>% names()

  # for every column/sample
  # generate a vector of the total sum of counts juncs with overlapping start or ends
  # divide to normalise
  for(i in 1:ncol(raw_counts)){

   cluster_total_counts <-
    raw_counts[[i]][junc_clusters_subjects] %>%
      S4Vectors::split(f = junc_clusters_query) %>%
      lapply(FUN = sum) %>%
      unlist()

   # need to reorder the names since they are converted to chr and then sorted
   # otherwise error in order of the clusters
   cluster_total_counts <- cluster_total_counts[names(cluster_total_counts) %>% as.integer() %>% order()]

   stopifnot(identical(as.integer(names(cluster_total_counts)), 1:nrow(raw_counts)))

   raw_counts_norm_mat[,i] <-
     raw_counts[[i]]/cluster_total_counts

  }

  colnames(raw_counts_norm_mat) <- colnames(raw_counts)
  raw_counts_norm_df <- raw_counts_norm_mat %>% as.data.frame() %>% dplyr::as_tibble()

  junc_raw_norm_metadata_list <-
    list(raw = raw_counts %>%
           dplyr::mutate(index = row_number()) %>%
           dplyr::select(index, everything()),
         norm = raw_counts_norm_df %>%
           dplyr::mutate(index = row_number()) %>%
           dplyr::select(index, everything()),
         metadata = junc_coords)

  print(stringr::str_c(Sys.time(), (" - done!")))

  return(junc_raw_norm_metadata_list)

}

#' Seperates gr by start and end
#'
#' \code{.get_gr_for_start_end} takes a gr object and generates 2, one
#' containing only the precise start and the other, the precise end
#'
#' @param gr gr. Any genomic ranges object.
#'
#' @return list. 2 grs, each with 1 range corresponding to every range in the
#'   input. One containing precise start positions, the other precise ends
.get_gr_for_start_end <- function(gr){

  gr_start <- gr
  GenomicRanges::end(gr_start) <- GenomicRanges::start(gr_start)
  gr_end <- gr
  GenomicRanges::start(gr_end) <- GenomicRanges::end(gr_end)

  gr_start_end_list <- list(start = gr_start, end = gr_end)

  return(gr_start_end_list)

}

#' Tests whether the normalise_junc function has worked as expected
#'
#' \code{test_normalise_junc} takes in the raw and norm counts in a list
#' outputted from \code{normalise_junc} and tests whether all normalised values
#' are between 0-1. Then for \code{n_samples_to_test}, check
#'
#' @param raw_norm_counts_metadata list. Returned from \code{normalise_junc}.
#' @param n_samples_to_test int. Number of random samples to check
#'   clustering/normalisation.
#'
#'
#' @return lgl. Should be TRUE if normalisation has performed accurately.
#' @export
test_normalise_junc <- function(raw_norm_counts_metadata, n_samples_to_test = 5, n_juncs_to_test = 20, junc_coords, raw_counts){

  raw_norm_counts_metadata$norm[is.na(raw_norm_counts_metadata$norm)] <- 0

  # any counts outside the range of 0-1?
  norm_counts_0_to_1 <-
    raw_norm_counts_metadata$norm %>%
    dplyr::select(-index) %>%
    lapply(FUN = function(x){ any(x < 0 | x > 1) }) %>%
    unlist() %>%
    any()

  sample_indexes_to_test <- sample(x = 2:ncol(raw_norm_counts_metadata$norm %>% dplyr::select(-index)), n_samples_to_test)

  sample_test_eval <- logical(length = n_samples_to_test)

  for(i in seq_along(sample_indexes_to_test)){

    sample_index_to_test <- sample_indexes_to_test[i]

    junc_indexes_to_test <- sample(x = 1:nrow(raw_norm_counts_metadata$norm), n_juncs_to_test)

    junc_test_eval <- logical(length = n_juncs_to_test)

    for(j in seq_along(junc_indexes_to_test)){

      junc_index_to_test <- junc_indexes_to_test[j]

      # check if the start end clustering has gotten the right number of juncs
      junc_to_test <- raw_norm_counts_metadata$metadata[junc_index_to_test]

      pred_cluster <- which(as.vector(GenomicRanges::seqnames(junc_coords) == GenomicRanges::seqnames(junc_to_test) &
                                        GenomicRanges::start(junc_coords) == GenomicRanges::start(junc_to_test) |
                                        GenomicRanges::end(junc_coords) == GenomicRanges::end(junc_to_test)))
      actual_cluster <- junc_to_test$junc_clusters %>% unlist() %>% unname()

      # check if the cluster normalised count matches
      cluster_total_raw_counts <- raw_norm_counts_metadata$raw[actual_cluster, sample_index_to_test] %>% unlist() %>% sum()
      pred_norm_count <- raw_norm_counts_metadata$raw[[junc_index_to_test, sample_index_to_test]]/cluster_total_raw_counts
      actual_norm_count <- raw_norm_counts_metadata$norm[[junc_index_to_test, sample_index_to_test]]

      if(is.na(pred_norm_count)) pred_norm_count <- 0

      junc_test_eval[j] <- (identical(sort(pred_cluster), sort(pred_cluster)) && pred_norm_count == actual_norm_count)

    }

    sample_test_eval[i] <- all(junc_test_eval)

  }

  # check ranges are in the same order and counts are
  identical_ranges <- (identical(raw_norm_counts_metadata$metadata %>% GenomicRanges::ranges(), junc_coords %>% GenomicRanges::ranges()) &&
                         identical(GenomicRanges::seqnames(raw_norm_counts_metadata$metadata), GenomicRanges::seqnames(junc_coords)))
  identical_raw_counts <- identical(raw_norm_counts_metadata$raw %>% dplyr::select(-index), raw_counts)

  return(c(!norm_counts_0_to_1, all(sample_test_eval), identical_ranges, identical_raw_counts))

}
dzhang32/RNAdiagnosR documentation built on Dec. 5, 2019, 2 a.m.