R/calculations.R

subtract_heatmap_background <- function(gr, bg){
  data_columns <- c("fu3","tu3","fu2","tu2","fu1","tu1",
                    "fd1","td1","fd2","td2","fd3","td3")
  gr[data_columns] <- gr[data_columns] - bg[data_columns]
  #results22[c("fu3","tu3","fu2","tu2","fu1","tu1","fd1","td1","fd2","td2","fd3","td3")] <- results22[,c("fu3","tu3","fu2","tu2","fu1","tu1","fd1","td1","fd2","td2","fd3","td3")] - results20[,c("fu3","tu3","fu2","tu2","fu1","tu1","fd1","td1","fd2","td2","fd3","td3")]
  return(gr)
}

calculate_heatmaps <- function(gr, length=NULL, strand=NULL){
  # Filter by length if necessary
  if(!is.null(length))
    gr <- gr[width(gr)==length]
  # Filter by strand if necessary
  if(!is.null(strand))
    gr <- gr[strand(gr)==strand]

  # Names of columns for positions at the 5' end
  five_positions <- c("fu3", "fu2", "fu1", "fd1", "fd2", "fd3")
  # Names of columns for positions at the 3' end
  three_positions <- c("tu3", "tu2", "tu1", "td1", "td2", "td3")

  # Get the nucleotide data for the 5' end
  five_raw <- as.data.table(mcols(gr)[,c("five", five_positions)])
  # Get the nucleotide data for the 3' end
  three_raw <- as.data.table(mcols(gr)[,c("three", three_positions)])

  # Get the total number of reads for each base at 5' end and rename columns
  five_n <- count(group_by(.data = five_raw,  five))
  names(five_n) <- c("X", "five_n")

  # Get the total number of reads for each base at 3' end and rename columns
  three_n <- count(group_by(.data = three_raw,  three))
  names(three_n) <- c("X", "three_n")

  # Create the results data frame with X and Y columns and the counts per base
  results <- data.table("X" = rep(c("A", "C", "G", "T"), each = 4), "Y" = rep(c("A", "C", "G", "T"), 4))
  results <- left_join(x = results, y = five_n, by = "X")
  results <- left_join(x = results, y = three_n, by = "X")

  # Loop through 5' positions, calculate percentages and add to results
  for(i in 1:6){
    # Count the bases per each 5' end, then set the names
    z <- count(group_by_at(.tbl = five_raw, .vars = vars(five, five_positions[i])))
    names(z) <- c("X", "Y", five_positions[i])
    # Merge the raw counts with the results
    results <- merge.data.frame(x = results, y = z, by = c("X", "Y"), all.x = TRUE)
    # Replace any NA values with 0
    results[is.na(results[five_positions[i]]), five_positions[i]] <- 0
    # Replace the raw counts with the ratio per 5' base
    results[five_positions[i]] <- results[five_positions[i]] / results$five_n
  }

  # Loop through 3' positions, calculate percentages and add to results
  for(i in 1:6){
    z <- count(group_by_at(.tbl = three_raw, .vars = vars(three, three_positions[i])))
    names(z) <- c("X", "Y", three_positions[i])
    results <- merge.data.frame(x = results, y = z, by = c("X", "Y"), all.x = TRUE)
    results[is.na(results[three_positions[i]]), three_positions[i]] <- 0
    results[three_positions[i]] <- results[three_positions[i]] / results$three_n
  }
  return(results)
}

count_overlaps_by_width <- function(gr, regions, overlap = "sense", normalized=FALSE){
  widths <- sort(base::unique(width(gr)))
  results <- as.data.frame(matrix(ncol=length(widths), nrow=length(regions)))
  names(results) <- widths
  original_region_strand <-as.character(strand(regions))
  if (overlap=="sense")
    ignore_strand <- FALSE
  if (overlap=="both")
    ignore_strand <- TRUE
  if (overlap=="antisense"){
    ignore_strand <- FALSE
    strand(regions) <- invert_vector(as.character(strand(regions)))
  }
  if (normalized==TRUE)
    interval_widths <- (end(regions) - start(regions))
  for (i in 1:length(widths)){
    results[,i] <- GenomicRanges::countOverlaps(subject = gr[width(gr)==widths[i]],
                                            query = regions,
                                            type = "any",
                                            ignore.strand=ignore_strand)
    if (normalized==TRUE)
      results[,i] <- results[,i] / (mcols(regions)$exon_bp/100)
  }
  return(cbind.data.frame("Gene_strand"=original_region_strand, results))
}

count_overlaps_by_width_and_base <- function(gr, regions, alignment_width, base_col, overlap = "sense", normalized=FALSE){
  gr <- gr[width(gr)==alignment_width]
  bases <- sort(base::unique(mcols(gr)[[base_col]]))
  results <- as.data.frame(matrix(ncol=length(bases), nrow=length(regions)))
  names(results) <- bases
  original_region_strand <-as.character(strand(regions))
  if (overlap=="sense")
    ignore_strand <- FALSE
  if (overlap=="both")
    ignore_strand <- TRUE
  if (overlap=="antisense"){
    ignore_strand <- FALSE
    strand(regions) <- invert_vector(as.character(strand(regions)))
  }
  if (normalized==TRUE)
    interval_widths <- (end(regions) - start(regions))
  for (i in 1:length(bases)){
    results[,i] <- GenomicRanges::countOverlaps(subject = gr[mcols(gr)[[base_col]]==bases[i]],
                                                query = regions,
                                                type = "any",
                                                ignore.strand=ignore_strand)
    if (normalized==TRUE)
      results[,i] <- results[,i] / interval_widths
  }
  return(cbind.data.frame("Gene_strand"=original_region_strand, results))
}


calculate_offsets <- function(gr, primary_length, overlap_type="sense", maximum_offset=10){
  # If the type is 'sense', the 5' ends of reads from the same strands are compared
  if(overlap_type=="sense"){
    strands <- list(c("+", "+"), c("-", "-"))
    ignore_strand = FALSE
  }
  # If the type is 'antisense', the 5' end of the primary set is compared to the 3' end of the secondary set
  if(overlap_type=="antisense"){
    strands <- list(c("+", "-"), c("-", "+"))
    ignore_strand = TRUE
  }
  # Make the 4 sets of GenomicRanges
  set1_1 <- granges(gr[width(gr)==primary_length & strand(gr)==strands[[1]][1]])
  set1_2 <- granges(gr[width(gr)!=primary_length & strand(gr)==strands[[1]][2]])
  set2_1 <- granges(gr[width(gr)==primary_length & strand(gr)==strands[[2]][1]])
  set2_2 <- granges(gr[width(gr)!=primary_length & strand(gr)==strands[[2]][2]])

  # Find the overlaps that are within the maximum offset
  set1 <- GenomicRanges::findOverlaps(query = set1_1, subject = set1_2, maxgap = maximum_offset, type = "start", select = "all", ignore.strand = ignore_strand)
  set2 <- GenomicRanges::findOverlaps(query = set2_1, subject = set2_2, maxgap = maximum_offset, type = "end", select = "all", ignore.strand = ignore_strand)

  # Use the 'hits' lists to make list of the GenomicRanges
  pos_query_grs <- set1_1[queryHits(set1)]
  pos_subject_grs <- set1_2[subjectHits(set1)]
  neg_query_grs <- set2_1[queryHits(set2)]
  neg_subject_grs <- set2_2[subjectHits(set2)]

  # Create a data frame with the values from each strand, and bind them into a single data frame
  total_results <- rbind.data.frame(data.frame("offsets" = start(pos_subject_grs) - start(pos_query_grs),
                                               "widths"= width(pos_subject_grs),
                                               #"chromosomes"= seqnames(pos_subject_grs),
                                               "strands"= strand(pos_query_grs)),
                                    data.frame("offsets" = end(neg_query_grs) - end(neg_subject_grs),
                                               "widths"= width(neg_subject_grs),
                                               #"chromosomes"= seqnames(neg_subject_grs),
                                               "strands"= strand(neg_query_grs)))

  # Count the unique rows and return a data table
  # res_1 <- data.table(dplyr::count(x = total_results, offsets, widths, chromosomes, strands, sort = TRUE))
  # res_2 <- data.table(dplyr::count(x = total_results, widths, chromosomes, strands, sort = TRUE))
  # results <- dplyr::left_join(x = res_1, y = res_2, by = c("widths", "chromosomes", "strands"), copy = TRUE, suffix = c(".x", ".y"))

  res_1 <- data.table(dplyr::count(x = total_results, offsets, widths, strands, sort = TRUE))
  res_2 <- data.table(dplyr::count(x = total_results, widths, strands, sort = TRUE))
  results <- dplyr::left_join(x = res_1, y = res_2, by = c("widths", "strands"), copy = TRUE, suffix = c(".x", ".y"))


  results["ratio"] <- results$n.x / results$n.y
  return(results)
}


find_overlaps <- function(A, B){
  a <- subsetByOverlaps(query = A, subject = B, maxgap = 10, minoverlap = 1, type = "start", invert = FALSE)
  return(start(A) - start(a))
}

find_minimum <- function(A, B){
  return(B[which.min(abs(B-A))]-A)
}

calculate_phasing <- function(gr, length=26, start_base=c("A|C|T|G")){
  gr <- sort.GenomicRanges(gr)
  filtered_gr <- gr[width(gr) %like% length & mcols(gr)$five %like% start_base]
  plus <- start(filtered_gr[strand(filtered_gr)=="+"])
  plus1 <- c(plus[1], plus)[1:length(plus)]
  plus2 <- c(plus[1], plus[1], plus)[1:length(plus)]
  plus3 <- c(plus[1], plus[1], plus[1], plus)[1:length(plus)]
  plus_first <- plus2 - plus3
  plus_first <- plus_first[plus_first >= 1 & plus_first <= 50]
  plus_second <- plus1 - plus3
  plus_second <- plus_second[plus_second >= 1 & plus_second <= 50]
  plus_third <- plus - plus3
  plus_third <- plus_third[plus_third >= 1 & plus_third <= 50]

  minus <- rev(end(filtered_gr[strand(filtered_gr)=="-"]))
  minus1 <- c(minus[1], minus)[1:length(minus)]
  minus2 <- c(minus[1], minus[1], minus)[1:length(minus)]
  minus3 <- c(minus[1], minus[1], minus[1], minus)[1:length(minus)]
  minus_first <- minus3 - minus2
  minus_first <- minus_first[minus_first >= 1 & minus_first <= 50]
  minus_second <- minus3 - minus1
  minus_second <- minus_second[minus_second >= 1 & minus_second <= 50]
  minus_third <- minus3 - minus
  minus_third <- minus_third[minus_third >= 1 & minus_third <= 50]

  return(list("plus_first"=plus_first,
              "plus_second"=plus_second,
              "plus_third"=plus_third,
              "minus_first"=minus_first,
              "minus_second"=minus_second,
              "minus_third"=minus_third))
}

calculate_exonic_bp_of_gene <- function(exons, genes){
  # Create a data frame with the start and end and gene ID columns
  df <- data.frame(start=start(exons),
                   end=end(exons),
                   ensembl_gene_id=mcols(exons)$ensembl_gene_id, stringsAsFactors = FALSE)

  # Aggregate the start and end position by gene ID, and Name the columns
  df_agg <- aggregate(x = list(df$start, df$end),
                      by=list(df$ensembl_gene_id, df$ensembl_gene_id),
                      FUN=c)
  names(df_agg) <- c("ensembl_gene_id", "ensembl_gene_id", "starts", "ends")

  # With the helper function sum_exons, sum up the total number of positions for each gene
  df_agg$exon_bp <- mapply(df_agg$starts,
                      df_agg$ends,
                      FUN = sum_exons,
                      USE.NAMES = FALSE)
  # Create a new set of metadata columns by joining the new counts with the old gene IDs
  new_mcols <- inner_join(x = as.data.frame(mcols(genes)),
                          y = df_agg[,c(2,5)],
                          by = c("ensembl_gene_id"))
  # Add the new columns to the gene intervals
  mcols(genes) <- new_mcols
  return(genes)
}

sum_exons <- function(starts, ends){
  # initialize the list of positions
  positions <- NULL
  # Loop through each start and end and add each position to the list
  for(i in 1:length(starts)){
    positions <- c(positions, starts[i]:ends[i])
  }
  # return the length of the list of unique positions
  return(length(unique(positions)))
}

#gr <- sort.GenomicRanges(two_mm[sample(x = 1:3499785, size = 1000000, replace = FALSE)])

#p <- ggplot(data = a, mapping = aes(x = offsets, by = as.factor(a$widths))) + geom_freqpoly(binwidth = 1)
#p <- ggplot2::ggplot() + geom_line(mapping = aes(x = results$offsets, y = results$n, group = results$widths, color = results$widths))

#ggplot(data=results3, aes(x=offsets, y=ratio)) + geom_line(aes(group = widths), color=results3$widths, inherit.aes = TRUE, show.legend = TRUE) + facet_grid(chromosomes~strands) + ggplot2::theme(legend.position = "right")

adjust_ggplot_units <- function(maxWidths, x1, x2){
  widths <- as.numeric(maxWidth2[[2]][[1]][[2]])
  y1 <- maxWidth[[2]][[1]][[1]]*(x1/x2)
  y2 <- maxWidth[[2]][[2]][[1]]
  new_units <- unit.pmax(unit(x = list(y1), units = list("cm")),
                         unit(x = list(y2), units = list("cm")))
  return(new_units)
}
alb202/PACER documentation built on May 16, 2019, 10:05 a.m.