R/add_repeat_maker_table.R

Defines functions add_repeat_masker_table

Documented in add_repeat_masker_table

#' add repeat masker table annotation
#'
#' This function allows you to add to the annotation table the repeat masker table informations
#' @param samples list of annotated peaks
#' @param table_folder folder where the repeat masker table is, default is 'utils'
#' @keywords repeat masker table
#' @import data.table tidyverse GenomicRanges
#' @export
#' @examples
#' add_repeat_masker_table(samples, table_folder)
add_repeat_masker_table <- function(samples, table_folder = "utils"){
  ###adding repeat masker annotation
  #loading repeat masker table for repeats annotation
  ###repeat masker annotation table downloaded from UCSC Table Browser: https://genome.ucsc.edu/cgi-bin/hgTables?hgsid=861624999_IsHRXOsBEeXzFUKTc2Yxnwu3wDiL&clade=mammal&org=Human&db=hg38&hgta_group=allTracks&hgta_track=rmsk&hgta_table=0&hgta_regionType=genome&position=chr12%3A54%2C594%2C719-54%2C595%2C565&hgta_outputType=selectedFields&hgta_outFileName=repeat_masker_table
  path <- file.path(table_folder, "repeat_masker_table")
  repeat_masker_table <- read.table(file = path, sep = '\t', header = TRUE)
  names(repeat_masker_table) <- c("seqnames", "start", "end", "strand", "rep_name", "rep_class", "rep_family")
  tmp_repeat_masker_table <- GRanges(repeat_masker_table) #converting to GRanges to use findOverlaps function(GRanges package)
  tmp_peaks_granges <- lapply(samples, GRanges) #converting to GRanges to use findOverlaps function(GRanges package)
  #using GRanges::findOverlaps to find overlaps between our set of peaks and repetitive repeat masker table
  repeat_hits <- lapply(tmp_peaks_granges, function(condition){
    hits <- findOverlaps(condition, tmp_repeat_masker_table, minoverlap = 3, select = "first")
    hits <- as.data.frame(hits)
  })
  #creating a dataframe of the same length of our annotation dataframe with rep_name and rep_class that will be merged with our annatation dataframe
  repeat_hits <- lapply(repeat_hits, function(condition){
    temporary_df <- data.frame(matrix(0, ncol = 3, nrow = 0))
    names(temporary_df) <- c("repetitive_sequence", "rep_name", "rep_class")
    for (line in condition) {
      tmp2 <- repeat_masker_table[line,] %>%
        mutate(repetitive_sequence = ifelse(is.na(line), as.integer(NA), "yes")) %>%
        select(repetitive_sequence, rep_name, rep_class)
      temporary_df <- rbind(temporary_df, tmp2)
    }
    return(temporary_df)
  })

  #add the information obtained from the repeat masker table as annotation columns
  for (condition in names(samples)) {
    samples[[condition]] = cbind(samples[[condition]], repeat_hits[[condition]])
  }
  return(samples)
}
riccardo-trozzo/BlissR documentation built on Aug. 1, 2020, 12:23 a.m.