IRFinder: Vector integrated region analysis package (IRFinder)

Documented in annoByGene

#' @title Annotate the vector integrated site by genes and transcription start sites (TSS).
#' 
#' @description This function uses Gene and TSS database to search the feature of integrated regions. 
#'              User can get query sequence inserted in gene and distribution graphes from Gene and TSS coordinations.
#'              Plus, user can do random distribution analysis by this function.
#' 
#' @usage annoByGene(hits, randomSet = NULL, mapTool = 'blast', organism = 'hg19', interval = 5000, range = c(-20000, 20000),
#`        outpath = '~', dbPath = paste0(.libPaths()[1], '/IRFinder/extdata'))
#' 
#' @param hits a GR object. This object made from *makeInputs* function.
#' @param randomSet a string vector. Type path to load a random set. 
#'                  If this value is null, random distribution analysis is not executed.
#' @param mapTool a character. Function serve two types of file such as outputs from BLAST and BLAT.
#'                Default is 'blast'. If you want to use BLAT output, use 'blast'.
#' @param organism a single character. This function serves 3 versions of organisms such as hg19, hg38 (Human)
#'                 and galGal6 (Chicken). Default is 'hg19'.
#' @param interval an integer vector. This number means interval number for distribution analysis. Default is 5000.
#' @param range an integer array. It means the range for highlight region of this analysis. Default range is c(-20000, 20000).
#' @param outpath an string vector. Plots are saved in this path. Default value is R home directory.
#' @param dbPath a string vector. Directory path of database files.
#' 
#' @return Return a result list constituted by insertion table (Gene), distribution table (Gene/TSS) and GRobject of Gene and TSS data.
#'
#'
#' @export

annoByGene = function(hits, randomSet = NULL, mapTool = 'blast', organism = 'hg19', interval = 5000, range = c(-20000, 20000),
                      outpath = '~', dbPath = paste0(.libPaths()[1], '/IRFinder/extdata')){
  library(GenomicRanges); library(stringr); library(grDevices); library(regioneR)

  cat('---------- Annotation integrated sites : Genes / TSSs ----------\n')
  cat(paste0('Start time : ', date(), '\n'))

  if(length(which(c('hg19', 'hg38', 'galGal6') %in% organism)) == 0){
    return(cat("You can use hg19/hg38/galGal6 data only. ( Input : ", paste(organism, collapse = ','), ")\n",
               '---------- Annotation process is halted. ----------\nFinish time : ', date(), '\n'))
  } else {}

  cat('---------- Loading a gene table ----------\n')
  #### 01. Load a gene table
  tab_loc = paste0(dbPath, '/Ensembl_gene_', organism, '.txt')
  #tab_loc = system.file('extdata', paste0('Ensembl_gene_', organism, '.txt'), package = 'IRFinder')
  dataTable = read.delim(tab_loc, stringsAsFactors = FALSE, header = TRUE)
  dataTable = subset(dataTable, dataTable$hgnc_symbol != '') # Delete genes that are not have hugo symbol
  dataTable = subset(dataTable, str_detect(dataTable$chromosome_name, '_') == FALSE)
  dataTable$chromosome_name = paste0('chr', dataTable$chromosome_name)

  cat('Done.\n')
  cat('---------- Creating a GRanges object ----------\n')

  #### 02. Make GR object by a gene table
  gr_genes = regioneR::toGRanges(dataTable[,c(6,7,8,9,1,2,13,14)])
  gr_tss = regioneR::toGRanges(dataTable[,c(6,12,12,1,3,14)])

  #### 03. Make interval GR objects of a gene table
  ranges = seq(from = range[1], to = range[2], interval)
  gr_genes_dist = vector("list", length = (length(ranges)-1))
  for(x in 1:(length(ranges)-1)){
    tmp_start = gr_genes@ranges@start + ranges[x]
    tmp_end = gr_genes@ranges@start + ranges[x+1]-1
    gr_genes_dist[[x]] = GRanges(seqnames = as.character(gr_genes@seqnames),
                                 ranges = IRanges(start = tmp_start,
                                                  end = tmp_end),
                                 strand = '*')
  }
  gr_tss_dist = vector("list", length = (length(ranges)-1))
  for(x in 1:(length(ranges)-1)){
    tmp_start = gr_tss@ranges@start + ranges[x]
    tmp_end = gr_tss@ranges@start + ranges[x+1]-1
    gr_tss_dist[[x]] = GRanges(seqnames = as.character(gr_tss@seqnames),
                               ranges = IRanges(start = tmp_start,
                                                end = tmp_end),
                               strand = '*')
  }
  
  #### 04. Make random GR object
  if(!is.null(randomSet)){
    tmp = read.delim(file = randomSet, header = TRUE, stringsAsFactors = FALSE)
    gr_random = regioneR::toGRanges(tmp[,c(2,3,3)])
  } else {
    cat("[WARN] Random distribution analysis will not be executed.\n")
  }

  cat('Done.\n')
  cat('---------- Annotating integrated regions  ----------\n')

  #### 05. Make gene information
  inside_gene = as.data.frame(findOverlaps(hits, gr_genes, type = 'any',
                                           ignore.strand = TRUE),
                              stringsAsFactors = FALSE)
  a = as.data.frame(hits[inside_gene$queryHits,], stringsAsFactors = FALSE)
  b = dataTable[inside_gene$subjectHits,]
  inside_tab = cbind(a,b)
  
  if(!is.null(randomSet)){
    inside_gene_ran = as.data.frame(findOverlaps(gr_random, gr_genes, type = 'any', ignore.strand = TRUE), 
                                    stringsAsFactors = FALSE)
    a = as.data.frame(gr_random[inside_gene_ran$queryHits,], stringsAsFactors = FALSE)
    b = dataTable[inside_gene_ran$subjectHits,]
    inside_ran_tab = cbind(a,b)
    
    dist_gene_ran = vector("list", length = length(ranges)-1)
    dist_tss_ran = vector("list", length = length(ranges)-1) 
    dist_gene_ran_tab = data.frame(stringsAsFactors = FALSE)
    dist_tss_ran_tab = data.frame(stringsAsFactors = FALSE)
  } else {}

  dist_gene = vector("list", length = length(ranges)-1)
  dist_tss = vector("list", length = length(ranges)-1)
  dist_gene_tab = data.frame(stringsAsFactors = FALSE)
  dist_tss_tab = data.frame(stringsAsFactors = FALSE)
  
  for(x in 1:(length(ranges)-1)){
    dist_gene[[x]] = as.data.frame(findOverlaps(query = hits,
                                                subject = gr_genes_dist[[x]],
                                                type = "any", ignore.strand = TRUE),
                                   stringsAsFactors = FALSE)
    a = as.data.frame(hits[dist_gene[[x]]$queryHits,], stringsAsFactors = FALSE)
    b = dataTable[dist_gene[[x]]$subjectHits,]
    tmp1 = cbind(a,b)
    dist_tss[[x]] = as.data.frame(findOverlaps(query = hits,
                                               subject = gr_tss_dist[[x]],
                                               type = "any", ignore.strand = TRUE),
                                  stringsAsFactors = FALSE)
    a = as.data.frame(hits[dist_tss[[x]]$queryHits,], stringsAsFactors = FALSE)
    b = dataTable[dist_tss[[x]]$subjectHits,]
    tmp2 = cbind(a,b)
    dist_gene_tab = rbind(dist_gene_tab, tmp1)
    dist_tss_tab = rbind(dist_tss_tab, tmp2)
    
    if(!is.null(randomSet)){
      dist_gene_ran[[x]] = as.data.frame(findOverlaps(query = gr_random,
                                                      subject = gr_genes_dist[[x]],
                                                      type = "any", ignore.strand = TRUE),
                                         stringsAsFactors = FALSE)
      a = as.data.frame(gr_random[dist_gene_ran[[x]]$queryHits,], stringsAsFactors = FALSE)
      b = dataTable[dist_gene_ran[[x]]$subjectHits,]
      tmp1 = cbind(a,b)
      dist_tss_ran[[x]] = as.data.frame(findOverlaps(query = gr_random,
                                                     subject = gr_tss_dist[[x]],
                                                     type = "any", ignore.strand = TRUE),
                                        stringsAsFactors = FALSE)
      a = as.data.frame(gr_random[dist_tss_ran[[x]]$queryHits,], stringsAsFactors = FALSE)
      b = dataTable[dist_tss_ran[[x]]$subjectHits,]
      tmp2 = cbind(a,b)
      dist_gene_ran_tab = rbind(dist_gene_ran_tab, tmp1)
      dist_tss_ran_tab = rbind(dist_tss_ran_tab, tmp2)
    } else {}
  }

  #### 06. Count the number of hit in each range
  count_hit_genes = vector("list", length = length(ranges)-1)
  count_hit_tss = vector("list", length = length(ranges)-1)
  
  if(!is.null(randomSet)){
    count_hit_genes_ran = vector("list", length = length(ranges)-1)
    count_hit_tss_ran = vector("list", length = length(ranges)-1)
  } else {}

  for(x in 1:(length(ranges)-1)){
    count_hit_genes[[x]] = countOverlaps(query = hits, subject = gr_genes_dist[[x]], type = "any", ignore.strand = TRUE)
    count_hit_tss[[x]] = countOverlaps(query = hits, subject = gr_tss_dist[[x]], type = "any", ignore.strand = TRUE)
    if(!is.null(randomSet)){
      count_hit_genes_ran[[x]] = countOverlaps(query = gr_random, subject = gr_genes_dist[[x]], type = "any", ignore.strand = TRUE)
      count_hit_tss_ran[[x]] = countOverlaps(query = gr_random, subject = gr_tss_dist[[x]], type = "any", ignore.strand = TRUE)
    } else {}
  }

  hits_gene = as.numeric(apply(as.data.frame(count_hit_genes, stringsAsFactors = FALSE), MARGIN = 2, sum))
  hits_tss = as.numeric(apply(as.data.frame(count_hit_tss, stringsAsFactors = FALSE), MARGIN = 2, sum))
  
  if(!is.null(randomSet)){
    hits_gene_ran = as.numeric(apply(as.data.frame(count_hit_genes_ran, stringsAsFactors = FALSE), MARGIN = 2, sum))
    hits_tss_ran = as.numeric(apply(as.data.frame(count_hit_tss_ran, stringsAsFactors = FALSE), MARGIN = 2, sum))
  } else {}

  #### 07. Drawing histograms
  ymax_g = max(hits_gene); ymax_t = max(hits_tss)

  mid = vector("numeric", length = (length(ranges)-1))
  for(x in 1:(length(ranges)-1)){
    mid[x] = (ranges[x] + ranges[x+1])/2
  }
  
  freq_sites_gene = rep(x = mid, times = hits_gene)
  freq_sites_tss = rep(x = mid, times = hits_tss)
  
  if(!is.null(randomSet)){
    freq_sites_gene_ran = rep(x = mid, times = hits_gene_ran)
    freq_sites_tss_ran = rep(x = mid, times = hits_tss_ran)
  } else {}

  cat('Done.\n')
  cat('---------- Drawing histograms ----------\n')

  ## Gene
  png(filename = paste0(outpath, '/Distribution_gene_', organism, '.png'),
      width = 1200, height = 700)
  hist_frequency = hist(freq_sites_gene/1000, ylim = c(0, ceiling(ymax_g * 1.1)), breaks = ranges/1000,
                        ylab = '#Integration events', xlab = "kbs",
                        probability = FALSE, main = NULL, col = 'cornflowerblue',
                        axes = FALSE, cex.lab = 1.5)
  axis(side = 1, at = ranges/1000, cex.axis = 1.5)
  axis(side = 2, at = seq(0, ceiling(ymax_g * 1.1), 1), cex.axis = 1.5)
  text(0, ceiling(ymax_g * 1.1), labels = 'Gene', cex = 2, font = 2)
  arrows(0,0,0,ceiling(ymax_g * 1.1)*0.95, length = 0.15, code = 1, lwd = 3)
  arrows(min(ranges/1000), ceiling(ymax_g * 1.1)*0.95, range[1]/1000*0.025, ceiling(ymax_g * 1.1)*0.95, length = 0.1, code = 1)
  arrows(max(ranges/1000), ceiling(ymax_g * 1.1)*0.95, range[2]/1000*0.025, ceiling(ymax_g * 1.1)*0.95, length = 0.1, code = 1)
  text(range[2]/2000, ceiling(ymax_g * 1.1)*0.9, 'Upstream', cex = 1.5, col = 'black')
  text(-(range[2]/2000), ceiling(ymax_g * 1.1)*0.9, 'Downstream', cex = 1.5, col = 'black')
  dev.off()
  
  if(!is.null(randomSet)){
    count_sites_gene = plyr::count(freq_sites_gene)[,2]
    count_sites_gene_ran = plyr::count(freq_sites_gene_ran)[,2]
    
    count_data = rbind(count_sites_gene, count_sites_gene_ran)
    png(paste0(outpath, '/Random_distribution_gene_', organism, '.png'), width = 1200, height = 750)
    barplot(count_data, beside = TRUE, ylim = c(0, (max(count_data)+2)),
            main = "Random distribution (Gene)", xlab = 'Intervals (Kbs)', ylab = "#Integration events",
            col = c('Dodgerblue', 'skyblue'), names.arg = ranges[c(1:((length(ranges)-1)/2), ((length(ranges)+3)/2):length(ranges))])
    dev.off()
  } else {}

  ## TSS
  png(filename = paste0(outpath, '/Distribution_TSS_', organism, '.png'),
      width = 1200, height = 700)
  hist_frequency = hist(freq_sites_tss/1000, ylim = c(0, ceiling(ymax_t * 1.1)), breaks = ranges/1000,
                        ylab = '#Integration events', xlab = "kbs",
                        probability = FALSE, main = NULL, col = 'cornflowerblue',
                        axes = FALSE, cex.lab = 1.5)
  axis(side = 1, at = ranges/1000, cex.axis = 1.5)
  axis(side = 2, at = seq(0, ceiling(ymax_t * 1.1), 1), cex.axis = 1.5)
  text(0, ceiling(ymax_t * 1.1), labels = 'TSS', cex = 2, font = 2)
  arrows(0, 0, 0, ceiling(ymax_t * 1.1)*0.95, length = 0.15, code = 1, lwd = 3)
  arrows(min(ranges/1000), ceiling(ymax_t * 1.1)*0.95, range[1]/1000*0.025, ceiling(ymax_t * 1.1)*0.95, length = 0.1, code = 1)
  arrows(max(ranges/1000), ceiling(ymax_t * 1.1)*0.95, range[2]/1000*0.025, ceiling(ymax_t * 1.1)*0.95, length = 0.1, code = 1)
  text(range[2]/2000, ceiling(ymax_t * 1.1)*0.9, 'Upstream', cex = 1.5, col = 'black')
  text(-(range[2]/2000), ceiling(ymax_t * 1.1)*0.9, 'Downstream', cex = 1.5, col = 'black')
  dev.off()
  
  if(!is.null(randomSet)){
    count_sites_tss = plyr::count(freq_sites_tss)[,2]
    count_sites_tss_ran = plyr::count(freq_sites_tss_ran)[,2]
    
    count_data_tss = rbind(count_sites_tss, count_sites_tss_ran)
    png(paste0(outpath, '/Random_distribution_tss_', organism, '.png'), width = 1200, height = 750)
    barplot(count_data_tss, beside = TRUE, ylim = c(0, round(max(count_data_tss), -2)),
            main = "Random distribution (TSS)", xlab = 'Intervals (Kbs)', ylab = "#Integration events",
            col = c('Dodgerblue', 'skyblue'), names.arg = ranges[c(1:((length(ranges)-1)/2), ((length(ranges)+3)/2):length(ranges))])
    dev.off()
  } else {}

  if(!is.null(randomSet)){
    result_list = vector("list", length = 7)
    result_list[[1]] = inside_tab
    result_list[[2]] = dist_gene_tab
    result_list[[3]] = dist_gene_ran_tab
    result_list[[4]] = gr_genes
    result_list[[5]] = dist_tss_tab
    result_list[[6]] = dist_tss_ran_tab
    result_list[[7]] = gr_tss
    names(result_list) = c('Gene_inside_hits', 'Gene_exp_distribution', 'Gene_random_distribution', 'Gene_data',
                           'TSS_exp_distribution', 'TSS_random_distribution', 'TSS_data')
  } else {
    result_list = vector("list", length = 5)
    result_list[[1]] = inside_tab
    result_list[[2]] = dist_gene_tab
    result_list[[3]] = gr_genes
    result_list[[4]] = dist_tss_tab
    result_list[[5]] = gr_tss
    names(result_list) = c('Gene_inside_hits', 'Gene_exp_distribution', 'Gene_data', 'TSS_exp_distribution', 'TSS_data')
  }

  cat('---------- Annotation process is finished. ----------\n')
  cat(paste0('Finish time : ', date(), '\n'))

  return(result_list)
}