R/annoByRepeats.R

Defines functions annoByRepeat

Documented in annoByRepeat

#' @title Annotate the vector integrated site by repeat sequences.
#' 
#' @description By the repeat database, this function works for viral vector integration site annotation.
#'              User can get query sequences inserted in several types of repeats and a distribution graph by this function.
#' 
#' @usage annoByRepeat(hits, randomSet, mapTool = 'blast', organism = 'hg19', interval = 5000, range = c(-20000, 20000),
#`        outpath = '~', repClass = c('SINE', 'LINE', 'DNA', 'RNA'), dbPath = paste0(.libPaths()[1], '/VVIPS/extdata'))
#' 
#' @param hits a GR object. This object made from makeInputs function.
#' @param randomSet TRUE or FALSE. For random distribution, function makes random set, if user enter TRUE.
#'                  If this value is FALSE, random distribution analysis is not executed. Default is TRUE.
#' @param mapTool a character vector. Function serve two types of file such as outputs from BLAST and BLAT.
#'                Default is 'blast'. If you want to use BLAT output, use 'blast'.
#' @param organism a character vector. This function serves 3 versions of organisms such as hg19, hg38 (Human)
#'                 and galGal6 (Chicken). Default is 'hg19'.
#' @param interval an integer vector. This number means interval number for distribution analysis. Default is 5000.
#' @param randomSize an integer vector. A random set size. Default is 10000.
#' @param range an integer array. It means the range for highlight region of this analysis. Default range is c(-20000, 20000).
#' @param outpath an string vector. Plots are saved in this path. Default value is R home directory.
#' @param outFile a character vector. Attached ID to the result file name.
#' @param repClass a character vector or array. Users can select specific types of repeats such as SINE, LINE, DNA, RNA and Simple_repeat.
#' @param dbPath a string vector. Directory path of database files.
#' 
#' @return Return a result list constituted by insertion table, distribution table and GR object of repeat data.
#' 
#' 
#' @export

annoByRepeat = function(hits, randomSet = TRUE, mapTool = 'blast', organism = 'hg19', interval = 5000, randomSize = 10000,
                        range = c(-20000, 20000), outpath = '~', outFile, repClass = c('SINE', 'LINE', 'DNA', 'RNA'), 
                        dbPath = paste0(.libPaths()[1], '/VVIPS/extdata')){
  require(GenomicRanges); require(stringr); require(grDevices); require(plyr); require(ggplot2); require(writexl)
  
  cat('---------- Annotation integrated sites : Repeats ----------\n')
  cat(paste0('Start time : ', date(), '\n'))
  
  if(length(which(c('hg19', 'hg38', 'galGal6') %in% organism)) == 0){
    return(cat("You can use hg19/hg38/galGal6 data only. ( Input : ", paste(organism, collapse = ','), ")\n",
               '---------- Annotation process is halted. ----------\nFinish time : ', date(), '\n'))
  } else {}
  
  if(str_ends(dbPath, pattern = '/')){
    dbPath = word(dbPath, start = 1, end = nchar(dbPath), sep = '')
  } else {}
  
  if(str_ends(outpath, pattern = '/')){
    outpath = word(outpath, start = 1, end = nchar(outpath), sep = '')
  } else {}
  
  cat('---------- Loading a Repeat table ----------\n')
  #### 01. Load a gene table
  tab_gz = gzfile(description = paste0(dbPath, '/rmsk.txt.gz'), open = "r")
  suppressWarnings(dataTable <- read.delim(file = tab_gz, header = FALSE, stringsAsFactors = FALSE))
  dataTable = dataTable[,c(6,7,8,10,11,12,13,14,15)]
  colnames(dataTable) = c('genoName', 'genoStart', 'genoEnd', 'strand',
                          'repName', 'repClass', 'repFamily',
                          'repStart', 'repEnd')
  close(tab_gz)
  dataTable = subset(dataTable, str_detect(dataTable$genoName, '_') == FALSE)
  dataTable = subset(dataTable, str_detect(dataTable$repClass, '[?]') == FALSE)
  dataTable = subset(dataTable, dataTable$repClass != "Unknown")
  dataTable = dataTable[which(dataTable$repClass %in% repClass),]
  dataTable[,2] = dataTable[,2]+1;  dataTable[,8] = dataTable[,8]+1
  dataTable = dataTable[,c(1,2,3,4,5,6,7)]; names(dataTable) = c('chr', 'start', 'end', names(dataTable)[4:7])
  
  cat('Done.\n')
  
  cat('---------- Loading a microsatellite table ----------\n')
  #### 01-1. Load a gene table
  tab_gz2 = gzfile(description = paste0(dbPath, '/microsat.txt.gz'), open = "r")
  suppressWarnings(dataTable2 <- read.delim(file = tab_gz2, header = FALSE, stringsAsFactors = FALSE))
  dataTable2 = dataTable2[,c(2:5)]
  colnames(dataTable2) = c('chrom', 'chromStart', 'chromEnd', 'name')
  close(tab_gz2)
  
  dataTable2[,2] = dataTable2[,2] + 1
  dataTable2 = dataTable2[,c(1,2,3,4)]; names(dataTable2) = c('chr', 'start', 'end', 'name')
  
  cat('Done.\n')
  
  if(randomSet){
    cat('---------- Make a random set ----------\n')
    #### 01. Generate random set
    set.seed(123456)
    
    ch_size = readRDS(file = system.file("extdata", paste0(organism, '_chrom.rds'), package = "VVIPS"))
    ch_size_num = as.numeric(ch_size$length)
    ch_start = cumsum(ch_size_num) - ch_size_num + 1
    ch_end = cumsum(ch_size_num)
    ch_ratio = (ch_size_num / sum(ch_size_num))
    
    random_set = sample(ch_size$chrom, size = randomSize, replace = TRUE, prob = ch_ratio)
    count_ch = plyr::count(random_set); row.names(count_ch) = count_ch$x
    count_ch = count_ch[ch_size$chrom,]
    count_ch = cbind(count_ch, ch_size_num)
    
    ran_set = apply(count_ch, 1, function(x){sample(c(1:x[3]), size = x[2], replace = FALSE, prob = NULL)})
    
    chr_ran = rep(paste0('chr', count_ch$x), count_ch$freq)
    pos_ran = unlist(ran_set)
    
    ran_tab = data.frame('Random' = c(1:randomSize), 'Random_chr' = chr_ran, 'Random_pos' = pos_ran, stringsAsFactors = FALSE)
    
    write.table(ran_tab, file = paste0(outpath, '/', outFile, '_Random_set_', organism, '_gene.txt'),
                quote = FALSE, append = FALSE, sep = '\t', na = '', row.names = FALSE, col.names = TRUE)
    
    cat('Done.\n')
  } else {}
  
  cat('---------- Creating a GRanges object ----------\n')
  
  #### 02. Make GR object by a gene table
  gr_repeat = makeGRangesFromDataFrame(dataTable, keep.extra.columns = TRUE, ignore.strand = FALSE)
  gr_micro = makeGRangesFromDataFrame(dataTable2, keep.extra.columns = TRUE, ignore.strand = TRUE)
  
  #### 03. Make interval GR objects of a gene table
  ranges = seq(from = range[1], to = range[2], interval)
  gr_repeat_dist = vector("list", length = (length(ranges)-1))
  for(x in 1:(length(ranges)-1)){
    tmp_start = gr_repeat@ranges@start + ranges[x]
    tmp_end = gr_repeat@ranges@start + ranges[x+1]-1
    gr_repeat_dist[[x]] = GRanges(seqnames = as.character(gr_repeat@seqnames),
                                  ranges = IRanges(start = tmp_start, end = tmp_end),
                                  name = gr_repeat$repName,
                                  class = gr_repeat$repClass,
                                  dist_label = rep(paste0(ranges[x], ' <= X < ', ranges[x+1]), length(gr_repeat)),
                                  strand = '*')
  }
  
  gr_micro_dist = vector("list", length = (length(ranges)-1))
  for(x in 1:(length(ranges)-1)){
    tmp_start = gr_micro@ranges@start + ranges[x]
    tmp_end = gr_micro@ranges@start + ranges[x+1]-1
    gr_micro_dist[[x]] = GRanges(seqnames = as.character(gr_micro@seqnames),
                                 ranges = IRanges(start = tmp_start, end = tmp_end), 
                                 name = gr_micro$name,
                                 dist_label = rep(paste0(ranges[x], ' <= X < ', ranges[x+1]), length(gr_micro)),
                                 strand = '*')
  }
  
  #### 04. Make random GR object
  if(randomSet){
    tmp = ran_tab[,c(2,3,3)]
    names(tmp) = c('chr', 'start', 'end')
    gr_random = makeGRangesFromDataFrame(tmp)
  } else {}
  
  cat('Done.\n')
  cat('---------- Annotating integrated regions ----------\n')
  
  #### 05. Make gene information
  inside_repeat = as.data.frame(findOverlaps(hits, gr_repeat,
                                             type = 'any', ignore.strand = TRUE), stringsAsFactors = FALSE)
  a = as.data.frame(hits[inside_repeat$queryHits,], stringsAsFactors = FALSE)
  b = dataTable[inside_repeat$subjectHits,]
  inside_tab = cbind(a,b)[,c(6, 1:3, 12, 8:11, 13, 14)]
  names(inside_tab) = c('q_name', 'q_chr', 'q_start', 'q_end', 'repeat', 'chr', 'start', 'end', 
                        'strand', 'repClass', 'repFamily')
  
  inside_micro = as.data.frame(findOverlaps(hits, gr_micro, type = 'any', ignore.strand = TRUE),
                               stringsAsFactors = FALSE)
  a2 = as.data.frame(hits[inside_micro$queryHits,], stringsAsFactors = FALSE)
  b2 = dataTable2[inside_micro$subjectHits,]
  inside_tab2 = cbind(a2,b2)[,c(6, 1:3, 11, 8:10)]
  names(inside_tab2) = c('q_name', 'q_chr', 'q_start', 'q_end', 'microsatellite', 'chr', 'start', 'end')
  
  if(randomSet){
    # Repeat
    inside_repeat_ran = as.data.frame(findOverlaps(gr_random, gr_repeat, type = 'any', ignore.strand = TRUE), stringsAsFactors = FALSE)
    a = as.data.frame(gr_random[inside_repeat_ran$queryHits,], stringsAsFactors = FALSE)
    b = dataTable[inside_repeat_ran$subjectHits,]
    inside_repeat_ran_tab = cbind(a,b)[,c(1:3, 10, 6:9, 11, 12)]
    names(inside_repeat_ran_tab) = c('q_chr', 'q_start', 'q_end', 'repeat', 'chr', 'start', 'end', 
                                     'strand', 'repClass', 'repFamily')
    
    dist_repeat_ran = vector("list", length = length(ranges)-1)
    dist_repeat_ran_tab = data.frame(stringsAsFactors = FALSE)
    
    # Micro
    inside_micro_ran = as.data.frame(findOverlaps(gr_random, gr_micro, type = 'any', ignore.strand = TRUE), stringsAsFactors = FALSE)
    
    a2 = as.data.frame(gr_random[inside_micro_ran$queryHits,], stringsAsFactors = FALSE)
    b2 = dataTable2[inside_micro_ran$subjectHits,]
    inside_micro_ran_tab = cbind(a2,b2)[,c(1:3, 9, 6:8)]
    names(inside_micro_ran_tab) = c('q_chr', 'q_start', 'q_end', 'microsatellite', 'chr', 'start', 'end')
    
    dist_micro_ran = vector("list", length = length(ranges)-1)
    dist_micro_ran_tab = data.frame(stringsAsFactors = FALSE)
  } else {}
  
  dist_repeat = vector("list", length = length(ranges)-1)
  dist_repeat_tab = data.frame(stringsAsFactors = FALSE)
  dist_micro = vector("list", length = length(ranges)-1)
  dist_micro_tab = data.frame(stringsAsFactors = FALSE)
  
  for(x in 1:(length(ranges)-1)){
    dist_repeat[[x]] = as.data.frame(findOverlaps(query = hits, subject = gr_repeat_dist[[x]], type = "any", 
                                                  ignore.strand = TRUE), stringsAsFactors = FALSE)
    a = as.data.frame(hits[dist_repeat[[x]]$queryHits,], stringsAsFactors = FALSE)
    b = gr_repeat_dist[[x]][dist_repeat[[x]]$subjectHits,]
    tmp1 = cbind(a,b)
    
    dist_micro[[x]] = as.data.frame(findOverlaps(query = hits, subject = gr_micro_dist[[x]], type = "any", 
                                                 ignore.strand = TRUE), stringsAsFactors = FALSE)
    a2 = as.data.frame(hits[dist_micro[[x]]$queryHits,], stringsAsFactors = FALSE)
    b2 = gr_micro_dist[[x]][dist_micro[[x]]$subjectHits,]
    tmp2 = cbind(a2,b2)
    
    dist_repeat_tab = rbind(dist_repeat_tab, tmp1)
    dist_micro_tab = rbind(dist_micro_tab, tmp2)
    
    if(randomSet){
      ## repeat
      dist_repeat_ran[[x]] = as.data.frame(findOverlaps(query = gr_random, subject = gr_repeat_dist[[x]],
                                                        type = "any", ignore.strand = TRUE), stringsAsFactors = FALSE)
      a = as.data.frame(gr_random[dist_repeat_ran[[x]]$queryHits,], stringsAsFactors = FALSE)
      b = gr_repeat_dist[[x]][dist_repeat_ran[[x]]$subjectHits,]
      tmp1 = cbind(a,b)
      
      ## micro
      dist_micro_ran[[x]] = as.data.frame(findOverlaps(query = gr_random, subject = gr_micro_dist[[x]],
                                                       type = "any", ignore.strand = TRUE), stringsAsFactors = FALSE)
      a2 = as.data.frame(gr_random[dist_micro_ran[[x]]$queryHits,], stringsAsFactors = FALSE)
      b2 = gr_micro_dist[[x]][dist_micro_ran[[x]]$subjectHits,]
      tmp2 = cbind(a2,b2)
      
      dist_repeat_ran_tab = rbind(dist_repeat_ran_tab, tmp1)
      dist_micro_ran_tab = rbind(dist_micro_ran_tab, tmp2)
      
    } else {}
  }
  
  #### 06. Count the number of hit in each range
  count_hit_repeat = vector("list", length = length(ranges)-1)
  count_hit_micro = vector("list", length = length(ranges)-1)
  
  if(randomSet){
    count_hit_repeat_ran = vector("list", length = length(ranges)-1)
    count_hit_micro_ran = vector("list", length = length(ranges)-1)
  } else {}
  
  for(x in 1:(length(ranges)-1)){
    count_hit_repeat[[x]] = countOverlaps(query = hits, subject = gr_repeat_dist[[x]], type = "any", ignore.strand = TRUE)
    count_hit_micro[[x]] = countOverlaps(query = hits, subject = gr_micro_dist[[x]], type = "any", ignore.strand = TRUE)
    
    if(randomSet){
      count_hit_repeat_ran[[x]] = countOverlaps(query = gr_random, subject = gr_repeat_dist[[x]], type = "any", ignore.strand = TRUE)
      count_hit_micro_ran[[x]] = countOverlaps(query = gr_random, subject = gr_micro_dist[[x]], type = "any", ignore.strand = TRUE)
    } else {}
  }
  
  hits_repeat = as.numeric(apply(as.data.frame(count_hit_repeat, stringsAsFactors = FALSE), MARGIN = 2, sum))
  hits_micro = as.numeric(apply(as.data.frame(count_hit_micro, stringsAsFactors = FALSE), MARGIN = 2, sum))
  
  if(randomSet){
    hits_repeat_ran = as.numeric(apply(as.data.frame(count_hit_repeat_ran, stringsAsFactors = FALSE), MARGIN = 2, sum))
    hits_micro_ran = as.numeric(apply(as.data.frame(count_hit_micro_ran, stringsAsFactors = FALSE), MARGIN = 2, sum))
  } else {}
  
  #### 07. Drawing histograms
  ymax_r = max(hits_repeat); ymax_r2 = max(hits_micro)
  
  mid = vector("numeric", length = (length(ranges)-1))
  for(x in 1:(length(ranges)-1)){
    mid[x] = (ranges[x] + ranges[x+1])/2
  }
  
  freq_sites_repeat = rep(x = mid, times = hits_repeat)
  freq_sites_micro = rep(x = mid, times = hits_micro)
  
  if(randomSet){
    freq_sites_repeat_ran = rep(x = mid, times = hits_repeat_ran)
    freq_sites_micro_ran = rep(x = mid, times = hits_micro_ran)
  } else {}
  
  dist_repeat_tab = dist_repeat_tab[,c(6, 1:3, 13, 8:10, 15)]
  names(dist_repeat_tab) = c('q_name', 'q_chr', 'q_start', 'q_end',
                             'repeat', 'r_chr', 'r_start', 'r_end', 'range')
  dist_micro_tab = dist_micro_tab[,c(6, 1:3, 13, 8:10, 14)]
  names(dist_micro_tab) = c('q_name', 'q_chr', 'q_start', 'q_end',
                            'microsatellite', 'r_chr', 'r_start', 'r_end', 'range')
  
  cat('Done.\n')
  cat('---------- Drawing histograms ----------\n')
  
  count_sites_repeat = plyr::count(freq_sites_repeat)[,2]
  count_sites_micro = plyr::count(freq_sites_micro)[,2]
  
  
  if(randomSet){
    # Repeat
    count_sites_repeat_ran = plyr::count(freq_sites_repeat_ran)[,2]
    count_data = data.frame('Range' = factor(rep(ranges[ranges != 0]/1000, 2), levels = ranges[ranges != 0]/1000), 
                            'Group' = c(rep('Observed', length(count_sites_repeat)), rep('Random', length(count_sites_repeat_ran))),
                            'Count' = c(count_sites_repeat, count_sites_repeat_ran),
                            'Freq' = c(count_sites_repeat/sum(count_sites_repeat),
                                       count_sites_repeat_ran/sum(count_sites_repeat_ran)))
    
    png(paste0(outpath, '/', outFile, '_Random_distribution_repeat_', organism, '.png'), width = 1200, height = 750)
    r_plot = ggplot(count_data) + geom_bar(aes(x = Range, y = Freq, fill = Group), stat = "identity", position = "dodge") +
      lims(y = c(0, max(count_data$Freq))) + ggtitle(label = "Random distribution (Repeat)") +
      xlab('Intervals (Kbs)') + ylab("Ratio of Integration Events") + scale_fill_manual(values = c('Dodgerblue', 'skyblue')) +
      theme(panel.background = element_rect(fill="white", colour = "white"),
            panel.grid.major = element_line(size = 0.5, linetype = 'dotted', colour = 'black'),
            axis.line = element_line(colour = "darkgrey"), legend.title = element_blank(),
            legend.key.size = unit(0.7, "cm"), plot.title = element_text(hjust = 0.5, face = "bold", size = 15),
            legend.text = element_text(size = 13), axis.text.x = element_text(size = 12),
            axis.text.y = element_text(size = 12), axis.title.x = element_text(size = 13), axis.title.y = element_text(size = 13))
    print(r_plot)
    dev.off()
    
    # Micro
    count_sites_micro_ran = plyr::count(freq_sites_micro_ran)[,2]
    count_data = data.frame('Range' = factor(rep(ranges[ranges != 0]/1000, 2), levels = ranges[ranges != 0]/1000), 
                            'Group' = c(rep('Observed', length(count_sites_micro)), rep('Random', length(count_sites_micro_ran))),
                            'Count' = c(count_sites_micro, count_sites_micro_ran),
                            'Freq' = c(count_sites_micro/sum(count_sites_micro),
                                       count_sites_micro_ran/sum(count_sites_micro_ran)))
    
    png(paste0(outpath, '/', outFile, '_Random_distribution_microsatellite_', organism, '.png'), width = 1200, height = 750)
    m_plot = ggplot(count_data) + geom_bar(aes(x = Range, y = Freq, fill = Group), stat = "identity", position = "dodge") +
      lims(y = c(0, max(count_data$Freq))) + ggtitle(label = "Random distribution (Microsatellite)") +
      xlab('Intervals (Kbs)') + ylab("Ratio of Integration Events") + scale_fill_manual(values = c('Dodgerblue', 'skyblue')) +
      theme(panel.background = element_rect(fill="white", colour = "white"),
            panel.grid.major = element_line(size = 0.5, linetype = 'dotted', colour = 'black'),
            axis.line = element_line(colour = "darkgrey"), legend.title = element_blank(),
            legend.key.size = unit(0.7, "cm"), plot.title = element_text(hjust = 0.5, face = "bold", size = 15),
            legend.text = element_text(size = 13), axis.text.x = element_text(size = 12),
            axis.text.y = element_text(size = 12), axis.title.x = element_text(size = 13), axis.title.y = element_text(size = 13))
    print(m_plot)
    dev.off()
    
    result_list = vector("list", length = 8)
    result_list[[1]] = inside_tab
    result_list[[2]] = dist_repeat_tab
    result_list[[3]] = dist_repeat_ran_tab
    result_list[[4]] = gr_repeat
    result_list[[5]] = inside_tab2
    result_list[[6]] = dist_micro_tab
    result_list[[7]] = dist_micro_ran_tab
    result_list[[8]] = gr_micro
    names(result_list) = c('Repeat_inside_hits', "Repeat_exp_distribution", "Repeat_random_distribution", "Repeat_data",
                           "Microsatellite_inside_hits", "Microsatellite_exp_distribution", 
                           "Microsatellite_random_distribution", "Microsatellite_data")
    count_table = data.frame('Ranges' = paste0(ranges[1:length(ranges)-1], ' <= X < ', ranges[2:length(ranges)]),
                             'Repeat' = count_sites_repeat,
                             'Random_r' = count_sites_repeat_ran,
                             'Microsatellite' = count_sites_micro,
                             'Random_m' = count_sites_micro_ran,
                             stringsAsFactors = FALSE)
    
    ## Difference
    prop_list_r = suppressWarnings(apply(count_table[,c(2,3)], 1, function(t){prop.test(x = c(t[1], t[2]), 
                                                                                        n = c(sum(count_sites_repeat), sum(count_sites_repeat_ran)), 
                                                                                        correct = FALSE)}))
    prop_list_m = suppressWarnings(apply(count_table[,c(4,5)], 1, function(t){prop.test(x = c(t[1], t[2]), 
                                                                                        n = c(sum(count_sites_micro), sum(count_sites_micro_ran)), 
                                                                                        correct = FALSE)}))
    
    count_table = cbind(count_table[,c(1,2,3)], 'Repeat_p_value' = round(unlist(lapply(prop_list_r, function(t){t$p.value})),3),
                        count_table[,c(4,5)], 'Micro_p_value' = round(unlist(lapply(prop_list_m, function(t){t$p.value})), 3))
    
  } else {
    ## Repeat
    png(filename = paste0(outpath, '/', outFile, '_Distribution_Repeat_', organism, '.png'),
        width = 1200, height = 700)
    hist_freq = hist(freq_sites_repeat/1000, breaks = ranges/1000, plot = FALSE)
    hist_frequency = hist(freq_sites_repeat/1000, breaks = ranges/1000,
                          ylab = 'Density', xlab = "kbs", 
                          ylim = c(0, round(max(hist_freq$density),2)*1.5),
                          probability = TRUE, main = NULL, col = 'cornflowerblue',
                          axes = FALSE, cex.lab = 1.3)
    axis(side = 1, at = ranges/1000, cex.axis = 1.5)
    axis(side = 2, at = seq(0, round(max(hist_freq$density),2)*1.5, 0.01), cex.axis = 1.5)
    text(0, round(max(hist_freq$density),2)*1.5, labels = 'Repeat', cex = 1.7, font = 2)
    arrows(0,0,0,round(max(hist_freq$density),2)*1.5-0.005, length = 0.15, code = 1, lwd = 3)
    arrows(min(ranges/1000), round(max(hist_freq$density),2)*1.5-0.005, range[1]/1000*0.025, round(max(hist_freq$density),2)*1.5-0.005, length = 0.1, code = 1)
    arrows(max(ranges/1000), round(max(hist_freq$density),2)*1.5-0.005, range[2]/1000*0.025, round(max(hist_freq$density),2)*1.5-0.005, length = 0.1, code = 1)
    text(range[2]/2000, round(max(hist_freq$density),2)*1.5-0.01, 'Upstream', cex = 1.5, col = 'black')
    text(-(range[2]/2000), round(max(hist_freq$density),2)*1.5-0.01, 'Downstream', cex = 1.5, col = 'black')
    dev.off()
    
    ## micro
    png(filename = paste0(outpath, '/', outFile, '_Distribution_Microsatellite_', organism, '.png'),
        width = 1200, height = 700)
    hist_freq = hist(freq_sites_micro/1000, breaks = ranges/1000, plot = FALSE)
    hist_frequency = hist(freq_sites_micro/1000, breaks = ranges/1000,
                          ylab = 'Density', xlab = "kbs", 
                          ylim = c(0, round(max(hist_freq$density),2)*1.5),
                          probability = TRUE, main = NULL, col = 'cornflowerblue',
                          axes = FALSE, cex.lab = 1.3)
    axis(side = 1, at = ranges/1000, cex.axis = 1.5)
    axis(side = 2, at = seq(0, round(max(hist_freq$density),2)*1.5, 0.01), cex.axis = 1.5)
    text(0, round(max(hist_freq$density),2)*1.5, labels = 'MS', cex = 1.7, font = 2)
    arrows(0,0,0,round(max(hist_freq$density),2)*1.5-0.005, length = 0.15, code = 1, lwd = 3)
    arrows(min(ranges/1000), round(max(hist_freq$density),2)*1.5-0.005, range[1]/1000*0.025, round(max(hist_freq$density),2)*1.5-0.005, length = 0.1, code = 1)
    arrows(max(ranges/1000), round(max(hist_freq$density),2)*1.5-0.005, range[2]/1000*0.025, round(max(hist_freq$density),2)*1.5-0.005, length = 0.1, code = 1)
    text(range[2]/2000, round(max(hist_freq$density),2)*1.5-0.01, 'Upstream', cex = 1.5, col = 'black')
    text(-(range[2]/2000), round(max(hist_freq$density),2)*1.5-0.01, 'Downstream', cex = 1.5, col = 'black')
    dev.off()
    
    result_list = vector("list", length = 6)
    result_list[[1]] = inside_tab
    result_list[[2]] = dist_repeat_tab
    result_list[[3]] = gr_repeat
    result_list[[4]] = inside_tab2
    result_list[[5]] = dist_micro_tab
    result_list[[6]] = gr_micro
    names(result_list) = c('Repeat_inside_hits', "Repeat_exp_distribution", "Repeat_data",
                           "Microsatellite_inside_hits", "Microsatellite_exp_distribution", "Microsatellite_data")
    count_table = data.frame('Ranges' = paste0(ranges[1:length(ranges)-1], ' <= X < ',
                                               ranges[2:length(ranges)]),
                             'Repeat' = count_sites_repeat,
                             'Microsatellite' = count_sites_micro,
                             stringsAsFactors = FALSE)
  }
  
  ### Writing documents
  if(randomSet){
    if(!file.exists(paste0(outpath, '/', outFile, '_Distribution_results_repeat_', str_remove_all(Sys.Date(), '-'), '.xlsx'))){
      writexl::write_xlsx(list(inside_repeats = result_list[[1]],
                               inside_micro = result_list[[5]],
                               dist_freq = count_table,
                               dist_repeat = result_list[[2]],
                               dist_micro = result_list[[6]]),
                          path = paste0(outpath, '/', outFile, '_Distribution_results_repeat_', str_remove_all(Sys.Date(), '-'), '.xlsx'))
    } else {cat("[WARN] Result file can not be created because a file with the same name exists.\n")}
  } else {
    if(!file.exists(paste0(outpath, '/', outFile, '_Distribution_results_repeat_noRandom_', str_remove_all(Sys.Date(), '-'), '.xlsx'))){
      writexl::write_xlsx(list(inside_repeats = result_list[[1]],
                               inside_micro = result_list[[4]],
                               dist_freq = count_table,
                               dist_repeat = result_list[[2]],
                               dist_micro = result_list[[5]]),
                          path = paste0(outpath, '/', outFile, '_Distribution_results_repeat_noRandom_', str_remove_all(Sys.Date(), '-'), '.xlsx'))
    } else {cat("[WARN] Result file can not be created because a file with the same name exists.\n")}
  }
  
  cat('---------- Annotation process is finished. ----------\n')
  cat(paste0('Finish time : ', date(), '\n'))
  
  return(result_list)
}
bioinfo16/VVIPS documentation built on Nov. 4, 2019, 7:20 a.m.