#' @title Annotate the vector integrated site by repeat sequences.
#'
#' @description By the repeat database, this function works for viral vector integration site annotation.
#' User can get query sequences inserted in several types of repeats and a distribution graph by this function.
#'
#' @usage annoByRepeat(hits, randomSet, mapTool = 'blast', organism = 'hg19', interval = 5000, range = c(-20000, 20000),
#` outpath = '~', repClass = c('SINE', 'LINE', 'DNA', 'RNA'), dbPath = paste0(.libPaths()[1], '/IRFinder/extdata'))
#'
#' @param hits a GR object. This object made from makeInputs function.
#' @param randomSet a string vector. Type path to load a random set.
#' If this value is null, random distribution analysis is not executed.
#' @param mapTool a character vector. Function serve two types of file such as outputs from BLAST and BLAT.
#' Default is 'blast'. If you want to use BLAT output, use 'blast'.
#' @param organism a character vector. This function serves 3 versions of organisms such as hg19, hg38 (Human)
#' and galGal6 (Chicken). Default is 'hg19'.
#' @param interval an integer vector. This number means interval number for distribution analysis. Default is 5000.
#' @param range an integer array. It means the range for highlight region of this analysis. Default range is c(-20000, 20000).
#' @param outpath an string vector. Plots are saved in this path. Default value is R home directory.
#' @param repClass a character vector or array. Users can select specific types of repeats such as SINE, LINE, DNA, RNA and Simple_repeat.
#' @param dbPath a string vector. Directory path of database files.
#'
#' @return Return a result list constituted by insertion table, distribution table and GR object of repeat data.
#'
#'
#' @export
annoByRepeat = function(hits, randomSet = NULL, mapTool = 'blast', organism = 'hg19', interval = 5000, range = c(-20000, 20000),
outpath = '~', repClass = c('SINE', 'LINE', 'DNA', 'RNA'), dbPath = paste0(.libPaths()[1], '/IRFinder/extdata')){
library(GenomicRanges); library(stringr)
library(grDevices); library(regioneR)
cat('---------- Annotation integrated sites : Repeats ----------\n')
cat(paste0('Start time : ', date(), '\n'))
if(length(which(c('hg19', 'hg38', 'galGal6') %in% organism)) == 0){
return(cat("You can use hg19/hg38/galGal6 data only. ( Input : ", paste(organism, collapse = ','), ")\n",
'---------- Annotation process is halted. ----------\nFinish time : ', date(), '\n'))
} else {}
cat('---------- Loading a Repeat table ----------\n')
#### 01. Load a gene table
tab_loc = paste0(dbPath, '/rmsk.txt.gz')
##tab_loc = system.file('extdata', 'rmsk.txt.gz', package = 'IRFinder')
tab_gz = gzfile(description = tab_loc, open = "r")
suppressWarnings(dataTable <- read.delim(file = tab_gz, header = FALSE, stringsAsFactors = FALSE))
dataTable = dataTable[,c(6,7,8,10,11,12,13,14,15)]
colnames(dataTable) = c('genoName', 'genoStart', 'genoEnd', 'strand',
'repName', 'repClass', 'repFamily',
'repStart', 'repEnd')
close(tab_gz)
dataTable = subset(dataTable, str_detect(dataTable$genoName, '_') == FALSE)
dataTable = subset(dataTable, str_detect(dataTable$repClass, '[?]') == FALSE)
dataTable = subset(dataTable, dataTable$repClass != "Unknown")
dataTable = dataTable[which(dataTable$repClass %in% repClass),]
dataTable[,2] = dataTable[,2]+1
dataTable[,8] = dataTable[,8]+1
cat('Done.\n')
cat('---------- Loading a microsatellite table ----------\n')
#### 01-1. Load a gene table
tab_loc2 = paste0(dbPath, '/microsat.txt.gz')
##tab_loc = system.file('extdata', 'rmsk.txt.gz', package = 'IRFinder')
tab_gz2 = gzfile(description = tab_loc2, open = "r")
suppressWarnings(dataTable2 <- read.delim(file = tab_gz2, header = FALSE, stringsAsFactors = FALSE))
dataTable2 = dataTable2[,c(2:5)]
colnames(dataTable2) = c('chrom', 'chromStart', 'chromEnd', 'name')
close(tab_gz2)
dataTable2[,2] = dataTable2[,2] + 1
cat('Done.\n')
cat('---------- Creating a GRanges object ----------\n')
#### 02. Make GR object by a gene table
gr_repeat = regioneR::toGRanges(dataTable[,c(1,2,3,4,5,6,7)])
gr_micro = regioneR::toGRanges(dataTable2[,c(1,2,3,4)])
#### 03. Make interval GR objects of a gene table
ranges = seq(from = range[1], to = range[2], interval)
gr_repeat_dist = vector("list", length = (length(ranges)-1))
for(x in 1:(length(ranges)-1)){
tmp_start = gr_repeat@ranges@start + ranges[x]
tmp_end = gr_repeat@ranges@start + ranges[x+1]-1
gr_repeat_dist[[x]] = GRanges(seqnames = as.character(gr_repeat@seqnames),
ranges = IRanges(start = tmp_start, end = tmp_end), strand = '*')
}
gr_micro_dist = vector("list", length = (length(ranges)-1))
for(x in 1:(length(ranges)-1)){
tmp_start = gr_micro@ranges@start + ranges[x]
tmp_end = gr_micro@ranges@start + ranges[x+1]-1
gr_micro_dist[[x]] = GRanges(seqnames = as.character(gr_micro@seqnames),
ranges = IRanges(start = tmp_start, end = tmp_end), strand = '*')
}
if(!is.null(randomSet)){
#### 04. Make random GR object
tmp = read.delim(file = randomSet, header = TRUE, stringsAsFactors = FALSE)
gr_random = regioneR::toGRanges(tmp[,c(2,3,3)])
} else {}
cat('Done.\n')
cat('---------- Annotating integrated regions (Repeats) ----------\n')
#### 05. Make gene information
inside_repeat = as.data.frame(findOverlaps(hits, gr_repeat,
type = 'any', ignore.strand = TRUE), stringsAsFactors = FALSE)
a = as.data.frame(hits[inside_repeat$queryHits,], stringsAsFactors = FALSE)
b = dataTable[inside_repeat$subjectHits,]
inside_tab = cbind(a,b)
if(!is.null(randomSet)){
inside_repeat_ran = as.data.frame(findOverlaps(gr_random, gr_repeat,
type = 'any', ignore.strand = TRUE), stringsAsFactors = FALSE)
a = as.data.frame(gr_random[inside_repeat_ran$queryHits,], stringsAsFactors = FALSE)
b = dataTable[inside_repeat_ran$subjectHits,]
inside_repeat_ran_tab = cbind(a,b)
dist_repeat_ran = vector("list", length = length(ranges)-1)
dist_repeat_ran_tab = data.frame(stringsAsFactors = FALSE)
} else {}
dist_repeat = vector("list", length = length(ranges)-1)
dist_repeat_tab = data.frame(stringsAsFactors = FALSE)
for(x in 1:(length(ranges)-1)){
dist_repeat[[x]] = as.data.frame(findOverlaps(query = hits, subject = gr_repeat_dist[[x]], type = "any",
ignore.strand = TRUE), stringsAsFactors = FALSE)
a = as.data.frame(hits[dist_repeat[[x]]$queryHits,], stringsAsFactors = FALSE)
b = dataTable[dist_repeat[[x]]$subjectHits,]
tmp1 = cbind(a,b)
dist_repeat_tab = rbind(dist_repeat_tab, tmp1)
if(!is.null(randomSet)){
## random
dist_repeat_ran[[x]] = as.data.frame(findOverlaps(query = gr_random, subject = gr_repeat_dist[[x]],
type = "any", ignore.strand = TRUE), stringsAsFactors = FALSE)
a = as.data.frame(gr_random[dist_repeat_ran[[x]]$queryHits,], stringsAsFactors = FALSE)
b = dataTable[dist_repeat_ran[[x]]$subjectHits,]
tmp2 = cbind(a,b)
dist_repeat_ran_tab = rbind(dist_repeat_ran_tab, tmp2)
} else {}
}
#### 06. Count the number of hit in each range
count_hit_repeat = vector("list", length = length(ranges)-1)
if(!is.null(randomSet)){
count_hit_repeat_ran = vector("list", length = length(ranges)-1)
} else {}
for(x in 1:(length(ranges)-1)){
count_hit_repeat[[x]] = countOverlaps(query = hits, subject = gr_repeat_dist[[x]], type = "any", ignore.strand = TRUE)
if(!is.null(randomSet)){
count_hit_repeat_ran[[x]] = countOverlaps(query = gr_random, subject = gr_repeat_dist[[x]], type = "any", ignore.strand = TRUE)
} else {}
}
hits_repeat = as.numeric(apply(as.data.frame(count_hit_repeat, stringsAsFactors = FALSE), MARGIN = 2, sum))
if(!is.null(randomSet)){
hits_repeat_ran = as.numeric(apply(as.data.frame(count_hit_repeat_ran, stringsAsFactors = FALSE), MARGIN = 2, sum))
} else {}
cat('Done.\n')
cat('---------- Annotating integrated regions (Microsatlites) ----------\n')
#### 05. Make gene information
inside_micro = as.data.frame(findOverlaps(hits, gr_micro, type = 'any', ignore.strand = TRUE),
stringsAsFactors = FALSE)
a2 = as.data.frame(hits[inside_micro$queryHits,], stringsAsFactors = FALSE)
b2 = dataTable[inside_micro$subjectHits,]
inside_tab2 = cbind(a2,b2)
if(!is.null(randomSet)){
inside_micro_ran = as.data.frame(findOverlaps(gr_random, gr_micro,
type = 'any', ignore.strand = TRUE),
stringsAsFactors = FALSE)
a2 = as.data.frame(gr_random[inside_micro_ran$queryHits,], stringsAsFactors = FALSE)
b2 = dataTable[inside_micro_ran$subjectHits,]
inside_micro_ran_tab = cbind(a2,b2)
dist_micro_ran = vector("list", length = length(ranges)-1)
dist_micro_ran_tab = data.frame(stringsAsFactors = FALSE)
} else {}
dist_micro = vector("list", length = length(ranges)-1)
dist_micro_tab = data.frame(stringsAsFactors = FALSE)
for(x in 1:(length(ranges)-1)){
dist_micro[[x]] = as.data.frame(findOverlaps(query = hits, subject = gr_micro_dist[[x]], type = "any",
ignore.strand = TRUE), stringsAsFactors = FALSE)
a2 = as.data.frame(hits[dist_micro[[x]]$queryHits,], stringsAsFactors = FALSE)
b2 = dataTable2[dist_micro[[x]]$subjectHits,]
tmp1 = cbind(a2,b2)
dist_micro_tab = rbind(dist_micro_tab, tmp1)
if(!is.null(randomSet)){
## random
dist_micro_ran[[x]] = as.data.frame(findOverlaps(query = gr_random, subject = gr_micro_dist[[x]],
type = "any", ignore.strand = TRUE), stringsAsFactors = FALSE)
a2 = as.data.frame(gr_random[dist_micro_ran[[x]]$queryHits,], stringsAsFactors = FALSE)
b2 = dataTable2[dist_micro_ran[[x]]$subjectHits,]
tmp2 = cbind(a2,b2)
dist_micro_ran_tab = rbind(dist_micro_ran_tab, tmp2)
} else {}
}
#### 06. Count the number of hit in each range
count_hit_micro = vector("list", length = length(ranges)-1)
if(!is.null(randomSet)){
count_hit_micro_ran = vector("list", length = length(ranges)-1)
} else {}
for(x in 1:(length(ranges)-1)){
count_hit_micro[[x]] = countOverlaps(query = hits, subject = gr_micro_dist[[x]], type = "any",
ignore.strand = TRUE)
if(!is.null(randomSet)){
count_hit_micro_ran[[x]] = countOverlaps(query = gr_random, subject = gr_micro_dist[[x]], type = "any",
ignore.strand = TRUE)
} else {}
}
hits_micro = as.numeric(apply(as.data.frame(count_hit_micro, stringsAsFactors = FALSE), MARGIN = 2, sum))
if(!is.null(randomSet)){
hits_micro_ran = as.numeric(apply(as.data.frame(count_hit_micro_ran, stringsAsFactors = FALSE), MARGIN = 2, sum))
} else {}
cat('Done.\n')
cat('---------- Drawing histograms (Repeats) ----------\n')
#### 07. Drawing histograms
ymax_r = max(hits_repeat)
mid = vector("numeric", length = (length(ranges)-1))
for(x in 1:(length(ranges)-1)){
mid[x] = (ranges[x] + ranges[x+1])/2
}
freq_sites_repeat = rep(x = mid, times = hits_repeat)
if(!is.null(randomSet)){
freq_sites_repeat_ran = rep(x = mid, times = hits_repeat_ran)
} else {}
## Repeat
png(filename = paste0(outpath, '/Distribution_Repeat_', organism, '.png'),
width = 1200, height = 700)
hist_frequency = hist(freq_sites_repeat/1000, ylim = c(0, ceiling(ymax_r * 1.1)), breaks = ranges/1000,
ylab = '#Integration events', xlab = "kbs",
probability = FALSE, main = NULL, col = 'cornflowerblue',
axes = FALSE, cex.lab = 1.5)
axis(side = 1, at = ranges/1000, cex.axis = 1.5)
axis(side = 2, at = seq(0, ceiling(ymax_r * 2), 1), cex.axis = 1.5)
text(0, ceiling(ymax_r * 1.1)+1, labels = 'Repeat', cex = 2, font = 2)
arrows(0,0,0,(ceiling(ymax_r * 1.1) +1.5)*0.95, length = 0.15, code = 1, lwd = 3)
arrows(min(ranges/1000), (ceiling(ymax_r * 1.1) +1.5)*0.95, range[1]/1000*0.025, (ceiling(ymax_r * 1.1) +1.5)*0.95, length = 0.1, code = 1)
arrows(max(ranges/1000), (ceiling(ymax_r * 1.1) +1.5)*0.95, range[2]/1000*0.025, (ceiling(ymax_r * 1.1) +1.5)*0.95, length = 0.1, code = 1)
text(range[2]/2000, (ceiling(ymax_r * 1.1) +1.5)*0.93, 'Upstream', cex = 1.5, col = 'black')
text(-(range[2]/2000), (ceiling(ymax_r * 1.1) +1.5)*0.93, 'Downstream', cex = 1.5, col = 'black')
dev.off()
if(!is.null(randomSet)){
count_sites_repeat = plyr::count(freq_sites_repeat)[,2]
count_sites_repeat_ran = plyr::count(freq_sites_repeat_ran)[,2]
count_data = rbind(count_sites_repeat, count_sites_repeat_ran)
png(paste0(outpath, '/Random_distribution_Repeat_', organism, '.png'), width = 1200, height = 750)
barplot(count_data, beside = TRUE, ylim = c(0, (max(count_data)+2)),
main = "Random distribution (Repeat)", xlab = 'Intervals (Kbs)', ylab = "#Integration events",
col = c('Dodgerblue', 'skyblue'), names.arg = ranges[c(1:((length(ranges)-1)/2), ((length(ranges)+3)/2):length(ranges))])
dev.off()
} else {}
cat('Done.\n')
cat('---------- Drawing histograms (Microsatellites) ----------\n')
#### 07. Drawing histograms
ymax_r2 = max(hits_micro)
mid2 = vector("numeric", length = (length(ranges)-1))
for(x in 1:(length(ranges)-1)){
mid2[x] = (ranges[x] + ranges[x+1])/2
}
freq_sites_micro = rep(x = mid2, times = hits_micro)
if(!is.null(randomSet)){
freq_sites_micro_ran = rep(x = mid2, times = hits_micro_ran)
} else {}
## Repeat
png(filename = paste0(outpath, '/Distribution_Microsatellite_', organism, '.png'),
width = 1200, height = 700)
hist_frequency = hist(freq_sites_micro/1000, ylim = c(0, ceiling(ymax_r2 * 1.1)), breaks = ranges/1000,
ylab = '#Integration events', xlab = "kbs",
probability = FALSE, main = NULL, col = 'cornflowerblue',
axes = FALSE, cex.lab = 1.5)
axis(side = 1, at = ranges/1000, cex.axis = 1.5)
axis(side = 2, at = seq(0, ceiling(ymax_r2 * 1.1), 1), cex.axis = 1.5)
text(0, ceiling(ymax_r2 * 1.1), labels = 'Microsatellite', cex = 1.5, font = 2)
arrows(0,0,0,(ceiling(ymax_r2 * 1.1) + 0.5)*0.95, length = 0.15, code = 1, lwd = 3)
arrows(min(ranges/1000), (ceiling(ymax_r2 * 1.1) +0.5)*0.95, range[1]/1000*0.025, (ceiling(ymax_r2 * 1.1) +0.5)*0.95, length = 0.1, code = 1)
arrows(max(ranges/1000), (ceiling(ymax_r2 * 1.1) +0.5)*0.95, range[2]/1000*0.025, (ceiling(ymax_r2 * 1.1) +0.5)*0.95, length = 0.1, code = 1)
text(range[2]/2000, (ceiling(ymax_r2 * 1.1) +0.5)*0.93, 'Upstream', cex = 1.5, col = 'black')
text(-(range[2]/2000), (ceiling(ymax_r2 * 1.1) +0.5)*0.93, 'Downstream', cex = 1.5, col = 'black')
dev.off()
if(!is.null(randomSet)){
count_sites_micro = plyr::count(freq_sites_micro)[,2]
count_sites_micro_ran = plyr::count(freq_sites_micro_ran)[,2]
count_data = rbind(count_sites_micro, count_sites_micro_ran)
png(paste0(outpath, '/Random_distribution_microsatellite_', organism, '.png'), width = 1200, height = 750)
barplot(count_data, beside = TRUE, ylim = c(0, (max(count_data)+2)),
main = "Random distribution (Microsatellite)", xlab = 'Intervals (Kbs)', ylab = "#Integration events",
col = c('Dodgerblue', 'skyblue'), names.arg = ranges[c(1:((length(ranges)-1)/2), ((length(ranges)+3)/2):length(ranges))])
dev.off()
} else {}
if(!is.null(randomSet)){
result_list = vector("list", length = 8)
result_list[[1]] = inside_tab
result_list[[2]] = dist_repeat_tab
result_list[[3]] = dist_repeat_ran_tab
result_list[[4]] = gr_repeat
result_list[[5]] = inside_tab2
result_list[[6]] = dist_micro_tab
result_list[[7]] = dist_micro_ran_tab
result_list[[8]] = gr_micro
names(result_list) = c('Repeat_inside_hits', "Repeat_exp_distribution", "Repeat_random_distribution", "Repeat_data",
"Microsatellite_inside_hits", "Microsatellite_exp_distribution",
"Microsatellite_random_distribution", "Microsatellite_data")
} else {
result_list = vector("list", length = 6)
result_list[[1]] = inside_tab
result_list[[2]] = dist_repeat_tab
result_list[[3]] = gr_repeat
result_list[[4]] = inside_tab2
result_list[[5]] = dist_micro_tab
result_list[[6]] = gr_micro
names(result_list) = c('Repeat_inside_hits', "Repeat_exp_distribution", "Repeat_data",
"Microsatellite_inside_hits", "Microsatellite_exp_distribution", "Microsatellite_data")
}
cat('Done.\n')
cat('---------- Annotation process is finished. ----------\n')
cat(paste0('Finish time : ', date(), '\n'))
return(result_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.