#' @title Annotate the vector integrated site by genes and transcription start sites (TSS).
#'
#' @description This function uses Gene and TSS database to search the feature of integrated regions.
#' User can get query sequence inserted in gene and distribution graphes from Gene and TSS coordinations.
#' Plus, user can do random distribution analysis by this function.
#'
#' @usage annoByGene(hits, randomSet = NULL, mapTool = 'blast', organism = 'hg19', interval = 5000, range = c(-20000, 20000),
#` outpath = '~', dbPath = paste0(.libPaths()[1], '/VVIPS/extdata'))
#'
#' @param hits a GR object. This object made from *makeInputs* function.
#' @param randomSet TRUE or FALSE. For random distribution, function makes random set, if user enter TRUE.
#' If this value is FALSE, random distribution analysis is not executed. Default is TRUE.
#' @param mapTool a character. Function serve two types of file such as outputs from BLAST and BLAT.
#' Default is 'blast'. If you want to use BLAT output, use 'blast'.
#' @param organism a single character. This function serves 3 versions of organisms such as hg19, hg38 (Human)
#' and galGal6 (Chicken). Default is 'hg19'.
#' @param interval an integer vector. This number means interval number for distribution analysis. Default is 5000.
#' @param randomSize an integer vector. A random set size. Default is 10000.
#' @param range an integer array. It means the range for highlight region of this analysis. Default range is c(-20000, 20000).
#' @param outpath an string vector. Plots are saved in this path. Default value is R home directory.
#' @param outFile a character vector. Attached ID to the result file name.
#' @param dbPath a string vector. Directory path of database files.
#'
#' @return Return a result list is composed by insertion table (Gene), distribution table (Gene/TSS) and GRobject of Gene and TSS data.
#'
#'
#' @export
annoByGene = function(hits, randomSet = TRUE, mapTool = 'blast', organism = 'hg19', interval = 5000, randomSize = 10000,
range = c(-20000, 20000), outpath = '~', outFile,
dbPath = paste0(.libPaths()[1], '/VVIPS/extdata')){
require(GenomicRanges); require(stringr); require(grDevices); require(ggplot2); require(writexl); require(plyr)
cat('---------- Annotation integrated sites : Gene / TSS ----------\n')
cat(paste0('Start time : ', date(), '\n'))
if(length(which(c('hg19', 'hg38', 'galGal6') %in% organism)) == 0){
return(cat("You can use hg19/hg38/galGal6 data only. ( Input : ", paste(organism, collapse = ','), ")\n",
'---------- Annotation process is halted. ----------\nFinish time : ', date(), '\n'))
} else {}
if(str_ends(dbPath, pattern = '/')){
dbPath = word(dbPath, start = 1, end = nchar(dbPath), sep = '')
} else {}
if(str_ends(outpath, pattern = '/')){
outpath = word(outpath, start = 1, end = nchar(outpath), sep = '')
} else {}
cat('---------- Loading a gene table ----------\n')
#### 01. Load a gene table
dataTable = read.delim(paste0(dbPath, '/Ensembl_gene_', organism, '.txt'),
stringsAsFactors = FALSE, header = TRUE)[,-c(3:5,10:11)]
dataTable = subset(subset(dataTable, dataTable$hgnc_symbol != ''), str_detect(dataTable$chromosome_name, '_') == FALSE)
dataTable$chromosome_name = paste0('chr', dataTable$chromosome_name)
dataTable$strand[dataTable$strand == '1'] = '+'; dataTable$strand[dataTable$strand == '-1'] = '-';
g_tab = data.frame(unique(dataTable[,c(1,2,3,4,5,6,8,9)]), stringsAsFactors = FALSE)
names(g_tab) = c('symbol', 'ensembl_id', 'chr', 'start', 'end', 'strand', 'percentage_gc_content', 'description')
t_tab = data.frame(unique(dataTable[,c(1,2,3,7,7,8,9)]), stringsAsFactors = FALSE)
names(t_tab) = c('symbol', 'ensembl_id', 'chr', 'start', 'end', 'percentage_gc_content', 'description')
cat('Done.\n')
if(randomSet){
cat('---------- Make a random set ----------\n')
#### 01. Generate random set
set.seed(123456);
ch_size = readRDS(file = system.file("extdata", paste0(organism, '_chrom.rds'), package = "VVIPS"))
ch_size_num = as.numeric(ch_size$length)
ch_start = cumsum(ch_size_num) - ch_size_num + 1
ch_end = cumsum(ch_size_num)
ch_ratio = (ch_size_num / sum(ch_size_num))
random_set = sample(ch_size$chrom, size = randomSize, replace = TRUE, prob = ch_ratio)
count_ch = plyr::count(random_set); row.names(count_ch) = count_ch$x
count_ch = count_ch[ch_size$chrom,]
count_ch = cbind(count_ch, ch_size_num)
ran_set = apply(count_ch, 1, function(x){sample(c(1:x[3]), size = x[2], replace = FALSE, prob = NULL)})
chr_ran = rep(paste0('chr', count_ch$x), count_ch$freq)
pos_ran = unlist(ran_set)
ran_tab = data.frame('Random' = c(1:randomSize), 'Random_chr' = chr_ran, 'Random_pos' = pos_ran, stringsAsFactors = FALSE)
write.table(ran_tab, file = paste0(outpath, '/', outFile, '_Random_set_', organism, '_gene.txt'),
quote = FALSE, append = FALSE, sep = '\t', na = '', row.names = FALSE, col.names = TRUE)
cat('Done.\n')
} else {}
cat('---------- Creating a GRanges object ----------\n')
#### 02. Make GR object by a gene table
gr_genes = makeGRangesFromDataFrame(g_tab, keep.extra.columns = TRUE, ignore.strand = FALSE)
gr_tss = makeGRangesFromDataFrame(t_tab, keep.extra.columns = TRUE, ignore.strand = FALSE)
#### 03. Make interval GR objects of a gene table
ranges = seq(from = range[1], to = range[2], interval)
gr_genes_dist = vector("list", length = (length(ranges)-1))
gr_tss_dist = vector("list", length = (length(ranges)-1))
for(x in 1:(length(ranges)-1)){
# Gene
tmp_start = gr_genes@ranges@start + ranges[x]
tmp_end = gr_genes@ranges@start + ranges[x+1]-1
gr_genes_dist[[x]] = GRanges(seqnames = as.character(gr_genes@seqnames),
ranges = IRanges(start = tmp_start,
end = tmp_end),
symbol = gr_genes$symbol,
dist_label = rep(paste0(ranges[x], ' <= X < ', ranges[x+1]), length(gr_genes)),
strand = '*')
# TSS
tmp_start = gr_tss@ranges@start + ranges[x]
tmp_end = gr_tss@ranges@start + ranges[x+1]-1
gr_tss_dist[[x]] = GRanges(seqnames = as.character(gr_tss@seqnames),
ranges = IRanges(start = tmp_start,
end = tmp_end),
symbol = gr_tss$symbol,
dist_label = rep(paste0(ranges[x], ' <= X < ', ranges[x+1]), length(gr_tss)),
strand = '*')
}
#### 04. Make random GR object
if(randomSet){
tmp = ran_tab[,c(2,3,3)]
names(tmp) = c('chr', 'start', 'end')
gr_random = makeGRangesFromDataFrame(tmp)
} else {
cat("[WARN] Random distribution analysis will not be executed.\n")
}
cat('Done.\n')
cat('---------- Annotating integrated regions ----------\n')
#### 05. Make gene information
inside_gene = as.data.frame(findOverlaps(hits, gr_genes, type = 'any',
ignore.strand = TRUE),
stringsAsFactors = FALSE)
a = as.data.frame(hits[inside_gene$queryHits,], stringsAsFactors = FALSE)
b = g_tab[inside_gene$subjectHits,]
inside_tab = cbind(a,b)[,c(6,1,2,3,8:15)]
names(inside_tab) = c('q_name', 'q_chr', 'q_start', 'q_end', 'symbol', 'ensembl_id', 'chr', 'start', 'end',
'strand', 'percentage_gc_content', 'description')
if(randomSet){
inside_gene_ran = as.data.frame(findOverlaps(gr_random, gr_genes, type = 'any', ignore.strand = TRUE),
stringsAsFactors = FALSE)
a = as.data.frame(gr_random[inside_gene_ran$queryHits,], stringsAsFactors = FALSE)
b = g_tab[inside_gene_ran$subjectHits,]
inside_ran_tab = cbind(a,b)[,c(1,2,3,6:13)]
names(inside_ran_tab) = c('q_chr', 'q_start', 'q_end', 'symbol', 'ensembl_id', 'chr', 'start', 'end', 'strand',
'percentage_gc_content', 'description')
dist_gene_ran = vector("list", length = length(ranges)-1)
dist_tss_ran = vector("list", length = length(ranges)-1)
dist_gene_ran_tab = data.frame(stringsAsFactors = FALSE)
dist_tss_ran_tab = data.frame(stringsAsFactors = FALSE)
} else {}
dist_gene = vector("list", length = length(ranges)-1)
dist_tss = vector("list", length = length(ranges)-1)
dist_gene_tab = data.frame(stringsAsFactors = FALSE)
dist_tss_tab = data.frame(stringsAsFactors = FALSE)
for(x in 1:(length(ranges)-1)){
dist_gene[[x]] = as.data.frame(findOverlaps(query = hits,
subject = gr_genes_dist[[x]],
type = "any", ignore.strand = TRUE),
stringsAsFactors = FALSE)
a = as.data.frame(hits[dist_gene[[x]]$queryHits,], stringsAsFactors = FALSE)
b = as.data.frame(gr_genes_dist[[x]][dist_gene[[x]]$subjectHits,], stringsAsFactors = FALSE)
tmp1 = cbind(a,b)
dist_tss[[x]] = as.data.frame(findOverlaps(query = hits,
subject = gr_tss_dist[[x]],
type = "any", ignore.strand = TRUE),
stringsAsFactors = FALSE)
a = as.data.frame(hits[dist_tss[[x]]$queryHits,], stringsAsFactors = FALSE)
b = as.data.frame(gr_tss_dist[[x]][dist_tss[[x]]$subjectHits,], stringsAsFactors = FALSE)
tmp2 = cbind(a,b)
dist_gene_tab = rbind(dist_gene_tab, tmp1)
dist_tss_tab = rbind(dist_tss_tab, tmp2)
if(randomSet){
dist_gene_ran[[x]] = as.data.frame(findOverlaps(query = gr_random,
subject = gr_genes_dist[[x]],
type = "any", ignore.strand = TRUE),
stringsAsFactors = FALSE)
a = as.data.frame(gr_random[dist_gene_ran[[x]]$queryHits,], stringsAsFactors = FALSE)
b = as.data.frame(gr_genes_dist[[x]][dist_gene_ran[[x]]$subjectHits,], stringsAsFactors = FALSE)
tmp1 = cbind(a,b)
dist_tss_ran[[x]] = as.data.frame(findOverlaps(query = gr_random,
subject = gr_tss_dist[[x]],
type = "any", ignore.strand = TRUE),
stringsAsFactors = FALSE)
a = as.data.frame(gr_random[dist_tss_ran[[x]]$queryHits,], stringsAsFactors = FALSE)
b = as.data.frame(gr_tss_dist[[x]][dist_tss_ran[[x]]$subjectHits,], stringsAsFactors = FALSE)
tmp2 = cbind(a,b)
dist_gene_ran_tab = rbind(dist_gene_ran_tab, tmp1)
dist_tss_ran_tab = rbind(dist_tss_ran_tab, tmp2)
} else {}
}
#### 06. Count the number of hit in each range
count_hit_genes = vector("list", length = length(ranges)-1)
count_hit_tss = vector("list", length = length(ranges)-1)
if(randomSet){
count_hit_genes_ran = vector("list", length = length(ranges)-1)
count_hit_tss_ran = vector("list", length = length(ranges)-1)
} else {}
for(x in 1:(length(ranges)-1)){
count_hit_genes[[x]] = countOverlaps(query = hits, subject = gr_genes_dist[[x]], type = "any", ignore.strand = TRUE)
count_hit_tss[[x]] = countOverlaps(query = hits, subject = gr_tss_dist[[x]], type = "any", ignore.strand = TRUE)
if(randomSet){
count_hit_genes_ran[[x]] = countOverlaps(query = gr_random, subject = gr_genes_dist[[x]], type = "any", ignore.strand = TRUE)
count_hit_tss_ran[[x]] = countOverlaps(query = gr_random, subject = gr_tss_dist[[x]], type = "any", ignore.strand = TRUE)
} else {}
}
hits_gene = as.numeric(apply(as.data.frame(count_hit_genes, stringsAsFactors = FALSE), MARGIN = 2, sum))
hits_tss = as.numeric(apply(as.data.frame(count_hit_tss, stringsAsFactors = FALSE), MARGIN = 2, sum))
if(randomSet){
hits_gene_ran = as.numeric(apply(as.data.frame(count_hit_genes_ran, stringsAsFactors = FALSE), MARGIN = 2, sum))
hits_tss_ran = as.numeric(apply(as.data.frame(count_hit_tss_ran, stringsAsFactors = FALSE), MARGIN = 2, sum))
} else {}
#### 07. Drawing histograms
ymax_g = max(hits_gene); ymax_t = max(hits_tss)
mid = vector("numeric", length = (length(ranges)-1))
for(x in 1:(length(ranges)-1)){
mid[x] = (ranges[x] + ranges[x+1])/2
}
freq_sites_gene = rep(x = mid, times = hits_gene)
freq_sites_tss = rep(x = mid, times = hits_tss)
if(randomSet){
freq_sites_gene_ran = rep(x = mid, times = hits_gene_ran)
freq_sites_tss_ran = rep(x = mid, times = hits_tss_ran)
} else {}
dist_gene_tab = dist_gene_tab[,c(6, 1:3, 13, 8:10, 14)]
names(dist_gene_tab) = c('q_name', 'q_chr', 'q_start', 'q_end',
'gene', 'r_chr', 'r_start', 'r_end', 'range')
dist_tss_tab = dist_tss_tab[,c(6, 1:3, 13, 8:10, 14)]
names(dist_tss_tab) = c('q_name', 'q_chr', 'q_start', 'q_end',
'gene', 'r_chr', 'r_start', 'r_end', 'range')
cat('Done.\n')
cat('---------- Drawing histograms ----------\n')
count_sites_gene = plyr::count(freq_sites_gene)[,2]
count_sites_tss = plyr::count(freq_sites_tss)[,2]
if(randomSet){
# Gene
count_sites_gene_ran = plyr::count(freq_sites_gene_ran)[,2]
count_data = data.frame('Range' = factor(rep(ranges[ranges != 0]/1000, 2), levels = ranges[ranges != 0]/1000),
'Group' = c(rep('Observed', length(count_sites_gene)), rep('Random', length(count_sites_gene_ran))),
'Count' = c(count_sites_gene, count_sites_gene_ran),
'Freq' = c(count_sites_gene/sum(count_sites_gene),
count_sites_gene_ran/sum(count_sites_gene_ran)))
png(paste0(outpath, '/', outFile, '_Random_distribution_gene_', organism, '.png'), width = 1200, height = 750)
g_plot = ggplot(count_data) + geom_bar(aes(x = Range, y = Freq, fill = Group), stat = "identity", position = "dodge") +
lims(y = c(0, max(count_data$Freq))) + ggtitle(label = "Random distribution (Gene)") +
xlab('Intervals (Kbs)') + ylab("Ratio of Integration Events") + scale_fill_manual(values = c('Dodgerblue', 'skyblue')) +
theme(panel.background = element_rect(fill="white", colour = "white"),
panel.grid.major = element_line(size = 0.5, linetype = 'dotted', colour = 'black'),
axis.line = element_line(colour = "darkgrey"), legend.title = element_blank(),
legend.key.size = unit(0.7, "cm"), plot.title = element_text(hjust = 0.5, face = "bold", size = 15),
legend.text = element_text(size = 13), axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12), axis.title.x = element_text(size = 13), axis.title.y = element_text(size = 13))
print(g_plot)
dev.off()
# TSS
count_sites_tss_ran = plyr::count(freq_sites_tss_ran)[,2]
count_data_tss = data.frame('Range' = factor(rep(ranges[ranges != 0]/1000, 2), levels = ranges[ranges != 0]/1000),
'Group' = c(rep('Observed', length(count_sites_tss)), rep('Random', length(count_sites_tss_ran))),
'Count' = c(count_sites_tss, count_sites_tss_ran),
'Freq' = c(count_sites_tss/sum(count_sites_tss),
count_sites_tss_ran/sum(count_sites_tss_ran)))
png(paste0(outpath, '/', outFile, '_Random_distribution_tss_', organism, '.png'), width = 1200, height = 750)
t_plot = ggplot(count_data_tss) + geom_bar(aes(x = Range, y = Freq, fill = Group), stat = "identity", position = "dodge") +
lims(y = c(0, max(count_data_tss$Freq))) + ggtitle(label = "Random distribution (TSS)") +
xlab('Intervals (Kbs)') + ylab("Ratio of Integration Events") + scale_fill_manual(values = c('Dodgerblue', 'skyblue')) +
theme(panel.background = element_rect(fill="white", colour = "white"),
panel.grid.major = element_line(size = 0.5, linetype = 'dotted', colour = 'black'),
axis.line = element_line(colour = "darkgrey"), legend.title = element_blank(),
legend.key.size = unit(0.7, "cm"), plot.title = element_text(hjust = 0.5, face = "bold", size = 15),
legend.text = element_text(size = 13), axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12), axis.title.x = element_text(size = 13), axis.title.y = element_text(size = 13))
print(t_plot)
dev.off()
result_list = vector("list", length = 7)
result_list[[1]] = inside_tab
result_list[[2]] = dist_gene_tab
result_list[[3]] = dist_gene_ran_tab
result_list[[4]] = gr_genes
result_list[[5]] = dist_tss_tab
result_list[[6]] = dist_tss_ran_tab
result_list[[7]] = gr_tss
names(result_list) = c('Gene_inside_hits', 'Gene_exp_distribution', 'Gene_random_distribution', 'Gene_data',
'TSS_exp_distribution', 'TSS_random_distribution', 'TSS_data')
count_table = data.frame('Ranges' = paste0(ranges[1:length(ranges)-1], ' <= X < ', ranges[2:length(ranges)]),
'Gene' = count_sites_gene,
'Random_g' = count_sites_gene_ran,
'TSS' = count_sites_tss,
'Random_t' = count_sites_tss_ran,
stringsAsFactors = FALSE)
## Difference
prop_list_g = suppressWarnings(apply(count_table[,c(2,3)], 1, function(t){prop.test(x = c(t[1], t[2]), n = c(sum(count_sites_gene), sum(count_sites_gene_ran)), correct = FALSE)}))
prop_list_t = suppressWarnings(apply(count_table[,c(4,5)], 1, function(t){prop.test(x = c(t[1], t[2]), n = c(sum(count_sites_tss), sum(count_sites_tss_ran)), correct = FALSE)}))
count_table = cbind(count_table[,c(1,2,3)], 'Gene_p_value' = unlist(lapply(prop_list_g, function(t){t$p.value})),
count_table[,c(4,5)], 'TSS_p_value' = unlist(lapply(prop_list_t, function(t){t$p.value})))
} else {
## Gene
den = density(freq_sites_gene/1000)
png(filename = paste0(outpath, '/', outFile, '_Distribution_gene_', organism, '.png'),
width = 1200, height = 700)
hist_freq = hist(freq_sites_gene/1000, breaks = ranges/1000, plot = FALSE)
hist_frequency = hist(freq_sites_gene/1000, breaks = ranges/1000,
ylab = 'Density', xlab = "kbs",
ylim = c(0, round(max(hist_freq$density),2)*1.5),
probability = TRUE, main = NULL, col = 'cornflowerblue',
axes = FALSE, cex.lab = 1.3)
axis(side = 1, at = ranges/1000, cex.axis = 1.5)
axis(side = 2, at = seq(0, round(max(hist_freq$density),2)*1.5, 0.01), cex.axis = 1.5)
text(0, round(max(hist_freq$density),2)*1.5, labels = 'Gene', cex = 1.7, font = 2)
arrows(0,0,0,round(max(hist_freq$density),2)*1.5-0.005, length = 0.15, code = 1, lwd = 3)
arrows(min(ranges/1000), round(max(hist_freq$density),2)*1.5-0.005, range[1]/1000*0.025, round(max(hist_freq$density),2)*1.5-0.005, length = 0.1, code = 1)
arrows(max(ranges/1000), round(max(hist_freq$density),2)*1.5-0.005, range[2]/1000*0.025, round(max(hist_freq$density),2)*1.5-0.005, length = 0.1, code = 1)
text(range[2]/2000, round(max(hist_freq$density),2)*1.5-0.01, 'Upstream', cex = 1.5, col = 'black')
text(-(range[2]/2000), round(max(hist_freq$density),2)*1.5-0.01, 'Downstream', cex = 1.5, col = 'black')
dev.off()
## TSS
png(filename = paste0(outpath, '/', outFile, '_Distribution_TSS_', organism, '.png'),
width = 1200, height = 700)
hist_freq = hist(freq_sites_tss/1000, breaks = ranges/1000, plot = FALSE)
hist_frequency = hist(freq_sites_tss/1000, breaks = ranges/1000,
ylab = 'Density', xlab = "kbs",
ylim = c(0, round(max(hist_freq$density),2)*1.5),
probability = TRUE, main = NULL, col = 'cornflowerblue',
axes = FALSE, cex.lab = 1.3)
axis(side = 1, at = ranges/1000, cex.axis = 1.5)
axis(side = 2, at = seq(0, round(max(hist_freq$density),2)*1.5, 0.01), cex.axis = 1.5)
text(0, round(max(hist_freq$density),2)*1.5, labels = 'TSS', cex = 1.7, font = 2)
arrows(0,0,0,round(max(hist_freq$density),2)*1.5-0.005, length = 0.15, code = 1, lwd = 3)
arrows(min(ranges/1000), round(max(hist_freq$density),2)*1.5-0.005, range[1]/1000*0.025, round(max(hist_freq$density),2)*1.5-0.005, length = 0.1, code = 1)
arrows(max(ranges/1000), round(max(hist_freq$density),2)*1.5-0.005, range[2]/1000*0.025, round(max(hist_freq$density),2)*1.5-0.005, length = 0.1, code = 1)
text(range[2]/2000, round(max(hist_freq$density),2)*1.5-0.01, 'Upstream', cex = 1.5, col = 'black')
text(-(range[2]/2000), round(max(hist_freq$density),2)*1.5-0.01, 'Downstream', cex = 1.5, col = 'black')
dev.off()
result_list = vector("list", length = 5)
result_list[[1]] = inside_tab
result_list[[2]] = dist_gene_tab
result_list[[3]] = gr_genes
result_list[[4]] = dist_tss_tab
result_list[[5]] = gr_tss
names(result_list) = c('Gene_inside_hits', 'Gene_exp_distribution', 'Gene_data', 'TSS_exp_distribution',
'TSS_data')
count_table = data.frame('Ranges' = paste0(ranges[1:length(ranges)-1], ' <= X < ',
ranges[2:length(ranges)]),
'Gene' = count_sites_gene,
'TSS' = count_sites_tss,
stringsAsFactors = FALSE)
}
### Writing documents
if(randomSet){
if(!file.exists(paste0(outpath, '/', outFile, '_Distribution_results_gene_', str_remove_all(Sys.Date(), '-'), '.xlsx'))){
writexl::write_xlsx(list(inside_genes = result_list[[1]],
dist_freq = count_table,
dist_gene = result_list[[2]],
dist_tss = result_list[[5]]),
path = paste0(outpath, '/', outFile, '_Distribution_results_gene_', str_remove_all(Sys.Date(), '-'), '.xlsx'))
} else {cat("[WARN] Result file can not be created because a file with the same name exists.\n")}
} else {
if(!file.exists(paste0(outpath, '/', outFile, '_Distribution_results_gene_noRandom_', str_remove_all(Sys.Date(), '-'), '.xlsx'))){
writexl::write_xlsx(list(inside_genes = result_list[[1]],
dist_freq = count_table,
dist_gene = result_list[[2]],
dist_tss = result_list[[4]]),
path = paste0(outpath, '/', outFile, '_Distribution_results_gene_noRandom_', str_remove_all(Sys.Date(), '-'), '.xlsx'))
} else {cat("[WARN] Result file can not be created because a file with the same name exists.\n")}
}
cat('---------- Annotation process is finished. ----------\n')
cat(paste0('Finish time : ', date(), '\n'))
return(result_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.