#' @title Make an input table by BLAT/BLAST results
#'
#' @description Annotation funcions in this package need specific format of inputs. So, this function converts
#' raw data file to avaliable format of objects. In this part, user can use *BLAST* and *BLAT* output format only.
#' About this format, check the website of BLAST and UCSC genome browser.
#'
#' @usage makeInputs(inputPath, outputPath, mapTool = 'blast', identity = 90, vectorPos = 'front')
#'
#' @param inputPath a string vector. Location of a local alignment result file. When you use BLAT result file (*.psl),
#' do not include any header and comment.
#' @param outputPath a string vector. Location of tab-deliminated hit files.
#' @param mapTool a character vector. Function serves two types of file such as outputs from BLAST and BLAT.
#' Default is 'blast'. If you want to use file from BLAT, use 'blat'.
#' @param identity a numeric vector. Select more proper hits by identity score. Default value is 90.
#' @param vectorPos a character vector. Sets the position of the vector. Default value is 'front'. If the vector is located at the back, you can change it to 'behind'.
#'
#'
#' @return Return two types of inputs. One is data table format inputs and other is GenomicRange(GR) object type.
#'
#'
#' @export
makeInputs = function(inputPath, outputPath, mapTool = 'blast', identity = 90, vectorPos = 'front'){
library(stringr)
output_list = vector("list", length = 2)
cat('---------- Creating input objects for site annotation. ----------\n')
cat(paste0('Start time : ', date(), '\n'))
if(length(which(c('blat', 'blast') %in% mapTool)) == 0){
return(cat("[ERROR] You can use cetain data format files such as BLAST and BLAT. ( Input : ", paste(mapTool, collapse = ','), ")\n",
'---------- Creating objects is halted. ----------\nFinish time : ', date(), '\n'))
} else if(length(which(c('blat', 'blast') %in% mapTool)) == 2){
return(cat("[ERROR] You can use only one kind of data format. ( Input : ", paste(mapTool, collapse = ','), ")\n",
'---------- Creating objects is halted. ----------\nFinish time : ', date(), '\n'))
} else {}
cat('---------- Creating a table object ----------\n')
cat('Read an input table file.\n')
new_tab = data.frame(stringsAsFactors = FALSE); uniq = NULL; dups = NULL;
if(mapTool == 'blast'){
hitTable = read.delim(inputPath, header = FALSE, comment.char="#", stringsAsFactors = FALSE)
if(ncol(hitTable) != 12){
return(cat("[ERROR] Please check your input file or file format!\n",
'---------- Creating objects is halted. ----------\nFinish time : ', date(), '\n'))
} else {}
hitTable = subset(hitTable, str_detect(hitTable[,2], '_') == FALSE)
hitTable = hitTable[,-c(5,6,12)]
colnames(hitTable) = c('qname', 'sname', 'identity', 'align_length',
'qstart', 'qend', 'sstart', 'send', 'expect_value')
cat('Creating a table object.\n')
hitTable = hitTable[as.numeric(hitTable$identity) >= identity,]
if(length(unique(hitTable$qname)) != nrow(hitTable)){
qname_list = unique(hitTable$qname)
for(x in qname_list){
tmp = subset(hitTable, hitTable$qname == x)
new_row = tmp[which(tmp$identity == max(tmp$identity)),]
if(nrow(new_row) > 1){
new_row = new_row[which(new_row$expect_value == min(new_row$expect_value)),]
} else {}
new_tab = rbind(new_tab, new_row)
}
} else {new_tab = hitTable}
check = which(new_tab$sstart > new_tab$send)
strand = rep('+', nrow(new_tab))
strand[check] = '-'
new_tab = cbind(new_tab[,c(1:8)], strand, 'expect_value' = new_tab[,9])
tmp1 = new_tab$sstart[check]
tmp2 = new_tab$send[check]
new_tab$sstart[check] = tmp2
new_tab$send[check] = tmp1
dups = new_tab[which(duplicated(new_tab[,c(2,3,7,8,9)]) | duplicated(new_tab[,c(2,3,7,8,9)], fromLast = TRUE)),]
if(nrow(dups) != 0){
uniq = new_tab[-which(duplicated(new_tab[,c(2,3,7,8,9)]) | duplicated(new_tab[,c(2,3,7,8,9)], fromLast = TRUE)),]
} else {uniq = new_tab}
hitTable = uniq
output_list[[1]] = uniq
} else if(mapTool == 'blat'){
hitTable = read.delim(inputPath, header = FALSE, stringsAsFactors = FALSE)
if(ncol(hitTable) != 21){
return(cat("[ERROR] Please check your input file or file format!\n",
'---------- Creating a table is halted. ----------\nFinish time : ', date(), '\n'))
} else {}
# BLAT can find perfect sequence matches by more than 25 bases sequences.
hitTable = subset(hitTable, hitTable[,11] >= 25)
hitTable = subset(hitTable, str_detect(hitTable[,14], '_') == FALSE)
hitTable = hitTable[,c(1,9:17)]
colnames(hitTable) = c('match', 'strand', 'qname', 'qsize', 'qstart', 'qend',
'tname', 'tsize', 'tstart', 'tend')
cat('Creating a table object.\n')
hitTable = cbind(hitTable, 'identity' = round(as.numeric(hitTable$match) /
(abs(as.numeric(hitTable$qstart)-as.numeric(hitTable$qend))) * 100))
hitTable = hitTable[as.numeric(hitTable$identity) >= identity,]
hitTable = cbind(hitTable, 'cover' = round(as.numeric(hitTable$match)/as.numeric(hitTable$qsize)*100))
if(length(unique(hitTable$qname)) != nrow(hitTable)){
qname_list = unique(hitTable$qname)
for(x in qname_list){
tmp = subset(hitTable, hitTable$qname == x)
tmp_row = tmp[which(tmp$cover == max(tmp$cover)),]
if(nrow(tmp_row) != 1){
tmp_row = tmp_row[which(tmp_row$identity == max(tmp_row$identity)),]
} else {}
new_row = tmp_row
new_tab = rbind(new_tab, new_row)
}
} else {new_tab = hitTable}
dups = new_tab[which(duplicated(new_tab[,c(1,2,3,4,11,12)]) | duplicated(new_tab[,c(1,2,3,4,11,12)], fromLast = TRUE)),]
if(nrow(dups)!= 0){
uniq = new_tab[-which(duplicated(new_tab[,c(1,2,3,4,11,12)]) | duplicated(new_tab[,c(1,2,3,4,11,12)], fromLast = TRUE)),]
} else {uniq = new_tab}
hitTable = uniq
output_list[[1]] = uniq
} else {}
cat('---------- Writing hit files ----------\n')
write.table(uniq, file = paste0(outputPath, '/Used_hits_from_', mapTool, '.txt'),
append = FALSE, quote = FALSE, sep = '\t', na = '', row.names = FALSE,
col.names = TRUE)
write.table(dups, file = paste0(outputPath, '/Uncertain_hits_from_', mapTool, '.txt'),
append = FALSE, quote = FALSE, sep = '\t', na = '', row.names = FALSE,
col.names = TRUE)
cat('Done.\n')
cat('---------- Creating a GRanges object ----------\n')
#### 02, Make GR object of a input table
if(mapTool == 'blast'){
pos = vector("integer", length = nrow(hitTable))
if(vectorPos == 'front'){
pos[which(hitTable$strand == '+')] = as.numeric(hitTable$sstart)[which(hitTable$strand == '+')] - 1
pos[which(hitTable$strand == '-')] = as.numeric(hitTable$send)[which(hitTable$strand == '-')] + 1
} else if(vectorPos == 'behind'){
pos[which(hitTable$strand == '-')] = as.numeric(hitTable$sstart)[which(hitTable$strand == '-')] - 1
pos[which(hitTable$strand == '+')] = as.numeric(hitTable$send)[which(hitTable$strand == '+')] + 1
} else {
return(cat("[ERROR] Please check position of vector!\n",
'---------- Creating a table is halted. ----------\nFinish time : ', date(), '\n'))
}
gr_hits = GenomicRanges::GRanges(seqname = hitTable$sname,
ranges = IRanges::IRanges(start = pos, end = pos),
strand = '*', query = hitTable$qname,
identity = as.numeric(hitTable$identity))
} else if(mapTool == 'blat'){
pos = vector("integer", length = nrow(hitTable))
if(vectorPos == 'front'){
pos[which(hitTable$strand == '+')] = as.numeric(hitTable$tstart)[which(hitTable$strand == '+')] - 1
pos[which(hitTable$strand == '-')] = as.numeric(hitTable$tend)[which(hitTable$strand == '-')] + 1
} else if(vectorPos == 'behind'){
pos[which(hitTable$strand == '-')] = as.numeric(hitTable$tstart)[which(hitTable$strand == '-')] - 1
pos[which(hitTable$strand == '+')] = as.numeric(hitTable$tend)[which(hitTable$strand == '+')] + 1
} else {
return(cat("[ERROR] Please check position of vector!\n",
'---------- Creating a table is halted. ----------\nFinish time : ', date(), '\n'))
}
gr_hits = GenomicRanges::GRanges(seqname = hitTable$tname,
ranges = IRanges::IRanges(start = pos, end = pos),
strand = '*', query = hitTable$qname,
identity = as.numeric(hitTable$identity))
} else {}
output_list[[2]] = gr_hits
gr_hits_tab = as.data.frame(gr_hits)[,c(6,1,2)]
names(gr_hits_tab) = c('Query', 'Insertion_chr', 'Insertion_pos')
write.table(gr_hits_tab, file = paste0(outputPath, '/Integration_sites_', mapTool, '.txt'),
append = FALSE, quote = FALSE, sep = '\t', na = '', row.names = FALSE, col.names = TRUE)
cat('Done.\n')
cat('---------- Creating a table is finished. ----------\n')
cat(paste0('Finish time : ', date(), '\n'))
return(output_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.