R/makeInputs.R

Defines functions makeInputs

Documented in makeInputs

#' @title Make an input table by BLAT/BLAST results
#' 
#' @description Annotation funcions in this package need specific format of inputs. So, this function converts
#'              raw data file to avaliable format of objects. In this part, user can use *BLAST* and *BLAT* output format only.
#'              About this format, check the website of BLAST and UCSC genome browser.
#'              
#' @usage makeInputs(inputPath, outputPath, mapTool = 'blast', identity = 90, vectorPos = 'front')
#' 
#' @param inputPath a string vector. Location of a local alignment result file. When you use BLAT result file (*.psl),
#'                  do not include any header and comment.
#' @param outputPath a string vector. Location of tab-deliminated hit files.
#' @param mapTool a character vector. Function serves two types of file such as outputs from BLAST and BLAT.
#'                Default is 'blast'. If you want to use file from BLAT, use 'blat'.
#' @param identity a numeric vector. Select more proper hits by identity score. Default value is 90.
#' @param vectorPos a character vector. Sets the position of the vector. Default value is 'front'. If the vector is located at the back, you can change it to 'behind'. 
#' 
#' 
#' @return Return two types of inputs. One is data table format inputs and other is GenomicRange(GR) object type.
#' 
#' 
#' @export
 
makeInputs = function(inputPath, outputPath, mapTool = 'blast', identity = 90, vectorPos = 'front'){
  library(stringr)
  output_list = vector("list", length = 2)

  cat('---------- Creating input objects for site annotation.  ----------\n')
  cat(paste0('Start time : ', date(), '\n'))

  if(length(which(c('blat', 'blast') %in% mapTool)) == 0){
    return(cat("[ERROR] You can use cetain data format files such as BLAST and BLAT. ( Input : ", paste(mapTool, collapse = ','), ")\n",
               '---------- Creating objects is halted. ----------\nFinish time : ', date(), '\n'))
  } else if(length(which(c('blat', 'blast') %in% mapTool)) == 2){
    return(cat("[ERROR] You can use only one kind of data format. ( Input : ", paste(mapTool, collapse = ','), ")\n",
               '---------- Creating objects is halted. ----------\nFinish time : ', date(), '\n'))
  } else {}

  cat('---------- Creating a table object ----------\n')
  cat('Read an input table file.\n')
  
  new_tab = data.frame(stringsAsFactors = FALSE); uniq = NULL; dups = NULL;
  if(mapTool == 'blast'){
    hitTable = read.delim(inputPath, header = FALSE, comment.char="#", stringsAsFactors = FALSE)
    if(ncol(hitTable) != 12){
      return(cat("[ERROR] Please check your input file or file format!\n",
                 '---------- Creating objects is halted. ----------\nFinish time : ', date(), '\n'))
    } else {}
    hitTable = subset(hitTable, str_detect(hitTable[,2], '_') == FALSE)
    hitTable = hitTable[,-c(5,6,12)]
    colnames(hitTable) = c('qname', 'sname', 'identity', 'align_length',
                           'qstart', 'qend', 'sstart', 'send', 'expect_value')
    cat('Creating a table object.\n')

    hitTable = hitTable[as.numeric(hitTable$identity) >= identity,]
    if(length(unique(hitTable$qname)) != nrow(hitTable)){
      qname_list = unique(hitTable$qname)
      for(x in qname_list){
        tmp = subset(hitTable, hitTable$qname == x)
        new_row = tmp[which(tmp$identity == max(tmp$identity)),]
        if(nrow(new_row) > 1){
          new_row = new_row[which(new_row$expect_value == min(new_row$expect_value)),]
        } else {}
        new_tab = rbind(new_tab, new_row)
      }

    } else {new_tab = hitTable}

    
    check = which(new_tab$sstart > new_tab$send)
    strand = rep('+', nrow(new_tab))
    strand[check] = '-'
    new_tab = cbind(new_tab[,c(1:8)], strand, 'expect_value' = new_tab[,9])
    tmp1 = new_tab$sstart[check]
    tmp2 = new_tab$send[check]
    new_tab$sstart[check] = tmp2
    new_tab$send[check] = tmp1
    
    dups = new_tab[which(duplicated(new_tab[,c(2,3,7,8,9)]) | duplicated(new_tab[,c(2,3,7,8,9)], fromLast = TRUE)),]
    if(nrow(dups) != 0){
      uniq = new_tab[-which(duplicated(new_tab[,c(2,3,7,8,9)]) | duplicated(new_tab[,c(2,3,7,8,9)], fromLast = TRUE)),]
    } else {uniq = new_tab}
    hitTable = uniq
    output_list[[1]] = uniq
    
  } else if(mapTool == 'blat'){
    hitTable = read.delim(inputPath, header = FALSE, stringsAsFactors = FALSE)
    if(ncol(hitTable) != 21){
      return(cat("[ERROR] Please check your input file or file format!\n",
                 '---------- Creating a table is halted. ----------\nFinish time : ', date(), '\n'))
    } else {}
    # BLAT can find perfect sequence matches by more than 25 bases sequences.
    hitTable = subset(hitTable, hitTable[,11] >= 25)
    hitTable = subset(hitTable, str_detect(hitTable[,14], '_') == FALSE)
    hitTable = hitTable[,c(1,9:17)]
    colnames(hitTable) = c('match', 'strand', 'qname', 'qsize', 'qstart', 'qend',
                           'tname', 'tsize', 'tstart', 'tend')
    cat('Creating a table object.\n')
    
    hitTable = cbind(hitTable, 'identity' = round(as.numeric(hitTable$match) /
                                                    (abs(as.numeric(hitTable$qstart)-as.numeric(hitTable$qend))) * 100))
    hitTable = hitTable[as.numeric(hitTable$identity) >= identity,]
    hitTable = cbind(hitTable, 'cover' = round(as.numeric(hitTable$match)/as.numeric(hitTable$qsize)*100))
    if(length(unique(hitTable$qname)) != nrow(hitTable)){
      qname_list = unique(hitTable$qname)
      for(x in qname_list){
        tmp = subset(hitTable, hitTable$qname == x)
        tmp_row = tmp[which(tmp$cover == max(tmp$cover)),]
        if(nrow(tmp_row) != 1){
          tmp_row = tmp_row[which(tmp_row$identity == max(tmp_row$identity)),]
        } else {}
        new_row = tmp_row  
        new_tab = rbind(new_tab, new_row)
      }
    } else {new_tab = hitTable}
    
    dups = new_tab[which(duplicated(new_tab[,c(1,2,3,4,11,12)]) | duplicated(new_tab[,c(1,2,3,4,11,12)], fromLast = TRUE)),]
    if(nrow(dups)!= 0){
      uniq = new_tab[-which(duplicated(new_tab[,c(1,2,3,4,11,12)]) | duplicated(new_tab[,c(1,2,3,4,11,12)], fromLast = TRUE)),]
    } else {uniq = new_tab}
    hitTable = uniq
    output_list[[1]] = uniq
  } else {}
  
  cat('---------- Writing hit files ----------\n')
  write.table(uniq, file = paste0(outputPath, '/Used_hits_from_', mapTool, '.txt'),
              append = FALSE, quote = FALSE, sep = '\t', na = '', row.names = FALSE,
              col.names = TRUE)
  
  write.table(dups, file = paste0(outputPath, '/Uncertain_hits_from_', mapTool, '.txt'),
              append = FALSE, quote = FALSE, sep = '\t', na = '', row.names = FALSE,
              col.names = TRUE)
  cat('Done.\n')

  cat('---------- Creating a GRanges object ----------\n')
  #### 02, Make GR object of a input table
  if(mapTool == 'blast'){
    pos = vector("integer", length = nrow(hitTable))
    if(vectorPos == 'front'){
      pos[which(hitTable$strand == '+')] = as.numeric(hitTable$sstart)[which(hitTable$strand == '+')] - 1
      pos[which(hitTable$strand == '-')] = as.numeric(hitTable$send)[which(hitTable$strand == '-')] + 1
    } else if(vectorPos == 'behind'){
      pos[which(hitTable$strand == '-')] = as.numeric(hitTable$sstart)[which(hitTable$strand == '-')] - 1
      pos[which(hitTable$strand == '+')] = as.numeric(hitTable$send)[which(hitTable$strand == '+')] + 1
    } else {
      return(cat("[ERROR] Please check position of vector!\n",
                 '---------- Creating a table is halted. ----------\nFinish time : ', date(), '\n'))
    }
    gr_hits = GenomicRanges::GRanges(seqname = hitTable$sname,
                                     ranges = IRanges::IRanges(start = pos, end = pos),
                                     strand = '*', query = hitTable$qname,
                                     identity = as.numeric(hitTable$identity))
  } else if(mapTool == 'blat'){
    pos = vector("integer", length = nrow(hitTable))
    if(vectorPos == 'front'){
      pos[which(hitTable$strand == '+')] = as.numeric(hitTable$tstart)[which(hitTable$strand == '+')] - 1
      pos[which(hitTable$strand == '-')] = as.numeric(hitTable$tend)[which(hitTable$strand == '-')] + 1
    } else if(vectorPos == 'behind'){
      pos[which(hitTable$strand == '-')] = as.numeric(hitTable$tstart)[which(hitTable$strand == '-')] - 1
      pos[which(hitTable$strand == '+')] = as.numeric(hitTable$tend)[which(hitTable$strand == '+')] + 1
    } else {
      return(cat("[ERROR] Please check position of vector!\n",
                 '---------- Creating a table is halted. ----------\nFinish time : ', date(), '\n'))
    }
    gr_hits = GenomicRanges::GRanges(seqname = hitTable$tname,
                                     ranges = IRanges::IRanges(start = pos, end = pos),
                                     strand = '*', query = hitTable$qname,
                                     identity = as.numeric(hitTable$identity))
  } else {}

  output_list[[2]] = gr_hits
  gr_hits_tab = as.data.frame(gr_hits)[,c(6,1,2)]
  names(gr_hits_tab) = c('Query', 'Insertion_chr', 'Insertion_pos')
  write.table(gr_hits_tab, file = paste0(outputPath, '/Integration_sites_', mapTool, '.txt'),
              append = FALSE, quote = FALSE, sep = '\t', na = '', row.names = FALSE, col.names = TRUE)
  
  cat('Done.\n')
  cat('---------- Creating a table is finished. ----------\n')
  cat(paste0('Finish time : ', date(), '\n'))
  return(output_list)
}
bioinfo16/IRFinder documentation built on Aug. 19, 2019, 10:37 a.m.