R/BlastTools.R

Defines functions clean.by.query orthology.pairs clean.by.sub orthoBlast.batch orthoBlast

#' orthoBlast
#'
#' remove duplicated in query and subject and subset by a coverage and id.cutoff
#'
#' @param raw.blast raw blast tables obtained with ReadBlastTables functino
#' @param coverage numeric coverage cutoff
#' @param id.cutoff numeric, identity percent cutoff
#'
#' @import dplyr
#' @import magrittr
#' @return return a list with the data frames with the tables
#' @export
#'
#' @examples orthoBlast(raw.blast = rawblast, coverage = 50, id.cutoff = 70)
#'
#' # if the tables are in a list
#'
#' sapply(raw.blast.list, orthoBlast, coverage = 70, id.cutoff = 70, simplify = F, USE.NAMES = T)

orthoBlast <- function(raw.blast, coverage, id.cutoff){
  df.out <- raw.blast %>%
    filter(.[[13]] >= coverage) %>%
    filter(.[[3]] >= id.cutoff) %>%
    arrange(.[[1]], desc(.[[12]])) %>%
    group_by(.[[1]]) %>% slice(1) %>%
    ungroup() %>%
    select(-`.[[1]]`)
  return(df.out)
}
orthoBlast.batch <- function(raw.blast, coverage, id.cutoff){
  lapply(raw.blast, function(b){
    t1 <- b[b[[13]] >= coverage, ]
    t2 <- t1[t1[[3]] >= id.cutoff, ]
    t2 <- t2[order(t2[[1]], t2[[12]], decreasing = T), ]
    t3 <- t2[!duplicated(t2[[1]]), ]
  })
}

#' clean.by.sub
#'
#' clean the blast tables by subject and coverage and identity percent cutoff
#'
#' @param df data frame to be subset
#' @param cov coverage cutoff to apply
#' @param id percent identity cutoff to apply
#'
#' @return return the data frame subseted
#' @export
#'
#' @examples clean.by.sub(df = blast.table, cov = 50, id = 70)
clean.by.sub <- function(df, cov, id){
  df <- df[df[[13]] >= cov, ]
  df <- df[df[[3]] >=id,]
  df <- df[order(df[[2]], df[[12]], decreasing = T), ]
  return(df[!duplicated(df[[2]]),])}

#' orthology.pairs
#'
#' remove duplicated in query and subject and subset by a coverage and id.cutoff but return the a table with the pair of orthologs
#'
#' @param df data freame of the blast table
#' @param id numeric, percent identity cutoff
#' @param cov numeric, coverage cutoff
#'
#' @import dplyr
#' @import magrittr
#'
#' @return data frame of two columns with the orthologous pairs
#' @export
#'
#' @examples orthology.pairs(blast.table, id = 70, cov = 50)
orthology.pairs <- function(df, id, cov){
  df.out <- df %>%
    filter(.[[13]] >= cov) %>%
    filter(.[[3]] >= id) %>%
    arrange(.[[1]], desc(.[[12]])) %>%
    distinct(.[[1]], .keep_all = T)%>%
    select(c(1,2))

  return(df.out)
}

#' clean.by.query
#'
#' remove the duplicated query hits and apply a cutoff in identity percent and coverage
#'
#' @param df data frame with the blast table
#' @param id numeric, identity percent cutoff
#' @param cov numeric, coverage cutoff
#'
#' @return a data freame with the blast table cleaned
#' @export
#'
#' @examples clean.by.query(df = blast.table, id = 70, cov = 50)
clean.by.query <- function(df, id, cov){
  df <- df[df[[13]] >= cov, ]
  if(is.null(df)){
    stop()
  }
  df <- df[df[[3]] >=id,]
  df <- df[order(df[[1]], df[[12]], decreasing = T), ]
  return(df[!duplicated(df[[1]]),])}
torresmanno/BioTools documentation built on Nov. 14, 2021, 9:22 p.m.