R/count.recombinations.per.base.R

Defines functions count.rec.events.per.base

Documented in count.rec.events.per.base

#' Count number of overlapping recombination events at each genomic position
#'
#' This function reads a GFF file or data frame containing identified
#' recombination events in the genome identified by Gubbins, and counts the frequency of recombination
#' events at each genomic position i.e. number of unique overlapping recombination events. The
#' data frame can be generated using the "load.gubbins.rec.events.gff" function
#'
#' @param gubbins.gff.file Path to the input Gubbins GFF recombination file or data frame
#' @param recom.input.type Type of input recombination data, either "Gubbins" GFF or "BRATNextGen" tabular data.
#'
#' @return A data frame containing number of unique recombination events at genomic positions where recombination events were identified
#'
#' @examples
#' \dontrun{
#' Read genome in GFF formatted file (generated usign readseq) and plot
#' the genomic features
#'
#' This function may take some minutes to finish depending on the number
#' of recombination events identifiedand genome size
#'
#' gubbins.gff<-system.file("extdata", "ST320.recombination_predictions.gff",
#' package = "RCandy",mustWork = TRUE)
#'
#' rec.freq<-count.rec.events.per.base(gubbins.gff)
#' }
#'
#' @export
#'
#' @import magrittr
#' @import dplyr
#'
#' @author Chrispin Chaguza, \email{Chrispin.Chaguza@@gmail.com}
#' @references \url{https://github.com/ChrispinChaguza/RCandy}
#'
#### Function to count the number of recombination events per base in the genome ####

count.rec.events.per.base<-function(gubbins.gff.file,recom.input.type="Gubbins"){

  if( !recom.input.type %in% c("Gubbins","BRATNextGen") ){
    stop("Invalid recombination data specified. Choose from 'Gubbins' or 'BRATNextGen'")
  }
  # Check if the Gubbins GFF recombination file name is provided and is a string/character
  if( !is.null(gubbins.gff.file) & is.character(gubbins.gff.file) ){
    # Read the recombination events from the Gubbins GFF file
    # Check if the input data was generated by Gubbins (GFF file) or BRATNextGen (tabular file)
    if( recom.input.type=="Gubbins" ){
      rec.count.tmp<-load.gubbins.GFF(gubbins.gff.file)
    }else{
      rec.count.tmp<-load.gubbins.GFF(gubbins.gff.file,recom.input.type="BRATNextGen")
    }
  }else{
    # Check if the Gubbins GFF recombination events are provided via a data frame rather than a file
    if( length(setdiff(class(gubbins.gff.file), c("tbl_df","tbl","data.frame","rowwise_df")))==0 ){
      rec.count.tmp<-gubbins.gff.file
    }else{
      # Exit the program, invalid recombination event data provided
      stop("Something is wrong with the Gubbins recombination file")
    }
  }

  # Define variable to store each genomic position containing each recombination event
  temp.vals<-data.frame(XX=0)

  # Identify genomic positions with recombination events
  for(count.val in 1:length(rec.count.tmp$SEQ)){
    if( count.val==1 ){
      tmp.fr.dat<-data.frame(XX=seq(rec.count.tmp[count.val,]$START,rec.count.tmp[count.val,]$END,1))
      temp.vals<-tmp.fr.dat
    }else{
      tmp.fr.dat<-data.frame(XX=seq(rec.count.tmp[count.val,]$START,rec.count.tmp[count.val,]$END,1))
      temp.vals<-rbind(temp.vals,tmp.fr.dat)
    }
  }

  # Count number of overlapping recombination events at each genomic position
  temp.vals.fr<-data.frame(XX=temp.vals$XX) %>% dplyr::as_tibble() %>%
    dplyr::group_by(.data$XX) %>% dplyr::add_count(name="FRQ") %>%
    dplyr::mutate(POS=.data$XX,GRP=1) %>% dplyr::ungroup() %>%
    dplyr::select(-.data$XX) %>% dplyr::arrange(.data$POS) %>% distinct()

  # Return a data frame containing genomic position and number of recombination events identified
  return(temp.vals.fr)
}
ChrispinChaguza/RCandy documentation built on June 23, 2022, 1:03 p.m.