R/count.recombinations.per.genome.R

Defines functions count.rec.events.per.genome

Documented in count.rec.events.per.genome

#' Count number of overlapping recombination events at each genomic position
#'
#' This function reads a GFF file or data frame containing identified
#' recombination events in the genome identified by Gubbins, and counts the frequency of recombination
#' events at each genomic position i.e. number of unique overlapping recombination events. The
#' data frame can be generated using the "load.gubbins.rec.events.gff" function
#'
#' @param gubbins.gff.file Path to the input Gubbins GFF recombination file or data frame
#' @param recom.input.type Type of input recombination data, either "Gubbins" GFF or "BRATNextGen" tabular data.
#' @param taxon.names Vector containing taxon names.
#'
#' @return A data frame containing number of unique recombination events at genomic positions where recombination events were identified
#'
#' @examples
#' \dontrun{
#' Read genome in GFF formatted file (generated usign readseq) and plot
#' the genomic features
#'
#' This function may take some minutes to finish depending on the number
#' of recombination events identifiedand genome size
#'
#' gubbins.gff<-system.file("extdata", "ST320.recombination_predictions.gff",
#' package = "RCandy",mustWork = TRUE)
#'
#' rec.freq<-count.rec.events.per.base(gubbins.gff)
#' }
#'
#' @export
#'
#' @import magrittr
#' @import dplyr
#'
#' @author Chrispin Chaguza, \email{Chrispin.Chaguza@@gmail.com}
#' @references \url{https://github.com/ChrispinChaguza/RCandy}
#'
#### Function to count the number of recombination events per base in the genome ####

count.rec.events.per.genome<-function(gubbins.gff.file,recom.input.type="Gubbins",taxon.names){

  # Check if valid taxon names are specified
  if(is.null(taxon.names) | !is.vector(taxon.names)){
    stop("Invalid taxon names specified")
  }

  # Check type of recombination events and input file name or data
  if( !recom.input.type %in% c("Gubbins","BRATNextGen") ){
    stop("Invalid recombination data specified. Choose from 'Gubbins' or 'BRATNextGen'")
  }
  # Check if the Gubbins GFF recombination file name is provided and is a string/character
  if( !is.null(gubbins.gff.file) & is.character(gubbins.gff.file) ){
    # Read the recombination events from the Gubbins GFF file
    # Check if the input data was generated by Gubbins (GFF file) or BRATNextGen (tabular file)
    if( recom.input.type=="Gubbins" ){
      rec.count.tmp<-load.gubbins.GFF(gubbins.gff.file)
    }else{
      rec.count.tmp<-load.gubbins.GFF(gubbins.gff.file,recom.input.type="BRATNextGen")
    }
  }else{
    # Check if the Gubbins GFF recombination events are provided via a data frame rather than a file
    if( length(setdiff(class(gubbins.gff.file), c("tbl_df","tbl","data.frame","rowwise_df")))==0 ){
      rec.count.tmp<-gubbins.gff.file
    }else{
      # Exit the program, invalid recombination event data provided
      stop("Something is wrong with the Gubbins recombination file")
    }
  }

  # Define variable to store each genomic position containing each recombination event
  temp.vals<-c()

  # Identify genomic positions with recombination events
  for(count.val in 1:length(rec.count.tmp$SEQ)){
      temp.vals<-c(temp.vals,rec.count.tmp$gene[[count.val]])
  }

  # Count number of recombinations detected in each genome
  temp.vals.fr<-table(temp.vals)

  # Count number of overlapping recombination events at each genomic position
  temp.vals.fr<-temp.vals.fr[taxon.names]

  # Return a data frame containing genomic position and number of recombination events identified
  return(temp.vals.fr)
}
ChrispinChaguza/RCandy documentation built on June 23, 2022, 1:03 p.m.