#' Read recombination events from the output GFF file from Gubbins
#'
#' This function reads a GFF format generated by Gubbins containing
#' genomic regions where recombination events were identified.
#'
#' @param gubbins.gff.file Path to the input Gubbins GFF recombination file or data frame
#' @param recom.input.type Type of input recombination data, either "Gubbins" GFF or "BRATNextGen" tabular data.
#' @return A data frame containing number of unique recombination events at genomic positions where recombination events were identified
#'
#' @examples
#' \dontrun{
#' Read genome in GFF formatted file (generated usign readseq) and plot the genomic features
#'
#' gubbins.gff<-system.file("extdata", "ST320.recombination_predictions.gff", package = "RCandy",
#' mustWork = TRUE)
#'
#' rec.events<-load.gubbins.rec.events.gff(gubbins.gff)
#' }
#'
#' @export
#'
#' @import magrittr
#' @import dplyr
#' @import utils
#'
#' @author Chrispin Chaguza, \email{Chrispin.Chaguza@@gmail.com}
#' @references \url{https://github.com/ChrispinChaguza/RCandy}
#'
# Read Gubbins recombination GFF file
load.gubbins.GFF<-function(gubbins.gff.file,recom.input.type="Gubbins"){
# Check if correct input recombination data type is specified
if( !recom.input.type %in% c("Gubbins","BRATNextGen") ){
stop("Invalid recombination data specified. Choose from 'Gubbins' or 'BRATNextGen'")
}
# Check if the input data was generated by Gubbins (GFF file) or BRATNextGen (tabular file)
if( recom.input.type=="Gubbins" ){
# Check if a valid GFF Gubbins recombination file is specified
if( !is.null(gubbins.gff.file) ){
# Check if the Gubbins recombination file name is a string or character
if( is.character(gubbins.gff.file) ){
# Check if the Gubbins recombination file exists in the file path
if( file.exists(gubbins.gff.file) ){
# Read the GFF Gubbins recombination file, skips the first two lines which contain comments
tree.rec.data1<-dplyr::as_tibble(read.table(gubbins.gff.file,header=FALSE,sep="\t",comment.char="?",skip=2,fill=T,row.names=NULL))
colnames(tree.rec.data1)<-c("SEQ","PROG","TYPE","START","END","XX","YY","ZZ","REC")
# Check if at least one recombination event was found in the Gubbins GFF file
if( length(tree.rec.data1$SEQ)>1 ){
# Extract the taxon names in the Gubbins GFF file
tree.rec.data.tmp<-tree.rec.data1 %>% dplyr::mutate(REC1=.data$REC) %>%
dplyr::group_by(.data$SEQ,.data$PROG,.data$TYPE,.data$START,.data$END,.data$XX,.data$YY,.data$ZZ,.data$REC) %>%
tidyr::nest() %>% dplyr::rowwise() %>%
dplyr::mutate(gene=list(setdiff(stringr::str_split(stringr::str_trim(gsub("taxa=","",gsub("taxa= ","",stringr::str_trim(stringr::str_split(regmatches(data[[1]],regexpr("(taxa=).*;",data[[1]])),";")[[1]],side="both"))),side="both")," ")[[1]],c(""," "))) )
# Return a data frame containg recombination events in appropriate format
return(tree.rec.data.tmp)
}else{
# Exit the program when no valid GFF Gubbins recombination file is found
stop("It appears that there are no recombination events in the file '",gubbins.gff.file,"'")
}
}else{
# Exit the program when no valid GFF Gubbins recombination file is found
stop("Cannot find the Gubbins file '",gubbins.gff.file,"' containing recombination events")
}
}else{
if( length(setdiff(class(gubbins.gff.file),c("tbl_df","tbl","data.frame","rowwise_df","grouped_df")))==0 ){
return(gubbins.gff.file)
}else{
# Exit the program when an invalid GFF Gubbins recombination file is specified
stop("Gubbins recombination events file name does not appear to be a character or string")
}
}
}else{
# Return nothing if no GFF Gubbins recombination file is provided
return(NULL)
}
}else{
tree.rec.data.tmp<-as_tibble(read.table(gubbins.gff.file,
fill=TRUE,sep="\t",comment.char="?",skip=2,header=FALSE)) %>%
dplyr::mutate(V2=.data$V1) %>% dplyr::group_by(.data$V1) %>% tidyr::nest() %>%
dplyr::ungroup() %>% dplyr::select(-.data$V1) %>% dplyr::rowwise() %>%
dplyr::mutate(rec.events=list(stringr::str_split(.data$data[[1]]," ")[[1]][!stringr::str_split(.data$data[[1]]," ")[[1]] %in% c("")])) %>%
dplyr::mutate( Start=.data$rec.events[1],
End=.data$rec.events[2],
Origin=.data$rec.events[3],
HomeCluster=.data$rec.events[4],
BAPSIndex=.data$rec.events[5],
StrainName=.data$rec.events[6]) %>%
dplyr::mutate(SEQ="SEQUENCE",
PROG="GUBBINS",
TYPE="CDS",
START=.data$Start,
END=.data$End,
XX=0.000,
YY=".",
ZZ=0) %>% dplyr::select(-.data$data,-.data$rec.events,-.data$Start,-.data$End,-.data$Origin,-.data$HomeCluster,-.data$BAPSIndex) %>%
dplyr::group_by(.data$SEQ,.data$PROG,.data$TYPE,.data$START,.data$END,.data$XX,.data$YY,.data$ZZ) %>% tidyr::nest() %>%
dplyr::mutate( REC=paste("taxa='",stringr::str_trim(paste0(.data$data[[1]]$StrainName,collapse="..."),side="both"),"';",sep=" ",collapse=" ")) %>%
dplyr::mutate(REC=gsub("\\.\\.\\."," ",gsub(" ","",.data$REC))) %>% dplyr::mutate(REC=list(.data$REC)) %>%
dplyr::mutate(gene=list(.data$data[[1]]$StrainName),data=.data$REC)
return(tree.rec.data.tmp)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.