#' Serotyping pipeline for WGS assemblies
#' February 5 2024, Walter Demczuk & Shelley Peterson
#'
#' Takes Organism, Sample Number, Locus, to query a contig.fasta file
#' @param Org_id Organism to query: GAS, PNEUMO or GONO
#' @param Test_id AMR, TOXINS, VIRULENCE, NGSTAR, use MASTER for 16S id
#' @param SampleNo Sample number associated with contig.fasta file
#' @param LocusID The locus to query, or enter "list" to use a list of alleles
#' @param curr_work_dir Start up directory from pipeline project to locate system file structure
#' @details How it works:
#'
#' Pneumococcus serotyping based on PneumoCaT and SeroBA libraries.
#' Copies contig assembly from Warehouse drive
#' BLAST's copied assembly vs. reference data fasta of whole CPS regions for each serotype
#' If CPS region found that requires snp based analysis, queries each reference gene for that serogroup vs. loci list
#' Interprets snps to serotype level
#' The algorithm first determines the serogroup by blasting CPS_reference
#' Each locus in the locus list (temp folder) associated with that serogroup will then be blasted
#' There are 4 stages of locus analysis:
#' 1)result = POS/NEG (presence/absence)
#' 2)pseudo = pseudogene (disrupted/intact)
#' 3)mutations = serotype determining amino acid substitutions as listed in locus_mutations
#' 4)allele = entire gene sequence match of conserved serotype determining genes as found in the allele_lookup folders
#'
#' The relevant result type is listed in the locus list table (temp folder)
#' As each result is evaluated for each relevant locus result, the serotype lookup table locus_lookup (temp folder)
#' is filtered
#' If a single row is left in the lookup table by the end of the sample, that is the serotype
#' Otherwise a Fail<serogroup> answer will be returned
#' Supporting files like blast outputs, results, extracted fasta sequences can be found
#' in the user's local output or temp folders
#'
#' @return A table frame containing the results of the query
#' @export
#-------------------------------------------------------------------------------
# For troubleshooting and debugging
#Org_id <- "PNEUMO" #GBS or PNEUMO
#Test_id <- "SERO"
#SampleNo <- "list" #Sample No or "list"
#LocusID <- "list"
#curr_work_dir <- "C:\\WADE\\"
#Blast_evalue <- "10e-50" #sets sensitivity of Blast gene match 10e-50 to 10e-150; use 10e-5 for primers
#-------------------------------------------------------------------------------
SEROTYPE_pipeline <- function(Org_id, SampleNo, LocusID, Test_id, curr_work_dir){
start_time <- Sys.time()
#-----------------------------------------------------------------------------
# get directory structure and remove previous output files
directorylist <- getdirectory(curr_work_dir, Org_id, Test_id)
reflist <- refdirectory(directorylist, Org_id, Test_id)
unlink(paste0(directorylist$output_dir, "LabWareUpload_", Org_id, "_SEROTYPE.csv"))
#-----------------------------------------------------------------------------
# Load CPS types info
LocusLkupDNA_CPS <- paste0(reflist$Lkup_Dir, "reference_CPS.fasta")
CPSlist <- paste0(directorylist$temp_dir, "CPS_types_for_blast.csv")
CPS_types <- read.csv(paste0(reflist$Ref_Dir, "CPS_types_for_blast.csv"),
header = TRUE, sep = ",", stringsAsFactors = FALSE)
CPS_types <- CPS_types$Serogroup
#-----------------------------------------------------------------------------
# Blast Evalues
blast_evalues.df <- as_tibble(read.csv(paste0(reflist$Ref_Dir, "blast_evalues.csv"),
header = TRUE, sep = ",", stringsAsFactors = FALSE))
Blast_evalue <- as.character(blast_evalues.df$contig[1])
evalue_allele <- as.character(blast_evalues.df$allele[1])
blast_wt_id_threshold <- as.numeric(blast_evalues.df$wt_id[1])
#-----------------------------------------------------------------------------
########################### Setup sample list table ##########################
if(SampleNo == "list")
{
SampleList.df <-as_tibble(read.csv(directorylist$SampleList, header = TRUE, sep = ",", stringsAsFactors = FALSE))
}else
{
SampleList.df <- tibble(SampleNo, Variable = NA)
}
NumSamples <- dim(SampleList.df)[1]
#Progress Bar
n_iter <- NumSamples
init <- numeric(n_iter)
end <- numeric(n_iter)
m <- 1L
for (m in 1L:NumSamples) #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< Start Sample Loop
{
init[m] <- Sys.time()
sampleinfo <- getsampleinfo(SampleList.df, directorylist, reflist, m)
SampleProfile <- ""
serogroup <- ""
serotype <- ""
OutputLocus.df <- tibble(SampleNo = as.character(sampleinfo$CurrSampleNo)) #all results
cat("\n\n", sampleinfo$CurrSampleNo, ": ", m, " of ", NumSamples, "\n", sep = "")
############################################################################
# Get serogroup CPS locus for sample first
############################################################################
if (sampleinfo$Allele[1] == "Sample_Err")
{
serogroup <- "Sample_Err"
serotype <- "Sample_Err"
SampleProfile <- "Sample_Err"
}else #..................................................................... Not Sample Error
{
#makeblastdb from contig file for later when blasting wildgenes vs. contig to extract genes
BlastFormatCommand <- paste0("makeblastdb -in ", reflist$DestFile, " -dbtype nucl")
try(system(BlastFormatCommand))
#use contig file to blast against the CPS data to see what the CPS group is
Blast_Out_File <- blastquery(directorylist, reflist, "reference_CPS", Blast_evalue)
info = file.info(Blast_Out_File)
if(info$size == 0) #no blast result from CPS locus lookup i.e. not pneumo, bad sequencing
{
serotype <- "unknown"
serogroup <- "unknown"
}else
{
df.blastout <- as_tibble(read.csv(Blast_Out_File, header = FALSE, sep = "\t", stringsAsFactors = FALSE))
names(df.blastout) <- c("SampleNo", "Allele", "Ident", "Align", "Mismatches", "Gaps", "SampleStart", "SampleEnd", "AlleleStart", "AlleleEnd", "eValue", "bit")
df.blastout <- arrange(df.blastout, dplyr::desc(bit))
serotype1 <- df.blastout$Allele[1]
SerotypeParts <- unlist(strsplit(serotype1, "_"))
SerotypeParts <- SerotypeParts[1]
serotype <- ifelse(substr(SerotypeParts, 1, 1) == "0", sub("^.", "", SerotypeParts), SerotypeParts)
serotype[serotype %in% c("15B", "15C")] <- "15BC"
if (Org_id == "PNEUMO")
{
serogroup <- as.character(substr(SerotypeParts, 1, 2))
if(serogroup == "40") {serogroup <- "07"}
if(serogroup == "44" || serogroup == "46"){serogroup <- "12"}
if(serogroup == "38") {serogroup <- "25"}
if(serogroup == "37") {serogroup <- "33"}
if(serogroup == "42") {serogroup <- "35"}
}
if (Org_id == "GBS")
{
serogroup <- "GBS"
}
outfile_sero <- paste0(directorylist$output_dir, "output_blastout_serogroup.csv")
write.csv(df.blastout, outfile_sero, quote = FALSE, row.names = F )
}
}#.......................................................................... close Not Sample Error loop
cat("CPS Blast Serogroup ", serogroup, " - Serotype ", serotype, " !\n")
if (serogroup == "unknown" | serogroup == "Sample_Err") # no CPS match or no contig
{
serotype <- paste("NT[", serogroup, "]", sep = "")
SampleProfile <- serogroup
}else # //////////////////////////////////////////////////////////////////// Serogroup Found
{
##########################################################################
# setup loci and get locuslist screen for serotypes requiring no SNP analysis
##########################################################################
if(serotype %in% CPS_types)
{
SampleProfile <- "CPS operon type match"
locus <- ""
head(df.blastout, n = 5L)
}else #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Not in CPS_types list
{
LocusList <- paste0(reflist$Ref_Dir, serogroup, "_loci.csv")
if(file.exists(LocusList))
{
LocusList.df <- read.csv(LocusList, header = TRUE, sep = ",", stringsAsFactors = FALSE)
SerotypeLookup.df <- read.csv(paste0(reflist$Ref_Dir, serogroup, "_loci_lookup.csv"),
header = TRUE, sep = ",", stringsAsFactors = FALSE,
check.names = FALSE)
if(LocusID != "list")
{
LocusList.df <- filter(LocusList.df, Locus_id == LocusID)
}
NumLoci <- dim(LocusList.df)[1]
}else
{
NumLoci <- 0
}
LocusMutationsFile <- paste0(reflist$Ref_Dir, serogroup, "_loci_mutations.csv")
if(file.exists(LocusMutationsFile))
{
Mutns <- "Yes"
LocusMutations.df <- read.csv(LocusMutationsFile, header = TRUE, sep = ",", stringsAsFactors = FALSE)
} else
{
Mutns <- "No"
}
OutputLocus.df <- tibble(SampleNo = as.character(sampleinfo$CurrSampleNo)) #all results
OutputLocusProfile.df <- OutputLocus.df #for serotype
p <- 1L
for(p in 1L:NumLoci) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Locus Loop
{
ProfileSeparator <- ifelse(SampleProfile == "", "", ";")
CurrLocus <- as.character(LocusList.df$Locus_id[p])
locus <- gsub("GBS_", "", CurrLocus) #makes profile smaller for GBS LabWare output
locus_result_type <- as.character(LocusList.df$Result_type[p])
LocusLkupDNA <- paste0(reflist$Lkup_Dir, locus, ".fasta")
LocusLkupDNApresent <- ifelse(file.exists(LocusLkupDNA), TRUE, FALSE)
resultcol <- paste0(locus, "_result")
allelecol <- paste0(locus, "_allele")
pseudocol <- paste0(locus, "_pseudo")
mutationscol <- paste0(locus, "_mutations")
OutputLocus.df[1, resultcol] <- NA
OutputLocus.df[1, allelecol] <- NA
OutputLocus.df[1, mutationscol] <- NA
OutputLocus.df[1, pseudocol] <- NA
LocusFile <- paste0(reflist$WT_Dir, locus, ".fasta")
if(!file.exists(LocusFile))
{
sample_error.df <- tibble(Locus_ID = LocusID, Output = "Reference wildtype file not found.")
OutputLocus.df <- OutputLocus.df %>% replace(is.na(.), "Locus_Err")
return(sample_error.df)
}else #*************************************************************** WT gene found
{
BlastResult <- blasthits(reflist, directorylist, LocusFile, Blast_evalue)
OutputLocus.df[1,2] <- BlastResult #result
blastoutput <- readLines(paste0(directorylist$temp_dir, "blastout.txt"))
if(locus_result_type == "result")
{
SampleProfile <- paste0(SampleProfile, ProfileSeparator, locus, "[", BlastResult, "]")
cat(paste0(locus, "_result"), BlastResult, "\n", sep = "\t\t")
if(LocusID == "list")
{
SerotypeLookup.df <- SerotypeLookup.df %>% filter(.[[resultcol]] == BlastResult|
is.na(.[[resultcol]]))
}
}
#-------------------------------------------------------------------
#add missing genes to profile for mutations and pseudogenes
if((locus_result_type == "pseudo") & (BlastResult == "NEG"))
{
SampleProfile <- paste(SampleProfile, ProfileSeparator, locus, "[no pseudogene]", sep="")
cat(pseudocol, "no gene present", "\n", sep = "\t\t")
}
if((locus_result_type == "mutations") & (BlastResult == "NEG"))
{
SampleProfile <- paste(SampleProfile, ProfileSeparator, locus, "[no mutn gene]", sep="")
cat(mutationscol, "no gene present", "\n", sep = "\t\t")
}
if((locus_result_type == "allele") & (BlastResult == "NEG"))
{
SampleProfile <- paste(SampleProfile, ProfileSeparator, locus, "[no allele gene]", sep="")
cat(allelecol, "no gene present", "\n", sep = "\t\t")
}
#-------------------------------------------------------------------
DNASeqLine_str <- ""
WTDNASeqLine_str <- ""
blastdetails <- ""
if(BlastResult == "POS") #========================================== Positive BLAST Result Loop
{
# Parse out to get WT and query DNA sequences, match % etc.
blast_parsed <- parseblast(blastoutput, LocusID)
# Print both WT and query DNA sequences
if(LocusID != "list")
{
cat("\n\n>", CurrLocus, "(Wildtype)\n", blast_parsed$WTDNASeqLine_NoDash_str, "\n", sep ="")
cat("\n\n>", sampleinfo$CurrSampleNo, "_", CurrLocus , "\n", blast_parsed$DNASeqLine_NoDash_str, "\n", sep ="")
}
WTDNASeqLine <- DNAString(blast_parsed$WTDNASeqLine_str)
WTDNASeqLine_NoDash <- DNAString(blast_parsed$WTDNASeqLine_NoDash_str)
DNASeqLine <- DNAString(blast_parsed$DNASeqLine_str)
DNASeqLine_NoDash <- DNAString(blast_parsed$DNASeqLine_NoDash_str)
##################################################################
# Make Protein Sequence + Check if gene is intact
##################################################################
blastAA <- AAconvert(WTDNASeqLine_NoDash, DNASeqLine_NoDash)
if(locus_result_type == "pseudo")
{
#update locus output
WTend <- head(unlist(gregexpr('[*]', blastAA$WTAASeqLine_str[1])), n = 1)
WTend <- ifelse(is.na(WTend), "0", WTend)
queryend <- head(unlist(gregexpr('[*]', blastAA$AASeqLine_str[1])), n = 1)
queryend <- ifelse(is.na(queryend), 0, queryend)
diff <- ifelse(queryend == -1, 0, (WTend - queryend))
lengthdiff <- blast_parsed$WTlength - blast_parsed$IDlength
nonsense_mutation <- ifelse(diff > 6, "Disrupted",
ifelse((blast_parsed$WTlength - blast_parsed$IDlength > 25),
"Disrupted", "Intact"))
OutputLocus.df[pseudocol] <- nonsense_mutation
OutputLocusProfile.df[pseudocol] <- nonsense_mutation
SampleProfile <- paste0(SampleProfile, ProfileSeparator, locus, "[", nonsense_mutation, "]")
cat(pseudocol, nonsense_mutation, "\n", sep = "\t\t")
if(LocusID == "list")
{
SerotypeLookup.df <- SerotypeLookup.df %>% filter(.[[pseudocol]] == nonsense_mutation|
is.na(.[[pseudocol]]))
}
}
if(LocusID != "list")
{
cat("\n\n>", locus , "(Wildtype)\n", blastAA$WTAASeqLine_str, "\n", sep ="")
cat("\n\n>", sampleinfo$CurrSampleNo, "_", locus , "\n", blastAA$AASeqLine_str, "\n", sep ="")
}
##################################################################
# Mutations
##################################################################
motifs <- ""
mutationspresent <- ""
if(Mutns == "Yes")
{
motifs <- getmotifs(LocusMutations.df, Test_id, locus, LocusID, blastAA$AASeqLine_str)
}
#motifs <- ifelse(motifs == "?", "NA", motifs)
OutputLocus.df[mutationscol] <- motifs
if(locus_result_type == "mutations")
{
OutputLocusProfile.df[mutationscol] <- motifs #mutations
SampleProfile <- paste(SampleProfile, ProfileSeparator, locus, "[", motifs, "]", sep="")
cat(mutationscol, motifs, "\n", sep = "\t\t")
if(LocusID == "list")
{
SerotypeLookup.df <- SerotypeLookup.df %>% filter(.[[mutationscol]] == motifs|
is.na(.[[mutationscol]]))
}
}
##################################################################
# Lookup DNA alleles
##################################################################
# write DNASeqLine_NoDash_str to DestFile,
# BLAST vs. lookup table,
# parse out the allele numbers
sink(reflist$DestFile, split=FALSE, append = FALSE)
cat(">", sampleinfo$CurrSampleNo, "_", locus , "\n", blast_parsed$DNASeqLine_NoDash_str, sep ="")
sink()
if(locus_result_type == "allele")
{
#BLAST lookup table
Allele <- locusblast(locus, LocusList.df, sampleinfo, reflist, Test_id, Blast_evalue,
directorylist = directorylist)
#Allele <- ifelse(Allele == "?", NA, Allele)
OutputLocus.df[allelecol] <- Allele
OutputLocusProfile.df[allelecol] <- Allele #allele
SampleProfile <- paste0(SampleProfile, ProfileSeparator, locus, "[", Allele, "]")
cat(allelecol, Allele,"\n", sep = "\t\t")
if(LocusID == "list")
{
SerotypeLookup.df <- SerotypeLookup.df %>% filter(.[[allelecol]] == Allele|
is.na(.[[allelecol]]))
}
}
}#================================================================== End BLAST positive
}#******************************************************************** End WT gene found
}#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ End Locus Loop
# Interpret molecular characterization to get serotype
if(LocusID == "list")
{
NumResults <- dim(SerotypeLookup.df)[1]
if(NumResults == 0)
{
serotype <- paste0("NT[", serotype, "]")
}else
{
for(w in 1L:NumResults)
{
if(w == 1)
{
serotype <- SerotypeLookup.df$Serotype[w]
}else
{
serotype <- paste0(serotype, "/", SerotypeLookup.df$Serotype[w])
}
}
}
}
} #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ End Not in CPS Types List
} #///////////////////////////////////////////////////////////////////////// End Serogroup Found
if (m==1L)
{
SampleOutputProfile.df <- tibble(SampleNo = sampleinfo$CurrSampleNo, Serotype = serotype, Profile = SampleProfile)
}else
{
SampleOutputProfile2.df <- tibble(SampleNo = sampleinfo$CurrSampleNo, Serotype = serotype, Profile = SampleProfile )
SampleOutputProfile.df<- bind_rows(SampleOutputProfile.df, SampleOutputProfile2.df)
}
cat("\nProfile: ", SampleProfile, sep = "")
cat("\nSerotype: ", serotype, sep = "")
#Progress Bar
end[m] <- Sys.time()
time <- round(seconds_to_period(sum(end - init)), 0)
# Estimated remaining time based on the
# mean time that took to run the previous iterations
est <- NumSamples * (mean(end[end != 0] - init[init != 0])) - time
remaining <- round(seconds_to_period(est), 0)
cat(paste(" // Estimated time remaining:", remaining), "\n")
} #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< End Sample Loop
outfile <- paste0(directorylist$output_dir, "LabWareUpload_", Org_id, "_SEROTYPE.csv")
if (LocusID == "list")
{
write.csv(SampleOutputProfile.df, outfile, quote = FALSE, row.names = F)
}
if (NumSamples == 1L)
{
outfile_sero2 <- paste(directorylist$output_dir, "output_profile_SEROTYPE.csv", sep = "")
write.csv(OutputLocus.df, outfile_sero2, quote = FALSE, row.names = F )
}
dna_nf_file <- paste0(directorylist$output_dir, "output_dna_notfound.fasta")
if(file.exists(dna_nf_file))
{
nf_fasta <- read_fasta(dna_nf_file)
nf_fasta$sq <- as.character(nf_fasta$sq)
unique_nf <- distinct(nf_fasta, sq, .keep_all = TRUE)
unique_nf <- unique_nf[,c("name", "sq")]
unique_nf$name <- paste0(">", unique_nf$name)
unique_nf_file <- paste0(directorylist$output_dir, "output_dna_notfound_distinct.fasta")
write.table(unique_nf, file = unique_nf_file, row.names = FALSE, col.names = FALSE,
quote = FALSE, sep = "\n")
}
elapsed <- format(Sys.time() - start_time)
cat("\n\nDone!\n\n\n", "Elapsed time: ", elapsed,
"\n\n Serotyping Results are ready in output folder\n", sep = "")
return(SampleOutputProfile.df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.