#' @importFrom ShortRead FastqStreamer yield writeFastq
#' @importFrom Biostrings DNAStringSet width BStringSet
#' @importClassesFrom ShortRead FastqStreamer ShortReadQ SFastqQuality SRFilterResult
#' @importClassesFrom Biostrings DNAStringSet BStringSet
#' @importFrom IRanges narrow
############################################## Main functions for stage 0
Stage_0_Main_fun = function(SA_prefix, S0_fastq1, S0_fastq2, S0_LinkerA, S0_LinkerB,
S0_MinReadLength, S0_MaxReadLength, S0_LinkerOccurence, S0_image, S0_fastqStream,
S0_Totfastqreads, S0_AnalysisDir) {
# Take time:
Analysis.time.start = Sys.time()
#----------------
# create names to save the resulted fastq:
#----------------
if (!dir.exists(S0_AnalysisDir))
dir.create(S0_AnalysisDir)
FastqWriteList = Create_fastq_list_fun(S0_AnalysisDir = S0_AnalysisDir, SA_prefix = SA_prefix)
# image dir:
S0_image_dir = file.path(S0_AnalysisDir, paste(SA_prefix, "_stage_0_image.jpg",
sep = ""))
#----------------
# break and filter fastq files:
#----------------
cat("Filtering fastq files...\n")
Parse_fastqfiles_main_fun(S0_fastq1 = S0_fastq1, S0_fastq2 = S0_fastq2, S0_LinkerA = S0_LinkerA,
S0_LinkerB = S0_LinkerB, S0_MinReadLength = S0_MinReadLength, S0_MaxReadLength = S0_MaxReadLength,
S0_LinkerOccurence = S0_LinkerOccurence, S0_fastqStream = S0_fastqStream,
S0_Totfastqreads = S0_Totfastqreads, FastqWriteList = FastqWriteList, S0_image = S0_image,
SA_prefix = SA_prefix, S0_image_dir = S0_image_dir)
# print:
futile.logger::flog.info("=====================================", name = "SA_LogFile",
capture = FALSE)
futile.logger::flog.info("Stage 0 is done!", name = "SA_LogFile", capture = FALSE)
futile.logger::flog.info(paste("Analysis results for stage 0 are in:\n", S0_AnalysisDir),
name = "SA_LogFile", capture = FALSE)
# save time:
Analysis.time.end = Sys.time()
Total.Time = Analysis.time.end - Analysis.time.start
LogFile = paste("Total stage 0 time:", Total.Time, " ", units(Total.Time))
futile.logger::flog.info(LogFile, name = "SA_LogFile", capture = FALSE)
}
# done
#-------------
#-------------
# function for creating the fastq files for the ouput to be saved:
Create_fastq_list_fun = function(S0_AnalysisDir, SA_prefix) {
#-------------
# those with A/A or B/B linkers:(used in subsequent stages)
#-------------
fastq1_usable_dir = file.path(S0_AnalysisDir, paste(SA_prefix, "_usable_1.fastq.gz",
sep = ""))
if (file.exists(fastq1_usable_dir))
unlink(x = fastq1_usable_dir, recursive = TRUE, force = TRUE)
fastq2_usable_dir = file.path(S0_AnalysisDir, paste(SA_prefix, "_usable_2.fastq.gz",
sep = ""))
if (file.exists(fastq2_usable_dir))
unlink(x = fastq2_usable_dir, recursive = TRUE, force = TRUE)
#-------------
# those with A/B or B/A or one/two parts with no linkers
#-------------
fastq1_chimeric_dir = file.path(S0_AnalysisDir, paste(SA_prefix, "_chimeric_1.fastq.gz",
sep = ""))
if (file.exists(fastq1_chimeric_dir))
unlink(x = fastq1_chimeric_dir, recursive = TRUE, force = TRUE)
fastq2_chimeric_dir = file.path(S0_AnalysisDir, paste(SA_prefix, "_chimeric_2.fastq.gz",
sep = ""))
if (file.exists(fastq2_chimeric_dir))
unlink(x = fastq2_chimeric_dir, recursive = TRUE, force = TRUE)
#-------------
# those with lack of linkers in each read sequence. (always discarded,except if
# S0_KeepEmptyReads)
#-------------
fastq1_ambiguous_dir = file.path(S0_AnalysisDir, paste(SA_prefix, "_ambiguous_1.fastq.gz",
sep = ""))
if (file.exists(fastq1_ambiguous_dir))
unlink(x = fastq1_ambiguous_dir, recursive = TRUE, force = TRUE)
fastq2_ambiguous_dir = file.path(S0_AnalysisDir, paste(SA_prefix, "_ambiguous_2.fastq.gz",
sep = ""))
if (file.exists(fastq2_ambiguous_dir))
unlink(x = fastq2_ambiguous_dir, recursive = TRUE, force = TRUE)
#-------------
# add to list:
#-------------
FastqWriteList = list()
FastqWriteList[[1]] = list(object = NA, file = fastq1_usable_dir)
FastqWriteList[[2]] = list(object = NA, file = fastq2_usable_dir)
FastqWriteList[[3]] = list(object = NA, file = fastq1_chimeric_dir)
FastqWriteList[[4]] = list(object = NA, file = fastq2_chimeric_dir)
FastqWriteList[[5]] = list(object = NA, file = fastq1_ambiguous_dir)
FastqWriteList[[6]] = list(object = NA, file = fastq2_ambiguous_dir)
return(FastqWriteList)
}
# done
#-------------
#-------------
# function for cutting linkers from the yield fastq files
Parse_fastqfiles_main_fun = function(S0_fastq1, S0_fastq2, S0_LinkerA, S0_LinkerB,
S0_MinReadLength, S0_MaxReadLength, S0_LinkerOccurence, S0_fastqStream, S0_Totfastqreads,
FastqWriteList, S0_image, SA_prefix, S0_image_dir) {
#----------------
# Stream the fastq files:
#----------------
Streamfastq1 = ShortRead::FastqStreamer(con = S0_fastq1, n = S0_fastqStream)
Streamfastq2 = ShortRead::FastqStreamer(con = S0_fastq2, n = S0_fastqStream)
# the first of loop you need to create the files, then change to append
WrittingMode = "w"
# count yield/chimeric/usable/ambiguous/N reads:
TotfastqLinesRead = 0
Totchimeric = 0
Totusable = 0
Totambi = 0
TotNs = 0
# the following are for printing progress to the console:
options(scipen = 999, digits = 2) #for printing
cat1 = "||lines scanned:"
cat2 = "||usable reads:"
cat3 = "||chimeric reads:"
cat4 = "||ambiguous reads:"
cat5 = "||N reads:"
#----------------
# loop though the files:
#----------------
repeat {
#----------------
# read the fastq files:
#----------------
# read yield for fastq1
fastq1yield = ShortRead::yield(Streamfastq1)
# read yield for fastq2
fastq2yield = ShortRead::yield(Streamfastq2)
# break if empty, all is read
Curfastqyieldsize = length(fastq1yield)
if (Curfastqyieldsize == 0)
break
TotfastqLinesRead = TotfastqLinesRead + Curfastqyieldsize
#----------------
# check if fastq files have the same ids and remove /1,/2.
#----------------
SortedReads = Check_If_sorted_fun(fastq1yield = fastq1yield, fastq2yield = fastq2yield)
if (!SortedReads) {
stop("S0_fastq1 and S0_fastq2 files are not sorted by ID, or their IDs before /1 and /2 are not identical.",
call. = FALSE)
}
#----------------
# Find fastq linker and split (c++11):
#----------------
FilteringResults = FilterFastqYield_fun_Rcpp(Curfastqyieldsize, as.character(fastq1yield@sread),
Biostrings::width(fastq1yield@sread), as.character(fastq2yield@sread),
Biostrings::width(fastq2yield@sread), S0_LinkerA, S0_LinkerB, S0_LinkerOccurence,
S0_MinReadLength, S0_MaxReadLength)
#----------------
# check the whole Yield is NNs and skip:
#----------------
if (FilteringResults$TotNsYield == Curfastqyieldsize) {
TotNs = TotNs + FilteringResults$TotNsYield
# print:
cat(cat1, TotfastqLinesRead, "(", TotfastqLinesRead/S0_Totfastqreads *
100, "%)|| ", cat2, Totusable, "(", Totusable/TotfastqLinesRead *
100, "%)|| ", cat3, Totchimeric, "(", Totchimeric/TotfastqLinesRead *
100, "%)|| ", cat4, Totambi, "(", Totambi/TotfastqLinesRead * 100,
"%)|| ", cat5, TotNs, "(", TotNs/TotfastqLinesRead * 100, "%)||\r",
sep = "")
# go to next
next
}
#----------------
# narrow results
#----------------
fastq1yield@sread = IRanges::narrow(fastq1yield@sread, start = 1, end = FilteringResults$NarrowingPos[,
1])
fastq1yield@quality = IRanges::narrow(fastq1yield@quality, start = 1,
end = FilteringResults$NarrowingPos[, 1])
fastq2yield@sread = IRanges::narrow(fastq2yield@sread, start = 1, end = FilteringResults$NarrowingPos[,
2])
fastq2yield@quality = IRanges::narrow(fastq2yield@quality, start = 1,
end = FilteringResults$NarrowingPos[, 2])
#----------------
# add counts:
#----------------
Totusable = Totusable + FilteringResults$TotusableYield
Totchimeric = Totchimeric + FilteringResults$TotchimericYield
Totambi = Totambi + FilteringResults$TotambiYield
TotNs = TotNs + FilteringResults$TotNsYield
#----------------
# split chimeric, usable and ambiguous for the fastq files:
#----------------
# take Ids:
Usable_id = which(FilteringResults$FilterClasses == 1)
Chim_id = which(FilteringResults$FilterClasses == 2)
Amb_id = which(FilteringResults$FilterClasses == 0)
# split:
FastqWriteList[[1]]$object = fastq1yield[Usable_id]
FastqWriteList[[2]]$object = fastq2yield[Usable_id]
FastqWriteList[[3]]$object = fastq1yield[Chim_id]
FastqWriteList[[4]]$object = fastq2yield[Chim_id]
FastqWriteList[[5]]$object = fastq1yield[Amb_id]
FastqWriteList[[6]]$object = fastq2yield[Amb_id]
#----------------
# save data: fastq1yield_usable, fastq2yield_usable, fastq1yield_chimeric,
# fastq2yield_chimeric
#----------------
for (i in seq_len(6)) {
ShortRead::writeFastq(object = FastqWriteList[[i]]$object, file = FastqWriteList[[i]]$file,
mode = WrittingMode, compress = TRUE)
}
# change mode to append now:
WrittingMode = "a"
# print:
cat(cat1, TotfastqLinesRead, "(", TotfastqLinesRead/S0_Totfastqreads * 100,
"%)|| ", cat2, Totusable, "(", Totusable/TotfastqLinesRead * 100, "%)|| ",
cat3, Totchimeric, "(", Totchimeric/TotfastqLinesRead * 100, "%)|| ",
cat4, Totambi, "(", Totambi/TotfastqLinesRead * 100, "%)|| ", cat5, TotNs,
"(", TotNs/TotfastqLinesRead * 100, "%)||\r", sep = "")
}
# close connections:
close(Streamfastq1)
close(Streamfastq2)
#----------------
# write in log file:
#----------------
TotfastqLinesRead100 = TotfastqLinesRead/S0_Totfastqreads * 100
Totusable100 = Totusable/TotfastqLinesRead * 100
Totchimeric100 = Totchimeric/TotfastqLinesRead * 100
Totambi100 = Totambi/TotfastqLinesRead * 100
TotNs100 = TotNs/TotfastqLinesRead * 100
LogFile = list()
LogFile[[1]] = paste("Total lines processed:", TotfastqLinesRead, "(", TotfastqLinesRead100,
"%)")
LogFile[[2]] = paste("Total usable reads:", Totusable, "(", Totusable100, "%)")
LogFile[[3]] = paste("Total chimeric reads:", Totchimeric, "(", Totchimeric100,
"%)")
LogFile[[4]] = paste("Total ambiguous reads:", Totambi, "(", Totambi100, "%)")
LogFile[[5]] = paste("Total NNs reads:", TotNs, "(", TotNs100, "%)")
for (lf in seq_len(5)) futile.logger::flog.info(LogFile[[lf]], name = "SA_LogFile",
capture = FALSE)
#----------------
# plot image:
#----------------
if (S0_image) {
Get_image_S0_fun(Totusable100 = Totusable100, Totchimeric100 = Totchimeric100,
Totambi100 = Totambi100, TotNs100 = TotNs100, S0_image_dir = S0_image_dir)
}
}
# done
#-------------
#-------------
# function for checking if the fastq yields are sorted/have same id
Check_If_sorted_fun = function(fastq1yield, fastq2yield) {
# get widths of IDs:
fastq1_idwidth = Biostrings::width(fastq1yield@id)
fastq2_idwidth = Biostrings::width(fastq2yield@id)
# remove the /1 and /2 from the fastq files:
id1 = IRanges::narrow(fastq1yield@id, start = 1, end = fastq1_idwidth - 1)
id1 = as.character(id1)
id2 = IRanges::narrow(fastq2yield@id, start = 1, end = fastq2_idwidth - 1)
id2 = as.character(id2)
# return:
return(identical(id1, id2))
}
# done
#-------------
#-------------
# function for plotting for stage 0
Get_image_S0_fun = function(Totusable100, Totchimeric100, Totambi100, TotNs100, S0_image_dir) {
# Rcheck:
Value = NULL
Kind = NULL
#-------------
# create data:
#-------------
S0_imagedata = data.frame(Kind = c(paste("Usable (", round(Totusable100, digits = 1),
"%)", sep = ""), paste("Chimeric (", round(Totchimeric100, digits = 1), "%)",
sep = ""), paste("Ambiguous (", round(Totambi100, digits = 1), "%)", sep = ""),
paste("NNs (", round(TotNs100, digits = 1), "%)", sep = "")), Value = c(round(Totusable100),
round(Totchimeric100), round(Totambi100), round(TotNs100)))
#-------------
# plot the split:
#-------------
S0_image = ggplot2::ggplot(S0_imagedata, ggplot2::aes(x = "", y = Value, fill = factor(Kind))) +
ggplot2::geom_bar(width = 1, stat = "identity") + ggplot2::coord_polar(theta = "y") +
ggplot2::theme(axis.title = ggplot2::element_blank(), plot.title = ggplot2::element_text(size = 20,
color = "black"), legend.title = ggplot2::element_blank(), legend.text = ggplot2::element_text(size = 17),
axis.text = ggplot2::element_blank(), legend.position = "bottom", legend.direction = "vertical",
axis.ticks = ggplot2::element_blank()) + ggplot2::ggtitle("Pie chart for fastq files") +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + ggplot2::scale_fill_brewer(palette = "Dark2")
# save:
ggplot2::ggsave(plot = S0_image, file = S0_image_dir, scale = 2)
}
# done
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.