R/Stage_0_FilteringLinkersFunctions.R

Defines functions Stage_0_Main_fun Create_fastq_list_fun Parse_fastqfiles_main_fun Check_If_sorted_fun Get_image_S0_fun

#' @importFrom ShortRead FastqStreamer yield writeFastq
#' @importFrom Biostrings DNAStringSet width BStringSet
#' @importClassesFrom ShortRead FastqStreamer ShortReadQ SFastqQuality SRFilterResult
#' @importClassesFrom Biostrings DNAStringSet BStringSet
#' @importFrom IRanges narrow
############################################## Main functions for stage 0
Stage_0_Main_fun = function(SA_prefix, S0_fastq1, S0_fastq2, S0_LinkerA, S0_LinkerB,
    S0_MinReadLength, S0_MaxReadLength, S0_LinkerOccurence, S0_image, S0_fastqStream,
    S0_Totfastqreads, S0_AnalysisDir) {
    # Take time:
    Analysis.time.start = Sys.time()
    #----------------
    # create names to save the resulted fastq:
    #----------------
    if (!dir.exists(S0_AnalysisDir))
        dir.create(S0_AnalysisDir)
    FastqWriteList = Create_fastq_list_fun(S0_AnalysisDir = S0_AnalysisDir, SA_prefix = SA_prefix)
    # image dir:
    S0_image_dir = file.path(S0_AnalysisDir, paste(SA_prefix, "_stage_0_image.jpg",
        sep = ""))
    #----------------
    # break and filter fastq files:
    #----------------
    cat("Filtering fastq files...\n")
    Parse_fastqfiles_main_fun(S0_fastq1 = S0_fastq1, S0_fastq2 = S0_fastq2, S0_LinkerA = S0_LinkerA,
        S0_LinkerB = S0_LinkerB, S0_MinReadLength = S0_MinReadLength, S0_MaxReadLength = S0_MaxReadLength,
        S0_LinkerOccurence = S0_LinkerOccurence, S0_fastqStream = S0_fastqStream,
        S0_Totfastqreads = S0_Totfastqreads, FastqWriteList = FastqWriteList, S0_image = S0_image,
        SA_prefix = SA_prefix, S0_image_dir = S0_image_dir)
    # print:
    futile.logger::flog.info("=====================================", name = "SA_LogFile",
        capture = FALSE)
    futile.logger::flog.info("Stage 0 is done!", name = "SA_LogFile", capture = FALSE)
    futile.logger::flog.info(paste("Analysis results for stage 0 are in:\n", S0_AnalysisDir),
        name = "SA_LogFile", capture = FALSE)
    # save time:
    Analysis.time.end = Sys.time()
    Total.Time = Analysis.time.end - Analysis.time.start
    LogFile = paste("Total stage 0 time:", Total.Time, " ", units(Total.Time))
    futile.logger::flog.info(LogFile, name = "SA_LogFile", capture = FALSE)
}
# done
#-------------
#-------------
# function for creating the fastq files for the ouput to be saved:
Create_fastq_list_fun = function(S0_AnalysisDir, SA_prefix) {
    #-------------
    # those with A/A or B/B linkers:(used in subsequent stages)
    #-------------
    fastq1_usable_dir = file.path(S0_AnalysisDir, paste(SA_prefix, "_usable_1.fastq.gz",
        sep = ""))
    if (file.exists(fastq1_usable_dir))
        unlink(x = fastq1_usable_dir, recursive = TRUE, force = TRUE)
    fastq2_usable_dir = file.path(S0_AnalysisDir, paste(SA_prefix, "_usable_2.fastq.gz",
        sep = ""))
    if (file.exists(fastq2_usable_dir))
        unlink(x = fastq2_usable_dir, recursive = TRUE, force = TRUE)
    #-------------
    # those with A/B or B/A or one/two parts with no linkers
    #-------------
    fastq1_chimeric_dir = file.path(S0_AnalysisDir, paste(SA_prefix, "_chimeric_1.fastq.gz",
        sep = ""))
    if (file.exists(fastq1_chimeric_dir))
        unlink(x = fastq1_chimeric_dir, recursive = TRUE, force = TRUE)
    fastq2_chimeric_dir = file.path(S0_AnalysisDir, paste(SA_prefix, "_chimeric_2.fastq.gz",
        sep = ""))
    if (file.exists(fastq2_chimeric_dir))
        unlink(x = fastq2_chimeric_dir, recursive = TRUE, force = TRUE)
    #-------------
    # those with lack of linkers in each read sequence. (always discarded,except if
    # S0_KeepEmptyReads)
    #-------------
    fastq1_ambiguous_dir = file.path(S0_AnalysisDir, paste(SA_prefix, "_ambiguous_1.fastq.gz",
        sep = ""))
    if (file.exists(fastq1_ambiguous_dir))
        unlink(x = fastq1_ambiguous_dir, recursive = TRUE, force = TRUE)
    fastq2_ambiguous_dir = file.path(S0_AnalysisDir, paste(SA_prefix, "_ambiguous_2.fastq.gz",
        sep = ""))
    if (file.exists(fastq2_ambiguous_dir))
        unlink(x = fastq2_ambiguous_dir, recursive = TRUE, force = TRUE)
    #-------------
    # add to list:
    #-------------
    FastqWriteList = list()
    FastqWriteList[[1]] = list(object = NA, file = fastq1_usable_dir)
    FastqWriteList[[2]] = list(object = NA, file = fastq2_usable_dir)
    FastqWriteList[[3]] = list(object = NA, file = fastq1_chimeric_dir)
    FastqWriteList[[4]] = list(object = NA, file = fastq2_chimeric_dir)
    FastqWriteList[[5]] = list(object = NA, file = fastq1_ambiguous_dir)
    FastqWriteList[[6]] = list(object = NA, file = fastq2_ambiguous_dir)
    return(FastqWriteList)
}
# done
#-------------
#-------------
# function for cutting linkers from the yield fastq files
Parse_fastqfiles_main_fun = function(S0_fastq1, S0_fastq2, S0_LinkerA, S0_LinkerB,
    S0_MinReadLength, S0_MaxReadLength, S0_LinkerOccurence, S0_fastqStream, S0_Totfastqreads,
    FastqWriteList, S0_image, SA_prefix, S0_image_dir) {
    #----------------
    # Stream the fastq files:
    #----------------
    Streamfastq1 = ShortRead::FastqStreamer(con = S0_fastq1, n = S0_fastqStream)
    Streamfastq2 = ShortRead::FastqStreamer(con = S0_fastq2, n = S0_fastqStream)
    # the first of loop you need to create the files, then change to append
    WrittingMode = "w"
    # count yield/chimeric/usable/ambiguous/N reads:
    TotfastqLinesRead = 0
    Totchimeric = 0
    Totusable = 0
    Totambi = 0
    TotNs = 0
    # the following are for printing progress to the console:
    options(scipen = 999, digits = 2)  #for printing
    cat1 = "||lines scanned:"
    cat2 = "||usable reads:"
    cat3 = "||chimeric reads:"
    cat4 = "||ambiguous reads:"
    cat5 = "||N reads:"
    #----------------
    # loop though the files:
    #----------------
    repeat {
        #----------------
        # read the fastq files:
        #----------------
        # read yield for fastq1
        fastq1yield = ShortRead::yield(Streamfastq1)
        # read yield for fastq2
        fastq2yield = ShortRead::yield(Streamfastq2)
        # break if empty, all is read
        Curfastqyieldsize = length(fastq1yield)
        if (Curfastqyieldsize == 0)
            break
        TotfastqLinesRead = TotfastqLinesRead + Curfastqyieldsize
        #----------------
        # check if fastq files have the same ids and remove /1,/2.
        #----------------
        SortedReads = Check_If_sorted_fun(fastq1yield = fastq1yield, fastq2yield = fastq2yield)
        if (!SortedReads) {
            stop("S0_fastq1 and S0_fastq2 files are not sorted by ID, or their IDs before /1 and /2 are not identical.",
                call. = FALSE)
        }
        #----------------
        # Find fastq linker and split (c++11):
        #----------------
        FilteringResults = FilterFastqYield_fun_Rcpp(Curfastqyieldsize, as.character(fastq1yield@sread),
            Biostrings::width(fastq1yield@sread), as.character(fastq2yield@sread),
            Biostrings::width(fastq2yield@sread), S0_LinkerA, S0_LinkerB, S0_LinkerOccurence,
            S0_MinReadLength, S0_MaxReadLength)
        #----------------
        # check the whole Yield is NNs and skip:
        #----------------
        if (FilteringResults$TotNsYield == Curfastqyieldsize) {
            TotNs = TotNs + FilteringResults$TotNsYield
            # print:
            cat(cat1, TotfastqLinesRead, "(", TotfastqLinesRead/S0_Totfastqreads *
                100, "%)|| ", cat2, Totusable, "(", Totusable/TotfastqLinesRead *
                100, "%)|| ", cat3, Totchimeric, "(", Totchimeric/TotfastqLinesRead *
                100, "%)|| ", cat4, Totambi, "(", Totambi/TotfastqLinesRead * 100,
                "%)|| ", cat5, TotNs, "(", TotNs/TotfastqLinesRead * 100, "%)||\r",
                sep = "")
            # go to next
            next
        }
        #----------------
        # narrow results
        #----------------
        fastq1yield@sread = IRanges::narrow(fastq1yield@sread, start = 1, end = FilteringResults$NarrowingPos[,
            1])
        fastq1yield@quality = IRanges::narrow(fastq1yield@quality, start = 1,
            end = FilteringResults$NarrowingPos[, 1])
        fastq2yield@sread = IRanges::narrow(fastq2yield@sread, start = 1, end = FilteringResults$NarrowingPos[,
            2])
        fastq2yield@quality = IRanges::narrow(fastq2yield@quality, start = 1,
            end = FilteringResults$NarrowingPos[, 2])
        #----------------
        # add counts:
        #----------------
        Totusable = Totusable + FilteringResults$TotusableYield
        Totchimeric = Totchimeric + FilteringResults$TotchimericYield
        Totambi = Totambi + FilteringResults$TotambiYield
        TotNs = TotNs + FilteringResults$TotNsYield
        #----------------
        # split chimeric, usable and ambiguous for the fastq files:
        #----------------
        # take Ids:
        Usable_id = which(FilteringResults$FilterClasses == 1)
        Chim_id = which(FilteringResults$FilterClasses == 2)
        Amb_id = which(FilteringResults$FilterClasses == 0)
        # split:
        FastqWriteList[[1]]$object = fastq1yield[Usable_id]
        FastqWriteList[[2]]$object = fastq2yield[Usable_id]
        FastqWriteList[[3]]$object = fastq1yield[Chim_id]
        FastqWriteList[[4]]$object = fastq2yield[Chim_id]
        FastqWriteList[[5]]$object = fastq1yield[Amb_id]
        FastqWriteList[[6]]$object = fastq2yield[Amb_id]
        #----------------
        # save data: fastq1yield_usable, fastq2yield_usable, fastq1yield_chimeric,
        # fastq2yield_chimeric
        #----------------
        for (i in seq_len(6)) {
            ShortRead::writeFastq(object = FastqWriteList[[i]]$object, file = FastqWriteList[[i]]$file,
                mode = WrittingMode, compress = TRUE)
        }
        # change mode to append now:
        WrittingMode = "a"
        # print:
        cat(cat1, TotfastqLinesRead, "(", TotfastqLinesRead/S0_Totfastqreads * 100,
            "%)|| ", cat2, Totusable, "(", Totusable/TotfastqLinesRead * 100, "%)|| ",
            cat3, Totchimeric, "(", Totchimeric/TotfastqLinesRead * 100, "%)|| ",
            cat4, Totambi, "(", Totambi/TotfastqLinesRead * 100, "%)|| ", cat5, TotNs,
            "(", TotNs/TotfastqLinesRead * 100, "%)||\r", sep = "")
    }
    # close connections:
    close(Streamfastq1)
    close(Streamfastq2)
    #----------------
    # write in log file:
    #----------------
    TotfastqLinesRead100 = TotfastqLinesRead/S0_Totfastqreads * 100
    Totusable100 = Totusable/TotfastqLinesRead * 100
    Totchimeric100 = Totchimeric/TotfastqLinesRead * 100
    Totambi100 = Totambi/TotfastqLinesRead * 100
    TotNs100 = TotNs/TotfastqLinesRead * 100
    LogFile = list()
    LogFile[[1]] = paste("Total lines processed:", TotfastqLinesRead, "(", TotfastqLinesRead100,
        "%)")
    LogFile[[2]] = paste("Total usable reads:", Totusable, "(", Totusable100, "%)")
    LogFile[[3]] = paste("Total chimeric reads:", Totchimeric, "(", Totchimeric100,
        "%)")
    LogFile[[4]] = paste("Total ambiguous reads:", Totambi, "(", Totambi100, "%)")
    LogFile[[5]] = paste("Total NNs reads:", TotNs, "(", TotNs100, "%)")
    for (lf in seq_len(5)) futile.logger::flog.info(LogFile[[lf]], name = "SA_LogFile",
        capture = FALSE)
    #----------------
    # plot image:
    #----------------
    if (S0_image) {
        Get_image_S0_fun(Totusable100 = Totusable100, Totchimeric100 = Totchimeric100,
            Totambi100 = Totambi100, TotNs100 = TotNs100, S0_image_dir = S0_image_dir)
    }
}
# done
#-------------
#-------------
# function for checking if the fastq yields are sorted/have same id
Check_If_sorted_fun = function(fastq1yield, fastq2yield) {
    # get widths of IDs:
    fastq1_idwidth = Biostrings::width(fastq1yield@id)
    fastq2_idwidth = Biostrings::width(fastq2yield@id)
    # remove the /1 and /2 from the fastq files:
    id1 = IRanges::narrow(fastq1yield@id, start = 1, end = fastq1_idwidth - 1)
    id1 = as.character(id1)
    id2 = IRanges::narrow(fastq2yield@id, start = 1, end = fastq2_idwidth - 1)
    id2 = as.character(id2)
    # return:
    return(identical(id1, id2))
}
# done
#-------------
#-------------
# function for plotting for stage 0
Get_image_S0_fun = function(Totusable100, Totchimeric100, Totambi100, TotNs100, S0_image_dir) {
    # Rcheck:
    Value = NULL
    Kind = NULL
    #-------------
    # create data:
    #-------------
    S0_imagedata = data.frame(Kind = c(paste("Usable (", round(Totusable100, digits = 1),
        "%)", sep = ""), paste("Chimeric (", round(Totchimeric100, digits = 1), "%)",
        sep = ""), paste("Ambiguous (", round(Totambi100, digits = 1), "%)", sep = ""),
        paste("NNs (", round(TotNs100, digits = 1), "%)", sep = "")), Value = c(round(Totusable100),
        round(Totchimeric100), round(Totambi100), round(TotNs100)))
    #-------------
    # plot the split:
    #-------------
    S0_image = ggplot2::ggplot(S0_imagedata, ggplot2::aes(x = "", y = Value, fill = factor(Kind))) +
        ggplot2::geom_bar(width = 1, stat = "identity") + ggplot2::coord_polar(theta = "y") +
        ggplot2::theme(axis.title = ggplot2::element_blank(), plot.title = ggplot2::element_text(size = 20,
            color = "black"), legend.title = ggplot2::element_blank(), legend.text = ggplot2::element_text(size = 17),
            axis.text = ggplot2::element_blank(), legend.position = "bottom", legend.direction = "vertical",
            axis.ticks = ggplot2::element_blank()) + ggplot2::ggtitle("Pie chart for fastq files") +
        ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) + ggplot2::scale_fill_brewer(palette = "Dark2")
    # save:
    ggplot2::ggsave(plot = S0_image, file = S0_image_dir, scale = 2)
}
# done
IoannisVardaxis/MACPET documentation built on Aug. 9, 2019, 12:11 p.m.