R/utilities.R

Defines functions .sysargsrunCommandline .checkOutArgs2 .makeBam .moduleload .tryRunC checkPkg showDF filterDEGs run_DESeq2 run_edgeR readComp returnRPKM alignStats symLink2bam preprocessReads clusterRun runCommandline writeTargetsout

Documented in alignStats clusterRun filterDEGs preprocessReads readComp returnRPKM runCommandline run_DESeq2 run_edgeR showDF symLink2bam writeTargetsout

###########################################################
## Additional utilities for SYSargs and SYSargs2 objects ##
###########################################################

######################################################
## Convenience write function for targetsout(args) ##
######################################################
writeTargetsout <- function(x, file = "default", silent = FALSE, overwrite = FALSE, step = NULL, new_col = NULL,
                            new_col_output_index = NULL, remove = FALSE, ...) {
    if (all(class(x) != "SYSargs" & class(x) != "SYSargs2")) stop("Argument 'x' needs to be assigned an object of class 'SYSargs' OR 'SYSargs2")
    ## SYSargs class
    if (class(x) == "SYSargs") {
        targets <- targetsout(x)
        software <- software(x)
        if (file == "default") {
            file <- paste("targets_", software, ".txt", sep = "")
            file <- gsub(" {1,}", "_", file)
        } else {
            file <- file
        }
        headerlines <- targetsheader(x)
        ## SYSargs2 class
    } else if (class(x) == "SYSargs2") {
        if (is.null(step)) {
            stop(
                "Argument 'step' needs to be assigned one of the following values: ",
                paste(names(x$clt), collapse = ", "), " OR the corresponding position"
            )
        }
        if (all(!is.null(step) & is.character(step) & !any(names(x$clt) %in% step))) {
            stop(
                "Argument 'step' can only be assigned one of the following values: ",
                paste(names(x$clt), collapse = ", "), " OR the corresponding position"
            )
        }
        if (all(!is.null(step) & is.numeric(step) & !any(seq_along(names(x$clt)) %in% step))) {
            stop(
                "Argument 'step' can only be assigned one of the following position: ",
                paste(seq_along(names(x$clt)), collapse = ", "), " OR the corresponding names"
            )
        }
        targets <- targets.as.df(targets(x))
        if (remove == TRUE) {
            targets <- targets[, -c(which(grepl("FileName", colnames(targets))))]
        }
        ## Adding the columns
        if ((!is.null(new_col) & is.null(new_col_output_index)) |
            (is.null(new_col) & !is.null(new_col_output_index)) |
            (is.null(new_col) & is.null(new_col_output_index))) {
            cat("One of 'new_col' and 'new_col_output_index' is null. It is using default column naming and adding all the output files expected, and each one will be written in a different column. \n")
            for (i in seq_len(length(output(x)[[1]][[step]]))) {
                pout <- sapply(names(output(x)), function(y) output(x)[[y]][[step]][[i]], simplify = FALSE)
                targets[[paste0(files(x)$steps, "_", i)]] <- as.character(pout)
            }
        } else if (!is.null(new_col) & !is.null(new_col_output_index)) {
            if (any(length(output(x)[[1]][[step]]) < new_col_output_index) | any(new_col_output_index < 1)) {
                stop("'new_col_output_index' argument needs to be equal or bigger than 1 and smaller than ", length(output(x)[[1]][[1]]), ", the maximum number of outputs files.")
            }
            if (length(new_col) != length(new_col_output_index)) {
                stop("'new_col' should have the same length as 'new_col_output_index'")
            }
            for (i in seq_along(new_col)) {
                pout <- sapply(names(output(x)), function(y) output(x)[[y]][[step]][[new_col_output_index[i]]], simplify = FALSE)
                targets[[as.character(new_col[i])]] <- as.character(pout)
            }
        }
        ## Workflow and Step Name
        software <- strsplit(basename(files(x)$cwl), split = "\\.")[[1]][1]
        if (is.character(step)) {
            step <- strsplit(step, split = "\\.")[[1]][1]
        } else {
            step <- strsplit(names(x$clt)[step], split = "\\.")[[1]][1]
        }
        if (file == "default") {
            file <- paste("targets_", step, ".txt", sep = "")
            file <- gsub(" {1,}", "_", file)
        } else {
            file <- file
        }
        headerlines <- targetsheader(x)[[1]]
        names <- c(new_col, colnames(targets[, -c(which(grepl(paste(new_col, collapse = "|"), colnames(targets))))]))
        targets <- cbind(targets[, new_col], targets[, -c(which(grepl(paste(new_col, collapse = "|"), colnames(targets))))])
        colnames(targets) <- names
    }
    if (file.exists(file) & overwrite == FALSE) stop("I am not allowed to overwrite files; please delete existing file: ", file, " or set 'overwrite=TRUE'")
    targetslines <- c(paste(colnames(targets), collapse = "\t"), apply(targets, 1, paste, collapse = "\t"))
    writeLines(c(headerlines, targetslines), file, ...)
    if (silent != TRUE) cat("\t", "Written content of 'targetsout(x)' to file:", file, "\n")
}
## Usage:
# writeTargetsout(x=args, file="default") ## SYSargs class
# writeTargetsout(x=WF, file="default", step=1, new_col = "FileName1", new_col_output_index = 1) ## SYSargs2 class

##############################################################################
## Function to run NGS aligners including sorting and indexing of BAM files ##
##############################################################################
runCommandline <- function(args, runid = "01",
                           make_bam = FALSE, del_sam = TRUE,
                           dir = TRUE, dir.name = NULL,
                           force = FALSE, input_targets = NULL, ...) {
    ## Validation for 'args'
    if (any(!inherits(args, "SYSargs") & !inherits(args, "SYSargs2"))) stop("Argument 'args' needs to be assigned an object of class 'SYSargs' OR 'SYSargs2'")
    ## Environment Modules section
    .moduleload(args)
    ## SYSargs class ##
    if (class(args) == "SYSargs") {
        .sysargsrunCommandline(args = args, runid = runid, make_bam = make_bam, del_sam = del_sam)
    } else if (class(args) == "SYSargs2") {
        ## SYSargs2 class ##
        ## Workflow Name (Workflow OR CommandLineTool class)
        if (length(wf(args)$steps) == 0) {
            cwl.wf <- gsub("[[:space:]]", "_", paste(baseCommand(args), collapse = "_"), perl = TRUE)
        } else {
            cwl.wf <- Biostrings::strsplit(basename(files(args)$cwl), split = "\\.")[[1]][1]
        }
        ## Check if "results" folders exists...
        if (!dir.exists(file.path(yamlinput(args)$results_path$path))) {
            stop("Please check our current directory ('getwd()').
           The PATH defined at `yamlinput(args)` was not found and it is required.")
        }
        logdir <- file.path(yamlinput(args)$results_path$path)
        if (is.null(dir.name)) {
            dir.name <- cwl.wf
        } else {
            dir.name <- dir.name
        }
        ## Check if expected files exists or not
        ## Important: this happen in the new location in the case of dir=TRUE
        return <- .checkOutArgs2(args,
            make_bam = make_bam, dir = dir, dir.name = dir.name,
            force = force, del_sam = del_sam
        )
        args.return <- return$args_complete
        completed <- return$completed
        args <- return$args
        ## Check if the command line is exported
        if (!baseCommand(args) == c("bash")) {
            cmd_test <- tryCMD(command = baseCommand(args), silent = TRUE)
            if (cmd_test == "error") {
                stop(
                    "\n", baseCommand(args), ": command not found. ",
                    "\n", "Please make sure to configure your PATH environment variable according to the software in use.",
                    call. = FALSE
                )
            }
        }
        ## Create log files
        file_log <- file.path(logdir, paste0(
            "submitargs", runid, "_",
            dir.name, "_log_",
            format(Sys.time(), "%b%d%Y_%H%Ms%S"),
            paste(sample(0:9, 4), collapse = "")
        ))
        ## Check input
        if (length(args$inputvars) >= 1) {
            inpVar <- args$inputvars
            check.inp <- colSums(sapply(inpVar, function(y) sapply(yamlinput(args), function(x) x["path"] %in% y)))
            df.targets <- targets.as.df(args$targets)[names(check.inp[check.inp > 0])]
            inp_targets2 <- FALSE
        } else {
            inp_targets2 <- TRUE
        }
        ## Targets input
        if (!is.null(input_targets)) {
            if (inherits(input_targets, c("numeric", "integer"))) {
                if (!all(input_targets %in% seq_along(cmdlist(args)))) {
                    stop("Please select the 'input_targets' number accordingly, options are: ", "\n",
                        "        ", paste0(seq_along(cmdlist(args)), collapse = ", "),
                        call. = FALSE
                    )
                }
            } else if (inherits(input_targets, "character")) {
                if (!all(input_targets %in% SampleName(args))) {
                    stop("Please select the 'input_targets' name accordingly, options are: ", "\n",
                        "        ", paste0(SampleName(args), collapse = ", "),
                        call. = FALSE
                    )
                }
                input_targets <- which(SampleName(args) %in% input_targets)
            }
            input_targets <- input_targets
        } else {
            input_targets <- seq_along(cmdlist(args))
        }
        ## Sample status and Time
        sample_status <- sapply(names(cmdlist(args)), function(x) {
            list(sapply(args$files$steps, function(x) list(NULL)))
        })
        sample_status <- sample_status[input_targets]
        time_status <- args$status$status.time
        ## Progress bar
        pb <- utils::txtProgressBar(min = 0, max = length(input_targets), style = 3)
        ## LOOP
        for (i in input_targets) {
            ii <- names(cmdlist(args)[i])
            utils::setTxtProgressBar(pb, i)
            cat("## ", names(cmdlist(args)[i]), "\n", file = file_log, fill = TRUE, append = TRUE)
            ## Time
            time_status[ii, ]$time_start <- as.POSIXct(Sys.time(), origin = "1970-01-01")
            for (j in seq_along(cmdlist(args)[[i]])) {
                ## Add_command file at log file
                cat(c(
                    paste0("Time: ", paste0(format(Sys.time(), "%b%d%Y_%H%Ms%S"))), "\n",
                    "### Code: ",
                    "```{r, eval=FALSE} ",
                    cmdlist(args)[[i]][[j]][[1]],
                    "```", "\n",
                    "### Stdout: ",
                    "```{r, eval=FALSE}"
                ), file = file_log, sep = "\n", append = TRUE)
                ## Run the commandline only for samples for which no output file is available OR force == TRUE
                if (all(force == FALSE && all(as.logical(completed[[i]][[j]])))) {
                    cat("The expected output file(s) already exist", "\n", file = file_log, fill = TRUE, append = TRUE)
                    ## close R chunk
                    cat("```", file = file_log, sep = "\n", append = TRUE)
                    sample_status[[ii]][[args$files$steps[j]]] <- "Success"
                    next()
                } else {
                    ## Create an object for executable
                    command <- gsub(" .*", "", as.character(cmdlist(args)[[i]][[j]]))
                    commandargs <- gsub("^.*? ", "", as.character(cmdlist(args)[[i]][[j]]))
                    ## Return an error if input file doesn't exist
                    if (all(inp_targets2) || all(inp_targets <- file.exists(as.character(df.targets[i, ])))) {
                        stdout <- .tryRunC(command, commandargs)
                    } else {
                        stdout <- list(
                            stdout = paste(paste0(as.character(df.targets[i, ])[!inp_targets], collapse = ", "), "\n are missing"),
                            warning = "", error = "We have an error because target(s) file doesn't exit"
                        )
                    }
                    cat(stdout$stdout, file = file_log, sep = "\n", append = TRUE)
                    ## report status
                    if (!is.null(stdout$error)) {
                        cat("## Error", file = file_log, sep = "\n", append = TRUE)
                        cat(stdout$error, file = file_log, sep = "\n", append = TRUE)
                        sample_status[[ii]][[args$files$steps[j]]] <- "Error"
                    } else if (!is.null(stdout$warning)) {
                        cat("## Warning", file = file_log, sep = "\n", append = TRUE)
                        cat(stdout$warning, file = file_log, sep = "\n", append = TRUE)
                        sample_status[[ii]][[args$files$steps[j]]] <- "Warning"
                    } else if (all(is.null(stdout$error) && is.null(stdout$warning))) {
                        sample_status[[ii]][[args$files$steps[j]]] <- "Success"
                    }
                    ## close R chunk
                    cat("```", file = file_log, sep = "\n", append = TRUE)
                }
                ## converting sam to bam using Rsamtools package
                .makeBam(output(args)[[i]][[j]], make_bam = make_bam, del_sam = del_sam)
            }
            time_status[ii, ]$time_end <- as.POSIXct(Sys.time(), origin = "1970-01-01")
        }
        args.return[["files"]][["log"]] <- file_log
        ## In the case of make_bam=TRUE
        if (make_bam == TRUE) {
            args <- .checkOutArgs2(args,
                make_bam = make_bam, dir = FALSE,
                force = force, dir.name = dir.name,
                del_sam = del_sam
            )$args_complete
        }
        ## Create recursive the subfolders
        if (dir == TRUE) {
            for (i in seq_along(names(cmdlist(args)))) {
                full_path <- file.path(logdir, dir.name)
                if (dir.exists(full_path) == FALSE) {
                    dir.create(full_path, recursive = TRUE)
                }
            }
            ## copy log files
            if (dir.exists(file.path(logdir, dir.name, "_logs")) == FALSE) {
                dir.create(file.path(logdir, dir.name, "_logs"), recursive = TRUE)
            }
            file.rename(from = file_log, to = file.path(logdir, dir.name, "_logs", basename(file_log)))
            args.return[["files"]][["log"]] <- file.path(logdir, dir.name, "_logs", basename(file_log))
            ## output FILES
            for (i in seq_along(output(args))) {
                if (length(output(args)[[i]]) > 1) {
                    for (j in seq_along(output(args)[[i]])) {
                        for (k in seq_along(output(args)[[i]][[j]])) {
                            if (file.exists(output(args)[[i]][[j]])) {
                                name <- Biostrings::strsplit(output(args)[[i]][[j]][[k]], split = "\\/")[[1]]
                                name <- name[length(name)]
                                file.rename(from = output(args)[[i]][[j]][[k]], to = file.path(logdir, dir.name, name))
                            } else if (!file.exists(output(args.return)[[i]][[j]][[k]])) {
                                dump <- "No such file or directory"
                            }
                        }
                    }
                } else if (length(output(args)[[i]]) == 1) {
                    for (j in seq_along(output(args)[[i]][[1]])) {
                        if (file.exists(output(args)[[i]][[1]][[j]])) {
                            name <- Biostrings::strsplit(output(args)[[i]][[1]][[j]], split = "\\/")[[1]]
                            name <- name[length(name)]
                            file.rename(from = output(args)[[i]][[1]][[j]], to = file.path(logdir, dir.name, name))
                        } else if (!file.exists(output(args)[[i]][[1]][[j]])) {
                            dump <- "No such file or directory"
                        }
                    }
                }
            }
        }
        ## Status and log.files
        sample_status <- lapply(sample_status, function(x) lapply(x, function(y) if (is.null(y)) y <- "Pending" else y))
        df.status <- data.frame(matrix(do.call("c", sample_status), nrow = length(sample_status), byrow = TRUE))
        colnames(df.status) <- files(args.return)$steps
        check <- check.output(args.return)[input_targets, ]
        df.status.f <- cbind(check, df.status)
        df.status.f[c(2:4)] <- sapply(df.status.f[c(2:4)], as.numeric)
        df.status.f[c(5:ncol(df.status.f))] <- sapply(df.status.f[c(5:ncol(df.status.f))], as.character)
        rownames(df.status.f) <- df.status.f$Targets
        ## updating object
        args.return[["status"]]$status.summary <- .statusSummary(df.status.f)
        args.return[["status"]]$status.completed[match(df.status.f$Targets, args.return[["status"]]$status.completed$Targets), ] <- df.status.f
        args.return[["status"]]$status.time <- time_status
        ## time
        args.return[["status"]]$status.time$time_start <- as.POSIXct(args.return[["status"]]$status.time$time_start, origin = "1970-01-01")
        args.return[["status"]]$status.time$time_end <- as.POSIXct(args.return[["status"]]$status.time$time_end, origin = "1970-01-01")
        # registering total time
        newTimeDF <- args.return[["status"]]$status.time[input_targets, ]
        args.return[["status"]]$total.time <- list(time_start = newTimeDF$time_start[1], time_end = newTimeDF$time_end[length(newTimeDF$time_end)])
        ## double check output file
        cat("\n")
        cat(crayon::blue("---- Summary ----"), "\n")
        print(df.status.f)
        close(pb)
        return(args.return)
    }
}

## Usage:
# WF <- runCommandline(WF, make_bam=TRUE) # creates the files in the ./results folder
# WF <- runCommandline(WF, dir=TRUE) # creates the files in the ./results/workflowName/Samplename folder
# WF <- runCommandline(WF, make_bam = FALSE, dir=TRUE) ## For hisat2-mapping.cwl template
# runid="01"; make_bam=FALSE; del_sam=FALSE; dir=TRUE; dir.name=NULL; force=FALSE
# runid="01"; make_bam=FALSE; del_sam=FALSE; dir=TRUE; dir.name=NULL; force=TRUE
# runid="01"; make_bam=FALSE; del_sam=FALSE; dir=TRUE; dir.name=NULL; force=TRUE

############################################################################################
## batchtools-based function to submit runCommandline jobs to queuing system of a cluster ##
############################################################################################
## The advantage of this function is that it should work with most queuing/scheduling systems such as SLURM, Troque, SGE, ...
clusterRun <- function(args,
                       FUN = runCommandline,
                       more.args = list(args = args, make_bam = TRUE),
                       conffile = ".batchtools.conf.R",
                       template = "batchtools.slurm.tmpl",
                       Njobs, runid = "01", resourceList) {
    checkPkg("batchtools", quietly = FALSE)
    ## Validity checks of inputs
    if (!any(inherits(args, c("SYSargs", "SYSargs2", "SYSargsList")))) {
        stop("Argument 'args' needs to be assigned an object of class 'SYSargs' OR 'SYSargs2'")
    }
    if (!inherits(FUN, "function")) {
        stop("Value assigned to 'FUN' argument is not an object of class function.")
    }
    if (!file.exists(conffile)) {
        stop(
            "Need to point under 'conffile' argument to proper config file. ",
            "See more information here: https://mllg.github.io/batchtools/reference/makeRegistry.html. ",
            "Note: in this file *.tmpl needs to point to a valid template file."
        )
    }
    if (!file.exists(template)) {
        stop(
            "Need to point under 'template' argument to proper template file. ",
            "Sample template files for different schedulers are available here: https://github.com/mllg/batchtools/blob/master/inst/templates/"
        )
    }
    if (!class(more.args) == "list") {
        stop("'more.args' needs to be object of class 'list'.")
    }
    if (any(!names(more.args) %in% names(as.list(formals(FUN))))) {
        stop(
            "The list of arguments assigned to 'more.args' can only be the ",
            "following arguments defined in the function 'FUN':",
            paste(names(as.list(formals(FUN))), collapse = ", ")
        )
    }
    if (grepl("slurm", template)) {
        if (!grepl("slurm", Sys.getenv("PATH"))) {
            try(suppressWarnings(modulecmd_path <- system("which modulecmd", intern = TRUE, ignore.stderr = TRUE)), silent = TRUE)
            ## "Environment Modules" is not available
            if (length(modulecmd_path) == 0) {
                stop("Please export Slurm to your PATH")
                ## "Environment Modules" is available and proceed the module load
            } else if (length(modulecmd_path) > 0) {
                module_out <- moduleload("slurm") # loads specified software from module system
            }
        }
    }
    if (inherits(args, c("SYSargs", "SYSargs2"))) {
        if (!identical(FUN, runCommandline)) stop("For 'SYargs' OR 'SYSargs2' class, please use `FUN=runCommandline` function")
        ## SYSargs class
        if (inherits(args, "SYSargs")) {
            path <- normalizePath(results(args))
            args.f <- seq(along = args)
            ## SYSargs2 class
        } else if (inherits(args, "SYSargs2")) {
            path <- normalizePath(args$yamlinput$results_path$path)
            args.f <- seq(along = cmdlist(args))
        }
        ## batchtools routines
        fct <- function(i, args, ...) FUN(args = args[i], ...)
        logdir1 <- paste0(path, "/submitargs", runid, "_btdb_", paste(sample(0:9, 4), collapse = ""))
        reg <- batchtools::makeRegistry(file.dir = logdir1, conf.file = conffile, packages = "systemPipeR")
        ids <- batchtools::batchMap(fun = fct, args.f, more.args = more.args, reg = reg)
        chunk <- batchtools::chunk(ids$job.id, n.chunks = Njobs, shuffle = FALSE)
        ids$chunk <- chunk
        done <- batchtools::submitJobs(ids = ids, reg = reg, resources = resourceList)
        return(reg)
    } else if (inherits(args, c("SYSargsList"))) {
        if (!identical(FUN, runWF)) stop("For 'SYargsList' class, please use `FUN=runWF` function")
        sal <- args
        fct_sal <- function(i, sal, ...) {
            sal <- runWF(sysargs = sal, ...)
            return(sal)
        }
        path <- normalizePath(sal$projectInfo$results)
        for (j in seq_along(sal)) {
            logdir1 <- paste0(path, "/submitargs", 01, "_btdb_", paste(sample(0:9, 4), collapse = ""))
            reg <- batchtools::makeRegistry(file.dir = logdir1, conf.file = conffile, packages = "systemPipeR")
            ids <- batchtools::batchMap(fun = fct_sal, 1, more.args = list(sal = sal, steps = j), reg = reg)
            chunk <- batchtools::chunk(ids$job.id, n.chunks = 1, shuffle = FALSE)
            ids$chunk <- chunk
            done <- batchtools::submitJobs(ids = ids, reg = reg, resources = resourceList)
            batchtools::waitForJobs()
            sal <- batchtools::loadResult(reg = reg, id = ids)
        }
        return(sal)
    }
}

## Usage:
# targets <- system.file("extdata", "targets.txt", package="systemPipeR")
# dir_path <- system.file("extdata/cwl", package="systemPipeR")
# WF <- loadWorkflow(targets=targets, wf_file="hisat2/hisat2-mapping-se.cwl",
#                    input_file="hisat2/hisat2-mapping-se.yml", dir_path=dir_path)
# WF <- renderWF(WF, inputvars=c(FileName="_FASTQ_PATH1_", SampleName="_SampleName_"))
# WF
# runCommandline(WF[1])
#
# resources <- list(walltime=120, ntasks=1, ncpus=4, memory=1024)
# reg <- clusterRun(WF[1], FUN = runCommandline,
#                   more.args = list(args = WF, make_bam = TRUE),
#                   conffile=".batchtools.conf.R",
#                   template="batchtools.slurm.tmpl",
#                   Njobs=9, runid="01", resourceList=resources)
# getStatus(reg=reg)
# waitForJobs(reg=reg)
#
# sal <- SPRproject(overwrite=TRUE)
# targetspath <- system.file("extdata/cwl/example/targets_example.txt", package="systemPipeR")
# appendStep(sal) <- SYSargsList(step_name = "echo", targets=targetspath, dir=TRUE, wf_file="example/workflow_example.cwl", input_file="example/example.yml", dir_path = system.file("extdata/cwl", package="systemPipeR"),inputvars = c(Message = "_STRING_", SampleName = "_SAMPLE_"))
#
# appendStep(sal) <- LineWise(code = {
#   hello <- lapply(getColumn(sal, step=1, 'outfiles'), function(x) yaml::read_yaml(x))
# },
# step_name = "R_getCol",
# dependency = "echo")
#
# args <- sal
# sal <- clusterRun(sal, FUN = runWF,
#                   more.args = list(),
#                   conffile=".batchtools.conf.R",
#                   template="batchtools.slurm.tmpl",
#                   Njobs=1, runid="01", resourceList=resources)

########################
## Read preprocessing ##
########################
preprocessReads <- function(args = NULL,
                            FileName1 = NULL, FileName2 = NULL,
                            outfile1 = NULL, outfile2 = NULL,
                            Fct, batchsize = 100000, overwrite = TRUE, ...) {
    ## Checking input either args or individual files
    if (all(is.null(args) && is.null(FileName1) && is.null(outfile1))) {
        stop("At least one argument must be provided, either 'args' or 'FileName1' and 'outfile1'.")
    }
    ## Checking the function argument
    if (class(Fct) != "character") stop("Argument 'Fct' needs to be of class character")
    if (!is.null(args)) {
        if (all(!inherits(args, "SYSargs") & !inherits(args, "SYSargs2"))) stop("Argument 'args' needs to be assigned an object of class 'SYSargs' OR 'SYSargs2")
        if (class(args) == "SYSargs") {
            colnames_args <- colnames(targetsout(args)) # SYSargs
            targets_in <- targetsin(args)
            if (!all(c("FileName1", "FileName2") %in% colnames_args)) {
                outpaths <- outpaths(args) # SYSargs
            } else {
                outpaths1 <- as.character(targetsout(args)$FileName1)
                outpaths2 <- as.character(targetsout(args)$FileName2)
            }
        } else if (class(args) == "SYSargs2") {
            colnames_args <- colnames(targets.as.df(args$targets)) # SYSargs2
            targets_in <- targets.as.df(args$targets)
            if (!all(c("FileName1", "FileName2") %in% colnames_args)) {
                outpaths <- subsetWF(args = args, slot = "output", subset = 1, index = 1)
            } else {
                outpaths1 <- subsetWF(args = args, slot = "output", subset = 1, index = 1)
                outpaths2 <- subsetWF(args = args, slot = "output", subset = 1, index = 2)
            }
        }
    } else if (!is.null(FileName1)) {
        if (!is.null(FileName2)) {
            colnames_args <- c("FileName1", "FileName2")
            targets_in <- data.frame(FileName1, FileName2)
            if (is.null(outfile1) | is.null(outfile2)) stop("'outfile1' and 'outfile2' are required.")
            outpaths1 <- outfile1
            outpaths2 <- outfile2
        } else {
            colnames_args <- c("FileName")
            targets_in <- data.frame(FileName1)
            if (is.null(outfile1)) stop("'outfile1' is required.")
            outpaths <- outfile1
        }
    }
    ## Run function in loop over all fastq files
    ## Single end fastq files
    if (!all(c("FileName1", "FileName2") %in% colnames_args)) {
        for (i in seq(along = outpaths)) {
            outfile <- outpaths[i]
            ## Delete existing fastq files with same names, since writeFastq will append to them
            if (overwrite == TRUE) {
                if (any(file.exists(outfile))) unlink(outfile)
            } else {
                if (any(file.exists(outfile))) stop("File ", outfile, " exists. Please delete file first or set overwrite=TRUE.")
            }
            ## Run preprocessor function with FastqStreamer
            counter <- 0
            f <- ShortRead::FastqStreamer(normalizePath(targets_in$FileName[i]), batchsize)
            while (length(fq <- ShortRead::yield(f))) {
                fqtrim <- eval(parse(text = Fct))
                ShortRead::writeFastq(fqtrim, outfile, mode = "a", ...)
                counter <- counter + length(fqtrim)
                cat(counter, "processed reads written to file:", outfile, "\n")
            }
            close(f)
        }
    }
    ## Paired end fastq files
    if (all(c("FileName1", "FileName2") %in% colnames_args)) {
        for (i in seq(along = outpaths1)) {
            p1 <- as.character(targets_in$FileName1[i])
            p2 <- as.character(targets_in$FileName2[i])
            p1out <- as.character(outpaths1[i])
            p2out <- as.character(outpaths2[i])
            ## Delete existing fastq files with same names, since writeFastq will append to them
            if (overwrite == TRUE) {
                if (any(file.exists(p1out))) unlink(p1out)
                if (any(file.exists(p2out))) unlink(p2out)
            } else {
                if (any(file.exists(p1out))) {
                    stop("File ", p1out, " exists. Please delete file first or set overwrite=TRUE.")
                }
                if (any(file.exists(p2out))) {
                    stop("File ", p2out, " exists. Please delete file first or set overwrite=TRUE.")
                }
            }
            ## Run preprocessor function with FastqStreamer
            counter1 <- 0
            counter2 <- 0
            f1 <- ShortRead::FastqStreamer(p1, batchsize)
            f2 <- ShortRead::FastqStreamer(p2, batchsize)
            while (length(fq1 <- ShortRead::yield(f1))) {
                fq2 <- ShortRead::yield(f2)
                if (length(fq1) != length(fq2)) {
                    stop("Paired end files cannot have different read numbers.")
                }
                ## Process p1
                fq <- fq1 # for simplicity in eval
                fq1trim <- eval(parse(text = Fct))
                ## Index for p1
                index1 <- as.character(ShortRead::id(fq1)) %in% as.character(ShortRead::id(fq1trim))
                names(index1) <- seq(along = index1)
                index1 <- names(index1[index1])
                ## Process p2
                fq <- fq2 # for simplicity in eval
                fq2trim <- eval(parse(text = Fct))
                ## Index for p1
                index2 <- as.character(ShortRead::id(fq2)) %in% as.character(ShortRead::id(fq2trim))
                names(index2) <- seq(along = index2)
                index2 <- names(index2[index2])
                ## Export to processed paired files
                indexpair1 <- index1 %in% index2
                ShortRead::writeFastq(fq1trim[indexpair1], p1out, mode = "a", ...)
                indexpair2 <- index2 %in% index1
                ShortRead::writeFastq(fq2trim[indexpair2], p2out, mode = "a", ...)
                counter1 <- counter1 + sum(indexpair1)
                cat(counter1, "processed reads written to file:", p1out, "\n")
                counter2 <- counter2 + sum(indexpair2)
                cat(counter2, "processed reads written to file:", p2out, "\n")
            }
            close(f1)
            close(f2)
        }
    }
}
## Usage:
# preprocessReads(args=args, Fct="trimLRPatterns(Rpattern='GCCCGGGTAA', subject=fq)", batchsize=100000, overwrite=TRUE, compress=TRUE)
# preprocessReads(FileName1 = "data/SRR446027_1.fastq.gz", FileName2 = NULL,
#                  outfile1 = "results/SRR446027_1.fastq_trim.gz", outfile2 = NULL,
#                  Fct="trimLRPatterns(Rpattern='GCCCGGGTAA', subject=fq)", batchsize=100000, overwrite=TRUE, compress=TRUE)
#
# preprocessReads(FileName1 = "data/SRR446027_1.fastq.gz", FileName2 = "data/SRR446027_2.fastq.gz",
#                  outfile1 = "results/SRR446027_1.fastq_trim.gz", outfile2 = "results/SRR446027_2.fastq_trim.gz",
#                  Fct="trimLRPatterns(Rpattern='GCCCGGGTAA', subject=fq)", batchsize=100000, overwrite=TRUE, compress=TRUE)

##################################################################
## Function to create sym links to bam files for viewing in IGV ##
##################################################################
symLink2bam <- function(sysargs, command = "ln -s", htmldir, ext = c(".bam", ".bai"), urlbase, urlfile) {
    if (!any(inherits(sysargs, "SYSargs"), inherits(sysargs, "SYSargs2"), inherits(sysargs, "character"))) stop("Argument 'sysargs' needs to be assigned an object of class 'SYSargs' OR 'SYSargs2 OR 'named character vector'")
    ## SYSargs class
    if (inherits(sysargs, "SYSargs")) {
        bampaths <- outpaths(sysargs)
        symname <- SampleName(sysargs)
        ## SYSargs2 class ##
    } else if (inherits(sysargs, "SYSargs2")) {
        bampaths <- normalizePath(subsetWF(args = sysargs, slot = "output", subset = 1, index = 1))
        symname <- names(targets(sysargs))
    } else if (inherits(sysargs, "character")) {
        if (is.null(names(sysargs))) stop("Please provide a named character vector, where the names elements should be the sampleID")
        bampaths <- sysargs
        symname <- names(sysargs)
    }
    ## Create URL file
    urls <- paste(urlbase, htmldir[2], symname, ext[1], "\t", symname, sep = "")
    writeLines(urls, urlfile)
    ## Check directory and Create corresponding sym links
    if (!dir.exists(paste(htmldir, collapse = ""))) {
        dir.create(paste(htmldir, collapse = ""), recursive = TRUE)
        message(paste(htmldir, collapse = ""), "directory was created successfully!")
    }
    symname <- rep(symname, each = 2)
    symname <- paste(symname, c(ext[1], paste(ext, collapse = "")), sep = "")
    bampaths2 <- as.vector(t(cbind(bampaths, paste(bampaths, ext[2], sep = ""))))
    symcommands <- paste(command, " ", bampaths2, " ", paste(htmldir, collapse = ""), symname, sep = "")
    for (i in symcommands) system(i)
    message("Symbolic links were created successfully!")
}
## Usage:
# symLink2bam(sysargs=args, command="ln -s", htmldir=c("~/.html/", "somedir/"), ext=c(".bam", ".bai"), urlbase="http://cluster.hpcc.ucr.edu/~tgirke/", urlfile="IGVurl.txt")

#####################
## Alignment Stats ##
#####################
alignStats <- function(args, fqpaths, pairEnd = TRUE,
                       output_index = 1,
                       subset = "FileName1") {
    ## SYSargs class
    if (class(args) == "SYSargs") {
        fqpaths <- infile1(args)
        bampaths <- outpaths(args)
        if (!nchar(infile2(args))[1] > 0) pairEnd <- FALSE
        # SYSargs2 class
    } else if (class(args) == "SYSargs2") {
        fqpaths <- infile1(args)
        output.all <- subsetWF(args, slot = "output", subset = 1, index = output_index)
        bampaths <- as.character()
        for (i in seq_along(output.all)) {
            for (j in seq_along(output.all[[i]])) {
                if (grepl(".sam$", output.all[[i]][[j]]) == TRUE & grepl(".bam$", output.all[[i]][[j]]) == FALSE) {
                    stop("Please provide files in BAM format; it can be checked as 'output(args)'. Also, check 'output_update' function, if the BAM files were previously generated.")
                } else if (grepl(".bam$", output.all[[i]][[j]]) == TRUE & grepl("sorted.bam$", output.all[[i]][[j]]) == FALSE) {
                    bampaths <- c(bampaths, output.all[[i]][[j]])
                }
            }
        }
        names(bampaths) <- names(output.all)
        if (!nchar(infile2(args))[1] > 0) pairEnd <- FALSE
    } else if (class(args) == "character") {
        if (missing(fqpaths)) stop("Please provide fqpaths argument! It must be a named character vector, where the names elements should be the sampleID")
        bampaths <- args
        fqpaths <- fqpaths
    }
    bamexists <- file.exists(bampaths)
    fqpaths <- fqpaths[bamexists]
    bampaths <- bampaths[bamexists]
    ## Obtain total read number from FASTQ files
    Nreads <- ShortRead::countLines(fqpaths) / 4
    names(Nreads) <- names(fqpaths)
    ## If reads are PE multiply by 2 as a rough approximation
    if (pairEnd) Nreads <- Nreads * 2
    ## Obtain total number of alignments from BAM files
    bfl <- BamFileList(bampaths, yieldSize = 50000, index = character())
    param <- ScanBamParam(flag = scanBamFlag(isUnmappedQuery = FALSE))
    Nalign <- Rsamtools::countBam(bfl, param = param)
    ## Obtain number of primary alignments from BAM files
    param <- Rsamtools::ScanBamParam(flag = Rsamtools::scanBamFlag(isSecondaryAlignment = FALSE, isUnmappedQuery = FALSE))
    Nalignprim <- Rsamtools::countBam(bfl, param = param)
    statsDF <- data.frame(
        FileName = names(Nreads),
        Nreads = Nreads,
        Nalign = Nalign$records,
        Perc_Aligned = Nalign$records / Nreads * 100,
        Nalign_Primary = Nalignprim$records,
        Perc_Aligned_Primary = Nalignprim$records / Nreads * 100
    )
    if (pairEnd) colnames(statsDF)[which(colnames(statsDF) == "Nreads")] <- "Nreads2x"
    return(statsDF)
}
## Usage:
# read_statsDF <- alignStats(args=args)

########################
## RPKM Normalization ##
########################
returnRPKM <- function(counts, ranges) {
    geneLengthsInKB <- sum(width(reduce(ranges))) / 1000 # Length of exon union per gene in kbp
    millionsMapped <- sum(counts) / 1e+06 # Factor for converting to million of mapped reads.
    rpm <- counts / millionsMapped # RPK: reads per kilobase of exon model.
    rpkm <- rpm / geneLengthsInKB # RPKM: reads per kilobase of exon model per million mapped reads.
    return(rpkm)
}
## Usage:
# countDFrpkm <- apply(countDF, 2, function(x) returnRPKM(counts=x, ranges=eByg))

###############################################
## Read Sample Comparisons from Targets File ##
###############################################
## Parses sample comparisons from <CMP> line(s) in targets.txt file or SYSars object.
## All possible comparisons can be specified with 'CMPset: ALL'.
readComp <- function(file, format = "vector", delim = "-") {
    if (!format %in% c("vector", "matrix")) stop("Argument format can only be assigned: vector or matrix!")
    if (inherits(file, "SYSargs")) {
        if (length(targetsheader(file)) == 0) stop("Input has no targets header lines.")
        comp <- targetsheader(file)
        ## SYSargs2 class
    } else if (inherits(file, "SYSargs2")) {
        if (length(targetsheader(file))[[1]] == 0) stop("Input has no targets header lines.")
        comp <- targetsheader(file)[[1]]
    } else {
        comp <- readLines(file)
    }
    comp <- comp[grepl("<CMP>", comp)]
    comp <- gsub("#.*<CMP>| {1,}", "", comp)
    comp <- gsub("\t", "", comp)
    comp <- gsub("^\"|\"$", "", comp) # Often required if Excel is used for editing targets file
    comp <- strsplit(comp, ":|,")
    names(comp) <- lapply(seq(along = comp), function(x) comp[[x]][1])
    comp <- sapply(names(comp), function(x) comp[[x]][-1], simplify = FALSE)
    ## Check whether all samples are present in Factor column of targets file
    checkvalues <- unique(unlist(strsplit(unlist(comp), "-")))
    checkvalues <- checkvalues[checkvalues != "ALL"]
    if (inherits(file, "SYSargs")) {
        all <- unique(as.character(targetsin(file)$Factor))
    } else if (inherits(file, "SYSargs2")) {
        all <- unique(as.character(targets.as.df(targets(file))$Factor))
    } else {
        all <- unique(as.character(read.delim(file, comment.char = "#")$Factor))
    }
    if (any(!checkvalues %in% all)) stop("The following samples are not present in Factor column of targets file: ", paste(checkvalues[!checkvalues %in% all], collapse = ", "))
    ## Generate outputs
    allindex <- sapply(names(comp), function(x) any(grepl("ALL", comp[[x]])))
    if (any(allindex)) for (i in which(allindex)) comp[[i]] <- combn(all, m = 2, FUN = paste, collapse = delim)
    if (format == "vector" & delim != "-") comp <- sapply(names(comp), function(x) gsub("-", delim, comp[[x]]), simplify = FALSE)
    if (format == "vector") {
        return(comp)
    }
    if (format == "matrix") {
        return(sapply(names(comp), function(x) do.call("rbind", Biostrings::strsplit(comp[[x]], "-")), simplify = FALSE))
    }
}
## Usage:
# targetspath <- system.file("extdata", "targets.txt", package="systemPipeR")
# cmp <- readComp(targetspath, format="vector", delim="-")
# cmp <- readComp(WF, format="vector", delim="-")

#######################################################################
## Run edgeR GLM with entire count matrix or subsetted by comparison ##
#######################################################################
## If independent=TRUE then countDF will be subsetted for each comparison
run_edgeR <- function(countDF, targets, cmp, independent = TRUE, paired = NULL, mdsplot = "") {
    pkg <- c("edgeR", "limma", "grDevices")
    checkPkg(pkg, quietly = FALSE)
    ## if(class(cmp) != "matrix" & length(cmp)==2) cmp <- t(as.matrix(cmp)) # If cmp is vector of length 2, convert it to matrix.
    ## fix for _R_CHECK_LENGTH_1_LOGIC2_ error: " --- failure: the condition has length > 1 ---"
    if (all(class(cmp) != "matrix" & length(cmp) == 2)) cmp <- t(as.matrix(cmp))
    samples <- as.character(targets$Factor)
    names(samples) <- paste(as.character(targets$SampleName), "", sep = "")
    countDF <- countDF[, names(samples)]
    countDF[is.na(countDF)] <- 0
    edgeDF <- data.frame(row.names = rownames(countDF))
    group <- as.character(samples)
    if (independent == TRUE) {
        loopv <- seq(along = cmp[, 1])
    } else {
        loopv <- 1
    }
    for (j in loopv) {
        ## Filtering and normalization
        y <- edgeR::DGEList(counts = countDF, group = group) # Constructs DGEList object
        if (independent == TRUE) {
            subset <- samples[samples %in% cmp[j, ]]
            y <- y[, names(subset)]
            y$samples$group <- factor(as.character(y$samples$group))
        }
        keep <- rowSums(edgeR::cpm(y) > 1) >= 2
        y <- y[keep, ]
        y <- edgeR::calcNormFactors(y)
        ## Design matrix
        if (length(paired) == 0) {
            design <- model.matrix(~ 0 + y$samples$group, data = y$samples)
            colnames(design) <- levels(y$samples$group)
        } else {
            if (length(paired) > 0 & independent == FALSE) stop("When providing values under 'paired' also set independent=TRUE")
            Subject <- factor(paired[samples %in% cmp[j, ]]) # corrected Jun 2014 (won't change results)
            Treat <- y$samples$group
            design <- model.matrix(~ Subject + Treat)
            levels(design) <- levels(y$samples$group)
        }
        ## Estimate dispersion
        y <- edgeR::estimateGLMCommonDisp(y, design, verbose = TRUE) # Estimates common dispersions
        y <- edgeR::estimateGLMTrendedDisp(y, design) # Estimates trended dispersions
        y <- edgeR::estimateGLMTagwiseDisp(y, design) # Estimates tagwise dispersions
        fit <- edgeR::glmFit(y, design) # Fits the negative binomial GLM for each tag and produces an object of class DGEGLM with some new components.
        ## Contrast matrix is optional but makes anlysis more transparent
        if (independent == TRUE) {
            mycomp <- paste(cmp[j, 1], cmp[j, 2], sep = "-")
        } else {
            mycomp <- paste(cmp[, 1], cmp[, 2], sep = "-")
        }
        if (length(paired) == 0) contrasts <- limma::makeContrasts(contrasts = mycomp, levels = design)
        for (i in seq(along = mycomp)) {
            if (length(paired) == 0) {
                lrt <- edgeR::glmLRT(fit, contrast = contrasts[, i]) # Takes DGEGLM object and carries out the likelihood ratio test.
            } else {
                lrt <- edgeR::glmLRT(fit) # No contrast matrix with paired design
            }
            deg <- as.data.frame(edgeR::topTags(lrt, n = length(rownames(y))))
            colnames(deg) <- paste(paste(mycomp[i], collapse = "_"), colnames(deg), sep = "_")
            edgeDF <- cbind(edgeDF, deg[rownames(edgeDF), ])
        }
        if (nchar(mdsplot) > 0) {
            grDevices::pdf(paste("./results/sample_MDS_", paste(unique(subset), collapse = "-"), ".pdf", sep = ""))
            limma::plotMDS(y)
            grDevices::dev.off()
        }
    }
    return(edgeDF)
}
## Usage:
# cmp <- readComp(file=targetspath, format="matrix", delim="-")
# edgeDF <- run_edgeR(countDF=countDF, targets=targets, cmp=cmp[[1]], independent=TRUE, mdsplot="")

####################################################################
## Run DESeq2 with entire count matrix or subsetted by comparison ##
####################################################################
## If independent=TRUE then countDF will be subsetted for each comparison
run_DESeq2 <- function(countDF, targets, cmp, independent = FALSE, lfcShrink = FALSE, type = "normal") {
    pkg <- c("DESeq2")
    checkPkg(pkg, quietly = FALSE)
    ## if(class(cmp) != "matrix" & length(cmp)==2) cmp <- t(as.matrix(cmp)) # If cmp is vector of length 2, convert it to matrix.
    ## fix for _R_CHECK_LENGTH_1_LOGIC2_ error: " --- failure: the condition has length > 1 ---"
    if (all(class(cmp) != "matrix" & length(cmp) == 2)) cmp <- t(as.matrix(cmp))
    samples <- as.character(targets$Factor)
    names(samples) <- paste(as.character(targets$SampleName), "", sep = "")
    countDF <- countDF[, names(samples)]
    countDF[is.na(countDF)] <- 0
    deseqDF <- data.frame(row.names = rownames(countDF))
    if (independent == TRUE) {
        loopv <- seq(along = cmp[, 1])
    } else {
        loopv <- 1
    }
    for (j in loopv) {
        if (independent == TRUE) {
            ## Create subsetted DESeqDataSet object
            subset <- samples[samples %in% cmp[j, ]]
            countDFsub <- countDF[, names(subset)]
            dds <- DESeq2::DESeqDataSetFromMatrix(countData = as.matrix(countDFsub), colData = data.frame(condition = subset), design = ~condition)
            mycmp <- cmp[j, , drop = FALSE]
        } else {
            ## Create full DESeqDataSet object
            dds <- DESeq2::DESeqDataSetFromMatrix(countData = as.matrix(countDF), colData = data.frame(condition = samples), design = ~condition)
            mycmp <- cmp
        }
        ## Estimate of (i) size factors, (ii) dispersion, (iii) negative binomial GLM fitting and (iv) Wald statistics
        dds <- DESeq2::DESeq(dds, quiet = TRUE)
        for (i in seq(along = mycmp[, 1])) {
            ## Extracts DEG results for specific contrasts from DESeqDataSet object
            res <- DESeq2::results(dds, contrast = c("condition", mycmp[i, ]))
            ## lfcShrink
            if (lfcShrink == FALSE) {
                res <- DESeq2::results(dds, contrast = c("condition", mycmp[i, ]))
            } else if (lfcShrink == TRUE) {
                suppressMessages(
                    res <- DESeq2::lfcShrink(dds, contrast = c("condition", mycmp[i, ]), type = type)
                )
            }
            ## Set NAs to reasonable values to avoid errors in downstream filtering steps
            res[is.na(res[, "padj"]), "padj"] <- 1
            res[is.na(res[, "log2FoldChange"]), "log2FoldChange"] <- 0
            deg <- as.data.frame(res)
            colnames(deg)[colnames(deg) %in% c("log2FoldChange", "padj")] <- c("logFC", "FDR")
            colnames(deg) <- paste(paste(mycmp[i, ], collapse = "-"), colnames(deg), sep = "_")
            deseqDF <- cbind(deseqDF, deg[rownames(deseqDF), ])
        }
    }
    return(deseqDF)
}
## Usage:
# cmp <- readComp(file=targetspath, format="matrix", delim="-")
# degseqDF <- run_DESeq2(countDF=countDF, targets=targets, cmp=cmp[[1]], independent=FALSE)

###########################################
## Filter DEGs by p-value and fold change ##
############################################
filterDEGs <- function(degDF, filter, plot = TRUE) {
    ## global functions or variables
    Comparisons <- Counts <- Type <- NULL
    pval <- degDF[, grep("_FDR$", colnames(degDF)), drop = FALSE]
    log2FC <- degDF[, grep("_logFC$", colnames(degDF)), drop = FALSE]
    ## DEGs that are up or down regulated
    pf <- pval <= filter["FDR"] / 100 & (log2FC >= log2(filter["Fold"]) | log2FC <= -log2(filter["Fold"]))
    colnames(pf) <- gsub("_FDR", "", colnames(pf))
    pf[is.na(pf)] <- FALSE
    DEGlistUPorDOWN <- sapply(colnames(pf), function(x) rownames(pf[pf[, x, drop = FALSE], , drop = FALSE]), simplify = FALSE)
    ## DEGs that are up regulated
    pf <- pval <= filter["FDR"] / 100 & log2FC >= log2(filter["Fold"])
    colnames(pf) <- gsub("_FDR", "", colnames(pf))
    pf[is.na(pf)] <- FALSE
    DEGlistUP <- sapply(colnames(pf), function(x) rownames(pf[pf[, x, drop = FALSE], , drop = FALSE]), simplify = FALSE)
    ## DEGs that are down regulated
    pf <- pval <= filter["FDR"] / 100 & log2FC <= -log2(filter["Fold"])
    colnames(pf) <- gsub("_FDR", "", colnames(pf))
    pf[is.na(pf)] <- FALSE
    DEGlistDOWN <- sapply(colnames(pf), function(x) rownames(pf[pf[, x, drop = FALSE], , drop = FALSE]), simplify = FALSE)
    df <- data.frame(Comparisons = names(DEGlistUPorDOWN), Counts_Up_or_Down = sapply(DEGlistUPorDOWN, length), Counts_Up = sapply(DEGlistUP, length), Counts_Down = sapply(DEGlistDOWN, length))
    resultlist <- list(UporDown = DEGlistUPorDOWN, Up = DEGlistUP, Down = DEGlistDOWN, Summary = df)
    if (plot == TRUE) {
        mytitle <- paste("DEG Counts (", names(filter)[1], ": ", filter[1], " & ", names(filter)[2], ": ", filter[2], "%)", sep = "")
        df_plot <- data.frame(Comparisons = rep(as.character(df$Comparisons), 2), Counts = c(df$Counts_Up, df$Counts_Down), Type = rep(c("Up", "Down"), each = length(df[, 1])))
        p <- ggplot2::ggplot(df_plot, ggplot2::aes(Comparisons, Counts, fill = Type)) +
            ggplot2::geom_bar(position = "stack", stat = "identity") +
            ggplot2::coord_flip() +
            ggplot2::theme(axis.text.y = ggplot2::element_text(angle = 0, hjust = 1)) +
            ggplot2::ggtitle(mytitle)
        print(p)
    }
    return(resultlist)
}
## Usage:
# DEG_list <- filterDEGs(degDF=edgeDF, filter=c(Fold=2, FDR=1))

#############
## showDF ##
#############
showDF <- function(data, ...) {
    checkPkg("DT", quietly = FALSE)
    DT::datatable(
        data,
        extensions = c("FixedColumns", "Scroller"),
        options = list(
            scrollX = TRUE,
            fixedColumns = TRUE,
            deferRender = TRUE,
            scrollY = 200,
            scroller = TRUE
        )
    )
}
## Usage:
# targetspath <- system.file("extdata", "targets.txt", package="systemPipeR")
# targets <- read.delim(targetspath, comment.char = "#")
# showDF(targets)

###############
## checkPkg ##
###############
# param pkg character vector of the names of packages
checkPkg <- function(pkg, quietly = TRUE) {
    if (!inherits(pkg, "character")) stop("Argument 'pkg' needs to be assigned an object of class 'character'")
    for (i in pkg) {
        if (!requireNamespace(i, quietly = quietly)) {
            stop("Package '", i, "' should be installed.", "\n", call. = FALSE)
        }
    }
}
# pkg <- c("AnnotationDbi", "GO.db")
# pkg <- c("annotate")
# checkPkg(pkg, quietly = FALSE)

#########################
## Internal Functions ##
########################

###############
## .tryRunC ##
###############
.tryRunC <- function(command, commandargs) {
    warning <- error <- NULL
    value <- withCallingHandlers(
        tryCatch(
            if (command %in% "bwa") {
                bwa_err <- tempfile()
                system2(command, args = commandargs, stdout = T, stderr = bwa_err)
                readLines(bwa_err)
            } else if (command %in% c("bash")) {
                system(paste(command, commandargs))
            } else if (isTRUE(grep("\\$", command) == 1)) {
                system(paste(command, commandargs))
            } else {
                system2(command, args = commandargs, stdout = TRUE, stderr = TRUE)
            },
            error = function(e) {
                error <<- conditionMessage(e)
                NULL
            }
        ),
        warning = function(w) {
            warning <<- append(warning, conditionMessage(w))
            invokeRestart("muffleWarning")
        }
    )
    list(stdout = value, warning = warning, error = error)
}

## Usage:
# .tryRunC("ls", "la")
# .tryRunC("ls", "-la")

###########################################################################
## .moduleload function: internal function to module load <modules> ##
###########################################################################
.moduleload <- function(args) {
    ## Environment Modules section
    if (any(nchar(gsub(" {1,}", "", modules(args))) > 0)) {
        ## Check if "Environment Modules" is in the PATH
        # try(suppressWarnings(modulecmd_path <- system("which modulecmd", intern = TRUE, ignore.stderr = TRUE)),
        #     silent = TRUE
        # )
        modulecmd_path <- suppressMessages(is.modules.avail())
        ## "Environment Modules" is not available
        if (length(modulecmd_path) == 0) {
            dump <- "do nothing"
            ## "Environment Modules" is available and proceed the module load
        } else if (length(modulecmd_path) > 0) {
            for (j in modules(args)) moduleload(j) # loads specified software from module system
        }
    }
}

###########################################################################
## .makeBam function: internal function to convert *.sam to *.bam file ##
###########################################################################
.makeBam <- function(output_args, make_bam = TRUE, del_sam = TRUE) {
    if (all(!is.logical(c(make_bam, del_sam)))) stop("Arguments needs to be assigned 'TRUE' and 'FALSE'")
    if (make_bam == TRUE) {
        sam_files <- grepl(".sam$", output_args)
        others_files <- grepl("vcf$|bcf$|xls$|bed$", output_args)
        completed.bam <- grepl(".bam$", output_args)
        if (any(sam_files)) {
            for (k in which(sam_files)) {
                if (file.exists(output_args[k])) {
                    Rsamtools::asBam(file = output_args[k], destination = gsub("\\.sam$", "", output_args[k]), overwrite = TRUE, indexDestination = TRUE)
                    if (del_sam == TRUE) {
                        unlink(output_args[k])
                    } else if (del_sam == FALSE) {
                        dump <- "do nothing"
                    }
                } else {
                    next()
                }
            }
        } else if (any(others_files)) {
            dump <- "do nothing"
        }
        if (any(completed.bam)) { # If output is unindexed *.bam file (e.g. Tophat2)
            for (k in which(completed.bam)) {
                Rsamtools::sortBam(file = output_args[k], destination = gsub("\\.bam$", "", output_args[k]))
                Rsamtools::indexBam(output_args[k])
            }
        }
    } else if (make_bam == FALSE) {
        dump <- "do nothing"
    }
}
## Usage:
# .makeBam(output(args)[[1]][[1]], make_bam=TRUE, del_sam=FALSE)

########################################################
## .checkOutArgs2 function: internal function to check
## if the expected output has been created  ##
########################################################
.checkOutArgs2 <- function(args, make_bam, dir = dir, dir.name, force, del_sam = TRUE) {
    args_complete <- args
    if (make_bam == TRUE) {
        ## Validation for Hisat2
        if (any(grepl("samtools", names(clt(args_complete))))) {
            stop("argument 'make_bam' should be 'FALSE' when using the workflow with 'SAMtools'", call. = FALSE)
        }
        if (any(grepl(".sam", args_complete$output[[1]][[1]]))) {
            args_complete <- output_update(args_complete, dir = dir, dir.name = dir.name, replace = TRUE, extension = c(".sam", ".bam"), make_bam = make_bam, del_sam = del_sam)
        }
    }
    if (dir == TRUE) {
        args_complete <- output_update(args_complete, dir = dir, dir.name = dir.name)
    }
    completed <- output(args_complete)
    for (i in seq_along(output(args_complete))) {
        for (j in seq_along(output(args_complete)[[i]])) {
            completed[[i]][[j]] <- file.exists(output(args_complete)[[i]][[j]])
            names(completed[[i]][[j]]) <- output(args_complete)[[i]][[j]]
        }
    }
    check <- check.output(args_complete, "list")
    if (force) check <- replace(check, check == TRUE, FALSE)
    for (i in seq_along(check)) {
        if (check[i] == FALSE) {
            args[["output"]][[i]] <- args$internal_outfiles[[i]]
        }
    }
    return <- list(args = args, args_complete = args_complete, completed = completed)
    return(return)
}
## Usage:
# return <- .checkOutArgs2(args, make_bam=TRUE)
# args <- return$args
# completed <- return$completed

##########################################################################################
## .sysargsrunCommandline function: Old version of runCommandline accepts SYSargs class ##
##########################################################################################
.sysargsrunCommandline <- function(args, runid = "01", make_bam = TRUE, del_sam = TRUE, ...) {
    message("This method is using the previus version of SYSargs workflow control modules.
          Please check the new version 'SYSargs2'.")
    commands <- sysargs(args)
    completed <- file.exists(outpaths(args))
    names(completed) <- outpaths(args)
    logdir <- results(args)
    for (i in seq(along = commands)) {
        ## Run alignmets only for samples for which no BAM file is available.
        if (as.logical(completed)[i]) {
            next()
        } else {
            ## Create soubmitargsID_command file
            cat(commands[i], file = paste(logdir, "submitargs", runid, sep = ""), sep = "\n", append = TRUE)
            ## Run executable
            command <- gsub(" .*", "", as.character(commands[i]))
            commandargs <- gsub("^.*? ", "", as.character(commands[i]))
            ## Execute system command; note: BWA needs special treatment in stderr handling since it writes
            ## some stderr messages to sam file if used with system2()
            if (software(args) %in% c("bwa aln", "bwa mem")) {
                stdout <- system2(command, args = commandargs, stdout = TRUE, stderr = FALSE)
            } else if (software(args) %in% c("bash_commands")) {
                stdout <- system(paste(command, commandargs))
            } else {
                stdout <- system2(command, args = commandargs, stdout = TRUE, stderr = TRUE)
            }
            ## Create submitargsID_stdout file
            cat(commands[i], file = paste(logdir, "submitargs", runid, "_log", sep = ""), sep = "\n", append = TRUE)
            cat(unlist(stdout), file = paste(logdir, "submitargs", runid, "_log", sep = ""), sep = "\n", append = TRUE)
            ## Conditional postprocessing of results
            if (make_bam == TRUE) {
                if (grepl(".sam$", outfile1(args)[i])) { # If output is *.sam file (e.g. Bowtie2)
                    Rsamtools::asBam(file = outfile1(args)[i], destination = gsub("\\.sam$", "", outfile1(args)[i]), overwrite = TRUE, indexDestination = TRUE)
                    if (del_sam == TRUE) {
                        unlink(outfile1(args)[i])
                    } else if (del_sam == FALSE) {
                        dump <- "do nothing"
                    }
                } else if (grepl("vcf$|bcf$|xls$|bed$", outpaths(args)[i])) {
                    dump <- "do nothing"
                } else { # If output is unindexed *.bam file (e.g. Tophat2)
                    Rsamtools::sortBam(file = names(completed[i]), destination = gsub("\\.bam$", "", names(completed[i])))
                    Rsamtools::indexBam(names(completed[i]))
                }
            }
        }
    }
    bamcompleted <- gsub("sam$", "bam$", file.exists(outpaths(args)))
    names(bamcompleted) <- SampleName(args)
    cat("Missing alignment results (bam files):", sum(!as.logical(bamcompleted)), "\n")
    cat("Existing alignment results (bam files):", sum(as.logical(bamcompleted)), "\n")
    return(bamcompleted)
}
## Usage:
# args <- systemArgs(sysma="param/hisat2.param", mytargets="targets.txt")
# sysargs(args)[1] # Command-line parameters for first FASTQ file
# system("hisat2-build ./data/tair10.fasta ./data/tair10.fasta")
# .sysargsrunCommandline(args=args)
tgirke/systemPipeR documentation built on March 27, 2024, 11:31 p.m.