R/preprocessing-createtablescores.R

Defines functions createtablescores .fulljoinfun .mergefiles

Documented in createtablescores

.mergefiles <- function(explist, tmpfold, verbose) {

    if (verbose) message("Merging files by experiment and direction")
    mergedfilelist <- mapply(function(currentfiles, currentname, tmpfold,
        verbose) {
            destfile <- file.path(tmpfold, paste0(currentname, ".tsv"))
            if (verbose) message("\t Combining all ", currentname,
                " files into ", destfile)
            if (isTRUE(all.equal(.Platform$OS.type, "windows"))) {
                cmdps <- paste0("Get-Content ",
                    paste(currentfiles, collapse = ", "),
                    "| Out-File -Encoding UTF8 ", destfile)
                system2("powershell.exe", args = c("-command", cmdps))
            } else {
                cmd <- paste0("cat ", paste(currentfiles, collapse = " "),
                " > ", destfile)
                system(cmd)
            }

            return(destfile)
        }, explist, names(explist), MoreArgs = list(tmpfold, verbose))

    return(mergedfilelist)
}

.fulljoinfun <- function(mergedfilelist, idxvec, showmemory, verbose) {

    colnamevec <- c("biotype", "chr", "coor1", "coor2", "transcript", "gene",
        "strand", "window", "id", "dataset", "score")
    colnamejoin <- colnamevec[-c(10, 11)] ## Remove dataset and score
    if (verbose) message("Reading files and joining to the final table")
    firstpath <- mergedfilelist[[idxvec[1]]]
    if (verbose) message("\t Reading and joining ", firstpath)
    finaltab <- utils::read.delim(firstpath, header = FALSE,
        sep = "\t", na.strings = "NA", dec = ".", col.names = colnamevec,
        stringsAsFactors = FALSE)

    for (idx in idxvec[-1]) {
        currentpath <- mergedfilelist[[idx]]
        if (verbose) message("\t Reading and joining ", currentpath)
        tab <- utils::read.delim(currentpath, header = FALSE, sep = "\t",
            na.strings = "NA", dec = ".", col.names = colnamevec,
            stringsAsFactors = FALSE)
        finaltab <- dplyr::full_join(finaltab, tab, by = colnamejoin)
        rm(tab)
        if (showmemory) print(gc()) else invisible(gc())
    }
    return(finaltab)
}

#' Create a Unified Table of Scores
#'
#' @description
#' This function processes and combines table scores of each bedgraph and each
#' chromosome stored in the temporary folder into a unified table.
#'
#' @usage
#' createtablescores(tmpfold, exptabpath, showmemory = FALSE, showtime = FALSE,
#'   savefinaltable = TRUE, finaltabpath = tempdir(), finaltabname = "anno.tsv",
#'  verbose)
#'
#' @param tmpfold A string specifying the temporary folder containing the
#'  score files created with the function 'blacklisthighmap'.
#' @param exptabpath Path to the experiment table file containing a table with
#'              columns named 'condition', 'replicate', 'strand', and 'path'.
#' @param showmemory Logical; if `TRUE`, memory usage is printed during
#'  processing. Default is `FALSE`.
#' @param showtime Logical; if `TRUE`, the execution time of the function is
#'  printed. Default is `FALSE`.
#' @param savefinaltable Logical; if `TRUE`, the resulting table is saved to
#'  disk. Default is `TRUE`.
#' @param finaltabpath A string specifying the directory where the final table
#'  should be saved. Default is \code{tempdir()}.
#' @param finaltabname A string specifying the name of the final table file.
#'  Default is `"anno.tsv"`.
#' @param verbose Logical; if `TRUE`, detailed messages are printed during
#'  execution.
#'
#' @return A data frame containing the unified table of scores.
#'
#' @details
#' This function first merges files belonging to the same experiment and
#' direction. These files are combined into a single table providing two columns
#' per experiment. The first gives the name of the experiment and the second the
#' scores. The resulting table also includes annotations for each transcript.
#'
#' @examples
#' \donttest{
#' exptabpath <- system.file("extdata", "exptab-preprocessing.csv", package="tepr")
#' gencodepath <- system.file("extdata", "gencode-chr13.gtf", package = "tepr")
#' maptrackpath <- system.file("extdata", "k50.umap.chr13.hg38.0.8.bed",
#' package = "tepr")
#' blacklistpath <- system.file("extdata", "hg38-blacklist-chr13.v2.bed",
#'     package = "tepr")
#' tmpfoldpath <- file.path(tempdir(), "tmptepr")
#' windsize <- 200
#' genomename <- "hg38"
#' chromtabtest <- rtracklayer::SeqinfoForUCSCGenome(genomename)
#' allchromvec <- GenomeInfoDb::seqnames(chromtabtest)
#' chromtabtest <- chromtabtest[allchromvec[which(allchromvec == "chr13")], ]
#' 
#' ## Copying bedgraphs to the current directory
#' expdfpre <- read.csv(exptabpath)
#' bgpathvec <- sapply(expdfpre$path, function(x) system.file("extdata", x,
#'     package = "tepr"))
#' expdfpre$path <- bgpathvec
#' write.csv(expdfpre, file = "exptab-preprocessing.csv", row.names = FALSE,
#'     quote = FALSE)
#' exptabpath <- "exptab-preprocessing.csv"
#'
#' ## Necessary result to call createtablescores
#' allannobed <- retrieveanno(exptabpath, gencodepath, verbose = FALSE)
#' allwindowsbed <- makewindows(allannobed, windsize, verbose = FALSE)
#' blacklisthighmap(maptrackpath, blacklistpath, exptabpath, nbcputrans = 1,
#'     allwindowsbed, windsize, genomename = genomename, chromtab = chromtabtest,
#'     verbose = FALSE)
#'
#' ## Calling the function to test
#' finaltabtest <- createtablescores(tmpfold = tmpfoldpath, exptabpath,
#'     savefinaltable = FALSE, verbose = FALSE)}
#'
#' @importFrom dplyr full_join
#' @importFrom utils read.csv read.delim write.table
#'
#' @seealso
#' [blacklisthighmap]
#'
#' @export

createtablescores <- function(tmpfold, exptabpath, showmemory = FALSE,
    showtime = FALSE, savefinaltable = TRUE, finaltabpath = tempdir(),
    finaltabname = "anno.tsv", verbose = TRUE) {

        if (showtime) start_time_fun <- Sys.time()
        if (verbose) message("\n ## Merging results of each bedgraph into a ",
            "single table ##\n")

        ## Reading the information about experiments
        if (verbose) message("Reading the information about experiments")
        exptab <- utils::read.csv(exptabpath, header = TRUE)

        ## Retrieving the file paths
        filevec <- list.files(tmpfold, full.names = TRUE)

        ## Splitting the files by experiment names
        expnamevec <- sapply(strsplit(basename(filevec), "-"), "[", 1)

        if (!isTRUE(all.equal(length(unique(table(expnamevec))), 1)))
            stop("\n\t Experiments have a different number of files. This ",
                "should not happen if you used blacklisthighmap with the same",
                "experiment table. Contact the developer.\n")

        explist <- split(filevec, factor(expnamevec))

        ## Merging files by experiment and direction
        mergedfilelist <- .mergefiles(explist, tmpfold, verbose)

        ## Retrieving the exp name in the right order from exptab
        orderedexpvec <- paste0(exptab$condition, exptab$replicate,
            exptab$direction)
        idxvec <- match(orderedexpvec, names(mergedfilelist))
        idxna <- which(is.na(idxvec))
        if (!isTRUE(all.equal(length(idxna), 0)))
            stop("\n\t The merged file names do not correspond to the exptab.",
                " This should not happen. Contact the developer.\n")

        ## Reading each merged file and combining it to the final table
        finaltab <- .fulljoinfun(mergedfilelist, idxvec, showmemory, verbose)

        ## Filling the experiment columns
        if (verbose) message("Filling the experiment columns")
        idxdatasetvec <- grep("dataset", colnames(finaltab))
        nbrows <- nrow(finaltab)
        for (idxdataset in idxdatasetvec) {
            expname <- as.character(na.omit(unique(finaltab[, idxdataset])))
            finaltab[, idxdataset] <- rep(expname, nbrows)
        }

        if (savefinaltable) {
            if (!file.exists(finaltabpath))
                dir.create(finaltabpath, recursive = TRUE)
            outfile <- file.path(finaltabpath, finaltabname)
            if (verbose) message("\n ## Saving the final table to ", outfile)
            utils::write.table(finaltab, file = outfile, sep = "\t",
                quote = FALSE, row.names = FALSE, col.names = FALSE,
                fileEncoding = "UTF8")
        }

        if (showtime) {
            end_time_fun <- Sys.time()
            timing <- end_time_fun - start_time_fun
            message("\t\t ## Final table created in: ",
                format(timing, digits = 2))
        }

        return(finaltab)
}

Try the tepr package in your browser

Any scripts or data that you put into this service are public.

tepr documentation built on June 8, 2025, 10:46 a.m.