Nothing
.coordandfilter <- function(str, transtable, nbrows) { # nolint
if (isTRUE(all.equal(str, "minus"))) {
coordvec <- seq(from = nbrows, to = 1, by = -1)
transtable <- cbind(transtable, coord = coordvec)
transtable <- transtable[order(coordvec), ]
transtable <- transtable %>% dplyr::select(!dplyr::matches("plus"))
} else {
coordvec <- seq(from = 1, to = nbrows, by = 1)
transtable <- cbind(transtable, coord = coordvec)
transtable <- transtable %>% dplyr::select(!dplyr::matches("minus"))
}
return(transtable)
}
.computeecdf <- function(transtable, expdf, rounding, nbrows) { # nolint
## Declaration to tackle CMD check
variable <- NULL
## Retrieving keyword plus or minus
str <- .extractstr(transtable)
## Create the coordinate column and select scores having the righ
## orientation
transtable <- .coordandfilter(str, transtable, nbrows)
colnamevec <- colnames(transtable)
colscorevec <- colnamevec[grep("_score", colnamevec)]
## Filling the NA of the score columns of the right strand with
## tidyr::fill in the downup direction
transtable <- transtable %>% tidyr::fill(tidyr::contains("score"),
.direction = "downup")
## Computing ecdf
suppressWarnings(dflong <- transtable %>%
tidyr::gather(key = "variable", value = "value", colscorevec))
dflong[, "value_round"] <- round(dflong$value * rounding)
ecdflist <- lapply(unique(dflong$variable), function(currentvar) {
dfsubset <- subset(dflong,
subset = variable == currentvar)
dfexpanded <- dfsubset[rep(seq_len(nrow(dfsubset)),
dfsubset$value_round), ]
funecdf <- stats::ecdf(dfexpanded[, "coord"])
dfsubset$Fx <- funecdf(dfsubset$coord)
return(dfsubset)
})
resecdf <- dplyr::bind_rows(ecdflist)
## Shrink the results back to the transtable keeping ecdf columns
res <- dplyr::select(tidyr::pivot_wider(resecdf,
names_from = "variable", values_from = c("value", "value_round",
"Fx")), -tidyselect::contains("value_round"))
## Removing strand from column names
colnames(res) <- gsub(paste0(".", str), "", colnames(res))
return(res)
}
#' Compute ECDF for Genes Based on Expression Data
#'
#' @description
#' This function calculates the empirical cumulative distribution function
#' (ECDF) for expressed genes across multiple transcripts. It processes the
#' expression data to filter out non-expressed transcripts, compute ECDF values
#' for each transcript, and combine the results into a unified data frame. The
#' function operates in parallel for speed optimization.
#'
#' @usage
#' genesECDF(allexprsdfs, expdf, nbcpu = 1, rounding = 10,
#' showtime = FALSE, verbose = TRUE)
#'
#' @param allexprsdfs A list of data frames where the first element is the main
#' expression data frame and the second element contains the names of the
#' expressed transcripts (see 'averageandfilterexprs').
#' @param expdf A data frame containing experiment data that should have
#' columns named 'condition', 'replicate', 'strand', and 'path'.
#' @param nbcpu An integer specifying the number of CPU cores to use for
#' parallel computation. Default is \code{1}.
#' @param rounding An integer specifying the rounding factor for computing ECDF.
#' Default is \code{10}.
#' @param showtime A logical value indicating if the duration of the function
#' processing should be indicated before ending. Defaults to
#' \code{FALSE}.
#' @param verbose A logical flag indicating whether to print progress messages.
#' Default is \code{TRUE}.
#'
#' @return A list containing two elements:
#' \item{concatdf}{A data frame with ECDF results for each transcript.}
#' \item{nbrows}{An integer indicating the number of rows in each transcript
#' table.}
#'
#' @details
#' The function performs several steps:
#' \enumerate{
#' \item Filters the main expression table to retain only the expressed
#' transcripts.
#' \item Splits the data by each transcript.
#' \item For each transcript, computes ECDF values for the score columns
#' while respecting the strand orientation ("plus" or "minus").
#' \item Combines the ECDF results into a final data frame.
#' }
#'
#' The function uses parallel processing to compute ECDF for each transcript
#' simultaneously, making it faster on systems with multiple CPU cores.
#'
#' @examples
#' exppath <- system.file("extdata", "exptab.csv", package="tepr")
#' transpath <- system.file("extdata", "cugusi_6.tsv", package="tepr")
#' expthres <- 0.1
#'
#' ## Calculating averageandfilterexprs and countNA to call genesECDF
#' expdf <- read.csv(exppath)
#' transdf <- read.delim(transpath, header = FALSE)
#' avfilttest <- averageandfilterexprs(expdf, transdf, expthres,
#' showtime = FALSE, verbose = FALSE)
#' countnatest <- countna(avfilttest, expdf, nbcpu = 1, verbose = FALSE)
#'
#' ## Testing genesECDF
#' resecdf <- genesECDF(avfilttest, expdf, verbose = FALSE)
#'
#' @importFrom parallel mclapply
#' @importFrom dplyr bind_rows
#' @importFrom tidyr fill pivot_wider
#' @importFrom tidyselect contains
#' @importFrom rlang .data
#' @importFrom magrittr %>%
#' @importFrom stats ecdf
#'
#' @seealso
#' [averageandfilterexprs]
#'
#' @export
genesECDF <- function(allexprsdfs, expdf, nbcpu = 1, rounding = 10, # nolint
showtime = FALSE, verbose = TRUE) {
if (showtime) start_time <- Sys.time()
if (verbose) message("\n\t ## Computing ecdf")
## Defining variables
maintable <- allexprsdfs[[1]]
exprstransnames <- allexprsdfs[[2]]
maincolnamevec <- colnames(maintable)
## Filtering the main table to keep only the expressed transcripts
if (verbose) message("\t Filtering to keep only the expressed transcripts") # nolint
idx <- match(maintable$transcript, exprstransnames)
idxnoexpr <- which(is.na(idx))
if (isTRUE(all.equal(length(idxnoexpr), 0))) {
if (verbose)
warning("All the transcripts are expressed", immediate. = TRUE) # nolint
} else {
maintable <- maintable[-idxnoexpr, ]
}
## Splitting the table by each transcript to perform transcript specific
## operations
if (verbose) message("\t Splitting the table by each transcript") # nolint
transdflist <- split(maintable, factor(maintable$transcript))
nbrows <- unique(sapply(transdflist, nrow)) ## all transcripts have the same number of windows, no need to calculate it each time # nolint
.checkunique(nbrows, "nbrows")
## Computing ecdf on each transcript
if (verbose) message("\t Computing ecdf on each transcript")
ecdflist <- parallel::mclapply(transdflist, function(transtable, expdf,
rounding, nbrows, maincolnamevec) {
res <- .computeecdf(transtable, expdf, rounding, nbrows)
return(res)
}, expdf, rounding, nbrows, maincolnamevec, mc.cores = nbcpu)
concatdf <- dplyr::bind_rows(ecdflist)
if (showtime) {
end_time <- Sys.time()
timing <- end_time - start_time
message("\t\t -- Analysis performed in: ", format(timing, digits = 2))
}
finalres <- list(concatdf, nbrows)
return(finalres)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.