#' @name summary_reads
#' @title Summarise the reads for indel and GC content and produce the read depth plot
#' @description Still in dev, but work nicely.
#' Summarise the reads for indel and GC content and produce the read depth plot
#'
#' @param fq.files (character, path) The path to the individual fastq file to check,
#' or the entire fq folder.
#' @param output.dir (path) Write the output in a specific directory.
#' Default: \code{output.dir = NULL}, uses the working directory.
#' @param read.depth.plot (logical) To get the interesting summaries on
#' the read content, the function is very close to similar to
#' \code{\link{read_depth_plot}}, so you can produce it for each sample with
#' minimal cost on time.
#' Default: \code{read.depth.plot = TRUE}.
#' @inheritParams read_depth_plot
#' @rdname summary_reads
#' @export
#' @return The function returns the read depth groups plot and the read stats overall
#' and by read depth groups.
#' @examples
#' \dontrun{
#' require(vroom)
#' sum <- summary_reads(fq.files = "my_fq_folder")
#' }
summary_reads <- function(
fq.files,
output.dir = NULL,
read.depth.plot = TRUE,
min.coverage.fig = 7L,
parallel.core = parallel::detectCores() - 1
) {
opt.change <- getOption("width")
options(width = 70)
cat("#######################################################################\n")
cat("####################### stackr::summary_reads #########################\n")
cat("#######################################################################\n")
timing <- proc.time()
if (!"vroom" %in% utils::installed.packages()[,"Package"]) {
rlang::abort('Please install vroom for this option:\n
install.packages("vroom")')
}
if (assertthat::is.string(fq.files) && assertthat::is.dir(fq.files)) {
fq.files <- stackr::list_sample_file(f = fq.files, full.path = TRUE, recursive = TRUE)
message("Analysing ", length(fq.files), " samples...")
}
if (length(fq.files) > 1) {
future::plan(future::multisession, workers = parallel.core)
p <- progressr::progressor(steps = length(fq.files))
res <- furrr::future_map(
.x = fq.files,
.f = reads_stats,
read.depth.plot = read.depth.plot,
min.coverage.fig = min.coverage.fig,
output.dir = output.dir,
parallel.core = parallel.core,
p = p,
verbose = FALSE,
.progress = FALSE
)
} else {
res <- reads_stats(
fq.files = fq.files,
read.depth.plot = read.depth.plot,
min.coverage.fig = min.coverage.fig,
output.dir = output.dir,
parallel.core = parallel.core,
verbose = TRUE
)
}
timing <- proc.time() - timing
options(width = opt.change)
message("\nComputation time: ", round(timing[[3]]), " sec")
cat("############################## completed ##############################\n")
return(res)
}#summary_reads
# Internal function required ---------------------------------------------------
#' @title reads_stats
#' @description Main function to summarise the reads
#' @rdname reads_stats
#' @export
#' @keywords internal
reads_stats <- function(
fq.files,
read.depth.plot = TRUE,
min.coverage.fig = 7L,
output.dir = NULL,
parallel.core = parallel::detectCores() - 1,
p,
verbose = TRUE
) {
p()
clean.names <- stackr::clean_fq_filename(basename(fq.files))
if (verbose) message("Sample name: ", clean.names)
read.stats <- vroom::vroom(
file = fq.files,
col_names = "READS",
col_types = "c",
delim = "\t",
num_threads = parallel.core,
progress = TRUE
) %>%
dplyr::mutate(
INFO = seq.int(from = 1L, to = n()),
SEQ = rep(1:4, n() / 4)
) %>%
dplyr::filter(SEQ == 2L)
total.sequences <- nrow(read.stats)
if (verbose) message("Number of reads: ", total.sequences)
# stats ----------------------------------------------------------------------
if (verbose) message("Calculating reads stats...")
read.stats %<>% dplyr::count(READS, name = "DEPTH")
# detect indel and / or low quality reads...----------------------------------
# Number of N
# GC-content/ratio (or guanine-cytosine content), proportion of nitrogenous bases in a DNA
# DNA with low GC-content is less stable than DNA with high GC-content
indel <- read.stats %>%
dplyr::mutate(
LENGTH = stringi::stri_length(str = READS),
N = stringi::stri_count_fixed(str = READS, pattern = "N"),
N_PROP = round(N / LENGTH, 2),
GC = stringi::stri_count_fixed(str = READS, pattern = "C") +
stringi::stri_count_fixed(str = READS, pattern = "G"),
GC_PROP = round(GC / LENGTH, 2)
)
indel.stats.by.depth.group <- stats_stackr(data = indel, x = "N", group.by = "DEPTH")
gc.ratio.by.depth.group <- stats_stackr(data = indel, x = "GC_PROP", group.by = "DEPTH")
stats.overall <- dplyr::bind_rows(
stats_stackr(data = indel, x = "N") %>% dplyr::mutate(GROUP = "INDEL", .before = 1L),
stats_stackr(data = indel, x = "GC_PROP") %>% dplyr::mutate(GROUP = "GC", .before = 1L)
) %>%
tibble::add_column(.data = ., INDIVIDUALS = clean.names, .before = 1L) %>%
tibble::add_column(.data = ., TOTAL_READS = total.sequences, .before = 2L)
if (!is.null(output.dir)) {
vroom::vroom_write(x = stats.overall, path = file.path(output.dir, paste0(clean.names, "_stats.overall.tsv")))
}
read.stats %<>% dplyr::count(DEPTH, name = "NUMBER_DISTINCT_READS")
depth.group.levels <- c("low coverage", "target", "high coverage", "distinct reads")
max.coverage.fig <- min(read.stats$DEPTH[read.stats$NUMBER_DISTINCT_READS == 1L]) - 1
read.stats %<>%
dplyr::mutate(
DEPTH_GROUP = dplyr::case_when(
NUMBER_DISTINCT_READS == 1L ~ "distinct reads",
DEPTH < min.coverage.fig ~ "low coverage",
DEPTH >= min.coverage.fig & DEPTH <= max.coverage.fig ~ "target",
DEPTH > max.coverage.fig ~ "high coverage"
),
DEPTH_GROUP = factor(x = DEPTH_GROUP, levels = depth.group.levels, ordered = TRUE)
)
distinct.sequences <- total.number.distinct.reads <- sum(read.stats$NUMBER_DISTINCT_READS)
# read_depth_plot ------------------------------------------------------------
if (read.depth.plot) {
color.tibble <- tibble::tibble(
DEPTH_GROUP = c("low coverage", "target", "high coverage", "distinct reads"),
LABELS = c("low coverage", paste0("target [", min.coverage.fig, " - ", max.coverage.fig, "]"), "high coverage > 1 reads", "high coverage, unique reads"),
GROUP_COLOR = c("red", "green", "yellow", "orange")
) %>%
dplyr::mutate(
DEPTH_GROUP = factor(x = DEPTH_GROUP, levels = depth.group.levels, ordered = TRUE)
)
hap.read.depth.group.stats <- read.stats %>%
dplyr::mutate(DISTINCT_READS_DEPTH = DEPTH * NUMBER_DISTINCT_READS) %>%
dplyr::group_by(DEPTH_GROUP) %>%
dplyr::summarise(
NUMBER_READS_PROP = round((sum(DISTINCT_READS_DEPTH) / total.sequences), 4),
.groups = "drop"
) %>%
dplyr::left_join(color.tibble, by = "DEPTH_GROUP") %>%
dplyr::mutate(
LABELS = stringi::stri_join(LABELS, " (", as.character(format(NUMBER_READS_PROP, scientific = FALSE)), ")")
)
base_breaks <- function(n = 10){
function(x) {
grDevices::axisTicks(log10(range(x, na.rm = TRUE)), log = TRUE, n = n)
}
}
read.depth.plot <- ggplot2::ggplot(
data = read.stats, ggplot2::aes(x = DEPTH, y = NUMBER_DISTINCT_READS)
) +
ggplot2::geom_point(ggplot2::aes(colour = DEPTH_GROUP)) +
ggplot2::labs(
title = paste0("Read Depth Groups for sample: ", clean.names),
subtitle = paste0("Total reads: ", total.sequences),
x = "Depth of sequencing (log10)",
y = "Number of distinct reads (log10)"
) +
ggplot2::annotation_logticks() +
ggplot2::scale_colour_manual(
name = "Read coverage groups",
labels = hap.read.depth.group.stats$LABELS,
values = hap.read.depth.group.stats$GROUP_COLOR
) +
ggplot2::scale_x_log10(breaks = c(1, 5, 10, 25, 50, 75, 100, 250, 500, 1000)) +
ggplot2::scale_y_log10(breaks = base_breaks()) +
ggplot2::theme(
axis.title = ggplot2::element_text(size = 16, face = "bold"),
legend.title = ggplot2::element_text(size = 16, face = "bold"),
legend.text = ggplot2::element_text(size = 16, face = "bold"),
legend.position = c(0.7,0.8)
)
if (!is.null(output.dir)) {
ggplot2::ggsave(
plot = read.depth.plot,
filename = file.path(output.dir, paste0(clean.names, "_hap_read_depth.png")),
width = 25,
height = 15,
dpi = 300,
units = "cm"
)
}
} else {
read.depth.plot <- NULL
}
# results ------------------------------------------------------------------
read.stats %<>%
dplyr::left_join(indel.stats.by.depth.group, by = "DEPTH") %>%
dplyr::left_join(gc.ratio.by.depth.group, by = "DEPTH", suffix = c("_INDEL", "_GC")) %>%
tibble::add_column(.data = ., INDIVIDUALS = clean.names, .before = 1L) %>%
tibble::add_column(.data = ., TOTAL_READS = total.sequences, .before = 2L)
if (!is.null(output.dir)) {
vroom::vroom_write(x = read.stats, path = file.path(output.dir,paste0(clean.names, "_stats.by.depth.groups.tsv")))
}
return(list(overall = stats.overall, by.read.depth.groups = read.stats, plot = read.depth.plot))
}# End reads_stats
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.