R/plots_comparison.R

Defines functions build.comparison.table comparison.stats.raw comparison.cvg plot_primer.comparison.box plot_primer.comparison.mismatches prepare.constraint.plot get_comparison_table

Documented in get_comparison_table

#' @rdname AnalysisStats
#' @name AnalysisStats
#' @return \code{get_comparison_table} returns a data frame summarizing
#' the properties of the provided primer data sets.
#' @export
#' @examples
#' 
#' # Summarize the properties of multiple primer sets
#' data(Comparison)
#' tab <- get_comparison_table(template.data[1:3], primer.data[1:3], "IGH")
get_comparison_table <- function(templates, primers, sample.name = NULL) {
    if (is(templates, "Templates") && is(primers, "Primers")) {
        templates <- list(templates)
        primers <- list(primers)
    } 
    if (!is(templates, "list") || !is(primers, "list")) {
        stop("'templates' and 'primers' should be lists.")
    }
    if (length(templates) != 0) {
        if (length(sample.name) == 0) {
            sample.name <- sapply(templates, function(x) unique(x$Run))
        } else if (length(sample.name) == 1) {
            sample.name <- rep(sample.name, length(templates))
        } else if (length(sample.name) != length(templates)) {
            stop("Incorrect number of sample names provided.")
        }
        if (length(templates) != length(primers)) {
            stop("Please ensure that you provide the same number of templates as primers.")
        }
    }
    template.classes <- sapply(templates, function(x) class(x))
    if (any(template.classes != "Templates")) {
        stop("Input templates should be a list with objects ",
             "of class 'Templates'.")

    }
    primer.classes <- sapply(primers, function(x) class(x))
    if (any(primer.classes != "Primers")) {
        stop("Input primers should be a list with objects ",
             "of class 'Primers'.")
    }
    all.results <- vector("list", length(primers))
    for (j in seq_along(primers)) {
        primer.df <- primers[[j]]
        lex.seq <- templates[[j]]
        set.name <- unique(primer.df$Run)
        mode.directionality <- get.analysis.mode(primer.df)
        set.stats <- primer.set.parameter.stats(primer.df, mode.directionality, lex.seq)
        if (length(set.stats) != 0) {
            cur.result <- data.frame(Primers = set.name, Templates = sample.name[j], stringsAsFactors = FALSE)
            set.stats <- cbind(cur.result, set.stats, row.names = NULL, stringsAsFactors = FALSE)
        }
        all.results[[j]] <- set.stats
    }
    sample.result <- do.call(rbind.fill, all.results)
    # sort sample result by cvg
    if ("Coverage" %in% colnames(sample.result)) {
        hits <- regexpr("[0-9\\.]+%", sample.result$Coverage)
        cvg <- regmatches(sample.result$Coverage, hits)
        cvg <- substr(cvg, 1, nchar(cvg) - 1)
        o <- order(as.numeric(cvg), decreasing = TRUE)
    }
    if (length(sample.result) != 0) {
        # nice labels for columns without underscores:
        colnames(sample.result) <- gsub("_", " ", colnames(sample.result))
    }
    return(sample.result)
}
#' Preparation of Comparison Plot for Evaluation.
#'
#' @param primer.data List with objects of class \code{Primers}. Each 
#' list entry corresponds to a single primer set.
#' @param constraint.settings List with settings for each constraint.
#' If \code{NULL} (the default), use the available evaluation results
#' in \code{primer.data}.
#' @param plot.p.vals Whether p-values from Fisher's exact test
#' should be annotated for every primer set.
#' @return Plot indicating the ratio of primers
#' fulfilling the constraints specified in \code{constraint.settings}
#' for each primer set in \code{primer.data}. 
#' @keywords internal
prepare.constraint.plot <- function(primer.data, 
        constraint.settings, plot.p.vals = FALSE) {
    if (length(primer.data) == 0) {
        return(NULL)
    }
    if (length(constraint.settings) == 0) {
        return(NULL)
    }
    # check type of primers
    primer.classes <- sapply(primer.data, function(x) class(x))
    if (any(primer.classes != "Primers")) {
        stop("Input primers should be lists with objects ",
             "of class 'Primers' ...")
    }
    # determine available columns of the primer sets
    eval.cols <- get.eval.cols(primer.data, constraint.settings)
    if (length(eval.cols) == 0) {
        # nothing to plot
        return(NULL)
    }
    if (length(constraint.settings) != 0) {
        # re-evaluate primer.data with the available input constraints
        eval.names <- gsub("^EVAL_", "", eval.cols)
        primer.data <- eval.comparison.primers(primer.data, constraint.settings[names(constraint.settings) %in% eval.names])
    } 
    # determine frequency of fulfilled constraints
    new.df <- NULL
    p.vals <- rep(NA, length(primer.data))
    for (i in seq_along(primer.data)) {
        primer.df <- asS3(primer.data[[i]])
        if (nrow(primer.df) != 0) {
            if (plot.p.vals) {
                # compute p-vals
                p.data <- primer_significance(primer.df, active.constraints = names(constraint.settings))
                if (length(p.data) != 0) {
                    p.val <- as.numeric(p.data)
                } else {
                    p.val <- NA
                }
                p.vals[i] <- p.val
            } else {
                p.val <- NA
            }
            eval.df <- primer.df[, eval.cols, drop = FALSE]
            if (nrow(eval.df) != 0) {
                df <- data.frame(t(apply(eval.df, 2, function(x) length(which(x))/nrow(eval.df))))
                df <- cbind(Run = unique(primer.df$Run), P_value = p.val, df)
                m.df <- melt(df, c("Run", "P_value"), variable.name = "Constraint", value.name = "Ratio")
            } else {
                m.df <- NULL
            }
            new.df <- rbind(new.df, m.df)
        }
    }
    if (length(new.df) != 0 && length(p.vals) != 0) {
        # adjust/assign p-vals for multiple hypothesis testing
        m <- match(new.df$P_value, p.vals) 
        p.vals <- p.adjust(p.vals, method = "bonf")
        new.df$P_value <- p.vals[m]
    }
    # re-name constraints
    if (length(new.df) != 0) {
        new.df$Constraint <- gsub("EVAL_", "", new.df$Constraint)
        levels <- unique(new.df$Constraint)[order(as.character(unique(new.df$Constraint)))]
        new.df$Constraint <- factor(new.df$Constraint, levels = levels)
        # order Runs by name
        new.df$Run <- factor(new.df$Run, levels = unique(new.df$Run)[order(as.character(unique(new.df$Run)))])
        if (plot.p.vals) {
            p.rep <- format(new.df$P_value, scientific = TRUE, digits = 2)
            new.df$Run <- paste(new.df$Run, " (p-val: ", p.rep, ")", sep = "")
        }
    }
    return(new.df)
}
#' Plot Primer Mismatches
#'
#' Plots primer mismatches for every set.
#'
#' @param primer.data List with primer data frames.
#' @param template.data List with template data frames.
#' @param allowed.mismatches Allowed mismatches.
#' @param highlight.set Primer sets to highlight in the plot.
#' @return Plot of mismatches for comparison.
#' @keywords internal
plot_primer.comparison.mismatches <- function(primer.data, template.data, allowed.mismatches,
        highlight.set = NULL) {
    con.cols <- list("Nbr_of_mismatches_fw", "Nbr_of_mismatches_rev")
    names(con.cols) <- unlist(con.cols)
    con.identifier <- "Number of mismatches"
    new.data <- primer.data
    for (i in seq_along(primer.data)) {
        primer.df <- primer.data[[i]]
        if (any(!"Nbr_of_mismatches_fw" %in% colnames(primer.df))) {
            return(NULL)
        }
        mm.fw <- strsplit(as.character(primer.df$Nbr_of_mismatches_fw), split = ",")
        mm.rev <- strsplit(as.character(primer.df$Nbr_of_mismatches_rev), split = ",")
        r.fw <- do.call(rbind, lapply(seq_along(mm.fw), function(x) primer.df[rep(x, 
            length(mm.fw[[x]])), ]))
        if (length(r.fw) != 0) {
            r.fw$Nbr_of_mismatches_fw <- as.numeric(unlist(mm.fw))
            r.fw$Nbr_of_mismatches_rev <- rep(NA, nrow(r.fw))
        }
        r.rev <- do.call(rbind, lapply(seq_along(mm.rev), function(x) primer.df[rep(x, 
            length(mm.rev[[x]])), ]))
        if (length(r.rev) != 0) {
            r.rev$Nbr_of_mismatches_rev <- as.numeric(unlist(mm.rev))
            r.rev$Nbr_of_mismatches_fw <- rep(NA, nrow(r.rev))
        }
        df <- rbind(r.fw, r.rev)
        if (length(df) != 0 && nrow(df) != 0) {
            new.data[[i]] <- df
        } else {
            # data frame was empty
            new.data[[i]] <- primer.df
        }
    }
    boundaries <- list("Nbr_of_mismatches_fw" = allowed.mismatches,
                        "Nbr_of_mismatches_rev" = allowed.mismatches)
    p <- plot_primer.comparison.box(new.data, con.identifier, con.cols, 
        boundaries, highlight.set = highlight.set)
    return(p)
}

#' Boxplot for Primer Comparison
#'
#' Constructs a box plot showing constraint values for each primer set.
#'
#' @param primer.data List with primer data frames.
#' @param con.identifier Identifier of constraint to be plotted.
#' @param con.cols Column names with the constraint values in the primer data frames.
#' @param boundaries List with constraint settings.
#' @param y.limits Limits for the extent of the y-axis.
#' @param show.points If \code{TRUE} (the default), individual data points
#' are visualized in the boxplot, otherwise they are not shown.
#' @param highlight.set The identifier of a primer set to highlight in the plot.
#' @param nfacets A numeric providing the number of facet columns to show.
#' By default \code{nfacets} is \code{NULL} such that the number of 
#' facet columns is chosen automatically.
#' @return A boxplot for primer comparison.
#' @keywords internal
plot_primer.comparison.box <- function(primer.data,
    con.identifier, con.cols, boundaries, y.limits = NULL, 
    show.points = TRUE, highlight.set = NULL, nfacets = NULL) {

    if (length(primer.data) == 0) {
        return(NULL)
    }
    if (length(con.cols) == 0) {
        # no constraints to be plotted according to 'con.cols'
        return(NULL)
    }
    #print("plot_primer.box: nbr facet cols is:")
    #print(nfacets)
    plot.df <- rbind.primer.data(primer.data)
    if (length(plot.df) == 0) {
        return(NULL)
    }
    if (any(unlist(lapply(con.cols, function(x) !x %in% colnames(plot.df))))) {
        return(NULL)
    }
    plot.df <- melt(plot.df, id.vars = c("ID", "Run", "Forward", "Reverse"), 
        measure.vars = unlist(con.cols),
        variable.name = "Constraint", 
        value.name = "Value")
    # check whether there's any variable available for plotting ...
    if (length(highlight.set) != 0) {
        # check whether highlight set is specified correctly
        m <- match(highlight.set, plot.df$Run)
        na.idx <- which(is.na(m))
        if (length(na.idx) != 0) {
            msg <- paste("Highlight set not available in data:",
                paste(highlight.set[na.idx], collapse = ","))
            warning(msg)
            highlight.set <- highlight.set[!is.na(m)]
        }
    }
    # Select only fw values for fw primers etc.
    fw.idx <- which(plot.df$Forward != "")
    rev.idx <- which(plot.df$Reverse != "")
    plot.df$Direction <- ifelse(grepl("_fw", plot.df$Constraint), "Forward", ifelse(grepl("_rev", plot.df$Constraint), "Reverse", "Overall"))
    plot.df.fw <- plot.df[plot.df$Direction == "Forward" & plot.df$ID %in% plot.df$ID[fw.idx],]
    plot.df.rev <- plot.df[plot.df$Direction == "Reverse" & plot.df$ID %in% plot.df$ID[rev.idx],]
    plot.df <- rbind(plot.df.fw, plot.df.rev, plot.df[plot.df$Direction == "Overall",])
    ########
    con.names <- unlist(lapply(plot.df$Constraint, function(x) names(con.cols)[sapply(seq_along(con.cols), function(y) any(x %in% con.cols[[y]]))]))
    con.levels <- unique(con.names)[order(unique(con.names))]
    constraint.names <- unlist(constraints_to_unit(con.levels, TRUE))
    plot.df$Constraint <- factor(con.names, levels = con.levels,
                                labels = sapply(constraint.names, deparse))
    #######
    #plot.df$Constraint <- gsub("_fw", "", plot.df$Constraint)
    #plot.df$Constraint <- gsub("_rev", "", plot.df$Constraint)
    title <- "Constraints"
    boundary.data <- NULL # store plot boundaries
    cons <- as.character(levels(plot.df$Constraint))
    #mod.names <- unlist(lapply(con.cols, function(x) gsub("_fw", "", x[1])))
    for (i in seq_along(cons)) {
        con <- cons[i] 
        con.name <- con.levels[i]
        bound <- boundaries[[con.name]]
        if (length(bound) == 0) {
            warning(paste("Constraint boundary for ", con.name, " is not available.", sep = ""))
            next
        }
        cur.bound <- data.frame(Constraint = rep(con, length(bound)),
                                Z = bound)
        boundary.data <- rbind(boundary.data, cur.bound)
    }
    plot.df$Run <- abbreviate(plot.df$Run, getOption("openPrimeR.plot_abbrev"))
    plot.df$Run <- factor(plot.df$Run, levels = unique(plot.df$Run)[order(as.character(unique(plot.df$Run)))])
    p <- ggplot(plot.df, aes(x = .data[["Run"]], y = .data[["Value"]], colour = .data[["Run"]])) + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
        ggtitle(title) + ylab("Value") +
        geom_boxplot(outlier.shape = NA) +
        guides(colour = "none") # remove legend: is redundant here
    if (length(con.identifier) > 1) {
        # multiple constraints are plotted
        p <- p + facet_wrap(as.formula("~Constraint"), 
                scales = "free_y", ncol = nfacets, 
                labeller = ggplot2::label_parsed)
    } else {
        # a single constraint is plotted
        p <- p + ylab(con.identifier[[1]])
    }
    if (show.points) {
        p <- p + geom_point(alpha = 0.25, position = position_jitter(width = 0.25, 
            height = 0))
    }
    # set colors:
    pal <- getOption("openPrimeR.plot_colors")["Run"] # the RColorBrewer palette to use
    colors <- colorRampPalette(brewer.pal(8, pal))(length(unique(plot.df$Run)))
    p <- p + scale_colour_manual(values = colors)
    if (length(boundary.data) != 0) {
        p <- p + geom_hline(data = boundary.data, aes(yintercept = .data[["Z"]]),
                    linewidth = 0.25, 
                    colour = "red", linetype = 2)
    }
    if (length(y.limits) != 0) {
        p <- p + coord_cartesian(ylim = y.limits)  # don't lose bars outside of limits
    }
    annotation.constraints <- c("cross_dimerization", "self_dimerization", "secondary_structure")
    if (length(con.cols) == 1 && names(con.cols)[1] %in% annotation.constraints) {
        # requires library grid
        if (names(con.cols)[1] %in% c("cross_dimerization", "self_dimerization")) {
            text.high <- "Dimerization\nunlikely"
            text.low <- "Dimerization\nlikely"
        } else if (names(con.cols)[1] == "secondary_structure") {
            text.high <- "Secondary structure\nunlikely"
            text.low <- "Secondary structure\nlikely"
        }
        label.col <- "grey20"
        text_high <- grid::textGrob(text.high, gp=grid::gpar(fontsize=13, fontface="bold", col = label.col ))
        text_low <- grid::textGrob(text.low, gp=grid::gpar(fontsize=13, fontface="bold", col = label.col))
        x.pos <- -1 # position on the x axis of annotation
        high.pos <- max(plot.df$Value, na.rm = TRUE) + 0.1 * min(plot.df$Value, na.rm=TRUE)
        low.pos <- min(plot.df$Value, na.rm = TRUE) - 0.1 * min(plot.df$Value, na.rm= TRUE)
        #######
        p <- p + theme(plot.margin = unit(c(1,1,1,6), "lines")) +
        annotation_custom(text_high, ymin = high.pos, ymax = high.pos, xmin=x.pos,xmax=x.pos) + 
        annotation_custom(text_low,ymin = low.pos, ymax = low.pos,xmin=x.pos,xmax=x.pos) +
        scale_x_discrete(expand = c(0.1,0.0)) +
        geom_segment(aes(x = x.pos, y = high.pos + 0.25 * min(plot.df$Value, na.rm = TRUE), xend = x.pos, 
                    yend = low.pos - 0.25 * min(plot.df$Value, na.rm = TRUE)), color = label.col, size = 1.25,
                    arrow = arrow(length = unit(0.03, "npc")))
    }
    if (length(highlight.set) != 0) {
        # highlight selected sets
        sel <- levels(plot.df$Run) %in% highlight.set
        p <- p + theme(
                axis.text.x = element_text(
                    face=ifelse(sel, "bold","plain"),
                    colour = ifelse(sel, 
                            "grey20", "grey30")))
                    #size = ifelse(sel, 
                            #x.labels.size + 1, x.labels.size)))
    }
    return(p)
}
#' Comparison Coverage Stats.
#'
#' Computes coverage stats for primer comparison.
#'
#' @param primer.data List with primer data frames.
#' @param template.data List with template data frames.
#' @return Coverage statistics for comparing primers.
#' @keywords internal
comparison.cvg <- function(primer.data, template.data) {
    info <- comparison.stats.raw(primer.data, template.data)
    if (length(info) != 0) {
        cvg <- paste(round(info$Coverage * 100, 2), "%", sep = "")
        cvg.string <- paste(info$Nbr_Primers, " primers cover ", 
                    info$Nbr_Covered, 
                    " of ", info$Nbr_Templates, " (", cvg, 
                    ") templates", sep = "")
        result <- data.frame(Run = info$Run, 
                    Coverage = round(info$Coverage, 3), 
                    Info = cvg.string, stringsAsFactors = FALSE)
        return(result)
    } else {
        return(NULL)
    }
}
#' Computation of Raw Stats for Primer Comparison
#' 
#' Computes raw stats for primer comparison.
#'
#' @param primer.data List with primer data frames.
#' @param template.data List with template data frames.
#' @return Raw statistics for primer comparison.
#' @keywords internal
comparison.stats.raw <- function(primer.data, template.data) {
    if (length(primer.data) == 0 || length(template.data) == 0 ||
        is.null(template.data[[1]])) {
        return(NULL)
    }
    run.names <- get.run.names(primer.data)
    N <- unlist(lapply(primer.data, function(x) nrow(x)))
    N.templates <- unlist(lapply(template.data, function(x) nrow(x)))
    cvg.info <-  cvg <- lapply(seq_along(primer.data), function(x) get_cvg_ratio(primer.data[[x]], 
        template.data[[x]]))
    nbr.covered <- unlist(lapply(cvg.info, function(x) attr(x, "no_covered")))
    cvg <- unlist(lapply(cvg.info, function(x) as.numeric(x)))
    o <- order(cvg, decreasing = TRUE)
    if (length(cvg) == length(run.names)) {
        # if we have enough data (primers available for each set)
        info <- data.frame(Run = run.names, Coverage = cvg, Nbr_Primers = N, Nbr_Templates = N.templates, 
            Nbr_Covered = nbr.covered, stringsAsFactors = FALSE)
        info <- info[o, ]
        return(info)
    } else {
        return(NULL)
    }
}
build.comparison.table <- function(primers, templates) {
    primers <- set.run.names(primers) # ensure that primer run names are unique
    primer.runs <- get.run.names(primers)
    template.runs <- get.run.names(templates)
    m <- which.max(c(length(primer.runs), length(template.runs)))
    if (m == 1) { # more primer sets than templates
        template.runs <- c(template.runs, rep(NA, length(primer.runs) - length(template.runs)))
    } else { # more templates than primer sets
        primer.runs <- c(primer.runs, rep(NA, length(template.runs) - length(primer.runs)))
    }
    out <- data.frame(PrimerSet = primer.runs, 
                      Templates = template.runs) 
    # numeric index is important for selection!
    rownames(out) <- seq_len(nrow(out))
    if (length(templates) != length(primers)) {
        templates <- replicate(length(primers), templates[1])
    }
    cvg.data <- comparison.cvg(primers, templates)
    my.ids <- seq_along(primers)
    if (length(cvg.data) != 0) {
        # match out to cvg.data -> sorted
        m <- match(as.character(out$PrimerSet), as.character(cvg.data$Run)) # assumes that IDs are unique!!!
        add.df <- cvg.data[m,]
        out <- cbind(out, Coverage = add.df$Coverage, Info = add.df$Info)
        # sort by highest coverage
        o <- order(out$Coverage, decreasing = TRUE)
        out <- out[o, ]
        out$Coverage <- paste0(round(out$Coverage * 100, 2), "%")
    }
    return(out)
}
matdoering/openPrimeR documentation built on Feb. 11, 2024, 9:22 p.m.