R/plotFastqcPCA.R
In ngsReports: Load FastqQC reports and other NGS related files

#' @title Draw a PCA plot for Fast QC modules
#'
#' @description Draw a PCA plot for Fast QC modules across multiple samples
#' \lifecycle{experimental}
#'
#' @details
#' This carries out PCA on all or a subset of FastQC modules and plots the
#' output using either ggplot or plotly. Clustering of the PCA can be carried
#' out using a hierarchical clustering approach. Current modules for PCA are
#' Per_base_sequence_quality, Per_sequence_quality_scores,
#' Per_sequence_GC_content, Per_base_sequence_content, and
#' Sequence_Length_Distribution.

#'
#' @param x Can be a \code{FastqcData}, \code{FastqcDataList} or file paths
#' @param module \code{character} vector containing
#'  the desired FastQC module (eg. c("Per_base_sequence_quality",
#'  "Per_base_sequence_content"))
#' @param usePlotly \code{logical}. Output as ggplot2 (default) or plotly
#' object.
#' @param labels An optional named vector of labels for the file names.
#' All filenames must be present in the names.
#' File extensions are dropped by default
#' @param cluster \code{logical} default \code{FALSE}. If \code{groups} argument
#' is not set fastqc data will be clustered using hierarchical clustering.
#' @param clusterType One of "color/colour" or "hulls". Default is "colours"
#' and will colour points based on cluster/group, "hulls" will draw a polygon
#' around each cluster.
#' @param groups Optional data.frame (or tibble) with columns \code{Filename}
#' and \code{Group}. Values in the Filename column should correspond to the
#' values returned by fqName(x). If not supplied and \code{cluster = TRUE},
#' clusters will be automatically generated using HCPC from FactoMiner
#' @param ... Used to pass additional attributes to theme() and between methods
#'
#' @return A standard ggplot2 object, or an interactive plotly object
#'
#' @docType methods
#'
#' @importFrom dplyr left_join
#' @importFrom dplyr group_by
#' @importFrom dplyr ungroup
#' @importFrom FactoMineR HCPC
#' @importFrom FactoMineR PCA
#' @importFrom grDevices chull
#' @importFrom reshape2 dcast
#' @importFrom stats kmeans
#' @importFrom stats prcomp
#' @import ggplot2
#' @import tibble
#'
#' @examples
#'
#' # Get the files included with the package
#' packageDir <- system.file("extdata", package = "ngsReports")
#' fl <- list.files(packageDir, pattern = "fastqc.zip", full.names = TRUE)
#'
#' # Load the FASTQC data as a FastqcDataList object
#' fdl <- FastqcDataList(fl)
#' plotFastqcPCA(fdl, module = "Per_sequence_quality_scores", cluster = TRUE)
#'
#'
#' @name plotFastqcPCA
#' @rdname plotFastqcPCA-methods
#' @export
setGeneric("plotFastqcPCA", function(
    x, module, usePlotly = FALSE, labels, cluster = FALSE,
    clusterType = "colour", groups = NULL, ...){
    standardGeneric("plotFastqcPCA")
}
)
#' @rdname plotFastqcPCA-methods
#' @export
setMethod("plotFastqcPCA", signature = "ANY", function(
    x, module, usePlotly = FALSE, labels, cluster = FALSE,
    clusterType = "colour", groups = NULL, ...){
    .errNotImp(x)
}
)
#' @rdname plotFastqcPCA-methods
#' @export
setMethod("plotFastqcPCA", signature = "character", function(
    x, module, usePlotly = FALSE, labels, cluster = FALSE,
    clusterType = "colour", groups = NULL, ...){
    x <- FastqcDataList(x)
    if (length(x) == 1) x <- x[[1]]
    plotFastqcPCA(
        x, module, usePlotly, labels, cluster, clusterType, groups, ...
    )
}
)
#' @rdname plotFastqcPCA-methods
#' @export
setMethod("plotFastqcPCA", signature = "FastqcDataList", function(
    x, module, usePlotly = FALSE, labels, cluster = FALSE,
    clusterType = "colour", groups = NULL, ...){

    stopifnot(!missing(module))
    module <- match.arg(
        module, c(
            "Per_base_sequence_quality",
            "Per_sequence_quality_scores",
            "Per_sequence_GC_content",
            "Per_base_sequence_content",
            "Sequence_Length_Distribution"
        )
    )

    ## Set dummy variables to avoid R CMD check notes
    Dim.1 <- Dim.2 <- Cluster <- c()

    ## Get any arguments for dotArgs that have been set manually
    dotArgs <- list(...)
    allowed <- names(formals(theme))
    keepArgs <- which(names(dotArgs) %in% allowed)
    userTheme <- c()
    if (length(keepArgs) > 0) userTheme <- do.call(theme, dotArgs[keepArgs])

    ## Check for any point size arguments
    sz <- 0.7
    if ("size" %in% names(dotArgs)) sz <- dotArgs[["size"]]

    pFun <- paste0(".generate", module, "PCA")
    args <- list(x = x)

    df <- do.call(pFun, args)

    pca <- PCA(df, scale.unit = TRUE, ncp = 2, graph = FALSE)
    variance <- round(pca$eig[,2][seq_len(2)], 2)

    data <- as.data.frame(pca$ind$coord)

    if (cluster) {

        clusterType <- match.arg(clusterType, c("colour", "color", "hulls"))
        clusterType <- stringr::str_replace(clusterType, "color", "colour")

        ### with factoMineR
        set.seed(1)
        cluster <- HCPC(
            pca, nb.clust = 0, consol = 0, min = 2, max = 10, graph = FALSE
        )
        cluster <- cluster$call$X
        nClust <- max(as.integer(as.character(cluster[["clust"]])))
        cluster <- cluster[c("Dim.1", "Dim.2", "clust")]
        colnames(cluster) <- c("Dim.1", "Dim.2", "Cluster")

        data <- rownames_to_column(cluster, "Filename")

        if (!is.null(groups)) {

            ## Cluster using the groups data.frame
            stopifnot(c("Filename", "Group") %in% colnames(groups))
            stopifnot(all(data$Filename %in% groups$Filename))
            data <- left_join(data, groups, by = "Filename")
            data <- dplyr::rename(data, "Cluster" = "Group")

        }

        #data <- left_join(clusterDF, scores, by = "Filename")
        data$PCAkey <- data$Filename
        labels <- .makeLabels(dplyr::distinct(data, Filename), labels, ...)
        data$Filename <- labels[data$Filename]
        clust <- c()
        data$Cluster <- as.character(data$Cluster)
        ## get convex edges

        PCA <- ggplot() +
            geom_hline(yintercept = 0, colour = "darkgrey") +
            geom_vline(xintercept = 0, colour = "darkgrey") +
            theme_bw() +
            theme(
                panel.background = element_blank()
            ) +
            labs(
                x = paste0("PC1 (", variance[1], "%)"),
                y = paste0("PC2 (", variance[2], "%)")
            )

        if (clusterType == "hulls") {

            hulls <- group_by(data, Cluster)
            hulls <- dplyr::slice(hulls, chull(Dim.1, Dim.2))
            hulls <- ungroup(hulls)
            hulls$Cluster <-
                factor(hulls$Cluster, levels = unique(hulls$Cluster))

            PCA <- PCA +
                geom_point(
                    data = data,
                    aes_string(x = "Dim.1", y = "Dim.2",group = "Filename"),
                    size = sz
                ) +
                geom_polygon(
                    data = hulls,
                    aes_string(x = "Dim.1", y = "Dim.2", fill = "Cluster"),
                    alpha = 0.4
                )

        }
        if (clusterType == "colour") {

            PCA <- PCA +
                geom_point(
                    data = data,
                    aes_string(
                        "Dim.1", "Dim.2", group = "Filename", colour = "Cluster"
                    ),
                    size = sz
                )

        }

        if (!is.null(userTheme)) PCA <- PCA + userTheme

        if (usePlotly) {
            PCA <- plotly::ggplotly(PCA)

            if (clusterType == "hulls") {
                s <- split(data, data$Cluster)
                ## tidy up the html output for the interactive plot
                PCA$x$data[seq_len(nClust) + 1] <- lapply(
                    seq_len(nClust),
                    function(j){
                        names <- s[[j]]$Filename
                        names <- paste(names, collapse = "<br>")
                        PCA$x$data[[j + 1]]$text <- names
                        ## add key
                        PCA$x$data[[j + 1]]
                    }
                )
            }
        }
    }
    else{
        data <- rownames_to_column(data, "Filename")

        PCA <- ggplot() +
            geom_hline(yintercept = 0, colour = "darkgrey") +
            geom_vline(xintercept = 0, colour = "darkgrey") +
            theme_bw() +
            theme(panel.background = element_blank()) +
            labs(
                x = paste0("PC1 (", variance[1], "%)"),
                y = paste0("PC2 (", variance[2], "%)")
            ) +
            geom_point(
                data = data,
                aes_string(x = "Dim.1", y = "Dim.2", group = "Filename")
            )


        if (!is.null(userTheme)) PCA <- PCA + userTheme

        if (usePlotly) {
            PCA <- plotly::ggplotly(PCA)
        }}

    PCA
}

)


.generatePer_base_sequence_qualityPCA <- function(x){

    df <- getModule(x, "Per_base_sequence_quality")
    df$Start <- as.integer(gsub("([0-9]*)-[0-9]*", "\\1", df$Base))

    ## Adjust the data for files with varying read lengths
    ## This will fill NA values with the previous values
    df <- lapply(
        split(df, f = df$Filename),
        function(y){
            Longest_sequence <-
                gsub(".*-([0-9]*)", "\\1", as.character(y$Base))
            Longest_sequence <- max(as.integer(Longest_sequence))
            dfFill <- data.frame(Start = seq_len(Longest_sequence))
            y <- dplyr::right_join(y, dfFill, by = "Start")
            na.locf(y)
        }
    )

    df <- dplyr::bind_rows(df)[c("Filename", "Start", "Mean")]
    df <- tidyr::spread(df, "Start", "Mean", fill = 0)
    df <- as.data.frame(df)
    df <- column_to_rownames(df, "Filename")
    df
}

.generatePer_sequence_quality_scoresPCA <- function(x){

    df <- getModule(x, "Per_sequence_quality_scores")
    df <- tidyr::spread(df, "Quality", "Count", fill = 0)
    df <- as.data.frame(df)
    df <- column_to_rownames(df, "Filename")
    df
}


.generatePer_sequence_GC_contentPCA <- function(x){

    df <- getModule(x, "Per_sequence_GC_content")
    df <- tidyr::spread(df, "GC_Content", "Count", fill = 0)
    df <- as.data.frame(df)
    df <- column_to_rownames(df, "Filename")
    df
}


.generatePer_base_sequence_contentPCA <- function(x){

    df <- getModule(x, "Per_base_sequence_content")
    df$Start <- as.integer(gsub("([0-9]*)-[0-9]*", "\\1", df$Base))

    ## Adjust the data for files with varying read lengths
    ## This will fill NA values with the previous values
    df <- lapply(split(df, f = df$Filename), function(y){
        Longest_sequence <-
            gsub(".*-([0-9]*)", "\\1", as.character(y$Base))
        Longest_sequence <- max(as.integer(Longest_sequence))
        dfFill <- data.frame(Start = seq_len(Longest_sequence))
        y <- dplyr::right_join(y, dfFill, by = "Start")
        na.locf(y)
    })
    df <- dplyr::bind_rows(df)[c("Filename", "Start", "G", "A", "T", "C")]

    perBaseList <- lapply(c("G", "A", "T", "C"), function(y){
        spreadDF <- df[c("Filename", "Start", y)]

        spreadDF <- tidyr::spread(spreadDF, "Start", y, fill = 0)

        colnames(spreadDF)[2:ncol(spreadDF)] <-
            paste0(colnames(spreadDF)[2:ncol(spreadDF)], ".", y)
        spreadDF
    })

    ## i and j are functional variables spawned from perBaseList in order to
    ## join each base by filename
    df <- Reduce(function(i,j) dplyr::left_join(i,j,by="Filename"), perBaseList)
    df <- column_to_rownames(df, "Filename")
}

.generateSequence_Length_DistributionPCA <- function(x){

    df <- getModule(x, "Sequence_Length_Distribution")
    df <- lapply(split(df, f = df$Filename), function(y){
        Longest_sequence <-
            gsub(".*-([0-9]*)", "\\1", as.character(y$Length))
        Longest_sequence <- max(as.integer(Longest_sequence))
        dfFill <- data.frame(Lower = seq_len(Longest_sequence))
        y <- dplyr::right_join(y, dfFill, by = "Lower")
        na.locf(y)
    })
    df <- dplyr::bind_rows(df)[c("Filename", "Lower", "Count")]
    df <- tidyr::spread(df, "Lower", "Count", fill = 0)
    df <- column_to_rownames(df, "Filename")
    df
}

Any scripts or data that you put into this service are public.

ngsReports documentation built on Nov. 23, 2020, 2:01 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

ngsReports
Load FastqQC reports and other NGS related files

R/plotFastqcPCA.R
In ngsReports: Load FastqQC reports and other NGS related files

Defines functions .generateSequence_Length_DistributionPCA .generatePer_base_sequence_contentPCA .generatePer_sequence_GC_contentPCA .generatePer_sequence_quality_scoresPCA .generatePer_base_sequence_qualityPCA

Try the ngsReports package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ngsReports Load FastqQC reports and other NGS related files

R/plotFastqcPCA.R In ngsReports: Load FastqQC reports and other NGS related files

Defines functions .generateSequence_Length_DistributionPCA .generatePer_base_sequence_contentPCA .generatePer_sequence_GC_contentPCA .generatePer_sequence_quality_scoresPCA .generatePer_base_sequence_qualityPCA

Try the ngsReports package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ngsReports
Load FastqQC reports and other NGS related files

R/plotFastqcPCA.R
In ngsReports: Load FastqQC reports and other NGS related files