MatrixQCvis: Shiny-based interactive data-quality exploration for omics data

Documented in batchCorrectionAssay createBoxplot createDfFeature cv cvFeaturePlot distSample distShiny driftPlot ECDF featurePlot hoeffDPlot hoeffDValues imputeAssay MAplot MAvalues normalizeAssay plotCV sumDistSample transformAssay

#' @name createBoxplot
#'
#' @title Create a boxplot of (count/intensity) values per sample
#'
#' @description
#' The function \code{create_boxplot} creates a boxplot per sample for the 
#' intensity/count values.
#' 
#' @details
#' Internal usage in \code{shinyQC}.
#'
#' @param se \code{SummarizedExperiment} containing the (count/intensity) values 
#' in the \code{assay} slot
#' @param orderCategory \code{character}, one of \code{colnames(colData(se))}
#' @param title \code{character} or \code{numeric} of \code{length(1)}
#' @param log \code{logical}, if \code{TRUE} (count/intensity) values are 
#' displayed as log values
#' @param violin \code{logical}, if \code{FALSE} a boxplot is created, if 
#' \code{TRUE} a violin plot is created
#'
#' @examples
#' ## create se
#' a <- matrix(seq_len(100), nrow = 10, ncol = 10, 
#'     dimnames = list(seq_len(10), paste("sample", seq_len(10))))
#' a[c(1, 5, 8), seq_len(5)] <- NA
#' set.seed(1)
#' a <- a + rnorm(100)
#' cD <- data.frame(name = colnames(a), type = c(rep("1", 5), rep("2", 5)))
#' rD <- data.frame(spectra = rownames(a))
#' se <- SummarizedExperiment::SummarizedExperiment(assay = a, 
#'     rowData = rD, colData = cD)
#' 
#' createBoxplot(se, orderCategory = "name", title = "", log = TRUE, 
#'     violin = FALSE)
#' 
#' @return \code{gg} object from \code{ggplot2}
#' 
#' @importFrom dplyr left_join
#' @importFrom tidyr pivot_longer 
#' @importFrom tibble tibble as_tibble
#' @importFrom SummarizedExperiment assay
#' @importFrom ggplot2 ggplot aes geom_boxplot geom_violin sym
#' @importFrom ggplot2 scale_x_discrete theme element_text ggtitle xlab
#' @importFrom ggplot2 theme_classic
#' 
#' @export
createBoxplot <- function(se, orderCategory = colnames(colData(se)), 
    title = "", log = TRUE, violin = FALSE) {

    ## match arguments for order
    orderCategory <- match.arg(orderCategory)

    ## access the assay slot
    a <- SummarizedExperiment::assay(se)
    
    ## take log values if log = TRUE
    if (log) 
        if (any(a == 0, na.rm = TRUE)) {
            a <- log(a + 1)
        } else {
            a <- log(a)
        }
    
    ## access the colData slot and add the rownames as a new column to cD
    ## (will add the column "x5at1t1g161asy")
    cD <- se@colData |> as.data.frame()
    if ("x5at1t1g161asy" %in% colnames(cD))
        stop("Column name `x5at1t1g161asy` must not be duplicated")
    cD[["x5at1t1g161asy"]] <- rownames(cD)
    cD <- cD[, c("x5at1t1g161asy", orderCategory)]

    
    if (!violin) {
        ## calculate the values for the box length (this will be 
        ## substantially faster than using geom_boxplot to do the job):
        ## 
        ## the upper whisker is located at the mininum of the maximum value or 
        ## Q_3 + 1.5 IQR ( min(max(x), Q_3 + 1.5 * IQR) )
        ## the lower whisker is located at the maximum of the smallest value or 
        ## Q_1 - 1.5 IQR ( max(min(x), Q_1 - 1.5 * IQR) ),
        ## where IQR = Q_3 - Q_1 is the box length.
        a_quantiles <- apply(a, 2, quantile, c(0, 0.25, 0.5, 0.75, 1), 
            na.rm = TRUE)
        a_iqr <- a_quantiles["75%", ] - a_quantiles["25%", ]
        a_upper_whisker <- apply(rbind(
            a_quantiles["100%", ], a_quantiles["75%", ] + 1.5 * a_iqr), 2, min)
        a_lower_whisker <- apply(rbind(
            a_quantiles["0%", ], a_quantiles["25%", ] - 1.5 * a_iqr), 2, max)
        
        ## combine all the information
        a_l <- tibble::tibble(name = colnames(a_quantiles), 
            ymin = as.numeric(a_lower_whisker),
            lower = as.numeric(a_quantiles["25%", ]), 
            middle = as.numeric(a_quantiles["50%", ]), 
            upper = as.numeric(a_quantiles["75%", ]), 
            ymax = as.numeric(a_upper_whisker))
        
        ## get a list of outliers for each column, these will be the values that 
        ## are located above the upper whisker or below the lower whisker
        a_out <- lapply(seq_len(ncol(a)), function(cols_i) {
            a_cols_i <- a[, cols_i]
            a_cols_i <- a_cols_i[a_cols_i > a_upper_whisker[cols_i] | 
                a_cols_i < a_lower_whisker[cols_i]]
            a_cols_i <- as.numeric(a_cols_i)
            if (length(a_cols_i) > 0)
                data.frame(name = colnames(a)[cols_i], value = a_cols_i)
            else 
                data.frame(name = NULL, value = NULL)
        })
        a_out <- do.call("rbind", a_out)
        a_out <- tibble::as_tibble(a_out)
        
    } else { ## if violin == TRUE
        ## pivot_longer will create the columns name (containing the colnames 
        ## of a) and value (containing the actual values)
        a <- tibble::as_tibble(a) 
        a_l <- tidyr::pivot_longer(data = a, cols = seq_len(ncol(a)))
    }

    ## add another column that gives the order of plotting (will be factor)
    ## order alphabetically: combine the levels of the orderCategory and add 
    ## the name (to secure that the levels are unique)
    ## add another column for the x_values
    a_l <- dplyr::left_join(x = a_l, y = cD, 
        by = c("name" = "x5at1t1g161asy"), copy = TRUE)
    a_o <- paste(a_l[[orderCategory]], a_l[["name"]])
    a_o_u <- unique(a_o)
    a_l[["x_ggplot_vals"]] <- factor(x = a_o, levels = sort(a_o_u))
    
    ## do the actual plotting
    if (!violin) { 
        
        g <- ggplot2::ggplot(a_l, 
                ggplot2::aes(x = !!ggplot2::sym("x_ggplot_vals"))) +
            ggplot2::geom_boxplot(ggplot2::aes(ymin = !!ggplot2::sym("ymin"), 
                lower = !!ggplot2::sym("lower"), 
                middle = !!ggplot2::sym("middle"), 
                upper = !!ggplot2::sym("upper"), 
                ymax = !!ggplot2::sym("ymax")), stat = "identity") 
        
        if (nrow(a_out) > 0) {
            ## add x-axis order to outliers data set
            a_out <- dplyr::left_join(x = a_out, y = cD, 
                by = c("name" = "x5at1t1g161asy"), copy = TRUE)
            a_out_o <- paste(a_out[[orderCategory]], a_out[["name"]])
            a_out[["x_ggplot_vals"]] <- factor(x = a_out_o, 
                levels = sort(a_o_u))
            g <- g + geom_point(
                ggplot2::aes(x = !!ggplot2::sym("x_ggplot_vals"), 
                    y = !!ggplot2::sym("value")), 
                data = a_out)    
        }
        
    } else { ## violin == TRUE
        g <- ggplot2::ggplot(a_l, 
            ggplot2::aes(y = !!ggplot2::sym("value"), 
                x = !!ggplot2::sym("x_ggplot_vals"))) +
            ggplot2::geom_violin()
    }
    
    g + ggplot2::scale_x_discrete(
            labels = unique(a_l[["name"]])[order(a_o_u)]) +
        ggplot2::theme_classic() +
        ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90)) + 
        ggplot2::ggtitle(title) + ggplot2::xlab("samples")   
}

#' @name driftPlot
#' 
#' @title Plot the trend line for aggregated values
#' 
#' @description 
#' The function \code{driftPlot} aggregates the (count/intensity) values from 
#' the \code{assay()} slot of a \code{SummarizedExperiment} by the \code{median} 
#' or \code{sum} of the (count/intensity) values. \code{driftPlot} then 
#' visualizes these aggregated values and adds a trend line (using either 
#' LOESS or a linear model) from (a subset of) the aggregated values. The 
#' subset is specified by the arguments \code{category} and \code{level}.
#' 
#' @details 
#' The x-values are sorted according to the \code{orderCategory} argument: The 
#' levels of the corresponding column in \code{colData(se)} are pasted with the 
#' sample names (in the column \code{name}) and factorized.
#' Internal usage in \code{shinyQC}.
#' 
#' @param se \code{SummarizedExperiment}
#' @param aggregation \code{character}, type of aggregation of (count/intensity) 
#' values
#' @param category \code{character}, column of \code{colData(se)}
#' @param orderCategory \code{character}, column of \code{colData(se)}
#' @param level \code{character}, from which samples should the LOESS curve be
#' calculated, either \code{"all"} or one of the levels of the selected columns
#' of \code{colData(se)} (\code{"category"}) 
#' @param method \code{character}, either \code{"loess"} or \code{"lm"}
#' 
#' @return \code{gg} object from \code{ggplot2}
#' 
#' @examples 
#' #' ## create se
#' set.seed(1)
#' a <- matrix(rnorm(1000), nrow = 10, ncol = 100, 
#'     dimnames = list(seq_len(10), paste("sample", seq_len(100))))
#' a[c(1, 5, 8), seq_len(5)] <- NA
#' cD <- data.frame(name = colnames(a), type = c(rep("1", 50), rep("2", 50)))
#' rD <- data.frame(spectra = rownames(a))
#' se <- SummarizedExperiment::SummarizedExperiment(assay = a, 
#'     rowData = rD, colData = cD)
#' 
#' driftPlot(se, aggregation = "sum", category = "type", 
#'     orderCategory = "type", level = "1", method = "loess")
#' 
#' @importFrom dplyr across left_join summarise starts_with
#' @importFrom tidyr pivot_longer 
#' @importFrom stats median
#' @importFrom SummarizedExperiment assay
#' @importFrom ggplot2 ggplot aes geom_point geom_smooth theme_classic sym
#' @importFrom ggplot2 scale_color_manual scale_x_discrete xlab ylab theme
#' @importFrom ggplot2 element_text
#' @importFrom plotly ggplotly style 
#' @importFrom rlang .data
#' 
#' @export
driftPlot <- function(se, aggregation = c("median", "sum"), 
        category = colnames(colData(se)), orderCategory = colnames(colData(se)), 
        level = c("all", unique(colData(se)[, category])),
        method = c("loess", "lm")) {

    aggregation <- match.arg(aggregation)
    category <- match.arg(category)
    category <- make.names(category)
    orderCategory <- match.arg(orderCategory)
    orderCategory <- make.names(orderCategory)
    level <- match.arg(level, 
        choices = c("all", unique(as.character(colData(se)[, category]))))
    method <- match.arg(method)
    
    ## access the assay slot
    a <- SummarizedExperiment::assay(se)
    
    ## access the colData slot and add the rownames as a new column to cD
    ## (will add the column "x5at1t1g161asy")
    cD <- se@colData |> as.data.frame()
    colnames(cD) <- make.names(colnames(cD))
    if ("x5at1t1g161asy" %in% colnames(cD))
        stop("Column name `x5at1t1g161asy` must not be duplicated")
    cD[["x5at1t1g161asy"]] <- rownames(cD)

    a_l <- a |>
        as.data.frame() |>
        tidyr::pivot_longer(cols = seq_len(ncol(a)))

    if (aggregation == "median") FUN <- median
    if (aggregation == "sum") FUN <- sum

    ## aggregate the values across the samples
    a_l <- dplyr::group_by(.data = a_l, .data$name)
    a_l <- dplyr::summarise(a_l, 
        dplyr::across(dplyr::starts_with("value"), FUN, na.rm = TRUE))

    ## join with cD
    tbl <- dplyr::left_join(a_l, cD, 
        by = c("name" = "x5at1t1g161asy"), copy = TRUE)
    tbl[["col_ggplot_points"]] <- "all"
    tbl[["name"]] <- factor(tbl[["name"]], sort(tbl[["name"]]))
    tbl_c <- tbl[[category]]

    ## add another column that gives the order of plotting (will be factor)
    ## order alphabetically: combine the levels of the orderCategory and add 
    ## the name (to secure that the levels are unique)
    ## add another column for the x_values
    tbl_o <- tbl[[orderCategory]]
    tbl_o_n <- paste(tbl_o, tbl[["name"]])
    tbl[["x_ggplot_vals"]] <- factor(tbl_o_n, sort(tbl_o_n))

    ## create a separate data.frame which only contains the level or "all"
    if (level == "all") {
        tbl_subset <- tbl
    } else {
        tbl_subset <- tbl[tbl_c == level, ]
    }
    tbl_subset[["col_ggplot_points"]] <- "subset"

    ## create another column which converts factors into numeric (needed for 
    ## geom_smooth)
    tbl_subset[["x_ggplot_vals_num"]] <- as.numeric(
        tbl_subset[["x_ggplot_vals"]])

    g <- ggplot2::ggplot(tbl,
            ggplot2::aes(x = !!ggplot2::sym("x_ggplot_vals"), 
                y = !!ggplot2::sym("value"), 
                col = !!ggplot2::sym("col_ggplot_points"))) + 
        suppressWarnings(
            ggplot2::geom_point(ggplot2::aes(text = !!ggplot2::sym("name")))) + 
        ggplot2::geom_point(data = tbl_subset, 
            ggplot2::aes(x = !!ggplot2::sym("x_ggplot_vals"), 
                y = !!ggplot2::sym("value"), 
                col = !!ggplot2::sym("col_ggplot_points"))) + 
        ggplot2::geom_smooth(data = tbl_subset, 
            ggplot2::aes(x = !!ggplot2::sym("x_ggplot_vals_num"), 
                y = !!ggplot2::sym("value")), 
            method = method) +
        ggplot2::theme_classic() + 
        ggplot2::scale_color_manual(values = c("#000000", "#2A6CEF")) +
        ggplot2::scale_x_discrete(
            labels = tbl[["name"]][order(tbl[["x_ggplot_vals"]])]) +
        ggplot2::xlab("samples") + 
        ggplot2::ylab(paste(aggregation, "of values")) +
        ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90), 
            legend.position = "none")
    g <- plotly::ggplotly(g, tooltip = c("text"))
    g |> plotly::style(hoveron = "fills", traces = 2)
}


#' @name cv 
#' 
#' @title Calculate coefficient of variation
#' 
#' @description 
#' The function \code{cv} calculates the coefficient of variation from columns  
#' of a matrix. The coefficients of variation are calculated according to the 
#' formula \code{sd(y) / mean(y) * 100} with \code{y} the column values, thus,
#' the function returns the coefficient of variation in percentage.
#' 
#' @details
#' The function returned a named \code{list} (the name is specified by the 
#' \code{name} argument) containing the coefficient of variation of the 
#' columns of \code{x}.
#' 
#' @param x \code{matrix}
#' @param name \code{character}, the name of the returned list
#' 
#' @return \code{list}
#' 
#' @examples
#' x <- matrix(seq_len(10), ncol = 2)
#' cv(x)
#' 
#' @importFrom stats sd
#' 
#' @export
cv <- function(x, name = "raw") {
    
    sd_v <- apply(x, 2, stats::sd, na.rm = TRUE)
    mean_v <- apply(x, 2, mean, na.rm = TRUE)
    cv_v <- sd_v / mean_v * 100
    
    ## create a named list and return the list
    cv_v <- list(cv_v)
    names(cv_v) <- name
    cv_v
}

#' @name plotCV
#' 
#' @title Plot CV values
#' 
#' @description 
#' The function \code{plotCV} displays the coefficient of variation values of 
#' set of values supplied in a \code{data.frame} object. The function will 
#' create a plot using the \code{ggplot2} package and will print the values 
#' in the different columns in different colors.
#' 
#' @details 
#' Internal usage in \code{shinyQC}.
#' 
#' @param df \code{data.frame} containing one or multiple columns containing 
#' the coefficients of variation
#' 
#' @examples
#' x1 <- matrix(seq_len(10), ncol = 2)
#' x2 <- matrix(seq(11, 20), ncol = 2)
#' x3 <- matrix(seq(21, 30), ncol = 2)
#' x4 <- matrix(seq(31, 40), ncol = 2)
#' 
#' ## calculate cv values
#' cv1 <- cv(x1, "x1")
#' cv2 <- cv(x2, "x2")
#' cv3 <- cv(x3, "x3")
#' cv4 <- cv(x4, "x4")
#' 
#' df <- data.frame(cv1, cv2, cv3, cv4)
#' plotCV(df)
#' 
#' @return \code{gg} object from \code{ggplot2}
#' 
#' @importFrom tidyr pivot_longer
#' @importFrom ggplot2 ggplot aes geom_point geom_line xlab ylab sym
#' @importFrom ggplot2 theme_bw theme element_text
#' @export
plotCV <- function(df) {
    
    ## add a sample column
    if (!is.data.frame(df)) stop("df is not a data.frame")
    df[["sample"]] <- rownames(df)
    df[["sample"]] <- factor(x = df[["sample"]], levels = df[["sample"]])
    df <- tidyr::pivot_longer(df, cols = seq_len((ncol(df) - 1)))
    
    ggplot2::ggplot(df, ggplot2::aes(x = !!ggplot2::sym("sample"), 
            y = !!ggplot2::sym("value"))) + 
        ggplot2::geom_point(ggplot2::aes(color = !!ggplot2::sym("name"))) + 
        ggplot2::geom_line(ggplot2::aes(group = !!ggplot2::sym("name"), 
            color = !!ggplot2::sym("name"))) + 
        ggplot2::xlab("sample") + 
        ggplot2::ylab("coefficient of variation (in %)") + 
        ggplot2::theme_bw() + 
        ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90))
}

#' @name ECDF
#'
#' @title Create ECDF plot of a sample against a reference
#'
#' @description
#' The function \code{ECDF} creates a plot of the empirical cumulative 
#' distribution function of a specified sample and an outgroup (reference). 
#' The reference is specified by the \code{group} argument. The row-wise 
#' (feature) mean values of the reference are calculated after excluding 
#' the specified \code{sample}.
#'
#' @details 
#' Internal use in \code{shinyQC}. 
#' 
#' The function \code{ECDF} uses the \code{ks.test} function from \code{stats} 
#' to perform a two-sample Kolmogorov-Smirnov test. The Kolmogorov-Smirnov 
#' test is run with the alternative \code{"two.sided"}
#' (null hypothesis is that the true distribution function of the 
#' \code{sample} is equal to the hypothesized distribution function of the 
#' \code{group}).
#' 
#' The \code{exact} argument in \code{ks.test} is set to \code{NULL}, meaning 
#' that an exact p-value is computed if the product of the sample sizes is 
#' less than 10000 of \code{sample} and \code{group}. Otherwise, asymptotic 
#' distributions are used whose approximations might be inaccurate in low 
#' sample sizes.
#' 
#' @param se \code{SummarizedExperiment} object
#' @param sample \code{character}, name of the sample to compare against the 
#' group
#' @param group \code{character}, either \code{"all"} or one of 
#' \code{colnames(colData(se))}
#' 
#' @examples
#' ## create se
#' set.seed(1)
#' a <- matrix(rnorm(1000), nrow = 100, ncol = 10, 
#'     dimnames = list(seq_len(100), paste("sample", seq_len(10))))
#' a[c(1, 5, 8), seq_len(5)] <- NA
#' cD <- data.frame(name = colnames(a), type = c(rep("1", 5), rep("2", 5)))
#' rD <- data.frame(spectra = rownames(a))
#' se <- SummarizedExperiment(assay = a, rowData = rD, colData = cD)
#' 
#' ECDF(se, sample = "sample 1", group = "all")
#' 
#' @importFrom SummarizedExperiment assay
#' @importFrom stats ks.test
#' @importFrom ggplot2 ggplot aes sym stat_ecdf theme_bw xlab ylab
#' @importFrom ggplot2 ggtitle theme element_blank
#' 
#' @return \code{gg} object from \code{ggplot2}
#' 
#' @export
ECDF <- function(se, sample = colnames(se), 
    group = c("all", colnames(colData(se)))) {

    ## match arguments
    sample <- match.arg(sample)
    group <- match.arg(group)
    group <- make.names(group)
    
    ## access the assay slot
    a <- SummarizedExperiment::assay(se)
    
    ## access the colData slot and add the rownames as a new column to cD
    ## (will add the column "x5at1t1g161asy")
    cD <- se@colData |> as.data.frame()
    colnames(cD) <- make.names(colnames(cD))
    if ("x5at1t1g161asy" %in% colnames(cD))
        stop("Column name `x5at1t1g161asy` must not be duplicated")
    cD[["x5at1t1g161asy"]] <- rownames(cD)
    
    df <- data.frame(value = a[, sample], type = sample)
    
    ## calculate sample-wise the values
    ## get indices 
    inds_l <- colnames(a) == sample
    inds_nl <- !inds_l
    
    ## truncate the indices based on the group (only use the group for the 
    ## comparison and the "outgroup")
    if (group != "all") {
        g <- cD[cD[, "x5at1t1g161asy"] == sample, group]
        inds_nl <- inds_nl & cD[, group] == g
    }
    
    rM <- rowMeans(a[, inds_nl], na.rm = TRUE)
    df_group <- data.frame(value = rM, type = "group")
    
    ## remove from df that contains the sample values and from df_group the 
    ## features that have in any of df/df_group NA values
    inds <- !(is.na(df_group$value) |  is.na(df$value))
    df <- df[inds, ]
    df_group <- df_group[inds, ]
    
    ## calculate KS statistic of data
    value_s <- df[["value"]]
    value_g <- df_group[["value"]]
    ks_test <- stats::ks.test(x = value_s, y = value_g, exact = NULL, 
        alternative = "two.sided")
    
    ## bind together
    df <- rbind(df, df_group)

    ggplot2::ggplot(df, 
        ggplot2::aes(x = !!ggplot2::sym("value"), 
            color = !!ggplot2::sym("type"), group = !!ggplot2::sym("type"))) + 
        ggplot2::stat_ecdf(linewidth = 1) + ggplot2::theme_bw() + 
        ggplot2::xlab(sample) + ggplot2::ylab("ECDF") +
        ggplot2::ggtitle(sprintf("K-S Test: D: %s, p-value: %s", 
            signif(ks_test$statistic, 3), 
            signif(ks_test$p.value, 3))) + 
        ggplot2::theme(legend.title = ggplot2::element_blank(), 
            legend.position ="top")
}

#' @name distShiny
#'
#' @title Create distance matrix from numerical matrix
#'
#' @description
#' The function \code{distShiny} takes as an input a numerical \code{matrix} or
#' \code{data.frame} and returns the distances between the rows and columns
#' based on the defined \code{method} (e.g. euclidean distance). 
#' 
#' @details 
#' Internal use in \code{shinyQC}.
#'
#' @param x \code{matrix} or \code{data.frame} with samples in columns and 
#' features in rows
#' @param method \code{character}, method for distance calculation
#'
#' @examples
#' x <- matrix(seq_len(100), nrow = 10, ncol = 10, 
#'         dimnames = list(seq_len(10), paste("sample", seq_len(10))))
#' distShiny(x = x)
#' 
#' @return \code{matrix}
#' 
#' @importFrom stats dist
#'
#' @export
distShiny <- function(x, method = "euclidean") {
    ## calculate the distance matrix
    d <- stats::dist(t(x), method = method, diag = TRUE)
    data.matrix(d)
}

#' @name distSample
#'
#' @title  Create a heatmap using distance information between samples
#' 
#' @description 
#' The function \code{distSample} creates a heatmap from a distance matrix 
#' created by the function \code{distShiny}. The heatmap is annotated by the 
#' column specified by the \code{label} column in \code{colData(se)}.
#' 
#' @details 
#' Internal use in \code{shinyQC}
#'
#' @param d \code{matrix} containing distances, obtained from \code{distShiny}
#' @param se \code{SummarizedExperiment}
#' @param label \code{character}, refers to a column in \code{colData(se)}
#' @param title \code{character}
#' @param ... further arguments passed to \code{ComplexHeatmap::Heatmap}
#'
#' @examples
#' ## create se
#' a <- matrix(seq_len(100), nrow = 10, ncol = 10,
#'             dimnames = list(seq_len(10), paste("sample", seq_len(10))))
#' a[c(1, 5, 8), seq_len(5)] <- NA
#' set.seed(1)
#' a <- a + rnorm(100)
#' a_i <- imputeAssay(a, method = "MinDet")
#' cD <- data.frame(name = colnames(a_i),
#'     type = c(rep("1", 5), rep("2", 5)))
#' rD <- data.frame(spectra = rownames(a_i))
#' se <- SummarizedExperiment::SummarizedExperiment(assay = a_i, rowData = rD,
#'     colData = cD)
#' 
#' dist <- distShiny(a_i)
#' distSample(dist, se, label = "type", title = "imputed", 
#'     show_row_names = TRUE)
#'
#' @return \code{Heatmap} object from \code{ComplexHeatmap}
#'
#' @importFrom ComplexHeatmap HeatmapAnnotation Heatmap
#' @importFrom grDevices hcl.colors
#' @importFrom stats setNames as.dist
#'
#' @export
distSample <- function(d, se, label = "name", title = "raw", ...) {
    
    ## create the annotation, use here the colData, get the column label in 
    ## colData
    val <- se@colData[[label]]
    val_u <- unique(val)
    df <- data.frame(x = val)
    colnames(df) <- label
    l <- list(stats::setNames(grDevices::hcl.colors(n = length(val_u)), val_u))
    names(l) <- label
    
    ## set default arguments for ComplexHeatmap::Heatmap
    top_annotation <- ComplexHeatmap::HeatmapAnnotation(df = df, col = l)
    show_column_names <- FALSE
    args_default <- list(matrix = d, 
        clustering_distance_rows = function(x) stats::as.dist(x),
        clustering_distance_columns = function(x) stats::as.dist(x),
        top_annotation = top_annotation,
        name = "distances", column_title = title,
        show_column_names = show_column_names)

    ## write the ellipsis to args
    args <- list(...)
    args_default[names(args)] <- args

    ## call the ComplexHeatmap::Heatmap function
    do.call(what = ComplexHeatmap::Heatmap, args = args_default)
}

#' @name sumDistSample
#'
#' @title Plot the sum of distances to other samples
#'
#' @description 
#' The function \code{sumDistSample} creates a plot showing the sum of distance 
#' of a sample to other samples. 
#'
#' @param d \code{matrix} containing distances, obtained from \code{distShiny}
#' @param title \code{character} specifying the title to be added to the plot
#' 
#' @examples
#' a <- matrix(seq_len(100), nrow = 10, ncol = 10, 
#'             dimnames = list(seq_len(10), paste("sample", seq_len(10))))
#' dist <- distShiny(a)
#' 
#' sumDistSample(dist, title = "raw")
#'
#' @return \code{gg} object from \code{ggplot2}
#'
#' @importFrom ggplot2 ggplot aes sym geom_point geom_segment ggtitle
#' @importFrom ggplot2 xlab ylab theme_classic theme element_blank
#' @importFrom tibble tibble
#' @importFrom plotly ggplotly
#'
#' @export
sumDistSample <- function(d, title = "raw") {
    d_sum <- rowSums(d) 
    tbl <- tibble::tibble(name = names(d_sum), distance = d_sum)
    g <- ggplot2::ggplot(tbl, ggplot2::aes(x = !!ggplot2::sym("distance"), 
            y = !!ggplot2::sym("name"))) + 
        ggplot2::geom_point(size = 0.5) + 
        ggplot2::geom_segment(ggplot2::aes(xend = 0, 
            yend = !!ggplot2::sym("name")), 
            linewidth = 0.1) + 
        ggplot2::ggtitle(title) +
        ggplot2::xlab("sum of distances") + ggplot2::ylab("") + 
        ggplot2::theme_classic()
    
    plotly::ggplotly(g, tooltip = c("x", "y"))
}

#' @name MAvalues
#'
#' @title Create values (M and A) for MA plot
#'
#' @description 
#' The function \code{MAvalues} will create MA values as input for the function 
#' \code{MAplot} and \code{hoeffDValues}. 
#' \code{M} and \code{A} are specified relative to specified samples which 
#' is determined by the \code{group} argument. In case of \code{group == "all"}, 
#' all samples (expect the specified one) are taken for the reference 
#' calculation. In case of \code{group != "all"} will use the samples belonging 
#' to the same group given in \code{colnames(colData(se))} expect the 
#' specified one. 
#'
#' @param se \code{SummarizedExperiment}
#' @param log2 \code{logical}, specifies if values are 
#' \code{log2}-transformed prior to calculating M and A values. 
#' If the values are already transformed, \code{log2} should be set to 
#' \code{FALSE}. If \code{log2 = TRUE} and if there are values in 
#' \code{assay(se)} that are 0, the \code{log2} values are calculated by
#' \code{log2(assay(se) + 1)}
#' @param group \code{character}, either \code{"all"} or one of 
#' \code{colnames(colData(se))}
#'
#' @return \code{tbl} with columns \code{Feature}, \code{name} (sample name), 
#' \code{A}, \code{M} and additional columns of \code{colData(se)}
#' 
#' @examples
#' ## create se
#' set.seed(1)
#' a <- matrix(rnorm(10000), nrow = 1000, ncol = 10, 
#'             dimnames = list(seq_len(1000), paste("sample", seq_len(10))))
#' a[c(1, 5, 8), seq_len(5)] <- NA
#' cD <- data.frame(name = colnames(a), type = c(rep("1", 5), rep("2", 5)))
#' rD <- data.frame(spectra = rownames(a))
#' se <- SummarizedExperiment(assay = a, rowData = rD, colData = cD)
#' 
#' MAvalues(se, log = FALSE, group = "all")
#' 
#' @importFrom dplyr left_join
#' @importFrom SummarizedExperiment assay colData
#' @importFrom tibble as_tibble add_column tibble rownames_to_column
#' @importFrom tidyr pivot_longer
#' 
#' @export
MAvalues <- function(se, log2 = TRUE, group = c("all", colnames(colData(se)))) {
    
    ## check arguments group
    group <- match.arg(group)
    
    ## access the assay slot
    a <- SummarizedExperiment::assay(se)
    
    ## access the colData slot and add the rownames as a new column to cD
    ## (will add the column "x5at1t1g161asy")
    cD <- se@colData |> as.data.frame()
    if ("x5at1t1g161asy" %in% colnames(cD))
        stop("Column name `x5at1t1g161asy` must not be duplicated")
    cD[["x5at1t1g161asy"]] <- rownames(cD)
    
    if (ncol(a) < 2)
        stop("MAplot needs more than one samples")
    
    ## take logarithm of assay values if the values are not transformed yet
    if (log2) {
        if (any(a == 0, na.rm = TRUE)) {
            a <- log2(a + 1)
        } else {
            a <- log2(a)   
        }
    }
    
    ## calculate sample-wise the M and A values (iterate through the columns)
    A <- M <- matrix(NA, nrow = nrow(a), ncol = ncol(a),
        dimnames = list(rownames(a), colnames(a)))
    
    ## calculate sample-wise the values (iterate through the columns)
    for (sample_i in colnames(a)) {
        
        ## get indices
        inds_nl <- colnames(a) != sample_i
        
        ## truncate the indices based on the group (only use the group for the 
        ## comparison and the "outgroup")
        if (group != "all") {
            g <- cD[cD[["x5at1t1g161asy"]] == sample_i, group]
            inds_nl <- inds_nl & cD[, group] == g
        }
        rM <- rowMeans(a[, inds_nl], na.rm = TRUE)
        M[, sample_i] <- a[, sample_i] - rM
        A[, sample_i] <- 0.5 * (a[, sample_i] + rM)
    }
    
    ## combine the data and add additional information from colData
    M <- M |> 
        tibble::as_tibble(M, rownames = "Feature") |>
        tidyr::pivot_longer(cols = 2:(ncol(a) + 1), values_to = "M")
    A <- A |>
        tibble::as_tibble(A, rownames = "Feature") |>
        tidyr::pivot_longer(cols = 2:(ncol(a) + 1), values_to = "A")
    
    ## combine A and M
    tibble::tibble(A, M = M$M) |>
        dplyr::left_join(y = cD, by = c("name" = "x5at1t1g161asy"))
}

#' @name hoeffDValues
#' 
#' @title Create values of Hoeffding's D statistics from M and A values
#' 
#' @description 
#' The function creates and returns Hoeffding's D statistics values 
#' from MA values. 
#' 
#' In case \code{sample_n} is set to a numerical value (e.g. 10000), a 
#' random subset containing \code{sample_n} is taken to calculate Hoeffding's D 
#' values to speed up the calculation. In case there are less features
#' than \code{sample_n}, all features are taken.
#'
#' @details 
#' The function uses the function \code{hoeffd} from the \code{Hmisc} package to 
#' calculate the values.
#'
#' @param tbl \code{tibble}, as obtained from the function \code{MAvalues}
#' @param name \code{character(1)}, name of the returned list
#' @param sample_n \code{numeric(1)}, number of features (subset) to be 
#' taken for calculation of Hoeffding's D values 
#' 
#' @examples
#' ## create se
#' a <- matrix(seq_len(100), nrow = 10, ncol = 10, 
#'             dimnames = list(seq_len(10), paste("sample", seq_len(10))))
#' a[c(1, 5, 8), seq_len(5)] <- NA
#' set.seed(1)
#' a <- a + rnorm(100)
#' cD <- data.frame(name = colnames(a), type = c(rep("1", 5), rep("2", 5)))
#' rD <- data.frame(spectra = rownames(a))
#' se <- SummarizedExperiment::SummarizedExperiment(assay = a, 
#'     rowData = rD, colData = cD)
#' 
#' tbl <- MAvalues(se)
#' hoeffDValues(tbl, "raw")
#' 
#' ## normalized values
#' se_n <- se
#' assay(se_n) <- normalizeAssay(a, "sum")
#' tbl_n <- MAvalues(se_n, group = "all")
#' hoeffDValues(tbl_n, "normalized")
#'
#' ## transformed values
#' se_t <- se
#' assay(se_t) <- transformAssay(a, "log")
#' tbl_t <- MAvalues(se_t, group = "all")
#' hoeffDValues(tbl_t, "transformed")
#'
#' @importFrom Hmisc hoeffd
#' @importFrom tidyr pivot_wider
#' @importFrom dplyr mutate group_by summarise pull
#'
#' @return named list with Hoeffding's D values per sample
#' 
#' @export
hoeffDValues <- function(tbl, name = "raw", sample_n = NULL) {
    
    ## for bigger datasets the calculaton of Hoeffding's values is not
    ## efficient, create a random subset of the features to calculate 
    ## Hoeffding's D values (e.g. 10000 features)
    ## in case there are less features than sample_n, all features are taken
    unique_features <- unique(tbl$Feature)
    if (!is.numeric(sample_n)) sample_n <- length(unique_features)
    min_features <- min(length(unique_features), sample_n)
    features <- sample(unique_features, size = min_features, replace = FALSE)
    tbl <- tbl[tbl$Feature %in% features, ]
         
    ## create a wide tibble with the samples as columns and features as rows 
    ## for A and M values
    A <- tidyr::pivot_wider(tbl, id_cols = "Feature", values_from = "A", 
        names_from = "name")
    M <- tidyr::pivot_wider(tbl, id_cols = "Feature", values_from = "M", 
        names_from = "name")
    
    ## create the D statistic between M and A values, 
    ## only return these comparisons (not A vs. A and M vs. M, i.e. index 
    ## by [2, 1])
    hd_l <- dplyr::mutate(tbl, name = factor(name, levels = unique(name))) |>
        dplyr::group_by(name) |>
        dplyr::summarise(hd_l = Hmisc::hoeffd(A, M)$D[2, 1]) 
    hd_l <- hd_l$hd_l
    
    ## assign the names (do not use the first column since it contains the 
    ## Feature names)
    names(hd_l) <- colnames(A)[-1]
    
    ## create a list containing the named numeric and return the named list of
    ## these values
    l <- list(hd_l)
    names(l) <- name
    
    l
}

#' @name hoeffDPlot
#' 
#' @title Create a plot from a list of Hoeffding's D values
#' 
#' @description 
#' The function \code{hoeffDPlot} creates via \code{ggplot} a violin plot per 
#' factor, a jitter plot of the data points and (optionally) connects the points
#' via lines. \code{hoeffDPlot} uses the \code{plotly} package to make the 
#' figure interactive.
#' 
#' @details 
#' The function \code{hoeffDPlot} will create the violin plot and jitter plot 
#' according to the specified order given by the colnames of \code{df}. 
#' \code{hoeffDPlot} will thus internally refactor the \code{colnames} of the 
#' supplied \code{data.frame} according to the order of the \code{colnames}. 
#' 
#' @param df \code{data.frame} containing one or multiple columns containing the 
#' Hoeffding's D statistics
#' @param lines \code{logical}, should points belonging to the same sample be 
#' connected
#'
#' @examples
#' ## create se
#' set.seed(1)
#' a <- matrix(rnorm(10000), nrow = 1000, ncol = 10, 
#'             dimnames = list(seq_len(1000), paste("sample", seq_len(10))))
#' a[c(1, 5, 8), seq_len(5)] <- NA
#' cD <- data.frame(name = colnames(a), type = c(rep("1", 5), rep("2", 5)))
#' rD <- data.frame(spectra = rownames(a))
#' se <- SummarizedExperiment::SummarizedExperiment(assay = a, 
#'     rowData = rD, colData = cD)
#' 
#' tbl <- MAvalues(se, log = FALSE, group = "all")
#' hd_r <- hoeffDValues(tbl, "raw")
#' 
#' ## normalized values
#' se_n <- se
#' assay(se_n) <- normalizeAssay(a, "sum")
#' tbl_n <- MAvalues(se_n, log = FALSE, group = "all")
#' hd_n <- hoeffDValues(tbl_n, "normalized")
#' 
#' df <- data.frame(raw = hd_r, normalized = hd_n)
#' hoeffDPlot(df, lines = TRUE)
#' hoeffDPlot(df, lines = FALSE)
#' 
#' @return \code{gg} object from \code{ggplot2}
#' 
#' @importFrom tidyr pivot_longer
#' @importFrom dplyr mutate
#' @importFrom ggplot2 ggplot geom_violin geom_point aes sym geom_line
#' @importFrom ggplot2 ylab theme_classic theme
#' @importFrom plotly ggplotly
#' 
#' @export
hoeffDPlot <- function(df, lines = TRUE) {
    
    ## set lines to FALSE if only one list is supplied
    if (ncol(df) == 1) lines <- FALSE
    
    ## obtain the supplied order
    cols <- colnames(df)
    
    ## add the sample names as a column and create a long format of the df
    df <- data.frame(sample = rownames(df), df)
    df <- tidyr::pivot_longer(df, cols = 2:ncol(df), 
        names_to = "processing_step")
    
    ## refactor the names according to the supplied order (cols)
    names_f <- factor(dplyr::pull(df,"processing_step"), cols)
    df <- dplyr::mutate(df, processing_step = names_f, x = as.numeric(names_f))

    df$x_jitter <- jitter(df$x)
    # ## do the actual plotting
    g <- ggplot2::ggplot(df) + 
        ggplot2::geom_violin(
            ggplot2::aes(x = !!ggplot2::sym("processing_step"), 
                y = !!ggplot2::sym("value")), 
            na.rm = TRUE) + 
        suppressWarnings(
            ggplot2::geom_point(
                ggplot2::aes(x = !!ggplot2::sym("x_jitter"), 
                    y = !!ggplot2::sym("value"), 
                    color = !!ggplot2::sym("processing_step"), 
                    text = !!ggplot2::sym("sample"))))
    if (lines) g <- g + ggplot2::geom_line(
        ggplot2::aes(x = !!ggplot2::sym("x_jitter"), 
            y = !!ggplot2::sym("value"), group = !!ggplot2::sym("sample")))
    g <- g + ggplot2::ylab("Hoeffding's D statistic") + 
        ggplot2::xlab("processing step") + ggplot2::theme_classic() +
        ggplot2::theme(legend.position = "none")
    plotly::ggplotly(g, tooltip = c("text", "y"))
}

#' @name MAplot
#'
#' @title Create a MA plot
#'
#' @description 
#' The function creates a 2D histogram of M and A values.
#' 
#' @details 
#' \code{MAplot} returns a 2D hex histogram instead of a classical scatterplot 
#' due to computational reasons and better visualization of overlaying points.
#' The argument \code{plot} specifies the sample (refering to 
#' \code{colData(se)$name}) to be plotted. If \code{plot = "all"}, MA values 
#' for all samples will be plotted (samples will be plotted in facets). 
#' If the number of features (\code{tbl$Features}) is below 1000, points will be 
#' plotted (via \code{geom_points}), otherwise hexagons will be plotted
#' (via \code{geom_hex}).
#'
#' @param tbl \code{tibble} containing the M and A values, as obtained from the 
#' \code{MAvalues} function
#' @param group \code{character}, one of \code{colnames(colData(se))} 
#' (\code{se} used in \code{MAvalues}) or \code{"all"}
#' @param plot \code{character}, one of \code{colData(se)$name} (\code{se} 
#' used in \code{MAvalues}) or \code{"all"}
#'
#' @return \code{gg} object from \code{ggplot2}
#'
#' @examples
#' ## create se
#' set.seed(1)
#' a <- matrix(rnorm(10000), nrow = 1000, ncol = 10,
#'             dimnames = list(seq_len(1000), paste("sample", seq_len(10))))
#' a[c(1, 5, 8), seq_len(5)] <- NA
#' cD <- data.frame(name = colnames(a), type = c(rep("1", 5), rep("2", 5)))
#' rD <- data.frame(spectra = rownames(a))
#' se <- SummarizedExperiment::SummarizedExperiment(assay = a,
#'     rowData = rD, colData = cD)
#'
#' tbl <- MAvalues(se, log = FALSE, group = "all")
#' MAplot(tbl, group = "all", plot = "all")
#'
#' @importFrom dplyr pull filter
#' @importFrom ggplot2 ggplot aes geom_hex geom_point facet_wrap sym
#' @importFrom ggplot2 theme coord_fixed xlim ylim theme_bw
#' @importFrom stats formula
#' 
#' @export
MAplot <- function(tbl, group = c("all", colnames(tbl)), 
    plot = c("all", unique(tbl[["name"]]))) {
    
    group <- match.arg(group)
    
    if (!all(plot %in% c("all", unique(tbl$name)))) {
        stop("plot not in 'all' or 'unique(tbl$name)'")
    }
    
    ## get the number of features (this will govern if points will be plotted
    ## or hexagons)
    n <- tbl[["Feature"]]
    n <- unique(n)
    n <- length(n)

    A <- tbl[["A"]]
    M <- tbl[["M"]]
    x_lim <- c(min(A, na.rm = TRUE), max(A, na.rm = TRUE))
    x_lim <- ifelse(is.infinite(x_lim), NA, x_lim)
    y_lim <- c(min(M, na.rm = TRUE), max(M, na.rm = TRUE))
    y_lim <- ifelse(is.infinite(y_lim), NA, y_lim)
    
    if (!("all" %in% plot)) {
        tbl <- tbl[tbl[["name"]] %in% plot, ]
    }
    
    ## create a formula depending on the group argument for facet_wrap 
    if (group == "all" | group == "name") {
        fm <- stats::formula(paste("~", quote(name))) 
    } else {
        fm <- stats::formula(paste("~", group, "+", quote(name)))
    }
    
    ## do the actual plotting
    g <- ggplot2::ggplot(tbl, 
        ggplot2::aes(x = !!ggplot2::sym("A"), y = !!ggplot2::sym("M")))
    if (n >= 1000) {
        g <- g + ggplot2::geom_hex()
    } else {
        g <- g + ggplot2::geom_point()
    }
    
    if (group != "name") {
        g <- g + ggplot2::facet_wrap(fm, scales = "free")
    } else {
        g <- g + ggplot2::coord_fixed()
    }
    
    if (!any(is.na(x_lim))) g <- g + ggplot2::xlim(x_lim)
    if (!any(is.na(y_lim))) g <- g + ggplot2::ylim(y_lim)
    g <- g + ggplot2::theme_bw()
    g + ggplot2::theme(aspect.ratio = 1) 
        #plot.margin = grid::unit(c(0,0,0,0), "mm"))
}

#' @name createDfFeature
#' 
#' @title Create data frame of (count/intensity) values for a selected feature
#' along data processing steps
#' 
#' @description 
#' The function \code{createDfFeature} takes as input a list of matrices and 
#' returns the row \code{Feature} of each matrix as a column of a 
#' \code{data.frame}. The function \code{createDfFeature} provides the input 
#' for the function \code{featurePlot}. 
#' 
#' @details 
#' Internal usage in \code{shinyQC}
#' 
#' @param l \code{list} containing matrices at different processing steps
#' @param feature \code{character}, element of \code{rownames} of the matrices 
#' in \code{l}
#' 
#' @return \code{data.frame}
#' 
#' @examples 
#' set.seed(1)
#' x1 <- matrix(rnorm(100), ncol = 10, nrow = 10, 
#'     dimnames = list(paste("feature", seq_len(10)), 
#'         paste("sample", seq_len(10))))
#' x2 <- x1 + 5
#' x3 <- x2 + 10
#' 
#' l <- list(x1 = x1, x2 = x2, x3 = x3)
#' createDfFeature(l, "feature 1")
#' 
#' @export
createDfFeature <- function(l, feature) {
    l_slice <- lapply(seq_along(l), function(i) as.numeric(l[[i]][feature, ]))
    names(l_slice) <- names(l)
    df <- data.frame(l_slice)
    rownames(df) <- colnames(l[[1]])
    df
}

#' @name featurePlot
#'
#' @title Create a plot of (count/intensity) values over the samples
#'
#' @description
#' The function \code{featurePlot} creates a plot of (count/intensity) values 
#' for different data processing steps (referring to columns in the 
#' \code{data.frame}) over the different samples (referring to rows in 
#' the \code{data.frame}).
#' 
#' @details
#' Internal usage in \code{shinyQC}.
#' 
#' @param df \code{data.frame}
#' 
#' @return \code{gg} object from \code{ggplot2}
#' 
#' @examples 
#' set.seed(1)
#' x1 <- matrix(rnorm(100), ncol = 10, nrow = 10, 
#'     dimnames = list(paste("feature", seq_len(10)), 
#'         paste("sample", seq_len(10))))
#' x2 <- x1 + 5
#' x3 <- x2 + 10
#' l <- list(x1 = x1, x2 = x2, x3 = x3)
#' df <- createDfFeature(l, "feature 1")
#' featurePlot(df)
#' 
#' @importFrom tidyr pivot_longer
#' @importFrom tibble tibble
#' @importFrom ggplot2 ggplot aes sym geom_point geom_line xlab ylab
#' @importFrom ggplot2 theme_bw theme element_text
#' 
#' @export
featurePlot <- function(df) {
    
    ## add a sample column
    if (!is.data.frame(df)) stop("df is not a data.frame")
    df[["sample"]] <- factor(x = rownames(df), levels = rownames(df))
    df <- tidyr::pivot_longer(df, cols = seq_len((ncol(df) - 1)))
    
    ggplot2::ggplot(df, 
            ggplot2::aes(x = !!ggplot2::sym("sample"), 
                y = !!ggplot2::sym("value"))) + 
        ggplot2::geom_point(ggplot2::aes(color = !!ggplot2::sym("name"))) + 
        ggplot2::geom_line(ggplot2::aes(group = !!ggplot2::sym("name"), 
            color = !!ggplot2::sym("name"))) + 
        ggplot2::xlab("sample") + ggplot2::ylab("value") + 
        ggplot2::theme_bw() + 
        ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90))
}

#' @name cvFeaturePlot
#' 
#' @title Plot of feature-wise coefficient of variation values
#' 
#' @description 
#' The function \code{cvFeaturePlot} returns a \code{plotly} plot of coefficient 
#' of variation values. It will create a violin plot and superseded points
#' of coefficient of variation values per list entry of \code{l}.
#' 
#' @details 
#' \code{lines = TRUE} will connect the points belonging to the same feature 
#' with a line. If there are less than two features, the violin plot will not be
#' plotted. The violin plots will be ordered according to the order in \code{l}
#'  
#' @param l \code{list} containing matrices
#' @param lines \code{logical}
#' 
#' @return \code{plotly}
#' 
#' @examples
#' x1 <- matrix(seq_len(100), ncol = 10, nrow = 10, 
#'     dimnames = list(paste("feature", seq_len(10)), 
#'         paste("sample", seq_len(10))))
#' x2 <- x1 + 5
#' x3 <- x2 + 10
#' l <- list(x1 = x1, x2 = x2, x3 = x3)
#' cvFeaturePlot(l, lines = FALSE)
#' 
#' @importFrom tidyr pivot_longer
#' @importFrom dplyr mutate
#' @importFrom ggplot2 ggplot geom_violin aes sym geom_point geom_line
#' @importFrom ggplot2 ylab xlab theme_classic theme
#' @importFrom plotly ggplotly
#' @export
cvFeaturePlot <- function(l, lines = FALSE) {
    
    names_l <- names(l)
    df <- lapply(seq_along(l), function(x) cv(t(l[[x]]), name = names_l[x])) |>
        as.data.frame()
    
    ## create df containing cv values
    df[["feature"]] <- rownames(l[[1]])
    df <- tidyr::pivot_longer(df, cols = seq_len((ncol(df) - 1)))
    df[["name"]] <- factor(df[["name"]], levels = names(l))
    df[["x_jitter"]] <- as.numeric(df$name) |>
        jitter()
    
    # ## do the actual plotting
    g <- ggplot2::ggplot(df) 
    if (nrow(df) > 2) g <- g + 
        ggplot2::geom_violin(
            ggplot2::aes(x = !!ggplot2::sym("name"), 
                y = !!ggplot2::sym("value")), 
            na.rm = TRUE)
    g <- g + suppressWarnings(
        ggplot2::geom_point(
            ggplot2::aes(x = !!ggplot2::sym("x_jitter"), 
                y = !!ggplot2::sym("value"), color = !!ggplot2::sym("name"),
                text = !!ggplot2::sym("feature"))))
    if (lines) g <- g + ggplot2::geom_line(
        ggplot2::aes(x = !!ggplot2::sym("x_jitter"), 
            y = !!ggplot2::sym("value"), group = !!ggplot2::sym("feature")))
    g <- g + ggplot2::ylab("coefficient of variation") + 
        ggplot2::xlab("processing step") +
        ggplot2::theme_classic() + ggplot2::theme(legend.position = "none") 
    plotly::ggplotly(g, tooltip = c("text", "y"))
}

## normalization allows general-purpose adjustment for differences among your 
## sample; data transformation and scaling are two different approaches to make 
## individual features more comparable. You can use one or combine them to 
## achieve better results. 
## Sample Normalization: sample-specific normalization (weight, volumne),
## normalization by sum/median/reference sample or feature/pooled sample 
## from group, quantile normalization
## data transformation: log transformation, cube root transformation
## data scaling: mean scaling (mean-centered only), auto scaling 
## (mean-centered and divided by sd), pareto scaling (mean-centered and divided
## by square root of sd), range scaling (mean-centered and diveded by range of 
## each variable)


#' @name normalizeAssay
#'
#' @title Normalize a data sets (reduce technical sample effects)
#'
#' @description
#' The function \code{normalizeAssay} performs normalization by sum of the 
#' (count/intensity) values per sample (\code{method = "sum"}), quantile 
#' division per sample (\code{method = "quantile division"}), 
#' or by quantile normalization (adjusting the value distributions that they 
#' become identical in statistical properties, \code{method = "quantile"}). 
#' The value for quantile division (e.g., the 75% quantile per sample) can be 
#' specified by the \code{probs} argument. Quantile normalization is 
#' performed by using the \code{normalizeQuantiles} function from \code{limma}.
#' 
#' For the methods \code{"sum"} and \code{"quantile division"}, normalization
#' will be done depending on the \code{multiplyByNormalizationValue} parameter.
#' If set to \code{TRUE}, normalization values (e.g. sum or quantile) will be
#' calculated per sample. In a next step, adjusted normalization values will 
#' be calculated for each sample in relation to the median normalization 
#' values across all samples. Finally, the values in \code{a} are 
#' multiplied by these adjusted normalization values.
#' If \code{multiplyByNormalizationValue} is set to \code{FALSE}, 
#' normalization values (e.g. sum or quantile) will be
#' calculated per sample. The values in \code{a} are sample-wise divided by
#' the normalization values. 
#' 
#' @details
#' Internal usage in \code{shinyQC}. If \code{method} is set to \code{"none"}, 
#' the object \code{x} is returned as is (pass-through).
#' 
#' If \code{probs} is NULL, \code{probs} is internally set to 0.75 if 
#' \code{method = "quantile division"}.
#' 
#' Depending on the values in \code{a}, if \code{multiplyByNormalizationValue}
#' is set to \code{TRUE} the returned normalized values will be in the same 
#' order of magnitude than the original values, while if \code{FALSE}, the 
#' returned values will be in a smaller order of magnitude.
#' 
#' @param a \code{matrix} with samples in columns and features in rows
#' @param method \code{character}, one of \code{"none"}, \code{"sum"}, 
#' \code{"quantile division"}, \code{"quantile"}
#' @param probs \code{numeric}, ranging between \code{[0, 1)}. \code{probs} 
#' is used as the divisor for quantile division in 
#' \code{method = "quantile division"}
#' @param multiplyByNormalizationValue \code{logical}, if TRUE, normalization 
#' values will be calculated and the values in \code{a} will be multiplied by 
#' the values The parameter is only relavant for \code{method = "sum"} and 
#' \code{method = "quantile division"}
#'
#' @examples
#' a <- matrix(seq_len(100), nrow = 10, ncol = 10, 
#'         dimnames = list(seq_len(10), paste("sample", seq_len(10))))
#' normalizeAssay(a, "sum")
#' 
#' @return \code{matrix}
#'
#' @importFrom limma normalizeQuantiles
#' @importFrom stats quantile
#'  
#' @export
normalizeAssay <- function(a, 
    method = c("none", "sum", "quantile division", "quantile"), probs = 0.75,
    multiplyByNormalizationValue = FALSE) {
    
    if (!is.matrix(a)) stop ("a is not a matrix")
    method <- match.arg(method)
    
    a_n <- a

    if (method == "sum") {
        contains_negative <- a < 0
        contains_negative[is.na(contains_negative)] <- FALSE
        if (any(contains_negative))
            print("'a' contains negative values. Normalization may lead to undesired results.")
        
        if (multiplyByNormalizationValue) {
            normFactors <- apply(a_n, 2, sum, na.rm = TRUE)
            adjustedNormFactors <- median(normFactors, na.rm = TRUE) / normFactors
            a_n <- apply(a_n, 1, function(rows_i) 
                rows_i * adjustedNormFactors)
            
            ## transpose the matrix if the a_n was transposed by apply
            if (!all(colnames(a) %in% colnames(a_n)))
                a_n <- t(a_n)
        } else {
            a_n <- apply(a_n, 2, 
                function(cols_i) cols_i / sum(cols_i, na.rm = TRUE))    
        }
    }
    if (method == "quantile division") {
        contains_negative <- a < 0
        contains_negative[is.na(contains_negative)] <- FALSE
        if (any(contains_negative))
            print("'a' contains negative values. Normalization may lead to undesired results.")
        if (is.null(probs)) 
            probs <- 0.75
        
        if (multiplyByNormalizationValue) {
            normFactors <- apply(a_n, 2, function(cols_i) 
                stats::quantile(cols_i, probs = probs, na.rm = TRUE))
            adjustedNormFactors <- median(normFactors, na.rm = TRUE) / normFactors
            a_n <- apply(a_n, 1, function(rows_i) 
                rows_i * adjustedNormFactors)
            
            ## transpose the matrix if the a_n was transposed by apply
            if (!all(colnames(a) %in% colnames(a_n)))
                a_n <- t(a_n)
        } else {
            a_n <- apply(a_n, 2,
                function(cols_i) cols_i / stats::quantile(cols_i, 
                    probs = probs, na.rm = TRUE)) 
        }
        
    }
    if (method == "quantile") {
        cols_nona <- apply(a_n, 2, function(cols_i) !all(is.na(cols_i)))
        cols_nona <- names(cols_nona[cols_nona])
        a_n[, cols_nona] <- limma::normalizeQuantiles(a_n[, cols_nona], 
            ties = TRUE)
    }
    
    rownames(a_n) <- rownames(a)
    colnames(a_n) <- colnames(a)
    a_n
}

#' @name batchCorrectionAssay
#'
#' @title Remove batch effects from (count/intensity) values of a 
#' \code{SummarizedExperiment}
#'
#' @description
#' The function \code{batchCorrectionAssay} removes the batch effect of 
#' (count/intensity) values of a \code{SummarizedExperiment}. 
#' It uses either the \code{removeBatchEffect} or \code{ComBat} functions 
#' or no batch effect correction method (pass-through, 
#' \code{none}).
#'
#' @details 
#' The column \code{batch} in \code{colData(se)} contains the information 
#' on the batch identity. For \code{method = "removeBatchEffect (limma)"},
#' \code{batch2} may indicate a second series of batches. 
#' Internal use in \code{shinyQC}.
#' 
#' If \code{batch} is NULL and \code{method} is set to 
#' \code{method = "removeBatchEffect (limma)"} or \code{method = "ComBat"},
#' no batch correction will be performed (equivalent to 
#' \code{method = "none"}).
#' 
#' The method \code{ComBat} will only perform batch correction on valid 
#' features: (1) more or equal than two observations (no \code{NA}) per level 
#' and per feature, (2) variance greater than 0 per feature, and (3) more than
#' two valid features as given by (1) and (2). For non-valid features, values
#' are taken from \code{assay(se)}.
#'
#' @param se \code{SummarizedExperiment}
#' @param method \code{character}, one of \code{"none"} or 
#' \code{"removeBatchEffect"}
#' @param batch \code{character}, \code{NULL} or one of 
#' \code{colnames(colData(se))}
#' @param batch2 \code{character}, \code{NULL} or one of
#' \code{colnames(colData(se))}
#' @param ... further arguments passed to \code{removeBatchEffect} or 
#' \code{ComBat} 
#'
#' @examples
#' ## create se
#' a <- matrix(seq_len(100), nrow = 10, ncol = 10, 
#'     dimnames = list(seq_len(10), paste("sample", seq_len(10))))
#' a[c(1, 5, 8), seq_len(5)] <- NA
#' set.seed(1)
#' a <- a + rnorm(100)
#' cD <- data.frame(name = colnames(a), 
#'     type = c(rep("1", 5), rep("2", 5)), batch = rep(c(1, 2), 5))
#' rD <- data.frame(spectra = rownames(a))
#' se <- SummarizedExperiment::SummarizedExperiment(assay = a, 
#'     rowData = rD, colData = cD)
#' 
#' ## method = "removeBatchEffect (limma)"
#' batchCorrectionAssay(se, method = "removeBatchEffect (limma)", 
#'     batch = "batch", batch2 = NULL)
#' 
#' ## method = "ComBat"
#' batchCorrectionAssay(se, method = "ComBat", 
#'     batch = "batch", batch2 = NULL)
#' 
#' @return \code{matrix}
#' 
#' @importFrom limma removeBatchEffect
#' @importFrom SummarizedExperiment assay
#' @importFrom sva ComBat
#' @importFrom stats var
#' 
#' @export
batchCorrectionAssay <- function(se, 
        method = c("none", "removeBatchEffect (limma)", "ComBat"), 
        batch = NULL, batch2 = NULL, ...) {
    
    method <- match.arg(method)
    ##batch <- match.arg(batch)
    ##batch2 <- match.arg(batch2)
    a <- SummarizedExperiment::assay(se)
    a_b <- as.matrix(a)
    
    if (method == "removeBatchEffect (limma)") {
        
        cD <- se@colData
        
        ## check, when batch and batch2 are not NULL, that they match
        ## with the colnames of colData, assign the values of colData
        if (!is.null(batch)) {
            batch <- match.arg(batch, choices = colnames(cD))
            batch <- cD[[batch]]
        }
        
        if (!is.null(batch2)) {
            batch2 <- match.arg(batch2, choices = colnames(cD))
            batch2 <- cD[[batch2]]
        }

        ## perform batch correction
        a_b <- limma::removeBatchEffect(a_b, batch = batch, batch2 = batch2, 
            ...)
    }
    
    if (method == "ComBat") {
        
        cD <- se@colData
        
        ## check, when batch is not NULL, that it matches with the colnames
        ## of colData, assign the values of colData
        if (!is.null(batch)) {
            batch <- match.arg(batch, choices = colnames(cD))
            batch <- cD[[batch]]
            
            ## require at least 2 observations per feature and level
            batch_u <- unique(batch)
            keep_rows <- rep(TRUE, nrow(se))
            for (i in batch_u) {
                sum_nas <- apply(a_b[, batch == i, drop = FALSE], 1, 
                    FUN = function(row) sum(!is.na(row)))
                keep_rows[sum_nas < 2] <- FALSE
            }
            
            ## now check the variance, set keep_rows to FALSE if variance == 0
            keep_rows[apply(a_b, 1, stats::var, na.rm = TRUE) == 0] <- FALSE
            
            ## perform batch correction for the valid features (if there are
            ## more or equal than 2), use default values
            if (sum(keep_rows) >= 2) {
                print(paste("Perform batch correction using", sum(keep_rows), 
                    "features."))
                a_b[keep_rows, ] <- sva::ComBat(a_b[keep_rows, ], batch = batch,
                    ...)
            } else {
                print("No batch correction performed.")
            }
        }
    }
    
    ## assign rownames and colnames
    rownames(a_b) <- rownames(a)
    colnames(a_b) <- colnames(a)
    
    ## return
    a_b
}

#' @name transformAssay
#'
#' @title Transform the (count/intensity) values of a \code{data.frame}, 
#' \code{tbl} or \code{matrix} 
#' 
#' @description
#' The function \code{transformAssay} transforms the (count/intensity) values 
#' of a  \code{matrix}. It uses either \code{log}, \code{log2}, \code{log10},  
#' variance stabilizing normalisation (\code{vsn}) or no transformation method 
#' (pass-through, \code{none}). The object
#' \code{x} has the samples in the columns and the features in the rows.
#'
#' @details 
#' Internal use in \code{shinyQC}.
#' 
#' @param a \code{matrix} with samples in columns and features in rows
#' @param method \code{character}, one of \code{"none"}, \code{"log"}, 
#' \code{"log2"}, \code{"log10"}, or \code{"vsn"}
#' @param .offset \code{numeric(1)}, offset to add when \code{method} set to
#' \code{"log"}, \code{"log2"}, or \code{"log10"} and 
#' \code{a} contains values of 0, default to 1 
#'
#' @examples
#' a <- matrix(seq_len(1000), nrow = 100, ncol = 10, 
#'         dimnames = list(seq_len(100), paste("sample", seq_len(10))))
#' transformAssay(a, "none")
#' transformAssay(a, "log")
#' transformAssay(a, "log2")
#' transformAssay(a, "vsn")
#'
#' @return \code{matrix}
#' 
#' @importFrom vsn vsn2
#' 
#' @export
transformAssay <- function(a, method = c("none", "log", "log2", "log10", "vsn"),
    .offset = 1) {
    
    if (!is.matrix(a)) stop("a is not a matrix")
    method <- match.arg(method)
    
    if (length(.offset) != 1)
        stop("'.offset' has to be of length 1")

    a_t <- a

    if (method == "log") {
        if (any(a_t == 0, na.rm = TRUE)) {
            a_t <- log(a_t + .offset)
        } else {
            a_t <- log(a_t)
        }
    }
    if (method == "log2") {
        if (any(a == 0, na.rm = TRUE)) {
            a_t <- log2(a_t + .offset)
        } else {
            a_t <- log2(a_t)
        }
    }
    if (method == "log10") {
        if (any(a == 0, na.rm = TRUE)) {
            a_t <- log10(a_t + .offset)
        } else {
            a_t <- log10(a_t)
        }
    }
    if (method == "vsn") {
        a_t <- vsn2(a_t)
        a_t <- a_t@hx
    }
    
    rownames(a_t) <- rownames(a) 
    colnames(a_t) <- colnames(a)
    a_t
}

#' @name imputeAssay
#' 
#' @title Impute missing values in a \code{matrix}
#' 
#' @description 
#' The function \code{impute} imputes missing values based on one of the 
#' following principles: Bayesian missing value imputation (\code{BPCA}), 
#' k-nearest neighbor averaging (\code{kNN}), Malimum likelihood-based 
#' imputation method using the EM algorithm (\code{MLE}), replacement by 
#' the smallest non-missing value
#' in the data (\code{Min}), replacement by the minimal value observed as
#' the q-th quantile (\code{MinDet}, default \code{q = 0.01}), and replacement
#' by random draws from a Gaussian distribution centred to a minimal value 
#' (\code{MinProb}).
#'
#' @details
#' \code{BPCA} wrapper for \code{pcaMethods::pca} with \code{methods = "bpca"}. 
#' \code{BPCA} is a missing at random (MAR) imputation method. 
#' 
#' \code{kNN} wrapper for \code{impute::impute.knn} with \code{k = 10}, 
#' \code{rowmax = 0.5}, \code{colmax = 0.5}, \code{maxp = 1500}. \code{kNN} 
#' is a MAR imputation method.
#' 
#' \code{MLE} wrapper for \code{imputeLCMD::impute.MAR} with 
#' \code{method = "MLE"}, 
#' \code{model.selector = 1}/\code{imputeLCMD::impute.wrapper.MLE}. 
#' \code{MLE} is a MAR imputation method.
#' 
#' \code{Min} imputes the missing values by the observed minimal value of 
#' \code{x}. \code{Min} is a missing not at random (MNAR) imputation method.
#' 
#' \code{MinDet} is a wrapper for \code{imputeLCMD::impute.MinDet} with 
#' \code{q = 0.01}. \code{MinDet} performs the imputation using a 
#' deterministic minimal value approach. The missing entries are
#' replaced with a minimal value, estimated from the \code{q}-th quantile 
#' from each sample. \code{MinDet} is a MNAR imputation method.
#' 
#' \code{MinProb} is a wrapper for \code{imputeLCMD::impute.MinProb} with 
#' \code{q = 0.01} and \code{tune.sigma = 1}. \code{MinProb} performs the 
#' imputation based on random draws from a Gaussion distribution with the 
#' mean set to the minimal value of a sample. \code{MinProb} is a 
#' MNAR imputation method.
#' 
#' \code{MinProb} does not impute values (not available within shiny 
#' application).  
#' 
#' @param a \code{matrix} with samples in columns and features in rows
#' @param method \code{character}, one of \code{"BPCA"}, \code{"kNN"}, 
#' \code{"MLE"}, \code{"Min"}, 
#' \code{"MinDet"}, \code{"MinProb"}, or \code{"none"}
#'
#' @examples
#' a <- matrix(seq_len(100), nrow = 10, ncol = 10, 
#'     dimnames = list(seq_len(10), paste("sample", seq_len(10))))
#' a[c(1, 5, 8), seq_len(5)] <- NA
#' 
#' imputeAssay(a, method = "kNN")
#' imputeAssay(a, method = "Min")
#' imputeAssay(a, method = "MinDet")
#' imputeAssay(a, method = "MinProb")
#' 
#' @return \code{matrix}
#' 
#' @importFrom imputeLCMD impute.MinDet impute.MinProb
#' @importFrom impute impute.knn
#' @importFrom pcaMethods pca completeObs
#' 
#' @export
imputeAssay <- function(a,
    method = c("BPCA", "kNN", "MLE", "Min", "MinDet", "MinProb", "none")) {

    if (!is.matrix(a)) stop("a is not a matrix")
    method <- match.arg(method)

    ## convert the data.frame into matrix and transpose, this will impute
    ## the per row (feature-wise)
    a_i <- as.matrix(a) |>
        t()

    if (method == "BPCA") {
        n_samp <- nrow(a_i)
        ## expects a matrix with features in columns, samples in rows
        res <- pcaMethods::pca(a_i, method = "bpca", nPcs = (n_samp - 1),
            verbose = FALSE, maxSteps = 200, center = TRUE, scale = "uv")
        a_i <- pcaMethods::completeObs(res)
    }

    if (method == "kNN") {
        ## expects a matrix with features in rows, samples in columns
        a_i <- impute::impute.knn(data = t(a_i))$data |>
            t()
    }

    if (method == "MLE") {
        ## expects a matrix with features in columns, samples in rows
        a_i <- imputeLCMD::impute.wrapper.MLE(dataSet.mvs = a_i)
    }
        
    if (method == "Min") {
        min_val <- min(a_i, na.rm = TRUE)
        a_i[is.na(a_i)] <- min_val
    }
        
    if (method == "MinDet") {
        ## expects a matrix with features in columns, samples in rows
        a_i <- imputeLCMD::impute.MinDet(dataSet.mvs = a_i, q = 0.01)
    }
        
    if (method == "MinProb") {
        ## expects a matrix with features in columns, samples in rows
        a_i <- imputeLCMD::impute.MinProb(dataSet.mvs = a_i, q = 0.01, 
            tune.sigma = 1)
    }

    t(a_i)
}
tnaake/MatrixQCvis documentation built on Nov. 4, 2024, 7:42 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
tnaake/MatrixQCvis
Shiny-based interactive data-quality exploration for omics data

R/tab_values.R
In tnaake/MatrixQCvis: Shiny-based interactive data-quality exploration for omics data

Defines functions imputeAssay transformAssay batchCorrectionAssay normalizeAssay cvFeaturePlot featurePlot createDfFeature MAplot hoeffDPlot hoeffDValues MAvalues sumDistSample distSample distShiny ECDF plotCV cv driftPlot createBoxplot

Documented in batchCorrectionAssay createBoxplot createDfFeature cv cvFeaturePlot distSample distShiny driftPlot ECDF featurePlot hoeffDPlot hoeffDValues imputeAssay MAplot MAvalues normalizeAssay plotCV sumDistSample transformAssay

R Package Documentation

Browse R Packages

We want your feedback!

tnaake/MatrixQCvis Shiny-based interactive data-quality exploration for omics data

R/tab_values.R In tnaake/MatrixQCvis: Shiny-based interactive data-quality exploration for omics data

Defines functions imputeAssay transformAssay batchCorrectionAssay normalizeAssay cvFeaturePlot featurePlot createDfFeature MAplot hoeffDPlot hoeffDValues MAvalues sumDistSample distSample distShiny ECDF plotCV cv driftPlot createBoxplot

Documented in batchCorrectionAssay createBoxplot createDfFeature cv cvFeaturePlot distSample distShiny driftPlot ECDF featurePlot hoeffDPlot hoeffDValues imputeAssay MAplot MAvalues normalizeAssay plotCV sumDistSample transformAssay

R Package Documentation

Browse R Packages

We want your feedback!

tnaake/MatrixQCvis
Shiny-based interactive data-quality exploration for omics data

R/tab_values.R
In tnaake/MatrixQCvis: Shiny-based interactive data-quality exploration for omics data