#' Statistics for basic information
#' @param expr_dt A expression profile
#' @param meta_dt A metadata file
#' @param output_dir A directory of the output file(s)
#' @importFrom psych corr.test
#' @importFrom reshape2 melt
#' @export
qc_info <- function(expr_dt, meta_dt) {
# Load data --------------------------------
m <- meta_dt[, colnames(meta_dt) %in% c("library", "sample")]
d <- expr_dt
s <- meta_dt$sample
# Replace zero by NA -----------------------
d[d == 0] <- NA
# Statistics: number of features -----------
uniq_pro <- unique(d[, 1])
stat_num <- length(uniq_pro)
# Statistics: missing percentage -----------
d_all_num <- nrow(d) * (ncol(d) - 1)
d_missing_num <- length(which(is.na(d)))
prop_missing <- d_missing_num * 100 / d_all_num
stat_missing <- round(prop_missing, 3)
# Check if replicates available ------------
samples <- table(s)
rep_samples <- samples[samples > 1]
rep_num <- length(rep_samples)
if (rep_num == 0) {
stop("No replicates are available.")
} else {
# Calculating: absolute correlation ------
d_mtx <- d[, 2:ncol(d)]
d_cortest <- corr.test(d_mtx, method = "pearson", adjust = "fdr")
d_pmtx <- d_cortest$p
d_cormtx <- d_cortest$r
d_cormtx[d_pmtx > 0.05] <- 0
d_cordf <- melt(d_cormtx)
d_cordf <- d_cordf[d_cordf$Var2 != d_cordf$Var1, ]
d_cordf <- merge(d_cordf, m, by.x = "Var1", by.y = "library")
d_cordf <- merge(d_cordf, m, by.x = "Var2", by.y = "library")
d_cordf <- d_cordf[d_cordf$sample.x == d_cordf$sample.y, ]
cor_value <- median(d_cordf$value)
stat_acor <- round(cor_value, 3)
}
# Calculating: CV --------------------------
d_long <- melt(d)
d_long <- na.omit(d_long)
if (length(d_long$value[d_long$value < 0])) {
d_long$value <- 2 ^ (d_long$value)
message("All values were squared to avoid negative values.")
}
d_long <- merge(d_long, m, by.x = "variable", by.y = "library")
colnames(d_long) <- c("library", "feature", "value", "sample")
d_cv <- aggregate(
value ~ feature + sample, data = d_long,
FUN = function(x) sd(x) / mean(x))
stat_cv <- round(median(d_cv$value, na.rm = T) * 100, 3)
# Output -----------------------------------
stat_all <- c(stat_num, stat_missing, stat_acor, stat_cv)
return(stat_all)
}
#' Calculating SNR value; Plotting a PCA panel
#' @param expr_dt A expression profile (at protein level)
#' @param meta_dt A metadata file
#' @param output_dir A directory of the output file(s)
#' @importFrom data.table data.table
#' @importFrom data.table setkey
#' @importFrom ggplot2 ggplot
#' @importFrom ggplot2 aes
#' @importFrom ggplot2 theme
#' @importFrom ggplot2 labs
#' @importFrom ggplot2 geom_point
#' @importFrom ggplot2 scale_color_manual
#' @importFrom ggplot2 scale_x_continuous
#' @importFrom ggplot2 scale_y_continuous
#' @importFrom ggplot2 guides
#' @importFrom ggplot2 guide_legend
#' @importFrom ggplot2 ggsave
#' @importFrom ggthemes theme_few
#' @export
qc_snr <- function(expr_dt, meta_dt, output_dir=NULL) {
# Load data --------------------------------------
expr_ncol <- ncol(expr_dt)
expr_df <- data.frame(expr_dt[, 2:expr_ncol], row.names = expr_dt[, 1])
# Replace NA by zero -----------------------------
expr_df[is.na(expr_df)] <- 0
# Label the grouping info ------------------------
ids <- colnames(expr_df)
group <- meta_dt$sample
ids_group_mat <- data.table(id = ids, group = group)
# PCA --------------------------------------------
expr_df_t <- t(expr_df)
pca_prcomp <- prcomp(expr_df_t, retx = T, scale. = T)
pcs <- as.data.frame(predict(pca_prcomp))
pcs$sample_id <- rownames(pcs)
pcs$sample <- meta_dt$sample
# Calculating: SNR -------------------------------
dt_perc_pcs <- data.table(PCX = 1:nrow(pcs),
Percent = summary(pca_prcomp)$importance[2, ],
AccumPercent = summary(pca_prcomp)$importance[3, ])
dt_dist <- data.table(id_a = rep(ids, each = length(ids)),
id_b = rep(ids, time = length(ids)))
dt_dist$group_a <- ids_group_mat[match(dt_dist$id_a, ids_group_mat$id)]$group
dt_dist$group_b <- ids_group_mat[match(dt_dist$id_b, ids_group_mat$id)]$group
dt_dist[, type:= ifelse(id_a == id_b,
"Same",
ifelse(group_a == group_b, "Intra", "Inter"))]
dt_dist[, dist:= (dt_perc_pcs[1]$Percent * (pcs[id_a, 1] - pcs[id_b, 1])^2 +
dt_perc_pcs[2]$Percent * (pcs[id_a, 2] - pcs[id_b, 2])^2)]
dt_dist_stats <- dt_dist[, .(avg_dist = mean(dist)), by = .(type)]
setkey(dt_dist_stats, type)
signoise <- dt_dist_stats["Inter"]$avg_dist / dt_dist_stats["Intra"]$avg_dist
signoise_db <- round(10 * log10(signoise), 3)
# Plot -------------------------------------------
colors_custom <- c("D5" = "#4CC3D9",
"D6" = "#7BC8A4",
"F7" = "#FFC65D",
"M8" = "#F16745")
text_custom_theme <- element_text(family = "Arial",
size = 16,
face = "plain",
color = "black",
hjust = 0.5)
scale_axis_x <- c(min(pcs$PC1), max(pcs$PC1))
scale_axis_y <- c(min(pcs$PC2), max(pcs$PC2))
pc1_prop <- summary(pca_prcomp)$importance[2, 1]
pc2_prop <- summary(pca_prcomp)$importance[2, 2]
text_axis_x <- sprintf("PC1(%.2f%%)", pc1_prop * 100)
text_axis_y <- sprintf("PC2(%.2f%%)", pc1_prop * 100)
limit_x <- c(1.1 * scale_axis_x[1], 1.1 * scale_axis_x[2])
limit_y <- c(1.1 * scale_axis_y[1], 1.1 * scale_axis_y[2])
p_title <- paste("SNR = ", signoise_db, sep = "")
p_subtitle <- paste("(Number of proteins = ", nrow(expr_dt), ")", sep = "")
p <- ggplot(pcs, aes(x = PC1, y = PC2)) +
geom_point(aes(color = sample), size = 8) +
theme_few() +
theme(plot.title = text_custom_theme,
plot.subtitle = text_custom_theme,
axis.title = text_custom_theme,
axis.text = text_custom_theme,
legend.title = text_custom_theme,
legend.text = element_text(size = 16, color = "gray40")) +
labs(x = text_axis_x,
y = text_axis_y,
title = p_title,
subtitle = p_subtitle) +
scale_color_manual(values = colors_custom) +
scale_x_continuous(limits = limit_x) +
scale_y_continuous(limits = limit_y) +
guides(colour = guide_legend(override.aes = list(size = 2))) +
guides(shape = guide_legend(override.aes = list(size = 3)))
pc_num <- ncol(pcs)
output <- data.table(pcs[, c((pc_num - 1):pc_num, 1:(pc_num - 2))])
# Save & Output ----------------------------------
if (!is.null(output_dir)) {
output_dir_final1 <- file.path(output_dir, "pca_plot.png")
ggsave(output_dir_final1, p, width = 6, height = 5.5)
output_dir_final2 <- file.path(output_dir, "pca_table.tsv")
write.table(output, output_dir_final2, sep = "\t", row.names = F)
}
return(list(plot = p, table = output, SNR = signoise_db))
}
#' Analysis: differential expression
#' @param expr A expression table file (at peptide level)
#' @param group The grouping info
#' @importFrom edgeR DGEList
#' @importFrom edgeR filterByExpr
#' @importFrom edgeR calcNormFactors
#' @importFrom limma voom
#' @importFrom limma lmFit
#' @importFrom limma eBayes
#' @importFrom limma topTable
#' @export
dep_analysis <- function(expr, group) {
dge <- DGEList(counts = expr)
design <- model.matrix(~ group)
keep <- filterByExpr(dge, design)
dge <- dge[keep, , keep.lib.sizes = FALSE]
dge <- calcNormFactors(dge)
v <- voom(dge, design, plot = F)
fit <- lmFit(v, design)
fit <- eBayes(fit)
result <- topTable(fit, coef = ncol(design), sort.by = "logFC", number = Inf)
result$Sequence <- rownames(result)
result$Sequence.Number <- nrow(result)
result$Sample1 <- levels(group)[1]
result$Sample2 <- levels(group)[2]
result$Sample.Pair <- paste(levels(group)[2], levels(group)[1], sep = "/")
return(result)
}
#' Calculating RC value; Plotting a scatterplot
#' @param expr_dt A expression table file (at peptide level)
#' @param meta_dt A metadata file
#' @param output_dir A directory of the output file(s)
#' @importFrom ggplot2 element_text
#' @importFrom ggplot2 ggplot
#' @importFrom ggplot2 aes
#' @importFrom ggplot2 theme
#' @importFrom ggplot2 labs
#' @importFrom ggplot2 coord_fixed
#' @importFrom ggplot2 geom_point
#' @importFrom ggplot2 scale_color_manual
#' @importFrom ggthemes theme_few
#' @importFrom data.table as.data.table
#' @importFrom ggplot2 ggsave
#' @export
qc_cor <- function(expr_dt, meta_dt, output_dir=NULL, show_sample_pairs=F) {
# Load data ------------------------------------------------------
ref_dt_dir <- file.path(
system.file(package = "protqc"), "data/reference_dataset.rds")
ref_dt <- readRDS(ref_dt_dir)
expr_ncol <- ncol(expr_dt)
expr_df <- data.frame(expr_dt[, 2:expr_ncol], row.names = expr_dt[, 1])
expr_matrix <- as.matrix(expr_df)
# Replace NA by zero ---------------------------------------------
expr_matrix[is.na(expr_matrix)] <- 0
# Check the grouping info ----------------------------------------
samples <- unique(meta_dt$sample)
if (length(samples) > 1) {
check_d6 <- "D6" %in% samples
if (!check_d6) {
stop("No D6 samples are available.")
} else {
samples <- c("D6", samples[!samples %in% "D6"])
pair_num <- length(samples[!samples %in% "D6"])
}
} else {
stop("No grouping info.")
}
# Analysis: Differential expression ------------------------------
result_final <- c()
for (j in 2:(pair_num + 1)) {
sample_pair <- paste(samples[j], "D6", sep = "/")
ref_tmp <- ref_dt[ref_dt$Sample.Pair %in% sample_pair, ]
col1 <- grep(samples[j], colnames(expr_matrix))
col2 <- grep("D6", colnames(expr_matrix))
e_tmp <- expr_matrix[, c(col1, col2)]
e_tmp <- e_tmp[apply(e_tmp, 1, function(x) length(which(x == 0)) < 3), ]
expr_grouped <- e_tmp[rownames(e_tmp) %in% ref_tmp$Sequence, ]
sample_pairs <- factor(x = rep(c(samples[j], "D6"), each = 3),
levels = c("D6", samples[j]),
ordered = T)
result_tmp <- dep_analysis(expr = expr_grouped, group = sample_pairs)
result_tmp <- result_tmp[result_tmp$adj.P.Val < 0.05, ]
result_final <- rbind(result_final, result_tmp)
}
# Calculating: RC -----------------------------------------------
result_final <- as.data.table(result_final)
result_trim <- result_final[, c(7, 11, 1)]
result_trim$name <- apply(result_trim, 1, function(x) paste(x[1], x[2]))
ref_dt$name <- apply(ref_dt, 1, function(x) paste(x[1], x[2]))
result_withref <- merge(result_trim, ref_dt, by = "name")
df_test <- data.frame(result_withref[, c(1, 2, 3, 4, 7)])
colnames(df_test) <- c("Name", "Sequence", "Sample.Pair",
"logFC.Test", "logFC.Reference")
cor_value <- cor(x = df_test$logFC.Test, y = df_test$logFC.Reference)
cor_value <- round(cor_value, 3)
# Plot ----------------------------------------------------------
text_custom_theme <- element_text(family = "Arial",
size = 16,
face = "plain",
color = "black",
hjust = 0.5)
scale_axis_r <- c(min(df_test$logFC.Reference),
max(df_test$logFC.Reference))
scale_axis_t <- c(min(df_test$logFC.Test),
max(df_test$logFC.Test))
limit <- max(abs(c(scale_axis_r, scale_axis_t)))
limit_axis <- c(- limit, limit)
plot_title <- paste("RC = ", cor_value, sep = "")
plot_subtitle <- paste("(Number of peptides = ", nrow(df_test), ")", sep = "")
p <- ggplot(df_test, aes(x = logFC.Reference, y = logFC.Test)) +
theme_few() +
theme(plot.title = text_custom_theme,
plot.subtitle = text_custom_theme,
axis.title = text_custom_theme,
axis.text = text_custom_theme,
legend.title = text_custom_theme,
legend.text = element_text(size = 16, color = "gray40")) +
labs(y = "log2FC (Test Dataset)",
x = "log2FC (Reference Datasets)",
title = plot_title,
subtitle = plot_subtitle) +
coord_fixed(xlim = limit_axis, ylim = limit_axis)
if (show_sample_pairs == T) {
colors_custom <- c("D5/D6" = "#4CC3D9",
"F7/D6" = "#FFC65D",
"M8/D6" = "#F16745")
p <- p + geom_point(aes(color = Sample.Pair), size = 2.5, alpha = .5) +
scale_color_manual(values = colors_custom)
}else {
p <- p + geom_point(color = "steelblue4", size = 2.5, alpha = .1)
}
# Save & Output -------------------------------------------------
if (!is.null(output_dir)) {
output_dir_final1 <- file.path(output_dir, "corr_plot.png")
output_dir_final2 <- file.path(output_dir, "deps_table.tsv")
output_dir_final3 <- file.path(output_dir, "corr_table.tsv")
ggsave(output_dir_final1, p, height = 5.5, width = 5.5)
write.table(result_final, output_dir_final2, sep = "\t", row.names = F)
write.table(df_test, output_dir_final3, sep = "\t", row.names = F)
}
output_list <- list(plot = p,
DEPs = result_final,
logfc = df_test,
COR = cor_value)
return(output_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.