#!/usr/bin/env Rscript
# Load Required Packages
for (lib in c(
"dplyr",
"pbapply",
"lme4",
"lmerTest",
"car",
"cplm",
"pscl",
"logging",
"ggrepel",
"gridExtra",
"future",
"cowplot",
"openxlsx",
"readxl"
)) {
if (!suppressPackageStartupMessages(require(lib, character.only = TRUE)))
stop(paste("Please install the R package: ", lib))
}
# This script contains functions to manipulate metbolite profiling data
numeric_dataframe <- function(input) {
input[, c(1:ncol(input))] <-
sapply(sapply(input[, c(1:ncol(input))], as.character), as.numeric)
return (input)
}
# convert2mxpformat <- function(data, sample_metadata=NA, feature_metdata=NA){
#
# if(!is.na(feature_metdata)){
# combined <- as.matrix(cbind(t(feature_metadata), data))
# }
# if(!is.na(sample_metadata)){
# empty_corner <- matrix("", nrow = dim(sample_metadata)[1], ncol = dim(feature_metadata)[2])
# ext_sample_metadata <- as.matrix(cbind(empty_corner,sample_metadata))
# ext_sample_metadata[,dim(feature_metadata)[2]] <- rownames(sample_metadata)
# combined2 <- as.matrix(rbind(ext_sample_metadata,combined))
# }
#
# }
find_indx <- function(df, word = 'Metabolite')
{
colmn <- which(df == word) %/% nrow(df) + 1
row <- which(df == word) %% nrow(df)
return(c(row, colmn))
}
#' @export
load_data <-
function(input,
type = 'known',
sheet = 1,
ID = 'Metabolite') {
if (is.character(input)) {
#df <- read.xls (input, sheet = sheet, header = FALSE)
df <-
readxl::read_excel(input, sheet = sheet, col_names = FALSE) #much faster
} else {
df <- input
}
df <- as.data.frame(df)
df[df == "n/a"] <- NA
df[df == "NA"] <- NA
df[df == ""] <- NA
# remove columns with all NAs
df <- Filter(function(x)
!all(is.na(x)), df)
# find the occutance of Metabolite word in the file
ord <- find_indx(df, word = 'Metabolite')
row <- ord[1]
col <- ord[2]
Compound_ord <- find_indx(df, word = 'Compound_ID')
HMDB_ord <- find_indx(df, word = 'HMDB_ID')
MZ_ord <- find_indx(df, word = 'MZ')
RT_ord <- find_indx(df, word = 'RT')
# seprate sample metadata
col_names <- NA
if (row > 1) {
sample_metadata <- df[1:(row - 1), col:dim(df)[2]]
meta_rownames <- sample_metadata[[1]]
sample_metadata <-
sample_metadata[1:dim(sample_metadata)[1], 2:dim(sample_metadata)[2]]
col_names <- sapply(df[row, (col + 1):dim(df)[2]], as.factor)
colnames(sample_metadata) <- col_names
rownames(sample_metadata) <- meta_rownames
} else{
col_names <- sapply(df[row, (col + 1):dim(df)[2]], as.factor)
sample_metadata <- NA
}
# seprate feature metadata
f_col_names <- NA
if (col > 1) {
feature_metadata <- df[row:dim(df)[1], 1:col]
f_meta_rownames <- feature_metadata[[Compound_ord[2]]]
feature_metadata <-
feature_metadata[2:dim(feature_metadata)[1], 1:dim(feature_metadata)[2]]
f_col_names <- sapply(df[row, 1:col], as.factor)
colnames(feature_metadata) <- f_col_names
rownames(feature_metadata) <-
f_meta_rownames[-1] # ro remove "Compound" from the names
} else{
feature_metadata <- NA
}
df <- as.data.frame(df)
#seprate known data or generate a ID for unknows in the case of using all
if (type == 'known') {
if (ID == 'Metabolite') {
df <- df[!is.na(df[, col]), ]
} else if (ID == 'HMDB_ID') {
df <- df[!is.na(df[, HMDB_ord[2]]), ]
df <- df[!grepl('\\*', df[, HMDB_ord[2]]),]
df <- df[df[, HMDB_ord[2]] != "", ]
}
} else if (type == 'all') {
if (ID == 'Compound_ID') {
name_col <- df[, Compound_ord[2]] #df[,col-1]
} else{
if (ID == 'Metabolite') {
name_col = col
} else if (ID == 'HMDB_ID') {
name_col = HMDB_ord[2]
}
df[, col] <- ifelse(is.na(df[, name_col]),
paste(
df[, Compound_ord[2]],
'MZ',
round(as.numeric(df[, MZ_ord[2]]), digits = 4),
'RT',
round(as.numeric(df[, RT_ord[2]]), digits = 2),
sep = '_'
),
as.character(df[, name_col]))
}
}
#remove redundant ion
df[is.na(df)] <- "NA"
if (!is.na(HMDB_ord[2])) {
df <- df[which(df[, HMDB_ord[2]] != "redundant ion"),]
df <- df[df[, HMDB_ord[2]] != "Internal Standard", ]
}
df <- df[df[, col] != "Metabolite", ]
# remove feature start with NH4_
df <- df[!startsWith(sapply(df[, col], as.character), "NH4_"),]
df[df == "NA"] <- NA
df <- df[!is.na(df[, col]), ]
if (ID == 'HMDB_ID') {
row_names <-
gsub('\\*', '', as.matrix(df[row:dim(df)[1], HMDB_ord[2]]))
} else if (ID == 'Metabolite') {
row_names <- unlist(sapply(df[row:dim(df)[1], col], as.factor))
} else if (ID == 'Compound_ID') {
row_names <-
unlist(sapply(df[row:dim(df)[1], Compound_ord[2]], as.factor))
}
data <- df[row:dim(df)[1], col:dim(df)[2]]
data <- data[,-1]
# check if row_names has unique values to be used for row names
dup_names <- row_names[duplicated(row_names)]
if (length(dup_names) > 0) {
print(sprintf("There are duplicated values: %s", dup_names))
}
rownames(data) <- row_names
colnames(data) <- col_names
data <- as.data.frame(data)
result <- list()
# put data in the right ordination rows(observation or samples) columns features
result$sample_metadata <- as.data.frame(t(sample_metadata))
result$feature_metadata <- feature_metadata
data <- as.data.frame(t(data))
# make sure data frame is numeric
result$data <- numeric_dataframe(data)
return (result)
}
stats_tabletest_2d <- function(df1, df2) {
results <-
setNames(data.frame(matrix(
ncol = dim(df1)[2], nrow = dim(df2)[2]
)), colnames(df2))
rownames(results) <- colnames(df1)
for (i in 1:dim(df1)[2]) {
for (j in 1:dim(df2)[2]) {
if (i == j)
results[i, j] <- t.test(df1[, i], df2[, j])
}
}
return(results)
}
cor_2d <-
function(df1,
df2,
method = 'spearman',
use = "na.or.complete") {
results <-
setNames(data.frame(matrix(
ncol = dim(df1)[2], nrow = dim(df2)[2]
)), colnames(df2))
rownames(results) <- colnames(df1)
for (i in 1:dim(df1)[2]) {
for (j in 1:dim(df2)[2]) {
results[i, j] <-
cor(df1[, i], df2[, j], method = method , use = use)
}
}
return(results)
}
cor_within_and_between_2df <- function(df1,
df2,
output_path,
metadata = NA,
fillby = NA,
method = 'spearman',
use = "pairwise.complete.obs",
all = F,
plot_cor_threshold = .65,
plot_min_points_threshold = 10) {
commom_rows <- intersect(rownames(df1), rownames(df2))
df1 <- df1[commom_rows,]
df2 <- df2[commom_rows,]
n_row = dim(df1)[2]
if (all) {
n_row = dim(df1)[2] * (dim(df1)[2] - 1) + dim(df2)[2] * (dim(df2)[2] -
1) + dim(df1)[2]
}
stats_table <-
setNames(
data.frame(matrix(ncol = 7, nrow = n_row)),
c(
"Feature1",
"Feature2",
"Correlation",
"P.Value",
"Between",
"Dataset",
"Characterization"
)
)
N <- 0
# within dataset 1
if (all) {
for (i in 1:dim(df1)[2]) {
for (j in 1:dim(df1)[2]) {
if (i == j)
next
N <- N + 1
stats_table [N, ] <- c(
colnames(df1)[i],
colnames(df1)[j],
cor(df1[, i], df1[, j], method = method , use =
use),
NA,
"Within",
'1'
)
}
}
# within dataset 2
for (i in 1:dim(df2)[2]) {
for (j in 1:dim(df2)[2]) {
if (i == j)
next
N <- N + 1
stats_table [N, ] <- c(
colnames(df2)[i],
colnames(df2)[j],
cor(df2[, i], df2[, j], method = method , use =
use),
NA,
"Within",
'2'
)
}
}
}
# between two datasets
for (i in 1:dim(df1)[2]) {
bothNArows <- which(is.na(df1[, i]) | is.na(df2[, i]))
n_not_NA <- dim(df1)[1] - length(bothNArows)
if (n_not_NA < plot_min_points_threshold)
next
N <- N + 1
Characterization <-
ifelse(startsWith(colnames(df1)[i], 'QI'), 'Unknown', 'Known')
cor_test <-
cor.test(df1[, i], df2[, i], method = method , use = use)
#cor(df1[,i], df2[,i], method = method , use=use)
stats_table [N, ] <- c(
colnames(df1)[i],
colnames(df2)[i],
cor_test$estimate,
cor_test$p.value,
"Between",
3,
Characterization
)
}
stats_table <- stats_table[!is.na(stats_table$Correlation),]
stats_table <-
stats_table[order(-abs(as.numeric(stats_table$Correlation))),]
# write the stats table
write.table(
stats_table,
paste(output_path, '/', 'correlation_between_features.txt', sep = ''),
sep = "\t",
eol = "\n",
quote = F,
col.names = NA,
row.names = T
)
# distribution of corrlations
plot_file <-
paste(output_path, "/correlation_plots", fillby, ".pdf", sep = "")
pdf(plot_file,
width = 2.65,
height = 2.5,
onefile = TRUE)
steps <- 0.05
ggp <-
ggplot(stats_table,
aes(
x = as.numeric(Correlation),
#fill = Characterization,
na.rm = TRUE
)) +
#geom_density(alpha = 0.35, size = 0.1) +
geom_density(
aes(y = ..count..*steps, fill = Characterization, colour = Characterization),
alpha = 0.25,
size = 0.3
) +
#facet_wrap(~Characterization)+
geom_histogram(
aes(y = ..count..),
breaks = seq(-1, 1.0, by = steps),
size = 0.1,
col = "black",
fill = "gold",
alpha = .25
) +
guides(fill = guide_legend(title = NULL)) + scale_colour_discrete(guide = 'none') +
scale_x_continuous(limits = c(min(as.numeric(
stats_table$Correlation
)), max(as.numeric(
stats_table$Correlation
)))) +
xlab("Spearman Correlation") + ylab("Count") +
theme_nature() +
theme(legend.justification = c(1, 1),
legend.position = c(1, 1))
stdout <- capture.output(print(ggp), type = "message")
if (length(stdout) > 0)
logging::logdebug(stdout)
stats_table_plots <-
stats_table[abs(as.numeric(stats_table$Correlation)) > .7,]
scatter_plots <-
vector(mode = "list", length = dim(stats_table_plots)[1])
names(scatter_plots) <- stats_table_plots$Feature1
logging::loginfo("Plotting data ...")
for (i in 1:dim(stats_table_plots)[1]) {
#i = 1
x_label <- as.character(stats_table_plots[i, 'Feature1'])
y_label <- as.character(stats_table_plots[i, 'Feature2'])
pval <- as.numeric(stats_table_plots[i, 'P.Value'])
coef_val <- as.numeric(stats_table_plots[i, 'Correlation'])
input_df <- cbind(df1[x_label], df2[y_label])
n_not_NA <- 0
colnames(input_df) <- c("x", "y")
fillby_one_color <- NA
if (!is.na(metadata) & !is.na(fillby)){
input_df[, fillby] <- metadata[rownames(input_df), fillby]
}else {
input_df[, fillby] <- NA
fillby_one_color <- 'darkolivegreen4'
}
bothNArows <- which(is.na(input_df$x) | is.na(input_df$y))
n_not_NA <- dim(input_df)[1] - length(bothNArows)
print(n_not_NA)
# if Metadata is continuous generate a scatter plot
# Continuous is defined as numerical with more than 2 values (to exclude binary data)
if (plot_min_points_threshold > 0)
{
if (n_not_NA < plot_min_points_threshold)
# for positive correlation
next
}else if (n_not_NA > plot_min_points_threshold) {
# for negative correlations
next
}
temp_plot <- NULL
logging::loginfo("Creating scatter plot for continuous data, %s vs %s",
x_label,
y_label)
temp_plot <- ggplot2::ggplot(data = input_df,
ggplot2::aes(as.numeric(as.character(x)), as.numeric(as.character(y)))) +
ggplot2::geom_point(
aes(fill = get(fillby)),#'darkolivegreen4',
#fill = fillby_one_color,#'darkolivegreen4',
color = 'black',
alpha = .5,
shape = 21,
size = 1,
stroke = 0.15
) +
ggplot2::scale_x_continuous(limits = c(min(input_df['x']), max(input_df['x']))) +
ggplot2::scale_y_continuous(limits = c(min(input_df['y']), max(input_df['y']))) +
ggplot2::stat_smooth(
method = "glm",
color = 'blue',
size = .5,
na.rm = T
) +
ggplot2::guides(alpha = 'none') + ggplot2::labs("") +
ggplot2::xlab(x_label) + ggplot2::ylab(y_label) + theme_nature() +
guides(fill = guide_legend(title = fillby))
ggplot2::annotate(
geom = "text",
x = Inf,
y = Inf,
hjust = 1,
vjust = 1,
label = sprintf(
"P-Value: %s\nCoefficient: %s",
formatC(pval, format = "e", digits = 3),
formatC(coef_val, format = "e", digits = 2)
) ,
color = "black",
size = 2,
fontface = "italic"
)
if (!is.na(fillby_one_color)){
temp_plot <- temp_plot + ggplot2::geom_point(fill = fillby_one_color)
}
scatter_plots[[x_label]] <- temp_plot
stdout <- capture.output(print(temp_plot), type = "message")
if (length(stdout) > 0)
logging::logdebug(stdout)
}
dev.off()
results <- list()
results$stats_table <- stats_table
results$dist_plot <- ggp
results$scatter_plots <- scatter_plots
return(results)
}
# transformation function for reverse log1p axis
revlog_trans <- function(base = exp(1)) {
trans <- function(x)
- log1p(x)
inv <- function(x)
expm1(-x)
scales::trans_new("revlog1p", trans, inv, domain = c(0, Inf))
}
volcano_plot <-
function(stats_table,
threshold = 0.05,
method = 'nominal') {
if (method == 'nominal')
stats_table$fdr <- stats_table$P.Value
stats_table$feature[stats_table$fdr >= threshold] <- NA
p <-
ggplot(stats_table, aes(
x = logFC,
y = -log10(P.Value),
label = feature
)) +
scale_fill_gradient(low = "lightgray", high = "navy") +
scale_color_gradient(low = "lightgray", high = "navy") +
#scale_y_continuous(trans = revlog_trans(), expand = c(0.005, 0.005)) +
#scale_y_log10()+
#expand_limits(y = c(0, 1)) +
stat_density_2d(aes(fill = ..level..),
geom = "polygon",
show.legend = FALSE) +
geom_point(
data = subset(stats_table, fdr <= threshold & logFC > 0.0),
fill = "green",
color = 'black',
alpha = .5,
shape = 21,
size = 2,
stroke = 0.05
) +
geom_point(
data = subset(stats_table, fdr <= threshold & logFC < 0.0),
fill = 'red',
color = 'black',
alpha = .5,
shape = 21,
size = 2,
stroke = 0.05
) +
geom_point(
data = subset(stats_table, fdr > threshold),
fill = 'gray',
color = "black",
alpha = 0.2,
shape = 21,
size = 2,
stroke = 0.05
) +
geom_vline(xintercept = 0, size = 0.1) +
geom_hline(yintercept = 0, size = 0.1) +
geom_hline(
yintercept = -log10(threshold),
linetype = "dashed",
size = 0.1,
color = "red"
) +
geom_vline(
xintercept = c(-1, 1),
linetype = "dashed",
size = 0.1,
color = "red"
) +
theme_linedraw() +
theme(panel.grid = element_blank()) +
xlab("Fold change (log2)") +
ylab("P-Value (-log10)") +
annotate(
"text",
x = min(stats_table$logFC) + .1,
y = min(-log2(stats_table$P.Value)) + .75,
label = "Significant up",
size = 3,
color = "black",
hjust = 0
) +
annotate(
"point",
x = min(stats_table$logFC),
y = min(-log2(stats_table$P.Value)) + .75,
color = "green"
) +
annotate(
"text",
x = min(stats_table$logFC) + .1,
y = min(-log2(stats_table$P.Value)) + .5,
label = "Significant down",
size = 3,
color = "black",
hjust = 0
) +
annotate(
"point",
x = min(stats_table$logFC),
y = min(-log2(stats_table$P.Value)) + .5,
color = "red"
) +
annotate(
"text",
x = min(stats_table$logFC),
y = min(-log2(stats_table$P.Value)) + 1.0,
label = paste(method, " threshold: ", threshold, sep = ""),
size = 3,
color = "black",
hjust = 0
) +
geom_text_repel(
size = 2.3622,
force = 3,
fontface = "italic",
box.padding = unit(.1, "lines"),
point.padding = unit(0.05, "lines")
)
return (p)
}
mean_test <-
function(data,
metadata,
meta ,
case = NA,
control = NA,
method = 'wilcox.test',
paired = F) {
stats_table <-
setNames(
data.frame(matrix(
ncol = 8, nrow = dim(data)[2]
)),
c(
"logFC",
"Carbon length",
"Double bond",
"statistic",
"P.Value",
"adj.P.Val",
"B",
"fdr"
)
)
rownames(stats_table) <- colnames(data)
commom_rows <- intersect(rownames(data), rownames(metadata))
# response variables must be numeric
data <- numeric_dataframe(data[commom_rows,])
metadata <- metadata[commom_rows, ]
if (!is.na(case)) {
case <- data[which(metadata[meta,] == case)]
}
if (!is.na(control)) {
control <- data[which(metadata[meta,] == control)]
}
if (paired) {
commom_rows2 <- intersect(rownames(case), rownames(control))
case <- case[commom_rows2,]
control <- control[commom_rows2,]
}
for (i in 1:dim(stats_table)[1]) {
if (all(is.na(case[, i])) || all(is.na(control[, i]))) {
print(i)
next
}
stats_table[i, 'logFC'] <-
log2(mean(case[, i], na.rm = TRUE) / mean(control[, i], na.rm = TRUE))
tryCatch({
if (method == 't.test') {
stats_table[i, 'P.Value'] <-
t.test(case[, i], control[, i], paired = paired)$p.value
stats_table[i, 'statistic'] <-
t.test(case[, i], control[, i], paired = paired)$statistic
} else{
stats_table[i, 'P.Value'] <-
wilcox.test(case[, i], control[, i], paired = paired)$p.value
stats_table[i, 'statistic'] <-
wilcox.test(case[, i], control[, i], paired = paired)$statistic
}
}, error = function(e) {
stats_table[i, 'P.Value'] <- NA
stats_table[i, 'statistic'] <- NA
})
stats_table[i, "Carbon length"] <-
as.numeric(unlist(strsplit(unlist(
strsplit(rownames(stats_table)[i], ":")
)[1], "C"))[2])
stats_table[i, "Double bond"] <-
as.numeric(unlist(strsplit(unlist(
strsplit(rownames(stats_table)[i], ":")
)[2], " "))[1])
stats_table[i, "Family"] <-
gsub("^.* ", "", rownames(stats_table)[i])
}
stats_table$fdr <-
p.adjust(
as.vector(stats_table$P.Value),
method = 'BH',
n = length(stats_table$P.Value)
)
stats_table$feature <- rownames(stats_table)
return (stats_table)
}
stats_2groups <-
function(case,
control,
test_type = 'wilcox.test',
paired = F) {
stats_table <-
setNames(
data.frame(matrix(
ncol = 8, nrow = dim(case)[2]
)),
c(
"logFC",
"Carbon length",
"Double bond",
"statistic",
"P.Value",
"adj.P.Val",
"B",
"fdr"
)
)
rownames(stats_table) <- colnames(case)
for (i in 1:dim(stats_table)[1]) {
if (all(is.na(case[, i])) || all(is.na(control[, i]))) {
print(i)
next
}
stats_table[i, 'logFC'] <-
log2(mean(case[, i], na.rm = TRUE)) - log2(mean(control[, i], na.rm = TRUE))
tryCatch({
if (test_type == 't.test') {
stats_table[i, 'P.Value'] <-
t.test(case[, i], control[, i], paired = paired)$p.value
stats_table[i, 'statistic'] <-
t.test(case[, i], control[, i], paired = paired)$statistic
} else{
stats_table[i, 'P.Value'] <-
wilcox.test(case[, i], control[, i], paired = paired)$p.value
stats_table[i, 'statistic'] <-
wilcox.test(case[, i], control[, i], paired = paired)$statistic
}
}, error = function(e) {
stats_table[i, 'P.Value'] <- NA
stats_table[i, 'statistic'] <- NA
})
stats_table[i, "Carbon length"] <-
as.numeric(unlist(strsplit(unlist(
strsplit(rownames(stats_table)[i], ":")
)[1], "C"))[2])
stats_table[i, "Double bond"] <-
as.numeric(unlist(strsplit(unlist(
strsplit(rownames(stats_table)[i], ":")
)[2], " "))[1])
stats_table[i, "Family"] <-
gsub("^.* ", "", rownames(stats_table)[i])
}
stats_table$fdr <-
p.adjust(
as.vector(stats_table$P.Value),
method = 'BH',
n = length(stats_table$P.Value)
)
stats_table$feature <- rownames(stats_table)
return (stats_table)
}
diff_bar_plot <-
function(stats_table,
threshold = 0.05,
method = 'nominal',
orderby = 'logFC') {
if (method == 'nominal') {
stats_table$fdr <- stats_table$P.Value
}
stats_table_sig <-
stats_table[which(stats_table$fdr <= threshold),]
p <- ggplot(stats_table_sig) +
geom_bar(
aes(
x = reorder(feature, get(orderby)),
y = logFC,
fill = logFC
),
show.legend = F,
stat = "identity",
position = "identity"
) +
xlab("Metabolites") + ylab("logFC") +
#guides(colour = guide_legend(show = FALSE) )+
guides(fill = guide_legend(title = "")) + theme(legend.justification =
c(0, 0),
legend.position = c(.3, .2)) +
#theme(legend.position="top", legend.direction="vertical")+
theme(plot.title = element_text(face = "bold", hjust = .9)) +
#labs(title = "")+
scale_fill_gradient2(
low = 'red',
mid = 'snow3',
high = 'darkgreen',
space = 'Lab'
) +
theme_nature() +
#theme(text = element_text(),axis.text.x = element_text(angle = 0, hjust = 1), axis.text.y = element_text(angle = 0, hjust = 1))+
#geom_text(aes(label=number_of_genome_ref), position=position_dodge(width=0.9), vjust=-0.25)+
#scale_x_discrete()+
#scale_y_continuous(expand = c(0)) +
coord_flip()
return (p)
}
check_anotation <- function(study, ref_db = NA) {
if (is.na(ref_db)) {
ref_db <-
read.table(
"/Users/rah/Documents/Metabolomics/example_data/HMDB/mastermapping.txt",
header = TRUE,
sep = "\t",
fill = FALSE,
comment.char = "" ,
check.names = FALSE
)
}
unkcharterized_hmdb <-
rownames(data)[!rownames(data) %in% ref_db$HMDBID]
unkcharterized_hmdb <-
as.data.frame(unkcharterized_hmdb, drop = F)
return (unkcharterized_hmdb)
}
variance.test <-
function(data, metadata, meta , method = 'levene') {
# find common samples (rows) between data and metdata if the are not the same
if (!(meta %in% colnames(data)) &&
!(meta %in% colnames(metadata))) {
print('meta does not exist in your data')
return (NA)
}
commom_rows <- intersect(rownames(data), rownames(metadata))
# response variables must be numeric
data <- numeric_dataframe(data[commom_rows,])
metadata2 <- as.data.frame(metadata[commom_rows, meta], drop = F)
# combine data nad metadata for test function
test_data <- cbind(metadata2, data)
# group factor defining groups.
test_data$meta <- as.factor(test_data$meta)
# set up results table
stats_table <-
setNames(data.frame(matrix(
ncol = 3, nrow = dim(data)[2]
)),
c("Df", "F value", "Pr(>F)"))
#
rownames(stats_table) <- colnames(data)
for (i in 1:dim(stats_table)[1]) {
if (all(is.na(data[, i]))) {
print(i)
next
}
tryCatch({
if (method == 'levene') {
# feature to be tested
feature <- rownames(stats_table)[i]
test_data[feature] <- lapply(test_data[feature], as.numeric)
# an alternative test
# #bf.test(cepgfr5t ~ race1c, data = data)
test_results <-
with(test_data, leveneTest(get(feature), meta))
stats_table[i, 'Df'] <- test_results$`Df`[1]
stats_table[i, 'F value'] <- test_results$`F value`[1]
stats_table[i, 'Pr(>F)'] <- test_results$`Pr(>F)`[1]
} else{
return(NA)
}
}, error = function(e) {
stats_table[i, 'Df'] <- NA
stats_table[i, 'F value'] <- NA
stats_table[i, 'Pr(>F)'] <- NA
})
}
desnity_pvalues_plot <-
ggplot(data = stats_table, aes(get('Pr(>F)'))) +
geom_histogram(
aes(y = ..density..),
breaks = seq(0.0, 1.0, by = 0.05),
col = "black",
fill = "gold",
alpha = .5
) +
#geom_density(col=2) +
labs(
title = sprintf('p-values of variantion in %s groups', meta),
x = sprintf('P-values using %s test', method),
y = 'Density'
)
result <- list()
result$stats_table <- stats_table
result$desnity_pvalues_plot <- desnity_pvalues_plot
return (result)
}
regrese_out <- function(features,
meta,
random = NA,
fixed = NA) {
random_effect <- factor(meta[, random])
fixed_effect <- meta[, fixed]
residuals_df <- NULL
for (i in 1:dim(features)[2]) {
#i<- 1
lmer_results <- lmer(features[, i] ~ fixed_effect +
(1 |
random_effect), na.action = na.exclude)
temp_residual <- resid(lmer_results)
residuals_df <- rbind(residuals_df, temp_residual)
}
row.names(residuals_df) <- colnames(features)
colnames(residuals_df) <- rownames(features)
return (as.data.frame(t(residuals_df)))
}
merge_cvs <- function(input_dir_path, outputfile) {
# Download from http://smpdb.ca/downloads
# extract the zip file to smpdb_metabolites
#input_dir_path = "~/Downloads/smpdb_metabolites"
#outputfile = '/Users/rah/Documents/Metabolomics/example_data/HMDB/merged_ref_db.csv'
library(dplyr)
library(readr)
df <- list.files(path = input_dir_path, full.names = TRUE) %>%
lapply(read_csv) %>%
bind_rows
write_csv(df, outputfile)
}
filter_by_mean <- function(data, top = .95, axis = 1) {
if (axis == 2) {
data <- as.data.frame(t(data))
}
step1.dat <-
data[rowMeans(data, na.rm = TRUE) >= quantile(rowMeans(data, na.rm = TRUE), 1.0 -
top), ]
if (axis == 2)
return (as.data.frame(t(step1.dat)))
else
return (as.data.frame(step1.dat))
}
filter_by_mean <- function(data, top = .95, axis = 1) {
if (axis == 2) {
data <- as.data.frame(t(data))
}
step1.dat <-
data[rowMeans(data, na.rm = TRUE) >= quantile(rowMeans(data, na.rm = TRUE), 1.0 -
top), ]
if (axis == 2)
return (as.data.frame(t(step1.dat)))
else
return (as.data.frame(step1.dat))
}
filter_by_prevelance <-
function(data,
prev = 2,
type = "number",
axis = 1) {
if (axis == 2) {
data <- as.data.frame(t(data))
}
data2 <- data
data2[data2 > 0] <- 1
#data[is.na(data)] <- 0
if (type == "number")
step1.dat <- data[rowSums(data2, na.rm = TRUE) >= prev, ]
else if (type == "perc")
step1.dat <-
data[rowMeans(data, na.rm = TRUE) >= quantile(rowMeans(data, na.rm = TRUE), 1.0 -
top), ]
else
# if (type == "percentile")
step1.dat <-
data[rowMeans(data, na.rm = TRUE) >= quantile(rowMeans(data, na.rm = TRUE), 1.0 -
top), ]
if (axis == 2)
return (as.data.frame(t(step1.dat)))
else
return (as.data.frame(step1.dat))
}
filter_by_variance <- function(data, top = .95, axis = 1) {
if (axis == 2) {
data <- as.data.frame(t(data))
}
step2.dat <- apply(data, 1, var, na.rm = TRUE)
step2.dat <-
data[step2.dat >= quantile(step2.dat, 1.0 - top, na.rm = TRUE), ]
if (axis == 2)
return (as.data.frame(t(step2.dat)))
else
return (as.data.frame(step2.dat))
}
ordplots <-
function(data,
metadata,
output,
outputname = NA,
method = 'pcoa',
index = 'bray/curtis') {
dir.create(file.path(output), showWarnings = FALSE)
commom_rows <- intersect(rownames(data), rownames(metadata))
metadata <- metadata[commom_rows,]
data <- data[commom_rows,]
fakepcl <- list(
meta = as.data.frame(metadata),
x = as.matrix(data),
ns = dim(data)[1],
nf = dim(data)[2]
)
if (method == 'tsne') {
ord <- pcl.tsne(fakepcl)
} else if (method == 'pcoa') {
ord <- pcl.pcoa(fakepcl, k = 3, index = index)#, index = 'euclidean'
} else{
print("The ordination is not implemeted in massPattern!")
}
if (is.na(outputname)) {
outputname <- method
}
ord_plots <- vector(mode = "list", length = dim(metadata)[2])
names(ord_plots) <- colnames(metadata)
ord_plot <- pcl.ordplot(fakepcl, ord, pcos = c(1, 2))
#logging::loginfo("Plotting data using %s ordination", method)
pdf(
paste(output, '/', outputname, '.pdf', sep = ''),
width = 4.5,
height = 3.25,
onefile = TRUE
)
for (temp_metadat in colnames(metadata)) {
#print(temp_metadat)
#print(is.numeric(fakepcl$meta[1, temp_metadat]))
#print(length(fakepcl$meta[[temp_metadat]]))
#print(fakepcl$meta[1, temp_metadat])
# get not NA values
not_na <- fakepcl$meta[temp_metadat]
not_na <- not_na[!is.na(not_na)]
l_unique <- length(unique(not_na))
if (check.numeric(not_na[1]) & l_unique > 2)
#tryCatch({
fakepcl$meta[temp_metadat] <-
sapply(fakepcl$meta[temp_metadat], as.numeric)
#}, error = function(e){
# next#fakepcl$meta[temp_metadat] <- lapply(fakepcl$meta[temp_metadat], as.factor)
#})
else if (l_unique > 100 || l_unique < 2) {
#print(temp_metadat)
#print(l_unique)
#print(not_na[1])
#print(check.numeric(not_na[1]))
next
}
ord_plot <-
pcl.ordplot(
fakepcl,
ord,
colour_title = temp_metadat,
colour = temp_metadat,
pcos = c(1, 2)
)
ord_plots[[temp_metadat]] <- ord_plot
stdout <- capture.output(print(ord_plot), type = "message")
if (length(stdout) > 0)
logging::logdebug(stdout)
#ggsave(filename=paste(output,'/', temp_metadat,'.pdf', sep=''), plot=ord_plot,
# width = 5, height = 4, units = "in", dpi = 300)
}
dev.off()
return(ord_plots)
}
check.numeric <- function(N) {
return(
grepl(
"[-]?[0-9]+[.]?[0-9]*|[-]?[0-9]+[L]?|[-]?[0-9]+[.]?[0-9]*[eE][0-9]+",
N
)
)
}
convert_maaslin_output2matrix <-
function(df,
simlarity_threshold = 0.0005,
q_threshold = 1.0,
first_n = NA,
cell_value = 'pval') {
metadata <- df$metadata
feature <- df$feature
value <- NA
if (!is.na(first_n) & first_n > 0 & first_n < dim(df)[1]) {
if (cell_value == 'coef') {
df <- df[order(-abs(df[cell_value])) ,]
} else{
df <- df[order(df[cell_value]),]
}
df <- df[1:first_n, ]
}
df$coef[abs(df$qval) > q_threshold |
abs(df$coef) < simlarity_threshold] <- 0
value <- -log(df$qval) * sign(df$coef)
value <- pmax(-20, pmin(20, value))
n <- length(unique(feature))
m <- length(unique(metadata))
if (n < 2) {
print(
'There is no enough features in associations to make a heatmap plot of associations.
Please look at association in text output file!'
)
return (NA)
}
if (m < 2) {
print(
'There is no enough metadata in associations to make a heatmap plot of associations.
Please look at association in text output file!'
)
return (NA)
}
a = matrix(0, nrow = n, ncol = m)
a <- as.data.frame(a)
rownames(a) <- unique(feature)
colnames(a) <- unique(metadata)
for (i in 1:dim(df)[1]) {
if (abs(a[as.character(feature[i]), as.character(metadata[i])]) > abs(value[i]))
next
a[as.character(feature[i]), as.character(metadata[i])] <-
value[i]
}
#if (data_type == "pathway"){
# colnames(a) <- sapply(colnames(a), shorter_pwy_names )
#}
#else{
# colnames(a) <- sapply(colnames(a), pcl.sub )
#}
#rownames(b) <- rownames(a)#all_metadata
#colnames(b) <- colnames(a)
b <- a
#for (i in 1:n)
# for (j in 1:m){
# b[rownames(a)[i], colnames(a)[j] ] <- a[rownames(a)[i], colnames(a)[j] ]
# }
#b <-b[needed_metadata,]
#rownames(b) <- needed_metadata_names
b <- b[, colSums(b != 0, na.rm = TRUE) > 0, drop = F]
b <- t(b)
b <- b[, colSums(b != 0, na.rm = TRUE) > 0, drop = F]
#b <- t(b)
if (length(colnames(b)) > 2) {
hc <- hclust(dist(t(b)), method = "single")
b <- b[, hc$order, drop = F]
}
return (as.data.frame(b))
}
combine_maaslin_heatmap <- function() {
#library(pheatmap)
output_file = "pheatmap.pdf"
cell_value = 'qval'
title = ""
if (cell_value == "pval") {
value <- -log(df$pval) * sign(df$coef)
value <- pmax(-20, pmin(20, value))
if (is.null(title))
title <- "Significant associations (-log(pval)*sign(coeff))"
} else if (cell_value == "qval") {
value <- -log(df$qval) * sign(df$coef)
value <- pmax(-20, pmin(20, value))
if (is.null(title))
title <- "Significant associations (-log(qval)*sign(coeff))"
} else if (cell_value == "coef") {
value <- df$coef
if (is.null(title))
title <- "Significant associations (coeff)"
}
cell_value = "Q.value"
data_label = 'Data'
metadata_label = 'Metadata'
border_color = "grey93"
color = colorRampPalette(c("blue", "whitesmoke", "red"))(51) #whitesmoke
# read MaAsLin output
gaps <- c(0, 0)
mat <- NA
site_colours <- set_eric_coloures()
bodysite_color_order = list(
"Body site" = c(
"Buccal mucosa" = site_colours$Buccal_mucosa,
"Supragingival plaque" = site_colours$Supragingival_plaque,
"Tongue dorsum" = site_colours$Tongue_dorsum,
"Stool" = site_colours$Stool,
"Anterior nares" = site_colours$Anterior_nares,
"Posterior fornix" = site_colours$Posterior_fornix
)
)
#print(bodysite_color_order)
df_1 <-
read.table(
'/Users/rah/Documents/Metabolomics/Projects/MESA/C8-pos_known/linear_model_output_exam1/significant_results.tsv',
header = TRUE,
sep = "\t",
fill = TRUE,
comment.char = "" ,
check.names = FALSE
)
mat <- convert_maaslin_output2matrix(df_1)
#mat <- mat[, colnames(mat)%in%names(bugs_to_show_assoc[["Anterior nares"]])]
print ("C8_pos")
#colnames(mat) <- names(bugs_to_show_assoc[["Anterior nares"]])
gaps[1] <- length(colnames(mat))
method_order = data.frame(ID = factor(rep(c("C8-pos"), each = length(colnames(
mat
)))))
df_2 <-
read.table(
'/Users/rah/Documents/Metabolomics/Projects/MESA/HILIC-pos_known/linear_model_output_exam1/significant_results.tsv',
header = TRUE,
sep = "\t",
fill = TRUE,
comment.char = "" ,
check.names = FALSE
)
mat2 <- convert_maaslin_output2matrix(df_2)
if (!is.na(mat2) && dim(mat2)[1] > 0) {
#mat2 <- mat2[, colnames(mat2)%in%names(bugs_to_show_assoc[["Buccal mucosa"]])]
print ("HILIC-pos")
#colnames(mat2) <- bugs_to_show_assoc[["Buccal mucosa"]]
#mat <- cbind(mat, mat2)
mat <- combine2df(mat, mat2)
gaps[2] <- length(colnames(mat))
method_order = rbind(method_order, data.frame(ID = factor(rep(
c("HILIC-pos"), each = length(colnames(mat2))
))))
}
df_3 <-
read.table(
'/Users/rah/Documents/Metabolomics/Projects/MESA/HILIC-neg_known/linear_model_output_exam1/significant_results.tsv',
header = TRUE,
sep = "\t",
fill = TRUE,
comment.char = "" ,
check.names = FALSE
)
mat3 <- convert_maaslin_output2matrix(df_3)
if (!is.na(mat3) && dim(mat3)[1] > 0) {
#mat3 <- mat3[, colnames(mat3)%in%names(bugs_to_show_assoc[["Supragingival plaque"]])]
print("HILIC-neg")
#print(length(colnames(mat3)))
#colnames(mat3) <- bugs_to_show_assoc[["Supragingival plaque"]]
#mat <- cbind(mat, mat3)
mat <- combine2df(mat, mat3)
gaps[3] <- length(colnames(mat))
method_order = rbind(method_order, data.frame(ID = factor(rep(
c("HILIC-neg"), each = length(colnames(mat3))
))))
}
#df_4 <- read.table( '/Users/rah/Documents/Metabolomics/Projects/MESA/C8-pos_known/linear_model_output_exam1/significant_results.tsv',
# header = TRUE, sep = "\t", fill = TRUE, comment.char = "" , check.names = FALSE)
#mat4 <- convert_maaslin_output2matrix(df_4)
#mat4 <- mat4[, colnames(mat4)%in%names(bugs_to_show_assoc[["C18-neg"]])]
#print(length(colnames(mat4)), names(bugs_to_show_assoc[["Tongue dorsum"]]))
#colnames(mat4) <- bugs_to_show_assoc[["C18-neg"]]
#mat <- cbind(mat, mat4)
#print("C18-neg")
#gaps[4] <- length(colnames(mat))
#method_order = rbind(method_order, data.frame(ID = factor(rep(c("Tongue dorsum"), each=length(colnames(mat4))))))
colnames(method_order)[1] <- "Profiling method"
#rownames(bodysite_color_order)[1] <- "Body site"
#rownames(bodysite_order) <- colnames(mat)
labels_col <- colnames(mat)
colnames(mat) <- rownames(method_order)
#bodysite_order <- as.matrix(bodysite_order)
#rownames(bodysite_order) <- colnames(mat)
#bodysite_order <- as.data.frame(bodysite_order)
#print (rownames(bodysite_order))
#pdf(file=output_file, height = length(colnames(b))/5+8, width = length(rownames(b))/5+5)
#pdf(file=output_file, height = 25, width = 10)
mylegend = TRUE
my_show_rownames = TRUE
#if (!grepl('nares', output_file)){
# mylegend = FALSE
# my_show_rownames = FALSE
#}
mat[is.na(mat)] <- 0
plot_result <-
pheatmap(
mat,
cellwidth = 5,
cellheight = 5,
# changed to 3
main = title,
annotation_col = method_order["Profiling method"],
#annotation_colors = bodysite_color_order["Profiling method"],
annotation_legend = T,
fontsize = 6,
kmeans_k = NA,
#border=TRUE,
labels_col = labels_col,
show_rownames = T,
show_colnames = T,
scale = "none",
#clustering_method = "complete",
cluster_rows = FALSE,
cluster_cols = F,
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
legend = TRUE,
border_color = border_color,
color = color,
treeheight_row = 0,
treeheight_col = 0,
gaps_col = gaps,
display_numbers = matrix(ifelse(mat > 0.0, "+", ifelse(mat < 0.0, "-", "")), nrow(mat))
)
dev.off()
ggsave(
filename = '~/Documents/Metabolomics/Projects/MESA/associations_overview.pdf',
plot = plot_result$gtable,
width = 12,
limitsize = FALSE,
height = 3,
units = "in",
dpi = 300
)
return (plot_result)
}
combine2df <- function(df1, df2) {
all_rows <- dplyr::union(rownames(df1), rownames(df2))
df1 <- df1[all_rows,]
df2 <- df2[all_rows,]
result <- cbind(df1, df2)
rownames(result) <- all_rows
return(result)
}
#' @export
combine_QI_TF <- function(QI_file, TF_file, output_name){
# read the Progenesis file
QI_data <- read.table(
QI_file,
header = F,
row.names = NULL,
sep = ",",
fill = FALSE,
comment.char = "" ,
check.names = FALSE
)
# name the columns
colnames(QI_data) <- lapply(QI_data[3,], as.character)
# find the sart of raw abundance data
data_indx <- find_indx(QI_data, word = 'Raw abundance')
# remove the first two columns
QI_data <- QI_data[4:nrow(QI_data), ]
# use raw abundance
QI_data <- QI_data[, c(1,3,5,data_indx[2]:ncol(QI_data))]
# re-order columns and remove columns with name ""
indx_last_sample <- grep("Accepted Compound ID", colnames(QI_data))
cols <- colnames(QI_data)
cols <- cols[1:indx_last_sample]
samples_cols <-
cols[!(cols %in% c(
"Compound",
"m/z",
"Retention time (min)",
"Accepted Compound ID",
""
))]
samples_cols <- samples_cols[order(samples_cols)]
cols_in_order <-
c(c("Compound", "m/z", "Retention time (min)", "Accepted Compound ID"),
samples_cols)
QI_data <- QI_data[, cols_in_order]
#####read first file for TR profiles #########################
#1) on the second tab, I calculate the average retention time
RT_profile_data <-
readxl::read_excel(
TF_file,
sheet = 2,
col_names = FALSE
)
colnames(RT_profile_data) <- sapply(RT_profile_data[1,], as.factor)
RT_profile_data_rownames <- sapply(RT_profile_data[, 1], as.factor)
RT_profile_data <- RT_profile_data[-1, -1]
rownames(RT_profile_data) <- RT_profile_data_rownames[-1]
RT_profile_data <- as.data.frame(RT_profile_data)
##### Calculate average RT
RT_profile_data <- massPattern:::numeric_dataframe(RT_profile_data)
RT_profile_data[RT_profile_data <= 0.0] <- NA
RT <- colMeans(x = RT_profile_data, na.rm = T)
RT_profile_data <- rbind(RT = RT, RT_profile_data)
#2) I add the RT to the first tab, then copy & transpose to a new tab
#####read first file for TR profiles #########################
Intensity_profile_data <-
readxl::read_excel(
TF_file,
sheet = 1,
col_names = FALSE
)
colnames(Intensity_profile_data) <-
sapply(Intensity_profile_data[1,], as.factor)
Intensity_profile_data_rownames <-
sapply(Intensity_profile_data[, 1], as.factor)
Intensity_profile_data <- Intensity_profile_data[-1, -1]
rownames(Intensity_profile_data) <-
Intensity_profile_data_rownames[-1]
Intensity_profile_data <- as.data.frame(Intensity_profile_data)
##### add the average RT
Intensity_profile_data <- massPattern:::numeric_dataframe(Intensity_profile_data)
Intensity_profile_data <- rbind(RT = RT, Intensity_profile_data)
# clean data
## remove "Standards" mm, Bpp, FFA, miniMM
clean_rows <- row.names(Intensity_profile_data)
clean_rows <- clean_rows[!grepl("*miniMM*|*FFA*|*Bpp*|*mm*|*Standards*", clean_rows)]
Intensity_profile_clean_data <- Intensity_profile_data[clean_rows,]
TF_data <- as.data.frame(t(Intensity_profile_clean_data))
TF_data[,c("Compound", "m/z", "Retention time (min)", "Accepted Compound ID")] <- NA
TF_data[, "Retention time (min)"] <- TF_data[, "RT"]
# rename TF metabolites if they are in QI data
row.names(TF_data) <- ifelse(row.names(TF_data) %in% row.names(QI_data),
paste(row.names(TF_data), "TF", sep = '_'), row.names(TF_data))
TF_data[, "Software"] <- "TF"
TF_data$`Accepted Compound ID` <- rownames(TF_data)
QI_data[, "Software"] <- "QI"
#rownames(QI_data) <- QI_data$Compound
combined <- rbind(TF_data[, colnames(QI_data)], QI_data)
# order the final column names
cols_in_order <-
c(c("Compound", 'Software', "m/z", "Retention time (min)", "Accepted Compound ID"),
samples_cols)
combined <- combined[, cols_in_order]
colnames(combined) <- c(c("Compound_ID", 'Software', "MZ", "RT", "Metabolite"),
samples_cols)
combined[combined == ""] <- NA
combined <- with(combined, combined[order(Metabolite, na.last = TRUE), ])
hs <- createStyle(textDecoration = "BOLD", fontColour = "#FFFFFF", fontSize = 12,
fontName = "Arial Narrow", fgFill = "#4F80BD")
options("openxlsx.borderColour" = "#4F80BD") ## set default border colour
write.xlsx(combined,
file = output_name,
colNames = TRUE)
#borders = "rows",
#headerStyle = hs)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.