Here is the analysis on the FDA Human data set particularly looking at gender differences.
Originally we had done two sections:
1. Differences: plotting overall features, and running SAM to find significant features one by one.
2. Modeling: building predictive models that separate Men and Women, as well as running PCA.
3. Differences in correlation networks.
The purpose of this document is to formally chronicle this analysis and to build it out further, particularly in the modeling section.
Here we are using thresholded features so we will read the signaling data and load the appropriate packages.
library(blackbuck) library(FDAlibrary) library(ggplot2) library(dplyr)
compiled_df <- read.csv("~/Documents/FDAlibrary/saved_data_structures/human_signaling_fold_asinh_0.2.csv") head(compiled_df)
Next we separate men and women, and group the data so we can have summary statistics (mean and sd) for each feature:
f_df <- dplyr::filter(compiled_df, Gender == "F") m_df <- dplyr::filter(compiled_df, Gender == "M") f_grouped <- group_by(f_df, Condition_Feature) f_sum <- dplyr::summarise(f_grouped, f_mean = mean(value), f_sd = sd(value)) m_grouped <- group_by(m_df, Condition_Feature) m_sum <- dplyr::summarise(m_grouped, m_mean = mean(value), m_sd = sd(value))
Now we can plot Men v. Women, means and SDs:
df_gender <- data.frame(m_sum, f_sum) ids <- strsplit(as.character(df_gender$Condition_Feature), "_") df_gender <- data.frame(df_gender, mean_diff = df_gender$m_mean - df_gender$f_mean, sd_diff = df_gender$m_sd - df_gender$f_sd, Condition = sapply(ids, "[[", 1), Cell_type = sapply(ids, "[[", 2), Protein = sapply(ids, "[[", 3)) p <- ggplot(df_gender, aes(m_mean, f_mean)) p + geom_point() + geom_text(x = 0.25, y = 2, label = paste("R = ", round(cor(df_gender$m_mean, df_gender$f_mean),digits = 3), sep = "")) + geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) + coord_fixed(ratio = 1) # ggsave(paste(plotDirectory, "men_v_women_mean.pdf", sep = "")) p <- ggplot(df_gender, aes(m_sd, f_sd)) p + geom_point() + geom_text(x = 0.25, y = 0.75, label = paste("R = ", round(cor(df_gender$m_sd, df_gender$f_sd),digits = 3), sep = "")) + geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) # ggsave(paste(plotDirectory, "men_v_women_sd.pdf", sep = "")) p + geom_point(aes(colour = Condition)) + geom_text(x = 0.25, y = 0.75, label = paste("R = ", round(cor(df_gender$m_sd, df_gender$f_sd),digits = 3), sep = "")) + geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) # ggsave(paste(plotDirectory, "men_v_women_sd_by_condition.pdf", sep = ""))
Load the SAM package.
library(samr) library(reshape2)
Tidy up the data for SAM analysis
df <- dplyr::select(compiled_df, Donor, Gender, Condition_Feature, value) wide_df <- dcast(df, Donor + Gender ~ Condition_Feature) x <- as.matrix(dplyr::select(wide_df, -Donor, -Gender)) y <- rep(NA, length(wide_df$Gender)) y[wide_df$Gender == "F"] <- 2 y[wide_df$Gender != "F"] <- 1
Run SAM and select the significant genes (estimated FDR = 0)
model <- SAM(t(x), y, resp.type = "Two class unpaired", genenames = colnames(x), nperms = 1000, fdr.output = 0.05)
# q val 0 up_genes <- model$siggenes.table$genes.up[model$siggenes.table$genes.up[,"q-value(%)"]==0, "Gene ID"] lo_genes <- model$siggenes.table$genes.lo[model$siggenes.table$genes.lo[,"q-value(%)"]==0, "Gene ID"] # Higher in women up_genes # Higher in men lo_genes # write.csv(up_genes, file = paste(plotDirectory, "SAM_features_women.csv", sep = "")) # write.csv(lo_genes, file = paste(plotDirectory, "SAM_features_men.csv", sep = ""))
Plot significant features as boxplots
df <- dplyr::filter(df, Gender != "x") df_up <- df[df[, "Condition_Feature"] %in% up_genes,] ggplot(df_up, aes(Condition_Feature, value)) + geom_boxplot(aes(colour = Gender)) + scale_color_manual(values=c("firebrick1", "dark blue")) + coord_flip() # ggsave(filename = paste(plotDirectory,"gender_sig_up.pdf", sep = "")) df_lo <- df[df[, "Condition_Feature"] %in% lo_genes,] ggplot(df_lo, aes(Condition_Feature, value)) + geom_boxplot(aes(colour = Gender)) + scale_color_manual(values=c("firebrick1", "dark blue")) + coord_flip() # ggsave(filename = paste(plotDirectory,"gender_sig_lo.pdf", sep = ""))
Next we can see where these fall on the original plot:
significance <- rep("Not significant", dim(df_gender)[1]) significance[df_gender$Condition_Feature %in% lo_genes] <- "Higher in Males" significance[df_gender$Condition_Feature %in% up_genes] <- "Higher in Females" df_gender_sig <- data.frame(df_gender, Significance = significance) p <- ggplot(df_gender_sig, aes(m_mean, f_mean)) p + geom_point(aes(colour = Significance)) + scale_colour_manual(values = c("Red", "Blue", "Black")) + geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) + coord_fixed(ratio = 1)
Load the additional demographics data:
library(glmnet) dataDirectory <- "~/Documents/FDAlibrary/saved_data_structures/" demographics <- read.csv(paste(dataDirectory, "demographics.csv", sep = "")) demographics <- demographics[(demographics$Donor %in% compiled_df$Donor),] dems <- dplyr::select(demographics, Donor, Age) data <- merge(wide_df, dems)
Creat a train and test set
set.seed(1)
train <- sample(1:85, 50, replace = FALSE) x <- dplyr::select(data, -Age, -Donor, -Gender) x <- as.matrix(x) y <- data$Gender x <- x[y!="x", ] y <- y[y!="x"] y <- droplevels(y)
First train the model and use cross validation to choose lambda
grid <- 10^seq(10,-2, length =100) lasso.mod <- glmnet(x[train,], y[train], family = "binomial", alpha=1, lambda =grid) cv.out <- cv.glmnet(x[train,], y[train], family = "binomial", alpha = 1) plot(cv.out) lam <- cv.out$lambda.1se
Predict test set
lasso.pred <- predict(lasso.mod, newx = x[-train,], type = "class", s = lam) pred <- as.factor(lasso.pred[, 1]) table(pred, y[-train]) # 60% correct classification pred_probs <- predict(lasso.mod, newx = x[-train,], type = "response", s = lam) truth <- y[-train] library(AUC) roc_data <- roc(pred_probs, truth) auc(roc_data) lasso_df <- data.frame(False_Positive_Rate = roc_data$fpr, True_Positive_Rate = roc_data$tpr, Model = rep("lasso"))
Finally, extract the coefficients:
lasso.coefs <- predict(lasso.mod, type = "coefficients", s = lam) lasso.coefs <- matrix(lasso.coefs, dimnames = list(rownames(lasso.coefs), "coef")) lasso.coefs[lasso.coefs != 0,]
We actually want to vary alpha to see what the best fit would be (comparing ridge, elastic net, and the lasso)
alpha_range <- seq(0, 1, 0.1) # creating fold IDs set.seed(1) fold_sets_1 <- sample(1:10, size = length(train), replace = TRUE) set.seed(2) fold_sets_2 <- sample(1:10, size = length(train), replace = TRUE) set.seed(3) fold_sets_3 <- sample(1:10, size = length(train), replace = TRUE) set.seed(1) fold_sets <- list(fold_sets_1, fold_sets_2, fold_sets_3)
We need to empirically get the range of lambdas for ridge (alpha = 0) because it wasn't covering the error range properly:
chosen_folds <- fold_sets[[1]] cv.out <- cv.glmnet(x[train,], y[train], family = "binomial", alpha = 0, foldid = chosen_folds) lambdas_0 <- cv.out$lambda cv.out <- cv.glmnet(x[train,], y[train], family = "binomial", alpha = 0.1, foldid = chosen_folds) lambdas_0.1 <- cv.out$lambda test_seq <- c(lambdas_0, lambdas_0.1)
Creating a function to cross-validate tuning both alpha and lambda
cv.glmnet.a <- function(a, chosen_folds){ if(a != 0){ cv.out <- cv.glmnet(x[train,], y[train], family = "binomial", alpha = a, foldid = chosen_folds) } else { cv.out <- cv.glmnet(x[train,], y[train], family = "binomial", alpha = a, lambda = test_seq, foldid = chosen_folds) } results_frame <- data.frame(alpha = rep(a, length(cv.out$cvm)), lambda = cv.out$lambda, error = cv.out$cvm) return(results_frame) }
Perform cross-validation over 3 different fold ID sets:
for (i in 1:3){ # chosen_folds <- fold_sets[[i]] error_list <- lapply(alpha_range, cv.glmnet.a, chosen_folds = fold_sets[[i]]) error_frame <- do.call(rbind, error_list) # plot x = lambda, y = error, factor = alpha p <- ggplot(error_frame, aes(x = log(lambda), y= error, group=alpha)) print(p + geom_line(aes(colour = alpha))) lowest_error <- error_frame[which.min(error_frame$error), ] print(lowest_error) }
lasso.mod <- glmnet(x[train,], y[train], family = "binomial", alpha = 0) lam <- 1 lasso.pred <- predict(lasso.mod, newx = x[-train,], type = "class", s = lam) pred <- as.factor(lasso.pred[, 1]) table(pred, y[-train]) # 68% classification correct
ROC curves
pred_probs <- predict(lasso.mod, newx = x[-train,], type = "response", s = lam) truth <- y[-train] library(AUC) roc_data <- roc(pred_probs, truth) auc(roc_data) ridge_df <- data.frame(False_Positive_Rate = roc_data$fpr, True_Positive_Rate = roc_data$tpr, Model = rep("ridge")) ggplot(ridge_df, aes(x = False_Positive_Rate, y = True_Positive_Rate)) + geom_line() + geom_abline(intercept = 0, slope = 1, linetype = 2) + coord_fixed(ratio = 1)
Ok here goes with module info, doing sparse-group lasso (SGL):
library(SGL)
df_moduled has what we need, we will need to read it in (add this in later)
df_moduled <- read.csv("~/Documents/FDAlibrary/shiny-modules/output/df_moduled.csv", row.names = 1)
df <- df_moduled %>% dplyr::select(Condition_Feature, Module) %>% unique()
We need the orders to match for the module assignments and our data:
data_names <- convert_names(colnames(x)) module_names <- as.character(df$Condition_Feature) identical(data_names, module_names)
Assuming the above says TRUE, we can use the index straight out of the box:
index <- df$Module
Ok let's just try it... (later: sort out weird order thing, and optimize over alphas)
y_numeric <- as.numeric(y) - 1 index <- df$Module cv.out.SGL <- cvSGL(data = list(x = x[train, ], y = y_numeric[train]), index = index, type = "logit") plot(cv.out.SGL) which.min(cv.out.SGL$lldiff) SGLfit <- SGL(data = list(x = x[train, ], y = y_numeric[train]), index = index, type = "logit") predictions <- predictSGL(SGLfit, newX = x[-train, ]) table(round(predictions[,20]), y_numeric[-train]) # 74% classification!
Do some ROC:
roc_data <- roc(predictions[,20], truth) auc(roc_data) SGL_df <- data.frame(False_Positive_Rate = roc_data$fpr, True_Positive_Rate = roc_data$tpr, Model = rep("SGL")) ROC_df <- rbind(ridge_df, SGL_df) ggplot(ROC_df, aes(x = False_Positive_Rate, y = True_Positive_Rate, group = Model)) + geom_line(aes(colour = Model)) + geom_abline(intercept = 0, slope = 1, linetype = 2) + coord_fixed(ratio = 1)
Ok these results are preliminary but let's look what we've got:
SGL_coeffs <- data.frame(df, SGL_coefficients = SGLfit$beta[,20])
We see that it excludes modules 2, 6, 9 entirely, then picks sparsely within it, using a total of 35 features. It helps to use the group info!
library(grplasso) # x[train, ], y[train, ], index = index, ## Use a multiplicative grid for the penalty parameter lambda, starting ## at the maximal lambda value lambda <- lambdamax(x[train,], y = y_numeric[train], index = index, model = LogReg()) * 0.5^(0:5) grp_fit <- grplasso(x = x[train,], y = y_numeric[train], index = index, lambda = lambda, model = LogReg()) # need to optimize for lambda (design own cv!! validation). For now.... grp_fit <- grplasso(x = x[train,], y = y_numeric[train], index = index, lambda = 1, model = LogReg()) pred.resp <- predict(grp_fit, newdata = x[-train,], type = "response") # 60% correct classification roc_data <- roc(pred.resp, truth) auc(roc_data) GL_df <- data.frame(False_Positive_Rate = roc_data$fpr, True_Positive_Rate = roc_data$tpr, Model = rep("GL"))
Putting all the ROC data together for a plot:
ROC_df <- rbind(ridge_df, SGL_df, lasso_df, GL_df) ggplot(ROC_df, aes(x = False_Positive_Rate, y = True_Positive_Rate, group = Model)) + geom_line(aes(colour = Model)) + geom_abline(intercept = 0, slope = 1, linetype = 2) + coord_fixed(ratio = 1)
Correlation networks:
m_df <- compiled_df %>% dplyr::filter(Gender == "M") %>% dplyr::select(Donor, Condition_Feature, value) %>% reshape2::dcast(., Donor ~ Condition_Feature) %>% dplyr::select(-Donor) f_df <- compiled_df %>% dplyr::filter(Gender == "F") %>% dplyr::select(Donor, Condition_Feature, value) %>% reshape2::dcast(., Donor ~ Condition_Feature) %>% dplyr::select(-Donor) all_df <- compiled_df %>% dplyr::select(Donor, Condition_Feature, value) %>% reshape2::dcast(., Donor ~ Condition_Feature) %>% dplyr::select(-Donor) all_cormat <- cor(all_df) dd <- as.dist((1-all_cormat)/2) hc <- hclust(dd) m_cormat <- cor(m_df) m_cormat <- m_cormat[hc$order, hc$order] cormat_melted <- reshape2::melt(m_cormat) m_cor_plot <- ggplot(data = cormat_melted, aes(Var1, Var2, fill = value)) + geom_tile(colour = "white")+ scale_fill_gradient2(low = "red", high = "blue", mid = "white", midpoint = 0, limits=c(-1, 1)) + theme(axis.text = element_blank()) + coord_fixed(ratio = 1) + ggtitle("Male correlations") print(m_cor_plot) f_cormat <- cor(f_df) f_cormat <- f_cormat[hc$order, hc$order] cormat_melted <- reshape2::melt(f_cormat) f_cor_plot <- ggplot(data = cormat_melted, aes(Var1, Var2, fill = value)) + geom_tile(colour = "white") + scale_fill_gradient2(low = "red", high = "blue", mid = "white", midpoint = 0, limits=c(-1, 1)) + theme(axis.text = element_blank()) + coord_fixed(ratio = 1)+ ggtitle("Female correlations") print(f_cor_plot)
Histograms:
hist_df <- data.frame( R_value = c(m_cormat_melted$value, f_cormat_melted$value), Gender = c(rep("Male", dim(m_cormat_melted)[1]), rep("Female", dim(f_cormat_melted)[1]))) ggplot(hist_df, aes(R_value, colour = Gender)) + geom_histogram()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.