Here is the analysis on the FDA Human data set particularly looking at gender differences. Originally we had done two sections: 1. Differences: plotting overall features, and running SAM to find significant features one by one.
2. Modeling: building predictive models that separate Men and Women, as well as running PCA.
3. Differences in correlation networks.

The purpose of this document is to formally chronicle this analysis and to build it out further, particularly in the modeling section.

Feature Differences

Reading in and formatting the Data

Here we are using thresholded features so we will read the signaling data and load the appropriate packages.

library(blackbuck)
library(FDAlibrary)
library(ggplot2)
library(dplyr)
compiled_df <- read.csv("~/Documents/FDAlibrary/saved_data_structures/human_signaling_fold_asinh_0.2.csv")
head(compiled_df)

Summary statistics

Next we separate men and women, and group the data so we can have summary statistics (mean and sd) for each feature:

f_df <- dplyr::filter(compiled_df, Gender == "F")
m_df <- dplyr::filter(compiled_df, Gender == "M")

f_grouped <- group_by(f_df, Condition_Feature)
f_sum <- dplyr::summarise(f_grouped, f_mean = mean(value), f_sd = sd(value))

m_grouped <- group_by(m_df, Condition_Feature)
m_sum <- dplyr::summarise(m_grouped, m_mean = mean(value), m_sd = sd(value))

Now we can plot Men v. Women, means and SDs:

df_gender <- data.frame(m_sum, f_sum)
ids <- strsplit(as.character(df_gender$Condition_Feature), "_")
df_gender <- data.frame(df_gender, 
                        mean_diff = df_gender$m_mean - df_gender$f_mean,
                        sd_diff = df_gender$m_sd - df_gender$f_sd,
                        Condition = sapply(ids, "[[", 1),
                        Cell_type = sapply(ids, "[[", 2),
                        Protein = sapply(ids, "[[", 3))

p <- ggplot(df_gender, aes(m_mean, f_mean))

p + geom_point() +
  geom_text(x = 0.25, y = 2, label = paste("R = ", round(cor(df_gender$m_mean, df_gender$f_mean),digits = 3), sep = "")) +
  geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
  coord_fixed(ratio = 1)
# ggsave(paste(plotDirectory, "men_v_women_mean.pdf", sep = ""))

p <- ggplot(df_gender, aes(m_sd, f_sd))
p + geom_point() + geom_text(x = 0.25, y = 0.75, label = paste("R = ", round(cor(df_gender$m_sd, df_gender$f_sd),digits = 3), sep = "")) + geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)
# ggsave(paste(plotDirectory, "men_v_women_sd.pdf", sep = ""))

p + geom_point(aes(colour = Condition)) + geom_text(x = 0.25, y = 0.75, label = paste("R = ", round(cor(df_gender$m_sd, df_gender$f_sd),digits = 3), sep = "")) + geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)
# ggsave(paste(plotDirectory, "men_v_women_sd_by_condition.pdf", sep = ""))

SAM analysis for Significant Feature Differences

Load the SAM package.

library(samr)
library(reshape2)

Tidy up the data for SAM analysis

df <- dplyr::select(compiled_df, Donor, Gender, Condition_Feature, value)
wide_df <- dcast(df, Donor + Gender ~ Condition_Feature)
x <- as.matrix(dplyr::select(wide_df, -Donor, -Gender))
y <- rep(NA, length(wide_df$Gender))
y[wide_df$Gender == "F"] <- 2
y[wide_df$Gender != "F"] <- 1

Run SAM and select the significant genes (estimated FDR = 0)

model <- SAM(t(x), y, resp.type = "Two class unpaired", genenames = colnames(x), nperms = 1000, fdr.output = 0.05)
# q val 0
up_genes <- model$siggenes.table$genes.up[model$siggenes.table$genes.up[,"q-value(%)"]==0, "Gene ID"]
lo_genes <- model$siggenes.table$genes.lo[model$siggenes.table$genes.lo[,"q-value(%)"]==0, "Gene ID"]
# Higher in women
up_genes
# Higher in men
lo_genes
# write.csv(up_genes, file = paste(plotDirectory, "SAM_features_women.csv", sep = ""))
# write.csv(lo_genes, file = paste(plotDirectory, "SAM_features_men.csv", sep = ""))

Plot significant features as boxplots

df <- dplyr::filter(df, Gender != "x")
df_up <- df[df[, "Condition_Feature"] %in% up_genes,]
ggplot(df_up, aes(Condition_Feature, value)) + geom_boxplot(aes(colour = Gender)) + scale_color_manual(values=c("firebrick1", "dark blue")) + coord_flip()
# ggsave(filename = paste(plotDirectory,"gender_sig_up.pdf", sep = ""))

df_lo <- df[df[, "Condition_Feature"] %in% lo_genes,]
ggplot(df_lo, aes(Condition_Feature, value)) + geom_boxplot(aes(colour = Gender)) + scale_color_manual(values=c("firebrick1", "dark blue")) + coord_flip()
# ggsave(filename = paste(plotDirectory,"gender_sig_lo.pdf", sep = ""))

Next we can see where these fall on the original plot:

significance <- rep("Not significant", dim(df_gender)[1])
significance[df_gender$Condition_Feature %in% lo_genes] <- "Higher in Males"
significance[df_gender$Condition_Feature %in% up_genes] <- "Higher in Females"

df_gender_sig <- data.frame(df_gender, Significance = significance)
p <- ggplot(df_gender_sig, aes(m_mean, f_mean))

p + geom_point(aes(colour = Significance)) + scale_colour_manual(values = c("Red", "Blue", "Black")) +
  geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) +
  coord_fixed(ratio = 1)

Modeling

Load the additional demographics data:

library(glmnet)
dataDirectory <- "~/Documents/FDAlibrary/saved_data_structures/"
demographics <- read.csv(paste(dataDirectory, "demographics.csv", sep = ""))
demographics <- demographics[(demographics$Donor %in% compiled_df$Donor),]
dems <- dplyr::select(demographics, Donor, Age)
data <- merge(wide_df, dems)

Creat a train and test set

set.seed(1)
train <- sample(1:85, 50, replace = FALSE)
x <- dplyr::select(data, -Age, -Donor, -Gender)
x <- as.matrix(x)
y <- data$Gender
x <- x[y!="x", ]
y <- y[y!="x"]
y <- droplevels(y)

Lasso Model

First train the model and use cross validation to choose lambda

grid <- 10^seq(10,-2, length =100)
lasso.mod <- glmnet(x[train,], y[train], family = "binomial", alpha=1, lambda =grid)
cv.out <- cv.glmnet(x[train,], y[train], family = "binomial", alpha = 1)
plot(cv.out)
lam <- cv.out$lambda.1se

Predict test set

lasso.pred <- predict(lasso.mod, newx = x[-train,], type = "class", s = lam)
pred <- as.factor(lasso.pred[, 1])
table(pred, y[-train])
# 60% correct classification

pred_probs <- predict(lasso.mod, newx = x[-train,], type = "response", s = lam)
truth <- y[-train]

library(AUC)
roc_data <- roc(pred_probs, truth)
auc(roc_data)

lasso_df <- data.frame(False_Positive_Rate = roc_data$fpr, True_Positive_Rate = roc_data$tpr, Model = rep("lasso"))

Finally, extract the coefficients:

lasso.coefs <- predict(lasso.mod, type = "coefficients", s = lam)
lasso.coefs <- matrix(lasso.coefs, dimnames = list(rownames(lasso.coefs), "coef"))
lasso.coefs[lasso.coefs != 0,]

We actually want to vary alpha to see what the best fit would be (comparing ridge, elastic net, and the lasso)

alpha_range <- seq(0, 1, 0.1)
# creating fold IDs
set.seed(1)
fold_sets_1 <- sample(1:10, size = length(train), replace = TRUE)
set.seed(2)
fold_sets_2 <- sample(1:10, size = length(train), replace = TRUE)
set.seed(3)
fold_sets_3 <- sample(1:10, size = length(train), replace = TRUE)
set.seed(1)
fold_sets <- list(fold_sets_1, fold_sets_2, fold_sets_3)

We need to empirically get the range of lambdas for ridge (alpha = 0) because it wasn't covering the error range properly:

chosen_folds <- fold_sets[[1]]
cv.out <- cv.glmnet(x[train,], y[train], family = "binomial", alpha = 0, foldid = chosen_folds)
lambdas_0 <- cv.out$lambda
cv.out <- cv.glmnet(x[train,], y[train], family = "binomial", alpha = 0.1, foldid = chosen_folds)
lambdas_0.1 <- cv.out$lambda
test_seq <- c(lambdas_0, lambdas_0.1)

Creating a function to cross-validate tuning both alpha and lambda

cv.glmnet.a <- function(a, chosen_folds){
  if(a != 0){
    cv.out <- cv.glmnet(x[train,], y[train], family = "binomial", alpha = a, foldid = chosen_folds)
  }
  else {
    cv.out <- cv.glmnet(x[train,], y[train], family = "binomial", alpha = a, lambda = test_seq, foldid = chosen_folds)

  }
  results_frame <- data.frame(alpha = rep(a, length(cv.out$cvm)),
                              lambda = cv.out$lambda,
                              error = cv.out$cvm)
  return(results_frame)
}

Perform cross-validation over 3 different fold ID sets:

for (i in 1:3){
  # chosen_folds <- fold_sets[[i]]
  error_list <- lapply(alpha_range, cv.glmnet.a, chosen_folds = fold_sets[[i]])
  error_frame <- do.call(rbind, error_list)

  # plot x = lambda, y = error, factor = alpha

  p <- ggplot(error_frame, aes(x = log(lambda), y= error, group=alpha))
  print(p + geom_line(aes(colour = alpha)))
  lowest_error <- error_frame[which.min(error_frame$error), ]
  print(lowest_error)
}
lasso.mod <- glmnet(x[train,], y[train], family = "binomial", alpha = 0)
lam <- 1
lasso.pred <- predict(lasso.mod, newx = x[-train,], type = "class", s = lam)
pred <- as.factor(lasso.pred[, 1])
table(pred, y[-train])
# 68% classification correct

ROC curves

pred_probs <- predict(lasso.mod, newx = x[-train,], type = "response", s = lam)
truth <- y[-train]

library(AUC)
roc_data <- roc(pred_probs, truth)
auc(roc_data)

ridge_df <- data.frame(False_Positive_Rate = roc_data$fpr, True_Positive_Rate = roc_data$tpr, Model = rep("ridge"))
ggplot(ridge_df, aes(x = False_Positive_Rate, y = True_Positive_Rate)) + geom_line() + geom_abline(intercept = 0, slope = 1, linetype = 2) + coord_fixed(ratio = 1)

Ok here goes with module info, doing sparse-group lasso (SGL):

library(SGL)

df_moduled has what we need, we will need to read it in (add this in later)

df_moduled <- read.csv("~/Documents/FDAlibrary/shiny-modules/output/df_moduled.csv", row.names = 1)
df <- df_moduled %>% 
  dplyr::select(Condition_Feature, Module) %>%
  unique()

We need the orders to match for the module assignments and our data:

data_names <- convert_names(colnames(x))
module_names <- as.character(df$Condition_Feature)
identical(data_names, module_names)

Assuming the above says TRUE, we can use the index straight out of the box:

index <- df$Module

Ok let's just try it... (later: sort out weird order thing, and optimize over alphas)

y_numeric <- as.numeric(y) - 1
index <- df$Module
cv.out.SGL <- cvSGL(data = list(x = x[train, ], y = y_numeric[train]), index = index, type = "logit")
plot(cv.out.SGL)
which.min(cv.out.SGL$lldiff)
SGLfit <- SGL(data = list(x = x[train, ], y = y_numeric[train]), index = index, type = "logit")
predictions <- predictSGL(SGLfit, newX = x[-train, ])
table(round(predictions[,20]), y_numeric[-train])
# 74% classification!

Do some ROC:

roc_data <- roc(predictions[,20], truth)
auc(roc_data)
SGL_df <- data.frame(False_Positive_Rate = roc_data$fpr, True_Positive_Rate = roc_data$tpr, Model = rep("SGL"))
ROC_df <- rbind(ridge_df, SGL_df)
ggplot(ROC_df, aes(x = False_Positive_Rate, y = True_Positive_Rate, group = Model)) + 
  geom_line(aes(colour = Model)) + 
  geom_abline(intercept = 0, slope = 1, linetype = 2) + coord_fixed(ratio = 1)

Ok these results are preliminary but let's look what we've got:

SGL_coeffs <- data.frame(df, SGL_coefficients = SGLfit$beta[,20])

We see that it excludes modules 2, 6, 9 entirely, then picks sparsely within it, using a total of 35 features. It helps to use the group info!

library(grplasso)
# x[train, ], y[train, ], index = index, 
## Use a multiplicative grid for the penalty parameter lambda, starting
## at the maximal lambda value
lambda <- lambdamax(x[train,], y = y_numeric[train], index = index,
model = LogReg()) * 0.5^(0:5)
grp_fit <- grplasso(x = x[train,], y = y_numeric[train], index = index, lambda = lambda, model = LogReg())
# need to optimize for lambda (design own cv!! validation). For now....
grp_fit <- grplasso(x = x[train,], y = y_numeric[train], index = index, lambda = 1, model = LogReg())
pred.resp <- predict(grp_fit, newdata = x[-train,], type = "response")

# 60% correct classification

roc_data <- roc(pred.resp, truth)
auc(roc_data)
GL_df <- data.frame(False_Positive_Rate = roc_data$fpr, True_Positive_Rate = roc_data$tpr, Model = rep("GL"))

Putting all the ROC data together for a plot:

ROC_df <- rbind(ridge_df, SGL_df, lasso_df, GL_df)
ggplot(ROC_df, aes(x = False_Positive_Rate, y = True_Positive_Rate, group = Model)) + 
  geom_line(aes(colour = Model)) + 
  geom_abline(intercept = 0, slope = 1, linetype = 2) + coord_fixed(ratio = 1)

Correlation networks:

m_df <- compiled_df %>%
  dplyr::filter(Gender == "M") %>%
  dplyr::select(Donor, Condition_Feature, value) %>%
  reshape2::dcast(., Donor ~ Condition_Feature) %>%
  dplyr::select(-Donor)

f_df <- compiled_df %>%
  dplyr::filter(Gender == "F") %>%
  dplyr::select(Donor, Condition_Feature, value) %>%
  reshape2::dcast(., Donor ~ Condition_Feature) %>%
  dplyr::select(-Donor)

all_df <- compiled_df %>%
  dplyr::select(Donor, Condition_Feature, value) %>%
  reshape2::dcast(., Donor ~ Condition_Feature) %>%
  dplyr::select(-Donor)

all_cormat <- cor(all_df)
dd <- as.dist((1-all_cormat)/2)
hc <- hclust(dd)

m_cormat <- cor(m_df)
m_cormat <- m_cormat[hc$order, hc$order]

cormat_melted <- reshape2::melt(m_cormat)

m_cor_plot <- ggplot(data = cormat_melted, aes(Var1, Var2, fill = value)) + geom_tile(colour = "white")+
  scale_fill_gradient2(low = "red", high = "blue", mid = "white", midpoint = 0, limits=c(-1, 1)) + theme(axis.text = element_blank()) + coord_fixed(ratio = 1) + ggtitle("Male correlations")

print(m_cor_plot)

f_cormat <- cor(f_df)
f_cormat <- f_cormat[hc$order, hc$order]

cormat_melted <- reshape2::melt(f_cormat)

f_cor_plot <- ggplot(data = cormat_melted, aes(Var1, Var2, fill = value)) + geom_tile(colour = "white") +
  scale_fill_gradient2(low = "red", high = "blue", mid = "white", midpoint = 0, limits=c(-1, 1)) + theme(axis.text = element_blank()) + coord_fixed(ratio = 1)+ ggtitle("Female correlations")

print(f_cor_plot)

Histograms:

hist_df <- data.frame(
  R_value = c(m_cormat_melted$value, f_cormat_melted$value),
  Gender = c(rep("Male", dim(m_cormat_melted)[1]), rep("Female", dim(f_cormat_melted)[1])))

ggplot(hist_df, aes(R_value, colour = Gender)) + geom_histogram()


gfragiadakis/FDA-library documentation built on May 17, 2019, 2:13 a.m.