#' Prepare dataset
#'
#' This function allows you to read in supplement data--sample info and met info
#' @param df main dataframe that contain sample metabolite measures
#' @param df_sample_info sample information
#' @param df_sample_name the variable name for sample name, vary for each dataset Default is sampleinfo.SAMPLE_NAME
#' @param type tumor or normal Default to tumor
#' @export
#' @examples
#'
#' data(RC12)
#' data(RC12_sampleinfo)
#' RC12_tumor <- prepare_data(df = RC12, df_sampleinfo = RC12_sampleinfo, df_sample_name = "SAMPLE_NAME", type = "tumor")
prepare_data <- function(df, df_sampleinfo, df_sample_name = "SAMPLE_NAME", type = "tumor"){
#metabolite data
names(df)[1]<- "metabolite"
#sample info subdata
df_sampleinfo <- df_sampleinfo
df_tumor <- df_sampleinfo %>%
filter(GROUP == "TUMOR") %>%
select(df_sample_name)
df_tumor_vector<- as.vector(df_tumor[,1])
#extract sample only are tumor
names.use <- names(df)[names(df) %in% c(df_tumor_vector, "metabolite")]
df_tumor<- df[, names.use]
#extract sample are normal
names.notuse <- names(df)[!(names(df) %in% df_tumor_vector)]
df_normal<- df[, names.notuse]
if (type == "tumor"){
return(df_tumor)
}else{
return(df_normal)
}
}
#' Find the subset of two datafraomes
#'
#' This function allows you to extract the metabolites are being measured in both dataset that you are interested in.
#' @param df1 dataframe 1
#' @param df1_sampleinfo sample information about dataframe1
#' @param df2 dataframe 2
#' @param df2_sampleinfo sample information about dataframe2
#' @param df1_sample_name the variable name for sample name, vary for each dataset Default is sampleinfo.SAMPLE_NAME
#' @param df2_sample_name the variable name for sample name, vary for each dataset Default is sampleinfo.SAMPLE_NAME
#' @export
#' @examples
#'
#' list <- subset_data(df1 = RC12, df1_sampleinfo = RC12_sampleinfo, df2 = RC18, df2_sampleinfo = RC18_sampleinfo, df1_sample_name = "SAMPLE_NAME", df2_sample_name = "SAMPLE_NAME")
#' RC12_tumor_match <- list[[1]]
#' RC18_tumor_match <- list[[2]]
#' RC12_normal_match <- list[[3]]
#' RC18_normal_match <- list[[4]]
subset_data <- function(df1, df1_sampleinfo, df2, df2_sampleinfo, df1_sample_name = "SAMPLE_NAME", df2_sample_name= "SAMPLE_NAME") {
#change the name of the first column and make it consistent for both dataframes
names(df1)[1]<- "metabolite"
names(df2)[1]<- "metabolite"
#join sampleinfo with main data
#only select tumor data
df1_tumor <- df1_sampleinfo%>%
select(df1_sample_name, GROUP)%>%
filter(GROUP=="TUMOR")%>%
select(df1_sample_name)
df1_tumor_vector<- as.vector(df1_tumor[,1])
names.use <- names(df1)[names(df1) %in% c(df1_tumor_vector, "metabolite")]
df1_tumor<- df1[, names.use]
names.notuse <- names(df1)[!(names(df1) %in% df1_tumor_vector)]
df1_normal<- df1[, names.notuse]
#df2
df2_tumor <- df2_sampleinfo%>%
select(df2_sample_name, GROUP)%>%
filter(GROUP=="TUMOR")%>%
select(df2_sample_name)
df2_tumor_vector<- as.vector(df2_tumor[,1])
names.use <- names(df2)[names(df2) %in% c(df2_tumor_vector, "metabolite")]
df2_tumor<- df2[, names.use]
names.notuse <- names(df2)[!(names(df2) %in% df2_tumor_vector)]
df2_normal<- df2[, names.notuse]
#df1 and df2 match/missing in each dataset
df1_id <- df1%>%
select(metabolite)%>%
mutate(df1 = T)
df2_id <- df2%>%
select(metabolite)%>%
mutate(df2 = T)
#id
match_id <- inner_join(df1_id, df2_id, by = "metabolite")%>%
select(-df1, -df2)%>%
as.data.frame()
match_id <- as.vector(match_id[ ,1])
df2_missing_id <- left_join(df1_id, df2_id, by = "metabolite")%>%
filter(is.na(df2))%>%
select(-df1, -df2)%>%
as.data.frame()
df2_missing_id <- as.vector(df2_missing_id [ ,1])
df1_missing_id <- left_join(df2_id, df1_id,by = "metabolite")%>%
filter(is.na(df1))%>%
select(-df1, -df2)%>%
as.data.frame()
df1_missing_id <-as.vector(df1_missing_id[,1])
#main dataset
#df1
#tumor
df1_tumor_match <- df1_tumor %>%
filter(metabolite %in% match_id)
df1_tumor_notmatch <- df1_tumor %>%
filter(! metabolite %in% match_id)
#normal
#main dataset
df1_normal_match <- df1_normal %>%
filter(metabolite %in% match_id)
df1_normal_notmatch <- df1_normal %>%
filter(! metabolite %in% match_id)
#df2
#tumor
df2_tumor_match <- df2_tumor %>%
filter(metabolite %in% match_id)
df2_tumor_notmatch <- df2_tumor %>%
filter(! metabolite %in% match_id)
#normal
#main dataset
df2_normal_match <- df2_normal %>%
filter(metabolite %in% match_id)
df2_normal_notmatch <- df2_normal %>%
filter(! metabolite %in% match_id)
list_data_subset <- list()
list_data_subset<- list("df1_tumor_match"= df1_tumor_match,
"df2_tumor_match"= df2_tumor_match,
"df1_normal_match" = df1_normal_match,
"df2_normal_match" = df2_normal_match)
return(list_data_subset)
}
#' Tidy data into correct format before analysis
#'
#' This function transpose the original dataset to each row as one sample, each column as one metabolite predictor
#' @param df dataframe
#' @param standardize standardizaton method Default to none. options are z-score, mean-subtraction and none
#' @export
#' @examples
#' list <- subset_data(df1 = RC12, df1_sampleinfo = RC12_sampleinfo, df2 = RC18, df2_sampleinfo = RC18_sampleinfo, df1_sample_name = "SAMPLE_NAME", df2_sample_name = "SAMPLE_NAME")
#' RC12_tumor_match <- list[[1]]
#' RC18_tumor_match <- list[[2]]
#' t_RC12_tumor <- df_tidy( RC12_tumor_match, standardize = "z")
#' t_RC18_tumor <- df_tidy( RC18_tumor_match, standardize = "z")
#function to tidy data (transform) also, can choose standardization methods. default is none
df_tidy <- function(df, standardize = "none"){
#store the name of the metabolites in a vector
m_name <- df%>%
select(metabolite)%>%
as.data.frame()
m_id <- as.vector(m_name[,1])
m_name <- c("metabolite", m_id)
#transpose dataframe -- row to column, column to row
t_df <- df%>%
t()%>%
as.data.frame()%>%
rownames_to_column("metabolites")
#change the column name of the transposed dataframe
names(t_df) <- m_name
t_df<- t_df[-1, ]
colnames(t_df)[colnames(t_df) == "metabolite"] <- "sample"
t_df <- t_df%>%
mutate_each_(funs((.) %>% as.character() %>% as.numeric), vars=m_id)
favstats(t_df$M38002)
if (standardize == "z"){
t_df_standardized <- t_df%>%
mutate_each_(funs(scale(.) %>% as.vector), vars=m_id)
print("Standardization method: z-score")
} else if (standardize == "mean_sbt"){
t_df_standardized <- t_df%>%
mutate_each_(funs(scale(., scale = F)), vars=m_id)
print("Standardization method: mean subtraction")
} else {
t_df_standardized <- t_df
print("No standardization")
}
return(t_df_standardized)
}
#' Get metabolite id in vector format
#' This function get the name of the metabolite of your dataset
#' @param df dataframe
#' @export
#' @examples
#' m_id_sort <- get_m_id_vector(RC12_tumor_match)
#'
get_m_id_vector <- function(df){
#store the name of the metabolites in a vector
m_name <- df%>%
select(metabolite)%>%
as.data.frame()
m_id <- as.vector(m_name[,1])
#sort the id of the metabolite
m_id_sort <- str_sort(m_id)
m_id_sort_df <- m_id_sort%>% #save in a dataframe
as.data.frame()
colnames(m_id_sort_df)[1] <- "metabolite" #rename the column name
m_id_sort_df$metabolite <-as.character(m_id_sort_df$metabolite) #change data type
return(m_id_sort)
}
#' Get metabolite id in dataframe format
#' This function get the name of the metabolite of your dataset
#' @param df matched dataframe or
#' @export
#' @examples
get_m_id_df <- function(df){
#store the name of the metabolites in a vector
m_name <- df%>%
select(metabolite)%>%
as.data.frame()
m_id <- as.vector(m_name[,1])
#sort the id of the metabolite
m_id_sort <- str_sort(m_id)
m_id_sort_df <- m_id_sort%>% #save in a dataframe
as.data.frame()
colnames(m_id_sort_df)[1] <- "metabolite" #rename the column name
m_id_sort_df$metabolite <-as.character(m_id_sort_df$metabolite) #change data type
return(m_id_sort_df)
}
#' LASSO model fit
#' This function train LASSO model on train set and test on test set and evaluate model fit
#' @param train training dataset
#' @param test test dataset
#' @param m_id_sort metabolite name vector
#' @param i response metabolite's index, used in for loop
#' @param eval model fit evaluation, options are r-squared and MSE. Default to r-squared
#' @param number of folds in lasso cv Default to 10 folds
#' @export
#' @examples
#' MSE <- c()
#' r2<- c()
#' for (i in (1:length(m_id_sort))){
#' set.seed(1)
#' MSE[i] <- LASSO_model2_fit(train, test, m_id_sort, i, eval = "mse")
#' r2[i] <- LASSO_model2_fit(train, test, m_id_sort, i, eval = "r2")
#' }
LASSO_model2_fit <- function(train, test, m_id_sort, i, eval = "r2", num_folds = 10) {
#identify one response variable
response <- m_id_sort[i]
#y
index <- grep(response, colnames(train)) #extract the index of the column
y <- train[index]
y <- unlist(y)
y <- as.vector(y)
y <- as.numeric(y)
#Training
train_df <- train%>%
select(-sample)
test_df <- test%>%
select(-sample)
x_train <- train_df%>%
select(-response)%>%
as.matrix()
class(x_train) <- "numeric"
x_test <- test_df %>%
select(-response)%>%
as.matrix()
class(x_test) <- "numeric"
y_train <- pull(train_df, response)
y_train <- as.vector(y_train)
y_train <- as.numeric(y_train)
y_test <- pull(test_df, response)
y_test <- as.vector(y_test)
y_test <- as.numeric(y_test)
set.seed(58)
cv.out = cv.glmnet(x_train, y_train, alpha = 1, type.measure='mse', nfolds = num_folds)
# Select lamda that minimizes training MSE
# Use best lambda to predict test data
bestlam = cv.out$lambda.min
# Fit lasso model on the test dataset
LASSO_fit <- glmnet(x_train, y_train, alpha = 1, lambda = bestlam)
lasso_pred = predict(LASSO_fit, newx = x_test) %>%
as.data.frame()
lasso_pred <- as.vector(lasso_pred$s0)
if(eval == "mse"){
fit <- mean((lasso_pred - y_test)^2) # Calculate test MSE
message(paste("sample", i,"output MSE")) }
if (eval == "r2"){
# SSE <- sum((lasso_pred - y_test)^2) #sum of squares residual
# SST <- sum((y_test - mean(y_test))^2) #total sum of squares
# fit <- 1-SSE/SST
# fit
r2= cor(y_test, lasso_pred, method='pearson')^2
# r2= cor.test(y_test, lasso_pred, method='pearson')$estimate^2
r2
fit <- r2
message(paste("sample", i,"output adjusted R squared"))
}
return(fit)
}
#' Get LASSO model coefficient
#' This function train LASSO model on train set and test on test set and evaluate model fit
#' @param df train set
#' @param m_id_sort metabolite id vector
#' @export
#' @examples
#' coef_list = list()
#' coef_list <- LASSO_coef(df, m_id_sort)
LASSO_coef <- function(df, m_id_sort){
for (i in (1:length(m_id_sort))){
#identify one response variable
response <- m_id_sort[i]
#y
index <- grep(response, colnames(df)) #extract the index of the column
y <- df[index]
y <- unlist(y)
y <- as.vector(y)
y <- as.numeric(y)
#Training
train_df <- df%>%
select(-sample)
x_train <- train_df%>%
select(-response)%>%
as.matrix()
class(x_train) <- "numeric"
y_train <- pull(train_df, response)
y_train <- as.vector(y_train)
y_train <- as.numeric(y_train)
set.seed(58)
cv.out = cv.glmnet(x_train, y_train, alpha = 1, type.measure='mse')
# Select lamda that minimizes training MSE
# Use best lambda to predict test data
bestlam = cv.out$lambda.min
# Fit lasso model on the test dataset
LASSO_fit <- glmnet(x_train, y_train, alpha = 1, lambda = bestlam)
#Display coefficients using lambda chosen by CV
lasso_coef = predict(LASSO_fit, type = "coefficients", s = bestlam)
lasso_coef<- as.matrix(lasso_coef)
lasso_coef <- as.data.frame(lasso_coef)
lasso_coef <- tibble::rownames_to_column(lasso_coef, "metabolite")%>%
filter(`1` != 0)
colnames(lasso_coef)[2] <- "coef"
coef_list[[i]] = lasso_coef
}
return(coef_list)
}
#' Get LASSO model predictor matrix
#' @param df train set
#' @param m_id_sort metabolite id vector
#' @param i indec of metabolite response
#' @export
#' @examples
LASSO_predictors <- function(df, m_id_sort, i) {
#identify one response variable
response <- m_id_sort[i]
#y
index <- grep(response, colnames(df)) #extract the index of the column
y <- df[index]
y <- unlist(y)
y <- as.vector(y)
y <- as.numeric(y)
#Training
train_df <- df%>%
select(-sample)
x_train <- train_df%>%
select(-response)%>%
as.matrix()
class(x_train) <- "numeric"
y_train <- pull(train_df, response)
y_train <- as.vector(y_train)
y_train <- as.numeric(y_train)
##########################
set.seed(58)
cv.out = cv.glmnet(x_train, y_train, alpha = 1, type.measure='mse')
# Select lamda that minimizes training MSE
# Use best lambda to predict test data
bestlam = cv.out$lambda.min
# Fit lasso model on the test dataset
LASSO_fit <- glmnet(x_train, y_train, alpha = 1, lambda = bestlam)
#Display coefficients using lambda chosen by CV
lasso_coef = predict(LASSO_fit, type = "coefficients", s = bestlam)
lasso_coef<- as.matrix(lasso_coef)
lasso_coef <- as.data.frame(lasso_coef)
lasso_coef <- tibble::rownames_to_column(lasso_coef, "metabolite")%>%
filter(`1` != 0)
metabolite_predictors <- lasso_coef$metabolite[-1]
return(metabolite_predictors)
}
#' Plot predicted versus actual
#' @param train
#' @param test
#' @param m_id_sort
#' @export
#' @examples
#' plot_list <- list()
#' plot_list <- plot_actual_vs_fit(train, test, m_id_sort)
plot_actual_vs_fit <- function(train, test, m_id_sort){
for (i in 1:length(m_id_sort)) {
#identify one response variable
response <- m_id_sort[i]
#name of the metabolte
name <- RC12_metinfo %>%
select(COMP_IDstr, name)%>%
filter(COMP_IDstr == response)
name <- name[1,2]
#y
index <- grep(response, colnames(train)) #extract the index of the column
y <- train[index]
y <- unlist(y)
y <- as.vector(y)
y <- as.numeric(y)
#Training
train_df <- train%>%
select(-sample)
test_df <- test%>%
select(-sample)
x_train <- train_df%>%
select(-response)%>%
as.matrix()
class(x_train) <- "numeric"
x_test <- test_df %>%
select(-response)%>%
as.matrix()
class(x_test) <- "numeric"
y_train <- pull(train_df, response)
y_train <- as.vector(y_train)
y_train <- as.numeric(y_train)
y_test <- pull(test_df, response)
y_test <- as.vector(y_test)
y_test <- as.numeric(y_test)
set.seed(58)
cv.out = cv.glmnet(x_train, y_train, alpha = 1, type.measure='mse')
# Select lamda that minimizes training MSE
# Use best lambda to predict test data
bestlam = cv.out$lambda.min
LASSO_fit <- glmnet(x_train, y_train, alpha = 1, lambda = bestlam)
lasso_pred <- predict(LASSO_fit, newx = x_test)%>%
as.data.frame()
data <- cbind(lasso_pred, y_test)%>%
as.data.frame()
colnames(data)[1]<-"predicted"
colnames(data)[2]<-"actual"
lasso_pred <- as.vector(lasso_pred$s0)
r2= round(cor(y_test, lasso_pred, method='pearson')^2, digits=2)
r= round(cor(y_test, lasso_pred, method='pearson'), digits=2)
min_xy <- min(min(data$predicted), min(data$actual))
max_xy <- max(max(data$predicted), max(data$actual))
plot <- ggplot(data,aes(actual, predicted))+
geom_smooth(method='lm')+
geom_point(color="black")+
ggtitle(paste( "Predicted vs. Actual "," (", name, ")"," #", i, " index", index, sep=""), subtitle = paste("R-squared =", r2, "; correlation =", r, sep=""))+
geom_abline(colour ="red") +
scale_x_continuous(limits = c(min_xy, max_xy))+
scale_y_continuous(limits = c(min_xy, max_xy))
plot_list[[i]] = plot
}
return(plot_list)
}
#' Combine all samples from 7 studies that match the measured metabolites
#' @param num_study
#' @param df1
#' @param df1_sampleinfo
#' @export
#' @examples
#' run function in our data
#' data_all_sample <- combine_all_sample(
#' num_study=7, df1 = RC12, df2 = RC18, df3 = BC_tang, df4 = BC_terunuma, df5=HCC, df6 = DLBCL, df7 = PaCa, df1_sampleinfo= RC12_sampleinfo, df2_sampleinfo = RC18_sampleinfo, df3_sampleinfo = BC_tang_sampleinfo, df4_sampleinfo = BC_terunuma_sampleinfo, df5_sampleinfo = HCC_sampleinfo, df6_sampleinfo = DLBCL_sampleinfo, df7_sampleinfo= PaCa_sampleinfo )
#function
combine_all_sample <- function(num_study = num_sutdy, study_name = study_name, standardize = "none", df1 = RC12, df2 = RC18, df3 = BC_tang, df4 = BC_terunuma, df5=HCC, df6 = DLBCL, df7 = PaCa,
df1_sampleinfo= RC12_sampleinfo, df2_sampleinfo = RC18_sampleinfo, df3_sampleinfo = BC_tang_sampleinfo, df4_sampleinfo = BC_terunuma_sampleinfo, df5_sampleinfo = HCC_sampleinfo, df6_sampleinfo = DLBCL_sampleinfo, df7_sampleinfo= PaCa_sampleinfo){
m_list <- list(df1, df2, df3, df4, df5, df6, df7)
m_measured_all <- Reduce(intersect, list(df1$X__1, df2$X__1, df3$X__1, df4$X__1, df5$X__1, df6$X__1, df7$X__1))
sampleinfo_list <- list(df1_sampleinfo, df2_sampleinfo, df3_sampleinfo, df4_sampleinfo, df5_sampleinfo, df6_sampleinfo, df7_sampleinfo)
data_all_sample <-c()
for (i in 1:num_study){
#prepare dataset
m_list[[i]]<- prepare_data(df = m_list[[i]], df_sampleinfo = sampleinfo_list[[i]], df_sample_name = "SAMPLE_NAME", type = "tumor")
#subset
m_list[[i]] <- m_list[[i]][m_list[[i]]$metabolite %in% m_measured_all, ]
#tidy
m_list[[i]] <- df_tidy(m_list[[i]], standardize = standardize)
#mutate a study name
m_list[[i]] <- m_list[[i]]%>%
mutate(study = study_name[i])
#combine data from all studies to one dataframe
data_all_sample <- rbind(data_all_sample, m_list[[i]])
}
return(data_all_sample)
}
### Construct confidence score about prediction
#this function generates a plot of Prediction Consistency across cancer types
PC_plot <- function(num_study_measure, model_fit_all){
penalize <- model_fit_all %>%
filter(num_metabolite == num_study_measure)%>%
group_by(metabolite)%>%
mutate(r2_mean = mean(r2_m2), r2_sd=sd(r2_m2))%>%
mutate(confidence_1 = r2_mean - 2 * r2_sd^2, diff_1 = r2_mean - confidence_1)
penalize <- penalize[order(-penalize$confidence_1), ]
m_order_vector <- as.vector(penalize$name)
m_order_vector_unique <- unique(m_order_vector)
penalize$name <- factor(penalize$name, levels = m_order_vector)
plot <- ggplot(data = penalize, aes(x = name, y = r2_m2)) +
geom_boxplot() +
aes(colour = confidence_1) +
theme(legend.position = "right") +
labs(title = "Prediction Consistency Across Cancer Types/Studies", subtitle = paste( "For metabolites that are measured in ", num_study_measure, " studies", sep="")) +
xlab("metabolites")+
ylab("r-squared")+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 4))
return(plot)
}
# this function generates the vector of the metabolites that are measured in a certain number of studies.
PC_m <- function (num_study_measure, model_fit_all){
penalize <- model_fit_all %>%
filter(num_metabolite == num_study_measure)%>%
group_by(metabolite)%>%
mutate(r2_mean = mean(r2_m2), r2_sd=sd(r2_m2))%>%
mutate(confidence_1 = r2_mean - 2 * r2_sd^2, diff_1 = r2_mean - confidence_1)
penalize <- penalize[order(-penalize$confidence_1), ]
m_order_vector <- as.vector(penalize$name)
m_order_vector_unique <- unique(m_order_vector)
return(m_order_vector_unique)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.