mpred: Predict unmeasured metabolites

Documented in df_tidy prepare_data subset_data

#' Prepare dataset
#'
#' This function allows you to read in supplement data--sample info and met info
#' @param df main dataframe that contain sample metabolite measures
#' @param df_sample_info sample information
#' @param df_sample_name the variable name for sample name, vary for each dataset Default is sampleinfo.SAMPLE_NAME
#' @param type tumor or normal Default to tumor
#' @export
#' @examples 
#' 
#' data(RC12)
#' data(RC12_sampleinfo)
#' RC12_tumor <- prepare_data(df = RC12, df_sampleinfo = RC12_sampleinfo, df_sample_name = "SAMPLE_NAME", type = "tumor")

prepare_data <- function(df, df_sampleinfo, df_sample_name = "SAMPLE_NAME", type = "tumor"){
  #metabolite data
  names(df)[1]<-  "metabolite"
  #sample info subdata
  df_sampleinfo <- df_sampleinfo
  
  df_tumor <- df_sampleinfo %>%
    filter(GROUP == "TUMOR") %>%
    select(df_sample_name)
  df_tumor_vector<- as.vector(df_tumor[,1])
  
  #extract sample only are tumor
  names.use <- names(df)[names(df) %in% c(df_tumor_vector, "metabolite")]
  df_tumor<- df[, names.use]
  
  #extract sample are normal
  names.notuse <- names(df)[!(names(df) %in% df_tumor_vector)]
  df_normal<- df[, names.notuse]
  
  if (type == "tumor"){
    return(df_tumor)
  }else{
    return(df_normal)
  }
}


#' Find the subset of two datafraomes
#'
#' This function allows you to extract the metabolites are being measured in both dataset that you are interested in.  
#' @param df1 dataframe 1
#' @param df1_sampleinfo sample information about dataframe1
#' @param df2 dataframe 2
#' @param df2_sampleinfo sample information about dataframe2
#' @param df1_sample_name the variable name for sample name, vary for each dataset Default is sampleinfo.SAMPLE_NAME
#' @param df2_sample_name the variable name for sample name, vary for each dataset Default is sampleinfo.SAMPLE_NAME
#' @export
#' @examples
#' 
#' list <- subset_data(df1 = RC12, df1_sampleinfo = RC12_sampleinfo, df2 = RC18, df2_sampleinfo = RC18_sampleinfo, df1_sample_name = "SAMPLE_NAME",  df2_sample_name = "SAMPLE_NAME")
#' RC12_tumor_match <- list[[1]]
#' RC18_tumor_match <- list[[2]]
#' RC12_normal_match <- list[[3]]
#' RC18_normal_match <- list[[4]]
 
subset_data <- function(df1, df1_sampleinfo, df2, df2_sampleinfo, df1_sample_name = "SAMPLE_NAME", df2_sample_name= "SAMPLE_NAME") {
  #change the name of the first column and make it consistent for both dataframes
  names(df1)[1]<-  "metabolite"
  names(df2)[1]<-  "metabolite"
  
  #join sampleinfo with main data 
  #only select tumor data
  df1_tumor <- df1_sampleinfo%>%
    select(df1_sample_name, GROUP)%>%
    filter(GROUP=="TUMOR")%>%
    select(df1_sample_name)
  
  df1_tumor_vector<- as.vector(df1_tumor[,1])
  
  names.use <- names(df1)[names(df1) %in% c(df1_tumor_vector, "metabolite")]
  df1_tumor<- df1[, names.use]
  
  names.notuse <- names(df1)[!(names(df1) %in% df1_tumor_vector)]
  df1_normal<- df1[, names.notuse]
  
  
  #df2
  df2_tumor <- df2_sampleinfo%>%
    select(df2_sample_name, GROUP)%>%
    filter(GROUP=="TUMOR")%>%
    select(df2_sample_name)
  
  df2_tumor_vector<- as.vector(df2_tumor[,1])
  
  names.use <- names(df2)[names(df2) %in% c(df2_tumor_vector, "metabolite")]
  df2_tumor<- df2[, names.use]
  
  names.notuse <- names(df2)[!(names(df2) %in% df2_tumor_vector)]
  df2_normal<- df2[, names.notuse]
  
  
  #df1 and df2 match/missing in each dataset
  df1_id <- df1%>%
    select(metabolite)%>%
    mutate(df1 = T) 
  
  df2_id <- df2%>%
    select(metabolite)%>%
    mutate(df2 = T) 
  
  #id
  match_id  <- inner_join(df1_id, df2_id, by = "metabolite")%>%
    select(-df1, -df2)%>%
    as.data.frame()
  match_id <- as.vector(match_id[ ,1])
  
  df2_missing_id  <- left_join(df1_id, df2_id, by = "metabolite")%>%
    filter(is.na(df2))%>%
    select(-df1, -df2)%>%
    as.data.frame()
  df2_missing_id  <- as.vector(df2_missing_id [ ,1])
  
  
  df1_missing_id  <- left_join(df2_id, df1_id,by = "metabolite")%>%
    filter(is.na(df1))%>%
    select(-df1, -df2)%>%
    as.data.frame()
  df1_missing_id <-as.vector(df1_missing_id[,1])
  
  
  #main dataset 
  #df1
  #tumor
  df1_tumor_match <- df1_tumor %>%
    filter(metabolite %in% match_id)
  
  df1_tumor_notmatch <- df1_tumor %>%
    filter(! metabolite %in% match_id)
  
  #normal
  #main dataset 
  df1_normal_match <- df1_normal %>%
    filter(metabolite %in% match_id)
  
  df1_normal_notmatch <- df1_normal %>%
    filter(! metabolite %in% match_id)
  
  
  #df2
  #tumor
  df2_tumor_match <- df2_tumor %>%
    filter(metabolite %in% match_id)
  
  df2_tumor_notmatch <- df2_tumor %>%
    filter(! metabolite %in% match_id)
  
  #normal
  #main dataset 
  df2_normal_match <- df2_normal %>%
    filter(metabolite %in% match_id)
  
  df2_normal_notmatch <- df2_normal %>%
    filter(! metabolite %in% match_id)
  
  list_data_subset <- list()
  
  list_data_subset<- list("df1_tumor_match"= df1_tumor_match, 
                          "df2_tumor_match"= df2_tumor_match,
                          "df1_normal_match" = df1_normal_match,
                          "df2_normal_match" = df2_normal_match) 
  
  return(list_data_subset)
}


#' Tidy data into correct format before analysis
#'
#' This function transpose the original dataset to each row as one sample, each column as one metabolite predictor
#' @param df dataframe 
#' @param standardize standardizaton method Default to none. options are z-score, mean-subtraction and none
#' @export
#' @examples
#' list <- subset_data(df1 = RC12, df1_sampleinfo = RC12_sampleinfo, df2 = RC18, df2_sampleinfo = RC18_sampleinfo, df1_sample_name = "SAMPLE_NAME",  df2_sample_name = "SAMPLE_NAME")
#' RC12_tumor_match <- list[[1]]
#' RC18_tumor_match <- list[[2]]
#' t_RC12_tumor <- df_tidy( RC12_tumor_match, standardize = "z")
#' t_RC18_tumor <- df_tidy( RC18_tumor_match, standardize = "z")

#function to tidy data (transform) also, can choose standardization methods. default is none
df_tidy <- function(df, standardize = "none"){
  #store the name of the metabolites in a vector
  m_name <- df%>%
    select(metabolite)%>%
    as.data.frame()
  m_id <- as.vector(m_name[,1])
  m_name <- c("metabolite", m_id)
  
  #transpose dataframe -- row to column, column to row
  t_df <- df%>%
    t()%>%
    as.data.frame()%>%
    rownames_to_column("metabolites")
  
  #change the column name of the transposed dataframe
  names(t_df) <- m_name
  
  t_df<- t_df[-1, ]
  
  colnames(t_df)[colnames(t_df) == "metabolite"] <- "sample"
  
  t_df <- t_df%>%
    mutate_each_(funs((.) %>% as.character() %>% as.numeric), vars=m_id)
  
  favstats(t_df$M38002)
  
  if (standardize == "z"){
    t_df_standardized <- t_df%>%
      mutate_each_(funs(scale(.) %>% as.vector), vars=m_id)
    print("Standardization method: z-score")
  } else if (standardize == "mean_sbt"){
    t_df_standardized <- t_df%>%
      mutate_each_(funs(scale(., scale = F)), vars=m_id)
    print("Standardization method: mean subtraction")
  } else {
    t_df_standardized <- t_df
    print("No standardization")
  }
  
  return(t_df_standardized)
}



#' Get metabolite id in vector format
#' This function get the name of the metabolite of your dataset
#' @param df dataframe 
#' @export
#' @examples
#' m_id_sort <- get_m_id_vector(RC12_tumor_match)
#' 

get_m_id_vector <- function(df){
  #store the name of the metabolites in a vector
  m_name <- df%>%
    select(metabolite)%>%
    as.data.frame()
  m_id <- as.vector(m_name[,1])
  
  #sort the id of the metabolite
  m_id_sort <- str_sort(m_id)
  m_id_sort_df <- m_id_sort%>%   #save in a dataframe 
    as.data.frame() 
  colnames(m_id_sort_df)[1] <- "metabolite" #rename the column name
  m_id_sort_df$metabolite <-as.character(m_id_sort_df$metabolite) #change data type
  
  return(m_id_sort)
}


#' Get metabolite id in dataframe format
#' This function get the name of the metabolite of your dataset
#' @param df matched dataframe or 
#' @export
#' @examples
get_m_id_df <- function(df){
  #store the name of the metabolites in a vector
  m_name <- df%>%
    select(metabolite)%>%
    as.data.frame()
  m_id <- as.vector(m_name[,1])
  
  #sort the id of the metabolite
  m_id_sort <- str_sort(m_id)
  m_id_sort_df <- m_id_sort%>%   #save in a dataframe 
    as.data.frame() 
  colnames(m_id_sort_df)[1] <- "metabolite" #rename the column name
  m_id_sort_df$metabolite <-as.character(m_id_sort_df$metabolite) #change data type
  
  return(m_id_sort_df)
}


#' LASSO model fit
#' This function train LASSO model on train set and test on test set and evaluate model fit
#' @param train training dataset
#' @param test test dataset
#' @param m_id_sort metabolite name vector
#' @param i response metabolite's index, used in for loop
#' @param eval model fit evaluation, options are r-squared and MSE. Default to r-squared
#' @param number of folds in lasso cv Default to 10 folds
#' @export
#' @examples 
#' MSE <- c()
#' r2<- c()
#' for (i in (1:length(m_id_sort))){
#'  set.seed(1)
#'  MSE[i] <- LASSO_model2_fit(train, test, m_id_sort, i, eval = "mse")
#'  r2[i] <- LASSO_model2_fit(train, test, m_id_sort, i, eval = "r2")
#' }

LASSO_model2_fit <- function(train, test, m_id_sort, i, eval = "r2", num_folds = 10) {
  #identify one response variable
  response <- m_id_sort[i]
  
  #y
  index <- grep(response, colnames(train)) #extract the index of the column
  y <- train[index]
  y <- unlist(y)
  y <- as.vector(y)
  y <- as.numeric(y)
  
  #Training 
  train_df <- train%>%
    select(-sample)
  
  test_df <- test%>%
    select(-sample)
  
  x_train <- train_df%>%
    select(-response)%>%
    as.matrix()
  class(x_train) <- "numeric"
  
  x_test <- test_df %>%
    select(-response)%>%
    as.matrix()
  class(x_test) <- "numeric"
  
  
  y_train <- pull(train_df, response)
  y_train <- as.vector(y_train)
  y_train <- as.numeric(y_train)
  
  y_test <- pull(test_df, response)
  y_test <- as.vector(y_test)
  y_test <- as.numeric(y_test)
  
  set.seed(58)
  cv.out = cv.glmnet(x_train, y_train, alpha = 1, type.measure='mse', nfolds = num_folds)
  
  # Select lamda that minimizes training MSE 
  # Use best lambda to predict test data
  bestlam = cv.out$lambda.min
  
  # Fit lasso model on the test dataset 
  LASSO_fit <- glmnet(x_train, y_train, alpha = 1, lambda = bestlam)
  
  lasso_pred = predict(LASSO_fit, newx = x_test) %>%
    as.data.frame()
  lasso_pred <- as.vector(lasso_pred$s0)
  
  if(eval == "mse"){
    fit <- mean((lasso_pred - y_test)^2) # Calculate test MSE 
    message(paste("sample", i,"output MSE"))  }
  if (eval == "r2"){
    # SSE <- sum((lasso_pred - y_test)^2) #sum of squares residual
    #  SST <- sum((y_test - mean(y_test))^2)  #total sum of squares
    
    #  fit <- 1-SSE/SST
    #  fit
    r2= cor(y_test, lasso_pred, method='pearson')^2
    
    # r2= cor.test(y_test, lasso_pred, method='pearson')$estimate^2
    r2
    fit <- r2
    message(paste("sample", i,"output adjusted R squared"))
  }
  return(fit)
}

#' Get LASSO model coefficient
#' This function train LASSO model on train set and test on test set and evaluate model fit
#' @param df train set
#' @param m_id_sort metabolite id vector
#' @export
#' @examples 
#' coef_list = list()
#' coef_list <- LASSO_coef(df, m_id_sort)

LASSO_coef <- function(df, m_id_sort){
  for (i in (1:length(m_id_sort))){
    #identify one response variable
    response <- m_id_sort[i]
    
    #y
    index <- grep(response, colnames(df)) #extract the index of the column
    y <- df[index]
    y <- unlist(y)
    y <- as.vector(y)
    y <- as.numeric(y)
    
    #Training 
    train_df <- df%>%
      select(-sample)
    
    x_train <- train_df%>%
      select(-response)%>%
      as.matrix()
    class(x_train) <- "numeric"
    
    y_train <- pull(train_df, response)
    y_train <- as.vector(y_train)
    y_train <- as.numeric(y_train)
    
    set.seed(58)
    cv.out = cv.glmnet(x_train, y_train, alpha = 1, type.measure='mse')
    
    # Select lamda that minimizes training MSE 
    # Use best lambda to predict test data
    bestlam = cv.out$lambda.min
    
    # Fit lasso model on the test dataset 
    LASSO_fit <- glmnet(x_train, y_train, alpha = 1, lambda = bestlam)
    
    #Display coefficients using lambda chosen by CV
    lasso_coef = predict(LASSO_fit, type = "coefficients", s = bestlam)
    
    lasso_coef<-  as.matrix(lasso_coef)
    lasso_coef <- as.data.frame(lasso_coef)
    lasso_coef <- tibble::rownames_to_column(lasso_coef, "metabolite")%>%
      filter(`1` != 0)
    
    colnames(lasso_coef)[2] <- "coef"
    
    coef_list[[i]] = lasso_coef
  }
  return(coef_list)
}


#' Get LASSO model predictor matrix
#' @param df train set
#' @param m_id_sort metabolite id vector
#' @param i indec of metabolite response
#' @export
#' @examples 

LASSO_predictors <- function(df, m_id_sort, i) {
  #identify one response variable
  response <- m_id_sort[i]
  
  #y
  index <- grep(response, colnames(df)) #extract the index of the column
  y <- df[index]
  y <- unlist(y)
  y <- as.vector(y)
  y <- as.numeric(y)
  
  #Training 
  train_df <- df%>%
    select(-sample)

  x_train <- train_df%>%
    select(-response)%>%
    as.matrix()
  class(x_train) <- "numeric"
  
  y_train <- pull(train_df, response)
  y_train <- as.vector(y_train)
  y_train <- as.numeric(y_train)

  
  ##########################
  set.seed(58)
  cv.out = cv.glmnet(x_train, y_train, alpha = 1, type.measure='mse')
  
  # Select lamda that minimizes training MSE 
  # Use best lambda to predict test data
  bestlam = cv.out$lambda.min
  
  # Fit lasso model on the test dataset 
  LASSO_fit <- glmnet(x_train, y_train, alpha = 1, lambda = bestlam)
  
  
  #Display coefficients using lambda chosen by CV
  lasso_coef = predict(LASSO_fit, type = "coefficients", s = bestlam)
  
  lasso_coef<-  as.matrix(lasso_coef)
  lasso_coef <- as.data.frame(lasso_coef)
  lasso_coef <- tibble::rownames_to_column(lasso_coef, "metabolite")%>%
    filter(`1` != 0)
  metabolite_predictors <- lasso_coef$metabolite[-1]
  
  return(metabolite_predictors)
}


#' Plot predicted versus actual 
#' @param train
#' @param test
#' @param m_id_sort
#' @export
#' @examples 
#' plot_list <- list() 
#' plot_list <- plot_actual_vs_fit(train, test, m_id_sort) 

plot_actual_vs_fit <- function(train, test, m_id_sort){
  for (i in 1:length(m_id_sort)) {
    
    #identify one response variable
    response <- m_id_sort[i]
    
    #name of the metabolte
    name <- RC12_metinfo %>%
      select(COMP_IDstr, name)%>%
      filter(COMP_IDstr == response)
    name <- name[1,2]
    
    #y
    index <- grep(response, colnames(train)) #extract the index of the column
    y <- train[index]
    y <- unlist(y)
    y <- as.vector(y)
    y <- as.numeric(y)
    
    
    #Training 
    train_df <- train%>%
      select(-sample)
    test_df <- test%>%
      select(-sample)
    
    x_train <- train_df%>%
      select(-response)%>%
      as.matrix()
    class(x_train) <- "numeric"
    
    x_test <- test_df %>%
      select(-response)%>%
      as.matrix()
    class(x_test) <- "numeric"
    
    
    y_train <- pull(train_df, response)
    y_train <- as.vector(y_train)
    y_train <- as.numeric(y_train)
    
    y_test <- pull(test_df, response)
    y_test <- as.vector(y_test)
    y_test <- as.numeric(y_test)
    
    set.seed(58)
    cv.out = cv.glmnet(x_train, y_train, alpha = 1, type.measure='mse')
    
    # Select lamda that minimizes training MSE 
    # Use best lambda to predict test data
    bestlam = cv.out$lambda.min
    LASSO_fit <- glmnet(x_train, y_train, alpha = 1, lambda = bestlam)
    
    lasso_pred <- predict(LASSO_fit, newx = x_test)%>%
      as.data.frame()
    
    data <- cbind(lasso_pred, y_test)%>%
      as.data.frame()
    colnames(data)[1]<-"predicted"
    colnames(data)[2]<-"actual"
    
    lasso_pred <- as.vector(lasso_pred$s0)
    r2= round(cor(y_test, lasso_pred, method='pearson')^2, digits=2)
    r= round(cor(y_test, lasso_pred, method='pearson'), digits=2)
    
    min_xy <- min(min(data$predicted), min(data$actual))
    max_xy <- max(max(data$predicted), max(data$actual))
    
    plot <- ggplot(data,aes(actual, predicted))+
      geom_smooth(method='lm')+
      geom_point(color="black")+
      ggtitle(paste( "Predicted vs. Actual "," (", name, ")"," #", i, " index", index, sep=""), subtitle = paste("R-squared =", r2, "; correlation =", r, sep=""))+
      geom_abline(colour ="red") +
      scale_x_continuous(limits = c(min_xy, max_xy))+
      scale_y_continuous(limits = c(min_xy, max_xy))
    
    
    plot_list[[i]] = plot
  }
  return(plot_list)
}




#' Combine all samples from 7 studies that match the measured metabolites 
#' @param num_study
#' @param df1
#' @param df1_sampleinfo
#' @export
#' @examples 
#' run function in our data
#' data_all_sample  <- combine_all_sample(
#' num_study=7, df1 = RC12, df2 = RC18, df3 = BC_tang, df4 = BC_terunuma, df5=HCC, df6 = DLBCL, df7 = PaCa,  df1_sampleinfo= RC12_sampleinfo, df2_sampleinfo = RC18_sampleinfo, df3_sampleinfo = BC_tang_sampleinfo, df4_sampleinfo = BC_terunuma_sampleinfo, df5_sampleinfo = HCC_sampleinfo, df6_sampleinfo = DLBCL_sampleinfo, df7_sampleinfo= PaCa_sampleinfo )

#function
combine_all_sample <- function(num_study = num_sutdy, study_name = study_name, standardize = "none", df1 = RC12, df2 = RC18, df3 = BC_tang, df4 = BC_terunuma, df5=HCC, df6 = DLBCL, df7 = PaCa, 
                               df1_sampleinfo= RC12_sampleinfo, df2_sampleinfo = RC18_sampleinfo, df3_sampleinfo = BC_tang_sampleinfo, df4_sampleinfo = BC_terunuma_sampleinfo, df5_sampleinfo = HCC_sampleinfo, df6_sampleinfo = DLBCL_sampleinfo, df7_sampleinfo= PaCa_sampleinfo){
  
  m_list <- list(df1, df2, df3, df4, df5, df6, df7)
  m_measured_all <- Reduce(intersect, list(df1$X__1, df2$X__1, df3$X__1, df4$X__1, df5$X__1, df6$X__1, df7$X__1))
  
  sampleinfo_list <- list(df1_sampleinfo, df2_sampleinfo, df3_sampleinfo, df4_sampleinfo, df5_sampleinfo, df6_sampleinfo, df7_sampleinfo)
  
  data_all_sample <-c()
  for (i in 1:num_study){
    #prepare dataset 
    m_list[[i]]<- prepare_data(df = m_list[[i]], df_sampleinfo = sampleinfo_list[[i]], df_sample_name = "SAMPLE_NAME", type = "tumor") 
    #subset
    m_list[[i]] <-  m_list[[i]][m_list[[i]]$metabolite %in% m_measured_all, ] 
    #tidy
    m_list[[i]]  <- df_tidy(m_list[[i]], standardize = standardize)
    #mutate a study name 
    m_list[[i]] <- m_list[[i]]%>%
      mutate(study = study_name[i])
    #combine data from all studies to one dataframe
    data_all_sample <- rbind(data_all_sample, m_list[[i]])
  }
  return(data_all_sample)
}


### Construct confidence score about prediction
#this function generates a plot of Prediction Consistency across cancer types
PC_plot <- function(num_study_measure, model_fit_all){
  penalize <- model_fit_all %>%
    filter(num_metabolite == num_study_measure)%>%
    group_by(metabolite)%>%
    mutate(r2_mean = mean(r2_m2), r2_sd=sd(r2_m2))%>%
    mutate(confidence_1 = r2_mean - 2 * r2_sd^2, diff_1 = r2_mean - confidence_1)
  
  penalize <- penalize[order(-penalize$confidence_1), ]
  m_order_vector <- as.vector(penalize$name)
  m_order_vector_unique <- unique(m_order_vector)
  
  penalize$name <- factor(penalize$name, levels = m_order_vector)
  
  plot <- ggplot(data = penalize, aes(x = name, y = r2_m2)) + 
    geom_boxplot()  + 
    aes(colour = confidence_1) + 
    theme(legend.position = "right") + 
    labs(title = "Prediction Consistency Across Cancer Types/Studies", subtitle = paste( "For metabolites that are measured in ", num_study_measure, " studies", sep="")) + 
    xlab("metabolites")+
    ylab("r-squared")+
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 4))
  
  return(plot)
}

# this function generates the vector of the metabolites that are measured in a certain number of studies. 
PC_m <- function (num_study_measure, model_fit_all){
  penalize <- model_fit_all %>%
    filter(num_metabolite == num_study_measure)%>%
    group_by(metabolite)%>%
    mutate(r2_mean = mean(r2_m2), r2_sd=sd(r2_m2))%>%
    mutate(confidence_1 = r2_mean - 2 * r2_sd^2, diff_1 = r2_mean - confidence_1)
  
  penalize <- penalize[order(-penalize$confidence_1), ]
  m_order_vector <- as.vector(penalize$name)
  m_order_vector_unique <- unique(m_order_vector)
  return(m_order_vector_unique)
}