R/compute_associations.R

Defines functions compute_lm_associations compute_microbial_feature_proportions compute_microbial_feature_proportions_helper

suppressMessages(library(dplyr))
suppressMessages(library(purrr))
suppressMessages(library(tidyr))
suppressMessages(library(magrittr))
suppressMessages(library(broom))
suppressMessages(library(stringr))

# Workflow
# For each microbial feature (col in data type's tibble):
#   For each phenotype's modeldfs:
#     For each dataset:
#     1) add dataset_name to all tibbles
#     2) left_join matching ID's abundances using dataset_name and IDs
#     3) If proportion of non-zero values for abundances is at least 1,
#          mutate new col with tidied glm fit summary on model df that now has 1 feature tacked on
#     4) rename column as desired

# Performs above workflow on df outputted by cmd_metadata_metaphlan_v2.Rmd, and transposed, separated metaphlan df outputted
# by same Rmd file.


# For a single feature and single modeldf outputted by prune_metadata.R, computes the proportion of non-zero values per cohort.
compute_microbial_feature_proportions_helper <- function(model_ready_df, metaphlan_df, feature_num) {
  #browser()
  return(
    length(
      which((left_join(model_ready_df %>% mutate_if(is.factor, as.character),
                       metaphlan_df %>% mutate(dataset_name=str_replace(dataset_name, "gene_families_", "")) %>% select(c(1,2, (feature_num + 2))),
                       by = c("dataset_name", "sampleID")))[[length(model_ready_df) + 1]] > 0)
    ) / nrow(model_ready_df)
  )
}

compute_microbial_feature_proportions <- function(df, metaphlan_df){
  # adding dataset name to nested tibble
  df$model_selected_df <- map(c(seq(from = 1, to = nrow(df), by = 1)), function(x) as.data.frame(df[[x,2]]) %>% mutate(dataset_name = df[[x,1]]))
  for (j in seq_along(metaphlan_df %>% select(-1, -2))) {
    # adding 1 feature, fitting, and tidying summary
    new_feature_name <- colnames(metaphlan_df %>% select(-1, -2) %>% select(j))
    df %>% mutate(
      new_feature = map_dbl(df$model_selected_df, function(x)
        compute_microbial_feature_proportions_helper(x, metaphlan_df, j)
      )
    )
    colnames(df)[length(colnames(df))] <- paste0("feature_", toString(new_feature_name)) # last step to rename feature to desired variable input name
  }
  return(df)
}

# Takes model_ready_df (output of prune_metadata.R) and a transformed data_type file (outputted from pre_data_for_pipieline.R)
# and computes associations one-by-one and outputs in a tibble.
compute_lm_associations <- function(df, metaphlan_df,phenotype){
  # find fudge factor for logging
  fudge_factor <- min((metaphlan_df %>% select(-1, -2) %>% unlist())[which((metaphlan_df %>% select(-1, -2) %>% unlist()) > 0)])
  # clean dataset name folder if working with gene families
  #iterate through each cohort for each feature

  #This line just cleans the DF from dataset names that aren't in the metadata but are in the CAGS (we shouldn't have that issue)
  #df=df %>% filter(dataset_name %in% unique(gsub('gene_families_','',metaphlan_df$dataset_name)))
  df$model_selected_df <- map(c(seq(from = 1, to = nrow(df), by = 1)), function(x) data.frame(df[[x,2]]) %>% mutate(dataset_name = df[[x,1]]))
  for (j in seq_along(metaphlan_df %>% select(-1, -2))) {
    # adding 1 feature, fitting, and tidying summary
    new_feature_name <- colnames(metaphlan_df %>% select(-1, -2) %>% select(j))
    print(str_c("feature no.: ", new_feature_name)) # print feature no. currently working on
    df %<>% mutate(new_feature = map(df$model_selected_df, function(x)
      if (compute_microbial_feature_proportions_helper(x, metaphlan_df, j) > 0) { # prorportion must be non-zero or else no response to model
        regression_df=left_join(as.data.frame(x) %>% mutate_if(is.factor, as.character), metaphlan_df %>% mutate(dataset_name=str_replace(dataset_name, "gene_families_", "")) %>% select(c(1,2, (j + 2))),by = c("dataset_name", "sampleID")) %>% select(-dataset_name) %>% mutate_if(is.character, as.factor) %>% drop_na()
        ################averaging logic
        subnum=length(unique(regression_df$subjectID))
        samnum=length(unique(regression_df$sampleID))
        #check if the number of cases vs subjects differs and if so, average across samples, dropping columns that have multiple values for the same subject or, as a result of the averaging, contain only 1 value.
        if(subnum!=samnum){
          #get case subjects and remove ones that are pre disease (this can be changed if we want to compare health controls to pre disease individuals)
          cases=regression_df %>% filter(study_condition==1) %>% select(subjectID) %>% mutate_if(is.factor, as.character) %>% unique() %>% unlist %>% unname()
          regression_df_cases = regression_df %>% filter(subjectID %in% cases) %>% filter(study_condition != 0) %>% select(-new_feature_name)
          case_samples=regression_df_cases %>% select(sampleID) %>% mutate_if(is.factor,as.character) %>% unlist %>% unname
          #average
          regression_df_cases_numeric_averaged = regression_df_cases %>%group_by(subjectID) %>% select_if(is.numeric) %>% summarise_each(mean)
          regression_df_cases_character_averaged = regression_df_cases %>% select_if(negate(is.numeric)) %>% select(-sampleID) %>% distinct() %>% mutate_if(is.factor, as.character)
          subjects=unique(regression_df_cases_character_averaged$subjectID)
          toremove=list()
          for(s in subjects){
            regression_df_cases_character_averaged_sub = regression_df_cases_character_averaged %>% filter(subjectID==s)
            temp=apply(regression_df_cases_character_averaged_sub, 2, function(x) length(unique(x)))
            temp=names(temp[temp>1])
            if(length(temp)>0){
              toremove[s]=temp
            }
            toremove=unique(unname(unlist(toremove)))
            if(length(toremove)>0){
              regression_df_cases_character_averaged=regression_df_cases_character_averaged %>% select(-c(toremove)) %>% distinct()
            }
            cases_df = full_join(regression_df_cases_numeric_averaged,regression_df_cases_character_averaged,on=subjectID) %>% mutate_if(is.character,as.factor)

            #get control subjects
            controls=regression_df %>% filter(study_condition==0) %>% select(subjectID) %>% mutate_if(is.factor, as.character) %>% unique() %>% unlist %>% unname()
            regression_df_controls = regression_df %>% filter(subjectID %in% controls) %>% filter(study_condition != 1) %>% select(-new_feature_name)
            control_samples=regression_df_controls %>% select(sampleID) %>% mutate_if(is.factor,as.character) %>% unlist %>% unname
            regression_df_controls_numeric_averaged = regression_df_controls %>%group_by(subjectID) %>% select_if(is.numeric) %>% summarise_each(mean)

            regression_df_controls_character_averaged = regression_df_controls %>% select_if(negate(is.numeric)) %>% select(-sampleID) %>% distinct() %>% mutate_if(is.factor, as.character)
            subjects=unique(regression_df_controls_character_averaged$subjectID)
            toremove=list()
            for(s in subjects){
              regression_df_controls_character_averaged_sub = regression_df_controls_character_averaged %>% filter(subjectID==s)
              temp=apply(regression_df_controls_character_averaged_sub, 2, function(x) length(unique(x)))
              temp=names(temp[temp>1])
              if(length(temp)>0){
                toremove[s]=temp
              }
            }
            toremove=unique(unname(unlist(toremove)))
            if(length(toremove)>0){
              regression_df_controls_character_averaged=regression_df_controls_character_averaged %>% select(-c(toremove)) %>% distinct()
            }
            controls_df = full_join(regression_df_controls_numeric_averaged,regression_df_controls_character_averaged,on=subjectID) %>% mutate_if(is.character,as.factor)

            #average across microbial feature
            regression_df_mf = regression_df %>% filter(sampleID %in% case_samples | sampleID %in% control_samples) %>% select(subjectID,new_feature_name) %>%group_by(subjectID) %>% summarise_all(funs(mean=mean,sd=sd))
            #write.csv(regression_df_mf,paste('averaging_output/',phenotype,'_',datatype,'_',new_feature_name,'.csv',sep=''))
            regression_df_mf=regression_df_mf %>% select(subjectID,mean)
            colnames(regression_df_mf)[2]=new_feature_name

            #merge
            regression_df = bind_rows(cases_df,controls_df) %>% mutate_if(is.character,as.factor)
            regression_df=full_join(regression_df,regression_df_mf,on='subjectID')
          }
        }
        else{
          regression_df %<>% select(-sampleID)
        }

        regression_df %<>% select(-subjectID)
        #remove HLA variable from initial maximal models
        tryCatch({
          regression_df %<>% select(-HLA)

        },error=function(e)
        {
          regression_df=regression_df
        })
        #run regression
        regression_df=regression_df %>% drop_na %>% select_if(~ length(unique(.)) > 1)

        # Added to change column name study_condition of example data to "disease" so it matches iyr data
        regression_df <- regression_df %>%
          rename(
            disease = study_condition
          )
        return(tryCatch(tidy(stats::lm(as.formula(str_c("I(log10(`", new_feature_name, "` + ", toString(fudge_factor), ")) ~ disease")), # fit lm and get tidy outputs, excluding ID vars (sampleID and subjectID)
                                       data = regression_df))

                        ,
                        # NOTE: tidy() ends up removing and singularity fits (e.g. NA fits from a feature that, for the dataset's samples all have 0 relative abundance)
                        warning = function(w) w, # if warning or error, just return them instead of output
                        error = function(e) e
        )
        )
      } else {
        return(simpleError(str_c("No non-zero values for ", new_feature_name, " to be considered in ", x$dataset_name[[1]], "."))
        )
      }
    )
    )
    colnames(df)[length(colnames(df))] <- paste0("feature_", toString(new_feature_name)) # last step to rename feature to desired variable input name
  }
  return(df)
}

compute_assocations <- function(metadata, abundance_data, pheno, logger) {
  # Example Data
  #args <- c('1_prepared_metadata.rds', '2020_T2D_Data/abundance_data.rds', 'Karlsson_associations.rds','CRC')
  cmd_file <- as_tibble(metadata)
  datatype_file <- as_tibble(abundance_data)
  output <- compute_lm_associations(cmd_file, datatype_file,pheno)
  return(output)
}
elizabethcook21/voe documentation built on Sept. 8, 2020, 11:40 a.m.