R/gdsc_helpers.R

Defines functions clean_output bind_repression calc_exp GDSC_repress calc_gdsc_np

Documented in bind_repression calc_exp calc_gdsc_np clean_output GDSC_repress

#' function to calculate network potential for every gene/cell line combo in the GDSC
#'
#' @inheritParams compute_np
#'
#' @importFrom foreach %dopar%
#' @importFrom magrittr %>%
#'
#' @export

calc_gdsc_np <- function(cache = NULL, ncores = 1) {
  if(is.null(cache)) {
    stop("Please specify a filepath specifying where to store/retrieve data")
  }
  #just return the processed data if its already been done
  if(file.exists(paste0(cache, "/gdsc_np.Rda"))) {
    load(paste0(cache, "/gdsc_np.Rda"))
    message("using cached processed data")
    return(df_np)
  }
  #grab ppi to use for the scaffold
  g <- crosstalkr::prep_stringdb(cache = cache, min_score = 400)

  #clean gdsc
  df_list <- clean_gdsc(cache = cache)

  #grab expression data
  df_exp <- df_list[[3]]

  #get list of cell lines to iterate over
  cell_lines <- unique(df_exp$cell_line)

  #set up parallel execution
  cl <- parallel::makeCluster(ncores)
  doParallel::registerDoParallel(cl)
  out_list <-
    foreach::foreach(i = 1:length(cell_lines)) %dopar% {
      #isolate one cell line
      df_i <- df_exp %>%
        dplyr::filter(.data$cell_line == cell_lines[i])

      #add np to df_i for that cell line
      disruptr::calc_np_i(df_i, g = g)

    }
  #close out parallel execution
  parallel::stopCluster(cl)
  df_np = dplyr::bind_rows(out_list)

  save(df_np, file = paste0(cache, "/gdsc_np.Rda"))
  return(df_np)
}

#' function to do in-silico repression for a given gene on all cell lines in the GDSC
#'
#' @inheritParams calc_gdsc_np
#' @param v_rm which node should we remove
#'
#' @importFrom foreach %dopar%
#' @importFrom magrittr %>%
#'
#' @export


GDSC_repress <- function(cache = NULL, v_rm = "EGFR", ncores = 1) {

  if(is.null(cache)) {
    stop("Please specify a filepath specifying where to store/retrieve data")
  }

  if(file.exists(paste0(cache, "/", v_rm, "_gdsc_np.Rda"))) {
    load(paste0(cache, "/", v_rm, "_gdsc_np.Rda"))
    message("using cached processed data")
    return(df_np)
  }

  #get PPI
  g <- crosstalkr::prep_stringdb(cache = cache, min_score = 400)

  #get or compute network potential for gdsc
  df <- calc_gdsc_np(cache = cache, ncores = ncores)

  #compute the change in np for EGFR for each cell line.
  cell_lines <- unique(df$cell_line)

  #set up parallel computing
  cl <- parallel::makeCluster(ncores)
  doParallel::registerDoParallel(cl)
  out_list <-
    foreach::foreach(i = 1:length(cell_lines),
                     .packages = "disruptr") %dopar% {

                       #get named expression vector
                       exp <- calc_exp(df = df, cell_lines = cell_lines, i = i)

                       #calculate change in np when v_rm is removed
                       repression_vector <- node_repression(g = g, v_rm = v_rm,
                                                            state_function = calc_np_all,
                                                            exp = exp,
                                                            neighbors_only = TRUE)

                       #clean up and bind
                       bind_repression(df = df, cell_lines = cell_lines, i = i,
                                       v_rm = v_rm,
                                       repression_vector = repression_vector)
                     }
  parallel::stopCluster(cl)

  #bind the data frame back together
  df_np = dplyr::bind_rows(out_list)

  #save the output
  save(df_np, file = paste0(cache, "/", v_rm, "_gdsc_np.Rda"))

  #exit function
  return(df_np)

}

#' helper function to grab the named expression vector for a given cell line
#'
#' @param df tidy dataframe containing expression data for a number of cell lines
#' @param cell_lines unique vector of cell lines
#' @param i index value for `cell_lines
#'
#' @export

calc_exp <- function(df, cell_lines, i) {
  df_i <- dplyr::filter(df, .data$cell_line == cell_lines[i])
  exp <- df_i$log_expression
  names(exp) <- df_i$gene_symbols
  return(exp)
}

#' helper function to bind the repression vector to df_i in gdsc_repress
#'
#' @inheritParams calc_exp
#' @param v_rm which node should we remove?
#' @param repression_vector output of node_repression function in gdsc_repress
#'
#' @export

bind_repression <- function(df, cell_lines, i, repression_vector, v_rm) {
  df_i <- dplyr::filter(df, .data$cell_line == cell_lines[i])
  df_repression <- data.frame(gene_symbols = names(repression_vector),
                              np_diff = repression_vector)

  df_i <- dplyr::left_join(df_i, df_repression)
  df_i <- dplyr::mutate(df_i,
                        np_diff = ifelse(.data$gene_symbols == v_rm, abs(.data$np), .data$np_diff),
                        np_diff = ifelse(is.na(.data$np_diff), 0, .data$np_diff))
  return(df_i)

}

#' function to summarise output from `GDSC_repress`
#'
#' @param df data frame containing NP difference from in-silico repression of a given gene
#'
#' Computes the total network potential/ difference in network potential with repression of a given gene,
#' expression for the repressed gene, network potential for the repressed gene.
#'
#' @importFrom magrittr %>%
#'
#' @export

clean_output <- function(df) {
  df <- df %>%
    dplyr::filter(!is.na(.data$np),
                  !is.infinite(.data$np)) %>%
    dplyr::group_by(.data$cell_line) %>%
    dplyr::summarise(total_np = sum(.data$np),
                     diff_np = sum(.data$np_diff),
                     egfr_exp = sum(ifelse(
                       .data$gene_symbols == 'EGFR',
                       .data$log_expression,
                       0)),
                     egfr_np = sum(ifelse(
                       .data$gene_symbols == 'EGFR',
                       .data$np,
                       0))) %>%
    dplyr::mutate(diff_np_scaled = .data$diff_np/.data$total_np)
}
DavisWeaver/disruptr documentation built on Jan. 29, 2023, 1:04 a.m.