R/depmap.R

Defines functions deepPCA deepResult deepFilter deepInfo

Documented in deepFilter deepPCA deepResult

# Install Package: Ctrl + Shift + B
# devtools::document()

deepInfo <- function(){
  cat("
  deepFilter |||takes: attribute specifics |||gives: cell line table       |
  deepResult |||takes: deepFilter output   |||gives: data about cell lines |
  deepPCA    |||takes: deepResults output  |||gives: PCA                   |
      ")
}

#' Filter the depmap annotation file for cell lines of interest.
#'
#' @param DepMap_ID e.g. "ACH-000012"
#' @param cell_line_name e.g. "HCC827"
#' @param CCLE_Name e.g. "HCC827_LUNG"
#' @param alias alias
#' @param COSMIC_ID e.g. "1240146"
#' @param lineage lineage, e.g. "leukemia" or "lung"
#' @param lineage_subtype sublineage, e.g. "AML", "ALL", etc.
#' @param lineage_sub_subtype Subsublineage
#' @param lineage_molecular_subtype lineage molecular subtype
#' @param sex either "Male" or "Female"
#' @param source e.g. "Sanger", "DSMZ"
#' @param Achilles_n_replicates minimum number of replicates
#' @param max_cell_line_NMMD negative number, NMD = nonsense-medieated mRNA decay
#' @param min_cell_line_NMMD negative number, NMD = nonsense-medieated mRNA decay
#' @param culture_type mostly "Adherent" or "Suspension"
#' @param culture_medium culture medium
#' @param Cas9_activity minimum Cas9 activity, positive number
#' @param RRID e.g. "CVCL_0001"
#' @param sample_collection_site e.g. "lung"
#' @param primary_or_metastasis either "Primary" or "Metastasis"
#' @param disease disease of the patient, e.g. "Lung Cancer"
#' @param disease_subtype subdisease of the patient
#' @param ageMin integer, minimum patient age
#' @param ageMax integer, maximum patient age
#' @param Sanger_model_ID e.g. "SIDM01067"
#' @param additional_info additional information
#' @return A data.frame where each row is a human cell line, and each column is a parameter of information about the cell line (e.g. ID, tissue, etc.). Note that terms are searched for with grepl, so use ^ and $ to determine ending and start if need be.
#' @examples
#' head(deepFilter())
#' a <- seepFilter(lineage_subtype="^AML$", ageMin=60)
deepFilter <- function(DepMap_ID=NA, stripped_cell_line_name=NA, CCLE_Name=NA, alias=NA, COSMIC_ID=NA,
                       lineage=NA, lineage_subtype=NA, lineage_sub_subtype=NA, lineage_molecular_subtype=NA,
                       sex=NA, source=NA, Achilles_n_replicates=NA,
                       max_cell_line_NNMD=NA, min_cell_line_NNMD= -Inf,
                       culture_type=NA, culture_medium=NA, cas9_activity=NA, RRID=NA, sample_collection_site=NA,
                       primary_or_metastasis=NA, disease=NA, disease_subtype=NA, ageMin=0, ageMax=200, Sanger_model_ID=NA, additional_info=NA){

  ## get a data.frame with all input values (except for 3 values, which are set to NA for now)
  input <- c(DepMap_ID, stripped_cell_line_name, CCLE_Name, alias, COSMIC_ID,
             lineage, lineage_subtype, lineage_sub_subtype, lineage_molecular_subtype,
             sex, source, Achilles_n_replicates, max_cell_line_NNMD,
             culture_type, culture_medium, cas9_activity, RRID, sample_collection_site,
             primary_or_metastasis, disease, disease_subtype, Sanger_model_ID, additional_info)

  ## get a data.frame originating from dmMeta, where rows are filtered by numbers
  output <- as.data.frame(dmMeta[dmMeta$age>=ageMin & dmMeta$age<=ageMax,])

  ## get a vector with search terms
  terms <- input[!is.na(input)]
  ## get a vector with column numbers for each search term
  colnum <- seq(input)[!is.na(input)]

  ## filter the data.frame in each relevant column
  for(i in seq(terms)){
    output <- subset(output, grepl(terms[i], output[,colnum[i]]))
  }
  output

}

#' Get the results for a certain filter.
#'
#' @param cellLines Any data.table with DepMap_IDs for cell lines (ACH-000002 etc.) in its 1st column and cell line names in its 2nd column. Ideally generated by dmFilter().
#' @param dataset Any data.table with DepMap_IDs in its 1st column. Inbuilt are dmExpr (default),dmDep, dmDrug and dmProt.
#' @return A data.frame where each row is a cell line is a gene and each column is a cell line.
#' @examples
#' a <- deepFilter(lineage="leukemia", ageMin=60)
#' b <- deepResult(a, dmDep)
deepResult <- function(cellLines, dataset=dmExpr){

  merged <- merge(cellLines[,1:2], dataset, by="DepMap_ID")
  output <- t(merged[,3:ncol(merged)])
  output <- as.data.frame(output)
  colnames(output) <- unlist(merged[,2])

  ## for dmExpr and dmDep, change rownames and add an EntrezID column
  if( grepl( ")$", colnames(dataset)[2] ) ){
    output <- cbind(EntrezID=as.character(gsub("^.*\\(", "", gsub("\\)", "", rownames(output)))), output)
    rownames(output) <- as.character(gsub(" \\(.*\\)$", "", rownames(output)))
  }

  output

}

#' Generate a PCA plot from DepMap data
#'
#' @param deepResult Any data.table with DepMap_IDs for cell lines (ACH-000002 etc.) in its 1st column and cell line names in its 2nd column. Ideally generated by dmFilter().
#' @param groupBy vector of groupBy names to be extracted
#' @param size dot size
#' @param deepFilter table with metadata about cell lines. Can be generated with deepFilter(), or leave as NA to use a default
#' @return ggplot2 object. Legend title can be set with ggplot2::labs(color=...). Axis labels can be set with ggplot2::xlab(...) and ylab(...)
#' @examples
#' df1 <- deepFilter(disease="Leukemia")
#' df2 <- deepResult(df1)
#' pca1 <- deepPCA(df2, "lineage_subtype")
deepPCA <- function(deepResult, groupBy=NA, size=5, deepFilter=NA, labels=F){

  if(colnames(deepResult)[1] %in% "EntrezID"){
    deepResult <- deepResult[,-1]
  }

  # get the desired groupBy from all relevant cell lines
  if(is.na(deepFilter)){meta <- dmMeta}else{meta <- deepFilter}
  meta <- as.data.frame(meta)
  meta <- meta[meta$stripped_cell_line_name %in% colnames(deepResult), groupBy, drop=F]

  # perform PCA on data, then combine with meta data about the cell lines
  pca1 <- prcomp( t(na.omit(deepResult)), center=T )
  pca2 <- cbind( as.data.frame(pca1$x[,1:2]), cells=rownames(pca1$x), meta)

  # get PC variance percentages
  percs <- 100*round((pca1$sdev^2)/sum(pca1$sdev^2)[1:2], 3)

  # plot
  plot1 <- ggplot2::ggplot( pca2, ggplot2::aes(x=PC1, y=PC2, color=eval(parse(text=groupBy))) ) +
    ggplot2::geom_point(size=size) +
    ggplot2::xlab(paste0("PC1 (", percs[1], " %)")) +
    ggplot2::ylab(paste0("PC2 (", percs[2], " %)")) +
    ggplot2::labs(color=groupBy) #legnd title
  if(labels){
    plot1 <- plot1 +
      ggrepel::geom_label_repel(data = pca2, ggplot2::aes(label=cells), min.segment.length = 0.25, force = 6,
                                point.padding = 0.5, size=size, segment.size=0.5, nudge_x=-5, fill="white")
  }
  plot1
}
Solatar/setR documentation built on Dec. 5, 2020, 10:50 p.m.