R/exampleGDSC.R

#' @title Preprocessed data set to mimic a small pharmacogenomic example
#'
#' @description
#' Preprocessed data set to mimic a small pharmacogenetic example from the 
#' Genomics of Drug Sensitivity in Cancer (GDSC) database, with p=850 gene 
#' features as explanatory variables, s=7 drugs sensitivity data as response 
#' variables and data for n=498 cell lines. Gene features include p1=343 gene 
#' expression features (GEX), p2=426 by copy number variations (CNV) and p3=68 
#' mutated genes (MUT). Loading the data will load the associated blockList 
#' (and mrfG) objects needed to fit the model with BayesSUR(). The R code for 
#' generating the simulated data is given in the Examples paragraph.
#'
#' #importFrom plyr mapvalues
#' #importFrom data.table like
#'
#' @examples
#' # Load the GDSC sample dataset
#' data("exampleGDSC", package = "BayesSUR")
#' str(exampleGDSC)
#'
#' \dontrun{
#' # ===============
#' # This code below is to do preprocessing of GDSC data and obtain the complete dataset
#' # "exampleGDSC.rda" above. The user needs load the datasets from
#' # https://www.cancerrxgene.org release 5.
#' # But downloading and transforming the three used datasets below to *.csv files first.
#' # ===============
#'
#' requireNamespace("plyr", quietly = TRUE)
#' requireNamespace("data.table", quietly = TRUE)
#'
#'
#' features <- data.frame(read.csv("/gdsc_en_input_w5.csv", head = T))
#' names.fea <- strsplit(rownames(features), "")
#' features <- t(features)
#' p <- c(13321, 13747 - 13321, 13818 - 13747)
#' Cell.Line <- rownames(features)
#' features <- data.frame(Cell.Line, features)
#'
#' ic50_00 <- data.frame(read.csv("gdsc_drug_sensitivity_fitted_data_w5.csv", head = T))
#' ic50_0 <- ic50_00[, c(1, 4, 7)]
#' drug.id <- data.frame(read.csv("gdsc_tissue_output_w5.csv", head = T))[, c(1, 3)]
#' drug.id2 <- drug.id[!duplicated(drug.id$drug.id), ]
#' # delete drug.id=1066 since ID1066 and ID156 both correspond drug AZD6482,
#' # and no ID1066 in the "suppl.Data1" by Garnett et al. (2012)
#' drug.id2 <- drug.id2[drug.id2$drug.id != 1066, ]
#' drug.id2$drug.name <- as.character(drug.id2$drug.name)
#' drug.id2$drug.name <- substr(drug.id2$drug.name, 1, nchar(drug.id2$drug.name) - 6)
#' drug.id2$drug.name <- gsub(" ", "-", drug.id2$drug.name)
#'
#' ic50 <- ic50_0
#' # mapping the drug_id to drug names in drug sensitivity data set
#' ic50$drug_id <- plyr::mapvalues(ic50$drug_id, from = drug.id2[, 2], to = drug.id2[, 1])
#' colnames(ic50) <- c("Cell.Line", "compound", "IC50")
#'
#' # transform drug sensitivity overall cell lines to a data matrix
#' y0 <- reshape(ic50, v.names = "IC50", timevar = "compound", 
#'               idvar = "Cell.Line", direction = "wide")
#' y0$Cell.Line <- gsub("-", ".", y0$Cell.Line)
#'
#' # ===============
#' # select nonmissing pharmacological data
#' # ===============
#' y00 <- y0
#' m0 <- dim(y0)[2] - 1
#' eps <- 0.05
#' # r1.na is better to be not smaller than r2.na
#' r1.na <- 0.3
#' r2.na <- 0.2
#' k <- 1
#' while (sum(is.na(y0[, 2:(1 + m0)])) > 0) {
#'   r1.na <- r1.na - eps / k
#'   r2.na <- r1.na - eps / k
#'   k <- k + 1
#'   ## select drugs with <30% (decreasing with k) missing data overall cell lines
#'   na.y <- apply(y0[, 2:(1 + m0)], 2, function(xx) sum(is.na(xx)) / length(xx))
#'   while (sum(na.y < r1.na) < m0) {
#'     y0 <- y0[, -c(1 + which(na.y >= r1.na))]
#'     m0 <- sum(na.y < r1.na)
#'     na.y <- apply(y0[, 2:(1 + m0)], 2, function(xx) sum(is.na(xx)) / length(xx))
#'   }
#'
#'   ## select cell lines with treatment of at least 80% (increasing with k) drugs
#'   na.y0 <- apply(y0[, 2:(1 + m0)], 1, function(xx) sum(is.na(xx)) / length(xx))
#'   while (sum(na.y0 < r2.na) < (dim(y0)[1])) {
#'     y0 <- y0[na.y0 < r2.na, ]
#'     na.y0 <- apply(y0[, 2:(1 + m0)], 1, function(xx) sum(is.na(xx)) / length(xx))
#'   }
#'   num.na <- sum(is.na(y0[, 2:(1 + m0)]))
#'   message("#{NA}=", num.na, "\n", "r1.na =", r1.na, ", r2.na =", r2.na, "\n")
#' }
#'
#' # ===============
#' # combine drug sensitivity, tissues and molecular features
#' # ===============
#' yx <- merge(y0, features, by = "Cell.Line")
#' names.cell.line <- yx$Cell.Line
#' names.drug <- colnames(yx)[2:(dim(y0)[2])]
#' names.drug <- substr(names.drug, 6, nchar(names.drug))
#' # numbers of gene expression features, copy number festures and muatation features
#' p <- c(13321, 13747 - 13321, 13818 - 13747)
#' num.nonpen <- 13
#' yx <- data.matrix(yx[, -1])
#' y <- yx[, 1:(dim(y0)[2] - 1)]
#' x <- cbind(yx[, dim(y0)[2] - 1 + sum(p) + 1:num.nonpen], yx[, dim(y0)[2] - 1 + 1:sum(p)])
#'
#' # delete genes with only one mutated cell line
#' x <- x[, 
#'   -c(num.nonpen + p[1] + p[2] + which(colSums(x[, num.nonpen + p[1] + p[2] + 1:p[3]]) <= 1))]
#' p[3] <- ncol(x) - num.nonpen - p[1] - p[2]
#'
#' GDSC <- list(
#'   y = y, x = x, p = p, num.nonpen = num.nonpen, names.cell.line = names.cell.line,
#'   names.drug = names.drug
#' )
#'
#'
#' ## ================
#' ## ================
#' ## select a small set of drugs
#' ## ================
#' ## ================
#'
#' name_drugs <- c(
#'   "Methotrexate", "RDEA119", "PD-0325901", "CI-1040", "AZD6244", "Nilotinib",
#'   "Axitinib"
#' )
#'
#' # extract the drugs' pharmacological profiling and tissue dummy
#' YX0 <- cbind(GDSC$y[, colnames(GDSC$y) %in% paste("IC50.", name_drugs, sep = "")]
#' [, c(1, 3, 6, 4, 7, 2, 5)], GDSC$x[, 1:GDSC$num.nonpen])
#' colnames(YX0) <- c(name_drugs, colnames(GDSC$x)[1:GDSC$num.nonpen])
#' # extract the genetic information of CNV & MUT
#' X23 <- GDSC$x[, GDSC$num.nonpen + GDSC$p[1] + 1:(p[2] + p[3])]
#' colnames(X23)[1:p[2]] <- paste(substr(
#'   colnames(X23)[1:p[2]], 1,
#'   nchar(colnames(X23)[1:p[2]]) - 3
#' ), ".CNV", sep = "")
#'
#' # locate all genes with CNV or MUT information
#' name_genes_duplicate <- c(
#'   substr(colnames(X23)[1:p[2]], 1, nchar(colnames(X23)[1:p[2]]) - 4),
#'   substr(colnames(X23)[p[2] + 1:p[3]], 1, nchar(colnames(X23)[p[2] + 1:p[3]]) - 4)
#' )
#' name_genes <- name_genes_duplicate[!duplicated(name_genes_duplicate)]
#'
#' # select the GEX which have the common genes with CNV or MUT
#' X1 <- 
#'   GDSC$x[, GDSC$num.nonpen + which(colnames(GDSC$x)[GDSC$num.nonpen + 1:p[1]] %in% name_genes)]
#'
#' p[1] <- ncol(X1)
#' X1 <- log(X1)
#'
#' # summary the data information
#' exampleGDSC <- list(data = cbind(YX0, X1, X23))
#' exampleGDSC$blockList <- list(
#'   1:length(name_drugs), length(name_drugs) + 1:GDSC$num.nonpen,
#'   ncol(YX0) + 1:sum(p)
#' )
#'
#' # ========================
#' # construct the G matrix: edge potentials in the MRF prior
#' # ========================
#'
#' # edges between drugs: Group1 ("RDEA119","17-AAG","PD-0325901","CI-1040" and "AZD6244")
#' # indexed as (2:5)
#' # http://software.broadinstitute.org/gsea/msigdb/cards/KEGG_MAPK_SIGNALING_PATHWAY
#' pathway_genes <- read.table("MAPK_pathway.txt")[[1]]
#' Idx_Pathway1 <- which(c(colnames(X1), name_genes_duplicate) %in% pathway_genes)
#' Gmrf_Group1Pathway1 <- t(combn(rep(Idx_Pathway1, each = length(2:5)) +
#'   rep((2:5 - 1) * sum(p), times = length(Idx_Pathway1)), 2))
#'
#' # edges between drugs: Group2 ("Nilotinib","Axitinib") indexed as (6:7)
#' # delete gene ABL2
#' Idx_Pathway2 <- which(c(colnames(X1), name_genes_duplicate) %like% "BCR" |
#'   c(colnames(X1), name_genes_duplicate) %like% "ABL")[-c(3, 5)]
#' Gmrf_Group2Pathway2 <- t(combn(rep(Idx_Pathway2, each = length(6:7)) +
#'   rep((6:7 - 1) * sum(p), times = length(Idx_Pathway2)), 2))
#'
#' # edges between the common gene in different data sources
#' Gmrf_CommonGene <- NULL
#' list_CommonGene <- list(0)
#' k <- 1
#' for (i in 1:length(name_genes)) {
#'   Idx_CommonGene <- which(c(colnames(X1), name_genes_duplicate) == name_genes[i])
#'   if (length(Idx_CommonGene) > 1) {
#'     Gmrf_CommonGene <- rbind(Gmrf_CommonGene, 
#'     t(combn(rep(Idx_CommonGene, each = length(name_drugs))
#'     + rep((1:length(name_drugs) - 1) * sum(p), times = length(Idx_CommonGene)), 2)))
#'     k <- k + 1
#'   }
#' }
#' Gmrf_duplicate <- rbind(Gmrf_Group1Pathway1, Gmrf_Group2Pathway2, Gmrf_CommonGene)
#' Gmrf <- Gmrf_duplicate[!duplicated(Gmrf_duplicate), ]
#' exampleGDSC$mrfG <- Gmrf
#'
#' # create the target gene names of the two groups of drugs
#' targetGenes1 <- matrix(Idx_Pathway1, nrow = 1)
#' colnames(targetGenes1) <- colnames(exampleGDSC$data)[seq_along(targetGene$group1)]
#' targetGenes2 <- matrix(Idx_Pathway2, nrow = 1)
#' colnames(targetGenes2) <- colnames(exampleGDSC$data)[seq_along(targetGene$group2)]
#'
#' targetGene <- list(group1 = targetGenes1, group2 = targetGenes2)
#'
#' ## Write data file exampleGDSC.rda to the user's directory by save()
#' }
#'
"exampleGDSC"

Try the BayesSUR package in your browser

Any scripts or data that you put into this service are public.

BayesSUR documentation built on June 30, 2024, 9:06 a.m.