library(rcellminer)
library(stringr)
#--------------------------------------------------------------------------------------------------
# HELPER FUNCTIONS
#--------------------------------------------------------------------------------------------------
validateDataTab <- function(dataTab, keyCol = "Gene name", featureDataColNums = 1:9){
stopifnot(keyCol %in% colnames(dataTab))
# Remove any whitespace in feature data columns.
for (j in featureDataColNums){
if (is.character(dataTab[, j])){
dataTab[, j] <- stringr::str_trim(dataTab[, j])
}
}
# Make sure expected numeric data is numeric (symbols to be read in as NAs are
# properly handled).
stopifnot(all(c(lapply(dataTab[, -featureDataColNums], is.numeric), recursive = TRUE)))
# Set row names of data table to names in key column after checks.
stopifnot(all(!is.na(dataTab[, keyCol])))
stopifnot(all(dataTab[, keyCol] != ""))
stopifnot(all(dataTab[, keyCol] != "1-Mar")) # No Excel conversion to dates.
stopifnot(all(!duplicated(dataTab[, keyCol])))
rownames(dataTab) <- dataTab[, keyCol]
return(dataTab)
}
#--------------------------------------------------------------------------------------------------
# LOAD DATA: MRNA EXPRESSION.
#--------------------------------------------------------------------------------------------------
# http://discovery.nci.nih.gov/cellminerint/loadDownload.do
# Select: [RNA: 5 Platform Gene Transcript, select: Average z score]
# Processing of data file (RNA__5_Platform_Gene_Transcript_Average_z_scores.xls):
# --- Save as text file
# --- Delete first 10 rows to get to table.
# --- Clean up column names (delete superscripts for footnotes)
#----[z score data]------------------------------------------------------------
filePath <- "inst/extdata/cellminer_2_0/RNA__5_Platform_Gene_Transcript_Average_z_scores.txt"
expTabOrig <- read.table(file=filePath, header=TRUE, sep="\t", stringsAsFactors=FALSE,
check.names = FALSE, comment.char="", quote="", na.strings="-")
featureDataCols <- 1:9
expTabOrig <- validateDataTab(expTabOrig, keyCol = "Gene name",
featureDataColNums = featureDataCols)
expData <- ExpressionSet(as.matrix(expTabOrig[, -featureDataCols]))
featureData(expData) <- new("AnnotatedDataFrame", data=expTabOrig[, featureDataCols])
###################################################################################################
# Set CellMiner NCI-60 Cell Line Names
cmNci60Names <- colnames(exprs(expData))
stopifnot(identical(cmNci60Names, stringr::str_trim(cmNci60Names)))
# Make CellMiner 1.6, 2.0 cell line name match tab.
# cmNci60Names_1_6 <- colnames(rcellminer::getAllFeatureData(rcellminerData::molData)[["exp"]])
# CellMinerNci60LineTab <- data.frame(CellMiner_1_6 = cmNci60Names_1_6,
# CellMiner_2_0 = cmNci60Names,
# stringsAsFactors = FALSE)
# save(CellMinerNci60LineTab, file = "inst/extdata/CellMinerNci60LineTab.Rdata")
load("inst/extdata/CellMinerNci60LineTab.Rdata")
stopifnot(identical(cmNci60Names, CellMinerNci60LineTab$CellMiner_2_0))
###################################################################################################
#----[average log2 intensity data]---------------------------------------------
# http://discovery.nci.nih.gov/cellminerint/loadDownload.do
# Select: [RNA: 5 Platform Gene Transcript, select: Averaged intensities]
# Processing of data file (RNA__5_Platform_Gene_Transcript_Averaged_intensities.xls):
# --- Save as text file
# --- Delete first 10 rows to get to table.
# --- Clean up column names (delete superscripts for footnotes)
filePath <- "inst/extdata/cellminer_2_0/RNA__5_Platform_Gene_Transcript_Averaged_intensities.txt"
xaiTabOrig <- read.table(file=filePath, header=TRUE, sep="\t", stringsAsFactors=FALSE,
check.names = FALSE, comment.char="", quote="", na.strings="-")
featureDataCols <- 1:9
xaiTabOrig <- validateDataTab(xaiTabOrig, keyCol = "Gene name",
featureDataColNums = featureDataCols)
xaiData <- ExpressionSet(as.matrix(xaiTabOrig[, -featureDataCols]))
featureData(xaiData) <- new("AnnotatedDataFrame", data=xaiTabOrig[, featureDataCols])
# Column (NCI-60 cell line) consistency check.
stopifnot(identical(colnames(exprs(xaiData)), cmNci60Names))
#--------------------------------------------------------------------------------------------------
# LOAD DATA: GENE COPY.
#--------------------------------------------------------------------------------------------------
# http://discovery.nci.nih.gov/cellminerint/loadDownload.do
# Select: [DNA: Combined aCGH, select: Gene summary]
# Processing of data file (DNA__Combined_aCGH_Gene_summary.xls):
# --- Save as text file
# --- Delete first 10 rows to get to table.
# --- Clean up column names (delete superscripts for footnotes)
filePath <- "inst/extdata/cellminer_2_0/DNA__Combined_aCGH_Gene_summary.txt"
copTabOrig <- read.table(file=filePath, header=TRUE, sep="\t", stringsAsFactors=FALSE,
check.names = FALSE, comment.char="", quote="", na.strings="-")
featureDataCols <- 1:9
copTabOrig <- validateDataTab(copTabOrig, keyCol = "Probe id",
featureDataColNums = featureDataCols)
copData <- ExpressionSet(as.matrix(copTabOrig[, -featureDataCols]))
featureData(copData) <- new("AnnotatedDataFrame", data=copTabOrig[, featureDataCols])
# Column (NCI-60 cell line) consistency check.
stopifnot(identical(colnames(exprs(copData)), cmNci60Names))
#--------------------------------------------------------------------------------------------------
# LOAD DATA: GENE METHYLATION
#--------------------------------------------------------------------------------------------------
# http://discovery.nci.nih.gov/cellminerint/loadDownload.do
# Select: [DNA: Illumina 450K methylation, select: Gene average]
# Processing of data file (DNA__Illumina_450K_methylation_Gene_average.xls):
# --- Save as text file
# --- Delete first 10 rows to get to table.
# --- Clean up column names (delete superscripts for footnotes)
filePath <- "inst/extdata/cellminer_2_0/DNA__Illumina_450K_methylation_Gene_average.txt"
metTabOrig <- read.table(file=filePath, header=TRUE, sep="\t", stringsAsFactors=FALSE,
check.names = FALSE, comment.char="", quote="", na.strings="-")
featureDataCols <- 1:9
metTabOrig <- validateDataTab(metTabOrig, keyCol = "Probe id",
featureDataColNums = featureDataCols)
metData <- ExpressionSet(as.matrix(metTabOrig[, -featureDataCols]))
featureData(metData) <- new("AnnotatedDataFrame", data=metTabOrig[, featureDataCols])
# Column (NCI-60 cell line) consistency check.
stopifnot(identical(colnames(exprs(metData)), cmNci60Names))
#--------------------------------------------------------------------------------------------------
# LOAD DATA: EXOME/MUTATION.
#--------------------------------------------------------------------------------------------------
#----[gene level function altering mutations]-------------------------------------------------
# http://discovery.nci.nih.gov/cellminerint/loadDownload.do
# Select: [DNA: Exome Seq, select: protein function affecting]
# Processing of data file (DNA__Exome_Seq_Protein_function_affecting.xls):
# --- Save as text file
# --- Delete first 10 rows to get to table.
# --- Clean up column names (delete superscripts for footnotes)
filePath <- "inst/extdata/cellminer_2_0/DNA__Exome_Seq_Protein_function_affecting.txt"
mutTabOrig <- read.table(file=filePath, header=TRUE, sep="\t", stringsAsFactors=FALSE,
check.names = FALSE, comment.char="", quote="", na.strings="-")
featureDataCols <- 1:9
mutTabOrig <- validateDataTab(mutTabOrig, keyCol = "Gene name",
featureDataColNums = featureDataCols)
# NAs indicate that not enough reads were available to determine variant allele percent
# conversion (see CellMiner spreadsheet footnotes); treated as zeros for analyses.
for (cLine in colnames(mutTabOrig[, -featureDataCols])){
naIndexSet <- which(is.na(mutTabOrig[, cLine]))
mutTabOrig[naIndexSet, cLine] <- 0
}
mutData <- ExpressionSet(as.matrix(mutTabOrig[, -featureDataCols]))
featureData(mutData) <- new("AnnotatedDataFrame", data=mutTabOrig[, featureDataCols])
# Column (NCI-60 cell line) consistency check.
stopifnot(identical(colnames(exprs(mutData)), cmNci60Names))
#----[variant level exome sequencing data]--------------------------------------------------
filePath <- "inst/extdata/cellminer_2_0/DNA__Exome_Seq_none.txt"
exoTabOrig <- read.table(file=filePath, header=TRUE, sep="\t", stringsAsFactors=FALSE,
check.names = FALSE, comment.char="", quote="", na.strings="-")
featureDataCols <- 1:18
exoTabOrig <- validateDataTab(exoTabOrig, keyCol = "Probe id",
featureDataColNums = featureDataCols)
exoData <- ExpressionSet(as.matrix(exoTabOrig[, -featureDataCols]))
featureData(exoData) <- new("AnnotatedDataFrame", data=exoTabOrig[, featureDataCols])
# Column (NCI-60 cell line) consistency check.
stopifnot(identical(colnames(exprs(exoData)), cmNci60Names))
#--------------------------------------------------------------------------------------------------
# LOAD DATA: PROTEIN EXPRESSION (RPLA)
#--------------------------------------------------------------------------------------------------
# http://discovery.nci.nih.gov/cellminerint/loadDownload.do
# Select: [Protein: Lysate Array, select: log2].
filePath <- "inst/extdata/cellminer_2_0/Protein__Lysate_Array_log2.txt"
proTabOrig <- read.table(file=filePath, header=TRUE, sep="\t", stringsAsFactors=FALSE,
check.names = FALSE, comment.char="", quote="", na.strings="-")
featureDataCols <- 1:9
proTabOrig <- validateDataTab(proTabOrig, keyCol = "Probe id",
featureDataColNums = featureDataCols)
proData <- ExpressionSet(as.matrix(proTabOrig[, -featureDataCols]))
featureData(proData) <- new("AnnotatedDataFrame", data=proTabOrig[, featureDataCols])
# Column (NCI-60 cell line) consistency check.
stopifnot(identical(colnames(exprs(proData)), cmNci60Names))
#--------------------------------------------------------------------------------------------------
# LOAD DATA: PROTEIN EXPRESSION (SWATH-MS)
#--------------------------------------------------------------------------------------------------
# filePath <- "inst/extdata/cellminer_2_0/"
# swaTabOrig <- read.table(file=filePath, header=TRUE, sep="\t", stringsAsFactors=FALSE,
# check.names = FALSE, comment.char="", quote="", na.strings="-")
swaExpMat <- rcellminer::getAllFeatureData(nci60imsb::molData)[["swathms_avg"]]
stopifnot(identical(colnames(swaExpMat), CellMinerNci60LineTab$CellMiner_1_6))
colnames(swaExpMat) <- CellMinerNci60LineTab$CellMiner_2_0
swaAnnot <- rcellminer::getFeatureAnnot(nci60imsb::molData)[["swathms_avg"]]
stopifnot(identical(rownames(swaAnnot), rownames(swaExpMat)))
swaData <- ExpressionSet(swaExpMat)
featureData(swaData) <- new("AnnotatedDataFrame", data=swaAnnot)
# Column (NCI-60 cell line) consistency check.
stopifnot(identical(colnames(exprs(swaData)), cmNci60Names))
#--------------------------------------------------------------------------------------------------
# LOAD DATA: MICRORNA EXPRESSION.
#--------------------------------------------------------------------------------------------------
# http://discovery.nci.nih.gov/cellminerint/loadDownload.do
# Select: [RNA: Agilent Human microRNA (V2)].
filePath <- "inst/extdata/cellminer_2_0/RNA__Agilent_Human_microRNA_(V2)_GeneSpringGX.txt"
mirTabOrig <- read.table(file=filePath, header=TRUE, sep="\t", stringsAsFactors=FALSE,
check.names = FALSE, comment.char="", quote="", na.strings="-")
featureDataCols <- 1:11
mirTabOrig <- validateDataTab(mirTabOrig, keyCol = "Probe id",
featureDataColNums = featureDataCols)
mirData <- ExpressionSet(as.matrix(mirTabOrig[, -featureDataCols]))
featureData(mirData) <- new("AnnotatedDataFrame", data=mirTabOrig[, featureDataCols])
# Column (NCI-60 cell line) consistency check.
stopifnot(identical(colnames(exprs(mirData)), cmNci60Names))
#--------------------------------------------------------------------------------------------------
# LOAD DATA: CELL LINE METADATA.
#--------------------------------------------------------------------------------------------------
filePath <- "inst/extdata/cellminer_2_0/CELLMINER_CELL_LINE_METADATA.txt"
mdaTabOrig <- read.table(file=filePath, header=TRUE, sep="\t", stringsAsFactors=FALSE,
check.names = FALSE, comment.char="", quote="", na.strings=c("", "NA", "?"))
if (identical(mdaTabOrig$`Cell Line Name`, CellMinerNci60LineTab$CellMiner_1_6)){
mdaTabOrig$`Cell Line Name` <- CellMinerNci60LineTab$CellMiner_2_0
}
rownames(mdaTabOrig) <- mdaTabOrig$`Cell Line Name`
quantFeatures <- c("age", "Epithelial", "p53", "mdr", "doubling time")
mdaQuantTab <- mdaTabOrig[, quantFeatures]
colnames(mdaQuantTab) <- c("AGE", "IS_EPITHELIAL", "IS_P53_MUT", "MDR", "DOUBLING_TIME")
mdaQuantTab$AGE <- as.integer(mdaQuantTab$AGE)
mdaQuantTab$IS_EPITHELIAL[str_trim(mdaQuantTab$IS_EPITHELIAL) == "yes"] <- 1
mdaQuantTab$IS_EPITHELIAL[str_trim(mdaQuantTab$IS_EPITHELIAL) == "no"] <- 0
mdaQuantTab$IS_EPITHELIAL <- as.integer(mdaQuantTab$IS_EPITHELIAL)
mdaQuantTab$IS_P53_MUT[str_trim(mdaQuantTab$IS_P53_MUT) == "MT"] <- 1
mdaQuantTab$IS_P53_MUT[str_trim(mdaQuantTab$IS_P53_MUT) == "WT"] <- 0
mdaQuantTab$IS_P53_MUT <- as.integer(mdaQuantTab$IS_P53_MUT)
mdaQuantTab$MDR <- as.numeric(mdaQuantTab$MDR)
mdaQuantTab$DOUBLING_TIME <- as.numeric(mdaQuantTab$DOUBLING_TIME)
mdaTabSampleInfo <- mdaTabOrig[, setdiff(colnames(mdaTabOrig), quantFeatures)]
stopifnot(all(c(lapply(mdaQuantTab, is.numeric), recursive = TRUE)))
mdaData <- ExpressionSet(t(mdaQuantTab))
stopifnot(is.numeric(exprs(mdaData)))
mdaAnnot <- data.frame(Name = rownames(exprs(mdaData)), Footnote = NA, stringsAsFactors = FALSE)
rownames(mdaAnnot) <- mdaAnnot$Name
mdaAnnot["AGE", "Footnote"] <- "Information from Stinson, et al., (Anticancer Res. 1992 Jul-Aug;12(4):1035-53), DTP, ATCC, and other sources."
mdaAnnot["IS_EPITHELIAL", "Footnote"] <- ""
mdaAnnot["IS_P53_MUT", "Footnote"] <- "p53 status as determined by yeast growth functional assay: PM O'Conner, et al. (Cancer Res. 1997 Oct 1;57(19):4285-300)."
mdaAnnot["MDR", "Footnote"] <- "MDR Function: from DTP site (Lee JS etal., Mol Pharmacol. 1994 Oct;46(4):627-38)."
mdaAnnot["DOUBLING_TIME", "Footnote"] <- "Doubling times described at NCI/DTP site."
featureData(mdaData) <- new("AnnotatedDataFrame", data=mdaAnnot)
# Column (NCI-60 cell line) consistency check.
stopifnot(identical(colnames(exprs(mdaData)), cmNci60Names))
#--------------------------------------------------------------------------------------------------
# LOAD DATA: DRUG ACTIVITY.
#--------------------------------------------------------------------------------------------------
# activity data -------------------------------------------------------------------------
filePath <- "inst/extdata/cellminer_2_0/DTP_NCI60_ZSCORE.txt"
actTabOrig <- read.table(file=filePath, header=TRUE, sep="\t", stringsAsFactors=FALSE,
check.names = FALSE, comment.char="", quote="",
na.strings=c("", "na", "-"))
actTabOrig <- actTabOrig[, c(1:6, 67, 68, 7:66)]
featureDataCols <- 1:8
actTabOrig <- validateDataTab(actTabOrig, keyCol = "NSC #",
featureDataColNums = featureDataCols)
drugInfoTab <- actTabOrig[, featureDataCols]
colnames(drugInfoTab) <- c("NSC", "NAME", "FDA_STATUS", "MOA",
"PUBCHEM_ID", "SMILES", "TOTAL_EXPS", "TOTAL_EXPS_AFTER_QC")
drugInfoTab$NSC <- as.character(drugInfoTab$NSC)
drugInfoTab$PUBCHEM_ID <- as.integer(drugInfoTab$PUBCHEM_ID)
drugInfoTab$TOTAL_EXPS <- as.integer(drugInfoTab$TOTAL_EXPS)
drugInfoTab$TOTAL_EXPS_AFTER_QC <- as.integer(drugInfoTab$TOTAL_EXPS_AFTER_QC)
actData <- ExpressionSet(as.matrix(actTabOrig[, -featureDataCols]))
stopifnot(identical(rownames(exprs(actData)), rownames(drugInfoTab)))
featureData(actData) <- new("AnnotatedDataFrame", data=drugInfoTab)
# Column (NCI-60 cell line) consistency check.
stopifnot(identical(colnames(exprs(actData)), cmNci60Names))
# repeat activity data ------------------------------------------------------------------
filePath <- "inst/extdata/cellminer_2_0/DTP_NCI60_EXPS_USED_FOR_ZSCORE_ACT.txt"
usedInZscoreAct <- read.table(file=filePath, header=TRUE, sep="\t", stringsAsFactors=FALSE,
check.names = FALSE, comment.char="", quote="",
na.strings=c("", "na", "-"))
usedInZscoreAct$NSC_EXP_NAME <- paste0(usedInZscoreAct[, "NSC #"], "_",
usedInZscoreAct[, "Experiment name"])
filePath <- "inst/extdata/cellminer_2_0/DTP_NCI60_RAW.txt"
rawActTabOrig <- read.table(file=filePath, header=TRUE, sep="\t", stringsAsFactors=FALSE,
check.names = FALSE, comment.char="", quote="",
na.strings=c("", "na", "-"))
rawActTabOrig[, "NSC #"] <- as.character(rawActTabOrig[, "NSC #"])
rawActTabOrig$NSC_EXP_NAME <- paste0(rawActTabOrig[, "NSC #"], "_",
rawActTabOrig[, "Experiment name"])
rawActTabOrig$used_in_zscore <- FALSE
rawActTabOrig <- rawActTabOrig[, c(70, 71, 1:9, 10:69)]
featureDataCols <- 1:11
rawActTabOrig <- validateDataTab(rawActTabOrig, keyCol = "NSC_EXP_NAME",
featureDataColNums = featureDataCols)
for (i in seq_len(nrow(rawActTabOrig))){
nscExpId <- rawActTabOrig[i, "NSC_EXP_NAME"]
if (nscExpId %in% usedInZscoreAct$NSC_EXP_NAME){
rawActTabOrig[i, "used_in_zscore"] <- TRUE
}
cat(i, "\n")
}
nscSet <- unique(rawActTabOrig[, "NSC #"])
i <- 0
for (nscStr in nscSet){
i <- i + 1
nscRawActData <- rawActTabOrig[(rawActTabOrig[, "NSC #"] == nscStr), ]
expectedNumUsedInZscore <- unique(nscRawActData[, "Total after quality control"])
stopifnot(sum(nscRawActData$used_in_zscore) == expectedNumUsedInZscore)
cat(i, "\n")
}
drugRepeatInfoTab <- rawActTabOrig[, c("NSC_EXP_NAME", "NSC #", "Experiment name",
"used_in_zscore")]
colnames(drugRepeatInfoTab) <- c("NSC_EXP_NAME", "nsc", "experiment", "used_in_zscore")
stopifnot(is.character(drugRepeatInfoTab$nsc))
repeatActData <- ExpressionSet(as.matrix(rawActTabOrig[, -featureDataCols]))
stopifnot(identical(rownames(exprs(repeatActData)), rownames(drugRepeatInfoTab)))
featureData(repeatActData) <- new("AnnotatedDataFrame", data=drugRepeatInfoTab)
# Column (NCI-60 cell line) consistency check.
stopifnot(identical(colnames(exprs(repeatActData)), cmNci60Names))
#--------------------------------------------------------------------------------------------------
# Make NCI-60 sample info (shared by molData and drugData objects to be constructed).
#--------------------------------------------------------------------------------------------------
cellLineInfo <- loadNciColorSet(returnDf = TRUE)
if (identical(cellLineInfo$abbrCellLines, CellMinerNci60LineTab$CellMiner_1_6)){
cellLineInfo$abbrCellLines <- CellMinerNci60LineTab$CellMiner_2_0
}
stopifnot(identical(cellLineInfo$abbrCellLines, cmNci60Names))
cellLineOncoTreeTab <- read.table(file="inst/extdata/CellLineToOncoTree.txt",
header=TRUE, sep="\t", stringsAsFactors=FALSE,
check.names = FALSE, comment.char="", quote="",
na.strings=c("", "NA"))
# update "inst/extdata/CellLineToOncoTree.txt" if necessary -----------------------------
if (identical(cellLineOncoTreeTab[1:60, "Name"], CellMinerNci60LineTab$CellMiner_1_6)){
cellLineOncoTreeTab[1:60, "Name"] <- CellMinerNci60LineTab$CellMiner_2_0
write.table(cellLineOncoTreeTab, file="inst/extdata/CellLineToOncoTree.txt",
quote=FALSE, sep="\t", row.names=FALSE, col.names=TRUE, na="NA")
}
# ---------------------------------------------------------------------------------------
cellLineOncoTreeTab <- cellLineOncoTreeTab[(cellLineOncoTreeTab$DataSource == "NCI-60"), ]
stopifnot(identical(cellLineOncoTreeTab$Name, cmNci60Names))
stopifnot(identical(mdaTabSampleInfo$`Cell Line Name`, cmNci60Names))
nci60Miame <- new("MIAME", name="CellMiner 2.0", lab="NCI/DTB",
samples=list(Name = cmNci60Names,
TissueType = cellLineInfo$tissues,
OncoTree1 = cellLineOncoTreeTab$OncoTree1,
OncoTree2 = cellLineOncoTreeTab$OncoTree2,
OncoTree3 = cellLineOncoTreeTab$OncoTree3,
OncoTree4 = cellLineOncoTreeTab$OncoTree4,
Gender = mdaTabSampleInfo[, "sex"],
PriorTreatment = mdaTabSampleInfo[, "prior treatment"],
Histology = mdaTabSampleInfo[, "histology"],
Source = mdaTabSampleInfo[, "source"],
Ploidy = mdaTabSampleInfo[, "ploidy"],
Institute = mdaTabSampleInfo[, "Institute"],
Contributor = mdaTabSampleInfo[, "Contributor"],
Reference = mdaTabSampleInfo[, "Reference"]))
#--------------------------------------------------------------------------------------------------
# Make NCI-60 MolData object.
#--------------------------------------------------------------------------------------------------
nci60ESetList <- list()
nci60ESetList[["exp"]] <- expData
nci60ESetList[["xai"]] <- xaiData
nci60ESetList[["cop"]] <- copData
nci60ESetList[["met"]] <- metData
nci60ESetList[["mir"]] <- mirData
nci60ESetList[["mut"]] <- mutData
nci60ESetList[["exo"]] <- exoData
nci60ESetList[["pro"]] <- proData
nci60ESetList[["swa"]] <- swaData
nci60ESetList[["mda"]] <- mdaData
molData <- new("MolData", eSetList = nci60ESetList, sampleData = nci60Miame)
save(molData, file = "data/molData.RData")
#--------------------------------------------------------------------------------------------------
# Make NCI-60 DrugData object.
#--------------------------------------------------------------------------------------------------
drugData <- new("DrugData", act = actData, repeatAct = repeatActData, sampleData = nci60Miame)
save(drugData, file = "data/drugData.RData")
#--------------------------------------------------------------------------------------------------
# UPDATE DRUG DATA TO EXCLUDE ERRONEOUSLY ADDED COMPOUNDS
#--------------------------------------------------------------------------------------------------
library(rcellminer)
library(stringr)
excludedNscSet <- as.character(read.table("~/TMP/excludedNscSet.txt", header = FALSE)[, 1])
excludedNscSet <- stringr::str_trim(excludedNscSet)
# load original data ----------------------------------------------------------
nci60Miame <- rcellminerData::drugData@sampleData
actDataOrigMat <- exprs(getAct(rcellminerData::drugData))
actDataOrigAnnot <- rcellminer::getFeatureAnnot(rcellminerData::drugData)[["drug"]]
stopifnot(identical(rownames(actDataOrigMat), rownames(actDataOrigAnnot)))
repActDataOrigMat <- exprs(getRepeatAct(rcellminerData::drugData))
repActDataOrigAnnot <- rcellminer::getFeatureAnnot(rcellminerData::drugData)[["drugRepeat"]]
stopifnot(identical(rownames(repActDataOrigMat), rownames(repActDataOrigAnnot)))
# -----------------------------------------------------------------------------
# determine proper compound set -----------------------------------------------
# length(excludedNscSet)
# dim(actDataOrigMat)
# length(intersect(excludedNscSet, rownames(actDataOrigMat)))
properNscSet <- setdiff(rownames(actDataOrigMat), excludedNscSet)
# -----------------------------------------------------------------------------
# make actData ----------------------------------------------------------------
actDataMat <- actDataOrigMat[properNscSet, ]
actDataAnnot <- actDataOrigAnnot[properNscSet, ]
stopifnot(identical(rownames(actDataMat), rownames(actDataAnnot)))
####################################
actData <- ExpressionSet(actDataMat)
featureData(actData) <- new("AnnotatedDataFrame", data=actDataAnnot)
# -----------------------------------------------------------------------------
# make repeatActData ----------------------------------------------------------
stopifnot(is.character(repActDataOrigAnnot$nsc))
properNscIndexSet <- which(repActDataOrigAnnot$nsc %in% properNscSet)
repActDataMat <- repActDataOrigMat[properNscIndexSet, ]
repActDataAnnot <- repActDataOrigAnnot[properNscIndexSet, ]
stopifnot(identical(rownames(repActDataMat), rownames(repActDataAnnot)))
stopifnot(all(repActDataAnnot$nsc %in% rownames(exprs(actData))))
#############################################
repeatActData <- ExpressionSet(repActDataMat)
featureData(repeatActData) <- new("AnnotatedDataFrame", data=repActDataAnnot)
#------------------------------------------------------------------------------
drugData <- new("DrugData", act = actData, repeatAct = repeatActData, sampleData = nci60Miame)
save(drugData, file = "data/drugData.RData")
#--------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------
# UPDATE DRUG DATA TO EXCLUDE ERRONEOUSLY ADDED COMPOUNDS (2)
#--------------------------------------------------------------------------------------------------
library(rcellminer)
library(stringr)
# load and check updated NSC set ----------------------------------------------
properNscSet<- as.character(read.table(
"inst/extdata/cellminer_2_0/DTP_NCI60_CORRECTED_NSC_SET.txt", header = TRUE)[, 1])
properNscSet <- stringr::str_trim(properNscSet)
stopifnot(all(!duplicated(properNscSet)))
excludedNscSet <- as.character(read.table("~/TMP/excludedNscSet.txt", header = FALSE)[, 1])
excludedNscSet <- stringr::str_trim(excludedNscSet)
stopifnot(length(intersect(properNscSet, excludedNscSet)) == 0)
# load original data ----------------------------------------------------------
nci60Miame <- rcellminerData::drugData@sampleData
actDataOrigMat <- exprs(getAct(rcellminerData::drugData))
actDataOrigAnnot <- rcellminer::getFeatureAnnot(rcellminerData::drugData)[["drug"]]
stopifnot(identical(rownames(actDataOrigMat), rownames(actDataOrigAnnot)))
stopifnot(all(properNscSet %in% rownames(actDataOrigMat)))
repActDataOrigMat <- exprs(getRepeatAct(rcellminerData::drugData))
repActDataOrigAnnot <- rcellminer::getFeatureAnnot(rcellminerData::drugData)[["drugRepeat"]]
stopifnot(identical(rownames(repActDataOrigMat), rownames(repActDataOrigAnnot)))
# -----------------------------------------------------------------------------
# make actData ----------------------------------------------------------------
actDataMat <- actDataOrigMat[properNscSet, ]
actDataAnnot <- actDataOrigAnnot[properNscSet, ]
stopifnot(identical(rownames(actDataMat), rownames(actDataAnnot)))
####################################
actData <- ExpressionSet(actDataMat)
featureData(actData) <- new("AnnotatedDataFrame", data=actDataAnnot)
# -----------------------------------------------------------------------------
# make repeatActData ----------------------------------------------------------
stopifnot(is.character(repActDataOrigAnnot$nsc))
properNscIndexSet <- which(repActDataOrigAnnot$nsc %in% properNscSet)
repActDataMat <- repActDataOrigMat[properNscIndexSet, ]
repActDataAnnot <- repActDataOrigAnnot[properNscIndexSet, ]
stopifnot(identical(rownames(repActDataMat), rownames(repActDataAnnot)))
stopifnot(all(repActDataAnnot$nsc %in% rownames(exprs(actData))))
#############################################
repeatActData <- ExpressionSet(repActDataMat)
featureData(repeatActData) <- new("AnnotatedDataFrame", data=repActDataAnnot)
#------------------------------------------------------------------------------
drugData <- new("DrugData", act = actData, repeatAct = repeatActData, sampleData = nci60Miame)
save(drugData, file = "data/drugData.RData")
#--------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------
# library(rcellminer)
# library(stringr)
#
# cm20PubNscSet <- as.character(read.table(
# "inst/extdata/cellminer_2_0/DTP_NCI60_CORRECTED_NSC_SET.txt", header = FALSE)[, 1])
# cm20PubNscSet <- stringr::str_trim(cm20PubNscSet)
#
# excludedNscSet <- as.character(read.table("~/TMP/excludedNscSet.txt", header = FALSE)[, 1])
# excludedNscSet <- stringr::str_trim(excludedNscSet)
#
# tmp <- intersect(cm20PubNscSet, excludedNscSet)
#
# write.table(tmp, file="~/TMP/nonpub_nsc_in_cm20.txt", quote=FALSE, sep="\t",
# row.names=FALSE, col.names=TRUE, na="NA")
#--------------------------------------------------------------------------------------------------
# UPDATE LOG2 EXPRESSION DATA (EXCLUDE CNS:SF-539 CELL LINE)
#--------------------------------------------------------------------------------------------------
# log2 expression data values for this cell line are not consistent with the z-score
# (5-platform-summary) expression data; setting values to NA to avoid misinterpretation.
library(rcellminer)
xaiAnnot <- rcellminer::getFeatureAnnot(rcellminerData::molData)[["xai"]]
xaiDat <- rcellminer::getAllFeatureData(rcellminerData::molData)[["xai"]]
xaiDat[, "CNS:SF-539"] <- NA
xaiESet <- ExpressionSet(xaiDat)
featureData(xaiESet) <- new("AnnotatedDataFrame", data=xaiAnnot)
rcmESets <- rcellminerData::molData@eSetList
rcmESets[["xai"]] <- xaiESet
molData <- new("MolData", eSetList = rcmESets, sampleData = rcellminerData::molData@sampleData)
save(molData, file = "data/molData.RData")
#--------------------------------------------------------------------------------------------------
# UPDATING ONCOTREE TISSUE TYPE ANNOTATIONS (FROM MATCHED CELL LINE ANNOTATIONS IN OTHER SOURCES)
#--------------------------------------------------------------------------------------------------
updatedLineAnnot <- readRDS("~/lmpNci/rcellminerUtils/inst/crossDbAnnotChecking/updatedLineAnnot.rds")
nci60AnnotUpdate <- updatedLineAnnot[["nci60"]]
nci60Annot <- rcellminer::getSampleData(rcellminerData::molData)
stopifnot(identical(colnames(nci60AnnotUpdate), colnames(nci60Annot)))
stopifnot(identical(nci60AnnotUpdate$Name, nci60Annot$Name))
nci60Miame <- new("MIAME", name="CellMiner 2.0", lab="NCI/DTB",
samples=list(Name = nci60Annot$Name,
TissueType = nci60Annot$TissueType,
OncoTree1 = nci60AnnotUpdate$OncoTree1,
OncoTree2 = nci60AnnotUpdate$OncoTree2,
OncoTree3 = nci60AnnotUpdate$OncoTree3,
OncoTree4 = nci60AnnotUpdate$OncoTree4,
Gender = nci60Annot$Gender,
PriorTreatment = nci60Annot$PriorTreatment,
Histology = nci60Annot$Histology,
Source = nci60Annot$Source,
Ploidy = nci60Annot$Ploidy,
Institute = nci60Annot$Institute,
Contributor = nci60Annot$Contributor,
Reference = nci60Annot$Reference))
molData <- new("MolData", eSetList = rcellminerData::molData@eSetList, sampleData = nci60Miame)
save(molData, file = "data/molData.RData")
#--------------------------------------------------------------------------------------------------
# CORRECTING MIS-SPECIFIED DRUG NAMES
#--------------------------------------------------------------------------------------------------
library(rcellminer)
rcmDrugData <- rcellminerData::drugData # rcellminer DrugData object
# Get activity data matrix and matched drug annotation data frame.
drugAct <- exprs(rcellminer::getAct(rcmDrugData))
drugAnnot <- rcellminer::getFeatureAnnot(rcmDrugData)[["drug"]]
stopifnot(identical(rownames(drugAct), rownames(drugAnnot)))
# ----[corrections]--------------------------------------------
drugAnnot["614826", "NAME"] <- "Bisacodyl"
# -------------------------------------------------------------
correctedActEset <- ExpressionSet(drugAct) # keep existing object
featureData(correctedActEset) <- new("AnnotatedDataFrame", data=drugAnnot)
drugData <- new("DrugData", act = correctedActEset,
repeatAct = rcmDrugData@repeatAct, # keep existing object
sampleData = rcmDrugData@sampleData) # keep existing object
save(drugData, file = "data/drugData.RData")
#--------------------------------------------------------------------------------------------------
# UPDATE SAMPLE DATA FOR DRUG DATA OBJECT
#--------------------------------------------------------------------------------------------------
library(rcellminer)
# OncoTree annotations were updated for the MolData object sample data (see above),
# but (mistakenly) not for the drug data object sample data.
rcmMolData <- rcellminerData::molData # rcellminer MolData object
rcmDrugData <- rcellminerData::drugData # rcellminer DrugData object
stopifnot(identical(rcellminer::getSampleData(rcmMolData)[["Name"]],
rcellminer::getSampleData(rcmDrugData)[["Name"]]))
stopifnot(identical(rcellminer::getSampleData(rcmMolData)[["OncoTree1"]],
rcellminer::getSampleData(rcmDrugData)[["OncoTree1"]]))
# This check fails, because more detailed tissue type annotations were added to the
# molData object sample data, but not the drug data object's sample data.
# stopifnot(identical(rcellminer::getSampleData(rcmMolData)[["OncoTree2"]],
# rcellminer::getSampleData(rcmDrugData)[["OncoTree2"]]))
drugData <- new("DrugData", act = rcmDrugData@act, # keep existing object
repeatAct = rcmDrugData@repeatAct, # keep existing object
sampleData = rcmMolData@sampleData) # take correct molData object version
save(drugData, file = "data/drugData.RData")
#--------------------------------------------------------------------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.