ThreeWayClassifier.R
In netDx: Network-based patient classifier

## ----eval=FALSE---------------------------------------------------------------
#  suppressWarnings(suppressMessages(require(netDx)))
#  suppressWarnings(suppressMessages(library(curatedTCGAData)))
#  
#  # fetch RNA, methylation and proteomic data for TCGA BRCA set
#  brca <- suppressMessages(
#     curatedTCGAData("BRCA",
#                 c("mRNAArray","RPPA*","Methylation_methyl27*"),
#  	dry.run=FALSE))
#  
#  # process input variables
#  # prepare clinical variable - stage
#  staget <- sub("[abcd]","",sub("t","",colData(brca)$pathology_T_stage))
#  staget <- suppressWarnings(as.integer(staget))
#  colData(brca)$STAGE <- staget
#  # exclude normal, HER2 (small num samples)
#  pam50 <- colData(brca)$PAM50.mRNA
#  idx <- union(which(pam50 %in% c("Normal-like","HER2-enriched")),
#  	which(is.na(staget)))
#  idx <- union(idx, which(is.na(pam50)))
#  pID <- colData(brca)$patientID
#  tokeep <- setdiff(pID, pID[idx])
#  brca <- brca[,tokeep,]
#  pam50 <- colData(brca)$PAM50.mRNA
#  colData(brca)$pam_mod <- pam50
#  
#  # remove duplicate names
#  smp <- sampleMap(brca)
#  for (nm in names(brca)) {
#  	samps <- smp[which(smp$assay==nm),]
#  	notdup <- samps[which(!duplicated(samps$primary)),"colname"]
#  	brca[[nm]] <- suppressMessages(brca[[nm]][,notdup])
#  }
#  
#  # colData must have ID and STATUS columns
#  pID <- colData(brca)$patientID
#  colData(brca)$ID <- pID
#  colData(brca)$STATUS <- gsub(" ","_",colData(brca)$pam_mod)
#  
#  # create grouping rules
#  groupList <- list()
#  # genes in mRNA data are grouped by pathways
#  pathList <- readPathways(fetchPathwayDefinitions("January",2018))
#  groupList[["BRCA_mRNAArray-20160128"]] <- pathList[1:3]
#  # clinical data is not grouped; each variable is its own feature
#  groupList[["clinical"]] <- list(
#        age="patient.age_at_initial_pathologic_diagnosis",
#  	   stage="STAGE"
#  )
#  # for methylation generate one feature containing all probes
#  # same for proteomics data
#  tmp <- list(rownames(experiments(brca)[[2]]));
#  names(tmp) <- names(brca)[2]
#  groupList[[names(brca)[2]]] <- tmp
#  
#  tmp <- list(rownames(experiments(brca)[[3]]));
#  names(tmp) <- names(brca)[3]
#  groupList[[names(brca)[3]]] <- tmp
#  
#  # create function to tell netDx how to build features
#  # (PSN) from your data
#  makeNets <- function(dataList, groupList, netDir,...) {
#  	netList <- c() # initialize before is.null() check
#  	# correlation-based similarity for mRNA, RPPA and methylation data
#  	# (Pearson correlation)
#  	for (nm in setdiff(names(groupList),"clinical")) {
#  	   # NOTE: the check for is.null() is important!
#  		if (!is.null(groupList[[nm]])) {
#  		netList <- makePSN_NamedMatrix(dataList[[nm]],
#  		             rownames(dataList[[nm]]),
#                     groupList[[nm]],netDir,verbose=FALSE,
#  		             writeProfiles=TRUE,...)
#  		}
#  	}
#  	
#  	# make clinical nets (normalized difference)
#  	netList2 <- c()
#  	if (!is.null(groupList[["clinical"]])) {
#  	netList2 <- makePSN_NamedMatrix(dataList$clinical,
#  		rownames(dataList$clinical),
#  		groupList[["clinical"]],netDir,
#  		simMetric="custom",customFunc=normDiff, # custom function
#  		writeProfiles=FALSE,
#  		sparsify=TRUE,verbose=TRUE,...)
#  	}
#  	netList <- c(unlist(netList),unlist(netList2))
#  	return(netList)
#  }
#  
#  # run predictor
#  set.seed(42) # make results reproducible
#  outDir <- paste(tempdir(),randAlphanumString(),"pred_output",sep=getFileSep())
#  # To see all messages, remove suppressMessages()
#  # and set logging="default".
#  # To keep all intermediate data, set keepAllData=TRUE
#  numSplits <- 2L
#  out <- suppressMessages(
#     buildPredictor(dataList=brca,groupList=groupList,
#        makeNetFunc=makeNets,
#        outDir=outDir, ## netDx requires absolute path
#        numSplits=numSplits, featScoreMax=2L, featSelCutoff=1L,
#  	   numCores=1L)
#  )
#  
#  # collect results for accuracy
#  st <- unique(colData(brca)$STATUS)
#  acc <- matrix(NA,ncol=length(st),nrow=numSplits)  # accuracy by class
#  colnames(acc) <- st
#  for (k in 1:numSplits) {
#  	pred <- out[[sprintf("Split%i",k)]][["predictions"]];
#  	tmp <- pred[,c("ID","STATUS","TT_STATUS","PRED_CLASS",
#  	                 sprintf("%s_SCORE",st))]
#  	for (m in 1:length(st)) {
#  	   tmp2 <- subset(tmp, STATUS==st[m])
#  	   acc[k,m] <- sum(tmp2$PRED==tmp2$STATUS)/nrow(tmp2)
#  	}
#  }
#  
#  # accuracy by class
#  print(round(acc*100,2))
#  
#  # confusion matrix
#  res <- out$Split1$predictions
#  print(table(res[,c("STATUS","PRED_CLASS")]))
#  
#  sessionInfo()

## ----eval=TRUE----------------------------------------------------------------
suppressWarnings(suppressMessages(require(netDx)))

## ----eval=TRUE----------------------------------------------------------------
suppressMessages(library(curatedTCGAData))

## ----eval=TRUE----------------------------------------------------------------
curatedTCGAData(diseaseCode="BRCA", assays="*",dry.run=TRUE)

## ----eval=TRUE----------------------------------------------------------------
brca <- suppressMessages(
   curatedTCGAData("BRCA",
               c("mRNAArray","RPPA*","Methylation_methyl27*"),
	dry.run=FALSE))

## ----eval=TRUE----------------------------------------------------------------
# prepare clinical variable - stage
staget <- sub("[abcd]","",sub("t","",colData(brca)$pathology_T_stage))
staget <- suppressWarnings(as.integer(staget))
colData(brca)$STAGE <- staget

# exclude normal, HER2 (small num samples)
pam50 <- colData(brca)$PAM50.mRNA
idx <- union(which(pam50 %in% c("Normal-like","HER2-enriched")), 
	which(is.na(staget)))
idx <- union(idx, which(is.na(pam50)))
pID <- colData(brca)$patientID
tokeep <- setdiff(pID, pID[idx])
brca <- brca[,tokeep,]

pam50 <- colData(brca)$PAM50.mRNA
colData(brca)$pam_mod <- pam50

# remove duplicate names
smp <- sampleMap(brca)
for (nm in names(brca)) {
	samps <- smp[which(smp$assay==nm),]
	notdup <- samps[which(!duplicated(samps$primary)),"colname"]
	brca[[nm]] <- suppressMessages(brca[[nm]][,notdup])
}

## ----eval=TRUE----------------------------------------------------------------
pID <- colData(brca)$patientID
colData(brca)$ID <- pID
colData(brca)$STATUS <- gsub(" ","_",colData(brca)$pam_mod)

## ----eval=TRUE----------------------------------------------------------------
groupList <- list()

# genes in mRNA data are grouped by pathways
pathList <- readPathways(fetchPathwayDefinitions("January",2018))
groupList[["BRCA_mRNAArray-20160128"]] <- pathList[1:3]
# clinical data is not grouped; each variable is its own feature
groupList[["clinical"]] <- list(
      age="patient.age_at_initial_pathologic_diagnosis",
	   stage="STAGE"
)
# for methylation generate one feature containing all probes
# same for proteomics data
tmp <- list(rownames(experiments(brca)[[2]]));
names(tmp) <- names(brca)[2]
groupList[[names(brca)[2]]] <- tmp

tmp <- list(rownames(experiments(brca)[[3]]));
names(tmp) <- names(brca)[3]
groupList[[names(brca)[3]]] <- tmp

## ----eval=FALSE---------------------------------------------------------------
#  makePSN_NamedMatrix(..., writeProfiles=TRUE,...)`

## ----eval=FALSE---------------------------------------------------------------
#     makePSN_NamedMatrix(,...,
#                         simMetric="custom", customFunc=normDiff,
#                         writeProfiles=FALSE)

## ----eval=TRUE----------------------------------------------------------------
makeNets <- function(dataList, groupList, netDir,...) {
	netList <- c() # initialize before is.null() check
	# correlation-based similarity for mRNA, RPPA and methylation data
	# (Pearson correlation)
	for (nm in setdiff(names(groupList),"clinical")) {
	   # NOTE: the check for is.null() is important!
		if (!is.null(groupList[[nm]])) {
		netList <- makePSN_NamedMatrix(dataList[[nm]],
		             rownames(dataList[[nm]]),
                   groupList[[nm]],netDir,verbose=FALSE,
		             writeProfiles=TRUE,...) 
		}
	}
	
	# make clinical nets (normalized difference)
	netList2 <- c()
	if (!is.null(groupList[["clinical"]])) {
	netList2 <- makePSN_NamedMatrix(dataList$clinical, 
		rownames(dataList$clinical),
		groupList[["clinical"]],netDir,
		simMetric="custom",customFunc=normDiff, # custom function
		writeProfiles=FALSE,
		sparsify=TRUE,verbose=TRUE,...)
	}
	netList <- c(unlist(netList),unlist(netList2))
	return(netList)
}


## ----eval=TRUE----------------------------------------------------------------
set.seed(42) # make results reproducible

# location for intermediate work
# set keepAllData to TRUE to not delete at the end of the 
# predictor run.
# This can be useful for debugging.
outDir <- paste(tempdir(),"pred_output",sep=getFileSep()) # use absolute path
numSplits <- 2L
out <- suppressMessages(
   buildPredictor(dataList=brca,groupList=groupList,
      makeNetFunc=makeNets,
      outDir=outDir, ## netDx requires absolute path
      numSplits=numSplits, featScoreMax=2L, featSelCutoff=1L,
	   numCores=1L)
)

## ----eval=TRUE----------------------------------------------------------------
summary(out)
summary(out$Split1)

## ----eval=TRUE----------------------------------------------------------------
# Average accuracy
st <- unique(colData(brca)$STATUS) 
acc <- matrix(NA,ncol=length(st),nrow=numSplits) 
colnames(acc) <- st 
for (k in 1:numSplits) { 
	pred <- out[[sprintf("Split%i",k)]][["predictions"]];
	tmp <- pred[,c("ID","STATUS","TT_STATUS","PRED_CLASS",
	                 sprintf("%s_SCORE",st))]
	for (m in 1:length(st)) {
	   tmp2 <- subset(tmp, STATUS==st[m])
	   acc[k,m] <- sum(tmp2$PRED==tmp2$STATUS)/nrow(tmp2)
	}
}
print(round(acc*100,2))

## ---- eval=TRUE---------------------------------------------------------------
res <- out$Split1$predictions
print(table(res[,c("STATUS","PRED_CLASS")]))

## ----eval=TRUE----------------------------------------------------------------
sessionInfo()