mle.r
In lashmore/MolEndoMatch: Molecular Endophenotype Patient Matchmaking

require(MolEndoMatch)

# 1. DATA LOAD ---------------------------------------------------------------
# 1.1 Load the raw data into R. Data are the raw values straight from diagnostics, which need to be fitted to a normal distribution.
#     Rows are patients/samples, columns are features/variables.
# data.raw <- read.table("/path/to/data/matrix.txt", sep="\t", header=TRUE)
data(testData)

# 1.2 If raw data are already fit to a normal distribution, skip step 1.2 below.
isControlSample <- c(rep(0,nrow(testData)/2), rep(1,nrow(testData)/2))
data.zscore <- data.fitDataToNormal(testData,isControlSample)

# 1.3 Convert the z-scores to p-values, tail agnostic. Make sure rows are patients/samples, and columns are features/variables.
data.pvals <- apply(data.zscore, c(1,2), function(i) 2*pnorm(abs(i), lower.tail = FALSE))
data.pvals <- t(data.pvals)
dim(data.pvals)






# 2. EXPERIMENT PREP ---------------------------------------------------------
# 2.1 Learn your background knowledge graph on your z-score data, using a Gaussian Markov Model facilitated by the graphical LASSO.
#     Features should be columns, patients should be rows.
inv.covmatt <- huge(t(data.zscore), method="glasso")
inv.covmat.select <- huge.select(inv.covmatt, criterion="stars")
inv.covmat <- as.matrix(inv.covmat.select$opt.icov)
diag(inv.covmat) <- 0;
colnames(inv.covmat) <- rownames(data.zscore)
ig <- graph.adjacency(as.matrix(inv.covmat), mode="undirected", weighted=TRUE, add.colnames='name')


# 2.2 Set your global tuning parameter variables: p0, p1, thresholdDiff, and thresholdDrawT.
p0=0.1  # This is the percentage of probability that gets divided uniformly between the variables in the background knowledge graph that was learned in 2.1.
p1=0.9  # This is the percentage of probability that gets divided preferentially (depending on the connectivity) between the variables in the background knowledge graph that was learned in 2.1.
thresholdDiff=0.01  # This is the threshold of probability for which the probability diffusion algorithm stops recursively diffusing.
thresholdDrawT=10   # This is the number of MISSES the permutation generater tolerates when looking adaptively for variables in the encoded subset.
kmax=5  # This is the largest subset you will encode against your background knowledge graph.
igraphObjectG <- vector(mode="list", length=length(V(ig)$name))  # Global placeholder for the probabilities that are assigned to each variable in the background graph
names(igraphObjectG) <- V(ig)$name
adjacency_matrix <- list(as.matrix(get.adjacency(ig, attr="weight")))  # Global copy of the background knowledge graph's adjacency matrix.

# 2.3 It is strongly advised that you save your progress up until this point into a smartly named RData file, so you can begin at stage 3 at any given time.
path<- paste(getwd(), "/projectFolder", sep="")
save.image(sprintf("%s/experimentPrep.RData", path))




# GET PERMUTATIONS --------------------------------------------------------
# If data load (Section 1) and experiment prep (Section 2) are already saved, load environment here.
# Specify a path where patient-specific data will be stored.
# setwd("/path/to/projectDir")
setwd(paste(getwd(), "/projectFolder", sep=""))
load("experimentPrep.RData")
path<- paste(getwd(), "/patientData", sep="")
# Takes about 2-3 minutes for test dataset
for (patient in 1:nrow(data.pvals)) {
  mle.getPatientBitstrings(ig, path, patient)
}



# GET MINIMUM ENCODING LENGTH ---------------------------------------------
# You can look at the particular encoding lengths of a given patient with this function. The mutual information metric in the next section uses this function, too.
# Subsets that were highly connected will have more 1's in the BitString, and larger d-scores (See the "d.entropy" column in the results object, below).
patientNum=rownames(data.pvals)[10]
results <- mle.getEncodingLength(ig, path, data.pvals, patientNum)
results <- results[order(results[,"d.score"], decreasing = TRUE),]
sizeSubset = results[1,"subsetSize"]



# GET PATIENT SIMILARITY MATRIX -------------------------------------------
# Note: Each comparison takes an average of 2 seconds. There are 50*50 pairwise comparisons total.
#       Expected execution time (if executed in serial) is currently 75-90 minutes for the test dataset.
patientSimilarity <- matrix(0, nrow=nrow(data.pvals), ncol=nrow(data.pvals))
rownames(patientSimilarity) <- rownames(data.pvals)
colnames(patientSimilarity) <- rownames(data.pvals)
for (p1 in 1:nrow(patientSimilarity)) {
  pt1 <- rownames(data.pvals)[p1]
  print(pt1)
  for (p2 in 1:ncol(patientSimilarity)) {
    pt2 <- rownames(data.pvals)[p2]
    patientSimilarity[pt1,pt2] <- mle.getPatientSimilarity(ig, path, data.pvals, pt1, pt2)
  }
}
# Normalize scale by row so min is 0 and max is 1.
for (p1 in 1:nrow(patientSimilarity)) {
  tmp <- as.matrix(patientSimilarity[p1,])
  patientSimilarity[p1,] <- (tmp-min(tmp))/(max(tmp)-min(tmp))
}
plot.ptSimilarity(patientSimilarity, path="/Users/lillian.rosa/Downloads/Thesis/projectFolder")