eclust: Environment Based Clustering for Interpretable Predictive Models in High Dimensional Data

library(eclust)
library(magrittr)

parametersDf <- expand.grid(rho = c(0.90),
                            p = c(500),
                            # SNR = c(0.2,1,2),
                            SNR = 1,
                            n = c(100), # this is the total train + test sample size
                            # nActive = c(300), # must be even because its being split among two modules
                            #n0 = 200,
                            cluster_distance = c("tom","corr"),
                            Ecluster_distance = c("difftom", "diffcorr"),
                            rhoOther = 0.6,
                            betaMean = c(2),
                            alphaMean = c(1),
                            betaE = 3,
                            includeInteraction = FALSE,
                            includeStability = TRUE,
                            distanceMethod = "euclidean",
                            clustMethod = "hclust",
                            #cutMethod = "gap",
                            cutMethod = "dynamic",
                            agglomerationMethod = "average",
                            # agglomerationMethod = "ward.D",
                            K.max = 10, B = 10, stringsAsFactors = FALSE)

parametersDf <- transform(parametersDf, n0 = n/2, nActive = p*0.10)
parametersDf <- parametersDf[which(parametersDf$cluster_distance=="tom" & parametersDf$Ecluster_distance=="difftom" |
                                     parametersDf$cluster_distance=="corr" & parametersDf$Ecluster_distance=="diffcorr"),]
parameterIndex = 2
simulationParameters <- parametersDf[parameterIndex,, drop = F]

print(simulationParameters)

## ---- generate-data ----
message("generating data")

p <- simulationParameters[,"p"];
n <- simulationParameters[,"n"];
n0 <- simulationParameters[,"n0"];
SNR <- simulationParameters[,"SNR"]
n1 <- n - n0
cluster_distance <- simulationParameters[,"cluster_distance"]
Ecluster_distance <- simulationParameters[,"Ecluster_distance"]
rhoOther <- simulationParameters[,"rhoOther"];
betaMean <- simulationParameters[,"betaMean"];
betaE <- simulationParameters[,"betaE"]
alphaMean <- simulationParameters[,"alphaMean"];
rho <- simulationParameters[,"rho"];
nActive <- simulationParameters[,"nActive"];
includeInteraction <- simulationParameters[,"includeInteraction"]
includeStability <- simulationParameters[,"includeStability"]
distanceMethod <- simulationParameters[,"distanceMethod"]
clustMethod <- simulationParameters[,"clustMethod"]
cutMethod <- simulationParameters[,"cutMethod"]
agglomerationMethod <- simulationParameters[,"agglomerationMethod"]
K.max <- simulationParameters[,"K.max"]
B <- simulationParameters[,"B"]

# in this simulation its blocks 3 and 4 that are important
# leaveOut:  optional specification of modules that should be left out
# of the simulation, that is their genes will be simulated as unrelated
# ("grey"). This can be useful when simulating several sets, in some which a module
# is present while in others it is absent.
d0 <- s_modules(n = n0, p = p, rho = c(0,0), exposed = FALSE,
                modProportions = c(0.15,0.15,0.15,0.15,0.15,0.25),
                minCor = 0.01,
                maxCor = 1,
                corPower = 1,
                propNegativeCor = 0.3,
                backgroundNoise = 0.5,
                signed = FALSE,
                leaveOut = 1:4)

d1 <- s_modules(n = n1, p = p, rho = c(rho, rho), exposed = TRUE,
                modProportions = c(0.15,0.15,0.15,0.15,0.15,0.25),
                minCor = 0.4,
                maxCor = 1,
                corPower = 0.3,
                propNegativeCor = 0.3,
                backgroundNoise = 0.5,
                signed = FALSE)

# these should be the same. if they arent, its because I removed the red and
# green modules from the E=0 group
truemodule0 <- d0$setLabels
t0 <- table(truemodule0)
truemodule1 <- d1$setLabels
t1 <- table(truemodule1)
table(truemodule0,truemodule1)

# Convert labels to colors for plotting
moduleColors <- WGCNA::labels2colors(truemodule1)
table(moduleColors, truemodule1)

# does not contain E, needs to bind unexposed first
X <- rbind(d0$datExpr, d1$datExpr) %>%
  magrittr::set_colnames(paste0("Gene", 1:p)) %>%
  magrittr::set_rownames(paste0("Subject",1:n))

dim(X)


# pheatmap::pheatmap(cor(X))
# pheatmap::pheatmap(cor(d1$datExpr))
# pheatmap::pheatmap(cor(d0$datExpr))
# pheatmap(cor(d1$datExpr)-cor(d0$datExpr))
# pheatmap(WGCNA::TOMsimilarityFromExpr(X))
# pheatmap(WGCNA::TOMsimilarityFromExpr(d1$datExpr))
# pheatmap(WGCNA::TOMsimilarityFromExpr(d1$datExpr)-WGCNA::TOMsimilarityFromExpr(d0$datExpr))

# betaMainEffect <- vector("double", length = p)
# betaMainInteractions <- vector("double", length = p)
# # first assign random uniform to every gene in cluster 3 and 4,
# # then randomly remove so that thers only nActive left
# betaMainEffect[which(truemodule1 %in% 3:4)] <- runif(sum(truemodule1 %in% 3:4),
#                                                      betaMean - 0.1, betaMean + 0.1)
#
# # randomly set some coefficients to 0 so that there are only nActive non zero
# betaMainEffect <- replace(betaMainEffect,
#                           sample(which(truemodule1 %in% 3:4), sum(truemodule1 %in% 3:4) - nActive,
#                                  replace = FALSE), 0)
#
# betaMainInteractions[which(betaMainEffect!=0)] <- runif(nActive, alphaMean - 0.1, alphaMean + 0.1)
#
# beta <- c(betaMainEffect,
#           betaE,
#           betaMainInteractions)
# plot(beta)

betaMainEffect <- vector("double", length = p)

# betaMainInteractions <- vector("double", length = p)

# the first nActive/2 in the 3rd block are active
betaMainEffect[which(truemodule1 %in% 3)[1:(nActive/2)]] <- runif(
  nActive/2, betaMean - 0.1, betaMean + 0.1)

# the first nActive/2 in the 4th block are active
betaMainEffect[which(truemodule1 %in% 4)[1:(nActive/2)]] <- runif(
  nActive/2, betaMean+2 - 0.1, betaMean+2 + 0.1)

# betaMainInteractions[which(betaMainEffect!=0)] <- runif(nActive, alphaMean - 0.1, alphaMean + 0.1)

# must be in this order!!!! main effects, E, and then interactions... this order is being used
# by the generate_data function
beta <- c(betaMainEffect,
          betaE)#,betaMainInteractions)

plot(beta)

simulated_data <- s_generate_data(p = p,
                          X = X,
                          beta = beta,
                          include_interaction = includeInteraction,
                          cluster_distance = cluster_distance,
                          n = n,
                          n0 = n0,
                          eclust_distance = Ecluster_distance,
                          signal_to_noise_ratio = SNR,
                          distance_method = distanceMethod,
                          cluster_method = clustMethod,
                          cut_method = cutMethod,
                          agglomeration_method = agglomerationMethod,
                          K.max = K.max, B = B, nPC = 1)

# devtools::use_data(simulated_data)


simulated_data$similarity %>% dim
simulated_data$DT %>% dim
simulated_data$DT %>% str
pryr::object_size(simulated_data$DT)

# pryr::object_size(simulated_data$DT[,1:1002])
#
# pryr::object_size(simulated_data$DT)

simdata <- simulated_data$DT[,c(1,502,2:501)]
simdata[1:5,1:5]
simdata %>% dim
devtools::use_data(simdata, overwrite = TRUE)
devtools::check()

sahirbhatnagar/eclust documentation built on May 29, 2019, 12:58 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

sahirbhatnagar/eclust
Environment Based Clustering for Interpretable Predictive Models in High Dimensional Data

data-raw/simulated-data-processing.R
In sahirbhatnagar/eclust: Environment Based Clustering for Interpretable Predictive Models in High Dimensional Data

R Package Documentation

Browse R Packages

We want your feedback!

sahirbhatnagar/eclust Environment Based Clustering for Interpretable Predictive Models in High Dimensional Data

data-raw/simulated-data-processing.R In sahirbhatnagar/eclust: Environment Based Clustering for Interpretable Predictive Models in High Dimensional Data

R Package Documentation

Browse R Packages

We want your feedback!

sahirbhatnagar/eclust
Environment Based Clustering for Interpretable Predictive Models in High Dimensional Data

data-raw/simulated-data-processing.R
In sahirbhatnagar/eclust: Environment Based Clustering for Interpretable Predictive Models in High Dimensional Data