# SIMULATIONS/SIMULATION_plot_rep.R In emanuelealiverti/SOG:

#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Reproduce simulations for the paper.
# The seed to create the data is always the same, while changing the seed before train/test splitting allows to evaluate results over different splits, as suggested by an anonymous referee
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

rm(list = ls())

#+++++++++++++++++++++++++
# Takes roughly 10 minutes
#+++++++++++++++++++++++++
nsim = 50
out = array(0, dim = c(nsim,12,5))
seeds = 1:nsim
preds = list()
for(i in 1:nsim){
#this seed will be used only for the splitting
seed = seeds[i]
set.seed(2711)

#++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Scenario 1. Low rank structures, Y depend on X and Z.
#++++++++++++++++++++++++++++++++++++++++++++++++++++++
k.rid = 10
#n = 500
#p = 1000
n = 1000
p = 200

W = matrix(rnorm(p*k.rid), k.rid)
S = matrix(rnorm(n*k.rid), n)
z=rep(0:1, each=n/2)

lambda = rnorm(k.rid)
A = (S - lambda * z ) %*% W
X = scale(A)
beta = runif(k.rid,-5,5)

y = rnorm(n, mean = (S - lambda * z ) %*% beta )

set.seed(seed)
id.train = sample(1:n,.75*n)
id.test = setdiff(1:n, id.train)

require(sOG)
#Estimation
K=10
res_sfpl = sOG(X[id.train,], z=z[id.train], K=10,  t_val = 7,control = control_sOG(lim_step = 1,lim_max=5000))
res_fpl = OG(X=X[id.train,], z=z[id.train], K=10)
res_comp = svd(t(sva::ComBat(dat = t(X[id.train,]),batch = as.vector(z[id.train]))),nu=K,nv=K)
?sva::sva
res_sva = svd(t(sva::psva(dat = t(X[id.train,]),batch = as.factor(z[id.train]),n.sv=K)),nu=K,nv=K)
SVD = svd(X[id.train,], nu = K, nv = K)

#Linear models
y_train = y[id.train]
y_test = y[id.test]

m1 = lm(y_train ~ res_fpl$S) m2 = lm(y_train ~ res_sfpl$S)
m_comp = lm(y_train ~ res_comp$u) m_sva = lm(y_train ~ res_sva$u)
m_SVD = lm(y_train ~ SVD$u) #Test matrices res_sfpl_test = sOG(X[id.test,], z=z[id.test], K=10, t_val = 5,control = control_sOG(lim_step = 1,lim_max=5000)) res_fpl_test = OG(X=X[id.test,], z=z[id.test], K=10) res_comp_test = svd(t(sva::ComBat(dat = t(X[id.test,]),batch = as.vector(z[id.test]))),nu=K,nv=K) res_sva_test = svd(t(sva::psva(dat = t(X[id.test,]),batch = as.factor(z[id.test]),n.sv=K)),nu=K,nv=K) SVD_test = svd(X[id.test,], nu = K, nv = K) preds[[i]] = data.frame("COMBAT" = cbind(1,res_comp_test$u) %*% coef(m_comp),
"PSVA" = cbind(1, res_sva_test$u) %*% coef(m_sva), "OG" = cbind(1, res_fpl_test$S) %*% coef(m1),
"SOG" = cbind(1, res_sfpl_test$S) %*% coef(m2), "SVD" = cbind(1, SVD_test$u) %*% coef(m_SVD)
#"z" = z[id.test]
)

cat(i,'\n')
}

predsN = list()
for(i in 1:length(preds)){
seed = seeds[i]
set.seed(seed)
id.train = sample(1:n,.75*n)
id.test = setdiff(1:n, id.train)
tmp = preds[[i]]
tmp$z = z[id.test] predsN[[i]] = tmp } cor_res = matrix(unlist(lapply(predsN, function(x) cor(x)[6,-6])),nrow = nsim,byrow = T) colnames(cor_res) = colnames(predsN[[1]])[-6] boxplot(cor_res) df = reshape2::melt(cor_res) df$Var2 = factor(df\$Var2, labels = c("COMBAT", "PSVA", "OG", "SOG", "SVD"))
require(latex2exp)
require(ggplot2)
ggplot(df, aes(x = Var2, y = value)) + geom_boxplot(size = 1.5) + theme_bw(base_size = 18) +
#geom_jitter(aes(x = Var2, y = value), size = 2, alpha = .6, position=position_jitter(0.2)) +
#theme(panel.grid.major = element_blank(),
#panel.grid.minor = element_blank(),
#panel.background = element_blank()) +
scale_y_continuous(expand = c(0, 0.05)) +

#stat_summary(fun.y = mean, geom = "errorbar",
#aes(ymax = ..y.., ymin = ..y.., group = factor(Var2)),
#width = 0.75, linetype = "dashed", position = position_dodge()) +

xlab('')+
ylab(TeX('Correlation between \\hat{Y} and Z'))+
theme(strip.text = element_text(size=16),
#axis.title = element_text(size=18),
axis.text =  element_text(size=18),
axis.text.x =  element_text(angle = 0),
legend.position = 'none',
panel.spacing=unit(.5, "lines"),
panel.border = element_rect(color = "black", fill = NA, size = .5),
strip.background = element_rect(color = "black", size = .5),
) + guides(color = guide_legend(override.aes = list(size = 5)))

ggsave(file = "boxplot.pdf", width = 20, height = 4.5)

emanuelealiverti/SOG documentation built on Nov. 20, 2019, 12:45 a.m.