library(data.table)
library(magrittr)
library(purrr)
library(stringi)
library(car)
library(ewastools)
library(parallel)
samples = c("GSE77797","GSE99863","GSE42861","GSE85042")
samples = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=" %s+% samples %s+% "&targ=gsm&form=text&view=brief"
samples %<>% map(readLines) %>% unlist
samples %<>% split(.,cumsum(. %like% "^\\^SAMPLE = GSM") )
# Extract GSM accessions
names(samples) = map(samples,1) %>% stri_match_first(regex="GSM\\d+")
# Parse meta data
imap(samples,function(s,acc){
s = strsplit(s,split=" = ",fixed=TRUE)
data.table(gsm=acc,variable=map_chr(s,1),value=map_chr(s,2))
}) -> samples
samples = rbindlist(samples)
# Keep only information on sample characteristics and supplementary files
samples = samples[variable %chin% c("!Sample_characteristics_ch1","!Sample_supplementary_file","!Sample_title","!Sample_series_id")]
samples[variable=="!Sample_title",variable:="name"]
samples[variable=="!Sample_series_id",variable:="gse"]
# Find the URLs pointing to the two .idat files
samples[variable=="!Sample_supplementary_file" & value %like% "_Red\\.idat",variable:="red"]
samples[variable=="!Sample_supplementary_file" & value %like% "_Grn\\.idat",variable:="grn"]
i = samples[variable == "!Sample_characteristics_ch1",which=TRUE]
ch = samples$value[i] %>% stri_split(fixed=": ")
samples$variable[i] = map_chr(ch,1)
samples$value [i] = map_chr(ch,2)
rm(ch)
samples[,variable:= recode(variable,"'b cell (%)'='B';'cd4+ t cell (%)'='CD4';'cd8+ t cell (%)'='CD8';'granulocyte (%)'='GR';'monocyte (%)'='MO';'natural killer cell (%)'='NK'")]
samples[,variable:= recode(variable,"'gender'='sex';'Sex'='sex'")]
# Reshape data.table from long to wide format
samples = dcast(samples, gsm ~ variable)
samples[,file:="benchmarks/" %s+% gsm]
samples = split(samples,by="gse")
# ---------------------------------------------------------------------------
estimateLC = function (meth,ref)
{
J = ncol(meth)
coefs = read.table(ref)
coefs = as.matrix(coefs)
n_celltypes = ncol(coefs)
markers = match(rownames(coefs), rownames(meth))
EST = sapply(1:J, function(j) {
tmp = meth[markers, j]
i = !is.na(tmp)
quadprog::solve.QP(t(coefs[i, ]) %*% coefs[i, ], t(coefs[i,
]) %*% tmp[i], diag(n_celltypes), rep(0, n_celltypes))$sol
})
EST = t(EST)
colnames(EST) = colnames(coefs)
EST = data.table(EST)
return(EST)
}
estimateLC2 = function (meth,ref)
{
J = ncol(meth)
coefs = read.table(ref)
coefs = as.matrix(coefs)
n_celltypes = ncol(coefs)
markers = match(rownames(coefs), rownames(meth))
EST = sapply(1:J, function(j) {
tmp = meth[markers, j]
i = !is.na(tmp)
quadprog::solve.QP(
t(coefs[i, ]) %*% coefs[i, ], t(coefs[i,]) %*% tmp[i],
cbind(rep(1,n_celltypes),diag(n_celltypes)),
c(1,rep(0, n_celltypes)),
meq=1
)$sol
})
EST = t(EST)
colnames(EST) = colnames(coefs)
EST = data.table(EST)
return(EST)
}
avg_r2 = function(ref){
LC = estimateLC(meth,ref=ref)
frml = paste(c("meth~1",names(LC)),collapse="+")
LC[,meth:=runif(.N)]
m = lm(frml,data=LC)
mm = model.matrix(m)
f = function(meth_i)
{
m[1:8] = lm.fit(mm,meth_i)
summary(m)$adj.r.squared
}
f = possibly(f,otherwise=NA)
rr = apply(meth,1,f)
mean(rr,na.rm=TRUE)
}
avg_r3 = function(ref){
LC = estimateLC(meth,ref=ref)
x = rowSums(as.matrix(LC))
for(c in names(LC)){ LC[[c]] = LC[[c]]/x}
frml = paste(c("meth~1",names(LC)),collapse="+")
LC[,meth:=runif(.N)]
m = lm(frml,data=LC)
mm = model.matrix(m)
f = function(meth_i)
{
m[1:8] = lm.fit(mm,meth_i)
summary(m)$adj.r.squared
}
f = possibly(f,otherwise=NA)
rr = apply(meth,1,f)
mean(rr,na.rm=TRUE)
}
training = list.files(path="../data",pattern="\\.txt$",full.names=TRUE)
training = setdiff(training,c("../data/blood_norm.txt","../data/hkcpgs.txt","../data/saliva.txt"))
# ---------------------------------------------------------------------------
# test set (GSE77797) https://doi.org/10.1186/s12859-016-0943-7
gse77797 = samples$GSE77797[,list(gsm,file)]
meth = gse77797 %$% file %>% read_idats %>% correct_dye_bias %>% dont_normalize
R2 = mclapply(training,avg_r2,mc.cores=length(training)) %>% unlist
# ---------------------------------------------------------------------------
# 689 Rheumatoid arthritis GSE42861
gse42861 = samples$GSE42861[,list(gsm,file)]
meth = gse42861 %$% file %>% read_idats %>% correct_dye_bias %>% dont_normalize
R3 = mclapply(training,avg_r2,mc.cores=length(training)) %>% unlist
# ---------------------------------------------------------------------------
# 251 2y olds from Gambia GSE99863
gse99863 = samples$GSE99863[,list(gsm,file)]
meth = gse99863 %$% file %>% read_idats %>% correct_dye_bias %>% dont_normalize
R4 = mclapply(training,avg_r2,mc.cores=length(training)) %>% unlist
# ---------------------------------------------------------------------------
# 71 cord blood samples (GSE85042)
gse85042 = samples$GSE85042[,list(gsm,file)]
meth = gse85042 %$% file %>% read_idats %>% correct_dye_bias %>% dont_normalize
R5 = mclapply(training,avg_r2,mc.cores=length(training)) %>% unlist
round(cbind(R2,R3,R4,R5),3)
# R2 R3 R4 R5
# "Bakulski.txt" 0.313 0.287 0.444 0.236
# "Bakulski+deGoede.txt" 0.291 0.283 0.452 0.229
# "Bakulski+Gervin.txt" 0.301 0.291 0.446 0.225
# "Bakulski+Lin.txt" 0.303 0.288 0.450 0.232
# "Bakulski+Mill.txt" 0.284 0.301 0.462 0.226
# "Bakulski+Reinius.txt" 0.324 0.298 0.457 0.234
# "Bakulski+Salas.txt" 0.298 0.298 0.487 0.233
# "deGoede.txt" 0.270 0.280 0.478 0.229
# "deGoede+Lin.txt" 0.278 0.281 0.471 0.229
# "deGoede+Mill.txt" 0.260 0.280 0.473 0.225
# "deGoede+Reinius.txt" 0.285 0.288 0.465 0.226
# "deGoede+Salas.txt" 0.286 0.273 0.505 0.232
# "Gervin.txt" 0.277 0.175 0.244 0.125
# "Gervin+deGoede.txt" 0.281 0.286 0.461 0.225
# "Gervin+Lin.txt" 0.284 0.162 0.245 0.128
# "Gervin+Mill.txt" 0.273 0.176 0.288 0.131
# "Gervin+Reinius.txt" 0.289 0.168 0.235 0.127
# "Gervin+Salas.txt" 0.287 0.167 0.249 0.130
# "Lin.txt" 0.278 0.159 0.255 0.129
# "Lin+Mill.txt" 0.274 0.173 0.284 0.137
# "Lin+Reinius.txt" 0.288 0.165 0.230 0.125
# "Lin+Salas.txt" 0.284 0.166 0.242 0.126
# "Mill.txt" 0.257 0.147 0.216 0.122
# "Reinius.txt" 0.283 0.210 0.307 0.122
# "Reinius+Mill.txt" 0.261 0.187 0.294 0.125
# "Reinius+Salas.txt" 0.285 0.159 0.245 0.122
# "Salas.txt" 0.278 0.165 0.245 0.104
# "Salas+Mill.txt" 0.268 0.165 0.289 0.146
# "Lolipop.txt" 0.270 0.108 0.314 0.156
bakulski = fread("saliva/clean_saliva_samples.csv")
bakulski = bakulski[cell_type=="Whole"]
meth = bakulski %$% file %>% read_idats %>% correct_dye_bias %>% dont_normalize
estimateLC(meth,"../data/saliva.txt")
estimateLC(meth,"../data/Bakulski.txt")
estimateLC2(meth,"../data/saliva.txt")
estimateLC2(meth,"../data/Bakulski.txt")
avg_r2("../data/saliva.txt")
# 0.2507195
avg_r3("../data/saliva.txt")
avg_r2("../data/bakulski.txt")
# 0.371119
avg_r3("../data/bakulski.txt")
## Comparison to EpiDISH
library(EpiDISH)
LC = epidish(beta.m=meth,ref.m=centEpiFibIC.m,method="RPC")$estF
LC %<>% data.table
LC = LC[,.(Epi,IC)]
frml = paste(c("meth~1",names(LC)),collapse="+")
LC[,meth:=runif(.N)]
m = lm(frml,data=LC)
mm = model.matrix(m)
f = function(meth_i)
{
m[1:8] = lm.fit(mm,meth_i)
summary(m)$adj.r.squared
}
f = possibly(f,otherwise=NA)
rr = apply(meth,1,f)
mean(rr,na.rm=TRUE)
# 0.2137797
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.