knitr::opts_chunk$set(echo = TRUE)

CUE train model

This rmarkdown show how we train each imputation model. Three approaches for selecting weights for CUE can be found in our paper. These training processes can be used for users who would like to generate a new set of imputation reference models, for example by providing 450K and EPIC array data for a tissue other than placenta or whole blood (which are the only tissues currently available with our pretrained models).

# generate HM450 as x and HM850 as y
n = 200              # number of sampels
x <- matrix(rbeta(n*50,1,1),ncol=n) # upperstream 25 probes and downstream 25 probes measured by HM450
y <- rbeta(n,1,1)    # the target HM850 probe

Meth<-data.frame(cbind(y,t(x)))
train=1:160
test=161:200

Train with PFR

source("R/refund_lib.R")
load("PTSD/Annotations.RData")
#load("Data/Annotations.RData")
p=dim(annotation.450K)[1]
X_450 = matrix(rbeta(n*p,1,1),ncol=n) 
rownames(X_450) <- rownames(annotation.450K)
X_450 = log2(X_450/(1-X_450))

Y=log2(y/(1-y))
X=log2(x/(1-x))

isl_group <- c("", "Island", "N_Shelf", "N_Shore", "S_Shore", "S_Shelf")

train.dens <- list()
train.funcs <- list()
test.dens <- list()
test.funcs <- list()

for (l in isl_group) {
    if (l == "") {
      j <- "NA"
    } else {
      j <- l
    }
    # max(X)==13.28757
    test.dens[[j]] <- X_450[annotation.450K[, "Relation_to_UCSC_CpG_Island"] == l,test]
    test.funcs[[j]] <- apply(test.dens[[j]], 2, function(x) {density(x,from=-13.38757,to=13.38757)$y})
    train.dens[[j]] <- X_450[annotation.450K[, "Relation_to_UCSC_CpG_Island"] == l,train] 
    train.funcs[[j]] <- apply(train.dens[[j]], 2, function(x) {density(x,from=-13.38757,to=13.38757)$y})   
}

## random select a target probe Y to impute:
target_probe = rownames(annotation.EPIC)[sample.int(dim(annotation.EPIC),1)]

j <- paste(annotation.EPIC[target_probe,"Relation_to_UCSC_CpG_Island"])
    if (j == "") {
      j <- "NA"
    }
fit = new_pfr(unlist(Y[train]),
                    t(X[,train]), 
                    t(train.funcs[[j]]))

X.test <- create_predictors(t(X[,test]), t(test.funcs[[j]]))
Y.test <- X.test %*% fit$coefs
pred_PFR_test <- 2 ^ Y.test / (2 ^ Y.test + 1)

Train with XGBoosting

library(xgboost)
bst <- xgboost(data = t(x[,train]),
                             label = y[train],
                             #ax.depth = 2, eta = 1,
                             thread = 1, nrounds = 10,
                             objective = "reg:linear",verbose = 0)

pred_XGB_test<-predict(bst,t(x[,test]))

Train with random forests

library(randomForest)
rf=randomForest(as.formula(paste("y~.")), data = Meth, subset=train ,mty=5,ntree=10)

pred_RF_test<-predict(rf,Meth[test,2:(dim(Meth)[2])])

Train with KNN

library("class")

knn=knn(train = Meth[train,2:dim(x)[1]],
                      test = Meth[test,2:dim(x)[1]],
                      cl = Meth[train,1],
                      k=50,
                      prob=T)

pred_KNN_test<-as.numeric(as.matrix(knn))

Train with logistic regression

y[y>0.5]=1;y[y<=0.5]=0;
Meth<-data.frame(cbind(y,t(x)))
logit=glm(as.formula(paste("y~.")),Meth[train,],family = binomial)
# Note for some probes: the alg might not converge. In this case, we recommend to abondon logistic and work with the rest 4 methods if this happens.
# glm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurred
pred_Logistic_test<-predict(logit,Meth[test,],type = "response")

CUE prediction

# equal weights
K=4
pred_CUE_equal_weighs <- 1/K * (pred_Logistic_test + pred_KNN_test + pred_RF_test +pred_PFR_test)
#pred_CUE <- 


GangLiTarheel/CUE documentation built on Oct. 11, 2021, 5:48 a.m.