Home

/

GitHub

/

bioinformatist/ICE

/

README.md

README.md
In bioinformatist/ICE: ICE: Impute CircRNA Expression

ICE

ICE (Impute CircRNA Expression) is designed for imputing circRNA expression using PCGs (protein-coding genes).

Currently bundled example data may be huge in size. One could try use proxy and pull it first.

remotes::install_github("bioinformatist/ICE")

`library()` package and load example data

library(ICE)

Currently data used for testing is too large to make a bundle. You should follow example dataset part to prepare them.

library(ICE)
library(ggplot2)
library(data.table)

cv.full.pearson <- list(
  ICE_cv_entire(train.pcg = mionco.pcg, train.circ = mionco.circ, cor.method = 'pearson', method = 'KNN'),
  ICE_cv_entire(train.pcg = mionco.pcg, train.circ = mionco.circ, cor.method = 'pearson', method = 'RF', ncores = 56),
  ICE_cv_entire(train.pcg = mionco.pcg, train.circ = mionco.circ, cor.method = 'pearson', method = 'EN', ncores = 56),
  ICE_cv_entire(train.pcg = mionco.pcg, train.circ = mionco.circ, cor.method = 'pearson', method = 'lasso', ncores = 56),
  ICE_cv_entire(train.pcg = mionco.pcg, train.circ = mionco.circ, cor.method = 'pearson', method = 'SVM', ncores = 56))
names(cv.full.pearson) <- c('KNN', 'RF', 'EN', 'lasso', 'SVM')
cv.full.pearson <- lapply(names(cv.full.pearson), function(x) data.table(cv.full.pearson[[x]], method = x))
cv.full.pearson <- do.call(rbind, cv.full.pearson)
ggplot() + geom_boxplot(data = cv.full.pearson, aes(x = method, y = CC, color = method))
ggplot() + geom_density(data = cv.full.pearson[method == 'RF'], aes(CC))
ggplot() + geom_density(data = cv.full.pearson[method == 'RF'], aes(CC)) + scale_x_continuous(breaks = seq(0.5, 0.8, 0.025), labels = seq(0.5, 0.8, 0.025), limits = c(0.4, 0.825))
all.CC <- rbind(cv.full.pearson[, .(CC, method)], data.table(DL, method = 'DL'))

require("reticulate")

source_python("pickle_reader.py")
DL <- read_pickle_file("cc_pancancer.pkl")
names(DL) <- 'CC'
all.CC <- rbind(cv.full.pearson[, .(CC, method)], data.table(DL, method = 'DL'))
ggplot() + geom_density(data = all.CC, aes(CC, color = method))

DL <- read_pickle_file("cc_pancancer_30L_77e_val_mse.pkl")
names(DL) <- 'CC'
all.CC <- rbind(all.CC, data.table(DL, method = 'DL_val_mse'))
ggplot() + geom_density(data = all.CC, aes(CC, color = method))

DL <- read_pickle_file("final.pkl")
names(DL) <- 'CC'
all.CC <- rbind(all.CC, data.table(DL, method = 'DL_final'))
ggplot() + geom_density(data = all.CC, aes(CC, color = method))

cv.full.40.pearson <- list(
  ICE_cv_entire(train.pcg = mionco.pcg, train.circ = mionco.circ, cor.method = 'pearson', cutoff = 40, method = 'KNN'),
  ICE_cv_entire(train.pcg = mionco.pcg, train.circ = mionco.circ, cor.method = 'pearson', cutoff = 40, method = 'RF', ncores = 56),
  ICE_cv_entire(train.pcg = mionco.pcg, train.circ = mionco.circ, cor.method = 'pearson', cutoff = 40, method = 'EN', ncores = 56),
  ICE_cv_entire(train.pcg = mionco.pcg, train.circ = mionco.circ, cor.method = 'pearson', cutoff = 40, method = 'lasso', ncores = 56),
  ICE_cv_entire(train.pcg = mionco.pcg, train.circ = mionco.circ, cor.method = 'pearson', cutoff = 40, method = 'SVM', ncores = 56))
names(cv.full.40.pearson) <- c('KNN', 'RF', 'EN', 'lasso', 'SVM')
cv.full.40.pearson <- lapply(names(cv.full.40.pearson), function(x) data.table(cv.full.40.pearson[[x]], method = x))
cv.full.40.pearson <- do.call(rbind, cv.full.40.pearson)
DL <- read_pickle_file("final_40.pkl")
names(DL) <- 'CC'
all.CC.40 <- rbind(cv.full.40.pearson[, .(CC, method)], data.table(DL, method = 'DL_40'))
ggplot() + geom_density(data = all.CC, aes(CC, color = method))

To perform further analysis on circRNAs, we use convert genome region (Chromosome:Start-End) as circRNA identifiers.

One pair of circRNA and PCGs matrices is from paper The Landscape of Circular RNA in Cancer.

circRNA expression matrix was downloaded from this website with the metadata and pre-processed by below code:

library(data.table)
mionco.circ <- fread('v0.1.release.txt')[, `:=`(symbol = NULL, release = NULL, ID = paste0(chr, ':', start, '-', end))]
mionco.circ <- dcast(mionco.circ, formula = sample ~ ID, value.var = 'reads', fun.aggregate = max)
mionco.circ <- as.matrix(mionco.circ, rownames = 'sample')

mionco.pcg <- fread('fpkm_matrix.csv', drop = 1)
mionco.pcg[, Row.names := tstrsplit(Row.names, '\\.')[[1]]]
mionco.pcg[, V2 := NULL]

library(biomaRt)
mart <- useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl")
pcg.id <- getBM(attributes = 'ensembl_gene_id', filters = "biotype", values = "protein_coding", mart = mart)
setnames(mionco.pcg, 'Row.names', 'ensembl_gene_id')
mionco.pcg <- mionco.pcg[pcg.id, on = 'ensembl_gene_id', nomatch = NULL]
mionco.pcg <- as.matrix(mionco.pcg, rownames = 'ensembl_gene_id')
mionco.pcg <- t(mionco.pcg)

sample.used <- intersect(rownames(mionco.circ), rownames(mionco.pcg))
mionco.circ <- mionco.circ[sample.used, ]
mionco.pcg <- mionco.pcg[sample.used, ]

The CCLE circRNAs expression was downloaded from CircRic database, while the RNAseq gene expression as well as sample (cell line) annotation was downloaded from CCLE portal of The Broad Institute. As CCLE data is processed with hg19, we also need prepare a chain file for liftOver.

ccle.circ <- fread('circRNA_expression.csv')[, `:=`(ccl_id = NULL, cancer_type = NULL, circ_gene = NULL)]
anno <- fread('Cell_lines_annotations_20181226.txt')
anno <- anno[, .(CCLE_ID, Name)]
ccle.circ <- merge(ccle.circ, anno, by.x = 'ccl_name', by.y = 'Name')[, ccl_name := NULL]
splitted <- tstrsplit(ccle.circ[[1]], '_|\\|')
temp.19 <- GRanges(seqnames = paste0('chr', splitted[[1]]), ranges = IRanges(start = as.numeric(splitted[[2]]), end = as.numeric(splitted[[3]])), name = ccle.circ[['CCLE_ID']], count = ccle.circ[['number_bsReads']])
chain <- import.chain("hg19ToHg38.over.chain")
temp.38 <- liftOver(temp.19, chain)
temp.38 <- as.data.table(unlist(temp.38))
ccle.circ <- data.table(ID = paste0(temp.38[['seqnames']], ':', temp.38[['start']], '-', temp.38[['end']]), name = temp.38[['name']], count = temp.38[['count']])
ccle.circ <- dcast(ccle.circ, formula = name ~ ID, value.var = 'count', fun.aggregate = max)
ccle.circ <- as.matrix(ccle.circ, rownames = 'name')

ccle.pcg <- fread('CCLE_RNAseq_rsem_genes_tpm_20180929.txt')
ccle.pcg[, `:=`(transcript_ids = NULL, gene_id = tstrsplit(gene_id, '\\.')[[1]])]
mart <- useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl")
pcg.id <- getBM(attributes = 'ensembl_gene_id', filters = "biotype", values = "protein_coding", mart = mart)
setnames(ccle.pcg, 'gene_id', 'ensembl_gene_id')
ccle.pcg <- ccle.pcg[pcg.id, on = 'ensembl_gene_id', nomatch = NULL]
ccle.pcg <- as.matrix(ccle.pcg, rownames = 'ensembl_gene_id')
ccle.pcg <- t(ccle.pcg)

sample.used <- intersect(rownames(ccle.circ), rownames(ccle.pcg))
ccle.circ <- ccle.circ[sample.used, ]
ccle.pcg <- ccle.pcg[sample.used, ]

Up to now, mionco.circ and mionco.pcg could be used as circRNA and PCG dataset for training or testing, and the ccle.circ and ccle.pcg ditto.

# Keep circRNAs expressed in at least 1% sample
# 非常惨QAQ
train.circ <- ICE:::filter_circ(mionco.circ, 1)
new.circ <- ICE:::filter_circ(ccle.circ, 1)
train.pcg <- ICE:::pre_process(mionco.pcg)
new.pcg <- ICE:::pre_process(ccle.pcg)
# 取完交集就剩一点点了
used.circ.name <- intersect(colnames(new.circ), colnames(train.circ))

fwrite(train.circ[, used.circ.name], 'train_circ.csv')
fwrite(new.circ[, used.circ.name], 'new_circ.csv')
fwrite(train.pcg, 'train_pcg.csv')
fwrite(new.pcg, 'new_pcg.csv')

sfInit(parallel = TRUE, cpu = 56)
cl <- sfGetCluster()
sfExport('train.pcg', 'train.circ', 'new.pcg')
sfExport('ICE', namespace = 'ICE')
pboptions(use_lb = TRUE)

predicted <- pblapply(used.circ.name, function(x) ICE(train.pcg, train.circ, new.pcg, x, method = 'RF', filter = FALSE), cl = cl)
names(predicted) <- used.circ.name
cd.cc <- unlist(lapply(used.circ.name, function(x) cor(predicted[[x]], ccle.circ[, x], method = 'spearman')))

meta <- fread('meta_update.csv')
meta[, count := .N, by = 'Analysis Cohort']
head(unique(meta, by = 'Analysis Cohort')[order(-count)])

fwrite(ICE:::filter_circ(mionco.circ), 'mionco_circ.csv')
fwrite(ICE:::pre_process(mionco.pcg), 'mionco_pcg.csv')

cohort_cor <- function(x){
  cohort.sample.used <- intersect(sample.used, meta[x, on = 'Analysis Cohort', ID])
  if(length(cohort.sample.used) == 0) return()
  circ.cohort <- mionco.circ[cohort.sample.used, ]
  pcg.cohort <- mionco.pcg[cohort.sample.used, ]

  cv <- list(
  ICE_cv_entire(train.pcg = pcg.cohort, train.circ = circ.cohort, cor.method = 'pearson', folds = 3, method = 'KNN'),
  ICE_cv_entire(train.pcg = pcg.cohort, train.circ = circ.cohort, cor.method = 'pearson', folds = 3, method = 'RF'))
  names(cv) <- c('KNN', 'RF')
  cv <- lapply(names(cv), function(x) data.table(cv[[x]], method = x))
  cv <- do.call(rbind, cv)
  (data.table(cv, cohort = x))
}

cv.full <- lapply(head(unique(meta, by = 'Analysis Cohort')[order(-count)])[['Analysis Cohort']], function(x) cohort_cor(x))

names(cv.full) <- head(unique(meta, by = 'Analysis Cohort')[order(-count)])[['Analysis Cohort']]

cv.full <- Filter(function(x) !is.null(x), cv.full)
cv.full <- do.call(rbind, cv.full)
ggplot() + geom_density(data = cv.full[method == 'RF' & cohort == 'PRAD'], aes(CC))
d_fun <- ecdf (cv.full[method == 'RF' & cohort == 'PRAD', CC])
ggplot(data = cv.full, aes(x = cohort, y = CC, fill = method)) + geom_boxplot() + geom_jitter(aes(fill = method), shape=16, position=position_jitterdodge(0.2))

library(data.table)
ip <- fread('rpkm_ip_norm_m.txt')
anno <- fread('fix.anno.txt')

library(mlr3)
library(data.table)
# install.packages("mlr3learners.randomforest", repos = "https://mlr3learners.github.io/mlr3learners.drat")
library(mlr3learners.randomforest)
library(paradox)
library(mlr3tuning)

train.circ <- scale(ICE:::filter_circ(mionco.circ))
train.pcg <- ICE:::pre_process(mionco.pcg)
train.pcg <- scale(ICE:::corf(train.pcg, train.circ, "chr7:23611170-23611553", 50))

task.circ <- TaskRegr$new(id = 'circ', backend = data.table(target = train.circ[, "chr7:23611170-23611553"], train.pcg), target = "target")

learner.circ <- lrn('regr.randomForest')
# learner.circ$param_set
tune.ps <- ParamSet$new(list(
  ParamInt$new('ntree', lower = 1, upper = 500),
  ParamInt$new('mtry', lower = 1, upper = 33),
  ParamInt$new('nodesize', lower = 1, upper = 50)
))

instance <- TuningInstance$new(
  task = task.circ,
  learner = learner.circ,
  resampling = rsmp('cv'),
  measures = msr('regr.mse'),
  param_set = tune.ps,
  terminator = term("stagnation", threshold = 1e-5)
)

tuner <- tnr("grid_search", resolution = 10)

result <- tuner$tune(instance)

learner.circ <- lrn('regr.svm')
# learner.circ$param_set
tune.ps <- ParamSet$new(list(
  ParamDbl$new('cost', lower = 0.0001, upper = 10000),
  ParamDbl$new('gamma', lower = 0.01, upper = 1),
  ParamFct$new('type', 'eps-regression'),
  ParamFct$new('kernel', 'radial')
))

instance <- TuningInstance$new(
  task = task.circ,
  learner = learner.circ,
  resampling = rsmp('cv'),
  measures = msr('regr.mse'),
  param_set = tune.ps,
  terminator = term("stagnation", threshold = 1e-5)
)

tuner <- tnr("grid_search", resolution = 10)

result <- tuner$tune(instance)

learner.circ <- lrn('regr.fnn')
# learner.circ$param_set
tune.ps <- ParamSet$new(list(
  ParamInt$new('k', lower = 1, upper = task.circ$nrow - 100)
))

instance <- TuningInstance$new(
  task = task.circ,
  learner = learner.circ,
  resampling = rsmp('cv'),
  measures = msr('regr.mse'),
  param_set = tune.ps,
  terminator = term("stagnation", threshold = 1e-5)
)

tuner <- tnr("grid_search", resolution = 10)

result <- tuner$tune(instance)

library(glmnet)
cv.lasso <- cv.glmnet(train.pcg, train.circ[, "chr7:23611170-23611553"], alpha = 1, family = 'gaussian')
cv.lasso$lambda.min

library(caret)
en.model <- suppressWarnings(train(
        train.pcg, train.circ[, "chr7:23611170-23611553"], method = 'glmnet',
        trControl = trainControl("cv", number = 10),
        tuneLength = 100))
en.model$bestTune

bioinformatist/ICE documentation built on July 5, 2020, 12:20 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

bioinformatist/ICE
ICE: Impute CircRNA Expression

README.md
In bioinformatist/ICE: ICE: Impute CircRNA Expression

ICE

Installation

Quick start

`library()` package and load example data

Performance

ML and DL on 871 circRNAs

ML and DL on 2672 circRNAs

Example dataset

Pan-cancer dataset

CCLE dataset

Cross-dataset validation

m6A

Tuning for static ML parameters

Tuning for random forest

Tuning for SVM

Tuning for KNN

Tuning for lasso regression

Tuning for Elastic Net

R Package Documentation

Browse R Packages

We want your feedback!

bioinformatist/ICE ICE: Impute CircRNA Expression

README.md In bioinformatist/ICE: ICE: Impute CircRNA Expression

ICE

Installation

Quick start

library() package and load example data

Performance

ML and DL on 871 circRNAs

ML and DL on 2672 circRNAs

Example dataset

Pan-cancer dataset

CCLE dataset

Cross-dataset validation

m6A

Tuning for static ML parameters

Tuning for random forest

Tuning for SVM

Tuning for KNN

Tuning for lasso regression

Tuning for Elastic Net

R Package Documentation

Browse R Packages

We want your feedback!

bioinformatist/ICE
ICE: Impute CircRNA Expression

README.md
In bioinformatist/ICE: ICE: Impute CircRNA Expression

`library()` package and load example data