### =========================================================================
### preciseTADhub is an ExperimentHub package that stores pre-trained random
### forest models that can be leveraged to predict TAD and/or chromatin loop
### boundaries
### -------------------------------------------------------------------------
###
# preciseTADhub contains 84 lists as RDS objects
# Each list contains 2 objects: 1) a train object from \code{caret} with RF
# model information, and 2) a data.frame of variable importance for each feature
# included in the model
# The file names are stuctured as follows: i_j_k_l.rds
# where i denotes the chromosome that was used as a holdout (i.e. for
# testing), meaning all other chromosomes were used for training;
# i = {CHR1, CHR2, ..., CHR21, CHR22}
#
# j denotes the cell line;
# j = {GM12878, K562}
#
# k denotes the resolution (size of genomic bins);
# k = {5kb, 10kb}
#
# l denotes the TAD/loop caller used to define ground truth boundaries;
# l = {Arrowhead, Peakachu}
# For example the file named "CHR1_GM12878_5kb_Arrowhead.rds" is a list whose
# first item is a RF model that was built on data for chromosomes 2-22
# (omitting CHR9; see https://doi.org/10.1101/2020.09.03.282186), binned using
# 5 kb bins, ground truth TAD boundaries were identified using the Arrowhead
# TAD caller at 5 kb on GM12878. All models included the same number of
# predictors including CTCF, RAD21, SMC3, and ZNF143. The second item in the
# list is a data.frame with variable importances for CTCF, RAD21, SMC3, and
# ZNF143.
# The pre-trained models set up users to apply them to predict their own
# boundaries on chromosomes that were heldout, per the framework in the
# preciseTAD paper (https://doi.org/10.1101/2020.09.03.282186).
# The following is an example script describing the steps involved in making
# one of the RDS objects stored in preciseTADhub:
# "CHR1_GM12878_5kb_Arrowhead.rds"
# This example follows from the preciseTAD R package (see the detailed vignette
# at https://bioconductor.org/packages/devel/bioc/html/preciseTAD.html)
library(preciseTAD)
# Read in ARROWHEAD-called TADs at 5kb
data(arrowhead_gm12878_5kb)
# Extract unique boundaries
bounds.GR <- extractBoundaries(domains.mat = arrowhead_gm12878_5kb,
filter = FALSE,
CHR = paste0("CHR", c(22)),
resolution = 5000)
# Read in GRangesList of 26 TFBS
data(tfbsList)
tfbsList_filt <- tfbsList[names(tfbsList) %in% c("Ctcf",
"Rad21",
"Smc3")]
# Create the binned data matrix for CHR2-22 (training)
# using 5 kb binning, distance-type predictors from 4 TFBS from
# the GM12878 cell line, and random under-sampling
tadData <- createTADdata(bounds.GR = bounds.GR,
resolution = 5000,
genomicElements.GR = tfbsList_filt,
featureType = "distance",
resampling = "rus",
trainCHR = paste0("CHR", c(22)),
predictCHR = paste0("CHR", c(22)))
# Perform random forest using TADrandomForest by tuning mtry over 10 values
# using 3-fold CV
CHR22_GM12878_5kb_Arrowhead <- TADrandomForest(trainData = tadData[[1]],
testData = tadData[[2]],
tuneParams = list(mtry = 2,
ntree = 500,
nodesize = 1),
cvFolds = 3,
cvMetric = "Accuracy",
verbose = TRUE,
model = TRUE,
importances = TRUE,
impMeasure = "MDA",
performances = FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.