inst/doc/preciseTAD.R

## ----set-options, echo=FALSE, cache=FALSE---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
options(width = 400)
knitr::opts_chunk$set(warnings = FALSE, message = FALSE)

## ---- eval = FALSE--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#  # if (!requireNamespace("BiocManager", quietly=TRUE))
#  #     install.packages("BiocManager")
#  # BiocManager::install("preciseTAD")
#  devtools::install_github("dozmorovlab/preciseTAD")
#  # Alternatively
#  # devtools::install_github("stilianoudakis/preciseTAD")

## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
library(knitr)
library(e1071)
library(preciseTAD)

## ---- eval=FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#  arrowhead -c 1 \ #chromosome to call TADs on
#    -r 5000 \ #HiC data resolution
#    ~/GSE63525_GM12878_insitu_primary.hic \ #location of the .HIC file
#    ~/arrowhead_output #location to store the output

## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
data("arrowhead_gm12878_5kb")
head(arrowhead_gm12878_5kb)

## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
bounds <- extractBoundaries(domains.mat = arrowhead_gm12878_5kb, preprocess = FALSE, CHR = c("CHR1", "CHR22"), resolution = 5000)
# View unique boundaries
bounds

## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# path <- "./path/to/BEDfiles"
# tfbsList <- bedToGRangesList(filepath=path, bedList=NULL, bedNames=NULL, pattern = "*.bed", signal=4)

data("tfbsList")
names(tfbsList)
tfbsList

## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
set.seed(123)
tadData <- createTADdata(bounds.GR = bounds,
                         resolution = 5000,
                         genomicElements.GR = tfbsList,
                         featureType = "distance",
                         resampling = "rus",
                         trainCHR = "CHR1",
                         predictCHR = "CHR22")

# View subset of training data
tadData[[1]][1:5,1:4]
# Check it is balanced
table(tadData[[1]]$y)

# View subset of testing data
tadData[[2]][1:5,1:4]

## ----results='hide'-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
set.seed(123)
rfe_res <- TADrfe(trainData = tadData[[1]],
                 tuneParams = list(ntree = 500, nodesize = 1),
                 cvFolds = 5,
                 cvMetric = "Accuracy",
                 verbose = TRUE)

## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# View RFE performances
rfe_res[[1]]

# View the variable importance among top n features across each CV fold
head(rfe_res[[2]])

## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# Restrict the data matrix to include only SMC3, RAD21, CTCF, and ZNF143
tfbsList_filt <- tfbsList[names(tfbsList) %in% c("Gm12878-Ctcf-Broad", 
                                            "Gm12878-Rad21-Haib",
                                            "Gm12878-Smc3-Sydh",
                                            "Gm12878-Znf143-Sydh")]

set.seed(123)
tadData <- createTADdata(bounds.GR   = bounds,
                         resolution  = 5000,
                         genomicElements.GR = tfbsList_filt,
                         featureType = "distance",
                         resampling  = "rus",
                         trainCHR    = "CHR1",
                         predictCHR  = "CHR22")

# Run RF
set.seed(123)
tadModel <- TADrandomForest(trainData  = tadData[[1]],
                            testData   = tadData[[2]],
                            tuneParams = list(mtry     = 2,
                                              ntree    = 500,
                                              nodesize = 1),
                            cvFolds      = 3,
                            cvMetric     = "Accuracy",
                            verbose      = FALSE,
                            model        = TRUE,
                            importances  = TRUE,
                            impMeasure   = "MDA",
                            performances = TRUE)

## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# View model performances
performances <- tadModel[[3]]
performances$Performance <- round(performances$Performance, digits = 2)
rownames(performances) <- performances$Metric
kable(t(performances), caption = "List of model performances when validating an RF built on CHR1 on CHR22 test data.")

## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
svmModel <- svm(y ~ ., data = tadData[[1]], kernel = "radial", cost = 1, gamma = 0.5)

svmPreds <- predict(svmModel, tadData[[2]][, -1], positive = "Yes")

# View confusion matrix
table(svmPreds, tadData[[2]][, 1])

## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# Run preciseTAD
set.seed(123)
pt <- preciseTAD(genomicElements.GR = tfbsList_filt,
                featureType         = "distance",
                CHR                 = "CHR22",
                chromCoords         = list(25500000, 27500000),
                tadModel            = tadModel[[1]],
                threshold           = 1.0,
                verbose             = FALSE,
                parallel            = NULL,
                DBSCAN_params       = list(10000, 3),
                flank               = 5000)

# What is getting returned? PTBRs - precide TAD boundary regions, and PTBPs - points
names(pt)
# View the results
pt

## ---- eval=FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#  # Transform
#  pt_juice <- juicer_func(pt[[2]])

## ---- eval=FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#  filepath = "~/path/to/store/ptbps"
#  write.table(pt_juice,
#              file.path(filepath, "pt_juice.bed"),
#              quote = FALSE,
#              col.names = FALSE,
#              row.names = FALSE,
#              sep = "\t")

## ---- eval=FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#  # Read in RF model built on chr1-chr21
#  tadModel <- readRDS("~/Downloads/CHR22_GM12878_5kb_Peakachu.rds")

## ---- eval=FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#  # Establishing Hela-specific genomic annotations
#  # Reading in CTCF, RAD21, SMC3, and ZNF143 from repository and storing as list
#  ctcf <- as.data.frame(read_delim("http://raw.githubusercontent.com/stilianoudakis/preciseTAD_supplementary/master/data/bed/hela/ctcf.bed", delim = "\t", col_names = FALSE))
#  rad21 <- as.data.frame(read_delim("http://raw.githubusercontent.com/stilianoudakis/preciseTAD_supplementary/master/data/bed/hela/rad21.bed", delim = "\t", col_names = FALSE))
#  smc3 <- as.data.frame(read_delim("http://raw.githubusercontent.com/stilianoudakis/preciseTAD_supplementary/master/data/bed/hela/smc3.bed", delim = "\t", col_names = FALSE))
#  znf143 <- as.data.frame(read_delim("http://raw.githubusercontent.com/stilianoudakis/preciseTAD_supplementary/master/data/bed/hela/znf143.bed", delim = "\t", col_names = FALSE))
#  helaList <- list(ctcf, rad21, smc3, znf143)
#  
#  hela.GR <- bedToGRangesList(bedList=helaList, bedNames=c("Ctcf", "Rad21", "Smc3", "Znf143"), signal=5)

## ---- eval=FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#  # Run preciseTAD
#  set.seed(123)
#  pt <- preciseTAD(genomicElements.GR = hela.GR,
#                  featureType         = "distance",
#                  CHR                 = "CHR22",
#                  chromCoords         = list(25500000, 27500000),
#                  tadModel            = tadModel[[1]],
#                  threshold           = 1.0,
#                  verbose             = FALSE,
#                  parallel            = NULL,
#                  DBSCAN_params       = list(10000, 3),
#                  flank               = 5000)
#  
#  pt

Try the preciseTAD package in your browser

Any scripts or data that you put into this service are public.

preciseTAD documentation built on Nov. 8, 2020, 6:51 p.m.