npbin:

title: "NPBin" author: "Anthony Aylward" date: "2018-08-03" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Vignette Title} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8}

We will analyze a subset of one of the sample dataset for illustration purposes.

library(parallel)
library(npbin)
library(data.table)

minimum_coverage <- 5 # minimum total coverage allowed
n_cores <- detectCores() # the number of cores to be used, can ONLY be 1 if run on Windows.

dt <- ctcf
colnames(dt)
#>  [1] "chr"             "location"        "m"              
#>  [4] "xm"              "winning.chip"    "motif"          
#>  [7] "pval.mat.atSNP"  "pval.pat.atSNP"  "pval.rank.atSNP"
#> [10] "winnig.motif"    "potential_TP"    "potential_FP"

dt.ct <- data.table(dt)[m >= minimum_coverage, ]

for illustration purpose, keep only the 2000 points, remove this line will end up with analyzing the whole data set. It could be slow if only one core is used.

dt.ct <- dt.ct[1:2000, ]
dt.ct[, p_hat:=xm / m]
n <- nrow(dt.ct)

NPBin

n_breaks <- 11 # number of breaks
spline_order <- 4 # order of splines
breaks <- seq(0, 1, length.out = n_breaks)
pi_init <- initialize_weights(
  dt.ct,
  n_breaks = n_breaks,
  spline_order = spline_order,
  plot = TRUE
) # initialized the weights using the histogram of p_hat

plot of chunk shape_ctcf_initialize_weights

estimate the overall model

overall_model_estimate <- emBinBspl(
  dt.ct[, xm],
  dt.ct[, m],
  breaks = breaks,
  k = spline_order,
  pi.init = pi_init,
  ncores = n_cores,
  err.max = 1e-3,
  iter.max = 200
)
names(overall_model_estimate)
#> [1] "pi"          "post"        "bspl"        "dmtx"        "f"          
#> [6] "ll.all"      "err.all"     "convergence" "controls"

estimate the null model

null_model_estimate <- estNull(
  dt.ct[, xm],
  dt.ct[, m],
  overall_model_estimate,
  init = NULL,
  iter.max = 200,
  ncores = n_cores,
  ub = rep(log(1e4), 2),
  err.max = 1e-4
)
names(null_model_estimate)
#>  [1] "pi"               "post"             "bspl"            
#>  [4] "dmtx"             "f"                "ll.all"          
#>  [7] "err.all"          "convergence"      "controls"        
#> [10] "coef.null"        "pi0"              "f0"              
#> [13] "locfdr"           "convergence.null"

null_model_estimate[["coef.null"]]
#> $shape1
#> [1] 37.53662
#> 
#> $shape2
#> [1] 37.30897
#> 
#> $pi0
#> [1] 0.8983954

Calculate the overdispersion/correlation parameter

1 / sum(
  null_model_estimate[["coef.null"]][["shape1"]],
  null_model_estimate[["coef.null"]][["shape2"]],
  1
)
#> [1] 0.01318468

Plot the estimated null distribution

plot_estimated_null(
  dt.ct[, p_hat],
  shape1_shape2 = c(
    null_model_estimate[["coef.null"]][["shape1"]],
    null_model_estimate[["coef.null"]][["shape2"]]
  )
)

plot of chunk shape_ctcf_plot_null