# Unit tests
library(COCOA)
library(data.table)
library(GenomicRanges)
context("Unit tests for COCOA.")
# generate synthetic data
# coordinateDT: positions of cytosines
chr1 <- seq(from=1, to = 4000, by = 200)
chr2 <- seq(from=4001, to = 6400, by = 200)
start <- c(chr1, chr2)
coordinateDT <- data.table(chr=c(rep("chr1", length(chr1)),
rep("chr2", length(chr2))),
start = start)
# loading values
loadingMat <- matrix(data = rep(1, (length(start) * 2)), ncol = 2)
colnames(loadingMat) <- c("PC1", "PC3")
dataDT <- data.table(coordinateDT, loadingMat)
# arbitrarily make some != 1
dataDT$PC1[3] <- 0
dataDT$PC3[5] <- 2
# fake region data (for testing ATAC-seq)
regionCoordDT <- data.table(chr=c(rep("chr1",4), rep("chr2", 2)),
start=c(300, 650, 3000, 10000, 5075, 6150),
end=c(600, 1000, 4000, 11000, 5125, 6300))
regionDataDT <- data.table(PC1=c(1:6), PC2=-1:4)
# region sets that will be tested
regionSet1 <- data.table(chr = c("chr1", "chr1", "chr1", "chr2", "chr2"),
start = c(1, 500, 3100, 5100, 6000),
end = c(400, 700, 3400, 5150, 6450))
regionSet1 <- MIRA:::dtToGr(regionSet1)
regionSet2 <- data.table(chr = c("chr1", "chr1", "chr1", "chr2", "chr2"),
start = c(1000, 1500, 3700, 4000, 4300),
end = c(1400, 1700, 3800, 4100, 4700))
regionSet2 <- MIRA:::dtToGr(regionSet2)
# don't use built in package data for tests because it will take longer
# running the tests
# aggregateSignal(signal = loadingMat,
# coordinateDT = coordinates,
# regionSet = regionSet)
# signalOLMetrics
dataDT <- cbind(coordinateDT, as.data.frame(loadingMat))
# making test data for Wilcoxon rank sum test
chr3 <- seq(from=100, to = 700, by = 100)
coordinateDTW <- data.table(chr=rep("chr3", length(chr3)),
start = chr3, end = chr3)
regionSetW <- data.table(chr = c("chr3", "chr3"),
start = c(50, 250),
end = c(150, 450))
regionSetW <- COCOA:::dtToGr(regionSetW)
loadingMatW <- matrix(data = c(-2:4, seq(from=8, to=2, by=-1)), ncol = 2)
colnames(loadingMatW) <- c("PC2", "PC3")
loadingMatW[7, "PC3"] <- 10
dataDTW <- cbind(coordinateDTW, as.data.frame(loadingMatW))
test_that("aggregateSignal, scoring metrics, and aggregateSignalGRList", {
# # test wilcoxon rank sum scoring metric
rsWResults <- COCOA:::rsWilcox(dataDT = dataDTW, regionSet = regionSetW)
PC2W <- wilcox.test(x = c(-2, 0, 1), y=c(-1, 2:4))$p.value
PC3W <- wilcox.test(x = c(8, 6, 5), y=c(7, 4, 3, 10))$p.value
expect_equal(c(PC2W, PC3W, 3, 2, 2, mean(width(regionSetW))),
c(rsWResults$PC2, rsWResults$PC3, rsWResults$signalCoverage,
rsWResults$regionSetCoverage,
rsWResults$totalRegionNumber,
rsWResults$meanRegionSize))
# # same test for Wilcoxon but with aggregateSignal (absolute value
# # of loadings will be taken), "greater" alternate hypothesis is used in
# # aggregateSignal
# rsWResults <- COCOA:::aggregateSignal(signal = loadingMatW,
# signalCoord = coordinateDTW,
# regionSet = regionSetW,
# signalCol = c("PC2", "PC3"),
# scoringMetric = "rankSum")
# PC2W <- wilcox.test(x = c(2, 0, 1), y=c(1, 2:4), alternative = "greater")$p.value
# PC3W <- wilcox.test(x = c(8, 6, 5), y=c(7, 4, 3, 10), alternative = "greater")$p.value
# expect_equal(c(PC2W, PC3W, 3, 2, 2, mean(width(regionSetW))),
# c(rsWResults$PC2, rsWResults$PC3, rsWResults$signalCoverage,
# rsWResults$regionSetCoverage,
# rsWResults$totalRegionNumber,
# rsWResults$meanRegionSize))
# test regionMean scoring method, average first within region, then
# between regions
regionMeanRes <- COCOA:::aggregateSignal(signal = loadingMatW,
signalCoord = coordinateDTW,
regionSet = regionSetW,
signalCol = c("PC2", "PC3"),
scoringMetric = "regionMean")
PC2R <- mean(c(2, mean(c(0, 1))))
PC3R <- mean(c(8, mean(c(6, 5))))
expect_equal(c(PC2R, PC3R, 3, 2, 2, mean(width(regionSetW))),
c(regionMeanRes$PC2, regionMeanRes$PC3, regionMeanRes$signalCoverage,
regionMeanRes$regionSetCoverage,
regionMeanRes$totalRegionNumber,
regionMeanRes$meanRegionSize))
########### test simpleMean scoring method ################
# this is a mean of CpG loading values just like "raw" but instead
# of averaging within regions then averaging regions together, this
# method does the simple average of all CpGs within the region set
simpleMeanRes <- COCOA:::aggregateSignal(signal = loadingMatW,
signalCoord = coordinateDTW,
regionSet = regionSetW,
signalCol = c("PC2", "PC3"),
scoringMetric = "simpleMean")
PC2RC <- mean(c(2, 0, 1))
PC3RC <- mean(c(8, 6, 5))
expect_equal(c(PC2RC, PC3RC, 3, 2, 2, mean(width(regionSetW))),
c(simpleMeanRes$PC2, simpleMeanRes$PC3, simpleMeanRes$signalCoverage,
simpleMeanRes$regionSetCoverage,
simpleMeanRes$totalRegionNumber,
simpleMeanRes$meanRegionSize))
###########################################################
# test simpleMedian scoring method
simpleMedRes <- COCOA:::aggregateSignal(signal = loadingMatW,
signalCoord = coordinateDTW,
regionSet = regionSetW,
signalCol = c("PC2", "PC3"),
scoringMetric = "simpleMedian")
PC2RC <- median(c(2, 0, 1))
PC3RC <- median(c(8, 6, 5))
expect_equal(c(PC2RC, PC3RC, 3, 2, 2, mean(width(regionSetW))),
c(simpleMedRes$PC2, simpleMedRes$PC3, simpleMedRes$signalCoverage,
simpleMedRes$regionSetCoverage,
simpleMedRes$totalRegionNumber,
simpleMedRes$meanRegionSize))
##########################################################
# test regionMedian scoring method
# median first within region, then
# between regions
tmpCoordDTW = data.frame(chr=c(coordinateDTW$chr, "chr3"),
start = c(coordinateDTW$start, 350),
end = c(coordinateDTW$end, 350))
tmpLoadMatW = data.frame(PC2= c(loadingMatW[, "PC2"], 20),
PC3=c(loadingMatW[, "PC3"], 85))
regionMedRes <- COCOA:::aggregateSignal(signal = tmpLoadMatW,
signalCoord = tmpCoordDTW,
regionSet = regionSetW,
signalCol = c("PC2", "PC3"),
scoringMetric = "regionMedian")
PC2R <- median(c(2, median(c(0, 1, 20))))
PC3R <- median(c(8, median(c(6, 5, 85))))
expect_equal(c(PC2R, PC3R, 4, 2, 2, mean(width(regionSetW))),
c(regionMedRes$PC2, regionMedRes$PC3, regionMedRes$signalCoverage,
regionMedRes$regionSetCoverage,
regionMedRes$totalRegionNumber,
regionMedRes$meanRegionSize))
#########################################################
# # test mean difference scoring method
# PC2Num <- mean(c(2, 0, 1)) - mean(c(1, 2, 3, 4))
# PC3Num <- mean(c(8, 6, 5)) - mean(c(7, 4, 3, 10))
# # see doi: 10.1186/s13040-015-0059-z
# # pooled variance times normalization factor for size of each set
# PC2Denom <- sqrt((sd(c(2, 0, 1))^2 + sd(c(1, 2, 3, 4))^2) / 2) * sqrt(1/3 - 1/4)
# PC3Denom <- sqrt((sd(c(8, 6, 5))^2 + sd(c(7, 4, 3, 10))^2) / 2) * sqrt(1/3 - 1/4)
#
# PC2MD <- PC2Num / PC2Denom
# PC3MD <- PC3Num / PC3Denom
# mdRes <- COCOA:::aggregateSignal(signal = loadingMatW,
# signalCoord = coordinateDTW,
# regionSet = regionSetW,
# signalCol = c("PC2", "PC3"),
# scoringMetric = "meanDiff")
# expect_equal(c(PC2MD, PC3MD, 3, 2, 2, mean(width(regionSetW))),
# c(mdRes$PC2, mdRes$PC3, mdRes$signalCoverage,
# mdRes$regionSetCoverage,
# mdRes$totalRegionNumber,
# mdRes$meanRegionSize))
################## test aggregateSignalGRList with meanDiff test data
coordinateDTW2 <- copy(coordinateDTW)
coordinateDTW2$end <- coordinateDTW$end
# make sure it works if there is metadata in GRanges object
# extraCol <- rep(0, nrow(coordinateDTW2))
# signalCoordW2 <- COCOA:::dtToGr(coordinateDTW2)
# mcols(signalCoordW2) <- data.frame(extraCol)
# twoResults <- aggregateSignalGRList(signal = loadingMatW,
# signalCoord = signalCoordW2,
# GRList = GRangesList(regionSetW, regionSetW),
# signalCol = c("PC2", "PC3"),
# scoringMetric = "meanDiff")
# expect_equal(rep(c(PC2MD, PC3MD, 3, 2, 2, mean(width(regionSetW))), each=2),
# c(twoResults$PC2, twoResults$PC3, twoResults$signalCoverage,
# twoResults$regionSetCoverage,
# twoResults$totalRegionNumber,
# twoResults$meanRegionSize))
########## testing signalOLMetrics, used for meanDiff scoring method #######
olMetrics <- COCOA:::signalOLMetrics(dataDT=dataDTW,
dataGR = dtToGr(dataDTW),
regionSet = regionSetW,
metrics = c("mean", "sd"),
alsoNonOLMet = TRUE)
PC2Met <- data.table(t(c(mean(c(-2, 0, 1)), sd(c(-2, 0, 1)),
mean(c(-1, 2:4)), sd(c(-1, 2:4)), 3, 2, 2, 151)))
PC2Met <- cbind("PC2", PC2Met)
colnames(PC2Met) <- colnames(olMetrics)
expect_equal(olMetrics[1, ], PC2Met)
PC3Met <- data.table(t(c(mean(c(8, 6, 5)), sd(c(8, 6, 5)),
mean(c(7, 4, 3, 10)), sd(c(7, 4, 3, 10)), 3, 2, 2, 151)))
PC3Met <- cbind("PC3", PC3Met)
colnames(PC3Met) <- colnames(olMetrics)
expect_equal(olMetrics[2, ], PC3Met)
})
test_that("ATAC-seq scoring methods", {
# test "regionOLWeightedMean"
weightedAve <- regionOLWeightedMean(signalMat = regionDataDT, signalGR = COCOA:::dtToGr(regionCoordDT),
regionSet = regionSet1, calcCols = c("PC1", "PC2"))
# proportion overlap is first then PC
correctAve <- data.frame(PC1=((101/400)*1+(101/201)*1+(51/201)*2+1*3+(26/51)*5+(151/451)*6) /
((101/400)+(101/201)+(51/201)+1+(26/51)+(151/451)),
PC2=((101/400)*-1+(101/201)*-1+(51/201)*0+1*1+(26/51)*3+(151/451)*4) /
((101/400)+(101/201)+(51/201)+1+(26/51)+(151/451)),
signalCoverage=5, regionSetCoverage=5,
sumProportionOverlap=(101/400)+(101/201)+(51/201)+1+(26/51)+(151/451))
expect_equal(weightedAve, correctAve)
# test "regionOLMean"
signalAve <- regionOLMean(signalDT = regionDataDT,
signalGR = COCOA:::dtToGr(regionCoordDT),
regionSet = regionSet1,
calcCols = c("PC1", "PC2"))
# proportion overlap is first then PC
correctAve <- data.frame(PC1=(1+1+2+3+5+6)/6,
PC2=(-1+-1+0+1+3+4)/6,
signalCoverage=5,
regionSetCoverage=5)
expect_equal(signalAve, correctAve)
# test "weightedAvePerRegion"
avePerRegion <- weightedAvePerRegion(signalDT= regionDataDT,
signalCoord=COCOA:::dtToGr(regionCoordDT),
regionSet=regionSet1,
calcCols = c("PC1", "PC2"))
correctAve <- data.table(PC1 = c(1*1, (101/201*1 + 51/201*2)/ (101/201 + 51/201), 1*3, 1*5, 1*6),
PC2 = c(1*-1, (101/201*-1 + 51/201*0)/ (101/201 + 51/201), 1*1, 1*3, 1*4))
expect_equal(avePerRegion$PC1, correctAve$PC1)
expect_equal(avePerRegion$PC2, correctAve$PC2)
})
test_that("averagePerRegion", {
loadingMatABR <- loadingMat
loadingMatABR[1, "PC1"] <- 3
loadingMatABR[32, "PC1"] <- 2
abr <- COCOA:::averagePerRegion(signal = loadingMatABR, signalCoord = COCOA:::dtToGr(coordinateDT),
regionSet = regionSet1, signalCol = "PC1",
returnQuantile = FALSE)
expect_equal(abr$PC1, c(2, 1, 1, 1.5))
# test quantile
abrq <- COCOA:::averagePerRegion(signal = loadingMatABR, signalCoord = COCOA:::dtToGr(coordinateDT),
regionSet = regionSet1, signalCol = "PC1",
returnQuantile = TRUE)
# the mean values, abr$PC1, converted to quantiles
# converting properly to quantiles?
expect_equal(abrq$PC1, ecdf(loadingMatABR[, "PC1"])(abr$PC1))
})
test_that("getMetaRegionProfile", {
loadingMatP <- matrix(data = c(1:8, 10, 10, rep(1, 20)), nrow = 30)
colnames(loadingMatP) <- "PC1"
# two CpGs per bin of regionSetP (with 5 bins)
.start <- c(seq(from=50, to=950, by = 100),
seq(from=2050, to=2950, by = 100),
seq(from=4050, to=4950, by = 100))
coordinateDTP <- data.frame(chr = rep("chr3", nrow(loadingMatP)),
start = .start,
end = .start,
extraCol = rep(1, length(.start)))
regionSetP <- data.table(chr = rep("chr3", 3),
start = c(1, 2001, 4001),
end = c(1010, 3010, 5010))
regionSetP <- COCOA:::dtToGr(regionSetP)
# COCOA:::dtToGr(coordinateDTP)
binnedP <- lapply(GRangesList(regionSetP),
function(x) getMetaRegionProfile(signal = loadingMatP,
signalCoord = coordinateDTP,
regionSet = x,
signalCol = "PC1",
binNum = 5))
meanPerBin <- (c(seq(from=1.5, to=7.5, by=2), 10) + rep(1, 5) + rep(1, 5)) / 3
# BSBinAggregate averages the profile around the center
symmetricalBin <- (meanPerBin + rev(meanPerBin)) / 2
expect_equal(binnedP[[1]]$PC1, symmetricalBin)
# making sure that different input formats still work
#########
alterOut <- lapply(GRangesList(regionSetP),
function(x) getMetaRegionProfile(signal = data.frame(loadingMatP),
signalCoord = COCOA:::dtToGr(coordinateDTP),
regionSet = x,
signalCol = "PC1",
binNum = 5))
expect_equal(binnedP, alterOut)
##########################################################
# # test multiBase data like ATAC-seq
# coordinateDTP <- data.frame(chr = rep("chr3", nrow(loadingMatP)),
# start = .start,
# end = .start+1,
# extraCol = rep(1, length(.start)))
# # make 1 region overlap multiple bins ( % first bin, % second bin)
# # 150 to
# coordinateDTP$end[2] = 225
# meanPerBin <- (c(seq(from=1.5, to=7.5, by=2), 10) * (x, x, 1, 1, 1) +
# rep(1, 5) + rep(1, 5)) / 3
#
})
################### test function inputs
# make sure various input formats work
#### take out these high level tests in order to save time
test_that("Input formats", {
normalOut <- aggregateSignalGRList(signal = loadingMatW,
signalCoord = coordinateDTW,
GRList = GRangesList(regionSetW),
signalCol = "PC2",
scoringMetric = "regionMean")
alterOut <- aggregateSignalGRList(signal = data.frame(loadingMatW),
signalCoord = COCOA:::dtToGr(coordinateDTW),
GRList = regionSetW,
signalCol = "PC2",
scoringMetric = "regionMean")
expect_equal(normalOut, alterOut)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.