library(testthat)
context("plpInterface")
indexFolder <- tempfile("indexFolder")
indexFolder2 <- tempfile("indexFolder")
# given plpData and population
n <- 100 + sample(100, 1)
populationBuild <- data.frame(
rowId = 1:n,
outcomeCount = round(runif(n))
)
populationPred <- data.frame(rowId = n + 1)
# person 11 is similar as person 1/3/5
covariatesBuild <- data.frame(
rowId = sample(populationBuild$rowId, size = 1000, replace = T),
# covariateId = sample(10, 1000, replace = T),
covariateId = c(
rep(1, 100), rep(2, 100), rep(3, 100), rep(4, 100), rep(5, 100),
rep(6, 100), rep(7, 100), rep(8, 100), rep(9, 100), rep(10, 100)
),
covariateValue = rep(1, 1000)
)
covariatesBuild <- unique(covariatesBuild)
covariatesPred <- data.frame(
rowId = rep(n + 1, 5),
covariateId = sample(10, 5, replace = T),
covariateValue = rep(1, 5)
)
covariatesPred <- unique(covariatesPred)
covariates <- rbind(covariatesBuild, covariatesPred)
covariateData <- Andromeda::andromeda(covariates = covariates)
plpData <- list(
cohorts = NULL,
outcomes = NULL,
covariateData = covariateData
)
covariateDataBuild <- Andromeda::andromeda(covariates = covariatesBuild)
plpDataBuild <- list(
cohorts = NULL,
outcomes = NULL,
covariateData = covariateDataBuild
)
# create matrix for similarity:
covariatesMat <- matrix(nrow = n + 1, ncol = 10, data = rep(0, 10 * (n + 1)))
for (i in 1:nrow(covariates)) {
covariatesMat[covariates$rowId[i], covariates$covariateId[i]] <- 1
}
# euclidean distance
distancEuclidean <- stats::dist(covariatesMat)
# manhattan distance
distancManhattan <- stats::dist(covariatesMat, method = "manhattan")
# cosine similarity distance
cosineSim <- function(covariatesMat) {
cos.sim <- function(ix) {
A <- covariatesMat[ix[1], ]
B <- covariatesMat[ix[2], ]
return(sum(A * B) / sqrt(sum(A^2) * sum(B^2)))
}
cmb <- expand.grid(i = 1:nrow(covariatesMat), j = 1:nrow(covariatesMat))
C <- matrix(apply(cmb, 1, cos.sim), nrow(covariatesMat), nrow(covariatesMat))
distance <- abs(1 - C)
distance[is.nan(distance)] <- 1
return(distance)
}
distanceCosine <- cosineSim(covariatesMat)
distance <- distanceCosine
# distance <- distancManhattan
test_that("buildKnnFromPlpData works when test patient not in plpData works", {
buildKnnFromPlpData(
plpData = plpDataBuild, # excludes the test person
population = populationBuild,
indexFolder = indexFolder,
overwrite = TRUE
)
expect_true(file.exists(indexFolder))
})
test_that("buildKnnFromPlpData when test patient in plpData works", {
buildKnnFromPlpData(
plpData = plpData, # includes the test person
population = populationBuild,
indexFolder = indexFolder2,
overwrite = TRUE
)
expect_true(file.exists(indexFolder2))
})
test_that("buildKnnFromPlpData unweighted predictions correct", {
# get the predicted risk for the test when k = 1
# find k near 1 based on ties
val <- max(as.matrix(distance)[n + 1, order(as.matrix(distance)[n + 1, 1:n])[1]])
k <- sum(as.matrix(distance)[n + 1, 1:n] <= val)
pred1 <- predictKnnUsingPlpData(
plpData = plpData,
population = populationPred,
indexFolder = indexFolder,
k = k,
weighted = F,
threads = 1
)
manualPred1 <- mean(populationBuild$outcomeCount[populationBuild$rowId %in% order(as.matrix(distance)[n + 1, 1:n])[1:k]])
expect_equal(pred1$value, manualPred1)
# find k near 10 based on ties
val <- max(as.matrix(distance)[n + 1, order(as.matrix(distance)[n + 1, 1:n])[1:10]])
k <- sum(as.matrix(distance)[n + 1, 1:n] <= val)
pred10 <- predictKnnUsingPlpData(
plpData = plpData,
population = populationPred,
indexFolder = indexFolder,
k = k,
weighted = F,
threads = 1
)
# get the predicted risk for the test when k = 1
manualPred10 <- mean(populationBuild$outcomeCount[populationBuild$rowId %in% order(as.matrix(distance)[n + 1, 1:n])[1:k]])
expect_equal(pred10$value, manualPred10)
})
test_that("buildKnnFromPlpData - test when patient has no covariates", {
populationNoCovs <- data.frame(rowId = n + 2)
k <- sum(rowSums(covariatesMat) <= min(rowSums(covariatesMat)))
pred <- predictKnnUsingPlpData(
plpData = plpData,
population = populationNoCovs,
indexFolder = indexFolder,
k = k,
weighted = F,
threads = 1
)
manualPred <- mean(populationBuild$outcomeCount[populationBuild$rowId %in% which(rowSums(covariatesMat) <= min(rowSums(covariatesMat)))])
# expect_equal(pred$value, manualPred)
expect_equal(pred$value, 0) # seems to be a bug where no covariates goes to 0 risk!
})
test_that("buildKnnFromPlpData - testing correct filtering", {
# result when knn trained without test patient in plpData and pop
predWithoutTestInData <- predictKnnUsingPlpData(
plpData = plpData,
population = populationPred,
indexFolder = indexFolder,
k = 3,
weighted = F,
threads = 1
)
# result when knn trained with test patient in plpData but not in pop
predWithTestInData <- predictKnnUsingPlpData(
plpData = plpData,
population = populationPred,
indexFolder = indexFolder2,
k = 3,
weighted = F,
threads = 1
)
# filtering appears to work if these are the same
expect_equal(predWithoutTestInData, predWithTestInData)
})
# Test cleanup
unlink(indexFolder)
unlink(indexFolder2)
Andromeda::close(covariateData)
Andromeda::close(covariateDataBuild)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.