BigKnn: Large Scale K-Nearest Neighbor Classifier using the Lucene Search Engine

library(testthat)

context("plpInterface")

indexFolder <- tempfile("indexFolder")
indexFolder2 <- tempfile("indexFolder")

# given plpData and population
n <- 100 + sample(100, 1)
populationBuild <- data.frame(
  rowId = 1:n,
  outcomeCount = round(runif(n))
)

populationPred <- data.frame(rowId = n + 1)

# person 11 is similar as person 1/3/5

covariatesBuild <- data.frame(
  rowId = sample(populationBuild$rowId, size = 1000, replace = T),
  # covariateId = sample(10, 1000, replace = T),
  covariateId = c(
    rep(1, 100), rep(2, 100), rep(3, 100), rep(4, 100), rep(5, 100),
    rep(6, 100), rep(7, 100), rep(8, 100), rep(9, 100), rep(10, 100)
  ),
  covariateValue = rep(1, 1000)
)
covariatesBuild <- unique(covariatesBuild)
covariatesPred <- data.frame(
  rowId = rep(n + 1, 5),
  covariateId = sample(10, 5, replace = T),
  covariateValue = rep(1, 5)
)
covariatesPred <- unique(covariatesPred)

covariates <- rbind(covariatesBuild, covariatesPred)

covariateData <- Andromeda::andromeda(covariates = covariates)

plpData <- list(
  cohorts = NULL,
  outcomes = NULL,
  covariateData = covariateData
)

covariateDataBuild <- Andromeda::andromeda(covariates = covariatesBuild)

plpDataBuild <- list(
  cohorts = NULL,
  outcomes = NULL,
  covariateData = covariateDataBuild
)



# create matrix for similarity:
covariatesMat <- matrix(nrow = n + 1, ncol = 10, data = rep(0, 10 * (n + 1)))
for (i in 1:nrow(covariates)) {
  covariatesMat[covariates$rowId[i], covariates$covariateId[i]] <- 1
}



# euclidean distance
distancEuclidean <- stats::dist(covariatesMat)

# manhattan distance
distancManhattan <- stats::dist(covariatesMat, method = "manhattan")

# cosine similarity distance
cosineSim <- function(covariatesMat) {
  cos.sim <- function(ix) {
    A <- covariatesMat[ix[1], ]
    B <- covariatesMat[ix[2], ]
    return(sum(A * B) / sqrt(sum(A^2) * sum(B^2)))
  }
  cmb <- expand.grid(i = 1:nrow(covariatesMat), j = 1:nrow(covariatesMat))
  C <- matrix(apply(cmb, 1, cos.sim), nrow(covariatesMat), nrow(covariatesMat))
  distance <- abs(1 - C)
  distance[is.nan(distance)] <- 1
  return(distance)
}
distanceCosine <- cosineSim(covariatesMat)


distance <- distanceCosine
# distance <- distancManhattan

test_that("buildKnnFromPlpData works when test patient not in plpData works", {
  buildKnnFromPlpData(
    plpData = plpDataBuild, # excludes the test person
    population = populationBuild,
    indexFolder = indexFolder,
    overwrite = TRUE
  )

  expect_true(file.exists(indexFolder))
})


test_that("buildKnnFromPlpData when test patient in plpData works", {
  buildKnnFromPlpData(
    plpData = plpData, # includes the test person
    population = populationBuild,
    indexFolder = indexFolder2,
    overwrite = TRUE
  )

  expect_true(file.exists(indexFolder2))
})



test_that("buildKnnFromPlpData unweighted predictions correct", {
  # get the predicted risk for the test when k = 1
  # find k near 1 based on ties
  val <- max(as.matrix(distance)[n + 1, order(as.matrix(distance)[n + 1, 1:n])[1]])
  k <- sum(as.matrix(distance)[n + 1, 1:n] <= val)

  pred1 <- predictKnnUsingPlpData(
    plpData = plpData,
    population = populationPred,
    indexFolder = indexFolder,
    k = k,
    weighted = F,
    threads = 1
  )

  manualPred1 <- mean(populationBuild$outcomeCount[populationBuild$rowId %in% order(as.matrix(distance)[n + 1, 1:n])[1:k]])

  expect_equal(pred1$value, manualPred1)

  # find k near 10 based on ties
  val <- max(as.matrix(distance)[n + 1, order(as.matrix(distance)[n + 1, 1:n])[1:10]])
  k <- sum(as.matrix(distance)[n + 1, 1:n] <= val)

  pred10 <- predictKnnUsingPlpData(
    plpData = plpData,
    population = populationPred,
    indexFolder = indexFolder,
    k = k,
    weighted = F,
    threads = 1
  )

  # get the predicted risk for the test when k = 1
  manualPred10 <- mean(populationBuild$outcomeCount[populationBuild$rowId %in% order(as.matrix(distance)[n + 1, 1:n])[1:k]])

  expect_equal(pred10$value, manualPred10)
})

test_that("buildKnnFromPlpData - test when patient has no covariates", {
  populationNoCovs <- data.frame(rowId = n + 2)

  k <- sum(rowSums(covariatesMat) <= min(rowSums(covariatesMat)))

  pred <- predictKnnUsingPlpData(
    plpData = plpData,
    population = populationNoCovs,
    indexFolder = indexFolder,
    k = k,
    weighted = F,
    threads = 1
  )

  manualPred <- mean(populationBuild$outcomeCount[populationBuild$rowId %in% which(rowSums(covariatesMat) <= min(rowSums(covariatesMat)))])

  # expect_equal(pred$value, manualPred)
  expect_equal(pred$value, 0) # seems to be a bug where no covariates goes to 0 risk!
})

test_that("buildKnnFromPlpData - testing correct filtering", {
  # result when knn trained without test patient in plpData and pop
  predWithoutTestInData <- predictKnnUsingPlpData(
    plpData = plpData,
    population = populationPred,
    indexFolder = indexFolder,
    k = 3,
    weighted = F,
    threads = 1
  )

  # result when knn trained with test patient in plpData but not in pop
  predWithTestInData <- predictKnnUsingPlpData(
    plpData = plpData,
    population = populationPred,
    indexFolder = indexFolder2,
    k = 3,
    weighted = F,
    threads = 1
  )

  # filtering appears to work if these are the same
  expect_equal(predWithoutTestInData, predWithTestInData)
})



# Test cleanup
unlink(indexFolder)
unlink(indexFolder2)
Andromeda::close(covariateData)
Andromeda::close(covariateDataBuild)

OHDSI/BigKnn documentation built on June 1, 2024, 12:13 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

OHDSI/BigKnn
Large Scale K-Nearest Neighbor Classifier using the Lucene Search Engine

tests/testthat/test-plpInterface.R
In OHDSI/BigKnn: Large Scale K-Nearest Neighbor Classifier using the Lucene Search Engine

R Package Documentation

Browse R Packages

We want your feedback!

OHDSI/BigKnn Large Scale K-Nearest Neighbor Classifier using the Lucene Search Engine

tests/testthat/test-plpInterface.R In OHDSI/BigKnn: Large Scale K-Nearest Neighbor Classifier using the Lucene Search Engine

R Package Documentation

Browse R Packages

We want your feedback!

OHDSI/BigKnn
Large Scale K-Nearest Neighbor Classifier using the Lucene Search Engine

tests/testthat/test-plpInterface.R
In OHDSI/BigKnn: Large Scale K-Nearest Neighbor Classifier using the Lucene Search Engine