BigKnn: Large Scale K-Nearest Neighbor Classifier using the Lucene Search Engine

library(BigKnn)
options(andromedaTempFolder = "c:/andromedaTemp")

covariates <- data.frame(rowId = c(1, 1, 1, 2, 2, 3),
                         covariateId = c(10, 11, 12, 10, 11, 12),
                         covariateValue = c(1, 1, 1, 1, 1, 1))
cohorts <- data.frame(rowId = c(1, 2, 3))

outcomes <- data.frame(rowId = c(1, 2, 3), y = c(1, 0, 0))

indexFolder <- "c:/temp/bigKnn"
unlink(indexFolder)
covariateData <- Andromeda::andromeda(covariates = covariates, 
                                      outcomes = outcomes,
                                      cohorts = cohorts)


buildKnn(outcomes = covariateData$outcomes, 
         covariates = covariateData$covariates, 
         indexFolder = indexFolder)


prediction <- predictKnn(cohorts = covariateData$cohorts, 
                         covariates = covariateData$covariates, 
                         indexFolder = indexFolder,
                         k = 10,
                         weighted = TRUE)


# Large example from PLP:
library(PatientLevelPrediction)
library(BigKnn)
options(fftempdir = "s:/temp")
indexFolder <- "s:/temp/lucene"

plpData <- loadPlpData("S:/Temp/PlpVignette/plpData")
parts <- splitData(plpData, c(0.75, 0.25))
savePlpData(parts[[1]], "s:/temp/PlpVignette/plpData_train")
savePlpData(parts[[2]], "s:/temp/PlpVignette/plpData_test")



plpData <- loadPlpData("S:/Temp/PlpVignette/plpData_train")



# Build an outcomes object for all subjects (not just those that have an outcome):
outcomes <- plpData$outcomes
outcomes$y <- ff::ff(1, length = nrow(plpData$outcomes), vmode = "double")
outcomes <- merge(plpData$cohorts, outcomes, by = c("rowId"), all.x = TRUE)
idx <- ffbase::is.na.ff(outcomes$y)
idx <- ffbase::ffwhich(idx, idx == TRUE)
outcomes$y <- ff::ffindexset(x = outcomes$y,
                             index = idx,
                             value = ff::ff(0, length = length(idx), vmode = "double"))

covariates <- plpData$covariates
rownames(covariates) <- NULL  #Needs to be null or the ordering of ffdf will fail
covariates <- covariates[ff::ffdforder(covariates[c("rowId")]), ]
ffbase::save.ffdf(covariates, dir = "s:/temp/covariates")
ffbase::save.ffdf(outcomes, dir = "s:/temp/outcomes")

ffbase::load.ffdf(dir = "s:/temp/covariates")
ffbase::load.ffdf(dir = "s:/temp/outcomes")

buildKnn(outcomes = outcomes,
         covariates = covariates,
         indexFolder = indexFolder,
         checkSorting = FALSE,
         checkRowIds = FALSE)


plpData <- loadPlpData("S:/Temp/PlpVignette/plpData_test")

# Build an outcomes object for all subjects (not just those that have an outcome):
outcomes <- plpData$outcomes
outcomes$y <- ff::ff(1, length = nrow(plpData$outcomes), vmode = "double")
outcomes <- merge(plpData$cohorts, outcomes, by = c("rowId"), all.x = TRUE)
idx <- ffbase::is.na.ff(outcomes$y)
idx <- ffbase::ffwhich(idx, idx == TRUE)
outcomes$y <- ff::ffindexset(x = outcomes$y,
                             index = idx,
                             value = ff::ff(0, length = length(idx), vmode = "double"))

covariates <- plpData$covariates
rownames(covariates) <- NULL  #Needs to be null or the ordering of ffdf will fail
covariates <- covariates[ff::ffdforder(covariates[c("rowId")]), ]

ffbase::save.ffdf(covariates, dir = "s:/temp/covariates2")
ffbase::save.ffdf(outcomes, dir = "s:/temp/outcomes2")

ffbase::load.ffdf(dir = "s:/temp/covariates2")
ffbase::load.ffdf(dir = "s:/temp/outcomes2")

prediction <- predictKnn(covariates = covariates,
                         indexFolder = indexFolder,
                         k = 1000,
                         weighted = TRUE,
                         checkSorting = FALSE)


# Example using plpData interface:
library(PatientLevelPrediction)
library(BigKnn)
options(fftempdir = "s:/temp")
indexFolder <- "s:/temp/lucene"
plpData <- loadPlpData("S:/Temp/PlpVignette/plpData_train")

buildKnnFromPlpData(plpData = plpData, indexFolder = indexFolder)

plpData <- loadPlpData("S:/Temp/PlpVignette/plpData_test")

prediction <- predictKnnUsingPlpData(indexFolder = indexFolder,
                                     k = 1000,
                                     weighted = TRUE,
                                     plpData,
                                     threads = 10)
attr(prediction, "modelType") <- "logistic"
computeAuc(prediction, plpData)
plotCalibration(prediction, plpData)

OHDSI/BigKnn documentation built on June 1, 2024, 12:13 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

OHDSI/BigKnn
Large Scale K-Nearest Neighbor Classifier using the Lucene Search Engine

extras/TestScripts.R
In OHDSI/BigKnn: Large Scale K-Nearest Neighbor Classifier using the Lucene Search Engine

R Package Documentation

Browse R Packages

We want your feedback!

OHDSI/BigKnn Large Scale K-Nearest Neighbor Classifier using the Lucene Search Engine

extras/TestScripts.R In OHDSI/BigKnn: Large Scale K-Nearest Neighbor Classifier using the Lucene Search Engine

R Package Documentation

Browse R Packages

We want your feedback!

OHDSI/BigKnn
Large Scale K-Nearest Neighbor Classifier using the Lucene Search Engine

extras/TestScripts.R
In OHDSI/BigKnn: Large Scale K-Nearest Neighbor Classifier using the Lucene Search Engine