# Connection #
connectionDetails <- createConnectionDetails(
dbms = "",
server = "",
user ="",
password = "",
port = 0)
connection<-DatabaseConnector::connect(connectionDetails)
# Cohort #
cdmDatabaseSchema <- ""
cohortDatabaseSchema <- ""
cohortTable <- ""
cohortId <- 1
rowIdField<-"row_id"
# Triton CovariateSettings #
custom_preprocessing<-function(string){
res<-tolower(string) # everything to lower case
res<-stringr::str_replace_all(res,"\\\\n", " ") # remove new lines
res<-stringr::str_replace_all(res,"[0-9]+", " ") # replace every number (with a space)
res<-stringr::str_replace_all(res,"_+", " ") # replace all under scores (with a space)
return(res)
}
custom_tokenizer <- function(strings) {
res<-stringi::stri_split_regex(strings, "[\\p{Z}\\p{P}\\p{C}\\p{S}]+")
return(res)
}
cs <- Triton::createTritonCovariateSettings(
useTextData = T,
startDay = -10,
endDay = -1,
preprocessor_function = custom_preprocessing,
tokenizer_function = custom_tokenizer,
stopwords = NULL,
custom_pruning_regex = NULL,
ngrams = 1,
term_count_min = 50,
doc_count_min = 50,
doc_proportion_max = 0.4,
doc_proportion_min = 0.001,
parallel = TRUE
)
# settings #
doPar<-cs$parallel
parCores<-parallel::detectCores()
quanteda::quanteda_options(threads=parallel::detectCores())
#### preprocess notes ####
# import notes #
notes <- Triton:::importNotesFromCohort(connection,
cdmDatabaseSchema,
cohortTable,
cohortId,
rowIdField,
cs$startDay,
cs$endDay,
cs$customWhere)
# Preprocessing the notes #
notes <- Triton:::preprocessNotes(notes,cs,doPar,parCores)
# Tokenization of the notes #
notes_tokens <- Triton:::tokenizeNotes(notes,cs,doPar,parCores)
# filter tokens
notes_tokens <- Triton:::filterTokens(notes_tokens,cs)
#### Word embeddings ####
## create word embeddings ##
t2v<-Triton:::trainGloVe(notes_tokens, d = 50, filename = "output/textmodels/gloveModel.rds",parCores = parCores,verbose = T)
#### Topic Models ####
# create dfm
notes_dfm <- Triton:::createDFM(notes_tokens)
# trim dfm
notes_dfm_trimmed <- Triton:::trimDFM(notes_dfm, cs)
## create lsa topic model ##
lsa<-Triton:::trainLSA(notes_dfm_trimmed,k = 10,filename = "output/textmodels/lsaModel.rds",tfidf = T)
# Get topics
lsaPred <- as.data.frame(lsa$docs)
# predict on new data
lsaPredNew <- predictLSA(lsa,notes_dfm_trimmed)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.