##############################################################################################
############## Follows: text parsing/preprocessing functions #################################
##############################################################################################
# Function : Build model
buildModel <- function (modelType, dataMatrix, labels, buildParams) {
model = NULL;
switch(tolower(modelType),
"svm"={
model <- e1071::svm(x=dataMatrix, y=labels, kernel="linear", probability=TRUE, type="C-classification", scale=TRUE, tolerance=0.1, fitted=FALSE);
#model <- gmum.r::SVM(as.matrix(dataMatrix), as.vector(labels), class.type="one.versus.one", core="libsvm", kernel=params$kernel, C=1);
},
"nb"={
model <- naiveBayes(x = dataMatrix, y=labels, buildParams, laplace=0.01);
},
"me"={
model <- maxent(dataMatrix, labels);
},
"dt"={
model <- rpart(labels ~ ., data=as.data.frame(dataMatrix), method="class", parms=list(split="information"), control=rpart.control(minsplit=4, cp=0.01) );
},
# default
{
message ("Model type not recognized !");
});
return (model);
}
# Deploy a ML model (#if thresH < 1 return all classes with thres>thresH, else return exactly thresH classes)
deployModel <- function (modelType, model, dataMatrix, thresH, allPresLabels) {
res <- NULL;
switch(tolower(modelType),
"svm"= {
res <- deploySvm (model, dataMatrix, thresH, allPresLabels);
},
"nb"= {
res <- deployNb (model, dataMatrix, thresH);
},
"me"= {
res <- deployME (model, dataMatrix, thresH);
},
"dt"= {
res <- deployDT (model, as.data.frame(dataMatrix), thresH);
},
# default
{
message ("Model type not recognized !");
});
return (res);
}
# Function : Deploy SVM model
deploySvm <- function (model, dataMatrix, thresH, allPresLabels) {
result <- predict(model, dataMatrix, probability=TRUE);
res <- attr(result, "probabilities");
# Cas 1 : restituer plusieurs labels qui > seuil thresH
if (thresH < 1) {
res[res < thresH] <- 0;
res[res >= thresH] <- 1;
}
# Cas 2 : restituer un nombre précis de labels égal à thresH (1, 2, 3, .....)
else {
# Find the "thresH" probable labels
indices <- integer(thresH);
for (i in 1:nrow(res)) {
indices <- order(res[i,], decreasing=TRUE)[1:thresH];
res[i, ] = rep(0, ncol(res));
res[i, as.matrix(indices) ] <- 1;
}
}
# completude matrix (labels never predicted) : a MUST for SVM (may be because codes are not factors ?)
newNames <- setdiff(allPresLabels, unlist(attr(attr(result,"probabilities"), "dimnames")[2]));
complMatrix <- matrix(0, ncol=length(newNames), nrow=nrow(res));
rownames(complMatrix) <- rownames(res);
colnames(complMatrix) <- newNames;
res<- cbind(res,complMatrix);
return (res);
}
# Function : Deploy NB model
deployNb <- function (model, dataMatrix, thresH) {
res <- predict(model, dataMatrix, type="raw", laplace=0.01, eps=0.01 );
# Cas 1 : restituer plusieurs labels qui > seuil thresH
if (thresH < 1) {
res[res < thresH] <- 0;
res[res >= thresH] <- 1;
}
# Cas 2 : restituer un nombre précis de labels égal à thresH (1, 2, 3, .....)
else {
# Find the "thresH" probable labels
indices <- integer(thresH);
for (i in 1:nrow(res)) {
indices <- order(res[i,], decreasing=TRUE)[1:thresH];
res[i, ] = rep(0, ncol(res));
res[i, as.matrix(indices) ] <- 1;
}
}
return (res);
}
# Function : Deploy MaxEnt model
deployME <- function (model, dataMatrix, thresH) {
res <- predict(model, dataMatrix);
res <- res[,-c(1)];
# Cas 1 : restituer plusieurs labels qui > seuil thresH
if (thresH < 1) {
res[res < thresH] <- 0;
res[res >= thresH] <- 1;
}
# Cas 2 : restituer un nombre précis de labels égal à thresH (1, 2, 3, .....)
else {
# Find the "thresH" probable labels
indices <- integer(thresH);
for (i in 1:nrow(res)) {
indices <- order(res[i,], decreasing=TRUE)[1:thresH];
res[i, ] = rep(0, ncol(res));
res[i, as.matrix(indices) ] <- 1;
}
}
return (res);
}
# Function : Deploy DT model
deployDT <- function (model, dataMatrix, thresH) {
res <- predict(model, dataMatrix);
# Cas 1 : restituer plusieurs labels qui > seuil thresH
if (thresH < 1) {
res[res < thresH] <- 0;
res[res >= thresH] <- 1;
}
# Cas 2 : restituer un nombre précis de labels égal à thresH (1, 2, 3, .....)
else {
# Find the "thresH" probable labels
indices <- integer(thresH);
for (i in 1:nrow(res)) {
indices <- order(res[i,], decreasing=TRUE)[1:thresH];
res[i, ] = rep(0, ncol(res));
res[i, as.matrix(indices) ] <- 1;
}
}
return (res);
}
#' Build Predictive Models from DataFrames
#'
#' Build a new code prediction model from data stored in dataframes. The function takes two mandatory 2-column dataframes: 'df.text' and 'df.code'. See below for details.
#'
#' @export
#' @param df.text a data frame with 2 columns: 'ID' (a unique line identifier), and 'TEXT (free text).
#' @param df.code a data frame with 2 columns: 'ID' (a unique line identifier), and 'DIAG' (alpha-num codes).
#' @param modelType model type from \{"SVM", "NB"\} for Support Vector Machine, and Naive Bayes respectively.
#' @param multiLabMod a string argument to specify how to deal with multilabeled texts. Put "d" if you want to duplicate them (the same text is considered multiple times with one code at each), or any other value to ignore multilabeled texts.
#' @param minCodSiz minimum code size to be reached (numeric). If this argument is set to K, then the codes with less than K texts will be bootstrapped to reach this minimum.
#' @return an object of type "svm" or "nb" (depends on the specified 'modelType' argument.).
#' If you want to use the built model within 'mlcodage' Web service, you must add it to the 'data' folder of the package in the form of an RData object with extension ".model.RData', then re-compile the package.
#' @examples
#' df.text = data.frame(ID=c("T1", "T2"), TEXT=c("text numer one", "text number two"))
#' df.code = data.frame(ID=c("T1", "T2"), DIAG=c("CODE1", "CODE2"))
#' buildFromCsv(df.text, df.code, modelType="SVM", multiLabMod="!d", minCodSiz=0)
buildFromDF <- function (df.text, df.code, modelType="SVM", multiLabMod="!d", minCodSiz=0) {
# Chek data format
if ( length(unique(rownames(df.text))) != length(rownames(df.text)) || length(unique(rownames(df.code))) != length(rownames(df.code)) ) {
return ("Error: 'df.text' and 'df.code' objects must have unique row names !")
}
# Put project folder here
setwd("E:/Code/mlcodage");
# for new data only: save the freq.-wise sorted list of codes (next time, you can restrict on N codes)
allowedCodes <- sort(table(df.code[,2]), decreasing=TRUE);
allowedCodes <- allowedCodes[allowedCodes > 0]; # discard non-present codes
allowedCodes <- names(allowedCodes);
message(c("#distint codes = ", length(allowedCodes) ));
# Build gold-standard matrix (ground truth)
cat("Build gold-standard matrix..");
goldStrd <- buildGoldStrd (df.text, df.code, allowedCodes);
cat ("ok\n");
gc();
# identify 0-labeled and multi-labeled docs
cat ("\nComputing stats/ID..");
statsPerNda <- rowSums(goldStrd);
cat ("ok\n");
# Discard docs with #labels == 0 (if multiLabelModel != "d" (duplicate), then discard multilabeled docs too)
inds = c();
if (multiLabMod != "d") {
cat ("Discarding docs. with '0' or '>1' labels");
inds <- which(statsPerNda != 1 );
} else {
cat ("Discarding docs. with '0' labels, others are duplicated");
inds <- which(statsPerNda == 0 );
}
if (length(inds) > 0) {
message (paste(length(inds), "documents discarded."));
df.text <- df.text [ -c(inds), ];
goldStrd <- goldStrd [ -c(inds), ];
statsPerNda <- statsPerNda[ -c(inds) ];
message(paste(length(df.text[,1]), "documents kept."));
}
nbUniqDocs <- nrow(df.text);
# After removing these IDs, some codes may be empty (no more docs): update stats
inds <- names(goldStrd)[ which(colSums(goldStrd)==0) ];
goldStrd <- goldStrd[ , !(names(goldStrd) %in% inds) ];
# compute stats/code (#docs per code)
cat ("\nComputing stats/code..");
statsPerCode <- colSums(goldStrd);
cat ("ok\n");
# Trandform gold-standard matrix into a vector of codes (some algos need this)
truthV <- apply(goldStrd, 1, function(x) colnames(goldStrd)[which.max(x)]);
cat ("Data parse..");
dtmParams <- getDefaultDtmParams();
dtm <- transform.matrix( DocumentTermMatrix(Corpus(VectorSource(df.text[,2])), control=dtmParams) , "sparse");
cat ("ok\n");
message(c("#tokens = ", ncol(dtm) ));
# duplicate docs for multi-label lrn: each doc with >1 label is duplicated for each of its labels
if (multiLabMod == "d") {
cat ("multi-label doc. duplic...");
newDtm <- duplicMultiLabel(dtm, goldStrd, statsPerNda);
cat("ok\n");
message("#new docs. created = ", length(newDtm$gsVec));
dtm <- rbind(dtm, newDtm$mat);
rownames(dtm) <- make.unique(rownames(dtm));
goldStrd <- rbind(goldStrd, newDtm$gsMat);
rownames(goldStrd) <- make.unique(rownames(goldStrd));
truthV <- c(truthV, newDtm$gsVec);
rm(newDtm);
}
# duplicate docs for rare labels: each code with '< minCodSiz' is bootstrapped by duplicating docs with same code
if (minCodSiz > 1) {
cat (paste("rare-label doc. duplic. with ", minCodSiz, " docs. each..", sep="") );
newDtm <- bootstrpRareLabel(dtm, goldStrd, truthV, statsPerCode, minCodSiz);
cat("ok\n");
message("#new docs. created = ", length(newDtm$gsVec));
dtm <- rbind(dtm, newDtm$mat);
rownames(dtm) <- make.unique(rownames(dtm));
goldStrd <- rbind(goldStrd, newDtm$gsMat)
rownames(goldStrd) <- make.unique(rownames(goldStrd));
truthV <- c(truthV, newDtm$gsVec);
statsPerCode[names(newDtm$statsBis)] = newDtm$statsBis;
rm(newDtm);
}
gc();
nbDocsAll <- nrow(dtm);
message("#all docs. = ", nbDocsAll);
lrnLabels <- truthV;
allLabels <- unique(truthV);
message("ok");
truthV <- factor(truthV);
lrnLabels <- factor(lrnLabels);
allLabels <- levels(truthV);
nbLabelsAll <- length(allLabels);
message("# distinct labels = ", nbLabelsAll);
# transform matrices into the right format (sparse, dense...)
lrnMtrx = transform.matrix(dtm, "nb");
# Build ML model
cat ("\nModel build..");
datB <- Sys.time();
model <- buildModel (modelType, lrnMtrx, lrnLabels, buildParams[modelType]);
datE <- Sys.time();
# save dtmParams and dictionary (tokens) into model object
dtmParams$dictionary <- colnames(dtm);
dtmParams$stopwords <- NULL; # not necessary
dtmParams$bounds <- NULL; # take all the words in the (future) test text
model$dtmParams <- dtmParams;
cat("[", round(as.numeric(datE-datB)/60, 2) ,"] min. ok\n");
cat("To be used within 'mlcodage' package, the model must be saved into 'data' directory of the package. 'RData' file should contain one sigle object called 'model'.")
return(model);
}
#' Generate dataframes from a set of JSON Files
#'
#' Generate 'df.text' and 'df.code' objects starting from a floder with a plenty of JSON files.
#' These objects can then be given to function 'buildFromDF' in order to build a predictive model.
#'
#' @export
#' @param jsFolder path to your local folder containing JSON files.
#' @param diagsPath path to your local RData file containing diagnostics (codes). The file must contain an object called "Diagnostics" with at least two columns ('NDA', and 'ACTE').
#' @param corpusName choose a name for your corpus.
#' @param serviceName choose a service name from the built-in services: \{'urologie', 'chirgen', 'chirplas', 'chirmaxfac'\}.
#' This argument is used to choose the right UH (Hospital Unit) as well as the right variables from the JSON.
#' If you don't want any restriction regarding UH and VAR, please set serviceName to 'gen'.
#' @return a list of two dataframes: 'texts' and 'codes' to be given separately to function 'buildFromDF'.
csvFromJSONs <- function (jsFolder, diagsPath, corpusName, serviceName) {
# MiddleCare variables (different from a service to another)
fields = list();
fields[["urologie"]] <- unique(c("VAR463", "VAR468", "VAR467", "VAR484", "VAR653")); # URO-CRO
fields[["chirgen"]] <- unique(c("VAR91", "VAR90", "VAR539", "VAR5", "VAR6", "VAR275")); # CHIRGEN-CRO
fields[["chirplas"]] <- unique(c("VAR8", "VAR59", "VAR60", "VAR61", "VAR62", "VAR646")); # CHIRPLAS-CRO
fields[["chirmaxfac"]] <- unique(c("VAR7", "VAR18")); # MAXILLO-CRO
fields[["gen"]] <- unique(unlist(fields)); # GENERIC
fields = fields[[serviceName]];
if (length(fields) < 1)
stop("Error: 'fields' is empty, please check your 'serviceName' variable !");
# UH_EXEC of the concernec service
uhs <- list();
uhs[["urologie"]] <- unique(c("76819", "76176", "76173", "76175", "76191", "76192", "76505", "76536", "76834", "76129")); # URO-CRO
uhs[["chirgen"]] <- unique(c("76824", "76267", "76153")); # CHIRGEN-CRO
uhs[["chirplas"]] <- unique(c("76818", "76267", "76241", "76242", "76216", "76219", "76234")); # CHIRPLAS-CRO
uhs[["chirmaxfac"]] <- unique(c("76816", "76267")); # MAXILLO-CRO
uhs[["gen"]] <- unique(unlist(uhs)); # GENERIC
uhs = uhs[[serviceName]];
if (length(uhs) < 1)
stop("Error: 'uhss' is empty, please check your 'serviceName' variable !");
# Load texts either: from PostGresQL (mimic), or JSON files (all files must be in same folder)
cat("Loading text from: ");
if ( stringr::str_detect( tolower(corpusName), "mimic")) { # load from PostGresql
cat ("PostgreSQL db");
res <- get.ds.icd(NULL, NULL, 0, renewCon=TRUE);
csv <- data.frame(NDA=paste0(res[["subject_id"]], res[["hadm_id"]]), TEXT=res[["text"]])
csv <- unique(csv[c("NDA", "TEXT")]);
} else { # load from disk
cat ("disk..");
csv <- loadJsonFolder (jsFolder, fields, docType=NULL, 1)
}
cat ("ok\n");
nbDocsAll <- nrow(csv);
message (nbDocsAll, " retrieved documents.\n");
# Load codes (labels)
if ( stringr::str_detect( tolower(corpusName), "mimic")) { # from MIMIC data
diags <- data.frame(NDA=paste0(res[["subject_id"]], res[["hadm_id"]]), DIAG=res[["code"]]);
csv[["NDA"]] = as.character(csv[["NDA"]]); # diagnostics should be strings
rm(res);
gc();
} else { # from a separate R object provided by Rémi : ONLY ONE LINE SHOULD BE UNCOMMENTED
#diags <- loadDiags(diagsPath, serviceLabel, diagType, allowedCodes); # un-comment this for CRHs
diags <- loadActes(diagsPath, uhs, NULL); # un-comment this for CROs
# keep only present NDAs
diags <- diags[diags$NDA %in% csv$NDA, , ];
}
names(csv) <- c('ID', 'TEXT');
return (list(texts=csv, codes=diags));
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.