Nothing
#' Train a classifier via synthetic observations using inverse-probability weights
#' @author Norbert Krautenbacher, Kevin Strauss, Maximilian Mandl, Christiane Fuchs
#' @description This method trains classifiers by correcting them for sample selection bias via stochastic
#' inverse-probability oversampling or parametric inverse-probability bagging (Krautenbacher et al 2017). Classifiers are trained from
#' differently resampled data whose observations are weighted by inverse-probability weights per stratum to correct for the bias in the original
#' sample. The so attained ensemble of predictors is aggregated by averaging.
#' @references Krautenbacher, N., Theis, F. J., & Fuchs, C. (2017). Correcting Classifiers for Sample Selection Bias in Two-Phase Case-Control Studies. Computational and mathematical methods in medicine, 2017.
#' @param ... see the parameter genSample() of package sambia.
#' @param learner a character indicating which classifier is used to train. Note: set learner='rangerTree'
#' if random forest should be applied as in Krautenbacher et al. (2017), i.e. the correction step is incorporated
#' in the inherent random forest resampling procedure.
#' @param list.train.learner a list of parameters specific to the classifier that will be trained. Note that the
#' parameter 'data' need not to be provided in this list since the training data which the model will learn
#' on is already attained by new sampled data produced by the method genSample().
#' @param list.predict.learner a list of parameters specifiying how to predict new data given the trained model.
#' @param n.bs number of bootstramp samples. This trained model is uniquely determined by parameters 'learner' and 'list.train.learner'.
#' @references Krautenbacher, N., Theis, F. J., & Fuchs, C. (2017). Correcting Classifiers for Sample Selection Bias in Two-Phase Case-Control Studies.
#' Computational and mathematical methods in medicine, 2017.
#' @examples
#' ## simulate data for a population
#' require(pROC)
#'
#' set.seed(1342334)
#' N = 100000
#' x1 <- rnorm(N, mean=0, sd=1)
#' x2 <- rt(N, df=25)
#' x3 <- x1 + rnorm(N, mean=0, sd=.6)
#' x4 <- x2 + rnorm(N, mean=0, sd=1.3)
#' x5 <- rbinom(N, 1, prob=.6)
#' x6 <- rnorm(N, 0, sd = 1) # noise not known as variable
#' x7 <- x1*x5 # interaction
#' x <- cbind(x1, x2, x3, x4, x5, x6, x7)
#'
#' ## stratum variable (covariate)
#' xs <- c(rep(1,0.1*N), rep(0,(1-0.1)*N))
#'
#' ## effects
#' beta <- c(-1, 0.2, 0.4, 0.4, 0.5, 0.5, 0.6)
#' beta0 <- -2
#'
#' ## generate binary outcome
#' linpred.slopes <- log(0.5)*xs + c(x %*% beta)
#' eta <- beta0 + linpred.slopes
#'
#' p <- 1/(1+exp(-eta)) # this is the probability P(Y=1|X), we want the binary outcome however:
#' y<-rbinom(n=N, size=1, prob=p) #
#'
#' population <- data.frame(y,xs,x)
#'
#' #### draw "given" data set for training
#' sel.prob <- rep(1,N)
#' sel.prob[population$xs == 1] <- 9
#' sel.prob[population$y == 1] <- 8
#' sel.prob[population$y == 1 & population$xs == 1] <- 150
#' ind <- sample(1:N, 200, prob = sel.prob)
#'
#' data = population[ind, ]
#'
#' ## calculate weights from original numbers for xs and y
#' w.matrix <- table(population$y, population$xs)/table(data$y, data$xs)
#' w <- rep(NA, nrow(data))
#' w[data$y==0 & data$xs ==0] <- w.matrix[1,1]
#' w[data$y==1 & data$xs ==0] <- w.matrix[2,1]
#' w[data$y==0 & data$xs ==1] <- w.matrix[1,2]
#' w[data$y==1 & data$xs ==1] <- w.matrix[2,2]
#'
#' ### draw a test data set
#' newdata = population[sample(1:N, size=200 ), ]
#'
#' n.bs = 10
#' ## glm
#' pred_glm <- sambia::synthIPbag(data = data, weights = w, type='parIP',
#' strata.variables = c('y', 'xs'), learner='glm',
#' list.train.learner = list(formula=formula(y~.),family="binomial"),
#' list.predict.learner = list(newdata=newdata, type="response"),
#' n.bs = n.bs)
#' roc(newdata$y, pred_glm, direction = "<")
#'
#' ## random forest
#' pred_rf <- sambia::synthIPbag(data = data, weights = w, type='parIP',
#' strata.variables = c('y','xs'), learner='rangerTree',
#' list.train.learner = list(formula=formula(as.factor(y)~.)),
#' list.predict.learner = list(data=newdata),
#' n.bs = n.bs)
#' roc(newdata$y, pred_rf, direction = "<")
#' @export
#' @import e1071 ranger pROC FNN
synthIPbag <- function(..., learner, list.train.learner, list.predict.learner, n.bs){
# get all data types of list.predict.learner
cats <- sapply(list.predict.learner,class)
# get name of data frame
#which(class.list == 'factor')
data_frame_name <- names(cats[match('data.frame',cats)])
# get newdata of list
newdata <- list.predict.learner[[data_frame_name]]
# create an empty matrix containing n.bs boostrap predictions
pred <- matrix(NA,nrow(newdata),n.bs)
for(i in 1:n.bs){
sample <- sambia::genSample(...)
sampled_data <- sample$data
list.train.learner$data <- sampled_data
if(learner == 'naiveBayes'){
fit <- do.call(learner, list.train.learner)
list.predict.learner$object=fit
pred[,i] <- do.call(predict, list.predict.learner)[,2]
}else if(learner == 'rangerTree'){
# modify list.train.learner
learner1 = 'ranger'
list.train.def.par <- c('write.forest','probability','num.trees','replace','sample.fraction')
list.train.def.val <- c(TRUE,TRUE,1,FALSE,1)
for(ind in 1:length(list.train.def.par)){
list.train.learner[[list.train.def.par[ind]]] <- list.train.def.val[ind]
}
# apply model
#list.train.learner$data[,'y'] <- as.factor(list.train.learner$data[,'y'])
fit <- do.call(learner1, list.train.learner)
list.predict.learner$object=fit
pred[,i] <- do.call(predict, list.predict.learner)$predictions[,2]
}else{
fit <- do.call(learner, list.train.learner)
list.predict.learner$object=fit
pred[,i] <- do.call(predict, list.predict.learner)
}
}
pred <- rowMeans(pred)
return(pred)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.