missing_percentage_cross_subjects = function(dat, var_name){
sub = unique(dat$subject_id)[1]
var_df = dat[c('subject_id',var_name)]
var_df$is_miss = ifelse(is.na(dat[var_name]),1,0)
return(var_df %>% group_by(subject_id) %>% summarize(miss_perc = mean(is_miss)))
}
label_na = function(dat, var_name, miss_df, n_round, seed, priority_miss_perc){
rownames(dat) = 1:nrow(dat)
dat = dat[!is.na(dat[var_name]),]
label_lists = vector("list", n_round)
set.seed(seed)
seeds_list = sample(1:1000, n_round, replace=FALSE)
if(is.na(priority_miss_perc)){
for(n in 1:n_round){
for(i in 1:length(unique(dat$subject_id))){
num_obs = nrow(dat %>% filter(subject_id == unique(dat$subject_id)[i]))
num_na = round(num_obs * miss_df[miss_df$subject_id == unique(dat$subject_id)[i],]$miss_perc,0)
set.seed(seeds_list[n])
label_lists[[n]] = append(label_lists[[n]], sample(rownames(dat[dat$subject_id == unique(dat$subject_id)[i],]), num_na, replace=FALSE))
}
}
}else{
for(n in 1:n_round){
for(i in 1:length(unique(dat$subject_id))){
num_obs = nrow(dat %>% filter(subject_id == unique(dat$subject_id)[i]))
num_na = round(num_obs * priority_miss_perc,0)
set.seed(seeds_list[n])
label_lists[[n]] = append(label_lists[[n]], sample(rownames(dat[dat$subject_id == unique(dat$subject_id)[i],]), num_na, replace=FALSE))
}
}
}
return(label_lists)
}
my_missForest = function(dat, var_name, seed){
dat = dat %>% select(-subject_id, -date)
dat$homework = as.factor(dat$homework)
dat$gender = as.factor(dat$gender)
dat$marital_status = as.factor(dat$marital_status)
dat$ethnicity = as.factor(dat$ethnicity)
dat$race = as.factor(dat$race)
dat$religion = as.factor(dat$religion)
dat$finres_1 = as.factor(dat$finres_1)
dat$finres_2 = as.factor(dat$finres_2)
dat$finres_3 = as.factor(dat$finres_3)
dat$finres_4 = as.factor(dat$finres_4)
dat$finres_5 = as.factor(dat$finres_5)
dat$pain = as.factor(dat$pain)
dat$phq8_q1_alacrity = as.factor(dat$phq8_q1_alacrity)
dat$phq8_q2_alacrity = as.factor(dat$phq8_q2_alacrity)
dat$stress = as.factor(dat$stress)
dat$arousal = as.factor(dat$arousal)
dat$valence = as.factor(dat$valence)
set.seed(seed)
df.imp <- missForest::missForest(dat, variablewise = TRUE) #51.6s * 7 = 6 min
dat = as.data.frame(df.imp$ximp)
return(dat)
}
#2. impute by moving average
seven_moving_everage = function(dat, var_name, seed){
set.seed(seed)
copy_row_name = rownames(dat)
sql_query = paste0('SELECT subject_id, date, ', var_name,', AVG(',var_name,') OVER(PARTITION BY subject_id ORDER BY date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS moving_average3 FROM dat ORDER BY subject_id, date')
sql_query2 =paste0('SELECT subject_id, date, ', var_name,', CASE WHEN moving_average3 IS NULL THEN (AVG(moving_average3) OVER(PARTITION BY subject_id ORDER BY date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW)) ELSE moving_average3 END AS moving_average2 FROM dat ORDER BY subject_id, date')
sql_query3 =paste0('SELECT subject_id,', var_name,', CASE WHEN moving_average2 IS NULL THEN (AVG(moving_average2) OVER(PARTITION BY subject_id)) ELSE moving_average2 END AS moving_average FROM dat ORDER BY subject_id, date')
dat = sqldf::sqldf(sql_query)
dat = sqldf::sqldf(sql_query2)
dat = sqldf::sqldf(sql_query3)
dat[which(is.na(dat$step_count)),][var_name] = dat[which(is.na(dat$step_count)),]['moving_average']
rownames(dat) = copy_row_name
return(dat)
}
imput_miss = function(dat, var_name, seed, label_list, imp_func){
func <- match.fun(imp_func)
n_round = length(label_list)
rownames(dat) = 1:nrow(dat)
dat = dat[!is.na(dat[var_name]),]
impute_lists = vector("list", n_round)
for(n in 1:length(label_list)){
dat2 = dat
dat2[rownames(dat2)[rownames(dat2) %in% label_list[[n]]],][var_name] = NA
dat2 = func(dat2, var_name, seed) #check if row index still exist
impute_lists[[n]] = dat2[rownames(dat2)[rownames(dat2) %in% label_list[[n]]],][var_name][1:length(label_list[[n]]),]
}
return(impute_lists)
}
compute_nmse = function(dat, var_name, impute_list, label_list){
n_round = length(label_list)
nmse_df_list = vector("list", n_round)
nmse_df <- data.frame(matrix(ncol = 3, nrow = 0))
colnames(nmse_df) <- c("subject_id", "nmse", 'round_index')
dat = dat %>% select(subject_id, var_name)
rownames(dat) = 1:nrow(dat)
dat = dat[!is.na(dat[var_name]),]
dat$round_index = NA
for(n in 1:length(label_list)){
dat2 = dat
dat2 = dat2[rownames(dat2)[rownames(dat2) %in% label_list[[n]]],]
dat2$impute_value = impute_list[[n]]
colnames(dat2)[2] = 'true_value'
nmse_df = rbind(nmse_df, dat2 %>% group_by(subject_id) %>% summarize(nmse = sqrt(mean((true_value - impute_value)^2)/var(true_value))) %>% mutate(round_index = n))
}
return(nmse_df)
}
#' Make plots for imputation result
#'
#' @param filter_df filtered passive data
#' @param active_df active data
#' @param baseline_df baseline data
#' @param var_name passive measurement variable name
#' @param priority_miss_perc Percentage of observations per subject to be labeled as missing;1)value between 0 and 1: same percentage for each subject;2)NA: automatically according to missing percentage of each subject
#' @param n_round number of times to randomly sample observations to be labeled
#' @param seed fix seed so that simulation results are reproduceable
#' @param imp_func the function of imputation method, available options now: #1) missForest :my_missForest; #2) moving average of 7 days: seven_moving_everage
#' @return a data frame with imputation quality measurement metrics
#' @export
impute_quality_check = function(filter_df, active_df, baseline_df, var_name, priority_miss_perc, n_round, seed, imp_func){
my_dat = left_join(filter_df, baseline_df, by = c("subject_id" = "subject_id") ) %>%
arrange(subject_id, date)
my_dat = left_join(my_dat, active_df, by = c("subject_id" = "subject_id", "date" = "date") ) %>%
arrange(subject_id, date)
miss_df = missing_percentage_cross_subjects(dat = my_dat,var_name = var_name)
label_lists = label_na(dat = my_dat, var_name = var_name, miss_df = miss_df, n_round = n_round, seed = seed, priority_miss_perc = 0.2)
imput_result = imput_miss(dat=my_dat, var_name=var_name, seed=seed, label_list=label_lists, imp_func = imp_func)
eg_compute_nmse = compute_nmse(dat = my_dat, var_name = var_name, impute_list = imput_result, label_list = label_lists)
return(eg_compute_nmse)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.