R/filter.missingness.R

Defines functions filter_nodata filter_missingness

#' @include ccTable.R
ccTable$methods(
    get_missingness = function() {
        miss_count <- function(tb_) { 
            cmplt <- function(vec) {
                length(which(vec!="NA"))/length(vec) * 100 
            }
            items_ <- names(tb_)[!names(tb_) %in% c("site", "episode_id", "time")]
            flags <- tb_[, cmplt(.SD[[items_[1]]]), .(site, episode_id)]
            setnames(flags, c('site', 'episode_id', items_[1]))
            flags
        }

        .self$dquality[['missingness']] <- .self$torigin[, 1, by=c("site", "episode_id")]
        .self$dquality[['missingness']][, V1:=NULL]
        setkey(.self$dquality[['missingness']], site, episode_id)

        for (i in names(.self$conf)) {
            missconf <- .self$conf[[i]][["missingness"]][["labels"]]
            if(!is.null(missconf)) {
                for (c in seq(missconf)) {
                    col_name <- names(missconf[c])
                    colr <- missconf[[c]]
                    tbq <- ccd_select_table(.self$record, items_opt=i, freq=colr)
                    setkey(tbq, episode_id, site)
                    oldnm <- names(.self$dquality[['missingness']])
                    .self$dquality[['missingness']] <- 
                        merge(.self$dquality[['missingness']], miss_count(tbq))
                    setnames(.self$dquality[['missingness']], c(oldnm, paste(i, col_name, sep=".")))
                }
            }
        }    
})


#' Data missing filter
#'
#' Deal with data when insufficient data points are supported. There are 
#' two key items to be set in the YAML configuration file. 
#' 1) labels -- time interval. 2) accept_2d -- the accept present ratio. 
#' So if we set the labels is 24, and accept_2d is 70. It means we accept 
#' all the missing rate that is lower than 30% every 24 data points.
#' @name ccTable_filter_missingness
#' @param recount logical value. Recount the missingness if TRUE. 
NULL
ccTable$methods(
    filter_missingness = function(recount=FALSE){
        "filter out the where missingness is too low."
        if (recount || is.null(.self$dquality[['missingness']]) ||
            nrow(.self$dquality[['missingness']]) == 0)
            .self$get_missingness()

        if (is.null(.self$tclean) || nrow(.self$tclean) == 0)
            .self$tclean <- .self$torigin

        thresholds <- 
            unlist(lapply(.self$conf, 
                          function(x) x[["missingness"]][["accept_2d"]]))

        select_index <- rep(TRUE, nrow(.self$dquality[['missingness']]))
        for (nt in names(thresholds))
            select_index <- 
                select_index & as.vector(.self$dquality[['missingness']][, nt, with=FALSE] > thresholds[nt])
        
        .self$dfilter$missingness <- list()
        .self$dfilter$missingness$episode <-
            data.table(.self$dquality$missingness[, c('site', 'episode_id'),
                       with=FALSE], select_index)
})

#' No data filter
#' 
#' Remove the episode when a particular field is not presented.
#' It need to be set up in the YAML configuration file. 
#' @name ccTable_filter_nodata
NULL
ccTable$methods(
    filter_nodata = function() {
        "Exclude episodes when no data is presented in certain fields"
        data <- .self$get.data.column("nodata")
        nodata <- function(x, ...) {
            !all(x %in% c("NA", NA))
        }
        .self$dfilter$nodata <- getfilter(data, nodata)
    }
)
CC-HIC/cleanEHR documentation built on Aug. 28, 2022, 10:33 a.m.