bacpac_val: Validator for the BACPAC Minimum Data Set

library(dplyr);


#' Maps years to booleans indicating whether the year is a leap year
#'
#' @param year_number - numerical representation of the year
#' @return T if year is a leap year, F otherwise
leap_year <- function(year_number){
    by_four <- year_number %% 4 == 0;
    by_one_hundred <- year_number %% 100 == 0;
    by_four_hundred <- year_number %% 400 == 0;
    ly <- by_four & !by_one_hundred
    ly[by_four_hundred] <- T;
    ly
}

basic_month_map <- c(31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31);

#' Return the length of the given month for a given year
#'
#' @param month_number - the month as a number between inclusive 1 and 12
#' @param year_number - the year
#' @return the length of the given month
month_len <- function(month_number, year_number){
    basic <- basic_month_map[month_number];
    basic[leap_year(year_number) & month_number == 2] <- 29;
    basic    
}

#' maps vectors of year month day to booleans based on whether the day
#' is valid for the given year and month.
valid_day <- function(year, month, day){
    month >=1 & month <= 12 & day >= 1 & day <= month_len(month, year);
}

#' Return the unparsed column name for "name"
#'
#' @param name (or names)
#' @return the unparsed name
unparsed_column_name <- function(name){
    paste("unparsed__",name,sep="");
}

#' Load a data frame but keep both a parsed and unparsed version of each column (unparsed columns are preceeded in their name by "unparsed__")
#'
#' @param filename - file to load
#' @return a data frame with twice the columns indicated in the file, half of which are unparsed duplicates.
#' @export
val_read_csv <- function(filename,col_types=NULL){
    parsed <- if(is.null(col_types)) { 
                  readr::read_csv(filename, guess_max=100000);
              } else {
                  readr::read_csv(filename, col_types=col_types);
              }
    unparsed <- readr::read_csv(filename, col_types = readr::cols(.default = "c"))
    names(unparsed) <- unparsed_column_name(names(unparsed));
    cbind(parsed, unparsed) %>% dplyr::mutate(index__=seq(nrow(parsed)));
}

#' Load a data frame from an xpt but keep both a parsed and unparsed
#' version of each column (unparsed columns are preceeded in their
#' name by "unparsed__")
#'
#' For xpt files this entails reading them in and then writing them out again to CSV
#' with the correct spec and then reading back in with val_read_csv.
#'
#' @param filename - file to load
#' @return a data frame with twice the columns indicated in the file, half of which are unparsed duplicates.
#' @export
val_read_xpt <- function(filename){
    data <- haven::read_xpt(filename) %>% as_tibble();
    spec <- spec(data);
    tmpfn <- tempfile();
    readr::write_csv(data, tmpfn);
    out <- val_read_csv(tmpfn, col_types=spec);
    file.remove(tmpfn);
    out
}

#' val_read_data: load either a csv or xpt file and save unparsed rows
#' 
#' @param filename - file to load (either csv or xpt extension)
#' @return a data frame with both parsed and unparsed columns
#' @export
val_read_data <- function(filename){
    strReverse <- function(x) sapply(lapply(strsplit(x, NULL), rev), paste, collapse="");
    ext <- strReverse(filename) %>% stringr::str_sub(1,4) %>% strReverse();
    out <- switch(ext,
           ".csv"=val_read_csv(filename),
           ".xpt"=val_read_xpt(filename));
    if(is.null(out)){
        stop(sprintf("Unrecognized file extension-like part of input file %s, %s. We support only xpt and csv files.", filename, ext));
    }
    out
}

#' Returns the codelist (from the specification) for a given column id.
#'
#' @param column - the column ID to get the codelist for.
#' @return a character array of code list values.
column_to_codelist <- function(column, specification=bt_specification){
    sf <- specification$Codelists %>% dplyr::filter(ID==column) %>% dplyr::arrange(Order);
    lst <- sf$Term;
    if(identical(length(lst),0)){
        stop(sprintf("Tried to get the codelist for %s but it was empty.", column))
    } else {
        lst;
    }
}

#' get_codelist - fetch a codelist by ID.  NB. This is an alias for
#' column_to_codelist. Not all codelists correspond to a column but
#' the logic for fetching the codelist is identical.
#' 
#' @param ID - the id the codelist
#' @return the codelist
get_codelist <- column_to_codelist;

#' Returns the codelist order (from the specification) for a given column id.
#'
#' @param column - the column ID to get the codelist for.
#' @return a character array of code list values.
column_to_codelist_order <- function(code){
    sf <- bt_specification$Codelists %>% dplyr::filter(ID==code) %>% dplyr::arrange(Order);
    lst <- sf$Order;
    if(identical(length(lst),0)){
        stop(sprintf("Tried to get the codelist for %s but it was empty.", code))
    } else {
        lst;
    }
}

collapse_commas <- function(s){
    paste(s,collapse=", ");
}

downcase_names <- function(o){
    names(o) <- tolower(names(o));
    o
}

dont_do <- function(block){
    NULL
}

read_json <- function(filename){
    jsonlite::fromJSON(filename) %>% as.data.frame();
}

has_column <- function(tbl,name){
    name %in% names(tbl);
}

column_missing <- function(tbl, name){
    !has_column(tbl, name);
}

#' return a string for each row which contains the column name/value
#' pairs for each column in cols.
summarize_column_values <- function(tbl, cols){
    tbl %>%
        dplyr::rowwise() %>%
        dplyr::mutate(sss=paste(sprintf("(column: %s: value %s)", dplyr::all_of(cols), dplyr::across(dplyr::all_of(cols), function(x)x)), collapse=", ")) %>%
        dplyr::ungroup() %>%
        `[[`("sss")
}

block <- block <- gtools::defmacro(bl,expr=(function()bl)());

`%not-in%` <- function(x,table){
    !(x %in% table);
}

last <- function(sq){
    sq[length(sq)];
}

file_extension <- function(fn){
    str_split(fn,"\\.") %>% last();
}


read_generic <- function(input_file_name){
    switch(file_extension(input_file_name),
           "csv" = read_csv(input_file_name),
           "xpt" = haven::read_xpt(input_file_name),
           "json"= )
}

to_map <- function(df, key_col, val_col=TRUE){
    o <- list();
    if(identical(val_col, TRUE)){
        nms <- names(df);
        nms <- nms[nms %not-in% key_col];
        to_map(df,key_col,nms);
    } else if (length(val_col)==1){
        for (i in seq(nrow(df))){
            o[[df[[key_col]][[i]]]] <- df[[val_col]][[i]];
        }
        o
    } else {
        for (i in seq(nrow(df))){
            target <- list();
            for(v in val_col){
                target[[v]] <- df[[v]][[i]];
            }
            o[[df[[key_col]][[i]]]] <- target;
        }
        o
    }
}

`%===%` <- identical;

`%!==%` <- function(a,b){
    !(a %===% b);
}

all_true <- function(s){
    s %===% rep(T,length(s));
}

all.equal_tf <- function(a,b){
    identical(T, all.equal(a,b));
}

# In order to find the codelist for a question response (QSSTRESC)
# we need to look up the QSTESTCD in the WhereClauses table
# and then from there take the ID field and look up
# the Codelist in the ValueLevel table.
#
# The Value column in the WhereClauses table contains multiple
# WhereClauses IDs so we expand them here. This makes the Comparator
# column redundant because IN reduces to EQ when the values are split
# like this.
expand_where_clauses <- function(where_clauses){
    where_clauses <- where_clauses;
    do.call(rbind,
            Map(function(df){
                values <- stringr::str_split(df$Value,",") %>%
                    unlist() %>% 
                    stringr::str_trim();
                lst <- list();
                for(n in names(df)){
                    lst[[n]] <- rep(df[[n]][[1]], length(values));
                }
                lst[["Value"]] <- values;
                do.call(tibble, lst);
            },
            split(where_clauses, where_clauses$ID)) %>% unname())
}

deep_index <- function(o,...){
    indexers <- list(...);
    output <- o;
    for(ii in indexers){
        output <- output[[ii]];
    }
    output
}

to_readable_string <- function(data){
    thing <- data;
    o <- capture.output(print(body(eval(substitute(function(){x},list(x=thing))))))
    o <- o[2:(length(o)-1)];
    stringr::str_trim(paste(o,collapse=" "));
}

create_data_source_file <- function(filename, lst){
    file.remove(filename);
    cat("# created automatically, don't edit",file=filename,sep="\n");
    for(n in names(lst)){
        cat(sprintf("%s <- %s;", n, to_readable_string(lst[[n]])), file=filename, sep="\n",append=TRUE);
        if(tibble::is_tibble(lst[[n]])){
            cat(sprintf("%s <- tibble::as_tibble(%s);", n, n), file=filename, sep="\n",append=TRUE);
        }
        cat("", file=filename, sep="\n",append=TRUE);
    }
    filename
}