R/extractQR.R

Defines functions extractFrequency extractStatQ extractOOS_test extractNormalityTests extractARIMA extract_QR

Documented in extract_QR

#' Extraction d'un bilan qualité
#'
#' Permet d'extraire un bilan qualité à partir du fichier CSV contenant la matrice des diagnostics.
#'
#' @param matrix_output_file fichier CSV contenant la matrice des diagnostics.
#' @param sep séparateur de caractères utilisé dans le fichier csv (par défaut \code{sep = ";"})
#' @param dec séparateur décimal utilisé dans le fichier csv (par défaut \code{dec = ","})
#'
#' @details La fonction permet d'extraire un bilan qualité à partir d'un fichier csv contenant l'ensemble des
#' diagnostics (généralement fichier \emph{demetra_m.csv}).
#'
#' Ce fichier peut être obtenu en lançant le cruncher (\code{\link{cruncher}} ou \code{\link{cruncher_and_param}}) avec
#' l'ensemble des paramètres de base pour les paramètres à exporter et l'option \code{csv_layout = "vtable"} (par défaut)
#' pour le format de sortie des fichiers csv (option de \code{\link{cruncher_and_param}} ou de \code{\link{create_param_file}}
#' lors de la création du
#' fichier de paramètres).
#'
#' Le résultat de cette fonction est un objet \code{\link{QR_matrix}} qui est une liste de trois paramètres :
#' * le paramètre \code{modalities} est un \code{data.frame} contenant un ensemble de variables sous forme catégorielle
#'   (Good, Uncertain, Bad, Severe).
#' * le paramètre \code{values} est un \code{data.frame} contenant les valeurs associées aux indicateurs présents dans
#'   \code{modalities} (i.e. : p-valeurs, statistiques, etc.) ainsi que des variables qui n'ont pas
#'   de modalité (fréquence de la série et modèle ARIMA).
#' * le paramètre \code{score_formula} est initié à \code{NULL} : il contiendra la formule utilisée pour
#'   calculer le score (si le calcul est fait).
#'
#' @encoding UTF-8
#' @return Un objet de type \code{\link{QR_matrix}}.
#' @examples \dontrun{
#' QR <- extract_QR()
#' print(QR)
#' # Pour extraire la matrice des modalités :
#' QR$modalities
#' # Ou :
#' QR[["modalities"]]
#' }
#' @keywords internal
#' @name fr-extract_QR
NULL
#> NULL


#' Extraction of a quality report
#'
#' To extract a quality report from the csv file containing the diagnostics matrix.
#'
#' @param matrix_output_file the csv file containing the diagnostics matrix.
#' @param sep the separator used in the csv file (by default, \code{sep = ";"})
#' @param dec the decimal separator used in the csv file (by default, \code{dec = ","})
#'
#' @details This function generates a quality report from a csv file containing diagnostics (usually from the file \emph{demetra_m.csv}).
#'
#' The \emph{demetra_m.csv} file can be generated by launching the cruncher (functions \code{\link{cruncher}} or \code{\link{cruncher_and_param}}) with the default
#' export parameters, having used the default option \code{csv_layout = "vtable"} to format the output tables of the functions
#' \code{\link{cruncher_and_param}} and \code{\link{create_param_file}} when creating the parameters file.
#'
#' This function returns a \code{\link{QR_matrix}} object, which is a list of 3 objects:
#' * \code{modalities}, a \code{data.frame} containing several indicators and their categorical quality (Good, Uncertain, Bad, Severe).
#' * \code{values}, a \code{data.frame} containing the same indicators and the values that lead to their quality category (i.e.: p-values, statistics, etc.)
#' as well as additional variables that don't have a modality/quality (series frequency and arima model).
#' * \code{score_formula} that will store the formula used to calculate the score (when relevant). Its initial value is \code{NULL}.
#'
#' @encoding UTF-8
#' @return a \code{\link{QR_matrix}} object.
#' @family QR_matrix functions
#' @examples \dontrun{
#' QR <- extract_QR()
#' print(QR)
#' # To extract the modalities matrix:
#' QR$modalities
#' # Or:
#' QR[["modalities"]]
#' }
#' @importFrom stats sd
#' @importFrom utils read.csv
#' @seealso [Traduction française][fr-extract_QR()]
#' @export
extract_QR <- function(matrix_output_file, sep = ";", dec = ",") {
    if (missing(matrix_output_file) || is.null(matrix_output_file)) {
        stop("Please call extract_QR() on a csv file containing at least one cruncher output matrix (demetra_m.csv for example)")
    }
    if (length(matrix_output_file) == 0) {
        stop("The chosen csv file is empty")
    }
    if (!file.exists(matrix_output_file) || length(grep("\\.csv$", matrix_output_file)) == 0) {
        stop("The chosen file desn't exist or isn't a csv file")
    }

    demetra_m <- read.csv(
        file = matrix_output_file,
        sep = sep, dec = dec, stringsAsFactors = FALSE,
        na.strings = c("NA", "?")
    )
    missing_variables <- which(is.na(match(
        c(
            "qs.test.on.sa", "f.test.on.sa..seasonal.dummies.",
            "qs.test.on.i", "f.test.on.i..seasonal.dummies.",
            "f.test.on.sa..td.", "f.test.on.i..td.", "independence", "normality",
            "skewness", "kurtosis"
        ),
        colnames(demetra_m)
    )))
    if (length(missing_variables) != 0) {
        stop(paste0(
            "The following variables are missing from the diagnostics matrix:\n",
            c(
                "qs.test.on.sa", "f.test.on.sa..seasonal.dummies.",
                "qs.test.on.i", "f.test.on.i..seasonal.dummies.",
                "f.test.on.sa..td.", "f.test.on.i..td.", "independence", "normality",
                "skewness", "kurtosis"
            )[missing_variables],
            "\nPlease re-compute the export."
        ))
    }

    demetra_m$series <- gsub(
        "(^ *)|(* $)", "",
        gsub("(^.* \\* )|(\\[frozen\\])", "", demetra_m[, 1])
    )
    demetra_m$frequency <- extractFrequency(demetra_m)

    demetra_m <- cbind(
        demetra_m,
        extractARIMA(demetra_m),
        extractStatQ(demetra_m),
        extractOOS_test(demetra_m),
        extractNormalityTests(demetra_m)
    )
    demetra_m$pct_outliers_value <- demetra_m[, match("number.of.outliers", colnames(demetra_m)) + 1] * 100
    demetra_m$pct_outliers_modality <- demetra_m$number.of.outliers
    demetra_m$m7_modality <- cut(demetra_m$m7 + 0, #+0 to force the variable type to be numeric in case of an NA
        breaks = c(0, 1, 2, Inf),
        labels = c("Good", "Bad", "Severe"), right = FALSE
    )

    colnames(demetra_m)[match(
        c(
            "qs.test.on.sa", "f.test.on.sa..seasonal.dummies.",
            "qs.test.on.i", "f.test.on.i..seasonal.dummies.",
            "f.test.on.sa..td.", "f.test.on.i..td.", "independence", "normality"
        ),
        colnames(demetra_m)
    ) + 1] <- paste0(c(
        "qs.test.on.sa", "f.test.on.sa..seasonal.dummies.",
        "qs.test.on.i", "f.test.on.i..seasonal.dummies.",
        "f.test.on.sa..td.", "f.test.on.i..td.", "independence", "normality"
    ), "_pvalue")
    modalities_variables <- c(
        "series", "qs.test.on.sa", "f.test.on.sa..seasonal.dummies.",
        "qs.test.on.i", "f.test.on.i..seasonal.dummies.",
        "f.test.on.sa..td.", "f.test.on.i..td.",
        "independence", "homoskedasticity_modality",
        "skewness_modality", "kurtosis_modality", "normality", "oos_mean_modality",
        "oos_mse_modality", "m7_modality", "q_modality", "q_m2_modality", "pct_outliers_modality"
    )

    values_variables <- c(
        "series", "qs.test.on.sa_pvalue", "f.test.on.sa..seasonal.dummies._pvalue",
        "qs.test.on.i_pvalue", "f.test.on.i..seasonal.dummies._pvalue",
        "f.test.on.sa..td._pvalue", "f.test.on.i..td._pvalue",
        "independence_pvalue", "homoskedasticity_pvalue",
        "skewness_pvalue", "kurtosis_pvalue",
        "normality_pvalue", "oos_mean_pvalue",
        "oos_mse_pvalue", "m7", "q_value", "q_m2_value", "pct_outliers_value",
        "frequency", "arima_model"
    )

    if (!all(
        modalities_variables %in% colnames(demetra_m),
        values_variables %in% colnames(demetra_m)
    )) {
        missing_variables <- unique(c(
            modalities_variables[!modalities_variables %in% colnames(demetra_m)],
            values_variables[!values_variables %in% colnames(demetra_m)]
        ))
        missing_variables <- paste(missing_variables, collapse = "\n")
        stop(paste0(
            "The following variables are missing from the diagnostics matrix:\n",
            missing_variables, "\nPlease re-compute the export."
        ))
    }

    names_QR_variables <- c(
        "series", "qs_residual_sa_on_sa", "f_residual_sa_on_sa",
        "qs_residual_sa_on_i", "f_residual_sa_on_i",
        "f_residual_td_on_sa", "f_residual_td_on_i",
        "residuals_independency", "residuals_homoskedasticity",
        "residuals_skewness", "residuals_kurtosis", "residuals_normality",
        "oos_mean", "oos_mse", "m7", "q", "q_m2", "pct_outliers"
    )
    QR_modalities <- demetra_m[, modalities_variables]
    QR_values <- demetra_m[, values_variables]
    rownames(QR_modalities) <- rownames(QR_values) <- NULL
    colnames(QR_values)[seq_along(names_QR_variables)] <- colnames(QR_modalities) <- names_QR_variables
    QR_modalities[, -1] <- lapply(QR_modalities[, -1], factor,
        levels = c("Good", "Uncertain", "Bad", "Severe"), ordered = TRUE
    )
    QR <- QR_matrix(modalities = QR_modalities, values = QR_values)
    QR
}

extractARIMA <- function(demetra_m) {
    q_possibles <- grep("(^q$)|(^q\\.\\d$)", colnames(demetra_m))
    bp_possibles <- grep("(^bp$)|(^bp\\.\\d$)", colnames(demetra_m))

    val_q <- demetra_m[, q_possibles]
    val_bq <- demetra_m[, bp_possibles]

    if (length(q_possibles) > 1) {
        integer_col <- which(sapply(val_q, is.integer))
        if (length(integer_col) == 0) {
            val_q <- rep(NA, nrow(val_q))
        } else {
            val_q <- val_q[, integer_col[1]]
        }
    }
    if (length(bp_possibles) > 1) {
        integer_col <- which(sapply(val_bq, is.integer))
        if (length(integer_col) == 0) {
            val_bq <- rep(NA, nrow(val_bq))
        } else {
            val_bq <- val_bq[, integer_col[1]]
        }
    }

    if (!all(
        is.integer(val_q) || all(is.na(val_q)),
        is.integer(val_bq) || all(is.na(val_q))
    )) {
        stop("Error in the extraction of the arima order q or bq")
    }
    arima <- data.frame(
        arima_p = demetra_m[, "p"], arima_d = demetra_m[, "d"], arima_q = val_q,
        arima_bp = val_bq, arima_bd = demetra_m[, "bd"], arima_bq = demetra_m[, "bq"],
        arima_model = demetra_m[, "arima"]
    )
    return(arima)
}
extractNormalityTests <- function(demetra_m) {
    tests_possibles <- grep("(^skewness$)|(^kurtosis$)|(^lb2$)", colnames(demetra_m))
    if (length(tests_possibles) != 3) {
        stop("At least one test is missing, among: skewness, kurtosis, lb2")
    }

    if (length(grep(
        "^X\\.(\\d){1,}$",
        colnames(demetra_m)[rep(tests_possibles, each = 2) + rep(1:2, 3)]
    )) != 6) {
        stop("Re-compute the cruncher export with the options: residuals.skewness:3, residuals.kurtosis:3 and residuals.lb2:3")
    }

    normality <- data.frame(
        skewness_pvalue = demetra_m[, tests_possibles[1] + 2],
        kurtosis_pvalue = demetra_m[, tests_possibles[2] + 2],
        homoskedasticity_pvalue = demetra_m[, tests_possibles[3] + 2]
    )
    normality$skewness_modality <- cut(normality$skewness_pvalue,
        breaks = c(0, .01, .1, 1),
        labels = c("Bad", "Uncertain", "Good"),
        right = FALSE
    )
    normality$kurtosis_modality <- cut(normality$kurtosis_pvalue,
        breaks = c(0, .01, .1, 1),
        labels = c("Bad", "Uncertain", "Good"),
        right = FALSE
    )
    normality$homoskedasticity_modality <- cut(normality$homoskedasticity_pvalue,
        breaks = c(0, .01, .1, 1),
        labels = c("Bad", "Uncertain", "Good"),
        right = FALSE
    )
    return(normality)
}
extractOOS_test <- function(demetra_m) {
    mean_possibles <- grep("(^mean$)|(^mean\\.\\d$)", colnames(demetra_m))
    col_mean <- mean_possibles
    if (length(mean_possibles) > 1) {
        col_mean_possibles <- demetra_m[, mean_possibles]
        character_cols <- which(sapply(col_mean_possibles, is.character))
        if (length(character_cols) == 0) {
            col_all_NA <- which(apply(is.na(col_mean_possibles), 2, all))
            if (length(col_all_NA) == 0) {
                stop("Error in the extraction of the out of sample diagnostics")
            } else {
                col_mean <- mean_possibles[col_all_NA[1]]
            }
        } else {
            col_mean <- mean_possibles[character_cols[1]]
        }
    }
    col_mse <- match("mse", colnames(demetra_m))[1]
    if (!all(
        is.character(demetra_m[, col_mean]) || all(is.na(demetra_m[, col_mean])),
        is.double(demetra_m[, col_mean + 1]) || all(is.na(demetra_m[, col_mean + 1])),
        is.character(demetra_m[, col_mse]) || all(is.na(demetra_m[, col_mse])),
        is.double(demetra_m[, col_mse + 1]) || all(is.na(demetra_m[, col_mse + 1]))
    )) {
        stop("Error in the extraction of the out of sample diagnostics")
    }

    stat_OOS <- data.frame(demetra_m[, col_mean + c(0, 1)], demetra_m[, col_mse + c(0, 1)],
        stringsAsFactors = FALSE
    )
    colnames(stat_OOS) <- c("oos_mean_modality", "oos_mean_pvalue", "oos_mse_modality", "oos_mse_pvalue")
    return(stat_OOS)
}
extractStatQ <- function(demetra_m) {
    q_possibles <- grep("(^q$)|(^q\\.\\d$)", colnames(demetra_m))
    q_m2_possibles <- grep("(^q\\.m2$)|(^q\\.m2\\.\\d$)", colnames(demetra_m))
    col_q <- q_possibles
    col_q_m2 <- q_m2_possibles
    if (length(q_possibles) > 1) {
        col_q_possibles <- demetra_m[, q_possibles]
        character_cols <- which(sapply(col_q_possibles, is.character))
        if (length(character_cols) == 0) {
            col_all_NA <- which(apply(is.na(col_q_possibles), 2, all))
            if (length(col_all_NA) == 0) {
                stop("Error in the extraction of the Q and Q-M2 stats")
            } else {
                col_q <- q_possibles[col_all_NA[1]]
            }
        } else {
            col_q <- q_possibles[character_cols[1]]
        }
    }
    if (length(q_m2_possibles) > 1) {
        col_q_m2_possibles <- demetra_m[, q_m2_possibles]
        character_cols <- which(sapply(col_q_m2_possibles, is.character))
        if (length(character_cols) == 0) {
            col_all_NA <- which(apply(is.na(col_q_m2_possibles), 2, all))
            if (length(col_all_NA) == 0) {
                stop("Error in the extraction of the Q and Q-M2 stats")
            } else {
                col_q_m2 <- q_m2_possibles[col_all_NA[1]]
            }
        } else {
            col_q_m2 <- q_m2_possibles[character_cols[1]]
        }
    }
    if (!all(
        is.character(demetra_m[, col_q]) || all(is.na(demetra_m[, col_q])),
        is.double(demetra_m[, col_q + 1]) || all(is.na(demetra_m[, col_q + 1])),
        is.character(demetra_m[, col_q_m2]) || all(is.na(demetra_m[, col_q_m2])),
        is.double(demetra_m[, col_q_m2 + 1])
    ) || all(is.na(demetra_m[, col_q_m2 + 1]))) {
        stop("Error in the extraction of the Q and Q-M2 stats")
    }

    stat_Q <- data.frame(demetra_m[, col_q + c(0, 1)], demetra_m[, col_q_m2 + c(0, 1)],
        stringsAsFactors = FALSE
    )
    colnames(stat_Q) <- c("q_modality", "q_value", "q_m2_modality", "q_m2_value")

    return(stat_Q)
}
extractFrequency <- function(demetra_m) {
    if (any(is.na(match(c("start", "end", "n"), colnames(demetra_m))))) {
        stop("Error in the extraction of the series frequency (missing either the start date, the end date or the number of observations)")
    }
    start <- as.Date(demetra_m$start, format = "%Y-%m-%d")
    end <- as.Date(demetra_m$end, format = "%Y-%m-%d")
    n <- demetra_m$n
    # i <- 3

    start <- data.frame(y = as.numeric(format(start, "%Y")), m = as.numeric(format(start, "%m")))
    end <- data.frame(y = as.numeric(format(end, "%Y")), m = as.numeric(format(end, "%m")))
    freq <- c(12, 6, 4, 3, 2)
    nobs_compute <- matrix(sapply(freq, function(x) {
        x * (end[, 1] - start[, 1]) + (end[, 2] - start[, 2]) / (12 / x)
    }), nrow = nrow(demetra_m))
    return(sapply(seq_len(nrow(nobs_compute)), function(i) {
        freq[which((nobs_compute[i, ] == n[i]) | (nobs_compute[i, ] + 1 == n[i]) | (nobs_compute[i, ] - 1 == n[i]))[1]]
    }))
}
AQLT/JDCruncheR documentation built on March 20, 2024, 2:32 p.m.