manuscriptsJX: manuscripts tools

Documented in my_tableone

#' @title create tableone quickly
#' @description create tableone quickly
#' @param df dataframe
#' @param col_num vecter, the numerical variable names
#' @param col_fac vecter, the factor variable
#' @param col_group vecter, the variable to group
#' @param colnames_label vecter, the label of the colnames
#'
#' @return dataframe
#'
#' @examples
# library(pins)
# # Register the board
# pins::board_register(board = "github",
#                      repo = "jixing475/database",
#                      branch = "master",
#                      token = Sys.getenv("GITHUB_PAT"))
# T1 <-
#   pins::pin_get(
#     name = "T1_pin",
#     board = "github"
#   )
#
# manuscriptsJX::my_tableone(df = T1,
#                            col_num = "age_at_initial_pathologic_diagnosis",
#                            col_fac = c("CASC8", "gender", "race", "number_of_lymphnodes_positive_by_he", "pathologic_stage", "pathologic_T", "pathologic_M", "pathologic_N"),
#                            col_group = "CASC8",
#                            colnames_label = c("L_sample", "L_CASC8","L_age_at_initial_pathologic_diagnosis", "L_gender", "L_race", "L_number_of_lymphnodes_positive_by_he", "L_pathologic_stage", "L_pathologic_T", "L_pathologic_M", "L_pathologic_N")
# )
#
#'
#' @rdname my_tableone
#' @export


my_tableone <- function(df,
                        col_num,
                        col_fac,
                        col_group,
                        colnames_label
) {
    library(tidyverse)
    library(tableone)
    # # 📌  read data edit at this ==============================
    # df <- T1
    #
    # # 📌  set variable edit at this ==============================
    # colnames(df)[map_lgl(df, is.numeric)] %>% clipr::write_clip()
    #
    # col_num <- c("age_at_initial_pathologic_diagnosis")
    #
    # colnames(df)[!map_lgl(df, is.numeric)] %>% clipr::write_clip()
    #
    # col_fac <- c("gender","race","pathologic_stage","pathologic_T","pathologic_M","pathologic_N")
    #
    library(friendlyeval)
    group <- col_group
    df <- mutate(df, group = !!treat_string_as_col(col_group))

    # 📌  create df_label ==============================
    df_label <- data.frame(var = colnames(df) %>%
                               LuckyStar::str_detect_remove("group"))
    df_label$label <- colnames_label

    # create tableone ==============================
    # strata
    table_one_group <-
        CreateTableOne(
            vars = c(col_num, col_fac),
            strata = group,# Group
            data = df,# filter group is NA
            factorVars = col_fac # category variable
        )
    # overall 一列
    table_one_overall <-
        CreateTableOne(
            vars = c(col_num, col_fac),
            data = df,
            factorVars = col_fac # category variable
        )
    # 分几组比较
    group_levels <- df[[group]] %>% as.factor() %>% levels()
    # 每个 level 下都有, 参数发个非参数法的统计量, 你要进行判别选择,对么个横向的变量选择合适的统计方式, mean or median
    idx <- c()
    for(i in seq_along(group_levels)){
        idx <- table_one_group$ContTable[[i]] %>% as.data.frame() %>%
            dplyr::pull(skew) %>%
            abs() %>%
            `>`(2) %>% # absolutely value is greater than 2
            which() %>%
            c(idx, .) %>%
            unique()
    }
    idx# skew > 2 所以 idx 是非参变量的索引
    non_normal <- rownames(table_one_group$ContTable[[1]])[idx]
    ## Just typing the object name will invoke the print.TableOne method
    ## Tests are by oneway.test/t.test for continuous, chisq.test for categorical
    ## Specifying nonnormal variables will show the variables appropriately,
    ## and show nonparametric test p-values (kruskal.test/wilcox.test).
    ## Specify variables in the exact argument to obtain the exact test
    ## (fisher.test) p-values. If a 2-level factor is specified in cramVars
    ## both levels are shown in one row.
    res_group <- tableone_list2df(table_one_group,
                                  #two_level = two_levels_var(),
                                  non_normal = non_normal) %>%
        mutate(Variable = if_else(Variable=="", "NA", Variable)) %>%
        mutate(Variable = na_x2na(Variable, "NA")) %>%
        tidyr::fill(., Variable)  %>%
        tidyr::unite(Variable, c("Variable", "level"), sep = " ")

    res_overall <- tableone_list2df(table_one_overall,
                                    #two_level = two_levels_var(),
                                    non_normal = non_normal) %>%
        mutate(Variable = if_else(Variable=="", "NA", Variable)) %>%
        mutate(Variable = na_x2na(Variable, "NA")) %>%
        tidyr::fill(., Variable)  %>%
        tidyr::unite(Variable, c("Variable", "level"), sep = " ")

    res <-
        left_join(res_group ,
                  res_overall ,
                  by = "Variable") %>%
        #bind_cols(res_group,  res_overall) %>%
        select(Variable, Overall, everything()) %>%
        select(-test)

    # 增加统计量 ==============================
    reg <- "(\\(mean \\(sd\\)\\))|(\\(median \\[IQR\\]\\))|(\\(%\\))"
    statistic_table <-
        res %>%
        filter(Variable!="n ") %>%
        fill(p, .direction = "down") %>%
        mutate(var = Variable) %>%
        separate(var, into = c("var", "type"), sep = reg) %>%
        mutate(type = str_extract(Variable, reg)) %>%
        select(Variable, var, type) %>%
        mutate(var = str_trim(var, side = "right"))

    output <- c()
    for (i in 1:nrow(statistic_table)) {
        # sequence
        statistic <- get_statistic(statistic_table$var[i],
                                   statistic_table$type[i],
                                   df = df)
        names(statistic) <- statistic_table$var[i]
        output <- c(output, statistic)
    }
    statistic_table$statistic <- output

    # 增加标签 ==============================
    statistic_table <-
        statistic_table %>%
        left_join(df_label, by ="var") %>%
        rowwise() %>%
        mutate(label = str_replace(Variable, var, label))

    # 合并

    final_result <-
        res %>%
        left_join(select(statistic_table, Variable, statistic, label)) %>%
        select(-Variable) %>%
        select(Variable = label, everything()) %>%
        col_back(p)

    final_result[1, 1] <- "n"

    reg <- "(\\(mean \\(sd\\)\\))|(\\(median \\[IQR\\]\\))|(\\(%\\))"

    res <-
    final_result %>%
        mutate(var = Variable) %>%
        separate(var, into = c("var", "type"), sep = reg) %>%
        mutate(type = str_extract(Variable, reg)) %>%
        mutate(Var = str_c(var, type)) %>%
        rowwise() %>%
        mutate(class = str_remove(Variable, var) %>%
                   str_remove(type) %>%
                   str_remove("\\(\\)")) %>%
        select(-var, -type, -Variable) %>%
        select(Var, class, everything())

    return(res)
}