R/cat_all.R
In reappraised: Statistical Tools for Assessing Publication Integrity of Groups of Trials

#' Compares observed and expected distribution of all categorical (binomial) variables
#'
#' Creates plots of observed to expected numbers and ratios for the binomial variables and/or compares reported and calculated p-values for the variables\cr
#' Reference: Bolland MJ, Gamble GD, Avenell A, Cooper DJ, Grey A. Distributions of baseline categorical variables were different from the expected distributions in randomized trials with integrity concerns. J Clin Epidemiol. 2023;154:117-124
#'
#'
#' Returns a list containing objects described below and (if verbose = TRUE) prints the flextable cat_all_diff_calc_rep_ft and/or graph cat_all_graph depending on options chosen
#'
#' @param df data frame generated from load_clean function
#' @param comp.pvals "yes" or "no" indicator whether reported and calculated p-values should be compared
#' @param fisher.sim "yes" or "no" indicator whether to allow fisher test to simulate p-values for >2*2 tables
#' @param fish.n.sims number of simulations to use in Fisher test, default 10,000
#' @param binom "yes" or "no" indicator whether observed to expected distributions of binomial variables should be calculated
#' @param two_levels "yes" or "no" indicator whether variables with more than 2 levels should be collapsed to 2 levels
#' @param del.disparate if yes, data in which the absolute difference between group sizes is >20% are deleted
#' @param excl.level "yes" or "no" indicator whether one level of a variable should be deleted. Deleted level is chosen randomly using seed parameter.
#' @param seed seed for random number generator, default 0 = current date and time. Specify seed to make repeatable.
#' @param title title name for plots (optional)
#' @param verbose TRUE or FALSE indicates whether progress bar and comments show and flextable or plot or both are printed
#'
#' @return list containing objects as described\cr
#'
#' if p-value comparison used:
#'\itemize{
#'   \item cat_all_pvals = data frame of data for comparison of reported and calculated p-values
#'   \item cat_all_diff_calc_rep_ft = flextable of comparison of reported and calculated p-values
#'   \item cat_all_diff_calc_rep_data = data frame used to make flextable
#'   \item cat_all_diff_thresh_ft = flextable of comparison of reported and calculated p-values when only threshold given
#'   \item cat_all_diff_thresh_data = data frame used to make flextable for p-value thresholds
#'   }
#'
#' if comparing categorical variables used
#' \itemize{
#'   \item cat_all_graph = plot of observed to expected numbers and differences between groups, top panels are the absolute numbers, bottom panels are the differences between trial arms in two arm studies
#'   \item cat_all_graph_pc = plot of observed to expected numbers expressed as percentages and differences between groups, top panels are the percentages, bottom panels are the differences between trial arms in two arm studies
#'   \item cat_all_data_abs = data frame of data for absolute numbers
#'   \item cat_all_data_df = data frame of data for difference between groups in two arm studies
#'   \item cat_all_dataset_abs = data frame of dataset used for all trials
#'   \item cat_all_dataset_df = data frame of dataset used for two arm trials
#'   \item cat_all_all_graphs list containing
#'      \itemize{
#'         \item abs = plot for absolute numbers only
#'         \item df = plot for difference between groups in two arm studies only
#'         \item pc = plot for percentages only
#'         \item all_pc = composite plot of percentages and absolute numbers
#'         \item individual_graphs list of 6 individual plots making up composite figures
#'     }
#'   }
#'
#' @import ggplot2
#' @importFrom magrittr %>%
#' @importFrom dplyr select mutate left_join group_by slice arrange count filter rename ungroup across summarise bind_cols
#' @importFrom tidyr spread gather unite separate
#' @importFrom ggpubr ggarrange
#' @importFrom data.table setorder as.data.table
#' @importFrom purrr map map_int
#' @importFrom vcd assocstats
#' @importFrom vcdExtra CMHtest
#' @importFrom epitools riskratio.wald
#' @importFrom flextable flextable font fontsize valign align add_header_row merge_h set_table_properties
#' @importFrom stats runif
#' @export cat_all_fn
#'
#'
#' @examples
#' # load example data
#' cat_all_data <- load_clean(import= "no", file.cat = "SI_cat_all", cat_all= "yes",
#' format.cat = "wide")$cat_all_data
#'
#' \donttest{
#' # run function comparing p-values only (takes only a few seconds)
#' cat_all_fn (comp.pvals = "yes")$cat_all_diff_calc_rep_ft
#'
#' # run function comparing distribution of binomial variables only
#'
#' # to speed example up limit to 12 2-arm trials with 20 variables
#' # (takes close to 5 secs)
#'
#' cat_all_data <- cat_all_data [1:41, c(1:8,10:11,13:15)]
#'
#' cat_all_fn (binom = "yes", two_levels = "yes", del.disparate = "yes",
#' excl.level = "yes", seed = 10)$cat_all_graph
#'
#'
#' # to import an excel spreadsheet (modify using local path,
#' # file and sheet name, range, and format):
#'
#' # get path for example files
#' path <- system.file("extdata", "reappraised_examples.xlsx", package = "reappraised",
#'                    mustWork = TRUE)
#' # delete file name from path
#' path <- sub("/[^/]+$", "", path)
#'
#' # load data
#' cat_all_data <- load_clean(import= "yes", cat_all = "yes", dir = path,
#'    file.name.cat = "reappraised_examples.xlsx", sheet.name.cat = "SI_cat_all",
#'    range.name.cat = "A:N", format.cat = "wide")$cat_all_data}
#'
#' @md


cat_all_fn <- function (df = cat_all_data, comp.pvals = "no", fisher.sim = "y", fish.n.sims =10000, binom = "no", two_levels= "no", del.disparate = "yes",
                        excl.level= "yes", seed = 0, title = "", verbose = TRUE) {

  #import dataset
  a <- df
  results <- list()
  results2 <- list()

  if (! "data_type" %in% colnames(a)) {a$data_type <- 0}
  if (is.na(a$data_type [1]) | a$data_type [1] != "cat_all") {
    stop_mb(paste0(deparse(substitute(df)), " is either incorrect data for this function or ",
               deparse(substitute(df)), "$data_type != 'cat_all'"))}

  if (isTRUE(verbose) | tolower(substr(verbose,1,1)) == "t") {vb <- "y"
  } else {vb <- "n"}

  if(vb == "y") {
    pb = txtProgressBar(min=0, max=100, style = 2)
    if (tolower(substr(comp.pvals,1,1)) == "y" & tolower(substr(binom,1,1)) == "y") {inc <- 33
    } else if(tolower(substr(comp.pvals,1,1)) == "y" | tolower(substr(binom,1,1)) == "n") {inc <- 50
    } else if(tolower(substr(comp.pvals,1,1)) == "n" | tolower(substr(binom,1,1)) == "y") {inc <- 50}
  }

  #compare- pvalues
  if (tolower(substr(comp.pvals,1,1)) == "y") {

    a <- a %>% dplyr::select (-data_type)

    if(!"p" %in% colnames(a)) {
      b1 <- "There were no reported p-values"
    } else {

      #calculate p-values
      #convert to wide format for level
      #matrix is levels = row & groups = column = n_levels_group
      #make sure variables are in right order by arrange and factor
      b <- a %>% dplyr::select(study, var, level, starts_with("n", ignore.case = TRUE)) %>%
        tidyr::gather (k, v, starts_with("n", ignore.case = TRUE)) %>%
        dplyr::mutate(k = paste0(gsub("[0-9]+","", k), "_", gsub("\\D", "", k))) %>%
        tidyr::separate(k, into = c("k1", "k2")) %>%
        dplyr::arrange(level, k2) %>%  tidyr::unite(k, k1, level, sep = "_") %>%
        tidyr::unite(k, k, k2, sep = "_") %>%
        dplyr::mutate(k = factor(k, levels = unique(k))) %>% tidyr::spread (k, v)

      b <- dplyr::left_join(b, a %>% dplyr::select(study, var, levels_no ,group) %>%
                              dplyr::group_by(study, var) %>% dplyr::slice (1), by = c("study", "var"))
      b <- b %>% dplyr::select(-starts_with("N", ignore.case = FALSE))

      #Pb1
      if(vb == "y") {setTxtProgressBar(pb, getTxtProgressBar (pb)+ inc)
        cat("\nCalculating p-values ...\n\n")}

      #do as loop
      nm <- which(substr(colnames(b),1,1) == "n")
      nm_p <- c("pc", "pcc", "pf", "plr", "pchm", "p.m", "p.f", "p.c", "pmdpsas")
      b [, nm_p] <- NA

      b$Chisquare.warn <- NA

      #calculate p by different methods
      #matrix is levels = row & groups = column = n_levels_group
      for (i in 1:nrow(b)) {
        x <- na.omit(as.numeric(b [i, nm]))
        x <- matrix(x, byrow = TRUE, nrow = b$levels_no [i])
        chi <- suppressWarnings(chisq.test(x, correct = FALSE))
        b$pc [i] <- chi$p.value
        if(length(chi$expected) == 4 & sum (chi$expected <5) >0 |
              length(chi$expected) > 4 & (sum(chi$expected <1) >0 | sum(chi$expected <5) / length(chi$expected) > 0.2)) {
          b$Chisquare.warn [i] <- "exp <5"}
        b$pcc [i] <- suppressWarnings(chisq.test(x, correct = TRUE)$p.value)
        if (tolower(substr(fisher.sim,1,1)) == "y") {
          b$pf [i] <-  suppressWarnings(fisher.test(x, workspace = 2e8, simulate.p.value = TRUE, B= fish.n.sims)$p.value)
        } else {b$pf [i] <-  suppressWarnings(fisher.test(x, workspace = 2e8)$p.value)}
        b$plr [i] = vcd::assocstats(x)$chisq_tests [1,3]
        b$pchm [i] = vcdExtra::CMHtest(x)$table [1,3]
        if (b$group [i] == 2 & b$levels_no [i] == 2) { #epitools only for 2*2
          pm <- list(suppressWarnings(epitools::riskratio.wald(x)$p.value [2,]))
          b$p.m [i] = pm [[1]] [1]
          b$p.f [i] = pm [[1]] [2]
          b$p.c [i] = pm [[1]] [3]
          #sas midpvalues only for 2*2 tables also
          d1 = x [[1]]; d2 = x [[2]]; d3= x[[3]]; d4 = x[[4]];
          d5 = d1 +d2 +d3 + d4
          #multiply loop
          num = c(seq(max(d1+d2,1),1,-1), seq(max(d3+d4,1),1,-1), seq(max(d1+d3,1),1,-1), seq(max(d2+d4,1),1,-1))
          denom = c(seq(max(d5,1),1,-1), seq(max(d1,1),1,-1), seq(max(d2,1),1,-1), seq(max(d3,1),1,-1) ,seq(max(d4,1),1,-1))
          len = length(num) -length (denom)
          if (len > 0) {denom <- c(denom, rep(1, len)) }
          if (len < 0) {num <- c(num, rep(1, abs(len))) }
          p <- 100000
          num <- sort(num, decreasing = TRUE)
          denom <- sort(denom, decreasing = TRUE)
          for (j in 1:length(denom)) {
            p <- p * num [j] / denom [j]
          }
          p <- p / 100000
          b$pmdpsas [i] <- round(b$p.f [i] - 0.5* p,9)
        }
      }

      #now merge original p/test back in
      if (!"stat" %in% colnames(a)) {a$stat <- NA}

      a$stat <- tolower(a$stat)

      a$stat1 <- a$stat #keep originals

      if (sum(! a$stat %in% c("chisq", "chisqc", "fisher", "midp", "lr", "mh", NA))>0) {
        if(vb == "y") {cat("\nValues for 'stat' are 'chisq', 'chisqc', 'fisher', 'midp', 'lr', 'mh'\nOther values have been removed\n\n")
        }
        a$stat [!a$stat %in% c("chisq", "chisqc", "fisher", "midp", "lr", "mh")] <- NA
      }

      if (sum(a$levels_no*a$group >4 & a$stat %in% "midp") >0) {
        if(vb == "y") {cat("\nMid-p test not available for variables with group >2 or levels >2\n'midp' for these cases have been removed\n\n")
        }
        a$stat [a$levels_no*a$group >4 & a$stat %in% "midp"] <- NA
      }

      if (!"p" %in% colnames(a)) {a$p <- NA}
      b <- dplyr::left_join(b,
                            a %>% dplyr::select(study, var, p, stat, stat1) %>% dplyr::arrange(study, var, p, stat) %>%
                              dplyr::group_by(study, var) %>% dplyr::slice(1),
                            by = c("study", "var"))

      #get pvalues into usuable form
      b$p_n <- suppressWarnings(as.numeric(b$p))
      b <- b %>% dplyr::mutate(p_g = suppressWarnings(ifelse(grepl(">", p), as.numeric(sub('.*>', '', p)), NA)),
                               p_l = suppressWarnings(ifelse(grepl("<", p), as.numeric(sub('.*<', '', p)), NA))) %>%
        dplyr::mutate(p_g = suppressWarnings(ifelse(grepl("ns", tolower(p)), 0.05, p_g)))
      b$p <- tolower(b$p)
      b$p <- ifelse(is.na(b$p_n),b$p, as.character(b$p_n))

      #add digits
      b$p_d <- ifelse(!is.na(b$p_n),
                      ifelse(abs(b$p_n - round(b$p_n)) > .Machine$double.eps^0.5,
                             nchar(sub('^\\d+\\.', '', sub('0+$', '', as.character(abs(b$p_n))))), 0), NA)

      #round all p-values to # digits
      b [, nm_p] <- purrr::map(b [,nm_p], ~ifelse(!is.na(b$p_n), round(.x, b$p_d), round(.x, max(b$p_d,3, na.rm= TRUE))), b$p_n, b$p_d)

      #compare p results
      nm_p <- c("pc", "pf", "pcc", "plr", "pchm", "p.m", "p.f", "p.c", "pmdpsas")
      names (nm_p) <- c("chisq", "fisher", "chisqc", "lr", "mh", "midp","na", "na2", "na3")

      for (i in 1:NROW(b)) {b$p_match [i] <- ifelse(!is.na(nm_p [b$stat [i]]), b [i, nm_p [b$stat [i]]], NA)}
      b$p_match2 <- ifelse(!is.na(b$stat) & b$stat == "midp", b$pmdpsas, NA)

      #thresholds
      b$m_t <- NA
      b$m_t_comment <- NA

      #quite complex logic so do in loop- works through options of test vs no test, and thresholds
      for (i in 1:NROW(b)) {
        if (is.na(b$p_g [i]) & is.na(b$p_l [i])) {b$m_t [i] <- NA; next} #if no thresholds no match

        if (is.na(b$stat [i])) { #no test result
          if (b$group [i] * b$levels_no [i] > 4) {
            if (!is.na(b$p_g [i])) {
              #find if p_calc > p_rep
              #no match then match, greater than
              if (sum (b [i, nm_p] > b$p_g [i], na.rm = TRUE) ==0) {
                b$m_t [i] <- "No match"; b$m_t_comment [i] <- "No reported test, >2*2"
              } else {
                b$m_t [i] <- "Match"; b$m_t_comment [i] <- paste0("No reported test, >2*2, test=",
                                                names(nm_p [which.max (b [i, nm_p] > b$p_g [i])]))
              }
          }

          if (!is.na(b$p_l [i])) {
            #lower than
            if (sum (b [i, nm_p] < b$p_l [i], na.rm = TRUE) ==0) {
              b$m_t [i] <- "No match"; b$m_t_comment [i] <- "No reported test, >2*2"
            } else {
              b$m_t [i] <- "Match"; b$m_t_comment [i] <- paste0("No reported test, >2*2, test=",
                                                                names(nm_p [which.max (b [i, nm_p] < b$p_l [i])]))
            }
          }
        } #end of >2*2

          if (b$group [i] * b$levels_no [i] <= 4) {
            if (!is.na(b$p_g [i])) {
              if (sum (b [i, nm_p] > b$p_g [i], na.rm = TRUE) ==0) {
                b$m_t [i] <- "No match"; b$m_t_comment [i] <- "No reported test, 2*2"
              } else {
                b$m_t [i] <- "Match"; b$m_t_comment [i] <- paste0("No reported test, 2*2, test=",
                                                                  names(nm_p [which.max (b [i, nm_p] > b$p_g [i])]))
              }
            }
            if (!is.na(b$p_l [i])) {
              if (sum (b [i, nm_p] < b$p_l [i], na.rm = TRUE) ==0) {
                b$m_t [i] <- "No match"; b$m_t_comment [i] <- "No reported test, 2*2"
              } else {
                b$m_t [i] <- "Match"; b$m_t_comment [i] <- paste0("No reported test, 2*2, test=",
                                                                  names(nm_p [which.max (b [i, nm_p] < b$p_l [i])]))
              }
            }
          }#end of 2*2
        } #end of no test result

        if (!is.na(b$stat [i])) {  #test result
          if (b$stat [i] == "midp") {
            if (!is.na(b$p_g [i])) {
              b$m_t [i] <- ifelse(b$p_match [i] > b$p_g [i] & b$p_match2 [i] > b$p_g [i], "Match", "No match")} #> and midp
            if (!is.na(b$p_l [i])) {
              b$m_t [i] <- ifelse(b$p_match [i] < b$p_l [i] & b$p_match2 [i] < b$p_l [i], "Match", "No match")} #< and >2*2
            b$m_t_comment [i] <- "Midp SAS and R"}

          if (b$stat [i] != "midp") {
            if (!is.na(b$p_g [i])) {
              b$m_t [i] <- ifelse(b$p_match [i] > b$p_g [i], "Match", "No match")} #> and other
            if (!is.na(b$p_l [i])) {
              b$m_t [i] <- ifelse(b$p_match [i] < b$p_l [i], "Match", "No match")} #< and 2*2
            b$m_t_comment [i] <- b$stat [i]}
        } #end of test result
      }
      b$m_t_comment <- ifelse(!is.na(b$m_t_comment),
                              paste0(toupper(substr(b$m_t_comment, 1, 1)), substr(b$m_t_comment, 2, nchar(b$m_t_comment))), NA)


      b$p_df_stat <- b$p_df <- NA
      nm_ph <- c("pc","pf","p.m","pmdpsas","pcc","plr","pchm","p.f","p.c")
      #difference between reported values
      for (i in 1:NROW(b)) {
        if (is.na(b$p_n [i])) {next}
        if (is.na(b$stat [i])) {#no stat result
          #are there any matches df =0 or lowest dfs
          b$p_df [i] <- ifelse(!is.na(b$p_n [i]), min(abs(b$p_n [i] - b [i, nm_p]), na.rm = TRUE), NA)
          b$p_df_stat [i]  <-  names(which(nm_p == nm_ph [which.max(b$p_n [i] == b [i, nm_ph] + b$p_df [i])]))
        }
        if (!is.na(b$stat[i])) {
          b$p_df_stat[i] <- b$stat[i]
          if (!b$stat [i] == "midp") {b$p_df[i] <- abs(b$p_n[i] - b[i, nm_p[b$stat[i]]])
          } else {
             if (abs(b$p_n [i] - b$p.m [i]) < abs(b$p_n [i] - b$pmdpsas [i])) {
               b$p_df_stat [i] <- "midp.epitools"
               b$p_df [i] <-abs(b$p_n [i] - b$p.m [i])
             } else {
               b$p_df_stat [i] <-  "midp.SAS"
               b$p_df [i] <- abs(b$p_n [i] - b$pmdpsas [i])
             }
          }
        }
      }
      b$p_df_stat <- ifelse(!is.na(b$p_df_stat),
                            paste0(toupper(substr(b$p_df_stat, 1, 1)), substr(b$p_df_stat, 2, nchar(b$p_df_stat))), NA)

      #change midp titles
      b$p_df_stat <- ifelse(b$p_df_stat %in% "Na3", "Midp.sas",
                            ifelse(b$p_df_stat %in% "Midp", "Midp.epitools", b$p_df_stat))

      #output
      #compare reported vs unreported p-values
      b$df.groupsp <- cut(b$p_n,c(-1e-99,1e-99,seq(0.1,1.0,0.1)))
      b$df.groupsdf <- cut(b$p_df,c(-1e-99,1e-99,seq(0.1,1,0.1)))
      nm <- data.frame(diff_in_p_value= c(0, paste(seq(0,.9,0.1),"-", seq(0.1,1,0.1), "")), name = levels(b$df.groupsdf),
                       stringsAsFactors = FALSE)

      diff <- dplyr::left_join(nm %>% dplyr::select(diff_in_p_value),
                               dplyr::left_join (b %>% dplyr::filter(!is.na(p_n)) %>% dplyr::count(df.groupsdf) %>%
                                                   dplyr::rename(name =df.groupsdf), nm, by = "name") %>%
                                 dplyr::select (-name),
                               by = "diff_in_p_value")
      d1 <- ncol(diff)
      if (sum(!is.na(b$stat))>0) {
        diff <- dplyr::left_join(diff,
                                 dplyr::left_join (b %>% dplyr::filter(!is.na(p_n)) %>%
                                                     dplyr::mutate(stat1 = ifelse(is.na(stat1), "No test reported", stat1)) %>%
                                                     dplyr::count(stat1, df.groupsdf) %>% dplyr::rename(name =df.groupsdf),
                                                   nm, by = "name") %>% dplyr::select (-name) %>%
                                   dplyr::mutate (stat1 = paste0(toupper(substr(stat1, 1, 1)), substr(stat1, 2, nchar(stat1)))) %>%
                                   tidyr::spread (stat1, n),
                                 by = "diff_in_p_value")}
      d2 <- ncol(diff) - d1
      diff <- dplyr::left_join(diff,
                               dplyr::left_join(
                                 dplyr::left_join (b %>% dplyr::filter(!is.na(p_n)) %>% dplyr::count(df.groupsp, df.groupsdf) %>%
                                                     dplyr::rename(name =df.groupsdf), nm, by = "name") %>%
                                   dplyr::select (-name) %>% dplyr::rename(name = df.groupsp),
                                 nm %>% dplyr::rename (bpv = diff_in_p_value), by = "name") %>% dplyr::select (-name) %>%
                                 tidyr::spread (bpv, n),
                               by = "diff_in_p_value")
      d3 <- ncol(diff) - d1 -d2
      diff[is.na(diff)] <- ""

      diff <- diff %>% dplyr::mutate(n = ifelse(n=="", "",
                                                paste0 (as.numeric(n), " (",
                                                        round(as.numeric(n)/sum(as.numeric(n), na.rm = TRUE)*100,0),")")))
      colnames (diff) [1:2] <- c("Difference between \nreported and\ncalculated p-values", "n (%)")

      f <- flextable::flextable(diff)
      f <- flextable::font(f, fontname = "Arial")
      f <- flextable::fontsize(f, size = 8)
      f <- flextable::valign(f, i = 1, j = 2:ncol(diff), valign = "bottom", part = "head")
      f <- flextable::align(f, j = 2:ncol(diff), align = "center", part = "all")
      h <- c(rep("", d1), rep ("Test",d2), rep ("Baseline p-value", d3))
      f <- flextable::add_header_row(f, values = h, top = TRUE)
      f <- flextable::merge_h(f, part = "header")
      f <- flextable::align(f, i = 1, align = "center", part = "header")
      f <- flextable::fontsize(f, part = "header", size = 8)
      f <- flextable::set_table_properties(f, width = 1, layout = "autofit")

      #compare thresholds
      if (sum (!is.na(b$m_t)) == 0) {diff1 <- f1 <- "There were no threshold p-values reported"
      } else {
        b$tp <- factor(ifelse(rowSums(!is.na(cbind(b$p_g,b$p_l)), na.rm = TRUE) >0, b$p, NA))
        diff1 <- dplyr::left_join(b %>% dplyr::filter(!is.na(tp)) %>% dplyr::count(tp),
                                  b %>% dplyr::filter(!is.na(tp)) %>% dplyr::count(tp, m_t) %>% tidyr::spread (m_t, n),
                                  by = "tp")
        diff1[is.na(diff1)] <- ""
        if (! "Match" %in% colnames(diff1)) {diff1$Match <- 0}
        if (! "No match" %in% colnames(diff1)) {diff1$`No match` <- 0}
        diff1 <- diff1 %>%
          dplyr::mutate(Match = ifelse(Match=="", "",
                                       paste0 (as.numeric(Match), " (", round(as.numeric(Match)/n*100,0),")")),
                        `No match` = ifelse(`No match`=="", "", paste0 (as.numeric(`No match`), " (",
                                                                        round(as.numeric(`No match`)/n*100,0),")")),
                        n = ifelse(n=="", "",
                                   paste0 (as.numeric(n), " (",
                                           round(as.numeric(n)/sum(as.numeric(n), na.rm = TRUE)*100,0),")")))
        colnames (diff1) [1:2] <- c("Threshold\np-value", "n (%)")

        f1 <- flextable::flextable(diff1)
        f1 <- flextable::font(f1, fontname = "Arial")
        f1 <- flextable::fontsize(f1, size = 8)
        f1 <- flextable::valign(f1, i = 1, j = 2:ncol(diff1), valign = "bottom", part = "head")
        f1 <- flextable::align(f1, j = 2:ncol(diff1), align = "center", part = "all")
        f1 <- flextable::add_header_row(f1, values = c("","", "Thresholds vs calculated p value", "Thresholds vs calculated p value"), top = TRUE)
        f1 <- flextable::merge_h(f1, part = "header")
        f1 <- flextable::align(f1, i = 1, align = "center", part = "header")
        f1 <- flextable::fontsize(f1, part = "header", size = 8)
        f1 <- flextable::set_table_properties(f1, width = 0.4, layout = "autofit")
      }

      #return data
      b$warn <- ifelse(b$p_df_stat %in% "Chisq", b$Chisquare.warn, NA)
      b1 <- b %>% dplyr::select(study,var, p, stat1, p_df, p_df_stat, warn, m_t, m_t_comment, pc, Chisquare.warn, pf, p.m, pcc, plr, pchm, pmdpsas, group, levels_no) %>%
        dplyr::ungroup () %>% as.data.frame ()
      colnames (b1) <-  c("study", "variable", "reported_p_value", "test_reported", "difference_reported_calculated_p_values",
                          "test_calculated", "warn.message", "reported_p_matches_threshold", "threshold_comment", "chisquare", "chisquare.warning","fisher",
                          "midp_r_epitools","chisquare_continuty", "likelihood_ratio", "mantel_haenszel", "midp_SAS_calculation",
                          "groups", "levels_of_variable")

      results <- list(cat_all_pvals = b1, cat_all_diff_calc_rep_ft =f, cat_all_diff_calc_rep_data = diff,
                      cat_all_diff_thresh_ft = f1, cat_all_diff_thresh_data = diff1)
      if (vb == "y") {print (f)}
    }

    if(all(b1 == "There were no reported p-values")) {results <- list(cat_all_pvals = b1, cat_all_diff_calc_rep_ft =b1,
                                                                      cat_all_diff_calc_rep_data = b1, cat_all_diff_thresh_ft = b1, cat_all_diff_thresh_data = b1)}
  }


  #binomial function # note this is essentially cat_fn but uses n and N not v and n
  if (tolower(substr(binom,1,1)) == "y") {

    #collapse to 2 levels and delete one level
    if (tolower(substr(two_levels,1,1)) == "n") {

      #choose a random level to remove
      if(seed ==0) {set.seed(Sys.time())
      } else {set.seed(seed)}

      df <- df %>% dplyr::mutate(r1 = runif(nrow(df))) %>% dplyr::group_by (study, var) %>% dplyr::arrange (study, var, r1) %>%
        dplyr::mutate (l1 = dplyr::row_number()) %>% dplyr::select(-r1) %>% dplyr::ungroup () %>% as.data.frame()

      if (tolower(substr(excl.level,1,1)) == "y") {
        a <- df %>% dplyr::filter (!l1 == 1) %>% dplyr::select (study,var, recode, group, starts_with("n")) #exclude 1 level
      } else {a <- df %>% dplyr::select (study,var, recode, group, starts_with("n"))} # don't exclude

    } else {

      a <- df %>% dplyr::select(-level, -levels_no)
      if ("p" %in% colnames(a)) {a <- a %>% dplyr::select(-p)}

      nm1 <- colnames(a) [substr(colnames(a),1,1) == "n"]
      nm2 <- colnames(a) [substr(colnames(a),1,1) == "N"]

      a <- dplyr::left_join (a %>% dplyr::group_by(study, var, recode) %>%
                               dplyr::summarise (dplyr::across(c((all_of(nm1))), ~ sum(.x, na.rm = FALSE)), .groups = "keep"),
                             a %>% dplyr::select(study, var, group, all_of(nm2)) %>% dplyr::group_by (study, var) %>% dplyr::slice(1),
                             by = c("study", "var")) %>%
        dplyr::select(study:recode, group, all_of(nm1), all_of(nm2)) %>% dplyr::ungroup() %>% as.data.frame()

      if (tolower(substr(excl.level,1,1)) == "y") {
        if(seed ==0) {set.seed(Sys.time())
        } else {set.seed(seed)}

        a <- a %>% dplyr::mutate(r1 = runif(nrow(a))) %>% dplyr::group_by (study, var) %>% dplyr::arrange (study, var, r1) %>%
          dplyr::mutate (recode1 = dplyr::row_number()) %>% dplyr::select(-r1) %>% dplyr::ungroup () %>% as.data.frame() %>%
          filter(recode1 ==1) %>% select(-recode1)
      }
    }

    #Pb1/2
    if(vb == "y") {setTxtProgressBar(pb, getTxtProgressBar (pb)+ inc)
      cat("\nCalculating data for raw numbers and making graphs ...\n\n")}

    g <- max(a$group)#largest number of groups
    nm.n <- colnames(a) [substr(colnames(a),1,1) == "n"]
    nm.N <- colnames(a) [substr(colnames(a),1,1) == "N"]

    #calculate N, n and p # p = prop of people with trait
    a$p <- rowSums(a[, nm.n], na.rm = TRUE) / rowSums(a [, nm.N], na.rm = TRUE)

    #get largest group and add 1 because there can be 0 people with trait
    n <- max(a[, nm.N], na.rm = TRUE)+1

    #starting row for each group
    rw_g <- list()
    for (i in 1:g) {rw_g [[i]] <-  (i-1)*n +1}

    #get expected p
    exp_p <- expect_p(a, n, g, nm.N, rw_g)

    #sum the expected probs
    sum_exp <- 0
    for (i in 1:g) {sum_exp <- sum_exp + exp_p [rw_g [[i]]: (rw_g [[i]]+n-1),]}

    #sum the observed
    obs <- a %>% dplyr::select(all_of(nm.n)) %>% tidyr::gather (k, v) %>% dplyr::group_by (v) %>% dplyr::count () %>%
      dplyr::filter (!is.na(v)) %>% dplyr::rename (obs = n) %>% dplyr::ungroup()

    b <- dplyr::left_join(data.frame(num= 0: (n-1), exp= rowSums(sum_exp)), obs, by = c("num" = "v")) %>% replace(is.na(.), 0)

    #if large number of groups then compress
    n_gp <- sum(!is.na(b$obs))
    for (sz in seq(5,ceiling(n_gp/5)*5,5)) {
      b_temp <- compress(b, "obs", "exp", size=sz)
      if (sz == 5) {
        b <- dplyr::bind_cols(b, b_temp)
        if (sum(!is.na(b_temp [ , "obsm__5"])) < 22) {break}
      }
      if (sum(!is.na(b_temp [ , paste0("obsm__",sz)])) < 22) {
        b <- dplyr::bind_cols(b, b_temp)
        break}
    }

    graph <- list()
    graph [1:2] <- cat_graph (gph= b,  xtitle = "Number with characteristic per group", ytitle = "Number of trial groups*variables",
                              size = sz, sfx = "", text = list(length(unique(a$study)), paste(sum(b$obs), "groups*variables")),
                              fn = "cat_all", ti = "Y", top = "Y", t= title)

    rr_g <- list()
    rr_g [1:2] <- rr_graph(rr = b, xtitle =  "Number with characteristic per group", ytitle = "Observed/Expected ratio",
                           size =sz, sfx= "", text = list(length(unique(a$study)), sum(b$obs)),
                           fn = "cat_all", ti = "Y", top = "yes", t=title)

    #percent
    colnames(exp_p) <- 1:ncol(exp_p)
    exp_pc <- dplyr::bind_cols(num = rep(0:(n-1),g), g= rep(1:g, each= n), exp_p)
    exn <- as.data.frame(t(a [, nm.N]))
    exn_exp <- dplyr::bind_cols(num = rep(0:(n-1),g), g= rep(1:g, each= n), as.data.frame(lapply(exn, rep, rep(n,g))))
    colnames(exn_exp) <- c("num","g", 1:ncol(exn))
    ex1 <- dplyr::left_join(exp_pc %>% tidyr::gather(k, v, -num, -g) %>% dplyr::mutate(k = as.numeric(k)),
                            exn_exp %>% tidyr::gather(k, v1, -num, -g) %>% dplyr::mutate(k = as.numeric(k)), by = c("num", "g", "k"))
    ex1 <- ex1 %>% dplyr::filter(!is.na(v1)) %>% dplyr:: mutate(num1 = round(num/v1*100,0)) %>% dplyr::filter(num1 <101) %>%
      dplyr::group_by(k,g, num1) %>% dplyr::summarise (v = sum(v), .groups= "drop") %>% tidyr::spread(k, v)
    ex2 <- dplyr::left_join(data.frame(num1 = rep(0:100, g), g= rep(1:g, each=101)), ex1, by = c("num1","g")) %>%
      replace(is.na(.), 0)

    sum_exp_pc <- 0
    for (i in 1:g) {sum_exp_pc <- sum_exp_pc + ex2 [ex2$g == i, 3:ncol(ex2)]}

    obs_pc <- dplyr::bind_cols(a %>% dplyr::select(all_of(nm.n)) %>% tidyr::gather (k, v), a %>%
                                 dplyr::select(all_of(nm.N)) %>% tidyr::gather (k, v1) %>% dplyr::select(v1)) %>%
      dplyr::mutate(v = round(v/v1*100,0)) %>% dplyr::group_by (v) %>% dplyr::count () %>% dplyr::filter(!is.na(v)) %>%
      dplyr::rename(obs = n) %>% dplyr::ungroup ()

    b_pc= dplyr::left_join(data.frame(num= 0:100, exp = rowSums(sum_exp_pc)), obs_pc, by = c("num"= "v")) %>%
      replace(is.na(.), 0)

    n_gp <- sum(!is.na(b_pc$obs))
    for (sz in seq(5,ceiling(n_gp/5)*5,5)) {
      b_temp <- compress(b_pc, "obs", "exp", size=sz)
      if (sz == 5) {
        b_pc <- dplyr::bind_cols(b_pc, b_temp)
        if (sum(!is.na(b_temp [ , "obsm__5"])) < 22) {break}
      }
      if (sum(!is.na(b_temp [ , paste0("obsm__",sz)])) < 22) {
        b_pc <- dplyr::bind_cols(b_pc, b_temp)
        break}
    }

    graph [5:6] <- cat_graph (gph= b_pc,  xtitle = "Percentage with characteristic per group", ytitle = "Number of trial groups*variables",
                              size = sz, sfx = "", text = list(length(unique(a$study)), paste(sum(b_pc$obs), "groups*variables")),
                              fn = "cat_all", ti = "Y", top = "Y", t= title)

    rr_g [5:6] <- rr_graph(rr = b_pc, xtitle =  "Percentage with characteristic per group", ytitle = "Observed/Expected ratio",
                           size =sz, sfx= "", text = list(length(unique(a$study)), sum(b_pc$obs)),
                           fn = "cat_all", ti = "Y", top = "yes", t=title)

    #Pb2/3
    if(vb == "y") {setTxtProgressBar(pb, getTxtProgressBar (pb)+ inc)
      cat("\nCalculating data for differences betweeen groups and making graphs ...\n")
      cat("\nThis usually takes the most time ...\n\n")}

    #repeat for 2 arm studies only;
    a.2 <- a %>% dplyr::filter (group == 2) %>% dplyr::select(study, var, all_of(nm.n) [1:2], all_of(nm.N) [1:2])
    exp_p2 <- exp_p [, a$group == 2]

    #delete disparate rows
    if (tolower(substr(del.disparate,1,1)) == "y") {
      dd_f <- with(a.2, !(N1/N2 < 0.8 | N1/N2 >1.2 | N2/N1 < 0.8 | N2/N1 > 1.2))
      a.2 <- a.2 %>% dplyr::filter(dd_f)
      exp_p2 <- exp_p2 [, dd_f, drop= FALSE]
      rm (dd_f)}

    #expected differences
    #calculate probability of group numbers differing by 0, 1, 2 etc# this usually takes the longest amount of time.
    exp_p3 <- expect_p_diff (a.2, exp_p2, n, nm.N)

    #count observed diffs
    a.2$obs = abs(a.2 [, nm.n [[1]]] - a.2[ , nm.n [[2]]])
    obs <- a.2 %>% dplyr::count(obs) %>% dplyr::rename (num= obs, obs= n)
    b_df <- dplyr::left_join(data.frame(num= 0: (n-1), exp= rowSums(exp_p3)), obs, by = "num") %>% replace(is.na(.), 0)

    n_gp_df <- sum(!is.na(b_df$obs))
    for (sz_df in seq(5,ceiling(n_gp_df/5)*5,5)) {
      b_df_temp <- compress(b_df, "obs", "exp", size=sz_df)
      if (sz_df == 5) {
        b_df <- dplyr::bind_cols (b_df, b_df_temp)
        if (sum(!is.na(b_df_temp [ , "obsm__5"])) < 22) {break}
      }
      if (sum(!is.na(b_df_temp [ , paste0("obsm__",sz_df)])) < 22) {
        b_df <- dplyr::bind_cols (b_df, b_df_temp)
        break}
    }

    graph [3:4] <-  cat_graph (gph= b_df,  xtitle = "Differences between trial groups",
                               ytitle = "Number of variables", size = sz_df, sfx = "",
                               text = list(length(unique(a.2$study)), paste(sum(b_df$obs), "variables")),
                               fn = "cat_all", ti = "n", top = "n", t= title)

    rr_g [3:4] <- rr_graph(rr = b_df ,xtitle = "Differences between trial groups",
                           ytitle = "Observed/Expected ratio", size =sz_df, sfx= "",
                           text = list(length(unique(a.2$study)), sum(b_df$obs)),
                           fn = "cat_all", ti = "n", top = "n", t= title)

    graphs <- ggpubr::ggarrange(graph [[2]], graph [[4]], ncol = 1, nrow= 3, align = "hv")
    rr_graphs <- ggpubr::ggarrange(rr_g [[2]], rr_g [[4]], ncol = 1, nrow= 2, align = "hv")

    both_graphs <- list ()
    both_graphs [[1]] <- ggarrange(graph [[2]], rr_g [[2]], graph [[4]], rr_g [[4]],
                                   ncol = 2, nrow= 3, align = "hv")
    both_graphs [[2]] <- ggarrange(graph [[1]], rr_g [[2]], ncol = 2, nrow= 1, align = "hv")
    both_graphs [[3]] <- ggarrange(graph [[3]], rr_g [[4]], ncol = 2, nrow= 1, align = "hv")
    both_graphs [[4]] <- ggarrange(graph [[5]], rr_g [[6]], ncol = 2, nrow= 1, align = "hv")
    both_graphs [[5]] <- ggarrange(graph [[1]], rr_g [[2]], graph [[5]], rr_g [[6]], ncol = 2, nrow= 2, align = "hv")
    both_graphs [[6]] <- ggarrange(graph [[5]], rr_g [[6]], graph [[4]], rr_g [[4]], ncol = 2, nrow= 2, align = "hv")


    results2 <- list(cat_all_graph= both_graphs [[1]], cat_all_graph_pc= both_graphs [[6]],
                     cat_all_data_abs = b, cat_all_data_df = b_df, cat_all_dataset_abs = a, cat_all_dataset_df = a.2,
                     cat_all_all_graphs =  list(abs = both_graphs [[2]], df = both_graphs [[3]], pc = both_graphs [[4]],
                                                all_pc = both_graphs [[5]], ind_graphs = c(graph[c(1,3,5)], rr_g [c(1,3,5)])))

   if (vb == "y") {print(both_graphs [[1]])}
  }


results3 <- c(results, results2)

if (vb == "y") {close(pb)}

return (results3)

}
Any scripts or data that you put into this service are public.
reappraised documentation built on Oct. 6, 2023, 9:08 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
reappraised
Statistical Tools for Assessing Publication Integrity of Groups of Trials

R/cat_all.R
In reappraised: Statistical Tools for Assessing Publication Integrity of Groups of Trials

Try the reappraised package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

reappraised Statistical Tools for Assessing Publication Integrity of Groups of Trials

R/cat_all.R In reappraised: Statistical Tools for Assessing Publication Integrity of Groups of Trials

Try the reappraised package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

reappraised
Statistical Tools for Assessing Publication Integrity of Groups of Trials

R/cat_all.R
In reappraised: Statistical Tools for Assessing Publication Integrity of Groups of Trials