R/baseline_pvalues_cont.R
In reappraised: Statistical Tools for Assessing Publication Integrity of Groups of Trials

#' Compares observed and expected distribution of p-values for continuous variables
#'
#' Creates plots of calculated p-value distribution and AUC (area under curve)\cr
#'
#' Reference data is from (Carlisle 2017, Bolland 2021)\cr
#' Carlisle JB . Data fabrication and other reasons for non-random sampling in 5087 randomised, controlled trials in anaesthetic and general medical journals. Anaesthesia 2017;72:944–52 .2017\cr
#' Bolland MJ, Gamble GD, Grey A, Avenell A. Empirically generated reference proportions for baseline p values from rounded summary statistics. Anaesthesia 2020;75:1685-1687.\cr
#' See also Bolland MJ, Gamble GD, Avenell A, Grey A, Lumley T. Baseline P value distributions in randomized trials were uniform for continuous but not categorical variables. J Clin Epidemiol 2019;112:67-76.\cr
#' and Bolland MJ, Gamble GD, Avenell A, Grey A. Rounding, but not randomization method, non-normality, or correlation, affected baseline P-value distributions in randomized trials. J Clin Epidemiol 2019;110:50-62.
#'
#'
#' Returns a list containing 4 objects and (if verbose = TRUE) prints the plot pval_cont_calculated_pvalues
#'
#' @param df data frame generated from load_clean function
#' @param btsp number of bootstrap repeats used to generate 95% confidence interval around AUC
#' @param title optional title for plots
#' @param verbose TRUE or FALSE indicates whether progress bar and comments show and prints plot
#'
#' @return list containing 4 objects as described
#'\itemize{
#'   \item pval_cont_calculated_pvalues = plots of calculated p-value distribution and AUC
#'   \item pval_cont_reported_pvalues = plots of reported p-value distribution and AUC (if p-values were reported)
#'   \item pval_cont_ft_diff_calc_rep_p = flextable of distribution of differences in calculated and reported results
#'   \item all_results = list containing
#'      \itemize{
#'         \item pval_cont_baseline_pvalues_data = data frame of all results used in calculations
#'         \item pval_cont_diff_calc_rep_p = data frame of differences between calculated and reported p-values
#'         \item pval_cont_reported_pvalues= plot of reported p-value distribution
#'         \item pval_cont_auc_reported_pvalues = AUC of reported p-values
#'         \item pval_cont_calculated_pvalues = plot of calculated p-value distribution
#'         \item pval_cont_auc_calculated_pvalues= AUC of calculated p-values
#'         }}
#'
#' @import ggplot2
#' @importFrom magrittr %>%
#' @importFrom utils getTxtProgressBar setTxtProgressBar txtProgressBar
#' @importFrom dplyr select group_by left_join mutate count filter bind_cols rename full_join
#' @importFrom tidyr gather spread separate
#' @importFrom broom glance
#' @importFrom boot boot boot.ci
#' @importFrom ggpubr ggarrange
#' @importFrom flextable font fontsize valign set_table_properties
#' @export pval_cont_fn
#'
#'
#' @examples
#' # load example data
#' pval_cont_data <- load_clean(import= "no", file.cont = "SI_pvals_cont", pval_cont= "yes",
#' format.cont = "wide")$pval_cont_data
#'
#' \donttest{
#' # run function (takes only a few seconds)
#' pval_cont_fn(btsp=100)$pval_cont_calculated_pvalues
#'
#' # to import an excel spreadsheet (modify using local path,
#' # file and sheet name, range, and format):
#'
#' # get path for example files
#' path <- system.file("extdata", "reappraised_examples.xlsx", package = "reappraised",
#'                     mustWork = TRUE)
#' # delete file name from path
#' path <- sub("/[^/]+$", "", path)
#'
#' # load data
#' pval_cont_data <- load_clean(import= "yes", pval_cont = "yes", dir = path,
#'      file.name.cont = "reappraised_examples.xlsx", sheet.name.cont = "SI_pvals_cont",
#'      range.name.cont = "A1:O51", format.cont = "wide")$pval_cont_data}
#'
#' @md

#function for baseline p-values and AUC ---------------------------------
pval_cont_fn <- function (df= pval_cont_data, btsp = 500, title = "", verbose = TRUE) {

  if (isTRUE(verbose) | tolower(substr(verbose,1,1)) == "t") {vb <- "y"
  } else {vb <- "n"}

  if (vb == "y") {pb = txtProgressBar(min=0, max=100, style = 2)}

  #convert to long format
  a <- df

  if (! "data_type" %in% colnames(a)) {a$data_type <- 0}
  if (is.na(a$data_type [1]) | a$data_type [1] != "pval_cont") {
    stop_mb(paste0(deparse(substitute(df)), " is either incorrect data for this function or ",
                       deparse(substitute(df)), "$data_type != 'pval_cont'"))}

  #Pb1
  if (vb == "y") {
  setTxtProgressBar(pb,10)
  cat("Preparing data ...")
  }

  #add separator
  names <- colnames(a) [substr(colnames(a),1,1) %in% c("n","m") |
                          (substr(colnames(a),1,1) == "s" & colnames (a) != "study")]
  colnames (a) [colnames (a) %in% names] <- paste0(substr(names,1,1),"_", gsub("\\D", "", names))

  b <- a %>%
    dplyr::select(-c("p","p_num","data_type")) %>%
    tidyr::gather(v, value, -c("study","var")) %>%
    tidyr::separate(v,c("var1", "group")) %>%
    tidyr::spread(var1, value)

  #delete missing data
  b <- b [! (is.na(b$m) | is.na(b$s) | is.na(b$n)),]

  #now generate dataset for analysis
  #uses anova for summary data
  xis <-  b$m + sqrt((b$s**2)/b$n)
  xns <-  b$n*b$m - (b$n-1)*xis;
  x1 <- seq(1,length(xis)*2,by=2)
  x2 <- seq(2,length(xis)*2,by=2)
  y <- f <- study <- var <- grp <- vector(mode="numeric", length = length(xis)*2)
  y[x1] <- xis
  y[x2] <- xns
  f[x1] <- b$n-1
  f[x2] <- 1
  study [x1] <- b$study
  study [x2] <- b$study
  var [x1] <- b$var
  var [x2] <- b$var
  grp [x1] <- b$group
  grp [x2] <- b$group
  z <- data.frame(y,f,study,var, grp)
  z.expand <- as.data.frame(lapply(z, rep, z$f))

  #Pb2
  if (vb == "y") {
  setTxtProgressBar(pb,25)
  cat("Calculating p-values ... this usually takes the longest time")
  }

  #lm for each variable
  fitted_models = z.expand %>% dplyr::group_by(study,var) %>%
    dplyr::group_modify(~ broom::glance(lm(y ~ as.factor(grp), data = .)))

  #now divide by deciles and plot graph
  fitted_models$p <- cut(fitted_models$p.value,c(seq(0,1,0.1)))
  p <- as.data.frame(table (fitted_models$p))

  #graph for calculated p-values
  labels <- list(c("Calculated from summary data"),
                 paste(length(unique(b$study)), "trials"),
                 paste(length(fitted_models$p.value), "variables"))

  p.g <- pval_graph (x = p, round = "y", l= labels, t=title)

  pval <- p.g [[2]]
  p.g <- p.g [[1]]

  # graph for original p
  if(sum(!is.na(a$p))==0) {p.orig.g <- "There are no reported p-values"
  } else {

    a$p_num <- ifelse(substring(a$p,1,1) == "<", 0.0001, a$p_num)
    a$p1 <- cut(a$p_num,breaks =seq(0,1,0.1))

    p.orig <- as.data.frame(table (a$p1))

    labels.orig <- list(c("Reported p-values"),
                        paste(length(unique(a$study[!is.na(a$p_num)])), "trials"),
                        paste(sum(!is.na(a$p_num)), "variables"))
    p.orig.g <- pval_graph (x=p.orig, round= "n", l = labels.orig, t= title)

    pval.orig <- p.orig.g [[2]]
    p.orig.g <- p.orig.g [[1]]
  }

  #AUC functions
  #calculate AUC
  auc_calc <- function (x= auc, bt = "n") {
    auc <- x
    colnames(auc) <- "p" #p is x value
    auc$rank <- rank(auc$p, ties.method = "max")
    auc$y <- auc$rank/NROW(auc) # y is exp y value
    auc <- auc[order(auc$y),]
    auc <- rbind(c(0,0,0),auc)
    r1 <- 1:(NROW(auc)-1)
    r2 <- r1+1
    auc$h <- auc$w <- 0
    auc$w [2:(NROW(auc))] <- auc$p[r2] - auc$p[r1]
    auc$h [2:(NROW(auc))] <- (auc$y[r2] + auc$y[r1])/2
    auc$a <- auc$w *auc$h
    if (bt == "n") {
      auc$auc <- NA
      auc$auc [1] <- sum(auc$a)
      return (auc)
    } else {return (sum(auc$a))}
  }

  #helper function for boot
  auc_ci <- function(df, ind) {
    d <- data.frame(df [ind])
    sm <- auc_calc (x=d, bt = "y")
    return (sm)
  }

  auc <- data.frame(fitted_models$p.value)
  auc <- auc_calc()

  #pb3
  if (vb == "y") {
  setTxtProgressBar(pb,75)
  cat("\nBootstrapping 95% CIs ...")
  cat("\nSpeed up by reducing value of btsp ....")
  cat("\nBut if btsp too small, warning message 'In norm. inter ...'\n")
  }

  R <- btsp # number of repeats
  cis <- boot::boot(fitted_models$p.value, auc_ci , R)
  ci <- boot::boot.ci (cis, type="perc")
  lci <- ci$percent [4]; uci <- ci$percent [5]

  #prepare values for AUC graph

  labels <- append(labels, list(pval, paste("AUC", round(auc$auc [1],2), "95% CI", round (lci,2), "-",round(uci,2))))

  auc.g <- auc_grph(auc, labels, round = "y")

  #AUC original
  if(sum(!is.na(a$p))==0) {auc.orig.g <- "There are no reported p-values"
  } else {

    auc.orig <- data.frame(a$p_num [!is.na(a$p_num)])
    auc.orig <- auc_calc(auc.orig)

    cis <- boot::boot(a$p_num [!is.na(a$p_num)], auc_ci , R)
    ci <- boot::boot.ci (cis, type="perc")
    lci <- ci$percent [4]; uci <- ci$percent [5]

    labels.orig <- append(labels.orig, list(pval.orig,
                                            paste("AUC", round(auc.orig$auc [1],2), "95% CI", round (lci,2), "-", round(uci,2))))

    auc.orig.g <- auc_grph(auc.orig, labels.orig, round = "n")
  }

  #compare reported vs unreported p-values
  a <- dplyr::left_join (a, fitted_models[,c("study","var","p.value")], by = c("study", "var")) %>% as.data.frame()
  a$p.value <- round (a$p.value,3)

  if (sum (!is.na(a$p)) >0) {
    a$df <- round(abs(a$p.value- a$p_num),3)
    a$df.groups <- cut(a$df,seq(0,1,0.1))
    diff <- as.data.frame(table(a$df.groups))
    diff$prop <- diff$Freq/ sum(diff$Freq)
    diff1 <- diff
    diff[,1] <- paste(seq(0,.9,0.1)," - ", seq(0.1,1,0.1), "")
    diff[,2] <- paste(diff$Freq," (",round(diff$prop*100,0),")","")
    diff$prop <- NULL
    colnames (diff) <- c("Difference between \nreported and\n calculated p-values", "n (%)")
    colnames (diff1) [1] <- "difference_rep_calc_p-values"

    #make a flextable
    f <- flextable::flextable(diff)
    f <- flextable::font(f, fontname = "Arial")
    f <- flextable::fontsize(f, size = 8)
    f <- flextable::fontsize(f, part = "header", size = 8)
    f <- flextable::valign(f, i = 1, j = 2, valign = "bottom", part = "head")
    f <- flextable::set_table_properties(f, width = .3, layout = "autofit")

    #change names
    colnames (a) [colnames(a) == "df"] <- "difference_rep_calc_pvalues"
    a$df.groups <- NULL

    #reported pval graph
    rep_p <- ggpubr::ggarrange(p.orig.g, auc.orig.g, ncol = 1, nrow= 2, align = "hv")

  } else {
    diff1 <- rep_p <- f <- "There are no reported p-values"
  }

  #change names
  colnames (a) [colnames(a) == "p"] <- "reported_p-value"
  colnames (a) [colnames(a) == "p_num"] <- "reported_p-value_numeric"
  colnames (a) [colnames(a) == "p.value"] <- "p-value_calc_from_summary_stats"
  colnames (a) [colnames(a) == "var"] <- "variable"
  a$p1 <- NULL;

  calc_p <- ggpubr::ggarrange(p.g, auc.g, ncol = 1, nrow= 2, align = "hv")

  results <- list (pval_cont_calculated_pvalues = calc_p,
                   pval_cont_reported_pvalues = rep_p,
                   pval_cont_ft_diff_calc_rep_p = f,
                   all_results = list(pval_cont_baseline_pvalues_data = a, pval_cont_diff_calc_rep_p = diff1,
                                      pval_cont_reported_pvalues= p.orig.g, pval_cont_auc_reported_pvalues = auc.orig.g,
                                      pval_cont_calculated_pvalues = p.g, pval_cont_auc_calculated_pvalues= auc.g))

  if (vb == "y") {
    print(calc_p)
    close(pb)}

  return (results)

}

Any scripts or data that you put into this service are public.

reappraised documentation built on Oct. 6, 2023, 9:08 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

reappraised
Statistical Tools for Assessing Publication Integrity of Groups of Trials

R/baseline_pvalues_cont.R
In reappraised: Statistical Tools for Assessing Publication Integrity of Groups of Trials

Try the reappraised package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

reappraised Statistical Tools for Assessing Publication Integrity of Groups of Trials

R/baseline_pvalues_cont.R In reappraised: Statistical Tools for Assessing Publication Integrity of Groups of Trials

Try the reappraised package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

reappraised
Statistical Tools for Assessing Publication Integrity of Groups of Trials

R/baseline_pvalues_cont.R
In reappraised: Statistical Tools for Assessing Publication Integrity of Groups of Trials