learnSampling: Tools to help learn sampling

Documented in get_d_rep_samples get_d_rep_samples_from_population_data

#' Calculate a number of sample d-values (unbiased) based on a specified (infinite) population correlation.
#' @param pop.d Population d-value
#' @param cell.n Cell size for both cells for all samples. If you use two values (e.g., c(20, 40), these represent -3/+3 SD for variable sample sizes
#' @param number.of.samples Number of samples to obtain
#' @param number.of.decimals Number of decimals to report in returned data frame
#' @param alternative indicates type of alternative hypothesis (e.g., "two.sided") for t.test
#' @param seed.value random number seed value
#' @return Data frame with sample d-values
#' @examples
#' get_d_rep_samples(pop.d = .35, cell.n = 100)
#' @export
get_d_rep_samples <- function(pop.d = NA, cell.n = NA, number.of.samples = 10,
                              number.of.decimals = 3, alternative = "two.sided", seed.value = 1) {

     set.seed(seed.value)

     if (is.na(pop.d)) {return()}


     if (length(cell.n) == 1) {
          ns.for.cell1 = rep(cell.n, number.of.samples)
     } else {
          # if a range is specified in the cell.n then sample sizes are random based on that range
          # the range is -3 to +3 SD for sample size
          cell.n <- sort(cell.n)
          cell.min <- cell.n[1]
          cell.max <- cell.n[2]
          cell.sd <- (cell.max  - cell.min) / 6
          cell.mean <- mean(cell.min, cell.max)
          ns.for.cell1 = abs(round(rnorm(mean = cell.mean, sd = cell.sd, n = number.of.samples)))
     }


     dfs <- rep(NA,number.of.samples)
     ts <- rep(NA,number.of.samples)
     ps <- rep(NA,number.of.samples)
     ds <- rep(NA,number.of.samples)
     d2s <- rep(NA,number.of.samples)
     LLs <- rep(NA,number.of.samples)
     ULs <- rep(NA,number.of.samples)
     in_interval <- rep(NA,number.of.samples)

     for (i in 1:number.of.samples) {
          cell1.n <- ns.for.cell1[i]

          group1.data <- rnorm(cell1.n) + pop.d
          tout <- t.test(group1.data, alternative = alternative)

          dfs[i] <- round(tout$parameter, number.of.decimals)
          ts[i] <- round(tout$statistic, number.of.decimals)
          ps[i] <- round(tout$p.value, number.of.decimals)

          ciinfo <- MBESS::ci.sm(ncp = ts[i], N = cell1.n)
          ds[i] <- round(ciinfo$Standardized.Mean, number.of.decimals)

          in_interval[i] <- is_value_in_interval(pop.d, c(ciinfo$Lower.Conf.Limit.Standardized.Mean, ciinfo$Upper.Conf.Limit.Standardized.Mean))

          LLs[i] <- round(ciinfo$Lower.Conf.Limit.Standardized.Mean, number.of.decimals)
          ULs[i] <- round(ciinfo$Upper.Conf.Limit.Standardized.Mean, number.of.decimals)
     }
     xx<-1:number.of.samples
     sample.number <- xx
     data.out <- data.frame(sample.number, pop.d = pop.d, cell1.n = ns.for.cell1, d = ds, LL = LLs, UL = ULs, ci.captured.pop.d = in_interval, t = ts, df = dfs, p = ps)
     rownames(data.out) <- NULL

     return(data.out)
}



#' Calculate a number of sample correlations based on a specified population correlation
#' @param pop1 Population data using first column of data frame
#' @param pop2 Population data using first column of data frame
#' @param cell.n Sample size for all samples (range of sample size if two specified)
#' @param number.of.samples Number of samples to obtain
#' @param number.of.decimals Number of decimals to report in returned data frame
#' @param seed.value random number seed value
#' @return Data frame with sample correlations
#' @export
get_d_rep_samples_from_population_data <- function(pop1 = NULL, pop2 = NULL, cell.n,
                                                   number.of.samples = 10, number.of.decimals = 2) {

        set.seed(2)
        if (is.null(pop1)) {return()}

        if (length(cell.n) == 1) {
                ns.for.cell1 = rep(cell.n, number.of.samples)
                ns.for.cell2 = rep(cell.n, number.of.samples)
        }


        m1s <- rep(NA,number.of.samples)
        m2s <- rep(NA,number.of.samples)
        v1s <- rep(NA,number.of.samples)
        v2s <- rep(NA,number.of.samples)
        diffs <- rep(NA,number.of.samples)
        sps <- rep(NA,number.of.samples)
        ds <- rep(NA,number.of.samples)
        ds_unbiased <- rep(NA,number.of.samples)

        for (i in 1:number.of.samples) {
                cell1.n <- ns.for.cell1[i]
                cell2.n <- ns.for.cell2[i]

                group1.data <- sample(pop1, size = cell1.n)
                group2.data <- sample(pop2, size = cell2.n)
                #group1.data <- rnorm(cell1.n) + pop.d
                #group2.data <- rnorm(cell2.n)

                n1 <- cell1.n
                n2 <- cell2.n
                m1 <- mean(group1.data)
                m2 <- mean(group2.data)
                diff <- m1 - m2
                v1 <- var(group1.data)
                v2 <- var(group2.data)
                vp <- ((n1-1)*v1 + (n2-1)*v2) / (n1 + n2 - 2)
                sp <- sqrt(vp)
                d <- (m1 - m2)/sp
                cf <- 1 - 3 / (4 * (n1 + n2) -9)
                m1s[i] <- m1
                m2s[i] <- m2
                v1s[i] <- v1
                v2s[i] <- v2
                diffs[i] <- diff
                sps[i] <- sp

                ds[i] <-d
                ds_unbiased[i] <- d*cf
        }
        xx<-1:number.of.samples
        sample.number <- xx

        data.out <- data.frame(n_per_cell = ns.for.cell1,
                               mean1 = m1s,
                               var1_n_1= v1s,
                               mean2 = m2s,
                               var2_n_1 = v2s,
                               d = ds,
                               d_unbiased = ds_unbiased)

        data.out <- round(data.out,2)
        rownames(data.out) <- NULL

        data.out <- tibble::as_tibble(data.out)
        return(data.out)
}


# get_d_samples_from_population_data <- function(pop1 = NULL, pop2 = NULL, cell.n, number.of.samples = 10, number.of.decimals = 2, var.equal = TRUE, alternative = "two.sided") {
#         if (is.null(pop1)) {return()}
#
#         pop.d <- calc_pop_d(pop1,pop2)
#
#         if (length(cell.n) == 1) {
#                 ns.for.cell1 = rep(cell.n, number.of.samples)
#                 ns.for.cell2 = rep(cell.n, number.of.samples)
#         } else {
#                 cell.n <- sort(cell.n)
#                 cell.min <- cell.n[1]
#                 cell.max <- cell.n[2]
#                 cell.sd <- (cell.max  - cell.min) / 6
#                 cell.mean <- mean(cell.min, cell.max)
#                 ns.for.cell1 = abs(round(rnorm(mean = cell.mean, sd = cell.sd, n = number.of.samples)))
#                 ns.for.cell2 = ns.for.cell1
#         }
#
#
#         dfs <- rep(NA,number.of.samples)
#         ts <- rep(NA,number.of.samples)
#         ps <- rep(NA,number.of.samples)
#         ds <- rep(NA,number.of.samples)
#         d2s <- rep(NA,number.of.samples)
#         LLs <- rep(NA,number.of.samples)
#         ULs <- rep(NA,number.of.samples)
#         in_interval <- rep(NA,number.of.samples)
#
#         for (i in 1:number.of.samples) {
#                 cell1.n <- ns.for.cell1[i]
#                 cell2.n <- ns.for.cell2[i]
#
#                 group1.data <- dplyr::sample_n(pop1, size = cell1.n)[,1]
#                 group2.data <- dplyr::sample_n(pop2, size = cell2.n)[,1]
#                 #group1.data <- rnorm(cell1.n) + pop.d
#                 #group2.data <- rnorm(cell2.n)
#
#                 tout <- t.test(group1.data, group2.data, var.equal = var.equal, alternative = alternative)
#
#                 dfs[i] <- round(tout$parameter, number.of.decimals)
#                 ts[i] <- round(tout$statistic, number.of.decimals)
#                 ps[i] <- round(tout$p.value, number.of.decimals)
#
#                 ciinfo <- MBESS::ci.smd(ncp = ts[i], n.1 = cell1.n, n.2 = cell2.n)
#                 ds[i] <- round(ciinfo$smd, number.of.decimals)
#
#                 in_interval[i] <- is_value_in_interval(pop.d, c(ciinfo$Lower.Conf.Limit.smd, ciinfo$Upper.Conf.Limit.smd))
#
#                 LLs[i] <- round(ciinfo$Lower.Conf.Limit.smd, number.of.decimals)
#                 ULs[i] <- round(ciinfo$Upper.Conf.Limit.smd, number.of.decimals)
#         }
#         xx<-1:number.of.samples
#         sample.number <- xx
#         # data.out <- data.frame(sample.number,
#         #                        pop.d = pop.d,
#         #                        cell1.n = ns.for.cell1,
#         #                        cell2.n = ns.for.cell2,
#         #                        d = ds,
#         #                        LL = LLs,
#         #                        UL = ULs,
#         #                        ci.captured.pop.d = in_interval,
#         #                        t = ts,
#         #                        df = dfs,
#         #                        p = ps)
#         data.out <- data.frame(study = sample.number,
#                                pop.d = pop.d,
#                                cell1.n = ns.for.cell1,
#                                cell2.n = ns.for.cell2,
#                                d = ds)
#         rownames(data.out) <- NULL
#
#         return(data.out)
# }