bootstrap.dyads: Running bootstrapped dyad-ratios algorithms

Documented in bootstrapped.extraction

#' Bootstrap model for the Stimson dyad ratios algorithm.
#'
#' Runs bootstrap model for repeated estimation of variable loading scores in the dyad-ratios algorithm.
#'
#' @param data Define the data. Must be a dataframe or coercible object.
#' @param reps Define number of bootstrapped replications. Default is 500.
#' @param draw Define the proportion of items to be dropped in each bootstrap replication. Default is 0.3.
#' @param varname Define the variable name indicating the input series (as in extract function)
#' @param output Define the object name that extract function results are stores in.
#' @param print Logical. Define whether or not the function should display active count of replication progress in the console.
#'
#' @details This function runs the dyad-ratios bootstrap model, which takes the results of a single dyad-ratios estimation outcome and produces bootstrapped estimations of the variable loading scores.
#'
#' The bootstrap model removes a pre-defined proportion of random variables for each estimation, extracts the variable loading scores, and averages them across all trials to create bootstrapped-mean estimations of item validity.
#'
#' The model inherits all formula arguments from the extract function output. Note that the extract function must be as defined in this package code, not the original Extract.r.zip file hosted on Stimson's website.
#'
#' Assigning the output to an object creates a list of five items, including a dataframe called 'Full Results' which contrains the bootstrapped mean loading score, single-run estimated loading score, and the difference between the two for each variable input. Larger figures indicate bigger over-estimation of the loading score in the single-run estimation (a lower bootstrapped mean).
#'
#' It is strongly suggested (but not essential) for the speed of the bootstrap estimation that the options print and log in the extract function (to produce the results object) are set to FALSE.
#'
#' @export bootstrapped.extraction

bootstrapped.extraction <- function(data,reps=500,draw=0.3,varname,output,print=FALSE){

  ### Work out the levels and lengths of objects, build base objects for later filling, assign call to object

  set.seed(12345)

  d <- as.data.frame(data)

  d[,varname] <- as.factor(d[,varname])

  items <- levels(d[,varname])

  L <- (length(items)-round((length(items)*draw)))
  selections <- as.data.frame(matrix(nrow=L, ncol=1))

  formula <- output[["formula"]]

  ### Randomly select items and make a database with only randomly drawn items
  L2 <- length(levels(d[,varname]))
  loadings <- as.data.frame(levels(d[,varname]))
  names(loadings) <- "Variable"

  for(i in 1:reps){
    x <- sample(items, size = L)
    selections[i] <- x

  }

  ### Replace formula call to point at subsets - this is a crazy workaround, but I've spent hours trying to
  # figure out how to replace data call from extract and just cannot do it any other way than this (without introducing)
  # characters into the list

  # make sub_data and run a regression to get a formula with a data call to 'sub data'
  sub_data <- as.data.frame(cbind(c(1,2,3),c(3,5,4)))

  sum <- summary(lm(data=sub_data, V1 ~ V2))

  L1 <- as.list(sum$call)

  # get call from extract as a list
  formula_list <- as.list(output[["formula"]])

  # replace call to original data with 'sub data'
  formula_list[[2]] <- L1[[3]]

  # convert back to formula
  formula <- as.call(formula_list)


  # set counter for reps
  ticker <- 0


  ### Run the original Stimson code on each database, store loading scores, merge into loadings object

  for(i in 1:length(selections)){

    sub_data <- d[d[,varname] %in% selections[,i], ]

    sub_result <- eval(formula)

    results_new <- as.data.frame(cbind(sub_result[["varname"]], sub_result[["loadings1"]]))

    names(results_new) <- c("Variable", paste("LoadingScoreTrial", i, sep=""))

    loadings <- merge(loadings, results_new, by="Variable", all.x = TRUE)

    if(print==TRUE){

      ticker <- ticker+1
      cat(paste(ticker, "..."))

      if(ticker == reps){
        cat("Done!")
      }

    }

  }


  ### Calculate mean of scores for each item and rank

  ## change scores back to numeric
  for(i in 2:length(loadings)){

    loadings[,i] <- as.numeric(as.character(loadings[,i]))

  }

  ## find means
  loadings$mean <- rowMeans(loadings[,2:length(loadings)], na.rm=TRUE)

  loads2 <- as.matrix(loadings[,2:length(loadings)])
  loadings$sd <- matrixStats::rowSds(loads2, na.rm=TRUE)

  ## extract full results
  results_full <- as.data.frame(cbind(output[["varname"]], output[["loadings1"]]))

  ## change loadings back to numeric
  results_full$V2 <- as.numeric(as.character(results_full$V2))

  ## rename object
  names(results_full) <- c("Variable", "FullDataScore")

  ## combine objects
  loadings <- merge(loadings, results_full, by="Variable")

  ## Find distance from mean for each item
  loadings$Difference <- loadings$FullDataScore-loadings$mean

  ## Subset to summarisable object
  FullResults <- loadings[,c("Variable", "mean", "sd", "FullDataScore", "Difference")]

  names(FullResults) <- c("Variable", "Bootstrapped Mean", "Standard Deviation", "Single-Run Estimate", "Difference")

  FullResults <- FullResults[order(FullResults$`Bootstrapped Mean`, decreasing=TRUE),]

  ## Construct list object
  ListResults <- as.list("Bootstrap Results")

  ListResults[["Full Results"]] <- FullResults

  ListResults[["Input Data"]] <- d

  ListResults[["Variable Name"]]  <- varname

  ListResults[["Formula"]] <- formula

  ListResults[["Output"]] <- output


  return(ListResults)

}