R/transformation.R

Defines functions transformation_check

Documented in transformation_check

# This function first tests whether or not the data need to me be transformed
# using a Shapiro-Wilks Test. If this test has a significant p-value, then
# a Box COx transformation is conducted.

# This corresponds the top 2 boxes on the left of the flowchart. Note that is
# is only done for the basic model (vehicle and doses)

# Add in the linear model for shapiro wilk test, regress out the treatment effect
#' transformation_check
#' @export
transformation_check <- function(analysis_data) {
  # browser()
  analysis_data <- analysis_data %>%
    mutate(
      Response = as.numeric(Response),
      Baseline = as.numeric(Baseline)
    )
  # Compute the Subjects means to do Shapiro wilks test for only the basic model
  sub_mean <- analysis_data %>%
    select(-Time) %>%
    pivot_longer(cols = c(Response, Baseline), names_to = "Time", values_to = "Response") %>%
    mutate(Response = as.numeric(Response)) %>%
    filter(basic_model) %>%
    group_by(SubjectID, `Technical Replicate ID`, TreatmentNew) %>%
    summarize(sub_mean = mean(Response))
  model <- lm(data = sub_mean, formula = 0 + sub_mean ~ TreatmentNew)
  # Conduct the Shapiro-Wilk Test. The aim of this step is to avoid unnecessary
  # data transformations.
  shapiro_wilk <- shapiro.test(model$residuals)$p.value

  if (shapiro_wilk < 0.05) {
    # Subject means are not normally distributed, so we seek a transformation
    # that leads to normally distributed data

    # Dependent variable must be positive
    bc_data <- analysis_data %>%
      select(-Time) %>%
      pivot_longer(cols = c(Response, Baseline), names_to = "Time", values_to = "Response")
    parameter <- 0 # how much to shift the data
    if (min(bc_data$Response) <= 0) {
      # analysis_data$Response!=0 ensures that the data will be shifted
      # even if the minimum value is 0, and the 1.1 aims to make the shift relatively
      # small. In other words, the parameter won't be large compared to the scale of the
      # data
      parameter <- 1.1 * abs(min(as.numeric(bc_data$Response[bc_data$Response != 0])))
    }



    bc_data <- bc_data %>%
      mutate(
        Response = as.numeric(Response),
        Response = Response + parameter
      )

    # Coonduct the box cox transformation
    bc <- MASS::boxcox(Response ~ TreatmentNew * Time,
      data = bc_data,
      lambda = seq(-1, 2, 1 / 100)
    )
    # Put boxcox summary data in a data.frame. We are only considering common
    # transformations i.e. inverse (-1), inverse sqrt (-1/2), log (0), sqrt (1/2),
    # identity (1), and squared (2) transformation. We select the tranformation
    # that leads to the largest log liklihood value.
    bc_loglik <- data.frame(lambda = bc$x, log_lik = bc$y) %>%
      filter(lambda %in% c(-1, -1 / 2, 0, 1 / 2, 1, 2)) %>%
      arrange(log_lik) %>%
      tail(1)

    power <- bc_loglik$lambda
    message(paste("A power of", power, "was used to transform the data"))

    if (power == 0) {
      analysis_data <- analysis_data %>%
        mutate(
          Baseline_Transformed = log(as.numeric(Baseline)) + parameter,
          Response_Transformed = log(as.numeric(Response) + parameter)
        )
    } else if (power == 1) {
      analysis_data <- analysis_data %>%
        mutate(
          Baseline_Transformed = as.numeric(Baseline),
          Response_Transformed = as.numeric(Response)
        )
    } else {
      analysis_data <- analysis_data %>%
        mutate(
          Baseline_Transformed = (as.numeric(Baseline) + parameter)^power,
          Response_Transformed = (as.numeric(Response) + parameter)^power
        )
    }

    # Conduct Shapiro-Wilks Test on transformed data to see if the subjects
    # means are normally distributed on the transformed scale.Again, this is only,
    # with the basic model

    sub_mean2 <- analysis_data %>%
      select(-Time) %>%
      pivot_longer(cols = c(Response_Transformed, Baseline_Transformed), names_to = "Time", values_to = "Response_Transformed") %>%
      filter(basic_model) %>%
      group_by(SubjectID, `Technical Replicate ID`, TreatmentNew) %>%
      summarize(sub_mean = mean(Response_Transformed))

    model2 <- lm(data = sub_mean2, formula = 0 + sub_mean ~ TreatmentNew)
    shapiro_wilk2 <- shapiro.test(model2$residuals)$p.value
    if (shapiro_wilk2 < 0.05) {
      stop("Contact Statisician (Box Cox Tranformation used, but subject
     means are not normally distributed)")
    }
  } else {
    # The subject means were already normally distributed this there a transformation
    # is not needed
    message("No transformation required")
    # Regardless if a transformation was used, Response_Transformed will be on the
    # variable of interest for the rest of the analysis
    analysis_data <- analysis_data %>%
      mutate(
        Response_Transformed = Response,
        Baseline_Transformed = Baseline
      )
    power <- 1
  }

  if (!all(is.na(analysis_data$Baseline))) {
    analysis_data <- analysis_data %>%
      select(-`Technical Replicate ID`) %>%
      group_by(SubjectID, Time, Type, Treatment, TypeNew, TreatmentNew, basic_model) %>%
      summarise(across(c(Response_Transformed, Baseline_Transformed, Baseline, Response), mean)) %>%
      ungroup() %>%
      mutate(Response_Transformed_bc = Response_Transformed - Baseline_Transformed)
  } else {
    analysis_data <- analysis_data %>%
      select(-`Technical Replicate ID`) %>%
      group_by(Type, Treatment, SubjectID, Dose, TypeNew, TreatmentNew, basic_model, Time) %>%
      group_by(SubjectID, Time, Type, Treatment, TypeNew, TreatmentNew, basic_model) %>%
      summarise(across(c(Response_Transformed, Baseline_Transformed, Baseline, Response), mean)) %>%
      ungroup()
  }


  return(list(transformed = analysis_data, bc_transformation = power))
}
fdrennan/test documentation built on April 23, 2022, 12:37 a.m.