overfitting.R
In rmcorr: Repeated Measures Correlation

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE, tidy = FALSE)
options(width = 80)
library(knitr)
library(rmarkdown)
library(rmcorr) 
library(ggplot2) 
library(dplyr)
library(patchwork)
library(esc)
library(psych)

## ----eval = FALSE-------------------------------------------------------------
#  install.packages("dplyr")
#  require(dplyr)
#  
#  install.packages("esc")
#  require(esc)
#  
#  install.packages("patchwork")
#  require(patchwork)

## ----echo = FALSE-------------------------------------------------------------
N.Marusich2016 <- length(unique(marusich2016_exp2$Pair))
k.Marusich2016 <- length(unique(marusich2016_exp2$SourceReliablity))
total.obs.Marusich2016 <- length(marusich2016_exp2$Pair)

## -----------------------------------------------------------------------------
overfit.plot <- 
    ggplot(data = marusich2016_exp2, aes(x = MARS, y = HVT_capture)) +
    geom_point(aes(colour = factor(Pair))) +
    geom_smooth(method= "lm", level = 0.95) +
    coord_cartesian(xlim = c(2,4), ylim=c(0,30)) + 
    ylab("Capture Time (seconds)") + 
    theme_minimal() +
    theme(legend.position="none") 

marusich2016_avg <- marusich2016_exp2 %>%
                    group_by(Pair) %>%
                    summarize(Mean_MARS = mean(MARS),
                              Mean_HVT_capture = mean(HVT_capture))

average.plot <- 
    ggplot(data = marusich2016_avg, 
           aes(x = Mean_MARS, y = Mean_HVT_capture)) +
    geom_smooth(fullrange = TRUE, method= "lm", level = 0.95) +
    coord_cartesian(xlim = c(2,4), ylim=c(0,30)) +
    geom_point(aes(colour = factor(Pair))) +
    xlab("Mean MARS") +
    ylab("Mean Capture Time (seconds)") +
    scale_colour_discrete(name = "Dyad") +
    theme_minimal()

overfit.cor <- cor.test(marusich2016_exp2$MARS, marusich2016_exp2$HVT_capture)

average.cor <- cor.test(marusich2016_avg$Mean_MARS, marusich2016_avg$Mean_HVT_capture)

df.s <- rbind(overfit.cor$parameter, average.cor$parameter)

r.s  <- rbind(round(rbind(overfit.cor$estimate, average.cor$estimate), digits = 2)) 

CI.s <- formatC(rbind(overfit.cor$conf.int, 
          average.cor$conf.int), digits = 2,
          format = 'f')

p.vals <- rbind(round(overfit.cor$p.value, digits = 3), 
                prettyNum(average.cor$p.value, digits = 2, 
                          drop0trailing = TRUE))

overfit.plot + average.plot 

## -----------------------------------------------------------------------------
marusich.rmc <- rmcorr(participant = Pair, 
                       measure1 = MARS, 
                       measure2 = HVT_capture, 
                       data = marusich2016_exp2)
rmc.plot <- 
    ggplot(data = marusich2016_exp2, 
           aes(x = MARS, y = HVT_capture, group = factor(Pair), color = factor(Pair))) +
    geom_point(aes(colour = factor(Pair))) +
    geom_line(aes(y = marusich.rmc$model$fitted.values)) +
    theme_minimal() +
    scale_colour_discrete(name = "Dyad") +
    ylab("Capture Time (seconds)")

rmc.r <- round(marusich.rmc$r, 2)
rmc.CI <- round(marusich.rmc$CI, 2)
rmc.p <- formatC(marusich.rmc$p, format = "fg", digits = 2)
                # prettyNum(average.cor$p.value, digits = 2,
                          # drop0trailing = TRUE))

rmc.plot

## -----------------------------------------------------------------------------
N.vals <- seq(5, 100, by = 1)
sd.Z <- 1/sqrt(N.vals-3)

sd.overfit <- data.frame(cbind(N.vals, sd.Z))

sd.cor <- sd.Z[N.Marusich2016-4] #Have to subtract 4 because N.vals starts at 5, not 1
sd.overf <- sd.Z[total.obs.Marusich2016-4]

#We could determine the inflated sample size for the overfit model using its degrees of freedom + 2
df.s + 2  == total.obs.Marusich2016

sd.overfit.plot <- 
    ggplot(data = sd.overfit, 
           aes(x = N.vals, y = sd.Z)) +
    coord_cartesian(xlim = c(0,100), ylim=c(0, 0.75)) +
    geom_point(alpha = 0.25) +
    geom_segment(x = 28, y =  sd.cor, xend = 84, 
                 yend =  sd.cor, linetype = 2) +
    geom_segment(x = 84, y =  sd.cor, xend = 84, 
                 yend =  sd.overf, colour = "red",
                 linetype = 2) +
    xlab("Analysis Sample Size") + 
    ylab(expression(paste("Fisher ", italic("Z"), " standard deviation"))) +
    annotate(geom = "point", x = 28, y = sd.cor, 
             colour = "black", size = 2) +
    annotate(geom = "point", x = 84, y = sd.overf, 
             colour = "red", size = 2) +
    theme_minimal()

sd.overfit.plot

## -----------------------------------------------------------------------------
#Calculate the expected correlation using the reported p-value and actual sample size 
#esc_t assumes p-value is two-tailed and uses a Fisher Z transformation
calc.z <- esc_t(p = as.numeric(p.vals[1]), 
                      totaln = N.Marusich2016, 
                      es.type = "r")$es


reported.r = as.numeric(r.s[1])

#Overfit?
fisherz2r(calc.z) != abs(reported.r)  
#Note calc.z will always be positive using esc_t() this way
#If the reported.r is not a Fisher Z value - either it should be transformed or the calc.z should be transformed

fisherz2r(calc.z) 
reported.r