inst/doc/overfitting.R

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE, tidy = FALSE)
options(width = 80)
library(knitr)
library(rmarkdown)
library(rmcorr) 
library(ggplot2) 
library(dplyr)
library(patchwork)
library(esc)
library(psych)

## ----eval = FALSE-------------------------------------------------------------
#  install.packages("dplyr")
#  require(dplyr)
#  
#  install.packages("esc")
#  require(esc)
#  
#  install.packages("patchwork")
#  require(patchwork)

## ----echo = FALSE-------------------------------------------------------------
N.Marusich2016 <- length(unique(marusich2016_exp2$Pair))
k.Marusich2016 <- length(unique(marusich2016_exp2$SourceReliablity))
total.obs.Marusich2016 <- length(marusich2016_exp2$Pair)

## -----------------------------------------------------------------------------
overfit.plot <- 
    ggplot(data = marusich2016_exp2, aes(x = MARS, y = HVT_capture)) +
    geom_point(aes(colour = factor(Pair))) +
    geom_smooth(method= "lm", level = 0.95) +
    coord_cartesian(xlim = c(2,4), ylim=c(0,30)) + 
    ylab("Capture Time (seconds)") + 
    theme_minimal() +
    theme(legend.position="none") 

marusich2016_avg <- marusich2016_exp2 %>%
                    group_by(Pair) %>%
                    summarize(Mean_MARS = mean(MARS),
                              Mean_HVT_capture = mean(HVT_capture))

average.plot <- 
    ggplot(data = marusich2016_avg, 
           aes(x = Mean_MARS, y = Mean_HVT_capture)) +
    geom_smooth(fullrange = TRUE, method= "lm", level = 0.95) +
    coord_cartesian(xlim = c(2,4), ylim=c(0,30)) +
    geom_point(aes(colour = factor(Pair))) +
    xlab("Mean MARS") +
    ylab("Mean Capture Time (seconds)") +
    scale_colour_discrete(name = "Dyad") +
    theme_minimal()

overfit.cor <- cor.test(marusich2016_exp2$MARS, marusich2016_exp2$HVT_capture)

average.cor <- cor.test(marusich2016_avg$Mean_MARS, marusich2016_avg$Mean_HVT_capture)

df.s <- rbind(overfit.cor$parameter, average.cor$parameter)

r.s  <- rbind(round(rbind(overfit.cor$estimate, average.cor$estimate), digits = 2)) 

CI.s <- formatC(rbind(overfit.cor$conf.int, 
          average.cor$conf.int), digits = 2,
          format = 'f')

p.vals <- rbind(round(overfit.cor$p.value, digits = 3), 
                prettyNum(average.cor$p.value, digits = 2, 
                          drop0trailing = TRUE))

overfit.plot + average.plot 

## -----------------------------------------------------------------------------
marusich.rmc <- rmcorr(participant = Pair, 
                       measure1 = MARS, 
                       measure2 = HVT_capture, 
                       data = marusich2016_exp2)
rmc.plot <- 
    ggplot(data = marusich2016_exp2, 
           aes(x = MARS, y = HVT_capture, group = factor(Pair), color = factor(Pair))) +
    geom_point(aes(colour = factor(Pair))) +
    geom_line(aes(y = marusich.rmc$model$fitted.values)) +
    theme_minimal() +
    scale_colour_discrete(name = "Dyad") +
    ylab("Capture Time (seconds)")

rmc.r <- round(marusich.rmc$r, 2)
rmc.CI <- round(marusich.rmc$CI, 2)
rmc.p <- formatC(marusich.rmc$p, format = "fg", digits = 2)
                # prettyNum(average.cor$p.value, digits = 2,
                          # drop0trailing = TRUE))

rmc.plot

## -----------------------------------------------------------------------------
N.vals <- seq(5, 100, by = 1)
sd.Z <- 1/sqrt(N.vals-3)

sd.overfit <- data.frame(cbind(N.vals, sd.Z))

sd.cor <- sd.Z[N.Marusich2016-4] #Have to subtract 4 because N.vals starts at 5, not 1
sd.overf <- sd.Z[total.obs.Marusich2016-4]

#We could determine the inflated sample size for the overfit model using its degrees of freedom + 2
df.s + 2  == total.obs.Marusich2016

sd.overfit.plot <- 
    ggplot(data = sd.overfit, 
           aes(x = N.vals, y = sd.Z)) +
    coord_cartesian(xlim = c(0,100), ylim=c(0, 0.75)) +
    geom_point(alpha = 0.25) +
    geom_segment(x = 28, y =  sd.cor, xend = 84, 
                 yend =  sd.cor, linetype = 2) +
    geom_segment(x = 84, y =  sd.cor, xend = 84, 
                 yend =  sd.overf, colour = "red",
                 linetype = 2) +
    xlab("Analysis Sample Size") + 
    ylab(expression(paste("Fisher ", italic("Z"), " standard deviation"))) +
    annotate(geom = "point", x = 28, y = sd.cor, 
             colour = "black", size = 2) +
    annotate(geom = "point", x = 84, y = sd.overf, 
             colour = "red", size = 2) +
    theme_minimal()

sd.overfit.plot

## -----------------------------------------------------------------------------
#Calculate the expected correlation using the reported p-value and actual sample size 
#esc_t assumes p-value is two-tailed and uses a Fisher Z transformation
calc.z <- esc_t(p = as.numeric(p.vals[1]), 
                      totaln = N.Marusich2016, 
                      es.type = "r")$es


reported.r = as.numeric(r.s[1])

#Overfit?
fisherz2r(calc.z) != abs(reported.r)  
#Note calc.z will always be positive using esc_t() this way
#If the reported.r is not a Fisher Z value - either it should be transformed or the calc.z should be transformed

fisherz2r(calc.z) 
reported.r

Try the rmcorr package in your browser

Any scripts or data that you put into this service are public.

rmcorr documentation built on Sept. 11, 2024, 7:24 p.m.