dataAnalysis/RPP_figures_3.R

############################################################
#        RPP Manuscript Figure 1,2,3                       #
#                                                          #
# Created by Fred Hasselman on behalf of the RPP intiative #
############################################################

# All the custom functions that are required to recreate the Figures are available in a GitHub sourceable file: `C-3PR.R` 
# It is available here: https://github.com/FredHasselman/toolboxR

############################################################
# C-3PR: Evaluating Scientific *C*laims: *P*ost *P*ublication *P*eer *R*eview, *R*eplication and *R*eproduction.
#
# An R function library created in the contex of analysing and visualising data generated by several Open Science intiatives.
# 
# C-3PR Created by [Fred Hasselman](https://osf.io/ujgs6/) on behalf of the ManyLabs1, ManyLabs2, RPP and Curate Science projects
############################################################


# SETUP local R-------------------------------------------------------------------

# Use this code (from the devtools package) to source C-3PR directly from GitHub:
require(devtools)
source_url('https://raw.githubusercontent.com/FredHasselman/toolboxR/master/C-3PR.R')

# This will load and (if necessary) install libraries frequently used for data management and plotting
in.IT(c('ggplot2','RColorBrewer','lattice','gridExtra','plyr','dplyr','httr'))

# Read the data from the OSF storage
# Note: get.OSFfile() returns a list with the .csv data (df) and information (info) containing the URL download timestamp and original column and rownames (these names will be changed if dfCln=TRUE).  
RPPdata <- get.OSFfile(code='https://osf.io/fgjvw/',dfCln=T)$df

## If you dowloaded the csv file to your harddrive use this code:
  RPPdata<-read.csv('rpp_data.csv',stringsAsFactors=F )
  RPPdata<-df.Clean(RPPdata)
  RPPdata<-RPPdata$df

  
sortPvals <- sort(RPPdata$T.pval.recalc.O)
sum(sortPvals[1:(length(sortPvals)-1)] == sortPvals[2:length(sortPvals)])
which(sortPvals[1:(length(sortPvals)-1)] == sortPvals[2:length(sortPvals)])
  
# Select the completed replication studies
RPPdata <- dplyr::filter(RPPdata, !is.na(T.pval.USE.O), !is.na(T.pval.USE.R))

# We have 99 studies for which p-values and effect sizes could be calculated
nrow(RPPdata)
# We have 97 studies for which p-values of the original effect were significantly below .05
idOK <- complete.cases(RPPdata$T.r.O,RPPdata$T.r.R)
sum(idOK)

# Get ggplot2 themes predefined in C-3PR
mytheme <- gg.theme("clean")

# Get the Replication observed power
RPPdata$Power.Rn <- as.numeric(RPPdata$Power.R)

########################
# FIGURE 3
# EFFECT SIZE DENSITY PLOTS -------------------------------------------------------------
########################

# Setup some variables
RPPdata$oriSig <- "Not Significant"
# 3 studies claimed an effect at .05 < p < .06
RPPdata$oriSig[RPPdata$T.pval.USE.O<=.06] <- "Significant"
RPPdata$oriSig <- factor(RPPdata$oriSig)

RPPdata$repSig <- "Not Significant"
RPPdata$repSig[RPPdata$T.pval.USE.R<=.05] <- "Significant"
RPPdata$repSig <- factor(RPPdata$repSig)
RPPdata$repSig <- factor(RPPdata$repSig)

# ORIGINAL PLOT ------------------------
# The plotHolder() function from C-3PR creates a blank plot template that will hold the figures
blankPlot <- plotHolder()

# X margin density plot (note: gg.theme() from C-3PR can be used directly in a ggplot2() call)
xDense <- ggplot(RPPdata, aes(x=T.r.O, fill=oriSig)) + 
  geom_density(aes(y= ..count..),trim=F,alpha=.5) + 
  xlab("") + ylab("") + xlim(0,1) +
  gg.theme("noax") + 
  theme(legend.position = "none",plot.margin = unit(c(0,0,0,4), "lines"))

## Uncomment to save subplot
# ggsave("RPP_F3_xDense.png",plot=xDense)

# Y margin density plot (note: gg.theme() from C-3PR can be used directly in a ggplot2() call)
yDense <- ggplot(RPPdata, aes(x=T.r.R, fill=repSig)) + 
  geom_density(aes(y= ..count..),trim=F,alpha=.5) + 
  xlab("") + ylab("") + xlim(-.5,1) + 
  coord_flip() + 
  gg.theme("noax") + 
  theme(legend.position = "none", plot.margin = unit(c(0,0,3,0), "lines")) 

## Uncomment to save subplot
# ggsave("RPP_F3_yDense.png",plot=yDense)

# The main scatterplot (note: gg.theme() from C-3PR can be used directly in a ggplot2() call)
scatterP<-
  ggplot(RPPdata,aes(x=T.r.O,y=T.r.R)) +  
  geom_hline(aes(yintercept=0),linetype=2) +
  geom_abline(intercept=0,slope=1,color="Grey60")+
  geom_point(aes(size=Power.Rn,fill=repSig),color="Grey30",shape=21,alpha=.8) + 
  geom_rug(aes(color=oriSig),size=1,sides="b",alpha=.6) + 
  geom_rug(aes(color=repSig),size=1,sides="l",alpha=.6) + 
  scale_x_continuous(name="Original Effect Size",limits=c(0,1),breaks=c(0,.25,.5,.75,1)) + 
  scale_y_continuous(name="Replication Effect Size",limits=c(-.5,1),breaks=c(-.5,-.25,0,.25,.5,.75,1)) + 
  ggtitle("") + xlab("") + ylab("") + 
  scale_size_continuous(name="Replication Power",range=c(2,9)) + 
  scale_color_discrete(name="p-value") +
  scale_fill_discrete(name="p-value") +
  gg.theme("clean") + 
  theme(legend.position=c(.9,.6), plot.margin = unit(c(-2,-1.5,2,2), "lines")) 

## Uncomment to save subplot
# ggsave("RPP_F3_scatter.png",plot=scatterP)

# Yet another way to organise plots: grid.arrange() from the gridExtra package.
grid.arrange(xDense, blankPlot, scatterP, yDense, ncol=2, nrow=2, widths=c(4, 1.4), heights=c(1.4, 4))

# SAVE combined plots as PDF
pdf("RPP_Figure3_original.pdf",pagecentre=T, width=15,height=12 ,paper = "special")
grid.arrange(xDense, blankPlot, scatterP, yDense, ncol=2, nrow=2, widths=c(4, 1.4), heights=c(1.4, 4))
dev.off()

# STATISTIC BASED FIGURE 3 -------------------------------
source("t-correction.R")
source("F-correction.R")
source("chisq-correction.R")

# Performing conditional inference on test statistics
naIndex <- !is.na(RPPdata$T.r.O)
statistics <- RPPdata$T.Test.value.O
statType <- RPPdata$T.Test.Statistic.O
conditionalList <- lapply(1:length(statType), function(x) x)
i <- 1
while(i <= length(conditionalList)) {
  if((is.na(RPPdata$T.pval.recalc.O[i]) | 
     !naIndex[i]) & 
     i != 80) {
    i <- i + 1
    next
  }
  
  if(i == 80) {
    alpha.threshold <- 0.05
  } else if(RPPdata$T.pval.recalc.O[i] >= 0.06) {
    alpha.threshold <- 1
  } else if(RPPdata$T.pval.recalc.O[i] >= 0.05) {
    alpha.threshold <- 0.06
  } else {
    alpha.threshold <- 0.05
  }
  if(any(statType[i] %in% c("z", "t", "r"))) {
    if(statType[i] == "t") {
      df <- RPPdata$T.df2.O[i]
      t <- statistics[i]
      if(df > 90) df <- 100
    } else if(statType[i] == "z") {
      df <- Inf
      t <- statistics[i]
    } else if(statType[i] == "r") {
      df <- Inf
      r <- statistics[i]
      t <- 0.5 * log((1 + r) / (1 - r))
      t <- t / sqrt(RPPdata$T.N.O[i] - 3)^-1
    }
    conditionalList[[i]] <- t.selection.correction(t, df, alpha.threshold, 0.05)
  } else if(any(statType[i] %in% c("Chi", "Chi2"))) {
    df <- RPPdata$T.df1.O[i]
    chi <- statistics[i]
    conditionalList[[i]] <- chisq.selection.correction(chi, df, alpha.threshold, 0.05)
  } else if(statType[i] == "F") {
    df1 <- RPPdata$T.df1.O[i]
    df2 <- RPPdata$T.df2.O[i]
    Fval <- statistics[i]
    if(df1 == 1) {
      statType[i] <- "t"
      statistics[i] <- sqrt(statistics[i])
      next
    }
    conditionalList[[i]] <- Fdist.selection.correction(Fval, df1, df2, alpha.threshold, 0.05)
  }
  print(i)
  i <- i + 1
}

# Converting conditional test statistics to correlations
F2cor <- function(Fval, df1, df2) {
  sqrt((Fval * (df1 / df2)) / (((Fval * df1) / df2) + 1)) * sqrt(1 / df1)
}
correctedCor <- rep(NA, length(statistics))
lowerCI <- rep(NA, length(statistics))
upperCI <- rep(NA, length(statistics))
N <- RPPdata$T.N.O
for(i in 1:length(correctedCor)) {
  if(!naIndex[i]) next
  if(is.na(RPPdata$T.pval.recalc.O[i]) & i != 80) next
  
  conditional <- conditionalList[[i]][[3]]
  lower <- conditionalList[[i]][[5]][1]
  upper <- conditionalList[[i]][[5]][2]
  
  if(any(statType[i] %in% c("z", "r"))) {
    n <- N[i]
    z <- conditional * sqrt(n - 3)^-1
    correctedCor[i] <- z2cor(z, n)
    lowerCI[i] <- z2cor(lower, n)
    upperCI[i] <- z2cor(upper, n)
  }
  
  if(any(statType[i] %in% c("Chi", "Chi2"))) {
    n <- N[i]
    correctedCor[i] <- sqrt(conditional / n)
    lowerCI[i] <- sqrt(lower / n)
    upperCI[i] <- min(sqrt(upper / n), 1)
  }
  
  if(statType[[i]] == "t") {
    conditional <- conditional^2
    df1 <- 1
    df2 <- RPPdata$T.df2.O[i]
    signLower <- sign(lower)
    lower <- lower^2
    upper <- upper^2
    
    correctedCor[i] <- F2cor(conditional, df1, df2)
    lowerCI[i] <- F2cor(lower, df1, df2) * signLower
    upperCI[i] <- F2cor(upper, df1, df2)
  }
  
  if(statType[i] == "F") {
    df1 <- RPPdata$T.df1.O[i]
    df2 <- RPPdata$T.df2.O[i]

    correctedCor[i] <- F2cor(conditional, df1, df2)
    lowerCI[i] <- F2cor(lower, df1, df2) * signLower
    upperCI[i] <- F2cor(upper, df1, df2)
  }
}

# non-significant corrected equals original non-significant
originalPvals <- RPPdata$T.pval.recalc.O
originalPvals[is.na(originalPvals)] <- 1
correctedCor[originalPvals >= 0.06] <- RPPdata$T.r.O[originalPvals >= 0.06]

RPPdata$correctedScor <- correctedCor

# Recalculating power
correctedPower <- rep(NA, length(correctedCor))
for(i in 1:length(correctedCor)) {
  if(length(conditionalList[[i]]) == 1) next
  
  if(class(conditionalList[[i]]) == "t.correction") {
    power <- calculate.power(conditionalList[[i]], newDF = RPPdata$T.N.R[i] - 1)[[2]]
  } else if(class(conditionalList[[i]]) == "chisq.correction") {
    power <- calculate.power(conditionalList[[i]], 
                             originalN = RPPdata$T.N.O[i],
                             newN = RPPdata$T.N.R[i])[[2]]
  } else if(class(conditionalList[[i]]) == "Fval.correction") {
    power <- calculate.power(conditionalList[[i]],
                             newN = RPPdata$T.N.R[i])[[2]]
  }
  correctedPower[i] <- power
}

correctedPower[is.na(correctedPower)] <- RPPdata$Power.Rn[is.na(correctedPower)]

RPPdata$correctedPower <- correctedPower
RPPdata$lowerCI <- lowerCI
RPPdata$upperCI <- upperCI

xDense <- ggplot(RPPdata, aes(x=correctedScor, fill=oriSig)) + 
  geom_density(aes(y= ..count..),trim=F,alpha=.5) + 
  xlab("") + ylab("") + xlim(0,1) +
  gg.theme("noax") + 
  theme(legend.position = "none",plot.margin = unit(c(0,0,0,4), "lines"))

# Y margin density plot (note: gg.theme() from C-3PR can be used directly in a ggplot2() call)
yDense <- ggplot(RPPdata, aes(x=T.r.R, fill=repSig)) + 
  geom_density(aes(y= ..count..),trim=F,alpha=.5) + 
  xlab("") + ylab("") + xlim(-.5,1) + 
  coord_flip() + 
  gg.theme("noax") + 
  theme(legend.position = "none", plot.margin = unit(c(0,0,3,0), "lines")) 

# STAT_CORRECTION ORIGINAL
# The main scatterplot (note: gg.theme() from C-3PR can be used directly in a ggplot2() call)
xDense <- ggplot(RPPdata, aes(x=T.r.O, fill=oriSig)) + 
  geom_density(aes(y= ..count..),trim=F,alpha=.5) + 
  xlab("") + ylab("") + xlim(0,1) +
  gg.theme("noax") + 
  theme(legend.position = "none",plot.margin = unit(c(0,0,0,4), "lines"))

# Y margin density plot (note: gg.theme() from C-3PR can be used directly in a ggplot2() call)
yDense <- ggplot(RPPdata, aes(x=T.r.R, fill=repSig)) + 
  geom_density(aes(y= ..count..),trim=F,alpha=.5) + 
  xlab("") + ylab("") + xlim(-.5,1) + 
  coord_flip() + 
  gg.theme("noax") + 
  theme(legend.position = "none", plot.margin = unit(c(0,0,3,0), "lines")) 

notInCI <- with(RPPdata, T.r.R < lowerCI | T.r.R > upperCI)
RPPdata$notInCI <- notInCI

# CONFIDENCE INTERVAL TABLE ----------------------------
CIdata <- with(RPPdata, data.frame(Local_ID = Local.ID,
                         Lower_CI = lowerCI, 
                         original_study = T.r.O, 
                         conditional_estimate = correctedScor, 
                         replication_study = T.r.R, 
                         Upper_CI = upperCI,
                         Stat_Type = T.Test.Statistic.O,
                         DF1_original = T.df1.O,
                         DF2_original = T.df2.O,
                         original_pvalues = T.pval.recalc.O))
write.csv(CIdata, file = "estimates and confidence intervals.csv")

# ORIGINAL FIGURE WITH CONFIDENCE INTERFVALS ----------------------
forPlot <- RPPdata[RPPdata$T.pval.recalc.O <= 0.05, ]
excludeF <- which(forPlot$T.Test.Statistic.O != "F" & forPlot$T.df2.O >1)
forPlot <- forPlot[-excludeF,]
scatterP<-
  ggplot(forPlot, aes(x=T.r.O,y=T.r.R)) +  
  geom_hline(aes(yintercept=0),linetype=2) +
  geom_abline(intercept=0,slope=1,color="Grey60")+
  geom_point(aes(size=Power.Rn,fill=repSig),color="Grey30",shape=21,alpha=.8) + 
  geom_rug(aes(color=oriSig),size=1,sides="b",alpha=.6) + 
  geom_rug(aes(color=repSig),size=1,sides="l",alpha=.6) + 
  scale_x_continuous(name="Original Effect Size",limits=c(0,1),breaks=c(0,.25,.5,.75,1)) + 
  scale_y_continuous(name="Replication Effect Size",limits=c(-.5,1),breaks=c(-.5,-.25,0,.25,.5,.75,1)) + 
  ggtitle("") + xlab("") + ylab("") + 
  scale_size_continuous(name="Replication Power",range=c(2,9)) + 
  scale_color_discrete(name="p-value") +
  scale_fill_discrete(name="p-value") +
  gg.theme("clean") + 
  theme(legend.position=c(.915,.6), plot.margin = unit(c(-2,-1.5,2.5,2), "lines")) + 
  geom_segment(aes(xend = T.r.O,
                   y = lowerCI, yend = upperCI, col = repSig), linetype = 1, alpha = 0.8, size = 0.5) + 
  geom_curve(aes(x = T.r.O - 0.0075, xend = T.r.O + 0.0075,
                   y = lowerCI, yend = lowerCI, col = repSig), curvature = 0.5, linetype = 1, size = 1.2) +
  geom_curve(aes(x = T.r.O - 0.0075, xend = T.r.O + 0.0075,
                   y = upperCI, yend = upperCI, col = repSig), curvature = -0.5, linetype = 1, size = 1.2) + 
  geom_point(aes(y = correctedScor), col = "black", size = 5, shape = 19) +
  geom_point(data = subset(forPlot, notInCI), aes(size = Power.Rn), shape = 21, col = "black", fill = "black", alpha = 0.3)


# Yet another way to organise plots: grid.arrange() from the gridExtra package.
grid.arrange(xDense, blankPlot, scatterP, yDense, ncol=2, nrow=2, widths=c(4, 1.4), heights=c(1.4, 4))

# SAVE combined plots as PDF
pdf("RPP_Figure3_original_5_noF.pdf",pagecentre=T, width=15*1.15,height=12*1.15 ,paper = "special")
gridExtra::grid.arrange(xDense, blankPlot, scatterP, yDense, ncol=2, nrow=2, widths=c(4, 1), heights=c(1, 4))
dev.off()

# ORIGINAL - SIZE BY P-VALUES ------------------------
RPPdata$pvalue_quantile <- order(-RPPdata$T.pval.recalc.O)/length(RPPdata$T.pval.recalc.O)
forPlot <- RPPdata[RPPdata$T.pval.recalc.O <= 0.05, ]
excludeF <- which(forPlot$T.Test.Statistic.O != "F" & forPlot$T.df2.O >1)
forPlot <- forPlot[-excludeF,]
scatterP<-
  ggplot(forPlot, aes(x=T.r.O, y=T.r.R)) +  
  geom_hline(aes(yintercept=0),linetype=2) +
  geom_abline(intercept=0,slope=1,color="Grey60")+
  geom_point(aes(size=pvalue_quantile,fill=repSig),color="Grey30",shape=21,alpha=.8) + 
  geom_rug(aes(color=oriSig),size=1,sides="b",alpha=.6) + 
  geom_rug(aes(color=repSig),size=1,sides="l",alpha=.6) + 
  scale_x_continuous(name="Original Effect Size",limits=c(0,1),breaks=c(0,.25,.5,.75,1)) + 
  scale_y_continuous(name="Replication Effect Size",limits=c(-.5,1),breaks=c(-.5,-.25,0,.25,.5,.75,1)) + 
  ggtitle("") + xlab("") + ylab("") + 
  scale_size_continuous(name="p-value quantile",range=c(2,9)) + 
  scale_color_discrete(name="Replication p-value") +
  scale_fill_discrete(name="Replication p-value") +
  gg.theme("clean") + 
  theme(legend.position=c(.915,.6), plot.margin = unit(c(-2,-1.5,2.5,2), "lines")) + 
  geom_segment(aes(xend = T.r.O,
                   y = lowerCI, yend = upperCI, col = repSig), linetype = 1, alpha = 0.8, size = 0.5) + 
  geom_curve(aes(x = T.r.O - 0.0075, xend = T.r.O + 0.0075,
                 y = lowerCI, yend = lowerCI, col = repSig), curvature = 0.5, linetype = 1, size = 1.2) +
  geom_curve(aes(x = T.r.O - 0.0075, xend = T.r.O + 0.0075,
                 y = upperCI, yend = upperCI, col = repSig), curvature = -0.5, linetype = 1, size = 1.2) + 
  geom_point(aes(y = correctedScor), col = "black", size = 5, shape = 19) +
  geom_point(data = subset(forPlot, notInCI), aes(size = pvalue_quantile), shape = 21, col = "black", fill = "black", alpha = 0.3)

grid.arrange(xDense, blankPlot, scatterP, yDense, ncol=2, nrow=2, widths=c(4, 1.4), heights=c(1.4, 4))

pdf("RPP_Figure3_pvaluesize_noF.pdf",pagecentre=T, width=15*1.15,height=12*1.15 ,paper = "special")
gridExtra::grid.arrange(xDense, blankPlot, scatterP, yDense, ncol=2, nrow=2, widths=c(4, 1), heights=c(1, 4))
dev.off()


# STAT_CORRECTION Corrected
# The main scatterplot (note: gg.theme() from C-3PR can be used directly in a ggplot2() call)
xDense <- ggplot(RPPdata, aes(x=correctedScor, fill=oriSig)) + 
  geom_density(aes(y= ..count..),trim=F,alpha=.5) + 
  xlab("") + ylab("") + xlim(0,1) +
  gg.theme("noax") + 
  theme(legend.position = "none",plot.margin = unit(c(0,0,0,4), "lines"))

# Y margin density plot (note: gg.theme() from C-3PR can be used directly in a ggplot2() call)
yDense <- ggplot(RPPdata, aes(x=T.r.R, fill=repSig)) + 
  geom_density(aes(y= ..count..),trim=F,alpha=.5) + 
  xlab("") + ylab("") + xlim(-.5,1) + 
  coord_flip() + 
  gg.theme("noax") + 
  theme(legend.position = "none", plot.margin = unit(c(0,0,3,0), "lines"))

notInCI <- with(RPPdata, T.r.R < lowerCI | T.r.R > upperCI)

# FIGURE 3 WITH CONDITIONAL ESTIMATES --------------------------
forPlot <- RPPdata[RPPdata$T.pval.recalc.O <= 0.05, ]
excludeF <- which(forPlot$T.Test.Statistic.O != "F" & forPlot$T.df2.O >1)
forPlot <- forPlot[-excludeF,]
scatterP<-
  ggplot(RPPdata,aes(x=correctedScor, y=T.r.R)) +  
  geom_hline(aes(yintercept=0),linetype=2) +
  geom_abline(intercept=0,slope=1,color="Grey60")+
  geom_point(aes(size=correctedPower,fill=repSig),color="Grey30",shape=21,alpha=.8) + 
  geom_rug(aes(color=oriSig),size=1,sides="b",alpha=.6) + 
  geom_rug(aes(color=repSig),size=1,sides="l",alpha=.6) + 
  scale_x_continuous(name="Original Effect Size",limits=c(0,1),breaks=c(0,.25,.5,.75,1)) + 
  scale_y_continuous(name="Replication Effect Size",limits=c(-.5,1),breaks=c(-.5,-.25,0,.25,.5,.75,1)) + 
  ggtitle("") + xlab("") + ylab("") + 
  scale_size_continuous(name="Replication Power",range=c(0,10)) + 
  scale_color_discrete(name="p-value") +
  scale_fill_discrete(name="p-value") +
  gg.theme("clean") + 
  theme(legend.position=c(.915,.6), plot.margin = unit(c(-2,-1.5,2.5,2), "lines")) + 
  geom_segment(aes(xend = correctedScor,
                   y = lowerCI, yend = upperCI, col = repSig), linetype = 1, alpha = 0.8, size = 0.5) + 
  geom_curve(aes(x = correctedScor - 0.0075, xend = correctedScor + 0.0075,
                 y = lowerCI, yend = lowerCI, col = repSig), curvature = 0.5, linetype = 1, size = 1.2) +
  geom_curve(aes(x = correctedScor - 0.0075, xend = correctedScor + 0.0075,
                 y = upperCI, yend = upperCI, col = repSig), curvature = -0.5, linetype = 1, size = 1.2) + 
  geom_point(data = subset(RPPdata, notInCI), aes(size = correctedPower), shape = 21, col = "black", fill = "black", alpha = 0.3)


# Yet another way to organise plots: grid.arrange() from the gridExtra package.
gridExtra::grid.arrange(xDense, blankPlot, scatterP, yDense, ncol=2, nrow=2, widths=c(4, 1.4), heights=c(1.4, 4))

# SAVE combined plots as PDF
pdf("RPP_Figure3_stat_5_noF.pdf",pagecentre=T, width=15*1.15,height=12*1.15 ,paper = "special")
gridExtra::grid.arrange(xDense, blankPlot, scatterP, yDense, ncol=2, nrow=2, widths=c(4, 1), heights=c(1, 4))
dev.off()

ggplot(RPPdata) +
  geom_density(aes(x = correctedPower, fill = repSig), alpha = 0.5) 

ggplot(RPPdata) +
  geom_density(aes(x = Power.Rn, fill = repSig), alpha = 0.5)

plot(x = correctedPower, y = RPPdata$repSig == "Significant")
lines(lowess(x = correctedPower, y = RPPdata$repSig == "Significant"),type = "l")
abline(a = 0, b = 1)

# SMOOOOOTHING --------------------------------------------
forPlot <- RPPdata
forPlot <- forPlot[forPlot$T.pval.recalc.O <= 0.05, ]
forPlot <- forPlot[-excludeF, ]
ggplot(forPlot) +
  geom_smooth(aes(x = T.r.O, y = T.r.R), col = "blue") +
  geom_point(aes(x = T.r.O, y = T.r.R), col = "blue") +
  geom_smooth(aes(x = correctedScor, y = T.r.R), col = "red") +
  geom_point(aes(x = correctedScor, y = T.r.R), col = "red") + 
  geom_abline(intercept = 0, slope = 1) + 
  theme_bw() +
  xlab("Original Effect Size Estimate") + 
  ylab("Replicated Effect Size Estimate") + 
  geom_segment(aes(y = T.r.R, yend = T.r.R, x = T.r.O, xend = correctedScor),
               col = "black", linetype = 1, alpha = 0.5, arrow = arrow(length = unit(0.3, "cm")))
  
  
# Figure Plot by Mean Estimated Signal Size --------------------
ncp <- unlist(sapply(conditionalList, function(x) if(length(x) > 0) x[2]))
ncp[statType == "t"] <- ncp[statType == "t"]^2
conditional.ncp <- unlist(sapply(conditionalList, function(x) if(length(x) > 0) x[4]))^2
ncpCI <- matrix(nrow = length(ncp), ncol = 2)
for(i in 1:length(ncp)) {
  if(length(conditionalList[[i]]) > 1)
  ncpCI[i, ] <- conditionalList[[i]][[6]]
}
CIsignSwitch <- which(sign(ncpCI[, 1]) != sign(ncpCI[, 2]))
ncpCI <- pmax(ncpCI, min(ncpCI[ncpCI > 0], na.rm = TRUE) / 10)
ncpCI <- ncpCI ^ 2

# Computing replicated NCP
R.statType <- RPPdata$T.Test.Statistic.R
R.ncp <- ncp
for(i in 1:length(ncp)) {
  if(R.statType[i] == "t") {
    df <- RPPdata$T.df2.R[i]
    if(df > 90) df <- 90
    R.ncp[i] <- t.to.ncp(RPPdata$T.Test.value.R[i], df)^2
  } else if(R.statType[i] == "F") {
    R.ncp[i] <- with(RPPdata, Fval.to.ncp(T.Test.value.R[i], T.df1.R[i], T.df2.R[i]))
  } else {
    R.ncp[i] <- with(RPPdata, chisq.to.ncp(T.Test.value.R[i], T.df1.R[i]))
  }
}

# Transforming to log scale
ncp <- log(ncp)
R.ncp <- pmax(R.ncp, min(R.ncp[R.ncp != 0] / 10, na.rm = TRUE))
R.ncp <- log(R.ncp)
conditional.ncp <- log(conditional.ncp)
ncpCI <- log(ncpCI)
# Fixing replicated signs
signSwitch <- with(RPPdata, which(sign(T.r.O) != sign(T.r.R)))
R.ncp[signSwitch] <- -R.ncp[signSwitch]

forNCPplot <- data.frame(ncp = ncp, conditinoalNCP = conditional.ncp,
                         lowerCI = ncpCI[, 1], upperCI = ncpCI[, 2],
                         repNCP = R.ncp,
                         condPower = RPPdata$correctedPower,
                         repSig = RPPdata$repSig)

ggplot(forNCPplot) +
  geom_point(aes(x = conditional.ncp, y = R.ncp, fill = factor(repSig), size = correctedPower), color="Grey30",shape=21,alpha=.8) +
  theme_bw() + 
  geom_abline(intercept = 0, slope = 1)






# Same calculations but with 0.01 threshold ---------------------------
source("t-correction.R")
source("F-correction.R")
source("chisq-correction.R")

# Performing conditional inference on test statistics
naIndex <- !is.na(RPPdata$T.r.O)
statistics <- RPPdata$T.Test.value.O
statType <- RPPdata$T.Test.Statistic.O
conditionalList <- lapply(1:length(statType), function(x) x)
i <- 1
while(i <= length(conditionalList)) {
  if((is.na(RPPdata$T.pval.recalc.O[i]) | 
      !naIndex[i]) & 
     i != 80) {
    i <- i + 1
    next
  }
  
  if(i == 80) {
    alpha.threshold <- 0.05
  } else if(RPPdata$T.pval.recalc.O[i] >= 0.06) {
    alpha.threshold <- 1
  } else if(RPPdata$T.pval.recalc.O[i] >= 0.05) {
    alpha.threshold <- 0.06
  } else if(RPPdata$T.pval.recalc.O[i] >= 0.005) {
    alpha.threshold <- 0.05
  } else {
    alpha.threshold <- 0.005
  }
  
  if(any(statType[i] %in% c("z", "t", "r"))) {
    if(statType[i] == "t") {
      df <- RPPdata$T.df2.O[i]
      t <- statistics[i]
      if(df > 90) df <- 100
    } else if(statType[i] == "z") {
      df <- Inf
      t <- statistics[i]
    } else if(statType[i] == "r") {
      df <- Inf
      r <- statistics[i]
      t <- 0.5 * log((1 + r) / (1 - r))
      t <- t / sqrt(RPPdata$T.N.O[i] - 3)^-1
    }
    conditionalList[[i]] <- t.selection.correction(t, df, alpha.threshold, 0.05)
  } else if(any(statType[i] %in% c("Chi", "Chi2"))) {
    df <- RPPdata$T.df1.O[i]
    chi <- statistics[i]
    conditionalList[[i]] <- chisq.selection.correction(chi, df, alpha.threshold, 0.05)
  } else if(statType[i] == "F") {
    df1 <- RPPdata$T.df1.O[i]
    df2 <- RPPdata$T.df2.O[i]
    Fval <- statistics[i]
    if(df1 == 1) {
      statType[i] <- "t"
      statistics[i] <- sqrt(statistics[i])
      next
    }
    conditionalList[[i]] <- Fdist.selection.correction(Fval, df1, df2, alpha.threshold, 0.05)
  }
  print(i)
  i <- i + 1
}

# Converting conditional test statistics to correlations
F2cor <- function(Fval, df1, df2) {
  sqrt((Fval * (df1 / df2)) / (((Fval * df1) / df2) + 1)) * sqrt(1 / df1)
}
correctedCor <- rep(NA, length(statistics))
lowerCI <- rep(NA, length(statistics))
upperCI <- rep(NA, length(statistics))
N <- RPPdata$T.N.O
for(i in 1:length(correctedCor)) {
  if(!naIndex[i]) next
  if(is.na(RPPdata$T.pval.recalc.O[i]) & i != 80) next
  
  conditional <- conditionalList[[i]][[3]]
  lower <- conditionalList[[i]][[5]][1]
  upper <- conditionalList[[i]][[5]][2]
  
  if(any(statType[i] %in% c("z", "r"))) {
    n <- N[i]
    z <- conditional * sqrt(n - 3)^-1
    correctedCor[i] <- z2cor(z, n)
    lowerCI[i] <- z2cor(lower, n)
    upperCI[i] <- z2cor(upper, n)
  }
  
  if(any(statType[i] %in% c("Chi", "Chi2"))) {
    n <- N[i]
    correctedCor[i] <- sqrt(conditional / n)
    lowerCI[i] <- sqrt(lower / n)
    upperCI[i] <- min(sqrt(upper / n), 1)
  }
  
  if(statType[[i]] == "t") {
    conditional <- conditional^2
    df1 <- 1
    df2 <- RPPdata$T.df2.O[i]
    signLower <- sign(lower)
    lower <- lower^2
    upper <- upper^2
    
    correctedCor[i] <- F2cor(conditional, df1, df2)
    lowerCI[i] <- F2cor(lower, df1, df2) * signLower
    upperCI[i] <- F2cor(upper, df1, df2)
  }
  
  if(statType[i] == "F") {
    df1 <- RPPdata$T.df1.O[i]
    df2 <- RPPdata$T.df2.O[i]
    
    correctedCor[i] <- F2cor(conditional, df1, df2)
    lowerCI[i] <- F2cor(lower, df1, df2) * signLower
    upperCI[i] <- F2cor(upper, df1, df2)
  }
}

# non-significant corrected equals original non-significant
originalPvals <- RPPdata$T.pval.recalc.O
originalPvals[is.na(originalPvals)] <- 1
correctedCor[originalPvals >= 0.06] <- RPPdata$T.r.O[originalPvals >= 0.06]

RPPdata$correctedScor <- correctedCor

# Recalculating power
correctedPower <- rep(NA, length(correctedCor))
for(i in 1:length(correctedCor)) {
  if(length(conditionalList[[i]]) == 1) next
  
  if(class(conditionalList[[i]]) == "t.correction") {
    power <- calculate.power(conditionalList[[i]], newDF = RPPdata$T.N.R[i] - 1)[[2]]
  } else if(class(conditionalList[[i]]) == "chisq.correction") {
    power <- calculate.power(conditionalList[[i]], 
                             originalN = RPPdata$T.N.O[i],
                             newN = RPPdata$T.N.R[i])[[2]]
  } else if(class(conditionalList[[i]]) == "Fval.correction") {
    power <- calculate.power(conditionalList[[i]],
                             newN = RPPdata$T.N.R[i])[[2]]
  }
  correctedPower[i] <- power
}

correctedPower[is.na(correctedPower)] <- RPPdata$Power.Rn[is.na(correctedPower)]

RPPdata$correctedPower <- correctedPower
RPPdata$lowerCI <- lowerCI
RPPdata$upperCI <- upperCI

xDense <- ggplot(RPPdata, aes(x=correctedScor, fill=oriSig)) + 
  geom_density(aes(y= ..count..),trim=F,alpha=.5) + 
  xlab("") + ylab("") + xlim(0,1) +
  gg.theme("noax") + 
  theme(legend.position = "none",plot.margin = unit(c(0,0,0,4), "lines"))

# Y margin density plot (note: gg.theme() from C-3PR can be used directly in a ggplot2() call)
yDense <- ggplot(RPPdata, aes(x=T.r.R, fill=repSig)) + 
  geom_density(aes(y= ..count..),trim=F,alpha=.5) + 
  xlab("") + ylab("") + xlim(-.5,1) + 
  coord_flip() + 
  gg.theme("noax") + 
  theme(legend.position = "none", plot.margin = unit(c(0,0,3,0), "lines")) 

# STAT_CORRECTION ORIGINAL
notInCI <- with(RPPdata, T.r.R < lowerCI | T.r.R > upperCI)
RPPdata$notInCI <- notInCI
RPPdata$onePercentSig <- RPPdata$T.pval.recalc.O <= 0.005
forPlot <- RPPdata[RPPdata$onePercentSig, ]

# The main scatterplot (note: gg.theme() from C-3PR can be used directly in a ggplot2() call)
xDense <- ggplot(forPlot, aes(x=T.r.O, fill=oriSig)) + 
  geom_density(aes(y= ..count..),trim=F,alpha=.5) + 
  xlab("") + ylab("") + xlim(0,1) +
  gg.theme("noax") + 
  theme(legend.position = "none",plot.margin = unit(c(0,0,0,4), "lines"))

# Y margin density plot (note: gg.theme() from C-3PR can be used directly in a ggplot2() call)
yDense <- ggplot(forPlot, aes(x=T.r.R, fill=repSig)) + 
  geom_density(aes(y= ..count..),trim=F,alpha=.5) + 
  xlab("") + ylab("") + xlim(-.5,1) + 
  coord_flip() + 
  gg.theme("noax") + 
  theme(legend.position = "none", plot.margin = unit(c(0,0,3,0), "lines")) 


excludeF <- which(forPlot$T.Test.Statistic.O != "F" & forPlot$T.df2.O >1)
forPlot <- forPlot[-excludeF,]
scatterP<-
  ggplot(forPlot, aes(x=T.r.O,y=T.r.R)) +  
  geom_hline(aes(yintercept=0),linetype=2) +
  geom_abline(intercept=0,slope=1,color="Grey60")+
  geom_point(aes(size=Power.Rn,fill=repSig),color="Grey30",shape=21,alpha=.8) + 
  geom_rug(aes(color=oriSig),size=1,sides="b",alpha=.6) + 
  geom_rug(aes(color=repSig),size=1,sides="l",alpha=.6) + 
  scale_x_continuous(name="Original Effect Size",limits=c(0,1),breaks=c(0,.25,.5,.75,1)) + 
  scale_y_continuous(name="Replication Effect Size",limits=c(-.5,1),breaks=c(-.5,-.25,0,.25,.5,.75,1)) + 
  ggtitle("") + xlab("") + ylab("") + 
  scale_size_continuous(name="Replication Power",range=c(2,9)) + 
  scale_color_discrete(name="p-value") +
  scale_fill_discrete(name="p-value") +
  gg.theme("clean") + 
  theme(legend.position=c(.915,.6), plot.margin = unit(c(-2,-1.5,2.5,2), "lines")) + 
  geom_segment(aes(xend = T.r.O,
                   y = lowerCI, yend = upperCI, col = repSig), linetype = 1, alpha = 0.8, size = 0.5) + 
  geom_curve(aes(x = T.r.O - 0.0075, xend = T.r.O + 0.0075,
                 y = lowerCI, yend = lowerCI, col = repSig), curvature = 0.5, linetype = 1, size = 1.2) +
  geom_curve(aes(x = T.r.O - 0.0075, xend = T.r.O + 0.0075,
                 y = upperCI, yend = upperCI, col = repSig), curvature = -0.5, linetype = 1, size = 1.2) + 
  geom_point(aes(y = correctedScor), col = "black", size = 5, shape = 19) +
  geom_point(data = subset(forPlot, notInCI), aes(size = Power.Rn), shape = 21, col = "black", fill = "black", alpha = 0.3)


# Yet another way to organise plots: grid.arrange() from the gridExtra package.
grid.arrange(xDense, blankPlot, scatterP, yDense, ncol=2, nrow=2, widths=c(4, 1.4), heights=c(1.4, 4))

# SAVE combined plots as PDF
pdf("RPP_Figure3_original_verySig_005_noF.pdf",pagecentre=T, width=15*1.15,height=12*1.15 ,paper = "special")
gridExtra::grid.arrange(xDense, blankPlot, scatterP, yDense, ncol=2, nrow=2, widths=c(4, 1), heights=c(1, 4))
dev.off()

# STAT_CORRECTION Corrected
# The main scatterplot (note: gg.theme() from C-3PR can be used directly in a ggplot2() call)
xDense <- ggplot(RPPdata, aes(x=correctedScor, fill=oriSig)) + 
  geom_density(aes(y= ..count..),trim=F,alpha=.5) + 
  xlab("") + ylab("") + xlim(0,1) +
  gg.theme("noax") + 
  theme(legend.position = "none",plot.margin = unit(c(0,0,0,4), "lines"))

# Y margin density plot (note: gg.theme() from C-3PR can be used directly in a ggplot2() call)
yDense <- ggplot(RPPdata, aes(x=T.r.R, fill=repSig)) + 
  geom_density(aes(y= ..count..),trim=F,alpha=.5) + 
  xlab("") + ylab("") + xlim(-.5,1) + 
  coord_flip() + 
  gg.theme("noax") + 
  theme(legend.position = "none", plot.margin = unit(c(0,0,3,0), "lines"))

scatterP<-
  ggplot(RPPdata,aes(x=correctedScor, y=T.r.R)) +  
  geom_hline(aes(yintercept=0),linetype=2) +
  geom_abline(intercept=0,slope=1,color="Grey60")+
  geom_point(aes(size=correctedPower,fill=repSig),color="Grey30",shape=21,alpha=.8) + 
  geom_rug(aes(color=oriSig),size=1,sides="b",alpha=.6) + 
  geom_rug(aes(color=repSig),size=1,sides="l",alpha=.6) + 
  scale_x_continuous(name="Original Effect Size",limits=c(0,1),breaks=c(0,.25,.5,.75,1)) + 
  scale_y_continuous(name="Replication Effect Size",limits=c(-.5,1),breaks=c(-.5,-.25,0,.25,.5,.75,1)) + 
  ggtitle("") + xlab("") + ylab("") + 
  scale_size_continuous(name="Replication Power",range=c(0,10)) + 
  scale_color_discrete(name="p-value") +
  scale_fill_discrete(name="p-value") +
  gg.theme("clean") + 
  theme(legend.position=c(.915,.6), plot.margin = unit(c(-2,-1.5,2.5,2), "lines")) + 
  geom_segment(aes(xend = correctedScor,
                   y = lowerCI, yend = upperCI, col = repSig), linetype = 1, alpha = 1, size = 1.1) + 
  geom_curve(aes(x = correctedScor - 0.0075, xend = correctedScor + 0.0075,
                 y = lowerCI, yend = lowerCI, col = repSig), curvature = 0.5, linetype = 1) +
  geom_curve(aes(x = correctedScor - 0.0075, xend = correctedScor + 0.0075,
                 y = upperCI, yend = upperCI, col = repSig), curvature = - 0.5, linetype = 1) +
  geom_point(data = subset(RPPdata, notInCI), aes(size = correctedPower), shape = 21, col = "black", fill = "black", alpha = 0.3)


# Yet another way to organise plots: grid.arrange() from the gridExtra package.
gridExtra::grid.arrange(xDense, blankPlot, scatterP, yDense, ncol=2, nrow=2, widths=c(4, 1.4), heights=c(1.4, 4))

# SAVE combined plots as PDF
pdf("RPP_Figure3_stat_verySig_1.pdf",pagecentre=T, width=15*1.15,height=12*1.15 ,paper = "special")
gridExtra::grid.arrange(xDense, blankPlot, scatterP, yDense, ncol=2, nrow=2, widths=c(4, 1), heights=c(1, 4))
dev.off()
ammeir2/selectiveTweedy documentation built on May 24, 2019, 3:02 a.m.