dataAnalysis/TweedyScience.R

convertToZ <- function(t, type, df1, df2) {
  z <- numeric(length(t))
  for(i in 1:length(t)) {
    print(i)
    if(type[i] == "F") {
      z[i] <- qnorm(pf(t[i], df1[i], df2[i]))
    } else if(type[i] == "t") {
      z[i] <- qnorm(pt(t[i], df1[i]))
    } else if(type[i] %in% c("Chi2", "Chi")) {
      z[i] <- qnorm(pchisq(t[i], df1[i]))
    } else if(type[i] == "z") {
      z[i] <- t[i]
    } else if(type[i] == "r") {
      z[i] <- atan(t[i]) * sqrt(df1[1] - 3)
    }
    print(z[i])
    a <- 0
  }

  return(z)
}

# ############################################################
# #        RPP Manuscript Figure 1,2,3                       #
# #                                                          #
# # Created by Fred Hasselman on behalf of the RPP intiative #
# ############################################################
#
# # All the custom functions that are required to recreate the Figures are available in a GitHub sourceable file: `C-3PR.R`
# # It is available here: https://github.com/FredHasselman/toolboxR
#
# ############################################################
# # C-3PR: Evaluating Scientific *C*laims: *P*ost *P*ublication *P*eer *R*eview, *R*eplication and *R*eproduction.
# #
# # An R function library created in the contex of analysing and visualising data generated by several Open Science intiatives.
# #
# # C-3PR Created by [Fred Hasselman](https://osf.io/ujgs6/) on behalf of the ManyLabs1, ManyLabs2, RPP and Curate Science projects
# ############################################################
#
#
# # SETUP local R-------------------------------------------------------------------
#
# # Use this code (from the devtools package) to source C-3PR directly from GitHub:
# require(devtools)
# source_url('https://raw.githubusercontent.com/FredHasselman/toolboxR/master/C-3PR.R')
#
# # This will load and (if necessary) install libraries frequently used for data management and plotting
# in.IT(c('ggplot2','RColorBrewer','lattice','gridExtra','plyr','dplyr','httr'))
#
# # Read the data from the OSF storage
# # Note: get.OSFfile() returns a list with the .csv data (df) and information (info) containing the URL download timestamp and original column and rownames (these names will be changed if dfCln=TRUE).
# RPPdata <- get.OSFfile(code='https://osf.io/fgjvw/',dfCln=T)$df
#
# ## If you dowloaded the csv file to your harddrive use this code:
# RPPdata <- read.csv('data/rpp_data.csv', stringsAsFactors = FALSE )
# RPPdata <- df.Clean(RPPdata)
# RPPdata <- RPPdata$df
#
# saveRDS(RPPdata, file = "data/rppclean.rds")
#

# Loading data ------------------------
RPPdata <- readRDS(file = "dataAnalysis/rppclean.rds")

sortPvals <- sort(RPPdata$T.pval.recalc.O)
sum(sortPvals[1:(length(sortPvals)-1)] == sortPvals[2:length(sortPvals)])
which(sortPvals[1:(length(sortPvals)-1)] == sortPvals[2:length(sortPvals)])

# Select the completed replication studies
RPPdata <- dplyr::filter(RPPdata, !is.na(T.pval.USE.O), !is.na(T.pval.USE.R))

# We have 99 studies for which p-values and effect sizes could be calculated
nrow(RPPdata)
# We have 97 studies for which p-values of the original effect were significantly below .05
idOK <- complete.cases(RPPdata$T.r.O,RPPdata$T.r.R)
sum(idOK)

# Starting Analysis -------------------
RPPdata$Test.statistic.O
RPPdata$T.Test.value.O
stats <- RPPdata$T.Test.value.O
statType <- RPPdata$T.Test.Statistic.O
df1 <- RPPdata$T.df1.O
df2 <- RPPdata$T.df2.O
df1[statType == "r"] <- RPPdata$T.N.O

originalZ <- qnorm(1 - pmax(RPPdata$T.pval.recalc.O, 10^-15) / 2)
repZ <- qnorm(1 - pmax(RPPdata$T.pval.recalc.R, 10^-15) / 2) * (1 - 2 * (RPPdata$Direction.R == "opposite"))
whichNA <- which(is.na(originalZ))
originalZ <- originalZ[-whichNA]

fit <- estimateTweedy(originalZ, k = 4, iterations = 2000,
                      keepEach = 50, nEstimates = 20,
                      maxsamp = length(originalZ) / 0.05,
                      lower = qnorm(0.025),
                      upper = qnorm(.975),
                      verbose = TRUE)

par(mfrow = c(1, 1), mar = rep(4, 4))
adjusted <- predict(fit)
plot(originalZ, adjusted)
points(originalZ, repZ[-whichNA], col = "blue")
abline(a = 0, b = 1)
plot(repZ[-whichNA], adjusted)
abline(a = 0, b = 1)

mean(originalZ - repZ, na.rm = TRUE)
mean(adjusted - repZ, na.rm = TRUE)

mean((originalZ - repZ)^2, na.rm = TRUE)
mean((adjusted - repZ)^2, na.rm = TRUE)

plot(density(originalZ))
lines(density(repZ[!is.na(repZ)]), col = "blue")
lines(density(adjusted[!is.na(adjusted)]), col = "red")

# Seriously now ------------------
originalZ <- qnorm(1 - pmax(RPPdata$T.pval.recalc.O, 10^-15) / 2)
# repZ <- qnorm(1 - pmax(RPPdata$T.pval.recalc.R, 10^-15) / 2) * (1 - 2 * (RPPdata$Direction.R == "opposite"))
whichNA <- which(is.na(originalZ))
originalZ <- originalZ[-whichNA]
# switch <- (1 - 2 * rbinom(length(originalZ), 1, 0.5))
s <- length(originalZ)
originalZ <- c(originalZ, -originalZ)
repZ <- qnorm(1 - pmax(RPPdata$T.pval.recalc.R, 10^-15) / 2) * (1 - 2 * (RPPdata$Direction.R == "opposite"))


results <- list()
slot <- 1
for(k in c(9)) {
  if(k == 3) {
    meanc <- c("0a", "-a", "a")
  } else if(k == 5) {
    meanc <- c("0a", "-a", "a", "-b", "b")
  } else if(k == 7) {
    meanc <- c("0a", "-a", "a", "-b", "b", "-c", "c")
  } else if(k == 9) {
    meanc <- c("0a", "-a", "a", "-b", "b", "-c", "c", "-d", "d")
  } else if(k == 11) {
    meanc <- c("0a", "-a", "a", "-b", "b", "-c", "c", "-d", "d", "-e", "e")
  } else if(k == 13) {
    meanc <- c("0a", "-a", "a", "-b", "b", "-c", "c", "-d", "d", "-e", "e", "-f", "f")
  } else if(k == 15) {
    meanc <- c("0a", "-a", "a", "-b", "b", "-c", "c", "-d", "d", "-e", "e", "-f", "f",
               "-g", "g")
  } else if(k == 17) {
    meanc <- c("0a", "-a", "a", "-b", "b", "-c", "c", "-d", "d", "-e", "e", "-f", "f",
               "-g", "g", "-h", "h")
  } else if(k == 19) {
    meanc <- c("0a", "-a", "a", "-b", "b", "-c", "c", "-d", "d", "-e", "e", "-f", "f",
               "-g", "g", "-h", "h", "-i", "i")
  }
  fit <- estimateTweedy(originalZ, k = k, iterations = 400,
                        keepEach = 10, nEstimates = 20,
                        maxsamp = length(originalZ) / 0.05,
                        lower = qnorm(0.025),
                        upper = qnorm(.975),
                        verbose = TRUE,
                        lbound = 0,
                        sd.constr = rep(1.5, k),
                        mean.constr = meanc,
                        arbvar = TRUE,
                        ECM = TRUE)
  results[[slot]] <- fit
  slot <- slot + 1
}
# save(results, file = "dataAnalysis/results/science fit1.Robj")

forplot <- list()
slot <- 1
opposite <- (1 - 2 * (RPPdata$Direction.R == "opposite"))
sizeRatio <- sqrt(RPPdata$T.N.O / RPPdata$T.N.R)
sizeRatio <- sizeRatio[-whichNA]
opposite <- opposite[-whichNA]
for(i in 1:length(results)) {
  fit <- results[[i]]
  adjusted <- predict(fit)[1:s]
  bias <- mean(adjusted - abs(repZ[-whichNA]) * opposite * sizeRatio, na.rm = TRUE)
  bdat <- data.frame(estimate = adjusted,
                    method = paste("k = ", length(fit$fit[[1]]$mu),
                                   " B = ", round(bias, 2), sep = ""))
  forplot[[slot]] <- bdat
  slot <- slot + 1
}

bias <- mean(originalZ[1:s] - abs(repZ[-whichNA]) * opposite * sizeRatio, na.rm = TRUE)
bdat <- data.frame(estimate = adjusted, na.rm = TRUE)
forplot[[slot]] <- data.frame(estimate = originalZ[1:s],
                              method = paste("naive",
                                             " B = ", round(bias, 2), sep = ""))
forplot <- do.call("rbind", forplot)
# forplot$estimate <- abs(forplot$estimate)

forplot <- rbind(forplot, data.frame(estimate = abs(repZ) * opposite * sizeRatio, method = "replication"))


library(ggplot2)
ggplot(subset(forplot, method != "k = 9 B = 0.07")) + geom_density(aes(x = estimate, col = method, linetype = method,
                                   fill = method), alpha = 0.1) +
  theme_bw()

adjusted <- predict(results[[8]])[1:s]
adjusted[adjusted < -10] <- 0
replication <- (abs(repZ) * opposite * sizeRatio)[-whichNA]
naive <- originalZ[1:s]
plot(naive, replication)
abline(v = 0, h = 0)
abline(a = 0, b = 1, col = "grey")
plot(adjusted, replication)
abline(v = 0, h = 0)
abline(a = 0, b = 1, col = "grey")
points(sort(naive), adjusted[order(naive)], col = "blue", type = "l", lwd = 2)

par(mfrow = c(1, 2))
plot(naive, replication)
abline(v = 0, h = 0)
abline(a = 0, b = 1, col = "grey")
plot(adjusted, replication)
abline(v = 0, h = 0)
abline(a = 0, b = 1, col = "grey")
ammeir2/selectiveTweedy documentation built on May 24, 2019, 3:02 a.m.