R/functions.R

Defines functions Z_validity_test gen_YD_dist gen_dists_multi gen_boot_dist compute_stat_multi compute_PQgrids_multi boot_stats_multi boot_compute_stat_multi

Documented in Z_validity_test

boot_compute_stat_multi <- function(boot_dist, m, lambda, K, L0, L1, limit, xis, xiN){

  temp <- list("P_D0" = boot_dist[1:L0, ], "P_D1" = boot_dist[(L0 + 1):(L0 + L1), ]);

  Tw <- compute_stat_multi(temp, m, lambda, K, limit, xis, xiN);

  return(Tw)

}

boot_stats_multi <- function(dists, m, lambda, K, xis, xiN, B, limit){

  L0 <- dim(dists$P_D0)[1];
  L1 <- dim(dists$P_D1)[1];

  YD_dist <- gen_YD_dist(dists, lambda, K);

  boot_stats <- matrix(0,xiN,B);

  for (b in 1:B){
    boot_dist <- gen_boot_dist(YD_dist, m, K, L0, L1);
    boot_stats[, b] <- boot_compute_stat_multi(boot_dist, m, lambda, K, L0, L1, limit, xis, xiN);
  }

  return(boot_stats)

}

compute_PQgrids_multi <- function(Q_D0, P_D0, Q_D1, P_D1, limit){

  L1 <- length(P_D1);
  L0 <- length(P_D0);

  ### Constructing intervals for the test statistic with D=1 ###

  if (Q_D1[L1] == 0){
    # Case where there is no mass in the D=1, Z=0 distribution
    p_D1 <- P_D1 - c(0, P_D1[1:(L1 - 1)]);
    Q1 <- 0;
    P1 <- min(p_D1);
  }
  else{
    D1 <- cbind(P_D1, c(0, P_D1[1:(L1 - 1)]), Q_D1, c(0, Q_D1[1:(L1 - 1)]));
    D1 <- subset(D1, D1[, 3] > D1[ ,4]);

    if (dim(D1)[1] > limit){
      # Coarsen the grid in a way that will preserve the maximum
      # Check how many intervals there are where q_D1>p_D1 and p_D1>q_D1
      q_D1 <- D1[, 3] - D1[, 4];
      p_D1 <- D1[, 1] - D1[, 2];
      positive <- sum(q_D1 > p_D1);
      if(Q_D1[L1] < 0.5 & positive > 0 & positive < dim(D1)[1]){
        # Can exclude any point where the difference is negative
        D1 <- subset(D1, q_D1 > p_D1);
      }
      if (positive > 1){
        # Merge consecutive points with p_D1 = 0
        D1 <- cbind(D1, c(0, D1[2:dim(D1)[1], 1] ==  D1[1:(dim(D1)[1] - 1), 2]));
        ind <- D1[, 5] + c(D1[2:dim(D1)[1], 5], 0);

        D1 <- subset(D1, ind < 2);
        D1[1:(dim(D1)[1] - 1), 3] <- (1- D1[2:dim(D1)[1], 5]) * D1[1:(dim(D1)[1] - 1), 3] + D1[2:dim(D1)[1], 5] * D1[2:dim(D1)[1], 3];
        D1 <- subset(D1, D1[, 5] == 0);
      }
    }
    Q1 <- D1[, 3] - matrix(D1[, 4], nrow = dim(D1)[1], ncol = dim(D1)[1], byrow = T);
    Q1 <- Q1[lower.tri(Q1, T)];
    P1 <- D1[, 1] - matrix(D1[, 2], nrow = dim(D1)[1], ncol = dim(D1)[1], byrow = T);
    P1 <- P1[lower.tri(P1, T)];
  }

  ### Constructing intervals for the test statistic with D=0 ###

  if (P_D0[L0] == 0){
    # case where there is no mass in the D=0 Z=1 distribution
    q_D0 <- Q_D0 - c(0, Q_D0[1:(L0 - 1)]);
    P0 <- 0;
    Q0 <- min(q_D0);
  }
  else{
    D0 <- cbind(P_D0, c(0, P_D0[1:(L0 - 1)]), Q_D0, c(0, Q_D0[1:(L0 - 1)]));
    D0 <- subset(D0, D0[, 1] > D0[, 2])

    if (dim(D0)[1] > limit){
      # Coarsen the grid in a way that will preserve the maximum
      # Check how many intervals there are where q_D0>p_D0 and p_D0>q_D0
      p_D0 <- D0[, 1] - D0[, 2];
      q_D0 <- D0[, 3] - D0[, 4];
      positive <- sum(p_D0 > q_D0);

      if(P_D0[L0] < 0.5 & positive > 0 & positive < dim(D0)[1]){
        # Can exclude any point where the difference is negative
        D0 <- subset(D0, p_D0 > q_D0);
      }

      if (positive > 1){
        # Merge consecutive points with q_D0 = 0
        D0 <- cbind(D0, c(0, D0[2:dim(D0)[1], 3] ==  D0[1:(dim(D0)[1] - 1), 4]));
        ind <- D0[, 5] + c(D0[2:dim(D0)[1], 5], 0);

        D0 <- subset(D0, ind < 2);
        D0[1:(dim(D0)[1] - 1), 1] <- (1- D0[2:dim(D0)[1], 5]) * D0[1:(dim(D0)[1] - 1), 1] + D0[2:dim(D0)[1], 5] * D0[2:dim(D0)[1], 1];
        D0 <- subset(D0, D0[, 5] == 0);
      }
    }
    Q0 <- D0[, 3] - matrix(D0[, 4], nrow = dim(D0)[1], ncol = dim(D0)[1], byrow = T);
    Q0 <- Q0[lower.tri(Q0, T)];
    P0 <- D0[, 1] - matrix(D0[, 2], nrow = dim(D0)[1], ncol = dim(D0)[1], byrow = T);
    P0 <- P0[lower.tri(P0, T)];
  }

  return(list("Q1" = Q1,"P1" = P1, "Q0" = Q0, "P0" = P0));
}

compute_stat_multi <- function(dists, m, lambda, K, limit, xis, xiN){

  Tw <- matrix(0, nrow = K, ncol = xiN);

  for (k in 1:(K - 1)){

    PQ <- compute_PQgrids_multi(dists$P_D0[, k], dists$P_D0[, k + 1] , dists$P_D1[, k], dists$P_D1[, k + 1], limit)

    Q_P_D1 <- PQ$Q1 - PQ$P1;
    P_Q_D0 <- PQ$P0 - PQ$Q0;

    weight_D1 <- sqrt(lambda[k] * PQ$Q1 * (1 - PQ$Q1) + (1 - lambda[k]) * PQ$P1 * (1 - PQ$P1));
    weight_D0 <- sqrt(lambda[k] * PQ$Q0 * (1 - PQ$Q0) + (1 - lambda[k]) * PQ$P0 * (1 - PQ$P0));

    for (x in 1:xiN){

      xi <- xis[x];

      weight_D1 <- pmax(xi, weight_D1);
      weight_D0 <- pmax(xi, weight_D0);

      weightedQ_P_D1 <- Q_P_D1 / weight_D1;
      weightedP_Q_D0 <- P_Q_D0 / weight_D0;

      Tw[k, x] <- (as.numeric(m[k]) * as.numeric(m[k + 1]) / (m[k] + m[k + 1])) ^ (0.5) * max(weightedQ_P_D1, weightedP_Q_D0);
    }
  }

  Twfinal <- rep(0,xiN);
  for (x in 1:xiN){
    Twfinal[x] <- max(Tw[, x])
  }

  return(Twfinal)

}

gen_boot_dist <- function(YD_dist, m , K, L0, L1){

  boot_dist <- matrix(0, L0 + L1, K);

  for (k in 1:K){
    boot_data <- sample.int((L0 + L1), size = m[k], replace = TRUE, prob = YD_dist[, k]);
    boot_pdf <- tabulate(boot_data, nbins = (L0 + L1))/m[k];
    boot_D0 <- cumsum(boot_pdf[1:L0]);
    boot_D1 <- cumsum(boot_pdf[(L0 + 1):(L0 + L1)]);
    boot_dist[, k] <- c(boot_D0, boot_D1)
  }

  return(boot_dist);
}

gen_dists_multi <-function(data, m, Z_order, K){

  YZ_D0 <- subset(data, data$D == 0);
  YZ_D0 <- YZ_D0[order(YZ_D0[, 1]), ];
  YZ_D1 <- subset(data, data$D == 1);
  YZ_D1 <- YZ_D1[order(YZ_D1[, 1]), ];

  Y0_grid <- unique(YZ_D0$Y);
  Y1_grid <- unique(YZ_D1$Y);

  P_D0 <- matrix(0, nrow = length(Y0_grid), ncol = K);
  P_D1 <- matrix(0, nrow = length(Y1_grid), ncol = K);

  for (k in 1:K){
    Y_D0 <- subset(YZ_D0$Y, YZ_D0$Z == Z_order[k]);
    P_D0[, k] <- findInterval(Y0_grid, Y_D0) / m[k];
    Y_D1 <- subset(YZ_D1$Y, YZ_D1$Z == Z_order[k]);
    P_D1[, k] <- findInterval(Y1_grid, Y_D1) / m[k];
  }

  return(list("P_D0" = P_D0, "P_D1" = P_D1));
}

gen_YD_dist <- function(dists, lambda, K){

  if(K > 2){
    propscore <- dists$P_D1[dim(dists$P_D1)[1], ];
    propscore_comp <- (propscore[2:(K - 1)]-propscore[1:(K - 2)]) ^ 2 - (propscore[2:(K - 1)]-propscore[3:K]) ^ 2;
    propscore_comp <- c(1, propscore_comp > 0, 0);
  }
  else{
    propscore_comp <- c(1, 0);
  }

  dist_sum <- diag(lambda, nrow = K, ncol = (K - 1));
  dist_sum[2:K, ] <- dist_sum[2:K, ] + diag((1 - lambda), nrow = (K - 1), ncol = (K - 1));
  Ydist_D0 <- dists$P_D0 %*% dist_sum;
  Ydist_D1 <- dists$P_D1 %*% dist_sum;
  Ydist_D0D1 <- rbind(Ydist_D0, Ydist_D1);

  YD_dist <- matrix(0, dim(Ydist_D0D1)[1], K);
  for (k in 1:K){
    YD_dist[, k] <- Ydist_D0D1[, k - 1 + propscore_comp[k]];

  }

  return(YD_dist);
}

Z_validity_test <- function(Y, D, Z, Z_order, xis, B = 500, alpha = c(0.1, 0.05, 0.01)){

  data <- data.frame("Y" = Y, "D" = D, "Z" = Z);
  data <- na.omit(data);

  limit = (10 ^ 4)
  limit <- floor((-1 + sqrt(1 + (8 * limit))) / 2);

  xis <- sort(xis);
  xiN <- length(xis);
  K <- length(Z_order);

  m <- rep(0, K);
  for (k in 1:K){
    m[k] <- nrow(subset(data, data$Z == Z_order[k]));
  }
  lambda <- m[2:K] / (m[1:(K - 1)] + m[2:K]);

  dists <- gen_dists_multi(data, m, Z_order, K);

  Tw <- compute_stat_multi(dists, m, lambda, K, limit, xis, xiN);

  boot_stats <- boot_stats_multi(dists, m, lambda, K, xis, xiN, B, limit)

  pval <- rowSums(Tw <= boot_stats)/B;
  cvs <- apply(boot_stats, 1, quantile, probs = (1 - alpha));

  return(list("teststat" = Tw, "pvals" = pval, "cvs" = cvs));

}
CarrThomas/TestforInstrumentValidity documentation built on July 1, 2020, 12:05 a.m.