In bips-hb/DataTrainCausalLearning: Practicals for the Data Train Course "Causal learning" 2021 (V Didelez)

knitr::opts_chunk$set(echo = TRUE, 
                      fig.width = 6, 
                      message = F, 
                      warning = F, 
                      cache = T, 
                      eval = TRUE)

library(DataTrainCausalLearning)
library(tidyverse)
library(pcalg)

Descriptive statistics

data("tcgas")

dim(tcgas)

head(tcgas)

plot(tcgas, cex = 0.3)

tcgas %>% 
  gather() %>% 
  ggplot(aes(x = value)) +
  geom_histogram(col = "black", fill = "white") + 
  theme_light() +
  facet_wrap(~ key)

PC-algorithm

pcfit <- pc(
  suffStat = list(C = cor(tcgas), n = dim(tcgas)[1]), # correlation matrix and the number of observations
  indepTest = gaussCItest, # our choice of independence test
  alpha = 0.05, # our choice of "significance level"
  labels = colnames(tcgas), # variable names
  maj.rule = T, solve.confl = T, u2pd = "relaxed"
)

plot(pc(
  suffStat = list(C = cor(tcgas), n = dim(tcgas)[1]),
  indepTest = gaussCItest,
  alpha = 0.01,
  labels = colnames(tcgas),
  maj.rule = T, solve.confl = T, u2pd = "relaxed"
)@graph, main = "alpha = 1%")

plot(pcfit@graph, main = "alpha = 5%")

plot(pc(
  suffStat = list(C = cor(tcgas), n = dim(tcgas)[1]),
  indepTest = gaussCItest,
  alpha = 0.1,
  labels = colnames(tcgas),
  maj.rule = T, solve.confl = T, u2pd = "relaxed"
)@graph, main = "alpha = 10%")

FCI-algorithm

fcifit <- fci(
  suffStat = list(C = cor(tcgas), n = dim(tcgas)[1]),
  indepTest = gaussCItest,
  alpha = 0.05,
  labels = colnames(tcgas),
  maj.rule = T,
  selectionBias = F
)

plot(fcifit)

GES-algorithm

gesfit <- ges(new("GaussL0penObsScore", tcgas))

plot(gesfit$essgraph)

LiNGAM

lingamfit <- lingam(tcgas)

lingam2graph <- function(fit,data){

  amat <- t(fit$Bpruned!=0)

  colnames(amat) <- colnames(data)

  output <- as(amat, "graphNEL")

  return(output)
}

lingamgraph <- lingam2graph(fit = lingamfit, # fitted object
                            data = tcgas # data used to fit the object
                            )

plot(lingamgraph)

Kernel independence tests

library(kpcalg)

pcfit_kernel05 <- pc(suffStat = list(data = tcgas, ic.method = "dcc.perm"),
                     indepTest = kernelCItest,
                     alpha = 0.05,
                     labels = colnames(tcgas),
                     maj.rule = TRUE, solve.confl = TRUE, u2pd = "relaxed"
                     )

pcfit_kernel10 <- pc(suffStat = list(data = tcgas, ic.method = "dcc.perm"),
                     indepTest = kernelCItest,
                     alpha = 0.1,
                     labels = colnames(tcgas),
                     maj.rule = TRUE, solve.confl = TRUE, u2pd = "relaxed"
                     )

par(mfrow = c(1,2))

plot(pcfit_kernel05, main = "alpha = 5%")
plot(pcfit_kernel10, main = "alpha = 10%")

Generalised Covariance Measure

library(GeneralisedCovarianceMeasure)

use_gcm <- function(x, y, S, suffStat){

  a <- as.matrix(suffStat$data)[,x]
  b <- as.matrix(suffStat$data)[,y]
  c <- as.matrix(suffStat$data)[,S]

  test <- gcm.test(a, b, c, alpha = suffStat$alpha)
  p_value <- test[[1]]

  return(p_value)
}

pcfit_gcm05 <- pc(suffStat = list(data = tcgas, alpha = 0.05),
                  indepTest = use_gcm,
                  alpha = 0.05,
                  labels = colnames(tcgas),
                  maj.rule = TRUE, solve.confl = TRUE, u2pd = "relaxed"
                  )

pcfit_gcm10 <- pc(suffStat = list(data = tcgas, alpha = 0.1),
                  indepTest = use_gcm,
                  alpha = 0.1,
                  labels = colnames(tcgas),
                  maj.rule = TRUE, solve.confl = TRUE, u2pd = "relaxed"
                  )

par(mfrow = c(1,2))

plot(pcfit_gcm05, main = "alpha = 5%")
plot(pcfit_gcm10, main = "alpha = 10%")

Discrete data

data("tcgadisc")

disCItest_mod <- function(x, y, S, suffStat){
  if (is.data.frame(dm <- suffStat$dm)) 
        dm <- data.matrix(dm)
  else stopifnot(is.matrix(dm))
  nlev <- suffStat$nlev
  adaptDF <- suffStat$adaptDF
  gSquareDis(x = x, y = y, S = S, dm = dm, nlev = nlev, adaptDF = adaptDF,
             verbose = FALSE, n.min = -1)
}

pcfit_d05 <- pc(suffStat = list(dm = tcgadisc, adaptDF = F),
                indepTest = disCItest_mod,
                alpha = 0.05,
                labels = colnames(tcgadisc),
                maj.rule = T, solve.confl = T, u2pd = "relaxed"
                )

pcfit_d10 <- pc(suffStat = list(dm = tcgadisc, adaptDF = F),
                indepTest = disCItest_mod,
                alpha = 0.1,
                labels = colnames(tcgadisc),
                maj.rule = T, solve.confl = T, u2pd = "relaxed"
                )

par(mfrow = c(1,2))

plot(pcfit_d05@graph, main = "alpha = 5%")
plot(pcfit_d10@graph, main = "alpha = 10%")

Comparison

par(mfrow = c(1,2))
plot(pcfit@graph, main = "PC (left) and FCI (right)")
plot(fcifit)
par(mfrow = c(1,2))
plot(gesfit$essgraph, main = "GES")
plot(lingamgraph, main = "LiNGAM")
par(mfrow = c(1,2))
plot(pcfit_gcm10@graph, main = "GCM")
plot(pcfit_kernel10@graph, main = "Kernel")
par(mfrow = c(1,1))
plot(pcfit_d10@graph, main = "Discrete")

possAn(t(as(as(gesfit$essgraph, "graphNEL"), "Matrix")), # the graph has to be specified as a matrix
       x = 4, # indicates the position of the variables of interest
       type = "cpdag" # type of graph
       )

colnames(tcgas)[c(1,4,5,6,7)]

possDe(t(as(as(gesfit$essgraph, "graphNEL"), "Matrix")), 
       x = 4, 
       type = "cpdag" 
       )

colnames(tcgas)[c(2,3,4,8)]

IDA algorithm

summary(lm(CDK6 ~ HMGA2 + BAX + CDKN2A + CDKN1A + MDM2 + SERPINE1 + THBS1, data = tcgas))$coefficients

names(tcgas)

ida(x.pos = 4, # position of HMGA2 in the dataset 
    y.pos = 8, # position of CDK6 in the dataset
    cov(tcgas), # covariance matrix of the dataset
    graphEst = as(gesfit$essgraph,"graphNEL"), # estimated graph
    method = "local", # type of adjustment set
    type = "cpdag", # type of the estimated graph
    verbose = TRUE
      )

ida(x.pos = 4, # position of HMGA2 in the dataset 
    y.pos = 8, # position of CDK6 in the dataset
    cov(tcgas), # covariance matrix of the dataset
    graphEst = as(gesfit$essgraph,"graphNEL"), # estimated graph
    method = "local", # type of adjustment set
    type = "cpdag", # type of the estimated graph
    verbose = TRUE
      ) -> idaest1

summary(lm(CDK6 ~ HMGA2 + SERPINE1 + THBS1, data = tcgas))$coefficients

ida(x.pos = 4,
    y.pos = 8,
    cov(tcgas),
    graphEst = as(gesfit$essgraph,"graphNEL"),
    method = "optimal",
    type = "cpdag",
    verbose = TRUE
      )

summary(lm(CDK6 ~ HMGA2 + SERPINE1 + THBS1 + MDM2, data = tcgas))$coefficients

ida(x.pos = 5,
    y.pos = 8,
    cov(tcgas),
    graphEst = as(gesfit$essgraph,"graphNEL"),
    method = "local",
    type = "cpdag",
    verbose = T
      )