Nothing
##' Estimates the causal parents S of the target variable Y using
##' invariant causal prediction and fits a general model of the form
##' \cr Y = f(X^S) + N.
##'
##' The function can be applied to models of the form\cr Y_i =
##' f(X_i^S) + N_i\cr with iid noise N_i and f is from a specific
##' function class, which the regression procedure given by the
##' parameter \code{regression.fun} should be able to approximate.\cr
##'
##' The invariant prediction procedure is applied using the hypothesis
##' test specified by the \code{test} parameter to determine whether a
##' candidate model is invariant. For further details see the
##' references.
##' @title Non-linear Invariant Causal Prediction
##' @param X matrix of predictor variables. Each column corresponds to
##' one predictor variable.
##' @param Y vector of target variable, with length(Y)=nrow(X).
##' @param test string specifying the hypothesis test used to test for
##' invariance of a parent set S (i.e. the null hypothesis
##' H0_S). The following tests are available: "block.mean",
##' "block.variance", "block.decoupled", "smooth.mean",
##' "smooth.variance", "smooth.decoupled" and "hsic".
##' @param par.test parameters specifying hypothesis test. The
##' following parameters are available: \code{grid}, \code{complements},
##' \code{link}, \code{alpha} and \code{B}. The parameter \code{grid} is an increasing
##' vector of gridpoints used to construct enviornments for change
##' point based tests. If the parameter \code{complements} is 'TRUE' each
##' environment is compared against its complement if it is 'FALSE'
##' all environments are compared pairwise. The parameter \code{link}
##' specifies how to compare the pairwise test statistics, generally
##' this is either max or sum. The parameter \code{alpha} is a numeric
##' value in (0,1) indicting the significance level of the
##' hypothesis test. The parameter \code{B} is an integer and specifies
##' the number of Monte-Carlo samples used in the approximation of
##' the null distribution.
##' @param regression.fun regression function used to fit the function
##' f. This should be a function which takes the argument (X,Y) and
##' outputs the predicted values f(Y).
##' @param max.parents integer specifying the maximum size for
##' admissible parents. Reducing this below the number of predictor
##' variables saves computational time but means that the confidence
##' intervals lose their coverage property.
##' @param stopIfEmpty if ‘TRUE’, the procedure will stop computing
##' confidence intervals if the empty set has been accepted (and
##' hence no variable can have a signicificant causal
##' effect). Setting to‘TRUE’ will save computational time in these
##' cases, but means that the confidence intervals lose their
##' coverage properties for values different to 0.
##' @param silent If 'FALSE', the procedure will output progress
##' notifications consisting of the currently computed set S
##' together with the p-value resulting from the null hypothesis
##' H0_S
##'
##' @return object of class 'seqICPnl' consisting of the following
##' elements
##'
##' \item{parent.set}{vector of the estimated causal parents.}
##'
##' \item{test.results}{matrix containing results from each individual
##' hypothesis test H0_S as rows. The first column ind links the rows in the
##' \code{test.results} matrix to the position in the list of the variable
##' \code{S}.}
##'
##' \item{S}{list of all the sets that were
##' tested. The position within the list corresponds to the index in
##' the first column of the test.results matrix.}
##'
##' \item{p.values}{p-value for being not included in the set of true
##' causal parents. (If a p-value is smaller than alpha, the
##' corresponding variable is a member of parent.set.)}
##'
##' \item{stopIfEmpty}{a boolean value indicating whether computations
##' stop as soon as intersection of accepted sets is empty.}
##'
##' \item{modelReject}{a boolean value indicating if the whole model was
##' rejected (the p-value of the best fitting model is too low).}
##'
##' \item{alpha}{significance level at which the hypothesis tests were performed.}
##'
##' \item{n.var}{number of predictor variables.}
##'
##' @export
##'
##' @import stats utils
##'
##' @author Niklas Pfister and Jonas Peters
##'
##' @references
##' Pfister, N., P. Bühlmann and J. Peters (2017).
##' Invariant Causal Prediction for Sequential Data. ArXiv e-prints (1706.08058).
##'
##' Peters, J., P. Bühlmann, and N. Meinshausen (2016).
##' Causal inference using invariant prediction: identification and confidence intervals.
##' Journal of the Royal Statistical Society, Series B (with discussion) 78 (5), 947–1012.
##'
##' @seealso The function \code{\link{seqICPnl.s}} can be used to
##' perform the individual hypothesis tests
##' H0_S. \code{\link{seqICP}} and \code{\link{seqICP.s}} are the
##' corresponding functions for classical sequential invariant
##' causal prediction.
##'
##' @examples
##' set.seed(2)
##'
##' # environment 1
##' na <- 120
##' X1a <- 0.3*rnorm(na)
##' X3a <- X1a + 0.2*rnorm(na)
##' Ya <- 2*X1a^2 + 0.6*sin(X3a) + 0.1*rnorm(na)
##' X2a <- -0.5*Ya + 0.5*X3a + 0.1*rnorm(na)
##'
##' # environment 2
##' nb <- 80
##' X1b <- 2*rnorm(nb)
##' X3b <- rnorm(nb)
##' Yb <- 2*X1b^2 + 0.6*sin(X3b) + 0.1*rnorm(nb)
##' X2b <- -0.5*Yb + 0.8*rnorm(nb)
##'
##' # combine environments
##' X1 <- c(X1a,X1b)
##' X2 <- c(X2a,X2b)
##' X3 <- c(X3a,X3b)
##' Y <- c(Ya,Yb)
##' Xmatrix <- cbind(X1, X2, X3)
##'
##' # use GAM as regression function
##' GAM <- function(X,Y){
##' d <- ncol(X)
##' if(d>1){
##' formula <- "Y~1"
##' names <- c("Y")
##' for(i in 1:(d-1)){
##' formula <- paste(formula,"+s(X",toString(i),")",sep="")
##' names <- c(names,paste("X",toString(i),sep=""))
##' }
##' data <- data.frame(cbind(Y,X[,-1,drop=FALSE]))
##' colnames(data) <- names
##' fit <- fitted.values(mgcv::gam(as.formula(formula),data=data))
##' } else{
##' fit <- rep(mean(Y),nrow(X))
##' }
##' return(fit)
##' }
##'
##' # Y follows the same structural assignment in both environments
##' # a and b (cf. the lines Ya <- ... and Yb <- ...).
##' # The direct causes of Y are X1 and X3.
##' # A GAM model fit considers X1, X2 and X3 as significant.
##' # All these variables are helpful for the prediction of Y.
##' summary(mgcv::gam(Y~s(X1)+s(X2)+s(X3)))
##'
##' # apply seqICP to the same setting
##' seqICPnl.result <- seqICPnl(X = Xmatrix, Y, test="block.variance",
##' par.test = list(grid = seq(0, na + nb, (na + nb)/10), complements = FALSE, link = sum,
##' alpha = 0.05, B =100), regression.fun = GAM, max.parents = 4, stopIfEmpty=FALSE, silent=FALSE)
##' summary(seqICPnl.result)
##' # seqICPnl is able to infer that X1 and X3 are causes of Y
seqICPnl <- function(X, Y,
test="block.variance",
par.test=list(grid=c(0,round(nrow(X)/2),nrow(X)),
complements=FALSE,
link=sum,
alpha=0.05,
B=100),
regression.fun=function(X,Y) fitted.values(lm.fit(X,Y)),
max.parents=ncol(X),
stopIfEmpty=TRUE,
silent=TRUE)
{
# preprocess X and Y
X <- unname(as.matrix(X))
Y <- unname(as.matrix(Y))
# add defaults if missing in parameter lists
if(missing(max.parents)){
max.parents <- ncol(X)
}
if(missing(stopIfEmpty)){
stopIfEmpty <- TRUE
}
if(missing(silent)){
silent <- TRUE
}
if(!exists("grid",par.test)){
par.test$grid <- c(0,round(nrow(X)/2),nrow(X))
}
if(!exists("complements",par.test)){
par.test$complements <- FALSE
}
if(!exists("link",par.test)){
par.test$link <- sum
}
if(!exists("alpha",par.test)){
par.test$alpha <- 0.05
}
if(!exists("B",par.test)){
par.test$B <- 100
}
# read out parameters from par.test
alpha <- par.test$alpha
# number of variables
d <- ncol(X)
# number of observations
n <- nrow(X)
###
# Compute parent set
###
# initialize variables
parent.set <- 1:d
len <- 1
ind <- 1
S <- list(numeric(0))
# check wheter model is invariant over the empty set S={}
if(!silent){
message("Currently fitting set S = {}")
}
res <- seqICPnl.s(X, Y, S[[1]], test, par.test, regression.fun)
result <- data.frame(ind=1,p.value=res$p.value,test.stat=res$test.stat,crit.value=res$crit.value, p=res$p)
if(!silent){
message(paste("p-value:",toString(round(res$p.value,digits=2))))
}
# compute intersection of rejected sets
if(res$p.value>alpha){
parent.set <- numeric(0)
}
empty.int <- length(parent.set)==0
# iterate over all sets until intersection is empty or max.parents is reached
while(len<=max.parents & len<=d & !(empty.int&stopIfEmpty)){
S <- append(S,combn(d,len,simplify=FALSE))
while(ind<length(S) & !(empty.int&stopIfEmpty)){
ind <- ind+1
# use the candiate set S[[ind]]
if(!silent){
message(paste("Currently fitting set S = {",toString(S[[ind]]),"}",sep=""))
}
res <- seqICPnl.s(X, Y, S[[ind]], test, par.test, regression.fun)
result <- rbind(result,data.frame(ind,p.value=res$p.value,test.stat=res$test.stat,crit.value=res$crit.value, p=res$p))
if(!silent){
message(paste("p-value:",toString(round(res$p.value,digits=2))))
}
# compute intersection of rejected sets
if(res$p.value>alpha){
parent.set <- intersect(parent.set,S[[ind]])
}
empty.int <- length(parent.set)==0
}
len <- len+1
}
# check if all tests where rejected (model is rejected)
if(sum(result$p.value<=alpha)==length(result$p.value)){
modelReject <- TRUE
empty.int <- TRUE
parent.set <- numeric(0)
}
else{
modelReject <- FALSE
if(empty.int){
parent.set <- numeric(0)
}
}
###
# Compute p-values for all variables (only if stopIfEmpty==FALSE)
###
if(!stopIfEmpty|length(S[[length(S)]])==d){
p.value <- numeric(d)
j.vec <- vector("logical",length(S))
for(j in 1:d){
for(i in 1:length(S)){
j.vec[i] <- j %in% S[[i]]
}
pval1 <- result$p.value[j.vec]
pval2 <- result$p.value[!j.vec]
if(max(pval1)>alpha){
p.value[j] <- max(pval2)
}
else{
p.value[j] <- 1
}
}
}
else{
p.value="not computed since stopIfEmpty==TRUE"
}
icp.result <- list(parent.set=parent.set,
test.results=result,
S=S,
p.values=p.value,
stopIfEmpty=stopIfEmpty,
modelReject=modelReject,
alpha=alpha,
n.var=d)
class(icp.result) <- "seqICPnl"
return(icp.result)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.