R/abn-toolbox.R

Defines functions scoreContribution essentialGraph skewness simulateDag infoDag compareEG compareDag odds or expit logit

Documented in compareDag compareEG essentialGraph expit infoDag logit odds or scoreContribution simulateDag

###############################################################################
## abn-toolbox.R ---
## Authors : Gilles Kratzer, Reinhard Furrer
###############################################################################

##-------------------------------------------------------------------------
## External usefull functions to analayse ABN and BN
##-------------------------------------------------------------------------

# functions logit
logit <- function(x) {
    log(x/(1 - x))
}

# functions expit
expit <- function(x) {
    exp(x)/(1 + exp(x))
}

# function that compute an odds ratio from a table/matrix
or <- function(x) {

    x <- as.matrix(x)

    if (dim(x)[1] != 2 || dim(x)[2] != 2) {
        stop("The contengency table should be of 2-dimensionnal with a 2x2 formulation.")
    }

    if (x[1, 2] == 0) {
        x[1, 2] <- 0.5
    }
    if (x[2, 1] == 0) {
        x[2, 1] <- 0.5
    }

    out <- (x[1, 1] * x[2, 2])/(x[1, 2] * x[2, 1])
    return(out)
}

# Probability to odds
odds <- function(x) {
    (x/(1 - x))
}

##-------------------------------------------------------------------------
## External function that compares DAGs and computes many metrics A Comparison of Structural Distance Measures for Causal Bayesian Network Models
##-------------------------------------------------------------------------


compareDag <- function(ref, test, node.names = NULL, checkDAG = TRUE) {

    ## check ref dag
    if (checkDAG) {
        ref <- validate_abnDag(  ref, data.df=node.names, returnDAG=TRUE)
        test <- validate_abnDag( test, data.df=node.names, returnDAG=TRUE)
    }

    if (any(dim(ref) != dim(test))) {
        stop("The reference and test DAG have not the same size")
    }

    n <- dim(ref)[1]

    ## unit matrix
    ref[ref != 0] <- 1
    test[test != 0] <- 1

    diff.matrix <- ref - (0.5 * test)

    diff.matrix.tab <- table(factor(diff.matrix, levels = c(-0.5, 0, 0.5, 1)))

    if(sum(ref == 1)==0 | sum(test == 1)==0){
        warning("If the test or reference DAG is empty, some of the estimates are not defined.")
    }

    ## output
    out <- list()

    out[["TPR"]] <- (as.numeric(diff.matrix.tab[names(diff.matrix.tab) == 0.5]))/(sum(ref == 1))
    out[["FPR"]] <- (as.numeric(diff.matrix.tab[names(diff.matrix.tab) == -0.5]))/(sum(ref == 0))
    out[["Accuracy"]] <- (as.numeric(diff.matrix.tab[names(diff.matrix.tab) == 0.5]) + as.numeric(diff.matrix.tab[names(diff.matrix.tab) == 0]))/(dim(ref)[1]^2)
    out[["FDR"]] <- as.numeric(diff.matrix.tab[names(diff.matrix.tab) == 1])/(sum(test == 1))
    out[["G-measure"]] <- sqrt(as.numeric(diff.matrix.tab[names(diff.matrix.tab) == 0.5])/(sum(test == 1)) * (as.numeric(diff.matrix.tab[names(diff.matrix.tab) == 0.5]))/(sum(ref == 1)))
    out[["F1-score"]] <- (2/((1/as.numeric(diff.matrix.tab[names(diff.matrix.tab) == 0.5])/(sum(test == 1))) + (1/(as.numeric(diff.matrix.tab[names(diff.matrix.tab) == 0.5]))/(sum(ref == 1)))))
    out[["PPV"]] <- as.numeric(diff.matrix.tab[names(diff.matrix.tab) == 0.5])/(sum(test == 1))
    out[["FOR"]] <- as.numeric(diff.matrix.tab[names(diff.matrix.tab) == 1])/(sum(test == 1))

    # transforming "reverse arc" into -0.5
    for( i in 1:n){
        for( j in i:n){
            if( diff.matrix[i,j]!=0 & diff.matrix[j,i]!=0){
                diff.matrix[i,j] <- -(diff.matrix[i,j] + diff.matrix[j,i])
                diff.matrix[j,i] <- 0
            }
        }
    }
    diff.matrix.tab <- table(factor(diff.matrix, levels = c(-0.5, 0, 0.5, 1)))

    out[["Hamming-distance"]] <- sum(as.numeric(diff.matrix.tab[names(diff.matrix.tab) %in% c(-0.5, 1)]))

    return(out)
}

compareEG <- function(ref, test) compareDag(ref, test, checkDAG=FALSE)


##-------------------------------------------------------------------------
## External function that return information about a dag number of nodes number of arcs avergae size of the MB average size of the Neighborhood
##-------------------------------------------------------------------------

infoDag <- function(object, node.names = NULL) {

    if (inherits(object, "abnLearned")) {
      dag <- object$dag
    } else   if (inherits(object, "abnFit")) {
      dag <- object$abnDag$dag
    } else  if (is.matrix(object)) {
      dag <- abs(object)
      dag[dag != 0] <- 1
      #       diag(dag) <- 0  # RF: we want to test if proper dag!!
      dag <- check.valid.dag(dag = dag, is.ban.matrix = FALSE, group.var = NULL)
      ## naming
      if (is.null(colnames(dag))) {
          colnames(dag) <- rownames(dag) <- node.names
      }
    } else if (grepl("~", as.character(dag)[1], fixed = TRUE)) {
        dag <- formula.abn(f = dag, name = node.names)
        dag <- check.valid.dag(dag = dag, is.ban.matrix = FALSE, group.var = NULL)
    } else {
       stop("'object' must either be a matrix or a formula expression, or of class 'abnFit', 'abnLearned'.")
    }

    if (is.null(node.names)) {
        node.names <- colnames(dag)
    }

    ## ======================== test for conformity over! ========================

    out <- list()
    ## number of nodes
    n.nodes <- dim(dag)[1]

    ## number of arcs
    n.arcs <- sum(dag)

    ## =========================== average markov blanket size ===========================

    mb.size <- vector(mode = "numeric", length = length(node.names))

    for (i in 1:length(node.names)) {

        node <- node.names[i]
        # row children column parent

        ## Parent + Children
        mb.children <- list()
        mb.parent <- list()
        for (j in 1:length(dag[1, ])) {
            if (dag[node, j] != 0) {
                mb.children[j] <- names(dag[node, ])[j]
            }
            if (dag[j, node] != 0) {
                mb.parent[j] <- names(dag[, node])[j]
            }
        }
        # delete NULL element
        mb.children <- unlist(mb.children[!sapply(mb.children, is.null)])
        mb.parent <- unlist(mb.parent[!sapply(mb.parent, is.null)])

        ## Parent of children
        mb.parent.children <- list()
        for (node.children in mb.children) {
            for (k in 1:length(dag[1, ])) {
                if (dag[k, node.children] != 0) {
                  mb.parent.children[k] <- names(dag[, node.children])[k]
                }
            }
        }
        # delete NULL element
        mb.parent.children <- unlist(mb.parent.children[!sapply(mb.parent.children, is.null)])

        # add all list
        mb.node <- unlist(list(mb.children, mb.parent, mb.parent.children))

        # unique element
        mb.node <- unique(mb.node)

        # delete index node
        mb.node.wo <- NULL
        if (length(mb.node) != 0) {
            for (l in 1:length(mb.node)) {
                if (mb.node[c(l)] == node) {
                  mb.node.wo <- mb.node[-c(l)]
                }
            }
        }
        if (is.null(mb.node.wo)) {
            mb.node.wo <- mb.node
        }

        mb.size[i] <- length(mb.node.wo)
    }

    mb.average <- mean(mb.size)

    ## average Neighborhood

    nh.size <- vector(mode = "numeric", length = length(node.names))
    parent.size <- vector(mode = "numeric", length = length(node.names))
    children.size <- vector(mode = "numeric", length = length(node.names))

    for (i in 1:length(node.names)) {
        nh.size[i] <- sum(dag[i, ]) + sum(dag[, i])
        parent.size[i] <- sum(dag[i, ])
        children.size[i] <- sum(dag[, i])
    }
    nh.average <- mean(nh.size)
    parent.average <- mean(parent.size)
    children.average <- mean(children.size)

    ## output
    out[["n.nodes"]] <- n.nodes
    out[["n.arcs"]] <- n.arcs
    out[["mb.average"]] <- mb.average
    out[["nh.average"]] <- nh.average
    out[["parent.average"]] <- parent.average
    out[["children.average"]] <- children.average

    return(out)
}

##-------------------------------------------------------------------------
## External function that simulate a dag with arbitrary arcs density
##-------------------------------------------------------------------------

simulateDag <- function(node.name = NULL, data.dists = NULL, edge.density = 0.5) {

    ## test
    if (length(node.name) <= 2) {
        stop("No need for help to simulate a DAG of size 2")
    }
    if (edge.density > 1 | edge.density < 0) {
        stop("'edge.density' should be a real number in [0,1]")
    }

    if (is.null(data.dists)) {
        data.dists <- sample(x = c("gaussian", "binomial", "poisson"), size = length(node.name), replace = TRUE)
        names(data.dists) <- node.name
    }

    dag <- matrix(data = 0, nrow = length(node.name), ncol = length(node.name))


    for (j in 2:(length(node.name))) {
        dag[c(j:length(node.name)), (j - 1)] <- rbinom(n = (length(node.name) - j + 1), size = 1, prob = edge.density)
    }
    order.m <- sample(x = length(node.name), size = length(node.name), replace = FALSE)

    ## changing order
    dag <- dag[order.m, order.m]  #order.m

    ## naming
    colnames(dag) <- rownames(dag) <- node.name

    out <- createAbnDag(dag, data.dists=data.dists)
    ## structure
    return(out)
}

##-------------------------------------------------------------------------
## Function that computes skewness of a distribution
##-------------------------------------------------------------------------

skewness <- function(x) {
    n <- length(x)
    (sum((x - mean(x))^3)/n)/(sum((x - mean(x))^2)/n)^(3/2)
}

##-------------------------------------------------------------------------
## External function that computes essential graph of a dag Minimal PDAG: The only directed edges are those who participate in v-structure Completed PDAG: very directed edge corresponds to a
## compelled edge, and every undirected edge corresponds to a reversible edge
##-------------------------------------------------------------------------

essentialGraph <- function(dag, node.names = NULL, PDAG = "minimal") {

    ## dag transformation
    if (is.matrix(dag)) {
        # check.valid.dag(dag=dag,data.df=data.df,is.ban.matrix=FALSE,group.var=group.var) unit matrix
        dag[dag != 0] <- 1
        node.names <- colnames(dag)

    } else {
        if (grepl("~", as.character(dag)[1], fixed = TRUE)) {
            dag <- formula.abn(f = dag, name = node.names)
        } else {
            stop("Dag specification must either be a matrix or a formula expression")
        }
    }

    ## compute essential graph moral graph

    dim.dag <- dim(dag)[1]
    moral <- matrix(data = 0, nrow = dim.dag, ncol = dim.dag)

    for (i in 1:dim.dag) {
        for (j in 1:dim.dag) {
            if (dag[i, j] == 1) {
                moral[i, j] <- 1
                moral[j, i] <- 1
            }
        }
    }

    colnames(moral) <- rownames(moral) <- node.names

    ## essential arcs
    if (PDAG == "minimal") {
        for (i in 1:dim.dag) {
            if (sum(dag[i, ]) >= 2) {
                for (j in 1:dim.dag) {
                  if (dag[i, j] == 1) {
                    moral[j, i] <- 0
                  }
                }
            }
        }


        colnames(moral) <- rownames(moral) <- node.names
        return(moral)
    }

    if (PDAG == "completed") {
        for (i in 1:dim.dag) {
            if (sum(dag[i, ]) >= 2) {
                for (j in 1:dim.dag) {
                  if (dag[i, j] == 1) {
                    moral[j, i] <- 0
                  }
                  if (dag[j, i] == 1) {
                    moral[i, j] <- 0
                  }
                }
            }
        }

        colnames(moral) <- rownames(moral) <- node.names
        return(moral)
    }
}

##-------------------------------------------------------------------------
##Function that compute likelihood contribution of observation
##-------------------------------------------------------------------------

scoreContribution <- function(object = NULL,
                              dag = NULL,
                              data.df = NULL,
                              data.dists = NULL,
                              verbose = FALSE){

    ## method abnCache
    if (!is.null(object)){
        if (inherits(object, "abnLearned")) {
            dag <- object$dag
            data.df <- object$score.cache$data.df
            data.dists <- object$score.cache$data.dists
        }}

    ## transform factor into 0/1

    node.ordering <- names(data.dists)
    nobs <- dim(data.df)[1]

    ll <- matrix(data = 0,nrow = dim(data.df)[1],ncol = dim(data.df)[2])
    nb.param <- matrix(data = 0,nrow = dim(data.df)[1],ncol = dim(data.df)[2])
    nb.parents <- matrix(data = 0,nrow = dim(data.df)[1],ncol = dim(data.df)[2])
    colnames(ll) <- colnames(nb.param) <- colnames(nb.parents) <- node.ordering
    nb.parents <- rowSums(dag)/nobs
    for (node in node.ordering) {

        if(as.character(data.dists[node])=="binomial"){
            Y <- as.factor(data.matrix(data.df[, node]))
        } else {
            Y <- data.matrix(data.df[, node])
        }
        X <- data.matrix(cbind(rep(1,nobs),data.df[, as.logical(dag[node,])]))
        if(as.character(data.dists[node])=="gaussian"){
            nb.param[,node] <- (dim(X)[2] + 1)/nobs
        }else{
            nb.param[,node] <- dim(X)[2]/nobs
        }

        x <- glm(formula = Y ~ -1 + X,family = as.character(data.dists[node]))
        yhat <- predict.glm(x,newdata = x$data, type = "response")

        switch (as.character(data.dists[node]),
                "binomial" = {
                    ll[,node] <- dbinom(as.numeric(Y)-1, size = 1L, prob = yhat, log = TRUE)
                },
                "gaussian" = {
                    ll[,node] <- dnorm(Y, mean = yhat, sd = sigma(x),log = TRUE)
                },
                "poisson" = {
                    ll[,node] <- dpois(Y, lambda = yhat, log = TRUE)
                }
        )
        hv <- hatvalues(x)

    }

    aic <- - 2*ll + 2*nb.param
    bic <- - 2*ll + nb.param*(log(nobs))
    mdl <- bic + (1/nobs + nb.parents) * log(nobs)

    out <- list("mlik" = ll, "aic" = aic, "bic" = bic, "mdl" = mdl, "hatvalues" = hv)

    return(out)

}


##-------------------------------------------------------------------------
## Function to export abn coefficients to xls
##-------------------------------------------------------------------------



## EOF

Try the abn package in your browser

Any scripts or data that you put into this service are public.

abn documentation built on April 25, 2022, 9:06 a.m.