lgde: Implements the Locally Gaussian Density Estimator

Documented in condLocal multiLocal multiLocal.knn

#########################################
##                                     ##
##   MULTIVARIATE DENSITY ESTIMATION   ##
##     GLOBAL AND LOCAL BANDWIDTHS     ##
##        CONDITIONAL DENSITIES        ##
##                                     ##
#########################################

#' Estimate a multivariate density function using the LGDE.
#' @param data The data matrix, one row per observation.
#' @param bandwidths A matrix of bandwidths. Must be in the same format as is
#' produced by the HLocal-function.
#' @param gsize If grid is not provided, the density will be estimated at
#' gsize points going diagonally through the data.
#' @param grid The grid at which to estimate the density. Must be a matrix with
#' the same number of columns as the data matrix.
#' @return A list with 6 elements:
#'     $f.est are the density estimates at the grid points,
#'     $d is the dimension,
#'     $loc.cor are the local correlations,
#'     $h is the matrix of bandwidths that was used,
#'     $pairs is atwo-column matrix indicating all pairs of variables,
#'     $grid is the grid at which the density has been estimated.
#' @examples
#' data <- cbind(rt(100, df = 10), rt(100, df = 10))
#' multiLocal(data)

multiLocal <- function(data,
                       bandwidths = NULL,
                       gsize = 15,
                       grid = apply(data, 2, function(x) seq(quantile(x,.001),quantile(x,.999), length.out=gsize))) {
    
    # Sample size and number of variables
    n <- dim(data)[1]
    d <- dim(data)[2]

    # Check that the dimensions agree
    if(d != dim(grid)[2]) stop("Grid and data dimensions do not match!")

    # Transform the data and grid to standard normality
    transformed <- transLocal(data = data,
                              grid = grid,
                              return.normalizing.constants = TRUE)

    transformed.data <- transformed$transformed.data
    z.grid <- transformed$transformed.grid
    normalizing.constants <- transformed$normalizing.constants

    # Calculate the bandwidths if they have not been provided by the user
    if(!is.null(bandwidths)) {
        h <- bandwidths
    } else {
        h <- HLocal(transformed.data)
    }
            
    pairs <- h[,1:2]
    if(length(pairs) == 2) {pairs <- t(as.matrix(pairs))} 
    if(length(pairs) == 2) {n.pairs <- 1} else {n.pairs <- dim(pairs)[1]}

    
    # Function to calculate local correlation
    joint.densities = function(pair.number) {
        x <- transformed.data[,pairs[pair.number,]]
        bw <- h[pair.number, c('h1', 'h2')]; 

        rho <- biLocal(data = x,
                       grid = z.grid[,h[pair.number,1:2]],
                       h = bw)$par.est                                     
    }

    # Estmate the local correlations
    joints <- lapply(X = as.list(1:n.pairs), FUN = joint.densities)

    # Calculate the density estimates
    f.estimate = function(parameters) {
          x <- parameters[1:d]
          rho.vec <- parameters[(d+1):length(parameters)]
          sigma <- diag(d)
          for(j in 1:n.pairs) {
              var1 <- pairs[j,1]
              var2 <- pairs[j,2]
              rho <- rho.vec[j]
              sigma[var1, var2] <- rho
              sigma[var2, var1] <- sigma[var1, var2]
          }
         mvtnorm::dmvnorm(as.numeric(x), mean = rep(0, d), sigma = sigma)
     }
            
    if(length(grid[,1]) == 1) {
        parameter.matrix <- t(c(do.call(cbind, z.grid),
                                do.call(cbind, joints))) 
    } else {
        parameter.matrix <- cbind(z.grid, do.call(cbind, joints))
    }

    parameter.matrix <- split(parameter.matrix, row(parameter.matrix))
            
    f.est <- do.call(rbind, lapply(X = parameter.matrix, FUN = f.estimate))*apply(normalizing.constants, 1, prod)
            
    # Return results
    ret <- list(loc.cor = joints, f.est = f.est, h = h, dimension = d, pairs = pairs, grid = grid, z.grid = z.grid, data = data, transformed.data = transformed.data)
    return(ret)
        
}

#' Estimate a multivariate density function using the LGDE and kNN bandwidths.
#' @param data The data matrix, one row per observation.
#' @param k.mat A matrix of k. Must be in the same format as is
#' produced by the kLocal-function.
#' @param gsize If grid is not provided, the density will be estimated at
#' gsize points going diagonally through the data.
#' @param grid The grid at which to estimate the density. Must be a matrix with
#' the same number of columns as the data matrix.
#' @return A list with 6 elements:
#'     $f.est are the density estimates at the grid points,
#'     $d is the dimension,
#'     $loc.cor are the local correlations,
#'     $k.mat is the matrix of k's that was used,
#'     $pairs is a two-column matrix indicating all pairs of variables,
#'     $grid is the grid at which the density has been estimated,
#'     $test is a vector of k's to test for cross-validation.
#' @examples
#' data <- cbind(rt(100, df = 10), rt(100, df = 10))
#' multiLocal.knn(data)

multiLocal.knn <- function(data,
                           k.mat = NULL,
                           gsize = 15,
                           grid = apply(data, 2, function(x)
                               seq(quantile(x,.001),quantile(x,.999), length.out=gsize)),
                           test = seq(20, 100, by = 3)) {
    
    # Sample size and number of variables
    n <- dim(data)[1]
    d <- dim(data)[2]

    # Transform the data and grid to standard normality
    transformed <- transLocal(data = data,
                              grid = grid,
                              return.normalizing.constants = TRUE)

    transformed.data <- transformed$transformed.data
    z.grid <- transformed$transformed.grid
    normalizing.constants <- transformed$normalizing.constants

    # Calculate the bandwidths if they have not been provided by the user
    if(!is.null(k.mat)) {
        h <- k.mat
    } else {
        h <- kLocal(transformed.data, test = test)
    }
            
    pairs <- h[,1:2]
    if(length(pairs) == 2) {pairs <- t(as.matrix(pairs))} 
    if(length(pairs) == 2) {n.pairs <- 1} else {n.pairs <- dim(pairs)[1]}

    
    # Function to calculate local correlation
    joint.densities = function(pair.number) {
        x <- transformed.data[,pairs[pair.number,]]
        k <- h[pair.number, c('k')];
        bw <- knnBi(x,
                    z.grid[,pairs[pair.number,]],
                    k)

        rho <- biLocal.knn(data = x,
                           grid = z.grid[,h[pair.number,1:2]],
                           h = bw)$par.est                                     
    }

    # Estmate the local correlations
    joints <- lapply(X = as.list(1:n.pairs), FUN = joint.densities)

    # Calculate the density estimates
    f.estimate = function(parameters) {
          x <- parameters[1:d]
          rho.vec <- parameters[(d+1):length(parameters)]
          sigma <- diag(d)
          for(j in 1:n.pairs) {
              var1 <- pairs[j,1]
              var2 <- pairs[j,2]
              rho <- rho.vec[j]
              sigma[var1, var2] <- rho
              sigma[var2, var1] <- sigma[var1, var2]
          }
         mvtnorm::dmvnorm(as.numeric(x), mean = rep(0, d), sigma = sigma)
     }
            
    if(length(grid[,1]) == 1) {
        parameter.matrix <- t(c(do.call(cbind, z.grid),
                                do.call(cbind, joints))) 
    } else {
        parameter.matrix <- cbind(z.grid, do.call(cbind, joints))
    }

    parameter.matrix <- split(parameter.matrix, row(parameter.matrix))
            
    f.est <- do.call(rbind, lapply(X = parameter.matrix, FUN = f.estimate))*
                        apply(normalizing.constants, 1, prod)
            
    # Return results
    ret <- list(joints = joints,
                f.est = f.est,
                h = h,
                dimension = d,
                pairs = pairs,
                grid = grid,
                k.mat = h)
    return(ret)
        
}

#' Estimate conditional densities using the LGDE
#' @param data The data matrix, one row per observation.
#' @param cond A vector of values to be conditioned on.
#' @param bandwidths A matrix of bandwidths. Must be in the same format as is
#' produced by the HLocal-function.
#' @param gsize If grid is not provided, the density will be estimated at
#' gsize points going diagonally through the data.
#' @param grid The grid at which to estimate the density. Must be a matrix with
#' the same number of columns as the data matrix.
#' @return A list with 1 element:
#'     $f.est.cond is the estimated conditional density.
#' @examples
#' # Three dimensional example. The conditioning variables must be the last columns in the data matrix.
#' Estimating the density of X1|X2 = 0, X3 = 0:
#' data <- cbind(rt(100, df = 10), rt(100, df = 10), rt(100, df = 10))
#' condLocal(data, cond = c(0,0))

condLocal <- function(data,
                      cond,
                      bandwidths = NULL,
                      gsize = 15,
                      grid = apply(as.matrix(data[, 1:(dim(data)[2] - length(cond))]),
                                   2, function(x) seq(quantile(x,.001),quantile(x,.999),
                                                      length.out = gsize))) {

    # Check if the dimensions of the grid, data and conditioning vector match
    if(dim(data)[2] != (dim(grid)[2] + length(cond))) {
        stop('The data matrix, grid matrix and conditioning vector do not match.')
    }

    # Sample size and number of variables
    n <- dim(data)[1]
    d <- dim(data)[2]

    # The grid where we must estimate the joint density. On the x-scale
    x.grid <- cbind(grid, matrix(rep(cond, dim(grid)[1]), ncol = length(cond), byrow = TRUE))

    # Transform the data and grid to standard normality
    transformed <- transLocal(data = data,
                              grid = x.grid,
                              return.normalizing.constants = TRUE)
    transformed.data <- transformed$transformed.data
    z.grid <- transformed$transformed.grid
    normalizing.constants <- transformed$normalizing.constants

    # Calculate the bandwidths if they have not been provided by the user
    if(!is.null(bandwidths)) {
        h <- bandwidths
    } else {
        h <- HLocal(transformed.data)
    }
            
    pairs <- h[,1:2]
    if(length(pairs) == 2) {pairs <- t(as.matrix(pairs))} 
    if(length(pairs) == 2) {n.pairs <- 1} else {n.pairs <- dim(pairs)[1]}

    # Function to calculate local correlation
    joint.densities = function(pair.number) {
        x <- transformed.data[,pairs[pair.number,]]
        bw <- h[pair.number, c('h1', 'h2')]; 

        rho <- biLocal(data = x,
                       grid = z.grid[,h[pair.number,1:2]],
                       h = bw)$par.est                                     
    }

    # Estmate the local correlations
    joints <- lapply(X = as.list(1:n.pairs), FUN = joint.densities)

    # Collect the grid and the local correlations        
    if(length(grid[,1]) == 1) {
        parameter.matrix <- t(c(do.call(cbind, z.grid[,1:(d-length(cond))]),
                                do.call(cbind, joints))) 
    } else {
        parameter.matrix <- cbind(z.grid[,1:(d-length(cond))], do.call(cbind, joints))
    }

    # The first elements are the z-grid point, the rest are the local correlations there 
    parameter.matrix <- split(parameter.matrix, row(parameter.matrix))
            
    
    # Calculate the *conditional* density estimates
    f.estimate = function(parameters) {
         x <- parameters[1:(d-length(cond))]
         rho.vec <- parameters[(d-length(cond)+1):length(parameters)]
         sigma <- diag(d)
         for(j in 1:n.pairs) {
              var1 <- pairs[j,1]
              var2 <- pairs[j,2]
              rho <- rho.vec[j]
              sigma[var1, var2] <- rho
              sigma[var2, var1] <- sigma[var1, var2]
         }
         # Partition sigma
         S11 <- as.matrix(sigma[1:(d - length(cond)), 1:(d-length(cond))])
         S22 <- as.matrix(sigma[(d - length(cond) +1):d, (d-length(cond) + 1):d])
         S12 <- as.matrix(sigma[1:(d - length(cond)),  (d-length(cond) + 1):d])
         dim(S12) <- c(d - length(cond), length(cond))
         S21 <- as.matrix(sigma[(d-length(cond) + 1):d, 1:(d - length(cond))])
         dim(S21) <- rev(dim(S12))
         c.mean = S12%*%solve(S22)%*%as.matrix(z.grid[1,(d-length(cond)+1):d])
         c.sig = S11 - S12%*%solve(S22)%*%S21
         list( c.mean, c.sig, 
             mvtnorm::dmvnorm(as.numeric(x),
                              mean = c.mean,
                              sigma = c.sig)
             )
     }

    f.est.cond <- do.call(rbind,
                     lapply(lapply(X = parameter.matrix, FUN = f.estimate), "[[", 3))*
                         apply(as.matrix(normalizing.constants[,1:(d-length(cond))]), 1, prod)

    ret <- list(loc.cor = joints,
                pairs = pairs,
                x.grid = x.grid,
                z.grid = z.grid,
                c.mean = lapply(lapply(X = parameter.matrix, FUN = f.estimate), "[[", 1),
                c.sig = lapply(lapply(X = parameter.matrix, FUN = f.estimate), "[[", 2),
                f.est.cond = f.est.cond)
    return(ret)
}