R/utils.R

#' Function to find monadic weights at one time point
#' @param df a data frame containing the unit ID variable and time variables as
#' well as location' and dependent variables.
#' @param id_var a character string identifying the unit ID variable in
#' \code{df}.
#' @param time_var a character string identifying the time variable in
#' \code{df}.
#' @param location_var a character string identifying the location of the units
#' in \code{df}. This is used to create the weighting matrix. Note that the
#' function finds the relative distance between the units by subtracting their
#' 'location'.
#' @param y_var a character string identifying the dependent variable in
#' \code{df}. Note that an independent variable could also be supplied.
#' @param type_cont whether the locations variables are continuous.
#' @param method the distance measure to be used. Only relevant for numeric
#' location variables. This must be one of
#' \code{"euclidean"}, \code{"maximum"}, \code{"manhattan"}, \code{"canberra"},
#' \code{"binary"} or \code{"minkowski"}. Any unambiguous substring can be
#' given. See \code{\link{dist}} for details.
#' @param row_standard logical. Whether or not to row-standardize the adjacency
#' matrix, i.e. dividing each neighbor weight for a feature by the sum of all
#' neighbor weights for that feature.
#' @param return_matrix logical. Whether or not to return the adjacency matrix.
#' Could be useful for debugging.
#' @param weight_name character string providing a custom weighting variable
#' name.
#' @param morans_i character specifying whether to print the p-value of
#' Moran's I Autocorrelation Index to the console (\code{message}), return only
#' a table of p-values (\code{table}), or \code{none}.
#' @param ... arguments to pass to methods.
#'
#' @source Neumayer, Eric, and Thomas Plumper. "Making spatial analysis
#' operational: commands for generating spatial effect variables in monadic and
#' dyadic data." Stata Journal 10.4 (2010): 585-605.
#'  \url{http://eprints.lse.ac.uk/30750/1/Making\%20spatial\%20analysis\%20operational(lsero).pdf}.
#'
#' @importFrom stats dist
#' @importFrom dplyr %>% full_join select
#' @importFrom igraph graph_from_data_frame as_adjacency_matrix
#' @importFrom ape Moran.I
#'
#' @noRd
weights_at_t <- function(df, id_var, location_var, y_var, type_cont,
                         time_var,
                         method = 'euclidean',
                         row_standard = FALSE,
                         return_matrix = FALSE,
                         weight_name,
                         morans_i = 'message', ...)
{
    freq <- NULL

    if (missing(weight_name)) weight_name <- sprintf('sp_wght_%s_%s',
                                                     location_var, y_var)

    if (missing(type_cont)) stop(
        'type_cont must be specified as TRUE or FALSE.', call. = FALSE)

    if (any(duplicated(df[, id_var]))) stop('Duplicate observations found',
                                            call. = FALSE)

    # Find w_{ikt}
    if (type_cont) {
        t_matrix <- as.matrix(dist(df[, location_var], method = method, ...))
    }
    else if (!type_cont) {
        df$temp <- 1
        joined <- full_join(df, df, by = 'temp')
        joined <- joined[, c(paste0(id_var, '.y'), paste0(id_var, '.x'),
                             paste0(location_var, '.y'),
                             paste0(location_var, '.x'))]
        joined$weighting[joined[, 3] == joined[, 4]] <- 1
        joined$weighting[joined[, 3] != joined[, 4]] <- 0

        joined <- joined[, c(paste0(id_var, '.x'),
                             paste0(id_var, '.y'), 'weighting')]
        joined <- joined[joined[, 1] != joined[, 2], ]
        grph <- graph_from_data_frame(joined, directed = FALSE, vertices = NULL)
        t_matrix <- as_adjacency_matrix(grph, attr = 'weighting',
                                        sparse = FALSE)
    }

    if (row_standard) t_matrix <- t_matrix / rowSums(t_matrix)

    if (return_matrix) return(t_matrix)
    else {
        # Find y_{kt}
        dependent_y <- df[, c(id_var, y_var)]

        if (morans_i != 'none') {
            # Find and print Moran's I
            if (type_cont) {
                message("Finding Moran's I for matrix inverse.")
                # See https://stats.idre.ucla.edu/r/faq/how-can-i-calculate-morans-i-in-r/
                t_matrix_inv <- 1/t_matrix
                diag(t_matrix_inv) <- 0
            }
            else if (!type_cont & !row_standard)
                t_matrix_inv <- t_matrix == 1
            else
                t_matrix_inv <- t_matrix

            mi <- Moran.I(dependent_y[, 2], t_matrix_inv)

            time_value <- unique(df[, time_var])
            mor_i <- round(mi$observed, digits = 3)
            m_p_value <- format.pval(mi$p.value, digits = 3)
        }

        if (morans_i == 'table') {
            out <- data.frame(morans_i = mor_i, morans_i_p_value = m_p_value)
            return(out)
        }

        else if (morans_i != 'table') {
            if (morans_i == 'message')
                message(sprintf("%s: Moran's I p-value: %s", time_value,
                                m_p_value))

            matrix_product <- t_matrix * dependent_y[, 2]
            out <- colSums(matrix_product) %>% as.data.frame
            out[, id_var] <- df[, id_var]
            names(out) <- c(weight_name, id_var)

            if (!isTRUE(type_cont)) {
                # Find group averages
                counts <- table(df[, location_var]) %>% data.frame
                names(counts) <- c(location_var, 'freq')
                id_located <- merge(df, counts, by = location_var)
                id_located <- id_located[, c(id_var, location_var, 'freq')]
                out <- merge(out, id_located, by = id_var)
                out$freq <- out$freq - 1
                out$freq[out$freq < 1] <- 0
                out[, weight_name] <- out[, weight_name] / out$freq
                out <- out %>% select(-freq)
            }
            return(out)
        }
    }
}



#' Drop rows from a data frame with missing values on a given variable(s).
#' @source From the DataCombine package
#' @noRd

DropNA <- function(data, Var)
{
    # Find term number
    DataNames <- names(data)
    if (missing(Var)) {
        if (isTRUE(message)) {
            message('No Var specified. Dropping all NAs from the data frame.\n')
        }
        Var <- names(data)
    }
    TestExist <- Var %in% DataNames
    if (!all(TestExist)){
        stop("Variable(s) not found in the data frame.", call. = FALSE)
    }

    # Drop if NA
    if (length(Var) == 1){
        DataNoNA <- data[!is.na(data[, Var]), ]

        DataVar <- data[, Var]
        DataNA <- DataVar[is.na(DataVar)]
        TotalDropped <- length(DataNA)
    }
    else{
        RowNA <- apply(data[, Var], 1, function(x){any(is.na(x))})
        DataNoNA <- data[!RowNA, ]

        TotalDropped <- sum(RowNA)
    }

    if (TotalDropped > 0)
        message(paste(TotalDropped,
            "rows dropped from the data frame because of missing values.\n"))

    return(DataNoNA)
}

#' Lag a time-series cross-sectional variable
#'
#' @inheritParams monadic_spatial_weights
#'
#' @importFrom dplyr %>% arrange group_by mutate lag


lagger <- function(df, id_var, time_var, weight_name) {
    original_names <- c(id_var, time_var, weight_name)
    col_positions <- sapply(original_names, function(x) grep(x, names(df)))
    names(df)[col_positions] <- c('id_var', 'time_var', 'weight_name')

    df <- df %>% arrange(id_var, time_var) %>% group_by(id_var) %>%
            mutate(lag_wy = dplyr::lag(weight_name, order_by = id_var))

    names(df)[col_positions] <- original_names
    names(df)[ncol(df)] <- sprintf('lag_%s', weight_name)
    return(df)
}
christophergandrud/spatialWeights documentation built on May 13, 2019, 7:04 p.m.