#' A function to ensure data is in the correct dataframe format
#' @param eha_data A dataframe that includes one observation for each node at risk of experiencing the event during each at-risk time point in each cascade. Note, it is assumed that each node can experience an event in each cascade once, at most.
#' @return A data.frame if conversion is successful, otherwise an error with an informative message
isEHA <- function(eha_data){
tryCatch(
{
as.data.frame(eha_data)
},
error = function(cond){
stop("Event history data cannot be coerced to a data.frame: ", cond$message)
}
)
}
#' A function to create data for NEHA with discrete time EHA data
#' @param eha_data A dataframe that includes one observation for each node at risk of experiencing the event during each at-risk time point in each cascade. Note, it is assumed that each node can experience an event in each cascade once, at most.
#' @param node A character string name of the variable that gives the node id. Node ids should be character type variables.
#' @param time A character string name of the variable that gives the time, in integers.
#' @param event A character string name of the variable that gives the binary 0/1 indicator of event occurrence.
#' @param cascade A character string name of the variable that gives the cascade id
#' @return A data frame in which, in addition to all of the variables in 'eha_data', there is one column for each directed dyad, named 'i_j', where 'i' and 'j' are node ids, in which the value indicates the number of time points before 'time' that 'i' experienced the event in the respective 'cascade'. If 'i' did not experience the cascade before 'time', the value is 0.
#' @export
data_neha_discrete <- function(eha_data, node, time, event, cascade){
# Ensure EHA data is in a data.frame object
eha_data <- isEHA(eha_data)
# find out if node is numeric
n1c <- substr(as.character(eha_data[,node]),1,1)
if(!all(is.element(tolower(n1c),letters))){
print("appending n_ to beginning of all node id's since at least one seems numeric")
eha_data[,node] <- paste("n_",eha_data[,node],sep="")
}
# extract node ids
node <- eha_data[,node]
unique.nodes <- unique(node)
# make set of possible diffusion relationships
node.pairs <- combinat::combn(unique.nodes,2)
node.pairs <- sort(c(paste(node.pairs[1,],node.pairs[2,],sep="_"),paste(node.pairs[2,],node.pairs[1,],sep="_")))
# create the matrix in which to hold the diffusion tie variables
diff_var_mat <- matrix(0,length(node),length(node.pairs))
# define column labels of diffusion tie variables
colnames(diff_var_mat) <- node.pairs
# add indicator variables to the diffusion tie variables matrix
for(i in 1:nrow(eha_data)){
# find the adopting node
node_i <- node[i]
# find the policy id
cascade_i <- eha_data[i,cascade]
# find the time
time_i <- eha_data[i,time]
# find previous events in the same cascade
previous.events <- node[which( (eha_data[,time] < time_i) & ( eha_data[,cascade] == cascade_i) & (eha_data[,event]==1) & (node != node_i)) ]
# calculate time since previous events
if(length(previous.events) > 0){
times.since.events <- time_i-eha_data[which( (eha_data[,time] < time_i) & ( eha_data[,cascade] == cascade_i) & (eha_data[,event]==1) & (node != node_i) ),time ]
col.ids <- paste(previous.events,node_i,sep="_")
diff_var_mat[i,col.ids] <- times.since.events
}
}
edge_sum <- apply(diff_var_mat,2,sum)
diff_var_mat <- diff_var_mat[,which(edge_sum > 0)]
# throw out edges with exactly 0 correlation
cor_e_x <- cor(cbind(eha_data[,event],diff_var_mat))[1,-1]
diff_var_mat <- diff_var_mat[,which(cor_e_x != 0)]
data.frame(eha_data,diff_var_mat,stringsAsFactors=F)
}
#' A function to simulate a single cascade from the discrete-time logistic specification NEHA data generating process.
#' @param x The [(number of nodes)x(number of observed time points)]x[number of covariates + 2] data frame of covariates to be used in simulating cascades. Note, each node must be at risk for the event until the maximum time. This function does not currently permit dyadic covariates that are themselves a function of past events (e.g., the number of geographically adjacent nodes that have experienced an event)
#' @param node A character string name of the variable in x that gives the node id
#' @param time A character string name of the variable in x that gives the time, in integers
#' @param beta A (ncol(x)-2)x1 matrix of regression coefficients with row names that match column names in x
#' @param gamma A (number of diffusion ties)x1 matrix with row names of the form "sending-node_receiving-node" such that sending and receiving node ids ## match elements of x[,node]. Elements should be non-negatave numeric values.
#' @param a A non-negative numeric value that models the exponential decay of sender influence. The effect of a previous event experienced by a diffusion tie sender on the log odds of an event at time t is gamma*exp(-exp(a)*(t-source_time)), where source_time is the time the sender experienced the event.
#' @return A data frame in discrete-time EHM format for a single cascade, in which there is at most one event occurrence for each node, indicated by the 'event' variable.
#' @export
simulate_neha_discrete <- function(x,node,time,beta,gamma,a=-8){
times <- sort(unique(x[,time]))
nodes <- sort(unique(x[,node]))
x_linear_predictor <- as.matrix(x[,row.names(beta)])%*%beta
event_times <- rep(NA,length(nodes))
source <- do.call('rbind',strsplit(rownames(gamma),"_"))[,1]
follower <- do.call('rbind',strsplit(rownames(gamma),"_"))[,2]
event <- rep(NA,nrow(x))
for(t in times){
nodes_at_risk <- nodes[is.na(event_times)]
for(n in nodes_at_risk){
xlp_nt <- x_linear_predictor[which((x[,time]==t) & (x[,node]==n))]
n_sources <- source[which(follower==n)]
source_effects <- gamma[which(follower==n),1]
source_times <- event_times[match(n_sources,nodes)]
# a = .25
# plot(0:20,exp(a*-(0:20)),ylim=c(0,1))
time_effects <- source_effects*exp(-exp(a)*(t-source_times))
time_effects <- time_effects[which(!is.na(time_effects))]
linear_predictor <- xlp_nt
if(length(time_effects)>0){
linear_predictor <- linear_predictor + sum(time_effects)
}
pr_nt <- 1/(1+exp(-linear_predictor))
event_nt <- 1*(pr_nt > runif(1))
event[which((x[,time]==t) & (x[,node]==n))] <- 0
if(event_nt==1){
event_times[which(nodes==n)] <- t
event[which((x[,time]==t) & (x[,node]==n))] <- 1
}
}
}
x <- data.frame(x,event,stringsAsFactors=F)
data.frame(na.omit(x),stringsAsFactors=F)
}
estimate_a <- function(dv,edges){
one_times <- c(unlist(edges[which(dv==1),]))
one_times <- one_times[which(one_times>0)]
min_time <- min(one_times)
max_time <- max(one_times)
#exp(-exp(a)*max_time)/exp(-exp(a)*min_time) = 0.5
#-exp(a)*max_time+exp(a)*min_time = log(0.5)
#exp(a) = log(0.5)/(min_time-max_time)
a = log(log(0.9)/(ifelse(min_time==max_time,-.5,min_time-max_time)))
for(i in 1:ncol(edges)){
edges[,i] <- (edges[,i] > 0 )*exp(-exp(a)*edges[,i])
}
list(a,edges)
}
#' @import boot foreach doParallel parallel doRNG
update_a <- function(data_with_aest,covariates,edges_subset,event,old_a,edge_vars,ncore=2){
multa <- c(1.1,1.05,0.95,0.9)
cvglm_a <- function(a,data_with_aest,covariates,edges_subset,event){
hol <- function(y,prob){
sum(log(prob^y*(1-prob)^(1-y)))
}
effect_names <- c(covariates,edges_subset)
data_with_aest[,edges_subset] <- data_with_aest[,edges_subset]^a
form <- as.formula(paste(event,"~",paste(effect_names,collapse="+"),sep=""))
glm.est <- glm(form,family=binomial,data=data_with_aest)
set.seed(9202011)
# cv.glm(data_with_aest,glm.est,hol,10)$delta[2]
logLik(glm.est)
}
cl <- makeCluster(ncore)
registerDoParallel(cl)
a <- NULL
cv_scores <- foreach(a = multa,.packages=c("boot"),
.options.RNG=9202011) %dorng% {
cvglm_a(a,data_with_aest,covariates,edges_subset,event)
}
stopCluster(cl)
cv_scores <- c(unlist(cv_scores))
besta <- multa[which.max(cv_scores)]
data_with_aest[, edge_vars] <- data_with_aest[, edge_vars]^besta
new_a <- old_a*besta
list(new_a,data_with_aest)
}
#' A function to estimate NEHA parameters using greedy edge addition
#' @import foreach doParallel parallel stats
#' @param eha_data A dataframe that includes one observation for each node at risk of experiencing the event during each at-risk time point in each cascade. Note, it is assumed that each node can experience an event in each cascade once, at most.
#' @param node A character string name of the variable that gives the node id
#' @param time A character string name of the variable that gives the time, in integers
#' @param event A character string name of the variable that gives the binary 0/1 indicator of event occurrence.
#' @param cascade A character string name of the variable that gives the cascade id
#' @param ncore integer indicating the number of cores to use in parallel computation.
#' @export
neha_geta <- function(eha_data, node, time, event, cascade, ncore=2){
data_for_neha <- data_neha_discrete(eha_data, node, time, event, cascade)
edge_vars <- names(data_for_neha)[which(!is.element(names(data_for_neha),names(eha_data)))]
a.estimate <- estimate_a(data_for_neha[,event],data_for_neha[,edge_vars])
print("a estimation complete")
data_for_neha[,edge_vars] <- a.estimate[[2]]
# results
list(a_est = a.estimate[[1]],data_for_neha)
}
#' A function to select edges and prepare data for NEHA estimation
#' @import doParallel foreach glmulti parallel
#' @param eha_data A dataframe that includes one observation for each node at risk of experiencing the event during each at-risk time point in each cascade. Note, it is assumed that each node can experience an event in each cascade once, at most.
#' @param node A character string name of the variable that gives the node id
#' @param time A character string name of the variable that gives the time, in integers
#' @param event A character string name of the variable that gives the binary 0/1 indicator of event occurrence.
#' @param cascade A character string name of the variable that gives the cascade id
#' @param covariates character vector of covariate names to include in the neha, excluding the intercept.
#' @param ncore an integer giving the number of cores to use in parallel computation.
#' @param negative logical, experimental indicating whether to go thorugh a phase of negative tie inference
#' @return A list with five elements.
#' \itemize{
#' \item a_est - The estimated value of alpha, the edge effect decay parameter.
#' \item edges - A character vector giving the names of the edges inferred 'sender_receiver'.
#' \item data_for_neha - A dataframe that can be used to find the NEHA estimates using a function for logistic regression.
#' \item combined_formula - NEHA formula to use if you want a single gamma estimate.
#' \item separate_formula - NEHA formula to use if you want a separate gamma estimate for each edge.
#' }
#' @examples
#' library(neha)
#' \dontrun{
#' # Simulate data for NEHA
#' # basic data parameters
#' cascades <- 50
#' nodes <- 20
#' times <- 30
#' nties <- 25
#' # generate dataframe
#' time <- sort(rep(1:times,nodes))
#' node <- paste("n",as.character(rep(1:nodes,times)),sep="")
#' intercept <- rep(1,length(time))
#' covariate <- runif(length(time))-2
#' data_for_sim <- data.frame(
#' time, node, intercept, covariate, stringsAsFactors = FALSE
#' )
#'
#' # regression parameters
#' beta <- cbind(c(-2.5,.25))
#' rownames(beta) <- c("intercept","covariate")
#'
#' # generate network effects
#' possible_ties <- rbind(t(combn(1:nodes,2)),t(combn(1:nodes,2))[,c(2,1)])
#' possible_ties <- paste(
#' paste("n",possible_ties[,1],sep=""),
#' paste("n",possible_ties[,2],sep=""),
#' sep="_"
#' )
#' ties <- sample(possible_ties,nties)
#' gamma <- cbind(rep(1.5,length(ties)))
#' rownames(gamma) <- ties
#'
#' # initiate simulated data object
#' simulated_data <- NULL
#'
#' # generate the data one cascade at a time
#'for(c in 1:cascades) {
#' simulated_cascade <-
#' simulate_neha_discrete(
#' x = data_for_sim,
#' node = "node",
#' time = "time",
#' beta = beta,
#' gamma = gamma,
#' a = -6
#' )
#' simulated_cascade <-
#' data.frame(simulated_cascade,
#' cascade = c,
#' stringsAsFactors = F)
#' simulated_data <- rbind(simulated_data, simulated_cascade)
#'}
#'
#'# infer edges
#'neha_results <-
#' neha(
#' simulated_data,
#' node = "node",
#' time = "time",
#' event = "event",
#' cascade = "cascade",
#' covariates = "covariate",
#' ncore = 3
#' )
#'
#' # estimate NEHA logistic regression
#' neha_estimate <-
#' glm(neha_results$combined_formula,
#' data = neha_results$data_for_neha,
#' family = binomial)
#' summary(neha_estimate)
#' }
#'
#'
#' @export
neha <- function(eha_data,node,time,event,cascade,covariates,
ncore=2, negative=F){
# Ensure EHA data is in a data.frame object
eha_data <- isEHA(eha_data)
data_with_aest <- neha_geta(eha_data, node = node, time = time,
event = event, cascade = cascade, ncore = ncore)
a.estimate <- data_with_aest[[1]]
a_est <- a.estimate
data_with_aest <- data_with_aest[[2]]
edge_vars <- names(data_with_aest)[which(!is.element(names(data_with_aest),
names(eha_data)))]
within_r_corr <- NULL
for (i in 1:length(edge_vars)) {
ri <- strsplit(edge_vars[i], "_")[[1]][2]
xi <- data_with_aest[which(data_with_aest[, node] ==
ri), edge_vars[i]]
yi <- data_with_aest[which(data_with_aest[, node] ==
ri), event]
cor_ye <- suppressWarnings(cor(yi, data_with_aest[which(data_with_aest[,
node] == ri), edge_vars]))
cor_ye <- cor_ye[which(!is.na(cor_ye))]
within_r_corr <- c(within_r_corr, (suppressWarnings(cor(yi,
xi)) - mean(cor_ye))/sd(cor_ye))
}
names(within_r_corr) <- edge_vars
unodes <- unique(data_with_aest[, node])
edges_subset <- NULL
converged <- F
iteration <- 1
while (!converged) {
old_edges <- edges_subset
effect_names <- c(covariates, edges_subset)
if (!is.null(edges_subset)) {
new_a <- update_a(data_with_aest = data_with_aest,
covariates = covariates, edges_subset = edges_subset,
event = event, old_a = a_est, edge_vars = edge_vars,
ncore = ncore)
a_est <- new_a[[1]]
data_with_aest <- new_a[[2]]
}
off <- rep(0, nrow(data_with_aest))
if (length(covariates) > 0) {
full_estimate <- glm(data_with_aest[, event] ~ as.matrix(data_with_aest[,
effect_names]), family = binomial)
off_coef <- coef(full_estimate)
off_coef[is.na(off_coef)] <- 0
off <- as.matrix(data_with_aest[, covariates]) %*%
(off_coef[2:(length(covariates) +
1)])
}
data_with_aest$off <- off
cl <- makeCluster(ncore)
registerDoParallel(cl)
reciever <- NULL
edges_subset <- foreach(reciever = unodes, .packages = c("glmulti")) %dopar%
{
edger <- do.call("rbind", strsplit(names(within_r_corr),
"_"))[, 2]
corr_r <- within_r_corr[which(edger == reciever)]
if (length(corr_r) > 10)
corr_r <- corr_r[order(-corr_r)[1:10]]
screenedr <- names(corr_r)
if (length(screenedr) > 0) {
Xyr <- data_with_aest[, c(screenedr, event)]
names(Xyr)[ncol(Xyr)] <- "y"
offr <- off[which(data_with_aest[, node] ==
reciever)]
sdxy <- apply(Xyr, 2, sd)
Xyr <- Xyr[, which(sdxy > 0)]
increased <- T
est0 <- glm(Xyr[, "y"] ~ 1, family = binomial)
BICfit <- BIC(est0)
vars <- 1
res.best.logistic <- NULL
while (increased) {
res.best.logistic.v <- glmulti(y = "y", xr = names(Xyr)[-ncol(Xyr)],
data = Xyr, level = 1, method = "h", crit = "bic",
confsetsize = 1, plotty = F, report = F,
fitfunction = "glm", maxsize = vars, minsize = ifelse(vars ==
1, 0, min(c(vars, ncol(Xyr) - 1))), family = binomial,
offset = off)
BICvars <- BIC(attributes(res.best.logistic.v)$objects[[1]])
increased <- BICvars < BICfit
if (increased) {
res.best.logistic <- res.best.logistic.v
BICfit <- BICvars
vars <- vars + 1
}
}
best_vars <- names(coef(res.best.logistic))
best_edges <- best_vars[which(is.element(best_vars,
screenedr))]
best_edges
}
}
stopCluster(cl)
edges_subset <- unique(c(unlist(edges_subset)))
converged <- all(is.element(edges_subset, old_edges)) &
all(is.element(old_edges, edges_subset))
}
edges_inferred <- edges_subset
if (negative) {
edges_subset_pos <- edges_subset
edges_subset <- NULL
converged <- F
iteration <- 1
while (!converged) {
old_edges <- edges_subset
effect_names <- c(covariates, edges_subset, edges_subset_pos)
off <- rep(0, nrow(data_with_aest))
if (length(covariates) > 0) {
full_estimate <- glm(data_with_aest[, event] ~
as.matrix(data_with_aest[, effect_names]),
family = binomial)
off <- as.matrix(data_with_aest[, covariates]) %*%
(coef(full_estimate)[2:(length(covariates) +
1)])
}
data_with_aest$off <- off
cl <- makeCluster(ncore)
registerDoParallel(cl)
edges_subset <- foreach(reciever = unodes, .packages = c("glmulti")) %dopar%
{
edger <- do.call("rbind", strsplit(names(within_r_corr),
"_"))[, 2]
corr_r <- within_r_corr[which(edger == reciever)]
corr_r <- corr_r[which(corr_r < 0)]
if (length(corr_r) > 10)
corr_r <- corr_r[order(corr_r)[1:10]]
screenedr <- names(corr_r)
if (length(screenedr) > 0) {
Xyr <- data_with_aest[, c(screenedr, event)]
names(Xyr)[ncol(Xyr)] <- "y"
offr <- off[which(data_with_aest[, node] ==
reciever)]
sdxy <- apply(Xyr, 2, sd)
Xyr <- Xyr[, which(sdxy > 0)]
increased <- T
est0 <- glm(Xyr[, "y"] ~ 1, family = binomial)
BICfit <- BIC(est0)
vars <- 1
res.best.logistic <- NULL
while (increased) {
res.best.logistic.v <- glmulti(y = "y",
xr = names(Xyr)[-ncol(Xyr)], data = Xyr,
level = 1, method = "h", crit = "bic",
confsetsize = 1, plotty = F, report = F,
fitfunction = "glm", maxsize = vars,
minsize = ifelse(vars == 1, 0, vars),
family = binomial, offset = off)
BICvars <- BIC(attributes(res.best.logistic.v)$objects[[1]])
increased <- BICvars < BICfit
if (increased) {
res.best.logistic <- res.best.logistic.v
BICfit <- BICvars
vars <- vars + 1
}
}
best_vars <- names(coef(res.best.logistic))
best_edges <- best_vars[which(is.element(best_vars,
screenedr))]
best_edges
}
}
stopCluster(cl)
edges_subset <- unique(c(unlist(edges_subset)))
converged <- all(is.element(edges_subset, old_edges)) &
all(is.element(old_edges, edges_subset))
}
edges_inferred <- c(edges_subset_pos, edges_subset)
}
edge_cols <- data_with_aest[, edges_inferred]
edges_combined <- apply(edge_cols, 1, sum)
edge_dat <- data.frame(edge_cols, edges_combined)
data_for_neha <- data.frame(data_with_aest[, c(node, time,
event, cascade, covariates)], edge_dat)
combined_formula <- as.formula(paste(event, "~", paste(c(covariates,
"edges_combined"), collapse = "+"), sep = ""))
separate_formula <- as.formula(paste(event, "~", paste(c(covariates,
edges_inferred), collapse = "+"), sep = ""))
list(a_est = a_est, edges = edges_inferred, data_for_neha = data_for_neha,
combined_formula = combined_formula, separate_formula = separate_formula)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.