#' A Function to generate test-train splits using NH county data. This is meant for internal use with our own data format.
#'
#' @useDynLib ContentStructure
#' @importFrom Rcpp evalCpp
#' @param files A string vector containing the names of the well-formatted .Rdata files with data from the counties we wish to generate a test train split from.
#' @param seed The seed we wish to use to generate the random splits. This is set up so we can vary it and so we can reproduce our results.
#' @param held_out_proportion A number between zero and 1, defaults to 0.1
#' @return Returns a list of lists where each sub-list contains a test-train split of the data generated by the chosen seed.
#' @export
Generate_Heldout_Samples <- function(files,
seed,
held_out_proportion = 0.1){
set.seed(seed)
Sociomatrix_List <- vector(length = length(files), mode = "list")
names(Sociomatrix_List) = files
document_edge_matrix <- author_attributes <- NULL
vocabulary <- document_word_matrix <- NULL
# loop through each county
for(i in 1:length(files)){
# load in the data
cat("Currently processing county:",files[i],"\n")
load(files[i])
nact <- length(author_attributes[,1])
Observed <- matrix(0,nrow = nact, ncol = nact )
Heldout <- matrix(0,nrow = nact, ncol = nact )
samp_vec <- 1:length(document_edge_matrix[,1])
samp_size <- length(samp_vec)*held_out_proportion
hold_out <- sample(samp_vec,samp_size, replace = FALSE)
for(j in 1:length(document_edge_matrix[,1])){
send <- document_edge_matrix[j,1]
for(k in 2:length(document_edge_matrix[1,])){
if(length(j %in% hold_out) > 0){
Heldout[send,k-1] <- Heldout[send,k-1] + document_edge_matrix[j,k]
}else{
Observed[send,k-1] <- Observed[send,k-1] + document_edge_matrix[j,k]
}
}
}
Observed_Emails <- document_edge_matrix[-hold_out,]
Heldout_Emails <- document_edge_matrix[hold_out,]
Observed_Email_Words <- document_word_matrix[-hold_out,]
Heldout_Email_Words <- document_word_matrix[hold_out,]
#generate gender indicator vector
find_F <- function(str){
if(grepl("f",tolower(str))){
1
}else{
0
}
}
gend <- sapply(author_attributes$Gender,find_F)
toadd <- list(observed_sociomatrix = Observed,
heldout_sociomatrix = Heldout,
observed_emails = Observed_Emails,
heldout_emails = Heldout_Emails,
observed_email_words = Observed_Email_Words,
heldout_email_words = Heldout_Email_Words,
gender = gend,
department = author_attributes$Department,
author_attributes = author_attributes,
vocabulary = vocabulary)
Sociomatrix_List[[i]] <- toadd
}
return(Sociomatrix_List)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.