R/Generate_Heldout_Samples.R

#' A Function to generate test-train splits using NH county data. This is meant for internal use with our own data format. 
#' 
#' @useDynLib ContentStructure
#' @importFrom Rcpp evalCpp
#' @param files A string vector containing the names of the well-formatted .Rdata files with data from the counties we wish to generate a test train split from. 
#' @param seed The seed we wish to use to generate the random splits. This is set up so we can vary it and so we can reproduce our results.
#' @param held_out_proportion A number between zero and 1, defaults to 0.1
#' @return Returns a list of lists where each sub-list contains a test-train split of the data generated by the chosen seed. 
#' @export
Generate_Heldout_Samples <- function(files,
                                     seed,
                                     held_out_proportion = 0.1){
  set.seed(seed)
  Sociomatrix_List <- vector(length = length(files), mode = "list")
  names(Sociomatrix_List) = files
  document_edge_matrix <- author_attributes <-  NULL
  vocabulary <- document_word_matrix <- NULL
  
  # loop through each county
  for(i in 1:length(files)){
    
    # load in the data
    cat("Currently processing county:",files[i],"\n")
    load(files[i])
    
    nact <- length(author_attributes[,1])
    
    Observed <- matrix(0,nrow = nact, ncol = nact )
    Heldout <- matrix(0,nrow = nact, ncol = nact )
    
    samp_vec <- 1:length(document_edge_matrix[,1])
    samp_size <- length(samp_vec)*held_out_proportion
    hold_out <- sample(samp_vec,samp_size, replace = FALSE)
    
    for(j in 1:length(document_edge_matrix[,1])){
      send <- document_edge_matrix[j,1]

      for(k in 2:length(document_edge_matrix[1,])){
        if(length(j %in% hold_out) > 0){
          Heldout[send,k-1] <- Heldout[send,k-1] + document_edge_matrix[j,k]
        }else{
          Observed[send,k-1] <- Observed[send,k-1] + document_edge_matrix[j,k]
        }
      }
    }
    
    Observed_Emails <- document_edge_matrix[-hold_out,]
    Heldout_Emails <- document_edge_matrix[hold_out,]
    Observed_Email_Words <- document_word_matrix[-hold_out,]
    Heldout_Email_Words <- document_word_matrix[hold_out,]
    
    
    #generate gender indicator vector
    find_F <- function(str){
      if(grepl("f",tolower(str))){
        1
      }else{
        0
      }
    }
    gend <- sapply(author_attributes$Gender,find_F)
    toadd <- list(observed_sociomatrix = Observed, 
                  heldout_sociomatrix = Heldout,
                  observed_emails = Observed_Emails,
                  heldout_emails = Heldout_Emails,
                  observed_email_words = Observed_Email_Words,
                  heldout_email_words = Heldout_Email_Words,
                  gender = gend,
                  department = author_attributes$Department,
                  author_attributes = author_attributes,
                  vocabulary = vocabulary)
    
    Sociomatrix_List[[i]] <- toadd
  }
  return(Sociomatrix_List)
}
matthewjdenny/ContentStructure documentation built on May 21, 2019, 1:01 p.m.