R/CensusSortR-package.r

# PVI.2012.2016 FUNCTION
# - Makes PVI Scores Usable for Any Enumerated Scope
# - Insert PVI Score.csv or replace
# - The File should have a header
# - Returns a dataframe

PVI.2012.2016 <- function(file = "PVI Scores.csv", header = TRUE){
  PVI <- read.csv(file= file,header=header)
  # We are going to have to match instead on names, the FIPS
  ## Deconstruct OfficialNames Variable into parts separated by   commas
  ## Combine Numbers together
  ## Match on last 5 digits of FIPS to FIPS at the county level...
  ## ... for census data.
  PVI$X1 <- gsub("[^[:digit:]]","",PVI$Official.Names)
  PVI$X1 <- substr(PVI$X1,1,5)
  
  ### Extracting PVI Scores for use
  # 2012
  Symbols2012 <- gsub("[[:digit:]]","",PVI$X2012)
  Raw2012 <- gsub("[^[:digit:]]","",PVI$X2012)
  # 2016
  Symbols2016 <- gsub("[[:digit:]]","",PVI$X2016)
  Raw2016 <- gsub("[^[:digit:]]","",PVI$X2016)
  
  # If R+ positive, else if D+ negative, else 0
  # 2012
  PVI$Raw2012 <- ifelse(Symbols2012 == "R+",
                        as.numeric(as.character(Raw2012)),
                        ifelse(Symbols2012 == "D+",
                               -1*as.numeric(as.character(Raw2012)),
                               as.numeric(0)))
  # 2016
  PVI$Raw2016 <- ifelse(Symbols2016 == "R+", 
                        as.numeric(as.character(Raw2016)),
                        ifelse(Symbols2016 == "D+",
                               -1*as.numeric(as.character(Raw2016)),
                               as.numeric(0)))
  
  # Ternary Assignment
  # 2012
  PVI$Ternary2012 <- ifelse(Symbols2012 == "R+", "R", 
                            ifelse(Symbols2012 == "D+", "D", "EVEN"))
  # 2016
  PVI$Ternary2016 <- ifelse(Symbols2016 == "R+", "R", 
                            ifelse(Symbols2016 == "D+", "D", "EVEN"))
  
  PVI <- cbind.data.frame(PVI[,1:2],PVI[,6],PVI[3:4],PVI[,7:10])
  
  # Names of PVI 
  PVI_Names <- c("State","County","FIPS","PVI.2012","PVI.2016","Raw2012"
                 ,"Raw2016","Ternary2012","Ternary2016")
  
  # Assign names to PVI
  colnames(PVI) <- PVI_Names
  
  # Return PVI dataframe
  return(PVI)
}

# Data.Set.Guide function
# If You aren't sure if your census datasets are uniform, it'll at least find and process the ones that are uniform.
# The majority uniform ones gets processed. 
# If you figure out the dates that are uniform, just throw of the used datasets in the drive and repeat, or something like that.
#  PATH
# Returns a list containing a vector of file addresses used
# A data frame of the Description of Variables with their respective column index
Data.Set.Guide <- function(DIR = "C:\\Users\\M\\Desktop\\Elections Forecasting\\GEOGRAPHIC_MOBILITY_5YR_ACS_S0701", pattern="_5YR_S0701_with_ann.csv$" ){
  fils <- list.files(DIR, pattern, full.names = TRUE, recursive = TRUE)
  file_name_length <- length(fils)
  
  # Check to see if the annotations are the same, this is important because it dictates which method to employ if estimates...
  # ...vary from year to year 
  # Dimensions of Rows and Columns - for Annotated - Check if they are all similar
  # This creates an empty vector to store the column lengths of each dataframe
  dim_store <- as.vector(NULL)
  for(i in 1:file_name_length){
    # open csv in a temporary object
    temp_data <- read.csv(fils[i],header=TRUE)
    # Create a place to store column size data
    # get column size data for that index representing a dataframe
    dim_store[i] <- ncol(temp_data)
    # remove temorary object
    rm(temp_data)
  }
  
  # Report which links are included in the form of a dataframe 
  valid_dataset_addresses <- NULL
  for(i in 1:length(fils)){
    if(dim_store[i] == max(dim_store)){
      valid_dataset_addresses[i] <- fils[i]
    }
  }
  
  valid_dataset_addresses <- as.data.frame(valid_dataset_addresses)
  # Create an empty list to store datasets of the same column...
  # ...lengths
  data_list <- as.list(NA)
  # Store Datasets in a list
  condition_requirement <- max(dim_store)
  for(i in 1:length(dim_store)){
    if(dim_store[i]== condition_requirement){
      # This is where we will store all the dataframes relates 
      # ...to the census tract of interest
      data_list[i] <- list(read.csv(fils[i],header=TRUE))
    }
  }
  # If they are all even, just get the first list
  data_list <- as.data.frame(data_list[1])
  
  # Remember, with lists...
  # ...list_name[["number_corresponding_to_dataframe_number"]][rowin df of list, column in df of list]
  # Create a codebook for the dataset
  County_Variable_Detail <- as.data.frame(
    cbind(t(as.vector(data_list[1,])),1:length(t(as.vector(data_list[1,])))
    ))
  colnames(County_Variable_Detail) <- c("Detail","R Index")
  
  list <- list(County_Variable_Detail,valid_dataset_addresses)
  print("OutPutObject[[1]] <- dataframe of codebook for all datasets included.")
  print("OutPutObject[[2]] <- addresses of included datasets.")
  View(County_Variable_Detail)
  View(valid_dataset_addresses)
  return(list)
}

# FolderSearch.By.Id Function
# About: This function will allow you to automate the collection of variables of interest...
# ... across multiple csv files. This is good for collecting the same variables from different ....
# ... csv files representing years.

# Parameters you need to plug in:
# Directory of files
# file with FIPS column named FIPS
# pattern  default is "_with_ann.csv"
# indices_of_interest = what are column numbers of the estimates and their respective MoEs?
# Right Way: indices_of_interest <-c(44,45,66,67); Wrong Way: indices_of_interest <-c(44,66,45,67);
# Should be in function input

#data_interests_index_vector  = c(474,475,104,105)
#DIR = "C:\\Users\\GEOGRAPHIC_MOBILITY_5YR_ACS_S0701"
#DataSet.Generic.Name = "ACS"
#pattern = "_with_ann.csv$"
#Covariate.and.MoE.Names = c("Median.Income","Median.Income.MoE","Median.Age","Median.Age.MoE")
#PVI_dataset_with_FIPS_COLUMN_NAMED_FIPS = PVI_df

FolderSearch.By.Id <- function(targeted_fit = PVI_dataset_with_FIPS_COLUMN_NAMED_FIPS, 
                               DIR = DIR,
                               pattern="_with_ann.csv$",
                               indices_of_interest = data_interests_index_vector,
                               DataSet.Generic.Name = "ACS",
                               Covariate.and.MoE.Names = Covariate.and.MoE.Names){
  # DO YOU HAVE A COLUMN CALLED FIPS?
  if("FIPS" %in% names(targeted_fit)){
    
    # Specify Directory Location ---- Your directory will be different from mine. 
    # In this file, I chose Median Age and Median Income for testing, find something else to see if it works for you
    # The directory below contains all 5 YR ACS datasets between 2009 and 2016 for S0701 or Geographic Mobility
    # Specify the type of csv file you want uploaded (generic format)
    fils <- list.files(DIR, pattern=pattern, full.names = TRUE, recursive = TRUE)
    file_name_length <- length(fils)
    # Check to see if the annotations are the same, this is important because it dictates which method to employ if estimates...
    # ...vary from year to year 
    # Dimensions of Rows and Columns - for Annotated - Check if they are all similar
    # This creates an empty vector to store the column lengths of each dataframe
    col_store <- as.vector(NULL)
    for(i in 1:file_name_length){
      # open i-th csv file
      temp_data <- read.csv(fils[i],header=TRUE)
      # Store the column info of csv
      col_store[i] <- ncol(temp_data)
      rm(temp_data)
    }
    
    # Create an empty list to store datasets of the same column lengths
    data_list <- as.list(NULL)
    # Store Datasets in a list
    for(i in 1:length(col_store)){
      if(col_store[i]==sum(unique(col_store))){
        # This is where we will store all the dataframes related to the census tract covariates and MoEs of interest  
        data_list[i] <- list(read.csv(fils[i],header=TRUE))
      }else if(col_store[i] <= max(indices_of_interest)){
        data_list[i] <- list(read.csv(fils[i],header=TRUE))
      }else{
        print("furthest location of covariate of interest and MoE not found, so dataset %d, is ignored", i)
      }
    }
    
    # Testing
    Estimates_of_interest <- c(NULL)
    MoE_of_interest <- c(NULL)
    
    # I wrote this to make sure no minor mistakes such as adding uneven vectors are passed 
    if(length(indices_of_interest) %% 2 == 0){
      vetted_covariates <- indices_of_interest
      
      for(i in 1:length(vetted_covariates)){
        ifelse(i %% 2 == 1, 
               Estimates_of_interest[i] <- vetted_covariates[i], 
               MoE_of_interest[i] <- vetted_covariates[i])}
    }
    else{
      stop("You're vector isn't even. So either you forgot an estimate or MoE. You have to include both for this function.
           Add in order of Estimate then its MoE. ")
    }
    
    # Remove the NA values for each vector
    Estimates_of_interest <- na.omit(Estimates_of_interest)
    MoE_of_interest <- na.omit(MoE_of_interest)
    
    Covariate.Names <- NULL
    MoE.Names <- NULL
    # NAMES Preparation for the covariates of interest
    #If Even list go on
    if(length(Covariate.and.MoE.Names) %% 2 == 0){
      # Process the Covariate.and.MoE.Names
      for(i in 1:length(Covariate.and.MoE.Names)){
        if(i %% 2 != 0){
          Covariate.Names[i] <- Covariate.and.MoE.Names[i]
        }else if(i %% 2 == 0){
          MoE.Names[i] <- Covariate.and.MoE.Names[i]
        }
      }
      # Eliminate the risk of an NA
      Covariate.Names <- na.omit(Covariate.Names)
      MoE.Names <- na.omit(MoE.Names)
    }else{
      stop("You're vector isn't even. So either you forgot to name an estimate or MoE.")
    }
    
    
    # Create 2 list containers
    # This container will contain extractions of estimates and margin of errors from the census tracts at each iteration of the...
    # ... specifed datasets from the directory of interest.
    # We fill plug this into  new_list, so we have a list of lists.
    # This list will contain all covariates and MoEs 
    # of  1 dataset at a time
    new_list <- list()
    
    # The container list contains all the new_list of the j-th dataset
    container <- list()
    # Needs Improvement, BUT IT WORKS
    for(i in 1:length(data_list)){
      # For Each Dataset given the length of list_of_df
      # We want to extract the year of the dataset to append to each column
      address_of_dataset <- fils[i]
      for(j in 1:length(Estimates_of_interest)){
        # Variables from each dataset will be stored in new_list
        
        # At the j-th iteration of new_list
        # We add a dataframe from estimate index j and MoE index j
        # This j-th iteration estimates 
        new_list[[j]] <- as.data.frame(lapply(Estimates_of_interest[j]:MoE_of_interest[j], 
                                              function(x) data_list[[i]][,x][match(targeted_fit$FIPS,
                                                                                   data_list[[i]][,2])]))
        
        
        
        # Using a little bit of regular expressions to append Dataset Type and Year of Dataset to the Covariate Name 
        # Find the Pattern
        MatchingCharacter <- regexpr("_\\d{2}",address_of_dataset, perl=TRUE)
        # Get the matches
        DataSetType_Year <- regmatches(address_of_dataset,MatchingCharacter)
        #Add ACS.YY
        DataSetType_YearACS<- paste0(gsub('_', paste0(DataSet.Generic.Name,'\\.'), DataSetType_Year),".")
        # Append the column names
        colnames(new_list[[j]]) <- c(paste0(DataSetType_YearACS,Covariate.Names[j]),paste0(DataSetType_YearACS,MoE.Names[j]))
        # Store List within List because new_list gets replaced after j = 2
        # What could I do to reduce the  number of forloops?
        container[[i]] <- new_list
      }
    }
    
    # You will get back a list of values
    return(as.data.frame(container))
    }else{
      print("Add the name FIPS to your column of FIPS. Or get FIPS to add to your dataset you want to attach census data based on  similar enumeration levels. ")
    }
}
melmaniwan/CensusSortR documentation built on May 12, 2019, 4:36 a.m.