# PVI.2012.2016 FUNCTION
# - Makes PVI Scores Usable for Any Enumerated Scope
# - Insert PVI Score.csv or replace
# - The File should have a header
# - Returns a dataframe
PVI.2012.2016 <- function(file = "PVI Scores.csv", header = TRUE){
PVI <- read.csv(file= file,header=header)
# We are going to have to match instead on names, the FIPS
## Deconstruct OfficialNames Variable into parts separated by commas
## Combine Numbers together
## Match on last 5 digits of FIPS to FIPS at the county level...
## ... for census data.
PVI$X1 <- gsub("[^[:digit:]]","",PVI$Official.Names)
PVI$X1 <- substr(PVI$X1,1,5)
### Extracting PVI Scores for use
# 2012
Symbols2012 <- gsub("[[:digit:]]","",PVI$X2012)
Raw2012 <- gsub("[^[:digit:]]","",PVI$X2012)
# 2016
Symbols2016 <- gsub("[[:digit:]]","",PVI$X2016)
Raw2016 <- gsub("[^[:digit:]]","",PVI$X2016)
# If R+ positive, else if D+ negative, else 0
# 2012
PVI$Raw2012 <- ifelse(Symbols2012 == "R+",
as.numeric(as.character(Raw2012)),
ifelse(Symbols2012 == "D+",
-1*as.numeric(as.character(Raw2012)),
as.numeric(0)))
# 2016
PVI$Raw2016 <- ifelse(Symbols2016 == "R+",
as.numeric(as.character(Raw2016)),
ifelse(Symbols2016 == "D+",
-1*as.numeric(as.character(Raw2016)),
as.numeric(0)))
# Ternary Assignment
# 2012
PVI$Ternary2012 <- ifelse(Symbols2012 == "R+", "R",
ifelse(Symbols2012 == "D+", "D", "EVEN"))
# 2016
PVI$Ternary2016 <- ifelse(Symbols2016 == "R+", "R",
ifelse(Symbols2016 == "D+", "D", "EVEN"))
PVI <- cbind.data.frame(PVI[,1:2],PVI[,6],PVI[3:4],PVI[,7:10])
# Names of PVI
PVI_Names <- c("State","County","FIPS","PVI.2012","PVI.2016","Raw2012"
,"Raw2016","Ternary2012","Ternary2016")
# Assign names to PVI
colnames(PVI) <- PVI_Names
# Return PVI dataframe
return(PVI)
}
# Data.Set.Guide function
# If You aren't sure if your census datasets are uniform, it'll at least find and process the ones that are uniform.
# The majority uniform ones gets processed.
# If you figure out the dates that are uniform, just throw of the used datasets in the drive and repeat, or something like that.
# PATH
# Returns a list containing a vector of file addresses used
# A data frame of the Description of Variables with their respective column index
Data.Set.Guide <- function(DIR = "C:\\Users\\M\\Desktop\\Elections Forecasting\\GEOGRAPHIC_MOBILITY_5YR_ACS_S0701", pattern="_5YR_S0701_with_ann.csv$" ){
fils <- list.files(DIR, pattern, full.names = TRUE, recursive = TRUE)
file_name_length <- length(fils)
# Check to see if the annotations are the same, this is important because it dictates which method to employ if estimates...
# ...vary from year to year
# Dimensions of Rows and Columns - for Annotated - Check if they are all similar
# This creates an empty vector to store the column lengths of each dataframe
dim_store <- as.vector(NULL)
for(i in 1:file_name_length){
# open csv in a temporary object
temp_data <- read.csv(fils[i],header=TRUE)
# Create a place to store column size data
# get column size data for that index representing a dataframe
dim_store[i] <- ncol(temp_data)
# remove temorary object
rm(temp_data)
}
# Report which links are included in the form of a dataframe
valid_dataset_addresses <- NULL
for(i in 1:length(fils)){
if(dim_store[i] == max(dim_store)){
valid_dataset_addresses[i] <- fils[i]
}
}
valid_dataset_addresses <- as.data.frame(valid_dataset_addresses)
# Create an empty list to store datasets of the same column...
# ...lengths
data_list <- as.list(NA)
# Store Datasets in a list
condition_requirement <- max(dim_store)
for(i in 1:length(dim_store)){
if(dim_store[i]== condition_requirement){
# This is where we will store all the dataframes relates
# ...to the census tract of interest
data_list[i] <- list(read.csv(fils[i],header=TRUE))
}
}
# If they are all even, just get the first list
data_list <- as.data.frame(data_list[1])
# Remember, with lists...
# ...list_name[["number_corresponding_to_dataframe_number"]][rowin df of list, column in df of list]
# Create a codebook for the dataset
County_Variable_Detail <- as.data.frame(
cbind(t(as.vector(data_list[1,])),1:length(t(as.vector(data_list[1,])))
))
colnames(County_Variable_Detail) <- c("Detail","R Index")
list <- list(County_Variable_Detail,valid_dataset_addresses)
print("OutPutObject[[1]] <- dataframe of codebook for all datasets included.")
print("OutPutObject[[2]] <- addresses of included datasets.")
View(County_Variable_Detail)
View(valid_dataset_addresses)
return(list)
}
# FolderSearch.By.Id Function
# About: This function will allow you to automate the collection of variables of interest...
# ... across multiple csv files. This is good for collecting the same variables from different ....
# ... csv files representing years.
# Parameters you need to plug in:
# Directory of files
# file with FIPS column named FIPS
# pattern default is "_with_ann.csv"
# indices_of_interest = what are column numbers of the estimates and their respective MoEs?
# Right Way: indices_of_interest <-c(44,45,66,67); Wrong Way: indices_of_interest <-c(44,66,45,67);
# Should be in function input
#data_interests_index_vector = c(474,475,104,105)
#DIR = "C:\\Users\\GEOGRAPHIC_MOBILITY_5YR_ACS_S0701"
#DataSet.Generic.Name = "ACS"
#pattern = "_with_ann.csv$"
#Covariate.and.MoE.Names = c("Median.Income","Median.Income.MoE","Median.Age","Median.Age.MoE")
#PVI_dataset_with_FIPS_COLUMN_NAMED_FIPS = PVI_df
FolderSearch.By.Id <- function(targeted_fit = PVI_dataset_with_FIPS_COLUMN_NAMED_FIPS,
DIR = DIR,
pattern="_with_ann.csv$",
indices_of_interest = data_interests_index_vector,
DataSet.Generic.Name = "ACS",
Covariate.and.MoE.Names = Covariate.and.MoE.Names){
# DO YOU HAVE A COLUMN CALLED FIPS?
if("FIPS" %in% names(targeted_fit)){
# Specify Directory Location ---- Your directory will be different from mine.
# In this file, I chose Median Age and Median Income for testing, find something else to see if it works for you
# The directory below contains all 5 YR ACS datasets between 2009 and 2016 for S0701 or Geographic Mobility
# Specify the type of csv file you want uploaded (generic format)
fils <- list.files(DIR, pattern=pattern, full.names = TRUE, recursive = TRUE)
file_name_length <- length(fils)
# Check to see if the annotations are the same, this is important because it dictates which method to employ if estimates...
# ...vary from year to year
# Dimensions of Rows and Columns - for Annotated - Check if they are all similar
# This creates an empty vector to store the column lengths of each dataframe
col_store <- as.vector(NULL)
for(i in 1:file_name_length){
# open i-th csv file
temp_data <- read.csv(fils[i],header=TRUE)
# Store the column info of csv
col_store[i] <- ncol(temp_data)
rm(temp_data)
}
# Create an empty list to store datasets of the same column lengths
data_list <- as.list(NULL)
# Store Datasets in a list
for(i in 1:length(col_store)){
if(col_store[i]==sum(unique(col_store))){
# This is where we will store all the dataframes related to the census tract covariates and MoEs of interest
data_list[i] <- list(read.csv(fils[i],header=TRUE))
}else if(col_store[i] <= max(indices_of_interest)){
data_list[i] <- list(read.csv(fils[i],header=TRUE))
}else{
print("furthest location of covariate of interest and MoE not found, so dataset %d, is ignored", i)
}
}
# Testing
Estimates_of_interest <- c(NULL)
MoE_of_interest <- c(NULL)
# I wrote this to make sure no minor mistakes such as adding uneven vectors are passed
if(length(indices_of_interest) %% 2 == 0){
vetted_covariates <- indices_of_interest
for(i in 1:length(vetted_covariates)){
ifelse(i %% 2 == 1,
Estimates_of_interest[i] <- vetted_covariates[i],
MoE_of_interest[i] <- vetted_covariates[i])}
}
else{
stop("You're vector isn't even. So either you forgot an estimate or MoE. You have to include both for this function.
Add in order of Estimate then its MoE. ")
}
# Remove the NA values for each vector
Estimates_of_interest <- na.omit(Estimates_of_interest)
MoE_of_interest <- na.omit(MoE_of_interest)
Covariate.Names <- NULL
MoE.Names <- NULL
# NAMES Preparation for the covariates of interest
#If Even list go on
if(length(Covariate.and.MoE.Names) %% 2 == 0){
# Process the Covariate.and.MoE.Names
for(i in 1:length(Covariate.and.MoE.Names)){
if(i %% 2 != 0){
Covariate.Names[i] <- Covariate.and.MoE.Names[i]
}else if(i %% 2 == 0){
MoE.Names[i] <- Covariate.and.MoE.Names[i]
}
}
# Eliminate the risk of an NA
Covariate.Names <- na.omit(Covariate.Names)
MoE.Names <- na.omit(MoE.Names)
}else{
stop("You're vector isn't even. So either you forgot to name an estimate or MoE.")
}
# Create 2 list containers
# This container will contain extractions of estimates and margin of errors from the census tracts at each iteration of the...
# ... specifed datasets from the directory of interest.
# We fill plug this into new_list, so we have a list of lists.
# This list will contain all covariates and MoEs
# of 1 dataset at a time
new_list <- list()
# The container list contains all the new_list of the j-th dataset
container <- list()
# Needs Improvement, BUT IT WORKS
for(i in 1:length(data_list)){
# For Each Dataset given the length of list_of_df
# We want to extract the year of the dataset to append to each column
address_of_dataset <- fils[i]
for(j in 1:length(Estimates_of_interest)){
# Variables from each dataset will be stored in new_list
# At the j-th iteration of new_list
# We add a dataframe from estimate index j and MoE index j
# This j-th iteration estimates
new_list[[j]] <- as.data.frame(lapply(Estimates_of_interest[j]:MoE_of_interest[j],
function(x) data_list[[i]][,x][match(targeted_fit$FIPS,
data_list[[i]][,2])]))
# Using a little bit of regular expressions to append Dataset Type and Year of Dataset to the Covariate Name
# Find the Pattern
MatchingCharacter <- regexpr("_\\d{2}",address_of_dataset, perl=TRUE)
# Get the matches
DataSetType_Year <- regmatches(address_of_dataset,MatchingCharacter)
#Add ACS.YY
DataSetType_YearACS<- paste0(gsub('_', paste0(DataSet.Generic.Name,'\\.'), DataSetType_Year),".")
# Append the column names
colnames(new_list[[j]]) <- c(paste0(DataSetType_YearACS,Covariate.Names[j]),paste0(DataSetType_YearACS,MoE.Names[j]))
# Store List within List because new_list gets replaced after j = 2
# What could I do to reduce the number of forloops?
container[[i]] <- new_list
}
}
# You will get back a list of values
return(as.data.frame(container))
}else{
print("Add the name FIPS to your column of FIPS. Or get FIPS to add to your dataset you want to attach census data based on similar enumeration levels. ")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.