#### Story: "Dear Mona, What’s The Most Common Name In America?"
#### Url: http://fivethirtyeight.com/features/whats-the-most-common-name-in-america/
#### Authors: Mona Chalabi (Mona.Chalabi@fivethirtyeight.com) and Andrew Flowers (andrew.flowers@fivethirtyeight.com)
require(babynames)
require(dplyr)
require(reshape2)
require(zoo)
require(datasets)
# Census population parameters
pop2000 <- 276059000 # year 2000 population
pop2013 <- 316128839 # year 2013 population
hispPopShare <- .171 # Hispanic share of overall population
foreignPopShare <- .127 # Foreign-born share of overall population
# Census growth rates by racial categories, 2000-2013
whiteGrowth <- 1.01155164
blackGrowth <- 1.138799977
aianGrowth <- 1.110695106
asianGrowth <- 1.553975166
twoRaceGrowth <- 1.817182595
hispGrowth <- 1.531490233
### Step 1: Actuarial Adjustments
# Create actuarial ajustments by birth year according to SSA probabilities
aging <- read.csv("aging-curve.csv") # taken from SSA: http://www.ssa.gov/oact/NOTES/as120/LifeTables_Tbl_7.html; 2013 is used as base year
agingSmooth <- data.frame(year=c(1900:2013))
agingSmooth$perctAliveMale <- aging[match(agingSmooth$year, aging$Decade),]$Male.1
agingSmooth$perctAliveFemale <- aging[match(agingSmooth$year, aging$Decade),]$Female.1
agingSmooth$perctAliveMale[1:111] <- na.approx(agingSmooth$perctAliveMale)
agingSmooth$perctAliveFemale[1:111] <- na.approx(agingSmooth$perctAliveFemale)
agingSmooth$perctAliveMale[112:114] <- 0.993480 # Assume probabilities are flat since 2010
agingSmooth$perctAliveFemale[112:114] <- 0.994490 # Assume probabilities are flat since 2010
# Estimate stock of first names of those currently living, using actuarial probablities
babyNames <- babynames %>% filter(year>=1900)
babyNames$perctAlive <- ifelse(babyNames$sex=="F", agingSmooth[match(babyNames$year, agingSmooth$year),]$perctAliveFemale, agingSmooth[match(babyNames$year, agingSmooth$year),]$perctAliveMale)
babyNames$nAlive <- babyNames$n * babyNames$perctAlive
stockNames <- babyNames %>% group_by(name) %>%
dplyr::summarize(totalAlive=sum(nAlive)) %>% ungroup() %>%
arrange(desc(totalAlive))
grep("Maria", stockNames$name)[1] # "Maria" is the 97th most common first name (unadjusted)
stockNames$perct2013 <- stockNames$totalAlive/pop2013
#### Step 2: Adjust first names to correct for undercounting of Hispanic names
# Why? Need to scale up Hispanic/Latino names because immigration not factored into SSA data set
# We can make inferences about Hispanic name undercounting by looking at names by state, which the SSA does provide.
######### ONLY RUN THIS ONCE, DATA SAVED IN all-states-babynames.csv #########
# Download state files from here: http://www.ssa.gov/oact/babynames/state/namesbystate.zip
# stateFiles <- grep("\\.TXT", list.files())
# allStates <- data.frame()
# for (i in stateFiles){
# stateData <- read.table(list.files()[i], sep=",")
# allStates <- rbind(allStates, stateData)
# }
# names(allStates) <- c("state", "sex", "year", "name", "n")
# write.csv(allStates, "all-states-babynames.csv")
######### ONLY RUN THIS ONCE, DATA SAVED IN all-states-babynames.csv #########
allStates <- read.csv("all-states-babynames.csv", stringsAsFactors=F) # Huge file, created from code above
allStates$perctAlive <- ifelse(allStates$sex=="F", agingSmooth[match(allStates$year, agingSmooth$year),]$perctAliveFemale, agingSmooth[match(babyNames$year, agingSmooth$year),]$perctAliveMale)
allStates$nAlive <- allStates$n * allStates$perctAlive
stateNames <- allStates %>% group_by(name, state) %>%
dplyr::summarize(totalAlive=sum(nAlive)) %>% ungroup() %>%
arrange(desc(totalAlive))
# Import state's hispanic population
statePop <- read.csv("state-pop.csv", stringsAsFactors=F)
statePop$totalPop <- gsub( ",", "", statePop$totalPop)
statePop$hispPop <- gsub( ",", "", statePop$hispPop)
statePop$hispPerct <- as.numeric(statePop$hispPop)/as.numeric(statePop$totalPop)
statePop$state <- state.abb[match(statePop$state, state.name)]
statePop$state[9] <- "DC" # Correct D.C. being coded as "NA"
# Add Hispanic population percentage and state population to each name/state combo
stateNames$hispPerct <- statePop[match(stateNames$state, statePop$state),]$hispPerct
stateNames$statePop <- statePop[match(stateNames$state, statePop$state),]$totalPop
# Calculate frequency of that name in that state, nation-wide, and difference between the two
stateNames$stateFreq <- as.numeric(stateNames$totalAlive) / as.numeric(stateNames$statePop)
stateNames$natFreq <- stockNames[match(stateNames$name, stockNames$name),]$perct2013
stateNames$diffFreq <- stateNames$stateFreq - stateNames$natFreq
# Group name/state combos into one national total, and take top 1000 names
natNamesByState <- stateNames %>% group_by(name) %>% summarize(n=sum(totalAlive)) %>% arrange(desc(n))
topNames <- natNamesByState[1:1000,]$name
# Regress each state's Hispanic population on the frequency each first name appears in the state.
coefs <- data.frame()
# Formula: stateFrequency ~ intercept + hispPerct, weighted by the square-root of the state's population
for(i in 1:length(topNames)){
name <- paste0("^", topNames[i], "$")
m <- lm(data=stateNames[grep(name, stateNames$name),], formula=stateFreq~hispPerct, weights=sqrt(as.numeric(statePop)))
coefs <- rbind(coefs, coef(m))
}
names(coefs) <- c("intercept", "hispPerct")
coefs$name <- topNames
coefs$correction <- (coefs$intercept + coefs$hispPerct*hispPopShare)/(coefs$intercept + coefs$hispPerct*foreignPopShare)
hispNameCorrex <- coefs %>% arrange(desc(correction))
# Cap Hispanic name correction on top and bottom
hispNameCorrex$correction <- ifelse(hispNameCorrex$correction > (hispPopShare/foreignPopShare), (hispPopShare/foreignPopShare), hispNameCorrex$correction)
hispNameCorrex$correction <- ifelse(hispNameCorrex$correction < (1-hispPopShare)/(1-foreignPopShare), (1-hispPopShare)/(1-foreignPopShare), hispNameCorrex$correction)
stockNames$correction <- hispNameCorrex[match(stockNames$name, hispNameCorrex$name),]$correction
stockNames$correction <- ifelse(is.na(stockNames$correction), 1.0, stockNames$correction)
stockNames$newTotal <- stockNames$total * stockNames$correction
stockNames$newPerct2013 <- stockNames$newTotal / pop2013
stockNames2 <- stockNames %>% arrange(desc(newPerct2013))
#### Step 3: Surnames analysis
# Adjust up surnames by their racial shares percent
surnames <- read.csv("surnames.csv", stringsAsFactors=F)
# Growth rates taken from 2000 to 2013; parameters are at top of script
surnames$hispCount2013 <- (as.numeric(surnames$count) * (as.numeric(surnames$pcthispanic)/100)) * hispGrowth
surnames$whiteCount2013 <- (as.numeric(surnames$count) * (as.numeric(surnames$pctwhite)/100)) * whiteGrowth
surnames$blackCount2013 <- (as.numeric(surnames$count) * (as.numeric(surnames$pctblack)/100)) * blackGrowth
surnames$asianCount2013 <- (as.numeric(surnames$count) * (as.numeric(surnames$pctapi)/100)) * asianGrowth
surnames$aianCount2013 <- (as.numeric(surnames$count) * (as.numeric(surnames$pctaian)/100)) * aianGrowth
surnames$twoRaceCount2013 <- (as.numeric(surnames$count) * (as.numeric(surnames$pct2prace)/100)) * twoRaceGrowth
surnames$Count2013 <- surnames$hispCount2013 + surnames$whiteCount2013 + surnames$blackCount2013 + surnames$asianCount2013 + surnames$aianCount2013 + surnames$twoRaceCount2013
surnames$perct2013 <- surnames$Count2013/pop2013
surnames <- surnames %>% arrange(desc(perct2013))
# Filter out top 100 surnames and first names
topFirstName <- stockNames2[1:100,] %>% select(name, newPerct2013)
topSurnames <- surnames[1:100,] %>% select(name, perct2013)
# Create empty matrix of top 100 surnames and first names
nameMatches <- data.frame(matrix(NA, nrow = 100, ncol = 100))
row.names(nameMatches) <- topFirstName$name
names(nameMatches) <- topSurnames$name
# Calculate "independent" probabilities
for (i in 1:100){
nameMatches[,i] <- topFirstName$newPerct2013 * topSurnames[i,]$perct2013
}
nameMatches2 <- nameMatches * pop2013 # Scale matrix by 2013 population
# Melt independent name name matches
longNames <- melt(nameMatches2, measure.vars=1:100)
longNames$firstName <- rep(topFirstName$name, 100)
longNames <- longNames %>% select(firstName, variable, value) %>% arrange(desc(value))
names(longNames) <- c("FirstName", "Surname", "EstimatedNumber")
# Read in Hartman's adjustments table, and melt it
# Source for the data in the 'adjustments.csv' is here: http://mypage.siu.edu/lhartman/johnsmith.html
adjustments <- read.csv("adjustments.csv")
adjustments2 <- melt(adjustments, measure.vars=2:21)
names(adjustments2) <- c("FirstName", "Surname", "Adjustment")
# Clean full names, attach estimated number (by pure independent odds)
longNames$cleanName <- paste(tolower(longNames$FirstName), tolower(longNames$Surname), sep=" ")
adjustments2$cleanName <- paste(tolower(adjustments2$FirstName), tolower(adjustments2$Surname), sep=" ")
adjustments2$Estimate <- longNames[match(adjustments2$cleanName, longNames$cleanName),]$EstimatedNumber
adjustedTotals <- adjustments2 %>% arrange(desc(Estimate))
adjustedTotals$finalEstimate <- adjustedTotals$Estimate * (1+ (adjustedTotals$Adjustment/100))
adjustedTotals <- adjustedTotals %>% arrange(desc(finalEstimate))
adjustedMatrix <- dcast(adjustedTotals, formula=FirstName~Surname)
longNames$finalEstimate <- adjustedTotals[match(longNames$cleanName, adjustedTotals$cleanName),]$finalEstimate
finalMatrix <- dcast(longNames, formula=FirstName~Surname)
tableNames <- row.names(nameMatches)[1:20]
finalMatrix2 <- finalMatrix[match(tableNames, finalMatrix$FirstName),]
advMatrixFinal <- finalMatrix2[,1:21]
#### Numbers to check in piece
sum(stockNames2$newPerct2013) # first names cover 80% of population
sum(stockNames2$newTotal) # first names cover 255 million people
grep("maria garcia", adjustments2$cleanName) # Maria Garcia (unadjusted) is the 354th most common name
grep("maria smith", adjustments2$cleanName) # Maria Smith (unadjusted) is the 74th most common name
surnames[grep("GEORGE", surnames$name)[1],]$perct2013 * stockNames2[grep("Kieron", stockNames2$name)[1],]$newPerct2013 * pop2013 # "Kieron George" odds
surnames[grep("LOTT", surnames$name)[1],]$count # Number of Americans with last name "Lott", should be 25118
#### Files to export
write.csv(topFirstName, "new-top-firstNames.csv")
write.csv(topSurnames, "new-top-surnames.csv")
write.csv(nameMatches2, "independent-name-combinations-by-pop.csv")
write.csv(advMatrixFinal, "adjusted-name-combinations-matrix.csv")
write.csv(adjustedTotals, "adjusted-name-combinations-list.csv")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.