Yleisimmat nimet 1900-luvulla

Lahde: Louhos-blogi.

Aloitetaan noutamalla ja esikasittelemalla nimitilastot:

# Use the XML library to load the name data from Vaestorekisterikeskus
library(XML)
nametables <- list()
for (year in 1:14) {
theurl <- paste("http://verkkopalvelu.vrk.fi/Nimipalvelu/nimipalvelu_etunimitop.asp?vuosi=",year,"&L=1",sep="")
tables <- readHTMLTable(theurl)
year.val <- unlist(strsplit(as.character(as.vector(tables[[2]]$V2)), split=" "))[3]
nametables[[year]] <- tables[[3]][2:11,1:5]
names(nametables)[year] <- year.val
}

library(ggplot2)
library(gdata)

# Melt to a data frame
library(reshape)
temp <- melt(nametables, id.vars=c("V1", "V4"), measure.vars=c("V2", "V5"))

# Change name counts and years to numeric form, merge years 2010 and 2011
temp$value <- as.numeric(as.vector(temp$value))
temp$year <- c(seq(1890, 2010, 10), 2010)[match(temp$L1, unique(temp$L1))]

# Get subsets for men and women
men <- subset(temp, variable=="V2")[,c(1,4,5,6)]
women <- subset(temp, variable=="V5")[,c(2,4,5,6)]
names(men) <- names(women) <- c("Nimi", "Maara", "Vuosi.char", "Vuosi")
men.sum <- aggregate(men$Maara, by=list(men$Nimi), sum)
women.sum <- aggregate(women$Maara, by=list(women$Nimi), sum)

# Get top 10 names for both genders
men.top10 <- men[men$Nimi %in% men.sum[order(men.sum[,2], decreasing=T),1][1:10],]
men.top10$Nimi <- drop.levels(men.top10$Nimi)
women.top10 <- women[women$Nimi %in% women.sum[order(women.sum[,2], decreasing=T),1][1:10],]
women.top10$Nimi <- drop.levels(women.top10$Nimi)

# Add missing zeros (run for either men.top10 or women.top10)
add.zeros <- function(dat) {
Names <- levels(dat$Nimi)
Years <- unique(dat$Vuosi)
# For each, add zeros for missing names
for (y in Years) {
to.add <- which(!(Names %in% dat$Nimi[dat$Vuosi==y]))
temp.data <- data.frame(Nimi=Names[to.add], Maara=rep(0, length(to.add)),
Vuosi.char=rep(NA, length(to.add)), Vuosi=rep(y, length(to.add)))
dat <- rbind(dat, temp.data)
}
return(dat)
}

Visualisointi:

p <- ggplot(add.zeros(women.top10), aes(x=Vuosi, y=Maara, fill=Nimi))
p <- p + geom_area(position="Stack", colour="black") + ylab("Maara") + ggtitle("Tyttojen nimet")
print(p)
# Plot with geom_density. Looks nice, but the y-axis isn't exactly intuitive...
p <- ggplot(men.top10, aes(x=Vuosi, weight=Maara, y=..count.., fill=Nimi))
p <- p + geom_density(position="stack") + ylab("Maara") + ggtitle("Poikien nimet")
print(p)


ropengov/sorvi documentation built on Oct. 24, 2023, 7:24 p.m.