library(stringr)
library(plyr)
library(dplyr)
library(magrittr)
dfhc <- read.csv("rawdata/predict_hac_data_raw.csv",
as.is = TRUE)
dfhc <- dfhc[1:406, ]
colnames(dfhc)
# I'll test functions on the animal column and on age of males in household.
ages <- dfhc[, 12]
names(ages) <- dfhc[, 1]
ages[ages == ""] <- "blank"
# To Lowercase
ages_lower <- tolower(ages)
# Tokenize
# # First we remove spaces from the start and end.
# ages_trimmed <- unlist(strsplit(ages_lower, split = "^\\s+|\\s+$"))
# # Then we tokenize on commas and the word "and", also removing interior spaces.
# ages_tokenized <- strsplit(ages_trimmed, split = ",\\s*|\\s*(\\band?\\b)\\s*")
ages_tokenized <- strsplit(ages_lower, split = "^\\s+|\\s*,\\s*|\\s*(\\band?\\b)\\s*|\\s+$")
all_ages <- as.data.frame(table(unlist(ages_tokenized)))
arrange(all_ages, -Freq)
ages_tokenized
ages_df <- ldply(seq_along(ages_tokenized), function(i) {
ages_orig <- ages_tokenized[[i]]
PID <- names(ages_tokenized[i])
df <- data.frame(PID, ages_orig, stringsAsFactors = FALSE)
})
x <- ages_df$ages_orig
clean_ages <- function(x) {
# Identify "months" in columns and clean them
x[grep("m", x)] <- as.numeric(gsub("[A-Za-z]| ", "", x[grep("m", x)])) / 12
# Strip all characters and spaces
x <- gsub("[A-Za-z]| ", "", x)
# Convert "0" to NA
x[x == "0" | x == "" | x == "-"] <- NA
x <- as.numeric(x)
}
ages_df$ages_clean <- clean_ages(ages_df$ages_orig)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.