# 01_92 Exploratory for data cleanup
# Imports -----------------------------------------------------------------
here::i_am("analysis/01_92_exploratory-for-data-cleaning.R")
library(here)
library(tidyverse)
# Compare pids in "complete only" vs "incomplete included" ----------------
# data_all <- readr::read_rds(here("analysis/data/derived_data/data-newnames.rds"))
data_completeonly <- readr::read_rds(here("analysis/data/derived_data/data-newnames-completeonly.rds"))
# inc_ids<- data_completeonly$pid
dictionary <- read_rds("analysis/data/derived_data/clean-question-list.rds")
# Find out why the "incompletes" were removed OR just remove them and move on for speed
# data_incomplete <- data_all |> filter(pid %in% inc_ids)
# They stopped before finishing. Remove them
# Remove bad cases --------------------------------------------------------
data_newnames <- read_rds("analysis/data/derived_data/data-newnames-completeonly.rds")
# find people who cheated the system
## what did people put for the free-answer questions -----------------------
data_newnames |> select(car_year) |> unique() |> View()
data_newnames |> filter(car_year == "mbnvnv") |> View()
data_newnames |>
mutate(car_make = str_to_lower(car_make)) |>
select(car_make) |>
unique()
# ggplot(aes(car_make)) + geom_bar()
ggplot(data_newnames, aes(car_make)) + geom_bar()
data_newnames |> select(car_make, car_model) |> View()
# check incomes against each other ----------------------------------------
# check age/year born against each other ----------------------------------
# check genders against each other ----------------------------------------
bad_gen <- data_newnames |> select(pid, gender, gender_survey_monk) |>
filter(!(gender %in% "Prefer not to say")) |>
filter(gender != gender_survey_monk) |>
pull(pid)
x <- data_newnames |>
filter(pid %in% bad_gen)
data_newnames
data_newnames |> filter(!is.na(gender_self_describe)) |> View()
# a person only put the number 3 in for every possible place, put disagree for every prompt (pid = 11598634657)
# select ppl who put the same answer for every `prompt_` -----
col_comb <- data_newnames %>% select(starts_with("prompt")) %>% colnames() %>% combn(2, simplify = FALSE)
# list of true/false: is this
prompt_cheat_ids <-
# map_dfr(prompt_comb, ~ list(pid = two_case[["pid"]], compair = two_case[.x[1]] == two_case[.x[2]])) %>%
map_dfr(col_comb, ~ list(pid = data_newnames[["pid"]], compair = data_newnames[.x[1]] == data_newnames[.x[2]])) %>%
# map_dfr(col_comb, ~ list(pid = data_newnames[["pid"]], compair = data_newnames[.x[1]] )) %>%
# map_dfr(col_comb, ~ list(pid = data_newnames[["pid"]], compair = str_detect(data_newnames[.x[1]], data_newnames[.x[2]]))) %>%
group_by(pid) %>%
summarise(compare_all = sum(compair)) %>%
mutate(allsame = compare_all == 15) %>%
filter(allsame) %>%
pull(pid)
data_short <- data_newnames |>
select(-collectorid, -startdate, -enddate, -custom_data_1, -collector_type_source, -device, -contains("_mode_"),
-region_survey_monk, -united_states_region_survey_monk)
prompt_cheat_data <- data_short %>% filter(pid %in% prompt_cheat_ids) %>% slice(19:length(.))
# Getting the string distances to attempt to find duplicates -----
# try with this list of makes/models from Hadley
# devtools::install_github("hadley/fueleconomy")
# library(tidystringdist)
#
# veh_makelist <- fueleconomy::common %>% pull(make) %>% unique()
#
# tidy_comb(data_newnames, veh_makelist, car_make)
#
# I could combine a bunch of columns into a single string first...
# library(tidystringdist)
library(stringdist)
# try with this list of makes/models from Hadley
# devtools::install_github("hadley/fueleconomy")
# library(tidystringdist)
#
veh_makelist <- fueleconomy::common %>% pull(make) %>% unique() %>% str_to_upper()
# code from https://amunategui.github.io/stringdist/
unq_make <- data_newnames %>%
select(car_make) %>%
mutate(car_make = str_to_upper(car_make) %>% str_trim(side = "both")) %>%
filter(!is.na(car_make)) %>%
pull(car_make) %>%
unique()
distmake <- stringdistmatrix(unq_make, unq_make, method = "jw")
# distmake <- stringdistmatrix(unq_make, veh_makelist, method = "jw")
rownames(distmake) <- unq_make
colnames(distmake) <- veh_makelist
hc <- hclust(as.dist(distmake))
plot(hc)
rect.hclust(hc,k = 50)
dfClust <- data.frame(unq_make, cutree(hc, k = 50))
names(dfClust) <- c('modelname','cluster')
print(paste('Average number of models per cluster:', mean(table(dfClust$cluster))))
# order the data by cluster size
t <- table(dfClust$cluster)
t <- cbind(t,t/length(dfClust$cluster))
t <- t[order(t[,2], decreasing=TRUE),]
p <- data.frame(factorName=rownames(t), binCount=t[,1], percentFound=t[,2])
dfClust <- merge(x=dfClust, y=p, by.x = 'cluster', by.y='factorName', all.x=T)
dfClust <- dfClust[rev(order(dfClust$binCount)),]
names(dfClust) <- c('cluster','modelname')
head(dfClust[c('cluster','modelname')],50)
# -----
veh_strdist <- data_newnames %>%
mutate(car_make = str_to_upper(car_make) %>% str_trim(side = "both")) %>%
tidy_comb_all(car_make) %>%
tidy_stringdist(method = "jw")
vh_dist <- as.dist(veh_strdist)
veh_repl <- veh_strdist %>%
# base the replacement on the number of occurrences
left_join(make_freq, by = c("V1" = "car_make")) %>%
rename(n_cases_v1 = n_cases) %>%
left_join(make_freq, by = c("V2" = "car_make")) %>%
rename(n_cases_v2 = n_cases) %>%
# remove honda/hyundai for now
filter((str_detect(V1, "^H") & str_detect(V2, "^H"))) %>%
#BEFORE using soundex, gotta take care of honda/hyundai
mutate(repl_v1 = if_else(
condition = jw > 0.16 & !soundex,
true = if_else(n_cases_v1 >= n_cases_v2,
true = V1,
false = V2),
false = V1
)) %>%
filter(!soundex)
mutate(soundex_repl = if_else(soundex == 0, V1, V2))
# maybe base the replacement on the number of occurrences
make_freq <- data_newnames %>%
select(car_make) %>%
mutate(car_make = car_make %>%
str_trim() %>%
str_to_upper()) %>%
group_by(car_make) %>%
summarize(n_cases = n())
# -----
data_newnames %>%
mutate(same = str_detect(prompt_drive_freedom, prompt_drive_relax)) %>%
mutate(if_else(
condition = str_detect(prompt_drive_freedom, prompt_drive_relax),
false = FALSE,
true =
))
select(pid, prompt_drive_freedom, prompt_drive_relax, same) %>%
filter(!same) %>%
# select(pid, starts_with("prompt"), same) %>%
View()
head(data_newnames$prompt_drive_enjoy)
head(data_newnames$prompt_drive_freedom)
str_detect(data_newnames$prompt_drive_enjoy, data_newnames$prompt_drive_freedom)
# To get duplicate cases, try selecting only character columns? -----
# Numeric columns contain numeric responses? -----
# check ZIPS and cities against each other --------------------------------
# Check ppl who put same prompt response for all --------------------------
# Build final dataset -----------------------------------------------------
data_rmcases <- data_newnames |>
# obvious ones
filter(car_make == "jhgfjhgfd")
# Write cleaned data ------------------------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.