TEMP2.R

library(dplyr)
# Load data
list.files('./raw-data/raw_nwea')
nwea <- load_csv('./raw-data/raw_nwea/NWEA Reading Fall SY 2013-14.csv')
list.files('./raw-data')
eto <- load_csv('./raw-data/students_list.csv')


# Create custom id --------------------------------------------------------
# Ensure that name of matching variable are the same in both dataset
nwea <- rename(nwea, fname = StudentFirstName, lname = StudentLastName)


eto$my_id <- create_id(eto, var = c('fname', 'lname'))
eto$raw_id <- paste0(eto$fname, eto$lname)
nwea$my_id <- create_id(nwea, var = c('fname', 'lname'))
nwea$raw_id <- paste0(nwea$fname, nwea$lname)

xx <- unique(nwea[, c('my_id', 'raw_id')])
xx$var_xx <- 'var_xx'
rm(nwea)
yy <- unique(eto[, c('my_id', 'raw_id')])
yy$var_yy <- 'var_yy'
rm(eto)

# Perfect match -----------------------------------------------------------
#The first matching pass - are there any rows in the two lists that have exactly the same signature?
matched <- inner_join(x = xx, y = yy, by = "my_id")
# Identify perfect matches
matched$match_status = "perfect match"


# Partial match -----------------------------------------------------------
#Grab the rows from the first list that were unmatched - that is, no matching item from the second list appears
todo <- anti_join(x = xx, y = yy, by = "my_id")

todo <- select(todo, my_id, raw_id = raw_id.x)

# partial match
todo$partials = as.character(sapply(todo$my_id, FUN = agrep, yy$my_id, max.distance = 0.1, value = T))
todo$partials[todo$partials == "character(0)"] <- NA

#Bring the original text into the partial match list based on the sig key.
partial_matched <- inner_join(x = todo, y = yy, by = c("partials" = "my_id"))

#Label these rows as partial match items
partial_matched$match_status = "Partial"
partial_matched["partials"] <- NULL

# Unmatched ---------------------------------------------------------------
#Find the rows that still haven't been matched
unmatched <- todo[is.na(todo$partials), ]
unmatched$match_status <- 'unmatched'
unmatched["partials"] <- NULL
colnames(unmatched)[colnames(unmatched) == 'raw_id'] <- 'raw_id.x'

#Add the set of partially matched items to the set of duplicate matched items
out <- dplyr::bind_rows(matched,partial_matched, unmatched)
thelayc/laycUtils documentation built on May 31, 2019, 9:17 a.m.