#' Combine Similar Columns Into One
#'
#' Combine similar columns into one column within one spreadsheet.
#'
#' @author Yiying Wang, \email{wangy@@aetna.com}
#' @param mapping.file Name of the field name mapping dictionary.
#' @param toolkit Directory of the dictionary.
#' @param output.format xls or xlsx. Default xlsx.
#'
#' @return A list. `Output Results` shows the path where bind_output.xlsx is; `Dropped
#' columns` shows the variables dropped during the combination.
#' @export
#' @importFrom gWidgets2 gfile
#' @importFrom compiler cmpfun
#' @importFrom readr read_csv
#' @importFrom stringi stri_conv
#' @importFrom stringr str_detect
#' @importFrom openxlsx getSheetNames readWorkbook read.xlsx write.xlsx createStyle
#' @importFrom XLConnect readWorksheetFromFile
#'
#' @examples
#' \dontrun{
#' bindColumns()
#' }
bindColumns <- function(mapping.file=paste0(toolkit, 'BindFieldNames.xlsx'),
toolkit=getOption("toolkit.dir"),
output.format='xlsx'){
# ------------Env codes no longer needed---------------
# Sys.setlocale("LC_CTYPE", "Chs") # Very important!
# addRtoolsPath()
#------Field name dictionary-------
sources <- enc2native(getSheetNames(mapping.file))
ynAssignMap <- .tcltkSelectItem(sources, "Select the mapping source.")
if (!ynAssignMap[[1]]) {
stop("No source designated!")
}else {
#map.source <- iconv(ynAssignMap[[2]], "UTF-8", "CP936")
map.source <- enc2native(ynAssignMap[[2]])
map <- readWorkbook(mapping.file, map.source)
map <- map[order(map$InDb, map$InFile), ]
map[, 1] <- toupper(map[, 1])
map[, 2] <- toupper(map[, 2])
# read files--------------------
if (Sys.info()['sysname'] == "Windows"){
raw.file <- invisible(choose.files(
paste0(getOption("init.dir"), "*.*"), multi=FALSE,
caption="Select the raw data file ...",
filters=rbind(matrix(c("Excel files (*.xls?)", "*.xls?;*.xls",
"csv files (*.csv)", "*.csv"),
byrow=TRUE, nrow=2),
Filters["All",])))
}else{
raw.file <- gfile("Select the raw data file ...", type='open',
initial.dir=getOption("init.dir"),
filter=list('xls* files'=list(
patterns=c('*.xls?', '*.xls')),
'csv files'=list(patterns=c('.csv'))),
multi=FALSE)
raw.file <- stri_conv(raw.file, "CP936", "UTF-8")
}
if (!file.exists(raw.file)) stop("No file designated!")
if (str_detect(raw.file, "[Xx][Ll][Ss][XxMmBb]$")){
dta <- read.xlsx(raw.file, 1, detectDates=TRUE) # Sheet1
}else if (str_detect(raw.file, "[Xx][Ll][Ss]$")){
dta <- readWorksheetFromFile(raw.file, 1) # Sheet1
}
input.cols <- names(dta)
if (str_detect(raw.file, "csv$"))
dta <- read_csv(raw.file)
#----------bind columns, need to recode-----------------------
maps <- split(map, map$InDb)
bind.all <- unlist(lapply(maps, function(x){
all(x$BindAll) | !(any(x$BindAll))
}))
if (!(all(bind.all)))
stop(cat("BindAll not consistent:\n", names(bind.all)[!bind.all]))
origColNames <- data.frame(InFile=names(dta))
origColNames <- merge(origColNames, map, by="InFile",
all.x=TRUE, sort=FALSE)
origColNames$InDb[is.na(origColNames$InDb)] <-
origColNames$InFile[is.na(origColNames$InDb)]
origColNames <- c("FILE", "SHEET",
as.character(
origColNames$InDb[!duplicated(origColNames$InDb)]))
dta <- dta[,! names(dta) %in% map$InFile[is.na(map$InDb)]]
funBindList <- function(x, dat){
# loop call .funBindCols to bind columns by grouped columns
# Args:
# x: mapping df
# dat: data.frame
# Return:
# Binded df
if (any(names(dat) %in% c(x$InFile, x$InDb))){
.funBindCols(dat[,intersect(names(dat), c(x$InFile, x$InDb))],
bind.all=x$BindAll[1])
}
}
listbindVal <- lapply(maps, cmpfun(funBindList), dat=dta)
class.vars <- sapply(listbindVal, function(x) class(x))
listbindVal[class.vars=='NULL'] <- NULL
df <- as.data.frame(listbindVal, stringsAsFactors=FALSE)
dta <- cbind(dta[ ,!names(dta) %in% c(map$InDb, map$InFile)], df)
dta <- dta[, intersect(origColNames, names(dta))]
# for (i in seq_len(nrow(map))){
# if (is.na(map$InDb[i])){
# dta[,c(map$InFile[i])] <- NULL #drop
# }else if(map$InFile[i] != map$InDb[i]){
# if (map$InFile[i] %in% names(dta)){
# if (!map$InDb[i] %in% names(dta)){
# names(dta) <- sub(map$InFile[i],map$InDb[i],names(dta))
# }else{
# if (map$BindAll[i]){ #BindAll=T bind all contents
# dta[,map$InDb[i]] <- paste(dta[,map$InDb[i]],
# dta[,map$InFile[i]],sep=";")
# dta[,map$InDb[i]] <-
# sub("NA;|;NA$|^[[:space:]]*NA[[:space:]]*$","",
# dta[,map$InDb[i]])
# }else if (!map$BindAll[i]){ #BindAll=F bind vacancies
# if (!map$InDb[i] %in% names(dta)) dta[,map$InDb[i]] <- NA
# dta[is.na(dta[,map$InDb[i]]),map$InDb[i]] <-
# dta[is.na(dta[,map$InDb[i]]),map$InFile[i]]
# }
# dta[,map$InFile[i]] <- NULL
# }
# }
# }
# }
if (length(raw.file) > 1)
message("You selected multiple files, we will pick up the first one's directory.")
raw.path <- str_replace_all(raw.file[1], "\\\\", "/")
raw.path <- str_replace(raw.path, "^(.+/)[^/]+$","\\1")
if (! str_detect(raw.path, ".+\\\\$|.+/$")) raw.path <- paste0(raw.path, "/")
if (output.format == 'csv'){
write.csv(dta, paste0(raw.path, "bind_output.csv"), na="")
}else if (output.format=='xlsx'){
write.xlsx(dta, file=paste0(raw.path, "bind_output.xlsx"),
sheetName="Sheet1",
headerStyle=createStyle(fgFill="#E8E8E8",
fontName='Arial Narrow'),
withFilter=TRUE)
}
drop.vars <- data.frame(
idx=which(!input.cols %in% c("FILE", "SHEET", map$InDb, map$InFile)),
var=input.cols[! input.cols %in% c("FILE", "SHEET", map$InDb,
map$InFile)]
)
return(list("Output Result"=paste0("The cleaned dataset 'bind_output.",
output.format, "' is in the folder ",
raw.path),
"Dropped columns"=drop.vars))
}
}
#' @importFrom stringr str_detect str_replace_all
.funBindCols <- function(x, bind.all=FALSE){
# Base function to bind grouped columns (like cbind) into one
# Args:
# x: object to combine
# bind.all: logical. If TRUE, bind all contents; if FALSE, extract the first
# non-empty content
# Return:
# binded vector based on x
if (is.null(dim(x))){
v <- x
}else{
x <- as.data.frame(sapply(as.list(x), trimws), stringsAsFactors=FALSE)
x[is.na(x)] <- ""
v <- do.call('paste', c(x, sep="%&%"))
v <- str_replace_all(v, "^(%&%)+|NA(%&%)+|(%&%)+NA|(%&%)+$", "")
if (bind.all){
v <- str_replace_all(v, "(%&%)+", ";")
}else{
v <- str_replace_all(v, "^(.*?)%&%.*$", "\\1")
}
v[str_detect(v, "^$|^NA$")] <- NA
}
return(v)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.