R/clean_bind_row.R

#' Combine Rows Into One
#'
#' Combine rows into one which are actually sole record.
#'
#' @author Yiying Wang, \email{wangy@@aetna.com}
#' @param df You can assign a data.frame yourself. If null, you will manually
#' select a spreadsheet and extract the data.
#' @param output.format xlsx or xls. Default xlsx.
#'
#' @return Nothing
#' @export
#' @importFrom gWidgets2 gfile gwindow gvbox ggroup gframe gradio gbutton gaction
#' @importFrom gWidgets2 addHandlerChanged
#' @importFrom compiler cmpfun
#' @importFrom readr read_csv
#' @importFrom stringi stri_conv
#' @importFrom stringr str_detect
#' @importFrom XLConnect loadWorkbook createSheet writeWorksheet saveWorkbook readWorksheetFromFile getSheets
#' @importFrom openxlsx read.xlsx write.xlsx createStyle
#'
#' @examples
#' \dontrun{
#' bindRows()
#' }
bindRows <- function(df = NULL, output.format = 'xlsx'){
    # Sys.setlocale("LC_CTYPE","Chs")  # Very important!
    # sapply(c("dplyr","openxlsx","gWidgets2","gWidgets2RGtk2","compiler"),
    #       require,character.only=TRUE)
    # options(guiToolkit="RGtk2")
    # addRtoolsPath()
    #----------read files--------------------
    if (is.null(df)){
        if (Sys.info()['sysname'] == "Windows"){
            raw.file <- invisible(choose.files(
                paste0(getOption("init.dir"), "*.*"), multi=FALSE,
                caption="Select the raw data file...",
                filters=rbind(matrix(c("Excel files (*.xls?)", "*.xls?;*.xls",
                                       "csv files (*.csv)", "*.csv"), 
                                     byrow=TRUE, nrow=2),
                              Filters["All",])))
        }else{
            raw.file <- gfile("Select the raw data file...",type='open',
                          initial.dir=getOption("init.dir"),
                          filter=list('xls* files'=list(
                              patterns=c('*.xls?','*.xls')),
                              'csv files'=list(patterns=c('.csv'))),
                          multi=FALSE)
            raw.file <- stri_conv(raw.file, "CP936", "UTF-8")
        }
        
        if (!file.exists(raw.file)) stop("No file designated!")
        if (str_detect(raw.file, "[Xx][Ll][Ss]$|[Xx][Ll][Ss][XxMmBb]$")) {
            sheets <- enc2native(getSheets(loadWorkbook(raw.file)))
            dims <- as.data.frame(sapply(sheets, cmpfun(function(x) {
                d <- try(dim(readWorksheetFromFile(raw.file,x)),
                         silent=TRUE)
                if (is.null(d)) d <- c(0,0) else d
            })))
            dims <- sapply(names(dims), function(x) {
                paste(c(x, paste(dims[,x], collapse=" ")), collapse=" ")
                })
            funSelSht <- function(file=raw.file, sheets=sheets, dims=dims){
                window <- gwindow("Select the sheet", width=200, height=200)
                box <- gvbox(cont=window)
                addHandlerChanged(window, handler=function(...){
                    gtkMainQuit()
                })
                gg1 <- ggroup(cont=box)
                gg2 <- ggroup(cont=box, horizontal = TRUE)
                box1 <- gvbox(cont=gg1)
                frm1 <- gframe("Sheet Name (nRow nCol):", cont=box1)
                chkmap <- gradio(items=dims, selected=1, index=TRUE, cont=frm1)
                box21 <- gvbox(cont=gg2)
				box22 <- gvbox(cont=gg2)
                actOK <- gaction("  OK  ", "OK",
                                 handler=function(h,...){
                    dsheet <<- enc2native(svalue(chkmap))
                    output <<- TRUE
                    dispose(window)
                })
                buttonOK <- gbutton(action=actOK, cont=box21)
                actCancel <- gaction("Cancel", "Cancel",
                                     handler=function(h,...){
                    dsheet <<- NULL
                    output <<- FALSE
                    dispose(window)
                })
                buttonCancel <- gbutton(action=actCancel, cont=box22)
                gtkMain()
                return(list(output, dsheet))
            }
            sheet.sel <- funSelSht(raw.file, sheets, dims)
            if (!sheet.sel[[1]]) stop("You cancelled actions.")
            sheet.sel <- which(sheets==names(dims)[dims==sheet.sel[[2]]])
            dta <- readWorksheetFromFile(raw.file, sheet.sel)
        }
        if (str_detect(raw.file, "\\.[Cc][Ss][Vv]$"))
            dta <- read_csv(raw.file)
    }else{
        dta <- df
    }
    #ID var----------
    vars <- names(dta)
    var <- .funSelVar(vars)
    if (!var[[1]]) stop("You did not select any identifier!")
    else var <- var[[2]]
    #---------Combine rows-----------------
    dt <- split(dta, dta[,var])
    sn <- as.matrix(sapply(dt, function(x) nrow(x)))
    snNoBind <- row.names(sn)[sn[, 1]==1]
    snBind <- row.names(sn)[sn[, 1]>1]

    .funBindMode <- function(col.name, dat){
        d <- dat[!is.na(dat[, col.name]), col.name]
        if (length(d)==0){
            return(FALSE)
        }else{
            coerceNA <- try(as.numeric(d[! str_detect(enc2native(
                d, "[\u62D2\u5F03\u9634\u9633\\<\\>\\+\\-]"))]), silent=TRUE)
            if (length(! str_detect(d, enc2native("[\u62D2\u5F03\u9634\u9633\\<\\>\\+\\-]")))==0){
                p <- 0
            }else{
                p <- sum(is.na(coerceNA)) / length(! str_detect(
                    d, enc2native("[\u62D2\u5F03\u9634\u9633\\<\\>\\+\\-]")))
            }
            output <- (p>=0.5 || str_detect(tolower(col.name), enc2native("\u603B\u7ED3|\u5C0F\u7ED3?"))) &&
                ! str_detect(tolower(col.name), enc2native(paste0(
                    "name|gender|sex|company|\u59D3\u540D|\u5355\u4F4D|\u90E8\u95E8|\u6027\u522B|",
                    "department|\u8BC1$|\u53F7$|\u5EA6$|\u8054\u7CFB|\u4E2D\u5FC3$|\u7C7B\u578B$|provider|",
                    "^\u5C3F|\u5EA6$|\u7EA7^|\u91CF$|\u6570$|\u503C$"))
                    )
            return(output)
        }
    }
    bind.all <- sapply(vars, cmpfun(.funBindMode), dat=dta)

    funBindRows <- function(x, bind.all){
        if (is.null(dim(x))){
            v <- x
        }else{
            v <- do.call('paste', c(as.data.frame(t(x)), sep="%&%"))
            v <- str_replace_all(v, "^%&%|NA%&%|%&%NA|%&%$","")
            v[bind.all] <- str_replace_all(v[bind.all], "%&%", ";")
            v[!bind.all] <- str_replace_all(v[!bind.all], "^(.*?)%&%.*$", "\\1")
            v[str_detect(v, "^$|^NA$")] <- NA
        }
        return(v)
    }
    after.bind <- as.data.frame(t(sapply(dt[snBind], cmpfun(funBindRows),
                                         bind.all=bind.all)))
    if (ncol(after.bind)>0) {
        names(after.bind) <- vars
        output <- rbind(do.call('rbind', dt[snNoBind]), after.bind)
    }else{
        output <- dt[[snNoBind]]
    }

    raw.path <- str_replace_all(raw.file[1], "^(.+\\\\)[^\\]+\\.[Xx][Ll][Ss].{0,1}$",
                     "\\1")
    if (! str_detect(raw.path, ".+\\\\$")) raw.path <- paste0(raw.path,"\\\\")
    if (output.format=='csv'){
        write.csv(output,paste0(raw.path,"bind_rows.csv"), na="")
    }else if (output.format=='xlsx'){
        write.xlsx(output,file=paste0(raw.path,"bind_rows.xlsx"),
                     sheetName="Sheet1",
                     headerStyle=createStyle(
                         fgFill="#E8E8E8",
                         fontName='Arial Narrow')
                     )
    }
    return(paste0("The cleaned dataset 'bind_rows.", output.format,
                  "' is in the folder ", raw.path))
}
madlogos/aseshms documentation built on May 21, 2019, 11:03 a.m.