R/stats.R

Defines functions ez.corrmatrix ez.citen ez.es.t.paired.msr ez.es.t.paired.m12s12r ez.es.t.paired.tnr ez.es.t.independent.tn ez.es.t.independent.msn ez.dprime dprime ez.maxnp ez.table2 ez.table fisher.posthoc ez.fishers ez.logistics ez.lms ez.anovas1b ez.zresidize ez.zresid ez.r ez.scale ez.se ez.outlier ez.mvi ez.vi ez.vc ez.vv ez.xl ez.vx setdiff2 ez.compare p.apa round2 ez.describe

Documented in ez.anovas1b ez.citen ez.compare ez.corrmatrix ez.describe ez.dprime ez.es.t.independent.msn ez.es.t.independent.tn ez.es.t.paired.m12s12r ez.es.t.paired.msr ez.es.t.paired.tnr ez.fishers ez.lms ez.logistics ez.maxnp ez.mvi ez.outlier ez.r ez.scale ez.se ez.table ez.table2 ez.vc ez.vi ez.vv ez.vx ez.xl ez.zresid ez.zresidize fisher.posthoc round2 setdiff2

# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# stats
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#' print out summary statistics about a data frame or other object, alias of \code{\link[Hmisc]{describe}}
#' @description print out summary statistics about a data frame or other object, alias of \code{\link[Hmisc]{describe}}
#' @param x a data frame or a vector or sth else that can be converted into a data frame
#' @export
ez.describe = function(x){
    if (!is.data.frame(x)) {(x = data.frame(x))}
    # flush otherwise not print large text
    # show(x)
    print(Hmisc::describe(x))
    flush.console()
    # print(summary(x))
    # flush.console()
    # cat('--------------------------------------------------------------------------------------------\n')
    # str(x)
    # flush.console()
}

#' round2 always round half up (R's native round/sprintf may round .055 -> .05); vectorized
#' @description round2 always round half up (R's native round/sprintf may round .055 -> .05); vectorized
#' @export
round2 = function(x,n) {
    # https://stackoverflow.com/a/62546554/2292993
    # R's native round/sprintf may round .055 -> .05
  posneg = sign(x)
  z = abs(x)*10^n
  z = z + 0.5 + + sqrt(.Machine$double.eps)
  z = trunc(as.numeric(as.character(z)))
  z = z/10^n
  (z)*posneg
}

p.apa = function(pvalue,prefix=0,pe=F,decimals=3){
    if (is.na(pvalue)) {return(NA_character_)}
    if (prefix==2) {
        if (pvalue<.0001) {
            if (pe & decimals==2) pvalue = sprintf("p = %.2e", pvalue) else pvalue = sprintf("p < .0001")
            if (pe & decimals==3) pvalue = sprintf("p = %.3e", pvalue) else pvalue = sprintf("p < .0001")
        } else if (pvalue<.001) {
            if (pe & decimals==2) pvalue = sprintf("p = %.2e", pvalue) else pvalue = sprintf("p < .001")
            if (pe & decimals==3) pvalue = sprintf("p = %.3e", pvalue) else pvalue = sprintf("p < .001")
        } else if (pvalue<.005 | (pvalue>=.045 & pvalue<.055)) {
            pvalue = sprintf( "p = %s", gsub("^(\\s*[+|-]?)0\\.", "\\1.", sprintf('%.3f',round2(pvalue,3))) )
        } else if (pvalue<.01) {
            if (decimals==2) pvalue = sprintf("p = .01")
            if (decimals==3) pvalue = sprintf( "p = %s", gsub("^(\\s*[+|-]?)0\\.", "\\1.", sprintf('%.3f',round2(pvalue,3))) )
        } else {
            if (decimals==2) pvalue = sprintf( "p = %s", gsub("^(\\s*[+|-]?)0\\.", "\\1.", sprintf('%.2f',round2(pvalue,2))) )
            if (decimals==3) pvalue = sprintf( "p = %s", gsub("^(\\s*[+|-]?)0\\.", "\\1.", sprintf('%.3f',round2(pvalue,3))) )
        }
    } else if (prefix==1) {
        if (pvalue<.0001) {
            if (pe & decimals==2) pvalue = sprintf("= %.2e", pvalue) else pvalue = sprintf("< .0001")
            if (pe & decimals==3) pvalue = sprintf("= %.3e", pvalue) else pvalue = sprintf("< .0001")
        } else if (pvalue<.001) {
            if (pe & decimals==2) pvalue = sprintf("= %.2e", pvalue) else pvalue = sprintf("< .001")
            if (pe & decimals==3) pvalue = sprintf("= %.3e", pvalue) else pvalue = sprintf("< .001")
        } else if (pvalue<.005 | (pvalue>=.045 & pvalue<.055)) {
            pvalue = sprintf( "= %s", gsub("^(\\s*[+|-]?)0\\.", "\\1.", sprintf('%.3f',round2(pvalue,3))) )
        } else if (pvalue<.01) {
            if (decimals==2) pvalue = sprintf("= .01")
            if (decimals==3) pvalue = sprintf( "= %s", gsub("^(\\s*[+|-]?)0\\.", "\\1.", sprintf('%.3f',round2(pvalue,3))) )
        } else {
            if (decimals==2) pvalue = sprintf( "= %s", gsub("^(\\s*[+|-]?)0\\.", "\\1.", sprintf('%.2f',round2(pvalue,2))) )
            if (decimals==3) pvalue = sprintf( "= %s", gsub("^(\\s*[+|-]?)0\\.", "\\1.", sprintf('%.3f',round2(pvalue,3))) )
        }
    } else if (prefix==0){
        if (pvalue<.0001) {
            if (pe & decimals==2) pvalue = sprintf(" %.2e", pvalue) else pvalue = sprintf("< .0001")
            if (pe & decimals==3) pvalue = sprintf(" %.3e", pvalue) else pvalue = sprintf("< .0001")
        } else if (pvalue<.001) {
            if (pe & decimals==2) pvalue = sprintf("%.2e", pvalue) else pvalue = sprintf("< .001")
            if (pe & decimals==3) pvalue = sprintf("%.3e", pvalue) else pvalue = sprintf("< .001")
        } else if (pvalue<.005 | (pvalue>=.045 & pvalue<.055)) {
            pvalue = sprintf( "%s", gsub("^(\\s*[+|-]?)0\\.", "\\1.", sprintf('%.3f',round2(pvalue,3))) )
        } else if (pvalue<.01) {
            if (decimals==2) pvalue = sprintf(".01")
            if (decimals==3) pvalue = sprintf( "%s", gsub("^(\\s*[+|-]?)0\\.", "\\1.", sprintf('%.3f',round2(pvalue,3))) )
        } else {
            if (decimals==2) pvalue = sprintf( "%s", gsub("^(\\s*[+|-]?)0\\.", "\\1.", sprintf('%.2f',round2(pvalue,2))) )
            if (decimals==3) pvalue = sprintf( "%s", gsub("^(\\s*[+|-]?)0\\.", "\\1.", sprintf('%.3f',round2(pvalue,3))) )
        }
    } else if (prefix==-1){
        if (pvalue<=.0001) {
            pvalue = '****'
        } else if (pvalue<=.001) {
            pvalue = '***'
        } else if (pvalue<=.01) {
            pvalue = '**'
        } else if (pvalue<=.05) {
            pvalue = '*'
        } else {
            pvalue = 'ns'
        }
    } else if (prefix==9){
        if (pvalue<.001) {
            if (decimals==2) pvalue = sprintf("%.2e", pvalue)
            if (decimals==3) pvalue = sprintf("%.3e", pvalue)
        } else {
            pvalue = sprintf( "%s", gsub("^(\\s*[+|-]?)0\\.", "\\1.", as.character(pvalue)) )
        }
    }
    return(pvalue)
}
#' format p value according to apa for report
#' @description format p value according to apa for report
#' @param pvalue numeric vector
#' @param prefix -1,0,1,2,9
#' @param pe affects only p < .0001 or < .001. if T, would be sth like 3.14e-04; otherwise < .0001 or < .001
#' @param decimals 3 or 2 decimal places 
#' @return character vector prefix -1 (****,***,**,*,ns); 0 (< .0001, < .001, .003, .2); 1 (< .0001, < .001, = .003, = .02); 2 (p < .0001, < .001, p = .003, p = .02); 9 (.2e if <.001, raw value--ignore pe=T/F)
#' @export
ez.p.apa = Vectorize(p.apa)

#' compare two vectors, two dataframes
#' @description compare two vectors, two dataframes
#' @param value T/F, print out setdiff results
#' @note nrow() for data frame, length() for vector. union/intersect remove duplication
#' @export
ez.compare = function(lh,rh,value=FALSE,...) {
    if (is.data.frame(lh)) {
            len=nrow
            cat(sprintf('comparing nrow for two data frame uniques: %s\t%s\n\n',deparse(substitute(lh)),deparse(substitute(rh)) ))
        } else {
            len=length
            cat(sprintf('comparing length for two vector uniques: %s\t%s\n\n',deparse(substitute(lh)),deparse(substitute(rh)) ))
        }

    # https://github.com/tidyverse/dplyr/issues/3238
    # union/intersect remove duplication
    cat( sprintf('\t\t\t\tUnion: %4.0f\n',len(dplyr::union(lh,rh,...))) )
    cat( sprintf('\t\tLH: %4.0f /%4.0f\t\t\t\tRH: %4.0f /%4.0f\n',len(lh),len(unique(lh)),len(rh),len(unique(rh))) )
    cat( sprintf('\t\t\t\tInter: %4.0f\n',len(dplyr::intersect(lh,rh,...))) )
    cat('\n')
    cat( sprintf('\t\tLH>: %4.0f\t\t\t\t<RH: %4.0f\n',len(dplyr::setdiff(lh,rh,...)),len(dplyr::setdiff(rh,lh,...))) )
    cat('set equal?\n')
    cat(dplyr::setequal(lh,rh,...))
    cat('\n\n')

    if (value) {
        cat('LH>: \n')
        dplyr::setdiff(lh,rh,...) %>% ez.vv(print2scr=FALSE) %>% cat('\n')
        cat('RH>: \n')
        dplyr::setdiff(rh,lh,...) %>% ez.vv(print2scr=FALSE) %>% cat('\n')
    }
    return(invisible(NULL))
}

#' setdiff2(x,y)=setdiff(y,x)
#' @description setdiff2(x,y)=setdiff(y,x)
#' @export
setdiff2 = function(x, y, ...) {
    return(dplyr::setdiff(y,x,...))
}

#' view the overview of a data frame or similar object (like spss variable view, but with much more information)
#' @description vc (c(vv)), vi (view everything print out), vv (view format vector), vx (view excel), View (built-in).
#' @param df a data frame
#' @param temp when file is NULL, the name prefix for the temp excel file. If prefix not provided through this param, auto generate one
#' @param id a single col name in string or number (eg, 'age' or 3), that serves as (potentially unique) id, except which duplicated rows will be checked against. If NULL, rownames() will be auto used
#' @param file a file name, if NULL, a temp generated, will save more detailed variable information to an excel file
#' @param width controls if too many factor levels to print, eg 300. NULL=unlimited
#' @note when file=NULL (which stores to a temp file), debug=NULL, use getOption('debug') which is TRUE if not set up
#' \cr when file=NULL, debug provided, overwrites getOption('debug')
#' \cr when file provided, any debug is ignored
#' \cr Bottom line: file > param debug > option debug
#' \cr Updated: as of Thu, Nov 30 2017, not any more a wrapper of \code{\link[sjPlot]{view_df}}; can make the html bigger by openning in internet browser
#' @return returns a list $row, $col, $dat (input data frame), $pth (file path)
#' @export
ez.vx = function(df, temp=NULL, id=NULL, file=NULL, width=300, incomparables=FALSE, debug=NULL, ...){
    # if temped and not debug, just jump out of the function to save time
    if (is.null(file)) {
        debugMode = if (is.null(getOption('debug'))) TRUE else getOption('debug')
        # overwritten by 'debug' passed to function
        if (is.null(debug)) {
            if (!debugMode) {
                return(invisible(NULL))
            }
        } else if (!debug) {
            return(invisible(NULL))
        }
    }

    # if duplicated col names, the following main codes would crash with weird reasons
    # duplicated row names are fine
    if ( sum(ez.duplicated(colnames(df),vec=TRUE,incomparables=incomparables,dim=1))>0 ) {
        stop(sprintf('I cannot proceed. Duplicated col names found: %s\n', colnames(df)[which(ez.duplicated(colnames(df),vec=TRUE,incomparables=incomparables,dim=1))] %>% toString))
    }

    # id string to tell user which col used as id
    idString = if (is.null(id)) 'rowname' else id

    temped=F
    if(is.null(file)){
        temped=T
        if (is.null(temp)) temp = paste0('View_ID_',idString)
        file=tempfile(pattern = paste0(temp, '_',ez.moment(),'_'), tmpdir = tempdir(), fileext = ".xlsx")
        on.exit(unlink(file))
    }

    # row summary
    r.rowname=rownames(df) %>% ez.num()
    if (is.null(id)) {
        idname=rownames(df) %>% ez.num()
    } else {
        idname=df[,id]
    }

    r.duplicated.idname=ez.duplicated(idname,vec=TRUE,incomparables=incomparables,dim=1,vecgroup=TRUE)
    r.duplicated.idname[which(!r.duplicated.idname)]=NA
    # check duplicated row except the idname column
    if (is.null(id)) {
        r.duplicated.content=ez.duplicated(df,vec=TRUE,incomparables=incomparables,dim=1,vecgroup=TRUE)
    } else {
        # https://github.com/tidyverse/dplyr/issues/2184
        # to avoid the bug, in case variable name id is the same as one of the column names
        idididid=id
        r.duplicated.content=ez.duplicated(dplyr::select(df,-one_of(idididid)),vec=TRUE,incomparables=incomparables,dim=1,vecgroup=TRUE)
    }
    r.duplicated.content[which(!r.duplicated.content)]=NA

    tmpMatrix = is.na(df)
    r.ncol = rep(ncol(tmpMatrix), nrow(tmpMatrix))
    r.missing = rowSums(tmpMatrix,na.rm=TRUE)

    results0=data.frame(rowname=r.rowname,id=idname,duplicated_id=r.duplicated.idname,
                        duplicated_content_except_id=r.duplicated.content,ncol=r.ncol,missing=r.missing)
    results0=dplyr::mutate(results0,missing_rate=missing/ncol)
    results0=ez.rncol(results0,c('id'=paste0('id_',idString)))
    results0=dplyr::mutate(results0,nonmissing=ncol-missing,nonmissing_rate=nonmissing/ncol)



    # col summary
    results = ez.header(variable=character(),class=character(),is_all_numeric_like=logical(),n=numeric(),missing=numeric(),unique_including_na=numeric(),
                        counts_levels_view1=character(),counts_levels_view2=character(),
                        mean=numeric(),sd=numeric(),min=numeric(),max=numeric(),sum=numeric())
    vars=colnames(df)
    allFactorUniqueValues=character()
    allFactorCounts=integer()
    for (var in vars) {
        v.variable=var
        v.class=class(df[[var]]) %>% toString(width=width)  # could have multiple classes
        v.is_all_numeric_like = all(ez.is.numeric.like(df[[var]]))
        v.n=length(df[[var]])
        v.missing=sum(is.na(df[[var]]))
        v.unique=length(unique(df[[var]]))
        # count everything
        freqtable=suppressWarnings(dplyr::count_(df,var))
        vallbl=sjmisc_get_labels(df[[var]],include.values='n',attr.only=T,include.non.labelled=F)
        if (!is.null(vallbl)){
            # ez.2label trick here, do not use the results from sjmisc_get_labels
            vallbl = ez.2label(freqtable[[1]])  # would be in the same order to freqtable
            vallbl = paste("[",vallbl,"]",sep="")
        } else {
            vallbl = rep("", nrow(freqtable))
        }

        v.levels1 = paste("(", paste(freqtable[[1]],vallbl,sep="",collapse=", "),")", " : ", "(", paste(freqtable[[2]],sep="",collapse=", "), ")", sep="") %>% toString(width=width)
        v.levels2 = paste("(",freqtable[[1]],vallbl,": ",freqtable[[2]],")",sep="",collapse=", ") %>% toString(width=width)

        allFactorUniqueValues=unique(c(allFactorUniqueValues,unique(freqtable[[1]]) %>% as.character()))
        allFactorCounts=c(allFactorCounts,freqtable[[2]])
        # calculable
        is.date <- function(x) inherits(x, 'Date')
        # not all NA
        if ( is.numeric(df[[var]]) & !all(is.na(df[[var]])) ) {
            v.mean=mean(df[[var]],na.rm=TRUE)
            v.sd=sd(df[[var]],na.rm=TRUE)
            v.min=min(df[[var]],na.rm=TRUE)
            v.max=max(df[[var]],na.rm=TRUE)
            v.sum=sum(df[[var]],na.rm=TRUE)
        } else if ( is.date(df[[var]]) & !all(is.na(df[[var]])) ) {
            # converted to numeric, to convert back to date: ez.date(ori='R')
            v.mean=mean(df[[var]],na.rm=TRUE)
            v.sd=sd(df[[var]],na.rm=TRUE)
            v.min=min(df[[var]],na.rm=TRUE)
            v.max=max(df[[var]],na.rm=TRUE)
            # sum not defined for "Date" objects
            v.sum=NA
        } else {
            v.mean=v.sd=v.min=v.max=v.sum=NA
        }
        results = ez.append(results,list(v.variable,v.class,v.is_all_numeric_like,v.n,v.missing,v.unique,v.levels1,v.levels2,v.mean,v.sd,v.min,v.max,v.sum),print2scr=FALSE)
    }
    v.duplicated.varname=ez.duplicated(colnames(df),vec=TRUE,incomparables=incomparables,dim=1,vecgroup=TRUE)
    v.duplicated.varname[which(!v.duplicated.varname)]=NA
    v.duplicated.content=ez.duplicated(df,vec=TRUE,incomparables=incomparables,dim=2,vecgroup=TRUE)
    v.duplicated.content[which(!v.duplicated.content)]=NA
    v.duplicated=data.frame(duplicated_varname=v.duplicated.varname,duplicated_content=v.duplicated.content)
    results=dplyr::bind_cols(results,v.duplicated) %>% ez.move('duplicated_varname, duplicated_content before n')
    results=dplyr::mutate(results,missing_rate=missing/n)
    results=ez.rncol(results,c('n'='nrow'))
    # no factors in the data frame!
    allFactorUniqueValues = if (length(allFactorUniqueValues)==0) NA else allFactorUniqueValues %>% toString(width=width)
    allFactorCounts = if (length(allFactorCounts)==0) NA else range(allFactorCounts,na.rm =T) %>% toString(width=width)
    results=dplyr::add_row(results,variable='Total',counts_levels_view1=allFactorUniqueValues,
                          counts_levels_view2=allFactorCounts)
    results=results %>% ez.move('counts_levels_view1 counts_levels_view2 after variable')
    results=dplyr::mutate(results,nonmissing=nrow-missing,nonmissing_rate=nonmissing/nrow)
    results=results %>% ez.move('missing_rate nonmissing nonmissing_rate before unique_including_na')
    # labels=attr(df,'variable.labels') # only work for foreign package: 'variable.labels'
    # in case of 'variable.labels', it contains all varialbes' labels,
    # even some of them are not in the current df due to variable selection, eg, select()
    # So only retrieve exisiting cols in current df
    labels = ez.label.get(df,colnames(df))
    # if no labels at all, NULL; otherwise a string vector
    # labels' length should match #variables selected
    if ((!is.null(labels)) & nrow(results)==length(labels)+1) {results['varlbl']=c(labels,''); results=results %>% ez.move('varlbl before class')}

    # add col number in excel style
    results['excelcol']=c(ez.xlcolconv(1:(nrow(results)-1)),NA_character_)
    results %<>% ez.move('excelcol first')

    ez.savexlist(list('row'=results0,'col'=results,'dat'=df),file=file,withFilter = TRUE,rowNames = FALSE, colNames = TRUE)

    # give some time to open the file and then on.exit will delete it
    # although OS will be able to auto clean temp files later on
    # tempdir() is where it is
    if (temped) {
        debugMode = if (is.null(getOption('debug'))) TRUE else getOption('debug')
        # overwritten by 'debug' passed to function
        if (is.null(debug)) {
            if (debugMode) {
                ez.open(file)
                ez.sleep(6)
            }
        } else {
            if (debug) {
                ez.open(file)
                ez.sleep(6)
            }
        }
    }
    return(invisible(list('row'=results0,'col'=results,'dat'=df,'pth'=file)))
}

#' view df in excel (saved to a temp file, auto delete the temp file)
#' @description view df in excel (saved to a temp file, auto delete the temp file)
#' @param temp when file is NULL, the name prefix for the temp excel file. If prefix not provided through this param, auto generate one
#' @param label T/F. call ez.2label()
#' @return returns nothing
#' @note when debug provided, overwrites getOption('debug'). A trick to use: let View=ez.xl, then you can use GUI to click.
#' @export
ez.xl = function(df,temp=NULL,debug=NULL,label=TRUE) {
    debugMode = if (is.null(getOption('debug'))) TRUE else getOption('debug')

    # overwritten by 'debug' passed to function
    if (is.null(debug)) {
        if (debugMode) {
            if (is.null(temp)) temp = paste0('Data_')
            file=tempfile(pattern = paste0(temp, '_',ez.moment(),'_'), tmpdir = tempdir(), fileext = ".xlsx")
            on.exit(unlink(file))
            if (label) df = ez.2label(df)
            ez.savexlist(list("Sheet1"=df),file=file,withFilter = TRUE,rowNames = FALSE, colNames = TRUE)
            ez.open(file)
            ez.sleep(6)
        }
    } else {
        if (debug) {
            if (is.null(temp)) temp = paste0('Data_')
            file=tempfile(pattern = paste0(temp, '_',ez.moment(),'_'), tmpdir = tempdir(), fileext = ".xlsx")
            on.exit(unlink(file))
            if (label) df = ez.2label(df)
            ez.savexlist(list("Sheet1"=df),file=file,withFilter = TRUE,rowNames = FALSE, colNames = TRUE)
            ez.open(file)
            ez.sleep(6)
        }
    }

    return(invisible(NULL))
}

#' format a vector for easy manual copy/processing.
#' @description vc (c(vv)), vi (view everything print out), vv (view format vector), vx (view excel), View (built-in).
#' @param vec a vector
#' @param printn print first n and last n (useful for loooong vector). If 2n >= total length, print all. Inf=all
#' @param quote TRUE/FALSE, whether add a quote around each element (switch for string or number). NULL = auto (F for numeric, T otherwise)
#' @param order vector order for printing out, 'as','az','za'
#' @return returns the printout.
#' \cr By default in R when you type a variable name, you get [1] "rs171440fwd" "rs1800497fwd"
#' \cr now with this function you get 'rs171440fwd','rs1800497fwd','rs180043'
#' @seealso \code{\link{ez.print}} \code{\link{ez.pprint}}
#' @export
ez.vv = function(vec,printn=Inf,order='as',quote=NULL,print2scr=FALSE){
    if(is.null(quote)) {quote = if (is.numeric(vec)) FALSE else TRUE}

    if (2*printn >= length(vec)) {printn=NULL}

    if (order=='az') {vec=sort(vec,decreasing=F,na.last=T)}
    if (order=='za') {vec=sort(vec,decreasing=T,na.last=T)}

    if (quote) {
        if (is.null(printn)){
            printout=noquote(paste0("'",noquote(paste0(vec,collapse = "', '")),"'"))
        } else {
            header = noquote(paste0("'",noquote(paste0(vec[1:printn],collapse = "', '")),"'"))
            tailer = noquote(paste0("'",noquote(paste0(vec[(length(vec)-printn+1):length(vec)],collapse = "', '")),"'"))
            printout = noquote(paste(header,tailer,sep=",..................,"))
        }
    } else {
        if (is.null(printn)){
            printout=noquote(paste0(vec,collapse = ", "))
        } else {
            header = noquote(paste0(vec[1:printn],collapse = ", "))
            tailer = noquote(paste0(vec[(length(vec)-printn+1):length(vec)],collapse = ", "))
            printout = noquote(paste(header,tailer,sep=",..................,"))
        }
    }
    if (print2scr) {
        cat(sprintf("Total elements: %d\n",length(vec)))
    }
    return(printout)
}

#' format a vector for constructing a sprintf command.
#' @description vc (c(vv)), vi (view everything print out), vv (view format vector), vx (view excel), View (built-in).
#' @param vec a vector
#' @param printn print first n and last n (useful for loooong vector). If 2n >= total length, print all. Inf=all
#' @param quote TRUE/FALSE, whether add a quote around each element (switch for string or number). NULL = auto (F for numeric, T otherwise)
#' @param order vector order for printing out, 'as','az','za'
#' @return returns string
#' @examples
#' \dontrun{
#' # sprintf("c(%s)",ez.vv(vec,...)).
#' # ez.sprintf('model = "|model|", type = |ez.vc(type)|')
#' # use | to replace with brace for documentation purpose,
#' # because of weird error of roxygen "mismatched braces or quotes"
#' }
#' @export
ez.vc = function(vec,...){
    return(sprintf("c(%s)",ez.vv(vec,...)))
}

#' print sorted uniques of a df col or a vector (NA last) and other information
#' @description vc (c(vv)), vi (view everything print out), vv (view format vector), vx (view excel), View (built-in).
#' @param printn print first n and last n (useful for loooong vector). If 2n >= total length, print all. Inf=all
#' @param plot plot single vector (call generic/default \code{\link[graphics]{plot}})
#' @importFrom car basicPower bcPower bcnPower
#' @export
ez.vi = function(x,printn=35,plot=TRUE,...) {
    v = x
    if (is.data.frame(v) | is.matrix(v)) {
        if ( sum(ez.duplicated(colnames(v),vec=TRUE,dim=1))>0 ) {
            stop(sprintf('I cannot proceed. Duplicated col names foud: %s\n', colnames(v)[which(ez.duplicated(colnames(v),vec=TRUE,incomparables=incomparables,dim=1))] %>% toString))
        }
        v.class = if (is.matrix(v)) 'matrix' else class(v) %>% toString()
        if (is.matrix(v)) {v = data.frame(v)}
        v.cols = colnames(v) %>% ez.vv(print2scr=F,printn=printn,order='as')
        v.cols2 = colnames(v) %>% ez.vv(print2scr=F,printn=printn,order='az')
        v.nrow = nrow(v)
        v.ncol = ncol(v)
        v.missing=ez.count(v,NA,dim=3)
        v.n.colNumsAllNAs = length(as.vector(which(colSums(is.na(v)) == nrow(v))))

        classes=character()
        for (col in colnames(v)) {
            classes=c(classes,class(v[[col]]))
        }
        freq = table(classes)
        v.classes = paste('#',freq %>% names,': ',freq,sep='',collapse = ', ')

        v.attrs = attributes(v)
        v.attrs[c('row.names','names','class')] <- NULL
        v.attrs = sapply(v.attrs,length)
        v.attrs = paste(names(v.attrs),v.attrs,sep = ' @ ',collapse = ', ')
        if (v.attrs == '') v.attrs = '@'

        first2rows = if (nrow(v)>=4) 1:2 else 1:nrow(v); last2rows = if (nrow(v)>=4) (nrow(v)-2+1):nrow(v) else NULL
        first3cols = if (ncol(v)>=6) 1:3 else 1:ncol(v); last3cols = if (ncol(v)>=6) (ncol(v)-3+1):ncol(v) else NULL
        # c(1:2,NULL) -> (1,2)
        rows = if (nrow(v)==0) integer(0) else c(first2rows,last2rows)
        cols = if (ncol(v)==0) integer(0) else c(first3cols,last3cols)
        print(v[rows,cols,drop=F])

        cat('\n#col names, az:\n')
        cat(v.cols2)
        cat('\n')

        cat('\n#col names, as:\n')
        cat(v.cols)
        cat('\n')

        cat(sprintf('\nDim: %d x %d\t(#EmptyCols: %d\t#NA: %d)\n%s\n', v.nrow, v.ncol, v.n.colNumsAllNAs, v.missing, v.classes))
        cat(sprintf('\n%-25s\n',v.class))
        cat(sprintf('attributes: %s\n',v.attrs))
    } else if (is.list(x)) {
        for (l in names(x)) {
            cat(sprintf('$%-25s\t%s\n',l,class(x[[l]]) %>% toString))
        }
        cat(sprintf('List of %d\n',length(x)))
    } else {
        v.elements = unique(v) %>% ez.vv(print2scr=F,printn=printn,order='as')
        v.elements2 = unique(v) %>% ez.vv(print2scr=F,printn=printn,order='az')
        v.class=class(v) %>% toString()
        if (!is.null(names(v))) v.class=paste0('Named ',v.class)
        v.n=length(v)
        v.missing=sum(is.na(v))
        v.unique=length(unique(v))

        v.attrs = attributes(v)
        v.attrs[c('levels','names','class')] <- NULL
        v.attrs = sapply(v.attrs,length)
        v.attrs = paste(names(v.attrs),v.attrs,sep = ' @ ',collapse = ', ')
        if (!is.null(ez.getlabel(v))) {v.attrs = paste(v.attrs,ez.getlabel(v),sep='\n',collapse = '')}
        if (v.attrs == '') v.attrs = '@'

        # calculable
        is.date <- function(x) inherits(x, 'Date')
        # not all NA
        if ( is.numeric(v) & !all(is.na(v)) ) {
            v.mean=mean(v,na.rm=TRUE)
            v.sd=sd(v,na.rm=TRUE)
            v.min=min(v,na.rm=TRUE)
            v.max=max(v,na.rm=TRUE)
            v.sum=sum(v,na.rm=TRUE)
        } else if ( is.date(v) & !all(is.na(v)) ) {
            # converted to numeric, to convert back to date: ez.date(ori='R')
            v.mean=mean(v,na.rm=TRUE)
            v.sd=sd(v,na.rm=TRUE)
            v.min=min(v,na.rm=TRUE)
            v.max=max(v,na.rm=TRUE)
            # sum not defined for "Date" objects
            v.sum=NA
        } else {
            v.mean=v.sd=v.min=v.max=v.sum=NA
        }

        # count everything, not just is.factor(v) | is.character(v) | is.logical(v)
        freqtable=suppressWarnings(suppressWarnings(dplyr::count_(data.frame(tmpvar=v),"tmpvar")))
        vallbl=sjmisc_get_labels(v,include.values='n',attr.only=T,include.non.labelled=F)
        if (!is.null(vallbl)){
            # ez.2label trick here, do not use the results from sjmisc_get_labels
            vallbl = ez.2label(freqtable[[1]])  # would be in the same order to freqtable
            vallbl = paste("[",vallbl,"]",sep="")
        } else {
            vallbl = rep("", nrow(freqtable))
        }
        v.levels = paste(freqtable[[1]],vallbl,": ",freqtable[[2]],sep="",collapse="\n")

        cat(sprintf('Counts/Levels (Incl NA): \n%s\n\n',v.levels %>% toString(width=printn*20)))
        cat(sprintf("Uniques (Incl NA, NA might be printed as 'NA', as): \n%s\n\n", v.elements))
        cat(sprintf("Uniques (Incl NA, NA might be printed as 'NA', az): \n%s\n\n", v.elements2))
        cat(sprintf('#Unique (Incl NA): %d\t#NA: %d (%.0f%%)\t#Non-NA: %d\t#Total: %d\n', v.unique, v.missing, v.missing*100/v.n, v.n-v.missing, v.n))
        if ( (is.numeric(v) | is.date(v)) & !all(is.na(v)) ) {
            cat(sprintf('M = %.2f\tSD = %.2f\tRange = (%.2f,%.2f)\tSum = %.2f\n', v.mean, v.sd, v.min, v.max, v.sum))
        }
        cat(sprintf('\n%-25s\n',v.class))
        cat(sprintf('attributes: %s\n',v.attrs))

        if (plot & is.numeric(v) & !all(is.na(v))) {
            opar = par(mfrow=c(3, 2), oma=c(0,0,0,0), mar = c(2,2,0.5,0.5))
            on.exit(par(opar))
            plot(v, type='b', pch=20, col='#56B4E9')
            abline(h = v.mean, col = "#E69F00", lty = 3, lwd = 2)
            plot(sort(v), type='b', pch=20, col='#56B4E9')
            abline(h = v.mean, col = "#E69F00", lty = 3, lwd = 2)
            hist(v, col='#56B4E9',main=NULL,xlab=NULL)
            abline(v = v.mean, col = "#E69F00", lty = 3, lwd = 2)
            boxplot(v,col='#56B4E9',horizontal=TRUE);abline(v=v.mean,lty=3,lwd=2,col='#E69F00')

            # boxcox
            tryCatch({
            family = ifelse(any((v <= 0) %in% TRUE), "bcnPower", "bcPower")
            bc = suppressWarnings(car::powerTransform(v ~ 1, family = family))
            sbc = suppressWarnings(summary(bc))
            lambda.raw = sbc$result[[1]]
            lambda = sbc$result[[2]]
            p.lambda = car::testTransform(bc,lambda=lambda)$pval
            gamma = sbc$result.gamma[[1]]
            if (is.null(gamma)) gamma = NA
            cat(sprintf('Box-Cox: lambda.raw = %f, lambda = %.2f, p.lambda = %f, gamma = %f, n = %d\n', lambda.raw, lambda, p.lambda, gamma, length(v)))
            if (family=='bcnPower') {
                vv = car::bcnPower(v, lambda=lambda, jacobian.adjusted = FALSE, gamma=gamma)
            } else if (family=='bcPower') {
                vv = car::bcPower(v, lambda=lambda, jacobian.adjusted = FALSE, gamma=NULL)
            }
            car::boxCox(v ~ 1, family = family,
            xlab = as.expression(substitute(lambda~"(raw)"~"="~lambda.raw.value*", "~lambda~"="~lambda.value*", "~italic(p)~"="~p.lambda.value*", "~gamma~"="~gamma.value*", "~italic(n)~"="~n.value,
                list(lambda.value=sprintf("%.2f",lambda),
                    p.lambda.value=sprintf("%f",p.lambda),
                    gamma.value=sprintf("%f",gamma),
                    lambda.raw.value=sprintf("%f",lambda.raw),
                    n.value=sprintf("%d",length(v))
                    ))))
            hist(vv, col='#56B4E9',main=NULL,xlab=NULL)
            abline(v = mean(vv,na.rm=T), col = "#E69F00", lty = 3, lwd = 2)
            }, error=function(e){
                ez.print('boxcox transformation could not be performed.')
            })
        }
    }
    return(invisible(NULL))
}

#' generic plot a customized boxplot with jittered stripplot, violin, and mean
#' @description generic plot a customized boxplot with jittered stripplot, violin, and mean
#' @param df data frame in long format
#' @param cmd like "y", "y|x", "y|x z", "y|x z a" where y is continous, x z a are discrete
#' @param violin plot violin or not
#' @param n.size font size of stat n, 0 to hide
#' @param m.size font size of stat M, 0 to hide
#' @param facet  one of 'cols', 'rows', 'wrap'
#' @return a ggplot object (+theme_apa() to get apa format plot)
#' @export
ez.mvi = function(df,cmd,violin=TRUE,colors=ez.palette('Zhu'),n.size=4.5,m.size=4.5,alpha=0.7,facet='cols',theme.apa=TRUE){
    colors = sprintf("c(%s)",ez.vv(colors))
    df.bak=df
    gghistory=sprintf('df=%s',deparse(substitute(df)))

    # https://stackoverflow.com/a/25215323/2292993
    # call options(warn=1) to set the global warn (opt is alway global, even change inside a function) to 1, but returns the old value to oldWarn
    # finally on exit the function, set it back to old value
    oldOpts = options(warn=1)
    on.exit(options(oldOpts))
    # http://stackoverflow.com/a/25734388/2292993
    # Merge Multiple spaces to single space, and remove trailing/leading spaces
    # also see trimws()--remove trailing/leading spaces
    cmd = ez.trim(cmd)
    cmd = strsplit(cmd,'[~|]')[[1]]

    violin = ifelse(violin, 'geom_violin() +', '')
    # yy
    if (length(cmd)==1) {
        yy = cmd[1]
        df=ez.dropna(df,yy)
        # hide x axis label in this case
        # http://stackoverflow.com/a/15720769/2292993
        tt = sprintf('
                     fun_length <- function(x){return(data.frame(y=mean(x),label= paste0(length(x)," (n)")))}
                     pp = ggplot2::ggplot(df, aes(x=factor(""), y=%s)) +
                     stat_boxplot(geom = "errorbar", width = 0.5) +
                     %s geom_boxplot(outlier.shape=NA,alpha=alpha) +
                     geom_point(position=position_jitter(width=0.2, height=0), size=1) +
                     stat_summary(fun.y=mean, color="darkred", geom="point", shape=18, size=3) +
                     coord_flip() + theme(legend.position="none", axis.ticks.y=element_blank(), axis.text.y=element_blank()) +
                     xlab("") +
                     ggtitle(paste0("N (nrow) = ",nrow(df)))'
                     , yy, violin
        )
        tt = paste0(tt, sprintf(' + \nstat_summary(fun.data = fun_length, color="grey", geom="text",vjust=8,hjust=-0.1,size=%f)',n.size))
        tt = paste0(tt, sprintf(' + \nstat_summary(fun.y=mean, size=%f, color="darkred", geom="text",vjust=8,hjust=1,aes(label=sprintf("%%.2f (M)", ..y..)), alpha=1)',m.size))
        gghistory=paste(gghistory,
                 sprintf('df=ez.dropna(df,"%s")',yy),
                 tt,sep='\n')
    # yy|xx or yy|xx zz
    } else {
        yy = cmd[1]
        xx = gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", cmd[2], perl=TRUE)
        xx = strsplit(xx," ",fixed=TRUE)[[1]]
        # yy|xx
        if (length(xx)==1) {
            xx = xx[1]
            df=ez.dropna(df,c(yy,xx))
            df=ez.2factor(df,xx)

            pvalue=summary(aov(df[[yy]] ~ df[[xx]]))[[1]][["Pr(>F)"]][[1]]
            if (pvalue<.001) {
                pvalue = sprintf(", p = %.2e", pvalue)
            } else if (pvalue<.01) {
                pvalue = sprintf(", p = %.3f", pvalue)
            } else {
                pvalue = sprintf(", p = %.2f", pvalue)
            }

            tt = sprintf('
                         fun_length <- function(x){return(data.frame(y=mean(x),label= paste0(length(x)," (n)")))}
                         pp = ggplot2::ggplot(df, aes(x=%s, y=%s, fill=%s)) +
                         stat_boxplot(geom = "errorbar", width = 0.5) +
                         %s geom_boxplot(outlier.shape=NA,alpha=alpha) +
                         geom_point(position=position_jitter(width=0.2, height=0), size=1) +
                         stat_summary(fun.y=mean, color="royalblue", geom="point", shape=18, size=3) +
                         coord_flip() + theme(legend.position="none") +
                         ggtitle(paste0("N = ",nrow(df), "%s"))'
                         , xx, yy, xx, violin, pvalue
            )
            tt = paste0(tt, sprintf(' + \nstat_summary(fun.data = fun_length, color="grey", geom="text",vjust=8,hjust=-0.1,size=%f)',n.size))
            tt = paste0(tt, sprintf(' + \nstat_summary(fun.y=mean, size=%f, color="royalblue", geom="text",vjust=8,hjust=1,aes(label=sprintf("%%.2f (M)", ..y..)), alpha=1)',m.size))
            tt = paste0(tt, sprintf(' + \nscale_fill_manual(values=%s)',colors))
            gghistory=paste(gghistory,
                     sprintf('df=ez.dropna(df,c("%s","%s"))',yy,xx),
                     sprintf('df=ez.2factor(df,c("%s"))',xx),
                     tt,sep='\n')
        # yy|xx zz
        } else {
            if (length(xx)==2) {
                zz = xx[2]
                xx = xx[1]
                df=ez.dropna(df,c(yy,xx,zz))
                df=ez.2factor(df,c(xx,zz))

                tt = sprintf('
                             fun_length <- function(x){return(data.frame(y=mean(x),label= paste0(length(x)," (n)")))}
                             pp = ggplot2::ggplot(df, aes(x=%s, y=%s, fill=%s)) +
                             stat_boxplot(geom = "errorbar", width = 0.5) +
                             %s geom_boxplot(outlier.shape=NA,alpha=alpha) +
                             geom_point(position=position_jitter(width=0.2, height=0), size=1) +
                             stat_summary(fun.y=mean, color="royalblue", geom="point", shape=18, size=3) +
                             %s +
                             coord_flip() + theme(legend.position="none") +
                             ggtitle(paste0("N (nrow) = ",nrow(df)))'
                             , xx, yy, xx, violin, sprintf(ifelse(facet=="cols","facet_grid(.~%s)",ifelse(facet=="rows","facet_grid(%s~.)","facet_wrap(~%s)")),zz)
                )
                tt = paste0(tt, sprintf(' + \nstat_summary(fun.data = fun_length, color="grey", geom="text",vjust=8,hjust=-0.1,size=%f)',n.size))
                tt = paste0(tt, sprintf(' + \nstat_summary(fun.y=mean, size=%f, color="royalblue", geom="text",vjust=8,hjust=1,aes(label=sprintf("%%.2f (M)", ..y..)), alpha=1)',m.size))
                tt = paste0(tt, sprintf(' + \nscale_fill_manual(values=%s)',colors))
                gghistory=paste(gghistory,
                         sprintf('df=ez.dropna(df,c("%s","%s","%s"))',yy,xx,zz),
                         sprintf('df=ez.2factor(df,c("%s","%s"))',xx,zz),
                         tt,sep='\n')
            # yy|xx zz aa
            } else {
                aa = xx[3]
                zz = xx[2]
                xx = xx[1]
                df=ez.dropna(df,c(yy,xx,zz,aa))
                df=ez.2factor(df,c(xx,zz,aa))

                tt = sprintf('
                             fun_length <- function(x){return(data.frame(y=mean(x),label= paste0(length(x)," (n)")))}
                             pp = ggplot2::ggplot(df, aes(x=%s, y=%s, fill=%s)) +
                             stat_boxplot(geom = "errorbar", width = 0.5) +
                             %s geom_boxplot(outlier.shape=NA,alpha=alpha) +
                             geom_point(position=position_jitter(width=0.2, height=0), size=1) +
                             stat_summary(fun.y=mean, color="royalblue", geom="point", shape=18, size=3) +
                             facet_grid(%s~%s) +
                             coord_flip() + theme(legend.position="none") +
                             ggtitle(paste0("N (nrow) = ",nrow(df)))'
                             , xx, yy, xx, violin, zz, aa
                )
                tt = paste0(tt, sprintf(' + \nstat_summary(fun.data = fun_length, color="grey", geom="text",vjust=8,hjust=-0.1,size=%f)',n.size))

                tt = paste0(tt, sprintf(' + \nstat_summary(fun.y=mean, size=%f, color="royalblue", geom="text",vjust=8,hjust=1,aes(label=sprintf("%%.2f (M)", ..y..)), alpha=1)',m.size))
                tt = paste0(tt, sprintf(' + \nscale_fill_manual(values=%s)',colors))
                gghistory=paste(gghistory,
                         sprintf('df=ez.dropna(df,c("%s","%s","%s","%s"))',yy,xx,zz,aa),
                         sprintf('df=ez.2factor(df,c("%s","%s","%s"))',xx,zz,aa),
                         tt,sep='\n')
            }
        }
    }
    if (theme.apa) tt = paste0(tt,'+theme_apa()')
    eval(parse(text = tt))
    pp$gghistory=paste0(gghistory,'\nprint(pp)')
    pp$df=df.bak
    return(pp)
}

#' univariate outlier cleanup
#' @description univariate outlier cleanup
#' @param x a data frame or a vector
#' @param col passed to \code{\link{ez.selcol}}. colwise processing
#' \cr        if x is a data frame, col specified, process that col only.
#' \cr        if x is a data frame, col unspecified (i.e., NULL default), process all cols
#' \cr        if x is not a data frame, col is ignored
#' \cr        could be multiple cols
#' @param method z score, mad, or IQR (John Tukey)
#' @param cutoff abs(x) > cutoff will be treated as outliers. Default/auto values (i.e. if NA):
#' \cr z 95% of values fall within 1.96, qnorm(0.025,lower.tail=F), or 3
#' \cr mad 2.5, which is the standard recommendation, or 5.2
#' \cr iqr 1.5
#' \cr if multiple values specified, use the first one (an exception is hack=T, during which method and cutoff same length or scalar)
#' @param hack call mapply to try all method and cutoff (same length or scalar, ie, different methods with
#' corresponding cutoff, or same method with different cutoff).
#' @param plot boxplot and hist before and after outlier processing.
#' @param fillout how to process outlier, fill with na, mean, median (columnwise for data frame), or
#' null --> remove outlier (only for vector or df with single col specified, auto switch to na if otherwise)
#' @return returns a new data frame or vector. If hack=T, returns nothings
#' @note univariate outlier approach
#' The Z-score method relies on the mean and standard deviation of a group of data to measure central
#' tendency and dispersion. This is troublesome, because the mean and standard deviation are highly
#' affected by outliers – they are not robust. In fact, the skewing that outliers bring is one of the
#' biggest reasons for finding and removing outliers from a dataset!
#' Another drawback of the Z-score method is that it behaves strangely in small datasets – in fact,
#' the Z-score method will never detect an outlier if the dataset has fewer than 12 items in it.
#' \cr
#' \cr
#' Median absolute deviation, modified z-score. The median and MAD are robust measures of central tendency and dispersion, respectively.
#' \cr
#' \cr
#' Interquartile range method is that, like the modified Z-score method, it uses a robust measure of dispersion.
#' \cr
#' @examples
#' set.seed(1234)
#' x = rnorm(10)
#' iris %>% ez.outlier(1,fill='na',plot=T,hack=T,method=c('mad'),cutoff=c(1,3,2))
#' iris %>% ez.outlier(1,fill='null',plot=T,hack=T,method=c('z','mad','iqr'),cutoff=c(3,5,1.5))
#' iris %>% ez.outlier(1,fill='null',plot=T,hack=T,method=c('z','mad','iqr'),cutoff=NA)
#' @export
ez.outlier = function(x, col=NULL, method=c('z','mad','iqr'), cutoff=NA, fillout=c('null','na','mean','median'), hack=FALSE, plot=FALSE, na.rm=TRUE, print2scr=TRUE) {
    # https://datascienceplus.com/rscript/outlier.R
    # https://cran.r-project.org/web/packages/outliers/index.html
    # https://rpubs.com/hauselin/outliersDetect

    if (hack==T){
            # here for programming reason, for mapply,
            # cutoff could not be NULL, use NA, because length(NULL)=0, but length(NA)=1
            mapply(ez.outlier,method=method,cutoff=cutoff,MoreArgs=list(x=x,col=col,hack=F,plot=plot,fillout=fillout,na.rm=na.rm,print2scr=print2scr),SIMPLIFY=F,USE.NAMES=F)
            ez.print('Hack done! No actual data returned.')
            return(invisible(NULL))
    }

    # fillout=todropna is an internal workaround for data frame with single col passed in
    method = match.arg(method); cutoff = cutoff[1]; fillout = fillout[1]

    if (!is.data.frame(x)) {
        x.bak.plot = x; x.replace.na = x; oldNAs = ez.count(x.replace.na,NA)
        if (fillout=='na' | fillout=='todropna') {
            replacement = NA
        } else if (fillout=='mean') {
            replacement = mean(x, na.rm=na.rm)
        } else if (fillout=='median') {
            replacement = median(x, na.rm=na.rm)
        } else if (fillout=='null') {
            replacement = NULL
        }

        if (method=='z'){
            if(is.na(cutoff)) cutoff = 3
            absz = abs((x - mean(x, na.rm=na.rm))/sd(x, na.rm=na.rm))
            if (!is.null(replacement)) {
                x[absz > cutoff] <- replacement
                } else {
                    # if nothing above cutoff, x is untouched
                    if (length(which(absz > cutoff)) > 0) {
                        x = x[-which(absz > cutoff)]
                    }
                }
            x.replace.na[absz > cutoff] <- NA
        } else if (method=='mad'){
            if(is.na(cutoff)) cutoff = 2.5
            absmad <- abs((x - median(x, na.rm=na.rm))/mad(x, na.rm=na.rm))
            if (!is.null(replacement)) {
                x[absmad > cutoff] <- replacement
                } else {
                    if (length(which(absmad > cutoff)) > 0) {
                        x = x[-which(absmad > cutoff)]
                    }
                }
            x.replace.na[absmad > cutoff] <- NA
        } else if (method=='iqr'){
            # https://stackoverflow.com/a/4788102/2292993
            if(is.na(cutoff)) cutoff = 1.5
            q1 <- quantile(x, 0.25, na.rm=na.rm)
            q3 <- quantile(x, 0.75, na.rm=na.rm)
            # alternatively iqr = q3-q1
            iqr = IQR(x, na.rm = na.rm)
            lower_bound = q1 - (iqr * cutoff)
            upper_bound = q3 + (iqr * cutoff)
            if (!is.null(replacement)) {
                x[(x > upper_bound) | (x < lower_bound)] <- replacement
                } else {
                    if (length(which((x > upper_bound) | (x < lower_bound))) > 0) {
                        x = x[-which((x > upper_bound) | (x < lower_bound))]
                    }
                }
            x.replace.na[(x.replace.na > upper_bound) | (x.replace.na < lower_bound)] <- NA
        }

        nOutliers = ez.count(x.replace.na,NA,col) - oldNAs
        if (print2scr) {
            if (!is.null(col)) {
                ez.pprint(sprintf('%-15s %5s(%.2f): %3d outliers found and %s.', toString(col), toupper(method), cutoff, nOutliers, ifelse((is.null(replacement)|fillout=='todropna'),'REMOVED','REPLACED')))
            } else {
                ez.pprint(sprintf('%5s(%.2f): %3d outliers found and %s.', toupper(method), cutoff, nOutliers, ifelse((is.null(replacement)|fillout=='todropna'),'REMOVED','REPLACED')))
            }
        }

        if (plot){
            # oma is margin for the whole
            # mar controls margin size for individual plot it goes c(bottom, left, top, right)
            opar = par(mfrow=c(2, 2), oma=c(0,0,1.5,0), mar = c(2,2,1.5,0.5))
            on.exit(par(opar))
            boxplot(x.bak.plot, main=sprintf("With outliers (n=%d)",length(x.bak.plot)))
            hist(x.bak.plot, main=sprintf("With outliers (n=%d)",length(x.bak.plot)), xlab=NULL, ylab=NULL)

            boxplot(x, main=sprintf("With outliers (n=%d)",length(x.bak.plot)-nOutliers))
            hist(x, main=sprintf("With outliers (n=%d)",length(x.bak.plot)-nOutliers), xlab=NULL, ylab=NULL)
            title(sprintf("%s Outlier Check: %s(%.2f)",toString(col), toupper(method), cutoff), outer=TRUE)
        }
    } else if (is.data.frame(x)) {
        col = ez.selcol(x,col)
        if (length(col)>1 & fillout=='null') {
            ez.pprint('I do not know how to remove univariate outliers in multiple cols. fillout: null --> na ...','red')
            fillout='na'
        } else if (fillout=='null') {
            fillout='todropna'
        }
        # trick to pass actual col name
        x[col] = lapply(1:length(col), function(j) {ez.outlier(x=x[col][[j]],col=col[j],method=method,cutoff=cutoff,plot=plot,hack=hack,fillout=fillout,na.rm=na.rm,print2scr=print2scr)})
        if (fillout=='todropna') x = ez.dropna(x,col,print2scr=F)
    } # end if
    return(invisible(x))
}

#' standard error of mean
#' @description standard error of mean
#' @param x a vector
#' @note
#' \cr na will be omitted before calculation, the formula is sqrt(var(x,na.rm=TRUE)/length(na.omit(x))) (equivalent to sd(x,na.rm=TRUE)/sqrt(length(na.omit(x))))
#' \cr
#' \cr \code{\link[stats]{sd}}, standard deviation (sigma or sd, s) is simply the (positive) square root of the variance (sigma^2, or s^2), \code{\link[stats]{var}}. Both sd(), var() use denominator n - 1, which gives an unbiased estimator of the (co)variance for i.i.d. observations.
#' \cr
#' \cr se = sd/sqrt(n). see https://www.statsdirect.com/help/basic_descriptive_statistics/standard_deviation.htm
#' \cr
#' \cr I wrote ez.se, ez.sd = \code{\link[stats]{sd}}
#' \cr
#' \cr\cr For zscore, (x-mean(x,na.rm=T))/sd(x,na.rm=T), or use \code{\link{ez.scale}}(x,center=TRUE,scale=TRUE) demean: ez.scale(x,center=TRUE,scale=FALSE). (ez.scale() auto NA ignored/returned in place. )
#' \cr z-scores indeed have a mean of zero and a standard deviation of 1. Other than that, however, z-scores follow the exact same distribution as original scores. That is, standardizing scores doesn't make their distribution more or less "normal" in any way.
#' see https://www.spss-tutorials.com/z-scores-what-and-why/
#' @export
ez.se = function(x) {
    # http://stackoverflow.com/a/7220087/2292993
    sqrt(var(x,na.rm=TRUE)/length(na.omit(x)))
}

#' @rdname ez.se
#' @export
ez.sd = stats::sd

#' z score, scale, NA ignored & returned in place.
#' @description z score, scale, NA ignored & returned in place.
#' @param x a data frame or a vector
#' @param col passed to \code{\link{ez.selcol}}
#' \cr        if x is a data frame, col specified, process that col only.
#' \cr        if x is a data frame, col unspecified (i.e., NULL default), process all cols
#' \cr        if x is not a data frame, col is ignored
#' \cr        could be multiple cols
#' @return returns a new data frame or vector
#' @note
#' similar to base::scale, but no special scale attributes
#' alias: \code{\link{ez.z}} and \code{\link{ez.scale}}
#' @seealso \code{\link{ez.se}} \code{\link{ez.sd}}
#' @export
ez.scale = function(x, col=NULL, center = TRUE, scale = TRUE) {
    if (!is.data.frame(x)) {
        if (!is.numeric(x) | is.factor(x) | is.character(x)){
            stop("input x must be numeric")
        }
        x = as.vector(base::scale(x,center=center,scale=scale))
    } else if (is.data.frame(x)) {
        col = ez.selcol(x,col)
        x[col] = lapply(x[col],ez.scale,col=NULL,center=center,scale=scale)
    } else {
        x = x
    } # end if
    return(x)
}

#' @rdname ez.scale
#' @export
ez.z = ez.scale

#' r correlation
#' @description r correlation
#' @param x df or matrix
#' @param col col included for calculation, ignored if x is not a df
#' @param type "pearson" or "spearman"
#' @param print2scr print attention message, if NA present in r matrix; no effect if there is no NA
#' @note underlying uses \code{\link[Hmisc]{rcorr}}, which uses pairwise deletion. Pairs with fewer than 2 non-missing values, or too small variance, have the r values set to NA
#' \cr In the r matrix you get, you may get some NAs in the marix without warning)
#' @return r matrix
#' @export
ez.r = function(x,col=NULL,type="pearson",print2scr=T) {
    if (is.data.frame(x) & !is.null(col)) {
        col=(ez.selcol(x,col))
        x=x[col]
    }

    # https://www.statmethods.net/stats/correlations.html
    # stats::cor(x, use, method)
    # x: Matrix or data frame
    # use:
    #   all.obs (assumes no missing data - missing data will produce an error),
    #   complete.obs (listwise deletion), and
    #   pairwise.complete.obs (pairwise deletion)
    # method:
    #   Options are pearson, spearman or kendall.
    # cor() does not produce tests of significance, although you can use the cor.test( ) function to test a single correlation coefficient.

    # rcorr uses pairwise deletion; however, you may still get warnings/errors/NAs for calcuating r, p
    # because too few samples, or too small variance (close to 0) after deletion
    result = Hmisc::rcorr(data.matrix(x),type=type)$r
    CorrNACounts = data.frame(result) %>% ez.count(val=NA,dim=1) %>% tibble::rownames_to_column(var='variable') %>% dplyr::arrange(desc(count))
    NAs = sum(CorrNACounts$count)

    if (NAs > 0) {
        if (print2scr) {ez.pprint(sprintf('Attention: %s NAs contained in the correlation matrix (see CorrNACounts in workspace).', NAs), color='red')}
        assign("CorrNACounts", CorrNACounts, envir = .GlobalEnv)
    }
    return(result)
}

#' (z) residual
#' @description (z) residual
#' @param method 1=scale(resid), 2=resid (unstandarized), 3=stats::rstandard
#' @note according to jerry's test, \code{\link[stats]{rstandard}} and \code{\link[MASS]{stdres}} give the same results, both of which are slightly different from (but very close to, highly correlated with) method 1
#' \cr SPSS linear regression->save->residuals (menu)
#' \cr r z score of resid or residuals = SPSS z score of unstandardized residual
#' \cr r resid or residuals = SPSS unstandardized residual
#' \cr r stats::rstandard = MASS::stdres = SPSS studentized residual
#' \cr \href{https://stackoverflow.com/q/40062482/2292993}{https://stackoverflow.com/q/40062482/2292993}
#' @export
ez.zresid = function(model,method=1) {
    if (method==1) {result=as.vector(scale(resid(model),center=TRUE,scale=TRUE))}
    if (method==2) {result=resid(model)}
    if (method==3) {result=stats::rstandard(model)}
    return(result)
}

#' residualize variable, returning it changed in-place (i.e under its original column name)
#' @description residualize variable, returning it changed in-place (i.e under its original column name)
#' @param var  a single var or a vector of var names to regress out covar
#' @param covar covariates, one or more than one covar
#' @param model 'lm', 'lmrob' (MM-type Estimators, robustbase), 'lmRob' (automatically chooses, robust), 'rlm' (MASS)
#' @param scale  unstandarize or standardize residual
#' @param ... additional param passed to the specified model
#' @return dataframe with var residualized in place (i.e under its original column name). If covar have NA, then that row for var will be NA.
#' @importFrom robust lmRob
#' @importFrom robustbase lmrob lmrob.control
#' @export
ez.zresidize = function(data,var,covar,model='lm',scale=TRUE,...){
    data = data.frame(data)
    vars = var
    for (var in vars){
        var = ez.selcol(data,var); covar = ez.selcol(data,covar)
        # borrowed codes from https://cran.r-project.org/web/packages/umx/index.html
        form = paste0(var, " ~ ", paste(covar, collapse = " + "))
        form = as.formula(form)
        set.seed(20190117)
        # na.exclude better than na.omit. extractor functions like residuals() or fitted() will pad their
        # output with NAs for the omitted cases with na.exclude, thus having an output of the same length as
        # the input variables.
        na.action = na.exclude
        if (model=='lm') m = stats::lm(form,data,na.action=na.action,...)
        # MM-type Estimators
        # for some reason, robustbase::lmrob.control() cannot have see with a single value, like seed=1313
        if (model=='lmrob') m = suppressWarnings(robustbase::lmrob(form,data,control=robustbase::lmrob.control(max.it=500,maxit.scale=500),na.action=na.action,...))
        # lmRob function automatically chooses an appropriate algorithm to compute a final robust estimate with high breakdown point and high efficiency
        if (model=='lmRob') m = suppressWarnings(robust::lmRob(form,data,control=robust::lmRob.control(seed=1313,mxr=500,mxf=500,mxs=500,trace=FALSE),na.action=na.action,...))
        # increased maxit from 20, because sometimes, rlm fails
        # suppress 'rlm' failed to converge in xx steps
        if (model=='rlm') m = suppressWarnings(MASS::rlm(form,data,maxit=500,na.action=na.action,...))
        # can only use resid, robust lms do not support stats::rstandard
        tmp <- resid(m)
        if (scale) tmp = as.vector(scale(tmp,center=T,scale=T))
        oldNAs = sum(is.na(data[[var]]))
        newNAs = sum(is.na(tmp))
        if(newNAs > oldNAs){
            ez.pprint(sprintf('%d cases of var %s lost due to missing covariates', newNAs - oldNAs, var))
        }
        data[[var]] = tmp
    } # end for
    return(data)
}

#' anova or ancova 1b, aov(y~x+covar), for many y and/or many x
#' @description anova or ancova 1b, aov(y~x+covar), for many y and/or many x
#' @param df a data frame, Internally go through dropna --> ez.2value(y),ez.2factor(x),ez.2value(covar)
#' @param y compatible with \code{\link{ez.selcol}}, or y could be cmd from \code{\link{ez.barplot}} where not necessary to specify x, covar. y|x or y|x+covar, y~x or y~x+covar. cmd format can only handel 1 y, 1 x (and covars)
#' @param x compatible with \code{\link{ez.selcol}}
#' @param covar NULL=no covar, compatible with \code{\link{ez.selcol}}
#' @param view call View(result)
#' @param pmethods c('bonferroni','fdr'), type p.adjust.methods for all methods. This correction applies for all possible tests that have been/could be done.
#' @param plot T/F, the black dash line is bonferroni p = 0.05 (again for tests only with a non-NA p values), the grey black dash is uncorrected p = 0.05
#' @param cols number of columns for multiplot. NULL=auto calculate
#' @param error whether show error message when error occurs (also result will have an empty row when error occurs)
#' @return an invisible data frame or list of data frame (if many y and many x)
#' \cr the means column in excel can be split into mulitiple columns using Data >Text to Columns
#' \cr dof: from F-statistic
#' @note In R, \code{\link[stats]{aov}} for 1- or mutiple- way (use : * syntax to include interactions--not implemented in this function yet) between anova, it takes the output from \code{\link[stats]{lm}} and returns it to us in a way that is more in keeping with a traditional ANOVA approach.
#' \cr
#' \cr \code{\link[car]{Anova}} calculates type-II or type-III analysis-of-variance tables for model objects. Also, \code{\link[stats]{anova}} analyses the result of a model, producing type I (sequential) ANOVA table.
#' \cr
#' \cr Within and mixed anova use \code{\link[nlme]{lme}}, because Repeated-measures designs have dependent data, therefore dependent residuals, which can be handled by nlme::lme
#' \cr
#' \cr Eta squared measures the proportion of the total variance in a dependent variable Y that can be accounted for by knowledge of X (ie, the membership of different groups defined by an independent variable). For one-way anova, eta squared is the same as partial eta squared, and the same as R2 (these are true only for one-way anova, not the case for ancova or two-way anova). Eta squared is analogous to R2 in lm. Both are biased and have the weakness that each adding additional variable will automatically increase the value of Eta squared or R2. Eta squared for factor1 = SSfactor1/SStotal, where SStotal = SSfactor1 + SSfactor2 + SSinteraction + SSerror
#' \cr
#' \cr Partial eta squared is a similar measure in which the effects of other independent variables and interactions are partialled out (ie, the proportion of variance that a variable explains that is not explained by other variables in the analysis). Partial Eta squared for factor1 = SSfactor1/(SSfactor1+SSerror)
#' \cr
#' \cr If covariates provided, adjusted means with SD, partial eta squared. Otherwise, raw mean SD, and (partial) eta squared. se=sd/sqrt(n)
#' \cr
#' \cr posthoc results are very close to spss output, but may have tiny differences (e.g., R .304 vs. SPSS .310)
#' @importFrom effects effect
#' @export
ez.anovas1b = function(df,y,x=NULL,covar=NULL,report=T,reportF=F,view=F,plot=F,cols=3,pmethods=c('bonferroni','fdr'),point.size=10,point.shape=16,lab.size=18,text.size=16,error=T,prefix=2,pe=F,...) {
    # yet another patch for cmd input
    if (grepl('[~|]',y[1],perl=TRUE)){
        # peel the onion
        if (grepl('+',y,fixed=TRUE)) {
            tmp = strsplit(ez.trim(y),"+",fixed=TRUE)[[1]]
            covar = ez.trim(tmp[2:length(tmp)])
            y = tmp[1]
        }

        tmp = strsplit(ez.trim(y),'[~|]',perl=TRUE)[[1]]
        y = ez.trim(tmp[1]); x = ez.trim(tmp[2])
    }

    y=ez.selcol(df,y); x=ez.selcol(df,x); if (!is.null(covar)) covar=ez.selcol(df,covar)
    bt = trellis.par.get("fontsize")$text ; bp = trellis.par.get("fontsize")$points
    text.size = text.size/bt ; title.size = (lab.size+2)/bt ; lab.size = lab.size/bt ; point.size = point.size/bp

    # patch to handle multiple y, multiple x
    if (length(y)>1 & length(x)>1) {
        xlist = list(); plist = list()
        for (xx in x) {
            # plot = F; no need for sepearte plotlist
            result = ez.anovas1b(df,y,xx,covar=covar,report=report,view=F,plot=F,pmethods=pmethods,error=error,...)
            xlist[[xx]] = result

            result.plot = result %>% ez.dropna('p',print2scr=F)
            if (plot & nrow(result.plot)>0) {
                bonferroniP = -log10(0.05/length(result.plot[['p']]))
                plist[[xx]] = lattice::xyplot(-log10(result.plot$p) ~ result.plot$petasq2,
                       xlab = list(expression(eta[p]^2), cex=lab.size, fontfamily=RMN),
                       ylab = list("-log10(p-Value)", cex=lab.size, fontfamily=RMN),
                       scales = list( x=list(cex=text.size, fontfamily=RMN), y=list(cex=text.size, fontfamily=RMN) ),
                       type = "p", pch=point.shape, cex=point.size,
                       main = list(xx, cex=title.size, fontfamily=RMN),
                       col = "#e69f00",
                       ylim=c(-0.5,max(c(bonferroniP,-log10(result.plot$p)))+0.5),
                       abline=list(h=c(bonferroniP,-log10(0.05)),lty=2,lwd=2,col=c('black','darkgrey'))
                )
            }
        }
        if (plot & length(plist)>0) {if (is.null(cols)) {cols=floor(sqrt(length(plist)))}; gridExtra::grid.arrange(grobs=plist,ncol=cols)}
        return(invisible(xlist))
    }

    getStats = function(y,x,covar,df,...){
        df = df[, c(y,x,covar), drop=F]
        df = ez.dropna(df,print2scr=F)
        df.bak = df
        df = ez.2value(df,y); df = ez.2factor(df,x)
        if (!is.null(covar)){
            for (vv in c(covar)){
                # nonfactor nleves returns 0
                if (nlevels(df[[vv]])>2) ez.pprint(sprintf('col %s has >=3 factor levels, converting to number via ez.2value... dummy coding instead?', vv))
                df[[vv]]=ez.2value(df[[vv]])
            }
        }
        tryCatch({
        # aov is wrapper of lm, so also accept na.action
        na.action=na.exclude
        form = as.formula( paste0(y, " ~ ", paste(c(x,covar), collapse = " + ")) )
        # by default R use dummy coding (ie, contr.treatment), not orthogonal, not OK for type III
        # options('contrasts')
        # factory-default ones
        # options('contrasts' = c("contr.treatment", "contr.poly"))
        oldcons = options('contrasts' = c("contr.helmert", "contr.poly"))
        m = aov(form,df,na.action=na.action,...)
        sm = car::Anova(m,type="III")
        # restore contrasts
        options(oldcons)

        # have fun now
        p = sm[2,"Pr(>F)"]
        F = sm[2,'F value']
        dof = toString(sm[['Df']][c(2,length(sm[['Df']]))])
        MSE = sm[2,'Sum Sq']/sm[2,'Df']
        n = aggregate(df[[y]]~df[[x]],FUN=length)
        total = sum(n[[2]])
        counts = sprintf('%s, %d',n[[1]],n[[2]]) %>% paste0(collapse='; ')
        # no covar, no adjustment
        if (is.null(covar)) {
            avg = aggregate(df[[y]]~df[[x]],FUN=mean)
            dev = aggregate(df[[y]]~df[[x]],FUN=sd)
        } else {
            # adjusted mean, sd
            avg = data.frame(effects::effect(x,m))
            # sd=se*sqrt(n)
            dev = data.frame(avg[[1]],avg$se*sqrt(n[[2]]))
        }
        lvls = sprintf('%s',avg[[1]]) %>% paste0(collapse='\t')
        means = sprintf('%s\t%.2f',avg[[1]],avg[[2]]) %>% paste0(collapse='\t')
        # raw.adj.mean.sd = sprintf('%.2f (%.2f)',avg[[2]],dev[[2]]) %>% paste0(collapse='\t')
        raw.adj.mean.sd = sprintf('%.2f ± %.2f',avg[[2]],dev[[2]]) %>% paste0(collapse='\t')
        # page 523 in Discovering Stats using R 1st
        ss = sm[['Sum Sq']]
        petasq2 = ss[2]/(ss[2]+ss[length(ss)])
        ph = ez.esp('summary( multcomp::glht(m, linfct=multcomp::mcp("{x}" = "Tukey")) )')
        posthoc_tukey = paste0('(',names(ph$test$tstat),') ', ez.p.apa(ph$test$pvalues,prefix=prefix,pe=pe),'; ',collapse='')

        # df.bak with dropna, so na.rm needed
        uniques_drop_na.x=length(unique(df.bak[[x]])); nlevels.x=nlevels(df.bak[[x]])
        uniques_drop_na.y=length(unique(df.bak[[y]])); min.y=min(df.bak[[y]]); max.y=max(df.bak[[y]]); mean.y=mean(df.bak[[y]]); sd.y=sd(df.bak[[y]])

        return(list(y=y,x=x,covar=toString(covar),p=p,petasq2=petasq2,F=F,dof=dof,MSE=MSE,lvls=lvls,means=means,total=total,counts=counts,raw.adj.mean.sd=raw.adj.mean.sd,posthoc_tukey=posthoc_tukey,
            uniques_drop_na.x=uniques_drop_na.x,nlevels.x=nlevels.x,
            uniques_drop_na.y=uniques_drop_na.y,min.y=min.y,max.y=max.y,mean.y=mean.y,sd.y=sd.y
            ))
        }, error = function(e) {
            options(oldcons)  # make sure restore
            if (error) ez.pprint(sprintf('EZ Error: aov(%s ~ %s). NA returned.',y,paste(c(x,covar), collapse = " + ")),color='red')
            return(list(y=y,x=x,covar=toString(covar),p=NA_real_,petasq2=NA_real_,F=NA_real_,dof=NA_character_,MSE=NA_real_,lvls=NA_character_,means=NA_character_,total=NA,counts=NA_character_,raw.adj.mean.sd=NA_character_,posthoc_tukey=NA_character_,
                uniques_drop_na.x=NA,nlevels.x=NA,
                uniques_drop_na.y=NA,min.y=NA,max.y=NA,mean.y=NA,sd.y=NA
                ))
        }) # end tryCatch
    }

    result = mapply(getStats,y=y,x=x,MoreArgs=list(covar=covar, df=df,...),USE.NAMES=F)
    result = data.frame(t(result))
    result[] = lapply(result,unlist)

    for (method in pmethods) {
        ind = which(!is.na(result[['p']]))
        result[[method]]=stats::p.adjust(result[['p']],method=method)
    }
    ylbl = ez.label.get(df,result$y); xlbl = ez.label.get(df,result$x)
    if (is.null(ylbl)) {ylbl=''}; if (is.null(xlbl)) {xlbl=''}; result$ylbl=rep(ylbl,nrow(result)); result$xlbl=rep(xlbl,nrow(result))
    if (nrow(result)==0){result$orindex=integer(0)} else {result$orindex=1:nrow(result)}
    result = ez.move(result,'orindex first; ylbl after y; xlbl after x; raw.adj.mean.sd, posthoc_tukey last') %>% dplyr::arrange(p)

    if (view) {View(result)}

    result.report = result %>% ez.dropna('p',print2scr=F) %>% dplyr::arrange(orindex)
    if (report & nrow(result.report)>0) {
        # ez.pprint('>>>>>>')
        # ez.print(ifelse(is.null(covar), 'mean (sd), se=sd/sqrt(n)', 'adjusted mean (sd), se=sd/sqrt(n)'))
        for (i in 1:nrow(result.report)){
            Y = result.report$y[i]; X = paste(c(result.report$x[i],covar),collapse="+")
            ez.pprint(sprintf('levels: %s',result.report$lvls),color='cyan')
            if (reportF) {
                ez.pprint(sprintf('aov(%s~%s): %s\t%.2f\t%s',Y,X,result.report$raw.adj.mean.sd[i],result.report$F[i],ez.p.apa(result.report$p[i],prefix=0,pe=pe)),color='cyan')
            } else {
                ez.pprint(sprintf('aov(%s~%s): %s\t%s',Y,X,result.report$raw.adj.mean.sd[i],ez.p.apa(result.report$p[i],prefix=0,pe=pe)),color='cyan')
            }
        }

        for (i in 1:nrow(result.report)){
            ez.pprint(sprintf('Tukey: %s', result.report$posthoc_tukey[i]),color='cyan')
        }

        for (i in 1:nrow(result.report)){
            ez.pprint(sprintf(' Data available for %d participants (%s).', result.report$total[i], result.report$counts[i]),color='cyan')
        }

        for (i in 1:nrow(result.report)){
            Y = result.report$y[i]; X = paste(c(result.report$x[i],covar),collapse="+")
            # if (result.report$F[i] < 1) {
            #     ez.pprint(sprintf('aov(%s~%s): F(%s) < 1',Y,X,result.report$dof[i]),color='cyan')
            # } else {
            #     ez.pprint(sprintf('aov(%s~%s): F(%s) = %.2f, MSE = %.2f, %s, %s = %.2f',Y,X,result.report$dof[i],result.report$F[i],result.report$MSE[i],ez.p.apa(result.report$p[i],prefix=prefix,pe=pe),ifelse(is.null(covar),'etasq2','partial etasq2'),result.report$petasq2[i]),color='cyan')
            # }
            # https://stackoverflow.com/a/13353595/2292993
            ez.pprint(sprintf('aov(%s~%s): F(%s) = %.2f, MSE = %.2f, %s, %s = %s',Y,X,result.report$dof[i],result.report$F[i],result.report$MSE[i],ez.p.apa(result.report$p[i],prefix=prefix,pe=pe),ifelse(is.null(covar),'etasq2','partial etasq2'),substring(sprintf("%.2f", result.report$petasq2[i]), 2)),color='cyan')
        }
        # ez.pprint('<<<<<<')
    }

    result.plot = result %>% ez.dropna('p',print2scr=F)
    if (plot & nrow(result.plot)>0) {
        bonferroniP = -log10(0.05/length(result.plot[['p']]))
        pp=lattice::xyplot(-log10(result.plot$p) ~ result.plot$petasq2,
               xlab = list(expression(eta[p]^2), cex=lab.size, fontfamily=RMN),
               ylab = list("-log10(p-Value)", cex=lab.size, fontfamily=RMN),
               scales = list( x=list(cex=text.size, fontfamily=RMN), y=list(cex=text.size, fontfamily=RMN) ),
               type = "p", pch=point.shape, cex=point.size,
               main = list(ifelse((length(y)>=1 & length(x)==1),x,y), cex=title.size, fontfamily=RMN),
               col = "#e69f00",
               ylim=c(-0.5,max(c(bonferroniP,-log10(result.plot$p)))+0.5),
               abline=list(h=c(bonferroniP,-log10(0.05)),lty=2,lwd=2,col=c('black','darkgrey'))
        )
        print(pp)
    }

    return(invisible(result))
}

#' lm(y~x+covar), for many y and/or many x
#' @description lm(y~x+covar), for many y and/or many x
#' @param df a data frame. Internally go through dropna --> ez.2value --> scale
#' \cr because of this, this function is not friendly to factors with 3 or more levels, although the default lm can handel factors nicely (yet complicatedly--eg, not easy to test significance)
#' \cr by design, ez.2value to convert factors with 2 levels, such as sex, this generates the same results as lm by default, see note and example below for more
#' \cr scale to get standarized beta as effect size for comparison across different variables
#' @param y compatible with \code{\link{ez.selcol}}, or y could be cmd from \code{\link{ez.scatterplot}} where not necessary to specify x, covar, by. y~x+covar|{1,4}by. cmd format can only handel 1 y, 1 x
#' @param x compatible with \code{\link{ez.selcol}}
#' @param covar NULL=no covar, compatible with \code{\link{ez.selcol}}
#' @param by NULL. if specified with a factor col, do the model on all subjects, and groups in that col separately. In this case returns a list of result data frame
#' @param report print results (in APA format)
#' @param model vector c('lm', 'lmrob', 'lmRob', 'rlm'), robustbase::lmrob--MM-type Estimators; robust::lmRob--automatically chooses an appropriate algorithm. one or more, 'lm' will always be included internally, even if not specified
#' @param view call View(result)
#' @param pmethods c('bonferroni','fdr'), type p.adjust.methods for all methods. This correction is based on the total number of tests that successfully generate p values
#' @param plot T/F, the black dash line is bonferroni p = 0.05 (again for tests only with a non-NA p values), the grey black dash is uncorrected p = 0.05
#' @param cols number of columns for multiplot. NULL=auto calculate
#' @param error whether show error message when error occurs (also result will have an empty row when error occurs)
#' @param ... additional parameters to the specified model. if more than one model specified, ... may not be OK for all models, because diff models have diff parameters
#' @return an invisible data frame or list of data frame (if many y and many x)
#' \cr beta: standardized beta coefficients (simple or multiple regression) are the estimates resulting from a regression analysis that have been standardized
#' \cr so that the variances of dependent and independent variables are 1.
#' \cr Therefore, standardized coefficients refer to how many standard deviations a dependent variable will change,
#' \cr per standard deviation increase in the predictor variable.
#' \cr For simple regression (1 y ~ 1 x), the value of the standardized coefficient (beta) equals the correlation coefficient (r) (beta=r).
#' \cr For multiple regression (with covar), the value of the standardized coefficient (beta) is close to semi-partial correlation
#' \cr According to jerry testing, scale() or not for x,y or covar, does not change p values for predictors, although intercept would differ
#' \cr
#' \cr dof: from F-statistic
#' \cr residualization: say, y ~ x + a + b, first x ~ a + b is residualized and then y ~ x (ie, semi-partial). If no covar, y ~ x, although labelled residualized in result data frame, actually non-residualized, because nothing to residualize
#' \cr as far as semi-partial concerned, y ~ x + a + b, x ~ y + a + b are two different models. ppcor::spcor.test(iris[,1],iris[,2],iris[,c(3,4)]) != ppcor::spcor.test(iris[,2],iris[,1],iris[,c(3,4)])
#' \cr but for partial correlation, partial(y,x) is the same as partial(x,y), ppcor::pcor.test(iris[,1],iris[,2],iris[,c(3,4)]) = ppcor::pcor.test(iris[,2],iris[,1],iris[,c(3,4)])
#' \cr On a related note, y ~ x + a + b, generates the same results as y ~ b + a + x (the order does not matter).
#' \cr For coding purpose, I put x as x, and (a,b) as cov. stdbeta, p returned refer to x in MLR (multiple linear regression), values referring to a,b were discarded during the process.
#' \cr
#' \cr ++++++++++!!!important note!!!++++++++++
#' \cr "n"           | "n.lmrob"           | "n.lmRob"           | "n.rlm"
#' \cr "stdbeta"     | "stdbeta.lmrob"     | "stdbeta.lmRob"     | "stdbeta.rlm"
#' \cr "p"           | "p.lmrob"           | "p.lmRob"           | "p.rlm"
#' \cr "r2"          | "r2.lmrob"          | "r2.lmRob"          | "r2.rlm" (NA)
#' \cr "r.spartial"  | "r.spartial.lmrob"  | "r.spartial.lmRob"  | "r.spartial.rlm" (NA)
#' \cr "p.spartial"  | "p.spartial.lmrob"  | "p.spartial.lmRob"  | "p.spartial.rlm"
#' \cr "r.partial"   | "r.partial.lmrob"   | "r.partial.lmRob"   | "r.partial.rlm" (NA)
#' \cr "p.partial"   | "p.partial.lmrob"   | "p.partial.lmRob"   | "p.partial.rlm"
#' \cr
#' \cr Summary: The absolute value of the semipartial correlation of X with Y is always less than or equal to that of the partial correlation of X with Y.
#' \cr MLR p = ppcor::pcor.test ~=(>?) p.partial = (y.residual, x.residual)
#' \cr ppcor::spcor.test ~=(>?) p.spartial = (y, x.residual)
#' \cr ? might be larger, because after residualized, no covar, so dof is larger ?
#' \cr
#' \cr the stdbeta, p(.lm), p.lmrob etc in result data frame refer to stdbeta, p value for x in MLR, which are plotted when plot=T. the bestp is also selected based on this p value
#' \cr
#' \cr According to my own demo (see examples iris), MLR p is the same as p values from ppcor::pcor.test and SPSS Analyze->Correlate->Partial! also very close to the p value from manual calculation with correlation(y.residual, x.residual). partial r the same in all cases.
#' \cr
#' \cr the r.spartial, p.spartial refers to semi-partial correlation (r.spartial is the same as ppcor::spcor.test result, p.spartial very close to ppcor::spcor.test result.
#' \cr
#' \cr no column named r, r(.lm), r.lmrob etc in the result data frame
#' \cr r2.rlm or r.spartial.rlm are NA, but p values are available, because I do not know how to get them from rlm yet
#' \cr
#' @note To keep consistent with other R functions (eg, lm which converts numeric/non-numeric factor to values starting from 0), set start.at=0 in ez.2value(), then factor(1:2)->c(0,1), factor(c('girl','boy'))->c(1,0) # the level order is boy,girl
#' \cr in lm() the coding (0,1) vs.(1,2) does not affect slope, but changes intercept (but a coding from 1,2->1,3 would change slope--interval difference matters)
#' @examples
#' y=ez.zresidize(iris,'Sepal.Length','Petal.Width',scale=F)
#' x=ez.zresidize(iris,'Sepal.Width','Petal.Width',scale=F)
#' z=data.frame(Sepal.Length=y$Sepal.Length,Sepal.Width=x$Sepal.Width)
#' ez.lms(z,'Sepal.Length','Sepal.Width')
#' ppcor::pcor.test(iris$Sepal.Length,iris$Sepal.Width,iris$Petal.Width)
#' ppcor::spcor.test(iris$Sepal.Length,iris$Sepal.Width,iris$Petal.Width)
#'
#' y = c(1,2,3,4,5,6)
#' x = c(2,4,15,20,25,36)
#' z=factor(c('m','m','m','f','f','f'))
#'
#' lm(y~x+z) %>% summary
#' lm(y~x+ez.2value(z)) %>% summary
#' lm(y~x+ez.2value(z,start.at=2)) %>% summary
#' # same beta, p but diff Intercept
#'
#'
#' y = c(1,2,3,4,5,6)
#' x = c(2,4,15,20,25,36)
#' f=factor(c('m','m','u','u','f','f'))
#' lm(y~x+f) %>% summary
#' lm(y~x+ez.2value(f)) %>% summary
#' lm(y~x+ez.2value(f,start.at=2)) %>% summary
#' # the first is diff from the rest, changes p value
#'
#'
#' y = c(1,2,3,4,5,6)
#' x = c(2,4,15,20,25,36)
#' z=c(0,0,0,1,1,1)
#' zz=c(-1,-1,-1,1,1,1)
#' zzz=c(-3,-3,-3,1,1,1)
#' lm(y~x+z) %>% summary
#' lm(y~x+scale(z)) %>% summary
#' lm(y~x+zz) %>% summary
#' lm(y~x+zzz) %>% summary()
#' # again, scale and different coding strategy only change intercept and beta(beta?, that is the purpose!), but not p
#' @importFrom robust lmRob
#' @importFrom robustbase lmrob lmrob.control
#' @importFrom sfsmisc f.robftest
#' @export
ez.lms = function(df,y,x,covar=NULL,by=NULL,report=T,model=c('lm', 'lmrob', 'lmRob', 'rlm'),view=F,plot=F,pmethods=c('bonferroni','fdr'),cols=3,point.size=10,point.shape=16,lab.size=18,text.size=16,error=T,pe=F,...) {
    # yet another patch for cmd input
    if (grepl('~',y[1],fixed=TRUE)){
        # peel the onion
        if (grepl('|',y,fixed=TRUE)) {
            tmp = strsplit(ez.trim(y),'|',fixed=TRUE)[[1]]
            by = ez.trim(tmp[length(tmp)])
            y = tmp[1]
        }

        if (grepl('+',y,fixed=TRUE)) {
            tmp = strsplit(ez.trim(y),"+",fixed=TRUE)[[1]]
            covar = ez.trim(tmp[2:length(tmp)])
            y = tmp[1]
        }

        tmp = strsplit(ez.trim(y),"~",fixed=TRUE)[[1]]
        y = ez.trim(tmp[1]); x = ez.trim(tmp[2])
    }

    y=ez.selcol(df,y); x=ez.selcol(df,x); if (!is.null(covar)) covar=ez.selcol(df,covar); if (!is.null(by)) by=ez.selcol(df,by)

    # another patch to handle by groups
    if (!is.null(by)){
        # for all and for groups, assuming no factor level named ALL
        tmp = df; tmp[[by]] = factor('ALL'); df = base::rbind(tmp,df)
        ez.print( sprintf('%s ...', toString(levels(ez.factorelevel(df[[by]])))) )
        out = lapply(base::split(df, ez.factorelevel(df[[by]]), drop=TRUE), ez.lms, y=y,x=x,covar=covar,by=NULL,report=report,model=model,view=view,plot=plot,pmethods=pmethods,cols=cols,point.size=point.size,point.shape=point.shape,lab.size=lab.size,text.size=text.size,error=error, ...)
        return(invisible(out))
    }

    model = unique(c('lm',model)) # always include lm
    bt = trellis.par.get("fontsize")$text ; bp = trellis.par.get("fontsize")$points
    text.size = text.size/bt ; title.size = (lab.size+2)/bt ; lab.size = lab.size/bt ; point.size = point.size/bp

    # patch to handle multiple y, multiple x
    if (length(y)>1 & length(x)>1) {
        xlist = list(); plist = list()
        for (yy in y) {
            # plot = F; no need for sepearte plotlist
            result = ez.lms(df,yy,x,covar,report=report,model=model,by=by,view=F,plot=F,pmethods=pmethods,error=error,...)
            xlist[[yy]] = result

            result.plot = result %>% ez.dropna('p',print2scr=F)
            if (plot & nrow(result.plot)>0) {
                bonferroniP = -log10(0.05/length(result.plot[['p']]))
                plist[[yy]] = lattice::xyplot(-log10(result.plot$p) ~ result.plot$stdbeta,
                   xlab = list("Standardized Coefficient", cex=lab.size, fontfamily=RMN),
                   ylab = list("-log10(p-Value)", cex=lab.size, fontfamily=RMN),
                   scales = list( x=list(cex=text.size, fontfamily=RMN), y=list(cex=text.size, fontfamily=RMN) ),
                   type = "p", pch=point.shape, cex=point.size,
                   main = list(yy, cex=title.size, fontfamily=RMN),
                   col = "#e69f00",
                   ylim=c(-0.5,max(c(bonferroniP,-log10(result.plot$p)))+0.5),
                   abline=list(h=c(bonferroniP,-log10(0.05)),lty=2,lwd=2,col=c('black','darkgrey'))
                )
            }
        }
        if (plot & length(plist)>0) {if (is.null(cols)) {cols=floor(sqrt(length(plist)))}; gridExtra::grid.arrange(grobs=plist,ncol=cols)}
        return(invisible(xlist))
    }

    # processing lm to return something (like r n p)
    # @description processing lm to return something (like r n p)
    # @param y compatible with ez.selcol
    # @param x compatible with ez.selcol
    # @param covar optional, compatible with ez.selcol
    # @param df data frame
    # @param model vector c('lm', 'lmrob', 'lmRob', 'rlm')  robustbase::lmrob--MM-type Estimators; robust::lmRob--automatically chooses an appropriate algorithm
    # @param ... additional parameters to the specified model. if more than one model specified, ... may not be OK for all models, because diff models have diff parameters
    # @return a data frame
    # @export
    getStats = function(y,x,covar,df,model=c('lm', 'lmrob', 'lmRob', 'rlm'),...){
        # drop na before scale, although all lm models below deal with na.action
        # https://stats.stackexchange.com/a/11028/100493
        # default, if not specified, na.action from options('na.action')
        # otherwise scale results would be different
        # if covar NULL, c() auto gets rid of it
        df = df[, c(y,x,covar), drop=F]
        df = ez.dropna(df,print2scr=F)
        # also convert factor to value
        for (vv in c(y,x,covar)){
            # nonfactor nleves returns 0
            if (nlevels(df[[vv]])>2) ez.pprint(sprintf('col %s has >=3 factor levels, converting to number via ez.2value... dummy coding instead?', vv))
            df[[vv]]=ez.2value(df[[vv]])
        }
        df.bak = df
        df[] = lapply(df, ez.scale)
        # for one model
        get.stats = function(y,x,covar,df,model,...){
            tryCatch(suppressWarnings({
            set.seed(20190117)
            # na.exclude better than na.omit. extractor functions like residuals() or fitted() will pad their
            # output with NAs for the omitted cases with na.exclude, thus having an output of the same length as
            # the input variables.
            na.action=na.exclude
            form = as.formula( paste0(y, " ~ ", paste(c(x,covar), collapse = " + ")) )
            if (model=='lm') m = stats::lm(form,df,na.action=na.action,...)
            # MM-type Estimators
            # for some reason, robustbase::lmrob.control() cannot have see with a single value, like seed=1313
            if (model=='lmrob') m = suppressWarnings(robustbase::lmrob(form,df,control=robustbase::lmrob.control(max.it=500,maxit.scale=500),na.action=na.action,...))
            # lmRob function automatically chooses an appropriate algorithm to compute a final robust estimate
            # with high breakdown point and high efficiency
            if (model=='lmRob') m = suppressWarnings(robust::lmRob(form,df,control=robust::lmRob.control(seed=1313,mxr=500,mxf=500,mxs=500,trace=FALSE),na.action=na.action,...))
            # increased maxit from 20, because sometimes, rlm fails
            # suppress 'rlm' failed to converge in xx steps
            if (model=='rlm') m = suppressWarnings(MASS::rlm(form,df,maxit=500,na.action=na.action,...))
            # can only use resid, robust lms do not support stats::rstandard

            sm = summary(m)
            ## n = m$df.residual + 2 # (2 is X + 1), not considering covariates
            # better way to calculate n, considering missing values excluded
            n = nrow(m$model)
            # for rlm, r2 is NA
            r2 = sm$r.squared
            stdbeta = sm$coefficients[2,1]
            # I have seen people use the anova thing which might be for the whole model (kinda F test)
            # For individual predictor, use the p value from summary() --I think, jerry
            # coef(m) the same as sm$coefficients
            if (!'rlm' %in% attr(m,"class",exact=T)){
                p = sm$coefficients[2,4]
                dof = m$df.residual
            } else {
                # https://stats.stackexchange.com/a/205615/100493
                p = sfsmisc::f.robftest(m,var=x)$p.value  # var Either their names or their indices
                dof = sm$df[2]
            }

            # residualized
            if (is.null(covar)) {
                # for rlm, r returns NA
                r.spartial = sign(sm$coefficients[2,1])*sqrt(sm$r.squared)
                p.spartial = p
                r.partial = r.spartial
                p.partial = p.spartial
            } else {
                # this is semi-partial (residualize x) as would be done in multiple regression
                df.spartial = ez.zresidize(df, x, covar, model=model, scale=TRUE)
                tmp = get.stats(y=y, x=x, covar=NULL, df=df.spartial, model=model, ...)
                r.spartial = tmp$r.spartial
                p.spartial = tmp$p.spartial

                df.partial = ez.zresidize(df, c(y,x), covar, model=model, scale=TRUE)
                tmp = get.stats(y=y, x=x, covar=NULL, df=df.partial, model=model, ...)
                r.partial = tmp$r.partial
                p.partial = tmp$p.partial
            }

            # df.bak with dropna, but not scaled yet, so na.rm needed
            uniques_drop_na.x=length(unique(df.bak[[x]])); min.x=min(df.bak[[x]]); max.x=max(df.bak[[x]]); mean.x=mean(df.bak[[x]]); sd.x=sd(df.bak[[x]])
            uniques_drop_na.y=length(unique(df.bak[[y]])); min.y=min(df.bak[[y]]); max.y=max(df.bak[[y]]); mean.y=mean(df.bak[[y]]); sd.y=sd(df.bak[[y]])

            # toString(NULL) -> ''
            return(list(y=y, x=x, covar=toString(covar), n=n, dof=dof, r2=r2, stdbeta=stdbeta, p=p, r.spartial=r.spartial, p.spartial=p.spartial, r.partial=r.partial, p.partial=p.partial,
                uniques_drop_na.x=uniques_drop_na.x,min.x=min.x,max.x=max.x,mean.x=mean.x,sd.x=sd.x,
                uniques_drop_na.y=uniques_drop_na.y,min.y=min.y,max.y=max.y,mean.y=mean.y,sd.y=sd.y
                ))
            }), error=function(e){
                if (error) ez.pprint(sprintf('EZ Error: %s(%s ~ %s). NA returned.',model,y,paste(c(x,covar), collapse = " + ")),color='red')
                return(list(y=y, x=x, covar=toString(covar), n=NA, dof=NA, r2=NA, stdbeta=NA, p=NA, r.spartial=NA, p.spartial=NA, r.partial=NA, p.partial=NA,
                    uniques_drop_na.x=NA,min.x=NA,max.x=NA,mean.x=NA,sd.x=NA,
                    uniques_drop_na.y=NA,min.y=NA,max.y=NA,mean.y=NA,sd.y=NA
                    ))
            }) # end try catch
        }

        out = mapply(get.stats,model=model,MoreArgs=list(y=y, x=x, covar=covar, df=df,...))
        out = data.frame(t(out))
        out[] = lapply(out,unlist)
        out = tibble::rownames_to_column(out)
        out['bestp'] = out$rowname[which.min(out$p)]
        out = ez.2wide(out,'bestp','rowname',c('n', 'dof', 'r2', 'stdbeta', 'p', 'r.spartial', 'p.spartial', 'r.partial', 'p.partial'),sep='.')
        out = ez.clcolnames(out, '\\.lm$','')
        # UA=c('c','a','b'); UB=c('b','c','d'); # desired output: c('b','c','a')
        AmatchlikeB = function(UA,UB){return(c(UB[UB %in% UA], UA[!(UA %in% UB)]))}
        desiredOrd = c('orindex', 'y', 'ylbl', 'x', 'xlbl', 'covar', 'bestp', 'n', 'n.lmrob', 'n.lmRob', 'n.rlm',
                        'stdbeta', 'stdbeta.lmrob', 'stdbeta.lmRob', 'stdbeta.rlm', 'p', 'p.lmrob', 'p.lmRob', 'p.rlm',
                        'r2', 'r2.lmrob', 'r2.lmRob', 'r2.rlm', 'r.spartial', 'r.spartial.lmrob', 'r.spartial.lmRob',
                        'r.spartial.rlm', 'p.spartial', 'p.spartial.lmrob', 'p.spartial.lmRob', 'p.spartial.rlm',
                        'r.partial', 'r.partial.lmrob', 'r.partial.lmRob',
                        'r.partial.rlm', 'p.partial', 'p.partial.lmrob', 'p.partial.lmRob', 'p.partial.rlm',
                        'uniques_drop_na.x', 'min.x', 'max.x', 'mean.x', 'sd.x', 'uniques_drop_na.y', 'min.y', 'max.y',
                        'mean.y', 'sd.y', 'dof', 'dof.lmrob', 'dof.lmRob', 'dof.rlm', 'bonferroni', 'fdr')
        newOrd = AmatchlikeB(names(out), desiredOrd)
        out = out[,newOrd]
        return(out)
    }

    result = mapply(getStats,y=y,x=x,MoreArgs=list(covar=covar, df=df, model=model,...),USE.NAMES=F)
    result = data.frame(t(result))
    result[] = lapply(result,unlist)

    for (method in pmethods) {
        # only adjust for non-na p values
        ind = which(!is.na(result[['p']]))
        result[[method]][ind]=stats::p.adjust(result[['p']][ind],method=method)
    }

    ylbl = ez.label.get(df,result$y); xlbl = ez.label.get(df,result$x)
    if (is.null(ylbl)) {ylbl=''}; if (is.null(xlbl)) {xlbl=''}; result$ylbl=rep(ylbl,nrow(result)); result$xlbl=rep(xlbl,nrow(result))
    if (nrow(result)==0){result$orindex=integer(0)} else {result$orindex=1:nrow(result)}
    result = ez.move(result,'orindex first; ylbl after y; xlbl after x') %>% dplyr::arrange(p)

    if (view) {View(result)}

    # todo: format as APA
    result.report = result %>% ez.dropna('p',print2scr=F) %>% dplyr::arrange(orindex)
    if (report & nrow(result.report)>0) {
        # ez.pprint('>>>>>>')
        for (i in 1:nrow(result.report)){
            Y = result.report$y[i]; X = paste(c(result.report$x[i],covar),collapse="+")
            if (!is.null(ez.selcol(result.report,'starts_with("p.spartial.")'))){
                robustp = result.report[i,ez.selcol(result.report,'starts_with("p.partial.")')] %>% ez.p.apa(prefix=0,pe=pe) %>% toString()
                robustsp = result.report[i,ez.selcol(result.report,'starts_with("p.spartial.")')] %>% ez.p.apa(prefix=0,pe=pe) %>% toString()
                ez.pprint(sprintf('lm(%s~%s): n = %d, P r = %.2f, %s, %s; SP r = %.2f, %s, %s', Y,X,result.report$n[i],
                    result.report$r.partial[i],ez.p.apa(result.report$p.partial[i],prefix=2,pe=pe),robustp,
                    result.report$r.spartial[i],ez.p.apa(result.report$p.spartial[i],prefix=2,pe=pe),robustsp),color='cyan')
            } else {
                ez.pprint(sprintf('lm(%s~%s): n = %d, P r = %.2f, %s; SP r = %.2f, %s', Y,X,result.report$n[i],
                    result.report$r.partial[i],ez.p.apa(result.report$p.partial[i],prefix=2,pe=pe),
                    result.report$r.spartial[i],ez.p.apa(result.report$p.spartial[i],prefix=2,pe=pe)),color='cyan')
            }
        }
        # ez.pprint('<<<<<<')
    }

    result.plot = result %>% ez.dropna('p',print2scr=F)
    if (plot & nrow(result.plot)>0) {
        bonferroniP = -log10(0.05/length(result.plot[['p']]))
        pp=lattice::xyplot(-log10(result.plot$p) ~ result.plot$stdbeta,
               xlab = list("Standardized Coefficient", cex=lab.size, fontfamily=RMN),
               ylab = list("-log10(p-Value)", cex=lab.size, fontfamily=RMN),
               scales = list( x=list(cex=text.size, fontfamily=RMN), y=list(cex=text.size, fontfamily=RMN) ),
               type = "p", pch=point.shape, cex=point.size,
               main = list(ifelse((length(y)>=1 & length(x)==1),x,y), cex=title.size, fontfamily=RMN),
               col = "#e69f00",
               ylim=c(-0.5,max(c(bonferroniP,-log10(result.plot$p)))+0.5),
               abline=list(h=c(bonferroniP,-log10(0.05)),lty=2,lwd=2,col=c('black','darkgrey'))
        )
        print(pp)
    }

    return(invisible(result))
}
#' @rdname ez.lms
#' @export
ez.regressions=ez.lms

#' glm(y~x+covar,family=binomial), for many y and/or many x
#' @description glm(y~x+covar,family=binomial), for many y and/or many x
#' @param df a data frame. Internally go through dropna (no ez.2value, scale)
#' \cr glm can handel X factor by default (dummy coding), but not Y factor (You can't have factor/categorical response variables); see below
#' \cr I do not do standarization here, because
#' \cr While standardized coefficients in classic linear regression are well-defined,
#' \cr logistic regression, like other generalized linear models, present additional complexity as a result of
#' \cr the non-linear link function (logit), and non-normal error function (binomial).
#' \cr https://think-lab.github.io/d/205/
#' @param y compatible with \code{\link{ez.selcol}}, could not be factor, has to be 0/1
#' @param x compatible with \code{\link{ez.selcol}}, can auto dummy coding factors
#' @param covar NULL=no covar, compatible with \code{\link{ez.selcol}}
#' @param report print results (in APA format)
#' @param view call View(result)
#' @param pmethods c('bonferroni','fdr'), type p.adjust.methods for all methods. This correction applies for all possible tests that have been/could be done.
#' @param plot T/F, the black dash line is bonferroni p = 0.05 (again for tests only with a non-NA p values), the grey black dash is uncorrected p = 0.05
#' @param cols number of columns for multiplot. NULL=auto calculate
#' @param error whether show error message when error occurs
#' @return an invisible data frame or list of data frame (if many y and many x)
#' \cr odds_ratio: odds ratio=exp(b), one unit increase in x result in the odds of being 1 for y "OR" times the odds of being 0 for y
#' \cr so that the variances of dependent and independent variables are 1.
#' \cr Therefore, standardized coefficients refer to how many standard deviations a dependent variable will change,
#' \cr per standard deviation increase in the predictor variable.
#' \cr
#' \cr dof
#' @export
ez.logistics = function(df,y,x,covar=NULL,report=T,view=F,plot=F,pmethods=c('bonferroni','fdr'),cols=3,point.size=10,point.shape=16,lab.size=18,text.size=16,error=T,pe=F,...) {
    y=ez.selcol(df,y); x=ez.selcol(df,x); if (!is.null(covar)) covar=ez.selcol(df,covar)
    bt = trellis.par.get("fontsize")$text ; bp = trellis.par.get("fontsize")$points
    text.size = text.size/bt ; title.size = (lab.size+2)/bt ; lab.size = lab.size/bt ; point.size = point.size/bp

    # patch to handle multiple y, multiple x
    if (length(y)>1 & length(x)>1) {
        xlist = list(); plist = list()
        for (yy in y) {
            # plot = F; no need for sepearte plotlist
            result = ez.logistics(df,yy,x,covar=covar,report=report,view=F,plot=F,pmethods=pmethods,error=error,...)
            xlist[[yy]] = result

            result.plot = result %>% ez.dropna('p',print2scr=F)
            if (plot & nrow(result.plot)>0) {
                bonferroniP = -log10(0.05/length(result.plot[['p']]))
                plist[[yy]] = lattice::xyplot(-log10(result.plot$p) ~ log2(result.plot$odds_ratio),
                   xlab = list("log2(Odds Ratio)", cex=lab.size, fontfamily=RMN),
                   ylab = list("-log10(p-Value)", cex=lab.size, fontfamily=RMN),
                   scales = list( x=list(cex=text.size, fontfamily=RMN), y=list(cex=text.size, fontfamily=RMN) ),
                   type = "p", pch=point.shape, cex=point.size,
                   main = list(yy, cex=title.size, fontfamily=RMN),
                   col = "#e69f00",
                   ylim=c(-0.5,max(c(bonferroniP,-log10(result.plot$p)))+0.5),
                   abline=list(h=c(bonferroniP,-log10(0.05)),lty=2,lwd=2,col=c('black','darkgrey'))
                )
            }
        }
        if (plot & length(plist)>0) {if (is.null(cols)) {cols=floor(sqrt(length(plist)))}; gridExtra::grid.arrange(grobs=plist,ncol=cols)}
        return(invisible(xlist))
    }

    getStats = function(y,x,covar,df,...){
        df = df[, c(y,x,covar), drop=F]
        df = ez.dropna(df,print2scr=F)
        df.bak = df
        tryCatch({
        set.seed(20190117)   # may not need
        na.action=na.exclude
        form = as.formula( paste0(y, " ~ ", paste(c(x,covar), collapse = " + ")) )
        m = glm(form,df,na.action=na.action,family=binomial(link="logit"),...)
        sm = summary(m)
        p = sm$coefficients[2,4]
        odds_ratio = exp(sm$coefficients[2,1])
        dof = sm$df[2]

        # df.bak with dropna, so na.rm needed
        # y all 0,1, but ok to have
        uniques_drop_na.x=length(unique(df.bak[[x]])); min.x=min(df.bak[[x]]); max.x=max(df.bak[[x]]); mean.x=mean(df.bak[[x]]); sd.x=sd(df.bak[[x]])
        uniques_drop_na.y=length(unique(df.bak[[y]])); min.y=min(df.bak[[y]]); max.y=max(df.bak[[y]]); mean.y=mean(df.bak[[y]]); sd.y=sd(df.bak[[y]])

        return(list(y=y,x=x,covar=toString(covar),p=p,odds_ratio=odds_ratio,dof=dof,
            uniques_drop_na.x=uniques_drop_na.x,min.x=min.x,max.x=max.x,mean.x=mean.x,sd.x=sd.x,
            uniques_drop_na.y=uniques_drop_na.y,min.y=min.y,max.y=max.y,mean.y=mean.y,sd.y=sd.y
            ))
        }, error = function(e) {
            if (error) ez.pprint(sprintf('EZ Error: glm(%s ~ %s). NA returned.',y,paste(c(x,covar), collapse = " + ")),color='red')
            return(list(y=y,x=x,covar=toString(covar),p=NA,odds_ratio=NA,dof=NA,
                uniques_drop_na.x=NA,min.x=NA,max.x=NA,mean.x=NA,sd.x=NA,
                uniques_drop_na.y=NA,min.y=NA,max.y=NA,mean.y=NA,sd.y=NA
                ))
        }) # end try catch
    }

    result = mapply(getStats,y=y,x=x,MoreArgs=list(covar=covar, df=df, ...),USE.NAMES=F)
    result = data.frame(t(result))
    result[] = lapply(result,unlist)

    for (method in pmethods) {
        ind = which(!is.na(result[['p']]))
        result[[method]]=stats::p.adjust(result[['p']],method=method)
    }
    ylbl = ez.label.get(df,result$y); xlbl = ez.label.get(df,result$x)
    if (is.null(ylbl)) {ylbl=''}; if (is.null(xlbl)) {xlbl=''}; result$ylbl=rep(ylbl,nrow(result)); result$xlbl=rep(xlbl,nrow(result))
    if (nrow(result)==0){result$orindex=integer(0)} else {result$orindex=1:nrow(result)}
    result = ez.move(result,'orindex first; ylbl after y; xlbl after x') %>% dplyr::arrange(p)

    if (view) {View(result)}

    # todo: format as APA
    result.report = result %>% ez.dropna('p',print2scr=F) %>% dplyr::arrange(orindex)
    if (report & nrow(result.report)>0) {
        # ez.pprint('>>>>>>')
        for (i in 1:nrow(result.report)){
            Y = result.report$y[i]; X = paste(c(result.report$x[i],covar),collapse="+")
            ez.pprint(sprintf('glm(%s~%s): OR = %.2f, %s', Y,X,result.report$odds_ratio[i],ez.p.apa(result.report$p[i],prefix=2,pe=pe)),color='cyan')
        }
        # ez.pprint('<<<<<<')
    }

    result.plot = result %>% ez.dropna('p',print2scr=F)
    if (plot & nrow(result.plot)>0) {
        bonferroniP = -log10(0.05/length(result.plot[['p']]))
        pp=lattice::xyplot(-log10(result.plot$p) ~ log2(result.plot$odds_ratio),
               xlab = list("log2(Odds Ratio)", cex=lab.size, fontfamily=RMN),
               ylab = list("-log10(p-Value)", cex=lab.size, fontfamily=RMN),
               scales = list( x=list(cex=text.size, fontfamily=RMN), y=list(cex=text.size, fontfamily=RMN) ),
               type = "p", pch=point.shape, cex=point.size,
               main = list(ifelse((length(y)>=1 & length(x)==1),x,y), cex=title.size, fontfamily=RMN),
               col = "#e69f00",
               ylim=c(-0.5,max(c(bonferroniP,-log10(result.plot$p)))+0.5),
               abline=list(h=c(bonferroniP,-log10(0.05)),lty=2,lwd=2,col=c('black','darkgrey'))
        )
        print(pp)
    }

    return(invisible(result))
}

#' fisher.test(y,x), for many y and/or many x
#' @description fisher.test(y,x), for many y and/or many x
#' @param df a data frame, internal go through dropna --> ez.2factor
#' @param y compatible with \code{\link{ez.selcol}}
#' @param x compatible with \code{\link{ez.selcol}}
#' @param report print results (in APA format)
#' @param view call View(result)
#' @param pmethods c('bonferroni','fdr'), type p.adjust.methods for all methods. This correction applies for all possible tests that have been/could be done.
#' @param compare For posthoc, see more \code{\link{fisher.posthoc}}. If "row", treats the rows as the grouping variable. If "column", treats the columns as the grouping variable.
#' @param plot T/F, the black dash line is bonferroni p = 0.05 (again for tests only with a non-NA p values), the grey black dash is uncorrected p = 0.05
#' @param cols number of columns for multiplot. NULL=auto calculate
#' @param width width for toString(counts,width=width)
#' @param error whether show error message when error occurs
#' @return an invisible data frame or list of data frame (if many y and many x)
#' @note odds ratio only exist for 2x2 table, otherwise NA (arbitrary assigned by jerry)
#' \cr also computes chisq.test, however whenever any of expected count in cells was less than five (not observed frequency), fisher.test preferred because of cells that have expected count less than 5.
#' \cr fisher.test() does not produce a test statistic, but SPSS does (termed as D(x), or FI(x), see p 151 of IBM SPSS Exact Tests)
#' @export
ez.fishers = function(df,y,x,report=T,view=F,plot=F,pmethods=c('bonferroni','fdr'),compare='col',cols=3,lab.size=18,text.size=16,width=300,error=T,pe=F,...) {
    y=ez.selcol(df,y); x=ez.selcol(df,x)
    bt = trellis.par.get("fontsize")$text ; bp = trellis.par.get("fontsize")$points
    text.size = text.size/bt ; title.size = (lab.size+2)/bt ; lab.size = lab.size/bt

    # patch to handle multiple y, multiple x
    if (length(y)>1 & length(x)>1) {
        xlist = list(); plist = list()
        for (xx in x) {
            # plot = F; no need for sepearte plotlist
            result = ez.fishers(df,y,xx,error=error,view=view,plot=F,cols=cols,pmethods=pmethods,compare=compare,lab.size=lab.size,text.size=text.size,title.size=title.size,width=width)
            result = result %>% ez.dropna('p',print2scr=F)
            xlist[[xx]] = result

            result.plot = result %>% ez.dropna('p',print2scr=F)
            if (plot & nrow(result.plot)>0) {
                bonferroniP = -log10(0.05/length(result.plot[['p']]))
                plist[[xx]] = lattice::barchart(-log10(result.plot$p) ~ result.plot$y,
                   xlab = list("Variable", cex=lab.size, fontfamily=RMN),
                   ylab = list("-log10(p-Value)", cex=lab.size, fontfamily=RMN),
                   scales = list( x=list(cex=text.size, fontfamily=RMN), y=list(cex=text.size, fontfamily=RMN) ),
                   main = list(xx, cex=title.size, fontfamily=RMN),
                   col = "#e69f00",
                   ylim=c(-0.5,max(c(bonferroniP,-log10(result.plot$p)))+0.5),
                   panel=function(x,y,...){
                       panel.barchart(x,y,...)
                       panel.abline(h=bonferroniP,col.line="black",lty=2,lwd=2)
                       panel.abline(h=-log10(0.05),col.line="darkgrey",lty=2,lwd=2)}
                )
            }
        }
        if (plot & length(plist)>0) {if (is.null(cols)) {cols=floor(sqrt(length(plist)))}; gridExtra::grid.arrange(grobs=plist,ncol=cols)}
        return(invisible(xlist))
    }

    getStats = function(y,x,df,compare,...){
        df = df[, c(y,x), drop=F]
        df = ez.dropna(df,print2scr=F)
        df = ez.2factor(df)  # do ez.factorelevel also internally
        tryCatch({
        m = fisher.test(df[[y]],df[[x]],...) # by default, pairwise NA auto removed
        p = m$p.value
        # OR only exist for 2x2 table
        odds_ratio = if (is.null(m$estimate)) NA else m$estimate

        countTable = table(df[[y]],df[[x]])   # by default, pairwise NA auto removed
        rcategory = row.names(countTable); ccategory = colnames(countTable)
        counts1 = paste(ccategory, colSums(countTable), sep = ', ', collapse='; ')
        counts2 = paste0('(',toString(ccategory),')')
        for (i in 1:nrow(countTable)){
            counts2 = paste0(counts2, ' | ', rcategory[i], ': ', paste0(countTable[i,],collapse='\t'))
        }
        counts3 = paste0('(',toString(rcategory),')')
        for (i in 1:ncol(countTable)){
            counts3 = paste0(counts3, ' | ', ccategory[i], ': ', paste0(countTable[,i],collapse='/'))
        }
        counts = toString(paste0('(',counts1,').','\n', counts2,'\n',counts3),width=width)
        total = sum(countTable)

        mm = suppressWarnings(chisq.test(df[[y]],df[[x]],...))
        chisq = mm$statistic
        p.chisq = mm$p.value

        ph = fisher.posthoc(table(df[[y]],df[[x]]),
            compare = compare, fisher = TRUE, gtest = FALSE, chisq = TRUE,
            method = "fdr", correct = "none", cramer = FALSE, digits = 8)
        posthoc_fisher = paste0('(',ph$Comparison,') ', ez.p.apa(ph$p.adj.Fisher,prefix=2,pe=pe), ', ', ez.p.apa(ph$p.adj.Chisq,prefix=0,pe=pe), collapse='; ')

        out = list(y=y,x=x,p=p,p.chisq=p.chisq,chisq=chisq,odds_ratio=odds_ratio,counts=counts,total=total,posthoc_fisher=posthoc_fisher)
        return(out)
        }, error = function(e) {
            if (error) ez.pprint(sprintf('EZ Error: fisher.test(%s, %s). NA returned.',y,x,color='red'))
            return(list(y=y,x=x,p=NA,p.chisq=NA,chisq=NA,odds_ratio=NA,counts=NA,total=NA,posthoc_fisher=NA))
        }) # end try catch
    }

    result = mapply(getStats,y=y,x=x,MoreArgs=list(df=df,compare=compare,...),USE.NAMES=F)
    result = data.frame(t(result))
    result[] = lapply(result,unlist)

    for (method in pmethods) {
    	ind = which(!is.na(result[['p']]))
        result[[method]]=stats::p.adjust(result[['p']],method=method)
    }
    ylbl = ez.label.get(df,result$y); xlbl = ez.label.get(df,result$x)
    if (is.null(ylbl)) {ylbl=''}; if (is.null(xlbl)) {xlbl=''}; result$ylbl=rep(ylbl,nrow(result)); result$xlbl=rep(xlbl,nrow(result))
    if (nrow(result)==0){result$orindex=integer(0)} else {result$orindex=1:nrow(result)}
    result = ez.move(result,'orindex first; ylbl after y; xlbl after x') %>% dplyr::arrange(p)

    if (view) {View(result)}

    # todo: format as APA
    result.report = result %>% ez.dropna('p',print2scr=F) %>% dplyr::arrange(orindex)
    if (report & nrow(result.report)>0) {
        # ez.pprint('>>>>>>')
        for (i in 1:nrow(result.report)){
            # sprintf('%.2e',NA) OK
            ez.pprint(sprintf('fisher.test(%s,%s): OR = %.2e\t%s\t\tX2 = %.2f\t%s', result.report$y[i],result.report$x[i],result.report$odds_ratio[i],ez.p.apa(result.report$p[i],prefix=2,pe=pe), result.report$chisq[i], ez.p.apa(result.report$p.chisq[i],prefix=0,pe=pe)),color='cyan')
        }

        for (i in 1:nrow(result.report)){
            ez.pprint(sprintf('FDR (fisher,chisq): %s', result.report$posthoc_fisher[i]),color='cyan')
        }

        for (i in 1:nrow(result.report)){
            ez.pprint(sprintf(' Data available for %d participants %s', result.report$total[i], result.report$counts[i]),color='cyan')
        }
        # ez.pprint('<<<<<<')
    }

    result.plot = result %>% ez.dropna('p',print2scr=F)
    if (plot & nrow(result.plot)>0) {
        bonferroniP = -log10(0.05/length(result.plot[['p']]))
        if (length(y)>=1 & length(x)==1) {
            pp=lattice::barchart(-log10(result.plot$p) ~ result.plot$y,
               xlab = list("Variable", cex=lab.size, fontfamily=RMN),
               ylab = list("-log10(p-Value)", cex=lab.size, fontfamily=RMN),
               scales = list( x=list(cex=text.size, fontfamily=RMN), y=list(cex=text.size, fontfamily=RMN) ),
               main = list(x, cex=title.size, fontfamily=RMN),
               col = "#e69f00",
               ylim=c(-0.5,max(c(bonferroniP,-log10(result.plot$p)))+0.5),
               panel=function(x,y,...){
                   panel.barchart(x,y,...)
                   panel.abline(h=bonferroniP,col.line="black",lty=2,lwd=2)
                   panel.abline(h=-log10(0.05),col.line="darkgrey",lty=2,lwd=2)}
            )
        } else {
            pp=lattice::barchart(-log10(result.plot$p) ~ result.plot$x,
               xlab = list("Variable", cex=lab.size, fontfamily=RMN),
               ylab = list("-log10(p-Value)", cex=lab.size, fontfamily=RMN),
               scales = list( x=list(cex=text.size, fontfamily=RMN), y=list(cex=text.size, fontfamily=RMN) ),
               main = list(y, cex=title.size, fontfamily=RMN),
               col = "#e69f00",
               ylim=c(-0.5,max(c(bonferroniP,-log10(result.plot$p)))+0.5),
               panel=function(x,y,...){
                   panel.barchart(x,y,...)
                   panel.abline(h=bonferroniP,col.line="black",lty=2,lwd=2)
                   panel.abline(h=-log10(0.05),col.line="darkgrey",lty=2,lwd=2)}
            )
        }
        print(pp)
    }

    return(invisible(result))
}

#' @title Pairwise tests of independence for nominal data
#'
#' @description Conducts pairwise tests for a 2-dimensional matrix,
#'              in which at at least one dimension has more than two
#'              levels, as a post-hoc test. Conducts Fisher exact, Chi-square,
#'              or G-test.
#'
#' @param x A two-way contingency table. At least one dimension should have
#'          more than two levels.
#' @param compare If \code{"row"}, treats the rows as the grouping variable.
#'                If \code{"column"}, treats the columns as the grouping variable.
#' @param fisher  If \code{"TRUE"}, conducts fisher exact test.
#' @param gtest   If \code{"TRUE"}, conducts G-test.
#' @param chisq   If \code{"TRUE"}, conducts Chi-square test of association.
#' @param method  The method to adjust multiple p-values.
#'                See \code{stats::p.adjust}.
#' @param correct The correction method to pass to \code{DescTools::GTest}.
#' @param cramer If \code{"TRUE"}, includes and effect size, Cramer's V in the
#'               output.
#'               --jerry,does not support this because of cramerV() from the rcompansion package not included in this function
#' @param digits The number of significant digits in the output.
#' @param ... Additional arguments, passed to \code{stats::fisher.test},
#'            \code{DescTools::GTest}, or \code{stats::chisq.test}.
#'
#' @author Salvatore Mangiafico, \email{mangiafico@njaes.rutgers.edu}
#' @references \url{http://rcompanion.org/handbook/H_04.html}
#' @concept Chi-square G-test Fisher contingency table nominal
#' @return A data frame of comparisons, p-values, and adjusted p-values.
#'
#' @seealso \code{\link{pairwiseMcnemar}}, \code{\link{groupwiseCMH}},
#'           \code{\link{nominalSymmetryTest}},
#'           \code{\link{pairwiseNominalMatrix}}
#'
#' @examples
#' # posthoc for fisher, essentially fisher and then apply fdr correction
#' # pairwiseNominalIndependence from rcompansion
#' # nbk %>% ez.factorelevel(c('dx','race'))->tmp
#' # table(tmp$dx,tmp$race) %>% fisher.posthoc()
#' # note: table(tmp$race,tmp$dx) %>% fisher.posthoc() is different
#' # also similar function from RVAideMemoire::fisher.multcomp
#' # but this funciton can do all (fisher, chisq and g-test)
#'
#' ### Independence test for a 4 x 2 matrix
#' data(Anderson)
#' fisher.test(Anderson)
#' Anderson = Anderson[(c("Heimlich", "Bloom", "Dougal", "Cobblestone")),]
#' PT = fisher.posthoc(Anderson,
#'                                  fisher = TRUE,
#'                                  gtest  = FALSE,
#'                                  chisq  = FALSE,
#'                                  cramer = TRUE)
#' PT
#' cldList(comparison = PT$Comparison,
#'         p.value    = PT$p.adj.Fisher,
#'         threshold  = 0.05)
#'
#'
#' @importFrom DescTools GTest
#' @export
fisher.posthoc =
    function(x, compare="row",
             fisher=TRUE, gtest=TRUE, chisq=TRUE,
             method="fdr", correct="none", cramer=FALSE, digits=3, ...)
    {
    if(compare=="row"){n = nrow(x)}
    if(compare=="column" | compare=="col"){n = ncol(x)}
    N = n*(n-1)/2
    Z = data.frame(Comparison=rep("A", N),
                                 stringsAsFactors=FALSE)
    p.Fisher = rep(NA, N)
    p.Gtest = rep(NA, N)
    p.Chisq = rep(NA, N)
    Cramer = rep(NA, N)

    k=0
    for(i in 1:(n-1)){
         for(j in (i+1):n){
             k=k+1
 if (compare=="row"){
         Namea = as.character(rownames(x)[i])
         Nameb = as.character(rownames(x)[j])
         Dataz = matrix(c(x[i,],x[j,]), nrow=2, byrow=TRUE)
         }
 if (compare=="column" | compare=="col"){
         Namea = as.character(colnames(x)[i])
         Nameb = as.character(colnames(x)[j])
         Dataz = matrix(c(x[,i],x[,j]), ncol=2, byrow=FALSE)
         }
 Z$Comparison[k] = paste0(Namea, " : ", Nameb)
 if(fisher==TRUE){
        p.Fisher[k] = signif(fisher.test(Dataz, ...)$p.value, digits=digits)}
 if(gtest==TRUE){
        p.Gtest[k] = signif(DescTools::GTest(Dataz, correct=correct, ...)$p.value, digits=digits)}
 if(chisq==TRUE){
        p.Chisq[k] = signif(suppressWarnings(chisq.test(Dataz, ...))$p.value, digits=digits)}
    if(cramer==TRUE){
        Cramer[k] = cramerV(Dataz, digits=digits)}
    } # End j loop
 } # End i loop
 if(fisher==TRUE){
        Z$p.Fisher = p.Fisher
        Z$p.adj.Fisher = signif(p.adjust(Z$p.Fisher, method = method), digits=digits)
    }
 if(gtest==TRUE){
        Z$p.Gtest = p.Gtest
        Z$p.adj.Gtest = signif(p.adjust(Z$p.Gtest, method = method), digits=digits)
    }
 if(chisq==TRUE){
         Z$p.Chisq = p.Chisq
         Z$p.adj.Chisq = signif(p.adjust(Z$p.Chisq, method = method), digits=digits)
 }
 if(cramer==TRUE){
         Z$Cramer.V = Cramer
    }
return(Z)
}

#' table xy
#' @description (df, x, y) or (x, y), \code{\link[gmodels]{CrossTable}},  auto convert/reset factor
#' \cr ez.table2 input could be one, two, or more varibles/cols
#' @export
#' @examples
#' # dnn=c('row','col')
ez.table = function(df, x, y=NULL, prop.r=TRUE, prop.c=TRUE, prop.t=TRUE, digits=1, max.width = 1,
                       expected=FALSE, prop.chisq=FALSE, chisq = FALSE, fisher=FALSE, mcnemar=FALSE,
                       resid=FALSE, sresid=FALSE, asresid=FALSE,
                       missing.include=FALSE,
                       format="SPSS", dnn = NULL, ...) {
    # (df, x, y)
    if (!is.null(y)) {
        df = ez.2factor(df,c(x,y))
        if (is.null(dnn)) dnn=ez.eval(sprintf('c("%s","%s")',x,y))
        result = gmodels::CrossTable(df[[x]], df[[y]], digits=digits, max.width = max.width, expected=expected, prop.r=prop.r, prop.c=prop.c,
                           prop.t=prop.t, prop.chisq=prop.chisq, chisq=chisq, fisher=fisher, mcnemar=mcnemar,
                           resid=resid, sresid=sresid, asresid=asresid,
                           missing.include=missing.include,
                           format=format, dnn = dnn, ...)
    # (x, y)
    } else {
        if (is.null(dnn)) {
            dnn = c(deparse(substitute(df)),deparse(substitute(x)))
            dnn=gsub('^[\\w\\.]+\\$','',dnn,perl=T)
        }
        y=x; x=df
        x=ez.2factor(x); y=ez.2factor(y)
        result = gmodels::CrossTable(x, y, digits=digits, max.width = max.width, expected=expected, prop.r=prop.r, prop.c=prop.c,
                           prop.t=prop.t, prop.chisq=prop.chisq, chisq=chisq, fisher=fisher, mcnemar=mcnemar,
                           resid=resid, sresid=sresid, asresid=asresid,
                           missing.include=missing.include,
                           format=format, dnn = dnn, ...)
    }
    return(invisible(result))
}

#' freq table
#' @description freq table, df followed by col names (df, col1, col2, col3), or vector, factor (eg, x,y,z), auto convert/reset factor
#' \cr ez.table output is more beautiful, but must be two varibles/cols
#' \cr result table can be further passed to prop.table(), addmargins(), addmargins(prop.table())
#' @export
#' @param x, ... df followed by col names (df, col1, col2, col3), or vector, factor (eg, x,y,z)
#' @param dnn c('col','row')
#' @param exclude values to use in the exclude argument of factor when interpreting non-factor objects. could be NULL to include NA
#' @param row.vars    a vector of integers giving the numbers of the variables, or a character vector giving the names of the variables to be used for the rows of the flat contingency table.
#' @param col.vars    a vector of integers giving the numbers of the variables, or a character vector giving the names of the variables to be used for the columns of the flat contingency table.
ez.table2 = function(x, ..., dnn=NULL, exclude = c(NA, NaN), row.vars = NULL,col.vars = NULL) {
    if (is.data.frame(x)) {
        # input = x[c(...)]  # when pass date frame, dnn not working
        dots=sapply(as.list(substitute(list(...)))[-1L], deparse)
        input = paste('ez.2factor(x$',dots,')',sep='',collapse=', ')
        if (is.null(dnn)) dnn=c(...)
        cmd = sprintf('ftable(%s, exclude=exclude, row.vars=row.vars, col.vars=col.vars, dnn=dnn)', input)
        theTable = ez.eval(cmd)
        cmd = sprintf('fisher.test(%s)',input)
    } else {
        dots=sapply(as.list(substitute(list(...)))[-1L], deparse)
        if (is.null(dnn)) {
            dnn=paste("'",c(deparse(substitute(x)),dots),"'",sep='',collapse = ', ')
            # print(dnn)
            # get rid of df$
            dnn=gsub('[\\w\\.]+\\$','',dnn,perl=T) %>% paste0('c(',.,')')
            dnn=ez.eval(dnn)
        }
        input = paste('ez.2factor(',c('x',dots),')',sep='',collapse = ', ')
        cmd = sprintf('ftable(%s, exclude=exclude, row.vars=row.vars, col.vars=col.vars, dnn=dnn)', input)
        # print(cmd)
        theTable = ez.eval(cmd)
    }

    print(theTable)

    # if (is.data.frame(x)) {
    #     if (length(list(...))>0) {
    #         fisher = ez.eval(cmd)
    #         cat(sprintf("\nFisher's Exact Test, two-sided p = %f\n",fisher$p.value))
    #     }
    # } else {
    #     if (length(list(...))>0) {
    #         fisher = fisher.test(x,...)
    #         cat(sprintf("\nFisher's Exact Test, two-sided p = %f\n",fisher$p.value))
    #     }
    # }

    return(invisible(theTable))
}

#' maximize np by checking recursively various variable combinations (PCA trick)
#' @description maximize np by checking recursively various variable combinations (PCA trick)
#' @param targetVar a single outcome var, will calculate its mean in each completed subsamples
#' @param fixedVars vars you must have remained. (does not matter if it includes targetVar or not)
#' @return returns a new data frame
#' @export
ez.maxnp = function(df,targetVar=NULL,fixedVars=NULL,point.size=10,point.shape=16,lab.size=18,text.size=16) {
    if (!is.null(targetVar)) {targetVar=ez.selcol(df,targetVar); df %<>% ez.move(sprintf('%s first',targetVar))}
    if (!is.null(fixedVars)) {fixedVars = ez.selcol(df,fixedVars); fixedVars = dplyr::setdiff(fixedVars,targetVar)}

    if ( (!is.null(targetVar)) | (!is.null(fixedVars)) ) {
        # c() works even if targetVar=NULL
        df %>% select(-one_of(c(targetVar,fixedVars))) -> df01
    } else {
        df -> df01
    }

    # df01: 0s and 1s without target, fix, completed vars, zeroVars for PCA
    ind.NAs = is.na(df01)
    lapply(df01, as.character) %>% data.frame(stringsAsFactors = F) -> df01
    df01[!ind.NAs] <- 1; df01[ind.NAs] <- 0
    lapply(df01, as.numeric) %>% data.frame() -> df01

    # get rid of constant 1, for scale and proper pca solution
    # according to my test, if constant 1 into pca, you cannot scale and pca seems strange:
    # the constant var get loading of 0, in the middle of positive and negative laodings
    completedVars = ez.selcol(df01, which(colSums(df01) == nrow(df01)))
    zeroVars  = ez.selcol(df01, which(colSums(df01) == 0))
    if (!is.null(completedVars)) {df01 %<>% select(-one_of(completedVars))}
    if (!is.null(zeroVars)) {df01 %<>% select(-one_of(zeroVars))}

    ####************************************************************************************************
                                         ####*obsolete*####
    ####************************************************************************************************
    # no need to scale for pca, has the same scale 0/1 (well, still not same variance?)
    # in fact scale error if all 1 or 0, scale = TRUE cannot be used if there are zero or constant (for center = TRUE) variables

    # # https://stat.ethz.ch/pipermail/r-help/2006-April/104616.html
    # # duplicated to increse sample size if necessary
    # # because 'princomp' can only be used with more samples than variables
    # df01 = df01[rep(seq(nrow(df01)), each = ceiling(ncol(df01)/nrow(df01))),,drop=F]
    # pcaObj = stats::princomp(base::scale(df01))
    # pcaLoadings <- with(pcaObj, unclass(loadings)) %>% data.frame()
    # pcaLoadings %>% tibble::rownames_to_column() %>% arrange(Comp.1) %>% .$rowname %>% ez.vv()
    ####************************************************************************************************
                                         ####*obsolete*####
    ####************************************************************************************************

    pcaObj = stats::prcomp(df01,scale=T)
    # high loading=more 1s, which means fewer missing values
    pcaLoadings = pcaObj$rotation %>% data.frame() %>% tibble::rownames_to_column() %>% arrange(desc(PC1))

    allvars = c(targetVar,fixedVars,completedVars,pcaLoadings$rowname,zeroVars)
    # sample#, variable#, mean(targetVar)
    counts = matrix(NA,nrow=length(allvars),ncol=3)
    # chop from the last var
    for (i in length(allvars):1) {
        vars = allvars[1:i]
        df %>% select(vars) %>% ez.dropna(print2scr = F) -> df3
        counts[i,1] = nrow(df3)
        counts[i,2] = ncol(df3)

        if (is.null(targetVar)) {
            counts[i,3] = NA
        } else if (is.nan( mean(df3[[targetVar]]) )) {
            # if df3 empty, mean=NaN
            counts[i,3] = NA
        } else {
            counts[i,3] = mean(df3[[targetVar]])
        }
    }
    counts = data.frame(counts)
    colnames(counts) = c('sampleNum','variableNum','targetMean')
    counts$orderedVar = allvars

    # the plot() will not return an object. plot directly, hard to capture to an object
    # graphics::plot(x = variableNum, y = sampleNum)
    p1=lattice::xyplot(counts$sampleNum ~ counts$variableNum,
                  xlab = list("Number of Variables Kept", cex=lab.size, fontfamily=RMN),
                  ylab = list("Sample Size Without Missing Values", cex=lab.size, fontfamily=RMN),
                  scales = list( x=list(cex=text.size, fontfamily=RMN), y=list(cex=text.size, fontfamily=RMN) ),
                  type = "p", pch=point.shape, cex=point.size,
                  col="#e69f00")
    p2=NULL
    if (!all(is.na(counts$targetMean))) {
        p2=lattice::xyplot(counts$targetMean ~ counts$variableNum,
                  xlab = list("Number of Variables Kept", cex=lab.size, fontfamily=RMN),
                  ylab = list(sprintf("Mean Value of %s",targetVar), cex=lab.size, fontfamily=RMN),
                  scales = list( x=list(cex=text.size, fontfamily=RMN), y=list(cex=text.size, fontfamily=RMN) ),
                  type = "p", pch=point.shape, cex=point.size,
                  col="#56b4e9")
    }
    multiplot(p1,p2,cols=2)

    return(invisible(counts))
}

dprime <- function(hit, fa, miss, cr, adjusted=TRUE) {

    n_hit = hit
    n_fa = fa
    n_miss = miss
    n_cr = cr

    # 1) correct ------------------------------------------------------
    correct <- (n_hit+n_cr)/(n_hit+n_miss+n_fa+n_cr)


    # 2) Parametric Indices ------------------------------------------------------
    if (adjusted == TRUE) {
        # Adjusted ratios: Advocates of the loglinear approach recommend using it regardless of whether or not extreme rates are obtained.
        hit_rate_adjusted <- (n_hit + 0.5) / ((n_hit + 0.5) + n_miss + 1)
        fa_rate_adjusted <- (n_fa + 0.5) / ((n_fa + 0.5) + n_cr + 1)

        # dprime
        dprime <- stats::qnorm(hit_rate_adjusted) - stats::qnorm(fa_rate_adjusted)

        # beta
        zhr <- stats::qnorm(hit_rate_adjusted)
        zfar <- stats::qnorm(fa_rate_adjusted)
        beta <- exp(-zhr * zhr / 2 + zfar * zfar / 2)

        # criterion c
        c <- -(stats::qnorm(hit_rate_adjusted) + stats::qnorm(fa_rate_adjusted)) / 2
        # normalized c prime
        cprime <- c/dprime

    } else {
        # Ratios
        n_targets <- n_hit + n_miss
        n_distractors <- n_fa + n_cr

        hit_rate <- n_hit / n_targets
        fa_rate <- n_fa / n_distractors

        # dprime
        dprime <- stats::qnorm(hit_rate) - stats::qnorm(fa_rate)

        # beta
        zhr <- stats::qnorm(hit_rate)
        zfar <- stats::qnorm(fa_rate)
        beta <- exp(-zhr * zhr / 2 + zfar * zfar / 2)

        # criterion c
        c <- -(stats::qnorm(hit_rate) + stats::qnorm(fa_rate)) / 2
        # normalized c prime
        cprime <- c/dprime
    }


    # 3) Non-Parametric Indices ------------------------------------------------------
    # Ratios
    n_targets <- n_hit + n_miss
    n_distractors <- n_fa + n_cr
    hit_rate <- n_hit / n_targets
    fa_rate <- n_fa / n_distractors

    # aprime
    a <- 1 / 2 + ((hit_rate - fa_rate) * (1 + hit_rate - fa_rate) / (4 * hit_rate * (1 - fa_rate)))
    b <- 1 / 2 - ((fa_rate - hit_rate) * (1 + fa_rate - hit_rate) / (4 * fa_rate * (1 - hit_rate)))

    a[fa_rate > hit_rate] <- b[fa_rate > hit_rate]
    a[fa_rate == hit_rate] <- .5
    aprime <- a

    # bppd
    bppd <- ((1 - hit_rate) * (1 - fa_rate) - hit_rate * fa_rate) / ((1 - hit_rate) * (1 - fa_rate) + hit_rate * fa_rate)


    # 4) return ------------------------------------------------------
    n_targets <- n_hit + n_miss
    n_distractors <- n_fa + n_cr
    hr <- n_hit / n_targets
    far <- n_fa / n_distractors

    return(list(dprime = dprime, beta = beta, c = c, cprime = cprime, hr = hr, far = far, correct = correct, aprime = aprime, bppd = bppd))
}

#' Computes Signal Detection Theory indices (percent of correct, d', beta, A', B''D, c, c')
#' @description Computes Signal Detection Theory indices (percent of correct, d', beta, A', B''D, c, c').
#' @param hit Number of hits.
#' @param fa Number of false alarms.
#' @param miss Number of misses.
#' @param cr Number of correct rejections.
#' @param adjusted Should it use the Hautus (1995) adjustments for extreme values (hit rate of 1 and false alarm rate of 0). Note: only affects dprime, beta, c, cprime; all other results have nothing to do with adjustment. This script adjusts both extreme and non-extreme values when adjusted = T. See for more the following notes and \href{https://stats.stackexchange.com/questions/134779}{stackexchange}
#'
#' @return Returns a data frame:
#' (summary: aprime, bppd are nonparametric version of dprime, beta, nonparemetric not subjective to assumptions/extreme values)
#' \itemize{
#'  \item{\strong{hr}: }{hit rate, calculated with raw data, the unadjusted one returned}
#'  \item{\strong{far}: }{false alarm rate, the unadjusted one returned}
#'  \item{\strong{correct}: }{correct = (Hits + CR)/(Hits+Misses+FA+CR) , but it cannot disentangle the two components, ie, discriminability and bias (The major contribution of SDT to psychology is the separation of these two). Intuitively, the best subject maximizes H (and thus minimizes the Miss rate) and minimizes F (and thus maximizes the Correct Rejection rate); hit rate H = hit / (hit+miss); false alarm rate F = FA / (FA + CR); targets = hit + miss; distractors= fa + cr.}
#'  \item{\strong{dprime (d')}: }{The sensitivity, discriminability index. Reflects the mean/peak distance (in standard deviation unit) between the signal and noise distributions (d' = z(H) - z(F), other sensitivity measures include a transform other than z, or even no transform at all). The larger the difference between H and F, the better the subject's sensitivity. A value of 0 indicates an inability to distinguish signals from noise, whereas larger values indicate a correspondingly greater ability to distinguish signals from noise. Negative d' values indicate *below chance* (F > H) performance due to response confusion (responding yes when intending to respond no, and vice versa), misunderstanding the task or other (unknown) factors. Though Z values can have any real value, normally distributed ones are between -2 and 2 about 95 percent of the time. SDT states that d' is unaffected by response bias (i.e., is a pure measure of sensitivity) if two assumptions are met regarding the decision variable: (1) The signal and noise distributions are both normal, and (2) the signal and noise distributions have the same standard deviation. We call these the d' assumptions. If either assumption is violated, d' will vary with response bias. Because of this, some researchers prefer to use nonparametric measures of sensitivity. The most popular is A'. }
#'  \item{\strong{beta}: }{The decision bias, response bias, bias/criterion. Use of this measure assumes that responses are based on a likelihood ratio. Suppose the decision variable achieves a value of x on a given trial. The numerator for the ratio is the likelihood of obtaining x on a signal trial (i.e., the height of the signal distribution at x), whereas the denominator for the ratio is the likelihood of obtaining x on a noise trial (i.e., the height of the noise distribution at x). Formula is exp(-(zH-zF) x (zH+zF)/2). When subjects favor neither the yes response nor the no response, beta=1. Values less than 1 signify a bias toward responding yes (liberal), whereas values of beta greater than 1 signify a bias toward the no (conservative) response. Because beta is based on a ratio, the natural logarithm of beta is often analyzed in place of beta itself. This script gives beta, not ln(beta).}
#'  \item{\strong{aprime (A')}: }{Non-parametric estimate of discriminability. A' typically ranges from .5, which indicates that signals cannot be distinguished from noise, to 1, which corresponds to perfect performance. Values less than .5 may arise from sampling error or response confusion; the minimum possible value is 0.}
#'  \item{\strong{bppd (B''D)}: }{Non-parametric estimate of bias. B''D ranges from -1 (extreme bias in favor of yes/liberal responses) to 1 (extreme bias in favor of no/conservative responses). A value of 0 signifies no response bias.}
#'  \item{\strong{c}: }{Another index of bias, criterion c. c is defined as the distance (the number/unit of standard deviations) between the criterion and the neutral/midpoint point between these two distributions, where neither response is favored (beta=1). If the criterion is located at this point, c has a value of 0. Negative values of c signify a bias toward responding yes (the criterion lies to the left of the neutral point), whereas positive values signify a bias toward the no response (the criterion lies to the right of the neutral point). One advantage of c is that it is unaffected by changes in d', whereas beta is not.}
#'  \item{\strong{cprime}: }{Normalized c, which is c/dprime}
#'  }
#'
#'
#' Note that for d', beta, c, cprime, adjustement for extreme values are made following the loglinear recommandations of Hautus (1995). These extreme values are particularly likely to arise when signals differ markedly from noise, few trials are presented (so that sampling error is large), or subjects adopt extremely liberal or conservative criteria (as might occur if, for example, the consequences of a false alarm are severe). Advocates of this loglinear approach recommend using it regardless of whether or not extreme rates are obtained. This script adjusts both extreme and non-extreme values when adjusted = T.
#' @examples
#' hit <- 9
#' fa <- 2
#' miss <- 1
#' cr <- 7
#'
#' indices <- ez.dprime(hit, fa, miss, cr)
#'
#'
#' df <- data.frame(Participant = c("A", "B", "C"),
#'     hit = c(1, 2, 5), miss = c(9, 8, 5),
#'     fa = c(6, 8, 1), cr = c(4, 2, 9))
#'
#' indices <- ez.dprime(hit=df$hit,
#'     fa=df$fa,
#'     miss=df$miss,
#'     cr=df$cr,
#'     adjusted=FALSE)
#'
#'
#' @author Jerry modified from \href{https://dominiquemakowski.github.io/}{Dominique Makowski}. See Pallier (2002) for the algorithms, Stanislaw & Todorov (1999) for a good tutorial.
#'
#' @export
ez.dprime = function(hit, fa, miss, cr, adjusted=TRUE){
    if (length(hit)>1){
        # Vectorize accepts SIMPLIFY, but even though dprime() returns a data.frame
        # Vectorized function still returns a list
        # so I deal with list and convert it to a data frame
        myfun = Vectorize(dprime)
        result = myfun(hit, fa, miss, cr, adjusted)
        # return a data frame
        result = data.frame(t(result))
        # convert each column to numeric from list inherited from Vectorize
        result[] = lapply(result,unlist)
        result[] = lapply(result,ez.nan2na)
    } else {
        # return a data frame
        result = dprime(hit, fa, miss, cr, adjusted)
        result = data.frame(result)
    }
    return(result)
}

#' calculate effect size
#' @description calculate effect size
#' @param m1 mean
#' @param s1 standard deviation
#' @param n1 numbers/subjects/samples group 1
#' @param m2 mean
#' @param s2 standard deviation
#' @param n2 numbers/subjects/samples group 2
#' @return returns invisible
#' @export
# internal note: in the case of ancova, use adjusted means??
ez.es.t.independent.msn = function(m1,s1,n1,m2,s2,n2) {
    # simply sd weighted by sample size
    s_pooled = sqrt( (((n1-1)*s1*s1)+((n2-1)*s2*s2)) / (n1+n2-2) )
    d = (m1-m2)/s_pooled

    output = sprintf("d = %0.2f", d)
    cat(output, "\n", sep = "")
    cat('d [0.20 0.50) = small, [0.50 0.80) = medium, [0.80 ) = large. The sign of d is arbitrary.\n')
    return(invisible(d))
}

#' calculate effect size
#' @description calculate effect size
#' @param t t (equal variances assumed in SPSS), numbers/subjects/samples group 1 and 2
#' @return returns invisible
#' @export
# internal note: in the case of ancova, use adjusted means??
ez.es.t.independent.tn = function(t,n1,n2) {
    # this formula could be derived from t formula for independent t-test
    # equivalently sqrt((n1+n2)/(n1*n2))
    d = t*( sqrt((1/n1+1/n2)) )

    output = sprintf("d = %0.2f", d)
    cat(output, "\n", sep = "")
    cat('d [0.20 0.50) = small, [0.50 0.80) = medium, [0.80 ) = large. The sign of d is arbitrary.\n')
    return(invisible(d))
}

#' calculate effect size
#' @description calculate effect size
#' @param t t for paired samples t test, available in SPSS paired samples Test output table
#' @param n number of pairs, in SPSS paired samples Test output table, n=df+1
#' @param r correlation, In case, the correlation is unknown, please fill in 0.
#' @return returns invisible
#' @note formula from Dunlap 1996: Meta-analysis of experiments with matched groups or repeated measures designs. And notes from Section 5 https://www.psychometrica.de/effect_size.html
#' @export
# internal note: in the case of ancova, use adjusted means??
ez.es.t.paired.tnr = function(t,n,r) {
    d = t*sqrt( 2.0*(1.0-r)/n )

    output = sprintf("d = %0.2f", d)
    cat(output, "\n", sep = "")
    cat('d [0.20 0.50) = small, [0.50 0.80) = medium, [0.80 ) = large. The sign of d is arbitrary.\n')
    return(invisible(d))
}

#' calculate effect size
#' @description calculate effect size
#' @param m12 the mean of differences that equals the difference of means (m1-m2), available in SPSS paired samples Test output table
#' @param s12 the standard deviation of the difference score, available in SPSS paired samples Test output table
#' @param r correlation, In case, the correlation is unknown, please fill in 0.
#' @return returns invisible
#' @export
# internal note: in the case of ancova, use adjusted means??
ez.es.t.paired.m12s12r = function(m12,s12,r) {
    # derive the following formulas, based on t = m12/se12 = m12/(s12/sqrt(n)), therefore
    # d = t*sqrt(2.0*(1.0-r)/n) = ( m12/(s12/sqrt(n)) ) * sqrt(2.0*(1.0-r)/n) = m12*sqrt(2*(1-r))/s12
    d = m12*sqrt(2*(1-r))/s12

    output = sprintf("d = %0.2f", d)
    cat(output, "\n", sep = "")
    cat('d [0.20 0.50) = small, [0.50 0.80) = medium, [0.80 ) = large. The sign of d is arbitrary.\n')
    return(invisible(d))
}

#' calculate effect size
#' @description calculate effect size
#' @param m1 mean
#' @param s1 standard deviation
#' @param m2 mean
#' @param s2 standard deviation
#' @param r correlation, In case, the correlation is unknown, please fill in 0.
#' @return returns invisible
#' @export
# internal note: in the case of ancova, use adjusted means??
ez.es.t.paired.msr = function(m1,s1,m2,s2,r) {
    s12 = sqrt( s1*s1 + s2*s2 - 2*r*s1*s2 )
    m12 = m1 - m2
    d = m12*sqrt(2*(1-r))/s12

    output = sprintf("d = %0.2f", d)
    cat(output, "\n", sep = "")
    cat('d [0.20 0.50) = small, [0.50 0.80) = medium, [0.80 ) = large. The sign of d is arbitrary.\n')
    return(invisible(d))
}

#' retrieve article citation numbers from pubmed
#' @description retrieve article citation numbers from pubmed
#' @param xmlFile EndNote library file, contains exported articles with correct titles
#' @param outFile an excel file to store the results, if not provided, same base name as xmlFile
#' @param index 1:10, same syntax titles[index] to choose a subset of titles in xmlFile to process, NULL=all
#' @return returns invisible, save an excel file with results
#' @note get citation numbers cited by available pubmed central papers
#' @export
ez.citen = function(xmlFile,outFile=NULL,index=NULL){

    if (is.null(outFile)){
        outFile=gsub('xml$','xlsx',xmlFile,perl=TRUE)
    }
    # https://cran.r-project.org/web/packages/rentrez/vignettes/rentrez_tutorial.html
    # https://www.ncbi.nlm.nih.gov/books/NBK25501/
    # https://www.stat.berkeley.edu/~statcur/Workshop2/Presentations/XML.pdf

    bibs = XML::xmlParse(file = xmlFile)
    nodes = XML::getNodeSet(bibs,"//title")
    titles = XML::xmlSApply(nodes,XML::xmlValue)

    strcomp <- function(s1,s2) {
        s1 = tolower(gsub('\\W','',s1,perl=TRUE))
        s2 = tolower(gsub('\\W','',s2,perl=TRUE))
        return(s1==s2)
    }

    pubmedcites = function(title){
        # title = "Dissociating Normal Aging from Alzheimer's Disease: A View from Cognitive Neuroscience"
        rsearch <- rentrez::entrez_search(db="pubmed", retmax=1, term=title)

        if (rsearch$count==0) {
            # nothing found, give up
            result = list("sortpubdate" = "",
                          "pmcrefcount" = NA_integer_,
                          "sortfirstauthor" = "",
                          "lastauthor" = "",
                          "fulljournalname" = "",
                          "title" = "")
        } else {
            rsum <- rentrez::entrez_summary(db="pubmed", id=rsearch$ids)
            result <- rentrez::extract_from_esummary(rsum, c("sortpubdate", "pmcrefcount", "sortfirstauthor", "lastauthor", "fulljournalname", "title"))
            # if not same title, try title search according to previously returned parsed search term
            if (!strcomp(title,result$title)){
                term=gsub('\\[.*?\\]','[Title]',rsearch$QueryTranslation,perl=TRUE)
                rsearch <- rentrez::entrez_search(db="pubmed", retmax=1, term=term)
                # if fails, give up
                if (rsearch$count==0) {
                    result = list("sortpubdate" = "",
                                  "pmcrefcount" = NA_integer_,
                                  "sortfirstauthor" = "",
                                  "lastauthor" = "",
                                  "fulljournalname" = "",
                                  "title" = "")
                } else {
                    rsum <- rentrez::entrez_summary(db="pubmed", id=rsearch$ids)
                    result <- rentrez::extract_from_esummary(rsum, c("sortpubdate", "pmcrefcount", "sortfirstauthor", "lastauthor", "fulljournalname", "title"))
                }
            }

        }


        result$OriginalTitle = title
        result$RetrievedDate = ez.moment()

        if (strcomp(title,result$title)){
           result$TitleMatch = 1
        } else {
           result$TitleMatch = 0
        }

        return(result)
    }

    if (!is.null(index)) titles=titles[index]
    cites = lapply(titles, pubmedcites)
    results = as.data.frame( t(matrix(unlist(cites), nrow=length((cites[[1]])))) )
    names(results) = names(cites[[1]])
    results = ez.num(results, c('pmcrefcount','TitleMatch'),force=TRUE)
    results$ID = seq.int(nrow(results))
    results = dplyr::arrange(results,desc(pmcrefcount))

    ez.savex(results,file=outFile,withFilter=TRUE)
    return(invisible(results))
}

#' box-cox power transformation \href{https://drive.google.com/file/d/1tsXLgRSq_svd-hvA02XmXSJmPX6SCbvf/view}{GDoc Note}
#' @description box-cox power transformation \href{https://drive.google.com/file/d/1tsXLgRSq_svd-hvA02XmXSJmPX6SCbvf/view}{GDoc Note}
#' @param y a data frame or a vector
#' @param col passed to \code{\link{ez.selcol}}
#' \cr        if x is a data frame, col specified, process that col only.
#' \cr        if x is a data frame, col unspecified (i.e., NULL default), process all cols
#' \cr        if x is not a data frame, col is ignored
#' \cr        could be multiple cols
#' @param na.rm rm na from y,x (pairwise), if not, NA stays as is. applicable only if y is a vector.
#' @param plot boxcox plot. applicable only when there is an actual transformation
#' @param print2scr print out transformation parameters
#' @param force T = transform regardless, or F = only if p.lambda rounded is less than .05.
#' @param method "boxcox" is \code{out = car::bcPower(y, lambda=lambda.in.use, jacobian.adjusted = FALSE, gamma=NULL)} for all positive, \code{out = car::bcnPower(y, lambda=lambda.in.use, jacobian.adjusted = FALSE, gamma=gamma)} for any non-positive--ie, zero or negative. 
#' \cr 
#' \cr The selection between \code{\link[car]{bcPower}} and \code{\link[car]{bcnPower}} is done automatically by this function.
#' \cr 
#' \cr Where \code{\link[car]{bcPower}} is: ((x+gamma)^(lambda)-1)/lambda if lambda not 0; log(x+gamma) if lambda 0. Here gamma NULL means 0.
#' \cr 
#' \cr \code{\link[car]{bcnPower}} is: ((0.5 * (x + sqrt(x^2 + gamma^2)))^lambda - 1)/lambda if lambda not 0; log(0.5 * (x + sqrt(x^2 + gamma^2))) if lambda 0. This bcnPower is Hawkins and Weisberg (2017). While allowing for the transformed data to be interpreted similarly to the interpretation of Box-Cox
#' transformed, it is much less biased than by setting the parameter gamma to be non-zero in the Box-Cox family.
#' \cr
#' \cr 
#' "modified.tukey" \code{out = car::basicPower(y,lambda=lambda.in.use, gamma=NULL); if (lambda.in.use<0) out = -1*out}.
#' \cr 
#' \cr Where \code{\link[car]{basicPower}} is: x^lambda if lambda not 0; log(x) if lambda 0
#' \cr
#' \cr Because neither tukey or modified tukey could handle zero or negative input, this function will auto force switch to \code{\link[car]{bcnPower}}
#' \cr Therefore, both "tukey" and "boxcox" methods here keep the ordering.
#' @param precise use rounded lambda, one of c(0, 0.33, -0.33, 0.5, -0.5, 1, -1, 2, -2) or raw/calculated lambda
#' @return returns transformed y, or original y if no transformation occurs.
#' @importFrom car basicPower bcPower bcnPower
#' @note Box and Cox (1964) \code{\link[car]{bcPower}} and modified tukey \code{\link[car]{basicPower}} can only deal with non-negative responses. Also consider applying z standardization to boxcox-transformed data.
#' \cr lambda is a tuning parameter that can be optimized in a way that the distribution of the transformed data has the largest similarity to a normal distribution. There are several proposals to optimize lambda.
#' \cr The Box-Cox-transformed values do not guarantee normality although the data should be less skewed and should have less extreme values than before transformation.
#' \cr Some research (Zwiener et al, 2014, PLOS ONE) pointed out that z Standardization of covariates leads to better prediction performance independent of the underlying transformation used (eg., raw, log, boxcox)
#' \cr see also \code{\link[MASS]{boxcox}}
#' @export
ez.boxcox = function (y, col=NULL, na.rm = FALSE, plot = TRUE, print2scr = TRUE,
    force = TRUE, method = c('boxcox','modified.tukey'), precise = c('rounded','raw'), ...) {

    if (!is.data.frame(y)) {
        x = rep(1, length(y))
        if (na.rm && (any(is.na(y)) | any(is.na(x)))) {
            rmv <- is.na(y) | is.na(x)
            y <- y[!rmv]
            x <- x[!rmv]
        }
        if (!is.numeric(y) | is.factor(y) | is.character(y)) {
            stop("y must be numeric")
        }

        method = match.arg(method)
        precise = match.arg(precise)
        if (any((y <= 0) %in% TRUE)) {
            if (method=='modified.tukey') {
                ez.pprint('zero or negative value exists, switching method from modified tukey to boxcox bcnPower...')
                family = "bcnPower"
                method='boxcox'
            } else {
                family = "bcnPower"
                ez.pprint('zero or negative value exists, switching method from boxcox bcPower to boxcox bcnPower...')
            }
        }
        else {
            family = "bcPower"
        }
        # powerTransform: Finding Univariate or Multivariate Power Transformations
        # "bcPower" for the default for the Box-Cox power family
        # "bcnPower" for a two-parameter modification of the Box-Cox family that allows (a few) negative responses (Hawkins and Weisberg (2017))
        #        while allowing for the transformed data to be interpreted similarly to the interpretation of Box- Cox transformed values.
        #        essentially estimate/add a number to y to make it positive
        # "yjPower" family (Yeo and Johnson(2000)), another modifiation of the Box-Cox family that allows a few negative values.
        #       not use
        #       Because of the unusual constraints on the powers for positive and negative data, this transformation is not used very often, as results are difficult to interpret
        # If the "object" argument is of class "lm" or "lmerMod", the Box-Cox procedure is applied to the conditional distribution of the response given the predictors.
        bc = suppressWarnings(car::powerTransform(y ~ x, family = family))
        sbc = suppressWarnings(summary(bc))
        lambda.raw = sbc$result[[1]]
        # lambda, p.lambda are for rounded lambda
        lambda = sbc$result[[2]]
        p.lambda = ez.num(car::testTransform(bc,lambda=lambda)$pval,force=T)
        gamma = sbc$result.gamma[[1]]
        if (is.null(gamma)) gamma = NA

        if (force | p.lambda < .05){
            if (print2scr) cat(sprintf('Box-Cox%s: lambda.raw = %f, lambda = %.2f, p.lambda = %f, gamma = %f, n = %d\n', ifelse(is.null(col),'',sprintf(' (%s)',col)),lambda.raw, lambda, p.lambda, gamma, length(y)))

            if (precise=='raw') {
                lambda.in.use = lambda.raw
            } else {
                lambda.in.use = lambda
            }

            if (method=='boxcox') {
                if (family=='bcnPower') {
                    out = car::bcnPower(y, lambda=lambda.in.use, jacobian.adjusted = FALSE, gamma=gamma)
                    if (print2scr) cat(sprintf('Done! Box-Cox with negatives transformations (car::bcnPower): lambda %f, gamma = %f\n', lambda.in.use, gamma))
                } else if (family=='bcPower') {
                    out = car::bcPower(y, lambda=lambda.in.use, jacobian.adjusted = FALSE, gamma=NULL)
                    if (print2scr) cat(sprintf('Done! Box-Cox transformations (car::bcPower): lambda %f\n', lambda.in.use))
                }
            # modified tukey to keep ordering
            } else if (method=='modified.tukey') {
                out = car::basicPower(y,lambda=lambda.in.use, gamma=NULL)
                if (lambda.in.use<0) {
                    out = -1*out
                    if (print2scr) cat(sprintf('Done! Modified Tukey transformation (-1*car::basicPower), lambda %f\n', lambda.in.use))
                } else {
                    if (print2scr) cat(sprintf('Done! Tukey transformation (car::basicPower), lambda %f\n', lambda.in.use))
                }
            }

            if (plot) {
                old.par <- par(no.readonly = TRUE)
                graphics::layout(matrix(c(1,2,0,0,2,3), 3))
                opar = par(oma=c(0,0,0,0), mar = c(4,2,0.5,0.5))
                on.exit(par(opar))
                hist(y, col='#56B4E9',main=NULL,xlab=NULL)
                abline(v = mean(y,na.rm=T), col = "#E69F00", lty = 3, lwd = 2)
                car::boxCox(y ~ x, family = family,
                xlab = as.expression(substitute(variable*lambda~"(raw)"~"="~lambda.raw.value*", "~lambda~"="~lambda.value*", "~italic(p)~"="~p.lambda.value*", "~gamma~"="~gamma.value*", "~italic(n)~"="~n.value,
                    list(variable=ifelse(is.null(col),"",sprintf("%s: ",col)),
                        lambda.value=sprintf("%.2f",lambda),
                        p.lambda.value=sprintf("%f",p.lambda),
                        gamma.value=sprintf("%f",gamma),
                        lambda.raw.value=sprintf("%f",lambda.raw),
                        n.value=sprintf("%d",length(y))
                        ))))
                hist(out, col='#56B4E9',main=NULL,xlab=NULL)
                abline(v = mean(out,na.rm=T), col = "#E69F00", lty = 3, lwd = 2)
                par(old.par)
            }
        # no transformation
        } else {
            out = y
            if (print2scr) cat(sprintf('Done! No transformation actually applied\n'))
        }
    } else if (is.data.frame(y)) {
        col = ez.selcol(y,col)
        y[col] = lapply(1:length(col), function(j) {ez.boxcox(y=y[col][[j]],col=col[j],na.rm=F,plot=plot,print2scr=print2scr,force=force,method=method,precise=precise,...)})
        out = y
    }
    return(invisible(out))
}

#' calculate correlation matrix (for publication)
#' @description calculate correlation matrix (for publication)
#' @param df data frame
#' @param file if not NULL, save matrix to an excel file
#' @return returns invisible, save an excel file with results
#' @note *** p < .001, ** p < .01, * < .05
#' @export
ez.corrmatrix = function(df,file='CorrMatrix.xlsx'){
    # https://stackoverflow.com/a/56902023/2292993
    corstarsl <- function(d){ 
        x <- as.matrix(d) 
        R <- Hmisc::rcorr(x)$r
        p <- Hmisc::rcorr(x)$P 

        mystars <- ifelse(p < .001, "***", ifelse(p < .01, "** ", ifelse(p < .05, "* ", " ")))

        R <- format(round2(cbind(rep(-1.11, ncol(x)), R), 2))[,-1] 
        # remove leading 0, such that 0.34 -> .34
        R <- gsub("^(\\s*[+|-]?)0\\.", "\\1.", R)

        Rnew <- matrix(paste(R, mystars, sep=""), ncol=ncol(x)) 
        diag(Rnew) <- paste(diag(R), " ", sep="") 
        rownames(Rnew) <- colnames(x) 
        colnames(Rnew) <- paste(colnames(x), "", sep="") 

        Rnew <- as.matrix(Rnew)
        Rnew[upper.tri(Rnew, diag = TRUE)] <- ""
        diag(Rnew) <- "--"
        Rnew <- as.data.frame(Rnew) 

        Rnew <- cbind(Rnew[1:length(Rnew)-1])
        return(Rnew) 
    }
    out=corstarsl(df)
    meansd <- function(v){
        m=sprintf('%.2f',round2(mean(v,na.rm=TRUE),2))
        s=sprintf('%.2f',round2(sd(v,na.rm=TRUE),2))
        return(ez.sprintf("{m} ({s})"))
    }
    out['Mean (SD)'] = sapply(df,meansd)
    if (!is.null(file)) ez.savex(out,file,rowNames=TRUE)
    return(invisible(out))
}
jerryzhujian9/zmisc documentation built on March 9, 2024, 12:49 a.m.