cleansing: Data cleaning and summary statistics

Documented in .combine_lowFreqLevels .comb_xy .conv_dtype .conv_na .count_na .countsFreq_chiSq .descrStats_x es_cdelta .num_groupComp .num_groupComp_pairwise .rm_na .split_multiresp

#' Set non-informative values to NA
#' @param x variable to be cleaned
#' @return variable with values such as "na" set to NA
#' @details Replaced strings: NA, nan, inf, "NA", "NAN", "null" with NA
.conv_na<-function(x){
  # remove leading and traling whitespaces
  x=gsub('^ | $', '',  x)
  # clean NA
  gsub('^na$|^null$|^nan$|^inf$|^null$|^NA$|^\\s*$|^$', NA, x, ignore.case = T)
}

#' Remove all na values
#' @param x variable to be cleaned
#' @return number of non-informative values
.rm_na<-function(x){
  x[which(!is.na(x))]
}

#' @title Counting the number of NA
#' @param x variable for summary statistic
#' @param y variable for stratification
#' @return df counting NA, abs, %
.count_na<-function(x, y){
  # x is variable to be summaryised
  # y is stratification
  # x,y are clean, ie no other variables
  n_x=length(x)
  n_y=length(y)

  if(n_x != n_y){stop('count_na -> vectors have different lengths')}

  idx_x=(is.na(x))
  idx_y=(is.na(y))

  x_na=length(which(idx_x))
  y_na=length(which(idx_y))
  xANDy_na=length(which(idx_x & idx_y))
  xORy_na=length(which(idx_x | idx_y))

  x_na=c(x_na, x_na/n_x*100)
  y_na=c(y_na, y_na/n_x*100)
  xANDy_na=c(xANDy_na, xANDy_na/n_x*100)
  xORy_na=c(xORy_na, xORy_na/n_x*100)

  sum_tbl=data.frame(n_x=c(n_x, 100), x_na, n_y=c(n_y, 100), y_na, xANDy_na, xORy_na)
  rownames(sum_tbl)=c('abs', 'perc')

  out=list(summary=sum_tbl, idx_xORy=which(!idx_x & !idx_y), idx_x=which(!idx_x), idx_y=which(!idx_y))

  return(out)
}

.count_na_x<-function(x){
  # x is variable to be summaryised
  # y is stratification
  # x,y are clean, ie no other variables
  n_x=length(x)
  idx_x=(is.na(x))
  x_na=length(which(idx_x))

  x_na=c(x_na, x_na/n_x*100)
  x_val=c(n_x-x_na[1], 100-x_na[2])

  sum_tbl=data.frame(x_n=c(n_x, 100), x_na, x_val)
  rownames(sum_tbl)=c('abs', 'perc')

  out=list(summary=sum_tbl, idx_x=which(!idx_x))

  return(out)
}

#' @title Variable class conversion
#' @param x variable to be converted
#' @param dtype desired variable class (numeric, string, factor)
#' @param levs levels in case of class factor
#' @return converted x
.conv_dtype<-function(x, dtype=c('num', 'str', 'fac', 'bool'), levs=NULL){


  .conv<-function(x, dtype){
    switch(dtype,
           'num'={as.numeric(x)},
           'fac'={
             if(!is.null(levs)) {factor(x, levels=levs)} else{
               as.factor(x)
             }
           },
           'str'=as.character(x),
           'bool'=as.character(x)
           )
  }


  if(is.list(x)){
    out=lapply(x, function(xx) {.conv(xx, dtype)})
  }else{
    out=.conv(x, dtype)
  }

  return(out)

}

#' @title Split multilevel variables
#' @param x Multilevel variable to be split, given as string
#' @param sep Symbol that separates levels within a string
#' @param return return list or vector (see Details)
#' @return converted x
#' @details return argument is list: each x element is list element, return argument is vector: list is vectorised to 1D array, length does not equal number of elements in x then.
.split_multiresp<-function(x, sep=', ', return=c('list', 'vector')){

  x_split=strsplit(x, sep) # this is a list
  x_split=lapply(x_split, gsub, pattern="^ | $", replacement="")

  if(return=='vector'){
    # create array of indeces to track responses from db rows/entries
    names(x_split)=paste0(as.character(1:length(x_split)), '.')
    return(unlist(x_split))
  }else{
    return(x_split)
  }
}



.split_multiresp_df<-function(x, sep=', '){

  x_split=strsplit(x, sep) # this is a list
  x_split=lapply(x_split, gsub, pattern="^ | $", replacement="")

  ids=unlist(sapply(seq(x_split), function(i){
    rep(i,  length(x_split[[i]]))
  }))
  out=data.frame(id=ids, val=unlist(x_split))

  return(out)
}

#' @title Merge low freq levels into a single level
#' @param x array, categorical variable to be processed
#' @param n num Percent frequency value of x level, for assigning new level
#' @param repl char, Name of new level
#' @param includeLevN logic, Level name indicates number of merges
#' @return x with low freq levels merged into new level
.combine_lowFreqLevels<-function(x, n=2, repl='Other', includeLevN=T){
  cs=table(x)
  idx=which(cs<(sum(cs)*(n/100)))

  if(length(idx)>0){
    lev_repl=names(idx)
    if(includeLevN) {
      n=length(idx)
      repl=paste0(repl, ' (', n, ' levels)')
    }

    lev_repl=gsub('(', '\\(', lev_repl, fixed=T)
    lev_repl=gsub(')', '\\)', lev_repl, fixed=T)
    x_repl=gsub(paste0('^', paste0(lev_repl, collapse='$|^'), '$'), repl, x, fixed = F)
    return(x_repl)
  }else{
    return(x)
  }

}

#' @title Combine uni and/or multilevel variables, rm NA's
#' @param x array, categorical or numeric variable to be merged with y
#' @param y array, categorical or numeric variable to be merged with x
#' @return list, 1: merged x and y, 2: indices of NA in x or y
.comb_xy<-function(x, y){
  if(is.atomic(x) & is.atomic(y)){
    out=data.frame(x=x, y=y, id=seq_along(along.with=x))
  }

  if(is.data.frame(x) & is.atomic(y)){
    out=data.frame(x=x$val, y=y[x$id], id=x$id)
    }

  if(is.data.frame(y) & is.atomic(x)){
    #browser()
    out=data.frame(x=x[y$id], y=y$val, id=y$id)}

  if(is.data.frame(y) & is.data.frame(x)){
    browser()
  }
  return(out)
}


#' @title Univariate stats for a categorical variable
#' @param ds data.frame, categorical variable and segmentation variable
#' @param x.multi logic, is x cleaned multi-level variable?
#' @param y.multi logic, is y cleaned multi-level variable?
#' @return list, 1: contingency table(x,y) and y, 2: Chi-squared p value
#' @importFrom stats chisq.test
.countsFreq_chiSq<-function(ds, x.multi, y.multi){
  if(y.multi & !x.multi){
    y_lev=unique(ds$y)
    ds_uni=unique(ds[, colnames(ds) %in% c('x', 'ids')])
    ds_uni$y=F
    ds_uni$y=factor(ds_uni$y, levels=c(T, F))
    ds_uni$x=factor(ds_uni$x, levels=unique(ds$x))
    ct=sapply(y_lev, function(lev, ds_u=ds_uni){
      idc=ds$ids[which(ds$y==lev)]
      ds_u$y[ds_u$ids %in% idc]=T
      table(x=ds_u$x, seg=ds_u$y)[,1]
    })
  }

  if(x.multi & !y.multi){
    x_lev=unique(ds$x)
    ds_uni=unique(ds[, colnames(ds) %in% c('y', 'ids')])
    ds_uni$x=F
    ds_uni$y=factor(ds_uni$y, levels=unique(ds$y))
    ds_uni$x=factor(ds_uni$x, levels=c(T, F))
    ct=sapply(x_lev, function(lev, ds_u=ds_uni){
      idc=ds$ids[which(ds$x==lev)]
      ds_u$x[ds_u$ids %in% idc]=T
      table(x=ds_u$x, seg=ds_u$y)[1,]
    })
  }

  if(!x.multi & !y.multi){
    ct=table(ds$x, ds$y)
  }

  if(any(ct<5 & ct!=0)) warning('Low number of counts, p value instable')
   cs=chisq.test(ct)
   return(list(ct, cs))
}

.ct_to_perc<-function(ct){
  if(is.null(nrow(ct))){ct/sum(ct)}else{
    ct=apply(ct, 2, function(x){
      x/sum(x)
    })
  }
  ct*100
}


#' @title Statistical group comparison for a numeric variable
#' @param x num array, variable for group comparison
#' @param y cat array, segementation variable defining groups
#' @return list, 1: p value for comparing all groups and y, 2: contingency table y
#' @details Pariwise group comparison is performed with fct .num_groupComp_pariwise
#' @importFrom stats kruskal.test
#' @importFrom utils combn
.num_groupComp<-function(x, y){
  ct=table(y)
  ctp=(ct/sum(ct))*100
  if(any(ct<5)) stop('low number of counts')
  if(any(ctp<15)) warning('unequal group sizes, p value instable')
  if(length(ct)==1) stop('no segmentation level')
    p=kruskal.test(x, factor(y))$p.value
    # could do pairwise effect sizes here
    return(list(p, ctp))
}


#' @title Pairwise statistical group comparison for a numeric variable
#' @param x num array, variable for group comparison
#' @param y cat array, segementation variable defining groups
#' @return matrix, row 1: p value, row 2: Cliff's delta effect size
#' @importFrom stats kruskal.test
.num_groupComp_pairwise<-function(x, y){
  pw_combn=combn(unique(as.character(y)), 2)
  ps=apply(pw_combn, 2, function(lev){
    idx=which(y %in% lev)
    g=factor(y[idx])
    val=x[idx]
    p=kruskal.test(val, g)$p.value
    dc=es_cdelta(ref=x[which(g==lev[1])], comp=x[which(g==lev[2])])
    c(p, dc)
  })
  ps=as.data.frame(ps)
  colnames(ps)=apply(pw_combn, 2, paste, collapse=' vs.')

  return(ps)
}

#' @title Pairwise effect size - Cliff's delta
#' @param ref num array, variable values for group 1 (reference group)
#' @param comp ref num array, variable values for group 2 (comparison group)
#' @return Cliff's delta value
es_cdelta <- function(ref, comp) {

  if (!is.numeric(ref) | !is.numeric(comp))
    stop("Input not numeric")

  ind_ref <- is.na(ref) | is.infinite(ref)
  ind_comp <- is.na(comp) | is.infinite(comp)

  if (any(ind_ref)) {
    ref <- ref[!ind_ref]
    message("Removing NA or infinite values from reference group.")
  }
  if (any(ind_comp)) {
    comp <- comp[!ind_comp]
    message("Removing NA or infinite values from comparator group.")
  }

  if (length(ref) < 5 | length(comp) < 5)
    stop("Low number of values (< 5)")

  top_counts <- vapply(ref, function(x, y = comp) {
    names(x) <- NULL
    names(y) <- NULL
    c(length(which(x > y)), length(which(x < y)))
  }, FUN.VALUE = c(2, length(ref)))
  out <- ((sum(top_counts[1, ]) - sum(top_counts[2, ]))/(length(ref) * length(comp))) *
    (-1)
  return(out)
}






#' @title Descriptive stats for a numeric variable startified by second variable
#' @param out list generated with .summary_var
#' @return array of quantiles (range, IQR point estimates, median), length and %total for each segmentation level
.descrStats_x<-function(out){

  x=out$x_na_rm$val
  if(out$x$dtype_inferred=='num'){
    # median range
    stats=paste0(round(median(x), 1),' (', round(min(x),1),'-', round(max(x),1), ')')
    lab='Median (Range)'
    out=list(stats, lab)

  }else{
    # level freq (perc)
    ct=table(x, rep('all', length(x)))
    ctp=.ct_to_perc(ct)
    if(nrow(ct)>1){

      out=.comb_tbls(ct, ctp, format = 'long', html = F)
      out=paste(rownames(out), out[,1], sep=': ')
      idx=1:(length(out)-1)
      out[idx]=out[idx][order(ct, decreasing = T)]
      out=paste(out, collapse='\n')
    }else{
      out=paste(paste(rownames(ct), ct[1], sep=': '), ' (100%)', sep='')
        }
  }

  return(out)
}


#' @importFrom plyr ddply
#' @importFrom stats quantile
.num_descrStats_xy<-function(x, y){
  ct=table(y)
  ctp=(ct/sum(ct))*100

  if(any(ct<5)) stop('low number of counts')
  if(any(ctp<15)) warning('unequal group sizes, p value instable')
  if(length(ct)==1) stop('no segmantation level')

  cd=data.frame(x, y)
  quants=ddply(cd, .(y), function(ds, ntot=nrow(cd)){
    c(quantile(ds$x, probs=c(0, 0.25, 0.5, 0.75, 1)), n=nrow(ds), n_perc=(nrow(ds)/ntot) *100)
  })

  return(quants)
  # could do pairwise effect sizes here
}



.num_descrStats_x<-function(x){

  n=length(x)
  quants=quantile(x, probs=c(0, 0.25, 0.5, 0.75, 1))
  return(quants)
  # could do pairwise effect sizes here
}





#  Create table of abs and rel frequencies (%) for categorical variables
#  list of two: 1: p values semgentation levels, 2: contigency table (abs, %)
#  table ready to export
# .to_tbl_cat<-function(pv){
#   nr=nrow(pv[[1]][[1]])
#   nc=ncol(pv[[1]][[1]])
#   ct_out=t(sapply(seq(nr), function(i){
#     paste0(pv[[1]][[1]][i,], ' (', round(pv[[2]][i,]), '%)')
#   }))
#
#   colnames(ct_out)=colnames(pv[[2]])
#
#   ct_out=data.frame(id=rownames(pv[[2]]), ct_out)
#   add=rbind(rep("", nc+1), rep("", nc+1), rep("", nc+1))
#   add[2,1]='p value'
#   add[3,1]=format.pval(pv[[1]][[2]]$p.value, digits = 2)
#   colnames(add)=colnames(ct_out)
#
#   return(rbind(ct_out, add))
#
#
# }


#' @title Create table of summary stats for continuous variables
#' @param pv percentage contingency value
#' @return table ready to export
.to_tbl_num<-function(pv){
  nr=nrow(pv[[1]][[1]])
  nc=ncol(pv[[1]][[1]])
  ct_out=t(sapply(seq(nr), function(i){
    paste0(pv[[1]][[1]][i,], ' (', round(pv[[2]][i,]), '%)')
  }))

  colnames(ct_out)=colnames(pv[[2]])

  ct_out=data.frame(id=rownames(pv[[2]]), ct_out)
  add=rbind(rep("", nc+1), rep("", nc+1), rep("", nc+1))
  add[2,1]='p value'
  add[3,1]=format.pval(pv[[1]][[2]]$p.value, digits = 2)
  colnames(add)=colnames(ct_out)

  rbind(ct_out, add)


}


#' @importFrom stats addmargins
#' @importFrom htmlTable addHtmlTableStyle htmlTable
.comb_tbls<-function(abs, ct, format, html=T){

  if(format=='long'){
    abs=addmargins(abs)
    ct=addmargins(ct)

    out=sapply(seq(ncol(abs)), function(i){
      pp=as.character(round(ct[,i]))
      pp[pp=='0' & abs[,i]>0]='<1'
      paste0(formatC(abs[,i], big.mark = ','),' (', pp, '%)')
    })


    if(is.null(nrow(out))) out=t(out)
    colnames(out)=colnames(abs)
    rownames(out)=rownames(abs)
    return(out)
  }
  if(format=='1row'){

    out=sapply(seq(nrow(abs)), function(i){
        pp=as.character(round(ct[i,]))
        pp[pp=='0' & abs[i,]>0]='<1'
        oo=paste0(formatC(abs[i,], big.mark = ','),' (', pp, '%)')
        oo
    })
    out=c(out)

    olong=matrix(out, nrow=1, ncol=length(out))
    olong=rbind(rep(colnames(ct), nrow(ct)), olong)


    if(html==T){
      tbl=olong
      # html table
      tbl[2,] %>%
        addHtmlTableStyle(align="c",
                          # col.columns = c(rep("none", 2),
                          #                 rep("#F5FBFF", 4)),
                          css.cgroup='background-color: #F5FBFF;') %>%

        htmlTable(
          header=olong[1,],
          cgroup=rownames(ct),
          n.cgroup=c(rep(ncol(ct), nrow(ct)))
        ) -> ex

    }else{
      ex=rbind(add[1:ncol(olong)], olong)
      rownames(ex)=NULL


    }


    return(ex)

  }


}



#' @export
.tbl_output<-function(res, format, html=T){

  if(format=='long'){
    n_tot=formatC(sum(res$counts[[1]])+sum(res$nas[1,]), big.mark = ',')
    out<-.comb_tbls(res$counts[[1]], res$perc, format)
    if(ncol(out)<3) {
      add=matrix("", nrow=nrow(out), ncol=ncol(out)-3)
      out=cbind(out, add)
    }
    out[,ncol(out)]=gsub(' \\(.*', '', out[,ncol(out)])
    add<-add1<-rep("", ncol(out))
    out<-rbind(out, " "=add)
    add1[1]=n_tot
    out=rbind(out, "n_total"=add1)
    add[1]<-c(format.pval(res$counts[[2]]$p.value, 2))
    out<-rbind(out, p.value=add)

    nas<-.comb_tbls(t(t(res$nas[1,])), t(t(res$nas[2,])), format)
    nas=c(paste(c('Var:', 'Seg:', 'Both:'), nas)[c(2,1,3)], rep('', ncol(out)-3))
    out=rbind(out, "nas"=nas)
    rownames(out)[nrow(out)]='NAs'


  }
  if(format=='1row'){

    if(html){
      out=.comb_tbls(res$counts[[1]], res$perc, format, html)
    }else{
      out=.comb_tbls(res$counts[[1]], res$perc, format)
    }
  }


  return(out)

}


#' @title infer variable data type
#' @param x variable of interest
#' @param gsub_pattern regex for str to remove (e.g., '<|>')
#' @param n_un_lim number of unique values in x to define numeric (see details)
#' @param n_un_perc_lim fraction of unique values in x to define numeric (see details)
#' @details dtype inferred is one of num, str, logic, x should be free of na or non-informative values (see fct .rm_na). Parameters n_un_lim and n_un_frac_lim define absolute and fraction of unique values, respectively, to define x as numeric. This is to exclude categorical values from being defined as numeric (e.g., x levels of low medium high).
#' @return data type as string
infer_dtype=function(x, gsub_pattern='<|>', n_un_lim=5, n_un_perc_lim=10){
  t_num=NA
  t_str=NA
  t_bool=NA

  x_le=length(x) # this is not adjusted for multi-responses
  n_un=length(unique(x))
  n_un_perc=n_un/x_le

  x_num_le=length(which(!is.na(grep('^-?[0-9].*$', x))))/x_le

  # if all numeric na in x_num and more than n_un_lim unique values
  t_num=ifelse({x_num_le==1}, T, F)
  # if x has two or less than two values (numeric or not)
  t_bool=ifelse({n_un<=2}, T, F)
  t_str=ifelse({x_num_le<1 | n_un<=n_un_lim}, T, F)

  dtype=c('num', 'str', 'bool')[which(c(t_num, t_str, t_bool))]

  # establish priorities
  dtype_taken=NA
  if(length(dtype)>1){
    if(any(grepl('str', dtype))){dtype_taken='str'}
    if(any(grepl('bool', dtype))){dtype_taken='bool'}
  }else{
    dtype_taken=dtype
  }

  out=list(dtypes=dtype,
           dtype_inferred=dtype_taken,
           x_le=x_le, # this is not adjusted for multiple responses
           n_un=n_un,
           n_un_perc=n_un_perc
  )
  return(out)
}



#' @title infer if variable is of type multi-response
#' @param split_cutoff cut-off in percent for minimal entries with multi-responses
#' @param x variable to be cleaned
#' @param n_un length unique values of x
#' @param sep separators
#' @details Multi-response variables defined as strings with where multiple values are separated with comma, semicolon or any other alpha-numeric value indicated in sep argument
#' @return list of possible separators
infer_level=function(x, split_cutoff=20, n_un, sep=c(',', ';')){

#browser()
  mat=sapply(sep, function(sep){
    #browser()
        un_split=length(unique(gsub('^ | $', '', unique(unlist(strsplit(x, sep, fixed = T))))))
        impro=100-(un_split/n_un*100)
        if(is.nan(impro)){impro=0}
        return(impro)
  })

  out=list(tested_sep=mat)

  if(any(mat>split_cutoff)){
    idx=which.max(mat)
    out$sep_inferred=names(mat[idx])
    out$multiresponse=T
  }else{
    out$multiresponse=F
  }
  return(out)
}


.tbl_output_x_num<-function(res, format, html=T){

  x_tot=formatC(sum(c(unlist(res$nas[1,]), res$summary$n)), big.mark = ',', format='d')
  if(format=='long'){
    out=res$summary[,-1]
    out$n_perc=round(as.numeric(out$n_perc), 1)
    out$n=formatC(as.numeric(out$n), big.mark = ',', format='d')
    add=add1=rep('', ncol(out))
    out=rbind(out, add)
    add[1]=x_tot
    out=rbind(out, add)
    add[1]=format.pval(res$stats[[1]], digits = 2)
    out=rbind(out, add)
    add[1]=round(res$stats[[1]], digits = 2)
    out=rbind(out, add)

    rownames(out)=c(as.character(res$summary[,1]),'','n', 'p value', 'Cd')

  }
  if(format=='1row'){
    x_sum=apply(res$summary[,-1], 1, function(x){
      # check if integer
      if(all(as.integer(x[1:5])==x[1:5])) {r=0 }else{ r=2}
      ra=round(c(x[1], x[5]), r)
      paste(round(x[3], r), ' (', ra[1], '-', ra[2], ')', sep='')

    })
    df=as.data.frame(t(x_sum))
    colnames(df)=res$summary[,1]

    out=df
    if(html){

      tbl=df
      rownames(tbl)=NULL
      # html table
      tbl %>%
        addHtmlTableStyle(align="c",
                          css.cell = "padding-left: .5em; padding-right: .4em;") %>%
        htmlTable(
          rnames=F,
          n.cgroup=c(rep(ncol(tbl)))
        ) -> out
    }else{
      out=df
    }
  }

  return(out)

}

.summary_var_xy<-function(x, y=NULL){

  if(is.null(y)){
    y=rep('All', length(x))
    y.multi=F
    y.dtype='fac'
  }


  x.ori=x
  y.ori=y

  # x_na=.count_na(x)
  # y_na=.count_na(y)

  x=.conv_na(x)
  y=.conv_na(y)

  nas=.count_na(x, y)
  x_na_rm=x[nas$idx_x]
  y_na_rm=y[nas$idx_y]
  xy_na_rm=x[nas$idx_xORy]

  le_xy=length(x_na_rm)
  un_le_x=length(unique(x_na_rm))
  un_le_y=length(unique(y_na_rm))

  ml_x=infer_level(x_na_rm, split_cutoff = 20,  n_un = un_le_x)
  ml_y=infer_level(y_na_rm, split_cutoff = 20,  n_un = un_le_y)

  if(ml_x$multiresponse){
    x_na_rm=.split_multiresp_df(x_na_rm, sep=ml_x$sep_inferred)

  }
  if(ml_y$multiresponse){
    y_na_rm=.split_multiresp_df(y_na_rm, sep=ml_y$sep_inferred)
  }

  # combine x and y
  xy_df=.comb_xy(x_na_rm, y_na_rm)


  x_dtype=infer_dtype(xy_df$x)
  y_dtype=infer_dtype(xy_df$y)

  out=list(

    nas=nas$summary,
    x=x_dtype,
    y=y_dtype,
    avg_resp=round(nrow(xy_df)/le_xy),
    le_id_noNA=length(unique(xy_df$id)),
    x_ml=c(ml_x$multiresponse, ml_x$sep_inferred),
    y_ml=c(ml_y$multiresponse, ml_y$sep_inferred)
  )
  return(out)
}

#' @importFrom stats median
.summary_var<-function(x){

  x.ori=x
  x=.conv_na(x)
  nas=.count_na_x(x)

  x_na_rm=x[nas$idx_x]
  ml_x=infer_level(x_na_rm, split_cutoff = 20,  n_un = length(unique(x_na_rm)))

  # different ways to combine ml variable
  # x is unilevel, y is multilevel: what is age distribution for visitors answering with level x
  # x is multilevel, y is unilevel: testing motivation responses for HIV test previously
  # x and y are multilevel:
  # the statistics should be done on individual level, since thge absolute number or responses does not reflect visitor numbers after separating multi-responses...owing to the fact that two individuals can provide different number of responses

  if(ml_x$multiresponse){
    x_na_rm=.split_multiresp_df(x_na_rm, sep=ml_x$sep_inferred)
  }else{
    x_na_rm=data.frame(id=seq_along(x_na_rm), val=x_na_rm, stringsAsFactors = F)
  }
  x_dtype=infer_dtype(x_na_rm$val)

  x_na_rm$val=.conv_dtype(x_na_rm$val, dtype = x_dtype$dtype_inferred)


  out=list(
    nas=nas$summary,
    x=x_dtype,
    avg_resp=round(x_dtype$x_le/nas$summary$x_val[1],1),
    le_id_noNA=nas$summary$x_val[1],
    x_ml=c(ml_x$multiresponse, ml_x$sep_inferred),
    x_na_rm=x_na_rm
  )

  dst=.descrStats_x(out)
  out$dst=dst

  return(out)
}

# create table from summary statistics from x, one column per variable, multiple rows
tbl_summary<-function(svar, var='x'){

  n=format(svar$nas$x_n[1], big.mark = ',')
  cmp_p=round(svar$nas$x_val[2], 1)
  complt_perc=paste0(ifelse({cmp_p==100 & (svar$nas$x_val[2]<100)}, '>99' ,cmp_p), '%')
  un_p=round(svar$x$n_un_perc, 1)*100
  un_perc=paste0(ifelse({un_p==0 & (svar$x$n_un_perc>0)}, '<1' ,un_p), '%')

  if(svar$x_ml[1]){
    dtype=paste('str ML ', "(sep=\"", svar$x_ml[2], "\")", sep="")
  }else{dtype=svar$x$dtype_inferred}

  out=matrix(c(n, dtype, complt_perc, un_perc, svar$avg_resp, svar$dst[[1]]), ncol=1)
  return(out)
}

#' @title split multilevel variable and create dummy
#' @description data canversion to numeric, dummy variables for multi-level categorical vars
#' @param x var to be split and converted to dummy representation
#' @param sep separator for strsplit function, if sep=NULL then splitting is ommited
#' @return matrix of dummy vars

.split_to_dummy<-function(x, sep=NULL){

  if(!is.null(sep)){
    x_split=strsplit(x, sep)

    x_split=lapply(x_split, function(x){
      gsub('^\\s|\\s$', '', x)
    })
    un=unique(unlist(x_split))
  }else{
    un=unique(x)
  }

  un=un[!is.na(un)]

  add=sapply(un, function(lev){
    out=rep(0, length(x))
    out[grep(lev, x, fixed = T)]=1
    out
  })

  # add rows with na
  idx=which(is.na(x))
  if(length(idx)>0) add[idx,]=NA

  return(add)

}
tkimhofer/cleansing documentation built on Oct. 25, 2022, 1:12 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
tkimhofer/cleansing
Data cleaning and summary statistics

R/fcts_cleanup.R
In tkimhofer/cleansing: Data cleaning and summary statistics

Defines functions .num_descrStats_xy .descrStats_x es_cdelta .num_groupComp_pairwise .num_groupComp .ct_to_perc .countsFreq_chiSq .comb_xy .combine_lowFreqLevels .split_multiresp_df .split_multiresp .conv_dtype .count_na_x .count_na .rm_na .conv_na

Documented in .combine_lowFreqLevels .comb_xy .conv_dtype .conv_na .count_na .countsFreq_chiSq .descrStats_x es_cdelta .num_groupComp .num_groupComp_pairwise .rm_na .split_multiresp

R Package Documentation

Browse R Packages

We want your feedback!

tkimhofer/cleansing Data cleaning and summary statistics

R/fcts_cleanup.R In tkimhofer/cleansing: Data cleaning and summary statistics

Defines functions .num_descrStats_xy .descrStats_x es_cdelta .num_groupComp_pairwise .num_groupComp .ct_to_perc .countsFreq_chiSq .comb_xy .combine_lowFreqLevels .split_multiresp_df .split_multiresp .conv_dtype .count_na_x .count_na .rm_na .conv_na

Documented in .combine_lowFreqLevels .comb_xy .conv_dtype .conv_na .count_na .countsFreq_chiSq .descrStats_x es_cdelta .num_groupComp .num_groupComp_pairwise .rm_na .split_multiresp

R Package Documentation

Browse R Packages

We want your feedback!

tkimhofer/cleansing
Data cleaning and summary statistics

R/fcts_cleanup.R
In tkimhofer/cleansing: Data cleaning and summary statistics