R/countLeadTrailSpaces.R

Defines functions doBasicCheck gridNonalphanum countNonalphanum countLeadTrailSpaces

Documented in countLeadTrailSpaces countNonalphanum doBasicCheck gridNonalphanum

#' @title FUNCTION_TITLE
#' @description FUNCTION_DESCRIPTION
#' @param df PARAM_DESCRIPTION
#' @return OUTPUT_DESCRIPTION
#' @details DETAILS
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  #EXAMPLE1
#'  }
#' }
#' @rdname countLeadTrailSpaces
#' @export 
countLeadTrailSpaces = function(df) sapply(df,function(x) sum(grepl("^ | $", x)))
#' @title FUNCTION_TITLE
#' @description FUNCTION_DESCRIPTION
#' @param df PARAM_DESCRIPTION
#' @return OUTPUT_DESCRIPTION
#' @details DETAILS
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  #EXAMPLE1
#'  }
#' }
#' @rdname countNonalphanum
#' @export 
countNonalphanum = function(df) sapply(df,function(x) sum(grepl("[[:punct:] ]", stats::na.omit(x))))
#' @title FUNCTION_TITLE
#' @description FUNCTION_DESCRIPTION
#' @param df PARAM_DESCRIPTION
#' @return OUTPUT_DESCRIPTION
#' @details DETAILS
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  #EXAMPLE1
#'  }
#' }
#' @rdname gridNonalphanum
#' @export 
gridNonalphanum = function(df) sapply(df,function(x) grepl("[[:punct:] ]", x)==F & is.na(x))
#' @title FUNCTION_TITLE
#' @description FUNCTION_DESCRIPTION
#' @param df PARAM_DESCRIPTION
#' @return OUTPUT_DESCRIPTION
#' @details DETAILS
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  #EXAMPLE1
#'  }
#' }
#' @seealso 
#'  
#' @rdname doBasicCheck
#' @export 
#' @import stringr
doBasicCheck = function(df) {
  ## 181013 update laenge beispiel
  ## 14/01/14 laenge abhaengig von klasse
  ### schnelles anschauen der daten muss noch optimiert werden

  message("berechne laenge und NAs...\n")

  zeilen = dim(df)
  NAs = showNA(df)$NAs
  NAs_proz = NAs/zeilen
  Vals = zeilen - NAs
  Vals_proz = Vals/zeilen

  message("Berechne klassen und laengen...\n")

  klassen = showClassDF(df)
  numerisch = sapply(df,is.numeric)

  laenge = {sapply(df[,numerisch ==F , drop = F], stringr::str_length)}

  message("Berechne minima maxima...\n")
  maxl = apply(laenge,2,function(xx) max(xx, na.rm=T))
  minl = apply(laenge,2,function(xx) min(xx, na.rm=T))

  max_num = apply(df[, numerisch, drop = F],2,function(xx) max(as.numeric(xx), na.rm=T))
  min_num = apply(df[,numerisch, drop =F],2,function(xx) min(as.numeric(xx), na.rm=T))

  maxx = c(maxl, max_num)
  minn = c(minl, min_num)
  maxx = maxx[names(klassen)]
  minn = minn[names(klassen)]

  message("Berechne leading / trailing spaces...\n")
  lt_spaces = countLeadTrailSpaces(df[, numerisch ==F, drop=F])
  lt_spaces_proz = lt_spaces/zeilen

  lt_spaces = lt_spaces[names(klassen)]
  lt_spaces_proz = lt_spaces_proz[names(klassen)]

  message("Berechne alphanum...\n")
  non_alphanum = countNonalphanum(df[,numerisch ==F, drop=F])
  non_alphanum_proz = non_alphanum/zeilen

  non_alphanum = non_alphanum[names(klassen)]
  non_alphanum_proz = non_alphanum_proz[names(klassen)]

  firstentry = unlist(apply(df,2,function(xx) if(all(is.na(xx))) NA else stats::na.omit(xx)[1]))
  lastentry = unlist(apply(df,2,function(xx) if(all(is.na(xx))) NA else stats::na.omit(xx)[length(stats::na.omit(xx))]))

  res = data.frame(colname = names(klassen),Vals, Vals_proz, NAs, NAs_proz, lt_spaces, lt_spaces_proz, non_alphanum, non_alphanum_proz, klassen, minn, maxx, firstentry, lastentry)

  anheubschcat = c("Vals_proz", "NAs_proz", "lt_spaces_proz", "non_alphanum_proz")
  res[,anheubschcat] = sapply(res[,anheubschcat], function(x) round(100*x,2))
  names(res) = c("Colnames","Vals","V.pr","NAs", "NA.pr", "lt_spaces", "sp.pr", "non_alphanum", "nona.pr", "class", "min", "max", "firstentry", "lastentry")
  row.names(res) = NULL
  res
}
holgerman/toolboxH documentation built on June 25, 2022, 2:42 p.m.