R/my_funs.R

Defines functions uniqueWords clearText is.Date normalize squash zscore nth_max nth_min fitst last as.epoch coalesce fill_na drop_na min_col max_col Mode PRESS

Documented in clearText coalesce drop_na fill_na fitst is.Date last max_col min_col Mode normalize nth_max nth_min PRESS squash uniqueWords zscore

#' Remove duplicated words from a string
#' 
#' @param string  character vector
#' @param as_list if \code{TRUE} output is returned as a list (see \code{\link{strsplit}})
#' 
#' @references 
#' \url{http://stackoverflow.com/a/28033338/3986320}
#' 
#' @export

uniqueWords <- function(string, as_list = FALSE) {
  string <- lapply(strsplit(string, " "), unique)
  if (!as_list)
    string <- vapply(string, paste, character(1L), collapse = " ")
  string
}


#' Clean strings from any non-ASCII characters, extra spaces etc.
#' 
#' @param x             a character vector
#' @param ascii         if \code{TRUE} then string is converted to ASCII using
#'                      \code{\link{iconv}}
#' @param encoding      encoding parameter passed to \code{\link{iconv}}, if
#'                      \code{encoding = ""} is used, then \code{\link{iconv}}
#'                      uses the default system encoding
#' @param lowercase     if \code{TRUE} characters are transformed to lowercase
#' @param trim          parameter passed to \code{\link{trimws}}, if \code{trim = "none"}
#'                      then white spaces are not trimmed
#' @param single_space  if \code{TRUE} duplicated spaces are removed
#' @param alphanum      if \code{TRUE} non alphanumeric characters are replaced with
#'                      value provided in \code{other_char}
#' @param other_char    replacement value for non alphanumeric characters, space by default
#' 
#' @export

clearText <- function(x, ascii = TRUE, encoding = "", lowercase = TRUE,
                      trim = c("both", "left", "right", "none"),
                      single_space = TRUE, alphanum = FALSE, other_char = " ") {
  
  if (ascii)
    x <- iconv(x, from = encoding, to = "ASCII//TRANSLIT")
  
  if (lowercase)
    x <- tolower(x)

  if (alphanum)
    x <- gsub("[^[0-9a-zA-Z ]", other_char[1L], x)
  
  if (single_space)
    x <- gsub("(?<=[\\s])\\s*", "", x, perl=TRUE)
  
  trim <- match.arg(trim)
  if (trim != "none")
    x <- trimws(x, which = trim)
  
  return(x)
}


#' Check if object is of Date class
#' 
#' @param object
#' 
#' @export

is.Date <- function(x) "Date" %in% class(x) 

# 
# rep_along <- function(x, along.with) {
#   rep(x, length.out = length(along.with))
# }


#' Normalize, squash and standarize data
#' 
#' @param x      numeric vector
#' @param na.rm  if \code{TRUE} missing values are removed
#' @param FUN    function used for transforming \code{x} prior
#'               to squashing (by defauls \code{\link{identity}})
#' @param \dots  further arguments passed to \code{FUN}
#' 
#' @name normalize
#' @export

normalize <- function(x, na.rm = FALSE) {
  if (na.rm)
    xx <- x[!is.na(x)]
  else
    xx <- x
  (x - min(xx)) / diff(range(xx))
}

#' @rdname normalize
#' @export

squash <- function(x, FUN = identity, ..., na.rm = FALSE) {
  if (na.rm)
    xx <- x[!is.na(x)]
  else
    xx <- x
  FUN(x, ...) / sum(FUN(xx, ...))
}

#' @rdname normalize
#' @export

zscore <- function(x, na.rm = FALSE) {
  if (na.rm)
    xx <- x[!is.na(x)]
  else
    xx <- x
  (x - mean(xx)) / sd(xx)
}


#' Pick n greatest or smallest values from the vector
#' 
#' @param x  numeric vector
#' @param n  position of the values to return
#' 
#' @name nth_max
#' @export

nth_max <- function(x, n = 1L) {
  sort(x, decreasing = TRUE)[n]
}

#' @rdname nth_max
#' @export

nth_min <- function(x, n = 1L) {
  sort(x, decreasing = FALSE)[n]
}


#' Return first or last value
#' 
#' @param an object
#' 
#' @seealso \code{\link{head}}
#' 
#' @name first
#' @export

fitst <- function(x) head(x, n = 1L)

#' @name first
#' @export

last <- function(x) tail(x, n = 1L)


# space <- function(x, ...) {
#   format(x, ..., big.mark = " ", scientific = FALSE, trim = TRUE)
# }


as.epoch <- function(x) {
  as.numeric(as.POSIXct(x))
}


#' Take the first non-missing value rowwise
#' 
#' @param \dots list of vectors
#' 
#' @references 
#' \url{http://stackoverflow.com/q/19253820/3986320}
#' 
#' @export

coalesce <- function(...) {
  Reduce(function(x, y) {
    i <- which(is.na(x))[1L]
    x[i] <- y[i]
    x
  }, list(...))
}


#' Fill missing values with predefined constant
#' 
#' @param .data matrix or data.frame object
#' @param \dots column names if using restricted columns
#' @param fill  replacement value
#' 
#' @importFrom lazyeval lazy_dots
#' 
#' @export

fill_na <- function(.data, ..., fill = NA) {
  stopifnot(is.matrix(.data) || is.data.frame(.data))
  if (is.na(fill))
    return(.data)
  if (!missing(...)) {
    dots <- lazy_dots(...)
    .data[apply(.data[, dots], 1L, anyNA), dots] <- fill[1L]
  } else {
    .data[is.na(.data)] <- fill[1L]
  }
  .data
}


#' Drop rows with missing values
#' 
#' @param .data data.frame or matrix
#' @param \dots column names if using restricted columns
#' 
#' @references 
#' \url{http://stackoverflow.com/q/22353633/3986320}
#' 
#' @importFrom lazyeval lazy_dots
#' @importFrom dplyr select_ filter
#' @importFrom stats complete.cases
#' 
#' @export

drop_na <- function(.data, ...) {
  if (missing(...)){
    selected_rows <- complete.cases(.data)
  } else {
    selected_rows <- complete.cases(select_(.data, .dots = lazy_dots(...)))
  }
  filter(.data, selected_rows)
}


#' Rowwise finding the column with greatest value
#' 
#' @param .data data.frame or matrix
#' @param \dots column names if using restricted columns
#' 
#' @importFrom dplyr select_
#' @importFrom lazyeval lazy_dots
#' 
#' @name min_col
#' @export

min_col <- function(.data, ...) {
  if (!missing(...))
    data <- select_(.data, .dots = lazy_dots(...))
  colnames(.data)[apply(.data, 1L, which.min)]
}

#' @rdname min_col
#' @export

max_col <- function(.data, ...) {
  if (!missing(...))
    data <- select_(.data, .dots = lazy_dots(...))
  colnames(.data)[apply(.data, 1L, which.max)]
}




#' Estimate mode of discrete or continous data
#' 
#' @param x        numeric vector
#' @param discrete if \code{TRUE} estimated mode assuming that data is dicrete
#' @param \dots    additional parameters passed to \code{\link[stats]{density}}
#' 
#' @importFrom stats density
#' @export

Mode <- function(x, discrete = FALSE, ...) {
  if (!discrete) {
    dx <- density(x, ...)
    dx$x[which.max(dx$y)]
  } else {
    ux <- unique(x)
    tx <- tabulate(match(x, ux))
    ux[tx == max(tx)]
  }
} 


#' Report PRESS statistic for lm object
#' 
#' @param model \code{\link[stats]{lm}} object
#' 
#' @importFrom stats residuals lm.influence
#' @export

PRESS <- function(model) {
  if (!inherits(model, "lm") || inherits(model, "glm"))
    stop("model needs to be lm object")
  pr <- residuals(model)/(1 - lm.influence(model)$hat)
  sum(pr^2)
}
twolodzko/twextras documentation built on May 3, 2019, 1:52 p.m.