R/utils.R

Defines functions use.packages embr.timediff embr.newTime embr.difftime shift.column.up shift.column.down break.column.bycolumn.down break.column.bycolumn.up remove.equal.rows add.unique.id get.column.stats get.cross.column.stats meanNoOutliers sdNoOutliers

Documented in add.unique.id break.column.bycolumn.down break.column.bycolumn.up embr.difftime embr.newTime embr.timediff get.column.stats get.cross.column.stats remove.equal.rows shift.column.down shift.column.up use.packages

#
#' @title Find and Install Packages
#
#' @description
#' Install package if necessary and load library
#' @export

use.packages <- function(p)
{
  if (!is.element(p, installed.packages()[,1]))
    install.packages(p, dep = TRUE)
  require(p, character.only = TRUE)
}

#' Time Differences
#
#' Time difference in units
#
#' Function difftime calculates a difference of two date/time objects and returns an object of class "difftime" with an attribute indicating the units.
#' Time format used is always "\%Y-\%m-\%d \%H:\%M:\%S" for time or "\%Y-\%m-\%d" for date
#
#' @param time1 /date-time or date objects.
#' @param time2 /date-time or date objects.
#' @param units	character string. Units in which the results are desired. Can be abbreviated.
#' @export

embr.timediff <- function(time1, time2, format="%Y-%m-%d %H:%M:%S", units="secs")
{
  difftime(as.POSIXlt(time1, tz="UTC", format=format), as.POSIXlt(time2, tz="UTC", format=format), units=units)
}

#
#' Date/time Conversion Function
#
#' Formats a character string to date or time
#'
#' Time format used is always "\%Y-\%m-\%d \%H:\%M:\%S" for time or "\%Y-\%m-\%d" for date
#
#' @param time /date-time or date objects.
#' @export
embr.newTime <- function(time, timeFormat = "%Y-%m-%d %H:%M:%S")
{
  as.POSIXlt(time, tz="UTC", format= timeFormat)
}

#
#' Time Differences
#
#' Formats interval creation
#'
#' Time format used is always "\%Y-\%m-\%d \%H:\%M:\%S" for time or "\%Y-\%m-\%d" for date
#
#' @param date1 /date-time or date objects.
#' @param date2 /date-time or date objects.
#' @param timeUnits units
#' @param timeFormat format of both dates or times
#' @export
embr.difftime <- function(date1, date2, timeFormat="%Y-%m-%d %H:%M:%S", units="secs")
{
  #ifelse(is.na(date1) | is.na(date2), NA,
    difftime(as.POSIXlt(date1, tz="UTC", format=timeFormat), as.POSIXlt(date2, tz="UTC", format=timeFormat), units=units, tz="UTC")#)
}

#
#' Shift Column
#
#' Shifts a column upwards n cells
#'
#' Shifts a column of a dataframe (next cells)
#
#' @param column column of dataframe
#' @param n number of cells to shift
#' @export
shift.column.up <- function(column, n=1)
{
  c(as.character(tail(column, length(column) - n)), rep(NA, n))
}

#
#' Shift Column
#
#' Shifts a column downwards n cells
#'
#' Shifts a column of a dataframe (previous cells)
#
#' @param column column of dataframe
#' @param n number of cells to shift
#' @export
shift.column.down <- function(column, n=1)
{
  c(rep(NA, n), as.character(head(column, length(column) - n)))
}


#
#' Introduce value when Equal Values on Column and Previous Column
#
#' Value on column when another is equal
#'
#' Introduces a cell value on a column whenever there are two equal values on another column
#
#' @param column1 column to introduce the NAs
#' @param column2 column to distinguish cell values
#' @param value breaking cell value to add on column 1
#' @export
break.column.bycolumn.down <- function(column1, column2, breakValue=NA)
{
  nextColumn2 <- shift.column.down(column2)
  replace(column1, is.na(column2) | is.na(nextColumn2) | column2 != nextColumn2, breakValue)
}

#
#' Introduce value when Equal Values on Column and Next Column
#
#' Value on column when another is equal
#'
#' Introduces a cell value on a column whenever there are two equal values on another column
#
#' @param column1 column to introduce the NAs
#' @param column2 column to distinguish cell values
#' @param value breaking cell value to add on column 1
#' @export
break.column.bycolumn.up <- function(column1, column2, breakValue=NA)
{
  previousColumn2 <- shift.column.up(column2)
  replace(column1, is.na(column2) | is.na(previousColumn2) | column2 != previousColumn2, breakValue)
}
#
#' Remove Equal Rows
#
#' Remove rows that have subsequent equal columns
#
#' Remove rows that have the same value on a given column
#
#' @param dt dataframe
#' @param columnName column to distinguish rows
#' @export
remove.equal.rows <- function(dt, columnName)
{
  print('Function: remove.equal.rows')
  print(paste('Original number of rows:', nrow(dt)))
  column <- dt[[columnName]]
  nextColumn <- shift.column.up(column)
  dt <- dt[is.na(nextColumn) | is.na(column) | column != nextColumn,]
  print(paste('Afterward number of rows:', nrow(dt)))
  dt
}

#
#' Add unique id to dataframe
#
#' Add sequential id to dataframe based on the existing rows
#
#' Add unique id to end of dataframe according to nr of rows
#
#' @param df dataframe
#' @export
add.unique.id <- function(df)
{
  df$id<-seq.int(nrow(df))
  df
}

#
#' Statistics of Column (Values)
#
#' Return statistics of column (summary, stat.desc, psych::describe)
#
#' @export
get.column.stats <- function(df, column)
{
  summaryVal <- summary(column)
  stat.descVal <- stat.desc(column)
  describeVal <- psych::describe(column)
  fiveNumVal <- fivenum(column)

  stats <- c(summaryVal, stat.descVal, describeVal, fiveNumVal)

  statsNames <- c(names(stats)[1:33], "5.1", "5.2", "5.3", "5.4", "5.5")
  i = 1
  for(statsName in statsNames){
    df[, paste(statsName, ".all", sep="")] <- as.numeric(stats[i])
    i = i + 1
  }
  df
}

#
#' Statistics of Column (Values)
#
#' Add statistics of cross sectional column (summary, stat.desc, psych::describe)
#
#' @export
get.cross.column.stats <- function(df, crosscolumn, columnName)
{
  dummy <- c(1,2,3,4,5)
  summaryVal <- summary(dummy)
  stat.descVal <- stat.desc(dummy)
  describeVal <- psych::describe(dummy)
  fiveNumVal <- fivenum(dummy)

  stats <- c(summaryVal, stat.descVal, describeVal, fiveNumVal)
  statsNames <- c(names(stats)[1:33], "5.1", "5.2", "5.3", "5.4", "5.5")
  for(statsName in statsNames){
    df[, paste(statsName, sep="")] <- NA
  }

  for(columnID in unique(crosscolumn)){
    subColumn <- df[crosscolumn == columnID, c(columnName)]
    for(i in 1:length(subColumn)) {
      summaryVal <- summary(subColumn)
      stat.descVal <- stat.desc(subColumn)
      describeVal <- psych::describe(subColumn)
      fiveNumVal <- fivenum(subColumn)

      stats <- c(summaryVal, stat.descVal, describeVal, fiveNumVal)

      statsNames <- c(names(stats)[1:33], "5.1", "5.2", "5.3", "5.4", "5.5")
      i = 1
      for(statsName in statsNames){
        df[crosscolumn == columnID, paste(statsName, sep="")] <- as.numeric(stats[i])
        i = i + 1
      }
    }
  }
  df
}

#
#' Statistics of Mean
#
#' Mean removing NAs and outliers
#
#' @export
meanNoOutliers <- function(x)
{
  x <- x[ x <= boxplot(x, plot = FALSE)$stats[5,] ]
  mean(x, na.rm=TRUE)
}
#
#' Statistics of Sd
#
#' Standard deviation removing NAs and outliers
#
#' @export
sdNoOutliers <- function(x)
{
  x <- x[ x <= boxplot(x, plot = FALSE)$stats[5,] ]
  sd(x, na.rm=TRUE)
}
marcialbaptista/dataUtilities documentation built on May 21, 2019, 11:28 a.m.