Steps

  1. If no package dir exists go to E:/RProjects. Place this .rmd in that dir.
  2. Create the package (if not exists), use create_package(). This creates the NAMESPACE and the project.
  3. If the package exists run the Rproject from the project directory (E:/RProjects/mlhelper)
  4. Copy functions as R files into the R directory, together with docs. Also copy the data docs and package doc as R file into the R directory.
  5. Add dependencies as use_package()
  6. Add data as use_data() 8 Create documentation (man) using document() 9 Upload new NAMESPACE, DESCRIPTION, man, R, data and this file into the Github

IMPORTANT ISSUES In the DESCRIPTION I had to modify manualy the package name to mlhelper

if(!require(devtools)){install.packages("devtools")}
if(!require(knitr)){install.packages("knitr")}
if(!require(roxygen2)){install.packages("roxygen2")}
if(!require(testthat)){install.packages("testthat")}
if(!require(rmarkdown)){install.packages("rmarkdown")}
if(!require(usethis)){install.packages("usethis")}

# dependencies to add  
if(!require(dplyr)){install.packages("dplyr")}
if(!require(ggplot2)){install.packages("ggplot2")}
if(!require(purrr)){install.packages("purrr")}
if(!require(moments)){install.packages("moments")}
if(!require(gridExtra)){install.packages("gridExtra")}
if(!require(grid)){install.packages("grid")}

CHAPTER 1 - Package mamagement

# run just first time
path <- file.path("mlhelper")
create_package(path)
proj_get()
weather = data.frame(Day = c(1, 2, 3, 4, 5, 6, 7),

                      Temp = c(22, 19, 18, 24, NA, 21, 14),
                      Weather = c("Snow", "Snow", "Sun", NA , "Rain", "Snow", "Rain"))

#' Random Weather Data
#'
#' A dataset containing randomly generated weather data.
#'
#' @format A data frame of 7 rows and 3 columns
#' \describe{
#'  \item{Day}{Numeric values giving day of the week, 1 = Monday, 7 = Sunday}
#'  \item{Temp}{Numeric values giving temperature in degrees Celsius}
#'  \item{Weather}{Character values describing the weather on that day}
#' }
#' @source Randomly generated data
# Add the weather data
use_data(weather, overwrite = TRUE)
# Add libraries as a dependency to the DESCRIPTION file
use_package("dplyr")
use_package("purrr")
use_package("moments")
use_package("ggplot2")
use_package("gridExtra")
use_package("grid")
# creates man and updates NAMESPACE
#use_roxygen_md()
document()

# check man
help("multiplot")

ChAPTER 1 :: package management

# What is in the R directory before adding a function?
dir("mlhelper/R")

# Use the dump() function to write the numeric_summary function
#dump("numeric.summary", file = "mlhelper/R/numeric.summary.R")
#dump("df.numeric.summary", file = "mlhelper/R/df.numeric.summary.R")
#dump("preprocess.NAs", file = "mlhelper/R/preprocess.NAs.R")
#dump("rmse", file = "mlhelper/R/rmse.R")
#dump("plot.histogram", file = "mlhelper/R/plot.histogram.R")
#dump("plot.density", file = "mlhelper/R/plot.density.R")
#dump("do.plots", file = "mlhelper/R/do.plots.R")


# Verify that the file is in the correct directory
dir("mlhelper/R")
## SAVE IT as mlhelper.R in R dir

#' mlhelper: Helper functions for EDA and ml
#'
#' Collection of helper functions for EDA and ml
#'
#' @docType package
#' @name mlhelper
"_PACKAGE"
check("mlhelper")
# Build the package
build("mlhelper")

# Examine the contents of the current directory
dir()

CHAPTER 2 :: functions

multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }

  if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}
#' Multiple plot function
#'
#' ggplot objects can be passed in directly(...), or as a list of ggplot objects
#'
#' @param ... ggplot objects
#' @param cols number of columns
#' @import grid
#' @return This function returns the plots in a specified layout
#' @export
#' @examples
#' multiplot(p1, p2, cols=1)
#' @author Eva Szin Takacs, \email{szin.takacs.eva@gmail.com}
#' @seealso \code{\link[base]{gridExtra}}
# create numeric summary for a df$column in df format
numeric.summary <- function(x, na.rm) {

  # Include an error if x is not numeric
  if(!is.numeric(x)){
    stop("Data must be numeric")
  }

  # Create data frame
  data.frame( MIN = min(x, na.rm = na.rm),
              Q1 = quantile(x, 0.25, na.rm = na.rm),
              MEAN = round(mean(x, na.rm = na.rm),2),
              MEDIAN = round(median(x, na.rm = na.rm),2),
              Q3 = quantile(x, 0.75, na.rm = na.rm), 
              IQR = IQR(x, na.rm = na.rm),
              SD = round(sd(x, na.rm = na.rm),2),
              MAX = max(x, na.rm = na.rm),
              SKEWNESS = round(skewness(x, na.rm = na.rm),2))%>%
    mutate(SKEW = ifelse(MEAN > MEDIAN, "RIGHT", "LEFT"))
}
#' Numeric Summaries
#'
#' Summarises numeric data and returns a data frame containing the basic summary values.
#'
#' @param x a numeric vector containing the values to summarize.
#' @param na.rm A logical indicating whether missing values should be removed.
#' @import dplyr
#' @import moments
#' @return This function returns a \code{data.frame} including columns:
#' \itemize{
#'  \item min
#'  \item q1
#'  \item mean
#'  \item median
#'  \item q3
#'  \item iqr
#'  \item stdev
#'  \item max
#'  \item skewness
#'  \item skew
#' }
#' @export
#' @examples
#' numeric.summary(iris$Sepal.Length)
#' numeric.summary(airquality$Wind, na.rm = FALSE)
#' @author Eva Szin Takacs, \email{szin.takacs.eva@gmail.com}
#' @seealso \code{\link[base]{summary}}
numeric.summary(iris$Sepal.Length, na.rm=TRUE)
df.numeric.summary <- function(x, na.rm = TRUE){

  df.numeric <- select_if(x, .predicate = is.numeric) 

  map_df(df.numeric, .f = numeric.summary, na.rm = TRUE, .id = "ID")

}
#' Summary of Numeric Columns
#'
#' Generate specific summaries of numeric columns in a data frame
#' 
#' @param x A data frame. Non-numeric columns will be removed
#' @param na.rm A logical indicating whether missing values should be removed
#' @import purrr
#' @import dplyr
#' @return This function returns a \code{data.frame} including columns:
#'  \itemize{
#'  \item id
#'  \item min
#'  \item q1
#'  \item mean
#'  \item median
#'  \item q3
#'  \item iqr
#'  \item stdev
#'  \item max
#'  \item skewness
#'  \item skew
#' }
#' @export
#' @examples
#' df.numeric.summary(iris)
#' df.numeric.summary(airquality, na.rm = FALSE)
#' @author Eva Szin Takacs, \email{szin.takacs.eva@gmail.com}
#' @seealso \link[base]{summary}
df.numeric.summary(iris, na.rm = TRUE)
preprocess.NAs <- function(df, numeric.v, factor.v = "None"){

# get factor and numeric features
numeric_var <- names(df)[which(sapply(df, is.numeric))]

# get factor features
factor_var <- names(df)[which(sapply(df, is.factor))]

# df with numeric features
df_num <-  df %>%
  select(numeric_var)

#for numeric features replace missing values with 0
for (feature in colnames(df_num)) {
    df[[feature]][is.na(df_num[feature])] <- numeric.v
}

 # df with factor features
df_fac <-  df %>%
  select(factor_var)

#add the new level "None" and replace missing values with "None" for factors in a df
for (feature in colnames(df_fac)) {
    levels(df[[feature]]) = c(levels(df_fac[[feature]]), factor.v)
    df[[feature]][is.na(df_fac[[feature]])] <- factor.v
}
return(df)
}
#' Handling missing values in a df
#'
#' Replaces numeric and factor columns NAs with user defined values
#' 
#' @param df A data frame. numeric and factor columns will be considered
#' @param numeric.v Numeric value, NAs will be replaced with. Default is 0.
#' @param factor.v Factor value, NAs will be replaced with. Default is None.
#' @return This function returns the initial \code{data.frame} with NAs replaced
#'
#' @export
#' @examples
#' preprocess.NAs(weather, 0, "None")
#' @author Eva Szin Takacs, \email{szin.takacs.eva@gmail.com}
preprocess.NAs(weather, 0, "None")
# RMSE  - Calcuates the Root Mean Squared Error
# 
# Params: actual     - actual values
#         predicted  - predicted values 
# Return: rmse       - Root Mean Squared Error

rmse <- function(actual, predicted) {

  return(sqrt( mean((actual - predicted)^2)))
}
#histograms for all the factors in a df
plot.histogram <- function(df, feature) {
  data <- data.frame(feature=df[[feature]])
  plot <- ggplot(data=data, aes(x=factor(feature))) + stat_count() +
    labs( title = "", x = (colnames(df)[feature]), y = "")+
    theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 5),
          axis.title.x = element_text(size = 8))
  return (plot)
}
#' Histogram of a factor feature in a df
#'
#' Plots a basic histogram for a factor feature in a given df
#' 
#' @param df A data frame. numeric and factor columns will be considered
#' @param factor.v Factor feature
#' @return Histogram of the feature
#'
#' @export
#' @examples
#' plot.histogram(weather, "Weather")
#' @author Eva Szin Takacs, \email{szin.takacs.eva@gmail.com}
#' 
plot.histogram(weather, "Weather")
# density plots for all the numeric features in a df
plot.density <- function(df, feature){
  data <- data.frame(feature=df[[feature]])
  p <- ggplot(data= data) + geom_line(aes(x = feature), stat = 'density', size = 1,alpha = 1.0) +
   # scale_y_continuous(labels = scales::comma)+
  labs( title = "", x = paste0((colnames(df)[feature]), '\n', 'Skewness:',round(skewness(df[[feature]], na.rm = TRUE), 2)), y = "")+
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5),
          axis.title.x = element_text(size = 8))

  return(p)

}
#' Density plot of a numeric feature in a df
#'
#' Plots a density function of a numeric feature in a given df
#' 
#' @param df A data frame. Numeric columns will be considered
#' @param numeric.v Numeric feature as a string
#' @return Density plot of the numeric feature
#'
#' @export
#' @examples
#' plot.density(weather, "Temp")
#' @author Eva Szin Takacs, \email{szin.takacs.eva@gmail.com}
#' 
plot.density(iris, "Sepal.Length")
# list of plots in a grid
do.plots <- function(df, funct, nr, ncols=3) {
  p.list <- list()
  for (i in nr) {
    p <- funct(df=df, feature=i)
    p.list <- c(p.list, list(p))
  }
  do.call("grid.arrange", c(p.list, ncol=ncols))
}
#' List of plots in a grid arrangement
#'
#' Grid of plots depending on the function (\code{plot.histogram} or \code{plot.density}) for a given df
#' 
#' @param df A data frame.
#' @param funct function name. Tested for(\code{plot.histogram} or \code{plot.density}).
#' @param nr A range number of plots.
#' @param ncols Number of columns of the grid arrangement.

#' @return List of plots in a grid arrangement
#'
#' @export
#' @examples
#' do.plots(iris, func=plot.density, nr=1:4, ncols=3)
#' @author Eva Szin Takacs, \email{szin.takacs.eva@gmail.com}
#' 
do.plots(iris, func=plot.histogram, nr=1:4, ncols=3)

CHAPTER 3 :: loading the package

library(mlhelper, lib.loc = "e:/RProjects")
mlhelper::plot.histogram(weather, "Weather")
install_github("szintakacseva/mlhelper")
library(mlhelper)
# test it
df.numeric.summary(iris)
preprocess.NAs(weather, 0, "None")
p3 <- ggplot(mtcars, aes(x = mpg)) +
      geom_histogram()
p4 <- ggplot(mtcars, aes(x = cyl)) +
      geom_histogram()
multiplot(p3,p4, cols=1)


szintakacseva/mlhelper documentation built on June 4, 2019, 6:22 p.m.