##
## [helperfunc.r]
##
## author : Ed Goodwin
## project : helpeR
## createdate : 01.14.2015
##
## description:
## a set of useful functions that I can drop into various projects
##
## version: 0.01
## changelog:
##
require(sqldf)
require(dplyr)
as.numeric.com = function(y) {
## converts string with commas into single number
as.numeric(gsub(",","", y))
}
find.freq <- function(x) {
n <- length(x)
spec <- spec.ar(c(x),plot=FALSE)
if(max(spec$spec)>10) # Arbitrary threshold chosen by trial and error.
{
period <- round(1/spec$freq[which.max(spec$spec)])
if(period==Inf) # Find next local maximum
{
j <- which(diff(spec$spec)>0)
if(length(j)>0)
{
nextmax <- j[1] + which.max(spec$spec[j[1]:500])
period <- round(1/spec$freq[nextmax])
}
else
period <- 1
}
}
else
period <- 1
return(period)
}
## convert factor vector to numeric
factor2numeric = function(facvect) {
numeric = as.numeric(levels(facvect)[facvect])
numeric
}
## useful for using ggplot with forecast package (Holt-Winters, etc.)
#--Produces a data.frame with the Source Data+Training Data, Fitted Values+Forecast Values, forecast data Confidence Intervals
funggcast<-function(dn,fcast){
require(zoo) #needed for the 'as.yearmon()' function
en<-max(time(fcast$mean)) #extract the max date used in the forecast
#Extract Source and Training Data
ds<-as.data.frame(window(dn,end=en))
names(ds)<-'observed'
ds$date<-as.Date(time(window(dn,end=en)))
#Extract the Fitted Values (need to figure out how to grab confidence intervals)
dfit<-as.data.frame(fcast$fitted)
dfit$date<-as.Date(time(fcast$fitted))
names(dfit)[1]<-'fitted'
ds<-merge(ds,dfit,all.x=T) #Merge fitted values with source and training data
#Exract the Forecast values and confidence intervals
dfcastn<-as.data.frame(fcast)
dfcastn$date<-as.Date(as.yearmon(row.names(dfcastn)))
names(dfcastn)<-c('forecast','lo80','hi80','lo95','hi95','date')
pd<-merge(ds,dfcastn,all.x=T) #final data.frame for use in ggplot
return(pd)
}
## return the days of month in a year; need to improve to handle leap years
monthdays = function(year) {
rep(c(31,28,31,30,31,30,31,31,30,31,30,31),1)
monthdays[26 + (4*12)*(0:2)] <- 29 # leap years
}
## return number of days in month based on a date
numberOfDays <- function(date) {
m <- format(date, format="%m")
while (format(date, format="%m") == m) {
date <- date + 1
}
return(as.integer(format(date - 1, format="%d")))
}
## forecast accuracy functions
mape = function(y,yhat) {
mean(abs(y-yhat)/y)
}
smape = function(y, yhat) {
mean(200*abs(y-yhat)/(y+yhat))
}
# ## panel histogram for correlation matrix
# panel.hist = function(x){
# usr = par(usr)
# on.exit(par(usr))
# par(usr = c(usr[1:2], 0, 1.5))
# h = hist(x, plot=FALSE)
# breaks = h$breaks; nB = length(breaks)
# y = h$counts; y = y/max(y)
# rect(breaks[-1], y, col=“cyan”)
# }
## example: paris(credit[,-(4:5)], diag.panel=panel.hist)
## Correlation matrix with p-values. See http://goo.gl/nahmV for documentation of this function
cor.prob <- function (X, dfr = nrow(X) - 2) {
R <- cor(X, use="pairwise.complete.obs")
above <- row(R) < col(R)
r2 <- R[above]^2
Fstat <- r2 * dfr/(1 - r2)
R[above] <- 1 - pf(Fstat, 1, dfr)
R[row(R) == col(R)] <- NA
R
}
## Use this to dump the cor.prob output to a 4 column matrix
## with row/column indices, correlation, and p-value.
## See StackOverflow question: http://goo.gl/fCUcQ
flattenSquareMatrix <- function(m) {
if( (class(m) != "matrix") | (nrow(m) != ncol(m))) stop("Must be a square matrix.")
if(!identical(rownames(m), colnames(m))) stop("Row and column names must be equal.")
ut <- upper.tri(m)
data.frame(i = rownames(m)[row(m)[ut]],
j = rownames(m)[col(m)[ut]],
cor=t(m)[ut],
p=m[ut])
}
# # Multiple plot function
# #
# # ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# # - cols: Number of columns in layout
# # - layout: A matrix specifying the layout. If present, 'cols' is ignored.
# #
# # If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# # then plot 1 will go in the upper left, 2 will go in the upper right, and
# # 3 will go all the way across the bottom.
# #
# multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
# library(grid)
#
# # Make a list from the ... arguments and plotlist
# plots <- c(list(...), plotlist)
#
# numPlots = length(plots)
# p = 0
# # If layout is NULL, then use 'cols' to determine layout
# if (is.null(layout)) {
# # Make the panel
# # ncol: Number of columns of plots
# # nrow: Number of rows needed, calculated from # of cols
# layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
# ncol = cols, nrow = ceiling(numPlots/cols))
# }
#
# if (numPlots==1) {
# p = (plots[[1]])
#
# } else {
# # Set up the page
# grid.newpage()
# pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
#
# # Make each plot, in the correct location
# for (i in 1:numPlots) {
# # Get the i,j matrix positions of the regions that contain this subplot
# matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
#
# p = plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
# layout.pos.col = matchidx$col)
# }
# }
# return(p)
# }
## read small subset of the data for a file
## uses sqldf so that you can read large data files
readsubsetfile = function(fname, nlines) {
sqlq = paste("select * from ", fname, "order by random(*) limit ", nlines)
subfile = read.csv.sql(fname, sql = sqlq)
subfile
}
# ## number of weekdays between two dates
# Dates1 <- as.Date("2011-01-30") + rep(0, 10)
# Dates2 <- as.Date("2011-02-04") + seq(0, 9)
# Nweekdays(Dates1, Dates2)
Nweekdays <- Vectorize(function(a, b)
sum(!weekdays(seq(a, b, "days")) %in% c("Saturday", "Sunday")))
## create test/training sets from dataframes
testtrainsplit = function(df, trainsize) {
#Sample Indexes
indexes = sample(1:nrow(df), size=trainsize*nrow(df))
# Split data
test = df[indexes,]
train = df[-indexes,]
return(c(train, test))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.