# set the knitr options ... for everyone!
# if you unset this, then vignette build bonks. oh, joy.
#opts_knit$set(progress=TRUE)
opts_knit$set(eval.after='fig.cap')
# for a package vignette, you do want to echo.
# opts_chunk$set(echo=FALSE,warning=FALSE,message=FALSE)
opts_chunk$set(warning=FALSE,message=FALSE)
#opts_chunk$set(results="asis")
opts_chunk$set(cache=TRUE,cache.path="cache/")

#opts_chunk$set(fig.path="figure/",dev=c("pdf","cairo_ps"))
opts_chunk$set(fig.path="figure/",dev=c("png"))
opts_chunk$set(fig.width=7,fig.height=6,dpi=100,out.width='700px',out.height='600px')

# doing this means that png files are made of figures;
# the savings is small, and it looks like shit:
#opts_chunk$set(fig.path="figure/",dev=c("png","pdf","cairo_ps"))
#opts_chunk$set(fig.width=4,fig.height=4)
# for figures? this is sweave-specific?
#opts_knit$set(eps=TRUE)

# this would be for figures:
#opts_chunk$set(out.width='.8\\textwidth')
# for text wrapping:
options(width=160,digits=2)
opts_chunk$set(size="small")
opts_chunk$set(tidy=TRUE,tidy.opts=list(width.cutoff=50,keep.blank.line=TRUE))
library(ggplot2)
library(fromo)
library(dplyr)
library(moments)
library(microbenchmark)

fromo timings

To prevent performance regressions, compare them here, including benchmarks against other packages:

library(fromo)
library(RollingWindow)
library(roll)
library(RcppRoll)
library(microbenchmark)
library(moments)
library(RcppParallel)
# keep this constant for comparison
setThreadOptions(numThreads = 2)

print(Sys.info())
print(sessionInfo())

set.seed(12345)
x    <- rnorm(1e5)
xpos <- runif(length(x)) +1
xm   <- matrix(x,ncol=1)
xmps <- matrix(xpos,ncol=1)
w    <- runif(length(x))

dumbk <- function(x) { c(kurtosis(x) - 3.0,skewness(x),sd(x),mean(x),length(x)) }

checkit <- microbenchmark(sum(x),
                             mean(x),
                             sd(x),
               skewness(x),
               kurtosis(x),
               sd3(x), 
               skew4(x),
                             kurt5(x),
               dumbk(x))
print(checkit)
resdf <- checkit
gc()
# weights
slow_sd <- function(x,w) {
    n0 <- length(x)
    mu <- weighted.mean(x,w=w)
    sg <- sqrt(sum(w * (x-mu)^2)/(n0 - 1))
    c(sg,mu,n0)
}
checkit <- microbenchmark(cent_moments(x,max_order=4,wts=w,na_rm=TRUE,normalize_wts=FALSE),
                             sd3(x,wts=w), 
                             slow_sd(x,w))
print(checkit)
resdf <- rbind(resdf,checkit)
gc()
set.seed(12345)
x1 <- runif(1e4)
x2 <- runif(length(x1))

checkit <- microbenchmark(as.centsums(x1,1),
                             as.centsums(x1,2),
                             as.centsums(x1,3),
                             as.centsums(x1,4))
print(checkit)
resdf <- rbind(resdf,checkit)
gc()

# join them together
max_ord <- 6L
obj1 <- as.centsums(x1,max_ord)
obj2 <- as.centsums(x2,max_ord)
obj3 <- as.centsums(c(x1,x2),max_ord)

checkit <- microbenchmark(c(obj1,obj2),
                             obj3 %-% obj1)
print(checkit)
resdf <- rbind(resdf,checkit)
gc()

max_ord <- 6L
rs1 <- cent_sums(x1,max_ord)
rs2 <- cent_sums(x2,max_ord)
rs3 <- cent_sums(c(x1,x2),max_ord)

checkit <- microbenchmark(join_cent_sums(rs1,rs2),
                             unjoin_cent_sums(rs3,rs2),
                             unjoin_cent_sums(rs3,rs1))
print(checkit)
resdf <- rbind(resdf,checkit)
gc()
set.seed(54321)
x1 <- matrix(rnorm(100*4),ncol=4)
x2 <- matrix(rnorm(100*4),ncol=4)

max_ord <- 2L

# join them together
mobj1 <- as.centcosums(x1,max_ord)
mobj2 <- as.centcosums(x2,max_ord)
mobj3 <- as.centcosums(rbind(x1,x2),max_ord)
alt3 <- c(mobj1,mobj2)
# unjoin them, with this one weird operator:
alt2 <- mobj3 %-% mobj1
alt1 <- mobj3 %-% mobj2

checkit <- microbenchmark(as.centcosums(x1,max_ord),
                             mobj3 %-% mobj1)
print(checkit)
resdf <- rbind(resdf,checkit)
gc()
set.seed(4422)
x    <- rnorm(1e4)
xpos <- runif(length(x)) +1
xm   <- matrix(x,ncol=1)
xmps <- matrix(xpos,ncol=1)
w    <- runif(length(x))

dumb_zscore <- function(x,window) {
    altz <- sapply(seq_along(x),function(iii) { 
        rowi <- max(1,iii - window + 1)
        xrang <- x[rowi:iii]
        (x[iii] - mean(xrang)) / sd(xrang)
  },simplify=TRUE)
}

# run fun on each wins sized window...
silly_fun <- function(x,wins,fun,...) {
    xout <- rep(NA,length(x))
    for (iii in seq_along(x)) {
        xout[iii] <- fun(x[max(1,iii-wins+1):iii],...)
    }
    xout
}

wins <- 250

checkit <- microbenchmark(silly_fun(x,wins,sum,na.rm=FALSE),
                             silly_fun(x,wins,mean,na.rm=FALSE),
                             running_sum(x,wins),
                             running_mean(x,wins),
                             roll::roll_sum(xm,wins),
                             roll::roll_mean(xm,wins),
                             roll::roll_sd(xm,wins),
                             RollingWindow::RollingSum(x,wins,na_method='ignore'),
                             RollingWindow::RollingSum(x,wins),
                             RollingWindow::RollingMean(x,wins),
                             RollingWindow::RollingStd(x,wins),
                             RcppRoll::roll_sum(xm,n=wins,align='right',fill=NA),
                             RcppRoll::roll_mean(xm,n=wins,align='right',fill=NA),
                             RcppRoll::roll_sd(xm,n=wins,align='right',fill=NA),
                             running_sd(x,wins,na_rm=FALSE,restart_period=50000L),
                             running_sd(x,wins,na_rm=TRUE,restart_period=1000L),
                             running_sd3(x,wins),
                             running_skew(x,wins),
                             running_skew4(x,wins),
                             running_kurt(x,wins),
                             running_kurt5(x,wins),
                             running_tstat(x,wins),
                             running_zscored(x,wins),
                             running_sharpe(x,wins),
                             running_apx_median(x,wins),
                             running_centered(x,wins),
                             running_scaled(x,wins)
                             )
print(checkit)
resdf <- rbind(resdf,checkit)
gc()
library(readr)
FAKE_IT <- FALSE
if (FAKE_IT) {
    resdf <- readr::read_csv('timings.csv')
    #print(resdf)
} else {
    readr::write_csv(resdf,'timings.csv')
}

performance regressions?

Can we see them here? load all the timing data to check.

library(magrittr)
library(dplyr)
library(readr)
library(tidyr)
library(knitr)

mysernum <- as.character(packageVersion('fromo') )
allt <- data.frame(fname=dir('.','*.csv'),stringsAsFactors=FALSE) %>%
        dplyr::filter(grepl('^timings_\\d.+\\d+.csv$',fname)) %>%
        group_by(fname) %>%
            mutate(tims=list(readr::read_csv(fname))) %>%
        ungroup() %>%
        tidyr::unnest() %>%
        mutate(sernum=gsub('^timings_(.+).csv$','\\1',fname)) %>%
        dplyr::select(-fname) %>%
        rbind(resdf %>% dplyr::mutate(sernum=mysernum)) %>%
        group_by(sernum,expr) %>%
            summarize(meantime=mean(time,na.rm=TRUE)) %>%
        ungroup() %>%
        group_by(sernum) %>%
            mutate(sumx_time=median(ifelse(grepl('^sum\\(x\\)$',expr),meantime,NA),na.rm=TRUE)) %>%
        ungroup() %>%
        mutate(normalized=meantime / sumx_time) %>%
        arrange(sernum) %>%
        group_by(expr) %>%
            mutate(first_norm=first(normalized),
                         last_norm=last(normalized)) %>%
        ungroup() %>%
        mutate(relchange=normalized / first_norm,
                     last_status=last_norm / first_norm) %>%
        mutate(exgrp=ifelse(grepl('^(roll|RollingWindow|RcppRoll)::',expr),"brand_x",
                                                ifelse(grepl('^running_',expr),"running","summarizing")))

library(ggplot2)
ph <- allt %>%
        dplyr::filter(!grepl('brand_x',exgrp)) %>%
        ggplot(aes(sernum,normalized,group=expr,color=expr)) +
        geom_line() + geom_point() +
        scale_y_log10() + guides(colour=FALSE) + 
        theme(axis.text.x=element_text(angle = -90, hjust = 0)) + 
        facet_grid(exgrp ~ .,scales='free') +
        labs(x='release',
                 y='mean time taken, relative to sum(x)',
                 title='fromo microbenchmark timings, lasagna')
print(ph)

ph <- allt %>%
        dplyr::filter(!grepl('brand_x',exgrp)) %>%
        ggplot(aes(sernum,relchange,group=expr,color=expr)) +
        geom_line() + geom_point() +
        scale_y_log10() + guides(colour=FALSE) + 
        theme(axis.text.x=element_text(angle = -90, hjust = 0)) + 
        facet_grid(exgrp ~ .,scales='free') +
        labs(x='release',
                 y='normalized time taken, relative to first iteration',
                 title='fromo microbenchmark timings, spaghetti')
print(ph)

ph <- allt %>%
        dplyr::filter(!grepl('brand_x',exgrp)) %>%
        ggplot(aes(sernum,relchange)) +
        geom_boxplot(aes(group=sernum),alpha=0.7) + 
        stat_summary(aes(group="1",color='mean'),fun.y=mean,geom="line") + 
        scale_y_log10() + 
        theme(axis.text.x=element_text(angle = -90, hjust = 0)) + 
        facet_grid(exgrp ~ .,scales='free') +
        labs(x='release',
                 y='normalized time taken, relative to first iteration',
                 color='stat',
                 title='fromo microbenchmark timings, boxplots')
print(ph)

allt %>% 
    dplyr::filter(!grepl('brand_x',exgrp)) %>%
    select(expr,sernum,relchange,last_status) %>%
    tidyr::spread(key='sernum',value='relchange') %>%
    arrange(desc(last_status)) %>%
    select(-last_status) %>%
    head(n=50) %>%
    kable()

allt %>% 
    dplyr::filter(!grepl('brand_x',exgrp)) %>%
    select(-sernum,-relchange,-meantime,-sumx_time,-normalized) %>%
    distinct(expr,.keep_all=TRUE) %>%
    arrange(desc(last_status)) %>%
    head(n=20) %>%
    kable()


shabbychef/fromo documentation built on April 11, 2021, 11:03 p.m.