knitr::opts_chunk$set(
  collapse = TRUE,
  comment = ""
)
String Chunking

basic string processing functions

library(keysandstrings)
chunk_string('aaabbbcccddd',  chunk_len=3)
split_key(c('aaabbb', 'aaaccc','bbbaaa'),  chunk_len = 3)
level coder

replaces a factor levels in a data frame with a code that with a consistant number of characters.
This feature helps for grouped time series in the hts package that requires keys with consistant length

data = data.frame(w = rep('wkey1',8),
                  x = rep(c('xkey1', 'xkey2','xkey3', NA), 2), 
                  y = c(rep(c('ykey1', 'ykey2', 'ykey3'), 2), 'ykey1', ''),
                  z = c(rep(c('zkey1',NA, ''), 2), 'zkey1', '')
                  )
lc = level_coder(data)
x = predict(lc, data, verbose = TRUE)
print(x)
predict(lc, x, rev = TRUE)
key = predict(lc,data, return_primkey = TRUE)
print(key)
predict(lc,key, is_primkey = TRUE)
z = "aaazzzzzzzzz"
attr(z, 'primkey') = TRUE
predict(lc, key[8] ,is_primkey = TRUE)
TS Features

get lagged features from time series data. Works on a data frame or a time seires object. Uses keys and a data col. Handles missing values by carrying the next forward. This might be problematic for seasonal times series, so it's better to deal with NAs before running this.

head(ts_features(AirPassengers),2)

tail(ts_features(AirPassengers),2)




params =  list(time_col = 'date', 
                    key_col = 'key', 
                    val_col = 'val', 
                    n_future = 2, 
                    n_past = 2,
                    na_fill = NA)
data = data.frame(date = rep(1:10,2), key = c(rep('a',10), rep('b',10) ), val =c(1:10, 2:11))
ts_features(data, params)

# this demonstrates the back fill capability for missing 
data = data.frame(date = c(2:10, 1:10), key = c(rep('a',9), rep('b',10) ), val =c(2:10, 1:10))
new_data = ts_features(data, params)
new_data[ order(new_data$key, new_data$date),]

Key Scaler

Center and Scale new data, and uncenter and scale, all optional primary key ( creates a scale for each column and each set of keyed rows )

data = as.matrix(data.frame(key = c(rep(1,40), rep(2,39), 3), 
                  val1 =  c(1:79, NA), 
                  val2 = c(1:80), 
                  val3 = rep(1, 80))) 
head(data)
ks = keyscale(data,  key_col = 'key')
# scaling
x = predict(ks, data)
head(x)
#unscaling
head(predict(ks, x))
# case when there is a new key
data = as.matrix(data.frame(key = c(rep(1,2), rep(2,2), 4), 
                  val1 = c(rep(1:2, 2),55), 
                  val2 = c(rep(1:2, 2), 55),
                  val3 = c(rep(1:2,2),55))) 
data = as.matrix(data.frame(key = c(rep(1,40), rep(2,39), 3), 
                  val1 =  c(1:79, NA), 
                  val2 = c(1:80), 
                  val3 = rep(1, 80))) 
head(data)
ks = keyscale(data[,1:2],  key_col = 'key')
# scaling
head(predict(ks, data, verbose = FALSE))
Black list
data = data.frame(a = c(rep('aa', 9), 'b'), b = c(rep('aa', 10)))
bl = black_list(data, cols = c('a', 'b'), freq = 9 )
predict(bl, data, keep_rows = TRUE)
wl = black_list(data, cols = c('a', 'b'), freq = 9, white_list = TRUE )
predict(bl, data, keep_rows = TRUE)
Sparse Encoder

Catagory Sparse Encoder

data = data.frame(x = rep(x = c(NA,letters[1:4]), 5), y = rep(c(NA,letters[5:8]), 5), z = c(1:24, NA), zz = 1:25)
lc = sparseEncoder(data)
predict(lc, data)
Calendar Period Functions

Simple functions for manipulating calendar periods

x = c(201209,201212, 201301)
cp_add(x,2)
cp_to_fp(x)
cp_to_fp(cp_to_fp(x), rev = TRUE)
cp_seq(201201,6)
x = rep(cp_seq(200011, 250),2)
system.time(cp_to_calendar(cp_seq(201201,100)))
system.time(cp_to_calendar(cp_seq(201201,100), par = FALSE))
Mising CP
data = c(201201+0:5, 201301+6:11)
missing_cp(data)


mdavis29/keys_and_strings documentation built on May 24, 2019, 7:23 p.m.