knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library(gpmodels)

Let's start by loading the mtsamples dataset

We can find this in the clinspacy package (on GitHub).

library(dplyr)
library(tidytext)

mtsamples = clinspacy::dataset_mtsamples()

nrow(mtsamples)

mtsamples$transcription[[1]]

Let's combine the text from all of the notes into one string

dataset = data.frame(id = 1:nrow(mtsamples), # 1:500, 
                     variable = 'variable',
                     text = mtsamples$transcription, # [1:500], 
                     stringsAsFactors = FALSE)

dataset = 
  dataset %>% 
  unnest_character_shingles(char, text, n = 1, strip_non_alphanum = FALSE) %>% 
  group_by(id) %>% 
  mutate(sequence_num = row_number()) %>% 
  ungroup()

cat(dataset$char[1:100])

Let's generate a time_frame

char_frame = time_frame(fixed_data = dataset %>% distinct(id), 
                       temporal_data = dataset %>% filter(id == 1),
                       fixed_id = 'id',
                       temporal_id = 'id',
                       temporal_time = 'sequence_num',
                       temporal_variable = 'variable',
                       temporal_value = 'char',
                       step = 2,
                       max_length = 20,
                       output_folder = 'Z:/kdpsingh/gpm_char_lang', 
                       save_time_frame = FALSE)

Let's generate a time_frame (for fun)

char_frame = time_frame(fixed_data = dataset %>% distinct(id), 
                       temporal_data = dataset,
                       fixed_id = 'id',
                       temporal_id = 'id',
                       temporal_time = 'sequence_num',
                       temporal_variable = 'variable',
                       temporal_value = 'char',
                       step = 1,
                       output_folder = 'Z:/kdpsingh/gpm_char_lang/all_data_in_chunks',
                       create_folder = TRUE,
                       chunk_size = 16)

Let's generate a dataset that will predict the next letter using the last 5 letters

future::plan('multisession', workers = 6)

# model_predictors = char_frame %>%
#   gpm_add_predictors(variables = 'variable',
#                      lookback = 20,
#                      window = 1,
#                      stats = c(first = . %>% .[1]),
#                      output_file = FALSE)
# 
# 
# model_predictors = char_frame %>%
#   gpm_add_growing_predictors(variables = 'variable',
#                      stats = c(ngram = . %>% paste(collapse = '')),
#                      output_file = FALSE)

char_frame %>% 
  gpm_add_predictors(variables = 'variable', 
                     lookback = 50,
                     window = 1, 
                     stats = c(first = . %>% .[1]),
                     last_chunk_completed = 64)


char_frame %>% 
  gpm_add_outcomes(variables = 'variable', 
                   lookahead = 1,
                   stats = c(first = . %>% .[1]))

# model_predictors_ngram = char_frame %>%
#   gpm_add_predictors(variables = 'variable',
#                      lookback = 3,
#                      stats = c(ngram = . %>% paste(collapse = '')),
#                      output_file = FALSE)


model_outcome = char_frame %>% 
  gpm_add_outcomes(variables = 'variable', 
                   lookahead = 1,
                   stats = c(first = . %>% .[1]),
                   output_file = FALSE)

# model_outcome_ngram = char_frame %>% 
#   gpm_add_outcomes(variables = 'variable', 
#                    lookahead = 1,
#                    stats = c(ngram = . %>% paste(collapse = '')),
#                    output_file = FALSE)

Combine multi-file chunk datasets


Combine the datasets

model_data = gpm_combine(char_frame, 
                         model_outcome,
                         model_predictors)

knitr::kable(model_data, align = 'c')

# head(model_data)

Combine the ngram datasets

model_data_ngram = gpm_combine(char_frame,
                               model_outcome_ngram,
                               model_predictors_ngram)

knitr::kable(model_data_ngram, align = 'c')


# head(model_data_ngram)


ML4LHS/gpmodels documentation built on Feb. 1, 2024, 8:31 a.m.