Nothing
collect_ngrams <- function(x, n, sep="\t") {
if (n == 1) return(x)
L <- length(x)
if (L < n) return(NULL)
ngrams <- x[1:(L - (n - 1))]
for (i in 1:(n-1)) {
ngrams <- paste(ngrams, x[i + 1:(L - (n - 1))], sep = sep)
}
ngrams
}
log_plus_one <- function(x) {
ifelse(x == 0, 0, 1 + log(x))
}
#' ngram feature extraction
#'
#' \code{seq2feature_ngram} extracts ngram features from response processes.
#'
#' Three types of ngram features can be extracted. \code{type = "binary"} gives
#' binary ngram features indicating whether an ngram appears in a response process.
#' \code{type = "freq"} gives ngram frequency features. Each feature is the count of
#' the corresponding ngram in a response process. \code{type = "weighted"} gives the
#' weighted ngram features proposed in He and von Davier (2015).
#'
#' @family feature extraction methods
#'
#' @param seqs an object of class \code{"\link{proc}"}
#' @param level an integer specifying the max length of ngrams
#' @param type a character string (\code{"binary"}, \code{"freq"}, or \code{"weighted"})
#' specifying the type of ngram features.
#' @param sep action seperator within ngram.
#' @return a matrix of ngram features
#' @references He Q., von Davier M. (2015). Identifying Feature Sequences from Process
#' Data in Problem-Solving Items with N-Grams. In: van der Ark L., Bolt D., Wang WC.,
#' Douglas J., Chow SM. (eds) \emph{Quantitative Psychology Research}. Springer
#' Proceedings in Mathematics & Statistics, vol 140. Springer, Cham.
#' @examples
#' seqs <- seq_gen(100)
#' theta <- seq2feature_ngram(seqs)
#' @export
seq2feature_ngram <- function(seqs, level = 2, type = "binary", sep="\t") {
if (!(type %in% c("binary", "freq", "weighted"))) stop("Undefined ngram feature type!\n")
if (class(seqs) != "proc") stop("seqs should be a proc object!\n")
level <- round(level)
theta <- numeric(0)
action_seqs <- seqs$action_seqs
n_seq <- length(action_seqs)
for (index_level in 1:level) {
lgram_seqs <- sapply(action_seqs, collect_ngrams, n=index_level, sep=sep)
lgram_vec <- unlist(lgram_seqs)
lgrams <- unique(lgram_vec)
n_lgram <- length(lgrams)
lgram_tf <- matrix(0, n_seq, n_lgram)
colnames(lgram_tf) <- lgrams
for (index_seq in 1:n_seq) {
lgrams_seq <- lgram_seqs[[index_seq]]
for (lgram in lgrams_seq) {
lgram_tf[index_seq, lgram] <- lgram_tf[index_seq, lgram] + 1
}
}
if (type == "freq") theta <- cbind(theta, lgram_tf)
else if (type == "binary") {
lgram_tf_binary <- (lgram_tf > 0) + 0
theta <- cbind(theta, lgram_tf_binary)
}
else {
lgram_sf <- numeric(n_lgram)
names(lgram_sf) <- lgrams
for (lgram in lgrams) {
lgram_sf[lgram] <- sum(sapply(lgram_seqs, function(x) lgram %in% x))
}
lgram_weight <- t(log(n_seq / lgram_sf) * t(log_plus_one(lgram_tf)))
theta <- cbind(theta, lgram_weight)
}
}
return(theta)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.