# #################################################
# FUNCTION: make.table.of.frequencies
# preparing a huge table with all the frequencies.
# Two arguments are required: (1) text data: either
# a tokenized corpus (list), or a single text (vector);
# (2) a vector containig the words (a reference word list)
# to rearrange particular frequency list in accordance
# to this vector. Optional argument: (3) absent.sensitive
# which is used to prevent building tables of non-existing
# words. When switched on, the variables that contain but O
# in all samples, will be excluded. However, in some cases
# is is important to keep all the variables regardless of
# their values. It is the case of comparing two corpora:
# even if a given word did not occur in the whole corpus A,
# it might have occured in the corpus B. In short: if
# you perform any analysis involving I and II set, switch
# this option to FALSE.
# #################################################
make.table.of.frequencies.optimized = function(corpus,
features,
absent.sensitive = TRUE,
relative = TRUE) {
# variable initialization
frequency.table = c()
# checking the format of input data (vector? list?); converting to a list
if(is.list(corpus) == FALSE) {
corpus = list(corpus)
}
# checking if there are any names attached to the texts
if(is.character(names(corpus)) == FALSE) {
# if not, some generic names will be assigned
names(corpus) = paste("sample",1:length(corpus),sep="_")
}
message(paste("processing ", length(corpus), " text samples"))
for(i in 1:length(corpus)) {
message(sprintf("%7s", i), appendLF = FALSE)
# loading the next sample (= next item) from the corpus
current.sample = corpus[[i]]
# preparing the frequency list of the current sample
if(relative == TRUE) {
# either relative frequencies...
raw.freqs = table(current.sample) * 100 / length(current.sample)
} else {
# ...or raw frequencies
raw.freqs = table(current.sample)
}
# adjusting the frequencies to the list of features passed as an argument
current.vector.of.freqs = raw.freqs[features]
# create a new vector (name assgned automatically) that will contain
# the frequencies of the current sample
assign(paste("vector_of_freqs_", i, sep=""), current.vector.of.freqs)
# a short message on screen
message("\b\b\b\b\b\b\b", appendLF = FALSE)
}
message(" ")
message("combining frequencies into a table...")
# combining frequencies into a table
frequency.table = sapply(mget(grep("vector_of_freqs_", ls(), value=T)), rbind)
frequency.table = t(frequency.table)
# adjusting names of the samples
rownames(frequency.table) = names(corpus)
colnames(frequency.table) = features
# all NA values will be set to 0
frequency.table[which(is.na(frequency.table))] = 0
#
# optionally, get rid of words that have not occurred in the corpus:
# if switched off (which makes sense when one compares two or more tables),
# then the non-existing words will be represented in the table,
# and they will be filled with 0
if(absent.sensitive == TRUE) {
# if any of the requested words was not found in the corpus, ...
if( length(features[colSums(frequency.table) == 0]) > 0 ) {
# the word in question will be listed
cat("The following words/features could not be found in the corpus:\n")
cat(features[colSums(frequency.table) == 0], "\n")
}
# filtering out the variables (words) that have not occurred
frequency.table = frequency.table[,(colSums(frequency.table) > 0)]
}
# complain if the frequency table is empty
if(length(frequency.table) == 0) {
stop("something must be wrong with the words/features you want to analyze")
}
# the result of the function
return(frequency.table)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.