# #################################################
# Function for sampling
# Usage: ...
# Required argument: ...
# Optional arguments: ....
# #################################################
make.samples <-
function(tokenized.text,
sample.size = 10000,
sampling = "no.sampling",
sample.overlap = 0,
number.of.samples = 1,
sampling.with.replacement = FALSE){
# checking the format of input data (vector? list?); converting to a list
if(is.list(tokenized.text) == FALSE) {
tokenized.text = list(tokenized.text)
}
# checking if there are any names attached to the texts
if(is.character(names(tokenized.text)) == FALSE) {
# if not, some generic names will be assigned
names(tokenized.text) = paste("paste", 1:length(tokenized.text), sep = "_")
}
# starting an empty list
corpus.cut.into.samples = list()
# iterating over subsequent texts of the input corpus
for(i in 1:length(tokenized.text)) {
# retrieving an appropriate text from the whole corpus (if applicable)
current.text = tokenized.text[[i]]
# sanity check for text length: abort if the current text is extremely
# short or at least shorter than the specified sample size
if (length(current.text) < 10 ||
(sampling == "normal.sampling" && length(current.text) < sample.size) ||
(sampling == "random.sampling" && length(current.text) < sample.size)) {
warning("\n\n", head(current.text, 100), "...\t", "This text is too short!",
"\n")
stop("Corpus error...")
}
#
if(sample.size < sample.overlap) {
message("------------------------------------------------")
message("Sample overlap bigger that sample size?! NO WAY!")
message("performing no sample overlap...")
message("------------------------------------------------")
sample.overlap = 0
}
# at this point, each text in the corpus has been tokenized
# into an array of tokens which we can divide into samples
samples.from.text = list()
if (sampling == "normal.sampling"){
# initialize variables to sample the text
text.length = length(current.text)
number.of.samples = floor((text.length-sample.overlap)/(sample.size-sample.overlap))
message(names(tokenized.text)[i])
message(paste("\t", "- text length (in words): ", text.length, sep = ""))
message(paste("\t", "- nr. of samples: ", number.of.samples, sep = ""))
message(paste("\t", "- nr. of words dropped at the end of the text: ",
text.length-(number.of.samples*(sample.size-sample.overlap)), sep = ""))
# iterate over the samples:
current.start.index = 1
for(sample.index in 1:number.of.samples) {
current.sample = current.text[current.start.index:(current.start.index+sample.size-1)]
# flush current sample:
samples.from.text[[sample.index]] = current.sample
# assign a new id to current sample
id = paste(names(tokenized.text)[i],"_",sample.index,sep="")
names(samples.from.text)[sample.index] = id
# increment index for next iteration
current.start.index = current.start.index + sample.size - sample.overlap
current.sample = c()
}
} else if(sampling == "random.sampling"){
# if random sampling was chosen, the text will be randomized and a sample
# of a given length will be excerpted;
# initialize variables to sample the text
text.length = length(current.text)
message(names(tokenized.text)[i])
message(paste("\t", "- text length (in words): ", text.length, sep = ""))
message(paste("\t", "- nr. of random samples: ", number.of.samples, sep = ""))
message(paste("\t", "- sample length: ", sample.size, sep = ""))
# iterate over the samples:
for(sample.index in 1:number.of.samples) {
current.sample = head(sample(current.text, replace = sampling.with.replacement), sample.size)
# flush current sample:
samples.from.text[[sample.index]] = current.sample
# assign a new id to current sample
id = paste(names(tokenized.text)[i], "_", sample.index, sep = "")
names(samples.from.text)[sample.index] = id
# increment index for next iteration
current.sample = c()
}
} else if (sampling == "no.sampling"){
# entire texts will be used as a sample (regardless of its length)
current.sample = tokenized.text
samples.from.text[[1]] = current.sample
# inheriting the sample's name
names(samples.from.text) = names(tokenized.text)[i]
}
#
# estimating the number of samples already appended to the "new" corpus
n = length(corpus.cut.into.samples)
# appending newly created samples to the "new" corpus
for(s in 1:length(samples.from.text)) {
corpus.cut.into.samples[[n+s]] = samples.from.text[[s]]
names(corpus.cut.into.samples)[n+s] = names(samples.from.text)[s]
}
}
class(corpus.cut.into.samples) = "stylo.corpus"
return(corpus.cut.into.samples)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.