# Function to calculate frequencies using LaPlace smoothing
#' Function which finds tokens in the first element that do not exisit in the second element and returns them
#'
#'
#' Function which finds tokens that exsist in document B which do not exisit in document A
#' then adds them with a frequency of the smoothing factor to an updated list of tokens and
#' frequencies for each document. The relative frequency is then calculated
#' @param wordFreqA, a word frequency data structure that you want to find the unique tokens from
#' @param names2, the names of a vector which correspond to the tokens of another word frequency data structure (for efficency)
#' @keywords find unique, unique
#' @export
findUnique <- function(wordFreq1, names2) {
unique <- c()
for(i in 1:length(wordFreq1[[1]])) {
if (!(wordFreq1[[1]][i] %in% names2)) {
unique <- c(unique, wordFreq1[[1]][i])
}
}
return(unique)
}
#' Function to join two lists of lists into one data frame
#'
#'
#' Function which takes two lists of lists, converts them to data frames
#' and binds them together into one data frame
#' @param wordFreqOne, the first list of lists to merge
#' @param wordFreqTwo, the second list of lists to merge
#' @keywords merge, list matrix, data frame
#' @export
#' @examples mergeWF(wordFreqA, uniqueFreq)
mergeWF <- function(wordFreqOne, wordFreqTwo) {
# Combining the two lists of lists into a data frame
tempOne <- data.frame(wordFreqOne, stringsAsFactors = FALSE)
tempTwo <- data.frame(wordFreqTwo, stringsAsFactors = FALSE)
colnames(tempOne) <- c("token", "count")
colnames(tempTwo) <- c("token", "count")
new <- data.frame(matrix( nrow = (length(wordFreqOne[[1]]) + length(wordFreqTwo[[1]])), ncol = 2))
new <- rbind(tempOne, tempTwo)
return(new)
}
#' Function to calculate the relative frequencies, given two word frequency data frames
#'
#'
#' Function which, given two word frequency data frames generated by
#' frequencies() and the mergeWF() function, returns a data frame
#' with the tokens and their relative frequencies
#' @param newWordFreqA, the target word frequency data frame
#' @param newWordFreqB, the reference word frequency data fram
#' @keywords merge, list matrix, data frame
#' @export
#' @examples relativeFrequencies(newWordFrequenciesA, newWordFrequenciesB)
relativeFrequencies <- function(newWordFreqA, newWordFreqB) {
# Calculate the relative frequencies
relativeFrequencies <- data.frame(matrix(nrow = length(newWordFreqA), ncol = 2))
for (i in 1:nrow(newWordFreqA)) {
# calculate the total count for the denominator
countA <- sum(newWordFreqA[,2])
countB <- sum(newWordFreqB[,2])
# Get the token
temp <- newWordFreqA[i,1]
# Put token in the new data frame
relativeFrequencies[i,1] <- temp
# Calculate the frequency of token temp in B
keyCount <- getCount(newWordFreqB, temp)
freqB <- keyCount / countB
# Frequency of token temp in A
freqA <- newWordFreqA[i, 2] / countA
# Store the relative frequency
relativeFrequencies[i,2] <- (freqA / freqB)
}
return(relativeFrequencies)
}
#' function to find the relative frequencies of the combined tokens from document A and document B
#'
#'
#' Function which finds tokens that exsist in document B which do not exisit in document A
#' then adds them with a frequency of the smoothing factor to an updated list of tokens and
#' frequencies for each document. The relative frequency is then calculated
#' @param wordFrequenciesA, a wordFrequency list, generated by this package
#' @param wordFrequenciesB, another word frequency list for reference
#' @param smoothingFactor, the number used when the frequency of a word is zero and also the smooting factor
#' @keywords simplemaths, relative frequency
#' @export
#' @examples frequencies(wordFreqA, wordFreqB, 1000)
simpleMaths <- function(wordFrequenciesA, wordFrequenciesB, smoothingFactor) {
# Make an easily searchable way of finding whether or not a
# token exists in a document
# vector with the indexes of the tokens
rowNumbersA <- c(1:length(wordFrequenciesA[[2]]))
# name the indexes with the corresponding token
names(rowNumbersA) <- wordFrequenciesA[[1]]
rowNumbersB <- c(1:length(wordFrequenciesB[[2]]))
names(rowNumbersB) <- wordFrequenciesB[[1]]
# Find unique tokens which need to be added to A
namesA <- names(rowNumbersA)
unique <- findUnique(wordFrequenciesB, namesA)
# smoothing A
wordFrequenciesA[[2]] <- wordFrequenciesA[[2]] + smoothingFactor
# Add the unique tokens with a frequency equal to the smoothing factor
# to the exisitng word frequencies for A. Also smooth A.
newFrequenciesA <- data.frame()
if(!(length(unique) == 0)) {
uniqueFrequencies <- c()
uniqueFrequencies[[1]] <- unique
uniqueFrequencies[[2]] <- rep(smoothingFactor, length(unique))
# Merge the two lists of lists
newFrequenciesA <- mergeWF(wordFrequenciesA, uniqueFrequencies)
} else {
newFrequenciesA <- data.frame(wordFrequenciesA, stringsAsFactors = FALSE)
colnames(newFrequenciesA) <- c("token", "count")
}
# find the unique elements from A that need to be added to B
namesB <- names(rowNumbersB)
unique <- findUnique(wordFrequenciesA, namesB)
# Smooth
wordFrequenciesB[[2]] <- wordFrequenciesB[[2]] + smoothingFactor
# Turn the vector of unique tokens into a list of lists with
# the frequencies set to the smoothing factor
newFrequenciesB <- data.frame()
if (!(length(unique) == 0)) {
uniqueFrequencies <- c()
uniqueFrequencies[[1]] <- unique
uniqueFrequencies[[2]] <- rep(smoothingFactor, length(unique))
# merge the unique tokens with B
newFrequenciesB <- mergeWF(wordFrequenciesB, uniqueFrequencies)
} else {
newFrequenciesB <- data.frame(wordFrequenciesB, stringsAsFactors = FALSE)
colnames(newFrequenciesB) <- c("token", "count")
}
return(relativeFrequencies(newFrequenciesA, newFrequenciesB))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.