R/simpleMaths.R

# Function to calculate frequencies using LaPlace smoothing

#' Function which finds tokens in the first element that do not exisit in the second element and returns them
#' 
#' 
#' Function which finds tokens that exsist in document B which do not exisit in document A
#' then adds them with a frequency of the smoothing factor to an updated list of tokens and 
#' frequencies for each document.  The relative frequency is then calculated
#' @param wordFreqA, a word frequency data structure that you want to find the unique tokens from
#' @param names2, the names of a vector which correspond to the tokens of another word frequency data structure (for efficency)
#' @keywords find unique, unique
#' @export
findUnique <- function(wordFreq1, names2) {
  unique <- c()
  for(i in 1:length(wordFreq1[[1]])) {
    if (!(wordFreq1[[1]][i] %in% names2)) {
      unique <- c(unique, wordFreq1[[1]][i])
    }
  }
  return(unique)
}

#' Function to join two lists of lists into one data frame
#' 
#' 
#' Function which takes two lists of lists, converts them to data frames
#' and binds them together into one data frame
#' @param wordFreqOne, the first list of lists to merge
#' @param wordFreqTwo, the second list of lists to merge
#' @keywords merge, list matrix, data frame
#' @export
#' @examples mergeWF(wordFreqA, uniqueFreq)
mergeWF <- function(wordFreqOne, wordFreqTwo) {
  # Combining the two lists of lists into a data frame
  tempOne <- data.frame(wordFreqOne, stringsAsFactors = FALSE)
  tempTwo <- data.frame(wordFreqTwo, stringsAsFactors = FALSE)
  colnames(tempOne) <- c("token", "count")
  colnames(tempTwo) <- c("token", "count")
  new <- data.frame(matrix( nrow = (length(wordFreqOne[[1]]) + length(wordFreqTwo[[1]])), ncol = 2))
  new <- rbind(tempOne, tempTwo)
  return(new)
}
#' Function to calculate the relative frequencies, given two word frequency data frames 
#' 
#' 
#' Function which, given two word frequency data frames generated by
#' frequencies() and the mergeWF() function, returns a data frame
#' with the tokens and their relative frequencies
#' @param newWordFreqA, the target word frequency data frame
#' @param newWordFreqB, the reference word frequency data fram
#' @keywords merge, list matrix, data frame
#' @export
#' @examples relativeFrequencies(newWordFrequenciesA, newWordFrequenciesB)
relativeFrequencies <- function(newWordFreqA, newWordFreqB) {
  # Calculate the relative frequencies
  relativeFrequencies <- data.frame(matrix(nrow = length(newWordFreqA), ncol = 2))
  for (i in 1:nrow(newWordFreqA)) {
    # calculate the total count for the denominator
    countA <- sum(newWordFreqA[,2])
    countB <- sum(newWordFreqB[,2])
    # Get the token
    temp <- newWordFreqA[i,1]
    # Put token in the new data frame
    relativeFrequencies[i,1] <- temp
    # Calculate the frequency of token temp in B
    keyCount <- getCount(newWordFreqB, temp)
    freqB <- keyCount / countB
    # Frequency of token temp in A
    freqA <- newWordFreqA[i, 2] / countA
    # Store the relative frequency
    relativeFrequencies[i,2] <- (freqA / freqB)
  }
  return(relativeFrequencies)
}
#' function to find the relative frequencies of the combined tokens from document A and document B
#' 
#' 
#' Function which finds tokens that exsist in document B which do not exisit in document A
#' then adds them with a frequency of the smoothing factor to an updated list of tokens and 
#' frequencies for each document.  The relative frequency is then calculated
#' @param wordFrequenciesA, a wordFrequency list, generated by this package
#' @param wordFrequenciesB, another word frequency list for reference
#' @param smoothingFactor, the number used when the frequency of a word is zero and also the smooting factor
#' @keywords simplemaths, relative frequency
#' @export
#' @examples frequencies(wordFreqA, wordFreqB, 1000)
simpleMaths <- function(wordFrequenciesA, wordFrequenciesB, smoothingFactor) {
  # Make an easily searchable way of finding whether or not a
  # token exists in a document
  # vector with the indexes of the tokens
  rowNumbersA <- c(1:length(wordFrequenciesA[[2]]))
  # name the indexes with the corresponding token
  names(rowNumbersA) <- wordFrequenciesA[[1]]
  rowNumbersB <- c(1:length(wordFrequenciesB[[2]]))
  names(rowNumbersB) <- wordFrequenciesB[[1]]
  # Find unique tokens which need to be added to A
  namesA <- names(rowNumbersA)
  unique <- findUnique(wordFrequenciesB, namesA)
  # smoothing A
  wordFrequenciesA[[2]] <- wordFrequenciesA[[2]] + smoothingFactor
  # Add the unique tokens with a frequency equal to the smoothing factor
  # to the exisitng word frequencies for A.  Also smooth A.
  newFrequenciesA <- data.frame()
  if(!(length(unique) == 0)) {
    uniqueFrequencies <- c()
    uniqueFrequencies[[1]] <- unique
    uniqueFrequencies[[2]] <- rep(smoothingFactor, length(unique))
    # Merge the two lists of lists
    newFrequenciesA <- mergeWF(wordFrequenciesA, uniqueFrequencies)
  } else {
    newFrequenciesA <- data.frame(wordFrequenciesA, stringsAsFactors = FALSE)
    colnames(newFrequenciesA) <- c("token", "count")
  }
  
  # find the unique elements from A that need to be added to B
  namesB <- names(rowNumbersB)
  unique <- findUnique(wordFrequenciesA, namesB)
  
  # Smooth
  wordFrequenciesB[[2]] <- wordFrequenciesB[[2]] + smoothingFactor
  
  # Turn the vector of unique tokens into a list of lists with
  # the frequencies set to the smoothing factor
  newFrequenciesB <- data.frame()
  if (!(length(unique) == 0)) {
    uniqueFrequencies <- c()
    uniqueFrequencies[[1]] <- unique
    uniqueFrequencies[[2]] <- rep(smoothingFactor, length(unique))
    # merge the unique tokens with B
    newFrequenciesB <- mergeWF(wordFrequenciesB, uniqueFrequencies)
  } else {
    newFrequenciesB <- data.frame(wordFrequenciesB, stringsAsFactors = FALSE)
    colnames(newFrequenciesB) <- c("token", "count")
  }
  return(relativeFrequencies(newFrequenciesA, newFrequenciesB))
}
mouse0/suicideProject documentation built on May 3, 2019, 5:19 p.m.