#' Calculating the Word Error Rate
#'
#' This function calculates the word error rate between a hypothesis and a reference corpus.
#' @param r The reference quanteda corpus
#' @param h The hypothesis quanteda corpus
#' @keywords word error rate
#' @export
#' @return Returns a dataframe containing the Word error rate, the number of
#' substitutions, deletions and insertions and the number of words in the
#' reference and hypothesis corpora for each text in the corpus.
#' @examples
#' hypothesis_data=data.frame(text="The meadoww very nice and the two sun shines bright",
#' name="doc1",stringsAsFactors = F)
#' hypothesis_corpus=quanteda::corpus(hypothesis_data,docid_field = "name", text_field = "text")
#' reference_data=data.frame(text="The meadow is very nice and the sun shines bright",
#' name="doc1",stringsAsFactors = F)
#' reference_corpus=quanteda::corpus(reference_data,docid_field = "name", text_field = "text")
#' wer(r=reference_corpus,h=hypothesis_corpus)
#' # One substitution ("meadoww" instead of "meadow"), one deletion ("is") and one insertion ("two")
#' # Overall, this means there are three mistakes for ten reference words, giving a Word error rate of 0.3
wer <- function (r, h)
{
if (length(as.character(r)) != length(as.character(h)))
stop("The refernce and hypothesis corpus should have the same length")
if (length(as.character(r)) == length(as.character(h))) {
data.store = data.frame(wer = rep(NA, length(as.character(r))),
sub = NA, ins = NA, del = NA, words.ref = NA, words.hyp = NA)
for (k in 1:length(as.character(r))) {
print(paste("Document", k, "of", length(as.character(r))))
sub.count = 0
ins.count = 0
del.count = 0
r$id = 1:length(as.character(r))
ref_text <- r %>%
corpus_subset(id == k) %>%
tokens() %>%
quanteda::tokens_lookup(quanteda.dictionaries::data_dictionary_us2uk,
exclusive = FALSE,
capkeys = FALSE) %>%
as.character() %>%
tolower()
h$id = 1:length(as.character(h))
hyp_text <- h %>%
corpus_subset(id == k) %>%
tokens() %>%
quanteda::tokens_lookup(quanteda.dictionaries::data_dictionary_us2uk,
exclusive = FALSE,
capkeys = FALSE) %>%
as.character() %>%
tolower()
if (as.character(h)[k] == "") {
data.store$wer[k] = 1
data.store$sub[k] = 0
data.store$del[k] = length(ref_text)
data.store$ins[k] = 0
data.store$words.ref[k] = length(ref_text)
data.store$words.hyp[k] = 0
}
if (as.character(h)[k] != "") {
d1 <- matrix(ncol = length(hyp_text) + 1, nrow = length(ref_text) +
1, 0)
d1[1, ] <- 0:length(hyp_text)
d1[, 1] <- 0:length(ref_text)
dtext = d1
for (i in 2:nrow(d1)) {
for (j in 2:ncol(d1)) {
if (ref_text[i - 1] == hyp_text[j - 1]) {
d1[i, j] <- d1[i - 1, j - 1]
dtext[i, j] = "CORRECT"
}
else {
sub <- d1[i - 1, j - 1] + 1
ins <- d1[i, j - 1] + 1
del <- d1[i - 1, j] + 1
d1[i, j] <- min(sub, ins, del)
if (which.min(c(sub, ins, del)) == 1) {
dtext[i, j] = "SUB"
}
if (which.min(c(sub, ins, del)) == 2) {
dtext[i, j] = "INS"
}
if (which.min(c(sub, ins, del)) == 3) {
dtext[i, j] = "DEL"
}
}
}
}
sequence = rep(NA, length(ref_text))
start.row = nrow(dtext)
start.col = ncol(dtext)
dtext[2:nrow(dtext), 1] = "DEL"
dtext[1, 2:ncol(dtext)] = "INS"
dtext[1, 1] = "CORRECT"
for (l in (length(sequence)):1) {
sequence[l] = dtext[start.row, start.col]
if (sequence[l] %in% c("CORRECT", "SUB")) {
start.row = start.row - 1
start.col = start.col - 1
}
if (sequence[l] == "DEL") {
start.row = start.row - 1
}
if (sequence[l] == "INS") {
start.col = start.col - 1
}
}
data.store$wer[k] = d1[length(ref_text) + 1,
length(hyp_text) + 1]/length(ref_text)
data.store$sub[k] = sum(sequence == "SUB", na.rm = T)
data.store$del[k] = sum(sequence == "DEL", na.rm = T)
data.store$ins[k] = sum(sequence == "INS", na.rm = T)
data.store$words.ref[k] = length(ref_text)
data.store$words.hyp[k] = length(hyp_text)
}
}
return(data.store)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.