context("Regression test for feature extraction")
FeatureExtractionOld <- function(labeled) {
labeled <- copy(labeled)
myslq_sw <- system.file("extdata", "mysql_sw_wo_code_words.txt",
package="NLoN", mustWork=TRUE)
myslq_sw <- read.csv(myslq_sw, stringsAsFactors=FALSE, header=FALSE)
#Not same encoding as in cubernets. not used.
labeled$length <- nchar(labeled$text)
labeled$ratio.caps <- plyr::ldply(stringr::str_match_all(labeled$text,"[A-Z]"),length)/labeled$length
labeled$ratio.specials <- plyr::ldply(stringr::str_match_all(labeled$text,"[^a-zA-Z\\d\\s]"),length)/labeled$length
labeled$ratio.numbers <- plyr::ldply(stringr::str_match_all(labeled$text,"[\\d]"),length)/labeled$length
labeled$length.words <- labeled$length /(plyr::ldply(stringr::str_match_all(labeled$text,"[\\s+]"),length) +1)
labeled$words <- plyr::ldply(stringr::str_match_all(labeled$text,"[\\s+]"),length) +1
#we count stopword twice with two different tokenizers.
labeled[,stopwords:= sapply(text, function(x) sum(tolower(tokenizers::tokenize_regex(x, pattern = "\\s+", simplify = TRUE)) %in% myslq_sw$V1))/ labeled$words]
labeled[,stopwords2:= sapply(text, function(x) sum(tokenizers::tokenize_words(x, simplify = TRUE) %in% myslq_sw$V1))/ labeled$words]
#If the line ends to emoticon this is not true :-) so we substract. R does not support lookahead in regex which would also solve this
labeled[,last.char.code:= sapply(text, function(x) max(c(sum(grep("\\)$|\\{$|;$", x)) - sum (grep (":-\\)$|;-\\)|:\\)$|;\\)$|:-\\($|:\\($", x)), 0)))]
labeled[,last.char.nl:= sapply(text, function(x) sum(grep("\\.$|\\!$|\\?$|:$|,$", x)))]
labeled$first.3.chars.letters <- plyr::ldply (stringr::str_match_all(substr(gsub("\\s", "", labeled$text), 1,3), "[a-zA-Z]"),length)
labeled$emoticons <- plyr::ldply(stringr::str_match_all(labeled$text,":-\\)|;-\\)|:\\)|;\\)|:-\\(|:\\("),length)
#Sanity chek should empty
#labeled[last.char.nl==1 & last.char.code==1]$text
#New features from Kubernets
#Starts with @ sign
labeled[,first.char.at:= sapply(text, function(x) sum(grep("^@", x)))]
labeled
}
test_that("FeatureExtraction gives same results as FeatureExtractionOld", {
skip_on_cran()
data(nlon.data)
res <- FeatureExtraction(nlon.data$text)
res.old <- FeatureExtractionOld(nlon.data)
expect_equal(res$ratio.caps, res.old$ratio.caps)
expect_equal(res$ratio.specials, res.old$ratio.specials)
expect_equal(res$ratio.numbers, res.old$ratio.numbers)
expect_equal(res$length.words, res.old$length.words)
expect_equal(res$stopwords, res.old$stopwords)
expect_equal(res$stopwords2, res$stopwords2)
expect_equal(res$last.char.code, res.old$last.char.code)
expect_equal(res$last.char.nl, res.old$last.char.nl)
expect_equal(res$first.3.chars.letters, res.old$first.3.chars.letters)
expect_equal(res$emoticons, res.old$emoticons)
expect_equal(res$first.char.at, res.old$first.char.at)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.