#' Stemming Malay words
#'
#' Malaytextr function to stem Malay words
#' @usage stem_malay(word,
#' dictionary,
#' col_feature1,
#' col_dict1,
#' col_dict2,
#' Word)
#'
#' @details
#' `stem_malay()` is an approach to find the Malay words in a dictionary
#' and then proceed to remove "extra suffix" as explained by Khan et al. (2017), and then "prefix" and lastly, "suffix".
#'
#'@references
#'
#' Khan, Rehman Ullah, Fitri Suraya Mohamad, Muh Inam UlHaq, Shahren Ahmad Zadi Adruce, Philip Nuli Anding, Sajjad Nawaz Khan, and Abdulrazak Yahya Saleh Al-Hababi. 2017. "Malay Language Stemmer."
#'
#' @param word A data frame, or a character vector
#' @param dictionary A data frame with a column of words to be stemmed and a column of root words
#' @param col_feature1 Column that contains words to be stemmed from `word`
#' @param col_dict1 Column that will be used to match with `col_feature1` from `word`
#' @param col_dict2 Column that contains the root words from `dictionary`
#' @param Word Depreciated. Please use `word` instead
#'
#' @return Returns a data frame with the following properties:
#'
#' - `Col Word`: Renamed input from `word`
#' - `Root Word`: An additional column which contains the word(s) after being stemmed.
#'
#'
#' @examples
#'
#' #Specifying a character vector &
#' #use a dictionary from malaytextr package
#'
#' stem_malay(word = "banyaknya", dictionary = malayrootwords)
#'
#'
#'
#' #A data frame,
#' #Use a dictionary from malaytextr package,
#' #With a dataframe, you will need to specify the column to be stemmed
#'
#'x <- data.frame(text = c("banyaknya","sangat","terkedu", "pengetahuan"))
#'
#'stem_malay(word = x, dictionary = malayrootwords, col_feature1 = "text")
#'
#' @export
"stem_malay"
to_data.frame <- function(x) {
`Col Word` = NULL
data.frame(x) %>%
dplyr::rename(`Col Word` = 1) %>%
dplyr::mutate(`Col Word` = stringr::str_to_lower(`Col Word`))
}
stem_malay <- function(word, dictionary, col_feature1, col_dict1 = "Col Word", col_dict2 = "Root Word", Word) {
UseMethod("stem_malay")
}
#' @export
stem_malay.character <- function(word, dictionary, col_feature1, col_dict1 = "Col Word", col_dict2 = "Root Word", Word) {
#global binding
`Root Word` = NULL
if (!missing(Word)) {
warning("argument Word is deprecated; please use word instead.",
call. = FALSE)
word <- Word
}
#specify suffix, infix, prefix, suffix ----
extra_suffix = "nya$"
infix = "^(el|em|er)"
prefix = "^(memper|diper|ber|bel|per|ter|mem|penye|peny|menye|meny|menge|penge|meng|peng|men|pen|me|pem|pe|be|ke|se|ter|te|di)"
suffix = "(kannya|nya|kan|an|i|kah|lah|pun|ita|man|wan|wati|ku|mu)$"
# string to data frame ---
word <- to_data.frame(word)
# This is the root word variable in a dictionary ---
col <- dplyr::sym(col_dict2)
# Change columns in the dictionary to lowercase-format ---
dictionary <- dplyr::mutate_all(dictionary, .funs= stringr::str_to_lower)
# Map word to get the root word ---
df_map <- dplyr::left_join(word, dictionary, by = c("Col Word" = rlang::as_name(col_dict1))) #map word with root word
# To indicate which one can be found, and which not ---
df_map <- df_map %>%
dplyr::mutate(match = dplyr::if_else(is.na({{col}}),
"NO",
"YES"))
# If word can be found, use the word from dictionary ---
df_map <- df_map %>%
dplyr::mutate(`Root Word` = dplyr::if_else(match == "YES",
{{col}},
word %>% dplyr::pull(1))) %>%
# If cannot be found, remove extra suffix ---
dplyr::mutate(`Root Word` = dplyr::if_else(match == "NO" &
stringr::str_detect(`Root Word`, extra_suffix),
stringr::str_remove(`Root Word`, extra_suffix),
`Root Word`)) %>%
# Then remove prefix for word with more than 5 characters
dplyr::mutate(`Root Word` = dplyr::if_else(match == "NO" &
stringr::str_detect(`Root Word`, prefix) & nchar(`Root Word`) > 5,
stringr::str_remove(`Root Word`, prefix),
`Root Word`)) %>%
# Then remove suffix for word with more than 5 characters
dplyr::mutate(`Root Word` = dplyr::if_else(match == "NO" &
stringr::str_detect(`Root Word`, suffix) & nchar(`Root Word`) > 5,
stringr::str_remove(`Root Word`, suffix),
`Root Word`))
df_map <- df_map %>%
dplyr::select(-c(match))
message("'Root Word' is now returned instead of 'root_word'")
return(df_map)
}
#' @export
stem_malay.data.frame <- function(word, dictionary, col_feature1, col_dict1 = "Col Word", col_dict2 = "Root Word", Word) {
#global binding
`Root Word` = NULL
`Col Word` = NULL
if (!missing(Word)) {
warning("argument Word is deprecated; please use word instead.",
call. = FALSE)
word <- Word
}
#specify suffix, infix, prefix, suffix ----
extra_suffix = "nya$"
infix = "^(el|em|er)"
prefix = "^(memper|diper|ber|bel|per|ter|mem|penye|peny|menye|meny|menge|penge|meng|peng|men|pen|me|pem|pe|be|ke|se|ter|te|di)"
suffix = "(kannya|nya|kan|an|i|kah|lah|pun|ita|man|wan|wati|ku|mu)$"
# This is the root word variable in a dictionary ---
col <- dplyr::sym(col_dict2)
# Change columns to lowercase-format ---
word <- dplyr::mutate_all(word, .funs= stringr::str_to_lower)
dictionary <- dplyr::mutate_all(dictionary, .funs= stringr::str_to_lower)
# Rename column
word <- dplyr::rename(word, `Col Word` = {{ col_feature1 }})
# Map word to get the root word ---
df_map <- dplyr::left_join(word, dictionary, by = c("Col Word" = rlang::as_name(col_dict1)))
# To indicate which one can be found, and which not ---
df_map <- df_map %>%
dplyr::mutate(match = dplyr::if_else(is.na({{col}}),
"NO",
"YES"))
# If word can be found, use the word from dictionary ---
df_map <- df_map %>%
dplyr::mutate(`Root Word` = dplyr::if_else(match == "YES",
{{col}},
`Col Word`)) %>%
# If cannot be found, remove extra suffix ---
dplyr::mutate(`Root Word` = dplyr::if_else(match == "NO" &
stringr::str_detect(`Root Word`, extra_suffix),
stringr::str_remove(`Root Word`, extra_suffix),
`Root Word`)) %>%
# Then remove prefix for word with more than 5 characters
dplyr::mutate(`Root Word` = dplyr::if_else(match == "NO" &
stringr::str_detect(`Root Word`, prefix) & nchar(`Root Word`) > 5,
stringr::str_remove(`Root Word`, prefix),
`Root Word`)) %>%
# Then remove suffix for word with more than 5 characters
dplyr::mutate(`Root Word` = dplyr::if_else(match == "NO" &
stringr::str_detect(`Root Word`, suffix) & nchar(`Root Word`) > 5,
stringr::str_remove(`Root Word`, suffix),
`Root Word`))
df_map <- df_map %>%
dplyr::select(-c(match))
message("'Root Word' is now returned instead of 'root_word'")
return(df_map)
}
#' @export
#' @rdname stem_malay
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.