Nothing
#' @title Statistical assessment of
#' impacts of a specified theme from a DTD.
#' @description This function assesses the impacts of a theme
#' (or subject) on the overall opinion computed for a DTD
#' Different themes in a DTD can be identified by the keywords
#' used in the DTD. These keywords (or words) can be extracted by
#' any analytical means available to the users, e.g.
#' `word_imp` function. The keywords must be collated and
#' supplied this function through the `theme_keys` argument
#' (see below).
#' @param textdoc An \code{n} x \code{1} list (dataframe) of
#' individual text records, where \code{n} is the total
#' number of individual records.
#' @param theme_keys (a list) A one-column dataframe (of any
#' number of length) containing a list of keywords relating
#' to the theme or secondary subject to be investigated.
#' The keywords can also be defined as a vector of characters.
#' @param metric (an integer) Specify the metric to utilize
#' for the calculation of opinion score. Default: \code{1}.
#' See detailed documentation
#' in the \code{opi_score} function.
#' @param fun A user-defined function given that parameter
#' \code{metric} (above) is set equal to \code{5}.
#' See detailed documentation
#' in the \code{opi_score} function.
#' @param nsim (an integer) Number of replicas (ESD) to generate.
#' See detailed documentation in the \code{opi_sim} function.
#' Default: \code{99}.
#' @param alternative (a character) Default: \code{"two.sided"},
#' indicating a two-tailed test. A user can override
#' this default value by specifying \code{“less”} or \code{“greater”}
#' to run the analysis as one-tailed test when the observed score
#' is located at the lower or upper regions of the expectation
#' distribution, respectively. Note: for \code{metric=1},
#' the `alternative` parameter should be
#' set equal to \code{"two.sided"} because the opinion score is
#' bounded by both positive and negative values. For an opinion
#' score bounded by positive values, such as when
#' \code{metric = 2, 3 or 4}, the `alternative` parameter
#' should be set as "greater", and set as "less" otherwise.
#' If metric parameter is set equal to \code{5}, with a user-defined
#' opinion score function (i.e. `fun` not NULL ), the user is required
#' to determine the limits of the opinion scores, and set the
#' `alternative` argument appropriately.
#' @param quiet (TRUE or FALSE) To suppress processing
#' messages. Default: \code{TRUE}.
#' @usage opi_impact(textdoc, theme_keys=NULL, metric = 1,
#' fun = NULL, nsim = 99, alternative="two.sided",
#' quiet=TRUE)
#'
#' @examples
#'
#' # Application in marketing:
#'
#' #`data` -> 'reviews_dtd'
#' #`theme_keys` -> 'refreshment_theme'
#'
#' #RQ2a: "Do the refreshment outlets impact customers'
#' #opinion of the services at the Piccadilly train station?"
#'
#' ##execute function
#' output <- opi_impact(textdoc = reviews_dtd,
#' theme_keys=refreshment_theme, metric = 1,
#' fun = NULL, nsim = 99, alternative="two.sided",
#' quiet=TRUE)
#'
#' #To print results
#' print(output)
#'
#' #extracting the pvalue in order to answer RQ2a
#' output$pvalue
#'
#'
#' @details This function calculates the statistical
#' significance value (\code{p-value}) of an opinion score
#' by comparing the observed score (from the \code{opi_score}
#' function) with the expected scores (distribution) (from the
#' \code{opi_sim} function). The formula is given as
#' `p = (S.beat+1)/(S.total+1)`, where `S_total` is the total
#' number of replicas (`nsim`) specified, `S.beat` is number of replicas
#' in which their expected scores are than the observed score (See
#' further details in Adepeju and Jimoh, 2021).
#' @return Details of statistical significance of impacts
#' of a secondary subject `B` on the opinion concerning the
#' primary subject `A`.
#' @references (1) Adepeju, M. and Jimoh, F. (2021). An Analytical
#' Framework for Measuring Inequality in the Public Opinions on
#' Policing – Assessing the impacts of COVID-19 Pandemic using
#' Twitter Data. https://doi.org/10.31235/osf.io/c32qh
#' @importFrom tidytext unnest_tokens
#' @importFrom tibble tibble
#' @importFrom magrittr %>%
#' @importFrom stringr str_detect
#' @importFrom purrr map_chr
#' @importFrom magrittr %>%
#' @importFrom stringr str_c
#' @importFrom likert likert.options likert
#' likert.bar.plot
#' @importFrom dplyr filter mutate left_join arrange
#' select arrange if_else mutate_if
#' @export
opi_impact <- function(textdoc, theme_keys=NULL, metric = 1,
fun = NULL, nsim = 99, alternative="two.sided",
quiet=TRUE){ #tweets
keywords <- text <- ID <- sentiment<-flush.console <-
desc <- asterisk <- comb <- NULL
if(!is.null(theme_keys)){
theme_keys <- data.frame(theme_keys)
}
#output holder
output <- list()
#check if randomization is too small
if(nsim < 99){
stop("Number of simulation (nsim) is too small!!")
}
if(nsim > 9999){
stop(paste("Consider specifying a smaller",
"number of simulations (nsim)!!", sep=" "))
}
if(is.null(theme_keys)){
stop(" 'theme_keys' parameter cannot be 'NULL'!! ")
}
#check any contradiction in tail comparison
if(metric == 1 & (alternative %in% c("less", "greater"))){
stop(paste("When parameter `metric = 1`, argument",
" `alternative` must be set as 'two.sided'!! "))
}
if(metric %in% c(2:4) & alternative == "two.sided"){
stop(paste('When parameter `metric =` ', metric,
", argument `two.sided` must be set as 'less'!! ", sep=""))
}
#format keywords
theme_keys <- data.frame(as.character(theme_keys[,1]))
colnames(theme_keys) <- "keys"
theme_keys <-
as.character(theme_keys %>% map_chr(~ str_c(., collapse = "|")))
#format text records
textdoc <- data.frame(textdoc[,1])
colnames(textdoc) <- c("text")
#separate `textdoc` into two:
#textdoc_keypresent': contains any of the keywords
#textdoc_keyabsent': contains no keywords
textdoc_keypresent <- data.frame(textdoc) %>%
filter(str_detect(text, theme_keys, negate=FALSE)) %>%
mutate(keywords = "present")
if(nrow(textdoc_keypresent)==0){
stop(paste("The text record contains NONE of",
"the secondary keywords!! Operation terminated!!", sep=" "))
}
textdoc_keyabsent <- data.frame(textdoc) %>%
filter(stringr::str_detect(text, theme_keys, negate=TRUE))%>%
mutate(keywords = "absent")
#combine and retain the IDs of text records
#for later joining
textdoc_comb <- rbind(textdoc_keypresent, textdoc_keyabsent)
#create seq_ID
textdoc_comb$ID <- seq.int(nrow(textdoc_comb))
textonly <- data.frame(textdoc_comb$text) #head(textonly)
colnames(textonly) <- "text"
#drop the large text field
textdoc_comb <- textdoc_comb %>%
select(c(keywords, ID))
#compute the OSD and opinion scores
obj_both <- opi_score(textonly, metric = metric, fun = fun)
#extract the score
observed_score <- as.numeric(gsub("\\%", "", obj_both$opiscore))
#extract the OSD
OSD <- obj_both$OSD
#join OSD and the with the pre-processed
OSD_joined <- textdoc_comb %>%
left_join(OSD) %>%
#filter records that do not contain sentiment words
filter(!is.na(sentiment)) %>%
arrange(desc(keywords), sentiment)%>%
select(ID, sentiment, keywords)
#prepare data for Likert plot
#filter neutral
likert_osd <- OSD_joined %>%
#filter(sentiment != "neutral") %>%
arrange(keywords, desc(sentiment)) %>%
select(-c(ID)) %>%
mutate(class = if_else(sentiment == "neutral",
paste("neutral", "", sep=""),
paste(paste("key",keywords, sep="."), sentiment, sep="_")))%>%
select(class)
#hich(OSD_joined$sentiment == "neutral")
#unique(likert_osd$class)
#plot function here
#if(pplot == TRUE){
lik_p1 <- data.frame(likert_osd) %>% mutate_if(is.character,as.factor)
# Make list of ordered factor variables
out <- lapply(lik_p1, function(x) ordered(x,
levels = c("key.absent_positive", "key.absent_negative",
"neutral", "key.present_negative", "key.present_positive")))
# Combine into data.frame
res <- do.call( data.frame , out )
# Build plot
likert.options(legend="Classes")
p <- likert(res)
title <- "Percentage proportion of classes"
#plot(p, center=3, centered=FALSE) + ggtitle(title)
pp <- likert.bar.plot(p, legend="Classes")
#}
#terminate process if keyword fields
#does not include both 'present' and 'absent'
pres_abs <- unique(OSD_joined$keywords)
if(length(pres_abs) == 1){
stop(paste("The 'theme_keys' are either completely present",
"or absent in a sentiment class! The process terminated!!",
sep=" "))
}
#generate expected scores using `opi_sim` function
expected_scores <- opi_sim(osd_data = OSD_joined,
nsim=nsim,
metric = metric,
fun = fun,
quiet=quiet)
#}
# if(quiet == FALSE){
# #generate expected scores using `opi_sim` function
# expected_scores <- opi_sim(osd_data = OSD_joined,
# nsim=nsim,quiet=FALSE)
# }
#check if there is contradiction in
#the alternative argument
if(observed_score > 0 & alternative == "less"){
flush.console()
print(paste("Warning: 'Observed score' is positive!!",
"'two.sided' criterion is utilized!"))
alternative <- "two.sided"
}
if(observed_score <= 0 & alternative == "greater"){
flush.console()
print(paste("Warning: 'Observed score' is negative!!",
"'two.sided' criterion is utilized!"))
alternative <- "two.sided"
}
#first, check the alternative argument
#second, check the sign of the observed score.
#third, compute the p-value
if(alternative == "two.sided"){
if(observed_score <= 0){
S <- expected_scores[which(expected_scores <= observed_score)]
S <- length(S)
}
if(observed_score > 0){
S <- expected_scores[which(expected_scores > observed_score)]
S <- length(S)
}
p <- round((S + 1)/(nsim + 1), digits = nchar(nsim))
}
if(alternative == "less"){
S <- expected_scores[which(expected_scores <= observed_score)]
S <- length(S)
p <- round((S + 1)/(nsim + 1), digits = nchar(nsim))
}
if(alternative == "greater"){
S <- expected_scores[which(expected_scores > observed_score)]
S <- length(S)
p <- round((S + 1)/(nsim + 1), digits = nchar(nsim))
}
#preparing final result
S_beat <- S
#significance level
#first, create a table of significance
#nsim=99
ci <- c(95, 97.5, 99, 99.9, 99.99)
v <- c(nsim, ((nsim + 1) - ((nsim + 1) * (ci/100))))
#remove decimal locations
v <- v[v >= 1]
p_loc <- round(v/(nsim+1), digits = nchar(nsim+1))
p_loc <- p_loc[order(p_loc)]
#create list of asterix
aster <- NULL
for(i in 3:1){ #i=3
n <- rep("*", i)
aster <- rbind(aster, paste(n, collapse='' ))
}
aster <- rbind(aster, "'")
p_loc <- data.frame(cbind(p_loc, aster))
colnames(p_loc) <- c("p_loc", "asterisk")
v <- c(v, 0)
v <- v[order(v)]
#where does S falls
int <- findInterval(S_beat, v)
signif <- p_loc[int,2]
signif <- paste(signif, collapse='')
#finally merge col
p_loc <- p_loc %>%
mutate(comb=paste(p_loc, asterisk, sep="")) %>%
select(comb)
p_loc <- p_loc[, "comb"]
#collate all results
output$test <- "Test of significance (Randomization testing)"
output$criterion <- alternative
output$exp_summary <- summary(expected_scores)
output$p_table <- knitr::kable(data.frame(cbind(observed_score,
S_beat, nsim, pvalue=(S+1)/(nsim+1), signif)))
output$p_key <- rev(p_loc)
output$p_formula <- "(S_beat + 1)/(nsim + 1)"
output$plot <- pp
return(output)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.