Nothing
#' @title Words Distribution
#' @description This function examines whether the distribution
#' of word frequencies in a text document follows the Zipf distribution
#' (Zipf 1934). The Zipf's distribution is considered the ideal
#' distribution of a perfect natural language text.
#' @param textdoc \code{n} x \code{1} list (dataframe)
#' of individual text records, where \code{n} is the number
#' of individual records.
#' @usage word_distrib(textdoc)
#' @examples
#'
#' #Get an \code{n} x 1 text document
#' tweets_dat <- data.frame(text=tweets[,1])
#' plt = word_distrib(textdoc = tweets_dat)
#'
#' plt
#'
#' @details The Zipf's distribution is most easily observed by
#' plotting the data on a log-log graph, with the axes being
#' log(word rank order) and log(word frequency). For a perfect
#' natural language text, the relationship between the word rank
#' and the word frequency should have a negative slope with all points
#' falling on a straight line. Any deviation from the straight
#' line can be considered an imperfection attributable to the
#' texts within the document.
#' @return A list of word ranks and their respective
#' frequencies, and a plot showing the relationship between
#' the two variables.
#' @references Zipf G (1936). The Psychobiology of Language.
#' London: Routledge; 1936.
#' @importFrom tidytext unnest_tokens
#' @importFrom ggplot2 ggplot geom_line aes scale_color_manual
#' labs scale_x_log10 scale_y_log10 scale_shape_manual geom_abline
#' xlab ylab theme theme_light element_text element_rect geom_point
#' ggplot_build position_nudge geom_text
#' @importFrom tibble tibble tribble
#' @importFrom magrittr %>%
#' @importFrom dplyr summarise ungroup row_number
#' select count mutate filter rename
#' @importFrom cowplot get_legend plot_grid
#'
#' @export
word_distrib <- function(textdoc){
outp <- list()
#check that data is one-column
if(dim(textdoc)[2]!=1){
stop(paste("Dataframe must include only one column",
"containing the text records!!", sep=" "))
}
dat <- list(textdoc)
series <- tibble()
#tokenize document
tokenized <- tibble(text = as.character(unlist(dat)))%>%
unnest_tokens(word, text)%>% #tokenize
select(everything())
series <- tokenized
#compute term frequencies
doc_words <- series %>%
count(word, sort = TRUE) %>%
ungroup()
total_words <- doc_words %>%
summarise(total = sum(n))
doc_words <- cbind(doc_words, total_words)
#plot log(rank order) vs. log(word frequency)
freq_by_rank <- doc_words %>%
mutate(rank = row_number(),
`term_freq` = n / total)
#---
data <- word <- text <- everything <- summarise <- n <- mutate <-
row_number <- total <- aes <- `term_freq` <- geom_line <-
scale_x_log10 <- scale_y_log10 <- filter <- lm <- geom_abline <- xlab <-
ylab <- theme_light <-Legend <- scale_colour_brewer <- x <- y <-
position_nudge <- ggplot_build <- NULL
# ggplot(freq_by_rank, aes(rank, `term_freq`)) +
# geom_line() +
# scale_x_log10() +
# scale_y_log10()
#compare the distribution to a simple regression line
#examine the head, tail and mid section of the plot
lower_rank <- freq_by_rank %>%
filter(rank < 500)%>%
rename(`term_freq` = 5)
int_slope <- lm(log10(`term_freq`) ~ log10(rank), data = lower_rank)
#colors <- c("Sepal Width" = "blue", "Petal Length" = "red")
#create a fictitious data to borrow its legend
dat <- tribble(
~Legend, ~x, ~y,
"LogFreq_vs_LogRank", -1, -1,
"LogFreq_vs_LogRank", 1, 1,
"Idealized_Zipfs_Law", -1, 1,
"Idealized_Zipfs_Law", 1, -1
)
p <- ggplot(data=dat, aes(x, y, color=Legend)) +
geom_line(size = 1.6, alpha = 0.8) +
scale_color_manual(values = c(LogFreq_vs_LogRank = "red",
Idealized_Zipfs_Law = "gray40"))
#get legend
leg <- get_legend(p)
freq_by_rank <- freq_by_rank %>%
mutate(group=1)
#get index for labels
xx <-round(nrow(freq_by_rank)/10, digits=0) - 2
#create sequence
idx <- xx
for(i in 2:10){#i<-2
idx <- rbind(idx, idx[i-1] + xx)
#flush.console()
#print(i)
}
idx <- as.vector(idx)
#plot to first get x labels
get_x <- ggplot(freq_by_rank, aes(rank, term_freq,
color="Log(freq) vs. Log(r)")) +
geom_line(size = 1.6, alpha = 0.8, colour="red") +
geom_point(colour="red", alpha=0, size=1) +
geom_text(data=freq_by_rank[idx,],
aes(rank, term_freq,label=word))+
labs(title="Checking text document against Zipf's law")+
scale_x_log10() +
scale_y_log10()
get_x <- ggplot_build(get_x)$layout$panel_params[[1]]$x$get_labels()
get_x <- as.numeric(get_x[!is.na(get_x)])
#modify the last label
get_x <-c(get_x[1:(length(get_x)-1)],round((get_x[length(get_x)] +
get_x[(length(get_x)-1)])/2, digits=0))
lpt <- ggplot(freq_by_rank, aes(rank, term_freq,
color="Log(freq) vs. Log(r)")) +
geom_line(size = 1.6, alpha = 0.8, colour="red") +
geom_point(colour="red", alpha=0, size=1) +
geom_text(data=freq_by_rank[get_x,],
aes(rank, term_freq,label=word, colour="black"),
colour="black", position = position_nudge(y = 0.35))+
labs(title="Checking text document against Zipf's law")+
scale_x_log10() +
scale_y_log10() +
xlab("log(Rank)") +
ylab("log(Term frequency)")+
#theme(legend.position = "top")
geom_abline(intercept = as.numeric(int_slope$coefficients[1]),
slope = as.numeric(int_slope$coefficients[2]),
color = c("gray40"), linetype = 2, size=1.2) +
theme_light()+
scale_shape_manual(values = 1) +
labs(shape = "", linetype = "") +
theme(panel.border = element_rect(colour = "black", fill=NA),
aspect.ratio = 1, axis.text = element_text(colour = 1, size = 12))
final_lpt <- plot_grid(lpt, leg, ncol = 2, rel_heights = c(1, .1),
rel_widths = c(2,1), axis = "l")#,
#labels = c("AA","BB"),
#label_x = c(1.28, 0.495), label_y= c(0.53, 0.485))
outp$freqRank <- freq_by_rank
outp$plot <- final_lpt
return(outp)
}
#plt = word_distrib(textdoc = policing_otd)
#print(plt)
#Ideal Zipf's law
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.