#' Zipf law
zipf.law <- function(w, k=1,a=1,d=0) {
f = k/(seq_along(w)+d)^a
names(f) <- w
f
}
#' Exponential law
exponential.law <- function(w,a=2,k=1,d=0) {
f = k/a^(seq_along(w)+d)
names(f) <- w
f
}
#' Additive reduce law
additive.reduce.law <- function(w,k=1) {
1 - prod(1-w)^k
}
#' Read displex file
#'
#' This function reads file in the expected format from displex
#' @param file File name of the file with the data
#' @keywords read, displex
read.displex <- function(file) {
f = readLines(file)
infos = character()
users = character()
centers = character()
words = list()
for (i in seq_along(f)) {
a <- str_match(f[i], "^(\\d+) (\\d+) (\\d+) (.+)$")[-1]
infos[length(infos)+1] = a[1]
users[length(users)+1] = a[2]
centers[length(centers)+1] = a[3]
w = strsplit(a[4], ",")[[1]]
for (i in seq_along(w)) {
w[i] = trimws(w[i])
}
#w$class = "displex_wordlist"
words[[length(words)+1]] = w
}
d <- data.frame(infos=infos, users=users, centers=centers, words=I(words))
d
}
# General function for obtaining the spectrum of a interest center
# data -> data.frame as provided by read.displex
# law -> law to quantify compatibility with interest center
# reduce -> law to reduce the valorations of each word in each center to just one number
# return a data.frame with the field availability added.
build.availability <- function(data,law=exponential.law, reduce=additive.reduce.law) {
words = character()
availability = numeric()
for (i in seq_along(data$words)) {
words = c(words, data$words[i][[1]])
availability = c(availability, law(data$words[i][[1]]))
}
centers = rep(data$centers, vapply(data$words, FUN = function(x) length(x), FUN.VALUE = 0L))
new.data <- data.frame(centers=centers, words=words, availability=availability) %>%
group_by(centers,words) %>%
summarise(availability=reduce(availability),freq.abs=length(words),.groups="drop")
total_by_center <- new.data %>%
group_by(centers) %>%
summarise(counttotal=sum(freq.abs), .groups="drop") %>% ungroup()
new.data <-
left_join(new.data, total_by_center, by="centers") %>%
mutate(freq.rel=freq.abs/counttotal) %>%
arrange(-freq.rel) %>%
group_by(centers) %>%
mutate(freq.abs.cum = cumsum(freq.abs), freq.rel.cum = cumsum(freq.rel)) %>%
ungroup() %>%
arrange(centers,-availability)
new.data$order <- unlist(tapply(new.data$words, new.data$centers,function(x) {seq_along(x)}))
new.data %>% select(centers,words,order,availability,freq.abs,freq.rel,freq.abs.cum,freq.rel.cum)
}
# Obtain the availability following the López-Strassburger model
# return a data.frame with the field availability added.
build.lopezstrass.availability <- function(data) {
# Máxima posición alcanzada
n <- max(vapply(data$words, function(x) {length(x)}, FUN.VALUE=1L))
# Número total de hablantes
N <- length(unique(data$users))
build.availability(data = data,
law = function(w) {exponential.law(w,a=exp(2.3/(n-1)),d=-1)},
reduce = function(x) {sum(x) / N})
}
# Obtain the availability following the Ávila-Sánchez model
# returns a data.frame with the field availability added.
build.avilasanchez.availability <- function(data, k=1) {
build.availability(data,
law=function(x) {zipf.law(x,k=k,d=1)},
reduce=additive.reduce.law)
}
# Sugeno integral of the fuzzy set d, respect fuzzy measure g
# Returns the value of the integral
fuzzy.expected.value <- function(d, g=function(x) {length(x)/length(d)}, h=function(x) {sqrt(x)}, level=0) {
ng <- function(x) {
res <- g(x)
for (i in seq(length.out=level)) {
res = h(res)
}
res
}
# Calculate by function, and the levels for alpha
levels <- sort(unique(d), decreasing=TRUE)
# Determine alpha cuts and its measure
gs <- sapply(levels,function(x) {ng(d[d >= x])})
res <- cbind(levels,gs)
res <- max(apply(res,1,min))
res
}
# Label each word in each interest center in order to build levels of centrality
# data -> Availability data.frame as returned by build.avilability
# levels -> numbers of levels to clasify vocabulary
# returns -> a new data.frame, similar to input, but with labeled levels
classify.availability.levels <- function(data, levels=6) {
#new.data <- data %>% select(centers,words,order,availability)
new.data <- data
new.data$cutlevel = 0
new.data$level = 0
levels <- 6
for (i in seq(length.out = levels)) {
cuts <- tapply(new.data$availability,new.data$centers,function(x) {fuzzy.expected.value(x,level=i)})
cuts <- data.frame(cuts=cuts,centers=names(cuts))
new.data <- left_join(new.data, cuts, by="centers")
new.data$level[new.data$availability >= new.data$cuts] <- i
new.data$cutlevel[new.data$availability >= new.data$cuts] <- new.data$cuts[new.data$availability >= new.data$cuts]
new.data <- new.data %>% select(centers,words,order,availability,level,cutlevel,freq.abs,freq.rel,freq.abs.cum,freq.rel.cum)
}
new.data %>% arrange(-availability,centers)
}
# Reduce data.frame to a
build.availability.levels <- function(data) {
mt <- tapply(data$words, list(data$centers,data$level),c,simplify=FALSE)
dmt <- data.frame(mt)
names(dmt) <- colnames(mt)
dmt$centers <- rownames(dmt)
dmt <- dmt %>% pivot_longer(-centers,names_to="level", values_to="words") %>% mutate(level=strtoi(level))
dmt$count <- lapply(dmt$words, function(x) {length(unlist(x))}) %>% unlist
dmt %>% select(centers,level,count,words) %>% arrange(centers,-level)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.