Nothing
## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## ----setup--------------------------------------------------------------------
library(LexFindR)
## ----installation, eval = FALSE-----------------------------------------------
# # Install LexFindR from CRAN
# install.packages("LexFindR")
#
# # Or the development version from GitHub:
# # install.packages("devtools")
# devtools::install_github("maglab-uconn/LexFindR")
## ----getting-started----------------------------------------------------------
library(LexFindR)
# Get cohort index of ark in dictionary of ark, art and bab
target <- "AA R K"
lexicon <- c("AA R K", "AA R T", "B AA B")
cohort <- get_cohorts(target, lexicon)
cohort
# To get forms rather than indices using base R
lexicon[cohort]
# To get forms rather than indices using the form option
get_cohorts(target, lexicon, form = TRUE)
# Get count using base R
length(cohort)
# Get count using the count option
get_cohorts(target, lexicon, count = TRUE)
# Frequency weighting
target_freq <- 50
lexicon_freq <- c(50, 274, 45)
# get the summed log frequencies of competitors
get_fw(lexicon_freq)
#
get_fwcp(target_freq, lexicon_freq)
## ----comment-on-CMU-----------------------------------------------------------
# By default, CMU has numbers that indicate stress patterns
#
# If you do not strip those out, instances of the same vowel
# with different stress numbers will be treated as different
# symbols. This may be useful for some purposes (e.g., finding
# cohorts or neighbors with the same stress pattern).
#
# Here is a contrived example, where ARK will not be considered
# related to ART or BARK because of stress pattern differences
target <- "AA0 R K"
lexicon <- c("AA0 R K", "AA2 R T", "B AA3 R K")
get_cohorts(target, lexicon, form = TRUE)
get_neighbors(target, lexicon, form = TRUE)
# If this is not the behavior we want, we can strip lexical
# stress indicators using regular expressions
target <- gsub("\\d", "", target)
lexicon <- gsub("\\d", "", lexicon)
print(target)
print(lexicon)
get_cohorts(target, lexicon, form = TRUE)
get_neighbors(target, lexicon, form = TRUE)
## ----slex-cohort-example------------------------------------------------------
library(tidyverse)
glimpse(slex)
# define the lexicon with the list of target words to compute
# cohorts for; we will use *target_df* instead of modifying
# slex or lemmalex directly
target_df <- slex
# specify the reference lexicon; here it is actually the list
# of pronunciations from slex, as we want to find all cohorts
# for all words in our lexicon. It is not necessary to create
# a new dataframe, but because we find it useful for more
# complex tasks, we use this approach here
lexicon_df <- target_df
# this instruction will create a new column in our target_df
# dataframe, "cohort_idx", which will be the list of lexicon_df
# indices corresponding to each word's cohort set
target_df$cohort_idx <-
lapply(
# in each lapply instance, select the target pronunciation
target_df$Pronunciation,
# in each lapply instance, apply the get_cohorts function
FUN = get_cohorts,
# in each lapply instance, compare the current target
# Pronunciation to each lexicon Pronunciation
lexicon = lexicon_df$Pronunciation
)
# let's look at the first few instances in each field...
glimpse(target_df)
## ----slex-rhyme-tidyverse-----------------------------------------------------
slex_rhymes <- slex %>% mutate(
rhyme_idx = lapply(Pronunciation, get_rhymes, lexicon = Pronunciation),
rhyme_str = lapply(rhyme_idx, function(idx) {
Item[idx]
}),
rhyme_count = lengths(rhyme_idx)
)
glimpse(slex_rhymes)
slex_rhymes <- slex_rhymes %>%
rowwise() %>%
mutate(
rhyme_freq = list(slex$Frequency[rhyme_idx]),
rhyme_fw = get_fw(rhyme_freq),
rhyme_fwcp = get_fwcp(Frequency, rhyme_freq)
) %>%
ungroup()
glimpse(slex_rhymes)
## ----parallelize, cache = TRUE------------------------------------------------
library(future.apply)
library(tictoc)
# using two cores for demo or else
# set `workers` to availableCores() to use all cores
plan(multisession, workers = 2)
glimpse(lemmalex)
# the portion between tic and toc below takes ~X seconds on a
# 15-inch Macbook Pro 6-core i9; if you replace future_lapply
# with lapply, it takes ~317 secs, v. 66 secs with future_lapply
tic("Finding rhymes")
slex_rhyme_lemmalex <- lemmalex %>% mutate(
rhyme = future_lapply(Pronunciation, get_rhymes,
lexicon = lemmalex$Pronunciation),
rhyme_str = lapply(rhyme, function(idx) {
lemmalex$Item[idx]
}),
rhyme_len = lengths(rhyme)
)
toc()
glimpse(slex_rhyme_lemmalex)
## ----extended-example---------------------------------------------------------
library(LexFindR)
library(tidyverse) # for glimpse
library(future.apply) # parallelization
library(tictoc) # timing utilities
# In this example, we define a dataframe source for target words
# (target_df) and another for the lexicon to compare the target
# words to (lexicon_df). Often, these will be the same, but we keep
# them separate here to make it easier for others to generalize from
# this example code.
# Code assumes you have at least 3 columns in target_df & lexicon_df:
# 1. Item -- a label of some sort, can be identical to Pronunciation
# 2. Pronunciation -- typically a phonological form
# 3. Frequency -- should be in occurrences per million, or some other
# raw form, as the functions below take the log of
# the frequency form. See advice about padding in
# the main article text.
#
# Of course, you can name your fields as you like, and edit the
# field names below appropriately.
target_df <- slex
lexicon_df <- target_df
# Prepare for parallelizing
# 1. how many cores do we have?
# num_cores <- availableCores()
# using two cores for demo
num_cores <- 2
print(paste0("Using num_cores: ", num_cores))
# 2. now let future.apply figure out how to optimize parallel
# division of labor over cores
plan(multisession, workers = num_cores)
# the functions in this list all return lists of word indices; the
# uniqueness point function is not included because it returns a
# single value per word.
fun_list <- c(
"cohorts", "neighbors",
"rhymes", "homoforms",
"target_embeds_in", "embeds_in_target",
"nohorts", "cohortsP", "neighborsP",
"target_embeds_inP", "embeds_in_targetP"
)
# we need to keep track of the P variants, as we need to tell get_fwcp
# to add in the target frequency for these, as they exclude the target
Ps <- c(
"cohortsP", "neighborsP", "target_embeds_inP",
"embeds_in_targetP"
)
# determine how much to pad based on minimum frequency
if (min(target_df$Frequency) == 0) {
pad <- 2
} else if (min(target_df$Frequency) < 1) {
pad <- 1
} else {
pad <- 0
}
# now let's loop through the functions
for (fun_name in fun_list) {
# start timer for this function
tic(fun_name)
# the P functions do not include the target in the denominator for
# get_fwcp; if we want this to be a consistent ratio, we need to
# add target frequency to the denominator
add_target <- FALSE
if (fun_name %in% Ps) {
add_target <- TRUE
}
# inform the user that we are starting the next function, make sure
# we are correctly adding target or not
cat("Starting", fun_name, " -- add_target = ", add_target)
func <- paste0("get_", fun_name)
# use *future_lapply* to do the competitor search, creating
# a new column in *target_df* that will be this function's
# name + _idx (e.g., cohort_idx)
target_df[[paste0(fun_name, "_idx")]] <-
future_lapply(target_df$Pronunciation,
FUN = get(func),
lexicon = lexicon_df$Pronunciation
)
# list the competitor form labels in functionname_str
target_df[[paste0(fun_name, "_str")]] <- lapply(
target_df[[paste0(fun_name, "_idx")]],
function(idx) {
lexicon_df$Item[idx]
}
)
# list the competitor frequencies in functionname_freq
target_df[[paste0(fun_name, "_freq")]] <- lapply(
target_df[[paste0(fun_name, "_idx")]],
function(idx) {
lexicon_df$Frequency[idx]
}
)
# put the count of competitors in functionname_num
target_df[[paste0(fun_name, "_num")]] <-
lengths(target_df[[paste0(fun_name, "_idx")]])
# put the FW in functionname_fwt
target_df[[paste0(fun_name, "_fwt")]] <-
mapply(get_fw,
competitors_freq = target_df[[paste0(fun_name, "_freq")]],
pad = pad
)
# put the FWCP in functionname_fwcp
target_df[[paste0(fun_name, "_fwcp")]] <-
mapply(get_fwcp,
target_freq = target_df$Frequency,
competitors_freq = target_df[[paste0(fun_name, "_freq")]],
pad = pad, add_target = add_target
)
toc()
}
# Note that get_neighborsP excludes rhymes. If you do not want to
# track rhymes separately and want neighborsP to include all
# rhymes that are not cohorts, you can create new fields that
# combine them, as we do here, creating "Pr" versions
target_df$neighborsPr_num = target_df$neighborsP_num + target_df$rhymes_num
target_df$neighborsPr_fwcp = target_df$neighborsP_fwcp + target_df$rhymes_fwcp
target_df$neighborsPr_fwt = target_df$neighborsP_fwt + target_df$rhymes_fwt
# Now let's streamline the dataframe; we'll select the num, fwt, and fwcp
# columns and put them in that order, while not keeping some of the other
# 'helper' columns we created
export_df <- target_df %>%
select(Item | Pronunciation | Frequency
| ends_with("_num") | ends_with("_fwt") | ends_with("_fwcp"))
glimpse(export_df)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.