Nothing
## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## ----eval=FALSE---------------------------------------------------------------
# install.packages ("soundcorrs")
## -----------------------------------------------------------------------------
library (soundcorrs)
## ----eval=FALSE---------------------------------------------------------------
# # change all a’s to e’s
# function (x,meta)
# gsub ("a", "e", x)
#
# # change a’s followed by j, to e’s – version 1
# function (x, meta)
# gsub ("aj", "ej", x)
#
# # change a’s followed by j, to e’s – version 2
# function (x, meta)
# gsub ("a(?=j)", "e", x, perl=T)
#
# # change a’s and ä’s to e’s
# function (x, meta)
# gsub ("[aä]", "e", x)
#
# # change a’s to e’s only in the northern dialect
# function (x, meta)
# if (meta!="northern") x else gsub("a","e",x)
#
# # change a in the last syllable to e
# function (x, meta) {
#
# # find all the vowels
# # this assumes that 'trans' is a transcription object,
# # in which "V" is defined as a wild-card for 'all vowels'
# # alternatively, a square bracket notation could be used here,
# # similarly to the example above
# syllables <- gregexpr (expandMeta(trans,"V"), x)
#
# # find how many syllables x has
# last <- length (syllables[[1]])
#
# # replace a in the last one
# if (regmatches(x,syllables)[[1]][last] == "a")
# regmatches(x,syllables)[[1]][last] <- "e"
#
# # return the changed string
# return (x)
#
# }
## -----------------------------------------------------------------------------
# establish the paths of the samples included in soundcorrs
path.trans.com <- system.file ("extdata", "trans-common.tsv", package="soundcorrs")
path.trans.ipa <- system.file ("extdata", "trans-ipa.tsv", package="soundcorrs")
# and load them
trans.com <- read.transcription (path.trans.com)
trans.ipa <- read.transcription (path.trans.ipa)
# transcription needs to be an object of class ‘transcription’
class (trans.com)
# a basic summary
trans.com
# ‘data’ is the original data frame
# ‘cols’ is a guide to column names in ‘data’
# ‘meta’ is a vector of characters which act as metacharacters
# ‘values’ is a named list of the values of graphemes, exploded into vectors
# ‘zero’ are the characters denoting the linguistic zero
str (trans.com, max.level=1)
## -----------------------------------------------------------------------------
# establish the paths of the two datasets
path.abc <- system.file ("extdata", "data-abc.tsv", package="soundcorrs")
path.cap <- system.file ("extdata", "data-capitals.tsv", package="soundcorrs")
path.ie <- system.file ("extdata", "data-ie.tsv", package="soundcorrs")
# read “capitals”
d.cap.ger <- read.soundcorrs (path.cap, "German", "ALIGNED.German", path.trans.com)
d.cap.pol <- read.soundcorrs (path.cap, "Polish", "ALIGNED.Polish", path.trans.com)
d.cap.spa <- read.soundcorrs (path.cap, "Spanish", "ALIGNED.Spanish", path.trans.com)
d.cap <- merge (d.cap.ger, d.cap.pol, d.cap.spa)
# read “ie”
d.ie.lat <- read.soundcorrs (path.ie, "Lat", "LATIN", path.trans.com)
d.ie.eng <- read.soundcorrs (path.ie, "Eng", "ENGLISH", path.trans.ipa)
d.ie <- merge (d.ie.lat, d.ie.eng)
# read “abc”
tmp <- long2wide (read.table(path.abc,header=T), skip=c("ID"))
d.abc.l1 <- soundcorrs (tmp, "L1", "ALIGNED.L1", trans.com)
d.abc.l2 <- soundcorrs (tmp, "L2", "ALIGNED.L2", trans.com)
d.abc <- merge (d.abc.l1, d.abc.l2)
# some basic summary
d.abc.l1
d.abc
# ‘cols’ are the names of important columns
# ‘data’ is the original data frame
# ‘names’ are the names of the languages
# ‘segms’ are words exploded into segments; ‘$z’ is a variant with linguistic zeros; ‘$nz’ without them
# ‘segpos’ is a lookup list to check which character belongs to which segment; ‘$z’ is a variant with linguistic zeros; ‘$nz’ without them
# ‘separators’ are the strings used as segment separators
# ‘trans’ are ‘transcription’ objects
# ‘words’ are words obtained by removing separators from the ‘col.aligned’ column; ‘$z’ is a variant with linguistic zeros; ‘$nz’ without them
str (d.abc, max.level=1)
## -----------------------------------------------------------------------------
# a simple sound change
sc.V2a <- soundchange ("V > a", "V>a", trans.com, "All vowels change into a.")
# basic summary
sc.V2a
# ‘name’ is the name of the sound change
# ‘desc’ is a brief description
# ‘fun’ is the sound change function
# ‘trans’ is the transcription used in the change function
str (sc.V2a, max.level=1)
# if need be, functions inside ‘soundchange’ objects can be applied directly
sc.V2a$fun ("ouroboros", NULL)
# a slightly more complex change
sc.VV2a <- soundchange ("V{2,} > a", "VV>a", trans.com, "Only diphthongs change into a.")
sc.VV2a$fun ("ouroboros", NULL)
# a slightly more complex change
sc.CV2Ca <- soundchange ("CV > \\1a", "CV>Ca", trans.com, "Only postconsonantal vowels change into a.")
sc.CV2Ca$fun ("ouroboros", NULL)
# a more complex sound change
sc.2ndV2a.fun <- function (x, meta) {
tmp <- gregexpr (expandMeta(trans.com,"V+"), x)
regmatches(x,tmp)[[1]][2] <- "a"
return (x)
}
sc.2ndV2a <- soundchange (sc.2ndV2a.fun, "2ndV>a", trans.com,
"Only the vowel in the second syllable changes into a.")
sc.2ndV2a$fun ("ouroboros", NULL)
## -----------------------------------------------------------------------------
# “ab” spans segments 1–2, while “a” only occupies segment 1
findExamples (d.abc, "ab", "a", distance.end=0)
findExamples (d.abc, "ab", "a", distance.end=1)
# linguistic zeros cannot be found if ‘zeros’ is set to ‘FALSE’
findExamples (d.abc, "-", "", zeros=T)
findExamples (d.abc, "-", "", zeros=F)
# both the usual and custom regular expressions are permissible
findExamples (d.abc, "a", "[ou]")
findExamples (d.abc, "a", "O")
# the output is actuall a list
str (findExamples(d.abc,"a","a"), max.level=1)
# ‘data’ is what is displayed on the screen
# ‘which’ is useful for subsetting
subset (d.abc, findExamples(d.abc,"a","O")$which)
# ‘which’ can also be used to find examples
# that exhibit more than one correspondence.
aaa <- findExamples (d.cap, "a", "a", "a", distance.start=0, distance.end=0)$which
bbb <- findExamples (d.cap, "b", "b", "b", distance.start=0, distance.end=0)$which
d.cap$data [aaa & bbb,]
# the ‘cols’ argument can be used to alter the printed output
findExamples (d.abc, "a", "O", cols=c("ORTHOGRAPHY.L1","ORTHOGRAPHY.L2"))
## -----------------------------------------------------------------------------
# and see what result this gives
allPairs (d.abc, cols=c("ORTHOGRAPHY.L1","ORTHOGRAPHY.L2"))
# a clearer result could be obtained by running
# allPairs (d.cap, cols=c("ORTHOGRAPHY.German","ORTHOGRAPHY.Polish"),
# file="~/Desktop/d.cap.html", formatter=formatter.html)
## -----------------------------------------------------------------------------
# load the new formatter function …
# source ("~/Desktop/myFormatter.R")
# … and use it instead of formatter.html()
# allPairs (d.cap, cols=c("ORTHOGRAPHY.German","ORTHOGRAPHY.Polish"),
# file="~/Desktop/d.cap.html", formatter=myFormatter)
# note that this time the output will not open in the web browser automatically
## ----warning=FALSE------------------------------------------------------------
# prepare a list of changes, in the order of application
# for the definitions, see subsection Sound changes in section Loading data
sc.list <- list (sc.VV2a, sc.2ndV2a, sc.CV2Ca)
# prepare the data and the expected results
# (warnings can be safely ignored in this case)
tmp.l1 <- soundcorrs(data.frame(SOURCE=c("ouroboros","jormungandr")), "L1", "SOURCE", trans.com)
tmp.l2 <- soundcorrs(data.frame(TARGET=c("arabaras","jarmangandr")), "L2", "TARGET", trans.com)
dataset <- merge (tmp.l1, tmp.l2)
# and apply the changes to our data
res <- applyChanges (dataset, sc.list, "SOURCE", "TARGET", NULL)
res
# see if they match the expectations
res$match
# see which change did not work as expected
# it was CV > Ca because our changes use the sample "common" transcription,
# and j does not count in it as a consonant (it's a semivowel)
res$tree
## -----------------------------------------------------------------------------
# a general overview of the dataset as a whole
summary (d.abc)
# words are the default ‘unit’
summary (d.abc, unit="o")
# in relative values …
rels <- summary (d.abc, count="r")
round (rels, 2)
# … relative to entire rows
apply (rels, 1, sum)
## -----------------------------------------------------------------------------
# a general look in the internal mode
coocc (d.abc)
# now with metadata
coocc (d.abc, "DIALECT.L2")
# in the internal mode,
# the relative values are with regard to segment-to-segment blocks
tab <- coocc (d.abc, count="r")
rows.a <- which (rownames(tab) %hasPrefix% "a")
cols.b <- which (colnames(tab) %hasPrefix% "b")
sum (tab [rows.a, cols.b])
# there are four different segments in L1
sum (tab)
# if two correspondences never co-occur, the relative value is 0/0
# which R represents as ‘NaN’, and prints as empty space
coocc (d.abc, count="r")
# in the external mode,
# the relative values are with regard to blocks of rows, and all columns
tab <- coocc (d.abc, "DIALECT.L2", count="r")
rows.a <- which (rownames(tab) %hasPrefix% "a")
sum (tab [rows.a, ])
## -----------------------------------------------------------------------------
# for a small dataset, the result is going to be small
str (allCooccs(d.abc), max.level=0)
# but it can grow quite quickly with a larger dataset
str (allCooccs(d.cap), max.level=0)
# the naming scheme
names (allCooccs(d.abc))
# and with ‘column’ not ‘NULL’
names (allCooccs(d.abc,column="DIALECT.L2"))
## -----------------------------------------------------------------------------
# prepare some random data
set.seed (27)
dataset <- data.frame (X=1:10, Y=1:10 + runif(10,-1,1))
# prepare models to be tested
models <- list (
"model A" = list( formula="Y~a+X", start=list(list(a=1)) ),
"model B" = list( formula="Y~a^X", start=list(list(a=-1),list(a=1)) ))
# normally, (-1)^X would produce an error with ‘nls()’
# fit the models to the dataset
fit <- multiFit (models, dataset)
# inspect the results
summary (fit)
## -----------------------------------------------------------------------------
# prepare the data
dataset <- coocc (d.abc)
# prepare the models to be tested
models <- list (
"model A" = list( formula="Y~a*(X+b)^2", start=list(list(a=1,b=1)) ),
"model B" = list( formula="Y~a*(X-b)^2", start=list(list(a=1,b=1)) ))
# vanilla nls() often requires fairly accurate starting estimates
# fit the models to the dataset
fit <- fitTable (models, dataset, 1, vec2df.hist)
# inspect the results
summary (fit, metric="sigma")
## -----------------------------------------------------------------------------
# using the default ‘|’ …
addSeparators (d.abc$data$ORTHOGRAPHY.L1)
# … or a full stop
addSeparators (d.abc$data$ORTHOGRAPHY.L1, ".")
## -----------------------------------------------------------------------------
# build a table for a slightly larger dataset
tab <- coocc (d.cap)
# let us focus on L1 a and o
rows <- which (rownames(tab) %hasPrefix% "a")
cols <- which (colnames(tab) %hasPrefix% "o")
binTable (tab, rows, cols)
# or on all a-like and o-like vowels
rows <- which (rownames(tab) %hasPrefix% "[aāäǟ]")
cols <- which (colnames(tab) %hasPrefix% "[oōöȫ]")
binTable (tab, rows, cols)
## -----------------------------------------------------------------------------
# let us search a column other than the one specified as ‘aligned’
orth <- d.abc$data [, "ORTHOGRAPHY.L2"]
# look for all VCC sequences
query <- expandMeta (d.abc$trans[[1]], "VCC")
orth [grep(query,orth)]
# look for all VCC words
query <- expandMeta (d.abc$trans[[1]], "^VCC$")
orth [grep(query,orth)]
# the same in the binary notation
query <- expandMeta (d.abc$trans[[1]], "^[+vow][+cons][+cons]$")
orth [grep(query,orth)]
## -----------------------------------------------------------------------------
# build a table for a slightly larger dataset
tab <- coocc (d.cap)
# it is quite difficult to read as a whole, so let us focus
# on a-like vowels in L1 and s-like consonants in L2
rows <- which (rownames(tab) %hasPrefix% "[aāäǟ]")
cols <- which (colnames(tab) %hasPrefix% "[sśš]")
tab [rows, cols]
# and now let us see what corresponds to a-like vowels in L1
# and s-like consonants in L2
rows <- which (rownames(tab) %hasSuffix% "[aāäǟ]")
cols <- which (colnames(tab) %hasSuffix% "[sśš]")
tab [rows, cols]
## -----------------------------------------------------------------------------
# let us prepare the tables
tabs <- allCooccs (d.abc, bin=F)
# and apply the chi-squared test to them
chisq <- lapplyTest (tabs)
chisq
# this is only an example on a tiny dataset, so let us be more forgiving
summary (chisq, p.value=0.3)
# let us see the problems with ‘a’
attr (chisq$a, "error")
attr (chisq$a, "warning")
# this warning often means that the data were insufficient
tabs$a
## -----------------------------------------------------------------------------
# load a transcription
tmp <- loadSampleDataset ("trans-common")
# it's the same that we've already loaded above
identical (tmp, trans.com)
## -----------------------------------------------------------------------------
# the “abc” dataset is in the long format
abc.long <- read.table (path.abc, header=T)
# the simplest conversion unnecessarily doubles the ID column
long2wide (abc.long)
# but this can be avoided with the ‘skip’ argument
abc.wide <- long2wide (abc.long, skip="ID")
## -----------------------------------------------------------------------------
# with n==1, ngrams() returns simply the frequencies of segments
ngrams (d.cap$data[,"ORTHOGRAPHY.Spanish"])
# counts can easily be turned into a data frame with ranks
tab <- ngrams (d.cap$data[,"ORTHOGRAPHY.Spanish"], n=2)
mtx <- as.matrix (sort(tab,decreasing=T))
head (data.frame (RANK=1:length(mtx), COUNT=mtx, FREQ=mtx/sum(mtx)))
## -----------------------------------------------------------------------------
# select only examples from L2’s northern dialect
subset (d.abc, DIALECT.L2=="north") $data
# select only capitals of countries where German is an official language
subset (d.cap, grepl("German",d.cap$data$OFFICIAL.LANGUAGE)) $data
# select only pairs in which L1 a : L2 a
subset (d.abc, findPairs(d.abc,"a","a")$which) $data
## -----------------------------------------------------------------------------
# let us use the converted “abc” dataset
abc.wide
# with the separator preserved
wide2long (abc.wide, c(".L1",".L2"))
# and with the separator removed
wide2long (abc.wide, c(".L1",".L2"), strip=1)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.