# R/split.R In cysouw/qlcMatrix: Utility Sparse Matrix Functions for Quantitative Language Comparison

```# ==================================================
# High-level functions
# to convert data into collection of sparse matrices
# ==================================================

# split data table with nominal data into matrices
# consider for future to do this via long-form ("reshape")

splitTable <- function(	data,
attributes = colnames(data),
observations = rownames(data),
name.binder = ":",
split = NULL
) {

# assuming a data table with
#	- variables ("A"ttributes) as columns and
#	- observations (O) as rows
#	- values (V) in the cells (different for each column)
# - optionally, multiple values can occur in each cell
#   separated by character indicated in "split"

# multiple values in one cell can be splitted
splitColumn <- function(column, split) {
parts <- strsplit(column, split)
all <- unique(unlist(parts))
matches <- sapply(parts, function(x){match(x, all)})
M <- sparseMatrix(i = unlist(matches),
j = rep.int(1:length(matches),
sapply(matches,length)
)
)
return(list( M = M,
rownames = all
))
}

# basic rewrite, the rest is cosmetic
if (!is.null(split)) {
tt <- apply(data,2,function(x){splitColumn(x, split = split)})
} else {
tt <- apply(data,2,ttMatrix)
}

OV <- tt[[1]]\$M
for (i in tt[-1]) {
OV <- rbind(OV,i\$M)
}
OV <- t(OV)

# the following approach is slow... strange...
# http://www.r-bloggers.com/the-rbinding-race-for-vs-do-call-vs-rbind-fill/
# OV <- t(do.call(rbind,sapply(tt,function(x){x[[1]]})))

cls <- ncol(data)

# index matrix of variables to values
nrValues <- sapply(tt,function(x){length(x\$rownames)})
AV <- ttMatrix(rep.int(1:cls,nrValues))\$M

# we need some variable names to make value names, but entity names might be NULL

if (is.null(attributes)) {
attributes <- paste("X", 1:cls, sep="")
}

# make value names
values <- unlist(sapply(1:cls, function(x) {
paste(attributes[x],tt[[x]]\$rownames,sep=name.binder)
}, simplify = FALSE
)
)

return(list(	attributes = attributes,
values = values,
observations = observations,

OV = OV,	# Observations x Values
AV = AV		# Variables ("Attributes") x Values
))
}

# make unigram and bigram matrices from a vector of strings

splitStrings <- function(	strings,
sep = "",
bigrams = TRUE,
boundary = TRUE,
bigram.binder = "",
gap.symbol = "\u2043",
left.boundary = "#",
right.boundary = "#",
simplify = FALSE
) {
if (bigrams) {
gap.length <- 1
originals <- strings
if (boundary) {
strings <- paste(left.boundary, strings, right.boundary, sep = sep)
}
} else {
gap.length <- 0
}

# SW: Segments x Strings ("Words")
tmp <- pwMatrix(strings, sep=sep, gap.length=gap.length, gap.symbol=gap.symbol)
SW <- tmp\$M
segments <- tmp\$rownames

# US: unigrams x segments
tmp <- ttMatrix(segments)
US <- tmp\$M
unigrams <- tmp\$rownames

# Bisymbols x Segments
if (bigrams) {

# remove gap character from US and symbols
if (length(strings) > 1) {
gap.char <- which(unigrams == gap.symbol)
unigrams <- unigrams[-gap.char]
US <- US[-gap.char,]
}

S <- bandSparse(n = dim(US)[2], k = -1)
tmp <- rKhatriRao(US %*% S, US, unigrams, unigrams, binder = bigram.binder)
BS <- tmp\$M
bisymbols <- tmp\$rownames

# remove boundary from US and symbols
if (boundary) {
boundary.char <- which(unigrams == left.boundary | unigrams == right.boundary)
unigrams <- unigrams[-boundary.char]
US <- US[-boundary.char,]
}

# various forms of output
if (simplify) {
result <- (BS*1) %*% (SW*1)
rownames(result) <- bisymbols
colnames(result) <- originals
return(result)
} else {
return(list(	segments = segments,
unigrams = unigrams,
bigrams = bisymbols,
SW = SW, # Segments x Words
US = US, # Unigrams x Segments
BS = BS # Bigrams x Segments
))
}
} else {
if (simplify) {
result <- (US*1) %*% (SW*1)
rownames(result) <- unigrams
colnames(result) <- strings
return(result)
}
return(list(	segments = segments,
unigrams = unigrams,
SW = SW, # Segments x Words
US = US # Unigrams x Segments
))
}
}

# =========================================================
# convenience function specifically made for parallel texts
# =========================================================

# Read texts from the parallel-text project
# the long version around "scan" is not really quicker as the shortcut using read.table

#	data <- scan(file,sep="\t",comment.char="#",what="character",quote="")
#	dim(data) <- c(2,length(data)/2)
#	result <- data[2,,drop = TRUE]
#	names(result) <- data[1,,drop = TRUE]
#	return(result)

, sep = "\t"
, quote = ""
, colClasses = "character"
, row.names = 1
, encoding = "UTF-8"
)))

}

# make matrices from parallel texts (bible and the like)
# takes three arguments:
# - the text (as vector of strings)
# - the IDs for the sentences as found in this text
# - the IDs for the sentences as found in all texts (important for the parallelism)

splitText <- function(	text,
globalSentenceID = NULL,
localSentenceID = names(text),
sep = " ",
simplify = FALSE,
lowercase = TRUE
) {

# make RunningWords x Verses, i.e. all words of the text in the order as they appear as rows, linked to the localSentenceIDs as columns.
tmp <- pwMatrix(text, sep = sep, gap.length = 0)
RS <- tmp\$M
runningWords <- tmp\$rownames

# make type/token matrix linking the different wordforms to the individual words
# Wordforms x RunningWords
tmp <- ttMatrix(runningWords)
WR <- tmp\$M
wordforms <- tmp\$rownames

# link the localSentenceIDs "S" to the globalSentenceIDs "U"
if (!is.null(globalSentenceID)) {
tmp <- jMatrix(localSentenceID, globalSentenceID)
US <- t(tmp\$M2) %*% tmp\$M1
RS <- RS %*% t(US)
}

# remove upper/lowercase distinction for better statistics
if (lowercase) {
tmp <- ttMatrix(tolower(wordforms))
wW <- tmp\$M
lower <- tmp\$rownames
}

# various versions of output

if (!lowercase) {
if (simplify) {
R <- (WR*1) %*% (RS*1)
rownames(R) <- wordforms
return(R)
} else {
return(
list(	runningWords = runningWords,
wordforms = wordforms,

RS = RS,	# Running words x global sentence ID ("Sentence")
WR = WR		# Wordforms x Running words
))
}
} else {
if (simplify) {
R <- (wW*1) %*% (WR*1) %*% (RS*1)
rownames(R) <- lower
return(R)
} else {
return(
list(	runningWords = runningWords,
wordforms = wordforms,
lowercase = lower,

RS = RS,	# Running words x global sentence ID ("Sentence")
WR = WR,	# Wordforms x Running words
wW = wW		# lowercased wordforms x uppercase Wordforms
))
}
}
}

# ========================================================
# convenience function specifically made for QLC-wordlists
# ========================================================

splitWordlist <- function(	data,
doculects = "DOCULECT",
concepts = "CONCEPT",
counterparts = "COUNTERPART",
splitstrings = TRUE,
sep =  "",
bigram.binder = "",
grapheme.binder = "_",
simplify = FALSE
) {

# just a placeholder, assuming this one does not occur in the data
binder <- "\u2295"

# make doculect+counterpart combinations
# this is needed because sometimes the same string is found in different languages
combined <- paste( data[,doculects], data[,counterparts], sep = binder)

# DL: Doculects x Lines of data
tmp <- ttMatrix(data[,doculects])
DL <- tmp\$M
doculects <- tmp\$rownames

# CL: Concepts x Lines of data
tmp <- ttMatrix(data[,concepts])
CL <- tmp\$M
concepts <- tmp\$rownames

# WL: Counterparts ("Words") x Lines of data
tmp <- ttMatrix(combined)
WL <- tmp\$M
words <- tmp\$rownames
# split counterparts again from doculects

DW <- DL %*% t(WL)
CW <- CL %*% t(WL)

if (splitstrings) {

# split strings
S <- splitStrings(words, sep = sep, bigram.binder = bigram.binder)

# return results
if (simplify) {

BW <- (S\$BS*1) %*% (S\$SW*1)

# only use column names once because of size
rownames(DW) <- doculects
rownames(CW) <- concepts
rownames(BW) <- S\$bigrams
colnames(BW) <- words

return(list(DW = DW, CW = CW, BW = BW))

} else {
# separate characters to languages
# and prepare full output (rather long!)

# link to segments to doculects
DS <- DW %*% t(S\$SW)

# Graphemes x Segments
tmp <- rKhatriRao(	DS, S\$US,
doculects, S\$unigrams,
binder = grapheme.binder
)
GS <- tmp\$M
graphemes <- tmp\$rownames

# another KhatriRao to turn bisymbols into
# language-specific Bigraphs x Segments
tmp <- rKhatriRao(	DS, S\$BS,
doculects, S\$bigrams,
binder = grapheme.binder
)
TS <- tmp\$M
digraphs <- tmp\$rownames

return(list(	doculects = doculects,
concepts = concepts,
words = words,

segments = S\$segments,
unigrams = S\$unigrams,
bigrams = S\$bigrams,
graphemes = graphemes,
digraphs = digraphs,

DW = DW, # Doculects x Words
CW = CW, # Concepts x Words

SW = S\$SW,	# Segments x Words
US = S\$US,	# Unigrams x Segments
BS = S\$BS,	# Bigrams x Segments
GS = GS, 	# Graphemes x Segments
TS = TS 	# Digraphs x Segments
))
}
} else {
# without splitString
# much quicker, but only returns the basic structure
if (simplify) {

rownames(DW) <- doculects
rownames(CW) <- concepts
colnames(CW) <- words

return(list(DW = DW, CW = CW))
} else {
return(list(	doculects = doculects,
concepts = concepts,
words = words,

DW = DW, # Doculects x Words
CW = CW  # Concepts x Words
))
}
}
}
```
cysouw/qlcMatrix documentation built on April 22, 2018, 4:59 a.m.