# set global chunk options library(knitr); library(tm); library(qdap); #library(reports) opts_chunk$set(cache=FALSE, tidy=FALSE, warning=FALSE) opts_knit$set(upload.fun = image_uri, self.contained=TRUE) #library(knitcitations); library(reports); library(qdapTools) #bib <- read.bibtex(dir()[tools::file_ext(dir()) == "bib"][1]) source("funs/utils_functions.txt") source("funs/extra_functions.txt") # BU <- "http://trinker.github.io/qdap/" BU <- "http://trinker.github.io/qdap_dev/" #switch before upload LN <- function(fun, base=BU) paste0(BU, fun, ".html") FUN <- function(fun, fun2 = fun, base=BU) { HR2(LN(fun2), paste0("<code>", fun,"</code>"), copy2clip = FALSE) } BU2 <- "http://trinker.github.io/qdapDictionaries/" #switch before upload LN2 <- function(fun, base=BU2) paste0(BU2, fun, ".html") FUN2 <- function(fun, base=BU2) { HR2(LN2(fun), paste0("<code>", fun,"</code>"), copy2clip = FALSE) } yt <- function(URL) { paste0("<a href=\"", URL, "\" target=\"_blank\" style=\"text-decoration: none\"><b><font size=\"5\" color=\"#B22222\">[YT]</font></b></a>\n") } #cite in text using `r citet(bib[1])` uri_embed <- function(path, add="") { uri <- knitr::image_uri(path) cat(paste0("<img ", add, " src=\"", uri, "\" />")) } FT_vign <- function(..., text=text) {FT(..., text=text, copy2clip = FALSE)} HR_vign <- function(...) {HR(..., copy2clip = FALSE)} HR2_vign <- function(...) {HR2(..., copy2clip = FALSE)} CN_vign <- function(...) {CN(..., copy2clip = FALSE)} VS_vign <- function(...) {VS_vign(..., copy2clip = FALSE)} HS_vign <- function(...) {HS_vign(..., copy2clip = FALSE)}
r format(Sys.time(), '%B %d, %Y')
qdap (Rinker, 2013) is an R package designed to assist in quantitative discourse analysis. The package stands as a bridge between qualitative transcripts of dialogue and statistical analysis and visualization. qdap was born out of a frustration with current discourse analysis programs. Packaged programs are a closed system, meaning the researcher using the method has little, if any, influence on the program applied to her data.
R already has thousands of excellent packages for statistics and visualization. qdap is designed to stand as a bridge between the qualitative discourse of a transcript and the computational power and freedom that R offers. As qdap returns the power to the researcher it will also allow the researcher to be more efficient and thus effective and productive in data analysis. The qdap package provides researchers with the tools to analyze data and more importantly is a dynamic system governed by the data, shaped by theory, and continuously refined by the field.
...if you can dream up an analysis then qdap and R can help get you there.
uri_embed("imgs/qdaplogo.png", "width=\"350\", height=\"250\" style=\"display:block; margin-left:auto; margin-right:auto;\"")
The following vignette is a loose chronological road map for utilizing the tools provided by qdap.
## library(qdap) ## dat <- data.frame( ## x = c("project", "import_export", "tools", "cleaning", "viewing", ## "reshaping", "word", "coding", "counts", "measures", "visualization", ## "id", "data", "dict", "install"), ## ## y = c("Starting a New Project", "Import/Export Discourse Data", ## "Generic qdap Tools", "Cleaning/Preparing the Data", "View the Data", ## "Reshaping the Data", "Extract/Analyze Words", "Qualitative Coding System", ## "Word Counts and Descriptive Statistics", "Word Measures and Scoring", ## "Visualizing Discourse Data", "ID Sentences", ## "Data Sets", "Dictionaries and Word Lists", "Installation Issues") ## ) ## ## FUN <- function(x, y) { ## cat("\n\n") ## m <- paste0("<div>", 1:length(x), ". `r HR(\"#", x, "\", \"", y, "\")` </div> ") ## cat(paste(m, collapse="\n")); cat("\n") ## cat("\n\n") ## n <- paste0("<h3 id=\"", x, "\">", y, "</h3>") ## cat(paste(n, collapse="\n")); cat("\n") ## } ## ## FUN(dat[, 1], dat[, 2]) ## ## path <- "C:/Users/trinker/GitHub/trinker.github.com/qdap_dev" ## # path <- "C:/Users/trinker/GitHub/trinker.github.com/qdap" ## URL <- "http://trinker.github.io/qdap_dev/" ## # url <- "http://trinker.github.io/qdap" ## ## inds <- readLines(file.path(path, "index.html")) ## h3s <- grep("<h3", inds) ## h2s <- grep("<h2", inds) ## ## inds <- inds[head(h3s, 1):(tail(h2s, 1) - 1)] ## inds <- inds[12: tail(grep("</ul>", inds), 1)] ## h3s <- grep("<h3", inds) ## dat2 <- data.frame(start = h3s + 4, end = c(tail(h3s, -1) - 1, length(inds))) ## ## inds <- substring(inds, 5) ## ## ## ## invisible(lapply(1:nrow(dat2), function(i) { ## rws <- inds[dat2[i, 1]:dat2[i, 2]] ## ## funs <- unlist(genXtract(rws, ".html\">", "</a>")) ## descripts <- unlist(genXtract(rws, "<br />", "</li>")) ## ## rws <- rws[grepl("<code>", rws)] ## rws <- paste0("<form action=\"", file.path(URL, paste0(funs, ".html")), " target=\"_blank\" \"> ## <input type=\"submit\" value=\"", funs, "\"> - ", descripts, "\n</form>", "\n") ## ## ## cat(paste0("============\nfun group", i, "\n============\n")) ## cat(paste0("The following functions will be utilized in this section (click to view more): \n\n")) ## cat(paste(rws, collapse = "\n")); cat("\n") ## }))
The function r FUN("new_project")
is designed to generate project template of multiple nested directories that organize and guide the researcher through a qualitative study, from data collection to analysis and report/presentation generation. This workflow framework will enable the researcher to be better organized and more efficient in all stages of the research process. r FUN("new_project")
utilizes the r HR2("http://cran.r-project.org/web/packages/reports/reports.pdf", "reports package")
(Rinker, 2013b)
Please see the following links for PDF descriptions of the contents of the r FUN("new_project")
and the reports directory. r VS(2)
Project Workflow |
Report Workflow |
click here |
The r FUN("new_project")
template is designed to be utilized with r HR2("http://www.rstudio.com/ide/download/", "RStudio")
. Upon clicking the xxx.Rproj
file the template will be loaded into RStudio. The .Rprofile script will be sourced upon start up, allowing the user to automatically load packages, functions, etc. related to the project. The file extra_functions.R
is sourced, loading custom functions. Already included are two functions, email
and todo
, used to generate project member emails and track project tasks. This auto sourcing greatly enhances efficiency in workflow.
This subsection covers how to read in transcript data. Generally the researcher will have data stored as a .docx (Microsoft Word or Open/Libre Office) or .xlsx/.csv (spreadsheet format). It is of great importance that the researcher manually writes/parses their transcripts to avoid potential analysis problems later. All sentences should contain appropriate qdap punctuation (declarative = ., interrogative = ?, exclamatory = !, interrupted = | or r FUN("imperative")
= ., ?, !, |). Additionally, if a sentence contains an end mark/punctuation it should have accompanying text/dialogue. Two functions are useful for reading in data, r FUN("read.transcript")
and r FUN("dir_map")
. r FUN("read.transcript")
detects file type (.docx/.csv/.xlsx) and reads in a single transcript whereas r FUN("dir_map")
generates code that utilizes r FUN("read.transcript")
for each of the multiple transcripts in a single directory. Note that r FUN("read.transcript")
expects a two column formatted transcript (usually with person on the left and dialogue on the right).
Five arguments are of particular importance to read.transcript:
file |
The name of the file which the data are to be
read from. Each row of the table appears as one line of
the file. If it does not contain an absolute path, the
file name is relative to the current working directory,
|
col.names |
A character vector specifying the column names of the transcript columns. |
header |
logical. If |
sep |
The field separator character. Values on each
line of the file are separated by this character. The
default of |
skip |
Integer; the number of lines of the data file to skip before beginning to read data. |
Often transcripts contain extraneous material at the top and the argument r CN("skip = ?")
must be used to skip these extra lines. Some sort of unique separator must also be used to separate the person column from the text column. By default r CN('sep = ":"')
is assumed. If your transcripts do not contain a separator one must be inserted manually. Also note that the researcher may want to prepare the transcripts with brackets to denote non spoken annotations as well dialogue that is read rather than spoken. For more on bracket parsing see r HR("#bracket", "Bracket/General Chunk Extraction")
.
r FT(orange, 5, text="♦")
Reading In Data- read.transcript r FT(orange, 5, text="♦")
## Location of sample transcripts from the qdap package
(doc1 <- system.file("extdata/transcripts/trans1.docx", package = "qdap"))
(doc2 <- system.file("extdata/transcripts/trans2.docx", package = "qdap"))
(doc3 <- system.file("extdata/transcripts/trans3.docx", package = "qdap"))
(doc4 <- system.file("extdata/transcripts/trans4.xlsx", package = "qdap"))
dat1 <- read.transcript(doc1)
truncdf(dat1, 40)
## X1 X2
## 1 Researcher 2 October 7, 1892.
## 2 Teacher 4 Students it's time to learn. [Student di
## 3 Multiple Students Yes teacher we're ready to learn.
## 4 [Cross Talk 3 00]
## 5 Teacher 4 Let's read this terrific book together.
dat2 <- read.transcript(doc1, col.names = c("person", "dialogue"))
truncdf(dat2, 40)
## person dialogue
## 1 Researcher 2 October 7, 1892.
## 2 Teacher 4 Students it's time to learn. [Student di
## 3 Multiple Students Yes teacher we're ready to learn.
## 4 [Cross Talk 3 00]
## 5 Teacher 4 Let's read this terrific book together.
dat2b <- rm_row(dat2, "person", "[C") #remove bracket row
truncdf(dat2b, 40)
## person dialogue
## 1 Researcher 2 October 7, 1892.
## 2 Teacher 4 Students it's time to learn. [Student di
## 3 Multiple Students Yes teacher we're ready to learn.
## 4 Teacher 4 Let's read this terrific book together.
## Be aware of the need to `skip` non transcript lines
## Incorrect read; Needed to use `skip`
read.transcript(doc2)
Error in data.frame(X1 = speaker, X2 = pvalues, stringsAsFactors = FALSE) :
arguments imply differing number of rows: 7, 8
## Correct: Used `skip`
dat3 <- read.transcript(doc2, skip = 1)
truncdf(dat3, 40)
## X1 X2
## 1 Researcher 2 October 7, 1892.
## 2 Teacher 4 Students it's time to learn. [Student di
## 3 Multiple Students Yes teacher we're ready to learn.
## 4 [Cross Talk 3 00]
## 5 Teacher 4 Let's read this terrific book together.
## Be Aware of the `sep` Used
## Incorrect Read; Wrong `sep` Provided (used default `:`)
read.transcript(doc3, skip = 1)
##Dialogue and Person Columns Mixed Inappropriately
## X1
## 1 [Cross Talk 3
## X2
## 1 Teacher 4-Students it's time to learn. [Student discussion; unintelligible] Multiple Students-Yes teacher we're ready to learn. 00] Teacher 4-Let's read this terrific book together. It's called Moo Baa La La La and what was I going to ... Oh yes The story is by Sandra Boynton. A cow says Moo. A Sheep says Baa. Three singing pigs say LA LA LA! "No, no!" you say, that isn't right. The pigs say oink all day and night. Rhinoceroses snort and snuff. And little dogs go ruff ruff ruff! Some other dogs go bow wow wow! And cats and kittens say Meow! Quack! Says the duck. A horse says neigh. It's quiet now. What do you say?
## Correct `sep` Used
dat4 <- read.transcript(doc3, sep = "-", skip = 1)
truncdf(dat4, 40)
## X1 X2
## 1 Teacher 4 Students it's time to learn. [Student di
## 2 Multiple Students Yes teacher we're ready to learn. [Cross
## 3 Teacher 4 Let's read this terrific book together.
## Read In .xlsx Data
dat5 <- read.transcript(doc4)
truncdf(dat5, 40)
## V1 V2
## 1 Researcher 2: October 7, 1892.
## 2 Teacher 4: Students it's time to learn.
## 3
## 4 Multiple Students: Yes teacher we're ready to learn.
## 5
## 6 Teacher 4: Let's read this terrific book together.
## Reading In Text
trans <- "sam: Computer is fun. Not too fun.
greg: No it's not, it's dumb.
teacher: What should we do?
sam: You liar, it stinks!"
read.transcript(text=trans)
## V1 V2
## 1 sam Computer is fun. Not too fun.
## 2 greg No its not, its dumb.
## 3 teacher What should we do?
## 4 sam You liar, it stinks!
The r FUN("dir_map")
function enables the researcher to produce multiple lines of code, one line with r FUN("read.transcript")
for each file in a directory, which is then optionally copied to the clipboard for easy insertion into a script. Note that setting the argument r CN("use.path = FALSE")
may allow the code to be more portable in that a static path is not supplied to the r FUN("read.transcript")
scripts.
r FT(orange, 5, text="♦")
Reading In Data- dir_map r FT(orange, 5, text="♦")
(DIR <- system.file("extdata/transcripts", package = "qdap"))
dir_map(DIR)
...will produce...
dat1 <- read.transcript('~/extdata/transcripts/trans1.docx', col.names = c('person', 'dialogue'), skip = 0)
dat2 <- read.transcript('~/extdata/transcripts/trans2.docx', col.names = c('person', 'dialogue'), skip = 0)
dat3 <- read.transcript('~/extdata/transcripts/trans3.docx', col.names = c('person', 'dialogue'), skip = 0)
dat4 <- read.transcript('~/extdata/transcripts/trans4.xlsx', col.names = c('person', 'dialogue'), skip = 0)
The r CN("mcsv_x")
family of functions are utilized to read (r FUN("mcsv_r")
) and write (r FUN("mcsv_w")
) multiple csv files at once. r FUN("mcsv_w")
takes an arbitrary number of dataframes and outputs them to the supplied directory( r CN("dir = ?")
). An attempt will be made to output the dataframes from qdap functions that output lists of dataframes. Note that dataframes that contain columns that are lists must be condensed prior to writing with other R dataframe writing functions (e.g., write.csv
) using the r FUN("condense")
function. By default r FUN("mcsv_w")
attempts to utilize r FUN("condense")
.
The r FUN("mcsv_r")
function reads multiple files at once and then assigns then dataframes to identically named objects (minus the file extension) in the global environment. Additionally, all of the dataframes that are read in are also assigned to an inclusive list (name L1
by default).
r FT(orange, 5, text="♦")
Reading and Writing Multiple csvs r FT(orange, 5, text="♦")
## Make new minimal data sets mtcarsb <- mtcars[1:5, ]; CO2b <- CO2[1:5, ] ## Write multiple csvs and assign the directory path to `a` a <- mcsv_w(mtcarsb, CO2b, dir="foo") ## New data sets gone from .GlobalEnv rm("mtcarsb", "CO2b") ## View the files in `a` and assign to `nms` (nms <- dir(a)) ## Read in and notice the dataframes have been assigned in .GlobalEnv mcsv_r(file.path(a, nms)) mtcarsb; CO2b L1 ## The dataframe names and list of dataframe can be altered mcsv_r(file.path(a, nms), a.name = paste0("bot", 1:2), l.name = "bots_stink") bot1; bot2 bots_stink ## Clean up delete("foo")
r FT(orange, 5, text="♦")
Writing Lists of Dataframes to csvs r FT(orange, 5, text="♦")
## poldat and termco produce lists of dataframes poldat <- with(DATA, polarity(state, person)) term <- c("the ", "she", " wh") termdat <- with(raj.act.1, termco(dialogue, person, term)) ## View the lists of dataframes str(poldat); str(termdat) ## Write the lists of dataframes to csv mcsv_w(poldat, termdat, mtcars, CO2, dir="foo2") ## Clean up delete("foo2")
The nature of dialogue data makes it large and cumbersome to view in R. This section explores qdap tools designed for more comfortable viewing of R dialogue oriented text dataframes.
The _truncdf
family of functions (trunc + dataframe = r FUN("truncdf", "data_viewing")
) are designed to truncate the width of columns and number of rows in dataframes and lists of dataframes. The r CN("l")
and r CN("h")
in front of r CN("trunc")
stands for list and head and are extensions of r FUN("truncdf", "data_viewing")
. r FUN("qview", "data_viewing")
is a wrapper for r FUN("htruncdf", "data_viewing")
that also displays number of rows, columns, and the dataframe name.
r FT(orange, 5, text="♦")
Truncated Data Viewing r FT(orange, 5, text="♦")
truncdf(raj[1:10, ]) truncdf(raj[1:10, ], 40) htruncdf(raj) htruncdf(raj, 20) htruncdf(raj, ,20) ltruncdf(rajPOS, width = 4)
qview(raj)
## ========================================================================
## nrow = 840 ncol = 3 raj
## ========================================================================
## person dialogue act
## 1 Sampson Gregory, o 1
## 2 Gregory No, for th 1
## 3 Sampson I mean, an 1
## 4 Gregory Ay, while 1
## 5 Sampson I strike q 1
## 6 Gregory But thou a 1
## 7 Sampson A dog of t 1
## 8 Gregory To move is 1
## 9 Sampson A dog of t 1
## 10 Gregory That shows 1
qview(CO2)
## ========================================================================
## nrow = 84 ncol = 5 CO2
## ========================================================================
## Plant Type Treatment conc uptake
## 1 Qn1 Quebec nonchilled 95 16
## 2 Qn1 Quebec nonchilled 175 30.4
## 3 Qn1 Quebec nonchilled 250 34.8
## 4 Qn1 Quebec nonchilled 350 37.2
## 5 Qn1 Quebec nonchilled 500 35.3
## 6 Qn1 Quebec nonchilled 675 39.2
## 7 Qn1 Quebec nonchilled 1000 39.7
## 8 Qn2 Quebec nonchilled 95 13.6
## 9 Qn2 Quebec nonchilled 175 27.3
## 10 Qn2 Quebec nonchilled 250 37.1
Many qdap objects are lists that print as a single dataframe, though the rest of the objects in the list are available. The r FUN("lview", "data_viewing")
function unclasses the object and assigns "list".
lview(question_type(DATA.SPLIT$state, DATA.SPLIT$person))
By default text data (character vectors) are displayed as right justified in R. This can be difficult and unnatural to read, particularly as the length of the sentences increase. The r FUN("left_just")
function creates a more natural left justification of text. Note that r FUN("left_just")
inserts spaces to achieve the justification. This could interfere with analysis and therefore the output from r FUN("left_just")
should only be used for visualization purposes, not analysis.
r FT(orange, 5, text="♦")
Justified Data Viewing r FT(orange, 5, text="♦")
## The unnatural state of R text data DATA ## left just to the rescue left_just(DATA) ## Left just select column(s) left_just(DATA, c("sex", "state")) left_just(CO2[1:15,]) right_just(left_just(CO2[1:15,]))
A task of many analyses is to search a dataframe for a particular phrase and return those rows/observations that contain that term. The researcher may optionally choose to specify a particular column to search (r CN("column.name")
) or search the entire dataframe.
r FT(orange, 5, text="♦")
Search Dataframes r FT(orange, 5, text="♦")
(SampDF <- data.frame("islands"=names(islands)[1:32],mtcars, row.names=NULL)) Search(SampDF, "Cuba", "islands") Search(SampDF, "New", "islands") Search(SampDF, "Ho") Search(SampDF, "Ho", max.distance = 0) Search(SampDF, "Axel Heiberg") Search(SampDF, 19) #too much tolerance in max.distance Search(SampDF, 19, max.distance = 0) Search(SampDF, 19, "qsec", max.distance = 0)
This manual arranges functions into categories in the order a researcher is likely to use them. The Generic qdap Tools section does not fit this convention, however, because these tools may be used throughout all stages of analysis it is important that the reader is familiar with them. It is important to note that after reading in transcript data the researcher will likely that the next step is the need to parse the dataframe utilizing the techniques found in the r HR("#cleaning", "Cleaning/Preparing the Data")
section.
Often it can be tedious to supply quotes to character vectors when dealing with large vectors. r FUN("qcv")
replaces the typical r CN('c("A", "B", "C", "...")')
approach to creating character vectors. Instead the user supplies r CN("qcv(A, B, C, ...)")
. This format assumes single words separated by commas. If your data/string does not fit this approach the combined terms
and split
argument can be utilized.
r FT(orange, 5, text="♦")
Quick Character Vector r FT(orange, 5, text="♦")
qcv(I, like, dogs) qcv(terms = "I like, big dogs", split = ",") qcv(I, like, dogs, space.wrap = TRUE) qcv(I, like, dogs, trailing = TRUE) qcv(I, like, dogs, leading = TRUE) qcv(terms = "mpg cyl disp hp drat wt qsec vs am gear carb")
Often the researcher who deals with text data will have the need to lookup values quickly and return an accompanying value. This is often called a dictionary, hash, or lookup. This can be used to find corresponding values or recode variables etc. The r FUN("lookup")
& r HR2("%l%")
functions provide a fast environment lookup for single usage. The r FUN("hash")
& r HR2("http://trinker.github.io/qdap_dev/hash.html", "hash_lookup")
/r HR2("http://trinker.github.io/qdap_dev/hash.html", "%hl%")
functions provide a fast environment lookup for multiple uses of the same hash table.
r FT(orange, 5, text="♦")
r FUN("lookup")
- Dictionary/Look Up Examples r FT(orange, 5, text="♦")
lookup(1:5, data.frame(1:4, 11:14)) lookup(LETTERS[1:5], data.frame(LETTERS[1:4], 11:14), missing = NULL) lookup(LETTERS[1:5], data.frame(LETTERS[1:5], 100:104))
## Fast with very large vectors
key <- data.frame(x=1:2, y=c("A", "B"))
set.seed(10)
big.vec <- sample(1:2, 3000000, T)
out <- lookup(big.vec, key)
out[1:20]
## [1] "B" "A" "A" "B" "A" "A" "A" "A" "B" "A" "B" "B" "A"
## [14] "B" "A" "A" "A" "A" "A" "B"
## Supply a named list of vectors to key.match codes <- list(A=c(1, 2, 4), B = c(3, 5), C = 7, D = c(6, 8:10) ) lookup(1:10, codes) #or 1:10 %l% codes
## Supply a single vector to key.match and key.assign lookup(mtcars$carb, sort(unique(mtcars$carb)), c('one', 'two', 'three', 'four', 'six', 'eight')) lookup(mtcars$carb, sort(unique(mtcars$carb)), seq(10, 60, by=10))
r FT(orange, 5, text="♦")
r FUN("hash")
/r FUN("hash_look")
- Dictionary/Look Up Examples r FT(orange, 5, text="♦")
## Create a fake data set of hash values (DF <- aggregate(mpg~as.character(carb), mtcars, mean)) ## Use `hash` to create a lookup environment hashTab <- hash(DF) ## Create a vector to lookup x <- sample(DF[, 1], 20, TRUE) ## Lookup x in the hash with `hash_look` or `%hl%` hash_look(x, hashTab) x %hl% hashTab
Researchers dealing with transcripts may have the need to convert between traditional Hours:Minutes:Seconds format and seconds. The r FUN("hms2sec")
and r FUN("sec2hms")
functions offer this type of time conversion.
r FT(orange, 5, text="♦")
Time Conversion Examples r FT(orange, 5, text="♦")
hms2sec(c("02:00:03", "04:03:01")) hms2sec(sec2hms(c(222, 1234, 55))) sec2hms(c(256, 3456, 56565))
r FUN("url_dl")
is a function used to provide qdap users with examples taken from the Internet. It is useful for most document downloads from the Internet.
r FT(orange, 5, text="♦")
url_dl Examples r FT(orange, 5, text="♦")
## Example 1 (download from dropbox)
# download transcript of the debate to working directory
url_dl(pres.deb1.docx, pres.deb2.docx, pres.deb3.docx)
# load multiple files with read transcript and assign to working directory
dat1 <- read.transcript("pres.deb1.docx", c("person", "dialogue"))
dat2 <- read.transcript("pres.deb2.docx", c("person", "dialogue"))
dat3 <- read.transcript("pres.deb3.docx", c("person", "dialogue"))
docs <- qcv(pres.deb1.docx, pres.deb2.docx, pres.deb3.docx)
dir() %in% docs
delete(docs) #remove the documents
dir() %in% docs
## Example 2 (quoted string urls)
url_dl("https://dl.dropboxusercontent.com/u/61803503/qdap.pdf",
"http://www.cran.r-project.org/doc/manuals/R-intro.pdf")
## Clean up
delete(qcv(qdap.pdf, R-intro.pdf))
After reading in the data the researcher may want to remove all non-dialogue text from the transcript dataframe such as transcriber annotations. This can be accomplished with the r FUN("bracketX")
family of functions, which removes text found between two brackets (r CN("( )")
, r CN("{ }")
, r CN("[ ]")
, r CN("< >")
) or more generally using r FUN("genX")
and r FUN("genXtract")
to remove text between two character reference points.
If the bracketed text is useful to analysis it is recommended that the researcher assigns the un-bracketed text to a new column.
r FT(orange, 5, text="♦")
Extracting Chunks 1- bracketX/bracketXtract r FT(orange, 5, text="♦")
## A fake data set examp <- structure(list(person = structure(c(1L, 2L, 1L, 3L), .Label = c("bob", "greg", "sue"), class = "factor"), text = c("I love chicken [unintelligible]!", "Me too! (laughter) It's so good.[interrupting]", "Yep it's awesome {reading}.", "Agreed. {is so much fun}")), .Names = c("person", "text"), row.names = c(NA, -4L), class = "data.frame") examp bracketX(examp$text, "square") bracketX(examp$text, "curly") bracketX(examp$text, c("square", "round")) bracketX(examp$text) bracketXtract(examp$text, "square") bracketXtract(examp$text, "curly") bracketXtract(examp$text, c("square", "round")) bracketXtract(examp$text, c("square", "round"), merge = FALSE) bracketXtract(examp$text) bracketXtract(examp$text, with = TRUE)
Often a researcher will want to extract some text from the transcript and put it back together. One example is the reconstructing of material read from a book, poem, play or other text. This information is generally dispersed throughout the dialogue (within classroom/teaching procedures). If this text is denoted with a particular identifying bracket such as curly braces this text can be extracted and then pasted back together.
r FT(orange, 5, text="♦")
Extracting Chunks 2- Recombining Chunks r FT(orange, 5, text="♦")
paste2(bracketXtract(examp$text, "curly"), " ")
The researcher may need a more general extraction method that allows for any left/right boundaries to be specified. This is useful in that many qualitative transcription/coding programs have specific syntax for various dialogue markup for events that must be parsed from the data set. The r FUN("genX")
and r FUN("genXtract")
functions have such capabilities.
r FT(orange, 5, text="♦")
Extracting Chunks 3- genX/genXtract r FT(orange, 5, text="♦")
DATA$state ## Look at the difference in number 1 and 10 from above genX(DATA$state, c("is", "we"), c("too", "on")) ## A fake data set x <- c("Where is the /big dog#?", "I think he's @arunning@b with /little cat#.") x genXtract(x, c("/", "@a"), c("#", "@b")) ## A fake data set x2 <- c("Where is the L1big dogL2?", "I think he's 98running99 with L1little catL2.") x2 genXtract(x2, c("L1", 98), c("L2", 99))
After reading in data, removing non-dialogue (via r FUN("bracketX")
), and viewing it the researcher will want to find text rows that do not contain proper punctuation and or that contain punctuation and no text. This is accomplished with the _truncdf
family of functions and r FUN("potential_NA")
functions as the researcher manually parses the original transcripts, makes alterations and re-reads the data back into qdap. This important procedure is not an automatic process, requiring that the researcher give attention to detail in comparing the R dataframe with the original transcript.
r FT(orange, 5, text="♦")
Identifying and Coding Missing Values r FT(orange, 5, text="♦")
## Create A Data Set With Punctuation and No Text (DATA$state[c(3, 7, 10)] <- c(".", ".", NA)) DATA potential_NA(DATA$state, 20) potential_NA(DATA$state) ## Use To Selctively Replace Cells With Missing Values DATA$state[potential_NA(DATA$state, 20)$row[-c(3)]] <- NA DATA ## Reset DATA DATA <- qdap::DATA
The researcher may wish to remove empty rows (using r FUN("rm_empty_row")
) and/or rows that contain certain markers (using r FUN("rm_row")
). Sometimes empty rows are read into the dataframe from the transcript. These rows should be completely removed from the data set rather than denoting with NA
. The r FUN("rm_empty_row")
removes completely empty rows (those rows with only 1 or more blank spaces) from the dataframe.
r FT(orange, 5, text="♦")
Remove Empty Rowsr FT(orange, 5, text="♦")
(dat <- rbind.data.frame(DATA[, c(1, 4)], matrix(rep(" ", 4), ncol =2, dimnames=list(12:13, colnames(DATA)[c(1, 4)])))) rm_empty_row(dat)
Other times the researcher may wish to use r FUN("rm_row")
to remove rows from the dataframe/analysis based on transcription conventions or to remove demographic characteristics. For example, in the example below the transcript is read in with [Cross Talk 3. This is a transcription convention and we would want to parse these rows from the transcript. A second example shows the removal of people from the dataframe.
r FT(orange, 5, text="♦")
Remove Selected Rowsr FT(orange, 5, text="♦")
## Read in transcript
dat2 <- read.transcript(system.file("extdata/transcripts/trans1.docx",
package = "qdap"))
truncdf(dat2, 40)
## X1 X2
## 1 Researcher 2 October 7, 1892.
## 2 Teacher 4 Students it's time to learn. [Student di
## 3 Multiple Students Yes teacher we're ready to learn.
## 4 [Cross Talk 3 00]
## 5 Teacher 4 Let's read this terrific book together.
## Use column names to remove rows
truncdf(rm_row(dat2, "X1", "[C"), 40)
## X1 X2
## 1 Researcher 2 October 7, 1892.
## 2 Teacher 4 Students it's time to learn. [Student di
## 3 Multiple Students Yes teacher we're ready to learn.
## 4 Teacher 4 Let's read this terrific book together.
## Use column numbers to remove rows
truncdf(rm_row(dat2, 2, "[C"), 40)
## X1 X2
## 1 Researcher 2 October 7, 1892.
## 2 Teacher 4 Students it's time to learn. [Student di
## 3 Multiple Students Yes teacher we're ready to learn.
## 4 [Cross Talk 3 00]
## 5 Teacher 4 Let's read this terrific book together.
## Also remove people etc. from the analysis
rm_row(DATA, 1, c("sam", "greg"))
## person sex adult state code
## 1 teacher m 1 What should we do? K3
## 2 sally f 0 How can we be certain? K6
## 3 sally f 0 What are you talking about? K9
## 4 researcher f 1 Shall we move on? Good then. K10
An important step in the cleaning process is the removal of extra white spaces (use r FUN("Trim")
) and r HR2("http://stat.ethz.ch/R-manual/R-devel/library/base/html/Quotes.html", "escaped characters")
(use r FUN("clean")
). The r FUN("scrubber")
function wraps both r FUN("Trim")
and r FUN("clean")
and adds in the functionality of some of the r CN("replace_")
family of functions.
r FT(orange, 5, text="♦")
Remove Extra Spaces and Escaped Charactersr FT(orange, 5, text="♦")
x1 <- "I go \r to the \tnext line" x1 clean(x1) x2 <- c(" talkstats.com ", " really? ", " yeah") x2 Trim(x2) x3 <- c("I like 456 dogs\t , don't you?\"") x3 scrubber(x3) scrubber(x3, TRUE)
The replacement family of functions replace various text elements within the transcripts with alphabetic versions that are more suited to analysis. These alterations may affect word counts and other alphabetic dependent forms of analysis.
The r FUN("replace_abbreviation")
replaces standard abbreviations that utilize periods with forms that do not rely on periods. This is necessary in that many sentence specific functions (e.g., r FUN("sentSplit")
and r FUN("word_stats")
) rely on period usage acting as sentence end marks. The researcher may augment the standard r FUN2("abbreviations")
dictionary from qdapDictionaries with field specific abbreviations.
r FT(orange, 5, text="♦")
Replace Abbreviationsr FT(orange, 5, text="♦")
## Use the standard contractions dictionary x <- c("Mr. Jones is here at 7:30 p.m.", "Check it out at www.github.com/trinker/qdap", "i.e. He's a sr. dr.; the best in 2012 A.D.", "the robot at t.s. is 10ft. 3in.") x replace_abbreviation(x) ## Augment the standard dictionary with replacement vectors abv <- c("in.", "ft.", "t.s.") repl <- c("inch", "feet", "talkstats") replace_abbreviation(x, abv, repl) ## Augment the standard dictionary with a replacement dataframe (KEY <- rbind(abbreviations, data.frame(abv = abv, rep = repl))) replace_abbreviation(x, KEY)
The r FUN("replace_contraction")
replaces contractions with equivalent multi-word forms. This is useful for some word/sentence statistics. The researcher may augment the r FUN2("contractions")
dictionary supplied by qdapDictionaries, however, the word list is exhaustive.
r FT(orange, 5, text="♦")
Replace Contractionsr FT(orange, 5, text="♦")
x <- c("Mr. Jones isn't going.", "Check it out what's going on.", "He's here but didn't go.", "the robot at t.s. wasn't nice", "he'd like it if i'd go away") x replace_contraction(x)
The r FUN("replace_number")
function utilizes The work of John Fox (2005) to turn numeric representations of numbers into their textual equivalents. This is useful for word statistics that require the text version of dialogue.
r FT(orange, 5, text="♦")
Replace Numbers-Numeral Representationr FT(orange, 5, text="♦")
x <- c("I like 346457 ice cream cones.", "They are 99 percent good") replace_number(x) ## Replace numbers that contain commas as well y <- c("I like 346,457 ice cream cones.", "They are 99 percent good") replace_number(y) ## Combine numbers as one word/string replace_number(x, FALSE)
The r FUN("replace_symbol")
converts ($) to "dollar", (%) to "percent", (#) to "number", (@) to "at", (&) to "and", (w/) to "with". Additional substitutions can be undertaken with the r FUN("multigsub")
function.
r FT(orange, 5, text="♦")
Replace Symbolsr FT(orange, 5, text="♦")
x <- c("I am @ Jon's & Jim's w/ Marry", "I owe $41 for food", "two is 10% of a #") x replace_symbol(x) replace_number(replace_symbol(x))
The r FUN("qprep")
function is a wrapper for several other replacement family function that allows for more speedy cleaning of the text. This approach, while speedy, reduces the flexibility and care that is undertaken by the researcher when the individual replacement functions are utilized. The function is intended for analysis that requires less care.
r FT(orange, 5, text="♦")
General Replacement (Quick Preparation)r FT(orange, 5, text="♦")
x <- "I like 60 (laughter) #d-bot and $6 @ the store w/o 8p.m." x qprep(x)
Many qdap functions break sentences up into words based on the spaces between words. Often the researcher will want to keep a group of words as a single unit. The r FUN("space_fill")
allows the researcher to replace spaces between selected phrases with ~~. By default ~~ is recognized by many qdap functions as a space separator.
r FT(orange, 5, text="♦")
Space Fill Examplesr FT(orange, 5, text="♦")
## Fake Data x <- c("I want to hear the Dr. Martin Luther King Jr. speech.", "I also want to go to the white House to see President Obama speak.") x ## Words to keep as a single unit keeps <- c("Dr. Martin Luther King Jr.", "The White House", "President Obama") text <- space_fill(x, keeps) text ## strip Example strip(text, lower=FALSE) ## bag_o_words Example bag_o_words(text, lower=FALSE) ## wfm Example wfm(text, c("greg", "bob")) ## trans_cloud Example obs <- strip(space_fill(keeps, keeps), lower=FALSE) trans_cloud(text, c("greg", "bob"), target.words=list(obs), caps.list=obs, cloud.colors=qcv(red, gray65), expand.target = FALSE, title.padj = .7, legend = c("space_filled", "other"), title.cex = 2, title.color = "blue", max.word.size = 3)
The researcher may have the need to make multiple substitutions in a text. An example of when this is needed is when a transcript is marked up with transcription coding convention specific to a particular transcription method. These codes, while useful in some contexts, may lead to inaccurate word statistics. The base R function r HR2("http://stat.ethz.ch/R-manual/R-devel/library/base/html/grep.html", "gsub")
makes a single replacement of these types of coding conventions. The r FUN("multigsub")
(alias r FUN("mgsub")
) takes a vector of patterns to search for as well as a vector of replacements. Note that the replacements occur sequentially rather than all at once. This means a previous (first in pattern string) sub could alter or be altered by a later sub. r FUN("mgsub")
is useful throughout multiple stages of the research process.
r FT(orange, 5, text="♦")
Multiple Substitutionsr FT(orange, 5, text="♦")
left_just(DATA[, c(1, 4)]) multigsub(c("it's", "I'm"), c("it is", "I am"), DATA$state) mgsub(c("it's", "I'm"), c("it is", "I am"), DATA$state) mgsub(c("it's", "I'm"), "SINGLE REPLACEMENT", DATA$state) mgsub("[[:punct:]]", "PUNC", DATA$state, fixed = FALSE) ## Iterative "I'm" converts to "I am" which converts to "INTERATIVE" mgsub(c("it's", "I'm", "I am"), c("it is", "I am", "ITERATIVE"), DATA$state)
A researcher may face a list of names and be uncertain about gender of the participants. The r FUN("name2sex")
function utilizes the gender package to predict names based on Social Security Administration data, defaulting to the period from 1932-2012.
r FT(orange, 5, text="♦")
Name to Gender Predictionr FT(orange, 5, text="♦")
name2sex(qcv(mary, jenn, linda, JAME, GABRIEL, OLIVA, tyler, jamie, JAMES,
tyrone, cheryl, drew))
[1] F F F M M F M F M M F M
Levels: F M
During the initial cleaning stage of analysis the researcher may choose to create a stemmed version of the dialogue, that is words are reduced to their root words. The r FUN("stemmer")
family of functions allow the researcher to create stemmed text. The r FUN("stem2df")
function wraps r FUN("stemmer")
to quickly create a dataframe with the stemmed column added.
r FT(orange, 5, text="♦")
Stemmingr FT(orange, 5, text="♦")
## stem2df EXAMPLE: (stemdat <- stem2df(DATA, "state", "new")) with(stemdat, trans_cloud(new, sex, title.cex = 2.5, title.color = "blue", max.word.size = 5, title.padj = .7)) ## stemmer EXAMPLE: stemmer(DATA$state) ## stem_words EXAMPLE: stem_words(doggies, jumping, swims)
At times it is handy to be able to grab from the beginning or end of a string to a specific character. The r FUN("beg2char")
function allows you to grab from the beginning of a string to the nth occurrence of a character. The counterpart function, r FUN("char2end")
, grab from the nth occurrence of a character to the end of a string to. This behavior is useful if the transcript contains annotations at the beginning or end of a line that should be eliminated.
r FT(orange, 5, text="♦")
Grab From Character to Beginning/End of Stringr FT(orange, 5, text="♦")
x <- c("a_b_c_d", "1_2_3_4", "<_?_._:") beg2char(x, "_") beg2char(x, "_", 4) char2end(x, "_") char2end(x, "_", 2) char2end(x, "_", 3, include=TRUE) (x2 <- gsub("_", " ", x)) beg2char(x2, " ", 2) (x3 <- gsub("_", "\\^", x)) char2end(x3, "^", 2)
Often incomplete sentences have a different function than complete sentences. The researcher may want to denote incomplete sentences for consideration in later analysis. Traditionally, incomplete sentence are denoted with the following end marks (.., ..., .?, ..?, en & em). The r FUN("incomplete_replace")
can identify and replace the traditional end marks with a standard form r FT(blue, text="\"|\"")
.
r FT(orange, 5, text="♦")
Incomplete Sentence Identificationr FT(orange, 5, text="♦")
x <- c("the...", "I.?", "you.", "threw..", "we?") incomplete_replace(x) incomp(x) incomp(x, scan.mode = TRUE)
The r FUN("capitalizer")
functions allows the researcher to specify words within a vector to be capitalized. By default r FT(blue, text="I")
, and contractions containing r FT(blue, text="I")
, are capitalized. Additional words can be specified through the r CN("caps.list")
argument. To capitalize words within strings the r FUN("mgsub")
can be used.
r FT(orange, 5, text="♦")
Word Capitalizationr FT(orange, 5, text="♦")
capitalizer(bag_o_words("i like it but i'm not certain"), "like") capitalizer(bag_o_words("i like it but i'm not certain"), "like", FALSE)
Many functions in the qdap package require that the dialogue is broken apart into individual sentences, failure to do so may invalidate many of the outputs from the analysis and will lead to lead to warnings. After reading in and cleaning the data the next step should be to split the text variable into individual sentences. The r FUN("sentSplit")
function outputs a dataframe with the text variable split into individual sentences and repeats the demographic variables as necessary. Additionally, a turn of talk (r FT(red, text="tot column")
) variable is added that keeps track of the original turn of talk (row number) and the sentence number per turn of talk. The researcher may also want to create a second text column that has been stemmed for future analysis by setting r CN("stem.col = TRUE")
, though this is more time intensive.
r FT(orange, 5, text="♦")
r FUN("sentSplit")
Exampler FT(orange, 5, text="♦")
sentSplit(DATA, "state") sentSplit(DATA, "state", stem.col = TRUE) sentSplit(raj, "dialogue")[1:11, ]
r FT(orange, 5, text="♦")
r FUN("sentSplit")
- plot Methodr FT(orange, 5, text="♦")
plot(sentSplit(DATA, "state"), grouping.var = "person") plot(sentSplit(DATA, "state"), grouping.var = "sex")
r FT(orange, 5, text="♦")
r FUN("TOT", "sentSplit")
Example r FT(orange, 5, text="♦")
## Convert tot column with sub sentences to turns of talk dat <- sentSplit(DATA, "state") TOT(dat$tot)
Within dialogue (particularly classroom dialogue) several speakers may say the same speech at the same. The transcripts may lump this speech together in the form of:
Person | Dialogue |
John, Josh & Imani `r HS(8)` | Yes Mrs. Smith. `r HS(8)` |
The r FUN("speakerSplit")
function attributes this text to each of the people as separate entries. The default behavior is the search for the person separators of sep = c("and", "&", ","), though other separators may be specified.
r FT(orange, 5, text="♦")
Break and Stretch if Multiple Persons per Cellr FT(orange, 5, text="♦")
## Create data set with multiple speakers per turn of talk DATA$person <- as.character(DATA$person) DATA$person[c(1, 4, 6)] <- c("greg, sally, & sam", "greg, sally", "sam and sally") speakerSplit(DATA) ## Change the separator DATA$person[c(1, 4, 6)] <- c("greg_sally_sam", "greg.sally", "sam; sally") speakerSplit(DATA, sep = c(".", "_", ";")) ## Reset DATA DATA <- qdap::DATA
The r FUN("sentCombine")
function is the opposite of the r FUN("sentSplit")
, combining sentences into a single turn of talk per grouping variable.
r FT(orange, 5, text="♦")
Sentence Combiningr FT(orange, 5, text="♦")
dat <- sentSplit(DATA, "state") ## Combine by person sentCombine(dat$state, dat$person) ## Combine by sex truncdf(sentCombine(dat$state, dat$sex), 65)
It is more efficient to maintain a dialogue dataframe (consisting of a column for people and a column for dialogue) and a separate demographics dataframe (a person column and demographic column(s)) and then merge the two during analysis. The r FUN("key_merge")
function is a wrapper for the r HR2("http://stat.ethz.ch/R-manual/R-devel/library/base/html/merge.html", "merge")
function from R's base install that merges the dialogue and demographics dataframe. r FUN("key_merge")
attempts to guess the person column and outputs a qdap friendly dataframe.
r FT(orange, 5, text="♦")
Merging Demographic Informationr FT(orange, 5, text="♦")
## A dialogue dataframe and a demographics dataframe ltruncdf(list(dialogue=raj, demographics=raj.demographics), 10, 50) ## Merge the two merged.raj <- key_merge(raj, raj.demographics) htruncdf(merged.raj, 10, 40)
Many functions in qdap utilize the r FUN("paste2")
function, which pastes multiple columns/lists of vectors. r FUN("paste2")
differs from base R's r HR2("http://127.0.0.1:16084/library/base/html/paste.html", "paste")
function in that r FUN("paste2")
can paste unspecified columns or a list of vectors together. The r FUN("colpaste2df", "paste2")
function, a wrapper for r FUN("paste2")
, pastes multiple columns together and outputs an appropriately named dataframe. The r FUN("colsplit2df")
and r FUN("lcolsplit2df", "colsplit2df")
are useful because they can split the output from qdap functions that contain dataframes with pasted columns.
r FT(orange, 5, text="♦")
Using r FUN("paste2")
and r FUN("colSplit")
: Pasting & Splitting Vectors and Dataframesr FT(orange, 5, text="♦")
## Pasting a list of vectors paste2(rep(list(state.abb[1:8], month.abb[1:8]) , 2), sep = "|_|") ## Pasting a dataframe foo1 <- paste2(CO2[, 1:3]) head(foo1, 12) ## Splitting a pasted column bar1 <- colSplit(foo1) head(bar1, 10)
r FT(orange, 5, text="♦")
r FUN("colpaste2df")
& r FUN("colsplit2df")
: Splitting Columns in Dataframesr FT(orange, 5, text="♦")
## Create a dataset with a pasted column (dat <- colpaste2df(head(CO2), 1:3, keep.orig = FALSE)[, c(3, 1:2)]) ## Split column colsplit2df(dat) ## Specify names colsplit2df(dat, new.names = qcv(A, B, C)) ## Keep the original pasted column colsplit2df(dat, new.names = qcv(A, B, C), keep.orig = TRUE) ## Pasting columns and output a dataframe colpaste2df(head(mtcars)[, 1:5], qcv(mpg, cyl, disp), sep ="_", name.sep = "|") colpaste2df(head(CO2)[, -3], list(1:2, qcv("conc", "uptake")))
r FT(orange, 5, text="♦")
r FUN("lcolsplit2df")
: Splitting Columns in Lists of Dataframesr FT(orange, 5, text="♦")
## A list with dataframes that contain pasted columns x <- question_type(DATA.SPLIT$state, list(DATA.SPLIT$sex, DATA.SPLIT$adult)) ltruncdf(x[1:4]) z <- lcolsplit2df(x) ltruncdf(z[1:4])
Often a researcher will want to view the patterns of the discourse by grouping variables over time. This requires the data to have start and end times based on units (sentence, turn of talk, or word). The r FUN("gantt")
function provides the user with unit spans (start and end times) with the r FUN("gantt_rep")
extending this capability to repeated measures. The r FUN("gantt")
function has a basic plotting method to allow visualization of the unit span data, however, the r FUN("gantt_wrap")
function extends the r FUN("gantt")
and r FUN("gantt_rep")
functions to plot precise depictions (Gantt plots) of the unit span data. Note that if the researcher is only interested in the plotting the data as a Gantt plot, the r FUN("gantt_plot")
function combines the r FUN("gantt")
/r FUN("gantt_rep")
functions with the r FUN("gantt")
function
r FT(orange, 5, text="♦")
Unit Spansr FT(orange, 5, text="♦")
## Unit Span Dataframe dat <- gantt(mraja1$dialogue, mraja1$person) head(dat, 12) plot(dat) plot(dat, base = TRUE)
r FT(orange, 5, text="♦")
Repeated Measures Unit Spansr FT(orange, 5, text="♦")
## Repeated Measures Unit Span Dataframe dat2 <- with(rajSPLIT, gantt_rep(act, dialogue, list(fam.aff, sex))) head(dat2, 12) ## Plotting Repeated Measures Unit Span Dataframe plot(dat2) gantt_wrap(dat2, "fam.aff_sex", facet.vars = "act", title = "Repeated Measures Gantt Plot")
It is useful to convert data to an adjacency matrix for examining relationships between grouping variables in word usage. The r FUN("adjaceny_matrix")
(aka: r FUN("adjmat")
) provide this capability, interacting with a r FUN("termco")
or r FUN("wfm", "Word_Frequency_Matrix")
object. In the first example below Sam and Greg share 4 words in common, whereas, the Teacher and Greg share no words. The adjacency matrix can be passed to a network graphing package such as the r HR2("http://igraph.sourceforge.net/", "igraph")
package for visualization of the data structure as seen in Example 3.
r FT(orange, 5, text="♦")
Adjacency Matrix: Example 1r FT(orange, 5, text="♦")
adjacency_matrix(wfm(DATA$state, DATA$person))
## Adjacency Matrix:
##
## greg researcher sally sam
## researcher 0
## sally 1 1
## sam 4 0 1
## teacher 0 1 2 0
##
##
## Summed occurrences:
##
## greg researcher sally sam teacher
## 18 6 10 11 4
r FT(orange, 5, text="♦")
Adjacency Matrix: Example 2r FT(orange, 5, text="♦")
words <- c(" education", " war ", " econom", " job", "governor ") (terms <- with(pres_debates2012, termco(dialogue, person, words))) adjmat(terms)
## Adjacency Matrix:
##
## OBAMA ROMNEY CROWLEY LEHRER QUESTION
## ROMNEY 5
## CROWLEY 2 2
## LEHRER 4 4 2
## QUESTION 4 4 2 4
## SCHIEFFER 2 2 1 1 1
##
##
## Summed occurrences:
##
## OBAMA ROMNEY CROWLEY LEHRER QUESTION SCHIEFFER
## 5 5 2 4 4 2
It is often useful to plot the adjacency matrix as a network. The r HR("http://cran.r-project.org/web/packages/igraph/index.html", "igraph package")
provides this functionality.
`r FT(orange, 5, text="♦")` Plotting an Adjacency Matrix: Example 1`r FT(orange, 5, text="♦")`
library(igraph) dat <- adjacency_matrix(wfm(DATA$state, DATA$person, stopword = Top25Words)) g <- graph.adjacency(dat$adjacency, weighted=TRUE, mode ="undirected") g <- simplify(g) V(g)$label <- V(g)$name V(g)$degree <- igraph::degree(g) set.seed(14) plot(g, layout=layout.auto(g))
The following example will visualize the presidential debates data as a network plot.
`r FT(orange, 5, text="♦")` Plotting an Adjacency Matrix: Example 2`r FT(orange, 5, text="♦")`
library(igraph) ## Subset the presidential debates data set subpres <- pres_debates2012[pres_debates2012$person %in% qcv(ROMNEY, OBAMA), ] ## Create a word frequency matrix dat <- with(subpres, wfm(dialogue, list(person, time), stopword = Top200Words)) ## Generate an adjacency matrix adjdat <- adjacency_matrix(dat) X <- adjdat$adjacency g <- graph.adjacency(X, weighted=TRUE, mode ="undirected") g <- simplify(g) V(g)$label <- V(g)$name V(g)$degree <- igraph::degree(g) plot(g, layout=layout.auto(g))
We can easily add information to the network plot utilizing the r FUN("Dissimilarity")
function to obtain weights and distance measures for use with the plot.
r FT(orange, 5, text="♦")
Plotting an Adjacency Matrix: Example 2br FT(orange, 5, text="♦")
edge.weight <- 15 #a maximizing thickness constant d <- as.matrix(Dissimilarity(dat)) d2 <- d[lower.tri(d)] z1 <- edge.weight*d2^2/max(d2) z2 <- c(round(d2, 3)) E(g)$width <- c(z1)[c(z1) != 0] E(g)$label <- c(z2)[c(z2) != 0] plot(g, layout=layout.auto(g)) plot(g, layout=layout.auto(g), edge.curved =TRUE)
r FT(orange, 5, text="♦")
Plotting an Adjacency Matrix: Try the plot interactively!r FT(orange, 5, text="♦")
tkplot(g)
This section overviews functions that can extract words and word lists from dialogue text. The subsections describing function use are in alphabetical order as there is no set chronology for use.
The r FUN("all_words")
breaks the dialogue into a bag of words and searches based on the criteria arguments r CN("begins.with")
and r CN("contains")
. The resulting word list can be useful for analysis or to pass to qdap functions that deal with r HR("#counts", "Word Counts and Descriptive Statistics")
.
r FT(orange, 5, text="♦")
r FUN("all_words")
r FT(orange, 5, text="♦")
## Words starting with `re` x1 <- all_words(raj$dialogue, begins.with="re") head(x1, 10) ## Words containing with `conc` all_words(raj$dialogue, contains = "conc") ## All words ordered by frequency x2 <- all_words(raj$dialogue, alphabetical = FALSE) head(x2, 10)
The qdap package utilizes the following functions to turn text into a bag of words (word order is preserved):
`r HR("http://trinker.github.io/qdap_dev/bag_o_words.html", "bag_o_words")` | Reduces a text column to a single vector bag of words. |
`r HR("http://trinker.github.io/qdap_dev/bag_o_words.html", "breaker")` | Reduces a text column to a single vector bag of words and qdap recognized end marks. |
`r HR("http://trinker.github.io/qdap_dev/bag_o_words.html", "word.split")` | Reduces a text column to a list of vectors of bag of words and qdap recognized end marks (i.e., ".", "!", "?", "*", "-"). |
Bag of words can be useful for any number of reasons within the scope of analyzing discourse. Many other qdap functions employ or mention these three functions as seen in the following counts for the three word splitting functions.
library(acc.roxygen2) x <- search_repo(bag_o_words, breaker, word.split) print(xtable(x), type="html")
Function | bag_o_words | breaker | word.split | |
1 | all_words.R | 1 | - | - |
2 | automated_readability_index.R | - | - | 2 |
3 | bag_o_words.R | 10 | 6 | 3 |
4 | capitalizer.R | 3 | 1 | - |
5 | imperative.R | - | 3 | - |
6 | ngrams.R | 1 | - | - |
7 | polarity.R | 2 | - | - |
8 | rm_stopwords.R | 1 | 3 | - |
9 | textLISTER.R | - | - | 2 |
10 | trans_cloud.R | 1 | 1 | - |
11 | wfm.R | 1 | - | - |
r FT(orange, 5, text="♦")
Word Splitting Examplesr FT(orange, 5, text="♦")
bag_o_words("I'm going home!") bag_o_words("I'm going home!", apostrophe.remove = TRUE) bag_o_words(DATA$state) by(DATA$state, DATA$person, bag_o_words) lapply(DATA$state, bag_o_words) breaker(DATA$state) by(DATA$state, DATA$person, breaker) lapply(DATA$state, breaker) word_split(c(NA, DATA$state))
The r FUN("common")
function finds items that are common between n vectors
(i.e., subjects or grouping variables). This is useful for determining common language choices shared across participants in a conversation.
r FT(orange, 5, text="♦")
Words in Common Examplesr FT(orange, 5, text="♦")
## Create vectors of words a <- c("a", "cat", "dog", "the", "the") b <- c("corn", "a", "chicken", "the") d <- c("house", "feed", "a", "the", "chicken") ## Supply individual vectors common(a, b, d, overlap=2) common(a, b, d, overlap=3) ## Supply a list of vectors common(list(a, b, d)) ## Using to find common words between subjects common(word_list(DATA$state, DATA$person)$cwl, overlap = 2)
It is often useful and more efficient to start with a preset vector of words and eliminate or r FUN("exclude")
the words you do not wish to include. Examples could range from excluding an individual(s) from a column of participant names or excluding a few select word(s) from a pre-defined qdap word list. This is particularly useful for passing terms or stopwords to word counting functions like r FUN("termco")
or r FUN("trans_cloud")
.
r FT(orange, 5, text="♦")
r FUN("exclude")
Examplesr FT(orange, 5, text="♦")
exclude(1:10, 3, 4) exclude(Top25Words, qcv(the, of, and)) exclude(Top25Words, "the", "of", "an") #Using with `term_match` and `termco` MTCH.LST <- exclude(term_match(DATA$state, qcv(th, i)), qcv(truth, stinks)) termco(DATA$state, DATA$person, MTCH.LST)
Utilizing r HR2("http://en.wikipedia.org/wiki/N-gram", "ngrams")
can be useful for gaining a sense of what terms are used in conjunction with other terms. This is particularly useful in the analysis of dialogue when the combination of a particular vocabulary is meaningful. The r FUN("ngrams")
function provides a list of ngram related output that can be utilize in various analyses.
r FT(orange, 5, text="♦")
r FUN("ngrams")
Example note that the output is only partialr FT(orange, 5, text="♦")
out <- ngrams(DATA$state, DATA$person, 2) lapply(out[["all_n"]], function(x) sapply(x, paste, collapse = " "))
In analyzing discourse it may be helpful to remove certain words from the analysis as the words may not be meaningful or may overshadow the impact of other words. The r FUN("rm_stopwords")
function can be utilized to remove r HR2("http://nlp.stanford.edu/IR-book/html/htmledition/dropping-common-terms-stop-words-1.html", "rm_stopwords")
from the dialogue before passing to further analysis. It should be noted that many functions have a stopwords argument that allows for the removal of the stopwords within the function environment rather than altering the text in the primary discourse dataframe. Careful researcher consideration must be given as to the functional impact of removing words from an analysis.
r FT(orange, 5, text="♦")
Stopword Removal Examplesr FT(orange, 5, text="♦")
## The data DATA$state rm_stopwords(DATA$state, Top200Words) rm_stopwords(DATA$state, Top200Words, strip = TRUE) rm_stopwords(DATA$state, Top200Words, separate = FALSE) rm_stopwords(DATA$state, Top200Words, unlist = TRUE, unique = TRUE)
It is often useful to remove capitalization and punctuation from the dialogue in order to standardize the text. R is case sensitive. By removing capital letters and extra punctuation with the r FUN("strip")
function the text is more comparable. In the following output we can see, through the r HR2("http://stat.ethz.ch/R-manual/R-devel/library/base/html/Comparison.html", "==")
comparison operator and r HR2("http://stat.ethz.ch/R-manual/R-devel/library/base/html/Comparison.html", "outer")
function that the use of r FUN("strip")
makes the different forms of r FT(blue, text="Dan")
comparable.
x <- c("Dan", "dan", "dan.", "DAN") y <- outer(x, x, "==") dimnames(y) <- list(x, x); y x <- strip(c("Dan", "dan", "dan.", "DAN")) y <- outer(x, x, "==") dimnames(y) <- list(x, x); y
As seen in the examples below, r FUN("strip")
comes with multiple arguments to adjust the flexibility of the degree of text standardization.
r FT(orange, 5, text="♦")
r FUN("strip")
Examplesr FT(orange, 5, text="♦")
## Demonstrating the standardization of ## The data DATA$state strip(DATA$state) strip(DATA$state, apostrophe.remove=FALSE) strip(DATA$state, char.keep = c("?", "."))
It is useful in discourse analysis to analyze vocabulary use. This may mean searching for words similar to your initial word list. The r FUN("synonyms")
(aka r FUN("syn")
) function generates synonyms from the r HR2("http://trinker.github.io/qdapDictionaries/", "qdapDictionaries'")
r HR2("http://trinker.github.io/qdapDictionaries/SYNONYM.html", "SYNONYM")
dictionary. These synonyms can be returned as a list or a vector that can then be passed to other qdap functions.
r FT(orange, 5, text="♦")
Synonyms Examplesr FT(orange, 5, text="♦")
synonyms(c("the", "cat", "teach")) syn(c("the", "cat", "teach"), return.list = FALSE) syn(c("the", "cat", "teach"), multiwords = FALSE)
r FT(orange, 5, text="♦")
Word Association Examplesr FT(orange, 5, text="♦")
ms <- c(" I ", "you") et <- c(" it", " tell", "tru") word_associate(DATA2$state, DATA2$person, match.string = ms, wordcloud = TRUE, proportional = TRUE, network.plot = TRUE, nw.label.proportional = TRUE, extra.terms = et, cloud.legend =c("A", "B", "C"), title.color = "blue", cloud.colors = c("red", "purple", "gray70"))
r FT(orange, 5, text="♦")
Word Difference Examplesr FT(orange, 5, text="♦")
out <- with(DATA, word_diff_list(text.var = state, grouping.var = list(sex, adult))) ltruncdf(unlist(out, recursive = FALSE), n=4)
r FT(orange, 5, text="♦")
r FUN("word_list")
Examplesr FT(orange, 5, text="♦")
with(DATA, word_list(state, person)) with(DATA, word_list(state, person, stopwords = Top25Words)) with(DATA, word_list(state, person, cap = FALSE, cap.list=c("do", "we")))
A major task in qualitative work is coding either time or words with selected coding structures. For example a researcher may code the teacher's dialogue as related to the resulting behavior of a student in a classroom as "high", "medium" or "low" engagement. The researcher may choose to apply the coding to:
The coding process in qdap starts with the decision of whether to code the dialogue and/or the time spans. After that the researcher may follow the sequential subsections in the r HR("#coding", "Qualitative Coding System")
section outlined in these steps:
If you choose the route of coding words qdap gives two approaches. Each has distinct benefits and disadvantages dependent upon the situation. If you chose the coding of time spans qdap provides one option.
If you chose the coding of words you may choose to code a csv file or to code the transcript directly (perhaps with markers or other forms of markup), record the ranges in a text list and then read in the data. Both approaches can result in the same data being read back into qdap. The csv approach may allow for extended capabilities (beyond the scope of this vignette) while the transcript/list approach is generally more efficient and takes the approach many qualitative researchers typically utilize in qualitative coding (it also has the added benefit of producing a hard copy).
The next three subsections will walk the reader through how to make a template, code in the template, and read the data back into R/qdap. Subsections 4-5 will cover reshaping and initial analysis after the data has been read in (this approach is generally the same for all three coded data types).
r HR("#wordcsv", "Coding Words - The .csv Approach")
- How to template, code, read in and reshape the datar HR("#wordtrans", "Coding Words - The Transcript/List Approach")
- How to template, code, read in and reshape the datar HR("#timespan", "Coding Time Spans")
- How to template, code, read in and reshape the datar HR("#reshape", "Transforming Codes")
r HR("#analysis", "Initial Coding Analysis")
Before getting started with subsections 1-3 the reader will want to know the naming scheme of the code matrix (r FT(red, text="cm_")
) functions used. The initial r FT(red, text="cm_")
is utilized for any code matrix family of functions. The functions containing r FT(red, text="cm_temp")
are template functions. The r FT(red, text="df")
, r FT(red, text="range")
, or r FT(red, text="time")
determine whether the csv (r FT(red, text="df")
), Transcript/List (r FT(red, text="range")
), or Time Span (r FT(red, text="time")
) approach is being utilized. r FT(red, text="cm_")
functions that bear r FT(red, text="2long")
transform a read in list to a usable long format.
The csv approach utilizes r FUN("cm_df.temp")
and r FUN("cm_2long")
functions. To utilize the csv template approach simply supply the dataframe, specify the text variable and provide a list of anticipated codes.
r FT(orange, 5, text="♦")
Coding Words (csv approach): The Template r FT(orange, 5, text="♦")
## Codes
codes <- qcv(dc, sf, wes, pol, rejk, lk, azx, mmm)
## The csv template
X <- cm_df.temp(DATA, text.var = "state", codes = codes, file = "DATA.csv")
qview(X)
========================================================================
nrow = 56 ncol = 14 X
========================================================================
person sex adult code text word.num dc sf wes pol rejk lk azx mmm
1 sam m 0 K1 Computer 1 0 0 0 0 0 0 0 0
2 sam m 0 K1 is 2 0 0 0 0 0 0 0 0
3 sam m 0 K1 fun. 3 0 0 0 0 0 0 0 0
4 sam m 0 K1 Not 4 0 0 0 0 0 0 0 0
5 sam m 0 K1 too 5 0 0 0 0 0 0 0 0
6 sam m 0 K1 fun. 6 0 0 0 0 0 0 0 0
7 greg m 0 K2 No 7 0 0 0 0 0 0 0 0
8 greg m 0 K2 it's 8 0 0 0 0 0 0 0 0
9 greg m 0 K2 not, 9 0 0 0 0 0 0 0 0
10 greg m 0 K2 it's 10 0 0 0 0 0 0 0 0
After coding the data (see the r HR2("http://www.youtube.com/watch?v=tH242SIESIs", "YouTube video")
) the data can be read back in with r HR2("http://stat.ethz.ch/R-manual/R-devel/library/utils/html/read.table.html", "read.csv")
.
r FT(orange, 5, text="♦")
Coding Words (csv approach): Read In and Reshape r FT(orange, 5, text="♦")
## Read in the data
dat <- read.csv("DATA.csv")
## Reshape to long format with word durations
cm_2long(dat)
code person sex adult code.1 text word.num start end variable
1 dc sam m 0 K1 Computer 1 0 1 dat
2 wes sam m 0 K1 Computer 1 0 1 dat
3 rejk sam m 0 K1 Computer 1 0 1 dat
4 mmm sam m 0 K1 Computer 1 0 1 dat
5 lk sam m 0 K1 is 2 1 2 dat
6 azx sam m 0 K1 is 2 1 2 dat
.
.
.
198 wes greg m 0 K11 already? 56 55 56 dat
199 rejk greg m 0 K11 already? 56 55 56 dat
200 lk greg m 0 K11 already? 56 55 56 dat
201 azx greg m 0 K11 already? 56 55 56 dat
202 mmm greg m 0 K11 already? 56 55 56 dat
The Transcript/List approach utilizes r FUN("cm_df.transcript")
, r FUN("cm_range.temp")
and r FUN("cm_2long")
functions. To use the transcript template simply supply the dataframe, specify the text variable and provide a list of anticipated codes.
r FT(orange, 5, text="♦")
Coding Words (Transcript/List approach): Transcript Template r FT(orange, 5, text="♦")
## Codes
codes <- qcv(AA, BB, CC)
## Transcript template
X <- cm_df.transcript(DATA$state, DATA$person, file="DATA.txt")
sam:
1 2 3 4 5 6
Computer is fun. Not too fun.
greg:
7 8 9 10 11
No it's not, it's dumb.
teacher:
12 13 14 15
What should we do?
sam:
16 17 18 19
You liar, it stinks!
r FT(orange, 5, text="♦")
Coding Words (Transcript/List approach): List Template 1r FT(orange, 5, text="♦")
### List template
cm_range.temp(codes, file = "foo1.txt")
list(
AA = qcv(terms=''),
BB = qcv(terms=''),
CC = qcv(terms='')
)
This list below contains demographic variables. If the researcher has demographic variables it is recommended to supply them at this point. The demographic variables will be generated with durations automatically.
r FT(orange, 5, text="♦")
Coding Words (Transcript/List approach): List Template 2r FT(orange, 5, text="♦")
### List template with demographic variables
with(DATA, cm_range.temp(codes = codes, text.var = state,
grouping.var = list(person, adult), file = "foo2.txt"))
list(
person_greg = qcv(terms='7:11, 20:24, 30:33, 49:56'),
person_researcher = qcv(terms='42:48'),
person_sally = qcv(terms='25:29, 37:41'),
person_sam = qcv(terms='1:6, 16:19, 34:36'),
person_teacher = qcv(terms='12:15'),
adult_0 = qcv(terms='1:11, 16:41, 49:56'),
adult_1 = qcv(terms='12:15, 42:48'),
AA = qcv(terms=''),
BB = qcv(terms=''),
CC = qcv(terms='')
)
After coding the data (see the r HR2("http://www.youtube.com/watch?v=cxcD-j0iI2U", "YouTube video")
) the data can be read back in with r HR2("http://stat.ethz.ch/R-manual/R-devel/library/base/html/source.html", "source")
. Be sure to assign list to an object (e.g., dat <- list()
).
r FT(orange, 5, text="♦")
Coding Words (Transcript/List approach): Read in the datar FT(orange, 5, text="♦")
## Read it in
source("foo1.txt")
### View it
Time1
$AA
[1] "1"
$BB
[1] "1:2," "3:10," "19"
$CC
[1] "1:9," "100:150"
This format is not particularly useful. The data can be reshaped to long format with durations via r FUN("cm_2long")
:
r FT(orange, 5, text="♦")
Coding Words (Transcript/List approach): Long formatr FT(orange, 5, text="♦")
## Long format with durations
datL <- cm_2long(Time1)
datL
code start end variable
1 AA 0 1 Time1
2 BB 0 2 Time1
3 BB 2 10 Time1
4 BB 18 19 Time1
5 CC 0 9 Time1
6 CC 99 150 Time1
The Time Span approach utilizes the r FUN("cm_time.temp")
and r FUN("cm_2long")
functions. To generate the timespan template approach simply supply the list of anticipated codes and a start/end time.
r FT(orange, 5, text="♦")
Coding Times Spans: Time Span Template r FT(orange, 5, text="♦")
## Codes
## Time span template
X <- cm_time.temp(start = ":14", end = "7:40", file="timespans.txt")
X <- cm_time.temp(start = ":14", end = "7:40", file="timespans.doc")
[0] 14 15 16 ... 51 52 53 54 55 56 57 58 59
[1]0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ... 51 52 53 54 55 56 57 58 59
[2]0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ... 51 52 53 54 55 56 57 58 59
[3]0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ... 51 52 53 54 55 56 57 58 59
[4]0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ... 51 52 53 54 55 56 57 58 59
[5]0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ... 51 52 53 54 55 56 57 58 59
[6]0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ... 51 52 53 54 55 56 57 58 59
[7]0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ... 51 52 53
r FT(orange, 5, text="♦")
Coding Times Spans: List Template 1r FT(orange, 5, text="♦")
### List template
codes <- qcv(AA, BB, CC)
cm_time.temp(codes, file = "codelist.txt")
list(
transcript_time_span = qcv(terms="00:00 - 00:00"),
AA = qcv(terms=""),
BB = qcv(terms=""),
CC = qcv(terms="")
)
This list below contains demographic variables. If the researcher has demographic variables it is recommended to supply them at this point.
r FT(orange, 5, text="♦")
Coding Times Spans: List Template 2r FT(orange, 5, text="♦")
### List template with demographic variables
with(DATA, cm_time.temp(codes, list(person, adult), file = "codelist.txt"))
list(
transcript_time_span = qcv(terms="00:00 - 00:00"),
person_sam = qcv(terms=""),
person_greg = qcv(terms=""),
person_teacher = qcv(terms=""),
person_sally = qcv(terms=""),
person_researcher = qcv(terms=""),
adult_0 = qcv(terms=""),
adult_1 = qcv(terms=""),
AA = qcv(terms=""),
BB = qcv(terms=""),
CC = qcv(terms="")
)
After coding the data (see the r HR2("http://www.youtube.com/watch?v=XC-RXeY63bM&feature=youtu.be", "YouTube video")
) the data can be read back in with r HR2("http://stat.ethz.ch/R-manual/R-devel/library/base/html/source.html", "source")
. Be sure to assign list to an object (e.g., dat <- list()
).
r FT(orange, 5, text="♦")
Coding Times Spans: Read in the datar FT(orange, 5, text="♦")
## Read it in
source("codelist.txt")
### View it
Time1
$transcript_time_span
[1] "00:00" "-" "1:12:00"
$A
[1] "2.40:3.00," "5.01," "6.52:7.00," "9.00"
$B
[1] "2.40," "3.01:3.40," "5.01," "6.52:7.00," "9.00"
$C
[1] "2.40:4.00," "5.01," "6.52:7.00," "9.00," "13.00:17.01"
This format is not particularly useful. The data can be reshaped to long format with durations via r FUN("cm_2long")
:
r FT(orange, 5, text="♦")
Coding Times Spans: Long formatr FT(orange, 5, text="♦")
## Long format with durations
datL <- cm_2long(Time1, v.name = "time")
datL
code start end Start End variable
1 A 159 180 00:02:39 00:03:00 Time1
2 A 300 301 00:05:00 00:05:01 Time1
3 A 411 420 00:06:51 00:07:00 Time1
4 A 539 540 00:08:59 00:09:00 Time1
5 B 159 160 00:02:39 00:02:40 Time1
6 B 180 220 00:03:00 00:03:40 Time1
7 B 300 301 00:05:00 00:05:01 Time1
8 B 411 420 00:06:51 00:07:00 Time1
9 B 539 540 00:08:59 00:09:00 Time1
10 C 159 240 00:02:39 00:04:00 Time1
11 C 300 301 00:05:00 00:05:01 Time1
12 C 411 420 00:06:51 00:07:00 Time1
13 C 539 540 00:08:59 00:09:00 Time1
14 C 779 1021 00:12:59 00:17:01 Time1
The researcher may want to determine where codes do and do not overlap with one other. The r FT(red, text="cm_")
family of functions bearing (r FT(red, text="cm_code.")
) perform various transformative functions (Boolean search). r FUN("cm_code.combine")
will merge the spans (time or word) for given codes. r FUN("cm_code.exclude")
will give provide spans that exclude given codes. r FUN("cm_code.overlap")
will yield the spans where all of the given codes co-occur. r FUN("cm_code.transform")
is a wrapper for the previous three functions that produces one dataframe in a single call. Lastly, r FUN("cm_code.blank")
provides a more flexible framework that allows for the introduction of multiple logical operators between codes. Most tasks can be handled with the r FUN("cm_code.transform")
function.
For Examples of each click the links below:
1. r HR("#cm_code.combine", "cm_code.combine Examples")
2. r HR("#cm_code.exclude", "cm_code.exclude Examples")
3. r HR("#cm_code.overlap", "cm_code.overlap Examples")
4. r HR("#cm_code.transform", "cm_code.transform Examples")
5. r HR("#cm_code.blank", "cm_code.blank Examples")
For the sake of simplicity the uses of these functions will be demonstrated via a gantt plot for a visual comparison of the data sets.
The reader should note that all of the above functions utilize two helper functions (r FUN("cm_long2dummy")
and r FUN("cm_dummy2long")
) to stretch the spans into single units of measure (word or second) perform a calculation and then condense back to spans. More advanced needs may require the explicit use of these functions, though they are beyond the scope of this vignette.
The following data sets will be utilized throughout the demonstrations of the r FT(red, text="cm_code.")
family of functions:
r FT(orange, 5, text="♦")
Common Data Sets - Word Approachr FT(orange, 5, text="♦")
foo <- list( AA = qcv(terms="1:10"), BB = qcv(terms="1:2, 3:10, 19"), CC = qcv(terms="1:3, 5:6") ) foo2 <- list( AA = qcv(terms="4:8"), BB = qcv(terms="1:4, 10:12"), CC = qcv(terms="1, 11, 15:20"), DD = qcv(terms="") )
## Single time, long word approach (x <- cm_2long(foo))
code start end variable
1 AA 0 10 foo
2 BB 0 2 foo
3 BB 2 10 foo
4 BB 18 19 foo
5 CC 0 3 foo
6 CC 4 6 foo
x <- structure(list(code = structure(c(1L, 2L, 2L, 2L, 3L, 3L), .Label = c("AA", "BB", "CC"), class = "factor"), start = c(0, 0, 2, 18, 0, 4), end = c(10, 2, 10, 19, 3, 6), variable = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "foo", class = "factor")), .Names = c("code", "start", "end", "variable"), row.names = c(NA, -6L), class = c("cmspans", "cmrange", "cmrange2long", "vname_variable", "data.frame"))
gantt_wrap(x, "code")
## Repeated measures, long word approach (z <- cm_2long(foo, foo2, v.name="time"))
code start end time
1 AA 0 10 foo
2 BB 0 2 foo
3 BB 2 10 foo
4 BB 18 19 foo
5 CC 0 3 foo
6 CC 4 6 foo
7 AA 3 8 foo2
8 BB 0 4 foo2
9 BB 9 12 foo2
10 CC 0 1 foo2
11 CC 10 11 foo2
12 CC 14 20 foo2
z <- structure(list(code = structure(c(1L, 2L, 2L, 2L, 3L, 3L, 1L, 2L, 2L, 3L, 3L, 3L), .Label = c("AA", "BB", "CC"), class = "factor"), start = c(0, 0, 2, 18, 0, 4, 3, 0, 9, 0, 10, 14), end = c(10, 2, 10, 19, 3, 6, 8, 4, 12, 1, 11, 20), time = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("foo", "foo2"), class = "factor")), .Names = c("code", "start", "end", "time"), row.names = c(NA, -12L), class = c("cmspans", "cmrange", "cmrange2long", "vname_time", "data.frame"))
gantt_wrap(z, "code", "time")
r FT(orange, 5, text="♦")
Common Data Sets - Time Span Approachr FT(orange, 5, text="♦")
bar1 <- list( transcript_time_span = qcv(00:00 - 1:12:00), A = qcv(terms = "2.40:3.00, 5.01, 6.02:7.00, 9.00"), B = qcv(terms = "2.40, 3.01:3.02, 5.01, 6.02:7.00, 9.00, 1.12.00:1.19.01"), C = qcv(terms = "2.40:3.00, 5.01, 6.02:7.00, 9.00, 16.25:17.01") ) bar2 <- list( transcript_time_span = qcv(00:00 - 1:12:00), A = qcv(terms = "2.40:3.00, 5.01, 6.02:7.00, 9.00"), B = qcv(terms = "2.40, 3.01:3.02, 5.01, 6.02:7.00, 9.00, 1.12.00:1.19.01"), C = qcv(terms = "2.40:3.00, 5.01, 6.02:7.00, 9.00, 17.01") )
## Single time, long time approach (dat <- cm_2long(bar1))
code start end Start End variable
1 A 159 180 00:02:39 00:03:00 bar1
2 A 300 301 00:05:00 00:05:01 bar1
3 A 361 420 00:06:01 00:07:00 bar1
4 A 539 540 00:08:59 00:09:00 bar1
5 B 159 160 00:02:39 00:02:40 bar1
6 B 180 182 00:03:00 00:03:02 bar1
7 B 300 301 00:05:00 00:05:01 bar1
8 B 361 420 00:06:01 00:07:00 bar1
9 B 539 540 00:08:59 00:09:00 bar1
10 B 4319 4741 01:11:59 01:19:01 bar1
11 C 159 180 00:02:39 00:03:00 bar1
12 C 300 301 00:05:00 00:05:01 bar1
13 C 361 420 00:06:01 00:07:00 bar1
14 C 539 540 00:08:59 00:09:00 bar1
15 C 984 1021 00:16:24 00:17:01 bar1
dat <- structure(list(code = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"), start = c(159, 300, 361, 539, 159, 180, 300, 361, 539, 4319, 159, 300, 361, 539, 984), end = c(180, 301, 420, 540, 160, 182, 301, 420, 540, 4741, 180, 301, 420, 540, 1021), Start = structure(c(0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.00184027777777778, 0.00208333333333333, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0499884259259259, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0113888888888889), format = "h:m:s", class = "times"), End = structure(c(0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.00185185185185185, 0.00210648148148148, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0548726851851852, 0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0118171296296296), format = "h:m:s", class = "times"), variable = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "bar1", class = "factor")), .Names = c("code", "start", "end", "Start", "End", "variable"), row.names = c(NA, -15L), class = c("cmspans", "cmtime", "cmtime2long", "vname_variable", "data.frame", "spans_4320"))
gantt_wrap(dat, "code")
## Repeated measures, long time approach (dats <- cm_2long(bar1, bar2, v.name = "time"))
code start end Start End time
1 A 159 180 00:02:39 00:03:00 bar1
2 A 300 301 00:05:00 00:05:01 bar1
3 A 361 420 00:06:01 00:07:00 bar1
4 A 539 540 00:08:59 00:09:00 bar1
5 B 159 160 00:02:39 00:02:40 bar1
6 B 180 182 00:03:00 00:03:02 bar1
7 B 300 301 00:05:00 00:05:01 bar1
8 B 361 420 00:06:01 00:07:00 bar1
9 B 539 540 00:08:59 00:09:00 bar1
10 B 4319 4741 01:11:59 01:19:01 bar1
11 C 159 180 00:02:39 00:03:00 bar1
12 C 300 301 00:05:00 00:05:01 bar1
13 C 361 420 00:06:01 00:07:00 bar1
14 C 539 540 00:08:59 00:09:00 bar1
15 C 984 1021 00:16:24 00:17:01 bar1
16 A 159 180 00:02:39 00:03:00 bar2
17 A 300 301 00:05:00 00:05:01 bar2
18 A 361 420 00:06:01 00:07:00 bar2
19 A 539 540 00:08:59 00:09:00 bar2
20 B 159 160 00:02:39 00:02:40 bar2
21 B 180 182 00:03:00 00:03:02 bar2
22 B 300 301 00:05:00 00:05:01 bar2
23 B 361 420 00:06:01 00:07:00 bar2
24 B 539 540 00:08:59 00:09:00 bar2
25 B 4319 4741 01:11:59 01:19:01 bar2
26 C 159 180 00:02:39 00:03:00 bar2
27 C 300 301 00:05:00 00:05:01 bar2
28 C 361 420 00:06:01 00:07:00 bar2
29 C 539 540 00:08:59 00:09:00 bar2
30 C 1020 1021 00:17:00 00:17:01 bar2
dats <- structure(list(code = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"), start = c(159, 300, 361, 539, 159, 180, 300, 361, 539, 4319, 159, 300, 361, 539, 984, 159, 300, 361, 539, 159, 180, 300, 361, 539, 4319, 159, 300, 361, 539, 1020), end = c(180, 301, 420, 540, 160, 182, 301, 420, 540, 4741, 180, 301, 420, 540, 1021, 180, 301, 420, 540, 160, 182, 301, 420, 540, 4741, 180, 301, 420, 540, 1021), Start = structure(c(0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.00184027777777778, 0.00208333333333333, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0499884259259259, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0113888888888889, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.00184027777777778, 0.00208333333333333, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0499884259259259, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0118055555555556), format = "h:m:s", class = "times"), End = structure(c(0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.00185185185185185, 0.00210648148148148, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0548726851851852, 0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0118171296296296, 0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.00185185185185185, 0.00210648148148148, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0548726851851852, 0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0118171296296296), format = "h:m:s", class = "times"), time = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("bar1", "bar2"), class = "factor")), .Names = c("code", "start", "end", "Start", "End", "time"), row.names = c(NA, -30L ), class = c("cmspans", "cmtime", "cmtime2long", "vname_time", "data.frame", "spans_4320||4320"))
gantt_wrap(dats, "code", "time")
r FUN("cm_code.combine")
provides all the spans (time/words) that are occupied by one or more of the combined codes. For example, if we utilized r FUN("cm_code.combine")
on code list X and Y the result would be any span where X or Y is located. This is the OR of the Boolean search. Note that combine.code.list
must be supplied as a list of named character vectors.
r FT(orange, 5, text="♦")
r FUN("cm_code.combine")
Single Time Word Exampler FT(orange, 5, text="♦")
(cc1 <- cm_code.combine(x, list(ALL=qcv(AA, BB, CC))))
code start end
1 AA 0 10
2 BB 0 10
3 BB 18 19
4 CC 0 3
5 CC 4 6
6 ALL 0 10
7 ALL 18 19
cc1 <- structure(list(code = structure(c(1L, 3L, 3L, 4L, 4L, 2L, 2L), .Label = c("AA", "ALL", "BB", "CC"), class = "factor"), start = c(0L, 0L, 18L, 0L, 4L, 0L, 18L), end = c(10L, 10L, 19L, 3L, 6L, 10L, 19L)), .Names = c("code", "start", "end"), row.names = c(NA, -7L), class = c("cmspans", "cmrange", "data.frame"))
gantt_wrap(cc1, "code")
r FT(orange, 5, text="♦")
r FUN("cm_code.combine")
Repeated Measures Word Exampler FT(orange, 5, text="♦")
combines <- list(AB=qcv(AA, BB), ABC=qcv(AA, BB, CC)) (cc2 <- cm_code.combine(z, combines, rm.var = "time"))
code start end time
1 AA 0 10 foo
2 BB 0 10 foo
3 BB 18 19 foo
4 CC 0 3 foo
5 CC 4 6 foo
6 AB 0 10 foo
7 AB 18 19 foo
8 ABC 0 10 foo
9 ABC 18 19 foo
10 AA 3 8 foo2
11 BB 0 4 foo2
12 BB 9 12 foo2
13 CC 0 1 foo2
14 CC 10 11 foo2
15 CC 14 20 foo2
16 AB 0 8 foo2
17 AB 9 12 foo2
18 ABC 0 8 foo2
19 ABC 9 12 foo2
20 ABC 14 20 foo2
cc2 <- structure(list(code = structure(c(1L, 4L, 4L, 5L, 5L, 2L, 2L, 3L, 3L, 1L, 4L, 4L, 5L, 5L, 5L, 2L, 2L, 3L, 3L, 3L), .Label = c("AA", "AB", "ABC", "BB", "CC"), class = "factor"), start = c(0L, 0L, 18L, 0L, 4L, 0L, 18L, 0L, 18L, 3L, 0L, 9L, 0L, 10L, 14L, 0L, 9L, 0L, 9L, 14L), end = c(10L, 10L, 19L, 3L, 6L, 10L, 19L, 10L, 19L, 8L, 4L, 12L, 1L, 11L, 20L, 8L, 12L, 8L, 12L, 20L), time = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("foo", "foo2"), class = "factor")), .Names = c("code", "start", "end", "time"), row.names = c(NA, -20L), class = c("cmspans", "vname_time", "data.frame"))
gantt_wrap(cc2, "code", "time")
r FT(orange, 5, text="♦")
r FUN("cm_code.combine")
Single Time Time Span Exampler FT(orange, 5, text="♦")
combines2 <- list(AB=qcv(A, B), BC=qcv(B, C), ABC=qcv(A, B, C)) (cc3 <- cm_code.combine(dat, combines2))
code start end Start End
1 A 159 180 00:02:39 00:03:00
2 A 300 301 00:05:00 00:05:01
3 A 361 420 00:06:01 00:07:00
4 A 539 540 00:08:59 00:09:00
5 B 159 160 00:02:39 00:02:40
6 B 180 182 00:03:00 00:03:02
7 B 300 301 00:05:00 00:05:01
8 B 361 420 00:06:01 00:07:00
9 B 539 540 00:08:59 00:09:00
10 B 4319 4741 01:11:59 01:19:01
11 C 159 180 00:02:39 00:03:00
12 C 300 301 00:05:00 00:05:01
13 C 361 420 00:06:01 00:07:00
14 C 539 540 00:08:59 00:09:00
15 C 984 1021 00:16:24 00:17:01
16 AB 159 182 00:02:39 00:03:02
17 AB 300 301 00:05:00 00:05:01
18 AB 361 420 00:06:01 00:07:00
19 AB 539 540 00:08:59 00:09:00
20 AB 4319 4741 01:11:59 01:19:01
21 BC 159 182 00:02:39 00:03:02
22 BC 300 301 00:05:00 00:05:01
23 BC 361 420 00:06:01 00:07:00
24 BC 539 540 00:08:59 00:09:00
25 BC 984 1021 00:16:24 00:17:01
26 BC 4319 4741 01:11:59 01:19:01
27 ABC 159 182 00:02:39 00:03:02
28 ABC 300 301 00:05:00 00:05:01
29 ABC 361 420 00:06:01 00:07:00
30 ABC 539 540 00:08:59 00:09:00
31 ABC 984 1021 00:16:24 00:17:01
32 ABC 4319 4741 01:11:59 01:19:01
cc3 <- structure(list(code = structure(c(1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 6L, 6L, 6L, 6L, 6L, 2L, 2L, 2L, 2L, 2L, 5L, 5L, 5L, 5L, 5L, 5L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "AB", "ABC", "B", "BC", "C"), class = "factor"), start = c(159L, 300L, 361L, 539L, 159L, 180L, 300L, 361L, 539L, 4319L, 159L, 300L, 361L, 539L, 984L, 159L, 300L, 361L, 539L, 4319L, 159L, 300L, 361L, 539L, 984L, 4319L, 159L, 300L, 361L, 539L, 984L, 4319L), end = c(180L, 301L, 420L, 540L, 160L, 182L, 301L, 420L, 540L, 4741L, 180L, 301L, 420L, 540L, 1021L, 182L, 301L, 420L, 540L, 4741L, 182L, 301L, 420L, 540L, 1021L, 4741L, 182L, 301L, 420L, 540L, 1021L, 4741L), Start = structure(c(0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.00184027777777778, 0.00208333333333333, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0499884259259259, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0113888888888889, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0499884259259259, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0113888888888889, 0.0499884259259259, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0113888888888889, 0.0499884259259259), format = "h:m:s", class = "times"), End = structure(c(0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.00185185185185185, 0.00210648148148148, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0548726851851852, 0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0118171296296296, 0.00210648148148148, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0548726851851852, 0.00210648148148148, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0118171296296296, 0.0548726851851852, 0.00210648148148148, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0118171296296296, 0.0548726851851852 ), format = "h:m:s", class = "times")), .Names = c("code", "start", "end", "Start", "End"), row.names = c(NA, -32L), class = c("cmspans", "cmtime", "data.frame"))
gantt_wrap(cc3, "code")
r FUN("cm_code.exclude")
provides all the spans (time/words) that are occupied by one or more of the combined codes with the exclusion of another code. For example, if we utilized r FUN("cm_code.combine")
on code list X and Y the result would be any span where X is located but Y is not. This is the NOT of the Boolean search. The last term supplied to exclude.code.list is the excluded term. All other terms are combined and the final code term is partitioned out. Note that exclude.code.list
must be supplied as a list of named character vectors.
r FT(orange, 5, text="♦")
r FUN("cm_code.exclude")
Single Time Word Exampler FT(orange, 5, text="♦")
(ce1 <- cm_code.exclude(x, list(BnoC=qcv(BB, CC))))
code start end
1 AA 0 10
2 BB 0 10
3 BB 18 19
4 CC 0 3
5 CC 4 6
6 BnoC 3 4
7 BnoC 6 10
8 BnoC 18 19
ce1 <- structure(list(code = structure(c(1L, 2L, 2L, 4L, 4L, 3L, 3L, 3L), .Label = c("AA", "BB", "BnoC", "CC"), class = "factor"), start = c(0L, 0L, 18L, 0L, 4L, 3L, 6L, 18L), end = c(10L, 10L, 19L, 3L, 6L, 4L, 10L, 19L)), .Names = c("code", "start", "end"), class = c("cmspans", "data.frame", "cmrange"), row.names = c(NA, 8L))
gantt_wrap(ce1, "code")
r FT(orange, 5, text="♦")
r FUN("cm_code.exclude")
Repeated Measures Word Exampler FT(orange, 5, text="♦")
exlist <- list(AnoB=qcv(AA, BB), ABnoC=qcv(AA, BB, CC)) (ce2 <- cm_code.exclude(z, exlist, rm.var = "time"))
code start end time
1 AA 0 10 foo
2 BB 0 10 foo
3 BB 18 19 foo
4 CC 0 3 foo
5 CC 4 6 foo
6 ABnoC 3 4 foo
7 ABnoC 6 10 foo
8 ABnoC 18 19 foo
9 AA 3 8 foo2
10 BB 0 4 foo2
11 BB 9 12 foo2
12 CC 0 1 foo2
13 CC 10 11 foo2
14 CC 14 20 foo2
15 AnoB 4 8 foo2
16 ABnoC 1 8 foo2
17 ABnoC 9 10 foo2
18 ABnoC 11 12 foo2
ce2 <- structure(list(code = structure(c(1L, 3L, 3L, 4L, 4L, 2L, 2L, 2L, 1L, 3L, 3L, 4L, 4L, 4L, 5L, 2L, 2L, 2L), .Label = c("AA", "ABnoC", "BB", "CC", "AnoB"), class = "factor"), start = c(0L, 0L, 18L, 0L, 4L, 3L, 6L, 18L, 3L, 0L, 9L, 0L, 10L, 14L, 4L, 1L, 9L, 11L), end = c(10L, 10L, 19L, 3L, 6L, 4L, 10L, 19L, 8L, 4L, 12L, 1L, 11L, 20L, 8L, 8L, 10L, 12L), time = c("foo", "foo", "foo", "foo", "foo", "foo", "foo", "foo", "foo2", "foo2", "foo2", "foo2", "foo2", "foo2", "foo2", "foo2", "foo2", "foo2")), .Names = c("code", "start", "end", "time"), row.names = c(NA, 18L), class = c("cmspans", "vname_time", "data.frame", "cmrange"))
gantt_wrap(ce2, "code", "time")
r FT(orange, 5, text="♦")
r FUN("cm_code.exclude")
Repeated Measures Time Span Exampler FT(orange, 5, text="♦")
exlist2 <- list(AnoB=qcv(A, B), BnoC=qcv(B, C), ABnoC=qcv(A, B, C)) (ce3 <- cm_code.exclude(dats, exlist2, "time"))
code start end Start End time
1 A 159 180 00:02:39 00:03:00 bar1
2 A 300 301 00:05:00 00:05:01 bar1
3 A 361 420 00:06:01 00:07:00 bar1
4 A 539 540 00:08:59 00:09:00 bar1
5 B 159 160 00:02:39 00:02:40 bar1
6 B 180 182 00:03:00 00:03:02 bar1
7 B 300 301 00:05:00 00:05:01 bar1
8 B 361 420 00:06:01 00:07:00 bar1
9 B 539 540 00:08:59 00:09:00 bar1
10 B 4319 4741 01:11:59 01:19:01 bar1
11 C 159 180 00:02:39 00:03:00 bar1
12 C 300 301 00:05:00 00:05:01 bar1
13 C 361 420 00:06:01 00:07:00 bar1
14 C 539 540 00:08:59 00:09:00 bar1
15 C 984 1021 00:16:24 00:17:01 bar1
16 AnoB 160 180 00:02:40 00:03:00 bar1
17 BnoC 180 182 00:03:00 00:03:02 bar1
18 BnoC 4319 4741 01:11:59 01:19:01 bar1
19 ABnoC 180 182 00:03:00 00:03:02 bar1
20 ABnoC 4319 4741 01:11:59 01:19:01 bar1
21 A 159 180 00:02:39 00:03:00 bar2
22 A 300 301 00:05:00 00:05:01 bar2
23 A 361 420 00:06:01 00:07:00 bar2
24 A 539 540 00:08:59 00:09:00 bar2
25 B 159 160 00:02:39 00:02:40 bar2
26 B 180 182 00:03:00 00:03:02 bar2
27 B 300 301 00:05:00 00:05:01 bar2
28 B 361 420 00:06:01 00:07:00 bar2
29 B 539 540 00:08:59 00:09:00 bar2
30 B 4319 4741 01:11:59 01:19:01 bar2
31 C 159 180 00:02:39 00:03:00 bar2
32 C 300 301 00:05:00 00:05:01 bar2
33 C 361 420 00:06:01 00:07:00 bar2
34 C 539 540 00:08:59 00:09:00 bar2
35 C 1020 1021 00:17:00 00:17:01 bar2
36 AnoB 160 180 00:02:40 00:03:00 bar2
37 BnoC 180 182 00:03:00 00:03:02 bar2
38 BnoC 4319 4741 01:11:59 01:19:01 bar2
39 ABnoC 180 182 00:03:00 00:03:02 bar2
40 ABnoC 4319 4741 01:11:59 01:19:01 bar2
ce3 <- structure(list(code = structure(c(1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 6L, 6L, 6L, 6L, 6L, 3L, 5L, 5L, 2L, 2L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 6L, 6L, 6L, 6L, 6L, 3L, 5L, 5L, 2L, 2L), .Label = c("A", "ABnoC", "AnoB", "B", "BnoC", "C"), class = "factor"), start = c(159L, 300L, 361L, 539L, 159L, 180L, 300L, 361L, 539L, 4319L, 159L, 300L, 361L, 539L, 984L, 160L, 180L, 4319L, 180L, 4319L, 159L, 300L, 361L, 539L, 159L, 180L, 300L, 361L, 539L, 4319L, 159L, 300L, 361L, 539L, 1020L, 160L, 180L, 4319L, 180L, 4319L), end = c(180L, 301L, 420L, 540L, 160L, 182L, 301L, 420L, 540L, 4741L, 180L, 301L, 420L, 540L, 1021L, 180L, 182L, 4741L, 182L, 4741L, 180L, 301L, 420L, 540L, 160L, 182L, 301L, 420L, 540L, 4741L, 180L, 301L, 420L, 540L, 1021L, 180L, 182L, 4741L, 182L, 4741L), Start = structure(c(0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.00184027777777778, 0.00208333333333333, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0499884259259259, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0113888888888889, 0.00185185185185185, 0.00208333333333333, 0.0499884259259259, 0.00208333333333333, 0.0499884259259259, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.00184027777777778, 0.00208333333333333, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0499884259259259, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0118055555555556, 0.00185185185185185, 0.00208333333333333, 0.0499884259259259, 0.00208333333333333, 0.0499884259259259 ), format = "h:m:s", class = "times"), End = structure(c(0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.00185185185185185, 0.00210648148148148, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0548726851851852, 0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0118171296296296, 0.00208333333333333, 0.00210648148148148, 0.0548726851851852, 0.00210648148148148, 0.0548726851851852, 0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.00185185185185185, 0.00210648148148148, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0548726851851852, 0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0118171296296296, 0.00208333333333333, 0.00210648148148148, 0.0548726851851852, 0.00210648148148148, 0.0548726851851852 ), format = "h:m:s", class = "times"), time = c("bar1", "bar1", "bar1", "bar1", "bar1", "bar1", "bar1", "bar1", "bar1", "bar1", "bar1", "bar1", "bar1", "bar1", "bar1", "bar1", "bar1", "bar1", "bar1", "bar1", "bar2", "bar2", "bar2", "bar2", "bar2", "bar2", "bar2", "bar2", "bar2", "bar2", "bar2", "bar2", "bar2", "bar2", "bar2", "bar2", "bar2", "bar2", "bar2", "bar2")), .Names = c("code", "start", "end", "Start", "End", "time"), class = c("cmspans", "data.frame", "vname_time", "cmtime"), row.names = c(NA, 40L))
gantt_wrap(ce3, "code")
r FT(orange, 5, text="♦")
r FUN("cm_code.exclude")
Single Time Time Span Combined Exclude Exampler FT(orange, 5, text="♦")
(ce4.1 <- cm_code.combine(dat, list(AB = qcv(A, B)))) (ce4.2 <- cm_code.exclude(ce4.1, list(CnoAB = qcv(C, AB))))
code start end Start End
1 A 159 180 00:02:39 00:03:00
2 A 300 301 00:05:00 00:05:01
3 A 361 420 00:06:01 00:07:00
4 A 539 540 00:08:59 00:09:00
5 B 159 160 00:02:39 00:02:40
6 B 180 182 00:03:00 00:03:02
7 B 300 301 00:05:00 00:05:01
8 B 361 420 00:06:01 00:07:00
9 B 539 540 00:08:59 00:09:00
10 B 4319 4741 01:11:59 01:19:01
11 C 159 180 00:02:39 00:03:00
12 C 300 301 00:05:00 00:05:01
13 C 361 420 00:06:01 00:07:00
14 C 539 540 00:08:59 00:09:00
15 C 984 1021 00:16:24 00:17:01
16 AB 159 182 00:02:39 00:03:02
17 AB 300 301 00:05:00 00:05:01
18 AB 361 420 00:06:01 00:07:00
19 AB 539 540 00:08:59 00:09:00
20 AB 4319 4741 01:11:59 01:19:01
ce4.1 <- structure(list(code = structure(c(1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "AB", "B", "C"), class = "factor"), start = c(159L, 300L, 361L, 539L, 159L, 180L, 300L, 361L, 539L, 4319L, 159L, 300L, 361L, 539L, 984L, 159L, 300L, 361L, 539L, 4319L), end = c(180L, 301L, 420L, 540L, 160L, 182L, 301L, 420L, 540L, 4741L, 180L, 301L, 420L, 540L, 1021L, 182L, 301L, 420L, 540L, 4741L), Start = structure(c(0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.00184027777777778, 0.00208333333333333, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0499884259259259, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0113888888888889, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0499884259259259), format = "h:m:s", class = "times"), End = structure(c(0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.00185185185185185, 0.00210648148148148, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0548726851851852, 0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0118171296296296, 0.00210648148148148, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0548726851851852 ), format = "h:m:s", class = "times")), .Names = c("code", "start", "end", "Start", "End"), row.names = c(NA, -20L), class = c("cmspans", "cmtime", "data.frame"))
code start end Start End
1 A 159 180 00:02:39 00:03:00
2 A 300 301 00:05:00 00:05:01
3 A 361 420 00:06:01 00:07:00
4 A 539 540 00:08:59 00:09:00
5 B 159 160 00:02:39 00:02:40
6 B 180 182 00:03:00 00:03:02
7 B 300 301 00:05:00 00:05:01
8 B 361 420 00:06:01 00:07:00
9 B 539 540 00:08:59 00:09:00
10 B 4319 4741 01:11:59 01:19:01
11 C 159 180 00:02:39 00:03:00
12 C 300 301 00:05:00 00:05:01
13 C 361 420 00:06:01 00:07:00
14 C 539 540 00:08:59 00:09:00
15 C 984 1021 00:16:24 00:17:01
16 AB 159 182 00:02:39 00:03:02
17 AB 300 301 00:05:00 00:05:01
18 AB 361 420 00:06:01 00:07:00
19 AB 539 540 00:08:59 00:09:00
20 AB 4319 4741 01:11:59 01:19:01
21 CnoAB 984 1021 00:16:24 00:17:01
ce4.2 <- structure(list(code = structure(c(1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 5L), .Label = c("A", "AB", "B", "C", "CnoAB"), class = "factor"), start = c(159L, 300L, 361L, 539L, 159L, 180L, 300L, 361L, 539L, 4319L, 159L, 300L, 361L, 539L, 984L, 159L, 300L, 361L, 539L, 4319L, 984L), end = c(180L, 301L, 420L, 540L, 160L, 182L, 301L, 420L, 540L, 4741L, 180L, 301L, 420L, 540L, 1021L, 182L, 301L, 420L, 540L, 4741L, 1021L), Start = structure(c(0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.00184027777777778, 0.00208333333333333, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0499884259259259, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0113888888888889, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0499884259259259, 0.0113888888888889), format = "h:m:s", class = "times"), End = structure(c(0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.00185185185185185, 0.00210648148148148, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0548726851851852, 0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0118171296296296, 0.00210648148148148, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0548726851851852, 0.0118171296296296 ), format = "h:m:s", class = "times")), .Names = c("code", "start", "end", "Start", "End"), class = c("cmspans", "data.frame", "cmtime"), row.names = c(NA, 21L))
gantt_wrap(ce4.2, "code")
r FUN("cm_code.overlap")
provides all the spans (time/words) that are occupied by all of the given codes. For example, if we utilized r FUN("cm_code.overlap")
on code list X and Y the result would be any span where X and Y are both located. This is the AND of the Boolean search. Note that overlap.code.list
must be supplied as a list of named character vectors.
r FT(orange, 5, text="♦")
r FUN("cm_code.overlap")
Single Time Word Exampler FT(orange, 5, text="♦")
(co1 <- cm_code.overlap(x, list(BC=qcv(BB, CC))))
code start end
1 AA 0 10
2 BB 0 10
3 BB 18 19
4 CC 0 3
5 CC 4 6
6 BC 0 3
7 BC 4 6
co1 <- structure(list(code = structure(c(1L, 2L, 2L, 4L, 4L, 3L, 3L), .Label = c("AA", "BB", "BC", "CC"), class = "factor"), start = c(0L, 0L, 18L, 0L, 4L, 0L, 4L), end = c(10L, 10L, 19L, 3L, 6L, 3L, 6L)), .Names = c("code", "start", "end"), row.names = c(NA, -7L), class = c("cmspans", "cmrange", "data.frame"))
gantt_wrap(co1, "code")
r FT(orange, 5, text="♦")
r FUN("cm_code.overlap")
Repeated Measures Word Exampler FT(orange, 5, text="♦")
overlist <- list(AB=qcv(AA, BB), ABC=qcv(AA, BB, CC)) (co2 <- cm_code.overlap(z, overlist, rm.var = "time"))
code start end time
1 AA 0 10 foo
2 BB 0 10 foo
3 BB 18 19 foo
4 CC 0 3 foo
5 CC 4 6 foo
6 AB 0 10 foo
7 ABC 0 3 foo
8 ABC 4 6 foo
9 AA 3 8 foo2
10 BB 0 4 foo2
11 BB 9 12 foo2
12 CC 0 1 foo2
13 CC 10 11 foo2
14 CC 14 20 foo2
15 AB 3 4 foo2
co2 <- structure(list(code = structure(c(1L, 4L, 4L, 5L, 5L, 2L, 3L, 3L, 1L, 4L, 4L, 5L, 5L, 5L, 2L), .Label = c("AA", "AB", "ABC", "BB", "CC"), class = "factor"), start = c(0L, 0L, 18L, 0L, 4L, 0L, 0L, 4L, 3L, 0L, 9L, 0L, 10L, 14L, 3L), end = c(10L, 10L, 19L, 3L, 6L, 10L, 3L, 6L, 8L, 4L, 12L, 1L, 11L, 20L, 4L), time = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("foo", "foo2"), class = "factor")), .Names = c("code", "start", "end", "time"), row.names = c(NA, -15L), class = c("cmspans", "vname_time", "data.frame"))
gantt_wrap(co2, "code", "time")
r FT(orange, 5, text="♦")
r FUN("cm_code.overlap")
Repeated Measures Time Span Exampler FT(orange, 5, text="♦")
overlist2 <- list(AB=qcv(A, B), BC=qcv(B, C), ABC=qcv(A, B, C)) (co3 <- cm_code.overlap(dats, overlist2, "time"))
code start end Start End time
1 A 159 180 00:02:39 00:03:00 bar1
2 A 300 301 00:05:00 00:05:01 bar1
3 A 361 420 00:06:01 00:07:00 bar1
4 A 539 540 00:08:59 00:09:00 bar1
5 B 159 160 00:02:39 00:02:40 bar1
6 B 180 182 00:03:00 00:03:02 bar1
7 B 300 301 00:05:00 00:05:01 bar1
8 B 361 420 00:06:01 00:07:00 bar1
9 B 539 540 00:08:59 00:09:00 bar1
10 B 4319 4741 01:11:59 01:19:01 bar1
11 C 159 180 00:02:39 00:03:00 bar1
12 C 300 301 00:05:00 00:05:01 bar1
13 C 361 420 00:06:01 00:07:00 bar1
14 C 539 540 00:08:59 00:09:00 bar1
15 C 984 1021 00:16:24 00:17:01 bar1
16 AB 159 160 00:02:39 00:02:40 bar1
17 AB 300 301 00:05:00 00:05:01 bar1
18 AB 361 420 00:06:01 00:07:00 bar1
19 AB 539 540 00:08:59 00:09:00 bar1
20 BC 159 160 00:02:39 00:02:40 bar1
21 BC 300 301 00:05:00 00:05:01 bar1
22 BC 361 420 00:06:01 00:07:00 bar1
23 BC 539 540 00:08:59 00:09:00 bar1
24 ABC 159 160 00:02:39 00:02:40 bar1
25 ABC 300 301 00:05:00 00:05:01 bar1
26 ABC 361 420 00:06:01 00:07:00 bar1
27 ABC 539 540 00:08:59 00:09:00 bar1
28 A 159 180 00:02:39 00:03:00 bar2
29 A 300 301 00:05:00 00:05:01 bar2
30 A 361 420 00:06:01 00:07:00 bar2
31 A 539 540 00:08:59 00:09:00 bar2
32 B 159 160 00:02:39 00:02:40 bar2
33 B 180 182 00:03:00 00:03:02 bar2
34 B 300 301 00:05:00 00:05:01 bar2
35 B 361 420 00:06:01 00:07:00 bar2
36 B 539 540 00:08:59 00:09:00 bar2
37 B 4319 4741 01:11:59 01:19:01 bar2
38 C 159 180 00:02:39 00:03:00 bar2
39 C 300 301 00:05:00 00:05:01 bar2
40 C 361 420 00:06:01 00:07:00 bar2
41 C 539 540 00:08:59 00:09:00 bar2
42 C 1020 1021 00:17:00 00:17:01 bar2
43 AB 159 160 00:02:39 00:02:40 bar2
44 AB 300 301 00:05:00 00:05:01 bar2
45 AB 361 420 00:06:01 00:07:00 bar2
46 AB 539 540 00:08:59 00:09:00 bar2
47 BC 159 160 00:02:39 00:02:40 bar2
48 BC 300 301 00:05:00 00:05:01 bar2
49 BC 361 420 00:06:01 00:07:00 bar2
50 BC 539 540 00:08:59 00:09:00 bar2
51 ABC 159 160 00:02:39 00:02:40 bar2
52 ABC 300 301 00:05:00 00:05:01 bar2
53 ABC 361 420 00:06:01 00:07:00 bar2
54 ABC 539 540 00:08:59 00:09:00 bar2
co3 <- structure(list(code = structure(c(1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 6L, 6L, 6L, 6L, 6L, 2L, 2L, 2L, 2L, 5L, 5L, 5L, 5L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 6L, 6L, 6L, 6L, 6L, 2L, 2L, 2L, 2L, 5L, 5L, 5L, 5L, 3L, 3L, 3L, 3L), .Label = c("A", "AB", "ABC", "B", "BC", "C"), class = "factor"), start = c(159L, 300L, 361L, 539L, 159L, 180L, 300L, 361L, 539L, 4319L, 159L, 300L, 361L, 539L, 984L, 159L, 300L, 361L, 539L, 159L, 300L, 361L, 539L, 159L, 300L, 361L, 539L, 159L, 300L, 361L, 539L, 159L, 180L, 300L, 361L, 539L, 4319L, 159L, 300L, 361L, 539L, 1020L, 159L, 300L, 361L, 539L, 159L, 300L, 361L, 539L, 159L, 300L, 361L, 539L ), end = c(180L, 301L, 420L, 540L, 160L, 182L, 301L, 420L, 540L, 4741L, 180L, 301L, 420L, 540L, 1021L, 160L, 301L, 420L, 540L, 160L, 301L, 420L, 540L, 160L, 301L, 420L, 540L, 180L, 301L, 420L, 540L, 160L, 182L, 301L, 420L, 540L, 4741L, 180L, 301L, 420L, 540L, 1021L, 160L, 301L, 420L, 540L, 160L, 301L, 420L, 540L, 160L, 301L, 420L, 540L), Start = structure(c(0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.00184027777777778, 0.00208333333333333, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0499884259259259, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0113888888888889, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.00184027777777778, 0.00208333333333333, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0499884259259259, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0118055555555556, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593), format = "h:m:s", class = "times"), End = structure(c(0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.00185185185185185, 0.00210648148148148, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0548726851851852, 0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0118171296296296, 0.00185185185185185, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.00185185185185185, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.00185185185185185, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.00185185185185185, 0.00210648148148148, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0548726851851852, 0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0118171296296296, 0.00185185185185185, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.00185185185185185, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.00185185185185185, 0.0034837962962963, 0.00486111111111111, 0.00625), format = "h:m:s", class = "times"), time = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("bar1", "bar2"), class = "factor")), .Names = c("code", "start", "end", "Start", "End", "time"), row.names = c(NA, -54L), class = c("cmspans", "cmtime", "vname_time", "data.frame"))
gantt_wrap(co3, "code")
r FUN("cm_code.transform")
is merely a wrapper for r FUN("cm_code.combine")
, r FUN("cm_code.exclude")
, and r FUN("cm_code.overlap")
.
r FT(orange, 5, text="♦")
r FUN("cm_code.transform")
- Example 1r FT(orange, 5, text="♦")
ct1 <- cm_code.transform(x, overlap.code.list = list(oABC=qcv(AA, BB, CC)), combine.code.list = list(ABC=qcv(AA, BB, CC)), exclude.code.list = list(ABnoC=qcv(AA, BB, CC)) ) ct1
code start end
1 AA 0 10
2 BB 0 10
3 BB 18 19
4 CC 0 3
5 CC 4 6
6 oABC 0 3
7 oABC 4 6
8 ABC 0 10
9 ABC 18 19
10 ABnoC 3 4
11 ABnoC 6 10
12 ABnoC 18 19
ct1 <- structure(list(code = structure(c(1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 6L), .Label = c("AA", "BB", "CC", "oABC", "ABC", "ABnoC"), class = "factor"), start = c(0L, 0L, 18L, 0L, 4L, 0L, 4L, 0L, 18L, 3L, 6L, 18L), end = c(10L, 10L, 19L, 3L, 6L, 3L, 6L, 10L, 19L, 4L, 10L, 19L)), .Names = c("code", "start", "end" ), row.names = c(NA, -12L), class = c("cmspans", "cmrange", "data.frame" ))
gantt_wrap(ct1, "code")
r FT(orange, 5, text="♦")
r FUN("cm_code.transform")
- Example 2r FT(orange, 5, text="♦")
ct2 <-cm_code.transform(z, overlap.code.list = list(oABC=qcv(AA, BB, CC)), combine.code.list = list(ABC=qcv(AA, BB, CC)), exclude.code.list = list(ABnoC=qcv(AA, BB, CC)), "time" ) ct2
code start end time
1 AA 0 10 foo
2 BB 0 10 foo
3 BB 18 19 foo
4 CC 0 3 foo
5 CC 4 6 foo
6 oABC 0 3 foo
7 oABC 4 6 foo
14 ABC 0 10 foo
15 ABC 18 19 foo
19 ABnoC 3 4 foo
20 ABnoC 6 10 foo
21 ABnoC 18 19 foo
8 AA 3 8 foo2
9 BB 0 4 foo2
10 BB 9 12 foo2
11 CC 0 1 foo2
12 CC 10 11 foo2
13 CC 14 20 foo2
16 ABC 0 8 foo2
17 ABC 9 12 foo2
18 ABC 14 20 foo2
22 ABnoC 1 8 foo2
23 ABnoC 9 10 foo2
24 ABnoC 11 12 foo2
ct2 <- structure(list(code = structure(c(1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 6L, 1L, 2L, 2L, 3L, 3L, 3L, 5L, 5L, 5L, 6L, 6L, 6L), .Label = c("AA", "BB", "CC", "oABC", "ABC", "ABnoC"), class = "factor"), start = c(0L, 0L, 18L, 0L, 4L, 0L, 4L, 0L, 18L, 3L, 6L, 18L, 3L, 0L, 9L, 0L, 10L, 14L, 0L, 9L, 14L, 1L, 9L, 11L), end = c(10L, 10L, 19L, 3L, 6L, 3L, 6L, 10L, 19L, 4L, 10L, 19L, 8L, 4L, 12L, 1L, 11L, 20L, 8L, 12L, 20L, 8L, 10L, 12L), time = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("foo", "foo2" ), class = "factor")), .Names = c("code", "start", "end", "time"), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 14L, 15L, 19L, 20L, 21L, 8L, 9L, 10L, 11L, 12L, 13L, 16L, 17L, 18L, 22L, 23L, 24L), class = c("cmspans", "cmrange", "data.frame", "vname_time" ))
gantt_wrap(ct2, "code")
r FT(orange, 5, text="♦")
r FUN("cm_code.transform")
- Example 3r FT(orange, 5, text="♦")
ct3 <-cm_code.transform(dat, overlap.code.list = list(oABC=qcv(A, B, C)), combine.code.list = list(ABC=qcv(A, B, C)), exclude.code.list = list(ABnoC=qcv(A, B, C)) ) ct3
code start end Start End
1 A 159 180 00:02:39 00:03:00
2 A 300 301 00:05:00 00:05:01
3 A 361 420 00:06:01 00:07:00
4 A 539 540 00:08:59 00:09:00
5 B 159 160 00:02:39 00:02:40
6 B 180 182 00:03:00 00:03:02
7 B 300 301 00:05:00 00:05:01
8 B 361 420 00:06:01 00:07:00
9 B 539 540 00:08:59 00:09:00
10 B 4319 4741 01:11:59 01:19:01
11 C 159 180 00:02:39 00:03:00
12 C 300 301 00:05:00 00:05:01
13 C 361 420 00:06:01 00:07:00
14 C 539 540 00:08:59 00:09:00
15 C 984 1021 00:16:24 00:17:01
16 oABC 159 160 00:02:39 00:02:40
17 oABC 300 301 00:05:00 00:05:01
18 oABC 361 420 00:06:01 00:07:00
19 oABC 539 540 00:08:59 00:09:00
20 ABC 159 182 00:02:39 00:03:02
21 ABC 300 301 00:05:00 00:05:01
22 ABC 361 420 00:06:01 00:07:00
23 ABC 539 540 00:08:59 00:09:00
24 ABC 984 1021 00:16:24 00:17:01
25 ABC 4319 4741 01:11:59 01:19:01
26 ABnoC 180 182 00:03:00 00:03:02
27 ABnoC 4319 4741 01:11:59 01:19:01
ct3 <- structure(list(code = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L), .Label = c("A", "B", "C", "oABC", "ABC", "ABnoC" ), class = "factor"), start = c(159L, 300L, 361L, 539L, 159L, 180L, 300L, 361L, 539L, 4319L, 159L, 300L, 361L, 539L, 984L, 159L, 300L, 361L, 539L, 159L, 300L, 361L, 539L, 984L, 4319L, 180L, 4319L), end = c(180L, 301L, 420L, 540L, 160L, 182L, 301L, 420L, 540L, 4741L, 180L, 301L, 420L, 540L, 1021L, 160L, 301L, 420L, 540L, 182L, 301L, 420L, 540L, 1021L, 4741L, 182L, 4741L ), Start = structure(c(0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.00184027777777778, 0.00208333333333333, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0499884259259259, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0113888888888889, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0113888888888889, 0.0499884259259259, 0.00208333333333333, 0.0499884259259259), format = "h:m:s", class = "times"), End = structure(c(0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.00185185185185185, 0.00210648148148148, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0548726851851852, 0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0118171296296296, 0.00185185185185185, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.00210648148148148, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0118171296296296, 0.0548726851851852, 0.00210648148148148, 0.0548726851851852), format = "h:m:s", class = "times")), .Names = c("code", "start", "end", "Start", "End"), row.names = c(NA, -27L), class = c("cmspans", "cmtime", "data.frame"))
gantt_wrap(ct3, "code")
r FUN("cm_code.blank")
provides flexible Boolean comparisons between word.time spans. The overlap
argument takes a logical value, an integer or a character string of binary operator couple with an integer. It is important to understand how the function operates. This initial step calls r FUN("cm_long2dummy")
as seen below (stretching the spans to dummy coded columns), the comparison is conduted between columns, and then the columns are reverted back to spans via the r FUN("cm)dummy2long")
. This first example illustrates the stretching to dummy and reverting back to spans.
r FT(orange, 5, text="♦")
Long to dummy and dummy to long r FT(orange, 5, text="♦")
long2dummy <- cm_long2dummy(x, "variable") list(original =x, long_2_dummy_format = long2dummy[[1]], dummy_back_2_long = cm_dummy2long(long2dummy, "variable") )
$original
code start end variable
1 AA 0 10 foo
2 BB 0 2 foo
3 BB 2 10 foo
4 BB 18 19 foo
5 CC 0 3 foo
6 CC 4 6 foo
$long_2_dummy_format
AA BB CC
0 1 1 1
1 1 1 1
2 1 1 1
3 1 1 0
4 1 1 1
5 1 1 1
6 1 1 0
7 1 1 0
8 1 1 0
9 1 1 0
10 0 0 0
11 0 0 0
12 0 0 0
13 0 0 0
14 0 0 0
15 0 0 0
16 0 0 0
17 0 0 0
18 0 1 0
19 0 0 0
$dummy_back_2_long
code start end variable
1 AA 0 10 foo
2 BB 0 10 foo
3 BB 18 19 foo
4 CC 0 3 foo
5 CC 4 6 foo
long2dummy <- structure(list(foo = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), .Dim = c(20L, 3L), .Dimnames = list( c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19"), c("AA", "BB", "CC")))), .Names = "foo", class = c("l2d_cmrange", "list"))
Now let's examine a few uses of r FUN("cm_code.blank")
. The first is to set overlap = TRUE
(the default behavior). This default behavior is identical to r FUN("cm_code.overlap")
as seen below.
r FT(orange, 5, text="♦")
r FUN("cm_code.blank")
- overlap = TRUE
r FT(orange, 5, text="♦")
(cb1 <- cm_code.blank(x, list(ABC=qcv(AA, BB, CC))))
code start end
1 AA 0 10
2 BB 0 10
3 BB 18 19
4 CC 0 3
5 CC 4 6
6 ABC 0 3
7 ABC 4 6
cb1 <- structure(list(code = structure(c(1L, 3L, 3L, 4L, 4L, 2L, 2L), .Label = c("AA", "ABC", "BB", "CC"), class = "factor"), start = c(0L, 0L, 18L, 0L, 4L, 0L, 4L), end = c(10L, 10L, 19L, 3L, 6L, 3L, 6L)), .Names = c("code", "start", "end"), row.names = c(NA, -7L), class = c("cmspans", "cmrange", "data.frame"))
gantt_wrap(cb1, "code")
Next we'll set overlap = FALSE
and see that it is identical to r FUN("cm_code.combine")
.
r FT(orange, 5, text="♦")
r FUN("cm_code.blank")
- overlap = FALSE
r FT(orange, 5, text="♦")
(cb2 <- cm_code.blank(x, list(ABC=qcv(AA, BB, CC)), overlap = FALSE))
code start end
1 AA 0 10
2 BB 0 10
3 BB 18 19
4 CC 0 3
5 CC 4 6
6 ABC 0 10
7 ABC 18 19
cb2 <- structure(list(code = structure(c(1L, 3L, 3L, 4L, 4L, 2L, 2L), .Label = c("AA", "ABC", "BB", "CC"), class = "factor"), start = c(0L, 0L, 18L, 0L, 4L, 0L, 18L), end = c(10L, 10L, 19L, 3L, 6L, 10L, 19L)), .Names = c("code", "start", "end"), row.names = c(NA, -7L), class = c("cmspans", "cmrange", "data.frame"))
gantt_wrap(cb2, "code")
By first combining all codes (see cb2
above) and then excluding the final code by setting
overlap = 1
the behavior of r FUN("cm_code.exclude")
can be mimicked.
r FT(orange, 5, text="♦")
r FUN("cm_code.blank")
- mimicking r FUN("cm_code.exclude")
r FT(orange, 5, text="♦")
## Using the output from `cb2` above. (cb3 <- cm_code.blank(cb2, list(ABnoC=qcv(ABC, CC)), overlap = 1))
code start end
1 AA 0 10
2 BB 0 10
3 BB 18 19
4 CC 0 3
5 CC 4 6
6 ABC 0 10
7 ABC 18 19
8 ABnoC 3 4
9 ABnoC 6 10
10 ABnoC 18 19
cb3 <- structure(list(code = structure(c(1L, 4L, 4L, 5L, 5L, 2L, 2L, 3L, 3L, 3L), .Label = c("AA", "ABC", "ABnoC", "BB", "CC"), class = "factor"), start = c(0L, 0L, 18L, 0L, 4L, 0L, 18L, 3L, 6L, 18L), end = c(10L, 10L, 19L, 3L, 6L, 10L, 19L, 4L, 10L, 19L)), .Names = c("code", "start", "end"), row.names = c(NA, -10L), class = c("cmspans", "cmrange", "data.frame"))
gantt_wrap(cb3, "code")
Next we shall find when at least two codes overlap by setting overlap = ">1"
.
r FT(orange, 5, text="♦")
r FUN("cm_code.blank")
- At least 2 codes overlap r FT(orange, 5, text="♦")
blanklist <- list(AB=qcv(AA, BB), ABC=qcv(AA, BB, CC)) (cb4 <- cm_code.blank(z, blanklist, rm.var = "time", overlap = ">1"))
code start end time
1 AA 0 10 foo
2 BB 0 10 foo
3 BB 18 19 foo
4 CC 0 3 foo
5 CC 4 6 foo
6 AB 0 10 foo
7 ABC 0 10 foo
8 AA 3 8 foo2
9 BB 0 4 foo2
10 BB 9 12 foo2
11 CC 0 1 foo2
12 CC 10 11 foo2
13 CC 14 20 foo2
14 AB 3 4 foo2
15 ABC 0 1 foo2
16 ABC 3 4 foo2
17 ABC 10 11 foo2
cb4 <- structure(list(code = structure(c(1L, 4L, 4L, 5L, 5L, 2L, 3L, 1L, 4L, 4L, 5L, 5L, 5L, 2L, 3L, 3L, 3L), .Label = c("AA", "AB", "ABC", "BB", "CC"), class = "factor"), start = c(0L, 0L, 18L, 0L, 4L, 0L, 0L, 3L, 0L, 9L, 0L, 10L, 14L, 3L, 0L, 3L, 10L), end = c(10L, 10L, 19L, 3L, 6L, 10L, 10L, 8L, 4L, 12L, 1L, 11L, 20L, 4L, 1L, 4L, 11L), time = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("foo", "foo2" ), class = "factor")), .Names = c("code", "start", "end", "time" ), row.names = c(NA, -17L), class = c("cmspans", "vname_time", "data.frame"))
gantt_wrap(cb4, "code", "time")
Last, we will find spans where not one of the codes occurred by setting overlap = "==0"
.
r FT(orange, 5, text="♦")
r FUN("cm_code.blank")
- Spans where no code occurs r FT(orange, 5, text="♦")
blanklist2 <- list(noAB=qcv(AA, BB), noABC=qcv(AA, BB, CC)) (cb5 <- cm_code.blank(z, blanklist2, rm.var = "time", overlap = "==0"))
code start end time
1 AA 0 10 foo
2 BB 0 10 foo
3 BB 18 19 foo
4 CC 0 3 foo
5 CC 4 6 foo
6 noAB 10 18 foo
7 noAB 19 20 foo
8 noABC 10 18 foo
9 noABC 19 20 foo
10 AA 3 8 foo2
11 BB 0 4 foo2
12 BB 9 12 foo2
13 CC 0 1 foo2
14 CC 10 11 foo2
15 CC 14 20 foo2
16 noAB 8 9 foo2
17 noAB 12 21 foo2
18 noABC 8 9 foo2
19 noABC 12 14 foo2
20 noABC 20 21 foo2
cb5 <- structure(list(code = structure(c(1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 5L), .Label = c("AA", "BB", "CC", "noAB", "noABC"), class = "factor"), start = c(0L, 0L, 18L, 0L, 4L, 10L, 19L, 10L, 19L, 3L, 0L, 9L, 0L, 10L, 14L, 8L, 12L, 8L, 12L, 20L), end = c(10L, 10L, 19L, 3L, 6L, 18L, 20L, 18L, 20L, 8L, 4L, 12L, 1L, 11L, 20L, 9L, 21L, 9L, 14L, 21L), time = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("foo", "foo2"), class = "factor")), .Names = c("code", "start", "end", "time"), row.names = c(NA, -20L), class = c("cmspans", "vname_time", "data.frame"))
gantt_wrap(cb5, "code", "time")
The r FT(red, text="cm_")
family of functions has three approaches to initial analysis of codes. The researcher may want to summarize, visualize or determine the proximity of codes to one another. The following functions accomplish these tasks:
r HR("#cmsum", "Summary")
r HR("#cmplot", "Plotting")
r HR("#cmdist", "Distance Measures")
Most of the r FT(red, text="cm_")
family of functions have a r FUN("summary", "summary.cmspans")
method to allows for summaries of codes by group. Note that these summaries can be wrapped with r FUN("plot", "plot.sum_cmspans")
to print a heat map of the table of summaries.
r FT(orange, 5, text="♦")
Example 1: Summarizing Transcript/List Approach r FT(orange, 5, text="♦")
## Two transcript lists A <- list( person_greg = qcv(terms='7:11, 20:24, 30:33, 49:56'), person_researcher = qcv(terms='42:48'), person_sally = qcv(terms='25:29, 37:41'), person_sam = qcv(terms='1:6, 16:19, 34:36'), person_teacher = qcv(terms='12:15'), adult_0 = qcv(terms='1:11, 16:41, 49:56'), adult_1 = qcv(terms='12:15, 42:48'), AA = qcv(terms="1"), BB = qcv(terms="1:2, 3:10, 19"), CC = qcv(terms="1:9, 100:150") ) B <- list( person_greg = qcv(terms='7:11, 20:24, 30:33, 49:56'), person_researcher = qcv(terms='42:48'), person_sally = qcv(terms='25:29, 37:41'), person_sam = qcv(terms='1:6, 16:19, 34:36'), person_teacher = qcv(terms='12:15'), adult_0 = qcv(terms='1:11, 16:41, 49:56'), adult_1 = qcv(terms='12:15, 42:48'), AA = qcv(terms="40"), BB = qcv(terms="50:90"), CC = qcv(terms="60:90, 100:120, 150"), DD = qcv(terms="") ) ## Long format for transcript/list approach v <- cm_2long(A, B, v.name = "time") head(v)
code start end time
1 person_greg 6 11 A
2 person_greg 19 24 A
3 person_greg 29 33 A
4 person_greg 48 56 A
5 person_researcher 41 48 A
6 person_sally 24 29 A
v <- structure(list(code = structure(c(1L, 1L, 1L, 1L, 2L, 3L, 3L, 4L, 4L, 4L, 5L, 6L, 6L, 6L, 7L, 7L, 8L, 9L, 9L, 9L, 10L, 10L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 4L, 4L, 4L, 5L, 6L, 6L, 6L, 7L, 7L, 8L, 9L, 10L, 10L, 10L), .Label = c("person_greg", "person_researcher", "person_sally", "person_sam", "person_teacher", "adult_0", "adult_1", "AA", "BB", "CC"), class = "factor"), start = c(6, 19, 29, 48, 41, 24, 36, 0, 15, 33, 11, 0, 15, 48, 11, 41, 0, 0, 2, 18, 0, 99, 6, 19, 29, 48, 41, 24, 36, 0, 15, 33, 11, 0, 15, 48, 11, 41, 39, 49, 59, 99, 149), end = c(11, 24, 33, 56, 48, 29, 41, 6, 19, 36, 15, 11, 41, 56, 15, 48, 1, 2, 10, 19, 9, 150, 11, 24, 33, 56, 48, 29, 41, 6, 19, 36, 15, 11, 41, 56, 15, 48, 40, 90, 90, 120, 150), time = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor")), .Names = c("code", "start", "end", "time"), row.names = c(NA, -43L), class = c("cmspans", "cmrange", "cmrange2long", "vname_time", "data.frame"))
## Summary of the data and plotting the summary summary(v)
time code total percent_total n percent_n ave min max mean(sd)
1 a person_greg 22 12.0% 4 18.2% 5.5 4 8 5.5(1.7)
2 a person_researcher 7 3.8% 1 4.5% 7.0 7 7 7.0(0)
3 a person_sally 10 5.4% 2 9.1% 5.0 5 5 5.0(0)
4 a person_sam 13 7.1% 3 13.6% 4.3 3 6 4.3(1.5)
5 a person_teacher 4 2.2% 1 4.5% 4.0 4 4 4.0(0)
6 a adult_0 45 24.5% 3 13.6% 15.0 8 26 15.0(9.6)
7 a adult_1 11 6.0% 2 9.1% 5.5 4 7 5.5(2.1)
8 a AA 1 .5% 1 4.5% 1.0 1 1 1.0(0)
9 a BB 11 6.0% 3 13.6% 3.7 1 8 3.7(3.8)
10 a CC 60 32.6% 2 9.1% 30.0 9 51 30.0(29.7)
11 b person_greg 22 10.6% 4 19.0% 5.5 4 8 5.5(1.7)
12 b person_researcher 7 3.4% 1 4.8% 7.0 7 7 7.0(0)
13 b person_sally 10 4.8% 2 9.5% 5.0 5 5 5.0(0)
14 b person_sam 13 6.3% 3 14.3% 4.3 3 6 4.3(1.5)
15 b person_teacher 4 1.9% 1 4.8% 4.0 4 4 4.0(0)
16 b adult_0 45 21.7% 3 14.3% 15.0 8 26 15.0(9.6)
17 b adult_1 11 5.3% 2 9.5% 5.5 4 7 5.5(2.1)
18 b AA 1 .5% 1 4.8% 1.0 1 1 1.0(0)
19 b BB 41 19.8% 1 4.8% 41.0 41 41 41.0(0)
20 b CC 53 25.6% 3 14.3% 17.7 1 31 17.7(15.3)
============================
Unit of measure: words
plot(summary(v)) plot(summary(v), facet.vars = "time")
r FT(orange, 5, text="♦")
Example 2: Summarizing Time Spans Approach r FT(orange, 5, text="♦")
## Single time list x <- list( transcript_time_span = qcv(00:00 - 1:12:00), A = qcv(terms = "2.40:3.00, 5.01, 6.02:7.00, 9.00"), B = qcv(terms = "2.40, 3.01:3.02, 5.01, 6.02:7.00, 9.00, 1.12.00:1.19.01"), C = qcv(terms = "2.40:3.00, 5.01, 6.02:7.00, 9.00, 17.01") ) ## Long format for time span approach z <-cm_2long(x) head(z)
z <- structure(list(code = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"), start = c(159, 300, 361, 539, 159, 180, 300, 361, 539, 4319, 159, 300, 361, 539, 1020), end = c(180, 301, 420, 540, 160, 182, 301, 420, 540, 4741, 180, 301, 420, 540, 1021), Start = structure(c(0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.00184027777777778, 0.00208333333333333, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0499884259259259, 0.00184027777777778, 0.00347222222222222, 0.00417824074074074, 0.00623842592592593, 0.0118055555555556), format = "h:m:s", class = "times"), End = structure(c(0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.00185185185185185, 0.00210648148148148, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0548726851851852, 0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0118171296296296), format = "h:m:s", class = "times"), variable = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "x", class = "factor")), .Names = c("code", "start", "end", "Start", "End", "variable"), row.names = c(NA, -15L), class = c("cmspans", "cmtime", "cmtime2long", "vname_variable", "data.frame", "spans_4320")) head(z)
## Summary of the data and plotting the summary summary(z)
code total percent_total n percent_n ave min max mean(sd)
1 A 01:22 12.6% 4 26.7% 20.5 1 59 20.5(27.3)
2 B 08:06 74.7% 6 40.0% 81.0 1 422 81.0(168.6)
3 C 01:23 12.7% 5 33.3% 16.6 1 59 16.6(25.2)
============================
Unit of measure: time
Columns measured in seconds unless in the form hh:mm:ss
plot(summary(z))
r FT(orange, 5, text="♦")
Trouble Shooting Summary: Suppress Measurement Units r FT(orange, 5, text="♦")
## suppress printing measurement units suppressMessages(print(summary(z)))
code total percent_total n percent_n ave min max mean(sd)
1 A 01:22 12.6% 4 26.7% 20.5 1 59 20.5(27.3)
2 B 08:06 74.7% 6 40.0% 81.0 1 422 81.0(168.6)
3 C 01:23 12.7% 5 33.3% 16.6 1 59 16.6(25.2)
r FT(orange, 5, text="♦")
Trouble Shooting Summary: Print as Dataframe r FT(orange, 5, text="♦")
## remove print method class(z) <- "data.frame" z
Like r FUN("summary", "summary.cmspans")
, most of the r FT(red, text="cm_")
family of functions have a r FUN("plot", "plot.cmspans")
method as well that allows a Gantt plot visualization of codes by group.
r FT(orange, 5, text="♦")
Gantt Plot of Transcript/List or Time Spans Data r FT(orange, 5, text="♦")
## Two transcript lists A <- list( person_greg = qcv(terms='7:11, 20:24, 30:33, 49:56'), person_researcher = qcv(terms='42:48'), person_sally = qcv(terms='25:29, 37:41'), person_sam = qcv(terms='1:6, 16:19, 34:36'), person_teacher = qcv(terms='12:15'), adult_0 = qcv(terms='1:11, 16:41, 49:56'), adult_1 = qcv(terms='12:15, 42:48'), AA = qcv(terms="1"), BB = qcv(terms="1:2, 3:10, 19"), CC = qcv(terms="1:9, 100:150") ) B <- list( person_greg = qcv(terms='7:11, 20:24, 30:33, 49:56'), person_researcher = qcv(terms='42:48'), person_sally = qcv(terms='25:29, 37:41'), person_sam = qcv(terms='1:6, 16:19, 34:36'), person_teacher = qcv(terms='12:15'), adult_0 = qcv(terms='1:11, 16:41, 49:56'), adult_1 = qcv(terms='12:15, 42:48'), AA = qcv(terms="40"), BB = qcv(terms="50:90"), CC = qcv(terms="60:90, 100:120, 150"), DD = qcv(terms="") ) ## Long format x <- cm_2long(A, v.name = "time") y <- cm_2long(A, B, v.name = "time") ## cm_code family combs <- list(sam_n_sally = qcv(person_sam, person_sally)) z <- cm_code.combine(v, combs, "time")
x <- structure(list(code = structure(c(1L, 1L, 1L, 1L, 2L, 3L, 3L, 4L, 4L, 4L, 5L, 6L, 6L, 6L, 7L, 7L, 8L, 9L, 9L, 9L, 10L, 10L), .Label = c("person_greg", "person_researcher", "person_sally", "person_sam", "person_teacher", "adult_0", "adult_1", "AA", "BB", "CC"), class = "factor"), start = c(6, 19, 29, 48, 41, 24, 36, 0, 15, 33, 11, 0, 15, 48, 11, 41, 0, 0, 2, 18, 0, 99), end = c(11, 24, 33, 56, 48, 29, 41, 6, 19, 36, 15, 11, 41, 56, 15, 48, 1, 2, 10, 19, 9, 150), time = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "A", class = "factor")), .Names = c("code", "start", "end", "time"), row.names = c(NA, -22L), class = c("cmspans", "cmrange", "cmrange2long", "vname_time", "data.frame")) y <- structure(list(code = structure(c(1L, 1L, 1L, 1L, 2L, 3L, 3L, 4L, 4L, 4L, 5L, 6L, 6L, 6L, 7L, 7L, 8L, 9L, 9L, 9L, 10L, 10L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 4L, 4L, 4L, 5L, 6L, 6L, 6L, 7L, 7L, 8L, 9L, 10L, 10L, 10L), .Label = c("person_greg", "person_researcher", "person_sally", "person_sam", "person_teacher", "adult_0", "adult_1", "AA", "BB", "CC"), class = "factor"), start = c(6, 19, 29, 48, 41, 24, 36, 0, 15, 33, 11, 0, 15, 48, 11, 41, 0, 0, 2, 18, 0, 99, 6, 19, 29, 48, 41, 24, 36, 0, 15, 33, 11, 0, 15, 48, 11, 41, 39, 49, 59, 99, 149), end = c(11, 24, 33, 56, 48, 29, 41, 6, 19, 36, 15, 11, 41, 56, 15, 48, 1, 2, 10, 19, 9, 150, 11, 24, 33, 56, 48, 29, 41, 6, 19, 36, 15, 11, 41, 56, 15, 48, 40, 90, 90, 120, 150), time = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor")), .Names = c("code", "start", "end", "time"), row.names = c(NA, -43L), class = c("cmspans", "cmrange", "cmrange2long", "vname_time", "data.frame")) z <- structure(list(code = structure(c(6L, 6L, 6L, 6L, 7L, 8L, 8L, 9L, 9L, 9L, 10L, 2L, 2L, 2L, 3L, 3L, 1L, 4L, 4L, 5L, 5L, 11L, 11L, 11L, 11L, 6L, 6L, 6L, 6L, 7L, 8L, 8L, 9L, 9L, 9L, 10L, 2L, 2L, 2L, 3L, 3L, 1L, 4L, 5L, 5L, 5L, 11L, 11L, 11L, 11L), .Label = c("AA", "adult_0", "adult_1", "BB", "CC", "person_greg", "person_researcher", "person_sally", "person_sam", "person_teacher", "sam_n_sally" ), class = "factor"), start = c(6L, 19L, 29L, 48L, 41L, 24L, 36L, 0L, 15L, 33L, 11L, 0L, 15L, 48L, 11L, 41L, 0L, 0L, 18L, 0L, 99L, 0L, 15L, 24L, 33L, 6L, 19L, 29L, 48L, 41L, 24L, 36L, 0L, 15L, 33L, 11L, 0L, 15L, 48L, 11L, 41L, 39L, 49L, 59L, 99L, 149L, 0L, 15L, 24L, 33L), end = c(11L, 24L, 33L, 56L, 48L, 29L, 41L, 6L, 19L, 36L, 15L, 11L, 41L, 56L, 15L, 48L, 1L, 10L, 19L, 9L, 150L, 6L, 19L, 29L, 41L, 11L, 24L, 33L, 56L, 48L, 29L, 41L, 6L, 19L, 36L, 15L, 11L, 41L, 56L, 15L, 48L, 40L, 90L, 90L, 120L, 150L, 6L, 19L, 29L, 41L), time = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor")), .Names = c("code", "start", "end", "time"), row.names = c(NA, -50L), class = c("cmspans", "vname_time", "data.frame"))
plot(x, title = "Single")
plot(y, title = "Repeated Measure") plot(z, title = "Combined Codes")
Often a research will want to know which codes are clustering closer to other codes (regardless of whether the codes represent word or time spans). r FUN("cm_distance")
allows the research to find the distances between codes and standardize the mean of the differences to allow for comparisons similar to a correlation. The matrix output from r FUN("cm_distance")
is arrived at by taking the means and standard deviations of the differences between codes and scaling them (without centering) and then multiplying the two together. This results in a standardized distance measure that is non-negative, with values closer to zero indicating codes that are found in closer proximity.
The researcher may also access the means, standard deviations and number of codes by indexing the list output for each transcript. This distance measure compliments the Gantt plot.
Note that the argument causal = FALSE (the default) does not assume Code A comes before Code B whereas causal = TRUE assumes the first code precedes the second code. Generally, setting causal = FALSE will result in larger mean of differences and accompanying standardized values. Also note that row names are the first code and column names are the second comparison code. The values for Code A compared to Code B will not be the same as Code B compared to Code A. This is because, unlike a true distance measure, r FUN("cm_distance")
's matrix is asymmetrical. r FUN("cm_distance")
computes the distance by taking each span (start and end) for Code A and comparing it to the nearest start or end for Code B. So for example there may be 6 Code A spans and thus six differences between A and B, whereas Code B may only have 3 spans and thus three differences between B and A. This fact alone will lead to differences in A compared to B versus B compared to A.
r FT(orange, 5, text="♦")
r FUN("cm_distance")
- Initial Data Setup r FT(orange, 5, text="♦")
x <- list( transcript_time_span = qcv(00:00 - 1:12:00), A = qcv(terms = "2.40:3.00, 6.32:7.00, 9.00, 10.00:11.00, 33.23:40.00, 59.56"), B = qcv(terms = "3.01:3.02, 5.01, 19.00, 1.12.00:1.19.01"), C = qcv(terms = "2.40:3.00, 5.01, 6.32:7.00, 9.00, 17.01, 38.09:40.00") ) y <- list( transcript_time_span = qcv(00:00 - 1:12:00), A = qcv(terms = "2.40:3.00, 6.32:7.00, 9.00, 10.00:11.00, 23.44:25.00, 59.56"), B = qcv(terms = "3.01:3.02, 5.01, 7.05:8.00 19.30, 1.12.00:1.19.01"), C = qcv(terms = "2.40:3.00, 5.01, 6.32:7.30, 9.00, 17.01, 25.09:27.00") ) ## Long format dat <- cm_2long(x, y)
dat <- structure(list(code = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"), start = c(159, 391, 539, 599, 2002, 3595, 180, 300, 1139, 4319, 159, 300, 391, 539, 1020, 2288, 159, 391, 539, 599, 1423, 3595, 180, 300, 424, 1169, 4319, 159, 300, 391, 539, 1020, 1508), end = c(180, 420, 540, 660, 2400, 3596, 182, 301, 1140, 4741, 180, 301, 420, 540, 1021, 2400, 180, 420, 540, 660, 1500, 3596, 182, 301, 480, 1170, 4741, 180, 301, 450, 540, 1021, 1620), Start = structure(c(0.00184027777777778, 0.00452546296296296, 0.00623842592592593, 0.00693287037037037, 0.0231712962962963, 0.0416087962962963, 0.00208333333333333, 0.00347222222222222, 0.0131828703703704, 0.0499884259259259, 0.00184027777777778, 0.00347222222222222, 0.00452546296296296, 0.00623842592592593, 0.0118055555555556, 0.0264814814814815, 0.00184027777777778, 0.00452546296296296, 0.00623842592592593, 0.00693287037037037, 0.0164699074074074, 0.0416087962962963, 0.00208333333333333, 0.00347222222222222, 0.00490740740740741, 0.0135300925925926, 0.0499884259259259, 0.00184027777777778, 0.00347222222222222, 0.00452546296296296, 0.00623842592592593, 0.0118055555555556, 0.0174537037037037), format = "h:m:s", class = "times"), End = structure(c(0.00208333333333333, 0.00486111111111111, 0.00625, 0.00763888888888889, 0.0277777777777778, 0.0416203703703704, 0.00210648148148148, 0.0034837962962963, 0.0131944444444444, 0.0548726851851852, 0.00208333333333333, 0.0034837962962963, 0.00486111111111111, 0.00625, 0.0118171296296296, 0.0277777777777778, 0.00208333333333333, 0.00486111111111111, 0.00625, 0.00763888888888889, 0.0173611111111111, 0.0416203703703704, 0.00210648148148148, 0.0034837962962963, 0.00555555555555556, 0.0135416666666667, 0.0548726851851852, 0.00208333333333333, 0.0034837962962963, 0.00520833333333333, 0.00625, 0.0118171296296296, 0.01875), format = "h:m:s", class = "times"), variable = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L ), .Label = c("x", "y"), class = "factor")), .Names = c("code", "start", "end", "Start", "End", "variable"), row.names = c(NA, -33L), class = c("cmspans", "cmtime", "cmtime2long", "vname_variable", "data.frame", "spans_4320||4320"))
plot(dat, title="Plot of the Codes")
r FT(orange, 5, text="♦")
r FUN("cm_distance")
- Non-Causal Distance r FT(orange, 5, text="♦")
## a cm_distance output (out1 <- cm_distance(dat, time.var = "variable"))
x
standardized:
A B C
A 0.00 1.04 0.82
B 0.88 0.00 3.89
C 0.09 0.95 0.00
y
standardized:
A B C
A 0.00 0.38 1.97
B 0.47 0.00 4.94
C 0.08 0.09 0.00
## The elements available from the output names(out1)
[1] "x" "y"
## A list containing means, standard deviations and other ## descriptive statistics for the differences between codes out1$x
$mean
A B C
A 0.00 367.67 208.67
B 322.50 0.00 509.00
C 74.67 265.00 0.00
$sd
A B C
A 0.00 347.51 483.27
B 337.47 0.00 940.94
C 143.77 440.92 0.00
$n
A B C
A 6 6 6
B 4 4 4
C 6 6 6
$combined
A B C
A n=6 367.67(347.51)n=6 208.67(483.27)n=6
B 322.5(337.47)n=4 n=4 509(940.94)n=4
C 74.67(143.77)n=6 265(440.92)n=6 n=6
$standardized
A B C
A 0.00 1.04 0.82
B 0.88 0.00 3.89
C 0.09 0.95 0.00
r FT(orange, 5, text="♦")
r FUN("cm_distance")
- Causal Distance r FT(orange, 5, text="♦")
## a cm_distance output `causal = TRUE` cm_distance(dat, time.var = "variable", causal = TRUE)
x
standardized:
A B C
A 0.66 0.84 0.08
B 0.29 3.96 0.49
C 0.40 0.86 0.37
y
standardized:
A B C
A 1.11 1.63 0.08
B 0.03 2.95 0.04
C 0.70 1.27 0.11
A researcher often needs to quickly gather frequency counts for various words/word types. qdap offers multiple functions designed to efficiently generate descriptive word statistics by any combination of grouping variables. Many of the functions also offer proportional usage to more fairly compare between groups. Additionally, many functions also have plotting methods to better visualize the data that is transformed.
Often a researcher may want to get a general sense of how words are functioning for different grouping variables. The r FUN("word_stats")
function enables a quick picture of what is occurring within the data. The displayed (printed) output is a dataframe, however, the output from word_stats is actually a list. Use r FUN("?word_stats", "word_stats")
to learn more.
The displayed output is a wide dataframe, hence the abbreviated column names. The following column names and meanings will provide guidance in understanding the output:
r FT(orange, 5, text="♦")
r FUN("word_stats")
Example r FT(orange, 5, text="♦")
Note that the initial output is broken into three dataframe outputs because of the width of printed output from r FUN("word_stats")
being so large. The user will see that these three dataframes are actually one wide dataframe in the R output.
(desc_wrds <- with(mraja1spl, word_stats(dialogue, person, tot = tot)))
desc_wrds2 <- with(mraja1spl, word_stats(desc_wrds, person, tot = tot, digits = 1))
desc_wrds2$gts[, c(1, 2:9)]
desc_wrds2$gts[, c(1, 10:19)]
desc_wrds2$gts[, c(1, 20:26)]
## The following shows all the available elements in the `word_stats` output names(desc_wrds)
r FUN("word_stats")
has a plot method that plots the output as a heat map. This can be useful for finding high/low elements in the data set.
r FT(orange, 5, text="♦")
r FUN("word_stats")
Plot r FT(orange, 5, text="♦")
plot(desc_wrds)
plot(desc_wrds, label=TRUE, lab.digits = 1)
It takes considerable time to run r FUN("word_stats")
because it is calculating syllable counts. The user may re-use the object output from one run and bass this as the text variable (text.var
) in a subsequent run with different grouping variables (grouping.vars
) as long as the text variable has not changed. The example below demonstrates how to re-use the output from one r FUN("word_stats")
run in another run.
r FT(orange, 5, text="♦")
r FUN("word_stats")
Re-use r FT(orange, 5, text="♦")
with(mraja1spl, word_stats(desc_wrds, list(sex, fam.aff, died), tot = tot))
Many analyses with words involve a matrix based on the words. qdap uses a word frequency matrix (r FUN("wfm", "Word_Frequency_Matrix")
) or the less malleable dataframe version, word frequency dataframe (r FUN("wfdf", "Word_Frequency_Matrix")
). The r FUN("wfm", "Word_Frequency_Matrix")
is a count of word usages per grouping variable(s). This is a similar concept to the r HR("http://cran.r-project.org/web/packages/tm/index.html", "tm package's")
Term Document Matrix, though instead of documents we are interested in the grouping variable's usage of terms. r FUN("wfm", "Word_Frequency_Matrix")
is the general function that should be used, however, the r FUN("wfdf", "Word_Frequency_Matrix")
function does provide options for margin sums (row and column). Also note that the r FUN("wfm_expanded", "Word_Frequency_Matrix")
and r FUN("wfm_combine", "Word_Frequency_Matrix")
can expand or combine terms within a word frequency matrix.
r FT(orange, 5, text="♦")
r FUN("wfm", "Word_Frequency_Matrix")
Examples r FT(orange, 5, text="♦")
## By a single grouping variable with(DATA, wfm(state, person))[1:15, ] ## By two grouping variables with(DATA, wfm(state, list(sex, adult)))[1:15, ]
r FT(orange, 5, text="♦")
r FUN("wfm", "Word_Frequency_Matrix")
: Keep Two Word Phrase as a Single Term r FT(orange, 5, text="♦")
## insert double tilde ("~~") to keep phrases(e. g., first last name) space_keeps <- c(" fun", "I ") state2 <- space_fill(DATA$state, space_keeps, rm.extra = FALSE) with(DATA, wfm(state2, list(sex, adult)))[1:18, ]
At times it may be useful to view the correlation between word occurrences between turns of talk or other useful groupings. The user can utilize the output from r FUN("wfm", "Word_Frequency_Matrix")
to accomplish this.
`r FT(orange, 5, text="♦")` **`r FUN("wfm", "Word_Frequency_Matrix")`: Word Correlations** `r FT(orange, 5, text="♦")`
dat <- readRDS("data/wfmcor.rds")
library(reports)
x <- factor(with(rajSPLIT, paste(act, pad(TOT(tot)), sep = "|")))
dat <- wfm(rajSPLIT$dialogue, x)
cor(t(dat)[, c("romeo", "juliet")]) cor(t(dat)[, c("romeo", "banished")]) cor(t(dat)[, c("romeo", "juliet", "hate", "love")])
dat2 <- wfm(DATA$state, id(DATA)) qheat(cor(t(dat2)), low = "yellow", high = "red", grid = "grey90", diag.na = TRUE, by.column = NULL)
r FT(orange, 5, text="♦")
r FUN("wfdf", "Word_Frequency_Matrix")
Examples: Add Margins r FT(orange, 5, text="♦")
with(DATA, wfdf(state, person, margins = TRUE))[c(1:15, 41:42), ] with(DATA, wfdf(state, list(sex, adult), margins = TRUE))[c(1:15, 41:42), ]
r FT(orange, 5, text="♦")
r FUN("wfm_expanded", "Word_Frequency_Matrix")
: Expand the wfm r FT(orange, 5, text="♦")
## Start with a word frequency matrix z <- wfm(DATA$state, DATA$person) ## Note a single `you` z[30:41, ] ## Note that there are two `you`s in the expanded version wfm_expanded(z)[33:45, ]
r FT(orange, 5, text="♦")
r FUN("wfm_combine", "Word_Frequency_Matrix")
: Combine Terms in the wfm r FT(orange, 5, text="♦")
## Start with a word frequency matrix x <- wfm(DATA$state, DATA$person) ## The terms to exclude WL <- list( random = c("the", "fun", "i"), yous = c("you", "your", "you're") ) ## Combine the terms (out <- wfm_combine(x, WL)) ## Pass the combined version to Chi Squared Test chisq.test(out)
r FT(orange, 5, text="♦")
r FUN("wfm", "Word_Frequency_Matrix")
: Correspondence Analysis Example r FT(orange, 5, text="♦")
library(ca)
## Grab Just the Candidates
dat <- pres_debates2012
dat <- dat[dat$person %in% qcv(ROMNEY, OBAMA), ]
## Stem the text
speech <- stemmer(dat$dialogue)
## With 25 words removed
mytable1 <- with(dat, wfm(speech, list(person, time), stopwords = Top25Words))
## CA
fit <- ca(mytable)
summary(fit)
plot(fit)
plot3d.ca(fit, labels=1)
## With 200 words removed
mytable2 <- with(dat, wfm(speech, list(person, time), stopwords = Top200Words))
## CA
fit2 <- ca(mytable2)
summary(fit2)
plot(fit2)
plot3d.ca(fit2, labels=1)
Some packages that could further the analysis of qdap expect a Document Term or Term Document Matrix. qdap's r FUN("wfm", "Word_Frequency_Matrix")
is similar to the r HR("http://cran.r-project.org/web/packages/tm/index.html", "tm package's")
r HR("http://www.inside-r.org/packages/cran/tm/docs/DocumentTermMatrix", "TermDocumentMatrix")
and r HR("http://www.inside-r.org/packages/cran/tm/docs/DocumentTermMatrix", "DocumentTermMatrix")
. qdap does not try to replicate the extensive work of ther HR("http://cran.r-project.org/web/packages/tm/index.html", "tm")
package, however, the r FUN("as.tdm")
and r FUN("as.dtm", "as.tdm")
do attempt to extend the work the researcher conducts in qdap to be utilized in other R packages. For a vignette describing qdap-tm compatability use browseVignettes(package = "qdap")
or \r HR2("http://cran.r-project.org/web/packages/qdap/vignettes/tm_package_compatibility.pdf, "Click Here")
.
r FT(orange, 5, text="♦")
r FUN("as.tdm")
Use r FT(orange, 5, text="♦")
x <- wfm(DATA$state, DATA$person) ## Term Document Matrix as.tdm(x) ## Document Term Matrix as.dtm(x)
## ```r ## ## Run Latent Semantic Analysis ## library(lsa) ## lsa(as.tdm(x), dims=dimcalc_share()) ## ``` ## ## ## <pre><code>$tk ## [,1] [,2] ## about -0.021153126 0.072269368 ## already -0.169239530 -0.124825133 ## am -0.169239530 -0.124825133 ## are -0.021153126 0.072269368 ## be -0.021153126 0.072269368 ## can -0.021153126 0.072269368 ## certain -0.021153126 0.072269368 ## computer -0.090637878 0.215786300 ## distrust -0.090637878 0.215786300 ## do -0.001903917 0.014326564 ## dumb -0.169239530 -0.124825133 ## eat -0.169239530 -0.124825133 ## fun -0.181275756 0.431572601 ## good -0.001108363 0.009865681 ## how -0.021153126 0.072269368 ## hungry -0.169239530 -0.124825133 ## i -0.259877408 0.090961168 ## i'm -0.169239530 -0.124825133 ## is -0.259877408 0.090961168 ## it -0.090637878 0.215786300 ## it's -0.338479060 -0.249650265 ## let's -0.169239530 -0.124825133 ## liar -0.090637878 0.215786300 ## move -0.001108363 0.009865681 ## no -0.338479060 -0.249650265 ## not -0.259877408 0.090961168 ## on -0.001108363 0.009865681 ## shall -0.001108363 0.009865681 ## should -0.001903917 0.014326564 ## stinks -0.090637878 0.215786300 ## talking -0.021153126 0.072269368 ## telling -0.169239530 -0.124825133 ## the -0.169239530 -0.124825133 ## then -0.001108363 0.009865681 ## there -0.169239530 -0.124825133 ## too -0.090637878 0.215786300 ## truth -0.169239530 -0.124825133 ## way -0.169239530 -0.124825133 ## we -0.024165406 0.096461613 ## what -0.023057043 0.086595932 ## you -0.371668412 0.379016836 ## ## $dk ## [,1] [,2] ## greg -0.876176894 -0.47984657 ## researcher -0.005738152 0.03792516 ## sally -0.109512712 0.27781431 ## sam -0.469245067 0.82951496 ## teacher -0.009856846 0.05507346 ## ## $sk ## [1] 5.177141 3.844150 ## ## attr(,"class") ## [1] "LSAspace" ## </code></pre>
The r FUN("termco")
family of functions are some of the most useful qdap functions for quantitative discourse analysis. r FUN("termco")
searches for (an optionally groups) terms and outputs a raw count, percent, and combined (raw/percent) matrix of term counts by grouping variable. The r FUN("term_match", "termco")
r FUN("all_words")
r FUN("syn", "synonyms")
, r FUN("exclude")
, and r FUN("spaste")
are complementary functions that are useful in developing word lists to provide to the match.list.
The match.list acts to search for similarly grouped themes. For example c(" read ", " reads", " reading", " reader") may be a search for words associated with reading. It is good practice to name the vectors of words that are stored in the match.list . This is the general form for how to set up a match.list:
themes <- list( theme_1 = c(), theme_2 = c(), theme_n = c() )
It is important to understand how the match.list is handled by `r FUN("termco")`. The match.list is (optionally) case and character sensitive. Spacing is an important way to grab specific words and requires careful thought. For example using "read" will find the words "bread", "read", "reading", and "ready". If you want to search for just the word "read" supply a vector of c(" read ", " reads", " reading", " reader"). Notice the leading and trailing spaces. A space acts as a boundary whereas starting/ending with a nonspace allows for greedy matching that will find words that contain this term. A leading, trailing or both may be used to control how `r FUN("termco")` searches for the supplied terms. So the reader may ask why not supply one string spaced as " read"? Keep in mind that `r FUN("termco")` would also find the word "ready"
This section's examples will first view the complementary functions that augment the themes supplied to match.list and then main r FUN("termco")
function will be explored.
r FUN("term_match", "termco")
looks through a text variable (usually the text found in the transcript) and finds/returns a vector of words containing a term(s).
r FT(orange, 5, text="♦")
r FUN("term_match", "termco")
and r FUN("exclude")
Examplesr FT(orange, 5, text="♦")
term_match(text.var = DATA$state, terms = qcv(the, trust), return.list = FALSE) term_match(DATA$state, "i", FALSE) exclude(term_match(DATA$state, "i", FALSE), talking, telling)
r FUN("all_words")
is similar to r FUN("term_match", "termco")
, however, the function looks at all the words found in a text variable (usually the transcript text) and returns words that begin with or contain the term(s). The output can be arrange alphabetically or by frequency. The output is a dataframe which helps the researcher to make decisions with regard to frequency of word use.
r FT(orange, 5, text="♦")
r FUN("all_words")
Examplesr FT(orange, 5, text="♦")
x1 <- all_words(raj$dialogue, begins.with="re") head(x1, 10) all_words(raj$dialogue, begins.with="q") all_words(raj$dialogue, contains="conc") x2 <- all_words(raj$dialogue) head(x2, 10) x3 <- all_words(raj$dialogue, alphabetical = FALSE) head(x3, 10)
The r FUN("synonyms")
(short hand: r FUN("syn", "synonyms")
) function finds words that are synonyms of a given set of terms and returns either a list of vector that can be passed to r FUN("termco")
's match.list.
r FT(orange, 5, text="♦")
r FUN("synonyms")
Examplesr FT(orange, 5, text="♦")
synonyms(c("the", "cat", "job", "environment", "read", "teach")) head(syn(c("the", "cat", "job", "environment", "read", "teach"), return.list = FALSE), 30) syn(c("the", "cat", "job", "environment", "read", "teach"), multiwords = FALSE)
r FT(orange, 5, text="♦")
r FUN("termco")
- Simple Exampler FT(orange, 5, text="♦")
## Make a small dialogue data set (dat2 <- data.frame(dialogue=c("@bryan is bryan good @br", "indeed", "@ brian"), person=qcv(A, B, A))) ## The word list to search for ml <- list( wrds=c("bryan", "indeed"), "@", bryan=c("bryan", "@ br", "@br") ) ## Search by person with(dat2, termco(dialogue, person, match.list=ml)) ## Search by person proportion output with(dat2, termco(dialogue, person, match.list=ml, percent = FALSE))
`r FT(orange, 5, text="♦")` `r FUN("termco")` - Romeo and Juliet Act 1 Example`r FT(orange, 5, text="♦")`
## Word list to search for ## Note: In the last vector using "the" will actually ## include the other 3 versions ml2 <- list( theme_1 = c(" the ", " a ", " an "), theme_2 = c(" I'" ), "good", the_words = c("the", " the ", " the", "the ") ) (out <- with(raj.act.1, termco(dialogue, person, ml2))) ## Available elements in the termco output (use dat$...) names(out) ## Raw and proportion - useful for presenting in tables out$rnp ## Raw - useful for performing calculations out$raw ## Proportion - useful for performing calculations out$prop
r FT(orange, 5, text="♦")
Using r FUN("termco")
with r FUN("term_match", "termco")
and r FUN("exclude")
r FT(orange, 5, text="♦")
## Example 1 termco(DATA$state, DATA$person, exclude(term_match(DATA$state, qcv(th), FALSE), "truth")) ## Example 2 MTCH.LST <- exclude(term_match(DATA$state, qcv(th, i)), qcv(truth, stinks)) termco(DATA$state, DATA$person, MTCH.LST)
r FT(orange, 5, text="♦")
Using r FUN("termco")
with r FUN("syn")
r FT(orange, 5, text="♦")
syns <- synonyms("doubt") syns[1]
termco(DATA$state, DATA$person, unlist(syns[1]))
termco(DATA$state, DATA$person, unlist(syns[1]))$rnp[, c(1:5, 9:10)]
synonyms("doubt", FALSE) termco(DATA$state, DATA$person, list(doubt = synonyms("doubt", FALSE)))
termco(DATA$state, DATA$person, syns)
termco(DATA[["state"]], DATA[["person"]], syns)$rnp[, c(1:4, 7:8)]
r FUN("termco")
also has a plot method that plots a heat map of the r FUN("termco")
output based on the percent usage by grouping variable. This allows for rapid visualizations of patterns and enables fast spotting of extreme values. Here are some plots from the r HR("#rajex", "Romeo and Juliet Act 1 Example")
above.
r FT(orange, 5, text="♦")
Using r FUN("termco")
Plottingr FT(orange, 5, text="♦")
plot(out)
plot(out, label = TRUE)
A researcher may be interested in classifying and investigating the types of questions used within dialogue.
r FUN("question_type")
provides question classification. The algorithm searches for the following interrogative words (and optionally, their negative contraction form as well):
wrds <- c("are*", "can*", "correct", "could", "did*", "do*", "does*", "had*", "has", "have*", "how", "is", "may", "might*", "must*", "ok", "right", "shall", "should", "was*", "were*", "what", "when", "where", "which", "who", "whom", "whose", "why", "will*", "would*", "implied do/does/did") wrds2 <- data.frame(matrix(c(wrds, rep("", 3)), ncol = 5)) padding <- max(nchar(wrds)) + 3 padding2 <- rev(sort(nchar(wrds)))[2] + 8 out <- apply(wrds2[, 1:4], 2, function(x) sprintf(paste0("%-", padding2, "s"), x)) out2 <- apply(wrds2[, 5, drop=FALSE], 2, function(x) sprintf(paste0("%-", padding, "s"), x)) cat(paste(paste2(cbind(out, out2), sep ="", trim=FALSE), collapse = "\n"))
The interrogative word that is found first (with the exception of "ok", "right" and "correct") in the question determines the sentence type. "ok", "right" and "correct" sentence types are determined if the sentence is a question with no other interrogative words found and "ok", "right" or "correct" is the last word of the sentence. Those interrogative sentences beginning with the word "you", "wanna", or "want" are categorized as implying do/does/did question type, though the use of do/does is not explicit. Those sentence beginning with "you" followed by a select interrogative word (and or their negative counter parts) above (marked with *) or 1-2 amplifier(s) followed by the select interrogative word are categorized by the select word rather than an implied do/does/did question type. A sentence that is marked "ok" over rides an implied do/does/did label. Those with undetermined sentence type are labeled unknown.
r FT(orange, 5, text="♦")
r FUN("question_type")
- Basic Exampler FT(orange, 5, text="♦")
## Basic Example (x <- question_type(DATA.SPLIT$state, DATA.SPLIT$person)) ## Available elements from output names(x) ## Table of counts useful for additional analysis x$count ## The raw output with question types truncdf(x$raw, 15)
r FUN("question_type")
also has a plot method that plots a heat map of the output. This allows for rapid visualizations of patterns and enables fast spotting of extreme values.
r FT(orange, 5, text="♦")
r FUN("question_type")
- Plotting Methodr FT(orange, 5, text="♦")
plot(x)
plot(x, label = TRUE, high = "red", low = "yellow", grid = NULL)
Negative forms of questions such as r FT(green, text = "Don't you want the robots to leave?")
are, by default, grouped with their equivalent positive r FT(green, text = "Do")
forms, such as r FT(green, text = "Do you want the robots to leave?")
. The researcher may choose to keep the two forms separate using the argument neg.cont = TRUE
r FT(orange, 5, text="♦")
r FUN("question_type")
- Include Negative Questionsr FT(orange, 5, text="♦")
## Create a Dataframe with Do and Don't (DATA.SPLIT2 <- rbind(DATA.SPLIT, c("sam", "1.1", "1", "m", "0", "K1", "Don't you think so?", "x"), c("sam", "1.1", "1", "m", "0", "K1", "Do you think so?", "x") ))[, c(1, 7)] ## Do and Don't Grouped Together question_type(DATA.SPLIT2$state, DATA.SPLIT2$person)
## Do and Don't Grouped Separately question_type(DATA.SPLIT2$state, DATA.SPLIT2$person, neg.cont = TRUE)
person tot.quest what don't do how shall implied_do/does/did
1 greg 1 0 0 0 0 0 1(100%)
2 researcher 1 0 0 0 0 1(100%) 0
3 sally 2 1(50%) 0 0 1(50%) 0 0
4 sam 2 0 1(50%) 1(50%) 0 0 0
5 teacher 1 1(100%) 0 0 0 0 0
It may be helpful to access the indices of the question types in the x[["inds"]] output or access x[["raw"]][, "n.row"] for use with the r FUN("trans_context")
function as seen below.
r FT(orange, 5, text="♦")
r FUN("question_type")
- Passing to r FUN("trans_context")
r FT(orange, 5, text="♦")
## The indices of all questions x <- question_type(DATA.SPLIT$state, DATA.SPLIT$person) (inds1 <- x[["inds"]])
with(DATA.SPLIT, trans_context(state, person, inds = inds1, n.before = 2))
===================================
Event 1: [lines 2-6]
sam: Computer is fun. Not too fun.
greg: No it's not, it's dumb.
** teacher: What should we do?
sam: You liar, it stinks!
greg: I am telling the truth!
===================================
Event 2: [lines 5-9]
sam: You liar, it stinks!
greg: I am telling the truth!
** sally: How can we be certain?
greg: There is no way.
sam: I distrust you.
===================================
Event 3: [lines 8-12]
greg: There is no way.
sam: I distrust you.
** sally: What are you talking about?
researcher: Shall we move on? Good then.
greg: I'm hungry. Let's eat. You already?
===================================
Event 4: [lines 9-13]
sam: I distrust you.
sally: What are you talking about?
** researcher: Shall we move on? Good then.
greg: I'm hungry. Let's eat. You already?
===================================
Event 5: [lines 13-15]
sally: What are you talking about?
researcher: Shall we move on? Good then.
** greg: I'm hungry. Let's eat. You already?
## Find what and how questions inds2 <- x[["raw"]][x[["raw"]]$q.type %in% c("what", "how"), "n.row"] with(DATA.SPLIT, trans_context(state, person, inds = inds2, n.before = 2))
===================================
Event 1: [lines 2-6]
sam: Computer is fun. Not too fun.
greg: No it's not, it's dumb.
** teacher: What should we do?
sam: You liar, it stinks!
greg: I am telling the truth!
===================================
Event 2: [lines 5-9]
sam: You liar, it stinks!
greg: I am telling the truth!
** sally: How can we be certain?
greg: There is no way.
sam: I distrust you.
===================================
Event 3: [lines 8-12]
greg: There is no way.
sam: I distrust you.
** sally: What are you talking about?
researcher: Shall we move on? Good then.
greg: I'm hungry. Let's eat. You already?
A research may have the need to view simple word or character counts for the sake of comparisons between grouping variables. r FUN("word_count")
(r FUN("wc", "word_count")
), r FUN("word_list")
, r FUN("character_count", "word_count")
, r FUN("character_table", "word_count")
(r FUN("char_table", "word_count")
) serve the purposes of counting words and characters with r FUN("word_list")
producing a lists of words usage by grouping variable and r FUN("character_table", "word_count")
producing a count table of characters. The following examples demonstrate the uses of these functions.
r FT(orange, 5, text="♦")
r FUN("word_count")
Examplesr FT(orange, 5, text="♦")
word_count(DATA$state) ## `wc a shortened version of `word_count` wc(DATA$state) ## Retain the text wc(DATA$state, names = TRUE) ## Setting `byrow=FALSE` gives a total for the text variable word_count(DATA$state, byrow=FALSE, names = TRUE) ## identical to `byrow=FALSE` above sum(word_count(DATA$state)) ## By grouping variable tapply(DATA$state, DATA$person, wc, byrow=FALSE)
r FT(orange, 5, text="♦")
r FUN("word_count")
Plotting Centered Word Countsr FT(orange, 5, text="♦")
## Scale variable raj2 <- raj raj2$scaled <- unlist(tapply(wc(raj$dialogue), raj2$act, scale)) raj2$scaled2 <- unlist(tapply(wc(raj$dialogue), raj2$act, scale, scale = FALSE)) raj2$ID <- factor(unlist(tapply(raj2$act, raj2$act, seq_along))) ## Plot with ggplot2 library(ggplot2); library(grid) ggplot(raj2, aes(x = ID, y = scaled, fill =person)) + geom_bar(stat="identity", position="identity") + facet_grid(act~.) + ylab("Standard Deviations") + xlab("Turns of Talk") + guides(fill = guide_legend(nrow = 5, byrow = TRUE)) + theme(legend.position="bottom", legend.key.size = unit(.35, "cm"), axis.text.x = element_blank(), axis.ticks.x = element_blank()) + ggtitle("Standardized Word Counts\nPer Turn of Talk")
r FT(orange, 5, text="♦")
r FUN("character_count", "word_count")
Examplesr FT(orange, 5, text="♦")
character_count(DATA$state) ## Setting `byrow=FALSE` gives a total for the text variable character_count(DATA$state, byrow=FALSE) ## identical to `byrow=FALSE` above sum(character_count(DATA$state)) ## By grouping variable tapply(DATA$state, DATA$person, character_count, byrow=FALSE)
r FT(orange, 5, text="♦")
r FUN("character_table", "word_count")
Exampler FT(orange, 5, text="♦")
x <- character_table(DATA$state, DATA$person) names(x) counts(x) proportions(x)[, 1:10] scores(x)[, 1:7] ## Combine Columns vowels <- c("a", "e", "i", "o", "u") cons <- letters[!letters %in% c(vowels, qcv(j, q, x, z))] colcomb2class(x, list(vowels = vowels, consonants = cons, other = 2:7))
r FT(orange, 5, text="♦")
r FUN("character_table", "word_count")
Plot Methodr FT(orange, 5, text="♦")
plot(x)
plot(x, label = TRUE, high = "red", lab.digits = 1, zero.replace = "")
r FT(orange, 5, text="♦")
r FUN("character_table", "word_count")
Additional Plottingr FT(orange, 5, text="♦")
library(ggplot2);library(reshape2) dat <- char_table(DATA$state, list(DATA$sex, DATA$adult)) dat2 <- colsplit2df(melt(dat$raw), keep.orig = TRUE) dat2$adult2 <- lookup(as.numeric(as.character(dat2$adult)), c(0, 1), c("child", "adult")) head(dat2, 15)
ggplot(data = dat2, aes(y = variable, x = value, colour=sex)) + facet_grid(adult2~.) + geom_line(size=1, aes(group =variable), colour = "black") + geom_point()
ggplot(data = dat2, aes(x = variable, y = value)) + geom_bar(aes(fill = variable), stat = "identity") + facet_grid(sex ~ adult2, margins = TRUE) + theme(legend.position="none")
It is helpful to view the frequency distributions for a vector, matrix or dataframe. The r FUN("dist_tab")
function allows the researcher to quickly generate frequency distributions.
r FT(orange, 5, text="♦")
r FUN("dist_tab")
Examplesr FT(orange, 5, text="♦")
dist_tab(rnorm(10000), 10) dist_tab(sample(c("red", "blue", "gray"), 100, T), right = FALSE) dist_tab(CO2, 4)
wdst <- with(mraja1spl, word_stats(dialogue, list(sex, fam.aff, died))) dist_tab(wdst$gts[1:4], 5)
$`sex&fam.aff&died`
interval Freq cum.Freq percent cum.percent
1 f.cap.FALSE 1 1 9.09 9.09
2 f.cap.TRUE 1 2 9.09 18.18
3 f.mont.TRUE 1 3 9.09 27.27
4 m.cap.FALSE 1 4 9.09 36.36
5 m.cap.TRUE 1 5 9.09 45.45
6 m.escal.FALSE 1 6 9.09 54.55
7 m.escal.TRUE 1 7 9.09 63.64
8 m.mont.FALSE 1 8 9.09 72.73
9 m.mont.TRUE 1 9 9.09 81.82
10 m.none.FALSE 1 10 9.09 90.91
11 none.none.FALSE 1 11 9.09 100.00
$n.sent
interval Freq cum.Freq percent cum.percent
1 (3.85,34.7] 7 7 63.64 63.64
2 (34.7,65.6] 0 7 0.00 63.64
3 (65.6,96.4] 2 9 18.18 81.82
4 (96.4,127] 1 10 9.09 90.91
5 (127,158] 1 11 9.09 100.00
$n.words
interval Freq cum.Freq percent cum.percent
1 (14.4,336] 6 6 54.55 54.55
2 (336,658] 2 8 18.18 72.73
3 (658,981] 1 9 9.09 81.82
4 (981,1.3e+03] 1 10 9.09 90.91
5 (1.3e+03,1.62e+03] 1 11 9.09 100.00
$n.char
interval Freq cum.Freq percent cum.percent
1 (72.7,1.34e+03] 6 6 54.55 54.55
2 (1.34e+03,2.6e+03] 2 8 18.18 72.73
3 (2.6e+03,3.86e+03] 1 9 9.09 81.82
4 (3.86e+03,5.12e+03] 1 10 9.09 90.91
5 (5.12e+03,6.39e+03] 1 11 9.09 100.00
In some analysis of text the research may wish to gather information about parts of speech (POS). The function r FUN("pos")
and it's grouping variable counterpart, r FUN("pos_by", "pos")
, can provide this functionality. The r FUN("pos")
functions are wrappers for POS related functions from the r HR("http://cran.r-project.org/web/packages/openNLP/index.html", "openNLP")
package. The r FUN("pos_tags", "pos")
function provides a quick reference to what the POS tags utilized by r HR("http://cran.r-project.org/web/packages/openNLP/index.html", "openNLP")
mean. For more information on the POS tags see the r HR("http://www.cis.upenn.edu/~treebank/", "Penn Treebank Project")
.
The following examples utilize the r FUN("pos_by", "pos")
function as the r FUN("pos")
function is used identically, except without specifying a grouping.var
. It is important to realize that POS tagging is a very slow process. The speed can be increased by using the parallel = TRUE argument. Additionally, the user can recycle the output from one run of r FUN("pos")
, r FUN("pos_by", "pos")
or r FUN("formality")
and use it interchangeably between the r FUN("pos_by", "pos")
and r FUN("formality")
functions. This reuses the POS tagging which is the time intensive part (and can be extracted via YOUR_OUTPUT_HERE[["POStagged"]] from any of the above objects).
r FT(orange, 5, text="♦")
r FUN("pos_tags", "pos")
- Interpreting POS Tagsr FT(orange, 5, text="♦")
pos_tags()
posbydat <- readRDS("data/posbydat.rds")
r FT(orange, 5, text="♦")
r FUN("pos_by", "pos")
- POS by Group(s)r FT(orange, 5, text="♦")
posbydat <- with(DATA, pos_by(state, list(adult, sex))) ## Available elements names(posbydat)
posbydat <- structure(list(text = c("computer is fun not too fun", "no it's not it's dumb", "what should we do", "you liar it stinks", "i am telling the truth", "how can we be certain", "there is no way", "i distrust you", "what are you talking about", "shall we move on good then", "i'm hungry let's eat you already" ), POStagged = structure(list(POStagged = structure(c(1L, 6L, 10L, 11L, 5L, 2L, 8L, 3L, 9L, 7L, 4L), .Label = c("computer/NN is/VBZ fun/NN not/RB too/RB fun/NN", "how/WRB can/MD we/PRP be/VB certain/JJ", "i/FW distrust/NN you/PRP", "i/NN 'm/VBP hungry/JJ let/VBD 's/PRP eat/VB you/PRP already/RB", "i/PRP am/VBP telling/VBG the/DT truth/NN", "no/DT it/PRP 's/VBZ not/RB it/PRP 's/VBZ dumb/JJ", "shall/MD we/PRP move/VB on/IN good/JJ then/RB", "there/EX is/VBZ no/DT way/NN", "what/WP are/VBP you/PRP talking/VBG about/IN", "what/WP should/MD we/PRP do/VB", "you/PRP liar/VBP it/PRP stinks/VB"), class = "factor"), POStags = list( c("NN", "VBZ", "NN", "RB", "RB", "NN"), c("DT", "PRP", "VBZ", "RB", "PRP", "VBZ", "JJ"), c("WP", "MD", "PRP", "VB"), c("PRP", "VBP", "PRP", "VB"), c("PRP", "VBP", "VBG", "DT", "NN"), c("WRB", "MD", "PRP", "VB", "JJ"), c("EX", "VBZ", "DT", "NN" ), c("FW", "NN", "PRP"), c("WP", "VBP", "PRP", "VBG", "IN" ), c("MD", "PRP", "VB", "IN", "JJ", "RB"), c("NN", "VBP", "JJ", "VBD", "PRP", "VB", "PRP", "RB")), word.count = c(6L, 7L, 4L, 4L, 5L, 5L, 4L, 3L, 5L, 6L, 8L)), .Names = c("POStagged", "POStags", "word.count"), row.names = c(NA, -11L), class = "data.frame"), POSprop = structure(list(wrd.cnt = c(6L, 7L, 4L, 4L, 5L, 5L, 4L, 3L, 5L, 6L, 8L), propDT = c(0, 14.2857142857143, 0, 0, 20, 0, 25, 0, 0, 0, 0), propEX = c(0, 0, 0, 0, 0, 0, 25, 0, 0, 0, 0), propFW = c(0, 0, 0, 0, 0, 0, 0, 33.3333333333333, 0, 0, 0), propIN = c(0, 0, 0, 0, 0, 0, 0, 0, 20, 16.6666666666667, 0), propJJ = c(0, 14.2857142857143, 0, 0, 0, 20, 0, 0, 0, 16.6666666666667, 12.5), propMD = c(0, 0, 25, 0, 0, 20, 0, 0, 0, 16.6666666666667, 0), propNN = c(50, 0, 0, 0, 20, 0, 25, 33.3333333333333, 0, 0, 12.5), propPRP = c(0, 28.5714285714286, 25, 50, 20, 20, 0, 33.3333333333333, 20, 16.6666666666667, 25), propRB = c(33.3333333333333, 14.2857142857143, 0, 0, 0, 0, 0, 0, 0, 16.6666666666667, 12.5), propVB = c(0, 0, 25, 25, 0, 20, 0, 0, 0, 16.6666666666667, 12.5), propVBD = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12.5), propVBG = c(0, 0, 0, 0, 20, 0, 0, 0, 20, 0, 0), propVBP = c(0, 0, 0, 25, 20, 0, 0, 0, 20, 0, 12.5), propVBZ = c(16.6666666666667, 28.5714285714286, 0, 0, 0, 0, 25, 0, 0, 0, 0), propWP = c(0, 0, 25, 0, 0, 0, 0, 0, 20, 0, 0), propWRB = c(0, 0, 0, 0, 0, 20, 0, 0, 0, 0, 0)), .Names = c("wrd.cnt", "propDT", "propEX", "propFW", "propIN", "propJJ", "propMD", "propNN", "propPRP", "propRB", "propVB", "propVBD", "propVBG", "propVBP", "propVBZ", "propWP", "propWRB"), row.names = c(NA, -11L), class = "data.frame"), POSfreq = structure(list(wrd.cnt = c(6L, 7L, 4L, 4L, 5L, 5L, 4L, 3L, 5L, 6L, 8L), DT = c(0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L), EX = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), FW = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L ), IN = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L), JJ = c(0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L), MD = c(0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L), NN = c(3L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L), PRP = c(0L, 2L, 1L, 2L, 1L, 1L, 0L, 1L, 1L, 1L, 2L), RB = c(2L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L), VB = c(0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 1L), VBD = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), VBG = c(0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L), VBP = c(0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L), VBZ = c(1L, 2L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), WP = c(0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), WRB = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L)), .Names = c("wrd.cnt", "DT", "EX", "FW", "IN", "JJ", "MD", "NN", "PRP", "RB", "VB", "VBD", "VBG", "VBP", "VBZ", "WP", "WRB"), row.names = c(NA, -11L), class = "data.frame"), POSrnp = structure(list(wrd.cnt = c(6L, 7L, 4L, 4L, 5L, 5L, 4L, 3L, 5L, 6L, 8L), DT = c("0", "1(14.3%)", "0", "0", "1(20.0%)", "0", "1(25.0%)", "0", "0", "0", "0" ), EX = c("0", "0", "0", "0", "0", "0", "1(25.0%)", "0", "0", "0", "0"), FW = c("0", "0", "0", "0", "0", "0", "0", "1(33.3%)", "0", "0", "0"), IN = c("0", "0", "0", "0", "0", "0", "0", "0", "1(20.0%)", "1(16.7%)", "0"), JJ = c("0", "1(14.3%)", "0", "0", "0", "1(20.0%)", "0", "0", "0", "1(16.7%)", "1(12.5%)"), MD = c("0", "0", "1(25.0%)", "0", "0", "1(20.0%)", "0", "0", "0", "1(16.7%)", "0"), NN = c("3(50.0%)", "0", "0", "0", "1(20.0%)", "0", "1(25.0%)", "1(33.3%)", "0", "0", "1(12.5%)"), PRP = c("0", "2(28.6%)", "1(25.0%)", "2(50.0%)", "1(20.0%)", "1(20.0%)", "0", "1(33.3%)", "1(20.0%)", "1(16.7%)", "2(25.0%)"), RB = c("2(33.3%)", "1(14.3%)", "0", "0", "0", "0", "0", "0", "0", "1(16.7%)", "1(12.5%)"), VB = c("0", "0", "1(25.0%)", "1(25.0%)", "0", "1(20.0%)", "0", "0", "0", "1(16.7%)", "1(12.5%)"), VBD = c("0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "1(12.5%)"), VBG = c("0", "0", "0", "0", "1(20.0%)", "0", "0", "0", "1(20.0%)", "0", "0"), VBP = c("0", "0", "0", "1(25.0%)", "1(20.0%)", "0", "0", "0", "1(20.0%)", "0", "1(12.5%)"), VBZ = c("1(16.7%)", "2(28.6%)", "0", "0", "0", "0", "1(25.0%)", "0", "0", "0", "0"), WP = c("0", "0", "1(25.0%)", "0", "0", "0", "0", "0", "1(20.0%)", "0", "0" ), WRB = c("0", "0", "0", "0", "0", "1(20.0%)", "0", "0", "0", "0", "0")), .Names = c("wrd.cnt", "DT", "EX", "FW", "IN", "JJ", "MD", "NN", "PRP", "RB", "VB", "VBD", "VBG", "VBP", "VBZ", "WP", "WRB"), row.names = c(NA, -11L), class = "data.frame"), percent = TRUE, zero.replace = 0, pos.by.freq = structure(list( `adult&sex` = structure(1:4, .Label = c("0.f", "0.m", "1.f", "1.m"), class = "factor"), wrd.cnt = c(10, 37, 6, 4), DT = c(0, 3, 0, 0), EX = c(0, 1, 0, 0), FW = c(0, 1, 0, 0), IN = c(1, 0, 1, 0), JJ = c(1, 2, 1, 0), MD = c(1, 0, 1, 1), NN = c(0, 7, 0, 0), PRP = c(2, 8, 1, 1), RB = c(0, 4, 1, 0), VB = c(1, 2, 1, 1), VBD = c(0, 1, 0, 0), VBG = c(1, 1, 0, 0), VBP = c(1, 3, 0, 0), VBZ = c(0, 4, 0, 0), WP = c(1, 0, 0, 1), WRB = c(1, 0, 0, 0)), .Names = c("adult&sex", "wrd.cnt", "DT", "EX", "FW", "IN", "JJ", "MD", "NN", "PRP", "RB", "VB", "VBD", "VBG", "VBP", "VBZ", "WP", "WRB"), row.names = c(NA, 4L), class = "data.frame"), pos.by.prop = structure(list( `adult&sex` = structure(1:4, .Label = c("0.f", "0.m", "1.f", "1.m"), class = "factor"), wrd.cnt = c(10, 37, 6, 4), DT = c(0, 8.10810810810811, 0, 0), EX = c(0, 2.7027027027027, 0, 0), FW = c(0, 2.7027027027027, 0, 0), IN = c(10, 0, 16.6666666666667, 0), JJ = c(10, 5.40540540540541, 16.6666666666667, 0), MD = c(10, 0, 16.6666666666667, 25), NN = c(0, 18.9189189189189, 0, 0), PRP = c(20, 21.6216216216216, 16.6666666666667, 25), RB = c(0, 10.8108108108108, 16.6666666666667, 0), VB = c(10, 5.40540540540541, 16.6666666666667, 25), VBD = c(0, 2.7027027027027, 0, 0), VBG = c(10, 2.7027027027027, 0, 0), VBP = c(10, 8.10810810810811, 0, 0), VBZ = c(0, 10.8108108108108, 0, 0), WP = c(10, 0, 0, 25), WRB = c(10, 0, 0, 0)), .Names = c("adult&sex", "wrd.cnt", "DT", "EX", "FW", "IN", "JJ", "MD", "NN", "PRP", "RB", "VB", "VBD", "VBG", "VBP", "VBZ", "WP", "WRB"), row.names = c(NA, 4L), class = "data.frame"), pos.by.rnp = structure(list(`adult&sex` = structure(1:4, .Label = c("0.f", "0.m", "1.f", "1.m"), class = "factor"), wrd.cnt = c(10, 37, 6, 4), DT = c("0", "3(8.1%)", "0", "0"), EX = c("0", "1(2.7%)", "0", "0"), FW = c("0", "1(2.7%)", "0", "0"), IN = c("1(10.0%)", "0", "1(16.7%)", "0"), JJ = c("1(10.0%)", "2(5.4%)", "1(16.7%)", "0"), MD = c("1(10.0%)", "0", "1(16.7%)", "1(25.0%)"), NN = c("0", "7(18.9%)", "0", "0"), PRP = c("2(20.0%)", "8(21.6%)", "1(16.7%)", "1(25.0%)"), RB = c("0", "4(10.8%)", "1(16.7%)", "0"), VB = c("1(10.0%)", "2(5.4%)", "1(16.7%)", "1(25.0%)"), VBD = c("0", "1(2.7%)", "0", "0"), VBG = c("1(10.0%)", "1(2.7%)", "0", "0"), VBP = c("1(10.0%)", "3(8.1%)", "0", "0"), VBZ = c("0", "4(10.8%)", "0", "0"), WP = c("1(10.0%)", "0", "0", "1(25.0%)"), WRB = c("1(10.0%)", "0", "0", "0")), .Names = c("adult&sex", "wrd.cnt", "DT", "EX", "FW", "IN", "JJ", "MD", "NN", "PRP", "RB", "VB", "VBD", "VBG", "VBP", "VBZ", "WP", "WRB"), row.names = c(NA, 4L), class = "data.frame")), .Names = c("text", "POStagged", "POSprop", "POSfreq", "POSrnp", "percent", "zero.replace", "pos.by.freq", "pos.by.prop", "pos.by.rnp"), class = "pos_by", grouping.var = structure(c(2L, 2L, 4L, 2L, 2L, 1L, 2L, 2L, 1L, 3L, 2L), .Label = c("0.f", "0.m", "1.f", "1.m"), class = "factor")) names(posbydat)
## Inspecting the truncated output lview(posbydat)
$text
[1] "computer is fun not too fun" "no its not its dumb"
[3] "what should we do" "you liar it stinks"
[5] "i am telling the truth" "how can we be certain"
[7] "there is no way" "i distrust you"
[9] "what are you talking about" "shall we move on good then"
[11] "im hungry lets eat you already"
$POStagged
POStagged POStags word.count
1 computer/NN is/VBZ fun/NN ...RB fun/NN NN, VBZ, NN, RB, RB, NN 6
2 no/DT its/PRP$ not/RB its/PRP$ dumb/JJ DT, PRP$, RB, PRP$, JJ 5
3 what/WP should/MD we/PRP do/VB WP, MD, PRP, VB 4
4 you/PRP liar/VBP it/PRP stinks/VB PRP, VBP, PRP, VB 4
5 i/PRP am/VBP telling/VBG...DT truth/NN PRP, VBP, VBG, DT, NN 5
6 how/WRB can/MD we/PRP ...VB certain/JJ WRB, MD, PRP, VB, JJ 5
7 there/EX is/VBZ no/DT way/NN EX, VBZ, DT, NN 4
8 i/FW distrust/NN you/PRP FW, NN, PRP 3
9 what/WP are/VBP you/PR...VBG about/IN WP, VBP, PRP, VBG, IN 5
10 shall/MD we/PRP move/VB ...JJ then/RB MD, PRP, VB, IN, JJ, RB 6
11 im/PRP hungry/JJ let...PRP already/RB PRP, JJ, VBZ, VB, PRP, RB 6
$POSprop
wrd.cnt propDT propEX propFW propIN propJJ propMD ... propWRB
1 6 0 0 0.00000 0.00000 0.00000 0.00000 0
2 5 20 0 0.00000 0.00000 20.00000 0.00000 0
3 4 0 0 0.00000 0.00000 0.00000 25.00000 0
4 4 0 0 0.00000 0.00000 0.00000 0.00000 0
5 5 20 0 0.00000 0.00000 0.00000 0.00000 0
6 5 0 0 0.00000 0.00000 20.00000 20.00000 20
7 4 25 25 0.00000 0.00000 0.00000 0.00000 0
8 3 0 0 33.33333 0.00000 0.00000 0.00000 0
9 5 0 0 0.00000 20.00000 0.00000 0.00000 0
10 6 0 0 0.00000 16.66667 16.66667 16.66667 0
11 6 0 0 0.00000 0.00000 16.66667 0.00000 0
$POSfreq
wrd.cnt DT EX FW IN JJ MD NN PRP PRP$ RB VB VBG VBP VBZ WP WRB
1 6 0 0 0 0 0 0 3 0 0 2 0 0 0 1 0 0
2 5 1 0 0 0 1 0 0 0 2 1 0 0 0 0 0 0
3 4 0 0 0 0 0 1 0 1 0 0 1 0 0 0 1 0
4 4 0 0 0 0 0 0 0 2 0 0 1 0 1 0 0 0
5 5 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 0
6 5 0 0 0 0 1 1 0 1 0 0 1 0 0 0 0 1
7 4 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0
8 3 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0
9 5 0 0 0 1 0 0 0 1 0 0 0 1 1 0 1 0
10 6 0 0 0 1 1 1 0 1 0 1 1 0 0 0 0 0
11 6 0 0 0 0 1 0 0 2 0 1 1 0 0 1 0 0
$POSrnp
wrd.cnt DT EX FW IN JJ ... WRB
1 6 0 0 0 0 0 0
2 5 1(20.0%) 0 0 0 1(20.0%) 0
3 4 0 0 0 0 0 0
4 4 0 0 0 0 0 0
5 5 1(20.0%) 0 0 0 0 0
6 5 0 0 0 0 1(20.0%) 1(20.0%)
7 4 1(25.0%) 1(25.0%) 0 0 0 0
8 3 0 0 1(33.3%) 0 0 0
9 5 0 0 0 1(20.0%) 0 0
10 6 0 0 0 1(16.7%) 1(16.7%) 0
11 6 0 0 0 0 1(16.7%) 0
$percent
[1] TRUE
$zero.replace
[1] 0
$pos.by.freq
adult&sex wrd.cnt DT EX FW IN JJ MD NN PRP PRP$ RB VB VBG VBP VBZ WP WRB
1 0.f 10 0 0 0 1 1 1 0 2 0 0 1 1 1 0 1 1
2 0.m 33 3 1 1 0 2 0 6 6 2 4 2 1 2 3 0 0
3 1.f 6 0 0 0 1 1 1 0 1 0 1 1 0 0 0 0 0
4 1.m 4 0 0 0 0 0 1 0 1 0 0 1 0 0 0 1 0
$pos.by.prop
adult&sex wrd.cnt DT EX FW IN JJ ... WP
1 0.f 10 0.000000 0.000000 0.000000 10.00000 10.000000 10
2 0.m 33 9.090909 3.030303 3.030303 0.00000 6.060606 0
3 1.f 6 0.000000 0.000000 0.000000 16.66667 16.666667 0
4 1.m 4 0.000000 0.000000 0.000000 0.00000 0.000000 25
$pos.by.rnp
adult&sex wrd.cnt DT EX FW IN JJ ... WP
1 0.f 10 0 0 0 1(10.0%) 1(10.0%) 1(10.0%)
2 0.m 33 3(9.1%) 1(3.0%) 1(3.0%) 0 2(6.1%) 0
3 1.f 6 0 0 0 1(16.7%) 1(16.7%) 0
4 1.m 4 0 0 0 0 0 1(25.0%)
r FT(orange, 5, text="♦")
Plot Methodr FT(orange, 5, text="♦")
plot(posbydat, values = TRUE, digits = 2)
r FT(orange, 5, text="♦")
r FUN("pos_by", "pos")
- Recycling Saves Timer FT(orange, 5, text="♦")
posbydat2 <- with(DATA, pos_by(posbydat, list(person, sex)))
system.time(with(DATA, pos_by(posbydat, list(person, sex))))
user system elapsed
0.07 0.00 0.07
## `pos_by` output Recycled for `formality` with(DATA, formality(posbydat, list(person, sex)))
Examining syllable counts can be a useful source of information in associating with education level, age, SES, gender, etc. Several readability scores rely on syllable and polysyllable word counts. qdap defines a polysyllable word as a word with 3 or more syllables, though some in the linguistics/literacy fields may include two syllable words. r FUN("syllable_count", "syllabication")
is the base function for r FUN("syllable_sum", "syllabication")
, r FUN("polysyllable_sum", "syllabication")
, and r FUN("combo_syllable_sum", "syllabication")
, though is generally not of direct use to the researcher conducting discourse analysis. r FUN("syllable_count", "syllabication")
uses a dictionary lookup method augmented with a syllable algorithm for words not found in the dictionary. Words not found in the dictionary are denoted with a NF in the in.dictionary column of the output.
Here is a list of qdap r FUN("syllabication", "syllabication")
functions and their descriptions:
syllable_count | Count the number of syllables in a single text string. |
syllable_sum | Count the number of syllables per row of text. |
polysyllable_sum | Count the number of polysyllables per row of text. |
combo_syllable_sum | Count the number of both syllables and polysyllables per row of text. |
r FT(orange, 5, text="♦")
r FUN("syllabication", "syllabication")
Examplesr FT(orange, 5, text="♦")
syllable_count("Robots like Dason lie.") ## The text variable for reference DATA$state syllable_sum(DATA$state) polysyllable_sum(DATA$state) combo_syllable_sum(DATA$state)
qdap offers a number of word statistics and scores applied by grouping variable. Some functions are original to qdap, while others are taken from academic papers. Complete references for statistics/scores based on others' work are provided in the r HR("http://cran.r-project.org/web/packages/qdap/qdap.pdf", "help manual")
where appropriate. It is assumed that the reader is familiar, or can become acquainted, with the theory and methods for qdap functions based on the work of others. For qdap functions that are original to qdap a more robust description of the use and theory is provided.
Readability scores were originally designed to measure the difficulty of text. Scores are generally based on, number of words, syllables, polly-syllables and word length. While these scores are not specifically designed for, or tested on, speech, they can be useful indicators of speech complexity. The following score examples demonstrate the use of the following readability scores:
r HR("#ari", "Automated Readability Index")
r HR("#coleman", "Coleman Liau")
r HR("#smog", "SMOG")
r HR("#flesch", "Flesch Kincaid")
r HR("#fry", "Fry")
r HR("#linwr", "Linsear Write")
`r FT(orange, 5, text="♦")` **Automated Readability Index**`r FT(orange, 5, text="♦")`
wzxhzdk:215 sex&fam.aff word.count sentence.count character.count Aut._Read._Index
1 f.cap 9458 929 37474 2.3
2 f.mont 28 4 88 -3.1
3 m.cap 1204 133 4615 1.2
4 m.escal 3292 262 13406 4.0
5 m.mont 6356 555 26025 3.6
6 m.none 3233 250 13527 4.7
7 none.none 156 12 665 5.1
`r FT(orange, 5, text="♦")` **Coleman Liau**`r FT(orange, 5, text="♦")`
wzxhzdk:216 fam.aff&act word.count sentence.count character.count Coleman_Liau
1 cap.1 2636 272 10228 4.0
2 cap.2 2113 193 8223 4.4
3 cap.3 3540 339 14183 4.9
4 cap.4 2159 232 8620 4.5
5 cap.5 214 26 835 3.5
6 escal.1 748 36 3259 8.4
`r FT(orange, 5, text="♦")` **SMOG**`r FT(orange, 5, text="♦")`
wzxhzdk:217 person&act word.count sentence.count polysyllable.count SMOG
1 Benvolio.1 621 51 25 7.1
2 Capulet.1 736 72 35 7.1
3 Capulet.3 749 69 28 6.8
4 Capulet.4 569 73 25 6.5
5 Friar Laurence.2 699 42 36 8.4
6 Friar Laurence.3 675 61 32 7.3
7 Friar Laurence.4 656 42 25 7.5
8 Friar Laurence.5 696 54 32 7.5
9 Juliet.2 1289 113 48 6.9
10 Juliet.3 1722 152 64 6.8
11 Juliet.4 932 61 37 7.6
12 Lady Capulet.3 393 39 15 6.7
13 Mercutio.2 964 82 43 7.3
14 Mercutio.3 578 54 19 6.5
15 Nurse.1 599 59 20 6.5
16 Nurse.2 779 76 24 6.3
17 Nurse.3 579 68 14 5.7
18 Nurse.4 250 50 9 5.6
19 Romeo.1 1158 113 48 6.9
20 Romeo.2 1289 109 46 6.8
21 Romeo.3 969 87 48 7.4
22 Romeo.5 1216 103 52 7.2
`r FT(orange, 5, text="♦")` **Flesch Kincaid**`r FT(orange, 5, text="♦")`
wzxhzdk:218 sex&fam.aff word.count sentence.count syllable.count FK_grd.lvl FK_read.ease
1 f.cap 9458 929 11641 2.9 92.375
2 f.mont 28 4 30 -0.2 109.087
3 m.cap 1204 133 1452 2.2 95.621
4 m.escal 3292 262 4139 4.1 87.715
5 m.mont 6356 555 7965 3.7 89.195
6 m.none 3233 250 4097 4.4 86.500
7 none.none 156 12 195 4.2 87.890
Note that the Fry score is a graphical display, rather than text as the other readability scores are. This is in keeping with the original procedures outlined by Fry.
`r FT(orange, 5, text="♦")` **Fry**`r FT(orange, 5, text="♦")`
wzxhzdk:219 wzxhzdk:220`r FT(orange, 5, text="♦")` **Linsear Write**`r FT(orange, 5, text="♦")`
wzxhzdk:221 person sent.per.100 hard_easy_sum Linsear_Write
1 Balthasar 9.556 110 4.76
2 Benvolio 4.143 108 12.03
3 Capulet 11.469 115 4.01
4 Chorus 3.071 104 15.93
5 First Watchman 14.222 114 3.01
6 Friar Laurence 4.263 108 11.67
7 Gregory 11.000 100 3.55
8 Juliet 3.446 110 14.96
9 Lady Capulet 7.267 110 6.57
10 Mercutio 5.625 102 8.07
11 Montague 6.000 114 8.50
12 Nurse 12.098 102 3.22
13 Paris 9.091 110 5.05
14 Peter 10.357 110 4.31
15 Prince 10.842 110 4.07
16 Romeo 9.250 114 5.16
17 Sampson 9.421 107 4.68
18 Servant 9.667 104 4.38
19 Tybalt 9.591 112 4.84
`r FT(orange, 5, text="♦")` **`r FUN("Dissimilarity")` Examples**`r FT(orange, 5, text="♦")`
wzxhzdk:222 wzxhzdk:223 OBAMA.1 OBAMA.2 OBAMA.3 ROMNEY.1 ROMNEY.2
OBAMA.2 0.340
OBAMA.3 0.300 0.341
ROMNEY.1 0.340 0.287 0.258
ROMNEY.2 0.291 0.349 0.296 0.321
ROMNEY.3 0.264 0.297 0.329 0.290 0.338
`r FT(orange, 5, text="♦")` **`r FUN("Dissimilarity")` Clustering (Dendrogram)**`r FT(orange, 5, text="♦")`
wzxhzdk:224 wzxhzdk:225`r FT(orange, 5, text="♦")` **`r FUN("kullback_leibler")` Example** - *Compare to `r FUN("Dissimilarity")`*`r FT(orange, 5, text="♦")`
wzxhzdk:227 OBAMA.1 OBAMA.2 OBAMA.3 ROMNEY.1 ROMNEY.2 ROMNEY.3
OBAMA.1 0.000 0.237 0.221 0.195 0.250 0.264
OBAMA.2 0.104 0.000 0.161 0.148 0.142 0.223
OBAMA.3 0.119 0.152 0.000 0.142 0.180 0.168
ROMNEY.1 0.207 0.297 0.279 0.000 0.216 0.224
ROMNEY.2 0.194 0.195 0.262 0.116 0.000 0.234
ROMNEY.3 0.160 0.182 0.141 0.101 0.140 0.000
wzxhzdk:228
`r FT(orange, 5, text="♦")` **`r FUN("diversity")` Example**`r FT(orange, 5, text="♦")`
wzxhzdk:229`r FT(orange, 5, text="♦")` **`r FUN("diversity")` Plot Method**`r FT(orange, 5, text="♦")`
wzxhzdk:230`r FT(orange, 5, text="♦")` **`r FUN("formality")` Example**`r FT(orange, 5, text="♦")`
wzxhzdk:232 wzxhzdk:233`r FT(orange, 5, text="♦")` **`r FUN("formality")` Plot Method**`r FT(orange, 5, text="♦")`
wzxhzdk:234`r FT(orange, 5, text="♦")` **Recycling `r FUN("formality")`**`r FT(orange, 5, text="♦")`
wzxhzdk:235 wzxhzdk:236`r FT(orange, 5, text="♦")` **`r FUN("polarity")` Example**`r FT(orange, 5, text="♦")`
wzxhzdk:238 (poldat <- with(mraja1spl, polarity(dialogue, list(sex, fam.aff, died))))
POLARITY BY GROUP
=================
sex&fam.aff&died tot.sent tot.word ave.polarity sd.polarity sd.mean.polarity
1 f.cap.FALSE 158 1810 0.076 0.262 0.292
2 f.cap.TRUE 24 221 0.042 0.209 0.204
3 f.mont.TRUE 4 29 0.079 0.398 0.199
4 m.cap.FALSE 73 717 0.026 0.256 0.104
5 m.cap.TRUE 17 185 -0.160 0.313 -0.510
6 m.escal.FALSE 9 195 -0.153 0.313 -0.488
7 m.escal.TRUE 27 646 -0.069 0.256 -0.272
8 m.mont.FALSE 70 952 -0.044 0.384 -0.114
9 m.mont.TRUE 114 1273 -0.004 0.409 -0.009
10 m.none.FALSE 7 78 0.062 0.107 0.583
11 none.none.FALSE 5 18 -0.282 0.439 -0.642
wzxhzdk:239
`r FT(orange, 5, text="♦")` **`r FUN("polarity")`** - *Sentence Level Polarity Scores*`r FT(orange, 5, text="♦")`
wzxhzdk:240`r FT(orange, 5, text="♦")` **`r FUN("polarity")` Plot Method**`r FT(orange, 5, text="♦")`
wzxhzdk:241`r FT(orange, 5, text="♦")` **`r FUN("polarity")` Plot Group Polarity as Heat Map**`r FT(orange, 5, text="♦")`
wzxhzdk:242`r FT(orange, 5, text="♦")` **`r FUN("sentiment_frame")`** - *Specify Your Own Polarity Environment*`r FT(orange, 5, text="♦")`
wzxhzdk:243`r FT(orange, 5, text="♦")` **Polarity Over Time** `r FT(orange, 5, text="♦")`
poldat4 <- with(rajSPLIT, polarity(dialogue, act, constrain = TRUE))
polcount <- na.omit(counts(poldat4)$polarity)
len <- length(polcount)
cummean <- function(x){cumsum(x)/seq_along(x)}
cumpolarity <- data.frame(cum_mean = cummean(polcount), Time=1:len)
## Calculate background rectangles
ends <- cumsum(rle(counts(poldat4)$act)$lengths)
starts <- c(1, head(ends + 1, -1))
rects <- data.frame(xstart = starts, xend = ends + 1,
Act = c("I", "II", "III", "IV", "V"))
library(ggplot2)
ggplot() + theme_bw() +
geom_rect(data = rects, aes(xmin = xstart, xmax = xend,
ymin = -Inf, ymax = Inf, fill = Act), alpha = 0.17) +
geom_smooth(data = cumpolarity, aes(y=cum_mean, x = Time)) +
geom_hline(y=mean(polcount), color="grey30", size=1, alpha=.3, linetype=2) +
annotate("text", x = mean(ends[1:2]), y = mean(polcount), color="grey30",
label = "Average Polarity", vjust = .3, size=3) +
geom_line(data = cumpolarity, aes(y=cum_mean, x = Time), size=1) +
ylab("Cumulative Average Polarity") + xlab("Duration") +
scale_x_continuous(expand = c(0,0)) +
geom_text(data=rects, aes(x=(xstart + xend)/2, y=-.04,
label=paste("Act", Act)), size=3) +
guides(fill=FALSE) +
scale_fill_brewer(palette="Set1")
wzxhzdk:244
word_cor
function calculates correlations (based on the `r FUN("wfm")` function) for words nested within grouping variables (turn of talk is an obvious choice for a grouping variable). Running bootstrapping with a random sample can help the researcher determine if a co-occurrence of words is by chance. `r FUN("wordword_cor")` is even more flexible in that it can actually take a frequency matrix (e.g., the `r FUN("wfm_combine", "wfm")` function or the `r HR("#coding", "cm_")` family of functions).
`r FT(orange, 5, text="♦")` **`r FUN("word_cor")`** - *Single Words*`r FT(orange, 5, text="♦")`
library(reports)
x <- factor(with(rajSPLIT, paste(act, pad(TOT(tot)), sep = "|")))
word_cor(rajSPLIT$dialogue, x, "romeo", .45)
$romeo
that tybalt
0.4540979 0.4831937
word_cor(rajSPLIT$dialogue, x, "love", .5)
$love
likewise
0.5013104
`r FT(orange, 5, text="♦")` **`r FUN("word_cor")`** - *Negative Correlation*`r FT(orange, 5, text="♦")`
word_cor(rajSPLIT$dialogue, x, "you", -.1)
with(rajSPLIT, word_cor(dialogue, list(person, act), "hate"))
$hate
eyesight knight prison smooth vex'd
0.7318131 0.7318131 0.7318131 0.7318131 0.7318131
`r FT(orange, 5, text="♦")` **`r FUN("word_cor")`** - *Multiple Words*`r FT(orange, 5, text="♦")`
words <- c("hate", "i", "love", "ghost")
with(rajSPLIT, word_cor(dialogue, x, words, r = .5))
$hate
beasts beseeming bills bred
0.6251743 0.6251743 0.6251743 0.6251743
canker'd capulethold clubs coward
0.6251743 0.6251743 0.6251743 0.6251743
crutch disturb'd flourishes fountains
0.6251743 0.6251743 0.6251743 0.6251743
issuing mistemper'd neighbourstained partisans
0.6251743 0.6251743 0.6251743 0.6251743
pernicious profaners purple rebellious
0.6251743 0.6251743 0.6251743 0.6251743
streets subjects sword thrice
0.5027573 0.6251743 0.6164718 0.6251743
throw wield
0.6251743 0.6251743
$i
and have me my thee to
0.5150992 0.5573359 0.5329341 0.5134372 0.5101593 0.5533506
$love
likewise
0.5013104
$ghost
bone brains club dash drink keys kinsman's methinks
0.7056134 0.7056134 1.0000000 1.0000000 0.5749090 1.0000000 1.0000000 0.5749090
rage rapier's seeking spices spit
0.5749090 1.0000000 1.0000000 1.0000000 1.0000000
`r FT(orange, 5, text="♦")` **`r FUN("word_cor")`** - *Correlations Between Terms: Example 1*`r FT(orange, 5, text="♦")`
## Set r = NULL to get matrix between words
with(rajSPLIT, word_cor(dialogue, x, words, r = NULL))
hate i love ghost
hate 1.00000000 0.05142236 0.15871966 -0.01159382
i 0.05142236 1.00000000 0.36986172 0.01489943
love 0.15871966 0.36986172 1.00000000 -0.02847837
ghost -0.01159382 0.01489943 -0.02847837 1.00000000
`r FT(orange, 5, text="♦")` **`r FUN("word_cor")`** - *Correlations Between Terms: Example 2*`r FT(orange, 5, text="♦")`
dat <- pres_debates2012
dat$TOT <- factor(with(dat, paste(time, pad(TOT(tot)), sep = "|")))
dat <- dat[dat$person %in% qcv(OBAMA, ROMNEY), ]
dat$person <- factor(dat$person)
dat.split <- with(dat, split(dat, list(person, time)))
wrds <- qcv(america, debt, dollar, people, tax, health)
lapply(dat.split, function(x) {
word_cor(x[, "dialogue"], x[, "TOT"], wrds, r=NULL)
})
$`OBAMA.time 1`
america dollar people tax health
america 1.000000000 -0.005979775 0.6117618 -0.005979775 0.13803797
dollar -0.005979775 1.000000000 0.1650493 -0.004219409 -0.01092353
people 0.611761819 0.165049280 1.0000000 0.165049280 0.50398555
tax -0.005979775 -0.004219409 0.1650493 1.000000000 0.20572642
health 0.138037974 -0.010923527 0.5039855 0.205726420 1.00000000
$`ROMNEY.time 1`
america dollar people tax health
america 1.00000000 0.07493271 0.2336551 0.07033784 0.14986684
dollar 0.07493271 1.00000000 0.5859944 0.11109650 0.33821359
people 0.23365513 0.58599441 1.0000000 0.20584588 0.61333714
tax 0.07033784 0.11109650 0.2058459 1.00000000 -0.01723713
health 0.14986684 0.33821359 0.6133371 -0.01723713 1.00000000
$`OBAMA.time 2`
america dollar people tax health
america 1.00000000 -0.01526328 0.41353310 0.07609871 0.25733977
dollar -0.01526328 1.00000000 0.11671525 0.51222872 -0.01220067
people 0.41353310 0.11671525 1.00000000 0.03761852 0.11285926
tax 0.07609871 0.51222872 0.03761852 1.00000000 0.03431397
health 0.25733977 -0.01220067 0.11285926 0.03431397 1.00000000
$`ROMNEY.time 2`
america debt dollar people tax
america 1.00000000 -0.018370290 0.07531545 0.59403781 0.291238391
debt -0.01837029 1.000000000 0.53340505 0.02329285 -0.009432552
dollar 0.07531545 0.533405053 1.00000000 0.33346752 0.600125943
people 0.59403781 0.023292854 0.33346752 1.00000000 0.516577197
tax 0.29123839 -0.009432552 0.60012594 0.51657720 1.000000000
health 0.06384509 -0.008308652 0.68299026 0.25536510 0.658231340
health
america 0.063845090
debt -0.008308652
dollar 0.682990261
people 0.255365102
tax 0.658231340
health 1.000000000
$`OBAMA.time 3`
america debt dollar people tax
america 1.00000000 -0.01224452 -0.02326653 0.1182189 -0.02326653
debt -0.01224452 1.00000000 0.37361771 0.1765301 0.75525297
dollar -0.02326653 0.37361771 1.00000000 0.1909401 0.70993297
people 0.11821887 0.17653013 0.19094008 1.0000000 0.19094008
tax -0.02326653 0.75525297 0.70993297 0.1909401 1.00000000
$`ROMNEY.time 3`
america debt dollar people
america 1.0000000 0.2130341 0.2675978 0.3007027
debt 0.2130341 1.0000000 0.8191341 0.4275521
dollar 0.2675978 0.8191341 1.0000000 0.4666635
people 0.3007027 0.4275521 0.4666635 1.0000000
`r FT(orange, 5, text="♦")` **`r FUN("word_cor")`** - *Matrix from `r FUN("wfm_combine", "wfm")`*`r FT(orange, 5, text="♦")`
worlis <- list(
pronouns = c("you", "it", "it's", "we", "i'm", "i"),
negative = qcv(no, dumb, distrust, not, stinks),
literacy = qcv(computer, talking, telling)
)
y <- wfdf(DATA$state, id(DATA, prefix = TRUE))
z <- wfm_combine(y, worlis)
word_cor(t(z), word = c(names(worlis), "else.words"), r = NULL)
pronouns negative literacy else.words
pronouns 1.0000000 0.2488822 -0.4407045 -0.5914760
negative 0.2488822 1.0000000 -0.2105380 -0.7146856
literacy -0.4407045 -0.2105380 1.0000000 0.2318694
else.words -0.5914760 -0.7146856 0.2318694 1.0000000
`r FT(orange, 5, text="♦")` `r FUN("dispersion_plot")` - Understand the Search`r FT(orange, 5, text="♦")`
term_match(raj$dialogue, c(" love ", "love", " night ", "night"))
$` love `
[1] "love"
$love
[1] "love" "love's" "lovers" "loved" "lovely"
[6] "lovest" "lover" "beloved" "loves" "newbeloved"
[11] "glove" "lovesong" "lovedevouring" "loveperforming" "dearloved"
$` night `
[1] "night"
$night
[1] "night" "fortnight" "nights" "tonight" "night's" "knight"
[7] "nightingale" "nightly" "yesternight"
`r FT(orange, 5, text="♦")` **`r FUN("dispersion_plot")`** - *Example 1*`r FT(orange, 5, text="♦")`
wzxhzdk:246`r FT(orange, 5, text="♦")` **`r FUN("dispersion_plot")`** - *Example 2: Color Schemes*`r FT(orange, 5, text="♦")`
wzxhzdk:247 Using `r FUN("dispersion_plot")` with `r FUN("freq_terms")`'s [["rfswl"]][["all"]] can be a useful means of viewing the dispersion of high frequency words after stopword removal.`r FT(orange, 5, text="♦")` **`r FUN("dispersion_plot")`** - *Example 3: Using with `r FUN("freq_terms")`*`r FT(orange, 5, text="♦")`
wrds <- freq_terms(pres_debates2012$dialogue, stopwords = Top200Words)
## Add leading/trailing spaces if desired
wrds2 <- spaste(wrds)
## Use `~~` to maintain spaces
wrds2 <- c(" governor~~romney ", wrds2[-c(3, 12)])
## Plot
with(pres_debates2012 , dispersion_plot(dialogue, wrds2, rm.vars = time,
color="black", bg.color="white"))
wzxhzdk:248
list(
theme_1 = c("word1", "word2", "word3"),
theme_2 = c("word4", "word5"),
theme_3 = c("word6", "word7", "word8")
)
The cloud.colors argument takes a single color or a vector of colors 1 greater than the number of vectors of target.words. The order of cloud.colors corresponds to the order of target.words with the extra, final color being utilized for all words not matched to target.words.
`r FT(orange, 5, text="♦")` **`r FUN("trans_cloud")` Example 1**`r FT(orange, 5, text="♦")`
wzxhzdk:249`r FT(orange, 5, text="♦")` **`r FUN("trans_cloud")` Example 2** - *Polarity*`r FT(orange, 5, text="♦")`
wzxhzdk:250`r FT(orange, 5, text="♦")` **`r FUN("gradient_cloud")` Examples**`r FT(orange, 5, text="♦")`
wzxhzdk:251 wzxhzdk:252 wzxhzdk:253`r FT(orange, 5, text="♦")` **`r FUN("gantt_plot")`** - *Single Time/Single Grouping Variable*`r FT(orange, 5, text="♦")`
wzxhzdk:254`r FT(orange, 5, text="♦")` **`r FUN("gantt_plot")`** - *Single Time/Multiple Grouping Variable*`r FT(orange, 5, text="♦")`
wzxhzdk:255 Sometimes the location of the facets may not be ideal to show the data (i.e., you may want to reverse the x and y axis). By setting transform = TRUE the user can make this switch.`r FT(orange, 5, text="♦")` **`r FUN("gantt_plot")``** - *Transforming*`r FT(orange, 5, text="♦")`
wzxhzdk:256 Often the default colors are less useful in displaying the trends in a way that is most meaningful. Because `r FUN("gantt_plot")` is a wrapper for ggplot2 the color palettes can easily be extended to use with the output from `r FUN("gantt_plot")`.`r FT(orange, 5, text="♦")` **`r FUN("gantt_plot")`** - *Color Palette Examples*`r FT(orange, 5, text="♦")`
wzxhzdk:257 At times it may be useful to fill the bar colors by another grouping variable. The fill.var argument allows another coloring variable to be utilized.`r FT(orange, 5, text="♦")` **`r FUN("gantt_plot")`** - *Fill Variable Example 1*`r FT(orange, 5, text="♦")`
wzxhzdk:258`r FT(orange, 5, text="♦")` **`r FUN("gantt_plot")`** - *Fill Variable Example 2*`r FT(orange, 5, text="♦")`
wzxhzdk:259 Be wary though of using coloring to show what faceting would show better. Here is an example of faceting versus the color fill used in the `r HR("#examp1", "Fill Variable Example 1")` above.`r FT(orange, 5, text="♦")` **`r FUN("gradient_plot")`** - *Facet Instead of Fill Variable*`r FT(orange, 5, text="♦")`
wzxhzdk:260`r FT(orange, 5, text="♦")` **`r FUN("qheat")`** - *Basic Example*`r FT(orange, 5, text="♦")`
wzxhzdk:261`r FT(orange, 5, text="♦")` **`r FUN("qheat")`** - *Color Group Labels Example*`r FT(orange, 5, text="♦")`
wzxhzdk:262`r FT(orange, 5, text="♦")` **`r FUN("qheat")`** - *Order By Numeric Variable Examples*`r FT(orange, 5, text="♦")`
wzxhzdk:263`r FT(orange, 5, text="♦")` **`r FUN("qheat")`** - *Cell Labels Examples*`r FT(orange, 5, text="♦")`
wzxhzdk:264`r FT(orange, 5, text="♦")` **`r FUN("qheat")`** - *Custom Cell Labels Example*`r FT(orange, 5, text="♦")`
wzxhzdk:265`r FT(orange, 5, text="♦")` **`r FUN("qheat")`** - *Grid Examples*`r FT(orange, 5, text="♦")`
wzxhzdk:266`r FT(orange, 5, text="♦")` **`r FUN("qheat")`** - *Facet Examples*`r FT(orange, 5, text="♦")`
wzxhzdk:267`r FT(orange, 5, text="♦")` **`r FUN("qheat")`** - *Transposing Examples*`r FT(orange, 5, text="♦")`
wzxhzdk:268 When plotting a correlation/distance matrix set `r FT(green, courier_new, text="diag.na = TRUE")` to keep these extreme values from effecting the scaling.`r FT(orange, 5, text="♦")` **`r FUN("qheat")`** - *Correlation Matrix Examples*`r FT(orange, 5, text="♦")`
wzxhzdk:269`r FT(orange, 5, text="♦")` **`r FUN("rank_freq_mplot")`**`r FT(orange, 5, text="♦")`
wzxhzdk:270`r FT(orange, 5, text="♦")` **`r FUN("rank_freq_mplot")`** - *Using alpha*`r FT(orange, 5, text="♦")`
wzxhzdk:271 The `r FUN("rank_freq_plot", "rank_freq_mplot")` plots more quickly but does not handle multiple groups and does not take text/grouping variables directly.`r FT(orange, 5, text="♦")` **`r FUN("rank_freq_plot", "rank_freq_mplot")` Example** `r FT(orange, 5, text="♦")`
wzxhzdk:272`r FT(orange, 5, text="♦")` **`r FUN("tot_plot")`** - *Examples* `r FT(orange, 5, text="♦")`
wzxhzdk:273 wzxhzdk:274`r FT(orange, 5, text="♦")` **`r FUN("tot_plot")`** - *Facet Variables* `r FT(orange, 5, text="♦")`
wzxhzdk:275 Because `r FUN("tot_plot")` is based on the `r HR("http://docs.ggplot2.org/current/", "ggplot2 package")` (Wickham, 2009) and `r FUN("tot_plot")` invisibly returns the ggplot2 object, the output (of the class "ggplot") can be altered in the same way that another ggplot2 object can be. In the following examples the color palette is altered.`r FT(orange, 5, text="♦")` **`r FUN("tot_plot")`** - *Alter Colors* `r FT(orange, 5, text="♦")`
wzxhzdk:276`r FT(orange, 5, text="♦")` **`r FUN("tot_plot")`** - *Add Mean +2/+3 sd* `r FT(orange, 5, text="♦")`
wzxhzdk:277`r FT(orange, 5, text="♦")` **`r FUN("word_network_plot")`** - *Between Turns of Talk: All Words* `r FT(orange, 5, text="♦")`
wzxhzdk:279`r FT(orange, 5, text="♦")` **`r FUN("word_network_plot")`** - *Between People* `r FT(orange, 5, text="♦")`
wzxhzdk:280`r FT(orange, 5, text="♦")` **`r FUN("word_network_plot")`** - *Between sex and adult* `r FT(orange, 5, text="♦")`
wzxhzdk:281`r FT(orange, 5, text="♦")` **`r FUN("word_network_plot")`** - *`log.labels`* `r FT(orange, 5, text="♦")`
wzxhzdk:282`r FT(orange, 5, text="♦")` `r FUN("end_inc")` Examples `r FT(orange, 5, text="♦")`
wzxhzdk:283`r FT(orange, 5, text="♦")` `r FUN("end_mark")` Example `r FT(orange, 5, text="♦")`
wzxhzdk:284`r FT(orange, 5, text="♦")` `r FUN("end_mark")` - Grab Sentence Types`r FT(orange, 5, text="♦")`
wzxhzdk:285`r FT(orange, 5, text="♦")` `r FUN("ID")` - Grab Sentence Types`r FT(orange, 5, text="♦")`
wzxhzdk:286`r FT(orange, 5, text="♦")` `r FUN("imperative")` - Imperative Data`r FT(orange, 5, text="♦")`
wzxhzdk:287 name statement
1 sue go get it|
2 greg I hate to read.
3 tyler Stop running!
4 phil I like it!
5 sue You are terrible!
6 greg Don't!
7 tyler Greg, go to the red, brick office.
8 phil Tyler go to the gym.
9 sue Alex don't run.
`r FT(orange, 5, text="♦")` `r FUN("imperative")` - Re-mark End Marks`r FT(orange, 5, text="♦")`
imperative(dat, "name", "statement", additional.names = c("Alex"))
name statement
1 sue go get it*|
2 greg I hate to read.
3 tyler Stop running*!
4 phil I like it!
5 sue You are terrible!
6 greg Don't*!
7 tyler Greg, go to the red, brick office*.
8 phil Tyler go to the gym*.
9 sue Alex don't run*.
`r FT(orange, 5, text="♦")` `r FUN("imperative")` - Handle Incomplete Sentences`r FT(orange, 5, text="♦")`
imperative(dat, "name", "statement", lock.incomplete = TRUE, "Alex")
name statement
1 sue go get it|
2 greg I hate to read.
3 tyler Stop running*!
4 phil I like it!
5 sue You are terrible!
6 greg Don't*!
7 tyler Greg, go to the red, brick office*.
8 phil Tyler go to the gym*.
9 sue Alex don't run*.
`r FT(orange, 5, text="♦")` `r FUN("imperative")` - Warning Report`r FT(orange, 5, text="♦")`
imperative(dat, "name", "statement", additional.names = "Alex", warning=TRUE)
name statement warnings
1 sue go get it*| -
2 greg I hate to read. read
3 tyler Stop running*! -
4 phil I like it! -
5 sue You are terrible! -
6 greg Don't*! -
7 tyler Greg, go to the red, brick office*. 2 commas
8 phil Tyler go to the gym*. -
9 sue Alex don't run*. AAVE
`r FT(orange, 5, text="♦")` `r FUN("as.tdm")` & `r FUN("as.dtm", "as.tdm")` - From Raw Text Example 1`r FT(orange, 5, text="♦")`
wzxhzdk:288`r FT(orange, 5, text="♦")` `r FUN("as.tdm")` & `r FUN("as.dtm", "as.tdm")` - From Raw Text Example 2`r FT(orange, 5, text="♦")`
wzxhzdk:289library(tm)
plot(pres, corThreshold = 0.8)
wzxhzdk:290
wzxhzdk:291
plot(pres2, corThreshold = 0.95)
wzxhzdk:292
`r FT(orange, 5, text="♦")` `r FUN("as.tdm")` & `r FUN("as.dtm", "as.tdm")` - From `r FUN("wfm")``r FT(orange, 5, text="♦")`
wzxhzdk:293plot(as.tdm(x))
wzxhzdk:294
apply_as_tm(a, tm:::plot.TermDocumentMatrix, corThreshold = 0.4)
wzxhzdk:297
wzxhzdk:298
If there is a discrepancy between the R and Java architectures you will have to download the appropriate version of Java compatible with the version of R you're using. For more see Tal Galili's blog post regarding rJava issues.
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.