Nothing
#' Search for tokens.
#'
#' Searches through transcripts for tokens matching the given pattern.
#'
#' @param labbcat.url URL to the LaBB-CAT instance
#' @param pattern An object representing the pattern to search for.
#'
#' This can be:
#' \itemize{
#' \item{A string, representing a search of the orthography layer - spaces are
#' taken to be word boundaries}
#' \item{A single named list, representing a one-column search - names are taken to be layer IDs}
#' \item{A list of named lists, representing a multi-column search - the outer list
#' represents the columns of the search matrix where each column 'immediately
#' follows' the previous, and the names of the inner lists are taken to be layer IDs}
#' \item{A named list fully replicating the structure of the search matrix in the
#' LaBB-CAT browser interface, with one element called ``columns'', containing a
#' named list for each column.
#'
#' Each element in the ``columns'' named list contains an element named ``layers'', whose
#' value is a named list for patterns to match on each layer, and optionally an
#' element named ``adj'', whose value is a number representing the maximum distance, in
#' tokens, between this column and the next column - if ``adj'' is not specified, the
#' value defaults to 1, so tokens are contiguous.
#'
#' Each element in the ``layers'' named list is named after the layer it matches, and the
#' value is a named list with the following possible elements:
#' \itemize{
#' \item{\emph{pattern} A regular expression to match against the label}
#' \item{\emph{min} An inclusive minimum numeric value for the label}
#' \item{\emph{max} An exclusive maximum numeric value for the label}
#' \item{\emph{not} TRUE to negate the match}
#' \item{\emph{anchorStart} TRUE to anchor to the start of the annotation on this layer
#' (i.e. the matching word token will be the first at/after the start of the matching
#' annotation on this layer)}
#' \item{\emph{anchorEnd} TRUE to anchor to the end of the annotation on this layer
#' (i.e. the matching word token will be the last before/at the end of the matching
#' annotation on this layer)}
#' \item{\emph{target} TRUE to make this layer the target of the search; the
#' results will contain one row for each match on the target layer}
#' }
#' }
#' }
#'
#' Examples of valid pattern objects include:
#' \preformatted{
#' ## the word 'the' followed immediately by a word starting with an orthographic vowel
#' pattern <- "the [aeiou]"
#'
#' ## a word spelt with "k" but pronounced "n" word initially
#' pattern <- list(orthography = "k.*", phonemes = "n.*")
#'
#' ## the word 'the' followed immediately by a word starting with an phonemic vowel
#' pattern <- list(
#' list(orthography = "the"),
#' list(phonemes = "[cCEFHiIPqQuUV0123456789~#\\$@].*"))
#'
#' ## the word 'the' followed immediately or with one intervening word by
#' ## a hapax legomenon (word with a frequency of 1) that doesn't start with a vowel
#' pattern <- list(columns = list(
#' list(layers = list(
#' orthography = list(pattern = "the")),
#' adj = 2),
#' list(layers = list(
#' phonemes = list(not = TRUE, pattern = "[cCEFHiIPqQuUV0123456789~#\\$@].*"),
#' frequency = list(max = "2")))))
#' }
#' @param participant.expression An optional participant query expression for identifying
#' participants to search the utterances of. This should be the output of
#' \link{expressionFromIds}, \link{expressionFromAttributeValue},
#' or \link{expressionFromAttributeValues}, or more than one concatentated together
#' and delimited by ' && '. If not supplied, utterances of all participants will be searched.
#' @param transcript.expression An optional transript query expression for identifying
#' transcripts to search in. This should be the output of \link{expressionFromIds},
#' \link{expressionFromTranscriptTypes}, \link{expressionFromAttributeValue},
#' or \link{expressionFromAttributeValues}, or more than one concatentated together
#' and delimited by ' && '. If not supplied, all transcripts will be searched.
#' @param main.participant TRUE to search only main-participant utterances, FALSE to
#' search all utterances.
#' @param aligned This parameter is deprecated and will be removed in future versions;
#' please use anchor.confidence.min=50 instead.
#' @param matches.per.transcript Optional maximum number of matches per transcript to
#' return. NULL means all matches.
#' @param words.context Number of words context to include in the `Before.Match' and
#' `After.Match' columns in the results.
#' @param max.matches The maximum number of matches to return, or null to return all.
#' @param overlap.threshold The percentage overlap with other utterances before
#' simultaneous speech is excluded, or null to include overlapping speech.
#' @param anchor.confidence.min The minimum confidence for alignments, e.g.
#' \itemize{
#' \item{\emph{0} -- return all alignments, regardless of confidence;}
#' \item{\emph{50} -- return only alignments that have been at least automatically aligned;}
#' \item{\emph{100} -- return only manually-set alignments.}
#' }
#' @param page.length In order to prevent timeouts when there are a large number of
#' matches or the network connection is slow, rather than retrieving matches in one
#' big request, they are retrieved using many smaller requests. This parameter
#' controls the number of results retrieved per request.
#' @param no.progress TRUE to supress visual progress bar. Otherwise, progress bar will be
#' shown when interactive().
#' @return A data frame identifying matches, containing the following columns:
#' \itemize{
#' \item{\emph{SearchName} A name based on the pattern -- the same for all rows}
#' \item{\emph{MatchId} A unique ID for the matching target token}
#' \item{\emph{Transcript} Name of the transcript in which the match was found}
#' \item{\emph{Participant} Name of the speaker}
#' \item{\emph{Corpus} The corpus of the transcript}
#' \item{\emph{Line} The start offset of the utterance/line}
#' \item{\emph{LineEnd} The end offset of the utterance/line}
#' \item{\emph{Before.Match} Transcript text immediately before the match}
#' \item{\emph{Text} Transcript text of the match}
#' \item{\emph{After.Match} Transcript text immediately after the match}
#' \item{\emph{Number} Row number}
#' \item{\emph{URL} URL of the first matching word token}
#' \item{\emph{Target.word} Text of the target word token}
#' \item{\emph{Target.word.start} Start offset of the target word token}
#' \item{\emph{Target.word.end} End offset of the target word token}
#' \item{\emph{Target.segment} Label of the target segment (only present if the segment
#' layer is included in the pattern)}
#' \item{\emph{Target.segment.start} Start offset of the target segment (only present if the
#' segment layer is included in the pattern)}
#' \item{\emph{Target.segment.end} End offset of the target segment (only present if the
#' segment layer is included in the pattern)}
#' }
#'
#' @seealso \code{\link{getFragments}}
#' @seealso \code{\link{getSoundFragments}}
#' @seealso \code{\link{getMatchLabels}}
#' @seealso \code{\link{getMatchAlignments}}
#' @seealso \code{\link{processWithPraat}}
#' @seealso \code{\link{getParticipantIds}}
#'
#' @examples
#' \dontrun{
#' ## define the LaBB-CAT URL
#' labbcat.url <- "https://labbcat.canterbury.ac.nz/demo/"
#'
#' ## the word 'the' followed immediately by a word starting with an orthographic vowel
#' theThenOrthVowel <- getMatches(labbcat.url, "the [aeiou]")
#'
#' ## a word spelt with "k" but pronounced "n" word initially
#' knWords <- getMatches(labbcat.url, list(orthography = "k.*", phonemes = "n.*"))
#'
#' ## the word 'the' followed immediately by a word starting with an phonemic vowel
#' theThenPhonVowel <- getMatches(
#' labbcat.url, list(
#' list(orthography = "the"),
#' list(phonemes = "[cCEFHiIPqQuUV0123456789~#\\$@].*")))
#'
#' ## the word 'the' followed immediately or with one intervening word by
#' ## a hapax legomenon (word with a frequency of 1) that doesn't start with a vowel
#' results <- getMatches(
#' labbcat.url, list(columns = list(
#' list(layers = list(
#' orthography = list(pattern = "the")),
#' adj = 2),
#' list(layers = list(
#' phonemes = list(not=TRUE, pattern = "[cCEFHiIPqQuUV0123456789~#\\$@].*"),
#' frequency = list(max = "2"))))),
#' overlap.threshold = 5)
#'
#' ## all tokens of the KIT vowel, from the interview or monologue
#' ## of the participants AP511_MikeThorpe and BR2044_OllyOhlson
#' results <- getMatches(labbcat.url, list(segment="I"),
#' participant.expression = expressionFromIds(c("AP511_MikeThorpe","BR2044_OllyOhlson")),
#' transcript.expression = expressionFromTranscriptTypes(c("interview","monologue")))
#'
#' ## all tokens of the KIT vowel for male speakers who speak English
#' results <- getMatches(labbcat.url, list(segment="I"),
#' participant.expression = paste(
#' expressionFromAttributeValue("participant_gender", "M"),
#' expressionFromAttributeValues("participant_languages_spoken", "en"),
#' sep=" && "))
#'
#' ## results$Text is the text that matched
#' ## results$MatchId can be used to access results using other functions
#' }
#'
#' @keywords search
#'
getMatches <- function(labbcat.url, pattern, participant.expression=NULL, transcript.expression=NULL, main.participant=TRUE, aligned=NULL, matches.per.transcript=NULL, words.context=0, max.matches=NULL, overlap.threshold=NULL, anchor.confidence.min=NULL, page.length=1000, no.progress=FALSE) {
## if they've explicitly passed a value for aligned
if (!is.null(aligned)) {
print("WARNING: the getMatches parameter 'aligned' is deprecated; please use anchor.confidence.min=50 instead")
} else { ## use previous default value
aligned = FALSE
## if they've just followed the advice above, but their LaBB-CAT version is old
## we need to make sure that it still behaves as it did before
if (!is.null(anchor.confidence.min)) {
if (anchor.confidence.min >= 50) {
aligned = TRUE ## (this has no effect on newer versions of LaBB-CAT)
}
}
}
## first normalize the pattern...
if (is.character(pattern) && length(pattern) == 1) { # it's a string
## assume it's an orthography search
tokens <- strsplit(pattern," ")
pattern <- list()
for (token in tokens[[1]]) {
pattern[[length(pattern)+1]] <- list(orthography=token)
} ## next token
} # it's a string
## if pattern isn't a list, convert it to one
if (!is.list(pattern)) pattern <- as.list(pattern)
## if pattern isn't a list with a "columns" element, wrap a list around it
if (is.null(pattern$columns)) pattern <- list(columns = pattern)
## if pattern$columns isn't a 'nameless' list (i.e. with numeric indices) wrap a
## nameless list around it
if (!is.null(names(pattern$columns))) pattern$columns <- list(pattern$columns)
## columns contain lists with no "layers" element, wrap a list around them
for (c in 1:length(pattern$columns)) {
if (is.null(pattern$columns[[c]]$layers)) {
pattern$columns[[c]] <- list(layers = pattern$columns[[c]])
}
} # next column
## convert layer=string to layer=list(pattern=string)
target.layer <- "word"
for (c in 1:length(pattern$columns)) { # for each column
for (l in names(pattern$columns[[c]]$layers)) { # for each layer in the column
# if the layer value isn't a list
if (!is.list(pattern$columns[[c]]$layers[[l]])) {
# wrap a list(pattern=...) around it
pattern$columns[[c]]$layers[[l]] <- list(pattern = pattern$columns[[c]]$layers[[l]])
} # value isn't a list
# if they're searching the segment layer, assume it's the target
if (l == "segment" && target.layer == "word") {
target.layer <- "segment"
}
# ... unless there's an explicitly selected target
if (!is.null(pattern$columns[[c]]$layers[[l]]$target)
&& pattern$columns[[c]]$layers[[l]]$target) {
target.layer <- l
}
} # next layer
} # next column
## start the search
searchJson <- jsonlite::toJSON(pattern, auto_unbox = TRUE)
parameters <- list(command="search", searchJson=searchJson,
words_context=words.context)
if (main.participant) {
parameters$only_main_speaker <- TRUE
}
if (aligned) {
parameters$only_aligned <- TRUE
}
if (!is.null(anchor.confidence.min)) {
parameters$offsetThreshold <- anchor.confidence.min
}
if (!is.null(matches.per.transcript)) {
parameters$matches_per_transcript <- as.list(matches.per.transcript)
}
if (!is.null(participant.expression)) {
if (length(participant.expression) > 1 # it's a list, not a string
|| !grepl("'", participant.expression)) { # or it is a string, but not an expression
## for backwards compatibility for when the 3rd parameter was participant.ids
## we convert a list of IDs to the appropriate participant expression
participant.expression <- expressionFromIds(participant.expression)
} # it's a list, not a string
parameters$participant_expression <- as.list(participant.expression)
}
if (!is.null(transcript.expression)) {
if (length(transcript.expression) > 1 # it's a list, not a string
|| !grepl("'", transcript.expression)) { # or it is a string, but not an expression
## for backwards compatibility for when the 4th parameter was trascript.types
## we convert a list of IDs to the appropriate transcript expression
transcript.expression <- expressionFromTranscriptTypes(transcript.expression)
} # it's a list, not a string
parameters$transcript_expression <- as.list(transcript.expression)
}
if (!is.null(overlap.threshold)) {
parameters$overlap_threshold <- overlap.threshold
}
resp <- http.get(labbcat.url, "api/search", parameters)
if (is.null(resp)) return()
deprecatedApi <- FALSE
if (httr::status_code(resp) == 404) { # server version prior to 20230511.1949
resp <- http.get(labbcat.url, "search", parameters) # use deprecated endpoint
if (is.null(resp)) return()
deprecatedApi <- TRUE
}
resp.content <- httr::content(resp, as="text", encoding="UTF-8")
if (httr::status_code(resp) != 200) { # 200 = OK
print(paste("ERROR: ", httr::http_status(resp)$message))
print(resp.content)
return()
}
resp.json <- jsonlite::fromJSON(resp.content)
for (error in resp.json$errors) print(error)
## we get a task ID back
threadId <- resp.json$model$threadId
pb <- NULL
if (interactive() && !no.progress) {
pb <- txtProgressBar(min = 0, max = 100, style = 3)
}
## monitor the task until it finishes
thread <- thread.get(labbcat.url, threadId)
if (is.null(thread)) {
return()
}
while (thread$running) {
if (!is.null(pb) && !is.null(thread$percentComplete)) {
setTxtProgressBar(pb, thread$percentComplete)
}
Sys.sleep(1)
thread <- thread.get(labbcat.url, threadId)
if (is.null(thread)) {
return()
}
} # poll until finished
if (!is.null(pb)) {
if (!is.null(thread$percentComplete)) {
setTxtProgressBar(pb, thread$percentComplete)
close(pb)
}
if (!is.null(thread$status)) {
if (thread$size > 0) {
cat(paste(thread$status, " - fetching data...", "\n", sep=""))
} else {
cat(paste(thread$status, "\n", sep=""))
}
}
}
## define the dataframe to return (which is, for now, empty)
allMatches <- data.frame(matrix(ncol = 15, nrow = 0))
if (target.layer != "word") {
allMatches <- data.frame(matrix(ncol = 18, nrow = 0))
}
if (thread$size > 0) { ## there were actually some matches
## ensure labbcat base URL has a trailing slash (for URL reconstruction)
if (!grepl("/$", labbcat.url)) labbcat.url <- paste(labbcat.url, "/", sep="")
## layers - "word", and "segment" if mentioned in the pattern
tokenLayers <- c("word")
if (target.layer != "word") tokenLayers <- c("word", target.layer)
## search results can be very large, and httr timeouts are short and merciless,
## so we break the results into chunks and retrieve them using lots of small
## requests instead of one big request
totalMatches <- min(thread$size, max.matches) ## (works even if max.matches == NULL)
matchesLeft <- totalMatches
pageNumber <- 0
pb <- NULL
if (interactive() && !no.progress) {
pb <- txtProgressBar(min = 0, max = matchesLeft, style = 3)
}
endpoint <- "api/results"
if (deprecatedApi) endpoint <- "resultsStream" # server version prior to 20230511.1949
while(matchesLeft > 0) { ## loop until we've got all the matches we want
resp <- http.get(labbcat.url,
endpoint,
list(threadId=threadId, words_context=words.context,
pageLength=page.length, pageNumber=pageNumber),
content.type="application/json")
if (is.null(resp)) break
if (httr::status_code(resp) != 200) { # 200 = OK
print(paste("ERROR: ", httr::http_status(resp)$message))
print(httr::content(resp, as="text", encoding="UTF-8"))
break
}
resp.content <- httr::content(resp, as="text", encoding="UTF-8")
resp.json <- jsonlite::fromJSON(resp.content)
matches <- resp.json$model$matches
## decrement the number of rows left to get
matchesLeft <- matchesLeft - nrow(matches)
## ensure we don't have too many rows (on the last page)
if (matchesLeft < 0) {
matches <- head(matches, page.length + matchesLeft)
}
matches <- cbind(resp.json$model$name, matches)
## extract number from MatchId
matches <- cbind(
matches, as.numeric(stringr::str_match(matches$MatchId, "prefix=0*([0-9]+)-")[,2]))
if (deprecatedApi) {
## reconstruct url
matches <- cbind(
matches, paste(
labbcat.url, "transcript?transcript=",
matches$Transcript, "#",
stringr::str_match(matches$MatchId, "\\[0\\]=([^;]*)(;.*|$)")[,2], sep=""))
}
## Ensure previous default anchor.confidence.min behaviour still works
## i.e. if they don't specify a value, then only autmatically aligned offsets
## are returned.
## If they really want all offsets regardless of confidence, they can call with
## anchor.confidence.min=0
if (is.null(anchor.confidence.min)) {
anchor.confidence.min = 50
}
## get the alignments
tokens <- getMatchAlignments(
labbcat.url, matches$MatchId, tokenLayers,
anchor.confidence.min=anchor.confidence.min, no.progress=T)
matches <- cbind(matches, tokens)
## add this chunk to the collection
allMatches <- rbind(allMatches, matches)
if (!is.null(pb)) {
setTxtProgressBar(pb, nrow(allMatches))
}
## next page
pageNumber <- pageNumber + 1
} ## loop
## finished with the progress bar
if (!is.null(pb)) {
setTxtProgressBar(pb, nrow(allMatches))
close(pb)
}
} ## there are matches
if (deprecatedApi) {
frameNames <- c(
"SearchName","MatchId","Transcript","Participant","Corpus","Line","LineEnd",
"Before.Match","Text","After.Match","Number","URL",
"Target.word","Target.word.start","Target.word.end")
} else {
frameNames <- c(
"SearchName","Transcript","Participant","Corpus","Line","LineEnd",
"MatchId","URL","Before.Match","Text","After.Match","Number",
"Target.word","Target.word.start","Target.word.end")
}
if (target.layer != "word") {
frameNames <- c(frameNames,
c(paste("Target.",target.layer,sep=""),
paste("Target.",target.layer,".start",sep=""),
paste("Target.",target.layer,".end",sep="")))
}
names(allMatches) = frameNames
## free the search thread so it's not using server resources
http.get(labbcat.url, "threads", list(threadId=threadId, command="release"))
return(allMatches)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.