R/getMatches.R

Defines functions getMatches

Documented in getMatches

#' Search for tokens.
#'
#' Searches through transcripts for tokens matching the given pattern.
#'
#' @param labbcat.url URL to the LaBB-CAT instance
#' @param pattern An object representing the pattern to search for.
#'
#' This can be:
#' \itemize{
#'  \item{A string, representing a search of the orthography layer - spaces are
#'        taken to be word boundaries}
#'  \item{A single named list, representing a one-column search - names are taken to be layer IDs}
#'  \item{A list of named lists, representing a multi-column search - the outer list
#'        represents the columns of the search matrix where each column 'immediately
#'        follows' the previous, and the names of the inner lists are taken to be layer IDs} 
#'  \item{A named list fully replicating the structure of the search matrix in the
#'        LaBB-CAT browser interface, with one element called ``columns'', containing a
#'        named list for each column.
#' 
#'        Each element in the ``columns'' named list contains an element named ``layers'', whose
#'     value is a named list for patterns to match on each layer, and optionally an
#'     element named ``adj'', whose value is a number representing the maximum distance, in
#'     tokens, between this column and the next column - if ``adj'' is not specified, the
#'     value defaults to 1, so tokens are contiguous.
#'
#'         Each element in the ``layers'' named list is named after the layer it matches, and the
#'     value is a named list with the following possible elements:
#'         \itemize{
#'          \item{\emph{pattern}  A regular expression to match against the label}
#'          \item{\emph{min}  An inclusive minimum numeric value for the label}
#'          \item{\emph{max}  An exclusive maximum numeric value for the label}
#'          \item{\emph{not}  TRUE to negate the match}
#'          \item{\emph{anchorStart}  TRUE to anchor to the start of the annotation on this layer
#'             (i.e. the matching word token will be the first at/after the start of the matching
#'             annotation on this layer)}
#'          \item{\emph{anchorEnd}  TRUE to anchor to the end of the annotation on this layer
#'             (i.e. the matching word token will be the last before/at the end of the matching
#'             annotation on this layer)}
#'          \item{\emph{target}  TRUE to make this layer the target of the search; the
#'             results will contain one row for each match on the target layer}
#'       }
#'  }
#' }
#'
#' Examples of valid pattern objects include:
#' \preformatted{
#' ## the word 'the' followed immediately by a word starting with an orthographic vowel
#' pattern <- "the [aeiou]"
#' 
#' ## a word spelt with "k" but pronounced "n" word initially
#' pattern <- list(orthography = "k.*", phonemes = "n.*")
#' 
#' ## the word 'the' followed immediately by a word starting with an phonemic vowel
#' pattern <- list(
#'     list(orthography = "the"),
#'     list(phonemes = "[cCEFHiIPqQuUV0123456789~#\\$@].*"))
#' 
#' ## the word 'the' followed immediately or with one intervening word by
#' ## a hapax legomenon (word with a frequency of 1) that doesn't start with a vowel
#' pattern <- list(columns = list(
#'     list(layers = list(
#'            orthography = list(pattern = "the")),
#'          adj = 2),
#'     list(layers = list(
#'            phonemes = list(not = TRUE, pattern = "[cCEFHiIPqQuUV0123456789~#\\$@].*"),
#'            frequency = list(max = "2")))))
#' }
#' @param participant.expression An optional participant query expression for identifying
#'     participants to search the utterances of. This should be the output of
#'     \link{expressionFromIds}, \link{expressionFromAttributeValue},
#'     or \link{expressionFromAttributeValues}, or more than one concatentated together
#'     and delimited by ' && '. If not supplied, utterances of all participants will be searched.
#' @param transcript.expression An optional transript query expression for identifying
#'     transcripts to search in. This should be the output of \link{expressionFromIds},
#'     \link{expressionFromTranscriptTypes}, \link{expressionFromAttributeValue},
#'     or \link{expressionFromAttributeValues}, or more than one concatentated together
#'     and delimited by ' && '. If not supplied, all transcripts will be searched.
#' @param main.participant TRUE to search only main-participant utterances, FALSE to
#'     search all utterances.
#' @param aligned This parameter is deprecated and will be removed in future versions;
#'     please use anchor.confidence.min=50 instead.
#' @param matches.per.transcript Optional maximum number of matches per transcript to
#'     return. NULL means all matches.
#' @param words.context Number of words context to include in the `Before.Match' and
#'     `After.Match' columns in the results.
#' @param max.matches The maximum number of matches to return, or null to return all.
#' @param overlap.threshold The percentage overlap with other utterances before
#'     simultaneous speech is excluded, or null to include overlapping speech.
#' @param anchor.confidence.min The minimum confidence for alignments, e.g.
#' \itemize{
#'  \item{\emph{0} -- return all alignments, regardless of confidence;}
#'  \item{\emph{50} -- return only alignments that have been at least automatically aligned;}
#'  \item{\emph{100} -- return only manually-set alignments.}
#' }
#' @param page.length In order to prevent timeouts when there are a large number of
#'     matches or the network connection is slow, rather than retrieving matches in one
#'     big request, they are retrieved using many smaller requests. This parameter
#'     controls the number of results retrieved per request.
#' @param no.progress TRUE to supress visual progress bar. Otherwise, progress bar will be
#'     shown when interactive().
#' @return A data frame identifying matches, containing the following columns:
#' \itemize{
#'  \item{\emph{SearchName} A name based on the pattern -- the same for all rows}
#'  \item{\emph{MatchId} A unique ID for the matching target token}
#'  \item{\emph{Transcript} Name of the transcript in which the match was found}
#'  \item{\emph{Participant} Name of the speaker}
#'  \item{\emph{Corpus} The corpus of the transcript}
#'  \item{\emph{Line} The start offset of the utterance/line}
#'  \item{\emph{LineEnd} The end offset of the utterance/line}
#'  \item{\emph{Before.Match} Transcript text immediately before the match}
#'  \item{\emph{Text} Transcript text of the match}
#'  \item{\emph{After.Match} Transcript text immediately after the match}
#'  \item{\emph{Number} Row number}
#'  \item{\emph{URL} URL of the first matching word token}
#'  \item{\emph{Target.word} Text of the target word token}
#'  \item{\emph{Target.word.start} Start offset of the target word token}
#'  \item{\emph{Target.word.end} End offset of the target word token}
#'  \item{\emph{Target.segment} Label of the target segment (only present if the segment
#'     layer is included in the pattern)}
#'  \item{\emph{Target.segment.start} Start offset of the target segment (only present if the
#'     segment layer is included in the pattern)}
#'  \item{\emph{Target.segment.end} End offset of the target segment (only present if the
#'     segment layer is included in the pattern)}
#' }
#' 
#' @seealso \code{\link{getFragments}}
#' @seealso \code{\link{getSoundFragments}}
#' @seealso \code{\link{getMatchLabels}}
#' @seealso \code{\link{getMatchAlignments}}
#' @seealso \code{\link{processWithPraat}}
#' @seealso \code{\link{getParticipantIds}}
#' 
#' @examples 
#' \dontrun{
#' ## define the LaBB-CAT URL
#' labbcat.url <- "https://labbcat.canterbury.ac.nz/demo/"
#'
#' ## the word 'the' followed immediately by a word starting with an orthographic vowel
#' theThenOrthVowel <- getMatches(labbcat.url, "the [aeiou]")
#'
#' ## a word spelt with "k" but pronounced "n" word initially
#' knWords <- getMatches(labbcat.url, list(orthography = "k.*", phonemes = "n.*"))
#'
#' ## the word 'the' followed immediately by a word starting with an phonemic vowel
#' theThenPhonVowel <- getMatches(
#'   labbcat.url, list(
#'     list(orthography = "the"),
#'     list(phonemes = "[cCEFHiIPqQuUV0123456789~#\\$@].*")))
#' 
#' ## the word 'the' followed immediately or with one intervening word by
#' ## a hapax legomenon (word with a frequency of 1) that doesn't start with a vowel
#' results <- getMatches(
#'   labbcat.url, list(columns = list(
#'     list(layers = list(
#'            orthography = list(pattern = "the")),
#'          adj = 2),
#'     list(layers = list(
#'            phonemes = list(not=TRUE, pattern = "[cCEFHiIPqQuUV0123456789~#\\$@].*"),
#'            frequency = list(max = "2"))))),
#'   overlap.threshold = 5)
#'
#' ## all tokens of the KIT vowel, from the interview or monologue
#' ## of the participants AP511_MikeThorpe and BR2044_OllyOhlson
#' results <- getMatches(labbcat.url, list(segment="I"),
#'   participant.expression = expressionFromIds(c("AP511_MikeThorpe","BR2044_OllyOhlson")),
#'   transcript.expression = expressionFromTranscriptTypes(c("interview","monologue")))
#' 
#' ## all tokens of the KIT vowel for male speakers who speak English
#' results <- getMatches(labbcat.url, list(segment="I"),
#'   participant.expression = paste(
#'     expressionFromAttributeValue("participant_gender", "M"),
#'     expressionFromAttributeValues("participant_languages_spoken", "en"),
#'     sep=" && "))
#'
#' ## results$Text is the text that matched
#' ## results$MatchId can be used to access results using other functions
#' }
#'
#' @keywords search
#' 
getMatches <- function(labbcat.url, pattern, participant.expression=NULL, transcript.expression=NULL, main.participant=TRUE, aligned=NULL, matches.per.transcript=NULL, words.context=0, max.matches=NULL, overlap.threshold=NULL, anchor.confidence.min=NULL, page.length=1000, no.progress=FALSE) {

    ## if they've explicitly passed a value for aligned
    if (!is.null(aligned)) {
        print("WARNING: the getMatches parameter 'aligned' is deprecated; please use anchor.confidence.min=50 instead")
    } else { ## use previous default value
        aligned = FALSE
        ## if they've just followed the advice above, but their LaBB-CAT version is old
        ## we need to make sure that it still behaves as it did before
        if (!is.null(anchor.confidence.min)) {
            if (anchor.confidence.min >= 50) {
                aligned = TRUE ## (this has no effect on newer versions of LaBB-CAT)
            }
        }
    }
    
    ## first normalize the pattern...
    if (is.character(pattern) && length(pattern) == 1) { # it's a string
        ## assume it's an orthography search
        tokens <- strsplit(pattern," ")
        pattern <- list()
        for (token in tokens[[1]]) {
            pattern[[length(pattern)+1]] <- list(orthography=token)
        } ## next token
    } # it's a string

    ## if pattern isn't a list, convert it to one
    if (!is.list(pattern)) pattern <- as.list(pattern)

    ## if pattern isn't a list with a "columns" element, wrap a list around it
    if (is.null(pattern$columns)) pattern <- list(columns = pattern)

    ## if pattern$columns isn't a 'nameless' list (i.e. with numeric indices) wrap a
    ## nameless list around it
    if (!is.null(names(pattern$columns))) pattern$columns <- list(pattern$columns)

    ## columns contain lists with no "layers" element, wrap a list around them
    for (c in 1:length(pattern$columns)) {
        if (is.null(pattern$columns[[c]]$layers)) {
            pattern$columns[[c]] <- list(layers = pattern$columns[[c]])
        }
    } # next column

    ## convert layer=string to layer=list(pattern=string)
    target.layer <- "word"
    for (c in 1:length(pattern$columns)) { # for each column
        for (l in names(pattern$columns[[c]]$layers)) { # for each layer in the column
            # if the layer value isn't a list
            if (!is.list(pattern$columns[[c]]$layers[[l]])) {
                # wrap a list(pattern=...) around it
                pattern$columns[[c]]$layers[[l]] <- list(pattern = pattern$columns[[c]]$layers[[l]])
            } # value isn't a list
            # if they're searching the segment layer, assume it's the target
            if (l == "segment" && target.layer == "word") {
                target.layer <- "segment"
            }
            # ... unless there's an explicitly selected target
            if (!is.null(pattern$columns[[c]]$layers[[l]]$target)
                && pattern$columns[[c]]$layers[[l]]$target) {
                target.layer <- l
            }
        } # next layer
    } # next column

    ## start the search
    searchJson <- jsonlite::toJSON(pattern, auto_unbox = TRUE)
    parameters <- list(command="search", searchJson=searchJson,
                       words_context=words.context)
    if (main.participant) {
        parameters$only_main_speaker <- TRUE
    }
    if (aligned) {
        parameters$only_aligned <- TRUE
    }
    if (!is.null(anchor.confidence.min)) {
        parameters$offsetThreshold <- anchor.confidence.min
    }
    if (!is.null(matches.per.transcript)) {
        parameters$matches_per_transcript <- as.list(matches.per.transcript)
    }
    if (!is.null(participant.expression)) {
        if (length(participant.expression) > 1        # it's a list, not a string
            || !grepl("'", participant.expression)) { # or it is a string, but not an expression
            ## for backwards compatibility for when the 3rd parameter was participant.ids
            ## we convert a list of IDs to the appropriate participant expression
            participant.expression <- expressionFromIds(participant.expression)
        } # it's a list, not a string
        parameters$participant_expression <- as.list(participant.expression)
    }
    if (!is.null(transcript.expression)) {
        if (length(transcript.expression) > 1        # it's a list, not a string
            || !grepl("'", transcript.expression)) { # or it is a string, but not an expression
            ## for backwards compatibility for when the 4th parameter was trascript.types
            ## we convert a list of IDs to the appropriate transcript expression
            transcript.expression <- expressionFromTranscriptTypes(transcript.expression)
        } # it's a list, not a string
        parameters$transcript_expression <- as.list(transcript.expression)
    }
    if (!is.null(overlap.threshold)) {
        parameters$overlap_threshold <- overlap.threshold
    }
    
    resp <- http.get(labbcat.url, "api/search", parameters)
    if (is.null(resp)) return()
    deprecatedApi <- FALSE
    if (httr::status_code(resp) == 404) { # server version prior to 20230511.1949
        resp <- http.get(labbcat.url, "search", parameters) # use deprecated endpoint
        if (is.null(resp)) return()
        deprecatedApi <- TRUE
    }
    resp.content <- httr::content(resp, as="text", encoding="UTF-8")
    if (httr::status_code(resp) != 200) { # 200 = OK
        print(paste("ERROR: ", httr::http_status(resp)$message))
        print(resp.content)
        return()
    }
    resp.json <- jsonlite::fromJSON(resp.content)
    for (error in resp.json$errors) print(error)

    ## we get a task ID back
    threadId <- resp.json$model$threadId

    pb <- NULL
    if (interactive() && !no.progress) {
        pb <- txtProgressBar(min = 0, max = 100, style = 3)        
    }

    ## monitor the task until it finishes
    thread <- thread.get(labbcat.url, threadId)
    if (is.null(thread)) {
        return()
    }
    while (thread$running) {
        if (!is.null(pb) && !is.null(thread$percentComplete)) {
            setTxtProgressBar(pb, thread$percentComplete)
        }
        Sys.sleep(1)
        thread <- thread.get(labbcat.url, threadId)
        if (is.null(thread)) {
            return()
        }
    } # poll until finished
    if (!is.null(pb)) {
        if (!is.null(thread$percentComplete)) {
            setTxtProgressBar(pb, thread$percentComplete)
            close(pb)
        }
        if (!is.null(thread$status)) {
            if (thread$size > 0) {
                cat(paste(thread$status, " - fetching data...", "\n", sep=""))
            } else {
                cat(paste(thread$status, "\n", sep=""))
            }
        }
    }

    ## define the dataframe to return (which is, for now, empty)
    allMatches <- data.frame(matrix(ncol = 15, nrow = 0))
    if (target.layer != "word") {
        allMatches <- data.frame(matrix(ncol = 18, nrow = 0))
    }
    if (thread$size > 0) { ## there were actually some matches
        
        ## ensure labbcat base URL has a trailing slash (for URL reconstruction)
        if (!grepl("/$", labbcat.url)) labbcat.url <- paste(labbcat.url, "/", sep="")

        ## layers - "word", and "segment" if mentioned in the pattern
        tokenLayers <- c("word")
        if (target.layer != "word") tokenLayers <- c("word", target.layer)
        
        ## search results can be very large, and httr timeouts are short and merciless,
        ## so we break the results into chunks and retrieve them using lots of small
        ## requests instead of one big request
        
        totalMatches <- min(thread$size, max.matches) ## (works even if max.matches == NULL)
        matchesLeft <- totalMatches
        pageNumber <- 0

        pb <- NULL
        if (interactive() && !no.progress) {
            pb <- txtProgressBar(min = 0, max = matchesLeft, style = 3)        
        }

        endpoint <- "api/results"
        if (deprecatedApi) endpoint <- "resultsStream" # server version prior to 20230511.1949
        while(matchesLeft > 0) { ## loop until we've got all the matches we want        
            resp <- http.get(labbcat.url,
                             endpoint,
                             list(threadId=threadId, words_context=words.context,
                                  pageLength=page.length, pageNumber=pageNumber),
                             content.type="application/json")
            if (is.null(resp)) break
            if (httr::status_code(resp) != 200) { # 200 = OK
                print(paste("ERROR: ", httr::http_status(resp)$message))
                print(httr::content(resp, as="text", encoding="UTF-8"))
                break
            }
        
            resp.content <- httr::content(resp, as="text", encoding="UTF-8")
            resp.json <- jsonlite::fromJSON(resp.content)
            matches <- resp.json$model$matches
            
            ## decrement the number of rows left to get
            matchesLeft <- matchesLeft - nrow(matches)
            ## ensure we don't have too many rows (on the last page)
            if (matchesLeft < 0) { 
                matches <- head(matches, page.length + matchesLeft)
            }

            matches <- cbind(resp.json$model$name, matches)
            ## extract number from MatchId
            matches <- cbind(
                matches, as.numeric(stringr::str_match(matches$MatchId, "prefix=0*([0-9]+)-")[,2]))
            if (deprecatedApi) {
                ## reconstruct url
                matches <- cbind(
                    matches, paste(
                                 labbcat.url, "transcript?transcript=",
                                 matches$Transcript, "#",
                                 stringr::str_match(matches$MatchId, "\\[0\\]=([^;]*)(;.*|$)")[,2], sep=""))
            }
            ## Ensure previous default anchor.confidence.min behaviour still works
            ## i.e. if they don't specify a value, then only autmatically aligned offsets
            ## are returned.
            ## If they really want all offsets regardless of confidence, they can call with
            ## anchor.confidence.min=0
            if (is.null(anchor.confidence.min)) {
                anchor.confidence.min = 50
            }
            ## get the alignments
            tokens <- getMatchAlignments(
                labbcat.url, matches$MatchId, tokenLayers,
                anchor.confidence.min=anchor.confidence.min, no.progress=T)
            matches <- cbind(matches, tokens)

            ## add this chunk to the collection
            allMatches <- rbind(allMatches, matches)

            if (!is.null(pb)) {
                setTxtProgressBar(pb, nrow(allMatches))
            }

            ## next page
            pageNumber <- pageNumber + 1
        } ## loop
        ## finished with the progress bar
        if (!is.null(pb)) {
            setTxtProgressBar(pb, nrow(allMatches))
            close(pb)
        }
    } ## there are matches

    if (deprecatedApi) {
        frameNames <- c(
            "SearchName","MatchId","Transcript","Participant","Corpus","Line","LineEnd",
            "Before.Match","Text","After.Match","Number","URL",
            "Target.word","Target.word.start","Target.word.end")
    } else {
        frameNames <- c(
            "SearchName","Transcript","Participant","Corpus","Line","LineEnd",
            "MatchId","URL","Before.Match","Text","After.Match","Number",
            "Target.word","Target.word.start","Target.word.end")
    }
    if (target.layer != "word") {
        frameNames <- c(frameNames,
                        c(paste("Target.",target.layer,sep=""),
                          paste("Target.",target.layer,".start",sep=""),
                          paste("Target.",target.layer,".end",sep="")))
    }
    names(allMatches) = frameNames
    
    ## free the search thread so it's not using server resources
    http.get(labbcat.url, "threads", list(threadId=threadId, command="release"))

    return(allMatches)
}

Try the nzilbb.labbcat package in your browser

Any scripts or data that you put into this service are public.

nzilbb.labbcat documentation built on July 26, 2023, 6:08 p.m.