R/lma_lspace.R
In lingmatch: Linguistic Matching and Accommodation

Documented in lma_lspace

#' Latent Semantic Space (Embeddings) Operations
#'
#' Map a document-term matrix onto a latent semantic space, extract terms from a
#' latent semantic space (if \code{dtm} is a character vector, or \code{map.space =} \code{FALSE}),
#' or perform a singular value decomposition of a document-term matrix (if \code{dtm} is a matrix
#' and \code{space} is missing).
#' @param dtm A matrix with terms as column names, or a character vector of terms to be extracted
#'   from a specified space. If this is of length 1 and \code{space} is missing, it will be treated
#'   as \code{space}.
#' @param space A matrix with terms as rownames. If missing, this will be the right singular vectors
#'   of a singular value decomposition of \code{dtm}. If a character, a file matching the character
#'   will be searched for in \code{dir} (e.g., \code{space = 'google'}). If a file is not found and
#'   the character matches one of the \href{https://osf.io/489he/wiki/home}{available spaces}, you
#'   will be given the option to download it, as handled by \code{\link{download.lspace}}.
#'   If \code{dtm} is missing, the entire space will be loaded and returned.
#' @param map.space Logical: if \code{FALSE}, the original vectors of \code{space} for terms
#'   found in \code{dtm} are returned. Otherwise \code{dtm} \code{\%*\%} \code{space} is returned,
#'   excluding uncommon columns of \code{dtm} and rows of \code{space}.
#' @param fill.missing Logical: if \code{TRUE} and terms are being extracted from a space, includes
#'   terms not found in the space as rows of 0s, such that the returned matrix will have a row
#'   for every requested term.
#' @param term.map A matrix with \code{space} as a column name, terms as row names, and indices of
#'   the terms in the given space as values, or a numeric vector of indices with terms as names, or
#'   a character vector of terms corresponding to rows of the space. This is used instead of reading
#'   in an "_terms.txt" file corresponding to a \code{space} entered as a character (the name of a
#'   space file).
#' @param dim.cutoff If a \code{space} is calculated, this will be used to decide on the number of
#'   dimensions to be retained: \code{cumsum(d) / sum(d) < dim.cutoff}, where \code{d} is a vector
#'   of singular values of \code{dtm} (i.e., \code{svd(dtm)$d}). The default is \code{.5}; lower
#'   cutoffs result in fewer dimensions.
#' @param keep.dim Logical: if \code{TRUE}, and a space is being calculated from the input, a matrix
#'   in the same dimensions as \code{dtm} is returned. Otherwise, a matrix with terms as rows and
#'   dimensions as columns is returned.
#' @param use.scan Logical: if \code{TRUE}, reads in the rows of \code{space} with \code{\link{scan}}.
#' @param dir Path to a folder containing spaces. \cr
#'   Set a session default with \code{options(lingmatch.lspace.dir = 'desired/path')}.
#' @note
#' A traditional latent semantic space is a selection of right singular vectors from the singular
#' value decomposition of a dtm (\code{svd(dtm)$v[, 1:k]}, where \code{k} is the selected number of
#' dimensions, decided here by \code{dim.cutoff}).
#'
#' Mapping a new dtm into a latent semantic space consists of multiplying common terms:
#' \code{dtm[, ct]} \code{\%*\% space[ct, ]}, where \code{ct} \code{=} \code{colnames(dtm)[colnames(dtm)}
#' \code{\%in\%} \code{rownames(space)]} -- the terms common between the dtm and the space. This
#' results in a matrix with documents as rows, and dimensions as columns, replacing terms.
#' @family Latent Semantic Space functions
#' @return A matrix or sparse matrix with either (a) a row per term and column per latent dimension (a latent
#' space, either calculated from the input, or retrieved when \code{map.space = FALSE}), (b) a row per document
#' and column per latent dimension (when a dtm is mapped to a space), or (c) a row per document and
#' column per term (when a space is calculated and \code{keep.dim = TRUE}).
#' @examples
#' text <- c(
#'   paste(
#'     "Hey, I like kittens. I think all kinds of cats really are just the",
#'     "best pet ever."
#'   ),
#'   paste(
#'     "Oh year? Well I really like cars. All the wheels and the turbos...",
#'     "I think that's the best ever."
#'   ),
#'   paste(
#'     "You know what? Poo on you. Cats, dogs, rabbits -- you know, living",
#'     "creatures... to think you'd care about anything else!"
#'   ),
#'   paste(
#'     "You can stick to your opinion. You can be wrong if you want. You know",
#'     "what life's about? Supercharging, diesel guzzling, exhaust spewing,",
#'     "piston moving ignitions."
#'   )
#' )
#'
#' dtm <- lma_dtm(text)
#'
#' # calculate a latent semantic space from the example text
#' lss <- lma_lspace(dtm)
#'
#' # show that document similarities between the truncated and full space are the same
#' spaces <- list(
#'   full = lma_lspace(dtm, keep.dim = TRUE),
#'   truncated = lma_lspace(dtm, lss)
#' )
#' sapply(spaces, lma_simets, metric = "cosine")
#'
#' \dontrun{
#'
#' # specify a directory containing spaces,
#' # or where you would like to download spaces
#' space_dir <- "~/Latent Semantic Spaces"
#'
#' # map to a pretrained space
#' ddm <- lma_lspace(dtm, "100k", dir = space_dir)
#'
#' # load the matching subset of the space
#' # without mapping
#' lss_100k_part <- lma_lspace(colnames(dtm), "100k", dir = space_dir)
#'
#' ## or
#' lss_100k_part <- lma_lspace(dtm, "100k", map.space = FALSE, dir = space_dir)
#'
#' # load the full space
#' lss_100k <- lma_lspace("100k", dir = space_dir)
#'
#' ## or
#' lss_100k <- lma_lspace(space = "100k", dir = space_dir)
#' }
#' @export

lma_lspace <- function(dtm = "", space, map.space = TRUE, fill.missing = FALSE, term.map = NULL,
                       dim.cutoff = .5, keep.dim = FALSE, use.scan = FALSE, dir = getOption("lingmatch.lspace.dir")) {
  if (ckd <- dir == "") dir <- "~/Latent Semantic Spaces"
  if ((is.character(dtm) || is.factor(dtm)) && missing(space)) {
    if (length(dtm) > 1 && any(grepl(" ", dtm, fixed = TRUE))) {
      dtm <- lma_dtm(dtm)
    } else if (length(dtm) == 1 && dtm != "") {
      if (missing(use.scan)) use.scan <- TRUE
      space <- dtm
      dtm <- ""
    }
  }
  if (is.data.frame(dtm)) dtm <- as.matrix(dtm)
  if (missing(space)) {
    nr <- nrow(dtm)
    if (is.null(nr)) stop("enter a matrix for dtm, or specify a space")
    s <- svd(dtm)
    s$v <- t(s$v)
    k <- cumsum(s$d) / sum(s$d)
    if (dim.cutoff > 1) dim.cutoff <- 1
    k <- if (length(k) == 1) 1 else seq_len(if (any(k < dim.cutoff)) which(k >= dim.cutoff)[1] else 1)
    if (keep.dim) {
      dtm[] <- s$u[, k] %*% (if (length(k) == 1) matrix(s$d[k]) else diag(s$d[k])) %*% s$v[k, ]
    } else {
      cn <- colnames(dtm)
      dtm <- t(s$v[k, , drop = FALSE])
      rownames(dtm) <- cn
    }
  } else {
    terms <- if (is.null(colnames(dtm))) {
      map.space <- FALSE
      dtm
    } else {
      colnames(dtm)
    }
    if (is.character(space)) {
      if (space == "default") space <- "100k_lsa"
      name <- gsub("^.*[/\\]|\\.[^/\\]*$", "", space)[1]
      spaces <- list.files(dir)
      ts <- grep(space, spaces, fixed = TRUE, value = TRUE)
      if (!length(ts)) {
        ts <- rownames(select.lspace(name)$selected)[1]
        if (!ckd && !is.na(ts) && grepl("^$|^[yt1]|^ent", readline(paste0(
          "would you like to download the ", ts, " space? (press Enter for yes): "
        )))) {
          download.lspace(ts, dir = dir)
          ts <- paste0(ts, ".dat")
        } else {
          stop("space (", space, ") not found in dir (", dir, ")",
            if (ckd) '\nspecify a directory (e.g., dir = "~") to locate or download; see ?download.lspace',
            call. = FALSE
          )
        }
      }
      space_path <- normalizePath(paste0(dir, "/", if (length(su <- grep("\\.dat$", ts))) {
        ts[su[[1]]]
      } else {
        use.scan <- TRUE
        ts[grep("[bgx]z[ip2]*$", ts)[[1]]]
      }), "/", FALSE)
      name <- sub("\\.[^.]*$", "", basename(space_path))
      if (name %in% colnames(term.map)) term.map <- term.map[term.map[, name] != 0, name]
      rex <- function(inds, f) {
        nc <- length(strsplit(readLines(f, 1), "\\s+")[[1]])
        l <- length(inds)
        all <- all(seq_len(l) == inds)
        r <- matrix(0, l, nc)
        i <- 1
        con <- file(f, "r")
        on.exit(close(con))
        while (i <= l) {
          if (all) {
            n <- l
          } else {
            n <- 1
            while (i + n < l && inds[i + n - 1] == inds[i + n] - 1) n <- n + 1
          }
          r[seq_len(n) + i - 1, ] <- matrix(scan(
            con,
            n = n * nc, quiet = TRUE, skip = (if (i == 1) {
              inds[i]
            } else {
              inds[i] - inds[i - 1]
            }) - 1, quote = "", comment.char = "", na.strings = ""
          ), n, nc, byrow = TRUE)
          i <- i + n
        }
        r
      }
      if (is.null(term.map)) {
        terms_file <- paste0(dir, "/", name, "_terms.txt")
        if (!file.exists(terms_file)) stop("terms file (", space, "_terms.txt) not found in dir (", dir, ").")
        space_terms <- readLines(terms_file)
        su <- if (length(terms) == 1 && terms == "") {
          terms <- space_terms
          !logical(length(space_terms))
        } else {
          space_terms %in% terms
        }
        if (sum(su) < length(terms)) {
          lsterms <- tolower(space_terms)
          su2 <- !duplicated(lsterms) & lsterms %in% terms[!terms %in% space_terms[su]]
          if (any(su2)) {
            space_terms[su2] <- lsterms[su2]
            su <- su | su2
          }
        }
        if (sum(su)) {
          space <- if (use.scan) rex(which(su), space_path) else t(extract_indices(which(su), space_path))
          rownames(space) <- space_terms[su]
          ts <- terms[terms %in% rownames(space)]
          space <- space[ts, , drop = FALSE]
        } else {
          stop("no matching terms in space ", space)
        }
      } else {
        if (is.character(term.map)) term.map <- structure(seq_along(term.map), names = term.map)
        su <- which(names(term.map) %in% terms)
        inds <- as.numeric(sort(if (length(terms) == 1 && terms == "") term.map else term.map[su]))
        if (length(inds)) {
          space <- if (use.scan) rex(inds, space_path) else t(extract_indices(inds, space_path))
          rownames(space) <- ts <- names(term.map)[inds]
        } else {
          stop("no matching terms in space ", space)
        }
      }
    } else {
      if (is.data.frame(space)) space <- as.matrix(space)
      name <- deparse(substitute(space))
      su <- terms %in% rownames(space)
      if (sum(su)) {
        ts <- terms[su]
        space <- space[ts, , drop = FALSE]
      } else if (sum(su <- terms %in% colnames(space))) {
        ts <- terms[su]
        space <- t(space[, ts, drop = FALSE])
      } else {
        stop("no matching terms in provided space")
      }
    }
    if (fill.missing) {
      su <- which(!terms %in% rownames(space))
      if (length(su)) {
        space <- rbind(space, matrix(0, length(su), ncol(space), dimnames = list(terms[su])))
        space <- space[terms, ]
      }
      ts <- rownames(space)
    }
    attr(space, "space") <- name
    if (map.space) {
      rep <- length(ts) / ncol(dtm)
      if (rep < .2) {
        warning(paste0(
          "only ", round(rep * 100, 2), "% of dtm terms appear in the provided space; ",
          "you might consider using a different source or cleaning/partial matching terms"
        ), call. = FALSE)
      }
      dtm <- dtm[, ts, drop = FALSE] %*% space
      attr(dtm, "space") <- name
    } else {
      return(space)
    }
  }
  dtm
}