R/read.dic.R
In lingmatch: Linguistic Matching and Accommodation

Documented in read.dic write.dic

#' Read/Write Dictionary Files
#'
#' Read in or write dictionary files in Comma-Separated Values (.csv; weighted) or
#' Linguistic Inquiry and Word Count (.dic; non-weighted) format.
#' @param path Path to a file, a name corresponding to a file in \code{getOption('lingmatch.dict.dir')}
#'   (or \code{'~/Dictionaries'}) or one of the dictionaries available at \href{https://osf.io/y6g5b}{osf.io/y6g5b},
#'   a matrix-like object to be categorized, or a list to be formatted.
#' @param cats A character vector of category names to be returned. All categories are returned by default.
#' @param type A character indicating whether and how terms should be altered. Unspecified or matching 'asis'
#'   leaves terms as they are. Other options change wildcards to regular expressions:
#'   \code{'pattern'} (\code{'^[poi]'}) replaces initial asterisks with \code{'\\\\b\\\\w*'},
#'   and terminal asterisks with \code{'\\\\w*\\\\b'}, to match terms within raw text;
#'   for anything else, terms are padded with \code{^} and \code{$}, then those bounding marks are removed
#'   when an asterisk is present, to match tokenized terms.
#' @param as.weighted Logical; if \code{TRUE}, prevents weighted dictionaries from being converted to
#'   unweighted versions, or converts unweighted dictionaries to a binary weighted version
#'   -- a data.frame with a "term" column of unique terms, and a column for each category.
#' @param dir Path to a folder containing dictionaries, or where you would like dictionaries to be downloaded;
#'   passed to \code{\link{select.dict}} and/or \code{\link{download.dict}}.
#' @param ... Passes arguments to \code{\link{readLines}}.
#' @param term.name,category.name Strings identifying column names in \code{path} containing terms and categories
#'   respectively.
#' @param raw Logical or a character. As logical, indicates if \code{path} should be treated
#'   as a raw dictionary (as might be read in from a .dic file). As a character, replaces \code{path}
#'   as if it were read in from a file.
#' @return \code{read.dic}: A \code{list} (unweighted) with an entry for each category containing
#'   character vectors of terms, or a \code{data.frame} (weighted) with columns for terms (first, "term") and
#'   weights (all subsequent, with category labels as names).
#' @family Dictionary functions
#' @importFrom utils read.table write.table
#' @export

read.dic <- function(path, cats = NULL, type = "asis", as.weighted = FALSE, dir = getOption("lingmatch.dict.dir"), ...,
                     term.name = "term", category.name = "category", raw = FALSE) {
  if (ckd <- dir == "") dir <- "~/Dictionaries"
  if (!is.logical(raw)) {
    path <- raw
    raw <- TRUE
  }
  if (missing(path)) path <- file.choose()
  if (!raw) {
    if (is.character(path) && !any(file.exists(path)) &&
      any(file.exists(normalizePath(paste0(dir, "/", path), "/", FALSE)))) {
      path <- normalizePath(paste0(dir, "/", path), "/", FALSE)
    }
    if (is.character(path) && !any(file.exists(path))) {
      tp <- select.dict(path, dir = if (ckd) "" else dir)
      if (nrow(tp$selected) && length(path) <= nrow(tp$info)) {
        if (any(tp$selected$downloaded == "")) {
          td <- rownames(tp$selected)[tp$selected$downloaded == ""]
          if (!ckd && grepl("^$|^[yt1]|^ent", readline(paste0(
            "would you like to download ", if (length(td) == 1) "this dictionary" else "these dictionaries", "?:\n",
            sub(",(?=[^,]+$)", if (length(td) == 2) " and" else ", and", paste0(td, collapse = ", "), perl = TRUE),
            "\n(press Enter for yes): "
          )))) {
            tp$selected[td, "downloaded"] <- unlist(download.dict(td, dir = dir), use.names = FALSE)
          }
        }
        path <- tp$selected[tp$selected[, "downloaded"] != "", "downloaded"]
        if (!length(path)) {
          stop(
            if (nrow(tp$selected) == 1) "dictionary" else "dictionaries", " (",
            paste(rownames(tp$selected), collapse = ", "), ") not found in dir (", dir, ")",
            if (ckd) '\nspecify a directory (e.g., dir = "~") to locate or download; see ?download.dict',
            call. = FALSE
          )
        }
      }
    }
    if (is.character(path) && length(path) > 1 && any(file.exists(path))) {
      dicts <- list()
      for (p in path) {
        if (file.exists(p)) {
          name <- gsub("^.*[/\\]+|\\.[^.]+$", "", p)
          dicts[[name]] <- read.dic(p)
        }
      }
      path <- if (length(dicts) == 1) dicts[[1]] else dicts
    }
  }
  if (!is.null(dim(path))) {
    if (anyNA(path)) path[is.na(path)] <- 0
    cols <- colnames(path)
    if (term.name %in% cols) {
      terms <- path[, term.name]
      cols <- cols[cols != term.name]
    } else if (!is.null(rownames(path)) && any(grepl("[a-z]", rownames(path), TRUE))) {
      terms <- rownames(path)
    } else {
      su <- which(vapply(cols, function(col) !is.numeric(path[, col]), TRUE))
      if (!length(su)) {
        if (!is.null(colnames(path))) {
          path <- data.frame(term = colnames(path), t(path), stringsAsFactors = FALSE)
          terms <- path$term
          cols <- colnames(path)[-1]
          if (missing(as.weighted)) as.weighted <- TRUE
        } else {
          stop("no non-numeric columns found in path")
        }
      } else {
        if (length(su) > 1) {
          ssu <- vapply(su, function(col) {
            if (!anyDuplicated(path[, col])) {
              1
            } else if (all(path[, col] == path[1, col])) 0 else 2
          }, 0)
          if (length(su) == ncol(path) && !any(ssu == 0)) {
            path <- data.frame(
              term = unlist(path[, su], use.names = FALSE),
              category = rep(colnames(path), each = nrow(path))
            )
            category.name <- cols <- "category"
            su <- 1
          } else {
            if (any(ssu == 2)) {
              cols <- colnames(path)[su[which(ssu == 2)]]
              if (length(cols) > 1 && length(su) != ncol(path)) cols <- cols[1]
            }
            su <- if (any(ssu == 1)) su[which(ssu == 1)[[1]]] else su[[1]]
          }
        }
        terms <- path[, su]
      }
    }
    if (category.name %in% colnames(path)) {
      su <- which(vapply(cols, function(col) is.numeric(path[, col]), TRUE))
      cols <- path[, category.name]
      if (length(su) == 1) {
        weights <- path[, names(su)]
        wl <- data.frame(term = terms)
        v <- numeric(nrow(path))
        for (cat in unique(cols)) {
          su <- cols == cat
          v[su] <- weights[su]
          wl[, cat] <- v
          v[] <- 0
        }
      } else {
        wl <- split(terms, cols)
      }
    } else {
      su <- vapply(cols, function(col) {
        if (is.numeric(path[, col])) {
          1
        } else if (anyDuplicated(path[, col])) {
          if (!all(path[, col] == path[1, col])) 2 else 3
        } else {
          0
        }
      }, 0)
      wl <- if (!1 %in% su && any(su == 2) && any(su != 2)) {
        cols <- cols[su == 2]
        wl <- lapply(cols, function(col) split(terms, path[, col]))
        names(wl) <- cols
        wl
      } else {
        if (!any(su == 1)) {
          if (any(su > 1)) {
            cols <- cols[su > 1]
            if (length(cols) == 1) {
              split(terms, path[, cols])
            } else {
              wl <- lapply(cols, function(col) split(terms, path[, col]))
              names(wl) <- cols
              unlist(wl, FALSE)
            }
          } else {
            stop("no numeric columns found in path")
          }
        } else if (as.weighted) {
          cbind(term = terms, path[, cols[su == 1], drop = FALSE])
        } else {
          cols <- cols[su == 1]
          if (length(cols) == 1) {
            weights <- path[, cols]
            if (any(weights < 0) && any(weights > 0)) {
              Filter(length, list(
                positive = terms[weights > 0],
                neutral = terms[weights == 0],
                negative = terms[weights < 0]
              ))
            } else if (anyDuplicated(weights)) split(terms, weights) else list(category = terms)
          } else {
            weights <- as.data.frame(path[, cols], stringsAsFactors = FALSE)
            if (any(weights > 0) && any(weights < 0)) {
              for (col in cols) {
                if (any(weights[, col] > 0)) weights[, paste0(col, ".positive")] <- as.integer(weights[, col] > 0)
                if (any(weights[, col] == 0)) weights[, paste0(col, ".neutral")] <- as.integer(weights[, col] == 0)
                if (any(weights[, col] < 0)) weights[, paste0(col, ".negative")] <- as.integer(weights[, col] < 0)
                weights <- weights[, colnames(weights) != col]
              }
              cols <- sort(colnames(weights))
            }
            lvs <- sort(unique(unlist(weights)))
            if (length(lvs) == 2 && all(lvs == c(0, 1))) {
              wl <- lapply(cols, function(col) terms[weights[, col] == 1])
              names(wl) <- cols
              wl <- Filter(length, wl)
            } else {
              wl <- split(terms, colnames(weights)[max.col(weights, "first")])
            }
            wl
          }
        }
      }
    }
  } else if (is.list(path)) {
    path <- Filter(length, path)
    if (all(vapply(path, function(d) is.character(d) || is.factor(d), TRUE))) {
      wl <- path
      if (is.null(names(wl))) names(wl) <- paste0("cat", seq_along(wl))
      if (!is.null(cats)) {
        wl <- wl[names(wl) %in% cats]
        if (!length(wl)) stop("no cats were found in path")
      }
    } else {
      if (is.null(names(path))) names(path) <- paste0("dict", seq_along(path))
      wl <- if (any(vapply(path, function(d) !is.null(dim(d)), TRUE))) {
        terms <- NULL
        cols <- NULL
        for (d in names(path)) {
          path[[d]] <- read.dic(path[[d]], as.weighted = as.weighted)
          if (as.weighted) {
            terms <- unique(c(terms, path[[d]]$term))
            cols <- c(cols, paste0(d, ".", colnames(path[[d]])[-1]))
          }
        }
        if (as.weighted) {
          wl <- as.data.frame(
            matrix(0, length(terms), length(cols), dimnames = list(terms, cols)),
            stringsAsFactors = FALSE
          )
          for (d in names(path)) {
            cols <- colnames(path[[d]])[-1] <- paste0(d, ".", colnames(path[[d]])[-1])
            su <- duplicated(path[[d]]$term)
            if (any(su)) {
              su <- path[[d]]$term %in% path[[d]]$term[su]
              td <- path[[d]][su, , drop = FALSE]
              for (term in unique(td$term)) wl[term, cols] <- colMeans(td[td$term == term, cols])
            }
            if (any(!su)) path[[d]] <- path[[d]][!su, ]
            rownames(path[[d]]) <- path[[d]]$term
            wl[path[[d]]$term, cols] <- path[[d]][, cols]
          }
          data.frame(term = rownames(wl), wl, stringsAsFactors = FALSE)
        } else {
          unlist(path, FALSE)
        }
      } else if (any(vapply(path, is.list, TRUE))) {
        unlist(path, FALSE)
      } else {
        if (any(vapply(path, function(d) is.null(names(d)), TRUE))) {
          if (all(vapply(path, length, 0) == length(path[[1]]))) {
            data.frame(term = names(path), do.call(rbind, path), stringsAsFactors = FALSE)
          } else {
            stop("failed to resolve path; as a list, entries should contain character or named numeric vectors")
          }
        } else {
          terms <- unique(unlist(lapply(path, names), use.names = FALSE))
          v <- structure(numeric(length(terms)), names = terms)
          data.frame(term = terms, vapply(path, function(d) {
            v[names(d)] <- d
            v
          }, numeric(length(terms))), stringsAsFactors = FALSE)
        }
      }
    }
  } else {
    if (raw || length(path) != 1) {
      if (length(path) == 1) path <- strsplit(path, "\n")[[1]]
      di <- path
    } else {
      di <- tryCatch(readLines(path, warn = FALSE, ...), error = function(e) NULL)
      if (is.null(di)) stop("assumed path (", path, ") is to a file, but failed to read it in", call. = FALSE)
    }
    lst <- grep("%", di, fixed = TRUE)
    if (length(lst) > 1 && !grepl(",", di[lst[1]], fixed = TRUE)) {
      if (length(lst) < 2) {
        stop(
          "could not identify the end of the header -- ",
          "this should be the second percent sign (%) following the last category name."
        )
      }
      lst <- lst[2]
      h <- grep("^\\d", gsub("^\\s+|\\s*%+\\s*|\\s+$", "", di[seq_len(lst)]), value = TRUE)
      ci <- character()
      for (cat in h) ci[sub("\\s.*$", "", cat)] <- sub("^[^\\s]+\\s+", "", cat, perl = TRUE)
      if (is.null(cats)) cats <- ci
      sep <- if (grepl("\t", di[lst + 1], fixed = TRUE)) "\t" else "\\s"
      cb <- paste0("(?:", sep, "+(?:", paste(names(ci), collapse = "|"), ")(?=", sep, "|$))*$")
      di <- di[-seq_len(lst - 1)]
      wl <- lapply(structure(names(ci), names = ci), function(cat) {
        unique(sub(cb, "", di[grep(paste0(sep, cat, cb), di, perl = TRUE)], perl = TRUE))
      })
      wl <- wl[cats[cats %in% names(wl)]]
    } else {
      if (missing(as.weighted) && length(path) == 1) as.weighted <- TRUE
      wl <- if (any(grepl("[\\s,]", di, perl = TRUE))) {
        di <- read.table(
          text = di, header = TRUE, sep = if (grepl("\t", di[[1]])) "\t" else ",",
          quote = '"', comment.char = "", stringsAsFactors = FALSE
        )
        if (!missing(as.weighted) || (!term.name %in% colnames(di) && !any(vapply(di, is.character, TRUE)) &&
          !any(grepl("[a-z]", rownames(di), TRUE)))) {
          di <- tryCatch(
            read.dic(di, cats = cats, type = type, as.weighted = as.weighted),
            error = function(e) e$message
          )
        }
        di
      } else {
        list(cat1 = di)
      }
      if (length(wl) == 1 && is.character(wl)) {
        stop("assuming path is to a comma separated values file, but failed to read it in:\n", wl)
      }
    }
  }
  if (!missing(type) && !grepl("^[Aa]", type)) wl <- to_regex(wl, grepl("^[poi]", type, TRUE))
  if (is.null(dim(wl)) && !is.null(cats)) {
    su <- names(wl) %in% cats
    if (any(su)) wl <- wl[su]
  }
  if (as.weighted && is.null(dim(wl))) {
    op <- data.frame(term = unique(unlist(wl)), stringsAsFactors = FALSE)
    for (cat in names(wl)) op[, cat] <- as.integer(op$term %in% wl[[cat]])
    op
  } else {
    if (is.data.frame(wl) && !as.weighted && !missing(as.weighted)) {
      read.dic(wl, cats = cats, as.weighted = FALSE)
    } else {
      wl
    }
  }
}

#' @rdname read.dic
#' @param dict A \code{list} with a named entry of terms for each category, or a \code{data.frame}
#'   with terms in one column, and categories or weights in the rest.
#' @param filename The name of the file to be saved.
#' @param save Logical: if \code{FALSE}, does not write a file.
#' @return \code{write.dic}: A version of the written dictionary -- a raw character vector for
#'   unweighted dictionaries, or a \code{data.frame} for weighted dictionaries.
#' @examples
#' # make a small murder related dictionary
#' dict <- list(
#'   kill = c("kill*", "murd*", "wound*", "die*"),
#'   death = c("death*", "dying", "die*", "kill*")
#' )
#'
#' # convert it to a weighted format
#' (dict_weighted <- read.dic(dict, as.weighted = TRUE))
#'
#' # categorize it back
#' read.dic(dict_weighted)
#'
#' # convert it to a string without writing to a file
#' cat(raw_dict <- write.dic(dict, save = FALSE))
#'
#' # parse it back in
#' read.dic(raw = raw_dict)
#'
#' \dontrun{
#'
#' # save it as a .dic file
#' write.dic(dict, "murder")
#'
#' # read it back in as a list
#' read.dic("murder.dic")
#'
#' # read in the Moral Foundations or LUSI dictionaries from urls
#' moral_dict <- read.dic("https://osf.io/download/whjt2")
#' lusi_dict <- read.dic("https://osf.io/download/29ayf")
#'
#' # save and read in a version of the General Inquirer dictionary
#' inquirer <- read.dic("inquirer", dir = "~/Dictionaries")
#' }
#' @export

write.dic <- function(dict, filename = NULL, type = "asis", as.weighted = FALSE, save = TRUE) {
  if (!is.list(dict) || is.data.frame(dict)) {
    if (save && (missing(as.weighted) || as.weighted)) {
      as.weighted <- TRUE
      o <- dict
    } else {
      dict <- read.dic(dict)
    }
  }
  if (is.null(dim(dict))) {
    terms <- unique(as.character(unlist(dict, use.names = FALSE)))
    terms <- terms[terms != ""]
    if (!missing(type) && !grepl("^[Aa]", type)) dict <- to_regex(dict, grepl("^[poi]", type, TRUE))
    if (as.weighted) {
      o <- data.frame(term = terms, stringsAsFactors = FALSE)
      for (cat in names(dict)) o[, cat] <- as.integer(o$term %in% dict[[cat]])
    } else {
      l <- length(dict)
      m <- as.data.frame(matrix("", length(terms) + l + 2, l + 1), stringsAsFactors = FALSE)
      m[, 1] <- c("%", seq_len(l), "%", terms)
      m[seq_len(l) + 1, 2] <- if (is.null(names(dict))) seq_len(l) else names(dict)
      for (i in seq_along(dict)) m[which(m[-seq_len(l + 2), 1] %in% dict[[i]]) + l + 2, i + 1] <- i
      o <- gsub("\t{2,}", "\t", paste(sub("\t+$", "", do.call(paste, c(m, sep = "\t"))), collapse = "\n"))
    }
  }
  if (save && is.character(filename)) {
    filename <- filename[[1]]
    if (!grepl("\\.[^.]+$", filename)) filename <- paste0(filename, if (as.weighted) ".csv" else ".dic")
    if (as.weighted) {
      write.table(o, filename, sep = ",", row.names = FALSE, qmethod = "double")
    } else {
      write(o, filename)
    }
    message("dictionary saved to ", filename)
  }
  invisible(o)
}