R/mclm.R

# -----------------------------------------------------------------------------
#
#    name: mclm.R  [short for: mastering corpus linguistics methods]
#    purpose: library of simple R functions in support of corpus linguistics
#    author: Dirk Speelman 
#    version: 2017-10-16
#
# -----------------------------------------------------------------------------

# ============================================================================
# yet to be put in proper location
# ============================================================================

find_xpath <- function(pattern,
                       x,
                       handlers = NULL,
                       trim = TRUE,
                       fun = NULL,
                       final_fun = NULL,
                       namespaces = NULL,
                       ...) {
   # ----------------------------------------------------------
   # - x can be any of the following:
   #      - a vector of filenames
   #      - a character vector of XML source
   #      - a list of parsed XML documents  
   #  - asText, trim, ignoreBlanks=TRUE
   #       - passed on the xmlParse
   #       if x contains parsed XML documents,
   #       then these arguments are ignored.
   #  - ...
   #       passed on to sapply
   # ----------------------------------------------------------
   if (!is.vector(x)) {
      x <- list(x)
   }
   res <- vector("list", length(x))
   for (i in seq_along(x)) {
      cur <- x[[i]]
      # -------------------------------------------------------
      # - if cur turns out to be XML source or a filename,
      #   which is tested with (is.character(cur)), we
      #   parse it with xmlParse()
      # - otherwise, we assume cur is a parse XML document
      #   this could be tested with something along the
      #   lines of ("XMLInternalDocument" %in% class(cur));
      #   however, currently this default case is not tested,
      #   just assumed.
      # case of cur.x is XML source or filename
      # -------------------------------------------------------
      if (is.character(cur)) { 
         cur <- XML::xmlParse(cur,
                              handlers = handlers,
                              trim = trim)
      }   
      # -------------------------------------------------------
      #res[[i]] <- cur[pattern]
      if (is.null(namespaces)) {
         namespaces = XML::xmlNamespaceDefinitions(cur, simplify = TRUE)
      }
      res[[i]] <- XML::xpathApply(cur, pattern,
                                  namespaces = namespaces, ...)
   }
   res <- unlist(res)
   if (!is.null(fun)) {
     res <- sapply(res, fun, ...)
   }
   if (!is.null(final_fun)) {
     res <- do.call(final_fun, list(res))
   }    
   res
}

print.conc <- function(x, n = 6, ...) {
  cat("Concordance-based data frame (number of observations: ",
      nrow(x),
      ")\n",
      sep = "")
  if (n > 0) {
    print_kwic(x, n = n, ...)
  }
  if (n < nrow(x)) cat("...\n")
  cat("\nThis data frame has ")
  cat(ncol(x), ifelse(ncol(x) > 1, "columns:\n", "column:\n"))
  df_names <- data.frame(column = names(x))
  print(df_names)
  invisible(x)
}

print_kwic <- function(x, 
                       min_c_left = NA,
                       max_c_left = NA,
                       min_c_match = NA,
                       max_c_match = NA,
                       min_c_right = NA,
                       max_c_right = NA,
                       from = 1,
                       n = 30,
                       drop_tags = TRUE) {
  if (length(x$left) == 0 ||
      length(x$match) == 0 ||
      length(x$right) == 0) {
    stop("x must have appropriate values for 'left', 'match', and 'right'")
  }
  if (from < 1 || from > nrow(x)) {
    stop("the argument 'from' does not have an appropriate value for 'x'")
  } 
  if (n < 1) {
    stop("the argument 'n' does not have a appropriate value")
  } 
  df <- data.frame(left = as.character(x$left),
                   match = as.character(x$match),
                   right = as.character(x$right),
                   stringsAsFactors = FALSE)
  to <- min(nrow(df), from + n - 1)
  df <- df[from:to, ]
  if (drop_tags) {
    df$left <- cleanup_spaces(drop_tags(df$left))
    df$match <- cleanup_spaces(drop_tags(df$match))
    df$right <- cleanup_spaces(drop_tags(df$right))
  }
  largest_left <- max(nchar(df$left), 0, na.rm = TRUE)
  largest_match <- max(nchar(df$match), 0, na.rm = TRUE)
  largest_right <- max(nchar(df$right), 0, na.rm = TRUE)  
  # calculate width for id_col, left_col, match_col, and right_col
  c_avail <- max(0, getOption("width") - 7) # 7 for spaces between columns
                                            # with some margin for scroll bar
  # -- id col
  c_id_col <- max(nchar(as.character(from:(from + n - 1))))
  c_avail <- max(0, c_avail - c_id_col)
  # -- match col
  c_match_col <- trunc(c_avail * 0.3)
  if (! is.na(min_c_match)) c_match_col <- max(0, min_c_match, c_match_col)
  if (! is.na(max_c_match)) c_match_col <- max(0, min(c_match_col, max_c_match))
  c_match_col <- min(c_match_col, largest_match)
  # -- right col
  c_avail <- max(0, c_avail - c_match_col)
  c_right_col <- trunc(c_avail * 0.5)
  if (! is.na(min_c_right)) c_right_col <- max(0, min_c_right, c_right_col)
  if (! is.na(max_c_right)) c_right_col <- max(0, min(c_right_col, max_c_right))
  c_right_col <- min(c_right_col, largest_right)
  # -- left col
    c_left_col <- max(0, c_avail - c_right_col)
  if (! is.na(min_c_left)) c_left_col <- max(0, min_c_left, c_left_col)
  if (! is.na(max_c_left)) c_left_col <- max(0, min(c_left_col, max_c_left))
  # --  
  df$left = stringr::str_trunc(df$left, c_left_col, side = "left")
  df$match = stringr::str_trunc(df$match, c_match_col, side = "center")
  df$right = stringr::str_trunc(df$right, c_right_col, side = "right")
  print(df)
  invisible(x)
}


# ============================================================================
# generics
# ============================================================================

details <- function(x, y) UseMethod("details")

details.default <- function(x, y) NULL

zoom_re <- function(x, pattern, perl = TRUE) UseMethod("zoom_re")

zoom_re.default <- function(x, pattern, perl = TRUE) x

unzoom <- function(x) UseMethod("unzoom")

unzoom.default <- function(x) x

# ============================================================================
# ad hoc functions used in section on slma()
# ============================================================================

adhoc_min <- function(x) {
  if (sum(!is.na(x)) == 0) {
    NA
  } else {
    min(x, na.rm = TRUE)
  }
}

adhoc_max <- function(x) {
  if (sum(!is.na(x)) == 0) {
    NA
  } else {
    max(x, na.rm = TRUE)
  }
}

adhoc_sum <- function(x) {
  if (sum(!is.na(x)) == 0) {
    NA
  } else {
    sum(x, na.rm = TRUE)
  }
}


# ============================================================================
# string manipulation functions
# ============================================================================

cleanup_spaces <- function(x,
                           remove_leading = TRUE,
                           remove_trailing = TRUE) {
  x <- gsub("\\s+", " ", x, perl = TRUE)
  if (remove_leading) {
    x <- gsub("^[ ]", "", x, perl = TRUE)
  }
  if (remove_trailing) {
    x <- gsub("[ ]$", "", x, perl = TRUE)
  }
  x
}

drop_tags <- function(x, half_tags_too = TRUE) {
   if (half_tags_too) {
      x <- gsub("^[^<]*>|<[^>]*>|<[^>]*$", "", x, perl = TRUE)
   } else {
      x <- gsub("<[^>]*>", "", x, perl = TRUE)
   }
   x
}

# ============================================================================
# tokenization
# ============================================================================

tokenize <- function(x, # character vector or TextDocument
                     re_drop_line = NA,
                     line_glue = NA, # e.g. "\n" or ""
                     re_cut_area = NA,
                     re_token_splitter = "\\s+",
                     re_token_extractor = "[^\\s]+",
                     re_drop_token = NA,
                     re_token_transf_in = NA,
                     token_transf_out = NA,
                     token_to_lower = TRUE,
                     perl = TRUE) {
  if ("TextDocument" %in% class(x)) x <- as.character(x)
  # (further) split into lines -----------------------------------
  x <- unlist(strsplit(x, split = "\n"))
  # drop lines if needed -----------------------------------------
  if (!is.na(re_drop_line)) {
    x <- x[grep(re_drop_line, x, perl = perl, invert = TRUE)]
  }
  # paste lines in long line if needed ---------------------------
  if (!is.na(line_glue)) {
    x <- paste(x, collapse = line_glue)
  }
  # drop uninterestion regions if needed -------------------------
  if (!is.na(re_cut_area)) {
    x <- gsub(re_cut_area, "", x, perl = perl)
  }
  # identify tokens ----------------------------------------------
  if (!is.na(re_token_splitter)) {
    tokens <- unlist(strsplit(x, re_token_splitter, perl = perl))
  } else {
    m <- gregexpr(re_token_extractor, x, perl = perl)
    tokens <- unlist(regmatches(x, m))
  }
  # -- drop tokens if needed -------------------------------------
  if (!is.na(re_drop_token)) {
    tokens <- tokens[grep(re_drop_token, tokens, perl = perl, invert = TRUE)]
  }
  # transform tokens if needed -----------------------------------
  if (! is.na(re_token_transf_in)) {
    tokens <- gsub(re_token_transf_in, token_transf_out, tokens)
  }
  # tokens to lower if needed ------------------------------------
  if (token_to_lower) {
    tokens <- tolower(tokens)
  }
  class(tokens) = "tokens"
  tokens
}

# =============================================================================
# the class "freqlist"
# =============================================================================

freqlist <- function(x,
                     re_drop_line = NA,
                     line_glue = NA, 
                     re_cut_area = NA,
                     re_token_splitter = "\\s+",
                     re_token_extractor = "[^\\s]+",
                     re_drop_token = NA,
                     re_token_transf_in = NA,
                     token_transf_out = NA,
                     token_to_lower = TRUE,
                     perl = TRUE,
                     blocksize = 300,
                     verbose = FALSE,
                     show_dots = FALSE,
                     dot_blocksize = 10,
                     file_encoding = "UTF-8",
                     as_text = FALSE) {
  if (as_text) {
    freqlist_char(x,
                  re_drop_line = re_drop_line,
                  line_glue = line_glue,
                  re_cut_area = re_cut_area,
                  re_token_splitter = re_token_splitter,
                  re_token_extractor = re_token_extractor,
                  re_drop_token = re_drop_token,
                  re_token_transf_in = re_token_transf_in,
                  token_transf_out = token_transf_out,
                  token_to_lower = token_to_lower,
                  perl = perl)
  } else {
    freqlist_corp(x,
                  re_drop_line = re_drop_line,
                  line_glue = line_glue,
                  re_cut_area = re_cut_area,
                  re_token_splitter = re_token_splitter,
                  re_token_extractor = re_token_extractor,
                  re_drop_token = re_drop_token,
                  re_token_transf_in = re_token_transf_in,
                  token_transf_out = token_transf_out,
                  token_to_lower = token_to_lower,
                  perl = perl,
                  blocksize = blocksize,
                  verbose = verbose,
                  show_dots = show_dots,
                  dot_blocksize = dot_blocksize,
                  file_encoding = file_encoding)
  }
}

freqlist_char <- function(x,
                          re_drop_line = NA,
                          line_glue = NA, 
                          re_cut_area = NA,
                          re_token_splitter = "\\s+",
                          re_token_extractor = "[^\\s]+",
                          re_drop_token = NA,
                          re_token_transf_in = NA,
                          token_transf_out = NA,
                          token_to_lower = TRUE,
                          perl = TRUE) {
  if ("tokens" %in% class(x)) {
    freqlist <- table(x)
  } else {
    freqlist <- table(tokenize(x,
                               re_drop_line = re_drop_line,
                               line_glue = line_glue,
                               re_cut_area = re_cut_area,
                               re_token_splitter = re_token_splitter,
                               re_token_extractor = re_token_extractor,
                               re_drop_token = re_drop_token,
                               re_token_transf_in = re_token_transf_in,
                               token_transf_out = token_transf_out,
                               token_to_lower = token_to_lower,
                               perl = TRUE))
  }  
  class(freqlist) <- "freqlist"
  freqlist
}


freqlist_corp <- function(x,
                          re_drop_line = NA,
                          line_glue = NA, # e.g. "\n" or ""
                          re_cut_area = NA,
                          re_token_splitter = "\\s+",
                          re_token_extractor = "[^\\s]+",
                          re_drop_token = NA,
                          re_token_transf_in = NA,
                          token_transf_out = NA,
                          token_to_lower = TRUE,
                          perl = TRUE,
                          blocksize = 300,
                          verbose = FALSE,
                          show_dots = FALSE,
                          dot_blocksize = 10,
                          file_encoding = "UTF-8") {
  first_pt <- proc.time(); new_pt <- first_pt
  globfreqlist <- vector()
  i = 1
  while (i <= length(x)) {
    j = 0
    blocktokens <- vector()
    while ((j < blocksize) && ((i + j) <= length(x))) {
      fname <- x[i + j]
      # -- read corpus file
      con <- file(fname, encoding = file_encoding)
      newlines <- readLines(con, warn = FALSE)
      close(con)    
      # -- drop lines if needed
      if (!is.na(re_drop_line)) {
        newlines <- newlines[grep(re_drop_line, newlines,
                                  perl = perl, invert = TRUE)]
      }
      # -- paste lines in long line if needed
      if (!is.na(line_glue)) {
        newlines <- paste(newlines, collapse = line_glue)
      }
      # -- drop uninterestion regions if needed
      if (!is.na(re_cut_area)) {
        newlines <- gsub(re_cut_area, "", newlines, perl = perl)
      }
      # -- identify tokens
      if (!is.na(re_token_splitter)) {
        newtokens <- unlist(strsplit(newlines,
                                     re_token_splitter,
                                     perl = perl))
      } else {
        m <- gregexpr(re_token_extractor,
                      newlines,
                      perl = perl)
        newtokens <- unlist(regmatches(newlines, m))
      }
      # -- drop tokens if needed
      if (!is.na(re_drop_token)) {
        newtokens <- newtokens[grep(re_drop_token,
                                    newtokens,
                                    perl = perl,
                                    invert = TRUE)]
      }
      # -- transform tokens if needed
      if (!is.na(re_token_transf_in)) {
        newtokens <- gsub(re_token_transf_in,
                          token_transf_out,
                          newtokens)
      }
      # -- tokens to lower if needed
      if (token_to_lower) {
        newtokens <- tolower(newtokens)
      }
      # ====================================================
      blocktokens <- append(blocktokens, newtokens)
      if (verbose && (((i + j) %% dot_blocksize) == 0)) { 
        cat("."); utils::flush.console() 
      }
      j <- j + 1
    }
    blockfreqlist <- freqlist_from_text(blocktokens)
    globfreqlist <- addfreqlists(globfreqlist, blockfreqlist)
    prev_pt <- new_pt; new_pt <- proc.time()
    if (verbose) {
      cat((i+j)-1,"(", new_pt[3]-first_pt[3], "|", 
           new_pt[3]-prev_pt[3], ")\n")
      utils::flush.console()
    }
    i <- i + j
  }
  if (verbose) cat("\n")
  class(globfreqlist) <- "freqlist"
  globfreqlist
}


as_freqlist <- function(x) {
  if (!"table" %in% class(x)) {
    stop("x must be of class \"table\"")
  }    
  class(x) <- c("freqlist", class(x))
  x
}

as.data.frame.freqlist <- function(x, ...) {
  if (!"freqlist" %in% class(x)) {
    stop("x must be of class \"freqlist\"")
  }    
  d <- data.frame(frequency = as.numeric(x),
                  row.names = names(x))
  d <- d[order(x, decreasing = TRUE), , drop = FALSE]
  d
}

as.data.frame.conc <- function(x, ...) {
  if (! "conc" %in% class(x)) {
    stop("x must be of class \"x\"")
  }    
  d <- x
  class(d) <- setdiff(class(d), "conc")
  d
}

as_conc <- function(x,
                    left = NA,
                    match = NA,
                    right = NA,
                    keep_original = FALSE,
                    ...) {
  if (! "data.frame" %in% class(x)) {
    stop("x must be of class \"data.frame\"")
  }
  d <- x
  class(d) <- c("conc", setdiff(class(d), "conc"))
  # ==
  if (! is.na(left) && left != "left") {
    if (is.null(d[[left]])) {
      stop(paste0("the object 'x' does not have a column ", left, "'"))
    }
    d[["left"]] <- as.character(d[[left]])
    if (! keep_original) {
      d[[left]] <- NULL 
    }    
  }
  # --
  if (is.na(left) || left == "left") {
    if (is.null(d[["left"]])) {
      stop("the object 'x' does not have a column 'left'")
    }
    if (! is.character(d[["left"]])) {
      d[["left"]] <- as.character(d[["left"]])
    }  
  } 
  # ==
  if (! is.na(match) && match != "match") {
    if (is.null(d[[match]])) {
      stop(paste0("the object 'x' does not have a column ", match, "'"))
    }
    d[["match"]] <- as.character(d[[match]])
    if (! keep_original) {
      d[[match]] <- NULL 
    }    
  }
  # --
  if (is.na(match) || match == "match") {
    if (is.null(d[["match"]])) {
      stop("the object 'x' does not have a column 'match'")
    }
    if (! is.character(d[["match"]])) {
      d[["match"]] <- as.character(d[["match"]])
    }  
  } 
  # ==
  if (! is.na(right) && right != "right") {
    if (is.null(d[[right]])) {
      stop(paste0("the object 'x' does not have a column ", right, "'"))
    }
    d[["right"]] <- as.character(d[[right]])
    if (! keep_original) {
      d[[right]] <- NULL 
    }    
  }
  # --
  if (is.na(right) || right == "right") {
    if (is.null(d[["right"]])) {
      stop("the object 'x' does not have a column 'right'")
    }
    if (! is.character(d[["right"]])) {
      d[["right"]] <- as.character(d[["right"]])
    }  
  } 
  # ==
  d
}


print.freqlist <- function(x, n = 6, ...) {
  n_types <- length(x)
  n_tokens <- sum(x)
  sorted_x <- sort(x, decreasing = TRUE)
  n <- min(n, n_types)
  cat("Frequency list (number of types: ",
      n_types,
      ", number of tokens: ",
      n_tokens,
      ")\n",
      sep = "")
  if (n > 0) {
    d <- data.frame(word = names(sorted_x)[1:n],
                    frequency = as.numeric(sorted_x)[1:n])
    print(d)
    if (n_types > n) cat("...\n")
  }
  invisible(x)
}

details.freqlist <- function(x, y) {
  if (!"freqlist" %in% class(x)) {
    stop("x must be of class \"freqlist\"")
  }
  if (!"character" %in% typeof(y)) {
    stop("y must be a character vector.")
  }
  result <- list(); class(result) <- "details.freqlist"
  result$item <- y
  result$freq <- as.numeric(x[y])
  result
}

print.details.freqlist <- function(x, ...) {
  if (!"details.freqlist" %in% class(x)) {
    stop("x must be of class \"details.freqlist\"")
  }
  cat("Word: ",
      x$item,
      "\nFrequency: ",
      x$freq,
      "\n",
      sep = "")  
  invisible(x)
}

zoom_re.freqlist <- function(x, pattern, perl = TRUE) {
  if (!"freqlist" %in% class(x)) {
    stop("x must be of class \"freqlist\"")
  }
  if (!"character" %in% typeof(pattern)) {
    stop("y must be a character vector containing a regular expression")
  }
  result <- list(
      zoom_pattern = pattern,
      zoom_selected = grep(pattern, names(x), perl = perl),
      full_list = x
    )
  class(result) <- "zoom_re.freqlist"
  result
}

print.zoom_re.freqlist <- function(x, n = 6, ...) {
  if (!"zoom_re.freqlist" %in% class(x)) {
    stop("x must be of class \"zoom_re.freqlist\"")
  }
  n_sel_types <- length(x$zoom_selected)
  cat("Pattern: ",
      x$zoom_pattern,
      " (number of matches: ",
      n_sel_types,
      ")\n",
      sep = "")
  sel_x <- x$full_list[x$zoom_selected]
  sorted_sel_x <- sort(sel_x, decreasing = TRUE)
  n <- min(n, n_sel_types)
  if (n > 0) {
    d <- data.frame(word = names(sorted_sel_x)[1:n],
                    frequency = as.numeric(sorted_sel_x)[1:n])
    print(d)
    if (n_sel_types > n) cat("...\n")
  }
  invisible(x)
}

as.data.frame.zoom_re.freqlist <- function(x, ...) {
  if (!"zoom_re.freqlist" %in% class(x)) {
    stop("x must be of class \"zoom_re.freqlist\"")
  }
  if (length(x$zoom_selected) == 0) {
    NULL
  } else {
    sel_x <- x$full_list[x$zoom_selected]
    sorted_sel_x <- sort(sel_x, decreasing = TRUE)
    d <- data.frame(word = names(sorted_sel_x),
                    frequency = as.numeric(sorted_sel_x))
    d
  }
}


# ============================================================================
# slma()
# ============================================================================


as.data.frame.slma <- function(x, ...) {
  x$scores
}

strsplit_space_tokenizer <- function(x) {
    unlist(strsplit(as.character(x), "\\s+", perl = TRUE))
}    

slma <- function(corp_a, corp_b,
                 sig_cutoff = qchisq(.95, df = 1),
                 small_pos = 0.00001,
                 keep_intermediate = TRUE) {
  #
  # a and b are expected to inherit from type Corpus
  #
  result <- list(intermediate = list()); class(result) <- "slma"  
  corp_ab <- c(corp_a, corp_b)
  tdm <- tm::TermDocumentMatrix(corp_ab,
    control = list(tokenize = strsplit_space_tokenizer,
                   wordLengths = c(1, Inf)))
  scores <- data.frame(row.names = rownames(tdm))  
  d_gsig <- data.frame(row.names = rownames(tdm))  
  d_logor <- data.frame(row.names = rownames(tdm))
  d_dir <- data.frame(row.names = rownames(tdm))
  n <- length(corp_a)
  m <- length(corp_b)
  for (i in 1:n) {
    for (j in (n+1):(n+m)) {
      colname <- paste0(i, "-", j)
      a <- as.vector(tdm[, i])
      b <- sum(a) - a
      c <- as.vector(tdm[, j])
      d <- sum(c) - c
      ascores <- assoc_abcd(a, b, c, d,
                            small_pos = small_pos)
      d_gsig[[colname]] <- ifelse(ascores$G > sig_cutoff, 1, NA) 
      d_logor[[colname]] <- log(ascores$OR)
      d_dir[[colname]] <- ascores$dir
    }  
  }
  scores$S_abs <- apply((d_dir * d_gsig), 1, sum, na.rm = TRUE)
  scores$S_nrm <- scores$S_abs / ncol(d_dir) 
  scores$S_att <- apply((d_dir == 1) * d_gsig, 1, sum, na.rm = TRUE)
  scores$S_rep <- apply((d_dir == -1) * d_gsig, 1, sum, na.rm = TRUE)
  scores$lor_min <- apply((d_logor * d_gsig), 1, adhoc_min) 
  scores$lor_max <- apply((d_logor * d_gsig), 1, adhoc_max) 
  scores$lor_sd <- apply((d_logor * d_gsig), 1, sd, na.rm = TRUE) 
  scores$S_lor <- apply((d_logor * d_gsig), 1, adhoc_sum) / ncol(d_dir)
  
  scores <- scores[order(scores$S_lor, decreasing = TRUE), ]
  # --                               
  result$scores <- scores
  result$sig_cutoff <- sig_cutoff
  result$small_pos <- small_pos
  if (keep_intermediate) {
    result$intermediate$corp_a <- corp_a
    result$intermediate$corp_b <- corp_b
    result$intermediate$tdm <- tdm
  }  
  result
}

print.slma <- function(x, n = 20, ...) {
  print(x$scores[1:n, ])
}


details.slma <- function(x, y) {
  if (!"slma" %in% class(x)) {
    stop("x must be of class \"slma\"")
  }
  if (length(x$intermediate) == 0) {
    stop("x was not created with keep_intermediate = TRUE.")    
  }
  if (!"character" %in% typeof(y)) {
    stop("y must be a character vector.")
  }
  result <- list(); class(result) <- "details.slma"
  result$summary <- x$scores[y, , drop = FALSE]
  freqs <- as.vector(x$intermediate$tdm[y, ])
  corp_sizes <- apply(x$intermediate$tdm, 2, function(x) sum(as.vector(x)))
  n <- length(x$intermediate$corp_a)
  m <- length(x$intermediate$corp_b)
  labs <- as.character(names(corp_sizes))
  idx <- 0
  res <- data.frame(comp = character(n * m),
                    a = numeric(n * m),
                    b = numeric(n * m),
                    c = numeric(n * m),
                    d = numeric(n * m),
                    G = numeric(n * m),
                    sig = numeric(n * m),
                    dir = numeric(n * m),                     
                    dir_sig = numeric(n * m),                     
                    #OR = numeric(n * m),
                    #OR_sig = numeric(n * m),
                    log_OR = numeric(n * m),
                    log_OR_sig = numeric(n * m),                    
                    stringsAsFactors = FALSE)
  for (i in 1:n) {
    for (j in (n+1):(n+m)) {
      idx <- idx + 1
      res[idx, "comp"] <- paste0(labs[i], "--", labs[j])
      res[idx, "a"]    <- freqs[i]
      res[idx, "b"] <- corp_sizes[i] - freqs[i]
      res[idx, "c"] <- freqs[j]
      res[idx, "d"] <- corp_sizes[j] - freqs[j]
      ascores <- assoc_abcd(res[idx, "a"],  res[idx, "b"],
                            res[idx, "c"],  res[idx, "d"],
                            small_pos = x$small_pos)
      res[idx, "G"] <- ascores$G
      res[idx, "sig"] <- ifelse(ascores$G > x$sig_cutoff, 1, 0)
      res[idx, "dir"] <- ascores$dir      
      res[idx, "dir_sig"]  <-  ifelse(ascores$G > x$sig_cutoff,
                                     ascores$dir,
                                     NA)      
      res[idx, "log_OR"]  <-  log(ascores$OR)
      res[idx, "log_OR_sig"]  <-  ifelse(ascores$G > x$sig_cutoff,
                                         log(ascores$OR),
                                         NA)
    }
  }
  row.names(res) <- res$comp
  res$comp <- NULL
  result$details <- res
  result$item <- y
  result$sig_cutoff <- x$sig_cutoff
  result$small_pos <- x$small_pos
  result    
}

print.details.slma <- function(x, ...) {
  if (!"details.slma" %in% class(x)) {
    stop("x must be of class \"details.slma\"")
  }  
  cat(paste0('SLMA details\n'))  
  cat(paste0('[item: "', x$item,'"]\n\n'))
  print(round(x$summary, 3))  
  cat(paste0('\n[cutoff for G: ', round(x$sig_cutoff, 3), ']\n'))
  cat(paste0('[small positive offset: ', x$small_pos, ']\n\n'))
  print(round(x$details, 3))      
}


tokenize <- function(x, # character vector or TextDocument
                     re_drop_line = NA,
                     line_glue = NA, # e.g. "\n" or ""
                     re_cut_area = NA,
                     re_token_splitter = "\\s+",
                     re_token_extractor = "[^\\s]+",
                     re_drop_token = NA,
                     re_token_transf_in = NA,
                     token_transf_out = NA,
                     token_to_lower = TRUE,
                     perl = TRUE) {
  if ("TextDocument" %in% class(x)) x <- as.character(x) 
  # drop lines if needed ----------------------------------------
  if (!is.na(re_drop_line)) {
    x <- x[grep(re_drop_line, x, perl = perl, invert = TRUE)]
  }
  # paste lines in long line if needed ---------------------------
  if (!is.na(line_glue)) {
    x <- paste(x, collapse = line_glue)
  }
  # drop uninterestion regions if needed -------------------------
  if (!is.na(re_cut_area)) {
    x <- gsub(re_cut_area, "", x, perl = perl)
  }
  # identify tokens ----------------------------------------------
  if (!is.na(re_token_splitter)) {
    tokens <- unlist(strsplit(x, re_token_splitter, perl = perl))
  } else {
    m <- gregexpr(re_token_extractor, x, perl = perl)
    tokens <- unlist(regmatches(x, m))
  }
  # -- drop tokens if needed -------------------------------------
  if (!is.na(re_drop_token)) {
    tokens <- tokens[grep(re_drop_token, tokens, perl = perl, invert = TRUE)]
  }
  # transform tokens if needed -----------------------------------
  if (! is.na(re_token_transf_in)) {
    tokens <- gsub(re_token_transf_in, token_transf_out, tokens)
  }
  # tokens to lower if needed ------------------------------------
  if (token_to_lower) {
    tokens <- tolower(tokens)
  }
  class(tokens) = "tokens"
  tokens
}


# ============================================================================
#
# ============================================================================
assoc_abcd <- function(a, b, c, d,
                       measures = NULL,
                       with_variants = FALSE,
                       show_dots = FALSE,
                       p_fisher_2 = FALSE,
                       small_pos = 0.00001) {
  assoc_scores_abcd(a, b, c, d,
                    measures = measures,
                    with_variants = with_variants,
                    show_dots = show_dots,
                    p_fisher_2 = p_fisher_2,
                    small_pos = small_pos)
}    



# ---
#  READ FROM CONSOLE
#

scan_re <- function() {
  result <- ""
  nlines <- 0
  while(grepl("\\S", newline <- readLines(n = 1), perl = TRUE)) {
    if (nchar(result) > 0) {
      result <- paste0(result, "\n")
    }
    result <- paste0(result, newline)
    nlines <- nlines + 1
  }
  if (nlines > 1) {
    result <- paste0(result, "\n")
  }
  result
}

scan_re2 <- function() {
  x <- scan(what = "character", sep = "\n")
  paste(x, collapse = "\n")
}


scan_txt <- function() {
  scan_re()
}

cat_re <- function(x,
                   format = c("plain", "R", "TeX"),
                   as_single_line = TRUE) {
  if (as_single_line) {
    x <- gsub("(?<![[\\\\])#[^\n]*[\r\n]*", "", x, perl = TRUE)
    x <- gsub("[\r\n]+", " ", x, perl = TRUE)
    x <- gsub("\\s+", " ", x, perl = TRUE)
    x <- gsub("(^[ ]|[ ]$)", "", x, perl = TRUE)
  }
  if (format[1] == "R") {
    x <- gsub("\\\\", "\\\\\\\\", x, perl = TRUE)
    x <- gsub("\"", "\\\"", x, perl = TRUE)
    x <- paste0("\"", x, "\"")
  }
  cat(x)
  cat("\n\n")
  invisible(x)
}


# ---
#  READ RAW CORPUS FILE
#

read.raw.corpfile.tokens <- function(file, fileEncoding="") {
# -- helper function of read.raw.corpus.freqinfo()
  # --
  con <- file(file, encoding = fileEncoding)
  lines <- readLines(con, warn=FALSE)
  close(con)    
  # --
  #lines <- scan(file, what="char", sep="\n", 
  #              fileEncoding=fileEncoding, blank.lines.skip = FALSE,
  #              quiet=TRUE)
  long.line <- paste(lines, collapse="")
  return(strsplit(long.line, "\\s+", perl=TRUE)[[1]])
}

make.freqlist <- function(tokens) {
# -- helper function of read.raw.corpus.freqinfo()
  return(table(as.factor(tokens)))
}

merge.freqlists <- function(flist1, flist2) {
# -- helper function of read.raw.corpus.freqinfo()
  names <- union(names(flist1), names(flist2))
  flist1 <- flist1[names]; names(flist1) <- names; flist1[is.na(flist1)] <- 0
  flist2 <- flist2[names]; names(flist2) <- names; flist2[is.na(flist2)] <- 0
  return(flist1 + flist2)
}

merge_freqlists <- function(x, y) {
  names <- union(names(x), names(y))
  x <- x[names]
  names(x) <- names
  x[is.na(x)] <- 0
  y <- y[names]
  names(y) <- names
  y[is.na(y)] <- 0
  x + y
}


# ---
#  FREQUENCY LISTS, KEYWORDS AND COLLOCATION RETRIEVAL FUNCTIONS
#

raw.corpus.freqlist <- function(fnames, 
                                fileEncoding="", 
                                drop.tokens=NULL) {
  freqlist <- numeric(0)
  tokens <- character(0)
  for (i in 1:length(fnames)) {
     fname     <- fnames[i]
     newtokens <- read.raw.corpfile.tokens(fname, fileEncoding="UTF8")
     if (! is.null(drop.tokens)) {
       newtokens <- newtokens[- grep(drop.tokens, newtokens, perl=TRUE)]
     }
     tokens    <- append(tokens, newtokens)
     cat("."); utils::flush.console()
     if ((i %% 20) == 0) {
       freqlist <- merge.freqlists(freqlist, make.freqlist(tokens))
       tokens <- character(0)
     }
     if (((i %% 60) == 0) | (i == length(fnames))) {
       cat("\n"); utils::flush.console()
     }
  }
  freqlist <- merge.freqlists(freqlist, make.freqlist(tokens))
  tokens <- numeric(0)
  return(freqlist)
}

raw.corpus.surface.cooccur <- function(fnames, 
                                       freqlist,
                                       L=3, 
                                       R=3,
                                       boundary.marker="<boundary />",
                                       fileEncoding="", 
                                       drop.tokens=NULL) {
  # ---
  itemlen <- length(freqlist)
  itempos <- 1:itemlen; names(itempos) <- names(freqlist)
  # ---
  pair.names <- outer(names(freqlist), names(freqlist), paste, sep=" | ")
  dim(pair.names) <- NULL
  pair.freqs <- rep(0, length(pair.names))
  names(pair.freqs) <- pair.names
  # ---
  tokens <- character(0)
  for (i in 1:length(fnames)) {
     fname     <- fnames[i]
     tokens <- read.raw.corpfile.tokens(fname, fileEncoding="UTF8")
     if (! is.null(drop.tokens)) {
       tokens <- tokens[- grep(drop.tokens, tokens, perl=TRUE)]
     }
     cat("."); utils::flush.console()
     # ---
     for (j in 1:length(tokens)) { # loop over collocates
       colloc <- tokens[j]
       if (! is.na(freqlist[colloc])) {
         nodes <- character(0)
         # --- look for nodes to left of collocate ---
         k <- 1
         while ((k <= R) && ((j - k) > 0)) {  # R from persp. of colloc is L from persp. of node
           node <- tokens[j - k]
           if (is.na(node)) {
             k <- R+1
           } else if (node == boundary.marker) {
             k <- R+1
           } else {
             if (! is.na(freqlist[node])) {
               nodes <- append(nodes, node)             
             }
             k <- k+1
           }
         }  # end of while (k <= R) {  }
         # --- look for nodes to right of collocate ---
         k <- 1
         while (k <= L) {  # L from persp. of colloc is R from persp. of node
           node <- tokens[j + k]
           if (is.na(node)) {
             k <- L+1
           } else if (node == boundary.marker) {
             k <- L+1
           } else {
             if (! is.na(freqlist[node])) {
               nodes <- append(nodes, node)
             }
             k <- k+1
           }
         }  # end of while (k <= L) {  }
         if (length(nodes) > 0) {
           for (node in levels(as.factor(nodes))) {
               pos <- itempos[colloc] + (itempos[node]-1)*itemlen
               pair.freqs[pos] = pair.freqs[pos] + 1
           }
         }
       }  # end of: if (! is.na(freqlist[colloc]))  {  }
     }  # end of: for (j in 1:length(tokens)) {  }
     # ---
     if (((i %% 60) == 0) | (i == length(fnames))) {
       cat("\n"); utils::flush.console()
     }  # end of: if (((i %% 60) == 0) | (i == length(fnames))) {  }
  }  # end of: for (i in 1:length(fnames)) { }
  return(pair.freqs)
}

surf_cooc <- function(x, 
                      re_node,
                      w_left = 3, 
                      w_right = 3,
                      re_boundary = NA,
                      re_drop_line = NA,
                      line_glue = NA, 
                      re_cut_area = NA,
                      re_token_splitter = "\\s+",
                      re_token_extractor = "[^\\s]+",
                      re_drop_token = NA,
                      re_token_transf_in = NA,
                      token_transf_out = NA,
                      token_to_lower = TRUE,
                      perl = TRUE,
                      blocksize = 300,
                      verbose = FALSE,
                      dot_blocksize = 10,
                      file_encoding = "UTF-8") {
  first_pt <- proc.time(); new_pt <- first_pt
  retval <- list()
  globfreqlist1 <- vector()
  globfreqlist2 <- vector()
  i = 1
  while (i <= length(x)) {
    j = 0
    blocktokens <- vector()
    while ((j < blocksize) && ((i + j) <= length(x))) {
      file <- x[i + j]
      # -- read file
      con <- file(file, encoding = file_encoding)
      newlines <- readLines(con, warn = FALSE)
      close(con)    
      # -- drop lines if needed
      if (! is.na(re_drop_line)) {
        newlines <- newlines[grep(re_drop_line, newlines,
                                  perl = perl, invert = TRUE)]
      }
      # -- paste lines in long line if needed
      if (! is.na(line_glue)) {
        newlines <- paste(newlines, collapse = line_glue)
      }
      # -- drop uninterestion regions if needed
      if (! is.na(re_cut_area)) {
        newlines <- gsub(re_cut_area, "", newlines, perl = perl)
      }
      # -- identify tokens
      if (! is.na(re_token_splitter)) {
        newtokens <- unlist(strsplit(newlines,
                                     re_token_splitter,
                                     perl = perl))
      } else {
        m <- gregexpr(re_token_extractor, newlines, perl = perl)
        newtokens <- unlist(regmatches(newlines, m))
      }
      # -- drop token if needed
      if (! is.na(re_drop_token)) {
        newtokens <- newtokens[grep(re_drop_token, newtokens,
                                    perl = perl, invert = TRUE)]
      }
      # -- transform tokens if needed
      if (! is.na(re_token_transf_in)) {
        newtokens <- gsub(re_token_transf_in, token_transf_out,
                          newtokens)
      }
      # -- tokens to lower if needed
      if (token_to_lower) {
        newtokens <- tolower(newtokens)
      }
      # ====================================================
      blocktokens <- append(blocktokens, newtokens)
      if (verbose && (((i + j) %% dot_blocksize) == 0)) { 
        cat(".")
        utils::flush.console() 
      }
      j <- j + 1
    }
    match_pos <- grep(re_node, blocktokens, perl = perl)
    bound_pos <- vector()
    if (!is.na(re_boundary)) {
      bound_pos <- grep(re_boundary, blocktokens, perl = perl)
    }
    target_pos <- vector()
    new_pos <- match_pos
    if (w_left > 0) {
      for (k in 1:w_left) {
        new_pos <- new_pos - 1
        new_pos <- setdiff(new_pos, bound_pos)
        target_pos <- union(target_pos, new_pos)
      }
    }
    new_pos <- match_pos
    if (w_right > 0) {
      for (k in 1:w_right) {
        new_pos <- new_pos + 1
        new_pos <- setdiff(new_pos, bound_pos)
        target_pos <- union(target_pos, new_pos)
      }
    }
    target_pos <- target_pos[target_pos > 0]
    target_pos <- target_pos[target_pos <= length(blocktokens)]
    target_pos <- setdiff(target_pos, match_pos)
    
    ref_pos <- setdiff(1:length(blocktokens), target_pos)
    ref_pos <- setdiff(ref_pos, match_pos)
    ref_pos <- setdiff(ref_pos, bound_pos)
    
    t1 <- table(blocktokens[target_pos])
    blockfreqlist1 <- as.vector(t1)
    names(blockfreqlist1) <- names(t1)
    globfreqlist1 <- addfreqlists(globfreqlist1, blockfreqlist1)
    
    t2 <- table(blocktokens[ref_pos])
    blockfreqlist2 <- as.vector(t2)
    names(blockfreqlist2) <- names(t2)
    globfreqlist2 <- addfreqlists(globfreqlist2, blockfreqlist2)

    prev_pt <- new_pt; new_pt <- proc.time()
    if (verbose) {
      cat((i + j) - 1,"(", new_pt[3] - first_pt[3], "|", 
           new_pt[3] - prev_pt[3], ")\n")
      utils::flush.console()
    }
    i <- i + j
  }
  cooc_info(target_freqlist = globfreqlist1,
            ref_freqlist = globfreqlist2,
            target_n = sum(globfreqlist1),
            ref_n = sum(globfreqlist2))
}

text_cooc <- function(x, 
                      re_node,
                      re_boundary = NA,
                      re_drop_line = NA,
                      line_glue = NA,
                      re_cut_area = NA,
                      re_token_splitter = "\\s+",
                      re_token_extractor = "[^\\s]+",
                      re_drop_token = NA,
                      re_token_transf_in = NA,
                      token_transf_out = NA,
                      token_to_lower = TRUE,
                      perl = TRUE,
                      blocksize = 300,
                      verbose = FALSE,
                      dot_blocksize = 10,
                      file_encoding = "UTF-8") {
  first_pt <- proc.time(); new_pt <- first_pt
  retval <- list()
  globfreqlist1 <- vector()
  corpsize1 <- 0
  globfreqlist2 <- vector()
  corpsize2 <- 0
  i = 1
  while (i <= length(x)) {
    j = 0
    blocktokens1 <- vector()
    blocktokens2 <- vector()
    while ((j < blocksize) && ((i + j) <= length(x))) {
      file <- x[i + j]
      # -- read file
      con <- file(file, encoding = file_encoding)
      newlines <- readLines(con, warn = FALSE)
      close(con)    
      # -- drop lines if needed
      if (! is.na(re_drop_line)) {
        newlines <- newlines[grep(re_drop_line, newlines,
                                  perl = perl, invert = TRUE)]
      }
      # -- paste lines in long line if needed
      if (! is.na(line_glue)) {
        newlines <- paste(newlines, collapse = line_glue)
      }
      # -- drop uninterestion regions if needed
      if (! is.na(re_cut_area)) {
        newlines <- gsub(re_cut_area, "", newlines, perl = perl)
      }
      # -- identify tokens
      if (! is.na(re_token_splitter)) {
        newtokens <- unlist(strsplit(newlines,
                                     re_token_splitter,
                                     perl = perl))
      } else {
        m <- gregexpr(re_token_extractor, newlines, perl = perl)
        newtokens <- unlist(regmatches(newlines, m))
      }
      # -- drop token if needed
      if (! is.na(re_drop_token)) {
        newtokens <- newtokens[grep(re_drop_token, newtokens,
                                    perl = perl, invert = TRUE)]
      }
      # -- transform tokens if needed
      if (! is.na(re_token_transf_in)) {
        newtokens <- gsub(re_token_transf_in,
                          token_transf_out,
                          newtokens)
      }
      # -- tokens to lower if needed
      if (token_to_lower) {
        newtokens <- tolower(newtokens)
      }
      # ====================================================
      if (! is.na(re_boundary)) {
        boundaries <- grep(re_boundary, newtokens, perl = perl)
      } else {
        boundaries <- numeric(0)
      }
      if (length(boundaries) > 0) {
        v <- newtokens[-boundaries]
        f <- rep(1:(length(boundaries) + 1),
                 c(boundaries, length(newtokens) + 1) -
                 c(1, boundaries + 1))
        txts <- split(v, f)
      } else {
        txts <- list('1' = newtokens)
      }
      for (item in txts) {
        hits <- grep(re_node, item, perl = perl)
        if (length(hits) > 0) {
          blocktokens1 <- c(blocktokens1, names(freqlist(item[-hits])))
          corpsize1 <-  corpsize1 + 1
        } else {
          blocktokens2 <- c(blocktokens2, names(freqlist(item)))
          corpsize2 <-  corpsize2 + 1
        }
      }
      # ====================================================
      if (verbose && (((i + j) %% dot_blocksize) == 0)) { 
        cat(".")
        utils::flush.console() 
      }
      j <- j + 1
    }
    # --
    t1 <- table(blocktokens1)
    blockfreqlist1 <- as.vector(t1)
    names(blockfreqlist1) <- names(t1)
    globfreqlist1 <- addfreqlists(globfreqlist1, blockfreqlist1)

    t2 <- table(blocktokens2)
    blockfreqlist2 <- as.vector(t2)
    names(blockfreqlist2) <- names(t2)
    globfreqlist2 <- addfreqlists(globfreqlist2, blockfreqlist2)

    # --
    prev_pt <- new_pt; new_pt <- proc.time()
    if (verbose) {
      cat((i + j) - 1,"(", new_pt[3] - first_pt[3], "|", 
           new_pt[3] - prev_pt[3], ")\n")
      utils::flush.console()
    }
    i <- i + j
  }
  cooc_info(target_freqlist = globfreqlist1,
            ref_freqlist = globfreqlist2,
            target_n = corpsize1,
            ref_n = corpsize2)
}


# testen: hoe verkrijgen dat leestekens niet meegeteld worden: drop.token ?

raw.getsurfcooc.node <- function(
          files, 
          re.node,
          L=3, 
          R=3,
          re.boundary="<boundary />",
          re.split="[.'!?;;,\\s]+",
          re.ok.line=".*",
          re.drop.token=NA,
          to.lower=TRUE,
          perl=TRUE,
          blocksize=300,
          verbose=FALSE,
          dot.blocksize=10,
          fileEncoding="latin1") {
  first.pt <- proc.time(); new.pt <- first.pt
  retval <- list()
  globfreqlist1 <- vector()
  globfreqlist2 <- vector()
  i = 1
  while (i <= length(files)) {
    j = 0
    blocktokens <- vector()
    while ((j < blocksize) && ((i+j) <= length(files))) {
      file <- files[i+j]
      # --
      con <- file(file, encoding = fileEncoding)
      newlines <- readLines(con, warn=FALSE)
      close(con)    
      # --
      #newlines <- scan(file, what="char", sep="\n", 
      #                 fileEncoding=fileEncoding, 
      #                 blank.lines.skip = FALSE, quiet=TRUE)
      newsellines <- newlines[grep(re.ok.line, newlines, perl=perl)]
      newtokens <- unlist(strsplit(newlines, re.split, perl=perl))
      if (to.lower) { newtokens <- tolower(newtokens) }
      if (! is.na(re.drop.token)) {
        newtokens <- newtokens[- grep(re.drop.token, newtokens, perl=perl)]
      }
      blocktokens <- append(blocktokens, newtokens)
      if (verbose && (((i+j)%%dot.blocksize) == 0)) { 
        cat("."); utils::flush.console() 
      }
      j <- j + 1
    }
    match.pos <- grep(re.node, blocktokens, perl=perl)
    bound.pos <- vector()
    if (! is.na(re.boundary)) {
      bound.pos <- grep(re.boundary, blocktokens, perl=perl)
    }
    target.pos <- vector()
    new.pos <- match.pos
    if (L > 0) {
      for (k in 1:L) {
        new.pos <- new.pos - 1
        new.pos <- setdiff(new.pos, bound.pos)
        target.pos <- union(target.pos, new.pos)
      }
    }
    new.pos <- match.pos
    if (R > 0) {
      for (k in 1:R) {
        new.pos <- new.pos + 1
        new.pos <- setdiff(new.pos, bound.pos)
        target.pos <- union(target.pos, new.pos)
      }
    }
    target.pos <- target.pos[target.pos > 0]
    target.pos <- target.pos[target.pos <= length(blocktokens)]
    target.pos <- setdiff(target.pos, match.pos)
    
    ref.pos <- setdiff(1:length(blocktokens), target.pos)
    ref.pos <- setdiff(ref.pos, match.pos)
    ref.pos <- setdiff(ref.pos, bound.pos)
    
    t1 <- table(blocktokens[target.pos])
    blockfreqlist1 <- as.vector(t1); names(blockfreqlist1) <- names(t1)
    globfreqlist1 <- addfreqlists(globfreqlist1, blockfreqlist1)
    
    t2 <- table(blocktokens[ref.pos])
    blockfreqlist2 <- as.vector(t2); names(blockfreqlist2) <- names(t2)
    globfreqlist2 <- addfreqlists(globfreqlist2, blockfreqlist2)

    prev.pt <- new.pt; new.pt <- proc.time()
    if (verbose) {
      cat((i+j)-1,"(", new.pt[3]-first.pt[3], "|", 
           new.pt[3]-prev.pt[3], ")\n")
      utils::flush.console()
    }
    i <- i+j
  }
  retval$targetfreqs <- globfreqlist1
  retval$reffreqs <- globfreqlist2
  return(retval)
}


raw.corpus.surface.cooccur.node.old <- function(
                                       fnames, 
                                       freqlist,
                                       L=3, 
                                       R=3,
                                       node.regexp,
                                       boundary.marker="<boundary />",
                                       fileEncoding="", 
                                       drop.tokens=NULL) {
  # ---
  itemlen <- length(freqlist)
  itempos <- 1:itemlen; names(itempos) <- names(freqlist)
  # ---
  item.freqs <- rep(0, length(freqlist))
  names(item.freqs) <- names(freqlist)
  # ---
  tokens <- character(0)
  for (i in 1:length(fnames)) {
     fname     <- fnames[i]
     tokens <- read.raw.corpfile.tokens(fname, fileEncoding="UTF8")
     if (! is.null(drop.tokens)) {
       tokens <- tokens[- grep(drop.tokens, tokens, perl=TRUE)]
     }
     node.pos <- rep(FALSE, length(tokens))
     node.pos[grep(node.regexp, tokens, perl=TRUE)] <- TRUE
     cat("."); utils::flush.console()
     # ---
     for (j in 1:length(tokens)) { # loop over collocates
       colloc <- tokens[j]
       if (! is.na(freqlist[colloc])) {
         node.found <- FALSE
         # --- look for nodes to left of collocate ---
         k <- 1
         while ((k <= R) && ((j - k) > 0)) {  # R from persp. of colloc is L from persp. of node
           node <- tokens[j - k]
           if (is.na(node)) {
             k <- R+1
           } else if (node == boundary.marker) {
             k <- R+1
           } else {
             if (node.pos[j-k]) {
               node.found <- TRUE             
             }
             k <- k+1
           }
         }  # end of while (k <= R) {  }
         # --- look for nodes to right of collocate ---
         k <- 1
         while (k <= L) {  # L from persp. of colloc is R from persp. of node
           node <- tokens[j + k]
           if (is.na(node)) {
             k <- L+1
           } else if (node == boundary.marker) {
             k <- L+1
           } else {
             if (node.pos[j+k]) {
               node.found <- TRUE
             }
             k <- k+1
           }
         }  # end of while (k <= L) {  }
         if (node.found) {
               pos <- itempos[colloc]
               item.freqs[pos] = item.freqs[pos] + 1
         }
       }  # end of: if (! is.na(freqlist[colloc]))  {  }
     }  # end of: for (j in 1:length(tokens)) {  }
     # ---
     if (((i %% 60) == 0) | (i == length(fnames))) {
       cat("\n"); utils::flush.console()
     }  # end of: if (((i %% 60) == 0) | (i == length(fnames))) {  }
  }  # end of: for (i in 1:length(fnames)) { }
  return(item.freqs)
}


surface.cooccur.stats <- function(cooccur.freq, freqlist, corpus.size,
                                  min.freq=3) {
  cooccur.freq <- cooccur.freq[cooccur.freq >= min.freq]
  d <- data.frame(fullname = names(cooccur.freq), stringsAsFactors=FALSE)
  d$colloc = gsub("^(.*) \\| .*$", 
                  "\\1", names(cooccur.freq), perl=TRUE)
  d$node <- gsub("^.* \\| (.*)$", 
                 "\\1", names(cooccur.freq), perl=TRUE)
  d$a <- cooccur.freq
  d$b <- freqlist[d$node] - d$a
  d$c <- freqlist[d$colloc] - d$a
  d$d <- corpus.size - d$a - d$b - d$c
  d$DICE <- calc.DICE(d$a, d$b, d$c, d$d); cat("."); utils::flush.console()
  d$OR <- calc.OR(d$a, d$b, d$c, d$d); cat("."); utils::flush.console()
  d$log.OR <- calc.log.OR(d$a, d$b, d$c, d$d); cat("."); utils::flush.console()
  d$PMI <- calc.PMI(d$a, d$b, d$c, d$d); cat("."); utils::flush.console()
  d$Chi2 <- calc.Chi2(d$a, d$b, d$c, d$d);  cat("."); utils::flush.console()
  d$Chi2.signed <- calc.Chi2.signed(d$a, d$b, d$c, d$d); cat("."); utils::flush.console()
  d$p.Chi2 <- calc.p.Chi2(d$a, d$b, d$c, d$d);  cat("."); utils::flush.console() # two-sided
  d$G2 <- calc.G2(d$a, d$b, d$c, d$d); cat("."); utils::flush.console()
  d$G2.signed <- calc.G2.signed(d$a, d$b, d$c, d$d);  cat("."); utils::flush.console()
  d$p.G2 <- calc.p.G2(d$a, d$b, d$c, d$d);  cat("."); utils::flush.console() # two-sided
  d$t <- calc.t(d$a, d$b, d$c, d$d);  cat("."); utils::flush.console()      
  d$p.t <- calc.p.t(d$a, d$b, d$c, d$d); cat("."); utils::flush.console()
                                         # use of this test not well motivated!!!
                                         # one-sided 
                                         # (only detects a too high)
  d$t.as.chisq1 <- p_to_chisq1(d$p.t); cat("."); utils::flush.console()
                                                   # bring p to scale of Chi2 and G2
  d$p.fisher <- calc.p.fisher(d$a, d$b, d$c, d$d); cat("."); utils::flush.console()
                                                   # one-sided 
                                                   # (only detects a too high)
  d$fisher.as.chisq1 <- p_to_chisq1(d$p.fisher); cat("."); utils::flush.console()
                                                   # bring p to scale of Chi2/G2
  cat("\n"); utils::flush.console()
  return(d)
}

# ---
# RUN CONCORDANCE IN R
#

conc_re <- function(pattern,
                    x,
                    c_left = 200,
                    c_right = 200,
                    perl = TRUE,
                    after_line = "\n",
                    file_encoding = "UTF-8",
                    as_text = FALSE) {
  n_texts <- length(x)
  list_source <- vector("list", n_texts)
  list_left <- vector("list", n_texts)
  list_hits <- vector("list", n_texts)
  list_right <- vector("list", n_texts)
  if (length(file_encoding) < n_texts) {
     file_encoding <- rep(file_encoding, length = n_texts)
  }   
  for (i in seq_along(x)) {
    if (as_text) {
      text <- x[i]
    } else {
      fname <- x[i]
      text <- read_txt(fname, paste_char = after_line,
                       file_encoding = file_encoding[i])
    }
    m <- gregexpr(pattern, text, perl = perl)[[1]]
    start <- as.numeric(m)
    stop <- start + attr(m, "match.length") - 1
    left <- vector("character", 0)
    hits <- vector("character", 0)
    right <- vector("character", 0)
    if (start[1] > 0) { # if there are matches
         left  <- substr(rep(text, length(start)), start - c_left, start - 1)
         hits  <- substr(rep(text, length(start)), start, stop)
         right <- substr(rep(text, length(start)), stop + 1, stop + c_right)
    }
    if (as_text) {
        list_source[[i]] <- rep("-", length(left))
    } else {
        list_source[[i]] <- rep(fname, length(left))        
    }
    list_left[[i]] <- left
    list_hits[[i]] <- hits
    list_right[[i]] <- right      
  }
  src <- unlist(list_source)
  lft <- unlist(list_left)
  hts <- unlist(list_hits)
  rgt <- unlist(list_right)
  if (length(src) > 0) {
     gid <- id <- 1:length(src)
     lft <- cleanup_spaces(lft, remove_leading = FALSE, remove_trailing = FALSE) 
     hts <- cleanup_spaces(hts, remove_leading = FALSE, remove_trailing = FALSE) 
     rgt <- cleanup_spaces(rgt, remove_leading = FALSE, remove_trailing = FALSE) 
  } else {
     gid <- id <- numeric(0)
  }
  df <- data.frame(glob_id = gid,
                   id = id,
                   source = src,
                   left = lft,
                   match = hts,
                   right = rgt,
                   stringsAsFactors = FALSE)
  class(df) <- c("conc", class(df))
  df
}


# ---
#  FROM CONCORDANCES TO DATASETS
#

read_conc_antconc <- function(file,
                              version = c("3.4.3", "3.2.4"),
                              file_encoding = "") {
# ---
# - Assumes AntConc concordance results were saved with the default Concordance
#   Preferences. Especially important are:
#       Display Options
#           Hit number: YES
#           KWIC display: YES
#           File name: YES
#       Other options
#           Put tab spaces around hits in KWIC display: preferably YES (which
#           is not the default, but the default value NO is also OK)
# - Also, you must make sure to specify the correct file encoding.
  # --
  con <- file(file, encoding = file_encoding)
  lines <- readLines(con, warn = FALSE)
  close(con)
  # --
  d <- NA
  if (length(lines > 0)) {
    if (version[1] == "3.2.4") {
        # ----------------------------------------------------------
        # TAB is assumed to surround hit (but this is not cheched)
        # ----------------------------------------------------------
        cells <- strsplit(lines, split = "\t", fixed = TRUE)
        d <- data.frame(x = 1:length(lines)); d$x <- NULL
        d$id <- unlist(lapply(cells, "[", 1))
        d$left <- unlist(lapply(cells, "[", 2))
        d$match <- unlist(lapply(cells, "[", 3))
        d$right <- unlist(lapply(cells, "[", 4))
        d$source <- unlist(lapply(cells, "[", 5))   
    } else if (version[1] == "3.4.3") {
        # ----------------------------------------------------------
        # tab is preferred to surround hit (but may not)
        # ----------------------------------------------------------
        cells <- strsplit(lines, split = "\t", fixed = TRUE)
        d <- data.frame(x = 1:length(lines)); d$x <- NULL
        d$id <- unlist(lapply(cells, "[", 1))
        d$left <- unlist(lapply(cells, "[", 2))
        col3 <- unlist(lapply(cells, "[", 3))
        max_n_col3 <- max(nchar(col3))
        if (max_n_col3 == 0) {
          # there are TABS around the hits in KWIC display
          d$match <- unlist(lapply(cells, "[", 4))
          d$right <- unlist(lapply(cells, "[", 5))
          d$source <- unlist(lapply(cells, "[", 6))
        } else {
          # there are no TABS around the hits in KWIC display
          # we can only guess what are the boudaries of the hit
          d$match <- NA
          d$right <- unlist(lapply(cells, "[", 3))
          d$match <- gsub("^(\\s*[^ ]*).*$", "\\1", d$right, perl = TRUE)
          d$right <- substr(d$eight, nchar(d$match) + 1, nchar(d$right))
          d$source <- unlist(lapply(cells, "[", 4))       
        }
    }
  }
  d
}

read.conc.antconc.prev <- function(file, version=c("3.4.3", "3.2.4"),
  fileEncoding="") {
# ---
# - Assumes AntConc concordance results were saved with the default Concordance
#   Preferences. Especially important are:
#       Display Options
#           Hit number: YES
#           KWIC display: YES
#           File name: YES
#       Other options
#           Put tab spaces around hits in KWIC display: preferably YES (which
#           is not the default, but the default value NO is also OK)
# - Also, you must make sure to specify the correct file encoding.
# ---
# To be tested: do I need to use readChar, as I did in
#               read.conc.antconc.old(...), or is it safe to
#               use scan(...)
# ---
  lines <- scan(file, what="char", sep="\n", 
                fileEncoding=fileEncoding, blank.lines.skip = FALSE,
                quiet=TRUE)  
  d <- NA
  if (length(lines > 0)) {
    if (version[1] == "3.2.4") {
        # ---------------------------------------------------------- 
        # TAB is assumed to surround hit (but this is not cheched) 
        # ---------------------------------------------------------- 
        cells <- strsplit(lines, split="\t", fixed=TRUE)
        d <- data.frame(x=1:length(lines)); d$x <- NULL
        d$Id <- unlist(lapply(cells, "[", 1))
        d$Left <- unlist(lapply(cells, "[", 2))
        d$Node <- unlist(lapply(cells, "[", 3))
        d$Right <- unlist(lapply(cells, "[", 4))
        d$Source <- unlist(lapply(cells, "[", 5))    
    } else if (version[1] == "3.4.3") {
        # ---------------------------------------------------------- 
        # tab is preferred to surround hit (but may not)
        # ---------------------------------------------------------- 
        cells <- strsplit(lines, split="\t", fixed=TRUE)
        d <- data.frame(x=1:length(lines)); d$x <- NULL
        d$Id <- unlist(lapply(cells, "[", 1))
        d$Left <- unlist(lapply(cells, "[", 2))
        col3 <- unlist(lapply(cells, "[", 3))
        max.n.col3 <- max(nchar(col3))
        if (max.n.col3 == 0) {
          # there are TABS around the hits in KWIC display
          d$Node <- unlist(lapply(cells, "[", 4))
          d$Right <- unlist(lapply(cells, "[", 5))
          d$Source <- unlist(lapply(cells, "[", 6))
        } else {
          # there are no TABS around the hits in KWIC display
          # we can only guess what are the boudaries of the hit
          d$Node <- NA
          d$Right <- unlist(lapply(cells, "[", 3))
          d$Node <- gsub("^(\\s*[^ ]*).*$", "\\1", d$Right, perl=TRUE)
          d$Right <- substr(d$Right, nchar(d$Node)+1, nchar(d$Right))
          d$Source <- unlist(lapply(cells, "[", 4))        
        }
    }
  }
  return(d)
}


read.conc.antconc.old <- function(file, window.size=50, fileEncoding="") {
# ---
# - Assumes AntConc concordance results were saved with the default Concordance
#   Preferences. Especially important are:
#       Display Options
#           Hit number: YES
#           KWIC display: YES
#           File name: YES
#       Other options
#           Put tab spaces around hits in KWIC display: preferably YES (which
#           is not the default, but the default value NO is also OK)
# - Also, the window.size argument must specify the CORRECT window size that
#   was used in AntConc when running the query.
# - Also, you must make sure to specify the correct file encoding.
# ---
  fileSize <- file.info(file)$size
  connection <- file(file, "rb")
  chars <- iconv(readChar(connection, fileSize, useBytes=TRUE),
                 from=fileEncoding)
  close(connection)
  chars <- gsub("\r", " ", chars)
  lines <- strsplit(chars, "\n")[[1]]
  Id <- gsub("^([^\t]*)\t.*$", "\\1", lines, perl=TRUE)
  lines <- substr(lines, nchar(Id)+2, nchar(lines))
  #KWIC <- gsub("^[^\t]*\t(.*)\t[^\t]*$", "\\1", lines, perl=TRUE)
  Source <- gsub("^.*\t([^\t]*)$", "\\1", lines, perl=TRUE)
  lines <- substr(lines, 1, nchar(lines)-nchar(Source)-1)
  Left <- substr(lines, 1, window.size)
  lines <- substr(lines, window.size+1, nchar(lines))
  Node <- rep("", length(lines))
  hasHitInTabs <- grep("^\t([^\t]*)\t.*$", lines, perl=TRUE)
  hitInTabs <- gsub("^\t([^\t]*)\t.*$", "\\1", lines, perl=TRUE)
  Node[hasHitInTabs] <- hitInTabs[hasHitInTabs]
  hasHitNoTabs <- grep("^([\\w]*).*$", lines, perl=TRUE)
  hitNoTabs <- gsub("^([\\w]*).*$", "\\1", lines, perl=TRUE)
  hasHitNoTabs <- setdiff(hasHitNoTabs, hasHitInTabs)
  Node[hasHitNoTabs] <- hitNoTabs[hasHitNoTabs]
  NodeSize <- nchar(Node)+1
  NodeSize[hasHitInTabs] <- NodeSize[hasHitInTabs] + 2 
  Right <- substr(lines, NodeSize, nchar(lines))
  Left <- gsub("\t", " ", Left, perl=TRUE)
  Node <- gsub("\t", " ", Node, perl=TRUE)
  Right <- gsub("\t", " ", Right, perl=TRUE)
  return(data.frame(Id=Id, Source=Source, Left=Left, Node=Node, Right=Right))
}

read.conc.antconc.veryold <- function(file, window.size=50, fileEncoding="") {
# ---
# - Assumes AntConc concordance results were saved with the default Concordance
#   Preferences. Especially important are:
#       Display Options
#           Hit number: YES
#           KWIC display: YES
#           File name: YES
#       Other options
#           Put tab spaces around hits in KWIC display: NO
# - Also, the window.size argument must specify the CORRECT window size that
#   was used in AntConc when running the query.
# - Also, you must make sure to specify the correct file encoding.
# ---
  lines <- scan(file, what="char", sep="\n", 
                fileEncoding=fileEncoding, blank.lines.skip = FALSE,
                quiet=TRUE)
  Id <- gsub("^([^\t]*)\t.*$", "\\1", lines, perl=TRUE)
  KWIC <- gsub("^[^\t]*\t(.*)\t[^\t]*$", "\\1", lines, perl=TRUE)
  Source <- gsub("^.*\t([^\t]*)$", "\\1", lines, perl=TRUE)
  possible.antconc.errors <- (nchar(KWIC) < window.size*2)
  KWIC[possible.antconc.errors] <- format(KWIC[possible.antconc.errors], 
    width=(window.size*2 + 20)) # mitigate occasional AntConc error
  Left <- substr(KWIC, 1, window.size)
  Node <- substr(KWIC, window.size+1, nchar(KWIC)-window.size)
  Right <- substr(KWIC, nchar(KWIC)-window.size+1,nchar(KWIC))
  Left <- gsub("\t", " ", Left, perl=TRUE)
  Node <- gsub("\t", " ", Node, perl=TRUE)
  Right <- gsub("\t", " ", Right, perl=TRUE)
  return(data.frame(Id=Id, Source=Source, Left=Left, Node=Node, Right=Right))
}

conc_to_dataset_antconc <- function(x,
                                    outfile,
                                    version=c("3.4.3", "3.2.4"),
                                    file_encoding="") {
  if (length(file_encoding) < length(x)) {
    file_encoding <- rep(file_encoding, length = length(x))
  }
  lines <- vector()
  for (i in 1:length(x)) {
      d <- read_conc_antconc(x[i],
                             version = version, 
                             file_encoding = file_encoding[i])
      if (! is.null(dim(d))) {
        lines <- append(lines,
                        paste(as.character(d$id),
                             as.character(d$source),
                              as.character(d$left),
                              as.character(d$match),
                              as.character(d$right), sep = "\t"))
      }
  }
  lines <- paste(1:length(lines),"\t", lines, sep = "")
  lines <- append("glob_id\tid\tsource\tleft\tmatch\tright", lines)
  y <- paste0(paste(lines, collapse = "\n"), "\n")
  con <- file(outfile, "wb")
  writeBin(charToRaw(y), con, endian = "little")
  close(con)
  invisible(x)
}

conc_to_dataset_corpuseye <- function(x, outfile) {
# ---
# - infiles are one or several files to which the concordance search
#   results from the corpuseye website (http://corp.hum.sdu.dk/)
#   were exported (by clicking on export all and then saving as *.txt)
# - such files always have encoding UTF-8
# - we also save the results with encoding UTF-8
# ---
  corpdata <- vector()
  for (infile in x) {
    con <- file(infile, encoding = "UTF-8")
    lines <- readLines(con, warn = FALSE)
    close(con)
    corpdata <- append(corpdata, lines)
  }
  corp <- paste(corpdata, collapse = " \n")
  corp <- gsub("([^\n]) \n([^\n])", "\\1 \\2", corp, perl = TRUE)
  corp <- gsub("\n \n", "\n", corp, perl = TRUE)
  corp <- gsub("\\*", "\t", corp, perl = TRUE)
  corp <- gsub("\\|", "", corp, perl = TRUE)
  corp <- unlist(strsplit(corp, "\n"))
  corp <- corp[grep("[^\\s]", corp, perl = TRUE)]
  corp <- paste("corpuseye", corp, sep = "\t")
  corp <- paste(1:length(corp), "\t", corp, sep = "")
  corp <- paste(1:length(corp), "\t", corp, sep = "")
  # some more appending needs to be done here
  lines <- append("glob_id\tid\tsource\tleft\tmatch\tright", corp)
  y <- paste0(paste(lines, collapse = "\n"), "\n")
  con <- file(outfile, "wb")
  writeBin(charToRaw(y), con, endian = "little")
  close(con)
  invisible(x)
}

import_conc <- function(x,
                        file_encoding = "UTF-8",
                        source_type = c("corpuseye"),
                        ...) {
  if (! is.character(source_type)) {
    stop("the argument 'source_type' must be a character string")
  }
  source_type <- tolower(source_type)
  switch(source_type,
         corpuseye = import_conc_corpuseye(x, ...))
}  

import_conc_corpuseye <- function(x, ...) {
  if (! is.character(x)) {
    stop("argument 'x' must be a character vector containing file names")
  } 
  corpdata <- character(0)
  for (in_file in x) {
    new_lines <- read_txt(in_file, encoding = "UTF-8") 
    corpdata <- append(corpdata, new_lines)
  }
  corp <- paste(corpdata, collapse = " \n")
  corp <- gsub("([^\n]) \n([^\n])", "\\1 \\2", corp, perl = TRUE)
  corp <- gsub("\n \n", "\n", corp, perl = TRUE)
  corp <- gsub("\\*", "\t", corp, perl = TRUE)
  corp <- gsub("\\|", "", corp, perl = TRUE)
  corp <- unlist(strsplit(corp, "\n"))
  corp <- corp[grep("[^\\s]", corp, perl = TRUE)]
  corp <- paste("corpuseye", corp, sep = "\t")
  corp <- paste(1:length(corp), "\t", corp, sep = "")
  corp <- paste(1:length(corp), "\t", corp, sep = "")
  lines <- append("glob_id\tid\tsource\tleft\tmatch\tright", corp)
  # --  
  cols <- unlist(strsplit(lines[1], "\t"))
  lines <- lines[2:length(lines)]
  cells <- strsplit(lines, "\t")
  d <- data.frame(row.names = 1:length(lines))
  for (i in 1:length(cols)) {
    col_name <- cols[i]
    d[[col_name]] <- unlist(lapply(cells, "[", i))
    d[[col_name]] <- type.convert(d[[col_name]], as.is = TRUE)
  }
  class(d) <- c("conc", class(d))
  d
}

merge_conc <- function(..., show_warnings = TRUE) {
  if (show_warnings) { 
    df <- dplyr::bind_rows(...)
  } else {
    suppressWarnings(dplyr::bind_rows(...)) 
  } 
  df$glob_id <- 1:nrow(df)
  df
}

conc.to.dataset.corpuseye.prev <- function(infiles, outfile) {
# ---
# - infiles are one or several files to which the concordance search 
#   results from the corpuseye website (http://corp.hum.sdu.dk/)
#   were exported (by clicking on export all and then saving as *.txt)
# - such files always have encoding UTF-8
# - we also save the results with encoding UTF-8
# ---
  corpdata <- vector()
  for (infile in infiles) {
    lines <- scan(infile, what="char", sep="\n", 
                  fileEncoding="UTF-8", blank.lines.skip = FALSE,
                  quiet=TRUE)
    corpdata <- append(corpdata, lines)
  }
  corp <- paste(corpdata, collapse=" \n")
  corp <- gsub("([^\n]) \n([^\n])", "\\1 \\2", corp, perl=TRUE)
  corp <- gsub("\n \n", "\n", corp, perl=TRUE)
  corp <- gsub("\\*", "\t", corp, perl=TRUE)
  corp <- gsub("\\|", "", corp, perl=TRUE)
  corp <- unlist(strsplit(corp, "\n"))
  corp <- corp[grep("[^\\s]", corp, perl=TRUE)]
  corp <- paste("corpuseye", corp, sep="\t")
  corp <- paste(1:length(corp),"\t", corp, sep="")
  corp <- paste(1:length(corp),"\t", corp, sep="")
  # some more appending needs to be done here
  writeLines(iconv(append("GlobId\tId\tSource\tLeft\tNode\tRight",corp),
                   to="UTF-8", mark=TRUE), 
             outfile, useBytes=TRUE)
}

conc_to_dataset_bibleworks <- function(x, metafiles, outfile) {
  data <- vector()
  id <- vector()
  metadata <- vector()
  source <- vector()
  for (i in 1:length(x)) {
    con <- file(x[i], encoding = "UTF-8")
    data_lines <- readLines(con, warn = FALSE)
    close(con)    
    data_lines <- data_lines[nchar(data_lines) > 0]
    id_lines <- 1:length(data_lines)
    con <- file(metafiles[i], encoding = "UTF-8")
    metadata_lines <- readLines(con, warn = FALSE)
    close(con)    
    metadata_lines <- metadata_lines[nchar(metadata_lines) > 0]
    source_lines <- rep(x[i], length(data_lines))
    id <- append(id, id_lines)
    data <- append(data, data_lines)
    metadata <- append(metadata, metadata_lines)
    source <- append(source, source_lines)
  }
  d <- data.frame(glob_id = 1:length(data),
                  id = id,
                  source = source,
                  data = data,
                  metadata = metadata)
  d <- d[-nrow(d), ] # in a future update, this should be done more elegantly
  write_dataset(d, outfile)
  invisible(x)
}


conc.to.dataset.bibleworks.prev <- function(datafiles, metafiles, outfile) {
  data <- vector()
  id <- vector()
  metadata <- vector()
  source <- vector()
  for (i in 1:length(datafiles)) {
    data.lines <- scan(datafiles[i], what="char", sep="\n", 
                       fileEncoding="UTF-8", blank.lines.skip = TRUE,
                       quiet=TRUE)
    id.lines <- 1:length(data.lines)
    metadata.lines <- scan(metafiles[i], what="char", sep="\n", 
                       fileEncoding="UTF-8", blank.lines.skip = TRUE,
                       quiet=TRUE)
    source.lines <- rep(datafiles[i], length(data.lines))
    id <- append(id, id.lines)
    data <- append(data, data.lines)
    metadata <- append(metadata, metadata.lines)
    source <- append(source, source.lines)
  }
  d <- data.frame(GlobId=1:length(data),
                  Id=id,
                  Source=source,
                  Data=data,
                  Metadata=metadata)
  d <- d[-nrow(d),] # in a future update, this should be done more elegantly
  write_dataset(d, outfile)
}

conc.to.dataset.bibleworks2 <- function(datafiles, metafiles, outfile) {
  data <- vector()
  id <- vector()
  metadata <- vector()
  source <- vector()
  for (i in 1:length(datafiles)) {
    data.lines <- scan(datafiles[i], what="char", sep="\n", 
                       fileEncoding="UTF-8", blank.lines.skip = FALSE,
                       quiet=TRUE)
    data.lines <- paste(data.lines, collapse=" \n")
    data.lines <- gsub("([^\n]) \n([^\n])", "\\1 \\2", data.lines, perl=TRUE)
    data.lines <- gsub("^ \n ", "", data.lines, perl=TRUE)
    data.lines <- gsub("\n \n", "\n", data.lines, perl=TRUE)
    data.lines <- unlist(strsplit(data.lines, " \n "))
    id.lines <- 1:length(data.lines)
    metadata.lines <- scan(metafiles[i], what="char", sep="\n", 
                       fileEncoding="UTF-8", blank.lines.skip = FALSE,
                       quiet=TRUE)
    metadata.lines <- paste(metadata.lines, collapse=" \n")
    metadata.lines <- gsub("([^\n]) \n([^\n])", "\\1 \\2", metadata.lines, perl=TRUE)
    metadata.lines <- gsub("^ \n ", "", metadata.lines, perl=TRUE)
    metadata.lines <- gsub("\n \n", "\n", metadata.lines, perl=TRUE)
    metadata.lines <- unlist(strsplit(metadata.lines, " \n "))
    source.lines <- rep(datafiles[i], length(data.lines))
    id <- append(id, id.lines)
    data <- append(data, data.lines)
    metadata <- append(metadata, metadata.lines)
    source <- append(source, source.lines)
  }
  d <- data.frame(GlobId=1:length(data),
                  Id=id,
                  Source=source,
                  Data=data,
                  Metadata=metadata)
  write_dataset(d, outfile)
}

# --
# reads a text file into a character vector
# - if paste.char is NA, each input line is a separate item in
#   the character vector
# - if paste.char is "\n", then the character vector contains
#   a single item, in which all input lines are concatenated,
#   using "\n" as line terminator.
# --
read_txt <- function(file,
                     file_encoding = "UTF-8",
                     paste_char = NA,
                     ...) {
  con <- file(file, encoding = file_encoding)
  lines <- readLines(con, warn = FALSE)
  close(con)
  if (! is.na(paste_char)) {
    lines <- paste0(paste(lines, collapse = paste_char), paste_char)
  }
  lines
}

write_txt <- function(x,
                      file = "",
                      file_encoding = "UTF-8",
                      paste_char = "\n") {
# --
# writes a character vector to a text file, using paste.char
# as line terminator.
# - if paste.char is NA, x is assumed to be a single line
#   that is written to file as is 
# --
  if (! is.character(x) || length(x) == 0) {
    stop("argument 'x' must be a character vector of at least length one")
  }
  if (! is.character(paste_char) || length(paste_char) != 1) {
    stop("argument 'paste_char' must be a character vector of length one")
  }
  x <- paste0(paste(x, collapse = paste_char), paste_char)
  write_txt_utf8(x, file = file)
  invisible(x)
}

write_txt_utf8 <- function(x,
                           file = "") {
  if (! is.character(x) || length(x) != 1) {
    stop("argument 'x' must be a character vector of length one")
  } 
  con <- file(file, "wb")
  writeBin(charToRaw(x), con, endian = "little")
  close(con)
  invisible(x)
}


# read a data frame (simpler, but potentially more robust than read.table)
# (header, quote and comment_char have not been implemented yet)
read_dataset <- function(file,
                         header = TRUE,
                         sep = "\t",
                         quote = "",
                         comment_char = "",
                         file_encoding="UTF-8",
                         stringsAsFactors = default.stringsAsFactors(),
                         ...) {
  con <- file(file, encoding = file_encoding)
  lines <- readLines(con, warn = FALSE)
  close(con)
  cols <- unlist(strsplit(lines[1], sep))
  lines <- lines[2:length(lines)]
  cells <- strsplit(lines, sep)
  d <- data.frame(row.names = 1:length(lines))
  for (i in 1:length(cols)) {
      col_name <- cols[i]
      d[[col_name]] <- unlist(lapply(cells, "[", i))
      d[[col_name]] <- type.convert(d[[col_name]],
                                    as.is = ! stringsAsFactors)
  }
  d
}

# read concordance from file
read_conc <- function(file,
                      header = TRUE,
                      sep = "\t",
                      quote = "",
                      comment_char = "",
                      file_encoding="UTF-8",
                      stringsAsFactors = default.stringsAsFactors(),
                      ...) {
  lines <- read_txt(file, file_encoding = file_encoding) 
  cols <- unlist(strsplit(lines[1], sep))
  lines <- lines[2:length(lines)]
  cells <- strsplit(lines, sep)
  d <- data.frame(row.names = 1:length(lines))
  for (i in 1:length(cols)) {
      col_name <- cols[i]
      d[[col_name]] <- unlist(lapply(cells, "[", i))
      if (col_name %in% c("source", "left", "match", "right")) {
        d[[col_name]] <- type.convert(d[[col_name]], as.is = TRUE)
      } else {
        d[[col_name]] <- type.convert(d[[col_name]],
                                      as.is = ! stringsAsFactors) 
      }   
  }
  class(d) <- c("conc", class(d))
  d
}



read.dataset.prev <- function(file, header=TRUE, sep="\t", 
                              quote="", comment.char="", fileEncoding="UTF-8", ...) {
# --
# reads a data frame from the file format that is most often used in mcl
#   #   (header, quote and comment.char have not been implemented yet)
# --
  lines <- scan(file, what="char", sep="\n", 
                fileEncoding=fileEncoding, blank.lines.skip = FALSE,
                quiet=TRUE)
  cols <- unlist(strsplit(lines[1], sep))
  lines <- lines[2:length(lines)]
  cells <- strsplit(lines, sep)
  d <- data.frame(x=1:length(lines)); d$x <- NULL
  for (i in 1:length(cols)) {
      d[[cols[i]]] <- unlist(lapply(cells, "[", i))
      d[[cols[i]]] <- type.convert(d[[cols[i]]])
  }
  return(d)
}

write_dataset <- function(x,
                          file = "",
                          sep = "\t",
                          file_encoding = "UTF-8") {
  if (nrow(x) > 0) {
    lines <- as.character(x[, 1])
    for (i in 2:ncol(x)) {
      lines <- paste(lines, as.character(x[, i]), sep = sep)
    }
    names <- paste(names(x), collapse = sep)
    x <- paste0(paste(append(names, lines), collapse = "\n"), "\n")
    con <- file(file, "wb")
    writeBin(charToRaw(x), con, endian = "little")
    close(con)
  }
  invisible(x)
}


# write a concordance
write_conc <- function(x,
                       file = "",
                       sep = "\t",
                       file_encoding = "UTF-8") {
  if (nrow(x) > 0) {
    lines <- as.character(x[, 1])
    for (i in 2:ncol(x)) {
      lines <- paste(lines, as.character(x[, i]), sep = sep)
    }
    names <- paste(names(x), collapse = sep)
    x <- paste0(paste(append(names, lines), collapse = "\n"), "\n")
    con <- file(file, "wb")
    writeBin(charToRaw(x), con, endian = "little")
    close(con)
  }
  invisible(x)
}



write.dataset.prev <- function(x, file="", sep="\t", fileEncoding="UTF-8") {
# --
# writes a data frame to the file format that is most often used in mcl
# --
  if (nrow(x) > 0) {
    lines <- as.character(x[,1])
    for (i in 2:ncol(x)) {
      lines <- paste(lines, as.character(x[,i]), sep=sep)
    }
    names <- paste(names(x), collapse=sep)
    writeLines(iconv(append(names, lines), to=fileEncoding, mark=TRUE), 
               file, useBytes=TRUE)  
  }
}

# ---
#  FREQUENCY LISTS (represented as named numeric vectors)
#


freqlist_from_text <- function(x, all_freq_one = FALSE) { 
  t <- table(x)
  if (all_freq_one) {
    retval <- rep(1, length(t))
  } else {
    retval <- as.vector(t)
  }
  names(retval) <- names(t)
  class(retval) <- "freqlist"
  return(retval)
}

addfreqlists <- function(x, y) {
  sum.names <- union(names(x), names(y))
  sum.x <- x[sum.names]; sum.x[is.na(sum.x)] <- 0
  sum.y <- y[sum.names]; sum.y[is.na(sum.y)] <- 0
  sum.freq <- sum.x + sum.y
  names(sum.freq) <- sum.names
  class(sum.freq) <- "freqlist"
  return(sum.freq)
}

# the following function may eventually be turned into
# sort.freqlist(), i.e. a function that can be called
# with sort(x, ...), with x a freqlist
sortfreqlist <- function(x, criterion=c("word", "freq"), decreasing=FALSE) {
  if (criterion[1] == "freq") {
    return(sort(x, decreasing=decreasing))
  } else {
    return(x[sort(names(x), decreasing=decreasing)])
  }
}

writefreqlist <- function(x, file="", sep="\t", fileEncoding="UTF-8") {
# --
# writes a frequency list to file 
# --
  lines <- c("word\tfreq", paste(names(x), x, sep="\t"))
  writeLines(iconv(lines, to=fileEncoding, mark=TRUE), file, useBytes=TRUE)
}

readfreqlist <- function(file, header=TRUE, sep="\t", 
                         quote="", comment.char="", 
                         fileEncoding="UTF-8", ...) {
# --
# reads a frequency list from file 
#   (quote and comment.char have not been implemented yet)
# --
  # --
  con <- file(file, encoding = fileEncoding)
  lines <- readLines(con, warn=FALSE)
  close(con)    
  # --
  #lines <- scan(file, what="char", sep="\n", 
  #              fileEncoding=fileEncoding,
  #              blank.lines.skip = FALSE,
  #              quiet=TRUE, ...)
  if (header) { lines <- lines[2:length(lines)] }
  cells <- strsplit(lines, sep)
  x <- as.numeric(unlist(lapply(cells, "[", 2)))
  names(x) <- unlist(lapply(cells, "[", 1))
  return(x)
}

# the dependency on raw.getfreqlist in this function
# needs to be eliminated

read.wordlist <- function(file, header=TRUE, sep="\t", 
                          quote="", comment.char="", 
                          fileEncoding="UTF-8", ...) {
# --
# reads a word list from file 
#   (quote and comment.char have not been implemented yet)
  return(names(raw.getfreqlist(file, fileEncoding=fileEncoding)))
}



raw.getfreqlist <- function(files,
                            re.split="[.'!?;;,\\s]+",
                            re.ok.line=".*",
                            re.drop.token=NA,
                            to.lower=TRUE,
                            perl=TRUE,
                            blocksize=300,
                            verbose=FALSE,
                            dot.blocksize=10,
                            fileEncoding="latin1") {
  first.pt <- proc.time(); new.pt <- first.pt
  globfreqlist <- vector()
  i = 1
  while (i <= length(files)) {
    j = 0
    blocktokens <- vector()
    while ((j < blocksize) && ((i+j) <= length(files))) {
      file <- files[i+j]
      # --
      con <- file(file, encoding = fileEncoding)
      newlines <- readLines(con, warn=FALSE)
      close(con)    
      # --
      #newlines <- scan(file, what="char", sep="\n", 
      #                 fileEncoding=fileEncoding, 
      #                 blank.lines.skip=FALSE, quiet=TRUE)
      newsellines <- newlines[grep(re.ok.line, newlines, perl=perl)]
      newtokens <- unlist(strsplit(newlines, re.split, perl=perl))
      if (to.lower) { newtokens <- tolower(newtokens) }
      if (! is.na(re.drop.token)) {
        newtokens <- newtokens[- grep(re.drop.token, newtokens, perl=perl)]
      }
      blocktokens <- append(blocktokens, newtokens)
      if (verbose && (((i+j)%%dot.blocksize) == 0)) { 
        cat("."); utils::flush.console() 
      }
      j <- j + 1
    }
    t <- table(blocktokens)
    blockfreqlist <- as.vector(t); names(blockfreqlist) <- names(t)
    globfreqlist <- addfreqlists(globfreqlist, blockfreqlist)
    prev.pt <- new.pt; new.pt <- proc.time()
    if (verbose) {
      cat((i+j)-1,"(", new.pt[3]-first.pt[3], "|", 
           new.pt[3]-prev.pt[3], ")\n")
      utils::flush.console()
    }
    i <- i+j
  }
  cat("\n")
  return(globfreqlist)
}

wpl.getfreqlist <- function(files, 
                            re.ok.line=".*",
                             re.token.match=".*",
                             re.token.replace="\\1",
                             re.drop.token=NA,
                             perl=TRUE,
                             blocksize=300,
                             verbose=FALSE,
                             dot.blocksize=10) {
  first.pt <- proc.time(); new.pt <- first.pt
  globfreqlist <- vector()
  i = 1
  while (i <= length(files)) {
    j = 0
    blocktokens <- vector()
    while ((j < blocksize) && ((i+j) <= length(files))) {
      file <- files[i+j]
      # --
      con <- file(file, encoding = "latin1")
      newlines <- readLines(con, warn=FALSE)
      close(con)    
      # --
      #newlines <- scan(file, what="char", sep="\n", 
      #                 fileEncoding="latin1", 
      #                 blank.lines.skip = FALSE, quiet=TRUE)
      newsellines <- newlines[grep(re.ok.line, newlines, perl=perl)]
      newtokens <- gsub(re.token.match, re.token.replace, 
                          newsellines, perl=TRUE)
      if (! is.na(re.drop.token)) {
        newtokens <- newtokens[- grep(re.drop.token, newtokens, perl=perl)]
      }
      blocktokens <- append(blocktokens, newtokens)
      if (verbose && (((i+j)%%dot.blocksize) == 0)) { 
        cat("."); utils::flush.console() 
      }
      j <- j + 1
    }
    t <- table(blocktokens)
    blockfreqlist <- as.vector(t); names(blockfreqlist) <- names(t)
    globfreqlist <- addfreqlists(globfreqlist, blockfreqlist)
    prev.pt <- new.pt; new.pt <- proc.time()
    if (verbose) {
      cat((i+j)-1,"(", new.pt[3]-first.pt[3], "|", 
           new.pt[3]-prev.pt[3], ")\n")
      utils::flush.console()
    }
    i <- i+j
  }
  cat("\n")
  return(globfreqlist)
}


show_dot <- function(show_dots = FALSE) {
  if (show_dots) {
    cat(".")
    utils::flush.console()
  }
  invisible(show_dots)
}

# ------------------------------------------------------------------------------
# keywords and collocation related functions
# ------------------------------------------------------------------------------
# All functions in this section assume the data stem from frequency tables of 
# the form:
#                           target item  other item
#         target context              a           b
#          other context              c           d 
#
# Moreover all functions accept a,b,c,d to be equal sized vectors of length
# 1 or higher. They take a[1],b[1],c[1] and d[1] to stem from frequency table 1, 
# a[2],b[2],c[2] and d[2], to stem from frequency table 2, etc.
# ------------------------------------------------------------------------------

# function that returns association scores on the basis of
# the frequencies a, b, c and d

assoc_scores_abcd <- function(a, b, c, d,
                              measures = NULL,
                              with_variants = FALSE,
                              show_dots = FALSE,
                              p_fisher_2 = FALSE,
                              small_pos = 0.00001) {
  if (is.null(measures)) {
    measures <- c("exp_a", "DP_rows", "RR_rows",
                  "OR", "MS", "PMI", "Dice",
                  "G", "chi2", "t", "fisher")
  }
  a <- zero_plus(a, small_pos = small_pos)
  b <- zero_plus(b, small_pos = small_pos)
  c <- zero_plus(c, small_pos = small_pos)
  d <- zero_plus(d, small_pos = small_pos)
  m <- a + b; n <- c + d; k <- a + c; l <- b + d; N <- m + n
  ea <- (m * k)/N; eb <- (m * l)/N 
  ec <- (n * k)/N; ed <- (n * l)/N
  # =========================================================================
  retval <- data.frame(a = a, b = b, c = c, d = d) # remove zeroes from output also
  retval$dir <- ifelse(a/m < c/n, -1, 1) # direction
  # =========================================================================
  # -- exp_a --
  if (is.element("exp_a", measures) ||
      is.element("expected", measures) ||
      is.element("ALL", measures)) {
    retval$exp_a <- ea
    show_dot(show_dots)
  }
  # -- exp_b --
  if (is.element("exp_b", measures) ||
      is.element("expected", measures) ||
      is.element("ALL", measures)) {
    retval$exp_b <- eb
    show_dot(show_dots)
  }
  # -- exp_c --
  if (is.element("exp_c", measures) ||
      is.element("expected", measures) ||
      is.element("ALL", measures)) {
    retval$exp_c <- ec
    show_dot(show_dots)
  }
  # -- exp_d --
  if (is.element("exp_d", measures) ||
      is.element("expected", measures) ||
      is.element("ALL", measures)) {
    retval$exp_d <- ed
    show_dot(show_dots)
  }
  # =========================================================================  
  # -- DP_rows, difference of proportions (aka delta p) --
  if (is.element("DP_rows", measures) ||
      is.element("DP", measures) ||
      is.element("ALL", measures)) {
    retval$DP_rows <- (a / m) - (c / n)
    show_dot(show_dots)
  }
  # -- DP_cols, difference of proportions (aka delta p) --
  if (is.element("DP_cols", measures) ||
      is.element("DP", measures) ||
      is.element("ALL", measures)) {
    retval$DP_cols <- (a / k) - (b / l)
    show_dot(show_dots)
  }
  # -- perc_DIFF_rows, %DIFF --
  if (is.element("perc_DIFF_rows", measures) ||
      is.element("perc_DIFF", measures) ||
      is.element("ALL", measures)) {
    retval$perc_DIFF_rows <- 100 * (a / m - c / n) / (c / n) 
    show_dot(show_dots)
  }  
  # -- perc_DIFF_cols, %DIFF --
  if (is.element("perc_DIFF_cols", measures) ||
      is.element("perc_DIFF", measures) ||
      is.element("ALL", measures)) {
    retval$perc_DIFF_cols <- 100 * (a / k - b / l) / (b / l) 
    show_dot(show_dots)
  }
  # -- DC_rows, difference coefficient --
  if (is.element("DC_rows", measures) ||
      is.element("DC", measures) ||
      is.element("ALL", measures)) {
    retval$DC_rows <- (a/m - c/n) / (a / m + c / n) 
    show_dot(show_dots)
  }
  # -- DC_cols, difference coefficient --
  if (is.element("DC_cols", measures) ||
      is.element("DC", measures) ||
      is.element("ALL", measures)) {
    retval$DC_cols <- (a/k - b/l) / (a / k + b / l)
    show_dot(show_dots)
  }    
  # =========================================================================   
  # -- RR_rows, relative risk --
  if (is.element("RR_rows", measures) ||
      is.element("RR", measures) ||
      is.element("ALL", measures)) {
    retval$RR_rows <- (a/m) / (c/n)
    show_dot(show_dots)
  }
  # -- RR_cols, relative risk --
  if (is.element("RR_cols", measures) ||
      is.element("RR", measures) ||
      is.element("ALL", measures)) {
    retval$RR_cols <- (a/k) / (b/l)
    show_dot(show_dots)
  }
  # -- LR_rows, Hardie's Log Ratio (rows) --
  if (is.element("LR_rows", measures) ||
      is.element("LR", measures) ||
      is.element("ALL", measures)) {
    retval$LR_rows <- log2((a / m) / (c / n))
    show_dot(show_dots)
  }
  # -- LR_cols, Hardie's Log Ratio (cols) --
  if (is.element("LR_cols", measures) ||
      is.element("LR", measures) ||
      is.element("ALL", measures)) {
    retval$LR_cols <- log2((a / k) / (b / l))
    show_dot(show_dots)
  }
  # =========================================================================     
  # -- OR, odds ratio --
  if (is.element("OR", measures) ||
      is.element("ALL", measures)) {
    retval$OR <- (a / b) / (c / d) # also equals (a/c) / (b/d)
    show_dot(show_dots)
  }
  # -- log_OR --
  if (is.element("log_OR", measures) ||
      is.element("ALL", measures)) {
    retval$log_OR <- log((a / b) / (c / d))
    show_dot(show_dots)
  }
  # =========================================================================     
  # -- MS, minimum sensitivity --
  if (is.element("MS", measures) ||
      is.element("ALL", measures)) {
    retval$MS <- pmin(a / m, a / k)
    show_dot(show_dots)
  }
  # -- Jaccard --
  if (is.element("Jaccard", measures) ||
      is.element("ALL", measures)) {
    retval$Jaccard <- a/(m + k -a)
    show_dot(show_dots)
  }  
  # -- Dice --
  if (is.element("Dice", measures) ||
      is.element("ALL", measures)) {
    retval$Dice <- (2 * a) / (m + k)
    show_dot(show_dots)
  }
  # -- logDICE --
  if (is.element("logDice", measures) ||
      is.element("ALL", measures)) {
    retval$logDice <- 14 + log2((2 * a)/(m + k))
    show_dot(show_dots)
  }  
  # =========================================================================       
  # -- Phi (Cramer's V) --
  if (is.element("phi", measures) ||
      is.element("ALL", measures)) {
    retval$phi <- (a * d - b * c) / sqrt((a + b) * (c + d) * (a + c) * (b + d))
    show_dot(show_dots)
  }
  # -- Q (Yule's Q) --
  if (is.element("Q", measures) ||
      is.element("ALL", measures)) {
    retval$Q <- (a * d - b * c) / (a * d + b * c)
    show_dot(show_dots)
  }
  # =========================================================================       
  # -- mu --
  if (is.element("mu", measures) ||
      is.element("ALL", measures)) {
    retval$mu <- a / ea
    show_dot(show_dots)
  }    
  # -- PMI --
  if (is.element("PMI", measures) ||
      is.element("ALL", measures)) {
    retval$PMI <- log2((a / N) / ((k / N) * (m / N))) # is log van mu
    show_dot(show_dots)
  }
  # -- pos.PMI --
  if (is.element("pos_PMI", measures) ||
      is.element("ALL", measures)) {
    retval$pos_PMI <- log2((a / N) / ((k / N) * (m / N)))
    retval$pos_PMI[retval$pos_PMI < 0] <- 0 # remove negs
    show_dot(show_dots)
  }
  # -- PMI2 --
  if (is.element("PMI2", measures) ||
      is.element("ALL", measures)) {
    retval$PMI2 <- log2(((a^2) / N) / ((k / N) * (m / N))) 
    show_dot(show_dots)
  }
  # -- PMI3 --
  if (is.element("PMI3", measures) ||
      is.element("ALL", measures)) {
    retval$PMI3 <- log2(((a^3) / N) / ((k / N) * (m / N))) 
    show_dot(show_dots)
  }  
  # =========================================================================
  # -- chi2 (4-term) --
  if (is.element("chi2", measures) ||
      is.element("ALL", measures)) {
    retval$chi2 <- (a - ea)^2 / ea + (b - eb)^2 / eb + (c - ec)^2 / ec + (d - ed)^2 / ed
    if (with_variants) {
      retval$p_chi2 <- 1 - pchisq(retval$chi2, 1)
      retval$chi2_signed <- retval$chi2 * retval$dir
    }
    show_dot(show_dots)
  }
  # -- chi2 (4-term) with Yates correction --
  if (is.element("chi2_Y", measures) ||
      is.element("ALL", measures)) {
    retval$chi2_Y <- (abs(a - ea) - .5)^2 / ea + (abs(b - eb) - .5)^2 / eb +
                     (abs(c - ec) - .5)^2 / ec + (abs(d - ed) - .5)^2 / ed
    if (with_variants) {
      retval$p_chi2_Y <- 1 - pchisq(retval$chi2_Y, 1)
      retval$chi2_Y_signed <- retval$chi2_Y * retval$dir
    }
    show_dot(show_dots)
  }
  # -- chi2 (2-term) --
  if (is.element("chi2_2T", measures) ||
      is.element("ALL", measures)) {
    retval$chi2_2T <- (a - ea)^2 / ea + (c - ec)^2 / ec
    if (with_variants) {
      retval$p_chi2_2T <- 1 - pchisq(retval$chi2_2T, 1)
      retval$chi2_2T_signed <- retval$chi2_2T * retval$dir
    }
    show_dot(show_dots)
  }
  # -- chi2 (2-term) with Yates correction --
  if (is.element("chi2_2T_Y", measures) ||
      is.element("ALL", measures)) {
    retval$chi2_2T_Y <- (abs(a - ea) - .5)^2 / ea + (abs(c - ec) - .5)^2 / ec
    if (with_variants) {
      retval$p_chi2_2T_Y <- 1 - pchisq(retval$chi2_2T_Y, 1)
      retval$chi2_2T_Y_signed <- retval$chi2_2T_Y * retval$dir
    }
    show_dot(show_dots)
  }
  # =========================================================================
  # note: I call this G, but often in linguistics the name G2 is used
  # =========================================================================
  # -- G (4-term) --
  if (is.element("G", measures) ||
      is.element("ALL", measures)) {
    retval$G <- 2 * (a * log(a / ea) + b * log(b / eb) +
                     c * log(c / ec) + d * log(d / ed))
    if (with_variants) {
      retval$p_G <- 1 - pchisq(retval$G, 1)
      retval$G_signed <- retval$G * retval$dir
    }
    show_dot(show_dots)
  }
  # -- G (2-term) --
  if (is.element("G_2T", measures) ||
      is.element("ALL", measures)) {
    retval$G_2T <- 2 * (a * log(a / ea) + c * log(c / ec))
    if (with_variants) {
      retval$p_G_2T <- 1 - pchisq(retval$G_2T, 1)
      retval$G_2T_signed <- retval$G_2T * retval$dir
    }
    show_dot(show_dots)
  }
  # =========================================================================  
  # -- t --
  if (is.element("t", measures) ||
      is.element("ALL", measures)) {
    retval$t <- ((a / N - k / N * m / N) /
                 sqrt(((a / N) * (1 - a / N)) / N))
    if (with_variants) {
      retval$p_t_1 <- 1 - pt(retval$t, N - 1) # one-sided!
      retval$t_1_as_chisq1 <- p_to_chisq1(retval$p_t_1)
      retval$p_t_2 <- 2 * retval$p_t_1
      sel <- which(retval$t < 0)
      if (length(sel) > 0) {
      # the following line does too much calculation,
      # but attempts to avoid this led to errors
        retval$p_t_2[sel] <- (2 * pt(retval$t, N - 1))[sel]
      }
      retval$t_2_as_chisq1 <- p_to_chisq1(retval$p_t_2)
    }
    show_dot(show_dots)
  }
  # =========================================================================  
  # -- fisher (one-sided!) --
  if (is.element("fisher", measures) ||
      is.element("ALL", measures)) {
    retval$p_fisher_1 <- 1 - phyper(a - 1, m, n, k)
    if (with_variants) {
      retval$fisher_1_as_chisq1 <- p_to_chisq1(retval$p_fisher_1)
      retval$p_fisher_1r <- phyper(a, m, n, k)
      retval$fisher_1r_as_chisq1 <- p_to_chisq1(retval$p_fisher_1r)
    }
    if (p_fisher_2) {
      pf2 <- vector()
      for (i in 1:length(a)) {
        m <- matrix(nrow = 2, byrow = TRUE,
                    c(round(a[i]),round(b[i]),
                      round(c[i]),round(d[i])))
        pf2[i] <- fisher.test(m)$p
      }
      retval$p_fisher_2 <- pf2
      retval$fisher_2_as_chisq1 <- p_to_chisq1(retval$p_fisher_2)
    }
    show_dot(show_dots)
  }
  #
  retval
}

# constructor for an object of the class "cooc_info"
cooc_info <- function(target_freqlist,
                      ref_freqlist,
                      target_n = NA,
                      ref_n = NA) {
  retval <- list(target_freqlist = target_freqlist,
                 ref_freqlist = ref_freqlist,
                 target_n = target_n,
                 ref_n = ref_n)
  if (is.na(retval$target_n)) {
    retval$target_n <- sum(target_freqlist)
  }
  if (is.na(retval$ref_n)) {
    retval$ref_n <- sum(ref_freqlist)
  }
  class(retval) = "cooc_info"
  retval
}


# function that returns association scores on the basis of
# a target frequency list and a reference frequency list;
# it internally calls assoc_scores_abcd()

# is x is of class cooc_info, then y is ignored
# otherwise x and y are assumed to be target.freqlist and
# ref.freqlist respectively.

assoc_scores <- function(x, 
                         y = NULL, 
                         min_freq = 3,
                         measures = NULL,
                         with_variants = FALSE,
                         show_dots = FALSE,
                         p_fisher_2 = FALSE,
                         small_pos = 0.00001) {
  # ---------------------------------------------------------------
  # -- min_freq is minimum frequency in target_freqlist
  #    for inclusion in the output
  # ---------------------------------------------------------------
  if (class(x) != "cooc_info") {
    x <- cooc_info(target_freqlist = x, ref_freqlist = y,
                   target_n = sum(x), ref_n = sum(y))
  }
  if (min_freq == 0) {
    union_names <- union(names(x$target_freqlist),
                         names(x$ref_freqlist))
    x$target_freqlist <- x$target_freqlist[union_names]
    x$target_freqlist[is.na(x$target_freqlist)] <- 0
  } else {
    x$target_freqlist <- x$target_freqlist[x$target_freqlist >= min_freq]
  }
  x$ref_freqlist <- x$ref_freqlist[names(x$target_freqlist)]
  x$ref_freqlist[is.na(x$ref_freqlist)] <- 0
  retval <- assoc_scores_abcd(
              a = x$target_freqlist,
              b = x$target_n - x$target_freqlist,
              c = x$ref_freqlist,
              d = x$ref_n - x$ref_freqlist,
              measures = measures,
              with_variants = with_variants,
              show_dots = show_dots,
              p_fisher_2 = p_fisher_2,
              small_pos = small_pos)
  rownames(retval) <- names(x$target_freqlist)
  retval
}


zero_plus <- function(x, small_pos = 0.00001) {
# auxiliary function that makes all values in numeric vector x strictly positive
# small.pos stands for 'small positive constant'
  x[x <= 0] <- small_pos
  x
}

p_to_chisq1 <- function(p) {
# returns the 'p right quantile' in the chi-square distribution with one df
  return(qchisq(1 - p, 1))
}

chisq1_to_p <- function(x) {
# returns the proportion of the chi-square distribution with one df
# that sits to the right of x
  1 - pchisq(x, 1)
}

# -- All calc.X() functions below in principle have become
# -- redundant, because their functionality has become subsumed by
# -- assoc_scores_abcd();
# -- they are kept nonetheless, because:
#      (i)  I'm still keeping open the option to (redundantly) also
#           export these functions (next to assoc_scores_abcd(),
#           simply because their source code is easier to grasp
#           for readers of MCLM
#      (ii) they can still be useful for debugging assoc_scores_abcd()
# -- However, for the moment, these functions are not exported.

calc.exp.a <- function(a,b,c,d) {
  a <- zero_plus(a); b <- zero_plus(b); c <- zero_plus(c); d <- zero_plus(d)
  m <- a + b; n <- c + d; k <- a + c; l <- b + d; mn <- m + n
  ea <- (m * k)/mn
  return(ea)
}

calc.min.exp <- function(a,b,c,d) {
  a <- zero_plus(a); b <- zero_plus(b); c <- zero_plus(c); d <- zero_plus(d)
  m <- a + b; n <- c + d; k <- a + c; l <- b + d; mn <- m + n
  ea <- (m * k)/mn; eb <- (m * l)/mn 
  ec <- (n * k)/mn; ed <- (n * l)/mn
  return(apply(cbind(ea,eb,ec,ed), 1, min))
}

calc.G2 <- function(a,b,c,d) {
  a <- zero_plus(a); b <- zero_plus(b); c <- zero_plus(c); d <- zero_plus(d)
  m <- a + b; n <- c + d; k <- a + c; l <- b + d; mn <- m + n
  ea <- (m * k)/mn; eb <- (m * l)/mn 
  ec <- (n * k)/mn; ed <- (n * l)/mn
  return(2 * (a*log(a/ea) + b*log(b/eb) + c*log(c/ec) + d*log(d/ed)))
}

calc.G2.is.pos <- function(a,b,c,d) {
  a <- zero_plus(a); b <- zero_plus(b); c <- zero_plus(c); d <- zero_plus(d)
  m <- a + b; n <- c + d
  return(a/m > c/n)
}

calc.p.G2 <- function(a,b,c,d) {
  g2 <- calc.G2(a,b,c,d)
  return(1 - pchisq(g2, 1))
}

calc.G2.pos <- function(a,b,c,d) {
  a <- zero_plus(a); b <- zero_plus(b); c <- zero_plus(c); d <- zero_plus(d)
  m <- a + b; n <- c + d; k <- a + c; l <- b + d; mn <- m + n
  ea <- (m * k)/mn; eb <- (m * l)/mn 
  ec <- (n * k)/mn; ed <- (n * l)/mn
  result <- (2 * (a*log(a/ea) + b*log(b/eb) + c*log(c/ec) + d*log(d/ed)))
  result[!(a/m > c/n)] <- 0.0
  return(result)
}

calc.p.G2.pos <- function(a,b,c,d) {
  g2 <- calc.G2.pos(a,b,c,d)
  return(1 - pchisq(g2, 1))
}

calc.G2.neg <- function(a,b,c,d) {
  a <- zero_plus(a); b <- zero_plus(b); c <- zero_plus(c); d <- zero_plus(d)
  m <- a + b; n <- c + d; k <- a + c; l <- b + d; mn <- m + n
  ea <- (m * k)/mn; eb <- (m * l)/mn 
  ec <- (n * k)/mn; ed <- (n * l)/mn
  result <- (2 * (a*log(a/ea) + b*log(b/eb) + c*log(c/ec) + d*log(d/ed)))
  result[(a/m > c/n)] <- 0.0
  return(result)
}

calc.p.G2.neg <- function(a,b,c,d) {
  g2 <- calc.G2.neg(a,b,c,d)
  return(1 - pchisq(g2, 1))
}

calc.G2.signed <- function(a,b,c,d) {
  a <- zero_plus(a); b <- zero_plus(b); c <- zero_plus(c); d <- zero_plus(d)
  m <- a + b; n <- c + d; k <- a + c; l <- b + d; mn <- m + n
  ea <- (m * k)/mn; eb <- (m * l)/mn 
  ec <- (n * k)/mn; ed <- (n * l)/mn
  result <- (2 * (a*log(a/ea) + b*log(b/eb) + c*log(c/ec) + d*log(d/ed)))
  result[!(a/m > c/n)] <- - result[!(a/m > c/n)]
  return(result)
}

calc.Chi2 <- function(a,b,c,d) {
  a <- zero_plus(a); b <- zero_plus(b); c <- zero_plus(c); d <- zero_plus(d)
  m <- a + b; n <- c + d; k <- a + c; l <- b + d; mn <- m + n
  ea <- (m * k)/mn; eb <- (m * l)/mn 
  ec <- (n * k)/mn; ed <- (n * l)/mn
  return((a - ea)^2/ea + (b - eb)^2/eb + (c - ec)^2/ec + (d - ed)^2/ed)
}

calc.Chi2.signed <- function(a,b,c,d) {
  a <- zero_plus(a); b <- zero_plus(b); c <- zero_plus(c); d <- zero_plus(d)
  m <- a + b; n <- c + d; k <- a + c; l <- b + d; mn <- m + n
  ea <- (m * k)/mn; eb <- (m * l)/mn 
  ec <- (n * k)/mn; ed <- (n * l)/mn
  result <- ((a - ea)^2/ea + (b - eb)^2/eb + (c - ec)^2/ec + (d - ed)^2/ed)
  result[!(a/m > c/n)] <- - result[!(a/m > c/n)]
  return(result)  
}
calc.p.Chi2 <- function(a,b,c,d) {
  chi2 <- calc.Chi2(a,b,c,d)
  return(1 - pchisq(chi2, 1))
}

calc.PMI <- function(a,b,c,d) {
  a <- zero_plus(a); b <- zero_plus(b); c <- zero_plus(c); d <- zero_plus(d)
  m <- a + b; n <- c + d; k <- a + c; l <- b + d; mn <- m + n
  return(log2( (a/mn) / ( (k/mn) * (m/mn) ) ))
}

calc.MS <- function(a,b,c,d) {
  a <- zero_plus(a); b <- zero_plus(b); c <- zero_plus(c); d <- zero_plus(d)
  m <- a + b; n <- c + d; k <- a + c; l <- b + d; mn <- m + n
  return(min(a/m, a/k))
}

calc.t <- function(a,b,c,d) {
  a <- zero_plus(a); b <- zero_plus(b); c <- zero_plus(c); d <- zero_plus(d)
  m <- a + b; n <- c + d; k <- a + c; l <- b + d; mn <- m + n
  t <- ( ( a/mn - k/mn * m/mn )   /
           sqrt( ( (a/mn) * (1 - a/mn) ) / mn ) )
  return(t)
}

calc.p.t <- function(a,b,c,d) { # 1-sided
  a <- zero_plus(a); b <- zero_plus(b); c <- zero_plus(c); d <- zero_plus(d)
  m <- a + b; n <- c + d; k <- a + c; l <- b + d; mn <- m + n
  t <- ( ( a/mn - k/mn * m/mn )   /
           sqrt( ( (a/mn) * (1 - a/mn) ) / mn ) )
  return(1 - pt(t, mn-1))
}

calc.p.t.two.sided <- function(a,b,c,d) {
  a <- zero_plus(a); b <- zero_plus(b); c <- zero_plus(c); d <- zero_plus(d)
  m <- a + b; n <- c + d; k <- a + c; l <- b + d; mn <- m + n
  t <- ( ( a/mn - k/mn * m/mn )   /
           sqrt( ( (a/mn) * (1 - a/mn) ) / mn ) )
  p <- 1 - pt(t, mn-1) # first assumption: positive deviation
  if (t < 0) {
    p <- pt(t, mn-1)   # correct previous assumption if needed
  }
  p <- 2*p             # two.sided
  return(p)
}


calc.DP <- function(a,b,c,d) {
  a <- zero_plus(a); b <- zero_plus(b); c <- zero_plus(c); d <- zero_plus(d)
  m <- a + b; n <- c + d
  return((a/m) - (c/n))
}

calc.RR <- function(a,b,c,d) {
  a <- zero_plus(a); b <- zero_plus(b); c <- zero_plus(c); d <- zero_plus(d)
  m <- a + b; n <- c + d
  return((a/m) / (c/n))
}

calc.OR <- function(a,b,c,d) {
  a <- zero_plus(a); b <- zero_plus(b); c <- zero_plus(c); d <- zero_plus(d)
  return((a/b) / (c/d))
}

calc.log.OR <- function(a,b,c,d) {
  a <- zero_plus(a); b <- zero_plus(b); c <- zero_plus(c); d <- zero_plus(d)
  return(log((a/b) / (c/d)))
}

calc.DICE <- function(a,b,c,d) {
  a <- zero_plus(a); b <- zero_plus(b); c <- zero_plus(c); d <- zero_plus(d)
  m <- a + b; k <- a + c
  return( (2*a)/(m+k) )
}

calc.p.fisher <- function(a,b,c,d) {
  # the use of zero_plus() doesn't seem to hinder, so we keep it
  # for the sake of conceptual consistency across the
  # calc.X() functions
  a <- zero_plus(a); b <- zero_plus(b); c <- zero_plus(c); d <- zero_plus(d)
  m <- a + b; n <- c + d; k <- a + c
  return (1 - phyper(a-1,m,n,k))
}

calc.p.fisher.two.sided <- function(a,b,c,d) {
  # no zero_plus() here, to avoid warning about non-integer values
  result <- vector()
  for (i in 1:length(a)) {
    m <- matrix(c(a[i],b[i],c[i],d[i]), nrow=2, byrow=T)
    result[i] <- fisher.test(m)$p
  }
  return(result)
}


# ------------------------------------------------------------------------------
# CORRESPONDENCE ANALYSIS
# ------------------------------------------------------------------------------

ca.row.profiles <- function(tdat) {
# ------------------------------------------------------------------------------
# Calculates the row profiles matrix R for the two way contingency table tdat
# ------------------------------------------------------------------------------
  return(prop.table(as.matrix(tdat), 1))
}

ca.col.profiles <- function(tdat) {
# ------------------------------------------------------------------------------
# Calculates the col profiles matrix C for the two way contingency table tdat
# ------------------------------------------------------------------------------
  return(prop.table(as.matrix(tdat), 2))
}

ca.corresp.matrix <- function(tdat) {
# ------------------------------------------------------------------------------
# Calculates the correspondence matrix P for the two way contingency table tdat
# ------------------------------------------------------------------------------
  return(prop.table(as.matrix(tdat)))
}

ca.row.centroid <- function(tdat) {
# ------------------------------------------------------------------------------
# Calculates the row centroid for the two way contingency table tdat
# ------------------------------------------------------------------------------
  tdat <- as.matrix(tdat)
  data.P    <- tdat / sum(tdat)          # correspondence matrix
  data.c    <- apply(data.P, 2, sum)     # col masses (=average row profile)
  return(data.c)  
}

ca.col.centroid <- function(tdat) {
# ------------------------------------------------------------------------------
# Calculates the column centroid for the two way contingency table tdat
# ------------------------------------------------------------------------------
  tdat <- as.matrix(tdat)
  data.P    <- tdat / sum(tdat)          # correspondence matrix
  data.r    <- apply(data.P, 1, sum)     # row masses (=average col profile)
  return(data.r)
}

ca.plot.profiles <- function(tdat, side=TRUE, vertical=TRUE) {
# ------------------------------------------------------------------------------
# Plots row and columns profiles for the two way contingency table tdat
# ------------------------------------------------------------------------------
# side=TRUE means plotting the two plots side by side
# side=FALSE means plotting one above the other
# ------------------------------------------------------------------------------
  tdat <- as.matrix(tdat)
  dir <- c("v","h"); if (!vertical) { dir <- c("h","v") }
  if (side) { par(mfrow=c(1,2)) } else { par(mfrow=c(2,1)) }
  graphics::mosaicplot(tdat, col=T, main="row profiles", dir=dir) 
  graphics::mosaicplot(t(tdat), col=T, main="column profiles",dir=dir) 
  par(mfrow=c(1,1))
}

ca.row.masses <- function(tdat) {
# ------------------------------------------------------------------------------
# Calculates masses for the rows in the two way contingency table tdat
# ------------------------------------------------------------------------------
  tdat <- as.matrix(tdat)
  data.P    <- tdat / sum(tdat)          # correspondence matrix
  data.r    <- apply(data.P, 1, sum)     # row masses
  return(data.r)
}

ca.col.masses <- function(tdat) {
# ------------------------------------------------------------------------------
# Calculates masses for the columns in the two way contingency table tdat
# ------------------------------------------------------------------------------
  tdat <- as.matrix(tdat)
  data.P    <- tdat / sum(tdat)          # correspondence matrix
  data.c    <- apply(data.P, 2, sum)     # column masses
  return(data.c)
}

h.calc.chisq.dist <- function(x1, x2, m) {
# ------------------------------------------------------------------------------
# [auxiliary function used by row.chisq.dists() and col.chisq.dists()]
# calculated chi-square distance between two profiles x1 and x2, using m
# as the masses by means of which to weight the components in x1 and x2 
# ------------------------------------------------------------------------------
# if tdat is a matrix with absolute frequencies, then row distances between 
# the profiles of row 1 and row 2 are computed as follows:
#   data.P   <- tdat / sum(tdat)           # correspondence matrix  
#   row.prof <- prop.table(data.P, 1)      # row profiles
#   data.c   <- apply(data.P, 2, sum)      # col masses 
#   h.calc.chisq.dist(row.prof[1,], row.prof[2,], data.c)
# ------------------------------------------------------------------------------
# if tdat is a matrix with absolute frequencies, then column distances between 
# the profiles of column 1 and column 2 are computed as follows:
#   data.P   <- tdat / sum(tdat)           # correspondence matrix  
#   col.prof <- prop.table(data.P, 2)      # column profiles
#   data.r   <- apply(data.P, 1, sum)      # row masses 
#   h.calc.chisq.dist(col.prof[,1], col.prof[,2], data.r)
# ------------------------------------------------------------------------------
  result <- 0
  for (i in 1:length(x1)) {
    result <- result + (1/m[i] * (x1[i] - x2[i])^2)
  }
  return(sqrt(result))
}

row.chisq.dists <- function(tdat) {
# ------------------------------------------------------------------------------
# Calculates all chi-square row distances between the rows in the 
# two way contingency table tdat
# ------------------------------------------------------------------------------
  tdat <- as.matrix(tdat)
  data.P    <- tdat / sum(tdat)          # correspondence matrix
  row.prof  <- prop.table(data.P, 1)     # row profiles
  data.c    <- apply(data.P, 2, sum)     # col masses 
  data.r    <- apply(data.P, 1, sum)     # row masses
  m <- outer(data.r, data.r) * 0         # initialize function output
  for (i in 1:nrow(data.P)) {
    for (j in 1:nrow(data.P)) {
      m[i,j] <- h.calc.chisq.dist(row.prof[i,], row.prof[j,], data.c)
    }
  }
  return(m)
}

col.chisq.dists <- function(tdat) {
# ------------------------------------------------------------------------------
# Calculates all chi-square column distances between the columns in the 
# two way contingency table tdat
# ------------------------------------------------------------------------------
  tdat <- as.matrix(tdat)
  data.P    <- tdat / sum(tdat)          # correspondence matrix
  col.prof <- prop.table(data.P, 2)      # col profiles  
  data.c    <- apply(data.P, 2, sum)     # col masses 
  data.r    <- apply(data.P, 1, sum)     # row masses
  m <- outer(data.c, data.c) * 0         # initialize function output 
  for (i in 1:ncol(data.P)) {
    for (j in 1:ncol(data.P)) {
      m[i,j] <- h.calc.chisq.dist(col.prof[,i], col.prof[,j], data.r)
    }
  }
  return(m)
}

inertia.contrib <- function(tdat) {
# ------------------------------------------------------------------------------
# Calculates the contributions in all cells of the two way contingency table
# tdat to the overall inertia.
# ------------------------------------------------------------------------------
# This overall inertia is equal to
#      chisq.test(tdat, correct=F)$statistic / sum(tdat)
# and the contributions to the overall inertia are equal to
#      (chisq.test(tdat, correct=F)$residuals^2)/sum(tdat)
# ------------------------------------------------------------------------------
  tdat <- as.matrix(tdat)
  data.P    <- tdat / sum(tdat)          # correspondence matrix
  data.r    <- apply(data.P, 1, sum)     # row masses (=average col profile)
  data.c    <- apply(data.P, 2, sum)     # col masses (=average row profile)
  exp.mass  <- outer(data.r, data.c)     # expected masses (given independence)
  result <- outer(data.r, data.c) * 0
  for (i in 1:nrow(tdat)) {
    for (j in 1:ncol(tdat)) {
       result[i,j] <- 
         ((data.P[i,j] - exp.mass[i,j])^2 / exp.mass[i,j])
    }
  }
  return(result)
}

row.inertias <- function(tdat) {
# ------------------------------------------------------------------------------
# Calculates all row inertias of the two way contingency table tdat
# as m times the square of the row distance to the centroid of the rows 
# (i.e. to the average row profile), with m the mass of the row.
# ------------------------------------------------------------------------------
  tdat <- as.matrix(tdat)
  data.P    <- tdat / sum(tdat)          # correspondence matrix
  row.prof  <- prop.table(data.P, 1)     # row profiles  
  data.r    <- apply(data.P, 1, sum)     # row masses (=average col profile)
  data.c    <- apply(data.P, 2, sum)     # col masses (=average row profile)
  result    <- data.r * 0                # initialize the function output
  for (i in 1:nrow(tdat)) {
    result[i] <- 0
    for (j in 1:ncol(tdat)) {
      result[i] <- result[i] + ((row.prof[i,j] - data.c[j])^2 / data.c[j])
    }
    result[i] <- result[i] * data.r[i]   # multiply by row mass
  }
  return(result)
}

col.inertias <- function(tdat) {
# ------------------------------------------------------------------------------
# Calculates all column inertias of the two way contingency table tdat
# as m times the square of the column distance to the centroid of the columns 
# (i.e. to the average column profile), with m the mass of the column.
# ------------------------------------------------------------------------------
  tdat <- as.matrix(tdat)
  data.P    <- tdat / sum(tdat)          # correspondence matrix
  col.prof  <- prop.table(data.P, 2)     # column profiles  
  data.r    <- apply(data.P, 1, sum)     # row masses (=average col profile)
  data.c    <- apply(data.P, 2, sum)     # col masses (=average row profile)
  result    <- data.c * 0                # initialize the function output
  for (j in 1:ncol(tdat)) {
    result[j] <- 0
    for (i in 1:nrow(tdat)) {
      result[j] <- result[j] + ((col.prof[i,j] - data.r[i])^2 / data.r[i])
    }
    result[j] <- result[j] * data.c[j]   # multiply by column mass
  }
  return(result)
}


row.inertias2 <- function(tdat) {
# ------------------------------------------------------------------------------
# Calculates all row inertias of the two way contingency table tdat
# as m times the square of the row distance to the centroid of the rows 
# (i.e. to the average row profile), with m the mass of the row.
# ------------------------------------------------------------------------------
  tdat <- as.matrix(tdat)
  data.P    <- tdat / sum(tdat)          # correspondence matrix
  row.prof  <- prop.table(data.P, 1)     # row profiles  
  data.r    <- apply(data.P, 1, sum)     # row masses (=average col profile)
  data.c    <- apply(data.P, 2, sum)     # col masses (=average row profile)
  result    <- data.r * 0                # initialize the function output
  for (i in 1:nrow(tdat)) {
    result[i] <- 0
    row.dist.to.centroid <- h.calc.chisq.dist(row.prof[i,], data.c, data.c) 
    row.mass <- data.r[i]
    result[i] <- row.dist.to.centroid^2 * row.mass
  }
  return(result)
}

col.inertias2 <- function(tdat) {
# ------------------------------------------------------------------------------
# Calculates all column inertias of the two way contingency table tdat
# as m times the square of the column distance to the centroid of the columns 
# (i.e. to the average column profile), with m the mass of the column.
# ------------------------------------------------------------------------------
  tdat <- as.matrix(tdat)
  data.P    <- tdat / sum(tdat)          # correspondence matrix
  col.prof  <- prop.table(data.P, 2)     # column profiles  
  data.r    <- apply(data.P, 1, sum)     # row masses (=average col profile)
  data.c    <- apply(data.P, 2, sum)     # col masses (=average row profile)
  result    <- data.c * 0                # initialize the function output
  for (j in 1:ncol(tdat)) {
    result[j] <- 0
    col.dist.to.centroid <- h.calc.chisq.dist(col.prof[,j], data.r, data.r) 
    col.mass <- data.c[j]
    result[j] <- col.dist.to.centroid^2 * col.mass
  }
  return(result)
}

row.dists.centroid <- function(tdat) {
# ------------------------------------------------------------------------------
# Calculates all row inertias of the two way contingency table tdat
# as m times the square of the row distance to the centroid of the rows 
# (i.e. to the average row profile), with m the mass of the row.
# ------------------------------------------------------------------------------
  tdat <- as.matrix(tdat)
  data.P    <- tdat / sum(tdat)          # correspondence matrix
  row.prof  <- prop.table(data.P, 1)     # row profiles  
  data.r    <- apply(data.P, 1, sum)     # row masses (=average col profile)
  data.c    <- apply(data.P, 2, sum)     # col masses (=average row profile)
  result    <- data.r * 0                # initialize the function output
  for (i in 1:nrow(tdat)) {
    result[i] <- h.calc.chisq.dist(row.prof[i,], data.c, data.c) 
  }
  return(result)
}

col.dists.centroid <- function(tdat) {
# ------------------------------------------------------------------------------
# Calculates all column inertias of the two way contingency table tdat
# as m times the square of the column distance to the centroid of the columns 
# (i.e. to the average column profile), with m the mass of the column.
# ------------------------------------------------------------------------------
  tdat <- as.matrix(tdat)
  data.P    <- tdat / sum(tdat)          # correspondence matrix
  col.prof  <- prop.table(data.P, 2)     # column profiles  
  data.r    <- apply(data.P, 1, sum)     # row masses (=average col profile)
  data.c    <- apply(data.P, 2, sum)     # col masses (=average row profile)
  result    <- data.c * 0                # initialize the function output
  for (j in 1:ncol(tdat)) {
    result[j] <- h.calc.chisq.dist(col.prof[,j], data.r, data.r) 
  }
  return(result)
}

fitted.row.dists <- function(tdat.ca) {
# ------------------------------------------------------------------------------
# Calculates all euclidean row distances in the first two dimensions of the
# ca solution tdat.ca using the principal row coordinates
# ------------------------------------------------------------------------------
  princ.coord <- tdat.ca$rowcoord %*% diag(tdat.ca$sv)
  x           <- rep(0, length(tdat.ca$rownames))
  names(x)    <- tdat.ca$rownames
  result      <- outer(x, x)                        # initialize function output
  for (i in 1:nrow(princ.coord)) {
    for (j in 1:nrow(princ.coord)) {
      y <- 0
      for (k in 1:2) {
        y = y + (princ.coord[i,k] - princ.coord[j,k])^2
      }
      result[i,j] <- sqrt(y)
    }
  }
  return(result)
}

fitted.col.dists <- function(tdat.ca) {
# ------------------------------------------------------------------------------
# Calculates all euclidean column distances in the first two dimensions of the
# ca solution tdat.ca using the principal column coordinates
# ------------------------------------------------------------------------------
  princ.coord <- tdat.ca$colcoord %*% diag(tdat.ca$sv)
  x           <- rep(0, length(tdat.ca$colnames))
  names(x)    <- tdat.ca$colnames
  result      <- outer(x, x)                        # initialize function output
  for (i in 1:nrow(princ.coord)) {
    for (j in 1:nrow(princ.coord)) {
      y <- 0
      for (k in 1:2) {
        y = y + (princ.coord[i,k] - princ.coord[j,k])^2
      }
      result[i,j] <- sqrt(y)
    }
  }
  return(result)
}

# -----------------------------------------------------------------------------
# "ca" related methods
# -----------------------------------------------------------------------------

row_pcoord <- function(x, ...) {
# ------------------------------------------------------------------------------
# Retrieves row principal coordinates for all dimensions of the
# ca solution object
# ------------------------------------------------------------------------------
  x$rowcoord %*% diag(x$sv)
}

col_pcoord <- function(x, ...) {
# -----------------------------------------------------------------------------
# Retrieves column principal coordinates for all dimensions of the
# ca solution object
# ------------------------------------------------------------------------------
  x$colcoord %*% diag(x$sv)
}

xlim4ca <- function(x, ...) {
# ------------------------------------------------------------------------------
# Assumes object is the output of a ca analysis.
# Returns the xlim value that should be used to reproduce a plot.ca(...)
# plot with the generic plot(...)
# ------------------------------------------------------------------------------
  r_pc <- row_pcoord(x, ...)
  c_pc <- col_pcoord(x, ...)
  range(r_pc[,1], c_pc[,1])
}

ylim4ca <- function(x, ...) {
# ------------------------------------------------------------------------------
# Assumes object is the output of a ca analysis.
# Returns the ylim value that should be used to reproduce a plot.ca(...)
# plot with the generic plot(...)
# ------------------------------------------------------------------------------
  r_pc <- row_pcoord(x, ...)
  c_pc <- col_pcoord(x, ...)
  xlim <- range(r_pc[,1], c_pc[,1])
  ylim <- range(r_pc[,2], c_pc[,2])
  xr <- xlim[2] - xlim[1]
  yr <- ylim[2] - ylim[1]
  r_diff <- xr - yr
  c(ylim[1] - r_diff/2, ylim[2] + r_diff/2) 
}

# -------------------------------------------------------------------------------
# DISTANCE MEASURES
#

cosine_dist <- function(x) {
  (1 - x %*% t(x) / (sqrt(rowSums(x^2) %*% t(rowSums(x^2)))))
}
wai-wong-reimagine/mclm documentation built on May 16, 2019, 9:12 p.m.