R_temp/NEWduplicated.nex.R

#' Find duplicate or overlapping characters in nexus files
#' 
#' Function uses fuzzy text matching to output a list of potentially redundant
#' characters in a `nex` object, waits for user input to confirm, and then
#' handles merging of characters and outputs a new `nex` object
#' 
#' @param x (required) a nexus data object
#' @param opt method to use for finding duplicates
#' @param method specific method arguments to pass to functions
#' @param n number of pairs to use for training discriminant function analysis to predict matches
#' @param within_dataset whether to limit search to only among-dataset characters (e.g., useful if you are certain the individual matrices do not contain duplicates)
#' @param weighting vector for parts of character (before comma, after comma, states)
#' @param cores how many cores to use (for parallel processing in traitcor option)
#' 
#' @return an object of class \code{nex} for use in further \code{nexustools} functions
#' 
#' @examples \dontrun{
#' x <- read.nex(file='example/toy1.nex')
#' map <- list(c(2, 4), c(156, 157))
#' duplicated.nex(x, method = 'user', map = map)
#' duplicated.nex(x, method = 'automated', map = map)
#' duplicated(x, cutoff = 0.01, method = 'auto')
#' duplicated(x, method = 'user', map=list(c(2,3)))
#' 
#' @author Chad Eliason \email{celiason@@fieldmuseum.org}
#'
#' TODO write a lda.nex() function? separate the dup finding and dropping? maybe filter.nex()?
#' TODO add option to "nest" state labels in the network graph (so things like "size of distal end" wouldn't be matched across all characters, only those with state label as, say, "Humerus...")
#' TODO be able to plot "subclusters" (looking at all connections among characters, keeping only terms in common between the two)
#'
duplicated.nex <- function(x, opt=c("fuzzy", "terms", "comments", "traitcor"),
  method=NULL, within=c("none","partition","dataset"), commasep=FALSE, weighting=c(1,1,1),
  K=1, cluster = c("infomap", "fast_greedy", "walktrap", "label_prop", "leading_eigen",
    "louvain", "optimal", "spinglass"), cores=1, ...) {

  require(stringdist)
  require(tm)
  require(MASS)
  require(igraph)
  require(textmineR)
  require(parallel)
  require(polycor)
  require(psych)
  require(pbapply)
  require(pbmcapply)

  cutoff <- Inf # I previously had this as an argument, but i think it's better in the printout function (that way all possible dups will be output and their string distances for later subsetting)

  # setup
  opt <- match.arg(opt)
  within <- match.arg(within)

# within <- "partition"

  cluster <- match.arg(cluster)  
# cluster <- "infomap"
  clustfun <- get(paste0("cluster_", cluster))
  
  # extract information from nexus file
  oldcharnames <- x$charlabels
  oldstatenames <- x$statelabels

  ids <- 1:ncol(x$data)
  
  # only comparisons BETWEEN datasets/character types, not within:
  if (within == "dataset") {  
    splits <- split(ids, x$file)
  }
  
  # option to look within character partitions
  if (within == "partition" & !is.null(x$charpartition) & length(unique(x$charpartition)) > 1) {
    splits <- split(ids, x$charpartition)
  }

  if (within == "none") {
    splits <- list(1:ncol(x$data))
  }

  # Calculate text distances based on overlapping terms
  # 2 options here- igraph/network modularity, or Ward clustering
  # weighting or not (weighting options= 'weightTf', 'weightTfIdf', 'weightBin', 'weightSMART'
  if (opt == "terms") {
    termlist <- cleantext(paste(oldcharnames, oldstatenames))
    # make document matrix, sort by term frequency
    if (K == 1) {
      # do for each chunk
      M <- lapply(seq_along(splits), function(i) {
        # i=2
        subterms <- termlist[splits[[i]]]
        dtm <- TermDocumentMatrix(subterms, control = list(wordLengths = c(2, Inf), weighting = function(x) weightTfIdf(x, normalize = TRUE), stemming=FALSE))
        m <- as.matrix(dtm)
        colnames(m) <- splits[[i]]
        g <- graph_from_incidence_matrix(m, weighted=TRUE)
        cl <- clustfun(g, E(g)$weight)
        grps <- communities(cl)  # get clusters
        list(g=g, cl=cl, grps=grps)
      })

# head(merge(M[[1]], M[[2]], by=0, all=TRUE))

      dtm <- TermDocumentMatrix(termlist, control = list(wordLengths = c(2, Inf), weighting = function(x) weightTfIdf(x, normalize = TRUE), stemming=FALSE))
      m <- as.matrix(dtm)
      g <- graph_from_incidence_matrix(m, weighted=TRUE)
      cl <- clustfun(g, E(g)$weight)
      grps <- communities(cl)  # get clusters
    }
    if (K > 1) {
      dtm <- TermDocumentMatrix(termlist, control = list(wordLengths = c(2, Inf), stemming=FALSE))
      m <- as.matrix(dtm)
      tf_mat <- TermDocFreq(t(m))
      # TF-IDF and cosine similarity
      tfidf <- t(t(m)[ , tf_mat$term ]) * tf_mat$idf
      tfidf <- t(tfidf)
      csim <- tfidf / sqrt(rowSums(tfidf * tfidf))
      csim <- csim %*% t(csim)
      cdist <- as.dist(1 - csim)
      hc <- hclust(cdist, "ward.D")
      clust <- cutree(hc, K)
      grps <- split(names(clust), clust)
    }

    # make all possible combinations within groups/clusters
    lapply(splits, function(i) {
      # i=2
      g <- M[[i]]$g
      verts <- which(grepl("\\d+", V(g)$name))
      sdist <- as.dist(distances(g, v=verts, to=verts, weights=E(g)$weight))
      chars <- as.character(splits[[i]])
      if (!length(chars)>1) {
        return(NA)
      }
      dups <- as.data.frame(t(combn(chars, m=2)))
      dups$stringdist <- sdist
      # dups <- cbind(dups, sdist)
      dups <- dups[dups[,3]!=Inf, ]
      plot(g, edge.width=E(g)$weight*20)
      dups
    })

    verts <- grepl("\\d+", V(g)$name)
    sdist <- as.dist(distances(g, v=verts, to=verts, weights=E(g)$weight))
    dups <- t(combn(1:ncol(x$data), m=2))
    dups <- cbind(dups, sdist)    
    dups <- dups[dups[,3]!=Inf, ]
    # old way:
    # dups <- lapply(lapply(grps, str_extract, "\\d+"), na.omit)
    # dups <- lapply(dups, as.numeric)
    # keep <- sapply(dups, length) > 1  # only keep clusters with >1 character
    # dups <- dups[keep]
    # grps <- grps[keep]
    # dups <- t(do.call(cbind, lapply(dups, combn, m=2)))
    # dups <- cbind(dups, NA)
    # colnames(dups) <- c('char1','char2','stringdist')
    # dmat <- distances(g, v=verts, to=verts)
    # dups[,3] <- sapply(1:nrow(dups), function(i) {
    #   dmat[as.character(dups[i,1]), as.character(dups[i,2])]
    # })
  }

  # Calculate text distances using fuzzy string distance
  if (opt == "fuzzy") {
    if (is.null(method)) {
      method <- "jw"
    }
    if (commasep) {
      matches <- str_match(oldcharnames, "^((.*?),)?(.*)")
      part1 <- matches[, 3]
      part2 <- matches[, 4]
      id <- is.na(part1)
      part1[id] <- part2[id]
      part2[id] <- NA
      part3 <- oldstatenames
    } else {
      part1 <- oldcharnames
      part2 <- NA
      part3 <- oldstatenames
    }
    # Clean up character names
    part1 <- sapply(cleantext(part1), as.character)
    part2 <- sapply(cleantext(part2), as.character)
    part3 <- sapply(cleantext(part3), as.character)
    # individual text distance matrix
    sdist <- lapply(seq_along(splits), function(z) {
      nms <- splits[[z]]
      sd1 <- as.matrix(stringdistmatrix(part1[splits[[z]]], part1[splits[[z]]], method=method))
      sd2 <- as.matrix(stringdistmatrix(part2[splits[[z]]], part1[splits[[z]]], method=method))
      sd3 <- as.matrix(stringdistmatrix(part3[splits[[z]]], part1[splits[[z]]], method=method))
      dimnames(sd1) <- dimnames(sd2) <- dimnames(sd3) <- list(nms, nms)
      list(sd1, sd2, sd3)
    })
    # calculate weighted string distances
    sdist_final <- lapply(seq_along(sdist), function(z) {
      d1 <- weighting[1] * sdist[[z]][[1]]
      d2 <- weighting[2] * sdist[[z]][[2]]
      d3 <- weighting[3] * sdist[[z]][[3]]
      res <- matrix(mapply(sum,d1,d2,d3, MoreArgs=list(na.rm=T)),ncol=ncol(d1))/3
      diag(res) <- NA
      res[upper.tri(res)] <- NA
      dimnames(res) <- dimnames(d1)
      res
    })
    # setup duplicates matrix
    dups <- lapply(seq_along(sdist_final), function(z) {
      sset <- which(sdist_final[[z]] < cutoff, arr.ind=TRUE)
      data.frame(char1 = rownames(sdist_final[[z]])[sset[, 1]],
                 char2 = colnames(sdist_final[[z]])[sset[, 2]],
                 stringdist = sdist_final[[z]][which(sdist_final[[z]] < cutoff)])
    })
    dups <- do.call(rbind, dups)
    dups <- dups[order(dups$stringdist), ]
    dups$char1 <- as.numeric(as.character(dups$char1))
    dups$char2 <- as.numeric(as.character(dups$char2))
    dups <- subset(dups, stringdist < cutoff)
  }
  
  # Method to look at comments/reference to other datasets
  if (opt == "comments") {
    chars <- paste(oldcharnames, oldstatenames)
    # Search for author-year references (e.g., Eliason et al. 2015 character 13)
    m <- str_match_all(chars, "([A-Z][a-z]+\\s(?:et\\sal\\.\\s|(?:\\&|and)\\s[A-Z][a-z]+\\s)?)[\\(]?(\\d{4})[\\)]?.{1,5}[Ch]ar(\\.|acter)?\\s(\\d+)")
    names(m) <- seq_along(chars)
    # convert to matrix with only non-missing cases
    dups <- suppressWarnings(lapply(seq_along(m), function(z) {cbind(df_name = z, m[[z]])}))
    dups <- as.data.frame(do.call('rbind', dups))
    dups <- dups[, c(1:4, 6)]
    names(dups) <- c('charnum', 'text', 'author', 'year', 'charmatch')
    dups$id <- as.numeric(as.character(dups$charnum))
    dups$infile <- x$file[dups$id]
    dups$charnum <- x$charnum[dups$id]
    dups$matchfile <- paste0(tolower(str_extract(dups$author, "[A-Z][a-z]+")), "_", dups$year)
    dups <- dups[, c("infile", "charnum", "author", "year", "charmatch", "matchfile")]
    # match 1
    id1 <- sapply(1:nrow(dups), function(i) {
      res <- x$file == dups$infile[i] & x$charnums==dups$charnum[i]
      ifelse(any(res), which(res), NA)
      })
    # match 2
    id2 <- sapply(1:nrow(dups), function(i) {
      res <- x$file==dups$matchfile[i] & x$charnums==dups$charmatch[i]
      ifelse(any(res), which(res), NA)
      })
    dups <- cbind(id1, id2)
    dups <- subset(dups, complete.cases(dups))
    dups <- cbind(dups, stringdists=NA)
  }

  # Method looking at trait correlations
  if (opt == "traitcor") {
    mat <- x$data
    mat <- gsub("\\?|\\-", NA, mat)  # convert '?' or '-' scorings to NA:
    pairids <- combn(1:ncol(mat), m=2)  # get all pairs of characters
    if (method == "polycor") {
      dmat <- pbmclapply(1:ncol(pairids), function(i) {
        id1 <- pairids[1, i]
        id2 <- pairids[2, i]
        char1 <- mat[, id1]
        char2 <- mat[, id2]
        M <- cbind(char1, char2)
        M <- na.omit(M)
        M <- apply(M, 2, as.numeric)
        suppressWarnings(tryCatch(polychor(M), error=function(e) NA))
      }, mc.cores = cores)
      cors <- unlist(dmat)
      weights <- rep(1, length(cors))
    }
    if (method == "hamming") {
      dmat <- pbmclapply(1:ncol(pairids), function(i) {
        id1 <- pairids[1, i]
        id2 <- pairids[2, i]
        char1 <- mat[, id1]
        char2 <- mat[, id2]
        M <- na.omit(cbind(char1,char2))
        wt <- nrow(M)
        M <- apply(M, 2, paste0, collapse="")
        sdist <- as.numeric(stringdistmatrix(M, method="hamming"))
        sdist <- sdist/str_length(M[1])  # standardize by number of characters
        sdist <- abs(sdist-0.5)
        sdist <- sdist/0.5
        c(sdist=sdist, weight=wt)
        }, mc.cores = cores)
      cors <- sapply(dmat, "[[", "sdist")
      weights <- sapply(dmat, "[[", "weight")  
    }
    # things to output: pairids, sset.dist, dups, stringdists
    # sset.dist <- NULL
    stringdists <- 1-abs(cors)
    dups <- cbind(char1 = pairids[1,], char2 = pairids[2, ], cor = cors, weight = weights)
  }

  res <- x  # resultant NEXUS file for outputting later

  # create final duplicates matrix
  if (nrow(dups) == 0) {
    warning("No duplicates found, try a different method")
    res$dups <- NULL
  } else {
    res$dups <- data.frame("char1" = dups[, 1], "charnum1" = x$charnum[dups[, 1]], "char2" = dups[, 2], "charnum2" = x$charnum[dups[, 2]], stringdist = dups[, 3])
    res$dups <- res$dups[order(res$dups$stringdist), ]
  }

  # get clusters
  if (opt == "terms") {
    res$clusters <- grps
  }
  
  if (opt == "fuzzy" & nrow(dups) != 0) {
    g <- graph_from_edgelist(t(apply(dups[,1:2,drop=F], 1, as.character)), directed=FALSE)
    # if (cutoff == Inf) {
      grps <- communities(components(g))
    # } else {
      # E(g)$weight <- 1/dups[,'stringdist']
      # cl <- clustfun(g, E(g)$weight)
      # grps <- communities(cl)
    # }    
    res$clusters <- grps
  }

  if (opt == "comments" & nrow(dups) != 0) {
    g <- graph_from_edgelist(t(apply(dups[,1:2,drop=F], 1, as.character)), directed=FALSE)
    cl <- clustfun(g)
    grps <- communities(cl)  # get clusters
    res$clusters <- grps
  }
  
  res$cutoff <- cutoff
  
  res

}


## sub-function to get groups of duplicated characters based on input map (char1 in column 1, char2 in column 2)
findgroups <- function(x) {
  g <- graph_from_data_frame(x, directed=FALSE)
  grps <- max_cliques(g)
  grps <- sapply(1:length(grps), function(x) {as.numeric(names(grps[[x]]))})
  names(grps) <- paste('dup', seq_along(grps),sep='')
  grps
}

# x <- tmp  
# cutoff=0.2
update.nex <- function(x, cutoff=Inf) {
  dups <- x$dups
  newdups <- dups[dups$stringdist < cutoff, ]
  if (nrow(newdups)==0) {
    stop("No duplicates below the set cutoff")
  }
  g <- graph_from_edgelist(t(apply(newdups[,c('char1','char2'),drop=F], 1, as.character)), directed=FALSE)
  grps <- communities(components(g))
  x$clusters <- grps
  x$cutoff <- cutoff
  return(x)
}
celiason/phenotools documentation built on Sept. 16, 2024, 8:31 a.m.