data-raw/reference.R

# Import Edinburgh Associative Thesaurus as XML
rs <- XML::xmlTreeParse("data-raw/eat-response-stimulus.xml")
rs.list <- XML::xmlToList(rs)

# Convert XML to a human-readable and human-usable list of vectors
# - Names of the list refer to word prompts
# - Names of the contained vector refer to associations
# - Value of vector element is strength
master <- vector("list", length(rs.list) - 1)
names(master) <- unlist(lapply(rs.list[-length(rs.list)], function(wa) wa$.attrs[1]))
for(word in 1:(length(rs.list)-1)){

  cat("(", word, "): Cleaning data for word ", rs.list[[word]]$.attrs[1], "\n", sep = "")

  numAssoc <- (length(rs.list[[word]]) - 1) / 3
  if(numAssoc < 1) break

  assocStr <- vector("numeric", numAssoc)
  assocNam <- vector("character", numAssoc)
  for(i in 1:numAssoc){

    rs.index <- 3 * (i - 1) + 1

    # Assign the value of the association
    assocStr[i] <- rs.list[[word]][rs.index + 1]

    # Name the association
    assocNam[i] <- rs.list[[word]][rs.index]
  }

  names(assocStr) <- assocNam
  master[[word]] <- assocStr
}

# Outline EAT reference table to use for AI
allprompts <- unique(names(master))
allreplies <- unique(unlist(lapply(master, function(wa) names(wa))))
cols <- length(allprompts)
rows <- length(allreplies)
reference <- matrix(0, nrow = rows, ncol = cols)
colnames(reference) <- allprompts
rownames(reference) <- allreplies

# Fill in EAT reference table
for(col in allprompts){

  cat ("Filling reference table for word:", col, "\n")

  for(row in names(master[[col]])){

    val <- unlist(master[[col]][row])
    if(!is.null(val)){

      reference[row, col] <- as.numeric(val)
    }
  }
}

# Include only prompts with many replies
backup <- reference
numreplies <- apply(reference, 2, function(col) sum(col > 0))
reference <- backup[, numreplies > 75]
devtools::use_data(reference, internal = FALSE)
devtools::use_data(reference, internal = TRUE)
tpq/codenames documentation built on May 31, 2019, 6:50 p.m.