knitr::opts_chunk$set(echo = TRUE) library(dplyr) library(tidyr) library(BioCorpus) library(corpusFreq)
In the makeContextVector()
function, the utility function vec2words()
is used. This function
takes in a character vector and then performs a series of operations:
1. De-duplicate the elements in the vector
2. Remove NA elements
3. Substitute a single whitespace string for certain punctuation
4. Condense any multiple whitespaces into one
5. Split each element into a sub-list of character strings with split = " ", aka words
6. Collect all words into one list
7. Set all letters to lowercase
8. Remove digit-only words
9. For words with only alpha characters, break apart compound words
10. For words with mixed alphanumeric characters, keep only those with "cd|il"
11. Remove words with less than 2 characters
This order of operations is similar to those implemented by the openRefine project by Google.
The key decisions are described below:
1. De-duplication prior to splitting - By de-duplicating first, we bias the corpus towards words
that are found across studies instead of within studies. This is because it is common for elements
in the same study to have been copy-pasted and therefore not unique. If there is a misspelling,
then this error is propagated. Therefore, we seek to mitigate possible copy-paste errors using
unique()
at this early stage.
2. Removing punctuation and digits - Although some punctuation is used in different terms, such
as "anti-hemoglobin", it was decided to limit punctuation only to biologically-specific terms
such as "CD4+" and "IL-2"
# Create df with value, study accession, and tableName-colName as columns ISqueries <- read.csv("ImmuneSpaceQueries.csv", stringsAsFactors = F) ISdfs <- mapply(getISdata, tableName = ISqueries$tableName, colName = ISqueries$columnName, SIMPLIFY = FALSE) # 'SIMPLIFY' defaults to TRUE and returns vector. Argh. df <- do.call(rbind, ISdfs) # Helper for making frequency tables makeFTs <- function(values, colNm){ freqTbls <- lapply(values, function(x){ tmp <- df[ df[[colNm]] == x, ] ft <- corpusFreq::makeFreqTbl(tmp$value) }) } # Word frequencies by tblCol tcNms <- paste(ISqueries$tableName, ISqueries$columnName, sep = "-") ISFreqsByTbl <- makeFTs(tcNms, "tblCol") names(ISFreqsByTbl) <- tcNms save(ISFreqsByTbl, file = "../data/ISFreqsByTbl.rda") # Word frequencies by Sdy studies <- unique(df$study_accession) ISFreqsBySdy <- makeFTs(studies, "study_accession") names(ISFreqsBySdy) <- studies save(ISFreqsBySdy, file = "../data/ISFreqsBySdy.rda") # No subsetting - all words as one vector ISFreqsAll <- corpusFreq::makeFreqTbl(df$value) save(ISFreqsAll, file = "../data/ISFreqsAll.rda")
allDf <- data.frame(ISFreqsAll, stringsAsFactors = F) colnames(allDf)[[1]] <- "word" # Getting a sense of frequency distribution print(quantile(allDf$Freq)) # Look at most common words allDf <- allDf[ order(-allDf$Freq), ] common <- allDf[ allDf$Freq > 100, ] hist(common$Freq) DT::datatable(common[1:30, ])
text-frequency inverse-document-frequency or Tf-Idf.
# Helper for tf-idf makeTfIdf <- function(freqTbl){ tLs <- lapply(names(freqTbl), function(x){ tmp <- data.frame(freqTbl[[x]], stringsAsFactors = FALSE) colnames(tmp)[[1]] <- "word" tmp$varNm <- x tmp$varPerc <- tmp$Freq / sum(tmp$Freq) return(tmp) }) tDf <- do.call(rbind, tLs) tDf$docFreq <- allDf$Freq[ match(tDf$word, allDf$word) ] tDf$docPerc <- tDf$docFreq / sum(tDf$docFreq) tDf$tf_idf <- tDf$varPerc / tDf$docPerc subDf <- tDf %>% group_by(varNm) %>% arrange(desc(tf_idf)) %>% slice(1:3) } topWordsByTbl <- makeTfIdf(ISFreqsByTbl) topWordsBySdy <- makeTfIdf(ISFreqsBySdy) DT::datatable(topWordsBySdy)
Correlation matrix of word proportion in each sdy by sdy name
# Fill in words with zero frequency for each study and order alphabetically allNms <- unique(unlist(sapply(ISFreqsBySdy, function(x){ return(names(x)) }))) newAll <- lapply(ISFreqsBySdy, function(x){ missing <- setdiff(allNms, names(x)) new <- as.list(rep(0,length(missing))) names(new) <- missing all <- c(new, as.list(x)) all <- all[ order(names(all)) ] }) # fast list-2-df, NULLs introduced is a problem! corrMx <- structure(newAll, row.names = c(NA, -(length(newAll[[1]]))), class = "data.frame") rownames(corrMx) <- names(newAll[[1]]) corrMx <- apply(corrMx, 2, as.numeric) # create corr matrix and visualize as heatmap res <- cor(corrMx) heatmap(res)
# Hard to see differences, so look at cluster in a dendogram distr <- dist(res) hcr <- hclust(distr) ddr <- as.dendrogram(hcr) plot(ddr) # Let's create 10 groups and see correlate with the institution listed with the study trees <- cutree(hcr, k = 10) invTrees <- Biobase::reverseSplit(trees) clusts <- rep(names(invTrees), lengths(invTrees)) treeClusts <- data.frame(clusters = clusts, study_accession = unname(unlist(invTrees))) sdyInsts <- getISdata("study_personnel", "organization") sdyInsts <- unique(sdyInsts) sdyInsts <- sdyInsts[,1:2] OrgsClusts <- merge(sdyInsts, treeClusts, by = "study_accession") chiRes <- chisq.test(x = OrgsClusts$value, y = OrgsClusts$clusters)
Look at the suggestions for 3 different methods
x <- names(ISFreqsAll) # hunspell::hunspell_suggest based on english dictionary library(hunspell) correct <- hunspell_check(x) incorrect <- as.character(x[ !correct ]) suggs <- hunspell_suggest(incorrect) suggs_likely <- lapply(suggs, function(x){ ifelse(length(x) > 1, x[[1]], x)}) inspect_huns <- data.frame(orig = incorrect, suggestions = unlist(suggs_likely), stringsAsFactors = FALSE) # RefinR based on google's OpenRefine library(refinr) x_refin <- x %>% refinr::key_collision_merge() %>% refinr::n_gram_merge() results <- data_frame(original_values = x, suggestions = x_refin) %>% mutate(equal = original_values == suggestions) inspect_refine <- results[ results$original_values %in% incorrect, ] inspect_refine <- inspect_refine[ order(match(inspect_refine$original_values, incorrect)), ] # Offer suggestions based on frequencies in a custom frequency table (dictionary) bestGuess <- freqSugg(incorrect, ISFreqsAll) inspect_sims <- data.frame(incorrect = incorrect, suggestions = bestGuess, stringsAsFactors = FALSE) # Create df holding all suggestions ... without levels! suggDf <- data.frame(incorrect = incorrect, hunspell = inspect_huns$suggestions, refinR = inspect_refine$suggestions, bestGuess = inspect_sims$suggestions, stringsAsFactors = FALSE) DT::datatable(suggDf)
# Using corpusFreq's interactive functions, we can correct the freqTbl and write out all our changes # to a new rda file. # 1. Many terms less than 4 characters are abbreviations that are impossible to discern tmp <- suggDf[ nchar(incorrect) >= 4, ] # 2. Of those greater than or equal to 4 char, only looking at those where freqTbl offers # suggestions tmp <- tmp[ tmp$incorrect != tmp$bestGuess, ] # this leaves ~550 words tmp <- tmp[ tmp$incorrect != paste0(tmp$bestGuess, "s")] # remove plural changes tmp <- tmp[ grep(tmp$incorrect, tmp$hunspell, ignore.case = TRUE, invert = TRUE), ] # 3. Let's use the interactiveFindReplace() from corpusFreq to generate an R script # with all the substitutions we would like to make to names(ISFreqsAll). The work # was done and the file correctISFreqsAll.R was generated. So we source that now. bw <- mapply(c, tmp$bestGuess, tmp$hunspell, SIMPLIFY = FALSE) names(bw) <- tmp$incorrect # corrWords <- corpusFreq::InteractiveFindReplace(badWords = bw, # input = suggDf$incorrect, # outFile = "correctISFreqsAll.R") source("correctISFreqsAll.R") # get custom created fn `spellCheckRes()` # 4. Update the frequency table with the substitutions ISFreqsAllEdit <- ISFreqsAll isfae <- data.frame(word = names(ISFreqsAllEdit), Freq = unname(ISFreqsAllEdit), stringsAsFactors = F) isfae$word <- spellCheckRes(isfae$word) # 5. Update the frequencies since there are approx 100 non-unique rows now ISFreqsAllEdit <- isfae %>% group_by(word) %>% mutate(Freq = sum(Freq.Freq)) %>% summarize(unique(Freq)) # 6. Save the edited ft save(ISFreqsAllEdit, file = "../data/ISFreqsAllEdit.rda")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.