In RGLab/BioCorpus: Biological Terms Corpus for Language Processing and Text Tidying

knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(tidyr)
library(BioCorpus)
library(corpusFreq)

How the list is made

In the makeContextVector() function, the utility function vec2words() is used. This function takes in a character vector and then performs a series of operations: 1. De-duplicate the elements in the vector 2. Remove NA elements 3. Substitute a single whitespace string for certain punctuation 4. Condense any multiple whitespaces into one 5. Split each element into a sub-list of character strings with split = " ", aka words 6. Collect all words into one list 7. Set all letters to lowercase 8. Remove digit-only words 9. For words with only alpha characters, break apart compound words 10. For words with mixed alphanumeric characters, keep only those with "cd|il" 11. Remove words with less than 2 characters

This order of operations is similar to those implemented by the openRefine project by Google. The key decisions are described below: 1. De-duplication prior to splitting - By de-duplicating first, we bias the corpus towards words that are found across studies instead of within studies. This is because it is common for elements in the same study to have been copy-pasted and therefore not unique. If there is a misspelling, then this error is propagated. Therefore, we seek to mitigate possible copy-paste errors using unique() at this early stage. 2. Removing punctuation and digits - Although some punctuation is used in different terms, such as "anti-hemoglobin", it was decided to limit punctuation only to biologically-specific terms such as "CD4+" and "IL-2"

# Create df with value, study accession, and tableName-colName as columns
ISqueries <- read.csv("ImmuneSpaceQueries.csv", stringsAsFactors = F)
ISdfs <- mapply(getISdata, 
                tableName = ISqueries$tableName,
                colName = ISqueries$columnName,
                SIMPLIFY = FALSE) # 'SIMPLIFY' defaults to TRUE and returns vector. Argh.
df <- do.call(rbind, ISdfs)

# Helper for making frequency tables
makeFTs <- function(values, colNm){
  freqTbls <- lapply(values, function(x){ 
    tmp <- df[ df[[colNm]] == x, ]
    ft <- corpusFreq::makeFreqTbl(tmp$value)
  })
}

# Word frequencies by tblCol
tcNms <- paste(ISqueries$tableName, ISqueries$columnName, sep = "-")
ISFreqsByTbl <- makeFTs(tcNms, "tblCol")
names(ISFreqsByTbl) <- tcNms
save(ISFreqsByTbl, file = "../data/ISFreqsByTbl.rda")

# Word frequencies by Sdy
studies <- unique(df$study_accession)
ISFreqsBySdy <- makeFTs(studies, "study_accession")
names(ISFreqsBySdy) <- studies
save(ISFreqsBySdy, file = "../data/ISFreqsBySdy.rda")

# No subsetting - all words as one vector
ISFreqsAll <- corpusFreq::makeFreqTbl(df$value)
save(ISFreqsAll, file = "../data/ISFreqsAll.rda")

Look at most common words in whole db

allDf <- data.frame(ISFreqsAll, stringsAsFactors = F)
colnames(allDf)[[1]] <- "word"

# Getting a sense of frequency distribution
print(quantile(allDf$Freq))

# Look at most common words
allDf <- allDf[ order(-allDf$Freq), ]
common <- allDf[ allDf$Freq > 100, ]
hist(common$Freq)
DT::datatable(common[1:30, ])

look at most unique words in each tblCol and Study. This is known as the

text-frequency inverse-document-frequency or Tf-Idf.

# Helper for tf-idf
makeTfIdf <- function(freqTbl){
  tLs <- lapply(names(freqTbl), function(x){
    tmp <- data.frame(freqTbl[[x]], stringsAsFactors = FALSE)
    colnames(tmp)[[1]] <- "word"
    tmp$varNm <- x
    tmp$varPerc <- tmp$Freq / sum(tmp$Freq)
    return(tmp)
  })
  tDf <- do.call(rbind, tLs)

  tDf$docFreq <- allDf$Freq[ match(tDf$word, allDf$word) ]
  tDf$docPerc <- tDf$docFreq / sum(tDf$docFreq)
  tDf$tf_idf <- tDf$varPerc / tDf$docPerc

  subDf <- tDf %>%
    group_by(varNm) %>%
    arrange(desc(tf_idf)) %>%
    slice(1:3)
}

topWordsByTbl <- makeTfIdf(ISFreqsByTbl)
topWordsBySdy <- makeTfIdf(ISFreqsBySdy)

DT::datatable(topWordsBySdy)

Can we recommend studies based on similarity of words in them?

Correlation matrix of word proportion in each sdy by sdy name

# Fill in words with zero frequency for each study and order alphabetically
allNms <- unique(unlist(sapply(ISFreqsBySdy, function(x){
    return(names(x))
})))

newAll <- lapply(ISFreqsBySdy, function(x){
  missing <- setdiff(allNms, names(x))
  new <- as.list(rep(0,length(missing)))
  names(new) <- missing
  all <- c(new, as.list(x))
  all <- all[ order(names(all)) ]
})

# fast list-2-df, NULLs introduced is a problem!
corrMx <- structure(newAll, row.names = c(NA, -(length(newAll[[1]]))), class = "data.frame")
rownames(corrMx) <- names(newAll[[1]])
corrMx <- apply(corrMx, 2, as.numeric)

# create corr matrix and visualize as heatmap
res <- cor(corrMx)
heatmap(res)

Similarities by organization?

# Hard to see differences, so look at cluster in a dendogram
distr <- dist(res)
hcr <- hclust(distr)
ddr <- as.dendrogram(hcr)
plot(ddr)

# Let's create 10 groups and see correlate with the institution listed with the study
trees <- cutree(hcr, k = 10)
invTrees <- Biobase::reverseSplit(trees)
clusts <- rep(names(invTrees), lengths(invTrees))
treeClusts <- data.frame(clusters = clusts, study_accession = unname(unlist(invTrees)))

sdyInsts <- getISdata("study_personnel", "organization")
sdyInsts <- unique(sdyInsts)
sdyInsts <- sdyInsts[,1:2]

OrgsClusts <- merge(sdyInsts, treeClusts, by = "study_accession")
chiRes <- chisq.test(x = OrgsClusts$value, y = OrgsClusts$clusters)

Tidying up the all studies frequency table to remove misspellings

Look at the suggestions for 3 different methods

x <- names(ISFreqsAll)

# hunspell::hunspell_suggest based on english dictionary
library(hunspell)
correct <- hunspell_check(x)
incorrect <- as.character(x[ !correct ])
suggs <- hunspell_suggest(incorrect)
suggs_likely <- lapply(suggs, function(x){ ifelse(length(x) > 1, x[[1]], x)})
inspect_huns <- data.frame(orig = incorrect, 
                           suggestions = unlist(suggs_likely), 
                           stringsAsFactors = FALSE)

# RefinR based on google's OpenRefine
library(refinr)
x_refin <- x %>% 
  refinr::key_collision_merge() %>% 
  refinr::n_gram_merge()
results <- data_frame(original_values = x, suggestions = x_refin) %>% 
  mutate(equal = original_values == suggestions)
inspect_refine <- results[ results$original_values %in% incorrect, ]
inspect_refine <- inspect_refine[ order(match(inspect_refine$original_values, incorrect)), ]

# Offer suggestions based on frequencies in a custom frequency table (dictionary)
bestGuess <- freqSugg(incorrect, ISFreqsAll)
inspect_sims <- data.frame(incorrect = incorrect, suggestions = bestGuess, stringsAsFactors = FALSE)

# Create df holding all suggestions ... without levels!
suggDf <- data.frame(incorrect = incorrect,
                     hunspell = inspect_huns$suggestions,
                     refinR = inspect_refine$suggestions,
                     bestGuess = inspect_sims$suggestions,
                     stringsAsFactors = FALSE)

DT::datatable(suggDf)

# Using corpusFreq's interactive functions, we can correct the freqTbl and write out all our changes
# to a new rda file.

# 1. Many terms less than 4 characters are abbreviations that are impossible to discern
tmp <- suggDf[ nchar(incorrect) >= 4, ]

# 2. Of those greater than or equal to 4 char, only looking at those where freqTbl offers
# suggestions
tmp <- tmp[ tmp$incorrect != tmp$bestGuess, ] # this leaves ~550 words
tmp <- tmp[ tmp$incorrect != paste0(tmp$bestGuess, "s")] # remove plural changes
tmp <- tmp[ grep(tmp$incorrect, tmp$hunspell, ignore.case = TRUE, invert = TRUE), ]

# 3. Let's use the interactiveFindReplace() from corpusFreq to generate an R script
# with all the substitutions we would like to make to names(ISFreqsAll). The work
# was done and the file correctISFreqsAll.R was generated. So we source that now.
bw <- mapply(c, tmp$bestGuess, tmp$hunspell, SIMPLIFY = FALSE)
names(bw) <- tmp$incorrect
# corrWords <- corpusFreq::InteractiveFindReplace(badWords = bw, 
#                                                 input = suggDf$incorrect, 
#                                                 outFile = "correctISFreqsAll.R")
source("correctISFreqsAll.R") # get custom created fn `spellCheckRes()`

# 4. Update the frequency table with the substitutions
ISFreqsAllEdit <- ISFreqsAll
isfae <- data.frame(word = names(ISFreqsAllEdit), 
                    Freq = unname(ISFreqsAllEdit), 
                    stringsAsFactors = F)
isfae$word <- spellCheckRes(isfae$word)

# 5. Update the frequencies since there are approx 100 non-unique rows now
ISFreqsAllEdit <- isfae %>%
  group_by(word) %>%
  mutate(Freq = sum(Freq.Freq)) %>%
  summarize(unique(Freq))

# 6. Save the edited ft
save(ISFreqsAllEdit, file = "../data/ISFreqsAllEdit.rda")

RGLab/BioCorpus documentation built on Dec. 18, 2021, 8:47 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

RGLab/BioCorpus
Biological Terms Corpus for Language Processing and Text Tidying

In RGLab/BioCorpus: Biological Terms Corpus for Language Processing and Text Tidying

How the list is made

Look at most common words in whole db

look at most unique words in each tblCol and Study. This is known as the

Can we recommend studies based on similarity of words in them?

Similarities by organization?

Tidying up the all studies frequency table to remove misspellings

R Package Documentation

Browse R Packages

We want your feedback!

RGLab/BioCorpus Biological Terms Corpus for Language Processing and Text Tidying

In RGLab/BioCorpus: Biological Terms Corpus for Language Processing and Text Tidying

How the list is made

Look at most common words in whole db

look at most unique words in each tblCol and Study. This is known as the

Can we recommend studies based on similarity of words in them?

Similarities by organization?

Tidying up the all studies frequency table to remove misspellings

R Package Documentation

Browse R Packages

We want your feedback!

RGLab/BioCorpus
Biological Terms Corpus for Language Processing and Text Tidying