library(knitr) opts_chunk$set(cache=TRUE)
library(dbGaPdb) library(RSQLite) library(dplyr) library(dbplyr) library(stringr) library("tm") library("SnowballC") library("wordcloud") library("RColorBrewer")
dbGaPdb_file <- pull_dbGaPdb_sqlite(local_path = '~/Desktop') dbGaPdb <- src_sqlite(dbGaPdb_file) # If you've already downloaded the sqlite file ## dbGaPdb <- src_sqlite('~/PATH/your_dbGaP.sqlite')
diseases_by_study <- tbl(dbGaPdb, "study_info") %>% select(root_study_accession, disease) %>% data.frame() %>% rowwise() %>% mutate(study_accession2 = str_split(root_study_accession, '\\.')[[1]][1]) %>% select(study_accession2, disease) %>% unique() %>% pull(disease)
docs <- Corpus(VectorSource(diseases_by_study)) # Convert the text to lower case docs <- tm_map(docs, content_transformer(tolower)) # Remove numbers docs <- tm_map(docs, removeNumbers) # Remove english common stopwords docs <- tm_map(docs, removeWords, stopwords("english")) # Remove your own stop word # specify your stopwords as a character vector docs <- tm_map(docs, removeWords, c("diseases", "disease")) # Remove punctuations docs <- tm_map(docs, removePunctuation) # Eliminate extra white spaces docs <- tm_map(docs, stripWhitespace)
dtm <- TermDocumentMatrix(docs) m <- as.matrix(dtm) v <- sort(rowSums(m),decreasing=TRUE) d <- data.frame(word = names(v),freq=v)
set.seed(1234) wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words=100, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"), scale=c(4,1))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.