R-CMD-check Travis build status

text2df

An R package for working with text data. Mostly a wrapper for the corpus, quanteda & udpipe packages, and an attempt at a uniform framework.

devtools::install_github("jaytimm/text2df")

Some data

library(dplyr)
pmids <- pubmedr::pmed_search_pubmed(search_term = 'Psilocybin', 
                                       fields = c('TIAB','MH'))

corpus <- pubmedr::pmed_get_records2(pmids = pmids$pmid) |>
  bind_rows() |>
  filter(!is.na(abstract)) |>
  rename(doc_id = pmid, text = abstract)

tif2sentence

x0 <- corpus |>
  text2df::tif2sentence()

head(x0) %>% knitr::kable()

tif2token

x1 <- corpus |>
  text2df::tif2sentence() |>
  text2df::tif2token()

x1[c(1:3)]

token2mwe

library(pubmedr)
data("pmed_tbl_mesh")

mwe <- pmed_tbl_mesh |>
  filter(!grepl(',', TermName)) |>
  filter(grepl(' ', TermName)) |>
  distinct(TermName, .keep_all = T) 

sample(mwe$TermName, size = 10)
x10 <- corpus |>
  text2df::tif2sentence() |>
  text2df::tif2token() |>
  text2df::token2mwe(mwe = mwe$TermName)

x10[c(1:3)]

token2df

x2 <- corpus |>
  text2df::tif2sentence() |>
  text2df::tif2token() |>
  text2df::token2mwe(mwe = mwe$TermName) |>
  text2df::token2df()

x2 |> head() |> knitr::kable()

token2annotation

locald <- '/home/jtimm/pCloudDrive/nlp/udpipe-model'
setwd(locald)
udmodel <- udpipe::udpipe_load_model('english-ewt-ud-2.5-191206.udpipe')

x3 <- corpus |>
  text2df::tif2sentence() |>
  text2df::tif2token() |>
  text2df::token2mwe(mwe = mwe$TermName) |>
  text2df::token2annotation(model = udmodel)

x3 %>% head() %>% knitr::kable()


jaytimm/text2df documentation built on July 21, 2023, 1:58 a.m.