README.md

R-CMD-check Travis build
status

text2df

An R package for working with text data. Mostly a wrapper for the corpus, quanteda & udpipe packages, and an attempt at a uniform framework.

devtools::install_github("jaytimm/text2df")

Some data

library(dplyr)
pmids <- pubmedr::pmed_search_pubmed(search_term = 'Psilocybin', 
                                       fields = c('TIAB','MH'))
## [1] "Psilocybin[TIAB] OR Psilocybin[MH]: 1414 records"
corpus <- pubmedr::pmed_get_records2(pmids = pmids$pmid) |>
  bind_rows() |>
  filter(!is.na(abstract)) |>
  rename(doc_id = pmid, text = abstract)

tif2sentence

x0 <- corpus |>
  text2df::tif2sentence()

head(x0) %>% knitr::kable()

| doc_id | text | |:---|:-------------------------------------------------------------------| | 36129571.1 | Few treatments are available for patients with mood disorders or post-traumatic stress disorder (PTSD) who have already failed multiple interventions. | | 36129571.2 | After several decades when research into psychedelics was effectively halted by federal legislation, the past several years have shown the re-emergence of thoughtful investigations studying the utility of compounds such as 3,4-methylenedioxymethamphetamine (MDMA) and psilocybin. | | 36129571.3 | Several studies have coupled the safe administration of psychedelic compounds in a controlled environment after several hours of preparation of study participants and followed by multiple sessions to integrate the psychedelic experience. | | 36129571.4 | The improvement participants experience appear related to the often profound perspective changes experienced and seem unlike the improvements seen in the currently available care paradigms. | | 36129571.5 | Studies cited include treatment resistant depression, end of life despair, and PTSD. | | 36129571.6 | Psychedelic psychotherapy, a unique remarriage of biological therapy and psychotherapy, has the potential to transform mental health care. |

tif2token

x1 <- corpus |>
  text2df::tif2sentence() |>
  text2df::tif2token()

x1[c(1:3)]
## $`36129571.1`
##  [1] "Few"            "treatments"     "are"            "available"     
##  [5] "for"            "patients"       "with"           "mood"          
##  [9] "disorders"      "or"             "post-traumatic" "stress"        
## [13] "disorder"       "("              "PTSD"           ")"             
## [17] "who"            "have"           "already"        "failed"        
## [21] "multiple"       "interventions"  "."             
## 
## $`36129571.2`
##  [1] "After"                             "several"                          
##  [3] "decades"                           "when"                             
##  [5] "research"                          "into"                             
##  [7] "psychedelics"                      "was"                              
##  [9] "effectively"                       "halted"                           
## [11] "by"                                "federal"                          
## [13] "legislation"                       ","                                
## [15] "the"                               "past"                             
## [17] "several"                           "years"                            
## [19] "have"                              "shown"                            
## [21] "the"                               "re-emergence"                     
## [23] "of"                                "thoughtful"                       
## [25] "investigations"                    "studying"                         
## [27] "the"                               "utility"                          
## [29] "of"                                "compounds"                        
## [31] "such"                              "as"                               
## [33] "3,4-methylenedioxymethamphetamine" "("                                
## [35] "MDMA"                              ")"                                
## [37] "and"                               "psilocybin"                       
## [39] "."                                
## 
## $`36129571.3`
##  [1] "Several"        "studies"        "have"           "coupled"       
##  [5] "the"            "safe"           "administration" "of"            
##  [9] "psychedelic"    "compounds"      "in"             "a"             
## [13] "controlled"     "environment"    "after"          "several"       
## [17] "hours"          "of"             "preparation"    "of"            
## [21] "study"          "participants"   "and"            "followed"      
## [25] "by"             "multiple"       "sessions"       "to"            
## [29] "integrate"      "the"            "psychedelic"    "experience"    
## [33] "."

token2mwe

library(pubmedr)
data("pmed_tbl_mesh")

mwe <- pmed_tbl_mesh |>
  filter(!grepl(',', TermName)) |>
  filter(grepl(' ', TermName)) |>
  distinct(TermName, .keep_all = T) 

sample(mwe$TermName, size = 10)
##  [1] "SOS Protein"                          
##  [2] "Todds Paralysis"                      
##  [3] "PCR 4099"                             
##  [4] "Hospital Morgues"                     
##  [5] "Eulenburg's Disease"                  
##  [6] "Non-Steroidal Anti-Inflammatory Agent"
##  [7] "Cyclin-Dependent Kinase Inhibitor 2C" 
##  [8] "Host Parasite Interactions"           
##  [9] "Peritoneal Fibrosing Syndrome"        
## [10] "Radiation Protective Effects"
x10 <- corpus |>
  text2df::tif2sentence() |>
  text2df::tif2token() |>
  text2df::token2mwe(mwe = mwe$TermName)

x10[c(1:3)]
## $`36129571.1`
##  [1] "Few"                            "treatments"                    
##  [3] "are"                            "available"                     
##  [5] "for"                            "patients"                      
##  [7] "with"                           "mood_disorders"                
##  [9] "or"                             "post-traumatic_stress_disorder"
## [11] "("                              "PTSD"                          
## [13] ")"                              "who"                           
## [15] "have"                           "already"                       
## [17] "failed"                         "multiple"                      
## [19] "interventions"                  "."                             
## 
## $`36129571.2`
##  [1] "After"                             "several"                          
##  [3] "decades"                           "when"                             
##  [5] "research"                          "into"                             
##  [7] "psychedelics"                      "was"                              
##  [9] "effectively"                       "halted"                           
## [11] "by"                                "federal"                          
## [13] "legislation"                       ","                                
## [15] "the"                               "past"                             
## [17] "several"                           "years"                            
## [19] "have"                              "shown"                            
## [21] "the"                               "re-emergence"                     
## [23] "of"                                "thoughtful"                       
## [25] "investigations"                    "studying"                         
## [27] "the"                               "utility"                          
## [29] "of"                                "compounds"                        
## [31] "such"                              "as"                               
## [33] "3,4-methylenedioxymethamphetamine" "("                                
## [35] "MDMA"                              ")"                                
## [37] "and"                               "psilocybin"                       
## [39] "."                                
## 
## $`36129571.3`
##  [1] "Several"                "studies"                "have"                  
##  [4] "coupled"                "the"                    "safe"                  
##  [7] "administration"         "of"                     "psychedelic"           
## [10] "compounds"              "in"                     "a"                     
## [13] "controlled_environment" "after"                  "several"               
## [16] "hours"                  "of"                     "preparation"           
## [19] "of"                     "study"                  "participants"          
## [22] "and"                    "followed"               "by"                    
## [25] "multiple"               "sessions"               "to"                    
## [28] "integrate"              "the"                    "psychedelic"           
## [31] "experience"             "."

token2df

x2 <- corpus |>
  text2df::tif2sentence() |>
  text2df::tif2token() |>
  text2df::token2mwe(mwe = mwe$TermName) |>
  text2df::token2df()

x2 |> head() |> knitr::kable()

| doc_id | token | sentence_id | term_id | token_id | |:---------|:-----------|:------------|--------:|---------:| | 36129571 | Few | 1 | 1 | 1 | | 36129571 | treatments | 1 | 2 | 2 | | 36129571 | are | 1 | 3 | 3 | | 36129571 | available | 1 | 4 | 4 | | 36129571 | for | 1 | 5 | 5 | | 36129571 | patients | 1 | 6 | 6 |

token2annotation

setwd(locald)
udmodel <- udpipe::udpipe_load_model('english-ewt-ud-2.5-191206.udpipe')

x3 <- corpus |>
  text2df::tif2sentence() |>
  text2df::tif2token() |>
  text2df::token2mwe(mwe = mwe$TermName) |>
  text2df::token2annotation(model = udmodel)

x3 %>% head() %>% knitr::kable()

| doc_id | sentence_id | start | end | term_id | token_id | token | lemma | upos | xpos | feats | |:-----|------:|---:|--:|----:|:-----|:------|:-----|:---|:---|:----------------------| | 36129571 | 1 | 1 | 3 | 1 | 1 | Few | few | ADJ | JJ | Degree=Pos | | 36129571 | 1 | 5 | 14 | 2 | 2 | treatments | treatment | NOUN | NNS | Number=Plur | | 36129571 | 1 | 16 | 18 | 3 | 3 | are | be | AUX | VBP | Mood=Ind\|Tense=Pres\|VerbForm=Fin | | 36129571 | 1 | 20 | 28 | 4 | 4 | available | available | ADJ | JJ | Degree=Pos | | 36129571 | 1 | 30 | 32 | 5 | 5 | for | for | ADP | IN | NA | | 36129571 | 1 | 34 | 41 | 6 | 6 | patients | patient | NOUN | NNS | Number=Plur |



jaytimm/text2df documentation built on July 21, 2023, 1:58 a.m.