In joe-devivo/pdfSpeciesScraper: Extraction of Species Names from Data Store PDF Documents

knitr::opts_chunk$set(echo = TRUE)
source("functions/FileProcessingFunctions.R")
source("functions/DataCleanup.R")

packages <- c("RODBC", "tm", "quanteda", "readtext")

package.check <- lapply(packages, FUN = function(x) {
  if (!require(x, character.only = TRUE)) {
    install.packages(x, dependencies = TRUE, repos = "http://cran.us.r-project.org")
    library(x, character.only = TRUE)
  }
})

doclocation <- file.path("d:/texts")
load("../NPSpecies/data/dicSpecies.rda")
load("../NPSpecies/data/NPSpeciesJustSpecies.rda")

Download and Scrape Files

The following pulls data from the SQL database via machine ODBC connection. We will be updating several processing tables that feed into this query as data are processed. Limited only to files that have not been flagged as previously downladed.

chunksize<-50

ch <- odbcConnect("pdfScraper")
pdfFileProcessingStatus<-sqlFetch(ch,"dbo.vw_pdfFileProcessingStatus",as.is = c(TRUE), stringsAsFactors = FALSE)
odbcClose(ch)

FilesNotDownloaded<-subset(pdfFileProcessingStatus, Downloaded != 1)

# initialize temp download status data frame
tempDownloadStatus<-data.frame(
  DigitalFileID=integer(),
  Downloaded=integer()
)

for (i in 1:chunksize) {
  GetPdfFromDataStore(FilesNotDownloaded$DigitalFileID[i])
  tempDownloadStatus<-rbind(tempDownloadStatus,data.frame(DigitalFileID=FilesNotDownloaded$DigitalFileID[i],Downloaded=1))
  Sys.sleep(2)
}

# update downloaded table in the SQL database
ch <- odbcConnect("pdfScraper")
sqlSave(ch,tempDownloadStatus,"dbo.tbl_DownloadStatus",append=TRUE,rownames=FALSE)
pdfFileProcessingStatus<-sqlFetch(ch,"dbo.vw_pdfFileProcessingStatus",as.is = c(TRUE), stringsAsFactors = FALSE)
odbcClose(ch)

# initialize temp download status data frame
tempScrapeStatus<-data.frame(
  DigitalFileID=integer(),
  Scraped=integer()
)

# Create Corpus from Downloaded Files
docs<-readtext(paste0(doclocation, "/*.pdf"),
              docvarsfrom = "filenames",
              dvsep = "_"
               )

docscorpus<-corpus(docs)

# extract the docIDs and update the scraped table
for (i in 1:nrow(docs)) {
  tempScrapeStatus<-rbind(tempScrapeStatus,data.frame(DigitalFileID=docs$docvar1[i],Scraped=1))
}

# update downloaded table in the SQL database
ch <- odbcConnect("pdfScraper")
sqlSave(ch,tempScrapeStatus,"dbo.tbl_ScrapeStatus",append=TRUE,rownames=FALSE)
pdfFileProcessingStatus<-sqlFetch(ch,"dbo.vw_pdfFileProcessingStatus",as.is = c(TRUE), stringsAsFactors = FALSE)
odbcClose(ch)

start_time <- Sys.time()

# initialize temp analysis status data frame
tempAnalysisStatus<-data.frame(
  DigitalFileID=integer(),
  TaxonomyAnalyzed=integer()
)

# Analyze Text from files in the corpus by comparing it to the species dictionary
hits<-as.data.frame(kwic(docscorpus,phrase(dicSpecies),valuetype="fixed", include_docvars=TRUE))
names(hits)[5]<-"SciName"

# extract the doc IDs and update the analyzed table
for (i in 1:nrow(docs)) {
  tempAnalysisStatus<-rbind(tempAnalysisStatus,data.frame(DigitalFileID=docs$docvar1[i],TaxonomyAnalyzed=1))
}

# Write data to SQL table
ch <- odbcConnect("pdfScraper")
sqlSave(ch,hits,"dbo.tbl_TaxonomicScrapingResults",append=TRUE,rownames=FALSE)
sqlSave(ch,tempAnalysisStatus,"dbo.tbl_TaxonomyAnalysisStatus",append=TRUE,rownames=FALSE)
pdfFileProcessingStatus<-sqlFetch(ch,"dbo.vw_pdfFileProcessingStatus",as.is = c(TRUE), stringsAsFactors = FALSE)
odbcClose(ch)

Cleaning up...

# Delete Files in the directory
FilesNotDownloaded<-subset(pdfFileProcessingStatus, Downloaded != 1)
file.remove(file.path("d:/texts", list.files("d:/texts"))) 

print(Sys.time()-start_time)
print(paste(nrow(FilesNotDownloaded), "to go"))

joe-devivo/pdfSpeciesScraper documentation built on May 17, 2019, 2:15 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

joe-devivo/pdfSpeciesScraper
Extraction of Species Names from Data Store PDF Documents

In joe-devivo/pdfSpeciesScraper: Extraction of Species Names from Data Store PDF Documents

Download and Scrape Files

R Package Documentation

Browse R Packages

We want your feedback!

joe-devivo/pdfSpeciesScraper Extraction of Species Names from Data Store PDF Documents

In joe-devivo/pdfSpeciesScraper: Extraction of Species Names from Data Store PDF Documents

Download and Scrape Files

R Package Documentation

Browse R Packages

We want your feedback!

joe-devivo/pdfSpeciesScraper
Extraction of Species Names from Data Store PDF Documents