#' Converts bibliographic text files extracted from ProQuest into analyzable data frame
#'
#' @param path path for the folder containing text files to be wrangled
#' @param rmDuplicates logical vector indicating whether the function should identify and remove duplicate article records
#' @param csv logical vector indicating whether the final data frame generated by the function should be saved to the working directory as a .csv file
#'
#' @examples proQuestBWR.f(csv=TRUE, path="C:/Users/JohnDoe/Desktop/ProQuestTextFiles")
#' @note proQuestBWR.f will wrangle all text files present in the path folder.
#'
#' @export
proQuestBWR.f <- function(path, rmDuplicates=TRUE, csv = FALSE){
#___________________________________________________________________________
# 1. READ ProQuest txt files
#---------------------------------------------------------------------------
#
# Check last line of txt file. Clean up might be necessary
#___________________________________________________________________________
temp <- list.files(path, pattern = ".txt", full.names=TRUE)
record <- unlist(lapply(temp, readLines, warn=FALSE))
# extract attributes as a separate variable
attributes <- stringr::str_extract(record, "(.*?):")
full.df <- data.frame(cbind(attributes, record))
full.df$articleID <- cumsum(grepl("________", full.df$record))
full.df <- tbl_df(full.df) #Convert to dplyr table for easier reading
full.df <- filter(full.df, record != "") #Eliminate empy rows
full.df <- filter(full.df, !is.na(attributes)) #Eliminate missing
full.df$attributes <- gsub(" ", "", full.df$attributes)
full.df$attributes <- gsub(":", "", full.df$attributes)
full.df$record <- gsub("^[^:]+:", "", full.df$record)
full.df$record <- ifelse(full.df$attributes=="Publicationdetails",
gsub("\\s\\(.*", "", full.df$record), full.df$record)
full.df$attributes <- ifelse(full.df$attributes == "Title",
"Article", full.df$attributes)
full.df$attributes <- ifelse(full.df$attributes== "Documentauthor",
"Author", full.df$attributes)
full.df$attributes <- ifelse(full.df$attributes == "Publicationdetails",
"Publicationtitle", full.df$attributes)
full.df$attributes <- ifelse(full.df$attributes == "Subject",
"keyWord", full.df$attributes)
attributeKeep <- c("Article", "Author", "Publicationtitle",
"Publicationyear", "Abstract", "keyWord", "Location", "Pages")
attributeKeepIndex <- full.df$attributes %in% attributeKeep
full.df <- full.df[attributeKeepIndex,]
#_______________________________________________________________________________
# 2. REMOVE DUPLICATE RECORDS
#-------------------------------------------------------------------------------
# In this section, the code is doing a global match for duplicate article records
# based on the title. Duplicates occur because multiple databases have overlapping indexing.
#_______________________________________________________________________________
if (rmDuplicates) {
#Select out all titles
DF.temp <- filter(full.df, attributes == "Article")
#Journal titles show discrepancies in capitalization rules. Force all to
#lower to address this problem. Further testing should consider stripping
#white space.
DF.temp$record <- tolower(DF.temp$record)
#Find duplicated records - duplicates are marked as true
DF.temp <- DF.temp[duplicated(DF.temp$record), ]
#Screen out duplicated records by articleID. The articleID must be used
#because the duplicate title contains other article attributes
DF.duplicated.ID <- DF.temp$articleID
full.df <- full.df[!(full.df$articleID %in% DF.duplicated.ID), ]
}
#_______________________________________________________________________________
# 3. AUTHOR FIELD FIX - AUTHORS IN SINGLE FIELD
#-------------------------------------------------------------------------------
#
# Social Work abstracts lists authors in a single cell, separated by a semi-
# colon, and then includes digits and email addresses in some occassions.
# The following text locates each author in the cell and places it into a new
# row to be consistent with PsychInfo and SSA.
#_______________________________________________________________________________
#Create a new temporary data frame
DF.temp <- filter(full.df, attributes == "Author")
#Identify records with semi-colons in author names
semi.colons <- grepl("(;)", DF.temp$record)
#Select out those records with semi-colons in author names from temporary
#data frame
DF.temp <- DF.temp[semi.colons, ]
#Add a semi colon to the end of every string
DF.temp$record <- paste(DF.temp$record, ";", sep="")
semi.colon.split <- strsplit(DF.temp$record, ";")
split.df <- data.frame(
attributes = rep(DF.temp$attributes, lapply(semi.colon.split, length)),
record = unlist(semi.colon.split),
articleID = rep(DF.temp$articleID, lapply(semi.colon.split, length)))
#Trim whitespace on both sides
split.df$record <- stringr::str_trim(split.df$record, side = "both")
#Eliminate author affiliation from author record
split.df$record <- gsub("1\\s.+", "", split.df$record)
#Eliminate remaining number from author record
split.df$record <- gsub("1", "", split.df$record)
# Create a vector of all articleID's that were fixed
fixed.ID <- unique(split.df$articleID)
#Filter out all processed records from the fixed list
DF.authors <- filter(full.df, attributes == "Author")
DF.authors.good <- DF.authors[!(DF.authors$articleID %in% fixed.ID),]
DF.authors.fixed <- split.df
DF.no.authors <- filter(full.df, attributes != "Author")
#Strip author biographies
DF.authors.good$record <- gsub("11\\s.+", "", DF.authors.good$record)
#Bind the reduced DF with the fixed df
full.df <- rbind(DF.no.authors, DF.authors.good, DF.authors.fixed)
full.df <- arrange(full.df, articleID)
#_______________________________________________________________________________
# 4. KEYWORD FIELD FIX - KEYWORDS IN SINGLE FIELD
#-------------------------------------------------------------------------------
#
# Social Work abstracts lists authors in a single cell, separated by a semi-
# colon, and then includes digits and email addresses in some occassions.
# The following text locates each author in the cell and places it into a new
# row to be consistent with PsychInfo and SSA.
#_______________________________________________________________________________
#Create a new temporary data frame
DF.temp <- filter(full.df, attributes == "keyWord")
#Add a comma to the end of every string
DF.temp$record <- paste(DF.temp$record, ";", sep="")
comma.split <- strsplit(DF.temp$record, ";")
split.df <- data.frame(
attributes = rep(DF.temp$attributes, lapply(comma.split, length)),
record = unlist(comma.split),
articleID = rep(DF.temp$articleID, lapply(comma.split, length)))
#Trim whitespace on both sides
split.df$record <- gsub("\\*", "", split.df$record)
split.df$record <- stringr::str_trim(split.df$record, side = "both")
split.df$record <- tolower(split.df$record)
#Filter out all processed records from the fixed list
DF.no.subject <- filter(full.df, attributes != "keyWord")
DF.subject.fixed <- split.df
#Bind the reduced DF with the fixed df
DF <- rbind(DF.no.subject, DF.subject.fixed)
DF <- arrange(DF, articleID)
#_______________________________________________________________________________
# 5. Minor Cleaning
#-------------------------------------------------------------------------------
#
# In this section, meaningful variable names are assigned to variables that have
# been cleaned and are appropriate for analysis. All other variables are
# excluded to prevent inappropriate analyses.
#_______________________________________________________________________________
DF$attributes <- tolower(DF$attributes)
DF$attributes <- ifelse(DF$attributes == "keyword", "keyWord", DF$attributes)
DF$attributes <- ifelse(DF$attributes == "publicationtitle", "journal", DF$attributes)
DF$attributes <- ifelse(DF$attributes == "publicationyear", "pubYear", DF$attributes)
# Strip white-space
DF$record <- stringr::str_trim(DF$record, side="both")
rownames(DF) <- NULL
DF <- select(DF, articleID, attributes, record)
# write to CSV if necessary
if (csv) {
write.csv(DF, "proQuestBWR.csv")
message("The `proQuestBWR.csv` file can be found in your working directory: ", getwd())
return(invisible(DF))
}
DF
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.