#' Converts bibliographic text files extracted from EbscoHost into analyzable data frame
#'
#' @param path path directory for the folder containing text files to be wrangled
#' @param rmDuplicates logical vector indicating whether the function should identify and remove duplicate article records
#' @param firstAbstractOnly locgical vector indicating whether to retain all available abstract fields for a given article record. See front end notation.
#' @param csv logical vector indicating whether the final data frame generated by the function should be saved to the working directory as a .csv file
#'
#' @examples ebscoBWR.f(csv=TRUE, path="C:/Users/JaneDoe/Desktop/EbscoHostTextFiles")
#' @note ebsoBWR.f will wrangle all text files present in the path folder.
#'
#' @export
ebscoBWR.f <- function(path, rmDuplicates = TRUE, firstAbstractOnly=TRUE, csv=FALSE){
#_______________________________________________________________________________
# MAIN TO-DO LIST
#-------------------------------------------------------------------------------
#
#
# 4. Include note on importance of naming files to give priority
#
#
#
# 8. Include a quality check at end of later sections to ensure TI and SO
# match and are not reduced in subsequent parsing.
#_______________________________________________________________________________
#_______________________________________________________________________________
# 1a. READ EBSCO txt files
#-------------------------------------------------------------------------------
#
#All files to be wrangled should be saved in a single folder and have a *.txt
#extension. The files must be processed from EbscoHost in the generic
#bibliographic format -- no other file structure will work.
#
#_______________________________________________________________________________
temp <- list.files(path, pattern = ".txt", full.names=TRUE)
readUTF <- function(x){
x <- readLines(x, encoding = "utf-8")
}
dat <- lapply(temp, readUTF)
attributes <- unlist(lapply(dat, function(x) stringi::stri_sub(x, 1,2)))
attributes.df <- data.frame(attributes)
#Extracts the character string in "dat" beginning with the fifth character(attribute codes and
#punctuation are contained in characters 1-4). These character strings are the record values.
record <- substring(unlist(dat), 5)
record.df <- data.frame(record)
DF <- cbind(attributes.df, record.df)
DF <- data.frame(lapply(DF, as.character), stringsAsFactors = FALSE)
#_______________________________________________________________________________
# 1b. Create article IDs
#-------------------------------------------------------------------------------
#
#Each article record contains a set of attributes related to the scholarly article including
#title, journal, author, author affilitation, year published,location in which the research
#was conducted, page length, etc. To be able to retrieve all available data for a given article
# record, the same articleID is assigned to all attributes of that record.
#_______________________________________________________________________________
#Most article records are divided by a blank row, however this is not always the case. The
#following code assesses for a blank row following an article record which always ends with
#the UR attribute field. If a blank row is not present following an article record, one will
#be added.
blank <- c("", "", "")
DF <- rbind(blank, DF)
DF.temp <- DF
DF.temp$tempID <- as.numeric(1:length(DF.temp$attributes))
x1 <- which(DF.temp$attributes == "UR")
x2 <- x1+1
DF.x2 <- DF.temp[x2, ]
duplicate <- filter(DF.x2, attributes != "")
ID <- as.numeric(duplicate$tempID)
ID.placement <- as.numeric(ID - 0.5)
blank.rows <- rep("", length(ID.placement))
blank.frame <- cbind(blank.rows, blank.rows)
blank.frame <- as.data.frame(cbind(blank.frame, as.numeric(ID.placement)))
colnames(blank.frame) <- c("attributes", "record", "tempID")
blank.frame$tempID <- as.character(blank.frame$tempID)
blank.frame$tempID <- as.numeric(blank.frame$tempID)
DF.temp <- rbind(DF.temp, blank.frame)
DF.temp <- DF.temp[order(DF.temp$tempID),]
DF <- select(DF.temp, -tempID)
##articleIDs assigned following insertion of blank row following UR entries
DF$articleID <- cumsum(DF$attributes =="")
#_______________________________________________________________________________
# 2a. LABEL SECOND TI FIELDS
#-------------------------------------------------------------------------------
#
# Some records contain multiple TI fields when there is a translated title or the article
# titlewas unintentially split between two fields. To get an accurate count of the number
# of unique titles, the extra TI fields must be re-labeled.
#______________________________________________________________________________
DF<-DF[order(DF$articleID, DF$attributes),]
tempID <- 1:length(DF$record)
DF.temp <- cbind(DF, tempID)
x1 <- which(DF.temp$attributes == "TI")
x2 <- x1+1
DF.x2 <- DF.temp[x2, ]
duplicate <- filter(DF.x2, attributes == "TI")
duplicate.ind <- duplicate$tempID
DF.holder <- mutate(DF.temp, attributes=ifelse(tempID%in%duplicate.ind, "secondTitle", attributes))
DF <- select(DF.holder, -tempID)
#_______________________________________________________________________________
# 2b. REMOVE MULTIPLE AB FIELDS
#-------------------------------------------------------------------------------
#
# SSA records contain two AB fields. These AB records are always serially
# serially ordered, so the extraction method in 2a is reapplied here to detect the presence of
#more than one AB attribute for a given article record. An if statement is built in to allow
#the user to retain mutliple AB fields for article records if they desire, but the default
#setting is that those abstract fields beyond the first be removed.
#_______________________________________________________________________________
if (firstAbstractOnly) {
tempID <- 1:length(DF$record)
DF.temp <- cbind(DF, tempID)
x1 <- which(DF.temp$attributes == "AB")
x2 <- x1+1
DF.x2 <- DF.temp[x2, ]
duplicate <- filter(DF.x2, attributes == "AB")
duplicate.ind <- duplicate$tempID
DF.holder <- DF.temp[!(DF.temp$tempID %in% duplicate.ind),]
DF <- select(DF.holder, -tempID)
}
#_______________________________________________________________________________
# 3. REMOVE DUPLICATE RECORDS
#-------------------------------------------------------------------------------
# In the prior section, second title (TI) fields were identified and re-labeled.
# These were non-matching titles because, for a given article, either one title is in
# English and the other is in a foreing language, or the subtitle was placed in a second TI
# field. In this section, the code is doing a global match for duplicate article records
# based on the title. Duplicates occur because multiple databases have overlapping indexing.
#_______________________________________________________________________________
if (rmDuplicates) {
#Select out all titles
DF.temp <- filter(DF, attributes == "TI")
#Journal titles show discrepancies in capitalization rules. Force all to
#lower to address this problem. Further testing should consider stripping
#white space.
DF.temp$record <- tolower(DF.temp$record)
DF.temp$record <- gsub("[[:punct:]]", "", DF.temp$record)
DF.temp$record <- gsub("[[:space:]]", "", DF.temp$record)
#Find duplicated records - duplicates are marked as true
DF.temp <- DF.temp[duplicated(DF.temp$record), ]
#Screen out duplicated records by articleID. The articleID must be used
#because the duplicate title contains other article attributes
DF.duplicated.ID <- DF.temp$articleID
DF <- DF[!(DF$articleID %in% DF.duplicated.ID), ]
}
#_______________________________________________________________________________
# 4. CLEAN JOURNAL NAMES AND MERGE JOURNAL NAME FIELDS (SO AND JN)
#-------------------------------------------------------------------------------
#
#Journals that have a Special Issue are not grouped together as the same
#title. These subtitles need to be removed. Some articles have both the JN
#and SO code. Thus, keeping both results in an excess count. A quality control
#check is to ensure the number of article titles is exactly equal to the
#unique number of journal title entries
#_______________________________________________________________________________
##Articles drawn from MEDLINE and CINAHL have the year embedded in the journal title string.
#The year needs to be extracted before journal title field is cleaned.
holder <- filter(DF, grepl(".*\\(.*\\)\\,\\s[[:digit:]]*\\-[[:digit:]]*\\.", record))
holder <- filter(holder, attributes == "SO")
holder <- mutate(holder, YR = gsub(".*\\(.*\\)\\,\\s[[:digit:]]*\\-[[:digit:]]*\\.", "", record))
holder <- mutate(holder, YR = gsub("\\;.*", "", YR))
holder <- mutate(holder, YR = gsub("[[:alpha:]]", "", YR))
holder <- mutate(holder, YR = gsub("[[:digit:]]{2}/[[:digit:]]{2}/", "", YR))
holder <- mutate(holder, YR = gsub("[[:punct:]]", "", YR))
holder <- mutate(holder, YR = gsub("[[:blank:]]", "", YR))
holder <- mutate(holder, YR = strtrim(YR, 4))
holder$YR[holder$YR == "90"] <- "1990"
holder$YR[holder$YR == "91"] <- "1991"
holder$YR[holder$YR == "92"] <- "1992"
holder$YR[holder$YR == "93"] <- "1993"
holder$YR[holder$YR == "94"] <- "1994"
holder$YR[holder$YR == "95"] <- "1995"
holder$YR[holder$YR == "96"] <- "1996"
holder$YR[holder$YR == "97"] <- "1997"
holder$YR[holder$YR == "98"] <- "1998"
holder$YR[holder$YR == "99"] <- "1999"
merge <- select(holder, attributes, record = YR, articleID)
merge <- mutate(merge, attributes = "YR")
DF <- rbind(DF, merge)
rm(holder, merge)
##Now remove special issue titles
DF$record[DF$attributes == "SO"] <- gsub(" Special Issue", "",
DF$record[DF$attributes == "SO"])
DF$record[DF$attributes == "SO"] <- gsub(":.*", "",
DF$record[DF$attributes == "SO"])
DF$record[DF$attributes == "JN"] <- gsub(" Special Issue", "",
DF$record[DF$attributes == "JN"])
DF$record[DF$attributes == "JN"] <- gsub(":.*", "",
DF$record[DF$attributes == "JN"])
#Create separate data files filtered by SO and JN, and a set of unique ID's
journal.unique.SO <- filter(DF, attributes == "SO")
journal.unique.JN <- filter(DF, attributes == "JN")
articleID.unique <- unique(DF$articleID)
#Which ID's overlap from JN to SO? This shows articles with both JN and SO
#fields.
JN.in.SO <- journal.unique.JN$articleID %in% journal.unique.SO$articleID
#Filter out the ID from the JN that overlap with SO
journal.unique.JN <- journal.unique.JN[!(JN.in.SO), ]
journal.unique.JN <- mutate(journal.unique.JN, attributes = "SO")
DF <- filter(DF, attributes != "JN")
DF <- rbind(DF, journal.unique.JN)
#_______________________________________________________________________________
# 5. COMBINE YEAR FIELDS
#-------------------------------------------------------------------------------
#
# Year fields must be combined because each database uses a separate code.
# psychInfo uses the YR field; SSA, PD; and SWA, PD. It is not possible to
# Simply rename PD to YR, because psycInfo also uses a PD field as another
# variable. Thus, the PD values that are specific to psycInfo need to be
# eliminated before it can be replaced by the year (PD) field from SSA.
#_______________________________________________________________________________
DF$attributes <- ifelse(DF$attributes == "PY", "YR", DF$attributes)
DF.temp <- DF
DF.temp <- filter(DF.temp, attributes == "PD")
# PsycINFO ocassionally marks an articles PubMed ID using the "PD" code. These are
#6-8 digits long, and so a regular expression is used to detect them and rename the attribute
DF.temp<- DF.temp %>%
mutate(attributes=ifelse(grepl("[0-9]{6,8}", record), "pubMedID", attributes))
#Eliminates "Bibiliography", "Graph" and "Table" from PD field
DF.temp$record<-sub("[BGT]?[a-z]{4,11}", "", DF.temp$record)
# Extract the first portion of the dates, up to the point with a 2 or 4
# digit year value. This also captures some letters and characters.
DF.temp$record <- stringr::str_extract(DF.temp$record,
"[$/A-Za-z0-9]+\\d{2,4}")
#Exclude the characters
DF.temp$record <- gsub("[/A-Za-z]", "", DF.temp$record)
#Add 19 to all the records with just two digits
DF.temp <- DF.temp[!is.na(DF.temp$record), ]
DF.flag <- mutate(DF.temp,
flag = ifelse(nchar(DF.temp$record) == 2, "1", "0" )) %>%
filter(flag == "1") %>%
group_by(record) %>%
summarize(N = n())
colnames(DF.flag)<-c("Year", "N")
DF.temp$record <- ifelse(as.numeric(DF.temp$record) < 100,
paste("19", DF.temp$record, sep=""), DF.temp$record)
DF.temp <- mutate(DF.temp, attributes = ifelse(attributes=="pubMedID", "pubMedID", "YR"))
DF <- rbind(DF, DF.temp)
DF <- arrange(DF, articleID)
##Add additional code to extract data from articles indexed in MEDLINE and CINAHL
#_______________________________________________________________________________
# 6a. AUTHOR FIELD FIX - DIGITS
#-------------------------------------------------------------------------------
#
# For articles pulled in from SSA, superscripts used to footnote affiliation get
# added to author's first name in the AU field
#_______________________________________________________________________________
DF.temp <- filter(DF, attributes == "AU")
DF.temp$record <- gsub("[[:digit:]]", "", DF.temp$record)
DF <- filter(DF, attributes != "AU")
DF <- rbind(DF, DF.temp)
DF <- arrange(DF, articleID)
#_______________________________________________________________________________
# 6b. AUTHOR FIELD FIX - AUTHORS IN SINGLE FIELD
#-------------------------------------------------------------------------------
#
# Social Work abstracts lists authors in a single cell, separated by a semi-
# colon, and then includes digits and email addresses in some occassions.
# The following text locates each author in the cell and places it into a new
# row to be consistent with PsychInfo and SSA.
#_______________________________________________________________________________
#Create a new temporary data frame that will be used to check for the presence
#of semi-colons in the author field
DF.temp <- filter(DF, attributes == "AU")
#Identify records with semi-colons in author names
semi.colons <- grepl("(;)", DF.temp$record)
#Select out those records with semi-colons in author names from temporary
#data frame
DF.temp <- DF.temp[semi.colons, ]
#The following will be run if semi-colons are present.
#First, add a semi colon to the end of every string
if (length(DF.temp$record >= 1)) {
DF.temp$record <- paste(DF.temp$record, ";", sep="")
semi.colon.split <- strsplit(DF.temp$record, ";")
split.df <- data.frame(
attributes = rep(DF.temp$attributes, lapply(semi.colon.split, length)),
record = unlist(semi.colon.split),
articleID = rep(DF.temp$articleID, lapply(semi.colon.split, length)))
#Trim whitespace on both sides
split.df$record <- stringr::str_trim(split.df$record, side = "both")
#The author field is problematic because it contains some email addresses.
#Some fields have been improperly split because they were split on a semi-colon
#that was in the middle of the filed.
#Create a pattern that eliminates possible emails
split.df$record <- ifelse(grepl("@", split.df$record) == TRUE, "",
split.df$record)
split.df$record <- ifelse(nchar(split.df$record) <= 3, "",
split.df$record)
split.df$record <- ifelse(grepl("\\.", split.df$record) == FALSE, "",
split.df$record)
split.df$record <- ifelse(grepl("&", split.df$record) == TRUE, "",
split.df$record)
split.df <- filter(split.df, record != "")
# Create a vector of all articleID's that were fixed
fixed.ID <- unique(split.df$articleID)
#Filter out all processed records from the fixed list
DF.authors <- filter(DF, attributes == "AU")
DF.authors.good <- DF.authors[!(DF.authors$articleID %in% fixed.ID),]
DF.authors.fixed <- split.df
DF.no.authors <- filter(DF, attributes != "AU")
#Bind the reduced DF with the fixed df
DF <- rbind(DF.no.authors, DF.authors.good, DF.authors.fixed)
DF <- arrange(DF, articleID)
}
#_______________________________________________________________________________
# 6c. E-mails
#-------------------------------------------------------------------------------
#
# After fixing the author fields in 6b, subsequent testing revealed a large
# number of email addresses that were still in the datafile and note excluded.
# This section is a patch for this issue. This code should be re-written
# and integrated with the prior section.
#_______________________________________________________________________________
DF.temp <- filter(DF, attributes == "AU")
sub.1 <- sub("^([^,]*,[^,]*),.*", "\\1", DF.temp$record)
sub.2 <- sub("[,\\.][a-zA-Z]{1,}@", "", sub.1)
sub.3 <- sub("@[a-zA-Z0-9.\\]{1,}", "", sub.2)
sub.4 <- sub("(\\s[a-z]{1,})$", "", sub.3)
sub.5 <- sub("(/&;#]+)", "", sub.4)
DF.temp$record <- sub.5
DF <- filter(DF, attributes != "AU")
DF <- rbind(DF, DF.temp)
DF <- arrange(DF, articleID)
#_______________________________________________________________________________
# 7. Minor Cleaning
#-------------------------------------------------------------------------------
#
# In this section, meaningful variable names are assigned to variables that have
# been cleaned and are appropriate for analysis. All other variables are
# excluded to prevent inappropriate analyses.
#_______________________________________________________________________________
# Exclude UR record from the data file
DF$attributes <- ifelse(DF$attributes == "KW", "KP", DF$attributes)
DF$attributes <- ifelse(DF$attributes == "AD", "AF", DF$attributes)
DF$attributes <- ifelse(DF$attributes == "TI", "article", DF$attributes)
DF$attributes <- ifelse(DF$attributes == "AU", "author", DF$attributes)
DF$attributes <- ifelse(DF$attributes == "SO", "journal", DF$attributes)
DF$attributes <- ifelse(DF$attributes == "YR", "pubYear", DF$attributes)
DF$attributes <- ifelse(DF$attributes == "AB", "abstract", DF$attributes)
DF$attributes <- ifelse(DF$attributes == "KP", "keyWord", DF$attributes)
DF$attributes <- ifelse(DF$attributes == "LO", "location", DF$attributes)
DF$attributes <- ifelse(DF$attributes == "S2", "journalSecondary",
DF$attributes)
DF$attributes <- ifelse(DF$attributes == "AF", "authorAff", DF$attributes)
DF$attributes <- ifelse(DF$attributes == "PG", "pages", DF$attributes)
variables.to.keep <- c("article", "author", "journal", "pubYear", "abstract",
"keyWord", "location", "journalSecondary", "authorAff",
"pages", "pubMedID", "secondTitle")
DF <- DF[DF$attributes %in% variables.to.keep, ]
# Strip white-space
DF$record <- stringr::str_trim(DF$record, side="both")
DF$record <- ifelse(DF$attributes == "keyWord", tolower(DF$record), DF$record)
# Remove rownames
rownames(DF) <- NULL
# Reorder variables
DF <- select(DF, articleID, attributes, record)
# write to CSV if necessary
if (csv) {
write.csv(DF, "ebscoBWR.csv")
message("The `ebscoBWR.csv` file can be found in your working directory: ", getwd())
return(invisible(DF))
}
DF
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.