R/ebscoBWR.R

#' Converts bibliographic text files extracted from EbscoHost into analyzable data frame
#'
#' @param path path directory for the folder containing text files to be wrangled
#' @param rmDuplicates logical vector indicating whether the function should identify and remove duplicate article records
#' @param firstAbstractOnly locgical vector indicating whether to retain all available abstract fields for a given article record.  See front end notation.
#' @param csv logical vector indicating whether the final data frame generated by the function should be saved to the working directory as a .csv file
#'
#' @examples ebscoBWR.f(csv=TRUE, path="C:/Users/JaneDoe/Desktop/EbscoHostTextFiles")
#' @note ebsoBWR.f will wrangle all text files present in the path folder.
#'
#' @export
ebscoBWR.f <- function(path, rmDuplicates = TRUE, firstAbstractOnly=TRUE, csv=FALSE){

  #_______________________________________________________________________________
  #                           MAIN TO-DO LIST
  #-------------------------------------------------------------------------------
  #
  #
  #  4. Include note on importance of naming files to give priority
  #
  #
  #
  #  8.  Include a quality check at end of later sections to ensure TI and SO
  #      match and are not reduced in subsequent parsing.
  #_______________________________________________________________________________


  #_______________________________________________________________________________
  #                           1a. READ EBSCO txt files
  #-------------------------------------------------------------------------------
  #
  #All files to be wrangled should be saved in a single folder and have a *.txt
  #extension.  The files must be processed from EbscoHost in the generic
  #bibliographic format -- no other file structure will work.
  #
  #_______________________________________________________________________________

  temp <- list.files(path, pattern = ".txt", full.names=TRUE)

  readUTF <- function(x){
    x <- readLines(x, encoding = "utf-8")
  }

  dat <- lapply(temp, readUTF)

  attributes <- unlist(lapply(dat, function(x) stringi::stri_sub(x, 1,2)))

  attributes.df <- data.frame(attributes)

  #Extracts the character string in "dat" beginning with the fifth character(attribute codes and
  #punctuation are contained in characters 1-4). These character strings are the record values.

  record <- substring(unlist(dat), 5)

  record.df <- data.frame(record)

  DF <- cbind(attributes.df, record.df)

  DF <- data.frame(lapply(DF, as.character), stringsAsFactors = FALSE)


  #_______________________________________________________________________________
  #                           1b. Create article IDs
  #-------------------------------------------------------------------------------
  #
  #Each article record contains a set of attributes related to the scholarly article including
  #title, journal, author, author affilitation, year published,location in which the research
  #was conducted, page length, etc. To be able to retrieve all available data for a given article
  # record, the same articleID is assigned to all attributes of that record.
  #_______________________________________________________________________________

  #Most article records are divided by a blank row, however this is not always the case. The
  #following code assesses for a blank row following an article record which always ends with
  #the UR attribute field.  If a blank row is not present following an article record, one will
  #be added.

  blank <- c("", "", "")
  DF <- rbind(blank, DF)

  DF.temp <- DF

  DF.temp$tempID <- as.numeric(1:length(DF.temp$attributes))

  x1 <- which(DF.temp$attributes == "UR")
  x2 <- x1+1
  DF.x2 <- DF.temp[x2, ]
  duplicate <- filter(DF.x2, attributes != "")

  ID <- as.numeric(duplicate$tempID)
  ID.placement <- as.numeric(ID - 0.5)

  blank.rows <- rep("", length(ID.placement))

  blank.frame <- cbind(blank.rows, blank.rows)
  blank.frame <- as.data.frame(cbind(blank.frame, as.numeric(ID.placement)))

  colnames(blank.frame) <- c("attributes", "record", "tempID")
  blank.frame$tempID <- as.character(blank.frame$tempID)
  blank.frame$tempID <- as.numeric(blank.frame$tempID)

  DF.temp <- rbind(DF.temp, blank.frame)

  DF.temp <- DF.temp[order(DF.temp$tempID),]

  DF <- select(DF.temp, -tempID)

  ##articleIDs assigned following insertion of blank row following UR entries
  DF$articleID <- cumsum(DF$attributes =="")


  #_______________________________________________________________________________
  #                     2a. LABEL SECOND TI FIELDS
  #-------------------------------------------------------------------------------
  #
  # Some records contain multiple TI fields when there is a translated title or the article
  # titlewas unintentially split between two fields. To get an accurate count of the number
  # of unique titles, the extra TI fields must be re-labeled.
  #______________________________________________________________________________

  DF<-DF[order(DF$articleID, DF$attributes),]

  tempID <- 1:length(DF$record)
  DF.temp <- cbind(DF, tempID)

  x1 <- which(DF.temp$attributes == "TI")
  x2 <- x1+1
  DF.x2 <- DF.temp[x2, ]
  duplicate <- filter(DF.x2, attributes == "TI")
  duplicate.ind <- duplicate$tempID

  DF.holder <- mutate(DF.temp, attributes=ifelse(tempID%in%duplicate.ind, "secondTitle", attributes))

  DF <- select(DF.holder, -tempID)


  #_______________________________________________________________________________
  #                     2b. REMOVE MULTIPLE AB FIELDS
  #-------------------------------------------------------------------------------
  #
  # SSA records contain two AB fields.  These AB records are always serially
  # serially ordered, so the extraction method in 2a is reapplied here to detect the presence of
  #more than one AB attribute for a given article record.  An if statement is built in to allow
  #the user to retain mutliple AB fields for article records if they desire, but the default
  #setting is that those abstract fields beyond the first be removed.
  #_______________________________________________________________________________

  if (firstAbstractOnly) {

    tempID <- 1:length(DF$record)
    DF.temp <- cbind(DF, tempID)

    x1 <- which(DF.temp$attributes == "AB")
    x2 <- x1+1
    DF.x2 <- DF.temp[x2, ]
    duplicate <- filter(DF.x2, attributes == "AB")
    duplicate.ind <- duplicate$tempID

    DF.holder <- DF.temp[!(DF.temp$tempID %in% duplicate.ind),]

    DF <- select(DF.holder, -tempID)
  }


  #_______________________________________________________________________________
  #                    3.  REMOVE DUPLICATE RECORDS
  #-------------------------------------------------------------------------------
  # In the prior section, second title (TI) fields were identified and re-labeled.
  # These were non-matching titles because, for a given article, either one title is in
  # English and the other is in a foreing language, or the subtitle was placed in a second TI
  # field. In this section, the code is doing a global match for duplicate article records
  # based on the title. Duplicates occur because multiple databases have overlapping indexing.
  #_______________________________________________________________________________

  if (rmDuplicates) {

    #Select out all titles
    DF.temp <- filter(DF, attributes == "TI")

    #Journal titles show discrepancies in capitalization rules.  Force all to
    #lower to address this problem.  Further testing should consider stripping
    #white space.
    DF.temp$record <- tolower(DF.temp$record)
    DF.temp$record <- gsub("[[:punct:]]", "", DF.temp$record)
    DF.temp$record <- gsub("[[:space:]]", "", DF.temp$record)
    

    #Find duplicated records - duplicates are marked as true
    DF.temp <- DF.temp[duplicated(DF.temp$record), ]

    #Screen out duplicated records by articleID. The articleID must be used
    #because the duplicate title contains other article attributes
    DF.duplicated.ID <- DF.temp$articleID
    DF <- DF[!(DF$articleID %in% DF.duplicated.ID), ]
  }


  #_______________________________________________________________________________
  #     4.  CLEAN JOURNAL NAMES AND MERGE JOURNAL NAME FIELDS (SO AND JN)
  #-------------------------------------------------------------------------------
  #
  #Journals that have a Special Issue are not grouped together as the same
  #title. These subtitles need to be removed.  Some articles have both the JN
  #and SO code.  Thus, keeping both results in an excess count. A quality control
  #check is to ensure the number of article titles is exactly equal to the
  #unique number of journal title entries
  #_______________________________________________________________________________

  ##Articles drawn from MEDLINE and CINAHL have the year embedded in the journal title string.
  #The year needs to be extracted before journal title field is cleaned.  
  
  
  holder <- filter(DF, grepl(".*\\(.*\\)\\,\\s[[:digit:]]*\\-[[:digit:]]*\\.", record))
  holder <- filter(holder, attributes == "SO")
  
  holder <- mutate(holder, YR = gsub(".*\\(.*\\)\\,\\s[[:digit:]]*\\-[[:digit:]]*\\.", "", record))
  holder <- mutate(holder, YR = gsub("\\;.*", "", YR))
  holder <- mutate(holder, YR = gsub("[[:alpha:]]", "", YR))
  holder <- mutate(holder, YR = gsub("[[:digit:]]{2}/[[:digit:]]{2}/", "", YR))
  holder <- mutate(holder, YR = gsub("[[:punct:]]", "", YR))
  holder <- mutate(holder, YR = gsub("[[:blank:]]", "", YR))
  holder <- mutate(holder, YR = strtrim(YR, 4))
  
  holder$YR[holder$YR == "90"] <- "1990"
  holder$YR[holder$YR == "91"] <- "1991"
  holder$YR[holder$YR == "92"] <- "1992"
  holder$YR[holder$YR == "93"] <- "1993"
  holder$YR[holder$YR == "94"] <- "1994"
  holder$YR[holder$YR == "95"] <- "1995"
  holder$YR[holder$YR == "96"] <- "1996"
  holder$YR[holder$YR == "97"] <- "1997"
  holder$YR[holder$YR == "98"] <- "1998"
  holder$YR[holder$YR == "99"] <- "1999"
  
  merge <- select(holder, attributes, record = YR, articleID)
  merge <- mutate(merge, attributes = "YR")
  
  DF <- rbind(DF, merge)
  
  rm(holder, merge)

  ##Now remove special issue titles

  DF$record[DF$attributes == "SO"] <- gsub(" Special Issue", "",
                                           DF$record[DF$attributes == "SO"])

  DF$record[DF$attributes == "SO"] <- gsub(":.*", "",
                                           DF$record[DF$attributes == "SO"])

  DF$record[DF$attributes == "JN"] <- gsub(" Special Issue", "",
                                           DF$record[DF$attributes == "JN"])

  DF$record[DF$attributes == "JN"] <- gsub(":.*", "",
                                           DF$record[DF$attributes == "JN"])

  #Create separate data files filtered by SO and JN, and a set of unique ID's
  journal.unique.SO <- filter(DF, attributes == "SO")
  journal.unique.JN <- filter(DF, attributes == "JN")
  articleID.unique <- unique(DF$articleID)

  #Which ID's overlap from JN to SO? This shows articles with both JN and SO
  #fields.
  JN.in.SO <- journal.unique.JN$articleID %in% journal.unique.SO$articleID

  #Filter out the ID from the JN that overlap with SO
  journal.unique.JN <- journal.unique.JN[!(JN.in.SO), ]
  journal.unique.JN <- mutate(journal.unique.JN, attributes = "SO")
  DF <- filter(DF, attributes != "JN")
  DF <- rbind(DF, journal.unique.JN)


  #_______________________________________________________________________________
  #                        5. COMBINE YEAR FIELDS
  #-------------------------------------------------------------------------------
  #
  # Year fields must be combined because each database uses a separate code.
  # psychInfo uses the YR field; SSA, PD; and SWA, PD.  It is not possible to
  # Simply rename PD to YR, because psycInfo also uses a PD field as another
  # variable.  Thus, the PD values that are specific to psycInfo need to be
  # eliminated before it can be replaced by the year (PD) field from SSA.
  #_______________________________________________________________________________

  DF$attributes <- ifelse(DF$attributes == "PY", "YR", DF$attributes)

  DF.temp <- DF
  DF.temp <- filter(DF.temp, attributes == "PD")

  # PsycINFO ocassionally marks an articles PubMed ID using the "PD" code.  These are
  #6-8 digits long, and so a regular expression is used to detect them and rename the attribute
  DF.temp<- DF.temp %>%
    mutate(attributes=ifelse(grepl("[0-9]{6,8}", record), "pubMedID", attributes))

  #Eliminates "Bibiliography", "Graph" and "Table" from PD field
  DF.temp$record<-sub("[BGT]?[a-z]{4,11}", "", DF.temp$record)

  # Extract the first portion of the dates, up to the point with a 2 or 4
  # digit year value. This also captures some letters and characters.
  DF.temp$record <- stringr::str_extract(DF.temp$record,
                                         "[$/A-Za-z0-9]+\\d{2,4}")

  #Exclude the characters
  DF.temp$record <- gsub("[/A-Za-z]", "", DF.temp$record)

  #Add 19 to all the records with just two digits
  DF.temp <- DF.temp[!is.na(DF.temp$record), ]
  DF.flag <- mutate(DF.temp,
                    flag = ifelse(nchar(DF.temp$record) == 2, "1", "0" )) %>%
    filter(flag == "1") %>%
    group_by(record) %>%
    summarize(N = n())

  colnames(DF.flag)<-c("Year", "N")

  DF.temp$record <- ifelse(as.numeric(DF.temp$record) < 100,
                           paste("19", DF.temp$record, sep=""), DF.temp$record)
  DF.temp <- mutate(DF.temp, attributes = ifelse(attributes=="pubMedID", "pubMedID", "YR"))

  DF <- rbind(DF, DF.temp)
  DF <- arrange(DF, articleID)

 ##Add additional code to extract data from articles indexed in MEDLINE and CINAHL
  
  
  
  
  
  #_______________________________________________________________________________
  #                            6a. AUTHOR FIELD FIX - DIGITS
  #-------------------------------------------------------------------------------
  #
  # For articles pulled in from SSA, superscripts used to footnote affiliation get
  # added to author's first name in the AU field
  #_______________________________________________________________________________

  DF.temp <- filter(DF, attributes == "AU")
  DF.temp$record <- gsub("[[:digit:]]", "", DF.temp$record)
  DF <- filter(DF, attributes != "AU")

  DF <- rbind(DF, DF.temp)
  DF <- arrange(DF, articleID)


  #_______________________________________________________________________________
  #            6b. AUTHOR FIELD FIX - AUTHORS IN SINGLE FIELD
  #-------------------------------------------------------------------------------
  #
  # Social Work abstracts lists authors in a single cell, separated by a semi-
  # colon, and then includes digits and email addresses in some occassions.
  # The following text locates each author in the cell and places it into a new
  # row to be consistent with PsychInfo and SSA.
  #_______________________________________________________________________________

  #Create a new temporary data frame that will be used to check for the presence
  #of semi-colons in the author field
  DF.temp <- filter(DF, attributes == "AU")

  #Identify records with semi-colons in author names
  semi.colons <- grepl("(;)", DF.temp$record)

  #Select out those records with semi-colons in author names from temporary
  #data frame
  DF.temp <- DF.temp[semi.colons, ]

  #The following will be run if semi-colons are present.
  #First, add a semi colon to the end of every string
  if (length(DF.temp$record >= 1)) {
    DF.temp$record <- paste(DF.temp$record, ";", sep="")
    semi.colon.split <- strsplit(DF.temp$record, ";")

    split.df <- data.frame(
      attributes = rep(DF.temp$attributes, lapply(semi.colon.split, length)),
      record     = unlist(semi.colon.split),
      articleID  = rep(DF.temp$articleID, lapply(semi.colon.split, length)))

    #Trim whitespace on both sides
    split.df$record <- stringr::str_trim(split.df$record, side = "both")

    #The author field is problematic because it contains some email addresses.
    #Some fields have been improperly split because they were split on a semi-colon
    #that was in the middle of the filed.

    #Create a pattern that eliminates possible emails
    split.df$record <- ifelse(grepl("@", split.df$record) == TRUE, "",
                              split.df$record)
    split.df$record <- ifelse(nchar(split.df$record) <= 3, "",
                              split.df$record)
    split.df$record <- ifelse(grepl("\\.", split.df$record) == FALSE, "",
                              split.df$record)
    split.df$record <- ifelse(grepl("&", split.df$record) == TRUE, "",
                              split.df$record)
    split.df <- filter(split.df, record != "")

    # Create a vector of all articleID's that were fixed
    fixed.ID <- unique(split.df$articleID)

    #Filter out all processed records from the fixed list
    DF.authors <- filter(DF, attributes == "AU")
    DF.authors.good <- DF.authors[!(DF.authors$articleID %in% fixed.ID),]
    DF.authors.fixed <- split.df
    DF.no.authors <- filter(DF, attributes != "AU")

    #Bind the reduced DF with the fixed df
    DF <- rbind(DF.no.authors, DF.authors.good, DF.authors.fixed)
    DF <- arrange(DF, articleID)
  }


  #_______________________________________________________________________________
  #                       6c. E-mails
  #-------------------------------------------------------------------------------
  #
  # After fixing the author fields in 6b, subsequent testing revealed a large
  # number of email addresses that were still in the datafile and note excluded.
  # This section is a patch for this issue.  This code should be re-written
  # and integrated with the prior section.
  #_______________________________________________________________________________

  DF.temp <- filter(DF, attributes == "AU")
  sub.1 <- sub("^([^,]*,[^,]*),.*", "\\1", DF.temp$record)
  sub.2 <- sub("[,\\.][a-zA-Z]{1,}@", "", sub.1)
  sub.3 <- sub("@[a-zA-Z0-9.\\]{1,}", "", sub.2)
  sub.4 <- sub("(\\s[a-z]{1,})$", "", sub.3)
  sub.5 <- sub("(/&;#]+)", "", sub.4)
  DF.temp$record <- sub.5

  DF <- filter(DF, attributes != "AU")
  DF <- rbind(DF, DF.temp)
  DF <- arrange(DF, articleID)


  #_______________________________________________________________________________
  #                       7. Minor Cleaning
  #-------------------------------------------------------------------------------
  #
  # In this section, meaningful variable names are assigned to variables that have
  # been cleaned and are appropriate for analysis.  All other variables are
  # excluded to prevent inappropriate analyses.
  #_______________________________________________________________________________

  # Exclude UR record from the data file
  DF$attributes <- ifelse(DF$attributes == "KW", "KP", DF$attributes)
  DF$attributes <- ifelse(DF$attributes == "AD", "AF", DF$attributes)

  DF$attributes <- ifelse(DF$attributes == "TI", "article", DF$attributes)
  DF$attributes <- ifelse(DF$attributes == "AU", "author", DF$attributes)
  DF$attributes <- ifelse(DF$attributes == "SO", "journal", DF$attributes)
  DF$attributes <- ifelse(DF$attributes == "YR", "pubYear", DF$attributes)
  DF$attributes <- ifelse(DF$attributes == "AB", "abstract", DF$attributes)
  DF$attributes <- ifelse(DF$attributes == "KP", "keyWord", DF$attributes)
  DF$attributes <- ifelse(DF$attributes == "LO", "location", DF$attributes)
  DF$attributes <- ifelse(DF$attributes == "S2", "journalSecondary",
                          DF$attributes)
  DF$attributes <- ifelse(DF$attributes == "AF", "authorAff", DF$attributes)
  DF$attributes <- ifelse(DF$attributes == "PG", "pages", DF$attributes)

  variables.to.keep <- c("article", "author", "journal", "pubYear", "abstract",
                         "keyWord", "location", "journalSecondary", "authorAff",
                         "pages", "pubMedID", "secondTitle")

  DF <- DF[DF$attributes %in% variables.to.keep, ]

  # Strip white-space
  DF$record <- stringr::str_trim(DF$record, side="both")

  DF$record <- ifelse(DF$attributes == "keyWord", tolower(DF$record), DF$record)

  # Remove rownames
  rownames(DF) <- NULL

  # Reorder variables
  DF <- select(DF, articleID, attributes, record)

  # write to CSV if necessary
  if (csv) {
    write.csv(DF, "ebscoBWR.csv")
    message("The `ebscoBWR.csv` file can be found in your working directory: ", getwd())
    return(invisible(DF))
  }

  DF
}
bryanvictor/BibWrangleR documentation built on May 13, 2019, 8:11 a.m.