R/proquestBWR.R

#' Converts bibliographic text files extracted from ProQuest into analyzable data frame
#'
#' @param path path for the folder containing text files to be wrangled
#' @param rmDuplicates logical vector indicating whether the function should identify and remove duplicate article records
#' @param csv logical vector indicating whether the final data frame generated by the function should be saved to the working directory as a .csv file
#'
#' @examples proQuestBWR.f(csv=TRUE, path="C:/Users/JohnDoe/Desktop/ProQuestTextFiles")
#' @note proQuestBWR.f will wrangle all text files present in the path folder.
#'
#' @export
proQuestBWR.f <- function(path, rmDuplicates=TRUE, csv = FALSE){

  #___________________________________________________________________________
  #                           1. READ ProQuest txt files
  #---------------------------------------------------------------------------
  #
  # Check last line of txt file.  Clean up might be necessary
  #___________________________________________________________________________

  temp <- list.files(path, pattern = ".txt", full.names=TRUE)
  record <- unlist(lapply(temp, readLines, warn=FALSE))

  # extract attributes as a separate variable
  attributes <- stringr::str_extract(record, "(.*?):")
  full.df <- data.frame(cbind(attributes, record))

  full.df$articleID <- cumsum(grepl("________", full.df$record))

  full.df <- tbl_df(full.df) #Convert to dplyr table for easier reading
  full.df <- filter(full.df, record != "") #Eliminate empy rows
  full.df <- filter(full.df, !is.na(attributes)) #Eliminate missing
  full.df$attributes <- gsub(" ", "", full.df$attributes)
  full.df$attributes <- gsub(":", "", full.df$attributes)

  full.df$record <- gsub("^[^:]+:", "", full.df$record)

  full.df$record <- ifelse(full.df$attributes=="Publicationdetails",
                               gsub("\\s\\(.*", "", full.df$record), full.df$record)

  full.df$attributes <- ifelse(full.df$attributes == "Title",
                               "Article", full.df$attributes)
  full.df$attributes <- ifelse(full.df$attributes== "Documentauthor",
                               "Author", full.df$attributes)
  full.df$attributes <- ifelse(full.df$attributes == "Publicationdetails",
                               "Publicationtitle", full.df$attributes)
  full.df$attributes <- ifelse(full.df$attributes == "Subject",
                              "keyWord", full.df$attributes)

  attributeKeep <- c("Article", "Author", "Publicationtitle",
                   "Publicationyear", "Abstract", "keyWord", "Location", "Pages")

  attributeKeepIndex <- full.df$attributes %in% attributeKeep
  full.df <- full.df[attributeKeepIndex,]


  #_______________________________________________________________________________
  #                    2.  REMOVE DUPLICATE RECORDS
  #-------------------------------------------------------------------------------
  # In this section, the code is doing a global match for duplicate article records
  # based on the title. Duplicates occur because multiple databases have overlapping indexing.
  #_______________________________________________________________________________

  if (rmDuplicates) {

    #Select out all titles
    DF.temp <- filter(full.df, attributes == "Article")

    #Journal titles show discrepancies in capitalization rules.  Force all to
    #lower to address this problem.  Further testing should consider stripping
    #white space.
    DF.temp$record <- tolower(DF.temp$record)

    #Find duplicated records - duplicates are marked as true
    DF.temp <- DF.temp[duplicated(DF.temp$record), ]

    #Screen out duplicated records by articleID. The articleID must be used
    #because the duplicate title contains other article attributes
    DF.duplicated.ID <- DF.temp$articleID
    full.df <- full.df[!(full.df$articleID %in% DF.duplicated.ID), ]
  }


  #_______________________________________________________________________________
  #            3. AUTHOR FIELD FIX - AUTHORS IN SINGLE FIELD
  #-------------------------------------------------------------------------------
  #
  # Social Work abstracts lists authors in a single cell, separated by a semi-
  # colon, and then includes digits and email addresses in some occassions.
  # The following text locates each author in the cell and places it into a new
  # row to be consistent with PsychInfo and SSA.
  #_______________________________________________________________________________

  #Create a new temporary data frame
  DF.temp <- filter(full.df, attributes == "Author")

  #Identify records with semi-colons in author names
  semi.colons <- grepl("(;)", DF.temp$record)

  #Select out those records with semi-colons in author names from temporary
  #data frame
  DF.temp <- DF.temp[semi.colons, ]

  #Add a semi colon to the end of every string
  DF.temp$record <- paste(DF.temp$record, ";", sep="")
  semi.colon.split <- strsplit(DF.temp$record, ";")

  split.df <- data.frame(
      attributes = rep(DF.temp$attributes, lapply(semi.colon.split, length)),
      record = unlist(semi.colon.split),
      articleID = rep(DF.temp$articleID, lapply(semi.colon.split, length)))

  #Trim whitespace on both sides
  split.df$record <- stringr::str_trim(split.df$record, side = "both")

  #Eliminate author affiliation from author record
  split.df$record <- gsub("1\\s.+", "", split.df$record)

  #Eliminate remaining number from author record
  split.df$record <- gsub("1", "", split.df$record)

  # Create a vector of all articleID's that were fixed
  fixed.ID <- unique(split.df$articleID)

  #Filter out all processed records from the fixed list
  DF.authors <- filter(full.df, attributes == "Author")
  DF.authors.good <- DF.authors[!(DF.authors$articleID %in% fixed.ID),]
  DF.authors.fixed <- split.df
  DF.no.authors <- filter(full.df, attributes != "Author")

  #Strip author biographies
  DF.authors.good$record <- gsub("11\\s.+", "", DF.authors.good$record)

  #Bind the reduced DF with the fixed df
  full.df <- rbind(DF.no.authors, DF.authors.good, DF.authors.fixed)
  full.df <- arrange(full.df, articleID)

  #_______________________________________________________________________________
  #           4. KEYWORD FIELD FIX - KEYWORDS IN SINGLE FIELD
  #-------------------------------------------------------------------------------
  #
  # Social Work abstracts lists authors in a single cell, separated by a semi-
  # colon, and then includes digits and email addresses in some occassions.
  # The following text locates each author in the cell and places it into a new
  # row to be consistent with PsychInfo and SSA.
  #_______________________________________________________________________________

  #Create a new temporary data frame

  DF.temp <- filter(full.df, attributes == "keyWord")

  #Add a comma to the end of every string
  DF.temp$record <- paste(DF.temp$record, ";", sep="")
  comma.split <- strsplit(DF.temp$record, ";")

  split.df <- data.frame(
      attributes = rep(DF.temp$attributes, lapply(comma.split, length)),
      record = unlist(comma.split),
      articleID = rep(DF.temp$articleID, lapply(comma.split, length)))

  #Trim whitespace on both sides
  split.df$record <- gsub("\\*", "", split.df$record)
  split.df$record <- stringr::str_trim(split.df$record, side = "both")
  split.df$record <- tolower(split.df$record)

  #Filter out all processed records from the fixed list
  DF.no.subject <- filter(full.df, attributes != "keyWord")
  DF.subject.fixed <- split.df

  #Bind the reduced DF with the fixed df
  DF <- rbind(DF.no.subject, DF.subject.fixed)
  DF <- arrange(DF, articleID)


  #_______________________________________________________________________________
  #                       5. Minor Cleaning
  #-------------------------------------------------------------------------------
  #
  # In this section, meaningful variable names are assigned to variables that have
  # been cleaned and are appropriate for analysis.  All other variables are
  # excluded to prevent inappropriate analyses.
  #_______________________________________________________________________________

  DF$attributes <- tolower(DF$attributes)
  DF$attributes <- ifelse(DF$attributes == "keyword", "keyWord", DF$attributes)
  DF$attributes <- ifelse(DF$attributes == "publicationtitle", "journal", DF$attributes)
  DF$attributes <- ifelse(DF$attributes == "publicationyear", "pubYear", DF$attributes)

  # Strip white-space
  DF$record <- stringr::str_trim(DF$record, side="both")


  rownames(DF) <- NULL
  DF <- select(DF, articleID, attributes, record)

  # write to CSV if necessary
  if (csv) {
    write.csv(DF, "proQuestBWR.csv")
    message("The `proQuestBWR.csv` file can be found in your working directory: ", getwd())
    return(invisible(DF))
  }

  DF
}
bryanvictor/BibWrangleR documentation built on May 13, 2019, 8:11 a.m.