transcriptr: Read Transcript PDF Files

Documented in read_jhu_transcript read_transcript

#' Read JHU Transcript
#'
#' @param file path to the filename
#' @param remove_withdrawn should remove withdrawn courses
#'
#' @return A \code{tibble} of the data
#' @export
#'
#' @importFrom pdftools pdf_text
#' @importFrom dplyr mutate select filter recode bind_cols bind_rows
#' @importFrom dplyr as_tibble tibble %>% first
#' @importFrom tidyr separate
#' @importFrom zoo na.locf
#' @examples
#' file = "~/Downloads/Transcript.pdf"
#' if (file.exists(file)) {
#'     res = read_jhu_transcript(file)
#' }
read_jhu_transcript = function(
  file,
  remove_withdrawn = TRUE) {
  pdf_file = path.expand(file)
  pdf_file = normalizePath(pdf_file)
  res = pdf_text(pdf_file)

  full_term_name = credits = blah = is_term = term = NULL
  rm(list = c("credits", "blah", "is_term", "term", "full_term_name"))

  grade = NULL
  rm(list = c("grade"))
  # dat = readPDF(
  #   control = list(text="-raw"),
  #   engine = "xpdf")(
  #     elem = list(uri = path.expand(pdf_file)),
  #     language="en", id="id1")
  # dat = c(as.character(dat))


  # split_up = strsplit(res, split = "\n")
  # split_up = unlist(split_up)
  # max(nchar(split_up, type = "width"))

  res = trimws(res)
  res = strsplit(res, split = "\n")

  # res = lapply(res, trimws)
  x = res[[1]]
  res = lapply(res, function(x) {
    start = grep(pattern = "^COURSE NUMBER", trimws(x))
    start = start[length(start)]
    end = grep("^[***]", trimws(x))[1]
    if (length(end) == 0) {
      end = length(x)
    }
    stopifnot(length(start) == 1 && length(end) == 1)
    ind = seq(start + 1, end - 1)
    l = list(data = x[ind], other = x[-ind])
  })


  other = lapply(res, function(x) {
    x$other
  })

  res = lapply(res, function(x) {
    x$data
  })

  width = 109
  # r = res[[2]]

  cutter = function(x){
    nc = nchar(x)
    x = c(substr(x, 1, min(width, nc)),
          ifelse(nc > width + 1, substr(x, width + 1, nc),
                 ""))
    trimws(x)
  }

  # widths = c(11, 40, 5, 10)
  # cs = cumsum(widths)
  # cutter2 = function(x){
  #   x = trimws(x)
  #   x = gsub("\\s+", " ", x)
  #   nc = nchar(x)
  #   x = c(
  #     substr(x, 1, min(cs[1], nc)),
  #     ifelse(nc >=  cs[2],
  #            substr(x, cs[1] + 1, min(nc, cs[2])),
  #            ""),
  #     ifelse(nc >= cs[3],
  #            substr(x, cs[2] + 1, min(nc, cs[3])),
  #            ""),
  #     ifelse(nc >= cs[4],
  #            substr(x, cs[3] + 1, max(nc, cs[4])),
  #            "")
  #     )
  #   trimws(x)
  # }
  #
  # ss2 = lapply(res, function(r) {
  #   r = sapply(r, cutter2)
  #   r = t(r)
  # })
  # ss2 = do.call(rbind, ss2)
  # rownames(ss2) = NULL

  ss = lapply(res, function(r) {
    r = sapply(r, cutter)
    r = t(r)
  })

  other = lapply(other, function(r) {
    r = sapply(r, cutter)
    r = t(r)
  })

  ss = do.call(rbind, ss)
  rownames(ss) = NULL



  other = do.call(rbind, other)
  rownames(other) = NULL

  other = c(other[,1], other[,2])
  other = trimws(other)
  other = other[ other != "" ]
  stud_name = grep("student\\s+name", tolower(other))[1]
  stud_name = other[stud_name + 1]
  stud_name = strsplit(stud_name, split = "   ")[[1]]
  stud_name = trimws(stud_name)
  stud_name = stud_name[ stud_name != "" ]
  stud_name = stud_name[1:2]
  names(stud_name) = c("student_name", "student_id")


  ss = c(ss[,1], ss[,2])
  ss = trimws(ss)

  ss = ss[ ss != "" ]


  rem_ind = grepl("^GPA CRS", ss)
  if (any(rem_ind)) {
    remove = ss[rem_ind]
    end_correct = grepl("CRS: \\d*[.]\\d*$", remove)
    if (!all(end_correct)) {
      warning("May have removed records")
    }
    ss = ss[!rem_ind]
  }



  advisor_ind = grep("Advisor History", ss)
  if (length(advisor_ind) > 0) {
    award = grep("Awarded", ss)
    if (length(award) > 0) {
      award = award[ award > advisor_ind]
      award = max(award)
    } else {
      # may be wrong
      award = length(ss)
    }
    advisor = ss[ seq(advisor_ind, award)]
    advisor = advisor[ !grepl(
      "^[*]+\\s*End.*Transcript\\s*[*]*[*]$",
      advisor)]
    # remove after award
    # ss = ss[ -seq(advisor_ind, award)]
    ss = ss[ -seq(advisor_ind, length(ss))]
  } else {
    advisor = NULL
  }

  ss = dplyr::tibble(x = ss)

  term_str = "^2\\d*-\\d* .*(Winter|Summer|Term)   "
  ss = ss %>%
    mutate(is_term = grepl(term_str, x),
           term = ifelse(is_term, x, NA_character_),
           term = zoo::na.locf(term, na.rm = FALSE)) %>%
    filter(!is_term) %>%
    select(-is_term)
  ss = ss[
    !trimws(ss$x) %in% c("Intersession", "Institute", "Overview"),]

  x = strsplit(ss$x, split = "   ")
  x = lapply(x, function(r) {
    r = r[ r != ""]
    r = trimws(r)
    r
  })
  lengths = sapply(x, length)
  if (!all(lengths == 4)) {
    warning("Not all data may be retrieved!")
    message(ss[lengths != 4,])
    x = x[ lengths == 4 ]
    ss = ss[ lengths == 4, ]
    lengths = sapply(x, length)
  }
  stopifnot(all(lengths == 4))

  x = do.call("rbind", x)
  rownames(x) = NULL
  colnames(x) = c("course_number", "course_title", "grade", "credits")


  x = as_tibble(x)
  ss = bind_cols(ss, x)
  ss$x = NULL
  ss = ss %>%
    mutate(credits = as.numeric(credits))
  term = strsplit(ss$term, split = "   ")
  term = sapply(term, function(r) {
    r = r[ r != ""]
    r = trimws(r)
    r
  })
  term = t(term)
  colnames(term) = c("term", "program")
  ss$term = NULL
  term = dplyr::as_tibble(term)
  ss = bind_cols(ss, term)

  grade = ss$grade
  grade = strsplit(grade, split = " ")
  grade = sapply(grade, dplyr::first)
  ss$grade = grade

  ss = ss %>%
    dplyr::rename(full_term_name = term) %>%
    separate(full_term_name, into = c("year", "term", "blah"), sep = " ",
             extra = "merge", fill = "right",
             remove = FALSE) %>%
    select(-blah) %>%
    mutate(term = recode(term,
                         "Summer" = "0.5",
                         "First" = "1",
                         "Second" = "2",
                         "Winter" = "2.5",
                         "Third" = "3",
                         "Fourth" = "4"),
           term = as.numeric(term))
  if (remove_withdrawn) {
    ss = ss %>%
      filter( !(grade %in% "W"))
  }

  if (!is.null(advisor)) {
    attr(ss, "advisor_info") = advisor
  }
  attr(ss, "student_info") = stud_name

  return(ss)

}


# read_jhu_transcript2 = function(file) {
#   pdf_file = path.expand(file)
#   pdf_file = normalizePath(pdf_file)
#
#   credits = blah = is_term = term = NULL
#   rm(list = c("credits", "blah", "is_term", "term"))
#   dat = tm::readPDF(
#     control = list(text="-raw"),
#     engine = "xpdf")(
#       elem = list(uri = path.expand(pdf_file)),
#       language = "en", id = "id1")
#   dat = c(as.character(dat))
#   dat = dat[ !grepl("^[**]", dat) ]
#   dat = dat[ !grepl("^GPA CRS", dat) ]
#
# }


#' @rdname read_jhu_transcript
#' @param type Type of Transcript
#' @param ... additional arguments to pass to reader function
#' @export
read_transcript = function(file, type = "jhu", ...) {
  type = match.arg(type)
  func = switch(
    type,
    "jhu" = read_jhu_transcript
  )
  func(file, ...)
}