R/table_trips.R

#' Read pdf pay statements and return a df of trips
#'
#' This function takes pdfs of pay statement that are downloaded from uber
#'   https://partners.uber.com/p3/money/statements/all
#'   The settings must correspond to: portrait mode (i.e. wide), paper size A4,
#'   margins none. It returns a table of the trip data.
#' @param pdf_list a list of the pdf files with the payment info (may be a single pdf)
#' @keywords uber accounting pdf
#' @import pdftools
#' @import stringr
#' @import magrittr
#' @import dplyr
#' @import plyr
#' @import lubridate
#' @export
#' @examples
#' stub.dir <- "/Users/UBER Income/pay statements/"
#' pay.files <- paste0( stub.dir, list.files( stub.dir) )
#' table_trips( pay.files )

table_trips <- function( pdf_list ) {

  trip_table <- data_frame()

  for(i in 1:length(pdf_list) ) {
    trip_table <- bind_rows( trip_table, .read_stub( pdf_list[i] ) )
  }

  # sometimes tips come in late, so add these to the original payments
  table_out <- trip_table %>%
    group_by( Date.Time, Trip_ID, Type ) %>%
    dplyr::summarise( Fare = sum(Fare),
                      Tips = sum(Tips),
                      Boost = sum(Boost),
                      Total = sum(Total) )

  return(table_out)
}


.read_stub <- function( pdf_stub ) {

  txt <- pdf_text( pdf_stub )

  txt.all <- data_frame( )  # initialize the df of text
  for( i in 1: length(txt) ) {
    txt.page <- unlist( str_split(txt[i], "\\n") )

    for( j in 1:length(txt.page ) ) {
      txt.all <- bind_rows(
        txt.all, data_frame( line.txt = as.character( txt.page[j] ) )
      )
    }
  }

  # find the start of the earnings data and remove all lines above it
  for(k in 1:nrow(txt.all) ) {
    row.data <- unlist( str_split( txt.all[k,], " ") )
    row.data <- row.data[! row.data == ""]
    if( length(row.data) == 0 ) {  # avoid breaking on empty rows
      next
    }
    if( row.data[1] == "Date/Time") {
      earnings.start <- k
      break
    }
  }

  e.data <- txt.all[earnings.start:nrow(txt.all),]

  data.names <- unlist( str_split( e.data[1,], " ") )
  # are there miscellaneous items? If so then make a table of those
  if( "Misc." %in% data.names ) {
    misc.data <- .table_misc( txt.all )
    misc.earn <- TRUE
  } else { misc.earn <- FALSE }

  data.names <- data.names[ ! data.names %in% c("", "ID", "Earnings", "Misc.")]
  data.names[1] <- "Date.Time"
  data.names[2] <- "Trip_ID"
  # note Boost is Earnings Boost

  # Will need these when not all data is populated
  data.names.5 <- data.names[ ! data.names %in% c("Tips", "Boost") ]
  data.names.6t <- data.names[ ! data.names == "Boost" ]
  data.names.6b <- data.names[ ! data.names == "Tips" ]

  if( ! "Tips" %in% c(data.names) ) {
    my.trips <- .table_historical( e.data, incl.tips = TRUE )
  } else {
    #----- loop through each line in e.data to get the trips --------------------
    # set the new_page token
    new_page <- FALSE

    # initialize the return df
    my.trips <- data_frame( )

    for( i in 1:nrow(e.data) ) {
      deliv.data <- unlist( str_split( e.data[i,], " ") )
      deliv.data <- deliv.data[ ! deliv.data %in% ""]

      # different lengths correspond to different objects
      # length = 0: empty line, skip
      if( length( deliv.data ) == 0) {
        next
      }

      # new pages start with var names and then totals, skip these using new_page token
      if( substring( deliv.data[1], 1, 4) == "Date" ) {
        new_page <- TRUE
        next
      }

      # the first line just contains totals, so skip this
      if( new_page == TRUE ) {
        new_page <- FALSE
        next
      }

      # length = 4 and made it this far, then it is a date, get this value
      if( length( deliv.data ) == 4 ) {
        deliv.date <- paste( deliv.data[1], deliv.data[2], deliv.data[3], sep = " ")
        next
      }

      # if you made it this far, then the line is a delivery of length 6,7,8
      deliv.data[1] <- paste( deliv.date, deliv.data[1], deliv.data[2], sep = " ")
      deliv.data <- deliv.data[ -2 ]

      # make into df first for binding rows
      deliv.df <- as.data.frame( t( deliv.data ) )

      # length = 5 (6-1) if no tip and no boost, need to add
      if( ncol( deliv.df ) == 5) {
        names( deliv.df ) <- data.names.5
        deliv.df %<>% mutate( Tips = 0,
                              Boost = 0) %>%
          mutate_at( vars(Fare, Total),
                     funs( as.numeric( as.character(.))) ) %>%
          mutate_at( vars(Date.Time, Trip_ID, Type),
                     funs( as.character(.)) ) %>%
          dplyr::select( data.names )  # make sure it is in the right order
      }

      # length = 6 (7-1) missing either tip or boost, need to add
      if( ncol( deliv.df) == 6) {
        # The item in question, it5, could either be a tip or boost. If it is a
        #   tip, then there will be more white space between it5 and Total than
        #   between Fare and it5. The opposite will be true if it is Boost
        # 11/22/17- patch locate. Values can show up in Trip ID, so get all and
        #  use the last one.
        fare.pos <- str_locate_all(e.data[i,], deliv.data[4]) %>% as.data.frame()
        it5.pos <- str_locate_all(e.data[i,], deliv.data[5]) %>% as.data.frame()
        total.pos <- str_locate_all(e.data[i,], deliv.data[6]) %>% as.data.frame()
        white.1 <- it5.pos[nrow(it5.pos),1] - fare.pos[nrow(fare.pos),2]
        white.2 <- total.pos[nrow(total.pos),1] - it5.pos[nrow(it5.pos),2]

        # this won't work if it is a tip being added to a trip from a previous
        #   week because the total and the tip will be equal, so total.pos will
        #   return the position of it5
        if( deliv.data[5] == deliv.data[6] ) {
          names( deliv.df ) <- data.names.6t
          deliv.df %<>% mutate( Boost = 0)
        } else{
          if( white.1 > white.2 ) {
            # its boost...
            names( deliv.df ) <- data.names.6b
            deliv.df %<>% mutate( Tips = 0)
          } else {
            names( deliv.df ) <- data.names.6t
            deliv.df %<>% mutate( Boost = 0)
          }
        }

        deliv.df %<>%
          mutate_at( vars(Fare, Boost, Total),
                     funs( as.numeric( as.character(.))) ) %>%
          mutate_at( vars(Date.Time, Trip_ID, Type),
                     funs( as.character(.)) ) %>%
          dplyr::select( data.names )  # make sure it is in the right order
      }

      # length = 7 (8-1)-- all data, name and mutate to numeric
      if( ncol( deliv.df) == 7) {
        names( deliv.df ) <- data.names
        deliv.df %<>%
          mutate_at( vars(Fare, Boost, Total, Tips),
                     funs( as.numeric( as.character(.))) ) %>%
          mutate_at( vars(Date.Time, Trip_ID, Type),
                     funs( as.character(.)) )
      }

      my.trips <- bind_rows( my.trips, deliv.df )
    }

    my.trips %<>%
      mutate( Date.Time = mdy_hm( Date.Time ) )
  }

  # add misc earnings if they are present
  if( misc.earn == TRUE ) {
    my.trips <- bind_rows( my.trips, misc.data )
  }

  return( my.trips )
}

# -----------------------------------------------------------------------------
# This makes the table for stubs from before tipping --------------------------
.table_historical <- function( txt_data, incl.tips = TRUE ) {

  data.names <- unlist( str_split( txt_data[1,], " ") )
  data.names <- data.names[ ! data.names %in% c("", "ID", "Earnings", "Misc.")]
  data.names[1] <- "Date.Time"
  data.names[2] <- "Trip_ID"
  data.names.5 <- data.names[ ! data.names == "Boost" ]

  # set the new_page token
  new_page <- FALSE

  # initialize the return df
  my.trips <- data_frame( )

  for( i in 1:nrow(txt_data) ) {
    deliv.data <- unlist( str_split( txt_data[i,], " ") )
    deliv.data <- deliv.data[ ! deliv.data %in% ""]

    # different lengths correspond to different objects
    # length = 0: empty line, skip
    if( length( deliv.data ) == 0) {
      next
    }

    # new pages start with var names and then totals, skip these using new_page token
    if( substring( deliv.data[1], 1, 4) == "Date" ) {
      new_page <- TRUE
      next
    }

    # the first line just contains totals, so skip this
    if( new_page == TRUE ) {
      new_page <- FALSE
      next
    }

    # length = 4 and made it this far, then it is a date, get this value
    if( length( deliv.data ) == 4 ) {
      deliv.date <- paste( deliv.data[1], deliv.data[2], deliv.data[3], sep = " ")
      next
    }

    # if you made it this far, then the line is a delivery of length 6,7,8
    deliv.data[1] <- paste( deliv.date, deliv.data[1], deliv.data[2], sep = " ")
    deliv.data <- deliv.data[ -2 ]

    # make into df first for binding rows
    deliv.df <- as.data.frame( t( deliv.data ) )

    # length = 5 (6-1) if no boost, need to add
    if( ncol( deliv.df ) == 5) {
      names( deliv.df ) <- data.names.5
      deliv.df %<>% mutate( Boost = 0) %>%
        mutate_at( vars(Fare, Total),
                   funs( as.numeric( as.character(.))) ) %>%
        mutate_at( vars(Date.Time, Trip_ID, Type),
                   funs( as.character(.)) ) %>%
        dplyr::select( data.names )  # make sure it is in the right order
    }

    # length = 6 (7-1)-- all data, name and mutate to numeric
    if( ncol( deliv.df) == 6) {
      names( deliv.df ) <- data.names
      deliv.df %<>%
        mutate_at( vars(Fare, Boost, Total ),
                   funs( as.numeric( as.character(.))) ) %>%
        mutate_at( vars(Date.Time, Trip_ID, Type),
                   funs( as.character(.)) )
    }

    my.trips <- bind_rows( my.trips, deliv.df )
  }

  my.trips %<>%
    mutate( Date.Time = mdy_hm( Date.Time ) )

  if( incl.tips == TRUE ) {
    my.trips %<>% mutate( Tips = 0) %>%
      select( Date.Time, Trip_ID, Type, Fare, Tips, Boost, Total)
  }
  return( my.trips )
}

# -----------------------------------------------------------------------------
# This makes a table for miscellaneous earnings -------------------------------
.table_misc <- function( txt_df ) {

  # ----- select the miscellaneous payments portion of the stub ----------
  # find the start of the miscellaneous items and get the pay date
  for(i in 1:nrow(txt_df) ) {
    row.data <- unlist( str_split( txt_df[i,], " ") )
    row.data <- row.data[! row.data == ""]
    if( row.data[1] == "Period") {
      misc.date <- str_c(row.data[4:length(row.data)], collapse = " ")
    }
    if( row.data[1] == "Miscellaneous") {
      misc.start <- i
      break
    }
  }

  # find the end of the misc items
  for(j in misc.start:nrow(txt_df) ) {
    row.data <- unlist( str_split( txt_df[j,], " ") )
    row.data <- row.data[! row.data == ""]
    if( length(row.data) == 0 ) {
      next
    }
    if( row.data[1] == "Date/Time") {
      misc.end <- j
      break
    }
  }

  misc.data <- txt_df[misc.start:(misc.end-1),]

  #----- loop through the data and make a row for each item ----------
  misc.df <- data_frame()  # init df
  run.over <- 0 # init the skip token

  for( k in 1:nrow(misc.data) ) {

    misc.item <- unlist( str_split( misc.data[k,], " ") )
    misc.item <- misc.item[! misc.item == ""]
    item.len <- length( misc.item )

    if( item.len == 0 ) { next }  # skip empty lines

    if( misc.item[1] == "Miscellaneous" ) { next } # skip new page headers

    if( k <= run.over ) { next }  # skip runover lines


    if( is.na( as.numeric(misc.item[item.len]) ) == FALSE ) {
      misc.desc <- str_c(misc.item[1: item.len-1], collapse = " ")
      misc.pay <- as.numeric( misc.item[item.len])
    } else {
      # if the description is too long, it will run on to the next line and the
      #   payment will be on its own line. This deals with that
      misc.desc <- str_c(misc.item, collapse = " ")

      for(n in (k+1):nrow(misc.data)) {
        misc.item <- unlist( str_split( misc.data[n,], " ") )
        misc.item <- misc.item[! misc.item == ""]
        item.len <- length( misc.item )
        # get the payment
        if( item.len == 1 & is.numeric( as.numeric(misc.item)) == TRUE) {
          misc.pay <- as.numeric( misc.item[1] )
        }
        # now I want to just get rid of run on text on extra lines
        #  so I need to find where this descriptions ends
        if( item.len > 1 & is.na( as.numeric(misc.item[item.len])) == FALSE ) {
          run.over <- n
          break
        }
        # if it didn't break then it is the last item in the data
        run.over <- n
      }
    }


    # format data and add it to the misc.df
    misc.df <- bind_rows(
      misc.df,
      data_frame( Date.Time = mdy_hm(misc.date),
                  Trip_ID = substring(misc.desc,1,36),
                  Type = "Misc.", Fare = 0, Tips = 0, Boost = 0,
                  Total = misc.pay ) )
  }

  return( misc.df )
}
SmarshMELLOW/ubeRpay documentation built on May 29, 2019, 2:33 p.m.