R/Data_A.R

Defines functions process_A

#' @import data.table
#' @importFrom utils str globalVariables
#' @importFrom lubridate myd

Acquisitions_Variables <-
  c(
    'LOAN_ID',
    'ORIG_CHN',
    'Seller.Name',
    'ORIG_RT',
    'ORIG_AMT',
    'ORIG_TRM',
    'ORIG_DTE',
    'FRST_DTE',
    'OLTV',
    'OCLTV',
    'NUM_BO',
    'DTI',
    'CSCORE_B',
    'FTHB_FLG',
    'PURPOSE',
    'PROP_TYP',
    'NUM_UNIT',
    'OCC_STAT',
    'STATE',
    'ZIP_3',
    'MI_PCT',
    'Product.Type',
    'CSCORE_C',
    'MI_TYPE',
    'RELOCATION_FLG'
  )

if(getRversion() >= "2.15.1") utils::globalVariables(Acquisitions_Variables)

process_A <- function(acq_txt, verbose=FALSE) {
  if(!file.exists(acq_txt)) {
    stop(simpleError(paste(acq_txt, "not found")))
  }
  if(verbose) cat(acq_txt, file.size(acq_txt),'bytes\n')


  Acquisition_ColClasses <-
    c(
      'character', # 'LOAN_ID',
      'character', # 'ORIG_CHN',
      'factor', # 'Seller.Name',
      'numeric', # 'ORIG_RT',
      'numeric', # 'ORIG_AMT',
      'integer', # 'ORIG_TRM',
      'character', # 'ORIG_DTE',
      'character', # 'FRST_DTE',
      'integer', # 'OLTV',
      'integer', # 'OCLTV',
      'integer', #		'NUM_BO',
      'integer', # 'DTI',
      'integer', # 'CSCORE_B',
      'character', # 'FTHB_FLG',
      'character', # 'PURPOSE',
      'character', # 'PROP_TYP',
      'integer', # 'NUM_UNIT',
      'character', #		'OCC_STAT',
      'factor', #		'STATE',
      'factor', # 'ZIP_3',
      'integer', # 'MI_PCT',
      'factor', # 'Product.Type',
      'integer', # 'CSCORE_C',
      'integer', # 'MI_TYPE',
      'character'	#		'RELOCATION_FLG'
    )
  # stopifnot(unique(count.fields(acq_txt,sep = '|'))==length(Acquisitions_Variables))

  Data_A <- fread(file=acq_txt,sep = '|',stringsAsFactors = FALSE,
    col.names = Acquisitions_Variables,colClasses=Acquisition_ColClasses,
    data.table=TRUE,
    key='LOAN_ID')
setDT(Data_A, key='LOAN_ID')
#  stopifnot(nrow(Data_A)==R.utils::countLines(acq_txt))

  Data_A[ ,c('ORIG_DTE','FRST_DTE'):=list(
    as.IDate(lubridate::myd(ORIG_DTE,truncated = 1)),
    as.IDate(lubridate::myd(FRST_DTE,truncated = 1)))
    ]
  if(verbose) print(table(Data_A$ORIG_DTE, useNA = 'ifany'))
  #  Data_A[,'OrigYr':=factor(year(ORIG_DTE))]

  Data_A[,'ORIG_CHN':=factor(ORIG_CHN,levels = c('R','B','C'),ordered = FALSE)]
  Data_A[,'Seller.Name':=as.factor(Seller.Name)]

  # Data_A[,NUM_BO:=factor(
  #   NUM_BO,levels =  as.character(1:10),labels = c('1','2',rep('3+',8)), ordered = TRUE)]

  Data_A[,c(
    'RELOCATION_FLG','FTHB_FLG') :=list(
      factor(RELOCATION_FLG, levels=c('N','Y')),
      factor(FTHB_FLG,levels = c('N','Y'),ordered = FALSE)
    )]

  if(verbose) {
    cat("Purpose:\n")
    print(table(Data_A$PURPOSE, useNA = 'ifany'))
  }
  Data_A[,'PURPOSE':=factor(PURPOSE,levels = c('P','R','C','U'),ordered = FALSE)]

  if(verbose) {
    cat("Property Type:\n")
    print(sort(table(Data_A$PROP_TYP, useNA = 'ifany'), decreasing = TRUE))
  }
  Data_A[,PROP_TYP:=factor(PROP_TYP,levels = c('SF','CO','CP','MH','PU'), ordered = FALSE)]

  if(verbose) {
    cat("Occupancy Status:\n")
    print(sort(table(Data_A$OCC_STAT, useNA = 'ifany'), decreasing = TRUE))
  }
  Data_A[,OCC_STAT:=factor(OCC_STAT,levels = c('P','S','I'),ordered = FALSE)]

  if(verbose) {
    cat("Product Type:\n")
    print(table(Data_A$Product.Type, useNA = 'ifany'))
  }

  if(verbose) {
    cat("MI Type:\n")
    print(sort(table(Data_A$MI_TYPE, useNA = 'ifany'), decreasing = TRUE))
  }
  Data_A[ ,MI_TYPE:=factor(MI_TYPE, levels=c(1:3), labels = c("Borrower_Paid", "Lender_Paid", "Investor_Paid"))]

  if(verbose) {
    cat("Num Unit:\n")
    print(table(Data_A$NUM_UNIT, useNA = 'ifany'))
  }
  Data_A[, NUM_UNIT:=factor(NUM_UNIT, levels = 1:4)]

  # Data_A[, c('CSCORE_MN', 'ORIG_VAL'):= list(
  #   pmin(CSCORE_B,CSCORE_C, na.rm = TRUE),
  #   (ORIG_AMT/(OLTV/100)))]
  if(verbose) {
    utils::str(Data_A)
  }
  return(Data_A)
}
canarionyc/loanroll documentation built on Sept. 7, 2020, 4:50 a.m.