R/clean_text.R

#' function for cleaning text 
#'
#' @param text text to be cleaned up
clean_text = function(text){
  idep.regex <- c( "\t",                       #1 tabs to space
                   "  ",                       #2 double spaces to space
                   "^( ?\u00A7\\.? ?[[:digit:]]{1,3} ?[[:alpha:]]?\\.? ?)", #3 del paragraph symbol and number at beginning  e.g.: "\u00A781 "    "\u00A7 81"
                   "^(\\(?[[:digit:]]{1,3}\\) ?)",  #4 del digit enclosed in paratheses at beginning e.g.: "(1) "    "1) "
                   "^( ?[[:alpha:]]{1,4}\\. )",  #5 del roman numbers or letter items             e.g.: "VIII. "
                   "^(\\(?[[:alpha:]]\\))",    #6 del letter with parantheses                   e.g.: "(a) "    "a) "
                   "^([[:digit:]]{1,3}[[:alpha:]]?\\.)", # del digits followed by dot and space     e.g.: "1. "     "1A."
                   "^( )",     #8
                   "^(\\-)",   #9
                   "^(\u2014\\(?[[:digit:]]?\\)?)", #10
                   "^( )", #11
                   "^[[:digit:]]{1,3}\\.\u2014\\([[:digit:]]\\) ", #12
                   "^( ?\\(?[[:alpha:]]{1,4}\\) )", #13
                   "^(\\([[:digit:]]{1,3}\\) )", #14
                   "^([[:digit:]]{1,2},\u2014\\([[:digit:]]{1,2}\\) )", #15
                   "^([[:digit:]]\\. )", #16
                   "^([[:digit:]]{1,3}\\:\u2014\\([[:digit:]]{1,3}\\))", #17
                   "^(capitulo [[:alpha:]]{1,10}\\.?)", #18
                   "^(articulo [[:digit:]]{1,3}\\.?)", #19
                   "^(titulo [[:alpha:]]{1,10}\\.?)", #20
                   "^(seccion [[:digit:]]{1,10}\\.?)", #21
                   "^(seccion [[:alpha:]]{1,10}\\.?)", #22
                   "^(primera.?-? ?|segunda.?-? ?|tercera.?-? ?|cuarta.?-? ?|quinta.?-? ?|sexta.?-? ?|septima.?-? ?|octava.?-? ?|novena.?-? ?|decima.?-? ?)",
                   "^([[:digit:]]{1,3} ?\u00A7.? ?)", #24
                   "^([[:digit:]]{1,3} Kap\\. )", #25
                   "^(section [[:digit:]]{1,3} ?\u2014?-? ?)", #26
                   "^(chapter ?[[:digit:]]{1,3} ?\u2014?-? ?)", #27
                   "^(Art[[:alpha:]]?{1,7}.? ?[[:digit:]]{1,3}[[:alpha:]]?{1,10}-?[[:alpha:]]?{1,10}[[:digit:]]?{1,3}.?\u00B0?)", #28
                   "^(CHAP[[:alpha:]]?{1,5}.? [[:digit:]]?{1,3}[[:alpha:]]?{1,10}.? ?)", #29
                   "^([[:digit:]]{1,3}\u00B0? )", #30
                   "^([[:digit:]]{1,3}\u00B0? )", #31
                   "^(er)$", #32
                   "^([[:alpha:]]{1,5}\\. ?\u2014? ?)", #33
                   "^(.\\.)", #34
                   "^(PART [[:alpha:]]{1,10})", #35
                   "^(CAPO [[:alpha:]]{1,10}\\.? )", #36
                   "^([[:digit:]]{1,3}\u00B0\\) )", #37
                   "^([[:digit:]]{1,3}-[[:alpha:]]{1,4}\\.? ?)", #38
                   "^([[:alpha:]]{1,6} - )", #39
                   "^(Parte [[:alpha:]]{1,7} - )", #40
                   "^(titre [[:alpha:]]?{1,7}[[:digit:]]?{1,3}\\.? ?-?)", #41
                   "^(Titel [[:alpha:]]?{1,7}[[:digit:]]?{1,3}\\.? ?-?)", #42
                   "^(HOOFDSTUK [[:alnum:]]{1,7}.? ?)", #43
                   "^([[:digit:]]{1,3}\\,)", #44
                   "^(Paragraaf [[:digit:]]{1,3}[[:alpha:]]?\\.)", #45
                   "^([[:alpha:]]{1,10} Kapitel\\.?)", #46
                   "^([[:alpha:]]{1,10} Kapittel\\.?)", #47
                   "^(SEC..O [[:alpha:]]{1,7}\\.?)", #48
                   "^(Cap.tulo [[:alpha:]]{1,7}\\.?)", #49
                   "^(T.TULO [[:alpha:]]{1,7}\\.?)", #50
                   "^( )", #51
                   "^(\\.)", #52
                   "^( )", #53
                   "^([[:alpha:]]?IVIS.O [[:alpha:]]{1,7})", #54
                   "^([[:digit:]]. )", #55
                   "^(Kapitel\\:? [[:alpha:]]?{1,7})", #56
                   "^(Titel\\: )", #57
                   "^([[:digit:]]{1,2}[[:alpha:]])", #58
                   "^([[:digit:]]{1,2}.[[:digit:]]{1,2})", #59
                   "^(Title [[:alpha:]]?{1,7}[[:digit:]]?{1,3}\\.? ?-?)", #60
                   "^(section [[:alpha:]]?{1,7}[[:digit:]]?{1,3}\\.? ?-?)", #61
                   "^[[:digit:]]{1,3}\\.?-\\([[:digit:]]\\) ", #62
                   "^([[:digit:]]{1,2}- )",
                   "^(\\(?\\))")
  for(i in seq_along(idep.regex)){
    text <- gsub( pattern     = idep.regex[i],
                  replacement = "",
                  x           = text      )
  }
  return(text)
}
petermeissner/idep documentation built on May 25, 2019, 1:53 a.m.