Nothing
####################################
# R htm2txt package Ver 2.2.2 #
# by Sangchul Park #
####################################
#' Convert a html document to plain texts by stripping off all html tags
#'
#' @param htm A character vector, containing a html document, to be converted into plain texts (other objects are coerced into character vectors).
#' @param list A character that replaces "li" tags (referring to a numbering or bullet for lists). The default is a line change followed by a bullet character and a space.
#' @param pagebreak A character that replaces "hr" tags (referring to a thematic change in the content or a page break).
#' @return A character vector containing plain texts converted from the html document.
#' @examples
#' text = htm2txt("<html><body>html texts</body></html>")
#' text = htm2txt(c("Hello<p>World", "Goodbye<br>Friends"))
#' text = htm2txt("<p>Menu:</p><ul></li>Coffee</li><li>Tea</li></ul>", list = "\n- ")
#' text = htm2txt("Page 1<hr>Page 2", pagebreak = "\n\n[NEW PAGE]\n\n")
#' @export
htm2txt <- function(htm, list = "\n• ", pagebreak = "\n\n----------\n\n") {
# function gsubfun: work like gsubfn::gsubfn, but does not damage unicodes
gsubfun <- function(x, pattern, FUN) {
match = lapply(regmatches(x, gregexpr(pattern, x, useBytes = TRUE)), function(y) if (length(y) == 0) return(y) else return(sapply(y, FUN)))
nonmatch = regmatches(x, gregexpr(pattern, x, useBytes = TRUE), invert = TRUE)
return(sapply(seq_along(match), function(i) if (length(.subset2(match, i)) == 0) return(x[i]) else return(paste(append(.subset2(nonmatch, i)[1], sapply(seq_along(.subset2(match, i)), function(j) paste(.subset2(match, i)[j], .subset2(nonmatch, i)[j + 1], sep = ''))), collapse = ''))))
}
# htm2txt main codes
htm = as.vector(unlist(htm))
htm = gsub('<style[^>]*>(.*?)</style[^>]*>|<script[^>]*>(.*?)</script[^>]*>|<title[^>]*>(.*?)</title[^>]*>|<!--(.*?)-->', '', htm)
htm = gsub('</?p>|</?p [^>]*>|(</?(div|h1|h2|h3|h4|h5|h6|form|ul|ol|dir|dl|table|section|textarea|article|aside|details|blockquote)[^>]*>)+', '\n\n', htm, ignore.case = TRUE)
htm = gsub('(</?(br|tr|dt|dd|button|label|option|summary|legend)[^>]*>)+', '\n', htm, ignore.case = TRUE)
htm = gsub('<td[^>]*>', ' ', htm, ignore.case = TRUE)
htm = gsub('</?q>|</?q [^>]*>', '"', htm, ignore.case = TRUE)
htm = gsub('<hr[^>]*>', pagebreak, htm, ignore.case = TRUE)
htm = gsub('<li>[[:blank:](\n)(\t)]*|<li [^>]*>[[:blank:](\n)(\t)]*', list, htm, ignore.case = TRUE)
htm = gsub('<[/A-z!]+[^<>]*<[^<>]*<[^<>]*>[^<>]*>[^<>]*>', '', htm)
htm = gsub('<[/A-z!]+[^<>]*<[^<>]*>[^<>]*>', '', htm)
htm = gsub('<[/A-z!]+[^<>]*>', '', htm)
entities = c('Á','Á','á','á','Ă','ă','∾','∿','∾̳','Â','Â','â','â','´','´','А','а','Æ','Æ','æ','æ','⁡','𝔄','𝔞','À','À','à','à','ℵ','ℵ','Α','α','Ā','ā','⨿','&','&','&','&','⩓','∧','⩕','⩜','⩘','⩚','∠','⦤','∠','∡','⦨','⦩','⦪','⦫','⦬','⦭','⦮','⦯','∟','⊾','⦝','∢','Å','⍼','Ą','ą','𝔸','𝕒','≈','⩯','⩰','≊','≋',''','⁡','≈','≊','Å','Å','å','å','𝒜','𝒶','≔','*','≈','≍','Ã','Ã','ã','ã','Ä','Ä','ä','ä','∳','⨑','≌','϶','‵','∽','⋍','∖','⫧','⊽','⌆','⌅','⌅','⎵','⎶','≌','Б','б','„','∵','∵','∵','⦰','϶','ℬ','ℬ','Β','β','ℶ','≬','𝔅','𝔟','⋂','◯','⋃','⨀','⨁','⨂','⨆','★','▽','△','⨄','⋁','⋀','⤍','⧫','▪','▴','▾','◂','▸','␣','▒','░','▓','█','=⃥','≡⃥','⫭','⌐','𝔹','𝕓','⊥','⊥','⋈','⧉','╗','╖','╕','┐','╔','╓','╒','┌','═','─','╦','╤','╥','┬','╩','╧','╨','┴','⊟','⊞','⊠','╝','╜','╛','┘','╚','╙','╘','└','║','│','╬','╫','╪','┼','╣','╢','╡','┤','╠','╟','╞','├','‵','˘','˘','¦','¦','ℬ','𝒷','⁏','∽','⋍','\','⧅','⟈','•','•','≎','⪮','≏','≎','≏','Ć','ć','⋒','∩','⩄','⩉','⩋','⩇','⩀','ⅅ','∩︀','⁁','ˇ','ℭ','⩍','Č','č','Ç','Ç','ç','ç','Ĉ','ĉ','∰','⩌','⩐','Ċ','ċ','¸','¸','¸','⦲','¢','¢','·','·','ℭ','𝔠','Ч','ч','✓','✓','Χ','χ','○','ˆ','≗','↺','↻','⊛','⊚','⊝','⊙','®','Ⓢ','⊖','⊕','⊗','⧃','≗','⨐','⫯','⧂','∲','”','’','♣','♣','∷',':','⩴','≔','≔',',','@','∁')
entities = append(entities, c('∘','∁','ℂ','≅','⩭','≡','∯','∮','∮','ℂ','𝕔','∐','∐','©','©','©','©','℗','∳','↵','⨯','✗','𝒞','𝒸','⫏','⫑','⫐','⫒','⋯','⤸','⤵','⋞','⋟','↶','⤽','⋓','∪','⩈','≍','⩆','⩊','⊍','⩅','∪︀','↷','⤼','⋞','⋟','⋎','⋏','¤','¤','↶','↷','⋎','⋏','∲','∱','⌭','‡','†','ℸ','↡','⇓','↓','‐','⫤','⊣','⤏','˝','Ď','ď','Д','д','ⅅ','ⅆ','‡','⇊','⤑','⩷','°','°','∇','Δ','δ','⦱','⥿','𝔇','𝔡','⥥','⇃','⇂','´','˙','˝','`','˜','⋄','⋄','⋄','♦','♦','¨','ⅆ','ϝ','⋲','÷','÷','÷','⋇','⋇','Ђ','ђ','⌞','⌍','$','𝔻','𝕕','¨','˙','⃜','≐','≑','≐','∸','∔','⊡','⌆','∯','¨','⇓','⇐','⇔','⫤','⟸','⟺','⟹','⇒','⊨','⇑','⇕','∥','↓','⇓','↓','⤓','⇵','̑','⇊','⇃','⇂','⥐','⥞','↽','⥖','⥟','⇁','⥗','⊤','↧','⤐','⌟','⌌','𝒟','𝒹','Ѕ','ѕ','⧶','Đ','đ','⋱','▿','▾','⇵','⥯','⦦','Џ','џ','⟿','É','É','é','é','⩮','Ě','ě','≖','Ê','Ê','ê','ê','≕','Э','э','⩷','Ė','≑','ė','ⅇ','≒','𝔈','𝔢','⪚','È','È','è','è','⪖','⪘','⪙','∈','⏧','ℓ','⪕','⪗','Ē','ē','∅','∅','◻','∅','▫',' ',' ',' ','Ŋ','ŋ',' ','Ę','ę','𝔼','𝕖','⋕','⧣','⩱','ε','Ε','ε','ϵ','≖','≕','≂','⪖','⪕','⩵','=','≂','≟','⇌','≡','⩸','⧥','⥱','≓','ℰ','ℯ','≐','⩳','≂','Η','η','Ð','Ð','ð','ð','Ë','Ë','ë','ë','€','!','∃','∃','ℰ','ⅇ','ⅇ','≒','Ф','ф','♀','ffi','ff','ffl','𝔉','𝔣','fi','◼','▪','fj','♭','fl','▱','ƒ','𝔽','𝕗','∀','∀','⋔','⫙','ℱ'))
entities = append(entities, c('⨍','½','½','⅓','¼','¼','⅕','⅙','⅛','⅔','⅖','¾','¾','⅗','⅜','⅘','⅚','⅝','⅞','⁄','⌢','ℱ','𝒻','ǵ','Γ','γ','Ϝ','ϝ','⪆','Ğ','ğ','Ģ','Ĝ','ĝ','Г','г','Ġ','ġ','≧','≥','⪌','⋛','≥','≧','⩾','⩾','⪩','⪀','⪂','⪄','⋛︀','⪔','𝔊','𝔤','⋙','≫','⋙','ℷ','Ѓ','ѓ','≷','⪥','⪒','⪤','⪊','⪊','≩','⪈','⪈','≩','⋧','𝔾','𝕘','`','≥','⋛','≧','⪢','≷','⩾','≳','𝒢','ℊ','≳','⪎','⪐','>','>','≫','>','>','⪧','⩺','⋗','⦕','⩼','⪆','⥸','⋗','⋛','⪌','≷','≳','≩︀','≩︀','ˇ',' ','½','ℋ','Ъ','ъ','⇔','↔','⥈','↭','^','ℏ','Ĥ','ĥ','♥','♥','…','⊹','ℌ','𝔥','ℋ','⤥','⤦','⇿','∻','↩','↪','ℍ','𝕙','―','─','ℋ','𝒽','ℏ','Ħ','ħ','≎','≏','⁃','‐','Í','Í','í','í','⁣','Î','Î','î','î','И','и','İ','Е','е','¡','¡','⇔','ℑ','𝔦','Ì','Ì','ì','ì','ⅈ','⨌','∭','⧜','℩','IJ','ij','ℑ','Ī','ī','ℑ','ⅈ','ℐ','ℑ','ı','⊷','Ƶ','⇒','∈','℅','∞','⧝','ı','∬','∫','⊺','ℤ','∫','⊺','⋂','⨗','⨼','⁣','⁢','Ё','ё','Į','į','𝕀','𝕚','Ι','ι','⨼','¿','¿','ℐ','𝒾','∈','⋵','⋹','⋴','⋳','∈','⁢','Ĩ','ĩ','І','і','Ï','Ï','ï','ï','Ĵ','ĵ','Й','й','𝔍','𝔧','ȷ','𝕁','𝕛','𝒥','𝒿','Ј','ј','Є','є','Κ','κ','ϰ','Ķ','ķ','К','к','𝔎','𝔨','ĸ','Х','х','Ќ','ќ','𝕂','𝕜','𝒦','𝓀','⇚','Ĺ','ĺ','⦴','ℒ','Λ','λ','⟪','⟨','⦑','⟨','⪅','ℒ','«','«','↞','⇐','←','⇤','⤟','⤝','↩','↫','⤹','⥳','↢','⪫','⤛','⤙','⪭','⪭︀','⤎','⤌','❲','{','[','⦋'))
entities = append(entities, c('⦏','⦍','Ľ','ľ','Ļ','ļ','⌈','{','Л','л','⤶','“','„','⥧','⥋','↲','≦','≤','⟨','←','⇐','←','⇤','⇆','↢','⌈','⟦','⥡','⇃','⥙','⌊','↽','↼','⇇','↔','⇔','↔','⇆','⇋','↭','⥎','⊣','↤','⥚','⋋','⊲','⧏','⊴','⥑','⥠','↿','⥘','↼','⥒','⪋','⋚','≤','≦','⩽','⩽','⪨','⩿','⪁','⪃','⋚︀','⪓','⪅','⋖','⋚','⪋','⋚','≦','≶','≶','⪡','≲','⩽','≲','⥼','⌊','𝔏','𝔩','≶','⪑','⥢','↽','↼','⥪','▄','Љ','љ','⋘','≪','⇇','⌞','⇚','⥫','◺','Ŀ','ŀ','⎰','⎰','⪉','⪉','≨','⪇','⪇','≨','⋦','⟬','⇽','⟦','⟵','⟸','⟵','⟷','⟺','⟷','⟼','⟶','⟹','⟶','↫','↬','⦅','𝕃','𝕝','⨭','⨴','∗','_','↙','↘','◊','◊','⧫','(','⦓','⇆','⌟','⇋','⥭','‎','⊿','‹','ℒ','𝓁','↰','↰','≲','⪍','⪏','[','‘','‚','Ł','ł','<','<','≪','<','<','⪦','⩹','⋖','⋋','⋉','⥶','⩻','◃','⊴','◂','⦖','⥊','⥦','≨︀','≨︀','¯','¯','♂','✠','✠','⤅','↦','↦','↧','↤','↥','▮','⨩','М','м','—','∺','∡',' ','ℳ','𝔐','𝔪','℧','µ','µ','∣','*','⫰','·','·','−','⊟','∸','⨪','∓','⫛','…','∓','⊧','𝕄','𝕞','∓','ℳ','𝓂','∾','Μ','μ','⊸','⊸','∇','Ń','ń','∠⃒','≉','⩰̸','≋̸','ʼn','≉','♮','♮','ℕ',' ',' ','≎̸','≏̸','⩃','Ň','ň','Ņ','ņ','≇','⩭̸','⩂','Н','н','–','≠','⤤','⇗','↗','↗','≐̸','​','​','​','​','≢','⤨','≂̸','≫','≪','
','∄','∄','𝔑','𝔫','≧̸','≱','≱','≧̸','⩾̸','⩾̸','⋙̸','≵','≫⃒','≯','≯','≫̸','⇎','↮','⫲','∋','⋼','⋺','∋','Њ','њ','⇍','↚','‥','≦̸','≰','⇍'))
entities = append(entities, c('↚','⇎','↮','≰','≦̸','⩽̸','⩽̸','≮','⋘̸','≴','≪⃒','≮','⋪','⋬','≪̸','∤','⁠',' ','ℕ','𝕟','⫬','¬','¬','≢','≭','∦','∉','≠','≂̸','∄','≯','≱','≧̸','≫̸','≹','⩾̸','≵','≎̸','≏̸','∉','⋵̸','⋹̸','∉','⋷','⋶','⋪','⧏̸','⋬','≮','≰','≸','≪̸','⩽̸','≴','⪢̸','⪡̸','∌','∌','⋾','⋽','⊀','⪯̸','⋠','∌','⋫','⧐̸','⋭','⊏̸','⋢','⊐̸','⋣','⊂⃒','⊈','⊁','⪰̸','⋡','≿̸','⊃⃒','⊉','≁','≄','≇','≉','∤','∦','∦','⫽⃥','∂̸','⨔','⊀','⋠','⪯̸','⊀','⪯̸','⇏','↛','⤳̸','↝̸','⇏','↛','⋫','⋭','⊁','⋡','⪰̸','𝒩','𝓃','∤','∦','≁','≄','≄','∤','∦','⋢','⋣','⊄','⫅̸','⊈','⊂⃒','⊈','⫅̸','⊁','⪰̸','⊅','⫆̸','⊉','⊃⃒','⊉','⫆̸','≹','Ñ','Ñ','ñ','ñ','≸','⋪','⋬','⋫','⋭','Ν','ν','#','№',' ','≍⃒','⊯','⊮','⊭','⊬','≥⃒','>⃒','⤄','⧞','⤂','≤⃒','<⃒','⊴⃒','⤃','⊵⃒','∼⃒','⤣','⇖','↖','↖','⤧','Ó','Ó','ó','ó','⊛','⊚','Ô','Ô','ô','ô','О','о','⊝','Ő','ő','⨸','⊙','⦼','Œ','œ','⦿','𝔒','𝔬','˛','Ò','Ò','ò','ò','⧁','⦵','Ω','∮','↺','⦾','⦻','‾','⧀','Ō','ō','Ω','ω','Ο','ο','⦶','⊖','𝕆','𝕠','⦷','“','‘','⦹','⊕','⩔','∨','↻','⩝','ℴ','ℴ','ª','ª','º','º','⊶','⩖','⩗','⩛','Ⓢ','𝒪','ℴ','Ø','Ø','ø','ø','⊘','Õ','Õ','õ','õ','⨷','⊗','⨶','Ö','Ö','ö','ö','⌽','‾','⏞','⎴','⏜','∥','¶','¶','∥','⫳','⫽','∂','∂','П','п','%','.','‰','⊥','‱','𝔓','𝔭','Φ','φ','ϕ','ℳ','☎','Π','π','⋔','ϖ','ℏ','ℎ','ℏ','+','⨣','⊞','⨢','∔','⨥','⩲','±','±','±','⨦','⨧','±','ℌ','⨕'))
entities = append(entities, c('ℙ','𝕡','£','£','⪻','≺','⪷','≼','⪳','⪯','≺','⪷','≼','≺','⪯','≼','≾','⪯','⪹','⪵','⋨','≾','″','′','ℙ','⪹','⪵','⋨','∏','∏','⌮','⌒','⌓','∝','∷','∝','∝','≾','⊰','𝒫','𝓅','Ψ','ψ',' ','𝔔','𝔮','⨌','ℚ','𝕢','⁗','𝒬','𝓆','ℍ','⨖','?','≟','"','"','⇛','∽̱','Ŕ','ŕ','√','⦳','⟫','⟩','⦒','⦥','⟩','»','»','↠','⇒','→','⥵','⇥','⤠','⤳','⤞','↪','↬','⥅','⥴','⤖','↣','↝','⤜','⤚','∶','ℚ','⤐','⤏','⤍','❳','}',']','⦌','⦎','⦐','Ř','ř','Ŗ','ŗ','⌉','}','Р','р','⤷','⥩','”','”','↳','ℜ','ℜ','ℛ','ℜ','ℝ','▭','®','®','®','®','∋','⇋','⥯','⥽','⌋','ℜ','𝔯','⥤','⇁','⇀','⥬','Ρ','ρ','ϱ','⟩','→','⇒','→','⇥','⇄','↣','⌉','⟧','⥝','⇂','⥕','⌋','⇁','⇀','⇄','⇌','⇉','↝','⊢','↦','⥛','⋌','⊳','⧐','⊵','⥏','⥜','↾','⥔','⇀','⥓','˚','≓','⇄','⇌','‏','⎱','⎱','⫮','⟭','⇾','⟧','⦆','ℝ','𝕣','⨮','⨵','⥰',')','⦔','⨒','⇉','⇛','›','ℛ','𝓇','↱','↱',']','’','’','⋌','⋊','▹','⊵','▸','⧎','⧴','⥨','℞','Ś','ś','‚','⪼','≻','⪸','Š','š','≽','⪴','⪰','Ş','ş','Ŝ','ŝ','⪺','⪶','⋩','⨓','≿','С','с','⋅','⊡','⩦','⤥','⇘','↘','↘','§','§',';','⤩','∖','∖','✶','𝔖','𝔰','⌢','♯','Щ','щ','Ш','ш','↓','←','∣','∥','→','↑','­','­','Σ','σ','ς','ς','∼','⩪','≃','≃','⪞','⪠','⪝','⪟','≆','⨤','⥲','←','∘','∖','⨳','⧤','∣','⌣','⪪','⪬','⪬︀','Ь','ь','/','⧄','⌿','𝕊','𝕤','♠','♠','∥','⊓','⊓︀','⊔','⊔︀','√','⊏'))
entities = append(entities, c('⊑','⊏','⊑','⊐','⊒','⊐','⊒','□','□','□','⊓','⊏','⊑','⊐','⊒','⊔','▪','▪','→','𝒮','𝓈','∖','⌣','⋆','⋆','☆','★','ϵ','ϕ','¯','⋐','⊂','⪽','⫅','⊆','⫃','⫁','⫋','⊊','⪿','⥹','⋐','⊂','⊆','⫅','⊆','⊊','⫋','⫇','⫕','⫓','≻','⪸','≽','≻','⪰','≽','≿','⪰','⪺','⪶','⋩','≿','∋','∑','∑','♪','⋑','⊃','¹','¹','²','²','³','³','⪾','⫘','⫆','⊇','⫄','⊃','⊇','⟉','⫗','⥻','⫂','⫌','⊋','⫀','⋑','⊃','⊇','⫆','⊋','⫌','⫈','⫔','⫖','⤦','⇙','↙','↙','⤪','ß','ß','	','⌖','Τ','τ','⎴','Ť','ť','Ţ','ţ','Т','т','⃛','⌕','𝔗','𝔱','∴','∴','∴','Θ','θ','ϑ','ϑ','≈','∼','  ',' ',' ','≈','∼','Þ','Þ','þ','þ','∼','˜','≃','≅','≈','×','×','⊠','⨱','⨰','∭','⤨','⊤','⌶','⫱','𝕋','𝕥','⫚','⤩','‴','™','™','▵','▿','◃','⊴','≜','▹','⊵','◬','≜','⨺','⃛','⨹','⧍','⨻','⏢','𝒯','𝓉','Ц','ц','Ћ','ћ','Ŧ','ŧ','≬','↞','↠','Ú','Ú','ú','ú','↟','⇑','↑','⥉','Ў','ў','Ŭ','ŭ','Û','Û','û','û','У','у','⇅','Ű','ű','⥮','⥾','𝔘','𝔲','Ù','Ù','ù','ù','⥣','↿','↾','▀','⌜','⌜','⌏','◸','Ū','ū','¨','¨','_','⏟','⎵','⏝','⋃','⊎','Ų','ų','𝕌','𝕦','↑','⇑','↑','⤒','⇅','↕','⇕','↕','⥮','↿','↾','⊎','↖','↗','ϒ','υ','ϒ','Υ','υ','⊥','↥','⇈','⌝','⌝','⌎','Ů','ů','◹','𝒰','𝓊','⋰','Ũ','ũ','▵','▴','⇈','Ü','Ü','ü','ü','⦧','⦜','ϵ','ϰ','∅','ϕ','ϖ','∝','⇕','↕','ϱ','ς','⊊︀','⫋︀','⊋︀','⫌︀','ϑ','⊲','⊳','⫫','⫨','⫩','В'))
entities = append(entities, c('в','⊫','⊩','⊨','⊢','⫦','⋁','∨','⊻','≚','⋮','‖','|','‖','|','∣','|','❘','≀',' ','𝔙','𝔳','⊲','⊂⃒','⊃⃒','𝕍','𝕧','∝','⊳','𝒱','𝓋','⫋︀','⊊︀','⫌︀','⊋︀','⊪','⦚','Ŵ','ŵ','⩟','⋀','∧','≙','℘','𝔚','𝔴','𝕎','𝕨','℘','≀','≀','𝒲','𝓌','⋂','◯','⋃','▽','𝔛','𝔵','⟺','⟷','Ξ','ξ','⟸','⟵','⟼','⋻','⨀','𝕏','𝕩','⨁','⨂','⟹','⟶','𝒳','𝓍','⨆','⨄','△','⋁','⋀','Ý','Ý','ý','ý','Я','я','Ŷ','ŷ','Ы','ы','¥','¥','𝔜','𝔶','Ї','ї','𝕐','𝕪','𝒴','𝓎','Ю','ю','Ÿ','ÿ','ÿ','Ź','ź','Ž','ž','З','з','Ż','ż','ℨ','​','Ζ','ζ','ℨ','𝔷','Ж','ж','⇝','ℤ','𝕫','𝒵','𝓏','‍','‌'))
unicodes = c('Á','Á','á','á','Ă','ă','∾','∿','∾ ̳','Â','Â','â','â','´','´','А','а','Æ','Æ','æ','æ','⁡','𝔄','𝔞','À','À','à','à','ℵ','ℵ','Α','α','Ā','ā','⨿','&','&','&','&','⩓','∧','⩕','⩜','⩘','⩚','∠','⦤','∠','∡','⦨','⦩','⦪','⦫','⦬','⦭','⦮','⦯','∟','⊾','⦝','∢','Å','⍼','Ą','ą','𝔸','𝕒','≈','⩯','⩰','≊','≋',''','⁡','≈','≊','Å','Å','å','å','𝒜','𝒶','≔','*','≈','≍','Ã','Ã','ã','ã','Ä','Ä','ä','ä','∳','⨑','≌','϶','‵','∽','⋍','∖','⫧','⊽','⌆','⌅','⌅','⎵','⎶','≌','Б','б','„','∵','∵','∵','⦰','϶','ℬ','ℬ','Β','β','ℶ','≬','𝔅','𝔟','⋂','◯','⋃','⨀','⨁','⨂','⨆','★','▽','△','⨄','⋁','⋀','⤍','⧫','▪','▴','▾','◂','▸','␣','▒','░','▓','█','= ⃥','≡ ⃥','⫭','⌐','𝔹','𝕓','⊥','⊥','⋈','⧉','╗','╖','╕','┐','╔','╓','╒','┌','═','─','╦','╤','╥','┬','╩','╧','╨','┴','⊟','⊞','⊠','╝','╜','╛','┘','╚','╙','╘','└','║','│','╬','╫','╪','┼','╣','╢','╡','┤','╠','╟','╞','├','‵','˘','˘','¦','¦','ℬ','𝒷','⁏','∽','⋍','\','⧅','⟈','•','•','≎','⪮','≏','≎','≏','Ć','ć','⋒','∩','⩄','⩉','⩋','⩇','⩀','ⅅ','∩ ︀','⁁','ˇ','ℭ','⩍','Č','č','Ç','Ç','ç','ç','Ĉ','ĉ','∰','⩌','⩐','Ċ','ċ','¸','¸','¸','⦲','¢','¢','·','·','ℭ','𝔠','Ч','ч','✓','✓','Χ','χ','○','ˆ','≗','↺','↻','⊛','⊚','⊝','⊙','®','Ⓢ','⊖','⊕','⊗','⧃','≗','⨐','⫯','⧂','∲','”','’','♣','♣','∷',':','⩴','≔','≔',',','@','∁')
unicodes = append(unicodes, c('∘','∁','ℂ','≅','⩭','≡','∯','∮','∮','ℂ','𝕔','∐','∐','©','©','©','©','℗','∳','↵','⨯','✗','𝒞','𝒸','⫏','⫑','⫐','⫒','⋯','⤸','⤵','⋞','⋟','↶','⤽','⋓','∪','⩈','≍','⩆','⩊','⊍','⩅','∪ ︀','↷','⤼','⋞','⋟','⋎','⋏','¤','¤','↶','↷','⋎','⋏','∲','∱','⌭','‡','†','ℸ','↡','⇓','↓','‐','⫤','⊣','⤏','˝','Ď','ď','Д','д','ⅅ','ⅆ','‡','⇊','⤑','⩷','°','°','∇','Δ','δ','⦱','⥿','𝔇','𝔡','⥥','⇃','⇂','´','˙','˝','`','˜','⋄','⋄','⋄','♦','♦','¨','ⅆ','ϝ','⋲','÷','÷','÷','⋇','⋇','Ђ','ђ','⌞','⌍','$','𝔻','𝕕','¨','˙','⃜','≐','≑','≐','∸','∔','⊡','⌆','∯','¨','⇓','⇐','⇔','⫤','⟸','⟺','⟹','⇒','⊨','⇑','⇕','∥','↓','⇓','↓','⤓','⇵','̑','⇊','⇃','⇂','⥐','⥞','↽','⥖','⥟','⇁','⥗','⊤','↧','⤐','⌟','⌌','𝒟','𝒹','Ѕ','ѕ','⧶','Đ','đ','⋱','▿','▾','⇵','⥯','⦦','Џ','џ','⟿','É','É','é','é','⩮','Ě','ě','≖','Ê','Ê','ê','ê','≕','Э','э','⩷','Ė','≑','ė','ⅇ','≒','𝔈','𝔢','⪚','È','È','è','è','⪖','⪘','⪙','∈','⏧','ℓ','⪕','⪗','Ē','ē','∅','∅','◻','∅','▫',' ',' ',' ','Ŋ','ŋ',' ','Ę','ę','𝔼','𝕖','⋕','⧣','⩱','ε','Ε','ε','ϵ','≖','≕','≂','⪖','⪕','⩵','=','≂','≟','⇌','≡','⩸','⧥','⥱','≓','ℰ','ℯ','≐','⩳','≂','Η','η','Ð','Ð','ð','ð','Ë','Ë','ë','ë','€','!','∃','∃','ℰ','ⅇ','ⅇ','≒','Ф','ф','♀','ffi','ff','ffl','𝔉','𝔣','fi','◼','▪','f j','♭','fl','▱','ƒ','𝔽','𝕗','∀','∀','⋔','⫙','ℱ'))
unicodes = append(unicodes, c('⨍','½','½','⅓','¼','¼','⅕','⅙','⅛','⅔','⅖','¾','¾','⅗','⅜','⅘','⅚','⅝','⅞','⁄','⌢','ℱ','𝒻','ǵ','Γ','γ','Ϝ','ϝ','⪆','Ğ','ğ','Ģ','Ĝ','ĝ','Г','г','Ġ','ġ','≧','≥','⪌','⋛','≥','≧','⩾','⩾','⪩','⪀','⪂','⪄','⋛ ︀','⪔','𝔊','𝔤','⋙','≫','⋙','ℷ','Ѓ','ѓ','≷','⪥','⪒','⪤','⪊','⪊','≩','⪈','⪈','≩','⋧','𝔾','𝕘','`','≥','⋛','≧','⪢','≷','⩾','≳','𝒢','ℊ','≳','⪎','⪐','>','>','≫','>','>','⪧','⩺','⋗','⦕','⩼','⪆','⥸','⋗','⋛','⪌','≷','≳','≩ ︀','≩ ︀','ˇ',' ','½','ℋ','Ъ','ъ','⇔','↔','⥈','↭','^','ℏ','Ĥ','ĥ','♥','♥','…','⊹','ℌ','𝔥','ℋ','⤥','⤦','⇿','∻','↩','↪','ℍ','𝕙','―','─','ℋ','𝒽','ℏ','Ħ','ħ','≎','≏','⁃','‐','Í','Í','í','í','⁣','Î','Î','î','î','И','и','İ','Е','е','¡','¡','⇔','ℑ','𝔦','Ì','Ì','ì','ì','ⅈ','⨌','∭','⧜','℩','IJ','ij','ℑ','Ī','ī','ℑ','ⅈ','ℐ','ℑ','ı','⊷','Ƶ','⇒','∈','℅','∞','⧝','ı','∬','∫','⊺','ℤ','∫','⊺','⋂','⨗','⨼','⁣','⁢','Ё','ё','Į','į','𝕀','𝕚','Ι','ι','⨼','¿','¿','ℐ','𝒾','∈','⋵','⋹','⋴','⋳','∈','⁢','Ĩ','ĩ','І','і','Ï','Ï','ï','ï','Ĵ','ĵ','Й','й','𝔍','𝔧','ȷ','𝕁','𝕛','𝒥','𝒿','Ј','ј','Є','є','Κ','κ','ϰ','Ķ','ķ','К','к','𝔎','𝔨','ĸ','Х','х','Ќ','ќ','𝕂','𝕜','𝒦','𝓀','⇚','Ĺ','ĺ','⦴','ℒ','Λ','λ','⟪','⟨','⦑','⟨','⪅','ℒ','«','«','↞','⇐','←','⇤','⤟','⤝','↩','↫','⤹','⥳','↢','⪫','⤛','⤙','⪭','⪭ ︀','⤎','⤌','❲','{','[','⦋'))
unicodes = append(unicodes, c('⦏','⦍','Ľ','ľ','Ļ','ļ','⌈','{','Л','л','⤶','“','„','⥧','⥋','↲','≦','≤','⟨','←','⇐','←','⇤','⇆','↢','⌈','⟦','⥡','⇃','⥙','⌊','↽','↼','⇇','↔','⇔','↔','⇆','⇋','↭','⥎','⊣','↤','⥚','⋋','⊲','⧏','⊴','⥑','⥠','↿','⥘','↼','⥒','⪋','⋚','≤','≦','⩽','⩽','⪨','⩿','⪁','⪃','⋚ ︀','⪓','⪅','⋖','⋚','⪋','⋚','≦','≶','≶','⪡','≲','⩽','≲','⥼','⌊','𝔏','𝔩','≶','⪑','⥢','↽','↼','⥪','▄','Љ','љ','⋘','≪','⇇','⌞','⇚','⥫','◺','Ŀ','ŀ','⎰','⎰','⪉','⪉','≨','⪇','⪇','≨','⋦','⟬','⇽','⟦','⟵','⟸','⟵','⟷','⟺','⟷','⟼','⟶','⟹','⟶','↫','↬','⦅','𝕃','𝕝','⨭','⨴','∗','_','↙','↘','◊','◊','⧫','(','⦓','⇆','⌟','⇋','⥭','‎','⊿','‹','ℒ','𝓁','↰','↰','≲','⪍','⪏','[','‘','‚','Ł','ł','<','<','≪','<','<','⪦','⩹','⋖','⋋','⋉','⥶','⩻','◃','⊴','◂','⦖','⥊','⥦','≨ ︀','≨ ︀','¯','¯','♂','✠','✠','⤅','↦','↦','↧','↤','↥','▮','⨩','М','м','—','∺','∡',' ','ℳ','𝔐','𝔪','℧','µ','µ','∣','*','⫰','·','·','−','⊟','∸','⨪','∓','⫛','…','∓','⊧','𝕄','𝕞','∓','ℳ','𝓂','∾','Μ','μ','⊸','⊸','∇','Ń','ń','∠ ⃒','≉','⩰ ̸','≋ ̸','ʼn','≉','♮','♮','ℕ',' ',' ','≎ ̸','≎ ̸','⩃','Ň','ň','Ņ','ņ','≇','⩭ ̸','⩂','Н','н','–','≠','⤤','⇗','↗','↗','≐ ̸','​','​','​','​','≢','⤨','≂ ̸','≫','≪',' ','∄','∄','𝔑','𝔫','≧ ̸','≱','≱','≧ ̸','⩾ ̸','⩾ ̸','⋙ ̸','≵','≫ ⃒','≯','≯','≫ ̸','⇎','↮','⫲','∋','⋼','⋺','∋','Њ','њ','⇍','↚','‥','≦ ̸','≰','⇍'))
unicodes = append(unicodes, c('↚','⇎','↮','≰','≦ ̸','⩽ ̸','⩽ ̸','≮','⋘ ̸','≴','≪ ⃒','≮','⋪','⋬','≪ ̸','∤','⁠',' ','ℕ','𝕟','⫬','¬','¬','≢','≭','∦','∉','≠','≂ ̸','∄','≯','≱','≧ ̸','≫ ̸','≹','⩾ ̸','≵','≎ ̸','≏ ̸','∉','⋵ ̸','⋹ ̸','∉','⋷','⋶','⋪','⧏ ̸','⋬','≮','≰','≸','≪ ̸','⩽ ̸','≴','⪢ ̸','⪡ ̸','∌','∌','⋾','⋽','⊀','⪯ ̸','⋠','∌','⋫','⧐ ̸','⋭','⊏ ̸','⋢','⊐ ̸','⋣','⊂ ⃒','⊈','⊁','⪰ ̸','⋡','≿ ̸','⊃ ⃒','⊉','≁','≄','≇','≉','∤','∦','∦','⫽ ⃥','∂ ̸','⨔','⊀','⋠','⪯ ̸','⊀','⪯ ̸','⇏','↛','⤳ ̸','↝ ̸','⇏','↛','⋫','⋭','⊁','⋡','⪰ ̸','𝒩','𝓃','∤','∦','≁','≄','≄','∤','∦','⋢','⋣','⊄','⫅ ̸','⊈','⊂ ⃒','⊈','⫅ ̸','⊁','⪰ ̸','⊅','⫆ ̸','⊉','⊃ ⃒','⊉','⫆ ̸','≹','Ñ','Ñ','ñ','ñ','≸','⋪','⋬','⋫','⋭','Ν','ν','#','№',' ','≍ ⃒','⊯','⊮','⊭','⊬','≥ ⃒','> ⃒','⤄','⧞','⤂','≤ ⃒','< ⃒','⊴ ⃒','⤃','⊵ ⃒','∼ ⃒','⤣','⇖','↖','↖','⤧','Ó','Ó','ó','ó','⊛','⊚','Ô','Ô','ô','ô','О','о','⊝','Ő','ő','⨸','⊙','⦼','Œ','œ','⦿','𝔒','𝔬','˛','Ò','Ò','ò','ò','⧁','⦵','Ω','∮','↺','⦾','⦻','‾','⧀','Ō','ō','Ω','ω','Ο','ο','⦶','⊖','𝕆','𝕠','⦷','“','‘','⦹','⊕','⩔','∨','↻','⩝','ℴ','ℴ','ª','ª','º','º','⊶','⩖','⩗','⩛','Ⓢ','𝒪','ℴ','Ø','Ø','ø','ø','⊘','Õ','Õ','õ','õ','⨷','⊗','⨶','Ö','Ö','ö','ö','⌽','‾','⏞','⎴','⏜','∥','¶','¶','∥','⫳','⫽','∂','∂','П','п','%','.','‰','⊥','‱','𝔓','𝔭','Φ','φ','ϕ','ℳ','☎','Π','π','⋔','ϖ','ℏ','ℎ','ℏ','+','⨣','⊞','⨢','∔','⨥','⩲','±','±','±','⨦','⨧','±','ℌ','⨕'))
unicodes = append(unicodes, c('ℙ','𝕡','£','£','⪻','≺','⪷','≼','⪳','⪯','≺','⪷','≼','≺','⪯','≼','≾','⪯','⪹','⪵','⋨','≾','″','′','ℙ','⪹','⪵','⋨','∏','∏','⌮','⌒','⌓','∝','∷','∝','∝','≾','⊰','𝒫','𝓅','Ψ','ψ',' ','𝔔','𝔮','⨌','ℚ','𝕢','⁗','𝒬','𝓆','ℍ','⨖','?','≟','"','"','⇛','∽ ̱','Ŕ','ŕ','√','⦳','⟫','⟩','⦒','⦥','⟩','»','»','↠','⇒','→','⥵','⇥','⤠','⤳','⤞','↪','↬','⥅','⥴','⤖','↣','↝','⤜','⤚','∶','ℚ','⤐','⤏','⤍','❳','}',']','⦌','⦎','⦐','Ř','ř','Ŗ','ŗ','⌉','}','Р','р','⤷','⥩','”','”','↳','ℜ','ℜ','ℛ','ℜ','ℝ','▭','®','®','®','®','∋','⇋','⥯','⥽','⌋','ℜ','𝔯','⥤','⇁','⇀','⥬','Ρ','ρ','ϱ','⟩','→','⇒','→','⇥','⇄','↣','⌉','⟧','⥝','⇂','⥕','⌋','⇁','⇀','⇄','⇌','⇉','↝','⊢','↦','⥛','⋌','⊳','⧐','⊵','⥏','⥜','↾','⥔','⇀','⥓','˚','≓','⇄','⇌','‏','⎱','⎱','⫮','⟭','⇾','⟧','⦆','ℝ','𝕣','⨮','⨵','⥰',')','⦔','⨒','⇉','⇛','›','ℛ','𝓇','↱','↱',']','’','’','⋌','⋊','▹','⊵','▸','⧎','⧴','⥨','℞','Ś','ś','‚','⪼','≻','⪸','Š','š','≽','⪴','⪰','Ş','ş','Ŝ','ŝ','⪺','⪶','⋩','⨓','≿','С','с','⋅','⊡','⩦','⤥','⇘','↘','↘','§','§',';','⤩','∖','∖','✶','𝔖','𝔰','⌢','♯','Щ','щ','Ш','ш','↓','←','∣','∥','→','↑','­','­','Σ','σ','ς','ς','∼','⩪','≃','≃','⪞','⪠','⪝','⪟','≆','⨤','⥲','←','∘','∖','⨳','⧤','∣','⌣','⪪','⪬','⪬ ︀','Ь','ь','/','⧄','⌿','𝕊','𝕤','♠','♠','∥','⊓','⊓ ︀','⊔','⊔ ︀','√','⊏'))
unicodes = append(unicodes, c('⊑','⊏','⊑','⊐','⊒','⊐','⊒','□','□','□','⊓','⊏','⊑','⊐','⊒','⊔','▪','▪','→','𝒮','𝓈','∖','⌣','⋆','⋆','☆','★','ϵ','ϕ','¯','⋐','⊂','⪽','⫅','⊆','⫃','⫁','⫋','⊊','⪿','⥹','⋐','⊂','⊆','⫅','⊆','⊊','⫋','⫇','⫕','⫓','≻','⪸','≽','≻','⪰','≽','≿','⪰','⪺','⪶','⋩','≿','∋','∑','∑','♪','⋑','⊃','¹','¹','²','²','³','³','⪾','⫘','⫆','⊇','⫄','⊃','⊇','⟉','⫗','⥻','⫂','⫌','⊋','⫀','⋑','⊃','⊇','⫆','⊋','⫌','⫈','⫔','⫖','⤦','⇙','↙','↙','⤪','ß','ß','	','⌖','Τ','τ','⎴','Ť','ť','Ţ','ţ','Т','т','⃛','⌕','𝔗','𝔱','∴','∴','∴','Θ','θ','ϑ','ϑ','≈','∼','   ',' ',' ','≈','∼','Þ','Þ','þ','þ','∼','˜','≃','≅','≈','×','×','⊠','⨱','⨰','∭','⤨','⊤','⌶','⫱','𝕋','𝕥','⫚','⤩','‴','™','™','▵','▿','◃','⊴','≜','▹','⊵','◬','≜','⨺','⃛','⨹','⧍','⨻','⏢','𝒯','𝓉','Ц','ц','Ћ','ћ','Ŧ','ŧ','≬','↞','↠','Ú','Ú','ú','ú','↟','⇑','↑','⥉','Ў','ў','Ŭ','ŭ','Û','Û','û','û','У','у','⇅','Ű','ű','⥮','⥾','𝔘','𝔲','Ù','Ù','ù','ù','⥣','↿','↾','▀','⌜','⌜','⌏','◸','Ū','ū','¨','¨','_','⏟','⎵','⏝','⋃','⊎','Ų','ų','𝕌','𝕦','↑','⇑','↑','⤒','⇅','↕','⇕','↕','⥮','↿','↾','⊎','↖','↗','ϒ','υ','ϒ','Υ','υ','⊥','↥','⇈','⌝','⌝','⌎','Ů','ů','◹','𝒰','𝓊','⋰','Ũ','ũ','▵','▴','⇈','Ü','Ü','ü','ü','⦧','⦜','ϵ','ϰ','∅','ϕ','ϖ','∝','⇕','↕','ϱ','ς','⊊  ','⫋  ','⊋  ','⫌  ','ϑ','⊲','⊳','⫫','⫨','⫩','В'))
unicodes = append(unicodes, c('в','⊫','⊩','⊨','⊢','⫦','⋁','∨','⊻','≚','⋮','‖','|','‖','|','∣','|','❘','≀',' ','𝔙','𝔳','⊲','⊂ ⃒','⊃ ⃒','𝕍','𝕧','∝','⊳','𝒱','𝓋','⫋ ︀','⊊ ︀','⫌ ︀','⊋ ︀','⊪','⦚','Ŵ','ŵ','⩟','⋀','∧','≙','℘','𝔚','𝔴','𝕎','𝕨','℘','≀','≀','𝒲','𝓌','⋂','◯','⋃','▽','𝔛','𝔵','⟺','⟷','Ξ','ξ','⟸','⟵','⟼','⋻','⨀','𝕏','𝕩','⨁','⨂','⟹','⟶','𝒳','𝓍','⨆','⨄','△','⋁','⋀','Ý','Ý','ý','ý','Я','я','Ŷ','ŷ','Ы','ы','¥','¥','𝔜','𝔶','Ї','ї','𝕐','𝕪','𝒴','𝓎','Ю','ю','Ÿ','ÿ','ÿ','Ź','ź','Ž','ž','З','з','Ż','ż','ℨ','​','Ζ','ζ','ℨ','𝔷','Ж','ж','⇝','ℤ','𝕫','𝒵','𝓏','‍','‌'))
hash = new.env()
sapply(seq_along(entities), function(i) assign(entities[i], unicodes[i], hash))
remove(entities, unicodes)
htm = gsubfun(htm, '&[A-z]+[1-8]*;?', function(x) {
for (i in nchar(x):3) {y = substr(x, 1, i); if (exists(y, hash, inherits = FALSE)) return(paste(get(y, hash, inherits = FALSE), substr(x, i + 1, nchar(x)), sep = ''));}
return(x)
})
htm = gsubfun(htm, '&#[Xx][[:xdigit:]]+;', function(x) paste('&#', strtoi(substr(x, 4, nchar(x) - 1), base = 16), ';', sep = ''))
htm = gsubfun(htm, '&#[0-9]+;', function(x) {
i = as.integer(substr(x, 3, nchar(x) - 1))
if (i < 32) i = c(9688,8968,8969,8970,8971,124,45,8729,9688,9,10,11,12,13,9836,9728,10186,9664,8597,8252,183,8869,8868,8867,8593,8866,8594,8592,8970,8596,9650,9660)[i + 1]
if (i > 126 & i < 161) i = c(0,8364,129,44,402,8222,8230,8224,8225,710,8240,352,8249,339,141,381,143,144,8216,8217,8220,8221,8226,8211,8212,732,8482,353,8250,339,157,382,376,32)[i - 126]
return(intToUtf8(i))
})
htm = gsub('[[:space:]]*\n\n[[:space:]]*', '\n\n', gsub('[ \t]*\n *', '\n', gsub('^[[:space:]]*|[[:space:]]*$', '', gsub(' {2,}', ' ', gsub('[[:blank:]]*\t[[:blank:]]*', '\t', htm)))))
htm = gsub('<[/A-z!]+[^<>]*>', '', htm)
return(htm)
}
#' Extract simple plain texts from a web page at a certain URL
#'
#' @param URL A character indicating the URL of a web page.
#' @param encoding Encoding method (e.g., "UTF-8", "latin1", "bytes", "unknown", etc.).
#' @param ... Other \code{\link{htm2txt}} arguments.
#' @return A character containing plain texts converted from the htm document at the URL.
#' @examples
#' text = gettxt("https://www.wikipedia.org/")
#' @export
gettxt <- function(URL, encoding = "UTF-8", ...) return(htm2txt(paste(readLines(URL, warn = FALSE, encoding = encoding), sep = '', collapse = ' '), ...))
#' Display simple plain texts in a web page at a certain URL
#'
#' @param URL A character indicating the URL of a web page.
#' @param ... Other \code{\link{gettxt}} arguments.
#' @return None (invisible NULL).
#' @examples browse("https://www.wikipedia.org/")
#' @export
browse <- function(URL, ...) cat(gettxt(URL, ...))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.