Nothing
#' letter.convert
#'
#' Converts and unifies most hexadecimal and some HTML coded letters to Unicode characters. Performs CERMINE specific error correction (inserting operators, where these got lost while conversion).
#' @param x text string to process.
#' @param cermine Logical. If TRUE CERMINE specific error handling and letter conversion will be applied.
#' @param greek2text Logical. If TRUE some greek letters and special characters will be unified to textual representation (important to extract stats).
#' @param warning Logical. If TRUE prints warning massage if CERMINE specific letter conversion was performed.
#' @return Character. Text with unified and corrected letter representation.
#' @export
#' @examples
#' x<-c("five < ten","five < ten")
#' letter.convert(x)
# letter conversion
letter.convert<-function(x,cermine=FALSE,greek2text=FALSE,warning=TRUE){
# clean up white spaces
x<-gsub("^ *|(?<= ) | *$", "", x, perl = TRUE)
# right/left quotation mark
x<-gsub("\u2019","'",x)
x<-gsub("\u2018","'",x)
# unify "-"
x<-gsub("\u2012","-", x)
# if has hex letters convert to simplified letter
if(length(grep("&#",x))>0){
# prepare
# add 0 behind &#x if has only 4 characters between "&#x" and ";" and lowerize capture
x<-gsub("&#[Xx](....);","�\\L\\1;",x,perl=T)
# convert further upper to lower case letter in hex code
x<-gsub("&#[Xx](.....);","&#x\\L\\1;",x,perl=T)
# start conversion
## letters to unify
if(length(grep("&#",x))>0){
x<-gsub("&x0227;","\u227",x) # LATIN SMALL LETTER A WITH DOT ABOVE
x<-gsub("&x02a2;","\u2a2",x) # LATIN LETTER REVERSED GLOTTAL STOP WITH STROKE
x<-gsub("&x0215;","\u00fc",x) # LATIN SMALL LETTER U WITH DOUBLE GRAVE
x<-gsub("Ȑ","\u210",x) # LATIN CAPITAL LETTER R WITH DOUBLE GRAVE
x<-gsub("Ƞ","\u220",x) # LATIN CAPITAL LETTER N WITH LONG RIGHT LEGx<-gsub("","",x) # empty seperator
x<-gsub("⍺","\u03b1",x) # APL FUNCTIONAL SYMBOL ALPHA -> alpha
x<-gsub("~","\u007e",x) # FULLWIDTH TILDE
x<-gsub("𝝌","\u03c7",x) # bold italic small greek chi -> small greek chi
x<-gsub("𝜀","\u03b5",x) # MATHEMATICAL ITALIC SMALL EPSILON
x<-gsub("?","?",x) # question mark
x<-gsub("?","? ",x) # FULLWIDTH QUESTION MARK
x<-gsub("%","%",x) # FULLWIDTH PERCENT SIGN
x<-gsub("+","+",x) # high +
x<-gsub("±","+-",x) # plus minus
x<-gsub("+","+",x) # FULLWIDTH PLUS SIGN
x<-gsub("−","-",x) # minus
# quotes
x<-gsub("ʼ","'",x) # '
x<-gsub("'","'",x) # '
x<-gsub("“","'",x) # quotation start
x<-gsub("”","'",x) # quotation end
x<-gsub("´","'",x) # '
x<-gsub("„","'",x) # lower "
x<-gsub("’","'",x) # right single quotation mark
x<-gsub("‘","'",x) # left single quotation mark
x<-gsub("″","'",x) # double prime
x<-gsub("‴","'",x) # double prime end
x<-gsub("′","'",x) # prime
x<-gsub("ʹ","'",x) # MODIFIER LETTER PRIME
x<-gsub("ʻ","'",x) # MODIFIER LETTER TURNED COMMA
x<-gsub("´","'",x) # GREEK OXIA
x<-gsub("׳","'",x) # HEBREW PUNCTUATION GERESH
x<-gsub("΄","'",x) # Greek tonos
x<-gsub("ˈ","'",x) # MODIFIER LETTER VERTICAL LINE
x<-gsub("́","'",x) # COMBINING ACUTE ACCENT
x<-gsub("᾿","'",x) # GREEK PSILI
x<-gsub("‛","'",x) # SINGLE HIGH-REVERSED-9 QUOTATION MARK
x<-gsub("‵","'",x) # REVERSED PRIME
x<-gsub("̀","'",x) # COMBINING GRAVE ACCENT
x<-gsub("ꞌ","'",x) # LATIN SMALL LETTER SALTILLO
x<-gsub("᾽","'",x) # GREEK KORONIS
# spaces
x<-gsub(" "," ",x) # space
x<-gsub(" "," ",x) # space
x<-gsub(" "," ",x) # space
x<-gsub(" "," ",x) # em space
x<-gsub(" "," ",x) # space
x<-gsub(" "," ",x) # space
x<-gsub(" "," ",x) # space
x<-gsub(" "," ",x) # space
x<-gsub(" "," ",x) # space
x<-gsub(" "," ",x) # thin space
x<-gsub(" "," ",x) # non breaking space
x<-gsub("
"," ",x) # line seperator
x<-gsub("
"," ",x) # PARAGRAPH SEPARATOR
x<-gsub(" "," ",x) # no break space
x<-gsub(" "," ",x) # IDEOGRAPHIC SPACE
x<-gsub(" "," ",x) # MEDIUM MATHEMATICAL SPACE
x<-gsub("⁢"," ",x) # INVISIBLE TIMES
x<-gsub(" "," ",x) # hair space
x<-gsub("ᅟ"," ",x) # HANGUL CHOSEONG FILLER
x<-gsub("​","",x) # ZERO WIDTH SPACE
x<-gsub("‌","",x) # ZERO WIDTH NON-JOINER
x<-gsub("‬","",x) # POP DIRECTIONAL FORMATTING
x<-gsub("‍","",x) # ZERO WIDTH JOINER
x<-gsub("‌","",x) # ZERO WIDTH NON-JOINER
x<-gsub("‎","",x) # LEFT-TO-RIGHT MARK
x<-gsub(" ","",x) # RIGHT-TO-LEFT MARK
x<-gsub("⁡","",x) # FUNCTION APPLICATION
}
if(length(grep("&#",x))>0){
# hyphens
x<-gsub("‑","-",x) # non breaking hyphen
x<-gsub("–","-",x) # until/dash
x<-gsub("‐","-",x) # hyphen/dash
x<-gsub("­","-",x) # soft hyphen
x<-gsub("—","-",x) # mdash -
x<-gsub("‒","-",x) # FIGURE DASH
x<-gsub("♦","-",x) # bullet point like hyper reference
x<-gsub("―","-",x) #
x<-gsub("˙","-",x) # ˙
x<-gsub("─","-",x) # BOX DRAWINGS LIGHT HORIZONTA
x<-gsub("•"," - ",x) # bullet point
x<-gsub("◦"," - ",x) # white bullet
x<-gsub("▪"," - ",x) # bullet square
x<-gsub("∙"," - ",x) # BULLET OPERATOR (in categories)
x<-gsub("●"," - ",x) # BLACK CIRCLE (in categories)
x<-gsub("○"," - ",x) # circle
x<-gsub("·"," - ",x) # mitdot
x<-gsub("⋅"," - ",x) # sdot
x<-gsub("˗","-",x) # MODIFIER LETTER MINUS SIGN
x<-gsub("-","-",x) # FULLWIDTH HYPHEN-MINUS
x<-gsub("一","-",x) # CJK UNIFIED IDEOGRAPH-4E00
# special signs
x<-gsub("×","*",x) # multiplied by
x<-gsub("∗","*",x) # times
x<-gsub("☆","*",x) # white star
x<-gsub("★","*",x) # black star
x<-gsub("*","*",x) # asterix
x<-gsub("⋆","*",x) # STAR OPERATOR
x<-gsub(")",")",x) # )
x<-gsub("(","(",x) # (
x<-gsub("【",")",x) # LEFT BLACK LENTICULAR BRACKET
x<-gsub("】",")",x) # RIGHT BLACK LENTICULAR BRACKET
x<-gsub(")",") ",x) # FULLWIDTH RIGHT PARENTHESIS
x<-gsub("("," (",x) # FULLWIDTH LEFT PARENTHESIS
x<-gsub("̂","^",x) # COMBINING CIRCUMFLEX ACCENT
x<-gsub("˄","^",x) # MODIFIER LETTER UP ARROWHEAD
x<-gsub("ˆ","^",x) # circ
x<-gsub("fi","fi",x) # fi ligature
x<-gsub("ffl","ffl",x) # ffl ligature
x<-gsub("∼","~",x) # tilde
x<-gsub("˜","~",x) # tilde
x<-gsub("≡","=",x) # identical to
x<-gsub("≅","=~",x) # tilde full equal
x<-gsub("&","& ",x) # &
x<-gsub("#","# ",x) # # , num
x<-gsub("…","...",x) # ...
x<-gsub("→","->",x)# rightwards arrow
x<-gsub("▸","->",x)# rightwards arrow
x<-gsub("←","<-",x)# leftwards arrow
x<-gsub("꞉",":",x) # MODIFIER LETTER COLON
x<-gsub(":",":",x) # FULLWIDTH COLON
x<-gsub("∶",":",x) # ratio
x<-gsub("=","=",x) # equal sign
x<-gsub("<","<",x) # less than
x<-gsub("〈","<",x) # less than
x<-gsub("〈","<",x) # less than
x<-gsub("≪","<<",x) # much less than
x<-gsub("≤","<=",x) # less equal than
x<-gsub("⩽","<=",x) # less equal than
x<-gsub(">",">",x) # greater than
x<-gsub("〉",">",x) # greater than
x<-gsub("〉",">",x) # greater than
x<-gsub("⩾",">=",x) # greater equal than
x<-gsub("≧",">=",x) # greater equal than
x<-gsub("≫",">>",x) # much greater than
x<-gsub("≠","!=",x) # not greater
x<-gsub("≥","<",x) # not greater equal than
x<-gsub("<","<",x) # FULLWIDTH LESS-THAN SIGN
x<-gsub("=","=",x) # FULLWIDTH EQUALS SIGN
x<-gsub(">",">",x) # FULLWIDTH GREATER-THAN SIGN
x<-gsub("›",">",x) # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
x<-gsub("‹","<",x) # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
x<-gsub("►","->",x) # pointer right: ->
x<-gsub("«","<<",x) #
x<-gsub("»",">>",x) #
x<-gsub("⤒","->",x) # rightwards arrow ->
x<-gsub("˂","<",x) # MODIFIER LETTER LEFT ARROWHEAD
x<-gsub("∕","/",x) # divided by
x<-gsub("÷","/",x) # DIVISION SIGN
x<-gsub("⁄","/",x) # FRACTION SLASH
x<-gsub("/","/",x) # solidus /
x<-gsub("½"," 1/2",x) # one half
x<-gsub("¼"," 1/4",x) # one fourth
x<-gsub("¾"," 3/4",x) # Three quarter
}
if(length(grep("&#",x))>0){
x<-gsub("h","h",x) # LATIN SMALL LETTER H
x<-gsub("t","t",x) # LATIN SMALL LETTER T
x<-gsub("ⅹ","x",x) # SMALL ROMAN NUMERAL TEN
x<-gsub("&x0030;","0",x) # DIGIT zero
x<-gsub("&x0031;","1",x) # DIGIT one
x<-gsub("&x0032;","2",x) # DIGIT two
x<-gsub("&x0033;","3",x) # DIGIT three
x<-gsub("&x0034;","4",x) # DIGIT four
x<-gsub("&x0035;","5",x) # DIGIT five
x<-gsub("&x0036;","6",x) # DIGIT six
x<-gsub("&x0037;","7",x) # DIGIT seven
x<-gsub("&x0038;","8",x) # DIGIT eight
x<-gsub("&x0039;","9",x) # DIGIT nine
x<-gsub("&x0044;","D",x) # LATIN CAPITAL LETTER D
x<-gsub("С","C",x) # CYRILLIC CAPITAL LETTER ES
x<-gsub("І","I",x) # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I
x<-gsub("с","c",x) # CYRILLIC SMALL LETTER ES
x<-gsub(",",", ",x) # FULLWIDTH COMMA
x<-gsub("،",",",x) # ARABIC COMMA
x<-gsub("А","A",x) # CYRILLIC CAPITAL LETTER A
x<-gsub("Q","Q",x) # LATIN CAPITAL LETTER Q
x<-gsub("F","F",x) # LATIN CAPITAL LETTER F
x<-gsub("о","o",x) # CYRILLIC SMALL LETTER O
x<-gsub("x","x",x) # LATIN SMALL LETTER X
x<-gsub("Ӏ","I",x) # CYRILLIC LETTER PALOCHKA
x<-gsub("D","D",x) # LATIN CAPITAL LETTER D
x<-gsub("Е","E",x) # CYRILLIC CAPITAL LETTER IE
x<-gsub("Η","H",x) # GREEK CAPITAL LETTER ETA
x<-gsub("Р","P",x) # CYRILLIC CAPITAL LETTER ER
x<-gsub("ℱ","F",x) # SCRIPT CAPITAL F
x<-gsub("♂"," MALE ",x) # Male sign
x<-gsub("♀"," FEMALE ",x) # Female sign
x<-gsub("S","'*S?",x) # superscript *S
x<-gsub("Α","A",x) # A
x<-gsub("ℓ","l",x) # Latin letter ℓ -> l
x<-gsub("Ι","I",x) # Capital I
x<-gsub("ℜ","R",x) # BLACK-LETTER CAPITAL R
x<-gsub("♯","#",x) # MUSIC SHARP SIGN
}
if(length(grep("&#",x))>0){
x<-gsub("I","I",x) # LATIN CAPITAL LETTER I
x<-gsub("V","V",x) # LATIN CAPITAL LETTER V
x<-gsub("X","X",x) # LATIN CAPITAL LETTER X
x<-gsub("ǀ","I",x) # LATIN LETTER DENTAL CLICK
x<-gsub("˃",">",x) # MODIFIER LETTER RIGHT ARROWHEAD
x<-gsub(";",";",x) # GREEK QUESTION MARK look alike semicolun
x<-gsub("а","a",x) # CYRILLIC SMALL LETTER A
x<-gsub("В","B",x) # CYRILLIC CAPITAL LETTER VE
x<-gsub("Н","H",x) # CYRILLIC CAPITAL LETTER EN
x<-gsub("і","i",x) # CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
x<-gsub("ւ","L",x) # ARMENIAN SMALL LETTER YIWN
x<-gsub("⅓","1/3",x) # VULGAR FRACTION ONE THIRD
x<-gsub("⅕","1/5",x) # VULGAR FRACTION ONE FIFTH
x<-gsub("⅛","1/8",x) # VULGAR FRACTION ONE EIGHTH
x<-gsub("s","s",x) # FULLWIDTH LATIN SMALL LETTER S
x<-gsub("ď","d'",x) # LATIN SMALL LETTER D WITH CARON
x<-gsub("ť","t'",x) # LATIN SMALL LETTER T WITH CARON
x<-gsub("у","y",x) # CYRILLIC SMALL LETTER U
x<-gsub("Ο","O",x) # GREEK CAPITAL LETTER OMICRON
x<-gsub("М","M",x) # CYRILLIC CAPITAL LETTER EM
x<-gsub("Т","T",x) # CYRILLIC CAPITAL LETTER TE
x<-gsub("Х","X",x) # CYRILLIC CAPITAL LETTER HA
x<-gsub("е","e",x) # CYRILLIC SMALL LETTER IE
x<-gsub("fl","fl",x) # LATIN SMALL LIGATURE FL
x<-gsub("Ľ","L'",x) # LATIN CAPITAL LETTER L WITH CARON
x<-gsub("ľ","l'",x) # LATIN SMALL LETTER L WITH CARON
x<-gsub("ơ","o",x) # LATIN SMALL LETTER O WITH HORN
x<-gsub("¹","^1",x) # SUPERSCRIPT ONE
x<-gsub("²","^2",x) # SUPERSCRIPT TWO
x<-gsub("³","^3",x) # SUPERSCRIPT THREE
x<-gsub("Ε","E",x) # GREEK CAPITAL LETTER EPSILON
x<-gsub("Ν","N",x) # GREEK CAPITAL LETTER NU
x<-gsub("Κ","K",x) # GREEK CAPITAL LETTER KAPPA
x<-gsub("Μ","M",x) # GREEK CAPITAL LETTER MU
x<-gsub("ο","o",x) # GREEK SMALL LETTER OMICRON
x<-gsub("О","O",x) # CYRILLIC CAPITAL LETTER O
}
## unify hex
if(length(grep("&#",x))>0){
x<-gsub("═","\u003d",x) # BOX DRAWINGS DOUBLE HORIZONTAL
x<-gsub("│","\u007c",x) # BOX DRAWINGS LIGHT VERTICAL
x<-gsub("ő","\u00F6 ",x) # small ö ő
}
## convert all other hexadecimals to unicode at once
if(length(grep("&#x[10]",x))>0){
i<-grep("&#x[10]",x)
x[i]<-unlist(lapply(x[i],udecode))
}
######################################################################
######################################################################
######################################################################
## OLD: manual conversion
if(length(grep("&#",x))>0){
x<-gsub("Ă","\u0102",x) # LATIN CAPITAL LETTER A WITH BREVE
x<-gsub("Á","\u00C1",x) # Á
x<-gsub("Á","\u00c1",x) # Á
x<-gsub("Ä","\u00c4",x) # Ä
x<-gsub("Æ","\u00c6",x) # AE Æ
x<-gsub("ę","\u0119",x) # e with little thingy: ę
x<-gsub("ş","\u015f",x) # s with little thingy: ş
x<-gsub("ą","\u0105",x) # a with little thingy: ą
x<-gsub("ä","\u00e4",x) # ä
x<-gsub("Ü","\u00dc",x) # ü
x<-gsub("ü","\u00fc",x) # ü
x<-gsub("ù","\u00f9",x) # ù
x<-gsub("ß","\u00df",x) # ß
x<-gsub("á","\u00e1",x) # á
x<-gsub("à","\u00e0",x) # à
x<-gsub("â","\u00e2",x) # a with hat
x<-gsub("ă","\u0103",x) # a with little u on top
x<-gsub("ā","\u0101",x) # a with bar
x<-gsub("å","\u00e5",x) # a with ° on top
x<-gsub("æ","\u00E6",x) # a with e connected: æ
x<-gsub("ż","\u017C",x) # z with dot on top
x<-gsub("ż","\u017C",x) # z with dot on top
x<-gsub("ś","\u015B",x) # s with ` on top`
x<-gsub("š","\u0161",x) # s with turned ^ on top: š
x<-gsub("č","\u010d",x) # c with turned ^ on top: č
x<-gsub("Ç","\u00C7",x) # C with little thingy: Ç
x<-gsub("Č","\u010C",x) # C with turned ^ on top
x<-gsub("ć","\u0107",x) # c with ` on top
x<-gsub("Ĉ","\u0108",x) # C with ^ on top
x<-gsub("ĉ","\u0109",x) # c with hat
x<-gsub("ç","\u00e7",x) # french c: ç
x<-gsub("Å","\u00c5",x) # Angstrom letter: capital A with °
x<-gsub("ã","\u00e3",x) # a with ~
x<-gsub("è","\u00e8",x) # è
x<-gsub("é","\u00e9",x) # é
x<-gsub("ī","\u011b",x) # e with turned ^ on top
x<-gsub("Ż","\u017b",x) # Z with dot on top
x<-gsub("Ó","\u00d3",x) # Ó O acute
x<-gsub("ė","\u0117",x) # e with point on top
x<-gsub("É","\u00C9",x) # É, e acute
x<-gsub("ê","\u00ea",x) # ê
x<-gsub("í","\u00ed",x) # í i with '
x<-gsub("ì","\u00ec",x) # ì i with `
x<-gsub("Í","\u00cd",x) # Í
x<-gsub("İ","\u0130",x) # I with a dot on top
x<-gsub("ř","\u0159",x) # r with turned ^on top ; ř
x<-gsub("ʁ","\u02B6",x) # LATIN LETTER SMALL CAPITAL INVERTED R
x<-gsub("ž","\u017E",x) # z with turned ^ on top: ž
x<-gsub("Ś","\u015A",x) # with ' on top
x<-gsub("Š","\u0160",x) # S with turned ^ on top: Š
x<-gsub("Ş","\u015E",x) # S with little thingy below: Ş
x<-gsub("Ṫ","\u1E6A",x) # LATIN CAPITAL LETTER T WITH DOT ABOVE
x<-gsub("ț","\u021B",x) # t with ' below
x<-gsub("ū","\u016B",x) # u with bar
x<-gsub("ș","\u0219",x) # s with 'below
x<-gsub("Ł","\u0141",x) # L with little / Ł
x<-gsub("ł","\u0141",x) # l with little / ł
x<-gsub("ï","\u00EF",x) # iuml - i with two dots
x<-gsub("ı","\u0131",x) # ı
x<-gsub("ğ","\u011F",x) # g with small u on top: ğ
x<-gsub("Ö","\u00D6",x) # Ö
x<-gsub("ö","\u00F6",x) # ö
x<-gsub("ø","\u00F8",x) # ø: danish ö
x<-gsub("œ","\u0153",x) # oe: œ
x<-gsub("Œ","\u0152",x) # OE: Œ
x<-gsub("Ø","\u00D8",x) # Ø: danish Ö
x<-gsub("ó","\u00F3",x) # ó
x<-gsub("õ","\u00F5",x) # o tilde on top
x<-gsub("ò","\u00F2",x) # ò
x<-gsub("ô","\u00F4",x) # ô
x<-gsub("ý","\u00FD",x) # ý
x<-gsub("Ŷ","\u0176",x) # Y hat
x<-gsub("Υ","\u03a5",x) # Y
x<-gsub("ź","\u017A",x) # z with ' on top
x<-gsub("ú","\u00FA",x) # u with ' on top
x<-gsub("ů","\u016F",x) # u with ° on top
x<-gsub("ñ","\u00F1",x) # n-je ñ
x<-gsub("単","\u00F1",x) # n-je ñ
x<-gsub("ń","\u0144",x) # n with ' on top
x<-gsub("ë","\u00EB",x) # e with two dots on top
# special signs
x<-gsub("®","\u00ae",x) # REG-sign
x<-gsub("©","\u00a9",x) # COPYRIGHT-sign
x<-gsub("™","\u2122",x) # TradeMark-sign
x<-gsub("€","\u20ac",x) # Euro-sign
x<-gsub("£","\u00a3",x) # Pound-sign
x<-gsub("¥","\u00a5",x) # Yen-sign
x<-gsub("$","\u0024",x) # Dollar-sign
x<-gsub("≈","\u2248",x) # approximate sign
x<-gsub("∞","\u221e",x) # infinite sign
x<-gsub("①","\u2460",x) # 1 in circle
x<-gsub("②","\u2461",x) # 2 in circle
x<-gsub("③","\u2462",x) # 3 in circle
x<-gsub("④","\u2463",x) # 4 in circle
x<-gsub("⑤","\u2464",x) # 5 in circle
x<-gsub("%","\u0025",x) # percent
x<-gsub("‰","\u2030",x) # per mille
x<-gsub("≍","\u003d",x) # EQUIVALENT TO
x<-gsub("|","\u007c",x) # vertical line
x<-gsub("к","\u043a",x) # kyrillik letter ka
x<-gsub("ι","\u03b9",x) # greek letter iota
x<-gsub("Ã","\u00c3",x) # A with ^
x<-gsub("î","\u00ee",x) # i with ^
x<-gsub("ō","\u014d",x) # o with bar
x<-gsub("Ô","\u00d4",x) # O with ^
x<-gsub("¿","\u00BF",x) # spanish start of question mark ¿
x<-gsub("α","\u03b1",x) # small greek alpha
x<-gsub("ɑ","\u0251",x) # small latin letter alpha
x<-gsub("∝","\u221d",x) # PROPORTIONAL TO
x<-gsub("β","\u03b2",x) # small greek beta
x<-gsub("Β","\u0392",x) # capital greek beta
x<-gsub("χ","\u03c7",x) # small greek chi
x<-gsub("Χ","\u03a7",x) # capital greek Chi
x<-gsub("φ","\u03c6",x) # small greek phi
x<-gsub("Φ","\u03a6",x) # capital greek Phi
x<-gsub("Ф","\u0424",x) # CYRILLIC CAPITAL LETTER EF -> looks like Phi
x<-gsub("ϕ","\u03d5",x) # capital greek Phi
x<-gsub("ɸ","\u0278",x) # capital greek Phi
x<-gsub("Ψ","\u03a8",x) # capital greek Psi
x<-gsub("ψ","\u03c8",x) # small greek Psi
x<-gsub("Γ","\u0393",x) # capital greek Gamma
x<-gsub("π","\u03c0",x) # small greek pi
x<-gsub("ί","\u03af",x) # GREEK SMALL LETTER IOTA WITH TONOS
x<-gsub("μ","\u03bc",x) # small greek mu
x<-gsub("µ","\u00b5",x) # micro/small greek mu
x<-gsub("Σ","\u03a3",x) # Sum/capital greek sigma
x<-gsub("∑","\u2211",x) # Sum/capital greek sigma
x<-gsub("ʊ","\u028a",x) # Latin upsilon
x<-gsub("Ʊ","\u01b1",x) # LATIN CAPITAL LETTER UPSILON
x<-gsub("υ","\u03c5",x) # GREEK SMALL LETTER UPSILON
x<-gsub("ɔ","\u0254",x) # LATIN SMALL LETTER OPEN O
x<-gsub("σ","\u03c3",x) # small greek sigma
x<-gsub("ς","\u03c2",x) # GREEK SMALL LETTER FINAL SIGMA
x<-gsub("ω","\u03c9",x) # small greek omega
x<-gsub("ώ","\u03ce",x) # GREEK SMALL LETTER OMEGA WITH TONOS
x<-gsub("λ","\u03bb",x) # small greek lamda
x<-gsub("δ","\u03b4",x) # small greek delta
x<-gsub("∆","\u2206",x) # capital greek delta (increment) triangle
x<-gsub("▵","\u25b5",x) # utri (triangle)
x<-gsub("△","\u25b3",x) # WHITE UP-POINTING TRIANGLE
x<-gsub("⊿","\u22bf",x) # italic triangle
x<-gsub("Δ","\u0394",x) # delta-sign
x<-gsub("ε","\u03b5",x) # small greek epsilon (epsi)
x<-gsub("ɛ","\u025b",x) # LATIN SMALL LETTER OPEN E
x<-gsub("έ","\u03ad",x) # GREEK SMALL LETTER EPSILON WITH TONOS
x<-gsub("κ","\u03ba",x) # small greek kappa
x<-gsub("ϰ","\u03f0",x) # cursive kappa -> used as Chi
x<-gsub("γ","\u03b3",x)# small greek gamma
x<-gsub("ξ","\u03be",x) # small greek Xi
x<-gsub("Ξ","\u039e",x) # GREEK CAPITAL LETTER XI
x<-gsub("Τ","\u03A4",x) # capital greek Tau
x<-gsub("τ","\u03c4",x) # small greek tau
x<-gsub("ʌ","\u028c",x) # LATIN SMALL LETTER TURNED V
x<-gsub("ρ","\u03c1",x) # small greek rho
x<-gsub("θ","\u03b8",x) # small greek theta
x<-gsub("ϑ","\u03d1",x) # small greek theta
x<-gsub("Θ","\u0398",x) # capital greek Theta
x<-gsub("𝜃","\ud703",x) # capital greek Theta
x<-gsub("ζ","\u03b6",x) # capital greek Zeta
x<-gsub("Ω","\u03a9",x) # capital greek Omega (Ohm)
x<-gsub("Ω","\u2126",x) # capital greek Omega (Ohm)
x<-gsub("ν","\u03bd",x) # GREEK SMALL LETTER NU
x<-gsub("Λ","\u039b",x) # CAPITAL GREEK Lambda
x<-gsub("η","\u03b7",x) # eta
x<-gsub("ŋ","\u014b",x) # engma for eta
x<-gsub("ɳ","\u0273",x) # eta
x<-gsub("Ƞ","\u0220",x) # eta
x<-gsub("ƞ","\u019e",x) # eta
x<-gsub("°","\u00B0",x) # degree sign
x<-gsub("℃","\u2103",x) # degree Celsius
x<-gsub("º","\u00ba",x) # MASCULINE ORDINAL INDICATOR
x<-gsub("∘","\u2218",x) # RING OPERATOR
x<-gsub("†","\u2020",x) # daggar (died)
x<-gsub("‡","\u2021",x) # double daggar (died)
x<-gsub("⃞","\u20de",x) # square
x<-gsub("□","\u25a1",x) # white square
x<-gsub("∥","\u2225",x) # parallel to
x<-gsub("∨","\u2228",x) # ∨
x<-gsub("√","\u221a",x) # square root
x<-gsub("¯","\u00af",x) # superscripted ¯
x<-gsub("∂","\u2202",x) # ∂
x<-gsub("⋯","\u22EF",x) # MIDLINE HORIZONTAL ELLIPSIS
x<-gsub("‖","\u2016",x) # Verbar
x<-gsub("ℋ","\u210b",x) # HilbertSpace
x<-gsub("∈","\u2208",x) # Element of
x<-gsub("∩","\u2229",x) # INTERSECTION
x<-gsub("ℝ","\u211d",x) # DOUBLE-STRUCK CAPITAL R
x<-gsub("∅","\u2205",x) # emptyset
x<-gsub("∏","\u220f",x) # Sum Product
x<-gsub("∫","\u222b",x) # Integral
x<-gsub("ʃ","\u0283",x) # LATIN SMALL LETTER ESH/Integral
x<-gsub("ĸ","\u0138",x) # ĸ
x<-gsub("ϵ","\u03f5",x) # ϵ (Element of)
x<-gsub("∣","\u2223",x) # ∣
x<-gsub("∉","\u2209",x) # not element of
x<-gsub("∧","\u2227",x) # ∧
x<-gsub("¬","\u00ac",x) # NOT-sign
x<-gsub("∀","\u2200",x) # A upside down: for all sign
x<-gsub("↔","\u2194",x) # Left right arrow
x<-gsub("↓","\u2193",x) # downwards arrow
x<-gsub("↑","\u2191",x) # upwards arrow
x<-gsub("ɜ","\u025c",x) # LATIN SMALL LETTER REVERSED OPEN E
x<-gsub("✓","\u2713",x) # Hook
x<-gsub("♰","\u2670",x) # WEST SYRIAC CROSS
x<-gsub("¶","\u00b6",x) # English name pilcrow or paragraph mark
x<-gsub("_","\u005f",x) # low line
x<-gsub("¢","\u00a2",x) # cent sign
x<-gsub("¨","\u00a8",x) # DIAERESIS
x<-gsub("Ē","\u0112",x) # LATIN CAPITAL LETTER E WITH MACRON
x<-gsub("ƒ","\u0192",x) # LATIN SMALL LETTER F WITH HOOK
x<-gsub("Ɣ","\u0194",x) # LATIN CAPITAL LETTER GAMMA
x<-gsub("Ʃ","\u01a9",x) # LATIN CAPITAL LETTER ESH
x<-gsub("ƿ","\u01bf",x) # LATIN LETTER WYNN
x<-gsub("ʂ","\u0282",x) # LATIN SMALL LETTER S WITH HOOK
x<-gsub("ˇ","\u02c7",x) # CARON
x<-gsub("̅","\u0305",x) # COMBINING OVERLINE
x<-gsub("̇","\u0307",x) # COMBINING DOT ABOVE
x<-gsub("Π","\u03a0",x) # GREEK CAPITAL LETTER PI
x<-gsub("Τ","\u03a4",x) # GREEK CAPITAL LETTER TAU
x<-gsub("ή","\u03ae",x) # GREEK SMALL LETTER ETA WITH TONOS
x<-gsub("ϐ","\u03d0",x) # GREEK BETA SYMBOL
x<-gsub("ϒ","\u03d2",x) # GREEK UPSILON WITH HOOK SYMBOL
x<-gsub("ϖ","\u03d6",x) # GREEK PI SYMBOL
x<-gsub("ᵨ","\u1d68",x) # GREEK SUBSCRIPT SMALL LETTER RHO
x<-gsub("ṗ","\u1e57",x) # LATIN SMALL LETTER P WITH DOT ABOVE
x<-gsub("ή","\u1f75",x) # GREEK SMALL LETTER ETA WITH OXIA
x<-gsub("‥","\u2025",x) # TWO DOT LEADER
x<-gsub("‾","\u203e",x) # OVERLINE
x<-gsub("ℕ","\u2115",x) # DOUBLE-STRUCK CAPITAL N
x<-gsub("ℙ","\u2119",x) # DOUBLE-STRUCK CAPITAL P
x<-gsub("ℤ","\u2124",x) # DOUBLE-STRUCK CAPITAL Z
x<-gsub("↦","\u21a6",x) # RIGHTWARDS ARROW FROM BAR
x<-gsub("⇒","\u21d2",x) # RIGHTWARDS DOUBLE ARROW
x<-gsub("⇔","\u21d4",x) # LEFT RIGHT DOUBLE ARROW
x<-gsub("∃","\u2203",x) # THERE EXISTS
x<-gsub("∇","\u2207",x) # NABLA
x<-gsub("∪","\u222a",x) # union set operator
x<-gsub("≃","\u2243",x) # ASYMPTOTICALLY EQUAL TO
x<-gsub("≐","\u2250",x) # APPROACHES THE LIMIT
x<-gsub("≔","\u2254",x) # COLON EQUALS
x<-gsub("≜","\u225c",x) # DELTA EQUAL TO
x<-gsub("≦","\u2266",x) # LESS-THAN OVER EQUAL TO
x<-gsub("≺","\u227a",x) # PRECEDES
x<-gsub("≻","\u227b",x) # SUCCEEDS
x<-gsub("⊂","\u2282",x) # SUBSET OF
x<-gsub("⊆","\u2286",x) # SUBSET OF OR EQUAL TO
x<-gsub("⊕","\u2295",x) # CIRCLED PLUS
x<-gsub("⊗","\u2297",x) # CIRCLED TIMES
x<-gsub("⊙","\u2299",x) # CIRCLED DOT OPERATOR
x<-gsub("⊤","\u22a4",x) # DOWN TACK
x<-gsub("⊥","\u22a5",x) # UP TACK
x<-gsub("⊳","\u22b3",x) # CONTAINS AS NORMAL SUBGROUP
x<-gsub("⊺","\u22ba",x) # INTERCALATE
x<-gsub("⋀","\u22c0",x) # N-ARY LOGICAL AND
x<-gsub("⋃","\u22c3",x) # N-ARY UNION
x<-gsub("⋮","\u22ee",x) # VERTICAL ELLIPSIS
x<-gsub("⋱","\u22f1",x) # DOWN RIGHT DIAGONAL ELLIPSIS
x<-gsub("⌋","\u230b",x) # RIGHT FLOOR
x<-gsub("⌝","\u231d",x) # TOP RIGHT CORNER
x<-gsub("⌢","\u2322",x) # FROWN
x<-gsub("▷","\u25b7",x) # WHITE RIGHT-POINTING TRIANGLE
x<-gsub("▹","\u25b9",x) # WHITE RIGHT-POINTING SMALL TRIANGLE
x<-gsub("♡","\u2661",x) # WHITE HEART SUIT
x<-gsub("⟩","\u27e9",x) # MATHEMATICAL RIGHT ANGLE BRACKET
x<-gsub("⟶","\u27f6",x) # LONG RIGHTWARDS ARROW
x<-gsub("ꙍ","\ua64d",x) # CYRILLIC SMALL LETTER BROAD OMEGA
x<-gsub("︷","\ufe37",x) # PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET
x<-gsub("︸","\ufe38",x) # PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRACKET
x<-gsub("§","\u00a7",x) # SECTION SIGN
x<-gsub("¸","\u00b8",x) # CEDILLA
x<-gsub("Â","\u00c2",x) # LATIN CAPITAL LETTER A WITH CIRCUMFLEX
x<-gsub("Î","\u00ce",x) # LATIN CAPITAL LETTER I WITH CIRCUMFLEX
x<-gsub("Ý","\u00dd",x) # LATIN CAPITAL LETTER Y WITH ACUTE
x<-gsub("Ā","\u0100",x) # LATIN CAPITAL LETTER A WITH MACRON
x<-gsub("ē","\u0113",x) # LATIN SMALL LETTER E WITH MACRON
x<-gsub("ě","\u011b",x) # LATIN SMALL LETTER E WITH CARON
x<-gsub("Ĥ","\u0124",x) # LATIN CAPITAL LETTER H WITH CIRCUMFLEX
x<-gsub("ŵ","\u0175",x) # LATIN SMALL LETTER W WITH CIRCUMFLEX
x<-gsub("Ɛ","\u0190",x) # LATIN CAPITAL LETTER OPEN E
x<-gsub("Ƙ","\u0198",x) # LATIN CAPITAL LETTER K WITH HOOK
x<-gsub("Ɵ","\u019f",x) # LATIN CAPITAL LETTER O WITH MIDDLE TILDE
x<-gsub("ǎ","\u01ce",x) # LATIN SMALL LETTER A WITH CARON
x<-gsub("Ȓ","\u0212",x) # LATIN CAPITAL LETTER R WITH INVERTED BREVE
x<-gsub("Ʌ","\u0245",x) # LATIN CAPITAL LETTER TURNED V
x<-gsub("ɕ","\u0255",x) # LATIN SMALL LETTER C WITH CURL
x<-gsub("ɡ","\u0261",x) # LATIN SMALL LETTER SCRIPT G
x<-gsub("ɣ","\u0263",x) # LATIN SMALL LETTER GAMMA
x<-gsub("ɤ","\u0264",x) # LATIN SMALL LETTER RAMS HORN
x<-gsub("ɾ","\u027e",x) # LATIN SMALL LETTER R WITH FISHHOOK
x<-gsub("ɿ","\u027f",x) # LATIN SMALL LETTER REVERSED R WITH FISHHOOK
x<-gsub("ʅ","\u0285",x) # LATIN SMALL LETTER SQUAT REVERSED ESH
x<-gsub("ʝ","\u029d",x) # LATIN SMALL LETTER J WITH CROSSED-TAIL
x<-gsub("ʤ","\u02a4",x) # LATIN SMALL LETTER DEZH DIGRAPH
x<-gsub("˘","\u02d8",x) # BREVE
x<-gsub("˚","\u02da",x) # RING ABOVE
x<-gsub("̃","\u0303",x) # COMBINING TILDE
x<-gsub("К","\u041a",x) # CYRILLIC CAPITAL LETTER KA
x<-gsub("П","\u041f",x) # CYRILLIC CAPITAL LETTER PE
x<-gsub("р","\u0440",x) # CYRILLIC SMALL LETTER ER look alike p
x<-gsub("х","\u0445",x) # CYRILLIC SMALL LETTER HA
x<-gsub("ᴂ","\u1d02",x) # LATIN SMALL LETTER TURNED AE
x<-gsub("ᴦ","\u1d26",x) # GREEK LETTER SMALL CAPITAL GAMMA
x<-gsub("ᴧ","\u1d27",x) # GREEK LETTER SMALL CAPITAL LAMDA
x<-gsub("ᵒ","\u1d52",x) # MODIFIER LETTER SMALL O
x<-gsub("ᵡ","\u1d61",x) # MODIFIER LETTER SMALL CHI
x<-gsub("ṽ","\u1e7d",x) # LATIN SMALL LETTER V WITH TILDE
x<-gsub("ẏ","\u1e8f",x) # LATIN SMALL LETTER Y WITH DOT ABOVE
x<-gsub("ẞ","\u1e9e",x) # LATIN CAPITAL LETTER SHARP S
x<-gsub("⁰","\u2070",x) # SUPERSCRIPT ZERO
x<-gsub("₀","\u2080",x) # SUBSCRIPT ZERO
x<-gsub("₩","\u20a9",x) # WON SIGN
x<-gsub("⃗","\u20d7",x) # COMBINING RIGHT ARROW ABOVE
x<-gsub("ℂ","\u2102",x) # DOUBLE-STRUCK CAPITAL C
x<-gsub("ℏ","\u210f",x) # PLANCK CONSTANT OVER TWO PI
x<-gsub("ℐ","\u2110",x) # SCRIPT CAPITAL I
x<-gsub("ℑ","\u2111",x) # BLACK-LETTER CAPITAL I
x<-gsub("ℒ","\u2112",x) # SCRIPT CAPITAL L
x<-gsub("℘","\u2118",x) # SCRIPT CAPITAL P
x<-gsub("ℛ","\u211b",x) # SCRIPT CAPITAL R
x<-gsub("℧","\u2127",x) # INVERTED OHM SIGN
x<-gsub("ℳ","\u2133",x) # SCRIPT CAPITAL M
x<-gsub("↘","\u2198",x) # SOUTH EAST ARROW
x<-gsub("∊","\u220a",x) # SMALL ELEMENT OF
x<-gsub("∓","\u2213",x) # MINUS-OR-PLUS SIGN
x<-gsub("⊲","\u22b2",x) # NORMAL SUBGROUP OF
x<-gsub("⋂","\u22c2",x) # N-ARY INTERSECTION
x<-gsub("⌀","\u2300",x) # DIAMETER SIGN
x<-gsub("⌉","\u2309",x) # RIGHT CEILING
x<-gsub("⌊","\u230a",x) # LEFT FLOOR
x<-gsub("⟨","\u27e8",x) # MATHEMATICAL LEFT ANGLE BRACKET
x<-gsub("⨁","\u2a01",x) # N-ARY CIRCLED PLUS OPERATOR
x<-gsub("⨉","\u2a09",x) # N-ARY TIMES OPERATOR
x<-gsub("。","\u3002",x) # IDEOGRAPHIC FULL STOP
x<-gsub("¡","\u00a1",x) # spanish INVERTED EXCLAMATION MARK
x<-gsub("Ê","\u00ca",x) # LATIN CAPITAL LETTER E WITH CIRCUMFLEX
x<-gsub("Ĺ","\u0139",x) # LATIN CAPITAL LETTER L WITH ACUTE
x<-gsub("ʔ","\u0294",x) # LATIN LETTER GLOTTAL STOP
x<-gsub("Ζ","\u0396",x) # GREEK CAPITAL LETTER ZETA
x<-gsub("Є","\u0404",x) # CYRILLIC CAPITAL LETTER UKRAINIAN IE
x<-gsub("ẇ","\u1e87",x) # LATIN SMALL LETTER W WITH DOT ABOVE
x<-gsub("₁","\u2081",x) # SUBSCRIPT ONE
x<-gsub("ℰ","\u2130",x) # SCRIPT CAPITAL E
x<-gsub("↗","\u2197",x) # NORTH EAST ARROW
x<-gsub("⇀","\u21c0",x) # RIGHTWARDS HARPOON WITH BARB UPWARDS
x<-gsub("⇐","\u21d0",x) # LEFTWARDS DOUBLE ARROW
x<-gsub("∄","\u2204",x) # THERE DOES NOT EXIST
x<-gsub("∋","\u220b",x) # CONTAINS AS MEMBER
x<-gsub("∍","\u220d",x) # SMALL CONTAINS AS MEMBER
x<-gsub("∎","\u220e",x) # end OF PROOF
x<-gsub("∠","\u2220",x) # ANGLE
x<-gsub("≝","\u225d",x) # EQUAL TO BY DEFINITION
x<-gsub("≲","\u2272",x) # LESS-THAN OR EQUIVALENT TO
x<-gsub("≳","\u2273",x) # GREATER-THAN OR EQUIVALENT TO
x<-gsub("⊀","\u2280",x) # DOES NOT PRECEDE
x<-gsub("⊃","\u2283",x) # SUPERSET OF
x<-gsub("⊓","\u2293",x) # SQUARE CAP
x<-gsub("⊔","\u2294",x) # SQUARE CUP
x<-gsub("⊖","\u2296",x) # CIRCLED MINUS
x<-gsub("⊢","\u22a2",x) # RIGHT TACK
x<-gsub("⊨","\u22a8",x) # TRUE
x<-gsub("⋄","\u22c4",x) # DIAMOND OPERATOR
x<-gsub("⌈","\u2308",x) # LEFT CEILING
x<-gsub("▽","\u25bd",x) # WHITE DOWN-POINTING TRIANGLE
x<-gsub("◇","\u25c7",x) # WHITE DIAMOND
x<-gsub("♪","\u266a",x) # EIGHTH NOTE
x<-gsub("⟵","\u27f5",x) # LONG LEFTWARDS ARROW
x<-gsub("⟺","\u27fa",x) # LONG LEFT RIGHT DOUBLE ARROW
x<-gsub("⩮","\u2a6e",x) # EQUALS WITH ASTERISK
x<-gsub("⩵","\u2a75",x) # TWO CONSECUTIVE EQUALS SIGNS
x<-gsub("⪡","\u2aa1",x) # DOUBLE NESTED LESS-THAN
x<-gsub("⫫","\u2aeb",x) # DOUBLE UP TACK
x<-gsub("、","\u3001",x) # IDEOGRAPHIC COMMA
x<-gsub("駝","\u99dd",x) # CJK UNIFIED IDEOGRAPH-99dd
x<-gsub("�","\ufffd",x) # CIRCLED QUESTION MARK
x<-gsub("ɦ","\u266",x) # LATIN SMALL LETTER H WITH HOOK
x<-gsub("▶","\u25b6",x) # Black right-pointing triangle
x<-gsub("⁎","\u204e",x) # LOW ASTERISK
x<-gsub("Å","\u212b",x) # ANGSTROM SIGN
x<-gsub("ª","\u00aa",x) # FEMININE ORDINAL INDICATOR
x<-gsub("Ú","\u00da",x) # LATIN CAPITAL LETTER U WITH ACUTE
x<-gsub("‚","\u201a",x) # SINGLE LOW-9 QUOTATION MARK
x<-gsub("Ⅰ","\u2160",x) # ROMAN NUMERAL ONE
x<-gsub("Ⅱ","\u2161",x) # ROMAN NUMERAL TWO
x<-gsub("Ⅲ","\u2162",x) # ROMAN NUMERAL Three
x<-gsub("Ⅳ","\u2163",x) # ROMAN NUMERAL FOUR
x<-gsub("ð","\u00f0",x) # LATIN SMALL LETTER ETH
x<-gsub("░","\u2591",x) # LIGHT SHADE
x<-gsub("
","\u000d",x) # CARRIAGE RETURN (CR)*
x<-gsub("‏","\u200f",x) # RIGHT-TO-LEFT MARK
x<-gsub("∽","\u223d",x) # REVERSED TILDE
x<-gsub("Ⓡ","\u24c7",x) # CIRCLED LATIN CAPITAL LETTER R
x<-gsub("ʟ","\u029f",x) # LATIN LETTER SMALL CAPITAL L
x<-gsub("̄","\u0304",x) # COMBINING MACRON
x<-gsub("Ž","\u017d",x) # LATIN CAPITAL LETTER Z WITH CARON
x<-gsub("₂","\u2082",x) # SUBSCRIPT TWO
x<-gsub("À","\u00c0",x) # LATIN CAPITAL LETTER A WITH GRAVE
x<-gsub("^","\u005e",x) # CIRCUMFLEX ACCENT
x<-gsub("û","\u00fb",x) # LATIN SMALL LETTER U WITH CIRCUMFLEX
x<-gsub("ˉ","\u02c9",x) # MODIFIER LETTER MACRON
x<-gsub("·","\u0387",x) # GREEK ANO TELEIA
x<-gsub("ǫ","\u01eb",x) # LATIN SMALL LETTER O WITH OGONEK
x<-gsub("Ș","\u0218",x) # LATIN CAPITAL LETTER S WITH COMMA BELOW
x<-gsub("∴","\u2234",x) # THEREFORE
x<-gsub("ά","\u03ac",x) # GREEK SMALL LETTER ALPHA WITH TONOS
x<-gsub("➔","\u2794",x) # HEAVY WIDE-HEADED RIGHTWARDS ARROW
x<-gsub("͘","\u0358",x) # COMBINING DOT ABOVE RIGHT
x<-gsub("ф","\u0444",x) # CYRILLIC SMALL LETTER EF
x<-gsub("ӓ","\u04d3",x) # CYRILLIC SMALL LETTER A WITH DIAERESIS
x<-gsub("⩒","\u2a52",x) # LOGICAL OR WITH DOT ABOVE
x<-gsub("ň","\u0148",x) # LATIN SMALL LETTER N WITH CARON
x<-gsub("Ƭ","\u01ac",x) # LATIN CAPITAL LETTER T WITH HOOK
x<-gsub("Ǻ","\u01fa",x) # LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE
x<-gsub("ᴅ","\u1d05",x) # LATIN LETTER SMALL CAPITAL D
x<-gsub("∖","\u2216",x) # SET MINUS
x<-gsub("Ⓒ","\u24b8",x) # CIRCLED LATIN CAPITAL LETTER C
x<-gsub("Ρ","\u03a1",x) # GREEK CAPITAL LETTER RHO
x<-gsub("Г","\u0413",x) # CYRILLIC CAPITAL LETTER GHE
x<-gsub("Ш","\u0428",x) # CYRILLIC CAPITAL LETTER SHA
x<-gsub("Ř","\u0158",x) # LATIN CAPITAL LETTER R WITH CARON
x<-gsub("ə","\u0259",x) # LATIN SMALL LETTER SCHWA
x<-gsub("˝","\u02dd",x) # DOUBLE ACUTE ACCENT
x<-gsub("┴","\u2534",x) # BOX DRAWINGS LIGHT UP AND HORIZONTAL
x<-gsub("𝒮","\ud4ae",x) # HANGUL SYLLABLE D4AE
x<-gsub("Ï","\u00cf",x) # LATIN CAPITAL LETTER I WITH DIAERESIS
x<-gsub("Đ","\u0110",x) # LATIN CAPITAL LETTER D WITH STROKE
x<-gsub("ġ","\u0121",x) # LATIN SMALL LETTER G WITH DOT ABOVE
x<-gsub("ţ","\u0163",x) # LATIN SMALL LETTER T WITH CEDILLA
x<-gsub("ƙ","\u0199",x) # LATIN SMALL LETTER K WITH HOOK
x<-gsub("ɹ","\u0279",x) # LATIN SMALL LETTER TURNED R
x<-gsub("ї","\u0457",x) # CYRILLIC SMALL LETTER YI
x<-gsub("қ","\u049b",x) # CYRILLIC SMALL LETTER KA WITH DESCENDER
x<-gsub("ҡ","\u04a1",x) # CYRILLIC SMALL LETTER BASHKIR KA
x<-gsub("Ұ","\u04b0",x) # CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE:
x<-gsub("ӧ","\u04e7",x) # CYRILLIC SMALL LETTER O WITH DIAERESIS
x<-gsub("⇄","\u21c4",x) # RIGHTWARDS ARROW OVER LEFTWARDS ARROW
x<-gsub("⇌","\u21cc",x) # RIGHTWARDS HARPOON OVER LEFTWARDS HARPOON
x<-gsub("■","\u25a0",x) # BLACK SQUARE
x<-gsub("È","\u00c8",x) # LATIN CAPITAL LETTER E WITH GRAVE
x<-gsub("đ","\u0111",x) # LATIN SMALL LETTER D WITH STROKE
x<-gsub("ĥ","\u0125",x) # LATIN SMALL LETTER H WITH CIRCUMFLEX
x<-gsub("Ÿ","\u0178",x) # LATIN CAPITAL LETTER Y WITH DIAERESIS
x<-gsub("Ʈ","\u01ae",x) # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
x<-gsub("ƴ","\u01b4",x) # LATIN SMALL LETTER Y WITH HOOK
x<-gsub("ǒ","\u01d2",x) # LATIN SMALL LETTER O WITH CARON
x<-gsub("Ț","\u021a",x) # LATIN CAPITAL LETTER T WITH COMMA BELOW
x<-gsub("ˏ","\u02cf",x) # MODIFIER LETTER LOW ACUTE ACCENT
x<-gsub("ː","\u02d0",x) # MODIFIER LETTER TRIANGULAR COLON
x<-gsub("ϱ","\u03f1",x) # GREEK RHO SYMBOL
x<-gsub("ϴ","\u03f4",x) # GREEK CAPITAL THETA SYMBOL
x<-gsub("ـ","\u0640_",x) # ARABIC TATWEEL
x<-gsub("ᵧ","\u1d67",x) # GREEK SUBSCRIPT SMALL LETTER GAMMA
x<-gsub("Ṁ","\u1e40",x) # LATIN CAPITAL LETTER M WITH DOT ABOVE
x<-gsub("‟","\u201f",x) # DOUBLE HIGH-REVERSED-9 QUOTATION MARK
x<-gsub("⁺","\u207a",x) # SUPERSCRIPT PLUS SIGN
x<-gsub("₅","\u2085",x) # SUBSCRIPT FIVE
x<-gsub("₦","\u20a6",x) # NAIRA SIGN
x<-gsub("₵","\u20b5",x) # CEDI SIGN
x<-gsub("№","\u2116",x) # NUMERO SIGN
x<-gsub("℠","\u2120",x) # SERVICE MARK
x<-gsub("⇆","\u21c6",x) # LEFTWARDS ARROW OVER RIGHTWARDS ARROW
x<-gsub("≙","\u2259",x) # ESTIMATES
x<-gsub("⊟","\u229f",x) # SQUARED MINUS
x<-gsub("⍴","\u2374",x) # APL FUNCTIONAL SYMBOL RHO
x<-gsub("⎴","\u23b4",x) # TOP SQUARE BRACKET
x<-gsub("❖","\u2756",x) # BLACK DIAMOND MINUS WHITE X
x<-gsub("푃","\ud443",x) # HANGUL SYLLABLE D443
x<-gsub("ff","\ufb00",x) # LATIN SMALL LIGATURE FF
x<-gsub("𝑥","\ud465",x) # HANGUL SYLLABLE D465
x<-gsub("𝔉","\ud509",x) # HANGUL SYLLABLE D509
x<-gsub("𝕊","\ud54a",x) # HANGUL SYLLABLE D54A
x<-gsub("~","\u007e",x) # TILDE
x<-gsub("Ð","\u00d0",x) # LATIN CAPITAL LETTER ETH
x<-gsub("Ò","\u00d2",x) # LATIN CAPITAL LETTER O WITH GRAVE
x<-gsub("Þ","\u00de",x) # LATIN CAPITAL LETTER THORN
x<-gsub("Ė","\u0116",x) # LATIN CAPITAL LETTER E WITH DOT ABOVE
x<-gsub("Ĝ","\u011c",x) # LATIN CAPITAL LETTER G WITH CIRCUMFLEX
x<-gsub("ĩ","\u0129",x) # LATIN SMALL LETTER I WITH TILDE
x<-gsub("ũ","\u0169",x) # LATIN SMALL LETTER U WITH TILDE
x<-gsub("Ǝ","\u018e",x) # LATIN CAPITAL LETTER REVERSED E
x<-gsub("п","\u043f",x) # CYRILLIC SMALL LETTER PE
x<-gsub("т","\u0442",x) # CYRILLIC SMALL LETTER TE
x<-gsub("ћ","\u045b",x) # CYRILLIC SMALL LETTER TSHE
x<-gsub("Ӕ","\u04d4",x) # CYRILLIC CAPITAL LIGATURE A IE
x<-gsub("Ԏ","\u050e",x) # CYRILLIC CAPITAL LETTER KOMI TJE
x<-gsub("Փ","\u0553",x) # ARMENIAN CAPITAL LETTER PIWR
x<-gsub("״","\u05f4",x) # HEBREW PUNCTUATION GERSHAYIM
x<-gsub("٠","\u0660",x) # ARABIC-INDIC DIGIT ZERO
x<-gsub("ࣙ","\u08d9",x) # ARABIC SMALL LOW NOON WITH KASRA
x<-gsub("ः","\u0903",x) # DEVANAGARI SIGN VISARG
x<-gsub("ᗡ","\u15e1",x) # CANADIAN SYLLABICS CARRIER THA
x<-gsub("ᴀ","\u1d00",x) # LATIN LETTER SMALL CAPITAL A
x<-gsub("ᴪ","\u1d2a",x) # GREEK LETTER SMALL CAPITAL PSI
x<-gsub("ᵞ","\u1d5e",x) # MODIFIER LETTER SMALL GREEK GAMMA
x<-gsub("ᶲ","\u1db2",x) # MODIFIER LETTER SMALL PHI
x<-gsub("Ḗ","\u1e16",x) # LATIN CAPITAL LETTER E WITH MACRON AND ACUTE
x<-gsub("ḡ","\u1e21",x) # LATIN SMALL LETTER G WITH MACRON
x<-gsub("ḥ","\u1e25",x) # LATIN SMALL LETTER H WITH DOT BELOW
x<-gsub("ṃ","\u1e43",x) # LATIN SMALL LETTER M WITH DOT BELOW
x<-gsub("ṩ","\u1e69",x) # LATIN SMALL LETTER S WITH DOT BELOW AND DOT ABOVE
x<-gsub("ṫ","\u1e6b",x) # LATIN SMALL LETTER T WITH DOT ABOVE
x<-gsub("Ṽ","\u1e7c",x) # LATIN CAPITAL LETTER V WITH TILDE
x<-gsub("Ẋ","\u1e8a",x) # LATIN CAPITAL LETTER X WITH DOT ABOVE
x<-gsub("ẖ","\u1e96",x) # LATIN SMALL LETTER H WITH LINE BELOW
x<-gsub("ắ","\u1eaf",x) # LATIN SMALL LETTER A WITH BREVE AND ACUTE
x<-gsub("ế","\u1ebf",x) # LATIN SMALL LETTER E WITH CIRCUMFLEX AND ACUTE
x<-gsub("ị","\u1ecb",x) # LATIN SMALL LETTER I WITH DOT BELOW
x<-gsub("ộ","\u1ed9",x) # LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW
x<-gsub("ἄ","\u1f04",x) # GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA
x<-gsub("ἠ","\u1f20",x) # GREEK SMALL LETTER ETA WITH PSILI
x<-gsub("ῖ","\u1fd6",x) # GREEK SMALL LETTER IOTA WITH PERISPOMENI:
x<-gsub("Ώ","\u1ffb",x) # GREEK CAPITAL LETTER OMEGA WITH OXIA
x<-gsub("‪","\u202a",x) # LEFT-TO-RIGHT EMBEDDING
x<-gsub("‫","\u202b",x) # RIGHT-TO-LEFT EMBEDDING
x<-gsub("⁃","\u2043",x) # HYPHEN BULLET
x<-gsub("⁹","\u2079",x) # SUPERSCRIPT NINE
x<-gsub("⁻","\u207b",x) # SUPERSCRIPT MINUS
x<-gsub("₆","\u2086",x) # SUBSCRIPT SIX
x<-gsub("℞","\u211e",x) # PRESCRIPTION TAKE
x<-gsub("ℬ","\u212c",x) # SCRIPT CAPITAL B
x<-gsub("ℵ","\u2135",x) # ALEF SYMBOL
x<-gsub("Ⅵ","\u2165",x) # ROMAN NUMERAL SIX
x<-gsub("ⅳ","\u2173",x) # SMALL ROMAN NUMERAL FOUR
x<-gsub("↨","\u21a8",x) # UP DOWN ARROW WITH BASE
x<-gsub("↱","\u21b1",x) # UPWARDS ARROW WITH TIP RIGHTWARDS
x<-gsub("⇇","\u21c7",x) # LEFTWARDS PAIRED ARROWS
x<-gsub("⇋","\u21cb",x) # LEFTWARDS HARPOON OVER RIGHTWARDS HARPOON
x<-gsub("∛","\u221b",x) # CUBE ROOT
x<-gsub("∮","\u222e",x) # CONTOUR INTEGRAL
x<-gsub("≊","\u224a",x) # ALMOST EQUAL OR EQUAL TO
x<-gsub("≣","\u2263",x) # STRICTLY EQUIVALENT TO
x<-gsub("≹","\u2279",x) # NEITHER GREATER-THAN NOR LESS-THAN
x<-gsub("⊈","\u2288",x) # NEITHER A SUBSET OF NOR EQUAL TO
x<-gsub("⊊","\u228a",x) # SUBSET OF WITH NOT EQUAL TO
x<-gsub("⊎","\u228e",x) # MULTISET UNION
x<-gsub("⊚","\u229a",x) # CIRCLED RING OPERATOR
x<-gsub("⊞","\u229e",x) # SQUARED PLUS
x<-gsub("⊠","\u22a0",x) # SQUARED TIMES
x<-gsub("⋍","\u22cd",x) # REVERSED TILDE EQUALS
x<-gsub("⋙","\u22d9",x) # VERY MUCH GREATER-THAN
x<-gsub("⌜","\u231c",x) # TOP LEFT CORNER
x<-gsub("⍦","\u2366",x) # APL FUNCTIONAL SYMBOL DOWN SHOE STILE
x<-gsub("⎔","\u2394",x) # SOFTWARE-FUNCTION SYMBOL
x<-gsub("⎕","\u2395",x) # APL FUNCTIONAL SYMBOL QUAD
x<-gsub("Ⓕ","\u24bb",x) # CIRCLED LATIN CAPITAL LETTER F
x<-gsub("┤","\u2524",x) # BOX DRAWINGS LIGHT VERTICAL AND LEFT
x<-gsub("▓","\u2593",x) # DARK SHADE
x<-gsub("▬","\u25ac",x) # BLACK RECTANGLE
x<-gsub("▯","\u25af",x) # WHITE VERTICAL RECTANGLE
x<-gsub("◽","\u25fd",x) # WHITE MEDIUM SMALL SQUARE
x<-gsub("☛","\u261b",x) # BLACK RIGHT POINTING INDEX
x<-gsub("➁","\u2781",x) # DINGBAT CIRCLED SANS-SERIF DIGIT TWO
x<-gsub("➝","\u279d",x) # TRIANGLE-HEADED RIGHTWARDS ARROW
x<-gsub("⟂","\u27c2",x) # PERPENDICULAR
x<-gsub("⨯","\u2a2f",x) # VECTOR OR CROSS PRODUCT
x<-gsub("ⱪ","\u2c6a",x) # LATIN SMALL LETTER K WITH DESCENDER
x<-gsub("『","\u300e",x) # LEFT WHITE CORNER BRACKET
x<-gsub("た","\u305f",x) # HIRAGANA LETTER TA
x<-gsub("","\ue5fb",x) # Private Use Area E5FB
x<-gsub("ffi","\ufb03",x) # LATIN SMALL LIGATURE FFI
x<-gsub("・","\uff65",x) # HALFWIDTH KATAKANA MIDDLE DOT
x<-gsub("¤","\u00a4",x) # CURRENCY SIGN
x<-gsub("¦","\u00a6",x) # BROKEN BAR
x<-gsub("Ë","\u00cb",x) # LATIN CAPITAL LETTER E WITH DIAERESIS
x<-gsub("Ì","\u00cc",x) # LATIN CAPITAL LETTER I WITH GRAVE
x<-gsub("Ñ","\u00d1",x) # LATIN CAPITAL LETTER N WITH TILDE
x<-gsub("Õ","\u00d5",x) # LATIN CAPITAL LETTER O WITH TILDE
x<-gsub("Ù","\u00d9",x) # LATIN CAPITAL LETTER U WITH GRAVE
x<-gsub("þ","\u00fe",x) # LATIN SMALL LETTER THORN
x<-gsub("ÿ","\u00ff",x) # LATIN SMALL LETTER Y WITH DIAERESIS
x<-gsub("Ą","\u0104",x) # LATIN CAPITAL LETTER A WITH OGONEK
x<-gsub("Ć","\u0106",x) # LATIN CAPITAL LETTER C WITH ACUTE
x<-gsub("ċ","\u010b",x) # LATIN SMALL LETTER C WITH DOT ABOVE
x<-gsub("ĕ","\u0115",x) # LATIN SMALL LETTER E WITH BREVE
x<-gsub("Ę","\u0118",x) # LATIN CAPITAL LETTER E WITH OGONEK
x<-gsub("Ě","\u011a",x) # LATIN CAPITAL LETTER E WITH CARON
x<-gsub("ĝ","\u011d",x) # LATIN SMALL LETTER G WITH CIRCUMFLEX
x<-gsub("Ğ","\u011e",x) # LATIN CAPITAL LETTER G WITH BREVE
x<-gsub("Ģ","\u0122",x) # LATIN CAPITAL LETTER G WITH CEDILLA
x<-gsub("ģ","\u0123",x) # LATIN SMALL LETTER G WITH CEDILLA
x<-gsub("ħ","\u0127",x) # LATIN SMALL LETTER H WITH STROKE
x<-gsub("ĭ","\u012d",x) # LATIN SMALL LETTER I WITH BREVE
x<-gsub("Į","\u012e",x) # LATIN CAPITAL LETTER I WITH OGONEK
x<-gsub("Ķ","\u0136",x) # LATIN CAPITAL LETTER K WITH CEDILLA
x<-gsub("ķ","\u0137",x) # LATIN SMALL LETTER K WITH CEDILLA
x<-gsub("ĺ","\u013a",x) # LATIN SMALL LETTER L WITH ACUT
x<-gsub("Ļ","\u013b",x) # LATIN CAPITAL LETTER L WITH CEDILLA
x<-gsub("ļ","\u013c",x) # LATIN SMALL LETTER L WITH CEDILLA
x<-gsub("Ń","\u0143",x) # LATIN CAPITAL LETTER N WITH ACUTE
x<-gsub("ņ","\u0146",x) # LATIN SMALL LETTER N WITH CEDILLA
x<-gsub("Ň","\u0147",x) # LATIN CAPITAL LETTER N WITH CARON
x<-gsub("Ō","\u014c",x) # LATIN CAPITAL LETTER O WITH MACRON
x<-gsub("Ő","\u0150",x) # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE
x<-gsub("Ŕ","\u0154",x) # LATIN CAPITAL LETTER R WITH ACUTE
x<-gsub("ŕ","\u0155",x) # LATIN SMALL LETTER R WITH ACUT
x<-gsub("ŝ","\u015d",x) # LATIN SMALL LETTER S WITH CIRCUMFLEX
x<-gsub("Ţ","\u0162",x) # LATIN CAPITAL LETTER T WITH CEDILLA
x<-gsub("Ū","\u016a",x) # LATIN CAPITAL LETTER U WITH MACRON
x<-gsub("Ű","\u0170",x) # LATIN CAPITAL LETTER U WITH DOUBLE ACUTE
x<-gsub("ű","\u0171",x) # LATIN SMALL LETTER U WITH DOUBLE ACUTE
x<-gsub("Ź","\u0179",x) # LATIN CAPITAL LETTER Z WITH ACUTE
x<-gsub("Ɨ","\u0197",x) # LATIN CAPITAL LETTER I WITH STROKE
x<-gsub("ƚ","\u019a",x) # LATIN SMALL LETTER L WITH BAR
x<-gsub("Ơ","\u01a0",x) # LATIN CAPITAL LETTER O WITH HORN
x<-gsub("Ǎ","\u01cd",x) # LATIN CAPITAL LETTER A WITH CARON
x<-gsub("ǐ","\u01d0",x) # LATIN SMALL LETTER I WITH CARON
x<-gsub("ǔ","\u01d4",x) # LATIN SMALL LETTER U WITH CARON
x<-gsub("ǖ","\u01d6",x) # LATIN SMALL LETTER U WITH DIAERESIS AND MACRON
x<-gsub("ǚ","\u01da",x) # LATIN SMALL LETTER U WITH DIAERESIS AND CARON
x<-gsub("ǧ","\u01e7",x) # LATIN SMALL LETTER G WITH CARON
x<-gsub("ǹ","\u01f9",x) # LATIN SMALL LETTER N WITH GRAVE
x<-gsub("ǿ","\u01ff",x) # LATIN SMALL LETTER O WITH STROKE AND ACUTE
x<-gsub("ȃ","\u0203",x) # LATIN SMALL LETTER A WITH INVERTED BREVE
x<-gsub("ȅ","\u0205",x) # LATIN SMALL LETTER E WITH DOUBLE GRAVE
x<-gsub("ȇ","\u0207",x) # LATIN SMALL LETTER E WITH INVERTED BREVE
x<-gsub("ȋ","\u020b",x) # LATIN SMALL LETTER I WITH INVERTED BREVE
x<-gsub("ȍ","\u020d",x) # LATIN SMALL LETTER O WITH DOUBLE GRAVE
x<-gsub("ȏ","\u020f",x) # LATIN SMALL LETTER O WITH INVERTED BREVE
x<-gsub("ȓ","\u0213",x) # LATIN SMALL LETTER R WITH INVERTED BREVE
x<-gsub("Ȧ","\u0226",x) # LATIN CAPITAL LETTER A WITH DOT ABOVE
x<-gsub("ȧ","\u0227",x) # LATIN SMALL LETTER A WITH DOT ABOVE
x<-gsub("ȩ","\u0229",x) # LATIN SMALL LETTER E WITH CEDILLA
x<-gsub("ȶ","\u0236",x) # LATIN SMALL LETTER T WITH CURL
x<-gsub("˛","\u02db",x) # OGONEK
x<-gsub("̆","\u0306",x) # COMBINING BREVE
x<-gsub("̈","\u0308",x) # COMBINING DIAERESIS
x<-gsub("̗","\u0317",x) # COMBINING ACUTE ACCENT BELOW
x<-gsub("̧","\u0327",x) # COMBINING CEDILLA
x<-gsub("̨","\u0328",x) # COMBINING OGONEK
x<-gsub("ͣ","\u0363",x) # COMBINING LATIN SMALL LETTER A
x<-gsub("ϊ","\u03ca",x) # GREEK SMALL LETTER IOTA WITH DIALYTIKA
x<-gsub("ϋ","\u03cb",x) # GREEK SMALL LETTER UPSILON WITH DIALYTIKA
x<-gsub("ό","\u03cc",x) # GREEK SMALL LETTER OMICRON WITH TONOS
x<-gsub("ч","\u0447",x) # CYRILLIC SMALL LETTER CHE
x<-gsub("ѐ","\u0450",x) # CYRILLIC SMALL LETTER IE WITH GRAVE
x<-gsub("ё","\u0451",x) # CYRILLIC SMALL LETTER IO
x<-gsub("ҁ","\u0481",x) # CYRILLIC SMALL LETTER KOPPA
x<-gsub("Ҫ","\u04aa",x) # CYRILLIC CAPITAL LETTER ES WITH DESCENDER
x<-gsub("ҫ","\u04ab",x) # CYRILLIC SMALL LETTER ES WITH DESCENDER
x<-gsub("ӑ","\u04d1",x) # CYRILLIC SMALL LETTER A WITH BREVE
x<-gsub("Ӧ","\u04e6",x) # CYRILLIC CAPITAL LETTER O WITH DIAERESIS
x<-gsub("ࠢ","\u0822",x) # SAMARITAN VOWEL SIGN LONG A
x<-gsub("ᴌ","\u1d0c",x) # LATIN LETTER SMALL CAPITAL L WITH STROKE
x<-gsub("ᵃ","\u1d43",x) # MODIFIER LETTER SMALL A
x<-gsub("ᵇ","\u1d47",x) # MODIFIER LETTER SMALL B
x<-gsub("ᶜ","\u1d9c",x) # MODIFIER LETTER SMALL C
x<-gsub("ḿ","\u1e3f",x) # LATIN SMALL LETTER M WITH ACUTE
x<-gsub("ṅ","\u1e45",x) # LATIN SMALL LETTER N WITH DOT ABOVE
x<-gsub("Ṋ","\u1e4a",x) # LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW
x<-gsub("ṡ","\u1e61",x) # LATIN SMALL LETTER S WITH DOT ABOVE
x<-gsub("ṣ","\u1e63",x) # LATIN SMALL LETTER S WITH DOT BELOW
x<-gsub("ẗ","\u1e97",x) # LATIN SMALL LETTER T WITH DIAERESIS
x<-gsub("ẙ","\u1e99",x) # LATIN SMALL LETTER Y WITH RING ABOVE
x<-gsub("ạ","\u1ea1",x) # LATIN SMALL LETTER A WITH DOT BELOW
x<-gsub("ả","\u1ea3",x) # LATIN SMALL LETTER A WITH HOOK ABOVE
x<-gsub("ấ","\u1ea5",x) # LATIN SMALL LETTER A WITH CIRCUMFLEX AND ACUTE
x<-gsub("ầ","\u1ea7",x) # LATIN SMALL LETTER A WITH CIRCUMFLEX AND GRAVE
x<-gsub("ẻ","\u1ebb",x) # LATIN SMALL LETTER E WITH HOOK ABOVE
x<-gsub("ẽ","\u1ebd",x) # LATIN SMALL LETTER E WITH TILDE
x<-gsub("ề","\u1ec1",x) # LATIN SMALL LETTER E WITH CIRCUMFLEX AND GRAVE
x<-gsub("ể","\u1ec3",x) # LATIN SMALL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE
x<-gsub("Ễ","\u1ec4",x) # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE
x<-gsub("ễ","\u1ec5",x) # LATIN SMALL LETTER E WITH CIRCUMFLEX AND TILDE
x<-gsub("ọ","\u1ecd",x) # LATIN SMALL LETTER O WITH DOT BELOW
x<-gsub("ố","\u1ed1",x) # LATIN SMALL LETTER O WITH CIRCUMFLEX AND ACUTE
x<-gsub("ồ","\u1ed3",x) # LATIN SMALL LETTER O WITH CIRCUMFLEX AND GRAVE
x<-gsub("ờ","\u1edd",x) # LATIN SMALL LETTER O WITH HORN AND GRAVE
x<-gsub("ợ","\u1ee3",x) # LATIN SMALL LETTER O WITH HORN AND DOT BELOW
x<-gsub("ữ","\u1eef",x) # LATIN SMALL LETTER U WITH HORN AND TILDE
x<-gsub("ỳ","\u1ef3",x) # LATIN SMALL LETTER Y WITH GRAVE
x<-gsub("ỷ","\u1ef7",x) # LATIN SMALL LETTER Y WITH HOOK ABOVE
x<-gsub("ỹ","\u1ef9",x) # LATIN SMALL LETTER Y WITH TILDE
x<-gsub("ί","\u1f77",x) # GREEK SMALL LETTER IOTA WITH OXIA:
x<-gsub("ό","\u1f79",x) # GREEK SMALL LETTER OMICRON WITH OXIA
x<-gsub("ῆ","\u1fc6",x) # GREEK SMALL LETTER ETA WITH PERISPOMENI
x<-gsub("≑","\u2251",x) # GEOMETRICALLY EQUAL TO
x<-gsub("≿","\u227f",x) # SUCCEEDS OR EQUIVALENT TO
x<-gsub("⊘","\u2298",x) # CIRCLED DIVISION SLASH
x<-gsub("☺","\u263a",x) # WHITE SMILING FACE
x<-gsub("Ď","\u010e",x) # LATIN CAPITAL LETTER D WITH CARON
}
}# End hexadecimal conversion
## unify some html letters
x<-gsub("&","& ",x) # &
x<-gsub("<","<",x) # less than
x<-gsub("≤","<=",x) # less equal
x<-gsub(">",">",x) # greater than
x<-gsub("≥",">=",x) # greater equal
x<-gsub("=|=","=",x) # equal sign
x<-gsub("&","& ",x) # &
x<-gsub("´","'",x) #
x<-gsub("´","'",x) #
x<-gsub("'","'",x) #
x<-gsub(";",";",x) # semi colon
x<-gsub(",",",",x) # comma
x<-gsub(""","'",x) #
x<-gsub("(","(",x) #
x<-gsub(")",")",x) # )
x<-gsub("]|]","]",x) #
x<-gsub("[|[","[",x) #
x<-gsub("²","^2",x) # superscript 2
x<-gsub("/","/",x) # /
x<-gsub("χ","\u03a7",x) # Chi
x<-gsub("β","\u03b2",x) # beta
x<-gsub("α","\u03b1",x) # alpha
## unify unicode special characters
x<-gsub("\u2264|\u2A7f|\u2a7d","<=",x) # less equal
x<-gsub("\u2A7e|\u2265",">=",x) # greater equal
x<-gsub("\u003d","=",x) # equal sign
# special spaces
x<-gsub("\u00A0","",x) # no break space
x<-gsub("\u200b","",x) # invisible space
x<-gsub("\u200a|\u200c|\u200d|\u200e|\u200f"," ",x)
x<-gsub("\u2000|\u2001|\u2002|\u2003|\u2004"," ",x)
x<-gsub("\u2005|\u2006|\u2007|\u2008|\u2009"," ",x)
# minus/dash -
x<-gsub("\u2013|\u2014|\u2015|\u2212|\u2010|\u2011","-",x)
x<-gsub("\u00B2","^2",x) # superscript 2
x<-gsub("\u201D","'",x) # right double quote
x<-gsub("\u201C","'",x) # left double quote
x<-gsub("\u2032","'",x) # prime
x<-gsub("\u2033","'",x) # double prime
x<-gsub("\u2035","'",x) # reversed prime
x<-gsub("\u2036","'",x) # reversed double prime
x<-gsub("\u00D7","*",x) # times sign: x
if(greek2text==TRUE){
# some foreign alphabet letters
# alpha
x<-gsub("\u03b1|\u0251|\u221d","alpha",x)
# beta
x<-gsub("\u03b2|\u03d0|\u1e9e","b",x)
x<-gsub("\u0392","Beta",x)
# r
x<-gsub("\u027e","r",x)
x<-gsub("\u211b","R",x)
# Zeta -> Z
x<-gsub("\u0396","Z",x)
x<-gsub("\u03b3","gamma",x)
# delta
x<-gsub("\u03b4|\u2206|\u25b5|\u25b3|\u22bf|\u0394","delta ",x)
# eta
x<-gsub("\u019e|\u03b7|\u014b|\u0273|\u0220","eta",x)
x<-gsub("\u03ae|\u1f75|\u220([^a-z0-9])","eta\\1", x)
# epsilon
x<-gsub("\u025b|\u03f5|\u03b5","epsilon",x)
#chi
x<-gsub("\u03a7|\u1d61","Chi",x) # capital greek Chi
x<-gsub("\u03c7|\u03f0","chi",x) # small greek Chi
# lok alike p
x<-gsub("\u0440","p",x) # CYRILLIC SMALL LETTER ER
# omega
x<-gsub("\u3c9","omega",x)
x<-gsub("\u03a9|\u2126","Omega",x) # greek captial leter Omega
# phi
x<-gsub("\u03c6","phi",x)
x<-gsub("\u03a6","Phi",x)
# rho
x<-gsub("\u03c1|\u2374","rho",x)
# tau
x<-gsub("\u03c4","tau",x)
# lambda, etc
x<-gsub("\u03bb","lambda",x)
x<-gsub("\u03bd","Ny",x) # GREEK CAPITAL LETTER NEUTRUM
x<-gsub("\u03b9","iota",x) # greek letter iota
x<-gsub("\u0399","Iota",x) # greek capital letter Iota
x<-gsub("\u041A","ka",x) # kyrillik ka
x<-gsub("\u029F","L",x) # Latin L
x<-gsub("\u00B2","^2",x) # superscript 2
x<-gsub("\u0445","x",x) # CYRILLIC SMALL LETTER HA
}
# clean up white spaces
x<-gsub("^ *|(?<= ) | *$", "", x, perl = TRUE)
# convert cermine specific letter captures
# unsolvable and very crappy: minus gets sometimes converted to "2"
if(cermine==TRUE){
check<-x
# clean up white spaces
x<-gsub("^ *|(?<= ) | *$", "", x, perl = TRUE)
# clean up cermines captures of degree of freedom as reference if input is cermine JATS
# for "[ <xref.*"
x<-gsub("([^a-zA-Z][FRrzZTt2])\\[ <xref.*?>([0-9,;\\. ]*)</xref> \\]","\\1(\\2)",x)
# for "( <xref.*"
x<-gsub("([^a-zA-Z][FRrzZTt2])\\( <xref.*?>([0-9,;\\. ]*)</xref> \\)","\\1(\\2)",x)
x<-gsub("(chi2)\\( <xref.*?>([0-9,;\\. ]*)</xref> \\)","\\1(\\2)",x)
x<-gsub("([^a-zA-Z][FRrzZTt2]) \\( <xref.*?>([0-9,;\\. ]*)</xref> \\)","\\1(\\2)",x)
# " 5 [\\.0-9]" for "="
x<-gsub(" 5 ([\\.0-9])","=\\1",x)
# " 5 -[\\.0-9]" for "=-"
x<-gsub(" 5 -([\\.0-9])","=-\\1",x)
# " 5 - [\\.0-9]" for "=-"
x<-gsub(" 5 -( [\\.0-9])","=-\\1",x)
# " 5 )[\\.0-9]" for "=-"
x<-gsub(" 5 \\)([\\.0-9])","=-\\1",x)
# p as =
x<-gsub("([0-9]\\)) p ([-\\.0-9])","\\1 = \\2",x)
# "¼" for =
x<-gsub("\u00BC","=",x)
# convert badly captured "= )" -> "= -"
x<-gsub("([<>=]) \\)([.0-9])","\\1 -\\2",x)
x<-gsub("([<>=])\\) ([.0-9])","\\1 -\\2",x)
x<-gsub("([<>=]) \\) ([.0-9])","\\1 -\\2",x)
x<-gsub("([<>=])\\)([.0-9])","\\1-\\2",x)
x<-gsub("(CI) \\)([.0-9])","\\1: -\\2",x)
# "p4" for "p>"
x<-gsub(" p4\\.|^p4\\."," p>.",x)
x<-gsub(" p40|^p40"," p>0",x)
# "po" for "p<"
x<-gsub(" po\\.|^po\\."," p<.",x)
x<-gsub(" po0|^po0"," p<0",x)
# unify partial eta2 and eta2
x<-gsub("g2[qp]|np2|etap2|\u02732p|partial g2","eta2(p)",x)
x<-gsub("etap 2|gp2|n2p","eta2(p)",x)
x<-gsub("\u03C42|eta2|\u02732|\u03C4\u00B2|eta\u00B2|g2","eta2",x)
# "b" as "<"
x<-gsub("([^a-zA-Z][a-zA-Z]) b ([0-9\\.])","\\1 < \\2",x )
x<-gsub("([^a-zA-Z][Ftr]s) b ([0-9\\.])","\\1 < \\2",x )
x<-gsub("^([a-zA-Z]) b ([0-9\\.])","\\1 < \\2",x)
x<-gsub("([^a-zA-Z][a-zA-Z]) b (-[0-9\\.])","\\1 < \\2",x)
x<-gsub("([^a-zA-Z][Ftr]s) b (-[0-9\\.])","\\1 < \\2",x )
x<-gsub("^([a-zA-Z]) b (-[0-9\\.])","\\1 < \\2",x )
# "p\" as "p <"
x<-gsub("([^a-z])p \\\\","\\1p <",x)
# convert "num 9 num" to num*num (for ANOVA)
while(length(grep("[2-9] 9 [2-9]",x)>0)) x<-gsub("([2-9]) 9 ([2-9])","\\1*\\2",x)
# chi2
x<-gsub("chi 2","chi2",x)
x<-gsub("chi\\^2|chi\u00B2","chi2",x)
x<-gsub("v\\^2\\(","chi2(",x)
x<-gsub("v2([a-zA-Z])","chi2\\1",x)
x<-gsub("v2[=]","chi2=",x)
x<-gsub("v2\\(","chi2(",x)
x<-gsub("all 2s\\(","all chi2s(",x)
x<-gsub("v2 [=]","chi2=",x)
x<-gsub("v2 \\(","chi2(",x)
x<-gsub("v2s \\(","chi2s(",x)
x<-gsub("v\\^2[=]","chi2=",x)
x<-gsub("v\\^2\\(","chi2(",x)
x<-gsub("v\\^2s\\(","chi2s(",x)
x<-gsub("v\\^2 [=]","chi2=",x)
x<-gsub("v\\^2 \\(","chi2(",x)
x<-gsub("v\\^2s \\(","chi2s(",x)
x<-gsub(" 2\\(([0-9Nnd][0-9f, =]*)"," chi2(\\1",x)
x<-gsub(" 2 \\(([0-9Nnd][0-9f, =]*)"," chi2(\\1",x)
x<-gsub(" v2 ="," chi2 =",x)
x<-gsub(" v2s ="," chi2s =",x)
x<-gsub("(v2) (\\([1-9])","chi2\\2",x)
x<-gsub("v2\\(","chi2(",x)
x<-gsub("v2s\\(","chi2s(",x)
x<-gsub("([^a-zA-Z])[Cc]2\\(","\\1chi2(",x)
x<-gsub("([^a-zA-Z])[Cc]2 \\(","\\1chi2(",x)
x<-gsub("[Cc]2s\\(","chi2s(",x)
x<-gsub("[Cc]\\^2\\(","chi2(",x)
x<-gsub("[Cc]2\\(","chi2(",x)
x<-gsub("[CcVv]2\\[","chi2[",x)
x<-gsub("([^a-zA-Z])[Ccv]2\\(","\\1chi2(",x)
x<-gsub("[Cc]\\^2s\\(","chi2s(",x)
x<-gsub("w2\\(","chi2(",x)
x<-gsub("w2s\\(","chi2s(",x)
x<-gsub("w\\^2\\(","chi2(",x)
x<-gsub("w\\^2s\\(","chi2s(",x)
x<-gsub("X \\.2 \\(","chi2(",x)
x<-gsub("([a-zA-Z])[Cc]2\\(","\\1chi2(",x)
x<-gsub("([a-zA-Z])v2\\(","\\1chi2(",x)
x<-gsub("([a-zA-Z])w2\\(","\\1chi2(",x)
x<-gsub("([^a-zA-Z])[wcCv]2=","\\1chi2=",x)
x<-gsub("([^a-zA-Z])[wcCv]2 =","\\1chi2 =",x)
# eta2
x<-gsub(" Z2([<=>])"," eta2\\1",x)
# remove 2 after )
x<-gsub("\\)2=",")=",x)
# correct "=xnum" -> "=-num"
x<-gsub("([<>=])x([0-9\\.-])","\\1-\\2",x)
x<-gsub("([<>=]) x([0-9\\.-])","\\1-\\2",x)
# remove space between - and num
x<-gsub("([\\)<=>]) - ([\\.0-9])","\\1 -\\2",x)
# correct F without () for df1 and df2
x<-gsub("F([0-9\\.]*,[0-9\\.]*)([=<> ])","F(\\1)\\2",x)
# correct ",[a-z]"
x<- gsub(",([a-zA-Z])",", \\1",x)
# insert "<=>" for not captured presigns in standardStats
# F values with dfs
x<-gsub("(F\\(.*?\\)) ([\\.0-9])","\\1<=>\\2",x)
x<-gsub("(F[1-9 a-z]\\(.*?\\)) ([\\.0-9])","\\1<=>\\2",x)
x<-gsub("(F[a-z]*?\\([1-9].*?\\)) ([\\.0-9])","\\1<=>\\2",x)
# F values without dfs
x<-gsub("([\\( ]F) ([0-9]\\.[0-9])","\\1<=>\\2",gsub("([\\( ]F) (\\.[0-9])","\\1<=>\\2",x))
x<-gsub("([\\( ]Fs) ([0-9]\\.[0-9])","\\1<=>\\2",gsub("([\\( ]Fs) (\\.[0-9])","\\1<=>\\2",x))
x<-gsub("([\\( ]Fs)([0-9]\\.[0-9])","\\1<=>\\2",gsub("([\\( ]Fs)(\\.[0-9])","\\1<=>\\2",x))
# t values
# set "t_df" to "t(df)"
x<-gsub("([^a-zA-Z]t)([0-9]*?) ([0-9\\.])","\\1(\\2) \\3",x)
x<-gsub("([\\( ]t) (\\(.*?\\)) ([\\0-9\\.])","\\1\\2<=>\\3",x)
x<-gsub("([\\( ]t) (\\(.*?\\)) (-[\\0-9\\.])","\\1\\2<=>\\3",x)
x<-gsub("([\\( ]t\\(.*?\\)) ([0-9\\.][0-9\\.])","\\1<=>\\2",x)
x<-gsub("([\\( ]t\\(.*?\\)) (-[0-9\\.][0-9\\.])","\\1<=>\\2",x)
x<-gsub("([\\( ]t[1-9 ]*?\\(.*?\\)) ([\\.0-9])","\\1<=>\\2",x)
x<-gsub("([\\( ]t[1-9 ]*?\\(.*?\\)) (-[\\.0-9])","\\1<=>\\2",x)
x<-gsub("([\\( ]t[a-z]\\(.*?\\)) ([\\.0-9])","\\1<=>\\2",x)
x<-gsub("([\\( ]t[a-z]\\(.*?\\)) (-[\\.0-9])","\\1<=>\\2",x)
x<-gsub("([\\( ]t) ([0-9\\.][0-9\\.])","\\1<=>\\2",x)
x<-gsub("([\\( ]t) (-[0-9\\.][0-9\\.])","\\1<=>\\2",x)
x<-gsub("([\\( ]t[1-9 ]*?\\(.*?\\)) (-[0-9]\\.)","\\1<=>\\2",x)
x<-gsub("([\\( ]t[1-9 ]*?\\(.*?\\))(-[0-9]\\.)","\\1<=>\\2",x)
x<-gsub("([\\( ]ts) ([0-9\\.])","\\1<=>\\2",x)
x<-gsub("([\\( ]ts) (-[0-9\\.])","\\1<=>\\2",x)
x<-gsub("^(t) (\\(.*?\\)) ([\\0-9\\.])","\\1\\2<=>\\3",x)
x<-gsub("^(t) (\\(.*?\\)) (-[\\0-9\\.])","\\1\\2<=>\\3",x)
x<-gsub("^(t) ([0-9\\.])","\\1<=>\\2",x)
x<-gsub("^(t) (-[0-9\\.])","\\1<=>\\2",x)
x<-gsub("^(t\\(.*?\\)) ([0-9\\.])","\\1<=>\\2",x)
x<-gsub("^(t\\(.*?\\)) (-[0-9\\.])","\\1<=>\\2",x)
# d values
x<-gsub("([\\( ]d) (0\\.[0-9])","\\1<=>\\2",gsub("([\\( ]d) (\\.[0-9])","\\1<=>\\2",x))
x<-gsub("([\\( ]d) (-0\\.[0-9])","\\1<=>\\2",gsub("([\\( ]d) (-\\.[0-9])","\\1<=>\\2",x))
# r values
x<-gsub("([\\( ]r) (0\\.[0-9])","\\1<=>\\2",gsub("([\\( ]r) (\\.[0-9])","\\1<=>\\2",x))
x<-gsub("([\\( ]r) (-0\\.[0-9])","\\1<=>\\2",gsub("([\\( ]r) (-\\.[0-9])","\\1<=>\\2",x))
x<-gsub("([\\( ]r\\([0-9]*?\\)) (-\\.[0-9])","\\1<=>\\2",x)
x<-gsub("([\\( ]r\\([0-9]*?\\)) (\\.[0-9])","\\1<=>\\2",x)
x<-gsub("([\\( ]r \\([0-9]*?\\)) (-\\.[0-9])","\\1<=>\\2",x)
x<-gsub("([\\( ]r \\([0-9]*?\\)) (\\.[0-9])","\\1<=>\\2",x)
x<-gsub("^(r\\([0-9]*?\\)) (-\\.[0-9])","\\1<=>\\2",x)
x<-gsub("^(r\\([0-9]*?\\)) (\\.[0-9])","\\1<=>\\2",x)
# R2 values
x<-gsub("([\\( ]r2) (0\\.[0-9])","\\1<=>\\2",gsub("([\\( ]r2) (\\.[0-9])","\\1<=>\\2",x))
x<-gsub("([\\( ]R2) (0\\.[0-9])","\\1<=>\\2",gsub("([\\( ]R2) (\\.[0-9])","\\1<=>\\2",x))
x<-gsub("[^a-zA-Z]r2","R2",x)
# chi2 values
x<-gsub("([\\( ]chi2) ([0-9]\\.[0-9])","\\1<=>\\2",gsub("([\\( ]chi2) (\\.[0-9])","\\1<=>\\2",x))
x<-gsub("([\\( ]chi) ([0-9]\\.[0-9])","\\1<=>\\2",gsub("([\\( ]chi) (\\.[0-9])","\\1<=>\\2",x))
x<-gsub("([\\( ]chi2\\([0-9 Nn,=df]*\\)) ([0-9\\.])","\\1=\\2",x)
x<-gsub("([\\( ]chi\\([0-9 Nn,=df]*\\)) ([0-9\\.])","\\1=\\2",x)
x<-gsub("([\\( ]chi2s\\([0-9 Nn,=df]*\\)) ([0-9\\.])","\\1=\\2",x)
# Q values
x<-gsub("([\\( ]Q) (\\([0-9]*?\\)) ([0-9\\.])","\\1\\2<=>\\3",x)
x<-gsub("([\\( ]Q\\([0-9]*?\\)) ([0-9\\.])","\\1<=>\\2",x)
x<-gsub("([\\( ]Q) ([0-9\\.])","\\1<=>\\2",x)
# Z values
x<-gsub("([\\( ][Zz]) (\\.[0-9])","\\1<=>\\2",x)
x<-gsub("([\\( ][Zz]) (-\\.[0-9])","\\1<=>\\2",x)
x<-gsub("([\\( ][Zz]) ([0-9]\\.[0-9])","\\1<=>\\2",x)
x<-gsub("([\\( ][Zz]) (-[0-9]\\.[0-9])","\\1<=>\\2",x)
#x<-gsub("([\\( ][Zz]\\(.*?\\)) ([0-9\\.])","\\1<=>\\2",x)
#x<-gsub("([\\( ][Zz]\\(.*?\\)) (-[0-9\\.])","\\1<=>\\2",x)
# p value
x<-gsub("([\\( ]p) (0\\.[0-9])","\\1<=>\\2",gsub("([\\( ]p) (\\.[0-9])","\\1<=>\\2",x))
x<-gsub("([\\( ]ps) (0\\.[0-9])","\\1<=>\\2",gsub("([\\( ]ps) (\\.[0-9])","\\1<=>\\2",x))
# eta2
x<-gsub("partial2","eta2<=>",x)
x<-gsub("([\\( ]eta2) ([0-9\\.])","\\1<=>\\2",gsub("([\\( ]eta2) (\\.[0-9])","\\1<=>\\2",x))
x<-gsub("([\\( ]eta) ([0-9\\.])","\\1<=>\\2",gsub("([\\( ]eta) (\\.[0-9])","\\1<=>\\2",x))
x<-gsub("([\\( ]p2) (0\\.[0-9])"," eta2<=>\\2",gsub("([\\( ]p2) (\\.[0-9])"," eta2<=>\\2",x))
# others
x<-gsub("([\\( ]OR) ([0-9\\.])","\\1<=>\\2",x)
x<-gsub("([\\( ]RR) ([0-9\\.])","\\1<=>\\2",x)
x<-gsub("([\\( ][Oo]dds[ \\-][Rr]atio) ([0-9\\.])","\\1<=>\\2",x)
x<-gsub("([\\( ][Oo]dds) ([0-9\\.])","\\1<=>\\2",x)
x<-gsub("([\\( ][sS][dD]) ([0-9\\.])","\\1<=>\\2",x)
x<-gsub("([\\( ]SE) ([0-9\\.])","\\1<=>\\2",x)
x<-gsub("([\\( ]MSE) ([0-9\\.])","\\1<=>\\2",x)
x<-gsub("([\\( ]RMSE) ([0-9\\.])","\\1<=>\\2",x)
x<-gsub("([\\( ]RMSEA) ([0-9\\.])","\\1<=>\\2",x)
x<-gsub("([\\( ][Rr]ange) ([0-9\\.])","\\1=\\2",x)
x<-gsub("([\\( ]W) ([0-9\\.])","\\1<=>\\2",x)
x<-gsub("([\\( ]D) ([0-9\\.])","\\1<=>\\2",x)
x<-gsub("([\\( ][bB]) ([0-9\\.])","\\1<=>\\2",x)
x<-gsub("([\\( ][bB]) (-[0-9\\.])","\\1<=>\\2",x)
x<-gsub("([\\( ]slope) ([0-9\\.])","\\1=\\2",x)
x<-gsub("([\\( ]slope) (-[0-9\\.])","\\1=\\2",x)
x<-gsub("([\\( ][M]) ([0-9\\.])","\\1<=>\\2",x)
x<-gsub("([\\( ][M]) (-[0-9\\.])","\\1<=>\\2",x)
x<-gsub("([\\( ]beta) ([0-9\\.])","\\1=\\2",x)
x<-gsub("([\\( ]beta) (-[0-9\\.])","\\1=\\2",x)
x<-gsub("([\\( ]df) ([1-9])","\\1=\\2",x)
x<-gsub("([\\( ][gkm]) ([\\.0-9])","\\1=\\2",x)
x<-gsub("(power of 1) (\\.[0-9])","\\1-beta = \\2",x)
x<-gsub("([ (\\[]power) (\\.[0-9])","\\1=\\2",x)
# add () for df to stats without (): t12 = 12.3
x<-gsub("F([0-9]*?, [1-9]*?) ([<=>]*? [0-9\\.])","F(\\1) \\2",x)
x<-gsub("F ([0-9]*?, [1-9]*?) ([<=>]*? [0-9\\.])","F(\\1) \\2",x)
x<-gsub("t([0-9]*?) ([<=>]*? [0-9\\.])","t(\\1) \\2",x)
x<-gsub("t ([0-9]*?) ([<=>]*? [0-9\\.])","t(\\1) \\2",x)
x<-gsub("t([0-9]*?) (-[<=>]*? [0-9\\.])","t(\\1) \\2",x)
x<-gsub("t ([0-9]*?) (-[<=>]*? [0-9\\.])","t(\\1) \\2",x)
x<-gsub("r ([0-9]*?) ([<=>]*? [0-9\\.])","t(\\1) \\2",x)
x<-gsub("r([0-9]*?) ([<=>]*? [0-9\\.])","r(\\1) \\2",x)
x<-gsub("r ([0-9]*?) ([<=>]*? -[0-9\\.])","r(\\1) \\2",x)
x<-gsub("r([0-9]*?) ([<=>]*? -[0-9\\.])","r(\\1) \\2",x)
# add "=" between "N [1-9]"
x<-gsub("([\\(\\[ ][Nn]) ([1-9])","\\1=\\2",x)
# correct badly captured "-" as 2 in r(df)<=>2.num
x<-gsub("(r\\([0-9]*?\\)[<=>]*?)2\\.","\\1-.",x)
x<-gsub("(r\\([0-9]*?\\) [<=>]*? )2\\.","\\1-.",x)
# correct badly captured "-" as 2 in r(df)<=>2num.num but not t(df)=2.num0
x<-gsub("(t\\([0-9]*?\\)[<=>]*?)2([0-9]\\.)","\\1-\\2",x)
x<-gsub("(t\\([0-9]*?\\) [<=>]*? )2([0-9]\\.)","\\1-\\2",x)
# correct badly captured "-" as 2 in z<=>2num.num but not z=2.num0
# x<-gsub("([^a-zA-Z][zZ][<=>]*?)2([0-9]\\.)","\\1-\\2",x)
# x<-gsub("([^a-zA-Z][zZ] [<=>]*? )2([0-9]\\.)","\\1-\\2",x)
if(warning==TRUE) if(sum(check==x)!=length(x)) warning("CERMINE specific letter conversion was used to correct for some conversion errors.\n '<=>' was inserted by letter.convert() to make statistics readable.\n Note: The minus sign is sometimes converted to '2' or not at all!")
}
#cleanup
x<-gsub("\\t","",x)
# white spaces
x<-gsub("^ *|(?<= ) | *$", "", x, perl = TRUE)
# unify -
#x<-gsub("—|–","-",x)
return(x)
}
## Function to convert unicode to ASCII
udecode <- function(string){
# functions
uconv <- function(chars) intToUtf8(strtoi(chars, 16L))
ufilter <- function(string){
tryCatch({
input<-string
if(substr(string, 1, 1)=="|") return(uconv(substr(string,2,5))) else return(string)
},error=function(e){return(input)})
# if (length(grep("^[|][a-zA-z0-9]{4}",x))>0) uconv(substr(string, 2, 5)) else string
}
# convert | -> @@
string<-gsub("[|]","@@",string)
# hex to unicode
if(length(grep("�",string))>0){
i<-grep("�",string)
string[i]<-gsub("(\\u....);","\\1",gsub("�","\\u",string[i],fixed=T))
}
if(length(grep("",string))>0){
i<-grep("",string)
string[i]<-gsub("(\\u....);","\\1",gsub("","\\u",string[i],fixed=T))
}
# unicode to raw
string <- gsub("\\\\u([a-zA-z0-9]{4})", "|\\1", string, perl=TRUE)
strings <- unlist(strsplit2(unlist(strsplit2(string, "[|][a-zA-z0-9]{4}","after")), "[|][a-zA-z0-9]{4}","before"))
i<-which(is.na(sapply(strings, ufilter)))
replacement<-sapply(strings, ufilter)
replacement[i]<-names(replacement)[i]
string <- paste(replacement, collapse='')
# reconvert @@ -> |
string<-gsub("@@","|",string)
return(string)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.