#' @title Latin to Cyrillic conversion function
#' @description This function helps to convert transliterated Cyrillic to original Cyrillic.
#' @param mdat character vector to be back-transliterated to Cyrillic.
#' @param tolanguage language the text needs to be converted to ("Russian" by default)
#' @param LAOR rules of tranliteration from transliterated Cyrillic to original Cyrillic (the rules are listed in the file "transliterationLAOR.csv").
#' @param OROR rules to correct transliterated original Cyrillic (the rules are listed in the file "transliterationOROR.csv").
#' @param EnglishDetection if set to TRUE, the script avoids transliteration of words found in the English vocabulary (file: english.txt). If set to FALSE, only user defined stop words are used (file: stopwordsfile.csv).
#' @param EnglishLength threshold is set to ignore EnglishDectection words below given threshold.
#' @param RussianCorrection if set to TRUE, the script attempts to match every back-transliterated word with the Russian vocabulary (files: russian.txt and russian_surnames.txt).
#' @param SensitivityThreshold is used only if RussianCorrection==TRUE. It determines algorithm's sensitivity to mismatches (numbers closer to 0 define higher sensitivity to mismatches). SensitivityThreshold is set to 0.1 by default.
#' @export
#' @importFrom utils read.csv
#' @importFrom utils read.table
#' @importFrom stringdist stringdist
#' @importFrom stringi stri_unescape_unicode
#' @import dplyr
#' @import devtools
#' @return Returns the vector of transliterated characters in Cyrillic.
#' @examples
#' library(HooverArchives)
#'
#' # conversion to Russian
#' dat<-c("Mezhdunarodnaia gazeta. Gl. redaktor: Iu. Zarechkin. Moscow, Russia. Semiweekly. 199?",
#' "DEN' UCHITELIA komissiia po obrazovaniiu ob''edineniia Iabloko",
#' "III-ii RIM vestnik Rossiiskogo patrioticheskogo dvizheniia. Redaktory: M. Artem'ev, V. Rugich. Moscow, Russia.")
#'
#' converteddata_ru <- fromLATtoCYR(dat, LAOR=TRUE, OROR=FALSE, EnglishDetection=TRUE)
#'
#'
#' # conversion to Ukrainian
#'dat<-read.csv(system.file("Ukraine_microform.csv", package="HooverArchives"),
#' sep=",", encoding = "UTF-8", stringsAsFactors = FALSE)
#'
#'converteddata_uk <- fromLATtoCYR(dat$FIELD.245, tolanguage="Ukrainian")
fromLATtoCYR<-function(mdat=NULL, tolanguage="Russian", LAOR=TRUE, OROR=FALSE, EnglishDetection=TRUE,
EnglishLength=NULL, RussianCorrection=FALSE, SensitivityThreshold = 0.1){
#Global settings
#Sys.setlocale("LC_ALL",locale = tolanguage)
if(tolanguage=="Russian"){
tryCatch(Sys.setlocale(category='LC_CTYPE', locale='ru_RU'),
warning = function(c) Sys.setlocale(category='LC_CTYPE', locale='Russian_Russia.1251'))
trvfilename="transliterationLARU.csv";
rrvfilename="transliterationRURU.csv"}
if(tolanguage=="Ukrainian"){tryCatch(Sys.setlocale(category='LC_CTYPE', locale='uk_UA'),
warning = function(c) Sys.setlocale(category='LC_CTYPE', locale='Ukrainian_Ukraine.1251'))
Sys.setlocale("LC_CTYPE",locale = "Ukrainian_Ukraine.1251");
trvfilename="transliterationLAUK.csv";
rrvfilename=NULL}
if(RussianCorrection){
dicR<-read.table(system.file("russian.txt", package="HooverArchives"),
header = FALSE, sep = "", dec = ".", stringsAsFactors=FALSE)
dicN<-read.table(system.file("russian_surnames.txt", package="HooverArchives"),
header = FALSE, sep = "", dec = ".", stringsAsFactors=FALSE)
dicRS<-unname(unlist(dicR),unlist(dicN))}
autodetected <- NULL
transOROR.vector <- NULL
transLAOR.vector <- NULL
#loading data
trv <- read.csv(system.file(trvfilename, package="HooverArchives"), encoding = "UTF-8", stringsAsFactors=FALSE);
trv <- trv[,-1]
trv <- data.frame(apply(trv, 2, function(y) gsub("\\_", " ", y)))
trv[] <- lapply(trv, as.character)
if(!is.null(rrvfilename)){
rrv <- read.csv(system.file(rrvfilename, package="HooverArchives"), encoding = "UTF-8", stringsAsFactors=FALSE)
rrv <- data.frame(apply(rrv, 2, function(y) gsub("\\_", " ", y)))
rrv <- rrv[,-1]
rrv[] <- lapply(rrv, as.character)
}
swf <- read.csv(system.file("stopwordsfile.csv", package="HooverArchives"), encoding = "UTF-8", stringsAsFactors=FALSE)
swf <- data.frame(apply(swf, 2, function(y) gsub("\\_", " ", y)))
stopwords <- as.character(swf$stopwords)
#Functions
stopwords_encoder <- function(stopwords){
stopwordsfixed <- strsplit(stopwords,"")
result <- unlist(lapply(1:length(stopwordsfixed), function(x){
paste(paste("@",stopwordsfixed[[x]], sep=""), collapse="")
}))
return(result)}
translit<-function(dat, string_vec, EnglishDetection, SensitivityThreshold, RussianCorrection){
english_lookup <- function(a){
splVn <- splVn2 <- splVn3 <- NULL
varSpl <- c(" ", "\\-", "\\:", "\\!", "\\+", "\\?", "\\#", "\\$", "\\%", "\\^", "\\&", "\\*", "\\\\", "\\/", "\\|",
"\\_", "\\,", "\\.", "\\;", "\\\"", "\\(", "\\)", "\\[", "\\]", "\\<", "\\>", "\\{", "\\}", "\\~")
varSpl1 <- paste(c(paste(varSpl, "+", sep=""), "\\'+$", sep=""), collapse="|")
varSpl1 <- substring(varSpl1,1,nchar(varSpl1)-1)
splV <- unlist(strsplit(a, varSpl1, perl=TRUE))
varSpl2 <- paste(c(paste("^", varSpl, "+", sep=""), "\\d+", "^\\'+", sep=""), collapse="|")
varSpl2 <- substring(varSpl2,1,nchar(varSpl2)-1)
splV <- gsub(varSpl2 , "", splV)
splV <- splV[splV!=""]
splV <- unique(splV);
if(any(grepl("(?<=s)\\'$", splV, perl=TRUE))){
splVn <- which(grepl("(?<=s)\\'$", splV, perl=TRUE))
splVo <- splV
splV[splVn] <- substring(splV[splVn], 1, nchar(splV[splVn])-2)}
if(any(grepl("(?<=\\')s$", splV, perl=TRUE))){
splVn2 <- which(grepl("(?<=\\')s$", splV, perl=TRUE))
splVo2 <- splV
splV[splVn2] <- substring(splV[splVn2], 1, nchar(splV[splVn2])-2)}
if(any(grepl("('+$)|(\"+$)", splV, perl=TRUE))){
splVn3 <- which(grepl("(\'+$)|(\"+$)", splV, perl=TRUE))
splVo3 <- splV
splV[splVn3] <- gsub("[[:punct:]]+$", "", splV[splVn3])
}
minInd <- sapply(splV, function(x, y = dicE){
xx <- tolower(x)
yy <- tolower(y)
yy <- yy[substring(yy,1,1)%in%substring(xx,1,1)]
if(xx%in%yy){return(TRUE)}
return(FALSE)})
if(!is.null(splVn)){
names(minInd)[splVn]<-splVo[splVn]
}
if(!is.null(splVn2)){
names(minInd)[splVn2]<-splVo2[splVn2]
}
if(!is.null(splVn3)){
names(minInd)[splVn3]<-splVo3[splVn3]
}
return(minInd)}
fuzzy_function <- function(a, SensitivityThreshold){
splV<-unlist(strsplit(a," |\\."))
splV<-gsub(a," |\\.", splV)
splV<-gsub("[[:punct:] ]+","", unlist(splV))
splV<-gsub("\\d+", "", splV)
splV<-splV[splV!=""]
splV<-unique(splV);
minInd <- sapply(splV, function(x, y = dicRS){
xx<-tolower(x)
yy<-tolower(y)
yy = yy[substring(yy,1,1)%in%substring(xx,1,1)]
if(xx%in%yy){return(x[xx%in%yy])}
if(nchar(xx)<=2){y<-yy[nchar(yy)<=2]}else{
yy<-yy[substr(yy,1,3)==substr(xx, 1,3)]
}
index <- stringdist(xx, yy,method='jw')
if(length(index)>0){
min_index <- min(index)
if(min_index <= SensitivityThreshold){
result<-yy[which.min(index)]
}else{result<-xx}
}else{result<-xx}
return(result)})
AllUpperCase <- function(s) {
m = regexpr("[[:upper:]]+", s)
res = regmatches(s, m) == s
if(length(res)==0) res=FALSE
return (res)
}
AllLowerCase <- function(s) {
m = regexpr("[[:lower:]]+", s)
res = regmatches(s, m) == s
if(length(res)==0) res=FALSE
return (res)
}
FirstUpperCase <- function(s) {
res = grepl("^[[:upper:]][[:lower:]]+", s)
if(length(res)==0) res=FALSE
return (res)
}
for(i in 1:length(minInd)){
tests <- c(AllUpperCase(enc2native(names(minInd)[i])),
AllLowerCase(enc2native(names(minInd)[i])),
FirstUpperCase(enc2native(names(minInd)[i])))
if(any(tests)){
if(tests[1]){minInd[i] <- toupper(enc2native(minInd[i]))}
if(tests[2]){minInd[i] <- tolower(enc2native(minInd[i]))}
if(tests[3]){minInd[i] <- gsub(paste('^',substr(enc2native(minInd[i]),1,1), sep=""),
toupper(substr(enc2native(minInd[i]),1,1)), enc2native(minInd[i]))}
a <- gsub(enc2native(names(minInd)[i]), enc2native(minInd[i]), enc2native(a))
}}
result <- a
return(result)
}
#Operations
colnames(dat) <- c("subV", "withV")
if(any(grepl("(?<=\\\\u)\\w{4}", dat$withV, perl=TRUE))){
dat$withV[grepl("(?<=\\\\u)\\w{4}", dat$withV, perl=TRUE)]<-
stri_unescape_unicode((dat$withV[grepl("(?<=\\\\u)\\w{4}", dat$withV, perl=TRUE)]))
}
if(any(is.na(dat$withV))){
dat$withV[is.na(dat$withV)]<-""
}
subV.s <- split(dat$subV, nchar(dat$subV))
subV.s[[length(subV.s)]]<-c(subV.s[[length(subV.s)]],"''", "'", "\u0361")
stringWithStopWords <- string_vec
stringWithStopWords <- gsub("\"", "''", stringWithStopWords)
if(EnglishDetection==TRUE){
autodetected <- english_lookup(stringWithStopWords)
autodetected <- tryCatch(names(autodetected)[autodetected], error = function(e) e)
if(inherits(autodetected, "error")){autodetected <-NULL}
if(length(autodetected) == 0){autodetected <- NULL}
stopwords <- unique(c(autodetected, stopwords))
}
stopWordsReplace <- stopwords_encoder(stopwords)
replace_mat<-data.frame(stopwords, stopWordsReplace)
replace_mat<-replace_mat[order(nchar(replace_mat[,1]), replace_mat[,1], decreasing=TRUE),]
for(i in 1:nrow(replace_mat)){
if(grepl("[[:punct:]]$", replace_mat[i,1])){
stringWithStopWords <- gsub(replace_mat[i,1], replace_mat[i,2], stringWithStopWords)
}else{
#stringWithStopWords <- gsub(paste0("\\b",stopwords[i], "\\b", sep=""), stopWordsReplace[i], stringWithStopWords, perl=TRUE)
stringWithStopWords <- gsub(paste0("(?<![[:alpha:]]')", replace_mat[i,1], sep=""), replace_mat[i,2], stringWithStopWords, perl=TRUE)
}
}
if(grepl("'''", stringWithStopWords)){
stringWithStopWords <- gsub("^''|((?<=[[:punct:][:space:]])+'')|''(?=[[:punct:][:space:]])|''$", "'@'@'", stringWithStopWords, perl=TRUE)
stringWithStopWords <- gsub("@''", "@'", stringWithStopWords)
stringWithStopWords <- gsub("^'@", "@", stringWithStopWords)
}else{
stringWithStopWords <-
gsub("^''|((?<=[[:punct:][:space:]])+'')|''(?=[[:punct:][:space:]])|''$", "@'@'", stringWithStopWords, perl=TRUE)
}
for(i in seq(length(subV.s),1,-1)){
char_vS <- unlist(subV.s[i])
char_vL <- toupper(char_vS)
char_vC <- tryCatch(gsub("(^|[[:space:]])([[:alpha:]])", "\\1\\U\\2",
enc2native(char_vS), perl = TRUE), error = function(e) e)
if(inherits(char_vC, "error")) {warning("ERROR"); char_vC <- NULL}
chars <- unique(c(char_vS, char_vC, char_vL))
chars <- chars[!grepl("\\\\U", chars)]
j <- 1
while (j<(length(chars)+1)) {
if(chars[j]%in%dat$subV){
charSubst <- dat$withV[dat$subV%in%chars[j]]
}else if(chars[j]%in%toupper(dat$subV)){
charSubst <- toupper(dat$withV)[toupper(dat$subV)%in%chars[j]]
}else if(grepl("[[:alpha:]]", chars[j])){
dat$subVS <- gsub("(^|[[:space:]])([[:alpha:]])", "\\1\\U\\2", enc2native(dat$subV), perl = TRUE)
dat$withVS <- gsub("(^|[[:space:]])([[:alpha:]])", "\\1\\U\\2", enc2native(dat$withV), perl = TRUE)
charSubst <- dat$withVS[dat$subVS%in%chars[j]]
}
regexprg<-paste("(?<!\\@)", chars[j], sep="")
if(regexprg=="(?<!\\@)''"){
if(grepl("(?<=[[:lower:]])''|((?<=\\s[[:upper:]])'')|((?<=^[[:upper:]])'')", string_vec, perl=TRUE)){
stringWithStopWords<-
gsub("(?<=[[:lower:]])''|((?<=\\s[[:upper:]])'')|((?<=^[[:upper:]])'')", charSubst[1], stringWithStopWords, perl=TRUE)}
if(grepl("(?<=[[:upper:]])''", stringWithStopWords, perl=TRUE)){
stringWithStopWords<-gsub("(?<=[[:upper:]])''", toupper(charSubst[1]), stringWithStopWords, perl=TRUE)}
}
if(regexprg=="(?<!\\@)'"){
if(grepl("((?<=[[:lower:]])')|((?<=\\s[[:upper:]])')|((?<=^[[:upper:]])')", stringWithStopWords, perl=TRUE)){
stringWithStopWords <-
gsub("((?<=[[:lower:]])')|((?<=\\s[[:upper:]])')|((?<=^[[:upper:]])')", charSubst[1], stringWithStopWords, perl=TRUE)}
if(grepl("(?<=[[:upper:]])'", string_vec, perl=TRUE)){
stringWithStopWords <- gsub("(?<=[[:upper:]])'",toupper(charSubst[1]), stringWithStopWords, perl=TRUE)}
}
if(regexprg!="(?<!\\@)'"|regexprg!="(?<!\\@)''"){
if(grepl("(?<=\\\\u)\\w{4}", regexprg, perl=TRUE)){
regexprg<-paste("(?<!\\@)",
"<U\\+",regmatches(regexprg, gregexpr("(?<=\\\\u)\\w{4}", regexprg, perl=TRUE)),">", sep="")
stringWithStopWords <- gsub(regexprg, charSubst[1], stringWithStopWords, perl=TRUE)
}else{
stringWithStopWords <- gsub(regexprg, charSubst[1], stringWithStopWords, perl=TRUE)
}
}
j = j + 1
}
}
if(RussianCorrection){
if(!is.na(stringWithStopWords)){
stringWithStopWords <- fuzzy_function(stringWithStopWords, SensitivityThreshold)
}
}
return(stringWithStopWords)}
if(EnglishDetection){
dicE <- read.table(system.file("english.txt", package="HooverArchives"),
header = FALSE, sep = "", dec = ".", stringsAsFactors=FALSE)[,1]
if(!is.null(EnglishLength)){
dicE<-dicE[nchar(dicE)>=EnglishLength]
}
}
if (is.character(mdat) & length(mdat)==1){
if (LAOR|(isFALSE(LAOR)&isFALSE(OROR))){
transLAOR.vector <- translit(trv, mdat, EnglishDetection, SensitivityThreshold, RussianCorrection)
}
if (LAOR & OROR){
transOROR.vector <- translit(rrv, transLAOR.vector, EnglishDetection, SensitivityThreshold, RussianCorrection)
}
if (isFALSE(LAOR) & OROR){
transOROR.vector <- translit(rrv, mdat, EnglishDetection, SensitivityThreshold, RussianCorrection)
}
}
if (is.character(mdat) & length(mdat) > 1){
if (LAOR|(isFALSE(LAOR)&isFALSE(OROR))){
transLAOR.list <- lapply(1:length(mdat), function(iter) {
translit(trv, mdat[iter], EnglishDetection, SensitivityThreshold, RussianCorrection)
})
transLAOR.vector <- unlist(transLAOR.list)
}
if (OROR & LAOR){
transOROR.list <- lapply(1:length(mdat), function(iter) {
translit(rrv, transLAOR.vector[iter], EnglishDetection, SensitivityThreshold, RussianCorrection)
})
transOROR.vector <- unlist(transOROR.list)
}
if (OROR & isFALSE(LAOR)){
transOROR.list <- lapply(1:length(mdat), function(iter) {
translit(rrv, mdat[iter], EnglishDetection, SensitivityThreshold, RussianCorrection)
})
transOROR.vector <- unlist(transOROR.list)
}
}
if(isFALSE(OROR) & isFALSE(LAOR)){
result <- transLAOR.vector
}else{
if(OROR){
result <- transOROR.vector
}else{
result <- transLAOR.vector
}
}
result<-gsub("@", "", result)
result<-gsub("''", "\"", result)
return(result)}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.