R/character.R

Defines functions unescape_regex escape_regex unescape_md escape_md rand_char str_first_appear str_similar_pair str_similar str_to_han str_symbol_to_han str_number_to_han str_alphabet_to_han str_remove_dakuten str_kana_to_zen str_either

Documented in escape_md escape_regex rand_char str_alphabet_to_han str_either str_first_appear str_kana_to_zen str_number_to_han str_remove_dakuten str_similar str_similar_pair str_symbol_to_han str_to_han unescape_md unescape_regex

#' Return y if x is NA or empty character, otherwise return x
#' @description Return y if x is NA, otherwise return x
#' @param x target value
#' @param y alternatively returned value
#' @return x or y, whose length is euqal to length_x
#' @export
str_either = function(x,y){
	ifelse(is.na(x) | x=="",y,x)
}

#' Replace hankaku-kana by zenkaku-kana
#' @description Replace hankaku-kana by zenkaku-kana.
#' @param x target character
#' @return replaced character
#' @export
str_kana_to_zen <- function(x){
	# character変換
	if (!is.character(x)){ x <- as.character(x) }

	# 濁点、半濁点文字の置換
	dh = c("\uff76\uff9e","\uff77\uff9e","\uff78\uff9e","\uff79\uff9e","\uff7a\uff9e","\uff7b\uff9e","\uff7c\uff9e","\uff7d\uff9e","\uff7e\uff9e","\uff7f\uff9e","\uff80\uff9e","\uff81\uff9e","\uff82\uff9e","\uff83\uff9e","\uff84\uff9e","\uff8a\uff9e","\uff8b\uff9e","\uff8c\uff9e","\uff8d\uff9e","\uff8e\uff9e","\uff8a\uff9f","\uff8b\uff9f","\uff8c\uff9f","\uff8d\uff9f","\uff8e\uff9f")
	dz = c("\u30ac","\u30ae","\u30b0","\u30b2","\u30b4","\u30b6","\u30b8","\u30ba","\u30bc","\u30be","\u30c0","\u30c2","\u30c5","\u30c7","\u30c9","\u30d0","\u30d3","\u30d6","\u30d9","\u30dc","\u30d1","\u30d4","\u30d7","\u30da","\u30dd")
#	dh <- c("ガ","ギ","グ","ゲ","ゴ","ザ","ジ","ズ","ゼ","ゾ","ダ","ヂ","ヅ","デ","ド","バ","ビ","ブ","ベ","ボ","パ","ピ","プ","ペ","ポ")
#	dz <- c("ガ","ギ","グ","ゲ","ゴ","ザ","ジ","ズ","ゼ","ゾ","ダ","ヂ","ヅ","デ","ド","バ","ビ","ブ","ベ","ボ","パ","ピ","プ","ペ","ポ")
	for( i in 1:length(dz) ){ x <- gsub(dh[i],dz[i],x) }

	# 1bite文字の置換
	x <- chartr("\uff71\uff72\uff73\uff74\uff75\uff76\uff77\uff78\uff79\uff7a\uff7b\uff7c\uff7d\uff7e\uff7f\uff80\uff81\uff82\uff83\uff84\uff85\uff86\uff87\uff88\uff89\uff8a\uff8b\uff8c\uff8d\uff8e\uff8f\uff90\uff91\uff92\uff93\uff94\uff95\uff96\uff97\uff98\uff99\uff9a\uff9b\uff9c\uff66\uff9d\uff61\uff62\uff63\uff64\uff65\uff66\uff67\uff68\uff69\uff6a\uff6b\uff6c\uff6d\uff6e\uff6f\uff70",
					"\u30a2\u30a4\u30a6\u30a8\u30aa\u30ab\u30ad\u30af\u30b1\u30b3\u30b5\u30b7\u30b9\u30bb\u30bd\u30bf\u30c1\u30c4\u30c6\u30c8\u30ca\u30cb\u30cc\u30cd\u30ce\u30cf\u30d2\u30d5\u30d8\u30db\u30de\u30df\u30e0\u30e1\u30e2\u30e4\u30e6\u30e8\u30e9\u30ea\u30eb\u30ec\u30ed\u30ef\u30f2\u30f3\u3002\u300c\u300d\u3001\u30fb\u30f2\u30a1\u30a3\u30a5\u30a7\u30a9\u30e3\u30e5\u30e7\u30c3\u30fc",
					x)
#	x <- chartr("アイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワヲン。「」、・ヲァィゥェォャュョッー"
#					, "アイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワヲン。「」、・ヲァィゥェォャュョッー"
#					, x)
	return(x)
}

#' Remove daku-on, han-daku-on
#' @description Remove daku-on, han-daku-on
#' @param x target character
#' @return replaced character
#' @export
str_remove_dakuten <- function(x){
	# character変換
	if (!is.character(x)){ x <- as.character(x) }

	x = stringr::str_remove_all(x,"[\u309b\u309c]")#゛゜

	# 濁点、半濁点文字の置換

	#	from <- c("ゔ","が","ぎ","ぐ","げ","ご","ざ","じ","ず","ぜ","ぞ","だ","ぢ","づ","で","ど","ば","び","ぶ","べ","ぼ","ぱ","ぴ","ぷ","ぺ","ぽ")
	from = c("\u3094","\u304c","\u304e","\u3050","\u3052","\u3054","\u3056","\u3058","\u305a","\u305c","\u305e","\u3060","\u3062","\u3065","\u3067","\u3069","\u3070","\u3073","\u3076","\u3079","\u307c","\u3071","\u3074","\u3077","\u307a","\u307d")
	#	to   <- c('"う","か","き","く","け","こ","さ","し","す","せ","そ","た","ち","つ","て","と","は","ひ","ふ","へ","ほ","は","ひ","ふ","へ","ほ"'
	to = c("\u3046","\u304b","\u304d","\u304f","\u3051","\u3053","\u3055","\u3057","\u3059","\u305b","\u305d","\u305f","\u3061","\u3064","\u3066","\u3068","\u306f","\u3072","\u3075","\u3078","\u307b","\u306f","\u3072","\u3075","\u3078","\u307b")

	for( i in 1:length(from) ){ x <- gsub(from[i],to[i],x) }

	#	from <- c("ヴ","ガ","ギ","グ","ゲ","ゴ","ザ","ジ","ズ","ゼ","ゾ","ダ","ヂ","ヅ","デ","ド","バ","ビ","ブ","ベ","ボ","パ","ピ","プ","ペ","ポ")
	from = c("\u30f4","\u30ac","\u30ae","\u30b0","\u30b2","\u30b4","\u30b6","\u30b8","\u30ba","\u30bc","\u30be","\u30c0","\u30c2","\u30c5","\u30c7","\u30c9","\u30d0","\u30d3","\u30d6","\u30d9","\u30dc","\u30d1","\u30d4","\u30d7","\u30da","\u30dd")
	#	to   <- c("ウ","カ","キ","ク","ケ","コ","サ","シ","ス","セ","ソ","タ","チ","ツ","テ","ト","ハ","ヒ","フ","ヘ","ホ","ハ","ヒ","フ","ヘ","ホ")
	to = c("\u30a6","\u30ab","\u30ad","\u30af","\u30b1","\u30b3","\u30b5","\u30b7","\u30b9","\u30bb","\u30bd","\u30bf","\u30c1","\u30c4","\u30c6","\u30c8","\u30cf","\u30d2","\u30d5","\u30d8","\u30db","\u30cf","\u30d2","\u30d5","\u30d8","\u30db")

	for( i in 1:length(from) ){ x <- gsub(from[i],to[i],x) }

	return(x)
}

#' Replace zenkaku-alphabet by hankaku-alphabet
#' @description Replace zenkaku-alphabet by hankaku-alphabet
#' @param x target character
#' @return replaced character
#' @export
str_alphabet_to_han = function(x){
	# character変換
	if (!is.character(x)){ x <- as.character(x) }

	# 1bite文字の置換
	# abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXZ
	x <- chartr("\uff41\uff42\uff43\uff44\uff45\uff46\uff47\uff48\uff49\uff4a\uff4b\uff4c\uff4d\uff4e\uff4f\uff50\uff51\uff52\uff53\uff54\uff55\uff56\uff57\uff58\uff59\uff5a\uff21\uff22\uff23\uff24\uff25\uff26\uff27\uff28\uff29\uff2a\uff2b\uff2c\uff2d\uff2e\uff2f\uff30\uff31\uff32\uff33\uff34\uff35\uff36\uff37\uff38\uff3a",
					"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ",
					x)

	return(x)
}

#' Replace zenkaku-number by hankaku-number
#' @description Replace zenkaku-number by hankaku-number
#' @param x target character
#' @return replaced character
#' @export
str_number_to_han = function(x){
	# character変換
	if (!is.character(x)){ x <- as.character(x) }

	# 1bite文字の置換
	# 0123456789
	x <- chartr("\uff10\uff11\uff12\uff13\uff14\uff15\uff16\uff17\uff18\uff19",
					"0123456789",
					x)

	return(x)
}

#' Replace zenkaku-symbol by hankaku-symbol
#' @description Replace zenkaku-symbol by hankaku-symbol
#' @param x target character
#' @return replaced character
#' @export
str_symbol_to_han = function(x){
	# character変換
	if (!is.character(x)){ x <- as.character(x) }

	# 1bite文字の置換
	# (){}[]<>“”!?,./@#$%^&*ー―~=_+|¥
	x <- chartr("\uff08\uff09\uff5b\uff5d\uff3b\uff3d\uff1c\uff1e\u201c\u201d\uff01\uff1f\uff0c\uff0e\uff0f\uff20\uff03\uff04\uff05\uff3e\uff06\uff0a",
					'(){}[]<>""!?,./@#$%^&*',
					x)
	# ‘’
	x <- chartr("\u2015\uff5e\uff1d\uff3f\uff0b\uff5c\uffe5\u2018",
					"-~=_+|\\'",
					x)
	x <- chartr("\u2019","'",x)
	return(x)
}

#' Replace zenkaku alphabet, number and symbols by hankaku
#' @description Replace zenkaku alphabet, number and symbols by hankaku
#' @param x target character
#' @return replaced character
#' @export
str_to_han = function(x){
	x %>%
		str_alphabet_to_han() %>%
		str_number_to_han() %>%
		str_symbol_to_han() %>%
		stringr::str_replace_all("\u3000"," ") %>%
		return()
}

#' Check similarity of given string with target.
#' @description Check similarity of given string with target.
#' @param string character for checking similarity
#' @param target character for compared target
#' @param similarity threshold similarity
#' @param only_sub count only substitute
#' @return logical: TRUE if string is enough similar with target.
#' @importFrom utils adist
#' @export
str_similar = function(string,target,similarity=3, only_sub = FALSE){
	if(only_sub){
		apply(attributes(adist(string,target,counts=TRUE))$count[,,"sub"]<=similarity,1,any)
	}else{
		apply(adist(string,target)<=similarity,1,any)
	}
}

#' Check similarity pair of given string with themselves.
#' @description Check similarity pair of given string with themselves.
#' @param string character for checking similarity
#' @param similarity threshold similarity
#' @param only_sub count only substitute
#' @return logical: TRUE if string is enough similar with target.
#' @importFrom utils adist
#' @export
str_similar_pair = function(string,similarity=3, only_sub = FALSE){
	if(only_sub){
		dist = attributes(adist(string,string,counts=TRUE))$count[,,"sub"]<=similarity & !diag(TRUE,length(string))
	}else{
		dist = adist(string,string)<=similarity & !diag(TRUE,length(string))
	}
	row = matrix(1:length(string),length(string),length(string))
	pair = cbind(t(row)[dist],row[dist])
	pair = pair[pair[,1]<pair[,2],]
	return(matrix(pair,ncol=2))
}

#' Wherher the given string appear first time in the given string vector
#' @description str can be uniqued by using string[str_first_appear(string)].
#' @param string target character vector.
#' @param similarity threshold similarity
#' @return logical: TRUE if the character element appear first time.
#' @importFrom utils adist
#' @export
str_first_appear = function(string,similarity=3){
	if(length(string)==0)return(logical(0))
	if(length(string)==1)return(TRUE)
	return(
		apply(apply((adist(string,string)<similarity),2,cumsum)>0,2,function(x){min((1:length(x))[x])})==1:length(string)
	)
}

#' Generate random character sequences
#' @description return n strings with length len.
#' @param n number of str
#' @param len length of each str
#' @param lower_case logical: include lower case alphabets
#' @param upper_case logical: include upper case alphabets
#' @param number logical: include numbers
#' @return sequences of string.
#' @export
rand_char = function(n,len,lower_case=TRUE,upper_case=TRUE,number=TRUE){
	cand = NULL
	if(number){
		cand = c(cand,"0","1","2","3","4","5","6","7","8","9")
	}
	if(lower_case){
		cand = c(cand,"a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z")
	}
	if(upper_case){
		cand = c(cand,"A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z")
	}
	if(n<=0)return(NULL)
	ans = NULL
	for(i in 1:n){
		ans = c(ans,paste(sample(cand,len),collapse=""))
	}
	return(ans)
}

#' Escape all markdown character.
#' @description Escape all markdown character.
#' @param str character vector
#' @return Returns a character vector.
#' @export
escape_md = function(str){
	str |>
		stringr::str_replace_all("\\[",'\\\\[') |>
		stringr::str_replace_all("\\]",'\\\\]') |>
		stringr::str_replace_all("\\(",'\\\\(') |>
		stringr::str_replace_all("\\)",'\\\\)') |>
		stringr::str_replace_all("\\{",'\\\\{') |>
		stringr::str_replace_all("\\}",'\\\\}') |>
		stringr::str_replace_all("\\*",'\\\\*') |>
		stringr::str_replace_all("\\+",'\\\\+') |>
		stringr::str_replace_all("\\-",'\\\\-') |>
		stringr::str_replace_all("\\.",'\\\\.') |>
		stringr::str_replace_all("`",'\\\\`') |>
		stringr::str_replace_all("_",'\\\\_') |>
		stringr::str_replace_all("#",'\\\\#') |>
		stringr::str_replace_all("!",'\\\\!')
}

#' Unescape all markdown character.
#' @description Unescape all markdown character.
#' @param str character vector
#' @return Returns a character vector.
#' @export
unescape_md = function(str){
	str |>
		stringr::str_replace_all("\\\\\\[",'[') |>
		stringr::str_replace_all("\\\\\\]",']') |>
		stringr::str_replace_all("\\\\\\(",'(') |>
		stringr::str_replace_all("\\\\\\)",')') |>
		stringr::str_replace_all("\\\\\\{",'{') |>
		stringr::str_replace_all("\\\\\\}",'}') |>
		stringr::str_replace_all("\\\\\\*",'*') |>
		stringr::str_replace_all("\\\\\\+",'+') |>
		stringr::str_replace_all("\\\\\\-",'-') |>
		stringr::str_replace_all("\\\\\\.",'.') |>
		stringr::str_replace_all("\\\\`",'`') |>
		stringr::str_replace_all("\\\\_",'_') |>
		stringr::str_replace_all("\\\\#",'#') |>
		stringr::str_replace_all("\\\\!",'!')
}

#' Escape all regex character.
#' @description Escape all regex character.
#' @param str character vector
#' @return Returns a character vector.
#' @export
escape_regex = function(str){
	str |>
		stringr::str_replace_all("\\[",'\\\\[') |>
		stringr::str_replace_all("\\]",'\\\\]') |>
		stringr::str_replace_all("\\(",'\\\\(') |>
		stringr::str_replace_all("\\)",'\\\\)') |>
		stringr::str_replace_all("\\{",'\\\\{') |>
		stringr::str_replace_all("\\}",'\\\\}') |>
		stringr::str_replace_all("\\*",'\\\\*') |>
		stringr::str_replace_all("\\+",'\\\\+') |>
		stringr::str_replace_all("\\-",'\\\\-') |>
		stringr::str_replace_all("\\.",'\\\\.') |>
		stringr::str_replace_all("\\^",'\\\\^') |>
		stringr::str_replace_all("\\$",'\\\\$') |>
		stringr::str_replace_all("\\?",'\\\\?') |>
		stringr::str_replace_all("\\|",'\\\\|')
}

#' Unescape all regex character.
#' @description Unescape all regex character.
#' @param str character vector
#' @return Returns a character vector.
#' @export
unescape_regex = function(str){
	str |>
		stringr::str_replace_all("\\\\\\[",'[') |>
		stringr::str_replace_all("\\\\\\]",']') |>
		stringr::str_replace_all("\\\\\\(",'(') |>
		stringr::str_replace_all("\\\\\\)",')') |>
		stringr::str_replace_all("\\\\\\{",'{') |>
		stringr::str_replace_all("\\\\\\}",'}') |>
		stringr::str_replace_all("\\\\\\*",'*') |>
		stringr::str_replace_all("\\\\\\+",'+') |>
		stringr::str_replace_all("\\\\\\-",'-') |>
		stringr::str_replace_all("\\\\\\.",'.') |>
		stringr::str_replace_all("\\\\\\^",'^') |>
		stringr::str_replace_all("\\\\\\$",'$') |>
		stringr::str_replace_all("\\\\\\?",'?') |>
		stringr::str_replace_all("\\\\\\|",'|')
}
hmito/hmRLib documentation built on March 13, 2024, 9:41 p.m.