data-raw/tr51.R

library(tidyverse)
library(glue)
library(assertthat)
library(stringi)

parse_emoji_data <- function(file = "data-raw/tr51/emoji-data.txt") {
  read_lines(file) %>%
    str_subset("^[0-9A-F]") %>%
    tibble(txt = .) %>%
    separate(txt, into = c("code", "type_field", "description"), sep = "[;#]", extra = "drop") %>%
    mutate_all(str_trim) %>%
    mutate(points = map(str_split(code, "[.]{2}"), strtoi, base = 16))
}

parse_emoji_sequence <- function(file) {
  read_lines(file) %>%
    str_subset("^[0-9A-F]") %>%
    tibble(txt = .) %>%
    separate(txt, into = c("code", "type_field", "description"), sep = "[;#]", extra = "drop") %>%
    mutate_all(str_trim) %>%

    mutate(
      points = map(str_split(code, " "), strtoi, base = 16),
      nrunes = map_int(points, length)
    ) %>%
    arrange(desc(nrunes), code)
}

code_point_range <- function(x) {
  if (length(x) == 1) {
    stri_enc_fromutf32(x)
  } else {
    unicode <- map(x, stri_enc_fromutf32)
    paste0("[", unicode[1], "-", unicode[2], "]")
  }
}


emoji_data <- parse_emoji_data()
data_modifier_base <- emoji_data %>%
  filter(type_field == "Emoji_Modifier_Base")

# emoji presentation

# emoji presentation not possibly affected by skin tone modifiers
data_presentation <- emoji_data %>%
  filter(type_field == "Emoji_Presentation") %>%
  anti_join(select(data_modifier_base, code), by = "code")

rx_presentation <- data_presentation %>%
  pull(points) %>%
  map_chr(code_point_range) %>%
  paste0(collapse = "|")

emoji_data_no_presentation <- anti_join(
  filter(emoji_data, type_field == "Emoji"),
  filter(emoji_data, type_field == "Emoji_Presentation"),
  by = "code"
) %>%
  anti_join(select(data_modifier_base, code), by = "code")

rx_emoji <- emoji_data_no_presentation %>%
  pull(points) %>%
  map_chr(code_point_range) %>%
  paste0(collapse = "|") %>%
  paste0("(?:", ., ")\UFE0F") %>%
  str_replace("[*]", "[*]")

emoji_data_picto <- anti_join(
  filter(emoji_data, type_field == "Extended_Pictographic"),
  filter(emoji_data, type_field == "Emoji"),
  by = "code"
)

rx_picto <- emoji_data_picto %>%
  pull(points) %>%
  map_chr(code_point_range) %>%
  paste0(collapse = "|") %>%
  paste0("(?:", ., ")\UFE0F")

#### emoji sequences
emoji_sequences <- parse_emoji_sequence("data-raw/tr51/emoji-sequences.txt")

rx_keycap <- emoji_sequences %>%
  filter(type_field == "Emoji_Keycap_Sequence") %>%
  pull(points) %>%
  map_int(1) %>%
  stri_enc_fromutf32() %>%
  paste0("[", ., "]\uFE0F\u20E3")

# this can probably be simplified as there are some common runes
rx_subregion_flag <- emoji_sequences %>%
  filter(type_field == "RGI_Emoji_Tag_Sequence") %>%
  pull(points) %>%
  stri_enc_fromutf32() %>%
  paste0(collapse = "|")

rx_flags <- paste0("[\U1F1E6-\U1F1FF]{2}|", rx_subregion_flag)

modifier_base <- data_modifier_base %>%
  pull(points) %>%
  map_chr(code_point_range) %>%
  paste0(collapse = "|") %>%
  paste0("(?:", ., ")")

rx_modifier <- emoji_data %>%
  filter(type_field == "Emoji_Modifier") %>%
  pull(points) %>%
  magrittr::extract2(1L) %>%
  map_chr(stri_enc_fromutf32) %>%
  {
    paste0("[", .[1], "-", .[2], "]")
  }

rx_modifier_sequence <- glue("{modifier_base}\UFE0F?{rx_modifier}?")

rx_sequences <- glue("(?#\n\n--------simple sequences with skin tone modifiers--------){rx_modifier_sequence}|(?#\n\n--------country flags--------){rx_flags}|(?#\n\n--------keycap sequences--------){rx_keycap}")

# dealing with kiss, family and couple separately
rx_adult <- "[\U1F468\U1F469]"
rx_kid <- "[\U1F466\U1F467]"
zwj <- "\u200D"

woman_sign <- "\u2640"
man_sign <- "\u2642"
rx_gender <- glue("[{woman_sign}{man_sign}]\uFE0F")

rx_couple_sequence <- glue("{rx_adult}\U200D\U2764\UFE0F\U200D{rx_adult}")
rx_kiss_sequence <- glue("{rx_adult}\U200D\U2764\UFE0F\U200D\U1F48B\U200D{rx_adult}")
rx_family_sequence <- glue("(?:{rx_adult}\u200D){{1,2}}(?:{rx_kid}\u200D?){{1,2}}")

# gendered roles ...
emoji_zwj_sequences <- parse_emoji_sequence("data-raw/tr51/emoji-zwj-sequences.txt")

# sequences that are coded in 3 runes
roles_zwj_3 <- emoji_zwj_sequences %>%
  filter(nrunes == 3, str_detect(description, "^woman")) %>%
  pull(points) %>%
  map_int(3L)
rx_zwj_3 <- glue("{rx_adult}{rx_modifier}?{zwj}[{roles}]",
  roles = stri_enc_fromutf32(roles_zwj_3)
)

# those coded in 4 runes - with the gender first
roles_zwj_4_1 <- emoji_zwj_sequences %>%
  filter(nrunes == 4, str_detect(description, "^woman"), !str_detect(description, ":"), str_detect(code, "^1F469")) %>%
  pull(points) %>%
  map_int(3)
rx_zwj_4_1 <- glue("{rx_adult}{rx_modifier}?{zwj}[{roles}]\uFE0F",
  roles = stri_enc_fromutf32(roles_zwj_4_1)
)

# those coded in 4 runes - with in 3rd position
roles_zwj_4_2 <- emoji_zwj_sequences %>%
  filter(nrunes == 4, str_detect(description, "^(blond-haired )?(mer|wo)m[ae]n"), !str_detect(description, ":"), !str_detect(code, "^1F469")) %>%
  pull(points) %>%
  map_int(1)
rx_zwj_4_2 <- glue("[{roles}]{rx_modifier}?{zwj}{rx_gender}",
  roles = stri_enc_fromutf32(roles_zwj_4_2)
)

rx_rainbow_flag <- filter(emoji_zwj_sequences, description == "rainbow flag") %>%
  pull(points) %>%
  stri_enc_fromutf32()

roles_zwj_5 <- emoji_zwj_sequences %>%
  filter(nrunes == 5, !str_detect(description, ":"), str_detect(description, "woman")) %>%
  pull(points) %>%
  map_int(1)
rx_zwj_5 <- glue("[{roles}](?:\uFE0F|{rx_modifier}){zwj}{rx_gender}",
  roles = stri_enc_fromutf32(roles_zwj_5)
)

rx_eye <- filter(emoji_zwj_sequences, description == "eye in speech bubble") %>%
  pull(points) %>%
  stri_enc_fromutf32()

rx_pirate_flag <- filter(emoji_zwj_sequences, description == "pirate flag") %>%
  pull(points) %>%
  stri_enc_fromutf32()

rx_zwj_seq <- glue("(?#\n\n--------couple with heart--------\n){rx_couple_sequence}|(?#\n\n--------kiss--------\n){rx_kiss_sequence}|(?#\n\n--------family--------\n){rx_family_sequence}|(?#\n\n--------gendered roles--------\n){rx_zwj_3}|{rx_zwj_4_1}|(?#\n\n--------gendered activities--------\n){rx_zwj_4_2}|{rx_zwj_5}|(?#\n\n--------other zwj sequences--------){rx_rainbow_flag}|{rx_eye}|{rx_pirate_flag}")

####### final rx
emoji_rx <- unclass(glue("{rx_zwj_seq}|{rx_sequences}|(?#\n\n--------Emoji_Presentation--------){rx_presentation}|(?#\n\n--------Emoji--------){rx_emoji}|(?#\n\n--------Extended_Pictographic--------){rx_picto}"))
cat(emoji_rx)

usethis::use_data(emoji_rx, overwrite = TRUE)
EmilHvitfeldt/emoji documentation built on Oct. 30, 2024, 12:27 p.m.