R/ChineseNames.R

Defines functions compute_name_index .onAttach

Documented in compute_name_index

#' ChineseNames: Chinese Name Database 1930-2008
#'
#' @description
#' A database of Chinese surnames and Chinese given names (1930-2008).
#' This database contains nationwide frequency statistics of
#' 1,806 Chinese surnames and 2,614 Chinese characters used in given names,
#' covering about 1.2 billion Han Chinese population
#' (96.8\% of the Han Chinese household-registered population
#' born from 1930 to 2008 and still alive in 2008).
#' This package also contains a function for computing multiple features of
#' Chinese surnames and Chinese given names for scientific research (e.g.,
#' name uniqueness, name gender, name valence, and name warmth/competence).
#'
#' @details
#' Details are described in
#' \url{https://github.com/psychbruce/ChineseNames}
#'
#' @note
#' This database does not contain any individual-level information
#' (so it does not leak personal privacy).
#' All data are at the name level or character level.
#' Extremely rare characters are not included.
#'
#' @source
#' This database was provided by Beijing Meiming Science and Technology Company (in collaboration) and
#' originally obtained from the National Citizen Identity Information Center (NCIIC) of China in 2008.
#'
#' @section Citation:
#' Bao, H.-W.-S. (2021). ChineseNames: Chinese Name Database 1930-2008.
#' R package version 1.0.0.
#' \url{https://CRAN.R-project.org/package=ChineseNames}
#'
#' @docType package
#' @name ChineseNames
NULL


.onAttach=function(libname, pkgname) {
  packageStartupMessage(
    "\nTo cite the `ChineseNames` package in publications, please use:\n\n",
    "Bao, H.-W.-S. (2021). ",
    "ChineseNames: Chinese Name Database 1930-2008. ",
    "R package version 1.0.0. ",
    "https://CRAN.R-project.org/package=ChineseNames\n")
}


#### Database ####


#' 1,806 Chinese surnames and nationwide frequency.
#'
#' @name familyname
#' @usage data(familyname)
#' @format A data frame with 7 variables:
#' \describe{
#'   \item{\code{surname}}{surname (in Chinese)}
#'   \item{\code{compound}}{0 = single surname, 1 = compound surname}
#'   \item{\code{initial}}{initial letter (a-z)}
#'   \item{\code{initial.rank}}{initial order (1-26)}
#'   \item{\code{n.1930_2008}}{total counts in the database}
#'   \item{\code{ppm.1930_2008}}{proportion in population (ppm = parts per million)}
#'   \item{\code{surname.uniqueness}}{surname uniqueness}
#' }
#' @details \url{https://github.com/psychbruce/ChineseNames}
NULL


#' 2,614 Chinese characters used in given names and nationwide frequency.
#'
#' @name givenname
#' @usage data(givenname)
#' @format A data frame with 25 variables:
#' \describe{
#'   \item{\code{character}}{character used in given names (in Chinese)}
#'   \item{\code{pinyin}}{pinyin (pronunciation)}
#'   \item{\code{bihua}}{number of strokes in a character}
#'   \item{\code{n.male}}{total counts in male}
#'   \item{\code{n.female}}{total counts in female}
#'   \item{\code{name.gender}}{difference in proportions of a character used by male vs. female}
#'   \item{\code{n.1930_1959}, \code{n.1960_1969}, \code{n.1970_1979}, \code{n.1980_1989}, \code{n.1990_1999}, \code{n.2000_2008}}{total counts in a birth cohort}
#'   \item{\code{ppm.1930_1959}, \code{ppm.1960_1969}, \code{ppm.1970_1979}, \code{ppm.1980_1989}, \code{ppm.1990_1999}, \code{ppm.2000_2008}}{proportion (parts per million) in a birth cohort}
#'   \item{\code{name.ppm}}{average ppm (parts per million) across all cohorts}
#'   \item{\code{name.uniqueness}}{name-character uniqueness (in naming practices)}
#'   \item{\code{corpus.ppm}}{proportion (parts per million) in contemporary Chinese corpus}
#'   \item{\code{corpus.uniqueness}}{character-corpus uniqueness (in contemporary Chinese corpus)}
#'   \item{\code{name.valence}}{name valence (positivity of character meaning) (based on subjective ratings from 16 raters, ICC = 0.921)}
#'   \item{\code{name.warmth}}{name warmth/morality (based on subjective ratings from 10 raters, ICC = 0.774)}
#'   \item{\code{name.competence}}{name competence/assertiveness (based on subjective ratings from 10 raters, ICC = 0.712)}
#' }
#' @details \url{https://github.com/psychbruce/ChineseNames}
NULL


#' Population statistics for the Chinese name database.
#' @name population
#' @usage data(population)
#' @details \url{https://github.com/psychbruce/ChineseNames}
NULL


#' Top 1,000 given names in 31 Chinese mainland provinces.
#' @name top1000name.prov
#' @usage data(top1000name.prov)
#' @details \url{https://github.com/psychbruce/ChineseNames}
NULL


#' Top 100 given names in 6 birth cohorts.
#' @name top100name.year
#' @usage data(top100name.year)
#' @details \url{https://github.com/psychbruce/ChineseNames}
NULL


#' Top 50 given-name characters in 6 birth cohorts.
#' @name top50char.year
#' @usage data(top50char.year)
#' @details \url{https://github.com/psychbruce/ChineseNames}
NULL


#### Functions ####


`%>%`=dplyr::`%>%`


#' Compute multiple features of surnames and given names.
#'
#' @description
#' Compute all available name features (indices) based on
#' \code{\link{familyname}} and \code{\link{givenname}}.
#' You can either input a data frame
#' with a variable of Chinese full names
#' (and a variable of birth years, if necessary)
#' or just input a vector of full names
#' (and a vector of birth years, if necessary).
#'
#' \itemize{
#'   \item Usage 1: Input a single value or a vector of \code{name} [and \code{birth}, if necessary].
#'   \item Usage 2: Input a data frame of \code{data}
#'   and the variable name of
#'   \code{var.fullname} (or \code{var.surname} and/or \code{var.givenname})
#'   [and \code{var.birthyear}, if necessary].
#' }
#'
#' \emph{Caution.} Name-character uniqueness (NU) of cases
#' with birth year >= 2010 would not be accurately computed
#' due to the year limitation of this name database.
#'
#' @param data Data frame.
#' @param var.fullname Variable name of Chinese full names (e.g., \code{"name"}).
#' @param var.surname Variable name of Chinese surnames (e.g., \code{"surname"}).
#' @param var.givenname Variable name of Chinese given names (e.g., \code{"givenname"}).
#' @param var.birthyear Variable name of birth year (e.g., \code{"birth"}).
#' @param name \strong{If no \code{data}}, you can just input a vector of full name(s).
#' @param birth \strong{If no \code{data}}, you can just input a vector of birth year(s).
#' @param index Which indices to compute?
#'
#' By default, it computes all available name indices:
#' \itemize{
#'   \item \code{NLen}: full-name length (2~4).
#'   \item \code{SNU}: surname uniqueness (1~6).
#'   \item \code{SNI}: surname initial (1~26).
#'   \item \code{NU}: name-character uniqueness (1~6).
#'   \item \code{CCU}: character-corpus uniqueness (1~6).
#'   \item \code{NG}: name gender (-1~1).
#'   \item \code{NV}: name valence (1~5).
#'   \item \code{NW}: name warmth (1~5).
#'   \item \code{NC}: name competence (1~5).
#' }
#'
#' For details, see \url{https://github.com/psychbruce/ChineseNames}
#' @param NU.approx Whether to \emph{approximately} compute name-character uniqueness (NU)
#' using \emph{the nearest two birth cohorts with relative weights}
#' (which would be more precise than just using a single birth cohort).
#' Default is \code{TRUE}.
#' @param digits Number of decimal places. Default is \code{4}.
#' @param return.namechar Whether to return separate name characters.
#' Default is \code{TRUE}.
#' @param return.all Whether to return all temporary variables
#' in the computation of the final variables.
#' Default is \code{FALSE}.
#'
#' @return
#' A new data frame (\code{data.table}) with name indices appended.
#' Full names are split into \code{name0}
#' (surnames, with compound surnames automatically detected),
#' \code{name1}, \code{name2}, and \code{name3} (given-name characters).
#'
#' @note For details and examples, see \url{https://github.com/psychbruce/ChineseNames}
#'
#' @examples
#' ## Prepare
#' sn=familyname$surname[1:12]
#' gn=c(top100name.year$name.all.1960[1:6],
#'      top100name.year$name.all.2000[1:6],
#'      top100name.year$name.all.1960[95:100],
#'      top100name.year$name.all.2000[95:100])
#' demodata=data.frame(name=paste0(sn, gn),
#'                     birth=c(1960:1965, 2000:2005,
#'                             1960:1965, 2000:2005))
#'
#' ## Compute
#' newdata=compute_name_index(demodata,
#'                            var.fullname="name",
#'                            var.birthyear="birth")
#' # use View(newdata) to see the results
#'
#' @import data.table
#' @importFrom bruceR Print MEAN LOOKUP
#' @export
compute_name_index=function(data=NULL,
                            var.fullname=NULL,
                            var.surname=NULL,
                            var.givenname=NULL,
                            var.birthyear=NULL,
                            name=NA, birth=NA,
                            index=c("NLen",
                                    "SNU", "SNI",
                                    "NU", "CCU", "NG",
                                    "NV", "NW", "NC"),
                            NU.approx=TRUE,
                            digits=4,
                            return.namechar=TRUE,
                            return.all=FALSE) {
  ## Prepare ##

  familyname=ChineseNames::familyname
  givenname=ChineseNames::givenname

  fuxing=familyname[familyname$compound==1, "surname"]
  ref0=givenname$name.ppm; names(ref0)=givenname$character
  ref1=givenname$ppm.1930_1959; names(ref1)=givenname$character
  ref2=givenname$ppm.1960_1969; names(ref2)=givenname$character
  ref3=givenname$ppm.1970_1979; names(ref3)=givenname$character
  ref4=givenname$ppm.1980_1989; names(ref4)=givenname$character
  ref5=givenname$ppm.1990_1999; names(ref5)=givenname$character
  ref6=givenname$ppm.2000_2008; names(ref6)=givenname$character

  compute_NU_char=function(char, year=NA, approx=TRUE) {
    raw=!approx
    if(is.na(char) | char=="")
      ppm="NA"
    else if(is.na(year) | year>=2010)
      ppm=ref0[char]  # overall
    else if(year<1930)
      ppm=ref1[char]  # 1930-1959
    else if(year<1960)
      ppm=ifelse(
        raw | year<1955,
        ref1[char],  # 1930-1959
        (ref1[char]*(1965-year) + ref2[char]*(year-1955))/10
      )
    else if(year<1970)
      ppm=ifelse(
        raw,
        ref2[char],  # 1960-1969
        ifelse(year<1965,
               (ref1[char]*(1965-year) + ref2[char]*(year-1955))/10,
               (ref2[char]*(1975-year) + ref3[char]*(year-1965))/10)
      )
    else if(year<1980)
      ppm=ifelse(
        raw,
        ref3[char],  # 1970-1979
        ifelse(year<1975,
               (ref2[char]*(1975-year) + ref3[char]*(year-1965))/10,
               (ref3[char]*(1985-year) + ref4[char]*(year-1975))/10)
      )
    else if(year<1990)
      ppm=ifelse(
        raw,
        ref4[char],  # 1980-1989
        ifelse(year<1985,
               (ref3[char]*(1985-year) + ref4[char]*(year-1975))/10,
               (ref4[char]*(1995-year) + ref5[char]*(year-1985))/10)
      )
    else if(year<2000)
      ppm=ifelse(
        raw,
        ref5[char],  # 1990-1999
        ifelse(year<1995,
               (ref4[char]*(1995-year) + ref5[char]*(year-1985))/10,
               (ref5[char]*(2005-year) + ref6[char]*(year-1995))/10)
      )
    else if(year<2010)
      ppm=ifelse(
        raw,
        ref6[char],  # 2000-2009 (2008)
        ifelse(year<2005,
               (ref5[char]*(2005-year) + ref6[char]*(year-1995))/10,
               ref6[char])
      )
    else
      ppm="NA"
    if(is.na(ppm)) ppm=0
    if(ppm=="NA") ppm=NA
    return(as.numeric( -log10((ppm+1)/10^6) ))
  }

  ## Debug ##

  `.`=NULL
  NLen=SNU=SNI=NU=CCU=NG=NV=NW=NC=NULL
  fx=sur.name=given.name=full.name=name0=name1=name2=name3=NULL

  ## Main ##

  if(is.na(name)==FALSE) {
    data=data.frame(name=name, birth=birth)
    var.fullname="name"
    var.birthyear="birth"
  }
  if(is.null(data))
    stop("Please input your data.")
  if(is.null(var.fullname) & is.null(var.surname) & is.null(var.givenname))
    stop("Please input variable(s) of full/family/given names.")

  data=as.data.frame(data)
  if(!is.null(var.fullname)) {
    d=data.table(full.name=data[[var.fullname]])
    d[,NLen:=nchar(full.name)]
    d[,sur.name:=substr(full.name, 1,
                        ifelse((substr(full.name, 1, 2) %in% fuxing) & NLen>2,
                               2, 1))]
    d[,given.name:=substr(full.name, nchar(sur.name)+1, NLen)]
  } else {
    if(!is.null(var.surname) & !is.null(var.givenname)) {
      d=data.table(sur.name=data[[var.surname]], given.name=data[[var.givenname]])
    } else {
      if(!is.null(var.surname))
        d=data.table(sur.name=data[[var.surname]], given.name="")
      if(!is.null(var.givenname))
        d=data.table(sur.name="", given.name=data[[var.givenname]])
    }
    d[,full.name:=paste0(sur.name, given.name)]
    d[,NLen:=nchar(full.name)]
  }
  d[,name0:=sur.name]
  d[,name1:=substr(given.name, 1, 1)]
  d[,name2:=substr(given.name, 2, 2)]
  d[,name3:=substr(given.name, 3, 3)]

  if(!is.null(var.birthyear))
    d=cbind(d, year=data[[var.birthyear]])
  else
    d=cbind(d, year=NA)

  d=d[,.(name0, name1, name2, name3, year, NLen)]

  log=(nrow(d)>=100000)

  if("SNU" %in% index) {
    d[,SNU:=LOOKUP(d, "name0", familyname, "surname", "surname.uniqueness", return="new.value") %>% round(digits)]
    if(log) Print("SNU computed.")
  }

  if("SNI" %in% index) {
    d[,SNI:=LOOKUP(d, "name0", familyname, "surname", "initial.rank", return="new.value")]
    if(log) Print("SNI computed.")
  }

  if("NU" %in% index) {
    d[,`:=`(
      nu1=mapply(compute_NU_char, name1, year, NU.approx),
      nu2=mapply(compute_NU_char, name2, year, NU.approx),
      nu3=mapply(compute_NU_char, name3, year, NU.approx)
    )]
    d[,NU:=MEAN(d, "nu", 1:3) %>% round(digits)]
    if(log) Print("NU computed.")
  }

  if("CCU" %in% index) {
    d[,`:=`(
      ccu1=LOOKUP(d, "name1", givenname, "character", "corpus.uniqueness", return="new.value"),
      ccu2=LOOKUP(d, "name2", givenname, "character", "corpus.uniqueness", return="new.value"),
      ccu3=LOOKUP(d, "name3", givenname, "character", "corpus.uniqueness", return="new.value")
    )]
    d[,CCU:=MEAN(d, "ccu", 1:3) %>% round(digits)]
    if(log) Print("CCU computed.")
  }

  if("NG" %in% index) {
    d[,`:=`(
      ng1=LOOKUP(d, "name1", givenname, "character", "name.gender", return="new.value"),
      ng2=LOOKUP(d, "name2", givenname, "character", "name.gender", return="new.value"),
      ng3=LOOKUP(d, "name3", givenname, "character", "name.gender", return="new.value")
    )]
    d[,NG:=MEAN(d, "ng", 1:3) %>% round(digits)]
    if(log) Print("NG computed.")
  }

  if("NV" %in% index) {
    d[,`:=`(
      nv1=LOOKUP(d, "name1", givenname, "character", "name.valence", return="new.value"),
      nv2=LOOKUP(d, "name2", givenname, "character", "name.valence", return="new.value"),
      nv3=LOOKUP(d, "name3", givenname, "character", "name.valence", return="new.value")
    )]
    d[,NV:=MEAN(d, "nv", 1:3) %>% round(digits)]
    if(log) Print("NV computed.")
  }

  if("NW" %in% index) {
    d[,`:=`(
      nw1=LOOKUP(d, "name1", givenname, "character", "name.warmth", return="new.value"),
      nw2=LOOKUP(d, "name2", givenname, "character", "name.warmth", return="new.value"),
      nw3=LOOKUP(d, "name3", givenname, "character", "name.warmth", return="new.value")
    )]
    d[,NW:=MEAN(d, "nw", 1:3) %>% round(digits)]
    if(log) Print("NW computed.")
  }

  if("NC" %in% index) {
    d[,`:=`(
      nc1=LOOKUP(d, "name1", givenname, "character", "name.competence", return="new.value"),
      nc2=LOOKUP(d, "name2", givenname, "character", "name.competence", return="new.value"),
      nc3=LOOKUP(d, "name3", givenname, "character", "name.competence", return="new.value")
    )]
    d[,NC:=MEAN(d, "nc", 1:3) %>% round(digits)]
    if(log) Print("NC computed.")
  }

  if(return.namechar)
    data=cbind(data, d[,.(name0, name1, name2, name3)])
  data.new=cbind(data, as.data.frame(d)[index]) %>% as.data.table()
  if(return.all)
    return(d)
  else
    return(data.new)
}

Try the ChineseNames package in your browser

Any scripts or data that you put into this service are public.

ChineseNames documentation built on April 1, 2021, 5:08 p.m.