#' CBN
#'
#' This package contains tools and experimental items necessary to
#' replicate Caliskan, Bryson, and Narayanan (2017). 'Semantics derived
#' automatically from language corpora contain human-like biases'
#'
#' @source A. Caliskan, J. J. Bryson, and A. Narayanan (2017) 'Semantics derived
#' automatically from language corpora contain human-like biases'
#' \emph{Science}. 356:6334 \url{http://doi.org/10.1126/science.aal4230}.
#'
#' @docType package
#' @name cbn
#' @useDynLib cbn
#' @importFrom Rcpp sourceCpp
NULL
#' Gender Proportions for Names in the US Population
#'
#' This data set contains for each name used in \emph{any} of the studies,
#' (not just those in WEFAT 2) and its the gender proportions in the US population.
#' It was generated by the \code{gender} package, which uses US Social Security
#' Administration data.
#'
#' The columns of the data set are \code{name}, the name,
#' \code{proportion_male} and \code{proportion_female}, gender (a best guess
#' from the proportions), and the years within which the SSA search was
#' performed. This data set can merged with several of the study item sets,
#' but is most useful for replicating the second WEFAT study, as shown in the
#' replication vignette.
#'
#' This data should typically be joined e.g. using \code{merge}, to other item
#' information using the columns 'name' and 'Word' (assuming that information
#' comes from \code{cbn_get_items}). The replication vignette has an example.
#'
"cbn_gender_name_stats"
#' Gender Proportions for Names in the US Population from 1990 Census
#'
#' This data set contains gender information \emph{only} for names used in
#' WEFAT 2. It is a slightly normalized version of the original Dataverse
#' materials.
#'
#' The columns are \code{name}, \code{gender.score} a numerical score derived
#' (somehow) using -1 to be female, 0 to mean unisex, and 1 to mean male,
#' and \code{percentage.in.population}, \code{percentage.in.male.population},
#' and \code{percentage.in.female.population}. Apparently these three are
#' some measure of the prevalence of the name in the US population and two
#' gender subpopulations.
#'
#' The original materials are a tab separated file located at
#' \code{system.file("extdata", "censusNames1990.tsv", package = "cbn")}.
#'
#' Presumably some Bayes theorem with the addition of the population gender
#' balance recreates the quantity of substantive interest: P(gender | name).
#' This has not been done.
#'
"cbn_gender_name_stats_census1990"
#' All Items Used in All Studies
#'
#' This data frame contains all the items used in all the studies.
#' It is the data source for \code{cbn_get_items}.
#' Most of the time you should probably use that.
#'
#' @source A. Caliskan, J. J. Bryson, and A. Narayanan (2017) 'Semantics derived
#' automatically from language corpora contain human-like biases'
#' \emph{Science}. 356:6334 \url{http://doi.org/10.1126/science.aal4230}.
"cbn_items"
#' Cosine Similarity for Every Pair of Study Items
#'
#' A matrix of cosine similarities between each item and every other one.
#' Uses \code{cbn_items}.
#'
"cbn_item_cosines"
#' Vectors for All Items Used in All Studies
#'
#' A 457 x 300 matrix of (row) vectors for all study items,
#' extracted from the 840B word Common Crawl data on Jun 30th, 2018.
#'
#' @source J. Pennington, R. Socher, and C. D. Manning (2014)
#' 'GloVe: Global vectors for word representation'
#' \url{https://nlp.stanford.edu/projects/glove/}.
"cbn_item_vectors"
#' Get the Items in a Study
#'
#' Returns a data frame containing the items from one of the studies
#' (WEAT1 through WEAT10 or WEFAT1 or WEFAT2) or a vector containing
#' all items from all studies if \code{type} == "all".
#'
#' @param type "all" (the default), "WEAT", or "WEFAT"
#' @param number study number (default: 1) Ignored if \code{type} = "all"
#' @return a data frame of items in columns or a vector of all items
#' @export
cbn_get_items <- function(type = c("all", "WEAT", "WEFAT"), number = 1){
study_type <- match.arg(type)
if (study_type == "all") {
its <- unique(cbn::cbn_items$Word)
} else {
sname <- paste0(study_type, number)
its <- cbn::cbn_items[cbn::cbn_items$Study == sname, ]
class(its) <- c("cbn_study", class(its))
}
its
}
#' Make items
#'
#' @param studyname Name of your study
#' @param words a vector of words
#' @param conditions a vector of condition labels (must be the same length as
#' \code{words})
#' @param roles An optional vector of role description labels (must be the same length as
#' \code{words}). Values are either \code{target} or \code{attribute}
#'
#' @return a set of items
#' @export
cbn_make_items <- function(studyname, words, conditions, roles = NULL){
if (length(words) != length(conditions))
stop("words and conditions must be the length")
df <- data.frame(Study = studyname,
Word = words,
Condition = conditions,
stringsAsFactors = TRUE)
if (!is.null(roles))
if (length(roles) == length(words))
df$Role = roles
else
stop("words and conditions and roles must be the same length")
if (length(unique(df$Word)) != length(df$Word))
stop("words must be unique")
class(df) <- c("cbn_study", class(df))
df
}
#' Summary Method for Study Items
#'
#' A summary method for study items extracted via \code{\link{cbn_get_items}}.
#'
#' @param object A set of study items
#' @param ... Ignored
#' @return Condition names, roles (target or attribute) and N for study items
#' @export
#' @importFrom stats aggregate
#'
#' @examples
#' its <- cbn_get_items("WEAT", 6)
#' summary(its)
#'
summary.cbn_study <- function(object, ...){
if (is.null(object$Role))
f <- Word ~ Condition
else
f <- Word ~ Condition + Role
s <- aggregate(f, data = object, FUN = length)
colnames(s)[colnames(s) == "Word"] <- "N"
cat(unique(object[,'Study']), "\n")
print.data.frame(s, row.names = FALSE)
invisible(s)
}
#' Get Vectors for Items in a Study
#'
#' Returns a matrix containing word vectors for the items used in one
#' of the studies (WEAT1 through WEAT10 or WEFAT1 or WEFAT2).
#' If \code{type} == "all" then vectors for all items used in any of the studies
#' is returned. Words are row names.
#'
#' @param type "all" (the default), "WEAT", or "WEFAT"
#' @param number study number (default: 1) Ignored if \code{type} = "all"
#' @return a matrix with word vectors as rows
#' @export
cbn_get_item_vectors <- function(type = c("all", "WEAT", "WEFAT"), number = 1){
study_type <- match.arg(type)
if (study_type == "all") {
vecs <- cbn::cbn_item_vectors
} else {
its <- cbn_get_items(type = type, number = number)
vecs <- cbn::cbn_item_vectors[its$Word, ]
}
vecs
}
#' Set the Location of the Vectors File
#'
#' This function adds the location of the file of vectors to the
#' current environment (as the value of \code{CBN_VECTORS_LOCATION}).
#' If \code{persist} is TRUE it also adds this key to
#' \code{~/.Renviron} so that it is retained across R sessions.
#'
#' To recover the current location, use
#' \code{\link{cbn_get_vectorfile_location}}.
#'
#' @param f path where you unzipped your vectors file
#' @param persist Whether to add this to your R startup file
#' @return Nothing
#' @seealso \code{\link{cbn_get_vectorfile_location}}
#' @export
cbn_set_vectorfile_location <- function(f, persist = FALSE){
f <- normalizePath(f)
if (file.exists(f)) {
Sys.setenv(CBN_VECTORS_LOCATION = f)
add_to_Renviron("CBN_VECTORS_LOCATION", f)
} else
message(f, "does not exist!")
}
add_to_Renviron <- function(key, value) {
renviron = "~/.Renviron"
if (file.exists(renviron)) {
lines <- readLines(renviron)
new_lines <- Filter(function(x) { !startsWith(x, key) }, lines)
new_lines[[length(new_lines) + 1]] <- paste0(key, "=", value)
writeLines(paste0(new_lines, collapse = "\n"), renviron)
} else {
line <- paste0(key, "=", value, "\n")
writeLines(line, renviron)
}
}
#' Get the Location of the Vectors File
#'
#' Returns the full path to the file of word vectors. If there is no
#' environment variable \code{CBN_VECTORS_LOCATION} in the current
#' environment it prompts to set a location with
#' \code{cbn_set_vectorfile_location}
#'
#' If you want prefer the location of your downloaded vectors to persist
#' across sessions, add
#' \code{CBN_VECTORS_LOCATION=/Users/me/Documents/myvectors.txt}
#' or similar to your \code{~/.Renviron} file (creating the file if necessary).
#'
#' @return a full path to the vectors file
#' @seealso \code{\link{cbn_set_vectorfile_location}}
#' @export
cbn_get_vectorfile_location <- function(){
cc_loc <- Sys.getenv("CBN_VECTORS_LOCATION")
if (is.null(cc_loc))
stop("Location unknown: use cbn_set_vectorfile_location to assign it")
else
cc_loc
}
#' Extract Word Vectors From Current Vector File
#'
#' This function provides a more convenient wrapper for \code{extract_words}.
#' It uses the current vector file, whose location can be found using
#' \code{\link{cbn_get_vectorfile_location}} and assigned with
#' \code{\link{cbn_set_vectorfile_location}}.
#'
#' @param words words to get vectors for
#' @param verbose whether to report on progress
#' @param report_every how often to check in to see if we should stop
#'
#' @return a matrix with word vectors as rows
#' @export
cbn_extract_word_vectors <- function(words, verbose = FALSE, report_every = 100000){
loc <- cbn_get_vectorfile_location() # stops if none is set
mat <- cbn:::extract_words(words, vectors_file = loc, verbose = verbose,
report_every = report_every)
mat
}
#' Calculates Cosine Similarity Between Matrix Rows
#'
#' This function calculates the cosine similarity matrix between all
#' rows of a matrix \code{x}. When \code{x} and \code{y} are vectors
#' it calculates the cosine similarity between them. When \code{x}
#' is a vector and \code{y} is a matrix it calculates the cosine
#' between \code{x} and each row of \code{y}.
#'
#' This code is taken directly from the \code{lsa} package but adjusted to
#' operate rowwise.
#'
#' @param x A vector or a matrix (e.g., a document-term matrix).
#' @param y A vector with compatible dimensions to x. If NULL, use all columns of \code{x}.
#' @source The original code is from the \code{cosine} function by
#' Fridolin Wild (f.wild@open.ac.uk) in the \code{lsa} package.
#' @return An \code{ncol(x)} by \code{ncol(x)} matrix of cosine similarities, a scalar
#' cosine similarity, or a vector of cosine simialrities of length \code{nrow(y)}.
#' @export
cbn_cosine <- function(x, y = NULL){
if (is.matrix(x) && is.null(y)) {
co = array(0, c(nrow(x), nrow(x)))
f = rownames(x)
dimnames(co) = list(f, f)
for (i in 2:nrow(x)) {
for (j in 1:(i - 1)) {
co[i, j] = cbn_cosine(x[i, ], x[j, ])
}
}
co = co + t(co)
diag(co) = 1
return(as.matrix(co))
} else if (is.vector(x) && is.vector(y)) {
return(crossprod(x, y) / sqrt(crossprod(x) * crossprod(y)))
} else if (is.vector(x) && is.matrix(y)) {
co = vector(mode = "numeric", length = nrow(y))
names(co) = rownames(y)
for (i in 1:nrow(y)) {
co[i] = cbn_cosine(x, y[i, ])
}
return(co)
} else {
stop("Either one matrix, a vector and a matrix, or two vectors needed as input")
}
}
.onUnload <- function(libpath) {
library.dynam.unload("cbn", libpath)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.