Nothing
#' Add random error flags to a data frame.
#'
#' \code{add_random_error} adds a column of error flags (between 0 and 1)
#' to a data frame based on the \code{prob}.
#'
#' @param dataset A data frame of the dataset.
#' @param error_name A string of the name and type of the error in the form of
#' 'error name_error type'. The error name should be one of the variable name
#' in the \code{dataset}, and the error type can be either: 'missing', 'insert',
#' 'variant', 'typo', 'pho', 'ocr', 'trans_date' or 'trans_char'.
#' @param prob A vector of two numerical probablities, where the first one is
#' the probablity of being 0 and the second one is the probablity of being 1.
#' @return A data frame of the \code{dataset} with an additional column of binary encoded
#' error.
#' @examples
#' adult_with_flag <- add_random_error(adult[1:100,], prob = c(0.97, 0.03), "age_missing")
#' adult_with_flag <- add_random_error(adult_with_flag, prob = c(0.65, 0.35), "education_typo")
#'
#' @export
add_random_error <- function(dataset, error_name, prob = c(0.95, 0.05))
{
tmp <- sample(c(0, 1), prob = prob, replace = TRUE, size = nrow(dataset))
tmp <- as.factor(tmp)
dataset <- cbind(dataset, tmp)
colnames(dataset)[length(dataset)] <- paste0(error_name, "_flag")
return(dataset)
}
#' Add two dependent error flags to a data frame.
#'
#' \code{add_dependent_error} adds two column of dependent error flags (between 0 and 1)
#' to a data frame.
#'
#' @param dataset A data frame of the dataset.
#' @param error_names A string of the variable names and type of the error in the form of
#' 'variable 1_variable 2_error type'. The error of variable 2 depends on the error of
#' varable 1. The error type can be either: 'missing', 'insert', 'variant', 'typo',
#' 'pho', 'ocr', 'trans_date' or 'trans_char'.
#' @param prior_probs A vector of two numerical probablities, where the first one is
#' the prior probablity of variable 1 being 0 (no error) and the second one is the prior probablity
#' of variable 1 being 1 (having error).
#' @param cond_probs A vector of four numerical probablities, where the first two probablities
#' are the probablities of variable 2 being 0 and 1 given variable 1 being 0, and the last
#' two are the probablities of variable 2 being 0 and 1 given variable 1 being 1.
#' @return A data frame of the \code{dataset} with two additional dependent column of binary
#' encoded error.
#' @examples
#' adult_with_flag <- add_dependent_error(adult[1:100,], "race_sex_typo")
#' adult_with_flag <- add_dependent_error(adult[1:100,], "age_sex_missing",
#' prior_probs = c(0.99, 0.01),
#' cond_probs = c(0.95, 0.05, 0.4, 0.6))
#'
#' @export
add_dependent_error <- function(dataset, error_names,
prior_probs = c(0.50, 0.50),
cond_probs = c(0.95, 0.05, 0.85, 0.15))
{
sample_cpt <- function(prior, cond){
il <- mapply(function(x,y){sample(c(0,1), length(x), prob = y, replace = TRUE)},
x=split(prior, prior),
y=split(cond, seq(nrow(cond))))
unsplit(il, prior)
}
namestring <- strsplit(error_names, split = "_")[[1]]
var1 <- sample(c(0,1), nrow(dataset), prob=prior_probs, replace = TRUE)
if (length(cond_probs) == 4){
cond_probs <- matrix(cond_probs, byrow=TRUE, nrow = 2)
var2 = sample_cpt(var1, cond_probs)
tmp = table(var1, var2)
# dimnames(tmp) = list(c(paste0(namestring[1],"_0"), paste0(namestring[1],"_1")),
# c(paste0(namestring[2],"_0"), paste0(namestring[2],"_1")))
# print("The conditional distribution of the error flags is:" )
# print(prop.table(tmp, margin = 1))
var1 <- as.factor(var1)
var2 <- as.factor(var2)
dataset <- cbind(dataset, var1, var2)
colnames(dataset)[length(dataset)-1] <- paste0(namestring[1], "_flag")
colnames(dataset)[length(dataset)] <- paste0(namestring[2], "_flag")
} else {
stop('please use appropirate cond_probs format')
}
return(dataset)
}
#' Add a synthetic but realistic variable to a dataset following some rules.
#'
#' \code{add_variable} adds a column of new variable to a dataset. This new
#' variable generated by some realistic rules. Several type of variables
#' are included:
#' \enumerate{
#' \item {nhsid}: each row is assigned with an identifical 10-digit
#' id that is randomly generated following the Modulus 11 Algorithm;
#' \item {dob}: if the \code{age_dependency} is TRUE and there is a variable called 'age'
#' in the \code{dataset}, the dob is generated based on the value of age and
#' \code{end_date}. If \code{age_dependency} is FALSE, the dob is randomly
#' generated between \code{start_date} and \code{end_date};
#' \item address: a random UK address sampled from 30,000 UK addresses, see \code{\link{gen_address}};
#' \item firstname: randomly sample a firstname from the selected database:
#' \itemize{
#' \item \code{country} If is 'uk' and \code{gender_dependency} and \code{age_dependency}
#' are both TRUE, the generated firstnames will automatically sample a firstname that based
#' on the gender and age of the indviduals within the \code{dataset}. The uk
#' firstname database was extracted from ONS containing firstnames and their frequencies
#' in England and Wales from 1996 to 2018.
#' \item If \code{country} is 'us' and \code{gender_dependency} and \code{race_dependency}
#' are both TRUE, the generated firstnames will automatically sample a firstname that based
#' on the gender and ethnicity of the indviduals within the \code{dataset}. The us
#' firstname database was extracted from \code{\link[randomNames:randomNamesData]{randomNamesData}}.
#' Current ethnicity codes are: 1 American Indian or Native Alaskan, 2 Asian or Pacific Islander,
#' 3 Black (not Hispanic), 4 Hispanic, 5 White (not Hispanic) and 6 Middle-Eastern, Arabic.
#' }
#' \item lastname: randomly sample a lastname from the selected database:
#' \itemize{
#' \item If \code{country} is 'uk', the generated lastnames will automatically sample
#' a lastname from a extracted lastname database. The lastname database was extracted
#' from ONS.
#' \item If \code{country} is 'us' and \code{race_dependency} is TRUE, the generated
#' lastnames will automatically sample a lastname that based on the indvidual's ethnicity.
#' The us lastname database was extracted from \code{\link[randomNames:randomNamesData]{randomNamesData}}.
#' }
#' }
#'
#' @param dataset A data frame of the dataset.
#' @param type A string of the type of variable we want to add: 'nhsid', 'dob',
#' 'address', 'firstname' or 'lastname'.
#' @param country A string variable with a default of 'uk'. It can be either
#' 'uk' or 'us'.
#' @param start_date A Date variable with a default of '1900-01-01'.
#' @param end_date A Date variable with a default of '2020-01-01'.
#' @param age_dependency A logical variable with a default of FALSE
#' @param gender_dependency A logical variable with a default of FALSE
#' @param race_dependency A logical variable with a default of FALSE.
#' @return A data frame of the \code{dataset} with a new generated variable.
#' @examples
#' tmp1 <- add_variable(adult[1:100,], "nhsid")
#' tmp2 <- add_variable(adult[1:100,], "dob", end_date = "2015-03-02", age_dependency = TRUE)
#' tmp3 <- add_variable(adult[1:100,], "address")
#' tmp4 <- add_variable(adult[1:100,], "firstname", country = "uk", age_dependency = TRUE,
#' gender_dependency = TRUE)
#' tmp5 <- add_variable(adult[1:100,], "lastname", country = "uk")
#' tmp6 <- add_variable(adult[1:100,], 'firstname', country = 'us', gender_dependency=TRUE,
#' race_dependency=TRUE)
#' tmp7 <- add_variable(adult[1:100,], 'lastname', country='us', race_dependency = TRUE)
#'
#' @export
add_variable <- function(dataset, type, country = "uk", start_date = "1900-01-01",
end_date = "2020-01-01", age_dependency = FALSE,
gender_dependency = FALSE, race_dependency = FALSE)
{
if (tolower(type) == "nhsid")
{
dataset[type] <- 1
for (i in 1:nrow(dataset))
{
tmp <- gen_nhsid()
while (any(dataset[, type] == tmp))
{
tmp <- gen_nhsid()
}
dataset[i, type] <- tmp
}
} else if (tolower(type) == "dob" || tolower(type) == "date of birth")
{
if (age_dependency)
{
end_date <- as.character(as.Date(end_date))
dataset[type] <- end_date
for (i in 1:nrow(dataset))
{
age <- dataset$age[i]
dataset[i, type] <- as.character(as.Date(end_date) - age * 365)
}
} else
{
start_date <- as.Date(start_date)
end_date <- as.Date(end_date)
tmp <- as.character(as.Date(sample.int(end_date - start_date, nrow(dataset), replace = TRUE),
origin = start_date))
dataset <- cbind(dataset, tmp)
colnames(dataset)[length(dataset)] <- type
}
dataset[, type] <- as.character(dataset[, type])
} else if (tolower(type) == "address")
{
address_uk <- sdglinkage::address_uk
cols <- colnames(address_uk)
dataset[cols] <- NA
randomindex <- sample(1:nrow(address_uk), nrow(dataset), replace = TRUE)
for (i in 1:length(randomindex))
{
dataset[i, cols] <- address_uk[randomindex[i], ]
}
# dataset[cols] <- lapply(dataset[cols], factor)
dataset[, "postcode"] <- as.character(dataset[, "postcode"])
dataset[, "country"] <- as.character(dataset[, "country"])
dataset[, "primary_care_trust"] <- as.character(dataset[, "primary_care_trust"])
dataset[, "longitude"] <- as.numeric(dataset[, "longitude"])
dataset[, "latitude"] <- as.numeric(dataset[, "latitude"])
} else if (tolower(type) == "forename" || tolower(type) == "firstname")
{
dataset[type] <- ""
if (gender_dependency)
{
if (any(tolower(colnames(dataset)) == "sex"))
{
sex_var_name <- colnames(dataset)[tolower(colnames(dataset)) ==
"sex"]
} else if (any(tolower(colnames(dataset)) == "gender"))
{
sex_var_name <- colnames(dataset)[tolower(colnames(dataset)) ==
"gender"]
} else
{
print("either sex or gender will be accepted")
}
if (tolower(country) == "us")
{
firstname <- sdglinkage::firstname_us
if (race_dependency)
{
if (any(tolower(colnames(dataset)) == "race"))
{
race_var_name <- colnames(dataset)[tolower(colnames(dataset)) ==
"race"]
} else if (any(tolower(colnames(dataset)) == "ethnicty"))
{
race_var_name <- colnames(dataset)[tolower(colnames(dataset)) ==
"ethnicty"]
} else
{
print("either race or ethnicty will be accepted")
}
for (i in 1:nrow(dataset))
{
if (grepl(substr(dataset[i, race_var_name], 1, 3), "White (not Hispanic)") ||
dataset[i, race_var_name] == 5)
{
race_value <- "White (not Hispanic)"
} else if (grepl(substr(dataset[i, race_var_name], 1, 3), "American Indian or Native Alaskan")
|| dataset[i, race_var_name] == 1)
{
race_value <- "American Indian or Native Alaskan"
} else if (grepl(substr(dataset[i, race_var_name], 1, 3), "Black (not Hispanic)")
|| dataset[i, race_var_name] == 3)
{
race_value <- "Black (not Hispanic)"
} else if (grepl(substr(dataset[i, race_var_name], 1, 3), "Asian or Pacific Islander")
|| dataset[i, race_var_name] == 2)
{
race_value <- "Asian or Pacific Islander"
} else if (grepl(substr(dataset[i, race_var_name], 1, 3), "Middle-Eastern, Arabic")
|| dataset[i, race_var_name] == 6)
{
race_value <- "Middle-Eastern, Arabic"
} else{
race_value <- "Hispanic"
}
dataset[i, type] <- as.character(sample(firstname[firstname$sex ==
tolower(dataset[i, sex_var_name]) & firstname$race ==
race_value, 1], size = 1, replace = TRUE, prob = firstname[firstname$sex ==
tolower(dataset[i, sex_var_name]) & firstname$race ==
race_value, 2]))
}
} else
{
for (i in 1:nrow(dataset))
{
dataset[i, type] <- as.character(sample(firstname[firstname$sex ==
tolower(dataset[i, sex_var_name]), 1], size = 1,
replace = TRUE, prob = firstname[firstname$sex ==
tolower(dataset[i, sex_var_name]), 2]))
}
}
} else
{
firstname <- sdglinkage::firstname_uk
if (age_dependency)
{
if (any(tolower(colnames(dataset)) == "age"))
{
age_var_name <- colnames(dataset)[tolower(colnames(dataset)) ==
"age"]
for (i in 1:nrow(dataset))
{
birthyear <- as.numeric(substr(as.Date(end_date) - as.numeric(dataset[i,
age_var_name]) * 365, 1, 4))
if (birthyear < 1996)
{
birthyear <- 1996
} else if (birthyear > 2018)
{
birthyear <- 2018
}
dataset[i, type] <- as.character(sample(firstname[firstname$sex ==
tolower(dataset[i, sex_var_name]) & firstname$birthyear ==
birthyear, 1], size = 1, replace = TRUE, prob = firstname[firstname$sex ==
tolower(dataset[i, sex_var_name]) & firstname$birthyear ==
birthyear, 2]))
}
} else if (any(tolower(colnames(dataset)) == "dob"))
{
age_var_name <- colnames(dataset)[tolower(colnames(dataset)) ==
"dob"]
for (i in 1:nrow(dataset))
{
birthyear <- as.numeric(substr(dataset[i, age_var_name],
1, 4))
if (birthyear < 1996)
{
birthyear <- 1996
} else if (birthyear > 2018)
{
birthyear <- 2018
}
dataset[i, type] <- as.character(sample(firstname[firstname$sex ==
tolower(dataset[i, sex_var_name]) & firstname$birthyear ==
birthyear, 1], size = 1, replace = TRUE, prob = firstname[firstname$sex ==
tolower(dataset[i, sex_var_name]) & firstname$birthyear ==
birthyear, 2]))
}
} else
{
print("either age or dob will be accepted")
}
} else
{
for (i in 1:nrow(dataset))
{
dataset[i, type] <- as.character(sample(firstname[firstname$sex ==
tolower(dataset[i, sex_var_name]), 1], size = 1,
replace = TRUE, prob = firstname[firstname$sex ==
tolower(dataset[i, sex_var_name]), 2]))
}
}
}
} else
{
if (tolower(country) == "us")
{
firstname <- sdglinkage::firstname_us
if (race_dependency)
{
if (any(tolower(colnames(dataset)) == "race"))
{
race_var_name <- colnames(dataset)[tolower(colnames(dataset)) ==
"race"]
} else if (any(tolower(colnames(dataset)) == "ethnicty"))
{
race_var_name <- colnames(dataset)[tolower(colnames(dataset)) ==
"ethnicty"]
} else
{
print("either race or ethnicty will be accepted")
}
for (i in 1:nrow(dataset))
{
if (grepl(substr(dataset[i, race_var_name], 1, 3), "White (not Hispanic)") ||
dataset[i, race_var_name] == 5)
{
race_value <- "White (not Hispanic)"
} else if (grepl(substr(dataset[i, race_var_name], 1, 3), "American Indian or Native Alaskan")
|| dataset[i, race_var_name] == 1)
{
race_value <- "American Indian or Native Alaskan"
} else if (grepl(substr(dataset[i, race_var_name], 1, 3), "Black (not Hispanic)")
|| dataset[i, race_var_name] == 3)
{
race_value <- "Black (not Hispanic)"
} else if (grepl(substr(dataset[i, race_var_name], 1, 3), "Asian or Pacific Islander")
|| dataset[i, race_var_name] == 2)
{
race_value <- "Asian or Pacific Islander"
} else if (grepl(substr(dataset[i, race_var_name], 1, 3), "Middle-Eastern, Arabic")
|| dataset[i, race_var_name] == 6)
{
race_value <- "Middle-Eastern, Arabic"
} else{
race_value <- "Hispanic"
}
dataset[i, type] <- as.character(sample(firstname[firstname$race ==
race_value, 1], size = 1, replace = TRUE, prob = firstname[firstname$race ==
race_value, 2]))
}
} else
{
for (i in 1:nrow(dataset))
{
dataset[i, type] <- as.character(sample(firstname[,
1], size = 1, replace = TRUE, prob = firstname[,
2]))
}
}
} else
{
firstname <- sdglinkage::firstname_uk
if (age_dependency)
{
if (any(tolower(colnames(dataset)) == "age"))
{
age_var_name <- colnames(dataset)[tolower(colnames(dataset)) ==
"age"]
for (i in 1:nrow(dataset))
{
birthyear <- as.numeric(substr(as.Date(end_date) - as.numeric(dataset[i,
age_var_name]) * 365, 1, 4))
if (birthyear < 1996)
{
birthyear <- 1996
} else if (birthyear > 2018)
{
birthyear <- 2018
}
dataset[i, type] <- as.character(sample(firstname[firstname$birthyear ==
birthyear, 1], size = 1, replace = TRUE, prob = firstname[firstname$birthyear ==
birthyear, 2]))
}
} else if (any(tolower(colnames(dataset)) == "dob"))
{
age_var_name <- colnames(dataset)[tolower(colnames(dataset)) ==
"dob"]
for (i in 1:nrow(dataset))
{
birthyear <- as.numeric(substr(dataset[i, age_var_name],
1, 4))
if (birthyear < 1996)
{
birthyear <- 1996
} else if (birthyear > 2018)
{
birthyear <- 2018
}
dataset[i, type] <- as.character(sample(firstname[firstname$birthyear ==
birthyear, 1], size = 1, replace = TRUE, prob = firstname[firstname$birthyear ==
birthyear, 2]))
}
} else
{
print("either age or dob will be accepted")
}
} else
{
for (i in 1:nrow(dataset))
{
dataset[i, type] <- as.character(sample(firstname[,
1], size = 1, replace = TRUE, prob = firstname[,
2]))
}
}
}
}
dataset[, type] <- as.character(dataset[, type])
} else if (tolower(type) == "surname" || tolower(type) == "lastname")
{
if (tolower(country) == "us")
{
lastname <- sdglinkage::lastname_us
if (race_dependency)
{
dataset[type] <- ""
if (any(tolower(colnames(dataset)) == "race"))
{
race_var_name <- colnames(dataset)[tolower(colnames(dataset)) ==
"race"]
} else if (any(tolower(colnames(dataset)) == "ethnicty"))
{
race_var_name <- colnames(dataset)[tolower(colnames(dataset)) ==
"ethnicty"]
} else
{
print("either race or ethnicty will be accepted")
}
for (i in 1:nrow(dataset))
{
if (grepl(substr(dataset[i, race_var_name], 1, 3), "White (not Hispanic)") ||
dataset[i, race_var_name] == 5)
{
race_value <- "White (not Hispanic)"
} else if (grepl(substr(dataset[i, race_var_name], 1, 3), "American Indian or Native Alaskan")
|| dataset[i, race_var_name] == 1)
{
race_value <- "American Indian or Native Alaskan"
} else if (grepl(substr(dataset[i, race_var_name], 1, 3), "Black (not Hispanic)")
|| dataset[i, race_var_name] == 3)
{
race_value <- "Black (not Hispanic)"
} else if (grepl(substr(dataset[i, race_var_name], 1, 3), "Asian or Pacific Islander")
|| dataset[i, race_var_name] == 2)
{
race_value <- "Asian or Pacific Islander"
} else if (grepl(substr(dataset[i, race_var_name], 1, 3), "Middle-Eastern, Arabic")
|| dataset[i, race_var_name] == 6)
{
race_value <- "Middle-Eastern, Arabic"
} else{
race_value <- "Hispanic"
}
dataset[i, type] <- as.character(sample(lastname[lastname$race ==
race_value, 1], size = 1, replace = TRUE, prob = lastname[lastname$race ==
race_value, 2]))
}
} else
{
tmp <- sample(lastname[, 1], nrow(dataset), replace = TRUE,
prob = lastname[, 2])
dataset <- cbind(dataset, tmp)
colnames(dataset)[length(dataset)] <- type
}
} else
{
lastname <- sdglinkage::lastname_uk
tmp <- sample(lastname[, 1], nrow(dataset), replace = TRUE,
prob = lastname[, 2])
dataset <- cbind(dataset, tmp)
colnames(dataset)[length(dataset)] <- type
}
dataset[, type] <- as.character(dataset[, type])
}
return(dataset)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.