# FamData.R - Definition for FamData R6 class and methods
#' Family data for modeling
#'
#' @description An R6 class that stores family data and a kinship matrix.
#'
#' @details Methods of this class can be used to fit regression models such as
#' polygenic mixed models. The class currently supports only family data with
#' one observation per subject and will check that this requirement is met.
#'
#' Note that the private `data` member of the instance (returned by the
#' [`FamData$get_data()`](#method-get_data) method) will always be sorted by
#' ascending `family_id` and ascending `indiv_id` within `family_id`
#' regardless of the sort order of the input data set. Also, pedigree columns
#' in the input data set will be renamed in the `data` member; this mapping
#' can be returned using the [`FamData$get_var_map()`](#method-get_var_map)
#' method.
#'
#' @export
FamData <- R6Class(
"FamData",
parent_env = getNamespace("FamModel"),
lock_class = TRUE,
# Public members ============================================================
public = list(
# Constructor =============================================================
#' @description Constructs a new instance of this class.
#'
#' @details All constructor arguments other than `data`
#' *must be named*. There are two possible constructor signatures:
#' ```
#' FamData$new(data, family_id, indiv_id, proband, sex,
#' maternal_id, paternal_id, mzgrp, dzgrp)
#'
#' FamData$new(data, family_id, indiv_id, proband, phi)
#' ```
#' @param data A `data.frame` or [`data.table`] containing all
#' phenotypes and genetic variables of interest.
#' @param ... The arguments required for a particular method signature;
#' see below for detailed descriptions.
#' @param family_id Character string containing the name of a numeric or
#' character column in `data` containing a family identifier.
#' @param indiv_id Character string containing the name of a numeric or
#' character column in `data` containing a individual identifier.
#' @param proband Character string containing the name of a numeric column
#' in `data` containing 1 for probands and 0 otherwise. `NA` is not
#' permitted.
#' @param sex Character string containing the name of a numeric column in
#' `data` containing the individual's sex (1 = "male", 2 = "female")
#' or `NA` if unknown.
#' @param maternal_id Character string containing the name of a column in
#' `data` of the same type as `indiv_id` containing the `indiv_id` of the
#' individual's mother or 0 (numeric), "" (character), or `NA` if a
#' founder. See documentation for [`kinship2::pedigree`].
#' @param paternal_id Character string containing the name of a column in
#' `data` of the same type as `indiv_id` containing the `indiv_id` of the
#' individual's father or or 0 (numeric), "" (character), or `NA` if a
#' founder. See documentation for [`kinship2::pedigree`].
#' @param mzgrp Character string containing the name of a numeric,
#' character, or factor column in `data` containing the same value for all
#' members of a monozygotic twin group. `NA` should be used for all
#' individuals who are not monozygotic twins.
#' @param dzgrp Character string containing the name of a numeric,
#' character, or factor column in `data` containing the same value for all
#' members of a dizygotic twin group. `NA` should be used for all
#' individuals who are not dizygotic twins.
#' @param phi A matrix coercible to a [`Matrix::dsCMatrix-class`] containing
#' the pairwise kinship coefficients for individuals. Note that
#' `phi[i, j]` must contain the kinship coefficient between the
#' individuals in `data[i, ]` and `data[j, ]`.
initialize = function(data, ...) {
private$data_name <- deparse(substitute(data))
args <- list(...)
# Pedigree data constructor ---------------------------------------------
if (setequal(
names(args),
c(
"family_id",
"indiv_id",
"maternal_id",
"paternal_id",
"sex",
"proband",
"mzgrp",
"dzgrp"
)
)) {
# Create variable name map
private$var_map <- c(
fmid = args[["family_id"]],
id = args[["indiv_id"]],
mid = args[["maternal_id"]],
pid = args[["paternal_id"]],
sex = args[["sex"]],
pr = args[["proband"]],
mzid = args[["mzgrp"]],
dzid = args[["dzgrp"]]
)
# Populate data member, rename to standard variable names for use
# within FamData object
private$data <- if (is.data.table(data)) {
copy(data)
} else {
data.table(data)
}
setnames(
private$data,
old = private$var_map,
new = names(private$var_map)
)
# Change all integer-type numeric columns to double (i.e., standard
# numeric)
is_int_col <- private$data[, sapply(.SD, is.integer)]
if (any(is_int_col)) {
int_cols <- names(is_int_col)[is_int_col]
private$data[,
(int_cols) := lapply(.SD, as.numeric),
.SDcols = int_cols
]
}
# Sort by fmid, then id
setkey(private$data, fmid, id)
# Make sure that there is only one row per individual within each fmid
if (anyDuplicated(private$data[, .(fmid, id)])) {
stop(
"Multiple rows found for the same individual within a fmid"
)
}
# Check variables
if (
!is.numeric(private$data[, fmid]) &
!is.character(private$data[, fmid])
) {
stop(
"family_id column contains improperly formatted data. ",
"See ?FamData."
)
}
if (
!is.numeric(private$data[, id]) &
!is.character(private$data[, id])
) {
stop(
"indiv_id column contains improperly formatted data. See ?FamData."
)
}
private$unique_id <- !anyDuplicated(private$data[, id])
if (class(private$data[, mid]) != class(private$data[, id])) {
stop(
"maternal_id column contains improperly formatted data. ",
"See ?FamData."
)
}
if (class(private$data[, pid]) != class(private$data[, id])) {
stop(
"paternal_id column contains improperly formatted data. ",
"See ?FamData."
)
}
bad_peds <- data.table(
with(
private$data,
kinship2::familycheck(
famid = fmid,
id = if (private$unique_id) id else paste(fmid, id, sep = "/"),
father.id = if (private$unique_id) {
pid
} else {
paste(fmid, pid, sep = "/")
},
mother.id = if (private$unique_id) {
mid
} else {
paste(fmid, mid, sep = "/")
}
)
)
)[unrelated > 0 | split != 1 | join != 0]
if (nrow(bad_peds) > 0) {
cat("Incorrectly specified pedigrees:\n")
print(bad_peds)
stop("Incorrectly specified pedigrees found. See above output.")
}
if (
!is.numeric(private$data[, pr]) |
!setequal(unique(private$data[, pr]), c(0, 1))
) {
stop(
"proband column contains improperly formatted data. See ?FamData."
)
}
if (
!is.numeric(private$data[, sex]) |
length(setdiff(unique(private$data[, sex]), c(1, 2, NA))) > 0
) {
stop(
"sex column contains improperly formatted data. See ?FamData."
)
}
# Create relation data.table for twins
rel <- NULL
if (private$data[, any(!is.na(mzid))]) {
rel <- private$data[
!is.na(mzid),
cbind(
expand.grid(
id1 = id,
id2 = id,
stringsAsFactors = FALSE
),
code = 1
),
keyby = .(fmid, mzid)
][id2 > id1][, mzid := NULL]
}
if (private$data[, any(!is.na(dzid))]) {
rel <- rbind(
rel,
private$data[
!is.na(dzid),
cbind(
expand.grid(
id1 = id,
id2 = id,
stringsAsFactors = FALSE
),
code = 2
),
keyby = .(fmid, dzid)
][id2 > id1][, dzid := NULL]
)
}
# Create pedigreeList
ped_args <- with(
private$data,
list(
id = id,
dadid = pid,
momid = mid,
sex = ifelse(is.na(sex), 3, sex),
famid = fmid
)
)
if (!is.null(rel)) {
setnames(rel, old = "fmid", new = "famid")
ped_args[["relation"]] <- rel
}
private$ped_list <-
do.call(kinship2::pedigree, ped_args)
# Calculate autosomal kinship matrix
private$phi <-
kinship2::kinship(private$ped_list, chrtype = "autosome")
# Check that order of individuals in kinship matrix is the same as in
# data member. Note that kinship2::kinship uses "id" unless it is not
# unique across families, in which case it uses "fmid/id"
if (
!(
identical(
private$data[, as.character(id)], rownames(private$phi)
) &&
identical(
private$data[, as.character(id)], colnames(private$phi)
)
) &&
!(
identical(
private$data[, paste(fmid, id, sep = "/")], rownames(private$phi)
) &&
identical(
private$data[, paste(fmid, id, sep = "/")], colnames(private$phi)
)
)
) {
stop("Subject order in phi matrix does not match data member")
}
# Store IDs of families with consanguinity, determined as those
# containing individuals with phi_ii > 0.5. This follows because phi_ii
# = 0.5(1 + f_i) and f_i > 0 iff the individual is inbred (i.e., parents
# are related)
private$consang <- private$data[
Matrix::diag(private$phi) > 0.5,
unique(fmid)
]
# Data set and kinship matrix constructor -----------------------------
} else if (
setequal(
names(args),
c("family_id", "indiv_id", "proband", "phi")
)
) {
# Create variable name map
private$var_map <- c(
fmid = args[["family_id"]],
id = args[["indiv_id"]],
pr = args[["proband"]]
)
# Populate data member and rename to standard variable names for use
# within FamData object. DO NOT SORT YET!
private$data <- data.table(data)
setnames(
private$data,
old = private$var_map,
new = names(private$var_map)
)
# Change all integer-type numeric columns to double (i.e., standard
# numeric)
is_int_col <- private$data[, sapply(.SD, is.integer)]
if (any(is_int_col)) {
int_cols <- names(is_int_col)[is_int_col]
private$data[,
(int_cols) := lapply(.SD, as.numeric),
.SDcols = int_cols
]
}
# Check variables
if (
!is.numeric(private$data[, fmid]) &
!is.character(private$data[, fmid])
) {
stop(
"family_id column contains improperly formatted data. ",
"See ?FamData."
)
}
if (
!is.numeric(private$data[, id]) &
!is.character(private$data[, id])
) {
stop(
"indiv_id column contains improperly formatted data. See ?FamData."
)
}
private$unique_id <- !anyDuplicated(private$data[, id])
if (
!is.numeric(private$data[, pr]) |
!setequal(unique(private$data[, pr]), c(0, 1))
) {
stop(
"proband column contains improperly formatted data. See ?FamData."
)
}
# Make sure that there is only one row per individual within each fmid
if (anyDuplicated(private$data[, .(fmid, id)])) {
stop(
"Multiple rows found for the same individual within a fmid"
)
}
# Cast kinship matrix argument to appropriate type
phi <- as(as(args[["phi"]], "symmetricMatrix"), "dsCMatrix")
# Get "id" or "fmid/id" in original sort order from data
# member. Mimicking behavior of kinship2::kinship, which uses "id"
# unless it is not unique across families
ids <- if (private$unique_id) {
private$data[, as.character(id)]
} else {
private$data[, paste(fmid, id, sep = "/")]
}
# Make "id" or "fmid/id" the row and column names of kinship matrix
dimnames(phi) <- list(ids, ids)
# Sort data member member by fmid, then id
setkey(private$data, fmid, id)
# Get "id" or "fmid/id" from data member in new sort order
sorted_ids <- if (private$unique_id) {
private$data[, as.character(id)]
} else {
private$data[, paste(fmid, id, sep = "/")]
}
# Permute rows and columns of kinship matrix to be in this order and
# assign to kinship matrix member
private$phi <- phi[sorted_ids, sorted_ids]
# Store IDs of families with consanguinity, determined as those
# containing individuals with phi_ii > 0.5. This follows because phi_ii
# = 0.5(1 + f_i) and f_i > 0 iff the individual is inbred (i.e., parents
# are related)
private$consang <- private$data[
Matrix::diag(private$phi) > 0.5,
unique(fmid)
]
# Uncrecognized constructor -------------------------------------------
} else {
stop("Unrecognized constructor signature. Please see documentation.")
}
},
# Accessors ===============================================================
#' @description Returns a copy of data member (to prevent accidental
#' modification of data member [`data.table`] by reference).
get_data = function() copy(private$data),
#' @description Returns name of input `data.frame` or [`data.table`].
get_data_name = function() private$data_name,
#' @description Returns vector containing family IDs in which consanguinity
#' was found (based on individual self-kinship coefficient greater than
#' 0.5).
get_consang = function() private$consang,
#' @description Returns kinship matrix member.
get_phi = function() private$phi,
#' @description Returns list mapping variable names in input data set in
#' `data` argument to those in data member.
get_var_map = function() private$var_map,
# Print method ============================================================
#' @description Prints information about contents of `FamData` objects
#' with nice formatting.
#'
print = function() {
cat("\n==========================\n")
cat("FamData OBJECT INFORMATION\n")
cat("==========================\n")
cat("Input data set: ", private$data_name, "\n", sep = "")
cat("Families: ", private$data[, uniqueN(fmid)], "\n", sep = "")
cat("Subjects: ", uniqueN(private$data[, .(fmid, id)]), "\n", sep = "")
cat(
"Kinship matrix: ",
if ("mid" %in% names(private$var_map)) {
"Calculated from data set"
} else {
"Supplied to constructor"
},
"\n", sep = ""
)
cat("Consanguinous families: ")
if (length(private$consang) == 0) {
cat("None\n")
} else {
cat("\n")
cat(private$consang, sep = "\n")
}
cat("Pedigree variable mapping:\n")
cat(
sprintf(" %-6s : %s", names(private$var_map), private$var_map),
sep = "\n"
)
cat(
"Unique subject ID: ", if (private$unique_id) "id" else "fmid + id",
"\n", sep = ""
)
cat("Available variables:\n")
cat(
strwrap(paste0(
setdiff(names(private$data), names(private$var_map)), collapse = ", "
)),
sep = "\n"
)
},
# Model fitting methods ===================================================
#' @description Fits a linear mixed model to ascertained families
#'
#' @details See `vignette("linear_mixed_models")` for background,
#' implementation details, and references.
#'
#' @param formula A [Formula::Formula()] object describing the model for a
#' phenotype. The formula is of the form `y ~ mean | group`, where `y`
#' (required) is the outcome, `mean` (optional) specifies the mean model
#' that includes an intercept by default, and `| group` (optional)
#' specifies a factor term that can be used assign families to homogeneous
#' groups with different variance components. Multiple variables can be
#' combined into a single factor using `:`.
#' @param ... Additional parameters to pass to the `control` list for
#' [optim()] with `method = "L-BGFS-B"`. Note that `parscale` and
#' `fnscale` cannot be modified.
#'
#' @return Returns a [`FamLMMFit`] object.
#'
lmm = function(formula, ...) {
if (!Formula::is.Formula(formula)) formula <- Formula::Formula(formula)
# Set Formula environment to base namespace to prevent using variables not
# explicitly included in data argument of model.frame
environment(formula) <- .BaseNamespaceEnv
formula_dims <- length(formula)
if (formula_dims[1] != 1) {
stop(
"A single response variable must be specified in the supplied ",
"formula"
)
} else if (formula_dims[2] > 2) {
stop(
"More than 2 RHS components cannot be specified in formula"
)
} else if (formula_dims[2] == 2) {
formula <- update(formula, . ~ . | 0 + .)
} else if (formula_dims[2] == 1) {
formula <- update(formula, . ~ . | 1)
}
mod_data <- private$make_model_mats_lmm(formula)
mod_data[["model"]] <- "sing_asc_lmm"
init_fits <- lm(
formula(formula, rhs = 1),
data = private$data[mod_data[["incl_ids"]], on = .(fmid, id)][pr == 0]
)
# Make sure name patterns reserved for variance components are not used in
# mean model
if (any(grepl("^(h2_a|sigma)(\\..+)?$", names(coef(init_fits))))) {
stop(
"Variables in mean model cannot have names matching the pattern ",
"'^(h2_a|sigma)(\\..+)?$'"
)
}
parameters <- list(
betas = coef(init_fits),
h2_a = rep(0, ncol(mod_data[["f_pops"]])),
sigma = rep(
summary(init_fits)[["sigma"]],
ncol(mod_data[["f_pops"]])
)
)
if (ncol(mod_data[["f_pops"]]) > 1) {
names(parameters[["h2_a"]]) <- names(parameters[["sigma"]]) <-
colnames(mod_data[["f_pops"]])
}
objfun <- TMB::MakeADFun(
mod_data,
parameters,
DLL = "FamModel",
method = NULL,
silent = TRUE
)
attr(objfun, "type") <- "LMM-SA"
optres <- lmm_optim(objfun, ...)
# Return new FamLMMFit object with results
FamLMMFit$new(self, formula, objfun, optres)
},
# Plotting methods ========================================================
#' @description For objects initialized with pedigree data, allows plotting
#' of individual pedigrees using [`kinship2::plot.pedigree`].
#' @param famid Argument matching the value of a single family in the
#' `family_id` column used in the constructor.
plot_pedigree = function(famid) {
if (!is.null(private$ped_list)) plot(private$ped_list[famid])
}
),
# Private members ===========================================================
private = list(
# Data members ============================================================
consang = NULL,
data = NULL,
data_name = NULL,
ped_list = NULL,
phi = NULL,
unique_id = NULL,
var_map = NULL,
# Utility methods =========================================================
make_model_mats_lmm = function(formula) {
if (private$data[pr == 1, .N, keyby = fmid][, max(N)] > 1) {
stop(
"Cannot have more than one proband per family under single ",
"ascertainment"
)
}
non_na <- which(apply(
model.frame(formula, private$data, na.action = na.pass),
1,
function(x) all(!is.na(x))
))
fam_w_proband <- private$data[non_na][pr == 1, fmid]
fam_w_rels <- private$data[
non_na,
.(n_rel = sum(pr == 0)),
keyby = fmid
][n_rel > 0, fmid]
incl <- sort(intersect(
non_na,
private$data[, which(fmid %in% intersect(fam_w_proband, fam_w_rels))]
))
mf <- model.frame(
formula,
private$data[incl],
na.action = na.fail,
drop.unused.levels = TRUE,
fmid = fmid,
id = id,
proband = pr
)
# Check that selection of observations has maintained original sort order
# of data member, which is ascending fmid, then ascending id
if (
!identical(
order(mf[["(fmid)"]], mf[["(id)"]]),
seq(1, nrow(mf))
)
) {
stop("Model frame not ordered by ascending fmid, then ascending id")
}
incl_ids <- mf[, c("(fmid)", "(id)")]
names(incl_ids) <- c("fmid", "id")
incl_ids_char <- with(mf, paste(`(fmid)`, `(id)`, sep = "/"))
incl_phi_ids <- if (private$unique_id) {
with(mf, as.character(`(id)`))
} else {
incl_ids_char
}
y <- as.numeric(model.response(mf))
names(y) <- incl_ids_char
X <- model.matrix(formula, mf, rhs = 1)
rownames(X) <- incl_ids_char
pops <- model.matrix(formula, mf, rhs = 2)
if (any(apply(pops, 1, sum) != 1)) {
stop(
"Grouping formula for variance parameters does not result in a ",
"mutually exclusive and exhaustive partition into groups."
)
}
f_sum <- data.table(
fmid = mf[["(fmid)"]],
proband = mf[["(proband)"]],
pops
)[,
.(pr_idx = .I[proband == 1], .N),
keyby = c("fmid", colnames(pops))
]
if (anyDuplicated(f_sum[, fmid])) {
stop(
"The following families have heterogeneous population IDs: ",
paste0(f_sum[duplicated(f_sum[, fmid]), fmid], collapse = ", "),
".\nAll members of a family included in the model must come from ",
"the same population."
)
}
f_sizes <- f_sum[, N]
f_pr_idxs <- f_sum[, pr_idx]
names(f_pr_idxs) <- names(f_sizes) <- f_sum[, fmid]
f_pops <- as.matrix(
f_sum[, colnames(pops), with = FALSE],
nrow = nrow(f_sum),
ncol = ncol(pops),
rownames = f_sum[, fmid],
colnames = colnames(pops)
)
list(
y = y,
X = X,
phi = private$phi[incl_phi_ids, incl_phi_ids],
f_sizes = f_sizes,
f_pr_idxs = f_pr_idxs,
f_pops = f_pops,
incl_ids = as.list(incl_ids)
)
}
)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.