# xtdescribe.R
# ::rtemis::
# 2024 EDG
#' Describe longitudinal dataset
#'
#' This is a test to emulate the `xtdescribe` function in Stata.
#'
#' @param x data.frame with longitudinal data
#' @param ID_col Integer: The column position of the ID variable
#' @param time_col Integer: The column position of the time variable
#' @param n_patterns Integer: The number of patterns to display
#'
#' @author EDG
#' @export
xtdescribe <- function(
x,
ID_col = 1,
time_col = 2,
n_patterns = 9) {
id_name <- names(x)[ID_col]
time_name <- names(x)[time_col]
# Print vec_describe of ID, with n = number of unique IDs
id_us <- sort(unique(x[[id_name]]))
n_ids <- length(id_us)
time_us <- sort(unique(x[[time_name]]))
time_min <- min(time_us)
time_max <- max(time_us)
leftwidth <- max(nchar(id_name), nchar(time_name))
cat(
pad(id_name, leftwidth), ": ", vec_describe(id_us),
" n = ", length(id_us), "\n",
sep = ""
)
cat(
pad(time_name, leftwidth), ": ", vec_describe(time_us),
" T = ", length(time_us), "\n",
sep = ""
)
# Calculate delta for time variable
# ?is this minimum delta?
delta <- min(diff(time_us))
cat(
rep(" ", leftwidth), " Delta (", time_name, ") = ", delta, " unit\n",
sep = ""
)
span <- ((max(time_us) - min(time_us)) / delta) + 1
cat(
rep(" ", leftwidth), " Span (", time_name, ") = ", span, " periods\n",
sep = ""
)
# Does id * time have unique values?
id_time_unique <- length(unique(interaction(x[[id_name]], x[[time_name]]))) == nrow(x)
uid <- if (id_time_unique) {
"uniquely identifies"
} else {
"does not uniquely identify"
}
cat(
rep(" ", leftwidth), " (", id_name, "*", time_name, " ", uid, " each observation)\n",
sep = ""
)
# Distribution of T_i at min, 5%, 25%, 50%, 75%, 95%, max
cat(
"\nDistribution of T_i:",
"\tmin\t5%\t25%\t50%\t75%\t95%\tmax\n",
sep = ""
)
id_freq <- table(x[[id_name]])
id_freq_quant <- quantile(id_freq, c(0, 0.05, 0.25, 0.5, 0.75, 0.95, 1))
cat(
rep(" ", 20),
"\t",
paste(id_freq_quant, collapse = "\t"), "\n",
sep = ""
)
# Participation pattern by time
# Get N IDs per time point, calculate pct of total IDs and rank
id_time_freq <- table(x[[id_name]], x[[time_name]])
# Insert columns of 0s for missing time points
# id_time_freq <- cbind(id_time_freq, matrix(0, nrow = nrow(id_time_freq), ncol = span - ncol(id_time_freq)))
# Add column names for missing time points
missing_time_points <- setdiff(seq(time_min, time_max, by = delta), time_us)
missing <- matrix(0, nrow = nrow(id_time_freq), ncol = length(missing_time_points))
colnames(missing) <- missing_time_points
id_time_freq <- cbind(id_time_freq, missing)
# Re order columns by name
id_time_freq <- id_time_freq[, order(as.numeric(colnames(id_time_freq)))]
# Convert to pattern matrix by pasting all columns by row
# id_time_freq_char <- as.matrix(id_time_freq)
# id_time_freq_char[id_time_freq_char == 0] <- "."
id_time_freq_char <- matrix(as.character(id_time_freq), nrow = nrow(id_time_freq))
id_time_freq_char[id_time_freq_char == "0"] <- "."
id_time_freq_pattern <- apply(id_time_freq_char, 1, paste, collapse = "")
id_time_freq_pattern_freq <- table(id_time_freq_pattern)
id_time_freq_pattern_sorted <- sort(id_time_freq_pattern_freq, decreasing = TRUE)
# Make data.frame with Frequency, Percent, Cumulative Percent of top n_patterns and rest
pattern_summary <- data.frame(
`Freq.` = as.numeric(id_time_freq_pattern_sorted)[seq_len(n_patterns)]
)
pattern_summary$Percent <- round((pattern_summary$`Freq.` / n_ids) * 100, digits = 2)
pattern_summary$`Cum.` <- cumsum(pattern_summary$Percent)
pattern_summary$Pattern <- names(id_time_freq_pattern_sorted)[seq_len(n_patterns)]
# Add Freq, Percent, Cumulative Percent of rest
pattern_summary <- rbind(
pattern_summary,
data.frame(
`Freq.` = sum(id_time_freq_pattern_sorted[-seq_len(n_patterns)]),
Percent = round((sum(id_time_freq_pattern_sorted[-seq_len(n_patterns)]) / n_ids) * 100, digits = 2),
`Cum.` = "100.00",
Pattern = "(other patterns)"
)
)
# Missing pattern is X for time points with data and . for time points with no data
missing_pattern <- rep("X", ncol(id_time_freq))
missing_pattern[colSums(id_time_freq) == 0] <- "."
missing_pattern <- paste(missing_pattern, collapse = "")
# Add row with Total
pattern_summary <- rbind(
pattern_summary,
data.frame(
`Freq.` = n_ids,
Percent = "100.00",
`Cum.` = "",
Pattern = missing_pattern
)
)
print(pattern_summary, row.names = FALSE)
} # xtdescribe
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.