Nothing
#' Convert Two-Column Genetic Data to Long Format
#'
#' Takes a data frame consisting of metadata followed by paired columns of genetic data,
#' with each column in the pair representing a gene copy at a locus.
#' Returns a list of two data frames: one with genetic data condensed into one column,
#' and the other with two-column structure intact, but with cleaned allele names.
#'
#' @keywords internal
#'
#' @param D A data frame containing two-column genetic data, optionally preceded by metadata.
#' The header of the first genetic data column in each pair lists the locus name, the second is ignored.
#' \strong{Locus names must not have spaces in them!}
#' @param gen_start_col The index (number) of the column in which genetic data starts.
#' Columns must be only genetic data after genetic data starts.
#'
#' @return \code{tcf2long} returns a list of two data frames: in the first, "long", the rightmost
#' column is the genetic data. Two new columns, "locus" and "gene copy", duplicate the original
#' column name provided in the first of each pair, and designate copies "a" and "b", respectively.
#' Metadata is duplicated as necessary for each locus. The second, "clean_short", replicates the
#' input dataset, but with column names replaced by "(locus name) a" and "(locus name) b" in each pair.
#' In other words the locus name has an "a" or a "b" added to it \emph{after a space}.
#'
#'
#' @examples
#' ## Convert the alewife dataset for further processing
#' # the data frame passed into this function must have had
#' # character collections and repunits converted to factors
#' reference <- alewife
#' reference$repunit <- factor(reference$repunit, levels = unique(reference$repunit))
#' reference$collection <- factor(reference$collection, levels = unique(reference$collection))
#' ale_long <- tcf2long(reference, 17)
#' @export
tcf2long <- function(D, gen_start_col) {
n = ncol(D)
loc1 = seq(gen_start_col, n, by = 2)
loc2 = seq(gen_start_col + 1, n, by = 2)
dnam <- names(D)
# checking requirements for gather: even number of loci, no duplicated or empty names, no spaces in names
if( (n - gen_start_col) %% 2 == 0) stop("Odd number of locus columns. Check gen_start_col")
nt <- table(dnam[loc1])
nt <- nt[nt > 1]
if(length(nt)>0) stop("Duplicated locus names: ", paste(names(nt), collapse = ", "))
if(any(dnam=="") || any(is.na(dnam))) stop("Unnamed locus in genetic data")
ns <- stringr::str_detect(dnam[loc1], " " )
if(length(nt)>0) stop("Duplicated locus names: ", paste(names(nt), collapse = ", "))
if( sum(ns) > 0) stop("Locus names cannot contain spaces: ",paste("\"", dnam[loc1][ns], "\"", sep = "", collapse = ", "))
# here is a different way to do it that should be a little bit faster.
num_loci <- length(loc1)
num_short <- nrow(D)
long <- tibble::tibble(
sample_type = rep(D$sample_type, times = num_loci * 2),
repunit = rep(D$repunit, times = num_loci * 2),
collection = rep(D$collection, times = num_loci * 2),
indiv = rep(D$indiv, times = num_loci * 2),
locus = rep(dnam[loc1], each = num_short * 2),
gene_copy = rep(c(rep("a", num_short), rep("b", num_short)), times = num_loci),
allele = as.vector(as.matrix(D[, -(1:(gen_start_col - 1))]))
)
# renaming genetic column data
names(D)[loc2] <- paste(names(D)[loc1], "b", sep = " ")
names(D)[loc1] <- paste(names(D)[loc1], "a", sep = " ")
# long <- tidyr::gather_(data = D, key_col = "Locus", value_col = "allele", gather_cols = names(D)[gen_start_col:n]) %>%
# tidyr::separate(col = Locus, into = c("locus","gene_copy"), sep = " ")
list(long = long, clean_short = D)
}
#' Tabulate occurrences of all observed alleles in reference genetic data
#'
#' Takes the first output of \code{tcf2long}, along with two columns named "collection" and "sample_type",
#' and returns a data frame of allele counts for each locus within each reference population.
#' Alleles to be counted are identified from both reference and mixture populations.
#'
#' The "collection" column should be a key assigning samples to the desired groups,
#' e.g. collection site, run time, year.
#' The "sample_type" column must contain either "reference" or "mixture" for each sample.
#'
#' @keywords internal
#'
#' @param D A data frame containing, at minimum, a column of sample group identifiers named
#' "collection", a column designating each row as "reference" or "mixture", named "sample_type",
#' and (from tcf2long output) locus, gene copy, and observed alleles. If higher-level reporting
#' unit counts are desired, must have a column of reporting unit identifiers named "repunit"
#' @param pop_level a character vector expressing the population level for which allele counts
#' should be tabulated. Set to "collection" for collection/underlying sample group (default),
#' or "repunit" for reporting unit/overlying sample groups
#'
#' @return \code{reference_allele_counts} returns a long-format dataframe, with count data for
#' each collection, locus, and allele. Counts are only drawn from "reference" samples; alleles
#' unique to the "mixture" samples will still appear in the list, but will have 0s for all groups.
#'
#' @examples
#' ## count alleles in alewife reference populations
#' example(tcf2long) # gets variable ale_long
#' ale_rac <- reference_allele_counts(ale_long$long)
#'
#'@export
reference_allele_counts <- function(D, pop_level = "collection") {
# gathering the names of the gene loci and alleles across
# all reference and mixture collections
if(pop_level == "collection") {
if( any(names(D) == "collection") == F) stop("no 'collection' column given")
} else if(pop_level == "repunit") {
if( any(names(D) == "repunit") == F) stop("no 'repunit' column given")
} else stop("invalid selection for pop_level")
if( any(names(D) == "sample_type") == F) stop("no 'sample_type' column given")
if( any(is.na(D$collection)) ) stop("collection column may not contain NAs")
if( any(is.na(D$sample_type)) ) stop("Sample_type column may not contain NAs")
if(length(setdiff( unique(D$sample_type), c("reference","mixture"))) != 0) {
warning("sample_type should not hold values other than 'reference' and 'mixture';
other values ignored in allele counting")
}
D$collection <- factor(D$collection, levels = unique(D$collection))
D$repunit <- factor(D$repunit, levels = unique(D$repunit))
# generate data frame of alleles for each locus
loc_alle_names <- D %>%
dplyr::group_by(locus, allele) %>%
dplyr::tally() %>%
dplyr::ungroup() %>%
dplyr::filter(!is.na(allele)) %>%
dplyr::select(-n)
if(pop_level == "collection") {
# gathering the observed counts of each allele in each reference population
ref_col_alle <- D %>%
dplyr::filter(sample_type == "reference") %>%
dplyr::group_by(collection, locus, allele) %>%
dplyr::tally() %>%
dplyr::ungroup() %>%
dplyr::filter(!is.na(allele)) %>%
dplyr::rename(counts = n)
# creating combined table of all alleles & loci in each population,
# whether in reference samples or not
lefties <- lapply(unique(ref_col_alle$collection), function(x) cbind(collection = x, loc_alle_names, stringsAsFactors = FALSE)) %>%
dplyr::bind_rows()
# counts of each allele pair
pop_tab <- suppressMessages(dplyr::left_join(lefties, ref_col_alle)) %>%
dplyr::select(collection, dplyr::everything()) %>%
dplyr::arrange(collection, locus, allele)
pop_tab$counts[is.na(pop_tab$counts)] <- 0
pop_tab
} else {
# gathering the observed counts of each allele in each reference population
ref_col_alle <- D %>%
dplyr::filter(sample_type == "reference") %>%
dplyr::group_by(repunit, locus, allele) %>%
dplyr::tally() %>%
dplyr::ungroup() %>%
dplyr::filter(!is.na(allele)) %>%
dplyr::rename(counts = n)
# creating combined table of all alleles & loci in each population,
# whether in reference samples or not
lefties <- lapply(unique(ref_col_alle$repunit), function(x) cbind(repunit = x, loc_alle_names, stringsAsFactors = FALSE)) %>%
dplyr::bind_rows()
# counts of each allele paired
pop_tab <- suppressMessages(dplyr::left_join(lefties, ref_col_alle)) %>%
dplyr::select(repunit, dplyr::everything()) %>%
dplyr::arrange(repunit, locus, allele)
pop_tab$counts[is.na(pop_tab$counts)] <- 0
pop_tab
}
}
#' Convert data frame of allele frequencies to nested lists
#'
#' List-izes the output of \code{reference_allele_counts} into a usable format for \code{allelic_list}
#'
#' @keywords internal
#'
#' @param D the long-format dataframe of counts by collection, locus, and allele,
#' output by \code{reference_allele_counts}, to be made into a nested list
#'
#' @return \code{a_freq_list} returns a list named by loci, each element of which is a matrix
#' containing that locus's allele count data. Rows in the matrix mark alleles, and columns collections
#'
#' @examples
#'
#' # Generate a list of individual genotypes by allele from
#' # the alewife data's reference allele count tables
#' example(reference_allele_counts)
#' ale_ac <- a_freq_list(ale_rac)
#' @export
a_freq_list <- function(D, pop_level = "collection") {
if(pop_level == "collection"){
tmp <- D %>%
dplyr::arrange(locus,collection,allele)
split(tmp, f = tmp$locus) %>% # creates the list
lapply(function(x) { # names the rows by allele, and each column by collection
nalle <- length(unique(x$allele))
ret <- matrix(x$counts, nrow = nalle)
dimnames(ret) <- list(allele = unique(x$allele), pop = unique(x$collection))
ret
})
} else {
tmp <- D %>%
dplyr::arrange(locus,repunit,allele)
split(tmp, f = tmp$locus) %>% # creates the list
lapply(function(x) { # names the rows by allele, and each column by collection
nalle <- length(unique(x$allele))
ret <- matrix(x$counts, nrow = nalle)
dimnames(ret) <- list(allele = unique(x$allele), pop = unique(x$repunit))
ret
})
}
}
#' Create genotype lists for each locus
#'
#' Uses the allele counts from \code{a_freq_list} and the cleaned short-format output of
#' \code{tcf2long} to generate a nested list of individual genotypes for each locus
#'
#' @keywords internal
#'
#' @param cs a clean short genetic data matrix; the second element of the
#' output from \code{tcf2long}. Must have a column of individual identifiers, named "indiv"
#' @param ac allele counts from a_freq_list
#' @param samp_type choose which sample types of individuals to include in output:
#' "mixture", "both", or "reference"
#'
#' @return \code{allelic_list} returns a two-component nested list, with data stored as character
#' names of alleles ($chr) or as integer indices for the alleles ($int). Both forms contain lists
#' representing to loci, with two component vectors corresponding to gene copies a and b.
#'
#' @examples
#' example(a_freq_list)
#' ale_cs <- ale_long$clean_short
#' # Get the vectors of gene copies a and b for all loci in integer index form
#' ale_alle_list <- allelic_list(ale_cs, ale_ac)$int
#'
#' @export
allelic_list <- function(cs, ac, samp_type = "both") {
#Choosing a sample type causes the data set to be filtered by this type during allele list retrieval
if(samp_type != "both") {
cs <- cs %>%
dplyr::filter(sample_type == samp_type)
}
# first, pull the cleaned first alleles for each locus into a list of character vectors
all_a <- lapply(paste(names(ac), "a"), function(x) {
ret <- as.character(cs[[x]])
names(ret) <- cs$indiv
ret
})
names(all_a) <- names(ac)
# repeat with second gene copy
all_b <- lapply(paste(names(ac), "b"), function(x) {
ret <- as.character(cs[[x]])
names(ret) <- cs$indiv
ret
})
names(all_b) <- names(ac)
# combine into list of locus lists, each of 2 character vectors, a and b
ret <- lapply(names(ac), function(n) list(a = all_a[[n]], b = all_b[[n]]))
names(ret) <- names(ac)
#convert to factor, then to integers, while preserving sample IDs as names
iret <- lapply(names(ac), function(n) {
fa <- ret[[n]]$a
fb <- ret[[n]]$b
levs <- rownames(ac[[n]])
a <- as.integer(factor(fa, levels = levs))
b <- as.integer(factor(fb, levels = levs))
names(a) <- names(fa)
names(b) <- names(fb)
list(a = a, b = b)
})
names(iret) <- names(ac)
list(chr = ret, int = iret)
}
#' Collect essential data values before mixture proportion estimation
#'
#' Takes all relevant information created in previous steps of data conversion pipeline,
#' and combines into a single list which serves as input for further calculations
#'
#' Genotypes represented in \code{I_list} are converted into a single long vector,
#' ordered by locus, individual, and gene copy, with \code{NA} values represented as 0s.
#' Similarly, \code{AC_list} is unlisted to \code{AC}, ordered by locus, collection,
#' and allele. \code{DP} is a list of Dirichlet priors for likelihood calculations, created
#' by adding the values calculated from \code{alle_freq_prior} to each allele
#' \code{sum_AC} and \code{sum_DP} are the summed allele values for each locus
#' of their parent vectors, ordered by locus and collection.
#'
#' @keywords internal
#'
#' @param AC_list a list of allele count matrices; output from \code{a_freq_list}
#' @param I_list a list of genotype vectors; output from \code{allelic_list}
#' @param PO a vector of collection (population of origin) indices
#' for every individual in the sample, in order identical to \code{I_list}
#' @param coll_N a vector of the total number of individuals in each collection,
#' in order of appearance in the dataset
#' @param RU_vec a vector of collection indices, sorted by reporting unit
#' @param RU_starts a vector of indices, designating the first collection for each
#' reporting unit in RU_vec
#' @param alle_freq_prior a one-element named list specifying the prior to be used when
#' generating Dirichlet parameters for genotype likelihood calculations. The name of the
#' list item determines the type of prior used, with options \code{"const"}, \code{"scaled_const"},
#' and \code{"empirical"}. If \code{"const"}, the listed number will be taken as a constant
#' added to the count for each allele, locus, and collection.
#' If \code{"scaled_const"}, the listed number will be divided by the number of alleles at a locus,
#' then added to the allele counts. If \code{"empirical"}, the listed number will be multiplied
#' by the relative frequency of each allele across all populations, then added to the allele counts.
#'
#' @return \code{list_diploid_params} returns a list of the information necessary
#' for the calculation of genotype likelihoods in MCMC:
#'
#' \code{L}, \code{N}, and \code{C} represent the number of loci, individual genotypes,
#' and collections, respectively. \code{A} is a vector of the number of alleles at each
#' locus, and \code{CA} is the cumulative sum of \code{A}. \code{coll}, \code{coll_N},
#' \code{RU_vec}, and \code{RU_starts} are copied directly from input.
#'
#' \code{I}, \code{AC}, \code{sum_AC}, \code{DP}, and \code{sum_DP} are vectorized
#' versions of data previously represented as lists and matrices; indexing macros
#' use \code{L}, \code{N}, \code{C}, \code{A}, and \code{CA} to access these vectors
#' in later Rcpp-based calculations.
#'
#' @examples
#' example(allelic_list)
#' PO <- as.integer(factor(ale_long$clean_short$collection))
#' coll_N <- as.vector(table(PO))
#'
#' Colls_by_RU <- dplyr::count(ale_long$clean_short, repunit, collection) %>%
#' dplyr::filter(n > 0) %>%
#' dplyr::select(-n)
#' PC <- rep(0, length(unique((Colls_by_RU$repunit))))
#' for(i in 1:nrow(Colls_by_RU)) {
#' PC[Colls_by_RU$repunit[i]] <- PC[Colls_by_RU$repunit[i]] + 1
#' }
#' RU_starts <- c(0, cumsum(PC))
#' RU_vec <- as.integer(Colls_by_RU$collection)
#' param_list <- list_diploid_params(ale_ac, ale_alle_list, PO, coll_N, RU_vec, RU_starts)
#'
#' @export
list_diploid_params <- function(AC_list, I_list, PO, coll_N, RU_vec, RU_starts,
alle_freq_prior = list("const_scaled" = 1)) {
#check for a valid option in computing allele frequency priors
if (!(names(alle_freq_prior)[1] %in% c("const_scaled", "const", "empirical"))) stop("Choice ", names(alle_freq_prior)[1], " unknown for allele frequency prior.")
# get number of alleles at each locus
as <- lapply(AC_list, nrow)
loc_cycle <- names(AC_list)
if(names(alle_freq_prior[1]) == "const_scaled") {
DP_list <- lapply(loc_cycle, function(x) AC_list[[x]] + alle_freq_prior[[1]]/as[[x]])
}
if(names(alle_freq_prior[1]) == "const") {
DP_list <- lapply(loc_cycle, function(x) AC_list[[x]] + alle_freq_prior[[1]])
}
if(names(alle_freq_prior[1]) == "empirical") {
DP_list <- lapply(AC_list, function(x){
rel_freq <- rowSums(x)/sum(x)
x + (rel_freq * alle_freq_prior[[1]])
})
}
list(L = length(AC_list),
N = length(I_list[[1]]$a),
C = ncol(AC_list[[1]]),
A = unname(sapply(AC_list, nrow)),
CA = unname(c(0, cumsum(sapply(AC_list, nrow)))),
coll = PO,
coll_N = coll_N,
RU_vec = RU_vec,
RU_starts = RU_starts,
I = unname(unlist(lapply(I_list, function(l) {
x <- t(cbind(l$a, l$b))
x[is.na(x)] <- 0
x
}))),
AC = unname(unlist(AC_list)),
sum_AC = lapply(AC_list, colSums) %>%
unlist(),
DP = unlist(DP_list),
sum_DP = lapply(DP_list,colSums) %>%
unlist())
}
#' Generate MCMC parameter list from two-column genetic data & print summary
#'
#' This function is a wrapper for all steps to create the parameter list necessary
#' for genotype log-likelihood calculation from the starting two-column genetic data
#'
#' In order for all steps in conversion to be carried out successfully, the dataset
#' must have "repunit", "collection", "indiv", and "sample_type" columns preceding
#' two-column genetic data. If \code{summ == TRUE}, the function prints summary statistics
#' describing the structure of the dataset, as well as the presence of missing data,
#' enabling verification of proper data conversion.
#'
#' @param D A data frame containing two-column genetic data, preceded by metadata.
#' The header of the first genetic data column in each pair lists the locus name,
#' the second is ignored. \strong{Locus names must not have spaces in them!}
#' Required metadata includes a column of unique individual identifiers named "indiv",
#' a column named "collection" designating the sample groups, a column "repunit"
#' designating the reporting unit of origin of each fish, and a "sample_type" column
#' denoting each individual as a "reference" or "mixture" sample. \emph{No NAs should be
#' present in metadata}
#' @param gen_start_col The index (number) of the column in which genetic data starts.
#' Columns must be only genetic data after genetic data starts.
#' @param samp_type the sample groups to be include in the individual genotype list,
#' whose likelihoods will be used in MCMC. Options "reference", "mixture", and "both"
#' @param alle_freq_prior a one-element named list specifying the prior to be used when
#' generating Dirichlet parameters for genotype likelihood calculations. Valid methods
#' include \code{"const"}, \code{"scaled_const"}, and \code{"empirical"}. See
#' \code{?list_diploid_params} for method details.
#' @param summ logical indicating whether summary descriptions of the formatted data be provided
#' @param ploidies a named vector of ploidies (1 or 2) for each locus. The names must the the locus names.
#' @return \code{tcf2param_list} returns the output of \code{list_diploid_params},
#' after the original dataset is converted to a usable format and all relevant values
#' are extracted. See \code{?list_diploid_params} for details
#'
#' @keywords internal
#' @examples
#' # after adding support for haploid markers we need to pass
#' # in the ploidies vector. These markers are all diploid...
#' locnames <- names(alewife)[-(1:16)][c(TRUE, FALSE)]
#' ploidies <- rep(2, length(locnames))
#' names(ploidies) <- locnames
#' ale_par_list <- tcf2param_list(alewife, 17, ploidies = ploidies)
#'
#' @export
tcf2param_list <- function(D, gen_start_col, samp_type = "both",
alle_freq_prior = list("const_scaled" = 1), summ = T, ploidies){
# coerce collection and repunit to be factors. This is important since things get turned
# into integers
D$collection <- factor(D$collection, levels = unique(D$collection))
D$repunit <- factor(D$repunit, levels = unique(D$repunit))
cleaned <- tcf2long(D, gen_start_col)
AC_list <- reference_allele_counts(cleaned$long) %>%
a_freq_list()
I_list <- allelic_list(cleaned$clean_short, AC_list, samp_type = samp_type)$int
#PO <- as.integer(factor(cleaned$clean_short$collection))
if(samp_type == "both") {
PO <- as.integer(cleaned$clean_short$collection)
# PO <- dplyr::select(cleaned$clean_short, collection) %>%
# simplify2array() %>%
# factor(., levels = unique(D$collection)) %>%
# as.integer()
coll_N <- unname(table(cleaned$clean_short$collection))
# coll_N <- dplyr::count(cleaned$clean_short, collection) %>%
# dplyr::select(n)
# coll_N <- unname(unlist(coll_N))
} else {
PO <- dplyr::filter(cleaned$clean_short, sample_type == samp_type) %>%
dplyr::select(collection) %>%
simplify2array() %>%
factor(., levels = unique(D$collection)) %>%
as.integer()
coll_N <- dplyr::filter(cleaned$clean_short, sample_type == samp_type) %>%
dplyr::count(collection) %>%
dplyr::select(n)
coll_N <- unname(unlist(coll_N))
}
### ONLY designed for factorized repunit and collection
Colls_by_RU <- dplyr::count(cleaned$clean_short[, c("repunit", "collection")], repunit, collection) %>%
dplyr::filter(n > 0) %>%
dplyr::select(-n)
# Colls_by_RU <- dplyr::count(cleaned$clean_short, repunit, collection) %>%
# dplyr::select(-n)
PC <- rep(0, length(unique((Colls_by_RU$repunit))))
for(i in 1:nrow(Colls_by_RU)) {
PC[Colls_by_RU$repunit[i]] <- PC[Colls_by_RU$repunit[i]] + 1
}
RU_starts <- c(0, cumsum(PC))
names(RU_starts) <- as.character(unique(Colls_by_RU$repunit))
RU_vec <- as.integer(Colls_by_RU$collection)
names(RU_vec) <- as.character(Colls_by_RU$collection)
params <- list_diploid_params(AC_list, I_list, PO, coll_N, RU_vec, RU_starts, alle_freq_prior)
RU_list <- unique(cleaned$clean_short$repunit)
# here, the names of the indivs, collections, and repunits in the order in which they
# appear as integers in this data structure. We should have done this on day one, but
# we didn't which is why there are so many mysterious ways that have been used to get the
# names of the collections or repunits. At some point, we will have to go through
# and standardize all those different name accesses. But not now...
params$indiv_names <- D$indiv
params$collection_names <- levels(D$collection)
params$repunit_names <- levels(D$repunit)
params$locus_names <- names(AC_list)
params$ploidies <- as.integer(unname(ploidies[params$locus_names]))
# to compute the percent missing from the long vector I in params, is a little
# trickier when we have haploid markers in there, but not much. Before we just
# added everything up, but now we will focus only on the first gene copy in each
# pair, since we have enforced that that one will be missing if the locus is missing,
# whether it is haploid or not...s
mask <- rep(c(TRUE, FALSE), length.out = length(params$I))
percent.missing <- sum(params$I[mask] == 0)/sum(mask) * 100
if(summ == T){
cat(paste('Summary Statistics:',
paste(params$N, 'Individuals in Sample'),
paste(paste(params$L, 'Loci:'), paste(names(AC_list), collapse = ", ")),
paste(paste(paste(length(RU_list), 'Reporting Units:'),
paste(levels(RU_list), collapse = ", "))),
paste(paste(paste(params$C, 'Collections:'),
paste(colnames(AC_list[[1]]), collapse = ", "))),
paste(sprintf("%.2f", percent.missing), '% of allelic data identified as missing\n', sep = ""),
sep = '\n\n'))
}
params
}
#' Get the average within-RU assignment rate for each collection
#'
#' This function takes a matrix of scaled genotype likelihoods for a group of
#' individuals of known origin, and calculates the average rate at which individuals
#' in a particular collection are assigned to the correct reporting unit.
#'
#' The average rate of correct within-reporting unit assignment is proportional to
#' reporting-unit-level bias in the posterior probability for this collection;
#' if the correct assignment rate is high relative to other collections,
#' it will be upwardly biased, and vice versa. The inverse of this vector is used
#' to scale Dirichlet draws of \code{omega} during misassignment-scaled MCMC.
#' @param SL a scaled likelihood matrix; each column should sum to one, and represent
#' the probability of assignments to each collection (row) for a particular individual
#' @param coll a vector of the collection of origin indices of the individuals (length = \code{ncol(SL)})
#' @param RU_starts a vector delineating starting indices of different reporting units in RU_vec
#' @param RU_vec a vector of collection indices, organized by reporting unit
#'
#' @return \code{avg_coll2correctRU} returns a vector of length \code{nrow(SL)}, where
#' each element represents the average proportion of fish from the corresponding collection
#' which are correctly assigned to the proper collection, or misassigned to another
#' collection within the same reporting unit. This is distinct from the rate of
#' correct assignment at the collection level, which is too low and variable to serve
#' as a stable metric for \code{omega} scaling.
#' @examples
#' locnames <- names(alewife)[-(1:16)][c(TRUE, FALSE)]
#' ploidies <- rep(2, length(locnames))
#' names(ploidies) <- locnames
#'
#' params <- tcf2param_list(alewife, 17, ploidies = ploidies)
#' SL <- geno_logL(params) %>% exp() %>% apply(2, function(x) x/sum(x))
#' correct <- avg_coll2correctRU(SL, params$coll, params$RU_starts, params$RU_vec)
#' @export
#' @keywords internal
avg_coll2correctRU <- function(SL, coll, RU_starts, RU_vec) {
ru_SL <- lapply(1:(length(RU_starts)-1), function(ru){
apply(rbind(SL[RU_vec[(RU_starts[ru]+1):RU_starts[ru + 1]], ]), 2, sum)
}) %>%
do.call("rbind", .)
avg_SL <- lapply(1:nrow(SL), function(x) as.array(ru_SL[, (coll == x)])) %>%
lapply(function(x) apply(x,MARGIN = 1, FUN = mean)) %>%
simplify2array()
correct <- lapply(1:(length(RU_starts)-1), function(ru) {
ru_colls <- RU_vec[(RU_starts[ru]+1):RU_starts[ru+1]]
out <- avg_SL[ru, ru_colls]
}) %>%
unlist()
correct
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.