R/parent.assign.fun.R

Defines functions bar.plot.fun beta.min.ss.fun beta.fun X.mat.fun fkj.and.weight.fun ls.fun most.like.parents.duos.mismatches.fun duos.mismatches.fun mismatches.fun dkj.from.gkj.fun Dij.from.Gij.fun prelim.ml.discrete.geno.probs.fun dkj.from.snp.dat.pools.fun Dij.from.snp.dat.fun prelim.ml.discrete.assigned.genos.fun plot.lod.vs.LogL.range.fun most.like.parents.duo.fun parents.to.pools.lod.fun ml.fun adj.geno.prob.fun prelim.ml.quant.fun snp.error.fun tcj.fun nlj.fun rho.inv.fun lambda.ij.fun parent.combns.fun flj.from.snp.dat.fun flj.from.parent.Gij.fun Gij.fun phi.ij.fun genotypes.fun snp.param.pools.fun parent.assign.fun

Documented in parent.assign.fun snp.param.pools.fun

# June 2020
# Matthew Hamilton

#' parent.assign.fun
#' 
#' This function assigns parents to pooled samples using one of seven approaches:
#' \itemize{
#'  \item{'Least squares' method outlined in Henshall et al. 2014 (method = "Least_squares")} 
#'  \item{'Least squares minimum sum of squares' method whereby the sum of squares of all parental combinations are computed and 
#'   the combination with the minimum value identified.  Refer to Hamilton 2020 (method = "Least_squares", beta.min.ss = TRUE)} 
#'  \item{'Quantitative maximum likelihood' method whereby the approach using quantititive genotypes for parentatge assignment
#'  outlined in Henshall et al. 2014 is extended to pooled DNA samples.  Refer to Hamilton 2020 (method = "Quantitative")}
#'  \item{'Discrete maximum likelihood from genotype probabilities' method whereby discrete genotypes are derived from genotype probabilities and  
#'  used for parentatge assignment of pooled DNA samples by extending the discrete genotype maximum likelihood approach outlined in 
#'  Henshall et al. 2014 to pooled DNA samples.  Refer to Hamilton 2020 (method = "Discrete", discrete.method = "geno.probs").}
#'  \item{'Discrete maximum likelihood from genotype assignments' method whereby discrete genotypes are provided as input and  
#'  used for parentatge assignment of pooled DNA samples by extending the discrete genotype maximum likelihood approach outlined in 
#'  Henshall et al. 2014 to pooled DNA samples.  Refer to Hamilton 2020 (method = "Discrete", discrete.method = "assigned.genos")}
#'  \item{'Exclusion from genotype probabilities' method whereby discrete genotypes are derived from genotype probabilities and  
#'  used for parentatge assignment of pooled DNA samples by extending the exclusion approach outlined in 
#'  Henshall et al. 2014 to pooled DNA samples  Refer to Hamilton 2020 (method = "Exclusion", discrete.method = "geno.probs").}
#'  \item{'Exclusion from genotype assignments' method whereby discrete genotypes are provided as input and  
#'  used for parentatge assignment of pooled DNA samples by extending the exclusion approach outlined in 
#'  Henshall et al. 2014 to pooled DNA samples.  Refer to Hamilton 2020 (method = "Exclusion", discrete.method = "assigned.genos")}
#' }   
#' @param method is a vector of methods to be implemented (e.g. c("Quantitative", "Discrete", "Exclusion", "Least_squares"))
#' @param beta.min.ss is a logical variable appicable to least_squares method only (default = FALSE).
#' If TRUE, the sum of squares of all parental combinations are computed and the combination with the minimum value is identified.  
#' Refer to Hamilton 2020. 
#' @param discrete.method is a character variable applicable to the "Discrete" or "Exclusion" methods only
#' (default = "geno.probs").  It must equal either:
#' \itemize{
#'  \item{"geno.probs" in which case discrete genotypes for parents and pools are derived from genotype probabilities.}  
#'  \item{"assigned.genos" in which case discrete genotypes for parents and pools are obtained directly from the snp.dat.indiv and snp.dat.pools inputs.}
#'  }
#' @param threshold.indiv is a numeric variable between 0 and 1 inclusive applicable to the "Discrete" or "Exclusion" methods only
#' when discrete.method = "geno.probs" (default = NULL).  A discrete genotype is assigned to the the most likely genotype in 
#' the quantitative ordered genotype probability matrix Gij if it is greater than threshold.indiv (or
#' threshold.indiv / 2 for the two heterozygous genotypes).  Otherwise the genotype is deemed missing (refer to the left hand side of 
#' page 5 of Henshall et al. 2014)
#' @param threshold.pools is a numeric variable between 0 and 1 inclusive applicable to the "Discrete" or "Exclusion" methods only
#' when discrete.method = "geno.probs" (default = NULL).  Equivalent to threshold.indiv for pooled DNA samples.
#' @param snp.dat.indiv is a data frame with the following headings (class in parentheses):
#' \itemize{
#'  \item{'SAMPLE_ID' is the sample identifier.  Samples must be from diploid individuals (i.e. not pools) (integer).}
#'  \item{'SNP_ID' is the SNP identifier (character).} 
#'  \item{'INTENSITY_A' is the area/intensity for allele A (numeric).}
#'  \item{'INTENSITY_B' is the area/intensity for allele B (numeric).} 
#'  \item{'A_ALLELE' is the base represented by allele A (i.e. 'A', 'C', 'G' or 'T') (character).}
#'  \item{'B_ALLELE' is the base represented by allele B (i.e. 'A', 'C', 'G' or 'T') (character).}
#'  \item{'GENOTYPE' is the SNP genotype call (e.g. 'AT', 'TT').  NA if missing (character).}
#' } 
#' @param snp.dat.pools is a data frame with the following headings.  Note that all pooled DNA samples 
#' in this dataframe must be comprised of DNA from the same number of individuals (see n.in.pools) (class in parentheses):
#'  \itemize{
#'  \item{SAMPLE_ID is the pooled sample identifier (integer).}
#'  \item{SNP_ID is the SNP identifier (character).}
#'  \item{INTENSITY_A is the signal intensity for allele A. Not required if method does not include 'Quantitative' or 'Least_squares' and discrete.method  = "geno.probs" (numeric).}
#'  \item{INTENSITY_B is the signal intensity for allele B. Not required if method does not include 'Quantitative' or 'Least_squares' and discrete.method  = "geno.probs" (numeric).}
#'  \item{GENOTYPE is the assigned unordered genotype. Not required if discrete.method = "geno.probs" (character).}
#' } 
#' @param n.in.pools is an integer variable representing the number of individual that contributed DNA to each sample in snp.dat.pools  
#' @param min.intensity is a numeric variable (default = 0).  If the square root of the sum of INTENSITY_A squared and 
#' INTENSITY_B squared in snp.dat.indiv or snp.dat.pools is less than min.intensity then this record is excluded.
#' That is, observations that fall into an arc with a radius equal to min.intensity in the lower left of
#' signal intensity scatter plots are excluded.  
#' @param snp.error.assumed Must be one of (default = NULL):
#' \itemize{
#'  \item{NULL.  Note that if snp.error.assumed is NULL then snp.error.underlying must not be NULL.}
#'  \item{a numeric variable between 0 and 1, in which case the 'assumed error rate' (see Henshall et al 2014) is the same across all SNP.}
#'  \item{a data frame with columns SNP_ID and SNP_ERROR_TILDE (see Henshall et al 2014).}
#' } 
#' @param snp.error.underlying is not used if snp.error.assumed is not NULL (default = NULL). Must be either:
#' \itemize{
#'  \item{NULL.}
#'  \item{a numeric variable between 0 and 1 inclusive.  Used to comptute SNP_ERROR_TILDE from SNP_ERROR_HAT according
#'                      to the approach outlined on the left of page 5 of Henshall et al. 2014 using individual 
#'                      (i.e. not pooled) data only.  If snp.error.underlying = 0 then SNP_ERROR_TILDE = SNP_ERROR_HAT.}
#' } 
#' @param snp.param.indiv is the output of snp.param.indiv.fun (default = NULL).  That is, it is a data frame with the following 
#' headings (class in parentheses):
#' \itemize{
#'  \item{'SNP_ID' is the SNP identifier (character).} 
#'  \item{'N_AA' is the count of homozygous A (AA) genotypes (integer).}
#'  \item{'MEAN_P_AA' is the mean of allelic proportion for homozygous A genotypes (numeric).}
#'  \item{'SD_P_AA' is the standard deviation of allelic proportion for homozygous A genotypes (numeric).} 
#'  \item{'N_AB' is the count of heterozygous (AB) genotypes (integer).}
#'  \item{'MEAN_P_AB' is the mean of allelic proportion for heterozygous (AB) genotypes (numeric).}
#'  \item{'SD_P_AB' is the standard deviation of allelic proportion for heterozygous (AB) genotypes (numeric).}  
#'  \item{'N_BB' is the count of homozygous B (BB) genotypes (integer).}
#'  \item{'MEAN_P_BB' is the mean of allelic proportion for homozygous B genotypes (numeric).}
#'  \item{'SD_P_BB' is the standard deviation of allelic proportion for homozygous B genotypes (numeric).} 
#'  \item{'WELCH_A' is the welsh statistic for the interval between MEAN_P_AA and MEAN_P_AB (numeric).}
#'  \item{'WELCH_B' is the welsh statistic for the interval between MEAN_P_AB and MEAN_P_BB (numeric).}
#'  \item{'A_ALLELE_FREQ' is the A allele frequency computed from genotype counts (numeric).}
#'  \item{'B_ALLELE_FREQ' is the B allele frequency computed from genotype counts (numeric).}
#'  \item{'A_ALLELE' is the base represented by allele A (i.e. 'A', 'C', 'G' or 'T') (character).}
#'  \item{'B_ALLELE' is the base represented by allele B (i.e. 'A', 'C', 'G' or 'T') (character).}
#' } 
#' @param snp.param.pools is the output of snp.param.pools.fun.  That is, it is a data frame with the following headings (class in parentheses)
#' \itemize{
#'  \item{'SNP_ID' is the SNP identifier (character).} 
#'  \item{'MEAN_P_AAAA' is the mean of allelic proportion for homozygous A genotypes (numeric).}
#'  \item{'SD_P_AAAA' is the standard deviation of allelic proportion for homozygous A genotypes (numeric).} 
#'  \item{'MEAN_P_AAAB' is the mean of allelic proportion for unordered AAAB genotypes (numeric).}
#'  \item{'SD_P_AAAB' is the standard deviation of allelic proportion for unordered AAAB genotypes (numeric).} 
#'  \item{'MEAN_P_AABB' is the mean of allelic proportion for unordered AABB genotypes (numeric).}
#'  \item{'SD_P_AABB' is the standard deviation of allelic proportion for unordered AABB genotypes (numeric).}  
#'  \item{'MEAN_P_ABBB' is the mean of allelic proportion for unordered ABBB genotypes (numeric).}
#'  \item{'SD_P_ABBB' is the standard deviation of allelic proportion for unordered ABBB genotypes (numeric).}
#'  \item{'MEAN_P_BBBB' is the mean of allelic proportion for homozygous B genotypes (numeric).}
#'  \item{'SD_P_BBBB' is the standard deviation of allelic proportion for homozygous B genotypes (numeric).}   
#'  \item{'A_ALLELE' is the base represented by allele A (i.e. 'A', 'C', 'G' or 'T') (character).}
#'  \item{'B_ALLELE' is the base represented by allele B (i.e. 'A', 'C', 'G' or 'T') (character).}
#' } 
#' @param min.sd is a numberic variable defining a lower bound to be applied to estimates of the 
#' standard deviation of allelic proportion for genotypes in snp.param.indiv and snp.param.pools (default = 0)
#' @param fams is a data frame with the following headings (class in parentheses):
#' \itemize{
#'  \item{'FAMILY_ID' is the family identifier (integer).} 
#'  \item{'SIRE_ID' is the sire identifier (integer).} 
#'  \item{'DAM_ID' is the dam identifier (integer).} 
#' } 
#' @param fam.set.combns is a data frame with the following headings (class in parentheses). Note: if fam.set.combns = NULL
#' (see 'pooling by phenotype' example below), 
#' FAMILY_ID is taken from the 'fams' and duplicated n.in.pools times, FAM_SET_ID = 1 for the first duplication of 
#' FAMILY_IDs, 2 for the second etc and FAM_SET_COMBN_ID = 1 (default = NULL):
#' \itemize{
#'  \item{'FAM_SET_COMBN_ID' is the family set combination identifier (integer). A 'family set combination' is a combination of 'family sets'.  
#'  Each pooled sample must be associated with one only family set combination but a family set combination
#'  may be assoicated with multiple pooled samples using the fam.set.combns.by.pool input below.} 
#'  \item{'FAM_SET_ID' is the family set identifier (integer).  A 'family set' is a group of families of which one is known to be the true family 
#'  of one of the individuals in a pooled sample.  Within each 'family set combination' there must be a 'family set'
#'  for each individual in a pooled sample (i.e. if n.in.pools = 2 there must be two family sets in each family set combination)} 
#'  \item{'FAMILY_ID' is the family identifier (integer).} 
#' } 
#' @param fam.set.combns.by.pool is a data frame linking pooled samples with family set combinations.  
#' It has the following headings (class in parentheses).  Note: if fam.set.combns is NULL (see 'pooling by phenotype' example below),
#' fam.set.combns.by.pool is made NULL.
#' If fam.set.combns.by.pool = NULL, FAM_SET_COMBN_ID = 1 and SAMPLE_ID is taken from the 
#' 'snp.dat.pools' input (default = NULL):
#' \itemize{
#'  \item{SAMPLE_ID is the pooled sample identifier (integer).}
#'  \item{'FAM_SET_COMBN_ID' is the family set combination identifier (integer).} 
#' } 
#' @param skip.checks is a logical variable.  If FALSE parent.assign.fun data checks are not undertaken. 
#' @return  
#' \cr
#' \strong{Primary outputs}
#' @return \code{most.like.parents.quant}  
#' \cr \cr
#' Applicable when method = "Quantitative".  Identifies the most likely parental combination and delta LODs for 
#' individual parents.  Second most likely (alternative) parents are also presented.  Refer to Hamilton 2020.  Example fields for n.in.pools = 2:
#' \itemize{
#'  \item{SAMPLE_ID (integer).}
#'  \item{PARENT_COMBN_ID (integer).}
#'  \item{MISS_PARENT_SNP_DATA_PROP (numeric).}
#'  \item{MISS_POOL_SNP_DATA_PROP (numeric).}
#'  \item{NO_MISS_PARENT_OR_POOL_PROP (numeric).}
#'  \item{MIN_LOGL (numeric).}
#'  \item{MIN_LOGL_SNP (character).}
#'  \item{MAX_LOGL (numeric).}
#'  \item{MAX_LOGL_SNP (character).}
#'  \item{RANGE_5_TO_95_LOGL (numeric).}
#'  \item{LOD (numeric).}
#'  \item{FAM_SET_COMBN_ID (integer).}
#'  \item{PARENT_1 (integer).}
#'  \item{PARENT_2 (integer).}
#'  \item{PARENT_3 (integer).}
#'  \item{PARENT_4 (integer).}
#'  \item{FAM_COMBN_ID (integer).}
#'  \item{FAMILY_ID_1 (integer).}
#'  \item{FAMILY_ID_2 (integer).}
#'  \item{PARENT_1_DELTA_LOD (logical).}
#'  \item{PARENT_2_DELTA_LOD (logical).}
#'  \item{PARENT_3_DELTA_LOD (numeric).}
#'  \item{PARENT_4_DELTA_LOD (numeric).}
#'  \item{ALT_PARENT_1 (logical).}
#'  \item{ALT_PARENT_2 (logical).}
#'  \item{ALT_PARENT_3 (numeric).}
#'  \item{ALT_PARENT_4 (numeric).}
#'  \item{ALT_PARENT_COMBN_1 (logical).}
#'  \item{ALT_PARENT_COMBN_2 (logical).}
#'  \item{ALT_PARENT_COMBN_3 (integer).}
#'  \item{ALT_PARENT_COMBN_4 (integer).}
#'  \item{ALT_FAM_COMBN_1 (integer).}
#'  \item{ALT_FAM_COMBN_2 (integer).}
#'  \item{ALT_FAM_COMBN_3 (integer).}
#'  \item{ALT_FAM_COMBN_4 (integer).}
#' }
#' \cr
#' @return \code{most.like.parents.discrete}
#' \cr  \cr
#' Applicable when method = "Discrete".  Identifies the most likely parental 
#' combination and delta LODs for individual parents.  Second most likely (alternative) parents are also presented.  
#' Refer to Hamilton 2020.  Example fields for n.in.pools = 2:
#' \itemize{
#'  \item{SAMPLE_ID (integer).}
#'  \item{PARENT_COMBN_ID (integer).}
#'  \item{MISS_PARENT_SNP_DATA_PROP (numeric).}
#'  \item{MISS_POOL_SNP_DATA_PROP (numeric).}
#'  \item{NO_MISS_PARENT_OR_POOL_PROP (numeric).}
#'  \item{MIN_LOGL (numeric).}
#'  \item{MIN_LOGL_SNP (character).}
#'  \item{MAX_LOGL (numeric).}
#'  \item{MAX_LOGL_SNP (character).}
#'  \item{RANGE_5_TO_95_LOGL (numeric).}
#'  \item{LOD (numeric).}
#'  \item{FAM_SET_COMBN_ID (integer).}
#'  \item{PARENT_1 (integer).}
#'  \item{PARENT_2 (integer).}
#'  \item{PARENT_3 (integer).}
#'  \item{PARENT_4 (integer).}
#'  \item{FAM_COMBN_ID (integer).}
#'  \item{FAMILY_ID_1 (integer).}
#'  \item{FAMILY_ID_2 (integer).}
#'  \item{PARENT_1_DELTA_LOD (logical).}
#'  \item{PARENT_2_DELTA_LOD (logical).}
#'  \item{PARENT_3_DELTA_LOD (numeric).}
#'  \item{PARENT_4_DELTA_LOD (numeric).}
#'  \item{ALT_PARENT_1 (logical).}
#'  \item{ALT_PARENT_2 (logical).}
#'  \item{ALT_PARENT_3 (numeric).}
#'  \item{ALT_PARENT_4 (numeric).}
#'  \item{ALT_PARENT_COMBN_1 (logical).}
#'  \item{ALT_PARENT_COMBN_2 (logical).}
#'  \item{ALT_PARENT_COMBN_3 (integer).}
#'  \item{ALT_PARENT_COMBN_4 (integer).}
#'  \item{ALT_FAM_COMBN_1 (integer).}
#'  \item{ALT_FAM_COMBN_2 (integer).}
#'  \item{ALT_FAM_COMBN_3 (integer).}
#'  \item{ALT_FAM_COMBN_4 (integer).}
#' }
#' \cr
#' @return \code{most.like.parents.excl}  
#' \cr  \cr
#' Applicable when method = "Exclusion".  Identifies the most likely parental 
#' combination.  Refer to Hamilton 2020.  Second most likely (alternative) parental 
#' combination is also presented.  
#' Example fields for n.in.pools = 2:
#' \itemize{
#'  \item{SAMPLE_ID (integer).}
#'  \item{PARENT_COMBN_ID (integer).}
#'  \item{MISMATCHES (integer).}
#'  \item{SNP_COUNT (integer).}
#'  \item{MISMATCH_PROP (numeric).}
#'  \item{MISMATCH_PROP_SE (numeric).}
#'  \item{MISMATCH_PROP_Z (numeric).}
#'  \item{FAM_COMBN_ID (integer).}
#'  \item{FAMILY_ID_1 (integer).}
#'  \item{FAMILY_ID_2 (integer).}
#'  \item{PARENT_1 (integer).}
#'  \item{PARENT_2 (integer).}
#'  \item{PARENT_3 (integer).}
#'  \item{PARENT_4 (integer).}
#'  \item{ALT_PARENT_COMBN_ID (integer).}
#'  \item{ALT_FAM_COMBN_ID (integer).}
#'  \item{ALT_MISMATCHES (integer).}
#'  \item{ALT_SNP_COUNT (integer).}
#'  \item{ALT_MISMATCH_PROP (numeric).}
#'  \item{ALT_MISMATCH_PROP_SE (numeric).}
#'  \item{ALT_MISMATCH_PROP_Z (numeric).}
#' }
#' \cr
#' @return \code{most.like.parents.excl.non.dup}
#' \cr \cr
#'  Applicable when method = "Exclusion".  Identifies the most likely parental
#'  combination - simplified output with multiple combinations with the same number of mismatches (duplicated 
#'  SAMPLE_IDs) removed.  Refer to Hamilton 2020.  Example fields for n.in.pools = 2:
#' \itemize{
#'  \item{SAMPLE_ID (integer).}
#'  \item{MISMATCHES (integer).}
#'  \item{SNP_COUNT (integer).}
#'  \item{MISMATCH_PROP (numeric).}
#'  \item{MISMATCH_PROP_SE (numeric).}
#'  \item{MISMATCH_PROP_Z (numeric).}
#'  \item{FAMILY_ID_1 (integer).}
#'  \item{FAMILY_ID_2 (integer).}
#'  \item{PARENT_1 (integer).}
#'  \item{PARENT_2 (integer).}
#'  \item{PARENT_3 (integer).}
#'  \item{PARENT_4 (integer).}
#' }
#' \cr
#' @return \code{beta}
#' \cr \cr
#' Applicable when method = "Least_squares". Identifies the most likely parental combination:
#' \itemize{
#'  \item{SAMPLE_ID (integer).}
#'  \item{SIRE_ID (integer).}
#'  \item{DAM_ID (integer).}
#'  \item{FAMILY_ID (integer).}
#'  \item{BETA_STAR Refer to Henshall et al. 2014  (numeric).}
#'  \item{BETA_HAT Refer to Henshall et al. 2014 (numeric).}
#'  \item{BETA_HAT_CONSTRAINED  Constrained to have equal contributions from each FAMILY_SET_ID. Refer to Hamilton 2020}
#'  \item{BETA_MIN_SS Applicable when beta.min.ss = "TRUE". Constrained beta with minimum sum of squares is retained.  Beta constrained to have equal contributions from each FAMILY_SET_ID.  Refer to Hamilton 2020}
#' }
#' \cr
#' \strong{Primary plots}
#' @return \code{bar.png} 
#' \itemize{
#'  \item{Bar plot of BETA_HAT by FAMILY_ID.  Applicable to 'Least_squares' method only.  Output in a directory named 'Results' on the current working directory.} 
#' }
#' @return \code{discrete.png} 
#' \itemize{
#'  \item{Scatter plot of '5-95 percentile range of log-likelihood ratios' against 'Log odds (LOD) scores' for each 
#'  possible family combination.  Ideally there is an isolated point in the bottom right represtenting the 
#'  correct family combination.  Applicable to 'Discrete' method only.  Output in a directory named 
#'  'Results/lod.scatter' on the current working directory.} 
#' }
#' @return \code{quantitative.png} 
#' \itemize{
#'  \item{Scatter plot of '5-95 percentile range of log-likelihood ratios' against 'Log odds (LOD) scores' for each 
#'  possible family combination.  Ideally there is an isolated point in the bottom right represtenting the 
#'  correct family combination.  Applicable to 'Quantitative' method only.  Output in a directory named 
#'  'Results/lod.scatter' on the current working directory.} 
#' }
#' \cr
#' \strong{Intermediate outputs}
#' \cr
#' @return 'Dij' Applicable when method = "Discrete" or "Exclusion". Refer to Henshall et al. 2014
#' #:
# \itemize{
#  \item{SAMPLE_ID (integer).}
#  \item{SNP_ID (character).}
#  \item{AA_GENO_PROB (numeric).}
#  \item{AB_GENO_PROB (numeric).}
#  \item{BA_GENO_PROB (numeric).}
#  \item{BB_GENO_PROB (numeric).}
#  \item{A_TRANS_PROB (numeric).}
#  \item{B_TRANS_PROB (numeric).}
# }
#' @return 'dkj' Applicable when method = "Discrete" or "Exclusion". Refer to Hamilton 2020
#' #:
# \itemize{
#  \item{SAMPLE_ID (integer).}
#  \item{SNP_ID (character).}
#  \item{AAAA (numeric).}
#  \item{AAAB (numeric).}
#  \item{AABB (numeric).}
#  \item{ABBB (numeric).}
#  \item{BBBB (numeric).}
# }
#' @return 'dklj.adj' Applicable when method = "Discrete" or "Exclusion". Refer to Hamilton 2020 (dkj.star)
#' #:
# \itemize{
#  \item{SNP_ID (character).}
#  \item{SAMPLE_ID (integer).}
#  \item{AAAA (numeric).}
#  \item{AAAB (numeric).}
#  \item{AABB (numeric).}
#  \item{ABBB (numeric).}
#  \item{BBBB (numeric).}
#  \item{FAM_SET_COMBN_ID (integer).}
# }
#' @return 'fkj.and.weight' Applicable when method = "Least_squares". Refer to Henshall et al. 2014
#' #:
# \itemize{
#  \item{SNP_ID (character).}
#  \item{SAMPLE_ID (integer).}
#  \item{MEAN_P_AA (numeric).}
#  \item{MEAN_P_AB (numeric).}
#  \item{MEAN_P_BB (numeric).}
#  \item{ALLELIC_PROP_POOL (numeric).}
#  \item{FREQ_POOL (numeric).}
#  \item{FREQ_POOL_ERROR_WT (numeric).}
# }
#' @return 'Gij' Applicable when method = "Discrete" or ""Quantitative". Refer to Henshall et al. 2014
#' #:
# \itemize{
#  \item{SAMPLE_ID (integer).}
#  \item{SNP_ID (character).}
#  \item{AA_GENO_PROB (numeric).}
#  \item{AB_GENO_PROB (numeric).}
#  \item{BA_GENO_PROB (numeric).}
#  \item{BB_GENO_PROB (numeric).}
#  \item{A_TRANS_PROB (numeric).}
#  \item{B_TRANS_PROB (numeric).}
#  \item{ALLELIC_PROP_INDIV (numeric).}
# }
#' @return 'gkj' Applicable when method = "Discrete" or ""Quantitative". Refer to Hamilton 2020 
#' #:
# \itemize{
#  \item{SAMPLE_ID (integer).}
#  \item{SNP_ID (character).}
#  \item{AAAA (numeric).}
#  \item{AAAB (numeric).}
#  \item{AABB (numeric).}
#  \item{ABBB (numeric).}
#  \item{BBBB (numeric).}
# }
#' @return 'gklj.adj' Applicable when method = "Discrete" or ""Quantitative". Refer to Hamilton 2020 (gkj.star)
#' #:
# \itemize{
#  \item{SNP_ID (character).}
#  \item{SAMPLE_ID (integer).}
#  \item{AAAA (numeric).}
#  \item{AAAB (numeric).}
#  \item{AABB (numeric).}
#  \item{ABBB (numeric).}
#  \item{BBBB (numeric).}
#  \item{FAM_SET_COMBN_ID (integer).}
# }
#' @return 'flj.probs' Applicable when discrete.method = "geno.probs". Refer to Hamilton 2020 (fj)
#' #:
# \itemize{
#  \item{FAM_SET_COMBN_ID (integer).}
#  \item{SNP_ID (character).}
#  \item{A_TRANS_PROB (numeric).}
#  \item{B_TRANS_PROB (numeric).}
#  \item{SAMPLE_ID (integer).}
# }
#' @return 'flj.geno' Applicable when discrete.method = "assigned.genos". Refer to Hamilton 2020 (fj)
#' #:
# \itemize{
#  \item{FAM_SET_COMBN_ID (integer).}
#  \item{SNP_ID (character).}
#  \item{A_TRANS_PROB (numeric).}
#  \item{B_TRANS_PROB (numeric).}
#  \item{SAMPLE_ID (integer).}
# }
#' @return 'lambda.kj' Applicable when method = "Discrete" or ""Quantitative". Refer to Hamilton 2020. 
#' #Example output for n.in.pools = 2#:
# \itemize{
#  \item{SAMPLE_ID (integer).}
#  \item{SNP_ID (character).}
#  \item{AAAA_LAMBDA (numeric).}
#  \item{AAAB_LAMBDA (numeric).}
#  \item{AABB_LAMBDA (numeric).}
#  \item{ABBB_LAMBDA (numeric).}
#  \item{BBBB_LAMBDA (numeric).}
#  \item{ALLELIC_PROP_POOL (numeric).}
# }
#' @return 'lod.duos.discrete' Applicable when method = "Discrete". Refer to Hamilton 2020.
#' #:
# \itemize{
#  \item{SAMPLE_ID (integer).}
#  \item{PARENT_COMBN_ID (integer).}
#  \item{MISS_PARENT_SNP_DATA_PROP (numeric).}
#  \item{MISS_POOL_SNP_DATA_PROP (numeric).}
#  \item{NO_MISS_PARENT_OR_POOL_PROP (numeric).}
#  \item{MIN_LOGL (numeric).}
#  \item{MIN_LOGL_SNP (character).}
#  \item{MAX_LOGL (numeric).}
#  \item{MAX_LOGL_SNP (character).}
#  \item{RANGE_5_TO_95_LOGL (numeric).}
#  \item{LOD (numeric).}
#  \item{FAM_SET_COMBN_ID (integer).}
#  \item{PARENT_1 (integer).}
#  \item{PARENT_2 (integer).}
#  \item{PARENT_3 (integer).}
#  \item{PARENT_4 (integer).}
# }
#' @return 'lod.duos.quant' Applicable when method = "Quantitative". Refer to Hamilton 2020.
#' #:
# \itemize{
#  \item{SAMPLE_ID (integer).}
#  \item{PARENT_COMBN_ID (integer).}
#  \item{MISS_PARENT_SNP_DATA_PROP (numeric).}
#  \item{MISS_POOL_SNP_DATA_PROP (numeric).}
#  \item{NO_MISS_PARENT_OR_POOL_PROP (numeric).}
#  \item{MIN_LOGL (numeric).}
#  \item{MIN_LOGL_SNP (character).}
#  \item{MAX_LOGL (numeric).}
#  \item{MAX_LOGL_SNP (character).}
#  \item{RANGE_5_TO_95_LOGL (numeric).}
#  \item{LOD (numeric).}
#  \item{FAM_SET_COMBN_ID (integer).}
#  \item{PARENT_1 (integer).}
#  \item{PARENT_2 (integer).}
#  \item{PARENT_3 (integer).}
#  \item{PARENT_4 (integer).}
# }
#' @return 'logl.duos.discrete' Applicable when method = "Discrete". Refer to Hamilton 2020. Only outputted for the final SAMPLE_ID
#' #:
# \itemize{
#  \item{SNP_ID (character).}
#  \item{SAMPLE_ID (integer).}
#  \item{PARENT_COMBN_ID (integer).}
#  \item{LIKE_(numeric)ERATOR (numeric).}
#  \item{LIKE_DENOMINATOR (numeric).}
#  \item{LOG_LIKE_RATIO (numeric).}
#  \item{MISS_PARENT_SNP_DATA_PROP (numeric).}
#  \item{MISS_POOL_SNP_DATA_PROP (numeric).}
# }
#' @return 'logl.duos.quant' Applicable when method = "Quantitative". Refer to Hamilton 2020. Only outputted for the final SAMPLE_ID
#' #:
# \itemize{
#  \item{SNP_ID (character).}
#  \item{SAMPLE_ID (integer).}
#  \item{PARENT_COMBN_ID (integer).}
#  \item{LIKE_(numeric)ERATOR (numeric).}
#  \item{LIKE_DENOMINATOR (numeric).}
#  \item{LOG_LIKE_RATIO (numeric).}
#  \item{MISS_PARENT_SNP_DATA_PROP (numeric).}
#  \item{MISS_POOL_SNP_DATA_PROP (numeric).}
# }
#' @return 'mismatches' Applicable when method = "Exclusion". Refer to Hamilton 2020.
#' #:
# \itemize{
#  \item{SAMPLE_ID (integer).}
#  \item{PARENT_COMBN_ID (integer).}
#  \item{MISMATCHES (integer).}
#  \item{SNP_COUNT (integer).}
#  \item{MISMATCH_PROP (numeric).}
#  \item{MISMATCH_PROP_SE (numeric).}
#  \item{MISMATCH_PROP_Z (numeric).}
# }
#' @return 'mismatches.by.snp' Applicable when method = "Exclusion". Refer to Hamilton 2020. 
#' #Example output for n.in.pools = 2:
# \itemize{
#  \item{SNP_ID (character).}
#  \item{PARENT_COMBN_ID (integer).}
#  \item{AAAA_TRANS (numeric).}
#  \item{AAAB_TRANS (numeric).}
#  \item{AABB_TRANS (numeric).}
#  \item{ABBB_TRANS (numeric).}
#  \item{BBBB_TRANS (numeric).}
#  \item{FAM_SET_COMBN_ID (integer).}
#  \item{MISS_PARENT_COUNT (numeric).}
#  \item{MISS_PARENT (logical).}
#  \item{SAMPLE_ID (integer).}
#  \item{AAAA_POOLS (numeric).}
#  \item{AAAB_POOLS (numeric).}
#  \item{AABB_POOLS (numeric).}
#  \item{ABBB_POOLS (numeric).}
#  \item{BBBB_POOLS (numeric).}
#  \item{MISS_POOL (logical).}
#  \item{MISMATCHES (integer).}
# }
#' @return 'nlj.probs' Applicable when discrete.method = "geno.probs". Refer to Hamilton 2020 (nj). 
#' #Example output for n.in.pools = 2:
# \itemize{
#  \item{FAM_SET_COMBN_ID (integer).}
#  \item{SNP_ID (character).}
#  \item{AAAA (numeric).}
#  \item{AAAB (numeric).}
#  \item{AABB (numeric).}
#  \item{ABBB (numeric).}
#  \item{BBBB (numeric).}
# }
#' @return 'nlj.geno'  Applicable when discrete.method = "geno.probs". Refer to Hamilton 2020 (nj).
#' # Example output for n.in.pools = 2:
# \itemize{
#  \item{FAM_SET_COMBN_ID (integer).}
#  \item{SNP_ID (character).}
#  \item{AAAA (numeric).}
#  \item{AAAB (numeric).}
#  \item{AABB (numeric).}
#  \item{ABBB (numeric).}
#  \item{BBBB (numeric).}
# }
#' @return 'parent.combns':
# \itemize{
#  \item{FAM_COMBN_ID (integer).}
#  \item{FAMILY_ID_1 (integer).}
#  \item{FAMILY_ID_2 (integer).}
#  \item{PARENT_COMBN_ID (integer).}
#  \item{PARENT_1 (integer).}
#  \item{PARENT_2 (integer).}
#  \item{PARENT_3 (integer).}
#  \item{PARENT_4 (integer).}
# }
#' @return 'phi.ij' Applicable when method = "Discrete" or ""Quantitative". Refer to Henshall et al. 2014
#' #:
# \itemize{
#  \item{SAMPLE_ID (integer).}
#  \item{SNP_ID (character).}
#  \item{AA_PHI (numeric).}
#  \item{AB_PHI (numeric).}
#  \item{BA_PHI (numeric).}
#  \item{BB_PHI (numeric).}
#  \item{ALLELIC_PROP_INDIV (numeric).}
# }
#' @return 'snp.error.probs' Applicable when discrete.method = "geno.probs". Refer to Henshall et al. 2014
#' #:
# \itemize{
#  \item{SNP_ID Factor}
#  \item{SNP_ERROR_TILDE (numeric).}
# }
#' @return 'snp.error.geno'  Applicable when discrete.method = "assigned.genos". Refer to Henshall et al. 2014
#' #:
# \itemize{
#  \item{SNP_ID Factor}
#  \item{SNP_ERROR_TILDE (numeric).}
# }
#' @return 'tclj.adj.quant' Applicable when method = "Quantitative". Refer to Hamilton 2020 (tcj.star). 
#' #Example output for n.in.pools = 2:
# \itemize{
#  \item{SNP_ID (character).}
#  \item{PARENT_COMBN_ID (integer).}
#  \item{AAAA (numeric).}
#  \item{AAAB (numeric).}
#  \item{AABB (numeric).}
#  \item{ABBB (numeric).}
#  \item{BBBB (numeric).}
#  \item{FAM_SET_COMBN_ID (integer).}
# }
#' @return 'tclj.adj.discrete' Applicable when method = "Discrete". Refer to Hamilton 2020 (tcj.star). 
#' #Example output for n.in.pools = 2:
# \itemize{
#  \item{SNP_ID (character).}
#  \item{PARENT_COMBN_ID (integer).}
#  \item{AAAA (numeric).}
#  \item{AAAB (numeric).}
#  \item{AABB (numeric).}
#  \item{ABBB (numeric).}
#  \item{BBBB (numeric).}
#  \item{FAM_SET_COMBN_ID (integer).}
# }
#' @return 'tclj.discrete' Applicable when method = "Discrete". Refer to Hamilton 2020 (tcj). 
#' #Example output for n.in.pools = 2:
# \itemize{
#  \item{SNP_ID (character).}
#  \item{PARENT_COMBN_ID (integer).}
#  \item{AAAA (numeric).}
#  \item{AAAB (numeric).}
#  \item{AABB (numeric).}
#  \item{ABBB (numeric).}
#  \item{BBBB (numeric).}
#  \item{FAM_SET_COMBN_ID (integer).}
# }
#' @return 'tclj.ls' Applicable when method = "Least_squares". Refer to Hamilton 2020 (tcj).
#' 3:
# \itemize{
#  \item{SNP_ID (character).}
#  \item{PARENT_COMBN_ID (integer).}
#  \item{MISS_PARENT_COUNT (numeric).}
#  \item{FAM_SET_COMBN_ID (integer).}
#  \item{B_TRANS_PROB (numeric).}
# }
#' @return 'tclj.quant' Applicable when method = "Quantitative". Refer to Hamilton 2020 (tcj). 
#' #Example output for n.in.pools = 2::
# \itemize{
#  \item{SNP_ID (character).}
#  \item{PARENT_COMBN_ID (integer).}
#  \item{AAAA (numeric).}
#  \item{AAAB (numeric).}
#  \item{AABB (numeric).}
#  \item{ABBB (numeric).}
#  \item{BBBB (numeric).}
#  \item{FAM_SET_COMBN_ID (integer).}
# }
#' @return 'Xl.mat' List Applicable when method = "Quantitative". Refer to Henshall et al 2014 (X)
#' #:
# \itemize{
#  \item{ (numeric).}
# }

#' @examples
#' 
#' #' #Retrieve data for 'pooling by phenotype' example from Hamilton 2020
#' data(shrimp.snp.dat.indiv)
#' data(shrimp.snp.dat.pools)
#' data(shrimp.fams)
#' 
#' #Compute SNP parameters
#' shrimp.snp.param.indiv <- snp.param.indiv.fun(shrimp.snp.dat.indiv)
#' shrimp.snp.param.pools <- snp.param.pools.fun(shrimp.snp.param.indiv, n.in.pools = 2)
#'
#' #Assign parentage using the quantitative maximum likelihood method
#' parent.assign.fun(method= "Quantitative",
#'                   snp.dat.indiv = shrimp.snp.dat.indiv, 
#'                   snp.dat.pools = shrimp.snp.dat.pools,
#'                   n.in.pools = 2,
#'                   snp.error.assumed = 0.01,
#'                   snp.param.indiv = shrimp.snp.param.indiv,
#'                   snp.param.pools = shrimp.snp.param.pools,                  
#'                   fams = shrimp.fams)  
#' 
#' #Retrieve data for 'pooling for individual parentage assignment' example from Hamilton 2020
#' data(ab.snp.dat.indiv)
#' data(ab.snp.dat.pools)
#' data(ab.fams)
#' data(ab.fam.set.combns)
#' data(ab.fam.set.combns.by.pool)
#' 
#' #Compute SNP parameters
#' ab.snp.param.indiv <- snp.param.indiv.fun(ab.snp.dat.indiv)
#' ab.snp.param.pools <- snp.param.pools.fun(ab.snp.param.indiv, n.in.pools = 3)
#'
#' #Assign parentage using the quantitative maximum likelihood method
#' parent.assign.fun(method= "Quantitative",
#'                   snp.dat.indiv = ab.snp.dat.indiv, 
#'                   snp.dat.pools = ab.snp.dat.pools,
#'                   n.in.pools = 3,
#'                   snp.error.assumed = 0.01,
#'                   snp.param.indiv = ab.snp.param.indiv,
#'                   snp.param.pools = ab.snp.param.pools,                  
#'                   fams = ab.fams,
#'                   fam.set.combns = ab.fam.set.combns,
#'                   fam.set.combns.by.pool = ab.fam.set.combns.by.pool) 
#'                   
#' #Retrieve data for small worked example from Hamilton 2020
#' data(Ham.snp.dat.indiv)
#' data(Ham.snp.dat.pools)
#' data(Ham.fams)
#' data(Ham.fam.set.combns)
#' data(Ham.fam.set.combns.by.pool)
#' 
#' #Compute SNP parameters
#' Ham.snp.param.indiv <- snp.param.indiv.fun(Ham.snp.dat.indiv)
#' Ham.snp.param.pools <- snp.param.pools.fun(Ham.snp.param.indiv, n.in.pools = 2)
#' 
#' #Assign parentage using the least squares method
#' parent.assign.fun(method = "Least_squares",
#'                   beta.min.ss = TRUE, 
#'                   snp.dat.indiv = Ham.snp.dat.indiv, 
#'                   snp.dat.pools = Ham.snp.dat.pools,
#'                   n.in.pools = 2,
#'                   snp.error.assumed = 0.01,
#'                   snp.param.indiv = Ham.snp.param.indiv,
#'                   snp.param.pools = Ham.snp.param.pools,                  
#'                   fams = Ham.fams,
#'                   fam.set.combns = Ham.fam.set.combns,
#'                   fam.set.combns.by.pool = Ham.fam.set.combns.by.pool)
#'                   
#' #Assign parentage using the quantitative maximum likelihood method
#' parent.assign.fun(method= "Quantitative",
#'                   threshold.indiv = 0.98,         
#'                   threshold.pools = 0.98,         
#'                   snp.dat.indiv = Ham.snp.dat.indiv, 
#'                   snp.dat.pools = Ham.snp.dat.pools,
#'                   n.in.pools = 2,
#'                   snp.error.assumed = 0.01,
#'                   snp.param.indiv = Ham.snp.param.indiv,
#'                   snp.param.pools = Ham.snp.param.pools,                  
#'                   fams = Ham.fams,
#'                   fam.set.combns = Ham.fam.set.combns,
#'                   fam.set.combns.by.pool = Ham.fam.set.combns.by.pool)  
#'                   
#' #Assign parentage using the discrete maximum likelihood method 
#' #(discrete.method = "geno.probs")
#' parent.assign.fun(method= "Discrete",
#'                   discrete.method = "geno.probs",
#'                   threshold.indiv = 0.98,         
#'                   threshold.pools = 0.98,         
#'                   snp.dat.indiv = Ham.snp.dat.indiv, 
#'                   snp.dat.pools = Ham.snp.dat.pools,
#'                   n.in.pools = 2,
#'                   snp.error.assumed = 0.01,
#'                   snp.param.indiv = Ham.snp.param.indiv,
#'                   snp.param.pools = Ham.snp.param.pools,                  
#'                   fams = Ham.fams,
#'                   fam.set.combns = Ham.fam.set.combns,
#'                   fam.set.combns.by.pool = Ham.fam.set.combns.by.pool)  
#'                   
#' #Assign parentage using the discrete maximum likelihood method 
#' #(discrete.method = "assigned.genos")
#' parent.assign.fun(method= "Discrete",
#'                   discrete.method = "assigned.genos",
#'                   snp.dat.indiv = Ham.snp.dat.indiv, 
#'                   snp.dat.pools = Ham.snp.dat.pools,
#'                   n.in.pools = 2,
#'                   snp.error.assumed = 0.01,
#'                   fams = Ham.fams,
#'                   fam.set.combns = Ham.fam.set.combns,
#'                   fam.set.combns.by.pool = Ham.fam.set.combns.by.pool)    
#'   
#' #Assign parentage using the exclusion method 
#' #(discrete.method = "geno.probs")
#' parent.assign.fun(method= "Exclusion",
#'                   discrete.method = "geno.probs",
#'                   threshold.indiv = 0.98,         
#'                   threshold.pools = 0.98,         
#'                   snp.dat.indiv = Ham.snp.dat.indiv, 
#'                   snp.dat.pools = Ham.snp.dat.pools,
#'                   n.in.pools = 2,
#'                   snp.error.assumed = 0.01,
#'                   snp.param.indiv = Ham.snp.param.indiv,
#'                   snp.param.pools = Ham.snp.param.pools,                  
#'                   fams = Ham.fams,
#'                   fam.set.combns = Ham.fam.set.combns,
#'                   fam.set.combns.by.pool = Ham.fam.set.combns.by.pool)   
#'                                   
#' #Assign parentage using the exclusion method 
#' #(discrete.method = "assigned.genos")
#' parent.assign.fun(method= "Exclusion",
#'                   discrete.method = "assigned.genos",
#'                   snp.dat.indiv = Ham.snp.dat.indiv, 
#'                   snp.dat.pools = Ham.snp.dat.pools,
#'                   n.in.pools = 2,
#'                   fams = Ham.fams,
#'                   fam.set.combns = Ham.fam.set.combns,
#'                   fam.set.combns.by.pool = Ham.fam.set.combns.by.pool)   
#'  
#' #Assign parentage using multiple methods
#' #(discrete.method = "geno.probs")
#' parent.assign.fun(method = c("Least_squares", "Quantitative", "Discrete", "Exclusion"),
#'                   beta.min.ss = TRUE, 
#'                   discrete.method = "geno.probs",
#'                   threshold.indiv = 0.98,         
#'                   threshold.pools = 0.98,         
#'                   snp.dat.indiv = Ham.snp.dat.indiv, 
#'                   snp.dat.pools = Ham.snp.dat.pools,
#'                   n.in.pools = 2,
#'                   snp.error.assumed = 0.01,
#'                   snp.param.indiv = Ham.snp.param.indiv,
#'                   snp.param.pools = Ham.snp.param.pools,                  
#'                   fams = Ham.fams,
#'                   fam.set.combns = Ham.fam.set.combns,
#'                   fam.set.combns.by.pool = Ham.fam.set.combns.by.pool)  
#'                   
#' @references Henshall JM, Dierens, L Sellars MJ (2014) Quantitative analysis of low-density SNP data for parentage assignment and estimation of family contributions to pooled samples. Genetics Selection Evolution 46, 51. https://doi 10.1186/s12711-014-0051-y 
#' @references Hamilton MG (2020) Maximum likelihood parentage assignment using quantitative genotypes

#' @export

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

parent.assign.fun <- function(method, #c("Quantitative", "Discrete", "Exclusion", "Least_squares"),
                              beta.min.ss = FALSE, #Appicable to least_squares method.  If TRUE, beta constrained to integers according to n.in.pools and minimum sum of squares identified
                              discrete.method = "geno.probs", #"geno.probs" or "assigned.genos" 
                              threshold.indiv = NULL,         #Dij.from.Gij.fun.  Not used if discrete.method = "assigned.genos"
                              threshold.pools = NULL,         #dkj.from.gkj.fun.  Not used if discrete.method = "assigned.genos"
                              
                              #SNP data
                              snp.dat.indiv, 
                              snp.dat.pools,
                              n.in.pools,
                              min.intensity        = 0,    #pij.fun.  
                              snp.error.assumed    = NULL, #If not null then this error is applied to all SNP.
                              snp.error.underlying = NULL, #adj.geno.prob.fun.  Not required if snp.error.assumed is not NULL.
                              
                              #SNP parameters
                              snp.param.indiv = NULL,
                              snp.param.pools = NULL,
                              min.sd          = 0,         #phi.ij.fun   lambda.ij.fun  
                              
                              #Define pools
                              fams,
                              fam.set.combns = NULL,
                              fam.set.combns.by.pool = NULL,

                             skip.checks = FALSE
                             ) {

  print(Sys.time())
  print("Running parent.assign.fun")  
  
  # load required packages
  if("dplyr" %in% installed.packages()[, "Package"] == FALSE) {install.packages("dplyr", repos='https://cran.csiro.au/')} 
  library(dplyr) 
  if("mgcv" %in% installed.packages()[, "Package"] == FALSE) {install.packages("mgcv", repos='https://cran.csiro.au/')} 
  library(mgcv)
  if("RColorBrewer" %in% installed.packages()[, "Package"] == FALSE) {install.packages("RColorBrewer", repos='https://cran.csiro.au/')} 
  library(RColorBrewer)
  if("gplots" %in% installed.packages()[, "Package"] == FALSE) {install.packages("gplots", repos='https://cran.csiro.au/')} 
  library(gplots)
  if("ggplot2" %in% installed.packages()[, "Package"] == FALSE) {install.packages("ggplot2", repos='https://cran.csiro.au/')} 
  library(ggplot2)
  if("reshape2" %in% installed.packages()[, "Package"] == FALSE) {install.packages("reshape2", repos='https://cran.csiro.au/')} 
  library(reshape2)
  
  #Define plot parameters for bar.plot.fun - these were inputs in the parent.assign.fun but have been moved to simplify inputs
  file.name = ""              #Text. Name of thebar plot file.  Only relevant if method includes 'Least_squares'
  var = "BETA_HAT"            #Text.  Variable to plot "BETA_STAR" or "BETA_HAT".  Only relevant if method includes 'Least_squares'
  heading = "Estimated family contributions to pooled samples" #Text. Title of thebar plot .  Only relevant if method includes 'Least_squares'
  plot.to.heading.height = 20 # Number. Height of the title relative to the height of thebar plot.  Only relevant if method includes 'Least_squares' 
  font.size.heading = 3       #Number. Font size ofbar plot heading.  Only relevant if method includes 'Least_squares'
  font.size.y.axis = 2        #Number. Font size ofbar plot y axis labels.  Only relevant if method includes 'Least_squares'
  font.size.x.axis = 2        #Number. Font size ofbar plot x axis labels.  Only relevant if method includes 'Least_squares'

    #Start checks############################################################################
  
  if(!skip.checks) {
    
    # method checks
    
    if(sum(method %in% c("Quantitative", "Discrete", "Exclusion", "Least_squares")) != length(method)) {
      stop("method must be a vector containing the words \'Quantitative\', \'Discrete\', \'Exclusion\', and or \'Least_squares\'")
    }  
    
    # discrete.method checks
    
    if(!(discrete.method %in% c("geno.probs", "assigned.genos"))) {
      stop("discrete.method must be \'geno.probs\' or \'assigned.genos\'")
    }
    
    # snp.dat.indiv checks 
    
    #Check that required headings are present in snp.dat.indiv  
    
    if(discrete.method  == "geno.probs") {
      
      if(sum(c("SAMPLE_ID", "SNP_ID", "INTENSITY_A", "INTENSITY_B") %in% colnames(snp.dat.indiv)) != 4) {
        stop("snp.dat.indiv input must be a data frame containing the following headings: SAMPLE_ID, SNP_ID, INTENSITY_A, INTENSITY_B")
      }  
      
      snp.dat.indiv <- snp.dat.indiv[,c("SAMPLE_ID", "SNP_ID", "INTENSITY_A", "INTENSITY_B")]
      
      snp.dat.indiv$SAMPLE_ID   <- as.integer(snp.dat.indiv$SAMPLE_ID)
      snp.dat.indiv$SNP_ID      <- as.character(snp.dat.indiv$SNP_ID)  
      snp.dat.indiv$INTENSITY_A <- as.numeric(snp.dat.indiv$INTENSITY_A)  
      snp.dat.indiv$INTENSITY_B <- as.numeric(snp.dat.indiv$INTENSITY_B)  
      
      if(sum(!is.integer(snp.dat.indiv$SAMPLE_ID)) > 0) {
        stop("SAMPLE_ID in snp.dat.indiv must be an integer.  Also check for missing values.")
      }
      if(sum(!is.character(snp.dat.indiv$SNP_ID)) > 0) {
        stop("SNP_ID in snp.dat.indiv must be a character.  Check for missing values.")
      }
      if(sum(!is.numeric(snp.dat.indiv$INTENSITY_A)) > 0) {
        stop("INTENSITY_A in snp.dat.indiv must be numeric.  Also check for missing values.")
      }
      if(sum(!is.numeric(snp.dat.indiv$INTENSITY_B)) > 0) {
        stop("INTENSITY_B in snp.dat.indiv must be numeric.  Also check for missing values.")
      }
      
    } 
    
    if(discrete.method  == "assigned.genos" & ("Quantitative" %in% method | "Least_squares" %in% method)) {
      
      if(sum(c("SAMPLE_ID", "SNP_ID", "INTENSITY_A", "INTENSITY_B", "A_ALLELE", "B_ALLELE", "GENOTYPE") %in% colnames(snp.dat.indiv)) != 7) {
        stop("snp.dat.indiv input must be a data frame containing the following headings: SAMPLE_ID, SNP_ID, INTENSITY_A, INTENSITY_B, A_ALLELE, B_ALLELE, GENOTYPE")
      }
      
      snp.dat.indiv <- snp.dat.indiv[,c("SAMPLE_ID", "SNP_ID", "INTENSITY_A", "INTENSITY_B", "A_ALLELE", "B_ALLELE", "GENOTYPE")] 
      
      snp.dat.indiv$SAMPLE_ID   <- as.integer(snp.dat.indiv$SAMPLE_ID)
      snp.dat.indiv$SNP_ID      <- as.character(snp.dat.indiv$SNP_ID)  
      snp.dat.indiv$INTENSITY_A <- as.numeric(snp.dat.indiv$INTENSITY_A)  
      snp.dat.indiv$INTENSITY_B <- as.numeric(snp.dat.indiv$INTENSITY_B) 
      snp.dat.indiv$A_ALLELE <- as.character(snp.dat.indiv$A_ALLELE)
      snp.dat.indiv$B_ALLELE <- as.character(snp.dat.indiv$B_ALLELE)
      snp.dat.indiv$GENOTYPE      <- as.character(snp.dat.indiv$GENOTYPE) 
      
      if(sum(!is.integer(snp.dat.indiv$SAMPLE_ID)) > 0) {
        stop("SAMPLE_ID in snp.dat.indiv must be an integer.  Also check for missing values.")
      }
      if(sum(!is.character(snp.dat.indiv$SNP_ID)) > 0) {
        stop("SNP_ID in snp.dat.indiv must be a character.  Check for missing values.")
      }
      if(sum(!is.numeric(snp.dat.indiv$INTENSITY_A)) > 0) {
        stop("INTENSITY_A in snp.dat.indiv must be numeric.  Also check for missing values.")
      }
      if(sum(!is.numeric(snp.dat.indiv$INTENSITY_B)) > 0) {
        stop("INTENSITY_B in snp.dat.indiv must be numeric.  Also check for missing values.")
      }
      if(sum(!is.character(snp.dat.indiv$A_ALLELE)) > 0) {
        stop("A_ALLELE in snp.dat.indiv must be a character.  Check for missing values.")
      }
      if(sum(!is.character(snp.dat.indiv$B_ALLELE)) > 0) {
        stop("B_ALLELE in snp.dat.indiv must be a character.  Check for missing values.")
      }
      if(sum(!is.character(snp.dat.indiv$GENOTYPE)) > 0) {
        stop("GENOTYPE in snp.dat.indiv must be a character.  Check for missing values.")
      }
    }
    
    if(discrete.method  == "assigned.genos" & !("Quantitative" %in% method | "Least_squares" %in% method)) {
      
      if(sum(c("SAMPLE_ID", "SNP_ID", "A_ALLELE", "B_ALLELE", "GENOTYPE") %in% colnames(snp.dat.indiv)) != 5) {
        stop("snp.dat.indiv input must be a data frame containing the following headings: SAMPLE_ID, SNP_ID, A_ALLELE, B_ALLELE, GENOTYPE")
      }
      
      snp.dat.indiv <- snp.dat.indiv[,c("SAMPLE_ID", "SNP_ID", "A_ALLELE", "B_ALLELE", "GENOTYPE")] 
      
      snp.dat.indiv$SAMPLE_ID   <- as.integer(snp.dat.indiv$SAMPLE_ID)
      snp.dat.indiv$SNP_ID      <- as.character(snp.dat.indiv$SNP_ID) 
      snp.dat.indiv$A_ALLELE <- as.character(snp.dat.indiv$A_ALLELE)
      snp.dat.indiv$B_ALLELE <- as.character(snp.dat.indiv$B_ALLELE)
      snp.dat.indiv$GENOTYPE <- as.character(snp.dat.indiv$GENOTYPE)   
      
      if(sum(!is.integer(snp.dat.indiv$SAMPLE_ID)) > 0) {
        stop("SAMPLE_ID in snp.dat.indiv must be an integer.  Also check for missing values.")
      }
      if(sum(!is.character(snp.dat.indiv$SNP_ID)) > 0) {
        stop("SNP_ID in snp.dat.indiv must be a character.  Check for missing values.")
      }
      if(sum(!is.character(snp.dat.indiv$A_ALLELE)) > 0) {
        stop("A_ALLELE in snp.dat.indiv must be a character.  Check for missing values.")
      }
      if(sum(!is.character(snp.dat.indiv$B_ALLELE)) > 0) {
        stop("B_ALLELE in snp.dat.indiv must be a character.  Check for missing values.")
      }
      if(sum(!is.character(snp.dat.indiv$GENOTYPE)) > 0) {
        stop("GENOTYPE in snp.dat.indiv must be a character.  Check for missing values.")
      }
    }
    
    if(sum(is.na(snp.dat.indiv[,c("SAMPLE_ID", "SNP_ID")])) > 0) {
      stop("Check inputs in snp.dat.indiv  Are there missing SAMPLE_ID or SNP_ID?  Are columns of the correct class?")
    }
    
    #Check for duplicated records in snp.dat.pools
    indiv.snp <- paste(snp.dat.indiv$SAMPLE_ID,snp.dat.indiv$SNP_ID, sep=".")
    if(sum(duplicated(indiv.snp)) > 0) {
      stop("SAMPLE_ID and SNP_ID combinations are not unique in snp.dat.indiv  Delete duplicates or recode SAMPLE_ID.")
    }
    rm(indiv.snp)
    
    #order
    snp.dat.indiv <- snp.dat.indiv[order(snp.dat.indiv[,"SAMPLE_ID"]),]
    snp.dat.indiv <- snp.dat.indiv[order(snp.dat.indiv[,"SNP_ID"]),]
    
    # snp.dat.pools checks
    
    #Check that required headings are present in snp.dat.pools  
    
    if(discrete.method  == "geno.probs") {
      
      if(sum(c("SAMPLE_ID", "SNP_ID", "INTENSITY_A", "INTENSITY_B") %in% colnames(snp.dat.pools)) != 4) {
        stop("snp.dat.pools input must be a data frame containing the following headings: SAMPLE_ID, SNP_ID, INTENSITY_A, INTENSITY_B")
      }  
      
      snp.dat.pools <- snp.dat.pools[,c("SAMPLE_ID", "SNP_ID", "INTENSITY_A", "INTENSITY_B")]
      
      snp.dat.pools$SAMPLE_ID   <- as.integer(snp.dat.pools$SAMPLE_ID)
      snp.dat.pools$SNP_ID      <- as.character(snp.dat.pools$SNP_ID)  
      snp.dat.pools$INTENSITY_A <- as.numeric(snp.dat.pools$INTENSITY_A)  
      snp.dat.pools$INTENSITY_B <- as.numeric(snp.dat.pools$INTENSITY_B)  
      
      if(sum(!is.integer(snp.dat.pools$SAMPLE_ID)) > 0) {
        stop("SAMPLE_ID in snp.dat.pools must be an integer.  Also check for missing values.")
      }
      if(sum(!is.character(snp.dat.pools$SNP_ID)) > 0) {
        stop("SNP_ID in snp.dat.pools must be a character.  Check for missing values.")
      }
      if(sum(!is.numeric(snp.dat.pools$INTENSITY_A)) > 0) {
        stop("INTENSITY_A in snp.dat.pools must be numeric.  Also check for missing values.")
      }
      if(sum(!is.numeric(snp.dat.pools$INTENSITY_B)) > 0) {
        stop("INTENSITY_B in snp.dat.pools must be numeric.  Also check for missing values.")
      }
      
    } 
    
    if(discrete.method  == "assigned.genos" & ("Quantitative" %in% method | "Least_squares" %in% method)) {
      
      if(sum(c("SAMPLE_ID", "SNP_ID", "INTENSITY_A", "INTENSITY_B", "GENOTYPE") %in% colnames(snp.dat.pools)) != 5) {
        stop("snp.dat.pools input must be a data frame containing the following headings: SAMPLE_ID, SNP_ID, INTENSITY_A, INTENSITY_B, GENOTYPE")
      }
      
      snp.dat.pools <- snp.dat.pools[,c("SAMPLE_ID", "SNP_ID", "INTENSITY_A", "INTENSITY_B", "GENOTYPE")] 
      
      snp.dat.pools$SAMPLE_ID   <- as.integer(snp.dat.pools$SAMPLE_ID)
      snp.dat.pools$SNP_ID      <- as.character(snp.dat.pools$SNP_ID)  
      snp.dat.pools$INTENSITY_A <- as.numeric(snp.dat.pools$INTENSITY_A)  
      snp.dat.pools$INTENSITY_B <- as.numeric(snp.dat.pools$INTENSITY_B)  
      snp.dat.pools$GENOTYPE      <- as.character(snp.dat.pools$GENOTYPE) 
      
      if(sum(!is.integer(snp.dat.pools$SAMPLE_ID)) > 0) {
        stop("SAMPLE_ID in snp.dat.pools must be an integer.  Also check for missing values.")
      }
      if(sum(!is.character(snp.dat.pools$SNP_ID)) > 0) {
        stop("SNP_ID in snp.dat.pools must be a character.  Check for missing values.")
      }
      if(sum(!is.numeric(snp.dat.pools$INTENSITY_A)) > 0) {
        stop("INTENSITY_A in snp.dat.pools must be numeric.  Also check for missing values.")
      }
      if(sum(!is.numeric(snp.dat.pools$INTENSITY_B)) > 0) {
        stop("INTENSITY_B in snp.dat.pools must be numeric.  Also check for missing values.")
      }
      if(sum(!is.character(snp.dat.pools$GENOTYPE)) > 0) {
        stop("GENOTYPE in snp.dat.pools must be a character.  Check for missing values.")
      }
    }
    
    if(discrete.method  == "assigned.genos" & !("Quantitative" %in% method | "Least_squares" %in% method)) {
      
      if(sum(c("SAMPLE_ID", "SNP_ID", "GENOTYPE") %in% colnames(snp.dat.pools)) != 3) {
        stop("snp.dat.pools input must be a data frame containing the following headings: SAMPLE_ID, SNP_ID, GENOTYPE")
      }
      
      snp.dat.pools <- snp.dat.pools[,c("SAMPLE_ID", "SNP_ID", "GENOTYPE")] 
      
      snp.dat.pools$SAMPLE_ID   <- as.integer(snp.dat.pools$SAMPLE_ID)
      snp.dat.pools$SNP_ID      <- as.character(snp.dat.pools$SNP_ID)  
      snp.dat.pools$GENOTYPE <- as.character(snp.dat.pools$GENOTYPE)   
      
      if(sum(!is.integer(snp.dat.pools$SAMPLE_ID)) > 0) {
        stop("SAMPLE_ID in snp.dat.pools must be an integer.  Also check for missing values.")
      }
      if(sum(!is.character(snp.dat.pools$SNP_ID)) > 0) {
        stop("SNP_ID in snp.dat.pools must be a character.  Check for missing values.")
      }
      if(sum(!is.character(snp.dat.pools$GENOTYPE)) > 0) {
        stop("GENOTYPE in snp.dat.pools must be a character.  Check for missing values.")
      }
    }
    
    if(sum(is.na(snp.dat.pools[,c("SAMPLE_ID", "SNP_ID")])) > 0) {
      stop("Check inputs in snp.dat.pools.  Are there missing SAMPLE_ID or SNP_ID?  Are columns of the correct class?")
    }
    
    #Check for duplicated records in snp.dat.pools
    indiv.snp <- paste(snp.dat.pools$SAMPLE_ID,snp.dat.pools$SNP_ID, sep=".")
    if(sum(duplicated(indiv.snp)) > 0) {
      stop("SAMPLE_ID and SNP_ID combinations are not unique in snp.dat.pools.  Delete duplicates or recode SAMPLE_ID.")
    }
    rm(indiv.snp)
    
    #order
    snp.dat.pools <- snp.dat.pools[order(snp.dat.pools[,"SAMPLE_ID"]),]
    snp.dat.pools <- snp.dat.pools[order(snp.dat.pools[,"SNP_ID"]),]
    
    # snp.param.indiv checks
    
    #Check that required headings are present in snp.param.indiv  
    if("Least_squares" %in% method &
       sum(c("WELCH_A", "WELCH_B") %in% colnames(snp.param.indiv)) != 2) {
      stop("If using the least squares method, the snp.param.indiv input must contain the following headings: WELCH_A and WELCH_B")
    }
    
    if((sum(c("Quantitative", "Discrete", "Exclusion") %in% method) > 0 &
        discrete.method == "geno.probs") |
        "Quantitative" %in% method) {
    if(sum(c("SNP_ID", "MEAN_P_AA", "SD_P_AA", "MEAN_P_AB", "SD_P_AB", "MEAN_P_BB", "SD_P_BB", 
             "A_ALLELE", "B_ALLELE") %in% colnames(snp.param.indiv)) != 9) {  #"B_ALLELE_FREQ", 
      stop("snp.param.indiv input must contain the following headings: SNP_ID, MEAN_P_AA, SD_P_AA, MEAN_P_AB, SD_P_AB, MEAN_P_BB, SD_P_BB, A_ALLELE, and B_ALLELE") #B_ALLELE_FREQ,
    }
    
    snp.param.indiv$SNP_ID      <- as.character(snp.param.indiv$SNP_ID)  
    snp.param.indiv$MEAN_P_AA   <- as.numeric(snp.param.indiv$MEAN_P_AA)  
    snp.param.indiv$SD_P_AA     <- as.numeric(snp.param.indiv$SD_P_AA)  
    snp.param.indiv$MEAN_P_AB   <- as.numeric(snp.param.indiv$MEAN_P_AB)  
    snp.param.indiv$SD_P_AB     <- as.numeric(snp.param.indiv$SD_P_AB)  
    snp.param.indiv$MEAN_P_BB   <- as.numeric(snp.param.indiv$MEAN_P_BB)  
    snp.param.indiv$SD_P_BB     <- as.numeric(snp.param.indiv$SD_P_BB)  
    # snp.param.indiv$B_ALLELE_FREQ <- as.numeric(snp.param.indiv$B_ALLELE_FREQ)    
    snp.param.indiv$A_ALLELE    <- as.character(snp.param.indiv$A_ALLELE) 
    snp.param.indiv$B_ALLELE    <- as.character(snp.param.indiv$B_ALLELE) 
    
    if(sum(!is.character(snp.param.indiv$SNP_ID)) > 0) {
      stop("SNP_ID in snp.param.indiv must be a character.  Check for missing values.")
    }
    if(sum(!is.numeric(snp.param.indiv$MEAN_P_AA)) > 0) {
      stop("MEAN_P_AA in snp.param.indiv must be numeric.  Also check for missing values.")
    }
    if(sum(!is.numeric(snp.param.indiv$SD_P_AA)) > 0) {
      stop("SD_P_AA in snp.param.indiv must be numeric.  Also check for missing values.")
    }
    if(sum(!is.numeric(snp.param.indiv$MEAN_P_AB)) > 0) {
      stop("MEAN_P_AB in snp.param.indiv must be numeric.  Also check for missing values.")
    }
    if(sum(!is.numeric(snp.param.indiv$SD_P_AB)) > 0) {
      stop("SD_P_AB in snp.param.indiv must be numeric.  Also check for missing values.")
    }
    if(sum(!is.numeric(snp.param.indiv$MEAN_P_BB)) > 0) {
      stop("MEAN_P_BB in snp.param.indiv must be numeric.  Also check for missing values.")
    }
    if(sum(!is.numeric(snp.param.indiv$SD_P_BB)) > 0) {
      stop("SD_P_BB in snp.param.indiv must be numeric.  Also check for missing values.")
    }
    if(sum(!is.character(snp.param.indiv$A_ALLELE)) > 0) {
      stop("A_ALLELE in snp.param.indiv must be a character.  Check for missing values.")
    }
    if(sum(!is.character(snp.param.indiv$B_ALLELE)) > 0) {
      stop("B_ALLELE in snp.param.indiv must be a character.  Check for missing values.")
    }
    
    if(sum(is.na(snp.param.indiv)) > 0) {
      stop("Check inputs in snp.param.indiv  Are there missing values?  Are columns of the correct class?")
    }
    
    if("Least_squares" %in% method) {
      snp.param.indiv <- snp.param.indiv[,c("SNP_ID", "MEAN_P_AA", "SD_P_AA", "MEAN_P_AB", "SD_P_AB", "MEAN_P_BB", "SD_P_BB", 
                                "WELCH_A", "WELCH_B",  "A_ALLELE", "B_ALLELE")]#"B_ALLELE_FREQ",
    } else {
      snp.param.indiv <- snp.param.indiv[,c("SNP_ID", "MEAN_P_AA", "SD_P_AA", "MEAN_P_AB", "SD_P_AB", "MEAN_P_BB", "SD_P_BB", 
                                "A_ALLELE", "B_ALLELE")]#"B_ALLELE_FREQ", 
    }
    
    if(sum(snp.param.indiv[,c("SD_P_AA", "SD_P_AB", "SD_P_BB")] < 0) != 0) {
      stop("All values in the SD columns of snp.param.indiv must be greater than zero")
    }
    
    #Check that SNP are not duplicated
    if(sum(duplicated(snp.param.indiv[,"SNP_ID"])) > 0) {
      stop("SNP_IDs must not be duplicated in snp.param.indiv")
    }
    
    #Check that B_ALLELE_FREQ <=1 and >=0
    #  if(sum(snp.param.indiv[,"B_ALLELE_FREQ"] <=1 & snp.param.indiv[,"B_ALLELE_FREQ"] >= 0) != nrow(snp.param.indiv)) {
    #    stop("Data in the B_ALLELE_FREQ column must be between 0 and 1 inclusive")
    #  }       
    
    #Check that A_ALLELE and B_ALLELE data are one of A, C, G, or T
    if(sum(snp.param.indiv[,"A_ALLELE"] %in% c("A", "C", "G", "T") & 
           snp.param.indiv[,"B_ALLELE"] %in% c("A", "C", "G", "T")) !=  nrow(snp.param.indiv)) {
      stop("A_ALLELE and B_ALLELE data in snp.param.indiv must be \'A\', \'C\', \'G\', or \'T\'")
    }
    
    #Check that A_ALLELE is not the same as B_ALLELE
    if(sum(snp.param.indiv[,"A_ALLELE"] == snp.param.indiv[,"B_ALLELE"]) > 0) {
      stop("A_ALLELE and B_ALLELE must be different for each SNP in snp.param.indiv")
    }
    
    if("Least_squares" %in% method) {
      snp.param.indiv$WELCH_A <- as.numeric(snp.param.indiv$WELCH_A)  
      snp.param.indiv$WELCH_B <- as.numeric(snp.param.indiv$WELCH_B)  
      
      if(sum(!is.numeric(snp.param.indiv$WELCH_B)) > 0) {
        stop("WELCH_B in snp.param.indiv must be numeric.  Also check for missing values.")
      }
      if(sum(!is.numeric(snp.param.indiv$WELCH_B)) > 0) {
        stop("WELCH_B in snp.param.indiv must be numeric.  Also check for missing values.")
      }
    }
    
    #Check that there is only one nucleotide in column A_ALLELE for each SNP
    tmp.1 <-  unique(snp.param.indiv[,c("SNP_ID","A_ALLELE")])
    if(sum(unique(snp.param.indiv[,"SNP_ID"]) != tmp.1[,c("SNP_ID")]) > 0) {
      stop("There is more than one nucleotide in column A_ALLELE for at least one SNP in snp.param.indiv")
    }
    rm(tmp.1)
    
    #Check that there is only one nucleotide in column B_ALLELE for each SNP  
    tmp.1 <-  unique(snp.param.indiv[,c("SNP_ID","B_ALLELE")])
    if(sum(unique(snp.param.indiv[,"SNP_ID"]) != tmp.1[,c("SNP_ID")]) > 0) {
      stop("There is more than one nucleotide in column B_ALLELE for at least one SNP in snp.param.indiv")
    }
    rm(tmp.1)
    
    }
    
    #order
    snp.param.indiv <- snp.param.indiv[order(snp.param.indiv[,"SNP_ID"]),]
    
    # snp.param.pools checks
    
    if ((discrete.method == "geno.probs" | sum(c("Quantitative") %in% method) > 0) &
        is.null(snp.param.pools)) {
      stop("snp.param.pools must be specified in parent.assign.fun")
    }
    
    if ((sum(c("Quantitative", "Discrete", "Exclusion") %in% method) > 0 &
        discrete.method == "geno.probs") |
        "Quantitative" %in% method) {
      
      #Check that required headings are present in snp.param.pools  
      genotypes <- genotypes.fun(n.in.pools*2)
      
      if(sum(c("SNP_ID", paste("MEAN_P_", genotypes, sep=""), paste("SD_P_", genotypes, sep=""), "A_ALLELE", "B_ALLELE") %in% colnames(snp.param.pools)) != 
         (length(genotypes)*2 +3)) {
        stop(paste("snp.param.pools input (given the value of n.in.pools) must contain the following headings:", c("SNP_ID", 
                                                                                                                   paste("MEAN_P_", genotypes, sep=""), paste("SD_P_", genotypes, sep=""), "A_ALLELE", "B_ALLELE")))
      }
      
      snp.param.pools <- snp.param.pools[,c("SNP_ID", paste("MEAN_P_", genotypes, sep=""), paste("SD_P_", genotypes, sep=""), "A_ALLELE", "B_ALLELE")]
      
      snp.param.pools$SNP_ID   <- as.character(snp.param.pools$SNP_ID)  
      snp.param.pools$A_ALLELE <- as.character(snp.param.pools$A_ALLELE) 
      snp.param.pools$B_ALLELE <- as.character(snp.param.pools$B_ALLELE) 
      
      if(sum(!is.character(snp.param.pools$SNP_ID)) > 0) {
        stop("SNP_ID in snp.param.pools must be a character.  Check for missing values.")
      }
      if(sum(!is.character(snp.param.pools$A_ALLELE)) > 0) {
        stop("A_ALLELE in snp.param.pools must be a character.  Check for missing values.")
      }
      if(sum(!is.character(snp.param.pools$B_ALLELE)) > 0) {
        stop("B_ALLELE in snp.param.pools must be a character.  Check for missing values.")
      }
      
      for(genotype in genotypes) {
        tmp1 <- paste("MEAN_P_", genotype, sep="")
        tmp2 <- paste("SD_P_", genotype, sep="")
        snp.param.pools[,tmp1] <- as.numeric(snp.param.pools[,tmp1]) 
        snp.param.pools[,tmp2] <- as.numeric(snp.param.pools[,tmp2]) 
        rm(tmp1,tmp2)
      }
      
      if(sum(is.na(snp.param.pools)) > 0) {
        stop("Check inputs in snp.param.pools.  Are there missing values?  Are columns of the correct class?")
      }
      
      if(sum(snp.param.pools[,c(paste("SD_P_", genotypes, sep=""))] < 0) != 0) {
        stop("All values in the SD columns of snp.param.pools must be greater than zero")
      }
      
      #Check that SNP are not duplicated
      if(sum(duplicated(snp.param.pools[,"SNP_ID"])) > 0) {
        stop("SNP_IDs must no be duplicated in snp.param.pools")
      }
      
      #Check that A_ALLELE and B_ALLELE data are one of A, C, G, or T
      if(sum(snp.param.pools[,"A_ALLELE"] %in% c("A", "C", "G", "T") & 
             snp.param.pools[,"B_ALLELE"] %in% c("A", "C", "G", "T")) !=  nrow(snp.param.pools)) {
        stop("A_ALLELE and B_ALLELE data in snp.param.pools must be \'A\', \'C\', \'G\', or \'T\'")
      }
      
      #Check that A_ALLELE is not the same as B_ALLELE
      if(sum(snp.param.pools[,"A_ALLELE"] == snp.param.pools[,"B_ALLELE"]) > 0) {
        stop("A_ALLELE and B_ALLELE must be different for each SNP in snp.param.pools")
      }
      
      #Check SNP_ID and A_ALLELE/B_ALLELE combinations
      if(sum(unique(snp.param.pools[,c("SNP_ID", "A_ALLELE")]) == unique(snp.param.indiv[,c("SNP_ID", "A_ALLELE")])) 
         != (2*length(unique(snp.param.indiv$SNP_ID)))) {
        stop("SNP_ID and A_ALLELE combinations are different between snp.param.indiv and snp.param.pools")
      }
      
      #Check SNP_ID and A_ALLELE/B_ALLELE combinations
      if(sum(unique(snp.param.pools[,c("SNP_ID", "B_ALLELE")]) == unique(snp.param.indiv[,c("SNP_ID", "B_ALLELE")])) 
         != (2*length(unique(snp.param.indiv$SNP_ID)))) {
        stop("SNP_ID and B_ALLELE combinations are different between snp.param.indiv and snp.param.pools")
      }
      
      if(!nrow(unique(snp.param.pools[,c("SNP_ID", "A_ALLELE")])) == length(unique(snp.param.pools[,"SNP_ID"]))) {
        stop("Different A_ALLELEs listed for the same SNP_ID in snp.param.indiv and snp.param.pools")
      }
      
      if(!nrow(unique(snp.param.pools[,c("SNP_ID", "B_ALLELE")])) == length(unique(snp.param.pools[,"SNP_ID"]))) {
        stop("Different B_ALLELEs listed for the same SNP_ID in snp.param.indiv and snp.param.pools")
      }
      
    }
    
    #Check that there is only one nucleotide in column A_ALLELE for each SNP
    tmp.1 <-  unique(snp.param.pools[,c("SNP_ID","A_ALLELE")])
    if(sum(unique(snp.param.pools[,"SNP_ID"]) != tmp.1[,c("SNP_ID")]) > 0) {
      stop("There is more than one nucleotide in column A_ALLELE for at least one SNP in snp.param.pools")
    }
    rm(tmp.1)
    
    #Check that there is only one nucleotide in column B_ALLELE for each SNP  
    tmp.1 <-  unique(snp.param.pools[,c("SNP_ID","B_ALLELE")])
    if(sum(unique(snp.param.pools[,"SNP_ID"]) != tmp.1[,c("SNP_ID")]) > 0) {
      stop("There is more than one nucleotide in column B_ALLELE for at least one SNP in snp.param.pools")
    }
    rm(tmp.1)
    
    #order
    snp.param.pools <- snp.param.pools[order(snp.param.pools[,"SNP_ID"]),]
    
    #n.in.pools checks
    
    if(is.na(as.integer(n.in.pools))) {
      stop("n.in.pools must be an integer greater than zero")
    }
    
    n.in.pools <- as.integer(n.in.pools)
    
    if(n.in.pools < 1) {
      stop("n.in.pools must be an integer greater than zero")
    }
    
    # fams checks
    
    #Check that required headings are present in fams  
    if(sum(c("FAMILY_ID", "SIRE_ID", "DAM_ID") %in% colnames(fams)) != 3) {
      stop("fams input must be a data frame containing the following headings: FAMILY_ID, SIRE_ID, DAM_ID")
    }
    
    fams <- fams[,c("FAMILY_ID", "SIRE_ID", "DAM_ID")]
    
    fams$FAMILY_ID <- as.integer(fams$FAMILY_ID)
    fams$SIRE_ID   <- as.integer(fams$SIRE_ID)  
    fams$DAM_ID    <- as.integer(fams$DAM_ID)  
    
    if(sum(!is.integer(fams$FAMILY_ID)) > 0) {
      stop("FAMILY_ID in fams must be an integer.  Also check for missing values.")
    }
    if(sum(!is.integer(fams$SIRE_ID)) > 0) {
      stop("SIRE_ID in fams must be an integer.  Also check for missing values.")
    }
    if(sum(!is.integer(fams$DAM_ID)) > 0) {
      stop("DAM_ID in fams must be an integer.  Also check for missing values.")
    }
    
    if(sum(is.na(fams)) > 0) {
      stop("Check inputs in fams.  Are there missing values?  Are columns of the correct class?")
    }
    
    if(sum(fams < 1) > 0) {
      stop("FAMILY_ID, SIRE_ID and DAM_ID in fams must be integers and greater than 0")
    }
    
    if(!nrow(unique(fams[,c("SIRE_ID", "DAM_ID")])) == length(unique(fams[,c("FAMILY_ID")]))) {
      stop("There are multiple FAMILY_IDs with the same SIRE_ID and DAM_ID in fams")
    }
    
    if(sum(paste(fams[,"SIRE_ID"], fams[,"DAM_ID"], sep = "_") %in% 
           paste(fams[,"DAM_ID"], fams[,"SIRE_ID"], sep = "_")) > 0) {
      stop("There appear to be reciprocal families in fams.  These should be represented by only one row to avoid confusion.")
    }
    
    #Check that SIRE_ID and DAM_ID are present as SAMPLE_ID in snp.dat.indiv
    if(sum(!fams[,"SIRE_ID"] %in% snp.dat.indiv[,"SAMPLE_ID"]) > 0) {
      stop("All SIRE_IDs in fams must be present as SAMPLE_IDs in snp.dat.indiv")
    }
    
    #Check that SIRE_ID and DAM_ID are present as SAMPLE_ID in snp.dat.indiv
    if(sum(!fams[,"DAM_ID"] %in% snp.dat.indiv[,"SAMPLE_ID"]) > 0) {
      stop("All DAM_IDs in fams must be present as SAMPLE_IDs in snp.dat.indiv")
    }
    
    #order
    fams <- fams[order(fams[,"FAMILY_ID"]),]
    
    # fam.set.combns checks
    
    if((is.null(fam.set.combns.by.pool) & !is.null(fam.set.combns)) |
       (!is.null(fam.set.combns.by.pool) & is.null(fam.set.combns)) ) {
      stop("if either fam.set.combns.by.pool or fam.set.combns is NULL then both must be NULL")
    }
    
    if(!is.null(fam.set.combns)) {
    
    #Check that required headings are present in fam.set.combns  
    if(sum(c("FAM_SET_COMBN_ID", "FAM_SET_ID", "FAMILY_ID") %in% colnames(fam.set.combns)) != 3) {
      stop("fam.set.combns input must be a data frame containing the following headings: FAM_SET_COMBN_ID, FAM_SET_ID, FAMILY_ID")
    }
    
    fam.set.combns <- fam.set.combns[,c("FAM_SET_COMBN_ID", "FAM_SET_ID", "FAMILY_ID")]
    
    fam.set.combns$FAM_SET_COMBN_ID  <- as.integer(fam.set.combns$FAM_SET_COMBN_ID)
    fam.set.combns$FAM_SET_ID <- as.integer(fam.set.combns$FAM_SET_ID)  
    fam.set.combns$FAMILY_ID  <- as.integer(fam.set.combns$FAMILY_ID)  
    
    if(sum(!is.integer(fam.set.combns$FAM_SET_COMBN_ID)) > 0) {
      stop("FAM_SET_COMBN_ID in fam.set.combns must be an integer.  Also check for missing values.")
    }
    if(sum(!is.integer(fam.set.combns$FAM_SET_ID)) > 0) {
      stop("FAM_SET_ID in fam.set.combns must be an integer.  Also check for missing values.")
    }
    if(sum(!is.integer(fam.set.combns$FAMILY_ID)) > 0) {
      stop("FAMILY_ID in fam.set.combns must be an integer.  Also check for missing values.")
    }
    
    if(sum(is.na(fam.set.combns)) > 0) {
      stop("Check inputs in fam.set.combns.  Are there missing values?  Are columns of the correct class?")
    }
    
    if(sum(fam.set.combns[,c("FAM_SET_COMBN_ID", "FAM_SET_ID", "FAMILY_ID")] < 1) > 0) {
      stop("FAM_SET_COMBN_ID, FAM_SET_ID and FAMILY_ID in fam.set.combns must be integers and greater than 0")
    }
    
    tmp <- fam.set.combns
    tmp <- as.data.frame(table(tmp[,2:3]))
    tmp$FAM_SET_ID <- as.character(tmp$FAM_SET_ID)
    tmp$FAMILY_ID  <- as.character(tmp$FAMILY_ID)  
    tmp <- tmp[tmp[,"Freq"] != 0,]
    if(identical(!aggregate(tmp$Freq, by = list(tmp$FAM_SET_ID), na.rm=T, FUN = "mean")[2],
                 aggregate(tmp$Freq, by = list(tmp$FAM_SET_ID), na.rm=T, FUN = "max")[2])) {
      stop("FAM_SET_IDs present in multiple FAM_SET_COMBN_IDs do not allcontain the same FAMILY_IDs in fam.set.combns")
    }
    rm(tmp)
    
    #order
    fam.set.combns <- fam.set.combns[order(fam.set.combns[,"FAMILY_ID"]),]
    fam.set.combns <- fam.set.combns[order(fam.set.combns[,"FAM_SET_ID"]),]    
    fam.set.combns <- fam.set.combns[order(fam.set.combns[,"FAM_SET_COMBN_ID"]),]    
    }
    
    # fam.set.combns.by.pool checks
    
    if(!is.null(fam.set.combns.by.pool)) {
    
    #Check that required headings are present in fam.set.combns.by.pool  
    if(sum(c("SAMPLE_ID", "FAM_SET_COMBN_ID") %in% colnames(fam.set.combns.by.pool)) != 2) {
      stop("fam.set.combns.by.pool input must be a data frame containing the following headings: SAMPLE_ID, FAM_SET_COMBN_ID")
    }
    
    fam.set.combns.by.pool <- fam.set.combns.by.pool[,c("SAMPLE_ID", "FAM_SET_COMBN_ID")]
    
    fam.set.combns.by.pool$SAMPLE_ID <- as.integer(fam.set.combns.by.pool$SAMPLE_ID)      
    fam.set.combns.by.pool$FAM_SET_COMBN_ID  <- as.integer(fam.set.combns.by.pool$FAM_SET_COMBN_ID)
    
    if(sum(!is.integer(fam.set.combns.by.pool$SAMPLE_ID)) > 0) {
      stop("SAMPLE_ID in fam.set.combns.by.pool must be an integer.  Also check for missing values.")
    }
    
    if(sum(!is.integer(fam.set.combns.by.pool$FAM_SET_COMBN_ID)) > 0) {
      stop("FAM_SET_COMBN_ID in fam.set.combns.by.pool must be an integer.  Also check for missing values.")
    }
    
    if(sum(is.na(fam.set.combns.by.pool)) > 0) {
      stop("Check inputs in fam.set.combns.by.pool.  Are there missing values?  Are columns of the correct class?")
    }
    
    if(sum(fam.set.combns.by.pool[,c("SAMPLE_ID", "FAM_SET_COMBN_ID")] < 1) > 0) {
      stop("SAMPLE_ID and FAM_SET_COMBN_ID in fam.set.combns.by.pool must be integers and greater than 0")
    }
    
    fam.set.combns.by.pool <- fam.set.combns.by.pool[order(fam.set.combns.by.pool[,"SAMPLE_ID"]),]
    snp.dat.pools <- snp.dat.pools[order(snp.dat.pools[,"SAMPLE_ID"]),]
    if(!identical(as.character(unique(fam.set.combns.by.pool$SAMPLE_ID)), as.character(unique(snp.dat.pools$SAMPLE_ID)))) {
      stop("SAMPLE_ID in snp.dat.pools not the same as in fam.set.combns.by.pool")
    }
    
    fam.set.combns <- fam.set.combns[order(fam.set.combns[,"FAM_SET_COMBN_ID"]),]
    fam.set.combns.by.pool <- fam.set.combns.by.pool[order(fam.set.combns.by.pool[,"FAM_SET_COMBN_ID"]),]
    if(!identical(as.character(unique(fam.set.combns$FAM_SET_COMBN_ID)), as.character(unique(fam.set.combns.by.pool$FAM_SET_COMBN_ID)))) {
      stop("FAM_SET_COMBN_ID in fam.set.combns not the same as in fam.set.combns.by.pool")
    }
    
    #order
    fam.set.combns.by.pool <- fam.set.combns.by.pool[order(fam.set.combns.by.pool[,"SAMPLE_ID"]),]
    
    }
    
    # min.intensity checks
    
    if(is.na(as.numeric(min.intensity))) {
      stop("min.intensity must be a number")
    }
    
    min.intensity <- as.numeric(min.intensity)
    
    if(min.intensity < 0) {
      stop("min.intensity must be greater than or equal to zero")
    }
    
    # min.sd checks
    if(is.na(as.numeric(min.sd))) {
      stop("min.sd must be a number")
    }
    
    min.sd <- as.numeric(min.sd)
    
    if(min.sd < 0) {
      stop("min.sd must be greater than or equal to zero")
    }
    
    # snp.error.assumed checks
    
    if(!is.null(snp.error.assumed)) {
      
      if(is.data.frame(snp.error.assumed)) {
        #Check that required headings are present in fams  
        if(sum(c("SNP_ID", "SNP_ERROR_TILDE") %in% colnames(snp.error.assumed)) != 3) {
          stop("snp.error.assumed input must be a scalar or a data frame containing the following headings: SNP_ID, SNP_ERROR_TILDE")
        }
        
        snp.error.assumed <- snp.error.assumed[,c("SNP_ID", "SNP_ERROR_TILDE")]
        
        snp.error.assumed$SNP_ID <- as.character(snp.error.assumed$SNP_ID)
        snp.error.assumed$SNP_ERROR_TILDE   <- as.numeric(snp.error.assumed$SNP_ERROR_TILDE)  
        
        if(sum(is.na(snp.error.assumed)) > 0) {
          stop("Check inputs in snp.error.assumed  Are there missing values?  Are columns of the correct class?")
        }
        
        if(sum(snp.error.assumed$SNP_ERROR_TILDE < 0) > 0 |
           sum(snp.error.assumed$SNP_ERROR_TILDE > 1) > 0 ) {
          stop("SNP_ERROR_TILDE in snp.error.assumed must be a number between 0 and 1 inclusive")
        }
        
      } else {
        
        if(is.na(as.numeric(snp.error.assumed))) {
          stop("If snp.error.assumed is a scalar it must be between 0 and 1 inclusive")
        }
        
        snp.error.assumed <- as.numeric(snp.error.assumed)
        
        if(snp.error.assumed < 0 | snp.error.assumed > 1 ) {
          stop("If snp.error.assumed is a scalar it must be between 0 and 1 inclusive")
        }
        
      }
      
    }
    
    # snp.error.underlying checks
    
    if(sum(c("Quantitative", "Discrete", "Least_squares") %in% method) > 0) {
    
    if(is.null(snp.error.assumed)) {
      
      if(is.na(as.numeric(snp.error.underlying))) {
        stop("snp.error.underlying must be between 0 and 1 inclusive")
      }
      
      snp.error.underlying <- as.numeric(snp.error.underlying)
      
      if(snp.error.underlying. < 0 | snp.error.underlying > 1 ) {
        stop("snp.error.underlying must be between 0 and 1 inclusive")
      }
    }
    }
    
    # threshold.indiv checks
    
    if(sum(c("Discrete", "Exclusion") %in% method) > 0 &
       discrete.method == "geno.probs") {
      
      if(is.null(threshold.indiv)) {
        stop("threshold.indiv must be specified if method includes \'Discrete\' or \'Exclusion\' and discrete.method equals \'geno.probs\' ")
      }
      
      if(is.na(as.numeric(threshold.indiv))) {
        stop("threshold.indiv must be between 0 and 1 inclusive")
      }
      
      threshold.indiv <- as.numeric(threshold.indiv)
      
      if(threshold.indiv < 0 | threshold.indiv > 1 ) {
        stop("threshold.indiv must be between 0 and 1 inclusive")
      }
    }
    
    # threshold.pools checks
    
    if(sum(c("Discrete", "Exclusion") %in% method) > 0 &
       discrete.method == "geno.probs") {
      
      if(is.null(threshold.pools)) {
        stop("threshold.pools must be specified if method includes \'Discrete\' or \'Exclusion\'")
      }
      
      if(is.na(as.numeric(threshold.pools))) {
        stop("threshold.pools must be between 0 and 1 inclusive")
      }
      
      threshold.pools <- as.numeric(threshold.pools)
      
      if(threshold.pools < 0 | threshold.pools > 1 ) {
        stop("threshold.pools must be between 0 and 1 inclusive")
      }
    }
    
    
    if("Least_squares" %in% method) {
      
      # file.name checks 
      
      if(is.na(as.character(file.name))) {
        stop("file.name must be a string")
      }
      
      file.name <- as.character(file.name)
      
      # var checks   
      
      if(is.na(as.character(var))) {
        stop("var must be \'BETA_STAR\' or \'BETA_HAT\'")
      }
      
      var <- as.character(var)
      
      if(!var %in% c("BETA_STAR", "BETA_HAT")) {
        stop("var must be \'BETA_STAR\' or \'BETA_HAT\'")
      }
      
      # heading checks   
      
      if(is.na(as.character(heading))) {
        stop("heading must be a string")
      }
      
      heading <- as.character(heading)
      
      #plot.to.heading.height check 
      
      if(is.na(as.numeric(plot.to.heading.height))) {
        stop("plot.to.heading.height must be a number")
      }
      
      plot.to.heading.height <- as.numeric(plot.to.heading.height)
      
      if(plot.to.heading.height < 0) {
        stop("plot.to.heading.height must be greater than zero")
      }
      
      #font.size.heading check 
      
      if(is.na(as.numeric(font.size.heading))) {
        stop("font.size.heading must be a number")
      }
      
      font.size.heading <- as.numeric(font.size.heading)
      
      if(font.size.heading < 0) {
        stop("font.size.heading must be greater than zero")
      }
      
      #font.size.y.axis check 
      
      if(is.na(as.numeric(font.size.y.axis))) {
        stop("font.size.y.axis must be a number")
      }
      
      font.size.y.axis <- as.numeric(font.size.y.axis)
      
      if(font.size.y.axis < 0) {
        stop("font.size.y.axis must be greater than zero")
      }
      
      #font.size.y.axis check 
      
      if(is.na(as.numeric(font.size.x.axis))) {
        stop("font.size.x.axis must be a number")
      }
      
      font.size.x.axis <- as.numeric(font.size.x.axis)
      
      if(font.size.x.axis < 0) {
        stop("font.size.x.axis must be greater than zero")
      }
    }
    
    #Cross-data-frame checks
    
    #snp.dat.pools$SAMPLE_ID and snp.dat.indiv$SAMPLE_ID must all be different 
    if(sum(snp.dat.pools$SAMPLE_ID %in% snp.dat.indiv$SAMPLE_ID) > 0) {
      stop("SAMPLE_IDs in snp.dat.indiv must be different to SAMPLE_IDs in snp.dat.pools")
    }
    
    #    SNP_ID list must be the same in snp.dat.indiv, snp.dat.pools, snp.param.indiv, snp.param.pools (where relevant)
    tmp <- unique(snp.dat.indiv[order(snp.dat.indiv[,"SNP_ID"]),"SNP_ID"])
    if(!identical(tmp,unique(snp.dat.pools[order(snp.dat.pools[,"SNP_ID"]),"SNP_ID"]))) {
      stop("All SNP_IDs must be represented in both snp.dat.indiv and snp.dat.pools")
    }
    
    if ((sum(c("Discrete", "Exclusion") %in% method) > 0 &
         discrete.method == "geno.probs")|
        (sum(c("Quantitative", "Least_squares") %in% method)) > 0) {
          if(!identical(tmp,unique(snp.param.indiv$SNP_ID))) {
            stop("All SNP_IDs must be represented in each of snp.dat.indiv, snp.dat.pools, and snp.param.indiv")
          }
        }
        if ((sum(c("Discrete", "Exclusion") %in% method) > 0 &
             discrete.method == "geno.probs")|
            "Quantitative" %in% method) {
          if(!identical(tmp,unique(snp.param.pools$SNP_ID))) {
            stop("All SNP_IDs must be represented in each of snp.dat.indiv, snp.dat.pools, snp.param.indiv, snp.param.pools")
          }
        } 
    rm(tmp)
    
    #fams$SIRE_ID must be in snp.dat.indiv$SAMPLE_ID
    if(!sum(fams$SIRE_ID %in% snp.dat.indiv$SAMPLE_ID) == length(fams$SIRE_ID)) {
      stop ("Not all fams$SIRE_IDs are in snp.dat.indiv$SAMPLE_IDs")
    }
    
    #    fams$DAM_ID must be in snp.dat.indiv$SAMPLE_ID
    if(!sum(fams$DAM_ID %in% snp.dat.indiv$SAMPLE_ID) == length(fams$DAM_ID)) {
      stop ("Not all fams$DAM_IDs are in snp.dat.indiv$SAMPLE_IDs")
    }
    
    if(!is.null(fam.set.combns)){
    #fam.set.combns.by.pool$SAMPLE_ID must be in snp.dat.pools$SAMPLE_ID
    if(!sum(fam.set.combns.by.pool$SAMPLE_ID %in% snp.dat.pools$SAMPLE_ID) == length(fam.set.combns.by.pool$SAMPLE_ID)) {
      stop ("Not all fam.set.combns.by.pool$SAMPLE_IDs are in snp.dat.pools$SAMPLE_IDs")
    }
    
    #fam.set.combns$FAMILY_ID must be in fams$FAMILY_ID
    if(!sum(fam.set.combns$FAMILY_ID %in% fams$FAMILY_ID) == length(fam.set.combns$FAMILY_ID)) {
      stop ("Not all fam.set.combns$FAMILY_IDs are in fams$FAMILY_IDs")
    }
    }
    
    if(sum(c("Quantitative", "Discrete", "Least_squares") %in% method) > 0) {
      #snp.error.assumed and snp.error.underlying can't both be NULL
      if (is.null(snp.error.assumed) & is.null(snp.error.underlying)) {
        stop("Both snp.error.assumed and snp.error.underlying are NULL")
      }
    }
    
    #beta.min.ss
    if(!is.logical(beta.min.ss)) {
      stop("beta.min.ss must be either \'TRUE\' or \'FALSE\'")
    }
  }
  
  #End checks############################################################################
  
  # snp.dat.indiv <- snp.dat.indiv[,c("SNP_ID", "SAMPLE_ID", "GENOTYPE")]
  
  #Make outputs NULL
  phi.ij <- NULL
  Gij <- NULL
  flj.probs <- NULL
  flj.geno <- NULL
  parent.combns <- NULL
  lambda.kj <- NULL
  gkj <- NULL
  tclj.quant <- NULL
  tclj.discrete <- NULL
  tclj.ls  <- NULL             
  nlj.probs <- NULL
  nlj.geno <- NULL
  snp.error.probs <- NULL
  snp.error.geno <- NULL
  gklj.adj <- NULL
  dklj.adj <- NULL
  tclj.adj.quant <- NULL 
  tclj.adj.discrete <- NULL
  lod.duos.quant <- NULL
  lod.duos.discrete <- NULL
  logl.duos.quant <- NULL
  logl.duos.discrete <- NULL
  Dij  <- NULL                    
  dkj <- NULL                     
  mismatches  <- NULL             
  mismatches.by.snp <- NULL       
  # mismatches.by.snp.sample <- NULL
  # mismatch.snp.count <- NULL 
  Xl.mat <- NULL
  fkj.and.weight  <- NULL      
  beta     <- NULL            
  most.like.parents.quant <- NULL
  most.like.parents.discrete <- NULL
  most.like.parents.excl <- NULL 
  most.like.parents.excl.non.dup <- NULL 
  
  #if fam.set.combns and fam.set.combns.by.pool are NULL then generate data from 'fams' and  snp.dat.pools
  if(is.null(fam.set.combns) | is.null(fam.set.combns.by.pool)) {
    fam.set.combns <- data.frame(FAM_SET_COMBN_ID = 1,
      FAM_SET_ID = rep(1:n.in.pools, each = nrow(fams)),
      FAMILY_ID = rep(fams[,"FAMILY_ID"], n.in.pools))
    
    fam.set.combns.by.pool <- data.frame(SAMPLE_ID = unique(snp.dat.pools[,"SAMPLE_ID"]),
                                         FAM_SET_COMBN_ID = 1)
  }
  
  #if FAM_SET_COMBN_ID in fam.set.combns not in fam.set.combns.by.pool then remove
  fam.set.combns <- fam.set.combns[fam.set.combns[,"FAM_SET_COMBN_ID"] %in% fam.set.combns.by.pool[,"FAM_SET_COMBN_ID"],] 

  #if FAMILY_ID in fams not in fam.set.combns then remove
  fams <- fams[fams[,"FAMILY_ID"] %in% fam.set.combns[,"FAMILY_ID"],] 
  
  #if individuals are not parents of families in fam.set.combns then delete from snp.dat.indiv and fams
  fams    <- fams[fams[,"FAMILY_ID"] %in% unique(fam.set.combns[,"FAMILY_ID"]),]
  snp.dat.indiv <- snp.dat.indiv[snp.dat.indiv[,"SAMPLE_ID"] %in% unique(c(fams[,"SIRE_ID"], fams[,"DAM_ID"])),]
  
  #run universal preliminary functions
  if(discrete.method == "geno.probs" | "Quantitative" %in% method | "Least_squares" %in% method ) { 
    phi.ij <- phi.ij.fun(snp.dat.indiv = snp.dat.indiv, 
                         snp.param.indiv = snp.param.indiv, 
                         fams = fams,
                         min.sd = min.sd,
                         min.intensity = min.intensity
    ) 
    
    Gij <- Gij.fun(phi.ij = phi.ij)
    
    flj.from.snp.dat <- NULL
    flj.probs    <- flj.from.parent.Gij.fun(Gij = Gij, fam.set.combns = fam.set.combns, fams = fams) 
  }  
  
  if(discrete.method == "assigned.genos") {   
    
  #  snp.dat.indiv$SNP_ID <- as.character(snp.dat.indiv$SNP_ID)
  #  snp.param.indiv$SNP_ID <- as.character(snp.param.indiv$SNP_ID)
  #  snp.dat.indiv <- left_join(snp.dat.indiv, unique(snp.param.indiv[,c("SNP_ID", "A_ALLELE", "B_ALLELE")]), by = "SNP_ID")
    
    flj.geno <- flj.from.snp.dat.fun(snp.dat.indiv = snp.dat.indiv[,c("SAMPLE_ID", "SNP_ID", "A_ALLELE", "B_ALLELE", "GENOTYPE")], 
                                     fam.set.combns = fam.set.combns, 
                                     fams = fams)
  }  
  
  if (sum(c("Quantitative", "Discrete", "Exclusion") %in% method) > 0 ) {
    
    #get n.in.pools
  #  tmp <- colnames(snp.param.pools)[grep("MEAN_P_",colnames(snp.param.pools))][1]
  #  n.in.pools <- (nchar(tmp) - 7)/2
  #  rm(tmp)
    
    #get parent.combns
    tmp <- parent.combns.fun(fams = fams,
                             n.in.pools = n.in.pools, 
                             fam.set.combns = fam.set.combns)
    
    parent.combns         <- tmp$parent.combns
    parent.combns.by.fam.set.combn <- tmp$parent.combns.by.fam.set.combn
    
    rm(tmp)
    
  }       
  
  if((discrete.method == "geno.probs" & sum(c("Discrete", "Exclusion") %in% method) > 0 )|
     sum(c("Quantitative") %in% method) > 0 ) {          
    
    lambda.kj <- lambda.ij.fun(snp.dat.indiv = snp.dat.pools, 
                               snp.param.indiv = snp.param.pools, 
                               n.in.pools = n.in.pools,
                               min.sd = min.sd,
                               min.intensity = min.intensity
    )
    
    #get gkj
    rho.inv <- rho.inv.fun(n.in.pools)
    denominator <- rowSums(lambda.kj[,-c(1:2,ncol(lambda.kj))])
    
    gkj <- lambda.kj[,1:2]
    for(i in 1:nrow(rho.inv)) {
      gkj[,rho.inv[i,"GENOTYPE"]] <- rho.inv[i,"RHO_INV"] * lambda.kj[,2+i] / denominator
    }
    
    gkj <- gkj[order(gkj[,"SNP_ID"]),]
    gkj <- gkj[order(gkj[,"SAMPLE_ID"]),]
  }  
  
  if (sum(c("Quantitative", "Discrete", "Exclusion") %in% method) > 0 ) { 
    
    if(discrete.method == "geno.probs" | "Quantitative" %in% method) {  
      
      #compute nlj
      nlj.probs <- nlj.fun(flj = flj.probs,
                          n.in.pools = n.in.pools)  
      
      #get SNP error
      if(!is.null(snp.error.assumed)) {
        if(length(snp.error.assumed) == 1) {
          snp.error <- data.frame(SNP_ID = snp.param.indiv[,"SNP_ID"],
                                  SNP_ERROR_TILDE = snp.error.assumed)
        } else {
          snp.error <- snp.error.assumed
        }
      } else {
        snp.error.hat <- snp.error.fun(Gij = Gij,
                                       fams = fams, 
                                       parents.only = FALSE)
        snp.error <- snp.error.hat  
        snp.error[,"SNP_ERROR_TILDE"] <- snp.error[,"SNP_ERROR_HAT"]
        snp.error[snp.error[,"SNP_ERROR_TILDE"] < snp.error.underlying,"SNP_ERROR_TILDE"] <- snp.error.underlying
      }              
      snp.error.probs$SNP_ID <- as.character(snp.error.probs$SNP_ID)      
      snp.error.probs        <- snp.error[order(snp.error[,"SNP_ID"]),]
      
      rm(snp.error)
    }
    
    if("Discrete" %in% method & discrete.method == "assigned.genos") {   
      
      #compute nlj
      nlj.geno <- nlj.fun(flj = flj.geno,
                          n.in.pools = n.in.pools)  
      
      #get SNP error
      if(!is.null(snp.error.assumed)) {
        if(length(snp.error.assumed) == 1) {
          snp.error <- data.frame(SNP_ID = unique(snp.dat.indiv[,"SNP_ID"]),
                                  SNP_ERROR_TILDE = snp.error.assumed)
        } else {
          snp.error <- snp.error.assumed
        }
      } else {
        snp.error.hat <- snp.error.fun(Gij = Gij,
                                       fams = fams, 
                                       parents.only = FALSE)
        snp.error <- snp.error.hat  
        snp.error[,"SNP_ERROR_TILDE"] <- snp.error[,"SNP_ERROR_HAT"]
        snp.error[snp.error[,"SNP_ERROR_TILDE"] < snp.error.underlying,"SNP_ERROR_TILDE"] <- snp.error.underlying
      }              
      
      snp.error.geno <- snp.error[order(snp.error[,"SNP_ID"]),]
      rm(snp.error)
    }
  }
  
  if ("Quantitative" %in% method) {
    prelim.ml.quant <- prelim.ml.quant.fun(Gij = Gij,
                                           parent.combns.by.fam.set.combn = parent.combns.by.fam.set.combn,
                                           flj = flj.probs,
                                           gkj = gkj,
                                           snp.error = snp.error.probs,
                                           nlj = nlj.probs,
                                           fam.set.combns.by.pool =fam.set.combns.by.pool) 
    
    tclj.quant               = prelim.ml.quant$tclj
    miss.parent.count.quant = prelim.ml.quant$miss.parent.count
    gklj.adj     = prelim.ml.quant$gklj
    rm(prelim.ml.quant)
    
    ml <- ml.fun(g.d.klj.adj = gklj.adj,
                 tclj = tclj.quant,
                 snp.error = snp.error.probs,
                 nlj = nlj.probs,
                 miss.parent.count = miss.parent.count.quant,
                 parent.combns.by.fam.set.combn = parent.combns.by.fam.set.combn,
                 parent.combns = parent.combns,
                 meth = "quantitative")
    
    gklj.adj  <- ml$g.d.klj.adj
    tclj.adj.quant  <- ml$tclj.adj
    lod.duos.quant  <- ml$duos.lod
    logl.duos.quant <- ml$duos.logl
    most.like.parents.quant <- ml$most.like.parents
    rm(ml)
    
  }
  
  if (sum(c("Discrete", "Exclusion") %in% method) > 0 ) {
    
    if(discrete.method == "assigned.genos") {
      #     snp.dat.pools <- merge(snp.dat.pools, unique(snp.param.indiv[,c("SNP_ID", "A_ALLELE", "B_ALLELE")]), 
      #                             by = "SNP_ID", all.x = TRUE)
   #   snp.dat.pools$SNP_ID    <- as.character(snp.dat.pools$SNP_ID)
  #    snp.param.indiv$SNP_ID    <- as.character(snp.param.indiv$SNP_ID)
  #    snp.dat.pools <- left_join(snp.dat.pools, unique(snp.param.indiv[,c("SNP_ID", "A_ALLELE", "B_ALLELE")]), by = "SNP_ID")
      
      prelim.ml.discrete <- prelim.ml.discrete.assigned.genos.fun(method = method,
                                                                  snp.dat.indiv = snp.dat.indiv,
                                                                  snp.dat.pools = snp.dat.pools,
                                                                  parent.combns.by.fam.set.combn = parent.combns.by.fam.set.combn,
                                                                  flj = flj.geno,
                                                                  snp.error = snp.error.geno,
                                                                  nlj = nlj.geno,
                                                                  fam.set.combns.by.pool =fam.set.combns.by.pool)
    } 
    
    if(discrete.method == "geno.probs") {
      prelim.ml.discrete <- prelim.ml.discrete.geno.probs.fun(
        Gij = Gij,
        threshold.indiv = threshold.indiv,
        snp.dat.indiv = snp.dat.indiv,
        parent.combns.by.fam.set.combn = parent.combns.by.fam.set.combn,
        flj = flj.probs,
        threshold.pools = threshold.pools,
        gkj = gkj,
        lambda.kj = lambda.kj,
        snp.error = snp.error.probs,
        nlj = nlj.probs,
        fam.set.combns.by.pool =fam.set.combns.by.pool)
    }
    
    Dij <- prelim.ml.discrete$Dij
    tclj.discrete <- prelim.ml.discrete$tclj
    miss.parent.count.discrete <- prelim.ml.discrete$miss.parent.count
    threshold.pools <- prelim.ml.discrete$threshold.pools
    dkj <- prelim.ml.discrete$dkj
    dklj.adj <- prelim.ml.discrete$g.d.klj.adj
    rm(prelim.ml.discrete)
  }  
  
  if ("Discrete" %in% method) {
    
    if(discrete.method == "geno.probs") {
      
      ml <- ml.fun(g.d.klj.adj = dklj.adj,
                   tclj = tclj.discrete,
                   snp.error = snp.error.probs,
                   nlj = nlj.probs,
                   miss.parent.count = miss.parent.count.discrete,
                   parent.combns.by.fam.set.combn = parent.combns.by.fam.set.combn,
                   parent.combns = parent.combns,
                   meth = "discrete")
      
      dklj.adj <- ml$g.d.klj.adj
      tclj.adj.discrete <- ml$tclj.adj
      lod.duos.discrete <- ml$duos.lod
      logl.duos.discrete <- ml$duos.logl
      most.like.parents.discrete <- ml$most.like.parents
      rm(ml)
      
    }
    
    if(discrete.method == "assigned.genos") {
      ml <- ml.fun(g.d.klj.adj = dklj.adj,
                   tclj = tclj.discrete,
                   snp.error = snp.error.geno,
                   nlj = nlj.geno,
                   miss.parent.count = miss.parent.count.discrete,
                   parent.combns.by.fam.set.combn = parent.combns.by.fam.set.combn,
                   parent.combns = parent.combns,
                   meth = "discrete")
      
      dklj.adj <- ml$g.d.klj.adj
      tclj.adj.discrete <- ml$tclj.adj
      lod.duos.discrete <- ml$duos.lod
      logl.duos.discrete <- ml$duos.logl
      most.like.parents.discrete <- ml$most.like.parents
      rm(ml)
    }
  }  
  
  if ("Exclusion" %in% method) {
    exclusion <- mismatches.fun(dkj = dkj,
                                tclj = tclj.discrete,
                                miss.parent.count = miss.parent.count.discrete,
                                parent.combns = parent.combns,
                                fam.set.combns.by.pool =fam.set.combns.by.pool) 
    
    mismatches               <- exclusion$duos.mismatches
    mismatches.by.snp        <- exclusion$duos.mismatches.by.snp
    mismatches.by.snp.sample <- exclusion$mismatches.by.snp.sample
    mismatch.snp.count       <- exclusion$mismatch.snp.count
    most.like.parents.excl   <- exclusion$most.like.parents
    most.like.parents.excl.non.dup   <- exclusion$most.like.parents.non.dup
    rm(exclusion)
  }  
  
  if ("Least_squares" %in% method) {
    
    #min.intensity.pools and n.by.pool functionality removed for general parent assign function  
    
    min.intensity.pools <- min.intensity 
    #   if(!beta.min.ss) {
    #      n.by.pool <- NULL
    #    } else {
    #      n.by.pool = data.frame(SAMPLE_ID = unique(fam.set.combns.by.pool$SAMPLE_ID),
    #                             N_INDIV = rep(n.in.pools, length(unique(fam.set.combns.by.pool$SAMPLE_ID))))
    #    }

    least.sq      <- ls.fun(fams = fams,
                            fam.set.combns = fam.set.combns,
                            fam.set.combns.by.pool =fam.set.combns.by.pool,
                            Gij = Gij,
                            flj = flj.probs,
                            snp.dat.pools = snp.dat.pools,
                            snp.param.indiv = snp.param.indiv,
                            min.intensity = min.intensity,
                            beta.min.ss = beta.min.ss)
    
    tclj.ls       <- least.sq$tclj.ls
    fkj.and.weight <- least.sq$fkj.and.weight
    Xl.mat         <- least.sq$Xl.mat
    beta          <- least.sq$beta
    rm(least.sq)
    
    if(!exists("running.sim")) { #don't plot if running simulations    
      wd <- getwd()
      dir.create(file.path(wd, "Results"), showWarnings = FALSE)
      setwd(file.path(wd, "Results"))
      
      for(samp in unique(beta$SAMPLE_ID)) {
        
        tmp.beta <- beta[beta[,"SAMPLE_ID"] == samp,]
        tmp.beta$FAMILY_ID <- as.character(tmp.beta$FAMILY_ID )
        
        bar.plot.fun(beta = tmp.beta,
                     file.name = samp,
                     var = var,
                     heading = heading,
                     plot.to.heading.height = plot.to.heading.height,
                     font.size.heading = font.size.heading,
                     font.size.y.axis = font.size.y.axis,
                     font.size.x.axis = font.size.x.axis)
      }
      setwd(wd)
    }
  }
  
  print(Sys.time())
  
  return(list(most.like.parents.quant = most.like.parents.quant,
              most.like.parents.discrete = most.like.parents.discrete,
              most.like.parents.excl = most.like.parents.excl,
              most.like.parents.excl.non.dup = most.like.parents.excl.non.dup,
              beta = beta,
              Dij = Dij,
              dkj = dkj,
              dklj.adj = dklj.adj,
              fkj.and.weight = fkj.and.weight,
              Gij = Gij,
              gkj = gkj,
              gklj.adj = gklj.adj,
              flj.probs = flj.probs,
              flj.geno = flj.geno,
              lambda.kj = lambda.kj,
              lod.duos.discrete = lod.duos.discrete,
              lod.duos.quant = lod.duos.quant,
              logl.duos.discrete = logl.duos.discrete,
              logl.duos.quant = logl.duos.quant,
              mismatches = mismatches,
              mismatches.by.snp = mismatches.by.snp,
              #    mismatches.by.snp.sample = mismatches.by.snp.sample,
              #    mismatch.snp.count = mismatch.snp.count,
              nlj.probs = nlj.probs,
              nlj.geno = nlj.geno,
              parent.combns = parent.combns,
              phi.ij = phi.ij,
              snp.error.probs = snp.error.probs,
              snp.error.geno = snp.error.geno,
              tclj.adj.quant = tclj.adj.quant,
              tclj.adj.discrete = tclj.adj.discrete,
              tclj.discrete = tclj.discrete,
              tclj.ls = tclj.ls,
              tclj.quant = tclj.quant,
              Xl.mat = Xl.mat
  ))
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

snp.param.pools.fun <- function(snp.param.indiv , #from snp.gen.param.fun
                                n.in.pools
) {
  
  #Estimates mean and standard deviation of allelic proportions for each possible genotype (combination of A and B alleles)
  #in DNA pools for each SNP 
  
  #Args##########################################
  
  # snp.param.indiv:   Data frame (output of snp.gen.param.fun) 
  #            1. SNP_ID    is the SNP identifier
  #            2. MEAN_P_AA   is the mean  of allelic proportion (homozygous allele A)
  #            3. SD_P_AA     is the standard deviation of allelic proporiton (homozygous allele A)
  #            4. MEAN_P_AB   is the mean  of allelic proportion (heterozygous)
  #            5. SD_P_AB     is the standard  deviation of allelic proporiton (heterozygous)
  #            6. MEAN_P_BB   is the mean of allelic proportion (homozygous allele B)
  #            7. SD_P_BB     is the standard deviation of allelic proporiton (homozygous allele B)
  #            8. A_ALLELE    is the base designated as the A allele
  #            9. B_ALLELE    is the base designated as the B allele
  
  #n.in.pools    Integer. Number of individuals in DNA pools. 
  
  #Returns##########################################
  
  # snp.param.pools  Data frame.  Number of columns depends on n.in.pools.  The following is for n.in.pools = 2.
  #            1. SNP_ID        is the SNP identifier
  #            2. MEAN_P_AAAA   is the mean of allelic proportion for individuals of genotype AAAA
  #            3. SD_P_AAAA     is the standard deviation of allelic proportion for individuals of genotype AAAA
  #            4. MEAN_P_AAAB  
  #            5. SD_P_AAAB 
  #            6. MEAN_P_AABB   
  #            7. SD_P_AABB 
  #            8. MEAN_P_ABBB   
  #            9. SD_P_ABBB 
  #            10. MEAN_P_BBBB   
  #            11. SD_P_BBBB 
  #            12. A_ALLELE    is the base designated as the A allele 
  #            13. B_ALLELE    is the base designated as the B allele
  
  if(sum(c("SNP_ID", "MEAN_P_AA", "SD_P_AA", "MEAN_P_AB", "SD_P_AB", "MEAN_P_BB", "SD_P_BB", "A_ALLELE", "B_ALLELE") %in% 
         colnames(snp.param.indiv)) != 9) {
    stop("snp.param.indiv input must be a data frame containing the following headings: SNP_ID, MEAN_P_AA, SD_P_AA, MEAN_P_AB, SD_P_AB, MEAN_P_BB, SD_P_BB, A_ALLELE, B_ALLELE")
  }
  
  #generate alleles data frame
  alleles.by.snp <- snp.param.indiv[,c("SNP_ID","A_ALLELE", "B_ALLELE")]
  
  #Generate list of genotypes
  genotypes <- genotypes.fun(n=(2*n.in.pools))
  
  #Generate means
  means <- matrix(NA, nrow = nrow(snp.param.indiv), ncol = length(genotypes) + 1)
  means <- as.data.frame(means)
  colnames(means) <- c("SNP_ID", paste("MEAN_P_",genotypes, sep=""))
  means[,"SNP_ID"] <- snp.param.indiv[,"SNP_ID"]
  
  means[,2]                   <- snp.param.indiv[,"MEAN_P_AA"] #homozygous A
  means[,(ncol(means)/2 + 1)] <- snp.param.indiv[,"MEAN_P_AB"] #50:50 A:B
  means[,ncol(means)]         <- snp.param.indiv[,"MEAN_P_BB"] #homozygous B
  
  for (column in 2:(2+2*n.in.pools)) {
    
    if(column > 2 & column < (ncol(means)/2 + 1)) {
      means[,column] <- (snp.param.indiv[,"MEAN_P_AB"] * (column - 2) + 
                           snp.param.indiv[,"MEAN_P_AA"]  * ((ncol(means)/2 + 1) - column)) /
        (column - 2 + ncol(means)/2 + 1 - column)           
    }
    
    if(column > (ncol(means)/2 + 1) & column < ncol(means)) {
      means[,column] <- (snp.param.indiv[,"MEAN_P_BB"] * (column - (ncol(means)/2 + 1) ) + 
                           snp.param.indiv[,"MEAN_P_AB"]  * (ncol(means) - column)) /
        (column - (ncol(means)/2 + 1) + ncol(means) - column)
    }
  }
  
  #Generate standard deviations
  sds <- matrix(NA, nrow = nrow(snp.param.indiv), ncol = length(genotypes) + 1)
  sds <- as.data.frame(sds)
  colnames(sds) <- c("SNP_ID", paste("SD_P_",genotypes, sep=""))
  sds[,"SNP_ID"] <- snp.param.indiv[,"SNP_ID"]
  
  sds[,2]                 <- snp.param.indiv[,"SD_P_AA"] #homozygous A
  sds[,(ncol(sds)/2 + 1)] <- snp.param.indiv[,"SD_P_AB"] #50:50 A:B
  sds[,ncol(sds)]         <- snp.param.indiv[,"SD_P_BB"] #homozygous B
  
  for (column in 2:(2+2*n.in.pools)) {
    
    if(column > 2 & column < (ncol(sds)/2 + 1)) {
      sds[,column] <- sqrt((snp.param.indiv[,"SD_P_AB"]^2 * (column - 2) + 
                              snp.param.indiv[,"SD_P_AA"]^2  * ((ncol(sds)/2 + 1) - column)) /
                             (column - 2 + ncol(sds)/2 + 1 - column))      
    }
    
    if(column > (ncol(sds)/2 + 1) & column < ncol(sds)) {
      sds[,column] <- sqrt((snp.param.indiv[,"SD_P_BB"]^2 * (column - (ncol(sds)/2 + 1) ) + 
                              snp.param.indiv[,"SD_P_AB"]^2  * (ncol(sds) - column)) /
                             (column - (ncol(sds)/2 + 1) + ncol(sds) - column))
    }
  }
  
  snp.param.pools <- cbind(means, sds[,-1])
  snp.param.pools <- snp.param.pools[,order(c("A",genotypes, genotypes))] #reorder
  snp.param.pools[,"SNP_ID"] <- as.character(snp.param.pools[,"SNP_ID"])
  
  snp.param.pools$SNP_ID <- as.character(snp.param.pools$SNP_ID)
  # snp.param.pools <- merge(snp.param.pools, map[,c("SNP_ID", "A_ALLELE", "B_ALLELE")], by = "SNP_ID", all.x = TRUE)
  snp.param.pools$SNP_ID <- as.character(snp.param.pools$SNP_ID)
  alleles.by.snp$SNP_ID             <- as.character(alleles.by.snp$SNP_ID)
  snp.param.pools <- left_join(snp.param.pools, alleles.by.snp[,c("SNP_ID", "A_ALLELE", "B_ALLELE")], by = "SNP_ID")
  snp.param.pools <- snp.param.pools[order(snp.param.pools$SNP_ID),]
  
  return(snp.param.pools)
  
}
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

genotypes.fun <- function(n) {
  genotypes <- matrix("X", nrow = 1+n, ncol = n)
  for (row in 1:(1+n)) {
    genotypes[row,] <- c(rep("A", n - row + 1), rep("B", (row - 1)))
  }
  genotypes <- apply(genotypes, 1, paste, collapse='')
  return(genotypes)
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

phi.ij.fun <- function(snp.dat.indiv, 
                       snp.param.indiv, 
                       fams,
                       min.sd = 0,
                       min.intensity = 0
) {  
  
  #Returns the elements ("PHI.AA", "PHI.AB", "PHI.BA", "PHI.BB") of the matrix shown in the 
  #top left of page 4 of Henshall et al. 2014.  Note that it is assumed that PHI.AB = PHI.BA.
  #Also returns the allelic proportion (pij) (see Equation 1 of Henshall et al. 2014). 
  
  #Retains parents only
  
  #Required functions:
  # pij.fun
  
  #Args##########################################
  # snp.dat.indiv: Data frame.  NOTE: only data for relevant offspring, sires and dams should be 
  #          included (i.e. data for non-relevant individuals and any pools should not be included 
  #          in this data frame).
  #            1. SAMPLE_ID  is the individual identifier
  #            2. SNP_ID    is the SNP identifier
  #            3. INTENSITY_A    is the area/intensity for allele A
  #            4. INTENSITY_B    is the area/intensity for allele B
  
  # snp.param.indiv:   Data frame (output of snp.gen.param.fun) 
  #            1. SNP_ID    is the SNP identifier
  #            2. MEAN_P_AA   is the mean  of allelic proportion (homozygous allele A)
  #            3. SD_P_AA     is the standard deviation of allelic proporiton (homozygous allele A)
  #            4. MEAN_P_AB   is the mean  of allelic proportion (heterozygous)
  #            5. SD_P_AB     is the standard  deviation of allelic proporiton (heterozygous)
  #            6. MEAN_P_BB   is the mean of allelic proportion (homozygous allele B)
  #            7. SD_P_BB     is the standard deviation of allelic proporiton (homozygous allele B)
  
  # min.sd       Number. Standard deviation of allelic proportion  fixed to this value if less 
  #              than it.
  
  # min.intensity      Number used in pij.fun. If sqrt((snp.dat.indiv$INTENSITY_A)^2 +
  #              (snp.dat.indiv$INTENSITY_B)^2) less than this value
  #              then set allelic proportion to missing (see end of page 3 of Henshall et al 2014).
  #              Essentially removes observations that fall into the lower left of INTENSITY_A
  #              by INTENSITY_B scatter plot.
  
  #Returns##########################################
  # phi.ij:  Data frame 
  #            1.   SAMPLE_ID is the individual identifier
  #            2.   SNP_ID is the SNP identifier 
  #            3-6. AA_PHI, AB_PHI, BA_PHI, BB_PHI are the elements ("PHI.AA", "PHI.AB", "PHI.BA", 
  #                 "PHI.BB") of the matrix shown in the top left of page 4 of Henshall 
  #                 et al. 2014 
  #            7.   ALLELIC_PROP_INDIV is the allellic proportion (Equation 1 of Henshall 
  #                 et al. 2014) 
  
  print("Running phi.ij.fun")
  
  if(sum(c("FAMILY_ID", "SIRE_ID", "DAM_ID") %in% 
         colnames(fams)) != 3) {
    stop("fams input must be a data frame containing the following headings: FAMILY_ID, SIRE_ID, DAM_ID")
  }
  
  fams$FAMILY_ID    <- as.integer(fams$FAMILY_ID)
  fams$SIRE_ID   <- as.integer(fams$SIRE_ID)
  fams$DAM_ID    <- as.integer(fams$DAM_ID)
  fams <- fams[,c("FAMILY_ID", "SIRE_ID", "DAM_ID")]
  
  #If NA present as parents in fam then convert to 0
  fams[is.na(fams[,"SIRE_ID"]) ,"SIRE_ID"] <- "0"   
  fams[is.na(fams[,"DAM_ID"]) ,"DAM_ID"]   <- "0"  
  
  #identify parents
  parents <- unique(c(fams[,"SIRE_ID"],fams[,"DAM_ID"]))
  parents <- parents[parents != "0"]  
  
  #Check that all parents are in snp.dat.indiv.parents
  if(sum(!(parents %in% snp.dat.indiv[,"SAMPLE_ID"])) > 0) {
    stop("Not all parents in fams are present in snp.dat.indiv")
  }  
  
  #only retain parents in snp.dat.indiv
  snp.dat.indiv <- snp.dat.indiv[snp.dat.indiv[,"SAMPLE_ID"] %in% parents,]
  
  #Check that all headings are present in inputs  
  if(sum(c("SAMPLE_ID", "SNP_ID", "INTENSITY_A", "INTENSITY_B") %in% 
         colnames(snp.dat.indiv)) != 4) {
    stop("snp.dat.indiv input must be a data frame containing the following headings: SAMPLE_ID, SNP_ID, INTENSITY_A, INTENSITY_B")
  }
  
  if(sum(c("SNP_ID", "MEAN_P_AA", "SD_P_AA", "MEAN_P_AB", "SD_P_AB", "MEAN_P_BB", "SD_P_BB") %in% 
         colnames(snp.param.indiv)) != 7) {
    stop("snp.param.indiv input must be a data frame containing the following headings: SNP_ID, MEAN_P_AA, SD_P_AA, MEAN_P_AB, SD_P_AB, MEAN_P_BB, SD_P_BB")
  }
  
  #Name columns and assign class
  # colnames(snp.dat.indiv) <- c("SAMPLE_ID", "SNP_ID", "INTENSITY_A", "INTENSITY_B")
  snp.dat.indiv$SAMPLE_ID  <- as.integer(snp.dat.indiv$SAMPLE_ID)
  snp.dat.indiv$SNP_ID    <- as.character(snp.dat.indiv$SNP_ID)
  snp.dat.indiv$INTENSITY_A    <- as.numeric(snp.dat.indiv$INTENSITY_A)
  snp.dat.indiv$INTENSITY_B    <- as.numeric(snp.dat.indiv$INTENSITY_B)
  snp.dat.indiv <- snp.dat.indiv[,c("SAMPLE_ID", "SNP_ID", "INTENSITY_A", "INTENSITY_B")]
  
  #colnames(snp.param.indiv) <- c("SNP_ID", "MEAN_P_AA", "SD_P_AA", "MEAN_P_AB", "SD_P_AB", "MEAN_P_BB", "SD_P_BB")
  snp.param.indiv$SNP_ID    <- as.character(snp.param.indiv$SNP_ID)
  snp.param.indiv$MEAN_P_AA   <- as.numeric(snp.param.indiv$MEAN_P_AA)
  snp.param.indiv$SD_P_AA     <- as.numeric(snp.param.indiv$SD_P_AA)
  snp.param.indiv$MEAN_P_AB   <- as.numeric(snp.param.indiv$MEAN_P_AB)
  snp.param.indiv$SD_P_AB     <- as.numeric(snp.param.indiv$SD_P_AB)
  snp.param.indiv$MEAN_P_BB   <- as.numeric(snp.param.indiv$MEAN_P_BB)
  snp.param.indiv$SD_P_BB     <- as.numeric(snp.param.indiv$SD_P_BB)
  snp.param.indiv <- snp.param.indiv[,c("SNP_ID", "MEAN_P_AA", "SD_P_AA", "MEAN_P_AB", "SD_P_AB", "MEAN_P_BB", "SD_P_BB")]
  
  #Check for duplicated records in snp.dat.indiv
  indiv.snp <- paste(snp.dat.indiv$SAMPLE_ID,snp.dat.indiv$SNP_ID, sep=".")
  if(sum(duplicated(indiv.snp)) > 0) {
    stop("SAMPLE_ID and SNP_ID combinations are not unique in snp.dat.indiv.  Delete duplicates or recode SAMPLE_ID.")
  }
  rm(indiv.snp)
  
  # Check the list of SNPs the same in input files
  if(sum(unique(snp.dat.indiv[order(snp.dat.indiv[,"SNP_ID"]),"SNP_ID"]) != 
         unique(snp.param.indiv[order(snp.param.indiv[,"SNP_ID"]),"SNP_ID"]))>0) {
    stop("SNP identifiers do not match in snp.dat.indiv and snp.param.indiv")
  }
  
  #Get allelic proportion
  snp.allelic.prop <- pij.fun(snp.dat.indiv = snp.dat.indiv, min.intensity = min.intensity)
  print("Still running phi.ij.fun")
  
  #Rename columns
  colnames(snp.allelic.prop)[colnames(snp.allelic.prop) == "SAMPLE_ID"]    <- "SAMPLE_ID"
  colnames(snp.allelic.prop)[colnames(snp.allelic.prop) == "ALLELIC_PROP"] <- "ALLELIC_PROP_INDIV"  
  colnames(snp.allelic.prop)[colnames(snp.allelic.prop) == "INTENSITY"]    <- "INTENSITY_INDIV"  
  
  #merge data using cbind
  if(identical(snp.dat.indiv$SAMPLE_ID,snp.allelic.prop$SAMPLE_ID) &
     identical(snp.dat.indiv$SNP_ID,snp.allelic.prop$SNP_ID)) {
    snp.dat.indiv <- cbind(snp.dat.indiv, snp.allelic.prop[,!colnames(snp.allelic.prop) %in% c("SAMPLE_ID", "SNP_ID", "INTENSITY_A", "INTENSITY_B")])
  } else {
    stop("SAMPLE_ID and SNP_ID columns of pij.fun output do not match those of snp.dat.indiv.  Not sure why.")
  }
  
  # Standard deviation of allelic proporiton fixed to min.sd if less than min.sd or is NA
  snp.param.indiv[snp.param.indiv[,"SD_P_AA"] < min.sd & !is.na(snp.param.indiv[,"SD_P_AA"]), "SD_P_AA"] <- min.sd
  snp.param.indiv[snp.param.indiv[,"SD_P_AB"] < min.sd & !is.na(snp.param.indiv[,"SD_P_AB"]), "SD_P_AB"] <- min.sd
  snp.param.indiv[snp.param.indiv[,"SD_P_BB"] < min.sd & !is.na(snp.param.indiv[,"SD_P_BB"]), "SD_P_BB"] <- min.sd
  
  snp.param.indiv[is.na(snp.param.indiv[,"SD_P_AA"]), "SD_P_AA"] <- min.sd
  snp.param.indiv[is.na(snp.param.indiv[,"SD_P_AB"]), "SD_P_AB"] <- min.sd
  snp.param.indiv[is.na(snp.param.indiv[,"SD_P_BB"]), "SD_P_BB"] <- min.sd
  
  #Ensure all SD are > 0 if mean is not NA
  if(sum(
    (
      !is.na(snp.param.indiv[,"MEAN_P_AA"]) &
      !is.na(snp.param.indiv[,"MEAN_P_AB"]) &
      !is.na(snp.param.indiv[,"MEAN_P_BB"])
    ) & 
    
    (
      (snp.param.indiv[,"SD_P_AA"] <= 0 & !is.na(snp.param.indiv[,"SD_P_AA"])) |
      (snp.param.indiv[,"SD_P_AB"] <= 0 & !is.na(snp.param.indiv[,"SD_P_AB"])) |
      (snp.param.indiv[,"SD_P_BB"] <= 0 & !is.na(snp.param.indiv[,"SD_P_BB"])) |
      is.na(snp.param.indiv[,"SD_P_AA"]) |
      is.na(snp.param.indiv[,"SD_P_AB"]) |
      is.na(snp.param.indiv[,"SD_P_BB"])
    )
  ) > 0) {
    stop("At least one element of SD_P_AA, SD_P_AB or SD_P_BB equals zero or is NA where P.AA, P.AB and P.BB are not NA in snp.param.indiv.  Must specify min.sd that is greater than 0 or modify snp.param.indiv.")
  }
  
  #Merge data
  # phi.ij <- merge(snp.dat.indiv, snp.param.indiv, by = "SNP_ID", all.x = TRUE)
  snp.dat.indiv$SNP_ID    <- as.character(snp.dat.indiv$SNP_ID)
  snp.param.indiv$SNP_ID    <- as.character(snp.param.indiv$SNP_ID)
  phi.ij <- left_join(snp.dat.indiv, snp.param.indiv, by = "SNP_ID")
  
  #Calculate G matrices - see start of page 4 of Henshall et al 2014
  phi.ij$AA_PHI <- dnorm(phi.ij$ALLELIC_PROP_INDIV,phi.ij$MEAN_P_AA,phi.ij$SD_P_AA)
  phi.ij$AB_PHI <- dnorm(phi.ij$ALLELIC_PROP_INDIV,phi.ij$MEAN_P_AB,phi.ij$SD_P_AB) / 2
  phi.ij$BA_PHI <- phi.ij$AB_PHI
  phi.ij$BB_PHI <- dnorm(phi.ij$ALLELIC_PROP_INDIV,phi.ij$MEAN_P_BB,phi.ij$SD_P_BB)
  
  #retain only relevant columns
  phi.ij <- phi.ij[,c("SAMPLE_ID", "SNP_ID", "AA_PHI", "AB_PHI", "BA_PHI", "BB_PHI", "ALLELIC_PROP_INDIV")]
  return(phi.ij)
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

Gij.fun <- function(phi.ij, round.Gij = NULL) {
  
  #Returns########################################## the elements of the Gij matrix ("AA_GENO_PROB", " AB_GENO_PROB", " BA_GENO_PROB", "BB_GENO_PROB") 
  #and Tij vector ("A_TRANS_PROB", "B_TRANS_PROB") shown in the top left of page 4 
  #of Henshall et al. 2014.  
  
  #Args##########################################
  # phi.ij:  Data frame (output of phi.ij.fun)
  #            1.   SAMPLE_ID is the individual identifier
  #            2.   SNP_ID is the SNP identifier 
  #            3-6. AA_PHI, AB_PHI, BA_PHI, BB_PHI are the elements ("PHI.AA", "PHI.AB", "PHI.BA", 
  #                 "PHI.BB") of the matrix shown in the top left of page 4 of Henshall 
  #                 et al. 2014 
  #            7.   ALLELIC_PROP_INDIV is the allellic proportion (Equation 1 of Henshall 
  #                 et al. 2014) 
  
  #round.Gij: Integer.  Generally NULL. Rounds GENO_PROB values to specified number of decimal points.
  #           NOTE: when the round.Gij is specified (i.e. not NULL) there is a high probability that Equation 3 will equal 0
  #           (which essentially suggests a mismatch) and will result in infinite likelihood ratios.
  
  #Returns##########################################
  # Gij:     Data frame 
  #              1.   SAMPLE_ID is the individual identifier 
  #              2.   SNP_ID is the SNP identifier
  #              3-6. AA_GENO_PROB, AB_GENO_PROB, BA_GENO_PROB and BB_GENO_PROB: elements of the Gij matrix (see the 
  #                   top left of page 4 of Henshall et al. 2014
  #              7-8. A_TRANS_PROB, B_TRANS_PROB are the probabilities of allele transmission 
  #                   for alleles A and B respectively computed from Gij (i.e. the elements of 
  #                   the transmission (Tij) vector, Equation 2 of Henshall et al. 2014).
  #              9.   ALLELIC_PROP_INDIV is the allellic proportion (Equation 1 of Henshall 
  #                   et al. 2014).  Retained from the input data frame.
  
  print("Running Gij.fun")
  
  #Check that all headings are present in inputs  
  if(sum(c("SAMPLE_ID", "SNP_ID", "AA_PHI", "AB_PHI", "BA_PHI", "BB_PHI", "ALLELIC_PROP_INDIV") %in% 
         colnames(phi.ij)) != 7) {
    stop("phi.ij input must be a data frame containing the following headings: SAMPLE_ID, SNP_ID, AA_PHI, AB_PHI, BA_PHI, BB_PHI, ALLELIC_PROP_INDIV")
  }
  
  #Change column names and checks
  phi.ij$SAMPLE_ID  <- as.integer(phi.ij$SAMPLE_ID)
  phi.ij$SNP_ID    <- as.character(phi.ij$SNP_ID)
  phi.ij$AA_PHI <- as.numeric(phi.ij$AA_PHI)
  phi.ij$AB_PHI <- as.numeric(phi.ij$AB_PHI)
  phi.ij$BA_PHI <- as.numeric(phi.ij$BA_PHI)
  phi.ij$BB_PHI <- as.numeric(phi.ij$BB_PHI)
  phi.ij$ALLELIC_PROP_INDIV <- as.numeric(phi.ij$ALLELIC_PROP_INDIV)
  phi.ij <- phi.ij[,c("SAMPLE_ID", "SNP_ID", "AA_PHI", "AB_PHI", "BA_PHI", "BB_PHI", "ALLELIC_PROP_INDIV")]
  
  Gij <- phi.ij
  
  #G matrix
  phi.sum         <- Gij$AA_PHI + Gij$AB_PHI + Gij$BA_PHI + Gij$BB_PHI
  Gij$AA_GENO_PROB <- Gij$AA_PHI / phi.sum 
  Gij$AB_GENO_PROB <- Gij$AB_PHI / phi.sum
  Gij$BA_GENO_PROB <- Gij$BA_PHI / phi.sum
  Gij$BB_GENO_PROB <- Gij$BB_PHI / phi.sum 
  rm(phi.sum)
  
  
  if(!is.null(round.Gij)) {  
    
    #Must sum to 1
    #https://www.r-bloggers.com/round-values-while-preserve-their-rounded-sum-in-r/
    round_preserve_sum <- function(x) {
      
      x_orig <- x
      x[is.na(x)] <- 0.25
      
      up <- 10 ^ round.Gij
      x <- x * up
      y <- floor(x)
      indices <- tail(order(x-y), round(sum(x)) - sum(y))
      if(length(indices) > 0) {
        y[indices] <- y[indices] + 1
      }
      z <- y / up
      
      z[is.na(x_orig)] <- NA
      z
    }
    
    #Apply function by row
    Gij[,c("AA_GENO_PROB", "AB_GENO_PROB", "BA_GENO_PROB", "BB_GENO_PROB")] <-  
      apply(as.matrix(Gij[,c("AA_GENO_PROB", "AB_GENO_PROB", "BA_GENO_PROB", "BB_GENO_PROB")]), 2, round_preserve_sum)
    
    #Get some odd results (i.e. don't sum exactly to one)
    G.sum         <- Gij$AA_GENO_PROB + Gij$AB_GENO_PROB + Gij$BA_GENO_PROB + Gij$BB_GENO_PROB 
    Gij$AA_GENO_PROB <- Gij$AA_GENO_PROB / G.sum 
    Gij$AB_GENO_PROB <- Gij$AB_GENO_PROB / G.sum
    Gij$BA_GENO_PROB <- Gij$BA_GENO_PROB / G.sum 
    Gij$BB_GENO_PROB <- Gij$BB_GENO_PROB / G.sum
    rm(G.sum)
    
    #Rerun - Apply function by row
    Gij[,c("AA_GENO_PROB", "AB_GENO_PROB", "BA_GENO_PROB", "BB_GENO_PROB")] <-  
      apply(as.matrix(Gij[,c("AA_GENO_PROB", "AB_GENO_PROB", "BA_GENO_PROB", "BB_GENO_PROB")]), 2, round_preserve_sum)
    
    #Get some odd results (i.e. don't sum exactly to one)
    G.sum         <- Gij$AA_GENO_PROB + Gij$AB_GENO_PROB + Gij$BA_GENO_PROB + Gij$BB_GENO_PROB 
    Gij$AA_GENO_PROB <- Gij$AA_GENO_PROB / G.sum 
    Gij$AB_GENO_PROB <- Gij$AB_GENO_PROB / G.sum
    Gij$BA_GENO_PROB <- Gij$BA_GENO_PROB / G.sum 
    Gij$BB_GENO_PROB <- Gij$BB_GENO_PROB / G.sum
    rm(G.sum)
    
  }
  
  #Equation 2 of Henshall et al. 2014 - T vector
  Gij$A_TRANS_PROB <- (Gij$AA_GENO_PROB * 2 + Gij$AB_GENO_PROB + Gij$BA_GENO_PROB)/2  #(sum row 1 + sum col 1) / 2
  Gij$B_TRANS_PROB <- (Gij$BB_GENO_PROB * 2 + Gij$AB_GENO_PROB + Gij$BA_GENO_PROB)/2  #(sum row 2 + sum col 2) / 2
  
  #Get some odd results (i.e. don't sum exactly to one)
  G.sum         <- Gij$A_TRANS_PROB + Gij$B_TRANS_PROB 
  Gij$A_TRANS_PROB <- Gij$A_TRANS_PROB / G.sum 
  Gij$B_TRANS_PROB <- Gij$B_TRANS_PROB / G.sum
  rm(G.sum)
  
  #retain only relevant columns
  Gij <- Gij[,c("SAMPLE_ID", "SNP_ID", "AA_GENO_PROB", "AB_GENO_PROB", "BA_GENO_PROB", "BB_GENO_PROB",
                "A_TRANS_PROB", "B_TRANS_PROB", "ALLELIC_PROP_INDIV")]
  return(Gij)
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

flj.from.parent.Gij.fun <- function(Gij, fam.set.combns, fams) {
  
  #Generates allele frequencies by SNP from means of G matrices of parents. Used in early drafts of Henshall et al. 2014.
  
  #Args##########################################
  
  #Gij: Data frame.  Output of Gij.fun 
  #          1.   SAMPLE_ID is the individual identifier (assumed to be offspring if not 
  #               listed as a sire or dam in 'fams' input)
  #          2.   SNP_ID is the SNP identifier
  #          3-6. AA_GENO_PROB, AB_GENO_PROB, BA_GENO_PROB and BB_GENO_PROB: elements of the Gij  matrix (see the 
  #               top left of page 4 of Henshall et al. 2014
  #          7-8. A_TRANS_PROB, B_TRANS_PROB are the probabilities of allele transmission 
  #               for alleles A and B respectively computed from Gij (i.e. the elements of 
  #               the transmission (Tij) vector, Equation 2 of Henshall et al. 2014).
  
  # fam.set.combns:    Data frame.  NOTE: to define half sib families, the unknown parent must be NA or 0
  #                       1. FAM_SET_COMBN_ID (integer) itentifer of unique combination of FAM_SET_ID
  #                       2. FAM_SET_ID (integer) is the family aggregate identifier (integer > 0) 
  #                       3. FAMILY_ID (integer) is the family identifier  
  
  # fams:    Data frame.  NOTE: to define half sib families, the unknown parent must be NA or 0
  #          1. FAMILY_ID is the family identifier
  #          2. SIRE_ID is the sire identifier 
  #          3. DAM_ID is the dam identifier 
  
  #Returns##########################################
  
  # flj: Data frame.  Output of flj.from.parent.Gij.fun 
  #          1.   SNP_ID is the snp identifier
  #          2.     FAM_SET_COMBN_ID
  #          . AA_GENO_PROB_MEAN, AB_GENO_PROB_MEAN, BA_GENO_PROB_MEAN, BB_GENO_PROB_MEAN: means of parental
  #               G matrices
  #          . A_TRANS_PROB, B_TRANS_PROB are the probabilities of allele transmission 
  #               for alleles A and B respectively (i.e. the elements of the flj vector) computed from average parental G matrices 
  #               (i.e. from AA_GENO_PROB_MEAN, AB_GENO_PROB_MEAN, BA_GENO_PROB_MEAN, BB_GENO_PROB_MEAN).  Estimates of A and B allele 
  #               frequencies in the parental 'population'.
  #          . AA_GENO_PROB, AB_GENO_PROB, BA_GENO_PROB and BB_GENO_PROB: elements of the flj matrix (see the 
  #               bottom left of page 4 of Henshall et al. 2014
  #          . SAMPLE_ID is the individual identifier (all values are zero and represent unknown individuals)
  
  print("Running flj.from.parent.Gij.fun")
  
  #Check that all headings are present in inputs  
  if(sum(c("SAMPLE_ID", "SNP_ID", "A_TRANS_PROB", "B_TRANS_PROB") %in% 
         colnames(Gij)) != 4) {
    stop("Gij input must be a data frame containing the following headings: SAMPLE_ID, SNP_ID, A_TRANS_PROB, B_TRANS_PROB")
  }
  
  if(sum(c("FAM_SET_COMBN_ID", "FAM_SET_ID", "FAMILY_ID") %in% 
         colnames(fam.set.combns)) != 3) {
    stop("fam.set.combns input must be a data frame containing the following headings: FAM_SET_COMBN_ID FAM_SET_ID FAMILY_ID")
  }
  
  if(sum(c("FAMILY_ID", "SIRE_ID", "DAM_ID") %in% 
         colnames(fams)) != 3) {
    stop("fams input must be a data frame containing the following headings: FAMILY_ID, SIRE_ID, DAM_ID")
  }
  
  #Name columns and assign class
  #colnames(Gij)     <- c("SAMPLE_ID", "SNP_ID", "AA_GENO_PROB", "AB_GENO_PROB", "BA_GENO_PROB", "BB_GENO_PROB", 
  #                                 "A_TRANS_PROB", "B_TRANS_PROB")
  Gij$SAMPLE_ID     <- as.integer(Gij$SAMPLE_ID)
  Gij$SNP_ID       <- as.character(Gij$SNP_ID)
  Gij$A_TRANS_PROB <- as.numeric(Gij$A_TRANS_PROB)
  Gij$B_TRANS_PROB <- as.numeric(Gij$B_TRANS_PROB)
  Gij <- Gij[,c("SAMPLE_ID", "SNP_ID", "A_TRANS_PROB", "B_TRANS_PROB")]
  
  fam.set.combns$FAM_SET_COMBN_ID <- as.integer(fam.set.combns$FAM_SET_COMBN_ID)
  fam.set.combns$FAM_SET_ID       <- as.integer(fam.set.combns$FAM_SET_ID)
  fam.set.combns$FAMILY_ID        <- as.integer(fam.set.combns$FAMILY_ID)
  
  #colnames(fams) <- c("FAMILY_ID", "SIRE_ID", "DAM_ID")
  fams$FAMILY_ID    <- as.integer(fams$FAMILY_ID)
  fams$SIRE_ID   <- as.integer(fams$SIRE_ID)
  fams$DAM_ID    <- as.integer(fams$DAM_ID)
  fams <- fams[,c("FAMILY_ID", "SIRE_ID", "DAM_ID")]
  
  #Check that probabilities between 0 and 1
  if(
    sum(
      Gij$A_TRANS_PROB > 1 |
      Gij$B_TRANS_PROB > 1 |
      
      Gij$A_TRANS_PROB < 0 |
      Gij$B_TRANS_PROB < 0 
      , na.rm = T) != 0
  ) {
    stop("Probabilities in Gij must be between 0 and 1 inclusive")
  }
  
  #Check that "A_TRANS_PROB", "B_TRANS_PROB" sum to 1 in Gij
  if(
    sum(
      round((Gij$A_TRANS_PROB + Gij$B_TRANS_PROB),5) != 1.0, na.rm = T
    ) != 0
  ) {
    stop("A_TRANS_PROB + B_TRANS_PROB must equal 1 in all rows of Gij")
  }
  
  #Ensure that SAMPLE_ID column contains no 0s or NA
  if(
    sum(
      (is.na(Gij$SAMPLE_ID) |
       Gij$SAMPLE_ID == 0), na.rm = T
    ) != 0
  ) {
    stop("SAMPLE_ID in Gij cannot be NA or 0")
  } 
  
  #If NA present as parents in fam then convert to 0
  fams[is.na(fams[,"SIRE_ID"]) ,"SIRE_ID"] <- 0   
  fams[is.na(fams[,"DAM_ID"]) ,"DAM_ID"]   <- 0  
  
  flj.from.parent.G <- NULL
  
  #cycle through FAM_SET_COMBN_ID
  
  for(fam.set.combn in unique(fam.set.combns$FAM_SET_COMBN_ID)) {
    
    #identify families in fam.set.combn
    
    tmp.fams <- fams[fams[,"FAMILY_ID"] %in% fam.set.combns[fam.set.combns[,"FAM_SET_COMBN_ID"] == fam.set.combn,"FAMILY_ID"],]
    
    #get parent subset of Gij
    
    #identify parents
    parents <- data.frame(SAMPLE_ID = c(tmp.fams[,"SIRE_ID"],tmp.fams[,"DAM_ID"]))
    parents$SAMPLE_ID <- parents[parents$SAMPLE_ID != 0,]  
    parents$SAMPLE_ID <- as.integer(as.character(parents$SAMPLE_ID))
    
    #separate parents and offspring in Gij
    G.T.parents <- Gij[Gij[,"SAMPLE_ID"] %in% parents,]
    G.T.parents <- left_join(parents, Gij, by = "SAMPLE_ID")
    
    #get vector of allele probabilities of each SNP estimated from all parents
    a.mean.parents <- aggregate(G.T.parents$A_TRANS_PROB, by = list(G.T.parents$SNP_ID), 
                                na.rm=T, FUN = "mean")   
    colnames(a.mean.parents) <- c("SNP_ID", "A_TRANS_PROB")
    
    b.mean.parents <- aggregate(G.T.parents$B_TRANS_PROB, by = list(G.T.parents$SNP_ID), 
                                na.rm=T, FUN = "mean")   
    colnames(b.mean.parents) <- c("SNP_ID", "B_TRANS_PROB")
    
    #  tmp.flj.from.parent.G <- merge(a.mean.parents, b.mean.parents, by = "SNP_ID", all = TRUE)
    a.mean.parents$SNP_ID    <- as.character(a.mean.parents$SNP_ID)
    b.mean.parents$SNP_ID    <- as.character(b.mean.parents$SNP_ID)
    tmp.flj.from.parent.G <- inner_join(a.mean.parents, b.mean.parents, by = "SNP_ID")
    
    #Add rows for missing parents to G.T.parents  (see bottom left page 4 of Henshall et al. 2014)
    tmp.flj.from.parent.G$SAMPLE_ID <- as.integer(0)
    
    tmp.flj.from.parent.G$FAM_SET_COMBN_ID <- fam.set.combn
    
    flj.from.parent.G <- rbind(flj.from.parent.G,tmp.flj.from.parent.G)
  }
  
  rm(tmp.fams, tmp.flj.from.parent.G, fam.set.combn)
  flj.from.parent.G <- flj.from.parent.G[,c("FAM_SET_COMBN_ID", "SNP_ID", "A_TRANS_PROB", "B_TRANS_PROB", "SAMPLE_ID")]
  return(flj.from.parent.G)
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

flj.from.snp.dat.fun <- function(snp.dat.indiv, fam.set.combns, fams) {
  
  #Generates SNP allele frequencies from genotype calls for parents only
  
  #Required functions:
  # pij.fun, snp.gen.param.fun
  
  #Args##########################################
  # snp.dat.indiv: Data frame
  #              SAMPLE_ID is the individual identifier
  #              SNP_ID   is the SNP identifier
  #              A_ALLELE  is the base represented by allele A
  #              B_ALLELE  is the base represented by allele B
  #              GENOTYPE is the SNP genotype call
  
  # fam.set.combns:    Data frame.  NOTE: to define half sib families, the unknown parent must be NA or 0
  #                       1. FAM_SET_COMBN_ID (integer) itentifer of unique combination of FAM_SET_ID
  #                       2. FAM_SET_ID (integer) is the family aggregate identifier (integer > 0) 
  #                       3. FAMILY_ID (integer) is the family identifier  
  
  # fams: Data frame.  NOTE: to define half sib families, the unknown parent must be NA or 0
  #          1. FAMILY_ID is the family identifier
  #          2. SIRE_ID is the sire identifier 
  #          3. DAM_ID is the dam identifier 
  
  #Returns##########################################
  #   Data frame. See "Estimation of SNP sepecific parameters" page 3 of Henshall et al 2014 
  #              SNP_ID        is the SNP identifier, 
  #              N_AA          is the count homozygotes for allele A
  #              N_AB          is the count heterozygotes
  #              N_BB          is the count homozygotes for allele B
  #              A_ALLELE_FREQ is the A allele frequency derived from counts
  #              B_ALLELE_FREQ is the B allele frequency derived from counts
  
  print("Running flj.from.snp.dat.fun")
  
  #Check that all headings are present in inputs  
  if(sum(c("SAMPLE_ID", "SNP_ID",  "A_ALLELE", "B_ALLELE", "GENOTYPE") %in% colnames(snp.dat.indiv)) != 5) {
    stop("snp.dat.indiv input for flj.from.snp.dat.fun must be a data frame containing the following headings: SAMPLE_ID, SNP_ID, A_ALLELE, B_ALLELE, GENOTYPE")
  }
  
  if(sum(c("FAM_SET_COMBN_ID", "FAM_SET_ID", "FAMILY_ID") %in% 
         colnames(fam.set.combns)) != 3) {
    stop("fam.set.combns input must be a data frame containing the following headings: FAM_SET_COMBN_ID FAM_SET_ID FAMILY_ID")
  }
  
  if(sum(c("FAMILY_ID", "SIRE_ID", "DAM_ID") %in% 
         colnames(fams)) != 3) {
    stop("fams input must be a data frame containing the following headings: FAMILY_ID, SIRE_ID, DAM_ID")
  }
  
  #Retain columns and assign class
  #colnames(snp.dat.indiv) <- c("SAMPLE_ID", "SNP_ID", "A_ALLELE", "B_ALLELE", "GENOTYPE")
  snp.dat.indiv$SAMPLE_ID  <- as.integer(snp.dat.indiv$SAMPLE_ID)
  snp.dat.indiv$SNP_ID    <- as.character(snp.dat.indiv$SNP_ID)
  snp.dat.indiv$A_ALLELE   <- as.character(snp.dat.indiv$A_ALLELE)
  snp.dat.indiv$B_ALLELE   <- as.character(snp.dat.indiv$B_ALLELE)
  snp.dat.indiv$GENOTYPE  <- as.character(snp.dat.indiv$GENOTYPE)
  snp.dat.indiv <- snp.dat.indiv[,c("SAMPLE_ID", "SNP_ID", "A_ALLELE", "B_ALLELE", "GENOTYPE")]
  
  fam.set.combns$FAM_SET_COMBN_ID <- as.integer(fam.set.combns$FAM_SET_COMBN_ID)
  fam.set.combns$FAM_SET_ID       <- as.integer(fam.set.combns$FAM_SET_ID)
  fam.set.combns$FAMILY_ID        <- as.integer(fam.set.combns$FAMILY_ID)
  
  #colnames(fams) <- c("FAMILY_ID", "SIRE_ID", "DAM_ID")
  fams$FAMILY_ID    <- as.integer(fams$FAMILY_ID)
  fams$SIRE_ID   <- as.integer(fams$SIRE_ID)
  fams$DAM_ID    <- as.integer(fams$DAM_ID)
  fams <- fams[,c("FAMILY_ID", "SIRE_ID", "DAM_ID")]
  
  #Check that there is only one nucleotide in column A_ALLELE for each SNP
  tmp.1 <-  unique(snp.dat.indiv[,c("SNP_ID","A_ALLELE")])
  if(sum(unique(snp.dat.indiv[,"SNP_ID"]) != tmp.1[,c("SNP_ID")]) > 0) {
    stop("There is more than one nucleotide in column A_ALLELE for at least one SNP")
  }
  rm(tmp.1)
  
  #Check that there is only one nucleotide in column B_ALLELE for each SNP  
  tmp.1 <-  unique(snp.dat.indiv[,c("SNP_ID","B_ALLELE")])
  if(sum(unique(snp.dat.indiv[,"SNP_ID"]) != tmp.1[,c("SNP_ID")]) > 0) {
    stop("There is more than one nucleotide in column B_ALLELE for at least one SNP")
  }
  rm(tmp.1)
  
  #If NA present as parents in fam then convert to 0
  fams[is.na(fams[,"SIRE_ID"]) ,"SIRE_ID"] <- "0"   
  fams[is.na(fams[,"DAM_ID"]) ,"DAM_ID"]   <- "0"  
  
  #Dummy data required to run snp.gen.param.fun
  snp.dat.indiv$INTENSITY_A    <- 10000*runif(nrow(snp.dat.indiv)) #random numbers
  snp.dat.indiv$INTENSITY_B    <- 10000*runif(nrow(snp.dat.indiv))  
  
  flj.from.snp.dat <- NULL
  
  for(fam.set.combn in unique(fam.set.combns$FAM_SET_COMBN_ID)) {
    #identify families in fam.set.combn
    tmp.fams <- fams[fams[,"FAMILY_ID"] %in% fam.set.combns[fam.set.combns[,"FAM_SET_COMBN_ID"] == fam.set.combn,"FAMILY_ID"],]
    
    #identify parents
    parents <- data.frame(SAMPLE_ID = c(tmp.fams[,"SIRE_ID"],tmp.fams[,"DAM_ID"]))
    parents$SAMPLE_ID <- parents[parents$SAMPLE_ID != 0,]  
    parents$SAMPLE_ID <- as.integer(as.character(parents$SAMPLE_ID))
    #generate dummy SAMPLE_ID to allow snp.gen.param.fun to run (doesn't allow duplicate SAMPLE_ID/SNP_ID combinations)
    parents$DUMMY_SAMPLE_ID <- 1:nrow(parents) 
    #Get allele frequencies from counts using snp.gen.param.fun
    
    #duplciate snp data if a parent represented more than once
    tmp.snp.dat.indiv <- left_join(parents, snp.dat.indiv, by = "SAMPLE_ID")
    tmp.snp.dat.indiv$SAMPLE_ID <- tmp.snp.dat.indiv$DUMMY_SAMPLE_ID
    
    tmp.flj.from.snp.dat <- snp.gen.param.fun(snp.dat.indiv = tmp.snp.dat.indiv)
    
    rm(tmp.snp.dat.indiv)
    tmp.flj.from.snp.dat$FAM_SET_COMBN_ID <- fam.set.combn
    tmp.flj.from.snp.dat$SAMPLE_ID <- 0
    
    flj.from.snp.dat <- rbind(flj.from.snp.dat,tmp.flj.from.snp.dat)
  }
  
  rm(tmp.fams, tmp.flj.from.snp.dat, fam.set.combn)
  
  flj.from.snp.dat$A_TRANS_PROB <- flj.from.snp.dat$A_ALLELE_FREQ
  flj.from.snp.dat$B_TRANS_PROB <- flj.from.snp.dat$B_ALLELE_FREQ
  flj.from.snp.dat <- flj.from.snp.dat[,c("FAM_SET_COMBN_ID", "SNP_ID", "A_TRANS_PROB", "B_TRANS_PROB", "SAMPLE_ID")]
  
  return(flj.from.snp.dat)
}


#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

parent.combns.fun <- function(fams, 
                              n.in.pools,
                              fam.set.combns) {
  
  #generate parent.combns and fam.combns.by.fam.set.combn
  if(!is.null(fam.set.combns)) {
    fam.set.agg.combn <- unique(fam.set.combns[,c("FAM_SET_COMBN_ID", "FAM_SET_ID")])
    fam.set.agg.combn <- fam.set.agg.combn[order(fam.set.agg.combn[,"FAM_SET_ID"]),]
    fam.combns.by.fam.set.combn <- as.data.frame(matrix(NA, ncol = 1+n.in.pools, nrow=0))
    colnames(fam.combns.by.fam.set.combn) <- c("FAM_SET_COMBN_ID", paste("FAMILY_ID_", 1:n.in.pools, sep=""))
    
    for(fam.set.combn in unique(fam.set.combns[,"FAM_SET_COMBN_ID"])) {
      
      #pooling for pedigree assignment
      #  if(nrow(fam.set.agg.combn) == 
      #     n.in.pools * length(unique(fam.set.agg.combn$FAM_SET_COMBN_ID))) { 
      if(nrow(fam.set.agg.combn) != 1) {  
        for(agg in fam.set.agg.combn[fam.set.agg.combn[,"FAM_SET_COMBN_ID"] == fam.set.combn,"FAM_SET_ID"]) {
          if(agg == min(fam.set.agg.combn[fam.set.agg.combn[,"FAM_SET_COMBN_ID"] == fam.set.combn,"FAM_SET_ID"])) {
            tmp <- fam.set.combns[fam.set.combns[,"FAM_SET_COMBN_ID"] == fam.set.combn &
                                    fam.set.combns[,"FAM_SET_ID"] == agg, 
                                  c("FAM_SET_COMBN_ID", "FAMILY_ID")]
          } else {
            tmp <- merge(tmp, fam.set.combns[fam.set.combns[,"FAM_SET_COMBN_ID"] == fam.set.combn &
                                               fam.set.combns[,"FAM_SET_ID"] == agg, 
                                             c("FAM_SET_COMBN_ID", "FAMILY_ID")],
                         by = "FAM_SET_COMBN_ID",
                         all = TRUE,
                         suffixes = agg:(agg+ncol(tmp)))
          } 
        }
      } 
      
      #pooling by phenotype
      if(nrow(fam.set.agg.combn) == 1) { 
        for(agg in unique(fam.set.agg.combn$FAM_SET_ID)){  # 1:n.in.pools) {
          if(agg == unique(fam.set.agg.combn$FAM_SET_ID)[1]) {
            tmp <- fam.set.combns[fam.set.combns[,"FAM_SET_COMBN_ID"] == fam.set.combn &
                                    fam.set.combns[,"FAM_SET_ID"] == agg, 
                                  c("FAM_SET_COMBN_ID", "FAMILY_ID")]
          } else {
            tmp <- merge(tmp, fam.set.combns[fam.set.combns[,"FAM_SET_COMBN_ID"] == fam.set.combn , 
                                             c("FAM_SET_COMBN_ID", "FAMILY_ID")],
                         by = "FAM_SET_COMBN_ID",
                         all = TRUE,
                         suffixes = agg:(agg+ncol(tmp)))
          } 
        }
      }
      
      colnames(tmp) <- c("FAM_SET_COMBN_ID", paste("FAMILY_ID_", 1:n.in.pools, sep=""))
      fam.combns.by.fam.set.combn <- rbind(fam.combns.by.fam.set.combn, tmp)  
      rm(tmp)
    }
    
    parent.combns <- unique(fam.combns.by.fam.set.combn[,-1])
    
    if(!is.vector(parent.combns)) {
      parent.combns[,"FAM_COMBN_ID"] <- 1:nrow(parent.combns) 
    } else {
      parent.combns <- data.frame(FAMILY_ID_1 = parent.combns,
                                  FAM_COMBN_ID = 1:length(parent.combns)  )
      parent.combns$FAMILY_ID_1 <- as.integer(parent.combns$FAMILY_ID_1)
    }
    fam.combns.by.fam.set.combn <- merge(fam.combns.by.fam.set.combn, parent.combns, by.x = 2:ncol(fam.combns.by.fam.set.combn), 
                                         by.y = 1:(ncol(parent.combns)-1), all.x=TRUE)
    parent.combns <- parent.combns[,c("FAM_COMBN_ID", paste("FAMILY_ID_", 1:n.in.pools, sep=""))]
    
    fam.combns.by.fam.set.combn <- fam.combns.by.fam.set.combn[,c("FAM_SET_COMBN_ID", "FAM_COMBN_ID", paste("FAMILY_ID_", 1:n.in.pools, sep=""))]
    fam.combns.by.fam.set.combn <- fam.combns.by.fam.set.combn[order(fam.combns.by.fam.set.combn[, "FAM_COMBN_ID"]),]
    fam.combns.by.fam.set.combn <- fam.combns.by.fam.set.combn[order(fam.combns.by.fam.set.combn[, "FAM_SET_COMBN_ID"]),]
  }
  
  #Generate parent combinations and append to parent.combns and fam.combns.by.fam.set.combn
  
  for(col in 2:ncol(parent.combns)) {
    parent.combns <- merge(parent.combns, fams, by.x = col, by.y = "FAMILY_ID", all.x = TRUE, suffixes = (c(col-2,col-1)))
  }
  parent.combns[,(col+1):ncol(parent.combns)] <- t(apply(parent.combns[,(col+1):ncol(parent.combns)], 1, sort))
  colnames(parent.combns) <- c(colnames(parent.combns)[1:col],paste("PARENT_", 1:(2*(col-1)), sep=""))
  parent.combns <- parent.combns[order(parent.combns[,"FAM_COMBN_ID"]),]
  parent.combns[,"PARENT_COMBN_ID"] <- NA
  parent.combns[!duplicated(parent.combns[,(col+1):ncol(parent.combns)]) ,"PARENT_COMBN_ID"] <- 
    1:sum(!duplicated(parent.combns[,(col+1):ncol(parent.combns)]))
  #get unique parent combinations then remerge
  tmp <- parent.combns[!is.na(parent.combns[,"PARENT_COMBN_ID"]),(col+1):ncol(parent.combns)]
  parent.combns <- parent.combns[,colnames(parent.combns) != "PARENT_COMBN_ID"]
  parent.combns <- merge(parent.combns,
                         tmp,
                         by.x = (col+1):ncol(parent.combns),
                         by.y = 1:(ncol(tmp)-1),
                         all.x = TRUE)
  
  parent.combns <- parent.combns[,c("FAM_COMBN_ID",
                                    paste("FAMILY_ID_", 1:n.in.pools, sep =""),
                                    "PARENT_COMBN_ID",
                                    paste("PARENT_", 1:(2*n.in.pools), sep =""))]
  
  # parent.combns.by.fam.set.combn <- merge(fam.combns.by.fam.set.combn[,c("FAM_SET_COMBN_ID", "FAM_COMBN_ID")], 
  #                                 parent.combns[,c("FAM_COMBN_ID", "PARENT_COMBN_ID", paste("PARENT_", 1:(2*n.in.pools), sep =""))], 
  #                                 by = "FAM_COMBN_ID",
  #                                 all.x = TRUE)
  
  fam.combns.by.fam.set.combn$FAM_COMBN_ID <- as.integer(fam.combns.by.fam.set.combn$FAM_COMBN_ID)
  parent.combns$FAM_COMBN_ID      <- as.integer(parent.combns$FAM_COMBN_ID)
  parent.combns.by.fam.set.combn           <- left_join(fam.combns.by.fam.set.combn[,c("FAM_SET_COMBN_ID", "FAM_COMBN_ID")], 
                                                        parent.combns[,c("FAM_COMBN_ID", "PARENT_COMBN_ID", 
                                                                         paste("PARENT_", 1:(2*n.in.pools), sep =""))],
                                                        by = "FAM_COMBN_ID")
  
  parent.combns.by.fam.set.combn <- parent.combns.by.fam.set.combn[,colnames(parent.combns.by.fam.set.combn) != "FAM_COMBN_ID"]
  
  parent.combns.by.fam.set.combn <- parent.combns.by.fam.set.combn[!duplicated(parent.combns.by.fam.set.combn[,c("FAM_SET_COMBN_ID", "PARENT_COMBN_ID")]),]
  
 parent.combns.by.fam.set.combn <- parent.combns.by.fam.set.combn[!duplicated(parent.combns.by.fam.set.combn[,c("FAM_SET_COMBN_ID", "PARENT_COMBN_ID")]),] #remove duplicate parent combinations within family set combinations
 parent.combns <- parent.combns[order(parent.combns$FAM_COMBN_ID),]
 parent.combns <- parent.combns[!duplicated(parent.combns[,"PARENT_COMBN_ID"]),] #remove duplicate parent combinations 
  
  return(list(parent.combns = parent.combns, parent.combns.by.fam.set.combn = parent.combns.by.fam.set.combn))
}


#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

lambda.ij.fun <- function(snp.dat.indiv, 
                          snp.param.indiv,
                          n.in.pools=1,
                          min.sd = 0,
                          min.intensity = 0
) {  
  
  #Returns the elements (e.g. "LAMBDA.AAAA", "LAMBDA.AAAB", "LAMBDA.AABB", "LAMBDA.BBBB") of the matrix shown in the 
  #top left of page 4 of Henshall et al. 2014 (modified to accomodate pools).  Note that it is assumed that 
  #LAMBDA.AAAB = LAMBDA.BAAA etc. Also returns the allelic proportion (pij) (see Equation 1 of Henshall et al. 2014). 
  
  #Required functions:
  # pij.fun
  
  # snp.dat.indiv: Data frame (pooled samples only) containing relevant fields from the Genotype.Intensity 
  #tab of the corresponding GenotypeIntensity.xls file outputted from Sequenom's Typer software 
  #(Sequenom 2006).  Equivalent outputs from other platforms could also be used.  
  #              1. SAMPLE_ID is the sample identifier, 
  #              2. SNP_ID   is the SNP identifier,
  #              3. INTENSITY_A   is the area/intensity for allele A, 
  #              4. INTENSITY_B   is the area/intensity for allele B, 
  
  # snp.param.indiv: Data frame. Output of snp.gen.param.fun.  Example below for n.in.pools = 2.
  #             SNP_ID	     SNP identifier
  #             MEAN_P_AAAA	 Mean of allelic proportion for genotype AAAA
  #             SD_P_AAAA	   Standard deviation of allelic proportion for genotype AAAA
  #             MEAN_P_AAAB	 Mean of allelic proportion for genotype AAAB
  #             SD_P_AAAB    Standard deviation of allelic proportion for genotype AAAB
  #             MEAN_P_AABB	 Mean of allelic proportion for genotype AABB
  #             SD_P_AABB	   Standard deviation of allelic proportion for genotype AABB
  #             MEAN_P_ABBB	 Mean of allelic proportion for genotype ABBB
  #             SD_P_ABBB    Standard deviation of allelic proportion for genotype ABBB
  #             MEAN_P_BBBB	 Mean of allelic proportion for genotype BBBB
  #             SD_P_BBBB	   Standard deviation of allelic proportion for genotype BBBB
  
  #n.in.pools
  # Integer. Number of individuals in each pooled sample.  NOTE computation time is roughly proportional to 2 n.in.pools (i.e. this function is only appropriate for small pool sizes).  For larger pool sizes use an alternative method (e.g. least squares).
  
  # min.sd       Number. Standard deviation of allelic proportion  fixed to this value if less 
  #              than it.
  
  # min.intensity      Number used in pij.fun. If sqrt((snp.dat.indiv$INTENSITY_A)^2 +
  #              (snp.dat.indiv$INTENSITY_B)^2) less than this value
  #              then set allelic proportion to missing (see end of page 3 of Henshall et al 2014).
  #              Essentially removes observations that fall into the lower left of INTENSITY_A
  #              by INTENSITY_B scatter plot.
  
  #Returns##########################################
  # lambda.pool.j:  Data frame 
  #            1.            SAMPLE_ID is the individual identifier
  #            2.            SNP_ID is the SNP identifier 
  #            3-            e.g. AAAA_LAMBDA, AAAB_LAMBDA, AABB_LAMBDA, ABBB_LAMBDA, BBBB_LAMBDA - modified from top left 
  #                          of page 4 of Henshall et al. 2014 
  #            Final column. ALLELIC_PROP_POOL is the allellic proportion (Equation 1 of Henshall et al. 2014) 
  
  print("Running lambda.ij.fun")
  
  #Check that all headings are present in inputs  
  if(sum(c("SAMPLE_ID", "SNP_ID", "INTENSITY_A", "INTENSITY_B") %in% 
         colnames(snp.dat.indiv)) != 4) {
    stop("snp.dat.indiv input must be a data frame containing the following headings: SAMPLE_ID, SNP_ID, INTENSITY_A, INTENSITY_B")
  }
  
  #Generate list of genotypes
  genotypes <- genotypes.fun(n=(2*n.in.pools))
  
  if(sum(c("SNP_ID", paste("MEAN_P_",genotypes, sep=""), paste("SD_P_",genotypes, sep="")) %in% 
         colnames(snp.param.indiv)) != 3+4*n.in.pools) {
    stop(paste("snp.param.indiv input must be a data frame containing", 3+4*n.in.pools, "columns given that n.in.pools equals", n.in.pools))
  }
  
  #Name columns and assign class
  snp.dat.indiv$SAMPLE_ID  <- as.integer(snp.dat.indiv$SAMPLE_ID)
  snp.dat.indiv$SNP_ID   <- as.character(snp.dat.indiv$SNP_ID)
  snp.dat.indiv$INTENSITY_A   <- as.numeric(snp.dat.indiv$INTENSITY_A)
  snp.dat.indiv$INTENSITY_B   <- as.numeric(snp.dat.indiv$INTENSITY_B)
  snp.dat.indiv          <- snp.dat.indiv[,c("SAMPLE_ID", "SNP_ID", "INTENSITY_A", "INTENSITY_B")]
  
  #colnames(snp.param.indiv) <- c("SNP_ID", "MEAN_P_AA", "SD_P_AA", "MEAN_P_AB", "SD_P_AB", "MEAN_P_BB", "SD_P_BB")
  #  snp.param.indiv$SNP_ID    <- as.character(snp.param.indiv$SNP_ID)
  #  snp.param.indiv$MEAN_P_AA   <- as.numeric(snp.param.indiv$MEAN_P_AA)
  #  snp.param.indiv$SD_P_AA     <- as.numeric(snp.param.indiv$SD_P_AA)
  #  snp.param.indiv$MEAN_P_AB   <- as.numeric(snp.param.indiv$MEAN_P_AB)
  #  snp.param.indiv$SD_P_AB     <- as.numeric(snp.param.indiv$SD_P_AB)
  #  snp.param.indiv$MEAN_P_BB   <- as.numeric(snp.param.indiv$MEAN_P_BB)
  #  snp.param.indiv$SD_P_BB     <- as.numeric(snp.param.indiv$SD_P_BB)
  #  snp.param.indiv <- snp.param.indiv[,c("SNP_ID", "MEAN_P_AA", "SD_P_AA", "MEAN_P_AB", "SD_P_AB", "MEAN_P_BB", "SD_P_BB")]
  
  #Check for duplicated records in snp.dat.indiv
  pool.snp <- paste(snp.dat.indiv$SAMPLE_ID,snp.dat.indiv$SNP_ID, sep=".")
  if(sum(duplicated(pool.snp)) > 0) {
    stop("SAMPLE_ID and SNP_ID combinations are not unique in snp.dat.indiv.  Delete duplicates or recode SAMPLE_ID.")
  }
  rm(pool.snp)
  
  # Check the list of SNPs the same in input files
  if(sum(unique(snp.dat.indiv[,"SNP_ID"]) != unique(snp.param.indiv[,"SNP_ID"]))>0) {
    stop("SNP identifiers do not match in snp.dat.indiv and snp.param.indiv")
  }
  
  #Get allelic proportion
  snp.allelic.prop <- pij.fun(snp.dat.indiv = snp.dat.indiv, min.intensity = min.intensity)
  print("Still running lambda.ij.fun")
  
  #Rename columns
  colnames(snp.allelic.prop)[colnames(snp.allelic.prop) == "SAMPLE_ID"]        <- "SAMPLE_ID"
  colnames(snp.allelic.prop)[colnames(snp.allelic.prop) == "ALLELIC_PROP"]     <- "ALLELIC_PROP_POOL"  
  colnames(snp.allelic.prop)[colnames(snp.allelic.prop) == "INTENSITY"]        <- "INTENSITY_POOL"  
  
  #merge data using cbind
  if(identical(snp.dat.indiv$SAMPLE_ID,snp.allelic.prop$SAMPLE_ID) &
     identical(snp.dat.indiv$SNP_ID,snp.allelic.prop$SNP_ID)) {
    snp.dat.indiv <- cbind(snp.dat.indiv, snp.allelic.prop[,!colnames(snp.allelic.prop) %in% c("SAMPLE_ID", "SNP_ID", "INTENSITY_A", "INTENSITY_B")])
  } else {
    stop("SAMPLE_ID and SNP_ID columns of pij.fun output do not match those of snp.dat.indiv.  Not sure why.")
  }
  
  for(colname in paste("SD_P_",genotypes, sep="")) {
    snp.param.indiv[snp.param.indiv[,colname] < min.sd & !is.na(snp.param.indiv[,colname]), colname] <- min.sd
    snp.param.indiv[is.na(snp.param.indiv[,colname]), colname] <- min.sd
  }
  
  for(genotype in genotypes) {
    mean.colname <- paste("MEAN_P_",genotype, sep="")
    sd.colname <- paste("SD_P_",genotype, sep="")
    
    #Ensure all SD are > 0 if mean is not NA
    if(sum(
      (
        !is.na(snp.param.indiv[,mean.colname]) &
        (snp.param.indiv[,sd.colname] <= 0 & !is.na(snp.param.indiv[,sd.colname]) |
         is.na(snp.param.indiv[,sd.colname])) 
      )
    ) > 0) {
      stop("At least one element of SD_P_ equals zero or is NA where MEAN_P_ are not NA in snp.param.indiv.  Must specify min.sd that is greater than 0 or modify snp.param.indiv.")
    }
  }
  rm(mean.colname, sd.colname)
  
  #Merge data
  # lambda.pools <- merge(snp.dat.indiv, snp.param.indiv, by = "SNP_ID", all.x = TRUE)
  snp.dat.indiv$SNP_ID    <- as.character(snp.dat.indiv$SNP_ID)
  snp.param.indiv$SNP_ID    <- as.character(snp.param.indiv$SNP_ID)
  lambda.pools <- left_join(snp.dat.indiv, snp.param.indiv, by = "SNP_ID")
  
  for(count in 1:length(genotypes)) {
    genotype     <- genotypes[count]
    lambda.colname  <- paste(genotype, "_LAMBDA",sep="")
    mean.colname <- paste("MEAN_P_",genotype, sep="")
    sd.colname   <- paste("SD_P_",genotype, sep="")
    
    lambda.pools[,lambda.colname] <- dnorm(lambda.pools$ALLELIC_PROP_POOL,lambda.pools[,mean.colname],lambda.pools[,sd.colname])
  }
  
  #retain only relevant columns
  lambda.pools <- lambda.pools[,c("SAMPLE_ID", "SNP_ID", paste(genotypes, "_LAMBDA",sep=""), "ALLELIC_PROP_POOL")]
  
  return(lambda.pools)
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

rho.inv.fun <- function(n.in.pools) {
  
  print("Running rho.inv.fun")
  
  #Generate list of genotypes
  genotypes <- genotypes.fun(n=(2*n.in.pools))
  
  rho.inv <- data.frame(GENOTYPE = genotypes,
                        RHO_INV      = NA)
  rho.inv$GENOTYPE <- as.character(rho.inv$GENOTYPE)
  
  combns.fun <- function(n, m) {
    ind <- combn(seq_len(n), m)
    ind <- t(ind) + (seq_len(ncol(ind)) - 1) * n
    res <- rep(0, nrow(ind) * n)
    res[ind] <- 1
    matrix(res, ncol = n, nrow = nrow(ind), byrow = TRUE)
  }
  
  for(i in 1:length(genotypes)) {
    geno <- genotypes[i]
    ordered.geno.count <- nrow(combns.fun(2*n.in.pools,lengths(regmatches(geno, gregexpr("B", geno)))))
    rho.inv[i,"RHO_INV"] <- 1/ordered.geno.count
    rm(ordered.geno.count)
  }
  
  return(rho.inv)
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

nlj.fun <- function(flj, n.in.pools) {
  print("Running nlj.fun")
  
  #Get nlj
  tmp.parent.combns <- data.frame(PARENT_COMBN_ID = 1,
                                  PARENT_1 = 1)
  
  for(i in 2:(2*n.in.pools)) {
    tmp.parent.combns[,paste("PARENT_", i, sep="")] <- 1
  }  
  
  nlj <- NULL
  for(fam.set.combn in unique(flj$FAM_SET_COMBN_ID)) {
    
    tmp.fj <- flj[flj[,"FAM_SET_COMBN_ID"] == fam.set.combn,]
    
    tmp.tij <- tmp.fj
    tmp.tij$SAMPLE_ID = 1
    
    tmp.nj <- tcj.fun(parent.combns = tmp.parent.combns,
                      tij = tmp.tij,
                      fj = tmp.fj) 
    tmp.nj <- tmp.nj[,!colnames(tmp.nj) %in% c("PARENT_COMBN_ID", "MISS_PARENT_COUNT")]
    tmp.nj$FAM_SET_COMBN_ID <- fam.set.combn
    
    nlj <- rbind(nlj,tmp.nj)
  }
  
  rm(tmp.fj, tmp.parent.combns, tmp.tij, tmp.nj)
  
  #reorder columns
  nlj <- nlj[,c(colnames(nlj)[ncol(nlj)], colnames(nlj)[-ncol(nlj)])]
  
  print("Still running nlj.fun")
  
  # }
  
  return(nlj)
}


#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

tcj.fun <- function(tij, parent.combns, fj = NULL) {
  
  #Args##########################################
  
  #tij: Data frame.  Output of Gij.fun,  Mij.fun or Dij.fun.
  #          1.   SAMPLE_ID is the individual identifier (assumed to be offspring if not 
  #               listed as a sire or dam in 'fams' input)
  #          2.   SNP_ID is the SNP identifier
  #          3-4. A_TRANS_PROB, B_TRANS_PROB are the probabilities of allele transmission 
  #               for alleles A and B respectively computed from Gij (or Mij) (i.e. the elements of 
  #               the transmission (Tij) vector, Equation 2 of Henshall et al. 2014).
  #          5.    OFFSPRING_MISS. Logical. True if genotype data are missing
  
  #NOTE: If a D.matrix is specified then fj must be NULL (fj = NULL)
  
  # parent.combns:   
  
  # fj: Data frame.  Output of fj.from.parent.Gij.fun or fj.from.snp.dat.indiv.fun
  #          1.   SNP_ID is the snp identifier
  #          2-3. A_TRANS_PROB, B_TRANS_PROB are the probabilities of allele transmission 
  #               for alleles A and B respectively computed from average parental Gij or parental genotype counts (i.e. the elements of 
  #               the fj vector).
  #          4. SAMPLE_ID is the individual identifier (all values are zero and represent unknown individuals)
  
  #Returns##########################################
  
  #tcj: Data frame.
  
  
  print("Running tcj.fun")
  
  #Check that all headings are present in inputs  
  if(sum(c("SAMPLE_ID", "SNP_ID", "A_TRANS_PROB", "B_TRANS_PROB") %in% 
         colnames(tij)) != 4) {
    stop("tij input must be a data frame containing the following headings: SAMPLE_ID, SNP_ID, A_TRANS_PROB, B_TRANS_PROB")
  }
  
  #Name columns and assign class
  #colnames(tij)     <- c("SAMPLE_ID", "SNP_ID", "A_TRANS_PROB", "B_TRANS_PROB")
  tij$SAMPLE_ID     <- as.integer(tij$SAMPLE_ID)
  tij$SNP_ID       <- as.character(tij$SNP_ID)
  tij$A_TRANS_PROB <- as.numeric(tij$A_TRANS_PROB)
  tij$B_TRANS_PROB <- as.numeric(tij$B_TRANS_PROB)
  tij <- tij[,c("SAMPLE_ID", "SNP_ID", "A_TRANS_PROB", "B_TRANS_PROB")]
  
  if(is.null(fj)) {
    fj <- data.frame(SNP_ID       = unique(tij$SNP_ID),
                     A_TRANS_PROB = 0.5,
                     B_TRANS_PROB = 0.5) #Missing parent data for exclusion method
  } else {
    fj$SAMPLE_ID     <- as.integer(fj$SAMPLE_ID)
    fj$SNP_ID       <- as.character(fj$SNP_ID)               
    fj$A_TRANS_PROB <- as.numeric(fj$A_TRANS_PROB)  
    fj$B_TRANS_PROB <- as.numeric(fj$B_TRANS_PROB)  
    fj <- fj[,c("SAMPLE_ID", "SNP_ID", "A_TRANS_PROB", "B_TRANS_PROB")]
    
  }
  
  #Name columns and assign class
  for(i in 1:ncol(parent.combns)) {parent.combns[,i] <- as.integer(parent.combns[,i])}
  parent.combns$PARENT_COMBN_ID <- as.integer(parent.combns$PARENT_COMBN_ID)
  
  #Check that probabilities between 0 and 1
  if(
    sum(
      tij$A_TRANS_PROB > 1 |
      tij$B_TRANS_PROB > 1 |
      
      tij$A_TRANS_PROB < 0 |
      tij$B_TRANS_PROB < 0 
      , na.rm = T) != 0
  ) {
    stop("Probabilities in tij must be between 0 and 1 inclusive")
  }
  
  #Check that "A_TRANS_PROB", "B_TRANS_PROB" sum to 1 in tij
  if(
    sum(
      round((tij$A_TRANS_PROB + tij$B_TRANS_PROB),5) != 1.0, na.rm = T
    ) != 0
  ) {
    stop("A_TRANS_PROB + B_TRANS_PROB must equal 1 in all rows of tij")
  }
  
  #Ensure that SAMPLE_ID column contains no 0s or NA
  if(
    sum(
      (is.na(tij$SAMPLE_ID) |
       tij$SAMPLE_ID == 0), na.rm = T
    ) != 0
  ) {
    stop("SAMPLE_ID in tij cannot be NA or 0")
  } 
  
  #remove family combinations from parent.combns
  parent.combns <- parent.combns[!duplicated(parent.combns[,"PARENT_COMBN_ID"]),]
  parent.combns <- parent.combns[,grepl("PARENT_", colnames(parent.combns))]
  
  #identify parents 
  parents <- unique(unlist(parent.combns[,colnames(parent.combns) != "PARENT_COMBN_ID"]))
  
  #separate parents and offspring in tij
  tij.parents <- tij[tij[,"SAMPLE_ID"] %in% parents,]
  
  #Include SAMPLE_ID = 0 (i.e. unknown parent)
  if(!is.null(fj)) {
    tij.parents      <- merge(tij.parents, fj, 
                              by = c("SAMPLE_ID", "SNP_ID", "A_TRANS_PROB", "B_TRANS_PROB"), 
                              all = TRUE)
  }
  
  #identify snp
  snps <- fj$SNP_ID
  
  #Transmission vector 
  tcj <- parent.combns[rep(seq_len(nrow(parent.combns)), length(snps)), ] 
  tcj[,"SNP_ID"] <- rep(snps, each = nrow(parent.combns))
  rownames(tcj) <- 1:nrow(tcj)
  gc()
  
  tcj$MISS_PARENT_COUNT <- 0
  tcj$MISS <- FALSE
  
  # tcj <- merge(tcj, fj[,c("SNP_ID", "A_TRANS_PROB", "B_TRANS_PROB")], by = "SNP_ID", all.x = TRUE)
  tcj <- left_join(tcj, fj[,c("SNP_ID", "A_TRANS_PROB", "B_TRANS_PROB")], by = "SNP_ID")
  
  colnames(tcj)[(ncol(tcj)-1):ncol(tcj)] <- c("A_TRANS_PROB_MISS", "B_TRANS_PROB_MISS")
  
  for(parent in 1:(ncol(parent.combns) - 1)) {
    
    print(paste("Parent",parent,"of",(ncol(parent.combns) - 1)))
    
    # tcj <- merge(tcj,
    #             tij.parents[,c("SAMPLE_ID", "SNP_ID", "A_TRANS_PROB", "B_TRANS_PROB")],
    #             by.x = c(paste("PARENT_", parent, sep=""), "SNP_ID"),
    #              by.y = c("SAMPLE_ID", "SNP_ID"),
    #              all.x = TRUE)
    
    tmp <- c("PARENT" = "SAMPLE_ID", "SNP_ID" = "SNP_ID")
    attributes(tmp)$names[1] <- paste("PARENT_", parent, sep="")
    tcj <- left_join(tcj, tij.parents[,c("SAMPLE_ID", "SNP_ID", "A_TRANS_PROB", "B_TRANS_PROB")], by = tmp)
    rm(tmp)
    
    tcj$MISS <- FALSE
    tcj[is.na(tcj[,"A_TRANS_PROB"]) | 
          is.na(tcj[,"B_TRANS_PROB"]) |
          tcj[,paste("PARENT_", parent, sep="")] == 0,"MISS"] <- TRUE
    
    tcj$MISS_PARENT_COUNT <- tcj$MISS_PARENT_COUNT + tcj$MISS
    
    #Replace missing data with mean allele probabilities in parents
    tcj[is.na(tcj[,"A_TRANS_PROB"]),"A_TRANS_PROB"] <-
      tcj[is.na(tcj[,"A_TRANS_PROB"]),"A_TRANS_PROB_MISS"]
    tcj[is.na(tcj[,"B_TRANS_PROB"]),"B_TRANS_PROB"] <-
      tcj[is.na(tcj[,"B_TRANS_PROB"]),"B_TRANS_PROB_MISS"]
    
    if(parent == 1) {
      colnames(tcj)[(ncol(tcj)-1):ncol(tcj)] <- c("A","B") #last 2 columns
    } else {
      tcj[,paste(genotypes.fun(n = (parent-1)),"A", sep="")] <- 
        tcj[,genotypes.fun(n = (parent-1))] * tcj[,"A_TRANS_PROB"]
      tcj[,paste(genotypes.fun(n = (parent-1)),"B", sep="")] <- 
        tcj[,genotypes.fun(n = (parent-1))] * tcj[,"B_TRANS_PROB"]
      
      #remove unwanted columns
      tcj <- tcj[,!colnames(tcj) %in% c("A_TRANS_PROB", "B_TRANS_PROB")]
      tcj <- tcj[,!colnames(tcj) %in% genotypes.fun(parent-1)]
      
      #convert ordered genotypes to unordered
      ordered.genos <- c(paste(genotypes.fun(n = (parent-1)),"A", sep=""),
                         paste(genotypes.fun(n = (parent-1)),"B", sep=""))
      unordered.genos <- genotypes.fun(n = parent)
      
      for (i in 1:length(ordered.genos)) {
        unordered.genos[i] <- paste(sort(unlist(strsplit(ordered.genos[i], ""))), collapse = "")
      }
      
      #partition tcj 
      tcj.ordered <- tcj[,ordered.genos]
      tcj <- tcj[,!colnames(tcj) %in% ordered.genos]
      
      #get retain unordered genotypes
      for(geno in unique(unordered.genos)) {
        if(sum(unordered.genos==geno) > 1) {
          tcj[,geno] <- rowSums(tcj.ordered[,unordered.genos==geno])
        } else {
          tcj[,geno] <- tcj.ordered[,unordered.genos==geno]
        }
      }
      rm(tcj.ordered)
    }
    
  }
  #reorder/remove columns
  tcj <- tcj[,c("SNP_ID", "PARENT_COMBN_ID", "MISS_PARENT_COUNT", genotypes.fun(parent))]
  
  #divide by rho as genotypes are unordered
  n.in.pools <- (ncol(parent.combns)-1)/2
  
  rho.inv <- rho.inv.fun(n.in.pools)
  for(geno in rho.inv$GENOTYPE) {
    tcj[,geno] <- tcj[,geno]*rho.inv[rho.inv[,"GENOTYPE"] == geno,"RHO_INV"]
  }
  
  tcj <- tcj[order(tcj[,"SNP_ID"]),]
  tcj <- tcj[order(tcj[,"PARENT_COMBN_ID"]),]
  
  return(tcj)
  
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

snp.error.fun <- function(Gij, fams = NULL, parents.only = FALSE) {    
  
  #Computes the estimated SNP specific error rate (epsilon hat) from Gij matrices according to the method
  #outlined on the left hand side of  page 5 of Henshall et al. 2014 
  
  # Args: 
  
  # Gij:     Data frame.  Output from G.T.matrices.fun
  #              1.   SAMPLE_ID is the individual identifier
  #              2.   SNP_ID is the SNP identifier
  #              3-6. AA_GENO_PROB, AB_GENO_PROB, BA_GENO_PROB and BB_GENO_PROB: elements of the Gij matrix (see the 
  #                   top left of page 4 of Henshall et al. 2014
  #              7-8. A_TRANS_PROB, B_TRANS_PROB are the probabilities of allele transmission 
  #                   for alleles A and B respectively computed from Gij (i.e. the elements of 
  #                   the transmission (Tij) vector, Equation 2 of Henshall et al. 2014).
  
  # fams: Optional data frame - only required if parents.only=TRUE   NOTE: to define half sib families, the unknown parent must be NA or 0.  
  #          1. FAMILY_ID is the family identifier
  #          2. SIRE_ID is the sire identifier 
  #          3. DAM_ID is the dam identifier 
  
  # parents.only: Logical.  If TRUE only used data from parents.
  
  #Returns##########################################
  
  # snp.error.hat: Data frame.
  #              1.   SNP_ID is the SNP identifier  
  #              2.   SNP_ERROR_HAT is the SNP specific error rate (epsilon hat) from 
  #                   Gij matrix according to the method outlined on the left hand side of  
  #                   page 5 of Henshall et al. 2014  
  
  print("Running snp.error.fun")
  
  #Check that all headings are present in inputs  
  if(sum(c("SAMPLE_ID", "SNP_ID", "AA_GENO_PROB", "AB_GENO_PROB", "BA_GENO_PROB", 
           "BB_GENO_PROB", "A_TRANS_PROB", "B_TRANS_PROB") %in% 
         colnames(Gij)) != 8) {
    stop("Gij input must be a data frame containing the following headings: SAMPLE_ID, SNP_ID, AA_GENO_PROB, AB_GENO_PROB, BA_GENO_PROB, BB_GENO_PROB, A_TRANS_PROB, B_TRANS_PROB")
  }
  
  
  #Name columns and assign class
  #colnames(Gij)     <- c("SAMPLE_ID", "SNP_ID", "AA_GENO_PROB", "AB_GENO_PROB", "BA_GENO_PROB", "BB_GENO_PROB", 
  #                                 "A_TRANS_PROB", "B_TRANS_PROB")
  Gij$SAMPLE_ID     <- as.integer(Gij$SAMPLE_ID)
  Gij$SNP_ID       <- as.character(Gij$SNP_ID)
  Gij$AA_GENO_PROB <- as.numeric(Gij$AA_GENO_PROB)
  Gij$AB_GENO_PROB <- as.numeric(Gij$AB_GENO_PROB)
  Gij$BA_GENO_PROB <- as.numeric(Gij$BA_GENO_PROB)
  Gij$BB_GENO_PROB <- as.numeric(Gij$BB_GENO_PROB)
  Gij$A_TRANS_PROB <- as.numeric(Gij$A_TRANS_PROB)
  Gij$B_TRANS_PROB <- as.numeric(Gij$B_TRANS_PROB)
  Gij <- Gij[,c("SAMPLE_ID", "SNP_ID", "AA_GENO_PROB", "AB_GENO_PROB", "BA_GENO_PROB", 
                "BB_GENO_PROB", "A_TRANS_PROB", "B_TRANS_PROB")]
  
  #Check that probabilities between 0 and 1
  if(
    sum(
      Gij$AA_GENO_PROB > 1 |
      Gij$AB_GENO_PROB > 1 |
      Gij$BA_GENO_PROB > 1 |
      Gij$BB_GENO_PROB > 1 |
      Gij$A_TRANS_PROB > 1 |
      Gij$B_TRANS_PROB > 1 |
      
      Gij$AA_GENO_PROB < 0 |
      Gij$AB_GENO_PROB < 0 |
      Gij$BA_GENO_PROB < 0 |
      Gij$BB_GENO_PROB < 0 |
      Gij$A_TRANS_PROB < 0 |
      Gij$B_TRANS_PROB < 0 
      , na.rm = T) != 0
  ) {
    stop("Probabilities in Gij must be between 0 and 1 inclusive")
  }
  
  #Check that "A_TRANS_PROB", "B_TRANS_PROB" sum to 1 in Gij
  if(
    sum(
      round((Gij$A_TRANS_PROB + Gij$B_TRANS_PROB),5) != 1.0, na.rm = T
    ) != 0
  ) {
    stop("A_TRANS_PROB + B_TRANS_PROB must equal 1 in all rows of Gij")
  }
  
  #Check that "AA_GENO_PROB", "AB_GENO_PROB", "BA_GENO_PROB", "BB_GENO_PROB" sum to one
  if(
    sum(
      round((Gij$AA_GENO_PROB + Gij$AB_GENO_PROB +
             Gij$BA_GENO_PROB + Gij$BB_GENO_PROB),5) != 1, na.rm = T
    ) != 0
  ) {
    stop("AA_GENO_PROB + AB_GENO_PROB + BA_GENO_PROB + BB_GENO_PROB must equal 1 in all rows of Gij")
  }
  
  #Ensure that SAMPLE_ID column contains no 0s or NA
  if(
    sum(
      (is.na(Gij$SAMPLE_ID) |
       Gij$SAMPLE_ID == 0), na.rm = T
    ) != 0
  ) {
    stop("SAMPLE_ID in Gij cannot be NA or 0")
  } 
  
  #only retain parents in Gij if parents.only = TRUE
  
  if(parents.only == TRUE) {
    
    if(is.null(fams)) {
      stop("fams must be specified if parents.only = TRUE")
    }
    
    if(sum(c("FAMILY_ID", "SIRE_ID", "DAM_ID") %in% 
           colnames(fams)) != 3) {
      stop("fams input must be a data frame containing the following headings: FAMILY_ID, SIRE_ID, DAM_ID")
    }
    
    #colnames(fams) <- c("FAMILY_ID", "SIRE_ID", "DAM_ID")
    fams$FAMILY_ID    <- as.integer(fams$FAMILY_ID)
    fams$SIRE_ID   <- as.integer(fams$SIRE_ID)
    fams$DAM_ID    <- as.integer(fams$DAM_ID)
    fams <- fams[,c("FAMILY_ID", "SIRE_ID", "DAM_ID")]
    
    #If NA present as parents in fam then convert to 0
    fams[is.na(fams[,"SIRE_ID"]) ,"SIRE_ID"] <- "0"   
    fams[is.na(fams[,"DAM_ID"]) ,"DAM_ID"]   <- "0"  
    
    parents <- unique(c(fams[,"SIRE_ID"],fams[,"DAM_ID"]))
    parents <- parents[parents != "0"]  
    
    Gij <- Gij[Gij[,"SAMPLE_ID"] %in% parents,]
    
  }
  
  #Compute 1 - maximum of the unordered genotypes in Gij
  Gij$ab.BA_GENO_PROB   <- Gij$AB_GENO_PROB + Gij$BA_GENO_PROB
  Gij$EPSILON_GENO_PROB <- 1 - do.call(pmax, Gij[,c("AA_GENO_PROB", 
                                                    "ab.BA_GENO_PROB", 
                                                    "BB_GENO_PROB")])
  #Compute SNP error rates (mean of EPSILON_GENO_PROB for each SNP)
  snp.error <- aggregate(Gij$EPSILON_GENO_PROB, 
                         by = list(Gij$SNP_ID), 
                         na.rm=T, 
                         FUN = "mean")      
  colnames(snp.error) <- c("SNP_ID", "SNP_ERROR_HAT")
  
  return(snp.error)
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

prelim.ml.quant.fun <- function(Gij,
                                parent.combns.by.fam.set.combn,
                                flj,
                                gkj,
                                snp.error,
                                nlj,
                                fam.set.combns.by.pool) {
  
  print("Running prelim.ml.quant.fun")
  
  tclj <- NULL
  for(fam.set.combn in unique(flj$FAM_SET_COMBN_ID)) {
    
    print(fam.set.combn)
    
    tmp.fj <- flj[flj[,"FAM_SET_COMBN_ID"] == fam.set.combn,]
    tmp.parent.combns <- parent.combns.by.fam.set.combn[parent.combns.by.fam.set.combn[,"FAM_SET_COMBN_ID"] == fam.set.combn,] 
    
    tmp.parents <- tmp.parent.combns[,!colnames(tmp.parent.combns) %in% c("FAM_SET_COMBN_ID", "PARENT_COMBN_ID")]
    tmp.parents <- unique(as.vector(as.matrix(tmp.parents)))
    
    tmp.tij <- Gij[Gij[,"SAMPLE_ID"] %in% tmp.parents, ]
    tmp.tclj <- tcj.fun(tij = tmp.tij, 
                        parent.combns = tmp.parent.combns, 
                        fj = tmp.fj)
    tmp.tclj$FAM_SET_COMBN_ID <- fam.set.combn
    tclj <- rbind(tclj, tmp.tclj)
  }
  rm(tmp.fj, tmp.parent.combns, tmp.parents, tmp.tij, tmp.tclj)
  
  miss.parent.count <- unique(tclj[,c("PARENT_COMBN_ID", "SNP_ID", "MISS_PARENT_COUNT")])
  tclj               <- tclj[,colnames(tclj) != "MISS_PARENT_COUNT"] 
  
  gklj   <- adj.geno.prob.fun(lhs = merge(gkj,fam.set.combns.by.pool, by = "SAMPLE_ID", all = TRUE),
                              snp.error = snp.error,
                              nlj = nlj)
  return(list(tclj = tclj,
              miss.parent.count = miss.parent.count,
              gklj = gklj))
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

adj.geno.prob.fun <- function(lhs, snp.error, nlj) {
  
  #Computes the Mij matrix (see the top left of page 5 of Henshall et al. 2014)
  
  #Args########################################## 
  #lhs. Data frame.  
  #              1.   FAM_SET_COMBN_ID
  #              2.   SAMPLE_ID 
  #              3.   SNP_ID is the SNP identifier
  #              4-.  Unsorted genotypes: elements of the gkj or tclj matrices 
  
  #snp.error. Data frame. Output from snp.error.fun
  #              1.   SNP_ID is the SNP identifier
  #              2.   SNP_ERROR_TILDE is the assumed SNP specific error rate (see the 
  #                   middle left of page 5 of Henshall et al. 2014)
  
  #nlj.  Data frame.  Output of nlj.fun
  #              1.   SNP_ID is the SNP identifier
  #              2-   Unsorted genetypes
  
  #Returns##########################################
  
  # adj.geno.probs:     Data frame 
  #              1.   SAMPLE_ID or FAM_COMBN_ID 
  #              2.   SNP_ID is the SNP identifier
  #              3-.  Unsorted genotypes: elements of the gkj or tclj matrices 
  
  print("Running adj.geno.prob.fun")
  
  #Name columns and assign class
  
  # if(sum(c("SNP_ID", "AA_GENO_PROB", "AB_GENO_PROB", "BA_GENO_PROB", "BB_GENO_PROB") %in% colnames(fjfj)) != 5) {
  #    stop("fjfj input must be a data frame containing the following headings: SNP_ID, AA_GENO_PROB, AB_GENO_PROB, BA_GENO_PROB, BB_GENO_PROB")
  #  }
  
  #  fjfj$SNP_ID       <- as.character(fjfj$SNP_ID)
  #  fjfj$AA_GENO_PROB <- as.numeric(fjfj$AA_GENO_PROB)
  #  fjfj$AB_GENO_PROB <- as.numeric(fjfj$AB_GENO_PROB)
  #  fjfj$BA_GENO_PROB <- as.numeric(fjfj$BA_GENO_PROB)
  #  fjfj$BB_GENO_PROB <- as.numeric(fjfj$BB_GENO_PROB)
  #  fjfj <- fjfj[,c("SNP_ID", "AA_GENO_PROB", "AB_GENO_PROB", "BA_GENO_PROB", "BB_GENO_PROB")]

  if(sum(c("SNP_ID", "SNP_ERROR_TILDE") %in% colnames(snp.error)) != 2) {
    stop("snp.error input must be a data frame containing the following headings: SNP_ID, SNP_ERROR_TILDE")
  }
  
  snp.error$SNP_ID       <- as.character(snp.error$SNP_ID)
  snp.error$SNP_ERROR_TILDE <- as.numeric(snp.error$SNP_ERROR_TILDE)
  snp.error <- snp.error[,c("SNP_ID", "SNP_ERROR_TILDE")]
  
  #  if(sum(c("SAMPLE_ID", "SNP_ID", "AA_GENO_PROB", "AB_GENO_PROB", "BA_GENO_PROB", "BB_GENO_PROB") %in% colnames(lhs)) != 6) {
  #    stop("lhs input must be a data frame containing the following headings: SAMPLE_ID, SNP_ID, AA_GENO_PROB, AB_GENO_PROB, BA_GENO_PROB, BB_GENO_PROB")
  #  }
  
  #  lhs$SAMPLE_ID     <- as.integer(lhs$SAMPLE_ID)
  #  lhs$SNP_ID       <- as.character(lhs$SNP_ID)
  #  lhs$AA_GENO_PROB <- as.numeric(lhs$AA_GENO_PROB)
  #  lhs$AB_GENO_PROB <- as.numeric(lhs$AB_GENO_PROB)
  #  lhs$BA_GENO_PROB <- as.numeric(lhs$BA_GENO_PROB)
  #  lhs$BB_GENO_PROB <- as.numeric(lhs$BB_GENO_PROB)
  #  lhs <- lhs[,c("SAMPLE_ID", "SNP_ID", "AA_GENO_PROB", "AB_GENO_PROB", "BA_GENO_PROB", "BB_GENO_PROB")]
  nlj <- nlj[order(nlj$SNP_ID),]
  lhs <- lhs[order(lhs$SNP_ID),]
  snp.error <- snp.error[order(snp.error$SNP_ID),]
  
  #Check that SNP_IDs the same in all data frames
  if(!(identical(unique(nlj$SNP_ID), unique(lhs$SNP_ID)))) {
    stop("SNP_ID in nlj must match those in lhs")
  }
  
  if(!(identical(unique(snp.error$SNP_ID), unique(lhs$SNP_ID)))) {
    stop("SNP_ID in snp.error must match those in lhs")
  }
  
  #Check that SNP_IDs the same in all data frames
  if(!(identical(unique(nlj$SNP_ID), unique(snp.error$SNP_ID)))) {
    stop("SNP_ID in nlj must match those in snp.error")
  }
  
  #End checks  
  
  # remove from consideration snp that have a 0 in any column of nj 
  snp.remove <- nlj[rowSums(nlj[,!colnames(nlj) %in% c("FAM_SET_COMBN_ID", "SNP_ID")] == 0) > 0,c("FAM_SET_COMBN_ID", "SNP_ID")]
  
  if(length(snp.remove) > 0) {
    for(set in unique(snp.remove[,"FAM_SET_COMBN_ID"])) {
      
      nlj <- nlj[!nlj[,"SNP_ID"] %in% snp.remove[snp.remove[,"FAM_SET_COMBN_ID"] == set,"SNP_ID"] & 
                   nlj[,"FAM_SET_COMBN_ID"] == set,]
      lhs <- lhs[!lhs[,"SNP_ID"] %in% snp.remove[snp.remove[,"FAM_SET_COMBN_ID"] == set,"SNP_ID"] & 
                   lhs[,"FAM_SET_COMBN_ID"] == set,]
      warning(paste("Removed SNP",snp.remove[snp.remove[,"FAM_SET_COMBN_ID"] == set,"SNP_ID"], "from tcj.adj and gkj.adj of FAM_SET_COMBN_ID", set, "as zeros present in nlj)"))
    }
  }
  
  adj.geno.probs <- NULL
  for (fam.set.combn in unique(nlj$FAM_SET_COMBN_ID)) {
    
    tmp.lhs <- lhs[lhs[,"FAM_SET_COMBN_ID"] == fam.set.combn,]
    tmp.lhs <- tmp.lhs[,colnames(tmp.lhs) != "FAM_SET_COMBN_ID"]
    tmp.nlj <- nlj[nlj[,"FAM_SET_COMBN_ID"] == fam.set.combn,]   
    tmp.nlj <- tmp.nlj[,colnames(tmp.nlj) != "FAM_SET_COMBN_ID"]  
    
    #Replace missing values in tmp.lhs with those from tmp.nlj
    tmp.lhs <- merge(tmp.lhs, tmp.nlj, by = "SNP_ID", suffix = c("","y"), all.x = TRUE) 
    tmp.lhs[,3:(ncol(tmp.lhs)/2+1)][rowSums(is.na(tmp.lhs[,3:(ncol(tmp.lhs)/2+1)])) > 0,] <- 
      tmp.lhs[,(ncol(tmp.lhs)/2+2):ncol(tmp.lhs)][rowSums(is.na(tmp.lhs[,3:(ncol(tmp.lhs)/2+1)])) > 0,]
    
    #Get SNP error
    # tmp.lhs <- merge(tmp.lhs, snp.error, by = "SNP_ID", all.x = TRUE)
    tmp.lhs$SNP_ID       <- as.character(tmp.lhs$SNP_ID)
    snp.error$SNP_ID <- as.character(snp.error$SNP_ID)
    tmp.lhs <- left_join(tmp.lhs, snp.error, by = "SNP_ID")
    
    #Make adjustments to gkj and nj to account for SNP error 
    tmp.lhs[,3:((ncol(tmp.lhs)-1)/2+1)] <- tmp.lhs[,3:((ncol(tmp.lhs)-1)/2+1)]*(1-tmp.lhs[,ncol(tmp.lhs)])
    tmp.lhs[,((ncol(tmp.lhs)-1)/2+2):(ncol(tmp.lhs)-1)]   <- tmp.lhs[,((ncol(tmp.lhs)-1)/2+2):(ncol(tmp.lhs)-1)]*tmp.lhs[,ncol(tmp.lhs)] 
    tmp.lhs <- tmp.lhs[,-(ncol(tmp.lhs))] #remove error column
    tmp.lhs[,3:(ncol(tmp.lhs)/2+1)] <- tmp.lhs[,3:(ncol(tmp.lhs)/2+1)] + tmp.lhs[,(ncol(tmp.lhs)/2+2):ncol(tmp.lhs)]
    tmp.adj.geno.probs <- tmp.lhs[,1:(ncol(tmp.lhs)/2+1)]
    
    tmp.adj.geno.probs <- tmp.adj.geno.probs[order(tmp.adj.geno.probs[,"SNP_ID"]),]
    
    tmp.adj.geno.probs$FAM_SET_COMBN_ID <- fam.set.combn
    adj.geno.probs <- rbind(adj.geno.probs,tmp.adj.geno.probs)
  }
  
  return(adj.geno.probs)
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

ml.fun <- function(g.d.klj.adj,
                   tclj,
                   snp.error,
                   nlj,
                   miss.parent.count,
                   parent.combns.by.fam.set.combn,
                   parent.combns,
                   meth) {
  
  print("Running ml.fun")
  
  wd <- getwd()
  
  dir.create(file.path(wd, "Results"), showWarnings = FALSE)
  setwd(file.path(wd, "Results"))
  
  g.d.klj.adj <- g.d.klj.adj[order(g.d.klj.adj[,"SAMPLE_ID"]),]
  
  tclj.adj <- adj.geno.prob.fun(lhs= tclj,
                                snp.error=snp.error,
                                nlj = nlj)
  
  tclj.adj <- tclj.adj[order(tclj.adj[,"PARENT_COMBN_ID"]),]
  
  duos <- parents.to.pools.lod.fun(g.d.klj.adj = g.d.klj.adj,
                                   tclj.adj = tclj.adj,  
                                   miss.parent.count = miss.parent.count,
                                   nlj = nlj,
                                   parent.combns.by.fam.set.combn = parent.combns.by.fam.set.combn)
  duos.lod  <- duos$duos.lod
  duos.logl <- duos$duos.logl
  rm(duos)
  
  # duos.lod <- merge(duos.lod, parent.combns.by.fam.set.combn, by = c("SAMPLE_ID", "PARENT_COMBN_ID"), all.x = TRUE)
  duos.lod <- left_join(duos.lod, unique(g.d.klj.adj[,c("SAMPLE_ID", "FAM_SET_COMBN_ID")]), by = "SAMPLE_ID")
  duos.lod <- left_join(duos.lod, parent.combns.by.fam.set.combn, by = c("FAM_SET_COMBN_ID", "PARENT_COMBN_ID"))
  
  duos.lod <- duos.lod[order(duos.lod$LOD, decreasing = TRUE),]
  duos.lod <- duos.lod[order(duos.lod$SAMPLE_ID),]
  
  most.like.parents <- most.like.parents.duo.fun(duos.lod = duos.lod,
                                                 parent.combns = parent.combns) 
  
  if(!exists("running.sim")) {
    try(plot.lod.vs.LogL.range.fun(lod = duos.lod[,c("SAMPLE_ID", "RANGE_5_TO_95_LOGL", "LOD")],
                                   meth = meth)
    )
  }
  setwd(wd)
  
  return(list(g.d.klj.adj = g.d.klj.adj,
              tclj.adj = tclj.adj,
              duos.lod = duos.lod,
              duos.logl = duos.logl,
              most.like.parents = most.like.parents))
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

parents.to.pools.lod.fun <- function(g.d.klj.adj,
                                     tclj.adj,  
                                     miss.parent.count = NULL,
                                     nlj,
                                     parent.combns.by.fam.set.combn) {  
  
  # fjfj: Data frame.  Output of flj.from.parent.Gij.fun or flj.from.snp.dat.fun
  #          1.   SNP_ID is the snp identifier
  #          2-3. A_TRANS_PROB, B_TRANS_PROB are the probabilities of allele transmission 
  #               for alleles A and B respectively computed from average parental Gij or parental genotype counts (i.e. the elements of 
  #               the flj vector).
  #          4-7. AA_GENO_PROB, AB_GENO_PROB, BA_GENO_PROB and BB_GENO_PROB: elements of the fjfj matrix (see the 
  #               bottom left of page 4 of Henshall et al. 2014
  #          8. SAMPLE_ID is the individual identifier (all values are zero and represent unknown individuals)
  
  # miss.parent.count: Data frame
  #          PARENT_COMBN_ID     
  #          SNP_ID         
  #          MISS_PARENT_COUNT
  
  #Returns##########################################
  
  # duos.lod: Data frame.
  #          1. OFFSPRING_ID: Offspring identifier from individual identifier in Gij
  #          2. PARENT_COMBN_ID: Combinations of families 
  #          3. SNP_COUNT: The number of SNPS for which none of the sire, dam or offspring genotypes are missing
  #          4. MIN_LOGL: Minimum log-likelihood ratio of SNPs (that were not excluded) in duo.  
  #          5. MAX_LOGL: Maximum log-likelihood ratio of SNPs in duo.
  #          6. RANGE_5_TO_95_LOGL: 5-95 percentile range of log-likelihood ratio
  #          7. LOD: Summed log-likelihood ratios (log odds score) for the offspring, sire, dam combination.
  
  # duos.logl: Data frame.
  #          1. SNP_ID:	SNP identifier
  #          2. OFFSPRING_ID:	Offspring identifier
  #          3. PARENT_COMBN_ID: Combinations of families 
  #          4. LIKE_NUMERATOR:	The likelihood of the offspring, sire and dam duo for the given SNP_  See Equation 3 of Henshall et al. (2014).
  #          5. LIKE_DENOMINATOR:	"The likelihood under the null hypothesis that the offspring is unrelated to the sire and dam, which is constructed by treating both parents as missing" Henshall et al. (2014).
  #          6. LOG_LIKE_RATIO:	Log likelihood ratio of the offspring, sire and dam duo for the given SNP_
  
  print("Running parents.to.pools.lod.fun")
  
  # print("Still running fam.to.pools.lod.fun")
  
  #Loop through pools 
  
  #seed data 
  duos.lod <- NULL
  n.in.pools <- (ncol(parent.combns.by.fam.set.combn)-2)/2
  
  #identify pools 
  pools <- unique(g.d.klj.adj$SAMPLE_ID)
  
  #generatefam.set.combns.by.pool
  fam.set.combns.by.pool <- unique(g.d.klj.adj[,c("SAMPLE_ID", "FAM_SET_COMBN_ID")])
  g.d.pj.adj <- g.d.klj.adj[,colnames(g.d.klj.adj) != "FAM_SET_COMBN_ID"]
  rm(g.d.klj.adj)
  
  #remove duplicated parent combinations within FAM_SET_COMBN_ID
  parent.combns.by.fam.set.combn <- parent.combns.by.fam.set.combn[!duplicated(parent.combns.by.fam.set.combn[,c("FAM_SET_COMBN_ID","PARENT_COMBN_ID")]), ]
  
  #Add MISS_PARENT_SNP_DATA_PROP to miss.parent.count
  if(!is.null(miss.parent.count)) {
    miss.parent.count[,"MISS_PARENT_SNP_DATA_PROP"]  <- miss.parent.count[,"MISS_PARENT_COUNT"] / (n.in.pools * 2)
    miss.parent.count <- miss.parent.count[,c("PARENT_COMBN_ID", "SNP_ID", "MISS_PARENT_SNP_DATA_PROP")]
  }  
  
  for(i in 1:length(pools)) {
    
    pool <- pools[i] #pools identifier    
    print(paste("Computing duo LODs for sample", pool, "-", i, "of", length(pools)))
    
    fam.set.combn <-fam.set.combns.by.pool[fam.set.combns.by.pool[,"SAMPLE_ID"] == pool,"FAM_SET_COMBN_ID"]
    
    #only retain parent combinations for the fam.set.combn
    t.adj.cj <- tclj.adj[tclj.adj[,"PARENT_COMBN_ID"] %in% 
                           parent.combns.by.fam.set.combn[parent.combns.by.fam.set.combn[,"FAM_SET_COMBN_ID"] == fam.set.combn, "PARENT_COMBN_ID"],]
    t.adj.cj <- t.adj.cj[,colnames(t.adj.cj) != "FAM_SET_COMBN_ID"]
    
    nj <- nlj[nlj[,"FAM_SET_COMBN_ID"] == fam.set.combn,]
    nj <- nj[,colnames(nj) != "FAM_SET_COMBN_ID"]
    
    print("Generating log likelihood numerators")    
    
    # like.num <- merge(t.adj.cj,
    #                   g.d.pj.adj[g.d.pj.adj[,"SAMPLE_ID"] == pool,],
    #                   by = "SNP_ID",
    #                   suffixes = c("_TRANS","_GENO_PROB"),
    #                   all = TRUE
    # )    
    
    like.num <- inner_join(t.adj.cj, 
                           g.d.pj.adj[g.d.pj.adj[,"SAMPLE_ID"] == pool,], 
                           suffix = c("_TRANS","_GENO_PROB"),
                           by = "SNP_ID")
    like.num <- like.num[order(like.num$SNP_ID),]
    
    #Generate list of genotypes
    genotypes <- genotypes.fun(n=(2*n.in.pools))
    
    like.num[,"LIKE_NUMERATOR"] <- rowSums(like.num[,paste(genotypes, "_TRANS", sep="")] * 
                                             like.num[,paste(genotypes, "_GENO_PROB", sep="")])
    
    like.num <- like.num[,c("SNP_ID", "SAMPLE_ID", "PARENT_COMBN_ID", "LIKE_NUMERATOR")]
    
    if (nrow(like.num) > 0) {
      
      print("Generating log likelihood denominators")
      
      #Denominator in likelihood ratio   
      #  like.den <- merge(nj,
      #                    g.d.pj.adj[g.d.pj.adj[,"SAMPLE_ID"] == pool,],
      #                    by = "SNP_ID",
      #                    suffixes = c("_TRANS","_GENO_PROB"),
      #                    all = TRUE
      #  )
      
      like.den <- inner_join(nj, 
                             g.d.pj.adj[g.d.pj.adj[,"SAMPLE_ID"] == pool,], 
                             suffix = c("_TRANS","_GENO_PROB"),
                             by = "SNP_ID")
      
      #Equation 3 of Henshall et al. 2014
      like.den[,"LIKE_DENOMINATOR"] <- rowSums(like.den[,paste(genotypes, "_TRANS", sep="")] * 
                                                 like.den[,paste(genotypes, "_GENO_PROB", sep="")])
      
      like.den <- like.den[,c("SNP_ID", "LIKE_DENOMINATOR")]
      
      print("Generating log likelihood ratios")
      
      #Likelihood ratio
      # tmp.duos <- merge(like.num, like.den, by = "SNP_ID", all.x = TRUE)
      tmp.duos <- left_join(like.num, like.den, by = "SNP_ID")
      tmp.duos$LOG_LIKE_RATIO <- log(tmp.duos$LIKE_NUMERATOR) - log(tmp.duos$LIKE_DENOMINATOR)
      tmp.duos$PARENT_COMBN_ID  <- as.integer(tmp.duos$PARENT_COMBN_ID)
      
      #Add columns MISS_PARENT_SNP_DATA_PROP and MISS_POOL_SNP_DATA_PROP to tmp.duos
      
      if(is.null(miss.parent.count)) {
        tmp.duos[,"MISS_PARENT_SNP_DATA_PROP"] <- NA
      } else {
        #tmp.duos <- merge(tmp.duos, miss.parent.count, by = c("PARENT_COMBN_ID", "SNP_ID"), all.x = TRUE)
        tmp.duos$PARENT_COMBN_ID <- as.integer(tmp.duos$PARENT_COMBN_ID)
        tmp.duos <- left_join(tmp.duos, miss.parent.count,by = c("PARENT_COMBN_ID" , "SNP_ID"))
        tmp.duos <- tmp.duos[order(tmp.duos$PARENT_COMBN_ID),]
      }  
      
      miss.g.d.pj.adj <- g.d.pj.adj[,c("SNP_ID", "SAMPLE_ID")]
      miss.g.d.pj.adj[,"MISS_POOL_SNP_DATA_PROP"] <- as.numeric(is.na(g.d.pj.adj[,3]))
      # tmp.duos <- merge(tmp.duos, miss.g.d.pj.adj, by = c("SNP_ID", "SAMPLE_ID"), all.x = TRUE)
      tmp.duos <- left_join(tmp.duos, miss.g.d.pj.adj, by = c("SNP_ID" , "SAMPLE_ID"))
      tmp.duos <- tmp.duos[order(tmp.duos$SNP_ID),]
      rm(miss.g.d.pj.adj)
      
      #Retain all data in tmp.duos
      duos.logl.all <- tmp.duos
      
      duos.logl <- tmp.duos
      
      print("Generating LODs")
      
      #sum across all snp to get 'log odds score' (LOD)  
      tmp.duos <- aggregate(tmp.duos$LOG_LIKE_RATIO, by = list(tmp.duos$SAMPLE_ID,
                                                               tmp.duos$PARENT_COMBN_ID), 
                            na.rm=T, 
                            FUN = "sum")  
      colnames(tmp.duos) <- c("SAMPLE_ID", "PARENT_COMBN_ID", "LOD")
      #  tmp.duos$SAMPLE_ID <- pool
      tmp.duos <- tmp.duos[,c("SAMPLE_ID", "PARENT_COMBN_ID", "LOD")]
      
      
      #Get maximum LOG_LIKE_RATIO
      
      print("Determining maximum log likelihood ratios")
      max.logl <- suppressWarnings(aggregate(duos.logl$LOG_LIKE_RATIO, by = list(duos.logl$SAMPLE_ID,
                                                                                 duos.logl$PARENT_COMBN_ID), 
                                             na.rm=T, 
                                             FUN = "max"))
      colnames(max.logl) <- c("SAMPLE_ID","PARENT_COMBN_ID", "MAX_LOGL")
      
      #    max.logl <- merge(max.logl, 
      #                      duos.logl[, c("PARENT_COMBN_ID", "LOG_LIKE_RATIO", "SNP_ID")], 
      #                      by.x = c("PARENT_COMBN_ID", "MAX_LOGL"),
      #                      by.y = c("PARENT_COMBN_ID", "LOG_LIKE_RATIO"),
      #                      all.x = TRUE)
      
      tmp <- c("PARENT_COMBN_ID" = "PARENT_COMBN_ID", "MAX_LOGL" = "LOG_LIKE_RATIO")
      max.logl <- left_join(max.logl, 
                            duos.logl[, c("PARENT_COMBN_ID", "LOG_LIKE_RATIO", "SNP_ID")], 
                            by = tmp)
      rm(tmp)
      
      colnames(max.logl) <- c("SAMPLE_ID", "PARENT_COMBN_ID", "MAX_LOGL", "MAX_LOGL_SNP")
      #    max.logl$SAMPLE_ID   <- pool
      #max.logl           <- max.logl[,c("SAMPLE_ID", "PARENT_COMBN_ID", "MAX_LOGL", "MAX_LOGL_SNP")]
      
      #Get minimum LOG_LIKE_RATIO
      
      print("Determining minimum log likelihood ratios")
      
      min.logl <- suppressWarnings(aggregate(duos.logl$LOG_LIKE_RATIO, by = list(duos.logl$SAMPLE_ID,
                                                                                 duos.logl$PARENT_COMBN_ID), 
                                             na.rm=T, 
                                             FUN = "min")) 
      colnames(min.logl) <- c("SAMPLE_ID","PARENT_COMBN_ID", "MIN_LOGL")
      
      #  min.logl <- merge(min.logl, 
      #                    duos.logl[, c("PARENT_COMBN_ID", "LOG_LIKE_RATIO", "SNP_ID")], 
      #                    by.x = c("PARENT_COMBN_ID", "MIN_LOGL"),
      #                    by.y = c("PARENT_COMBN_ID", "LOG_LIKE_RATIO"),
      #                    all.x = TRUE)
      
      tmp <- c("PARENT_COMBN_ID" = "PARENT_COMBN_ID", "MIN_LOGL" = "LOG_LIKE_RATIO")
      min.logl <- left_join(min.logl, 
                            duos.logl[, c("PARENT_COMBN_ID", "LOG_LIKE_RATIO", "SNP_ID")], 
                            by = tmp)
      rm(tmp)
      
      colnames(min.logl) <- c("SAMPLE_ID","PARENT_COMBN_ID", "MIN_LOGL",  "MIN_LOGL_SNP")
      # min.logl$SAMPLE_ID <- pool
      #min.logl <- min.logl[,c("SAMPLE_ID", "PARENT_COMBN_ID", "MIN_LOGL", "MIN_LOGL_SNP")]
      
      #Get 5 to 95 quantile range LOG_LIKE_RATIO
      
      print("Determining 5 to 95 quantile range of log likelihood ratios")
      
      ipr.fun <- function(x) {
        quantile(x, c(0.95), na.rm = TRUE) - quantile(x, c(0.05), na.rm = TRUE) 
      }
      iqr.logl <- aggregate(duos.logl$LOG_LIKE_RATIO, by = list(duos.logl$SAMPLE_ID,
                                                                duos.logl$PARENT_COMBN_ID), 
                            #    na.rm=T, 
                            FUN = ipr.fun)  
      colnames(iqr.logl) <- c("SAMPLE_ID", "PARENT_COMBN_ID", "RANGE_5_TO_95_LOGL")
      #  iqr.logl$SAMPLE_ID <- pool
      #  iqr.logl <- iqr.logl[,c("SAMPLE_ID", "PARENT_COMBN_ID", "RANGE_5_TO_95_LOGL")]
      
      print("Determining missing log likelihood ratios")
      
      #geg columns MISS_PARENT_SNP_DATA_PROP and MISS_POOL_SNP_DATA_PROP and NO_MISS_PARENT_OR_POOL_PROP for merge with tmp.duos
      miss.logl <- aggregate(cbind(duos.logl[,c("MISS_PARENT_SNP_DATA_PROP", "MISS_POOL_SNP_DATA_PROP")],
                                   duos.logl[,"MISS_PARENT_SNP_DATA_PROP"] == 0 & duos.logl[,"MISS_POOL_SNP_DATA_PROP"] == 0), 
                             by = list(duos.logl[,c("PARENT_COMBN_ID")], duos.logl[,c("SAMPLE_ID")]),
                             FUN="mean")
      colnames(miss.logl) <- c("PARENT_COMBN_ID", "SAMPLE_ID", "MISS_PARENT_SNP_DATA_PROP", "MISS_POOL_SNP_DATA_PROP", 
                               "NO_MISS_PARENT_OR_POOL_PROP")
      
      #Merge
      tmp.duos <- merge(iqr.logl, tmp.duos, by = c("SAMPLE_ID", "PARENT_COMBN_ID"), all.y = TRUE)
      tmp.duos <- merge(max.logl, tmp.duos, by = c("SAMPLE_ID", "PARENT_COMBN_ID"), all.y = TRUE)
      tmp.duos <- merge(min.logl, tmp.duos, by = c("SAMPLE_ID", "PARENT_COMBN_ID"), all.y = TRUE)
      tmp.duos <- merge(miss.logl, tmp.duos, by = c("SAMPLE_ID", "PARENT_COMBN_ID"), all.y = TRUE)      
      
      duos.lod <- rbind(duos.lod,tmp.duos)
      #     rm(like.num, like.den, tmp.duos)
    } #END if (nrow(like.num) > 0) {
  } #END   for(i in 1:length(pools)) {
  
  return(list(duos.lod = duos.lod, duos.logl = duos.logl.all))
}


#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

most.like.parents.duo.fun <- function(duos.lod,
                                      parent.combns) {
  
  #Description: Identifies the family that each inidividual is most likely to belong to.
  
  #  Args:
  # duos.lod: Data frame.
  #          1. FAM_COMBN_ID: Family combination identifier from parent.combns.by.fam.set.combn
  #          2. SAMPLE_ID: Sample identifier
  
  #          ????. SNP_COUNT: The number of SNPS for which none of the sire, dam or offspring genotypes are missing
  
  #          3. MIN_LOGL: Minimum log-likelihood ratio of SNPs in duo
  #          4. MIN_LOGL_SNP: SNP associated with MIN_LOGL
  #          5. MAX_LOGL: Maximum log-likelihood ratio of SNPs in duo
  #          6. MAX_LOGL_SNP: SNP associated with MAX_LOGL
  #          7. RANGE_5_TO_95_LOGL: RANGE_5_TO_95_LOGL: 5-95 percentile range of log-likelihood ratio
  #          8. LOD: Summed log-likelihood ratios (log odds score) for the FAM_COMBN_ID and SAMPLE_ID duo.
  #          9. FAMILY_ID_1: Family 1 in FAM_COMBN_ID in parent.combns.by.fam.set.combn
  #          etc
  
  # parent.combns.by.fam.set.combn
  
  
  
  
  #  Returns:
  #parent.assign: Data frame.
  #          1. FAM_COMBN_ID: Family combination identifier from parent.combns.by.fam.set.combn with the highest LOD in combination with SAMPLE_ID
  #          2. SAMPLE_ID: Sample identifier
  
  #          ????. SNP_COUNT: The number of SNPS for which none of the sire, dam or offspring genotypes are missing
  
  #          3. MIN_LOGL: Minimum log-likelihood ratio of SNPs in duo
  #          4. MIN_LOGL_SNP: SNP associated with MIN_LOGL
  #          5. MAX_LOGL: Maximum log-likelihood ratio of SNPs in duo
  #          6. MAX_LOGL_SNP: SNP associated with MAX_LOGL
  #          7. RANGE_5_TO_95_LOGL: RANGE_5_TO_95_LOGL: 5-95 percentile range of log-likelihood ratio
  #          8. LOD: Summed log-likelihood ratios (log odds score) for the FAM_COMBN_ID and SAMPLE_ID duo.
  #          9. FAMILY_ID_1: Family 1 in FAM_COMBN_ID in parent.combns.by.fam.set.combn
  #          10 FAM_1_DELTA_LOD: Highest LOD of FAM_COMBN SAMPLE_ID if FAMILY_ID_1 excluded
  #          etc
  
  print("Running most.like.parents.duo.fun")
  
  #remove FAM_COMBN_ID containing duplicated PARENT_COMBN_ID
  tmp <- parent.combns[duplicated(parent.combns[,"PARENT_COMBN_ID"]),
                       "PARENT_COMBN_ID"]
  
  if(length(tmp) != 0) {
    parent.combns[parent.combns[,"PARENT_COMBN_ID"] %in% tmp,grep("FAM",colnames(parent.combns))] <- NA
  }
  parent.combns.unambiguous <- parent.combns
  #remove PARENT_IDs
  parent.combns.unambiguous <- parent.combns.unambiguous[,colnames(parent.combns.unambiguous)[c(grep("FAM", colnames(parent.combns.unambiguous)), 
                                                                                                grep("PARENT_COMBN_ID", colnames(parent.combns.unambiguous)))]
                                                         ]
  
  #Obtain most likely family (i.e. maximum LOD) for each offspring
  most.like           <- aggregate(duos.lod$LOD, by = list(duos.lod$SAMPLE_ID), na.rm=T, FUN = "max")      
  colnames(most.like) <- c("SAMPLE_ID", "LOD")
  
  # most.like <- merge(most.like, duos.lod, by = c("SAMPLE_ID", "LOD"), all.x = TRUE)
  most.like <- left_join(most.like, duos.lod, by = c("SAMPLE_ID", "LOD"))
  # most.like <- merge(most.like, parent.combns.unambiguous, by = c("PARENT_COMBN_ID"), all.x = TRUE)
  most.like <- left_join(most.like, parent.combns.unambiguous, by = c("PARENT_COMBN_ID"))
  most.like <- unique(most.like)
  
  #Remove most likely duo to obtain second most likely
  most.like.parents.list <- most.like[,c("SAMPLE_ID", "PARENT_COMBN_ID")]
  # most.like.parents.list <- merge(most.like.parents.list, 
  #                                  parent.combns[,colnames(parent.combns)[grep("PARENT", colnames(parent.combns))] ]  , 
  #                                  by = c("PARENT_COMBN_ID"), all.x = TRUE)
  most.like.parents.list <- left_join(most.like.parents.list, 
                                      parent.combns[,colnames(parent.combns)[grep("PARENT", colnames(parent.combns))] ], 
                                      by = c("PARENT_COMBN_ID"))
  #cycle for n.in.pools
  n.in.pools <- (ncol(parent.combns[,colnames(parent.combns)[grep("PARENT", colnames(parent.combns))] ] )-1)/2
  
  #identify samples with ambiguous parent allocation
  most.like.parents.list <- unique(most.like.parents.list)
  ambig.assign.list <- NULL
  ambig.assign.list <- unique(most.like.parents.list[duplicated(most.like.parents.list$SAMPLE_ID),"SAMPLE_ID"])
  
  #remove duplicate sample ids and then overwrite outputs for samples in ambig.assign.list at end
  most.like.parents.list <- most.like.parents.list[!duplicated(most.like.parents.list$SAMPLE_ID),]
  
  for(i in 1:(2*n.in.pools)) {
    #identify parent i for each sample
    
    tmp.remove           <- most.like.parents.list[,c("SAMPLE_ID",paste("PARENT_",i,sep = ""))]
    
    colnames(tmp.remove) <- c("SAMPLE_ID", "PARENT_REMOVE")
    tmp.remove[,"PARENT_REMOVE_COUNT"] <- rowSums(most.like.parents.list[,paste("PARENT_",1:(2*n.in.pools),sep = "")] == 
                                                    tmp.remove[,"PARENT_REMOVE"])
    tmp.remove <- unique(tmp.remove)
    
    # duos.lod.removed     <- merge(duos.lod, tmp.remove, by = "SAMPLE_ID", all.x = TRUE)
    duos.lod.removed <- left_join(duos.lod, tmp.remove, by = "SAMPLE_ID")
    
    duos.lod.removed[,"REMOVE"] <- 
      rowSums(duos.lod.removed[,paste("PARENT_",1:(2*n.in.pools),sep = "")] == 
                duos.lod.removed[,"PARENT_REMOVE"]) >= 
      duos.lod.removed[,"PARENT_REMOVE_COUNT"]
    
    duos.lod.removed <- duos.lod.removed[!duos.lod.removed$REMOVE,]
    
    if(nrow(duos.lod.removed) > 0) { 
      #Obtain second most likely family (i.e. maximum LOD) for each offspring
      second.most.like <-  aggregate(duos.lod.removed$LOD, 
                                     by = list(duos.lod.removed$SAMPLE_ID), 
                                     na.rm=T, FUN = "max") 
      colnames(second.most.like) <- c("SAMPLE_ID", paste("PARENT_",i, "_ALT_LOD",sep = ""))
      second.most.like <- second.most.like[!duplicated(second.most.like),]
      # second.most.like <- merge(tmp.remove, second.most.like, by = "SAMPLE_ID", all.x = TRUE) #in case not all samples in second.most.like
      second.most.like <- left_join(tmp.remove, second.most.like, by = "SAMPLE_ID")
      
      
      #Identify alternative parent
      alt.parents <- merge(second.most.like,
                           duos.lod.removed, 
                           by.x = c("SAMPLE_ID", paste("PARENT_",i, "_ALT_LOD",sep = "")),
                           by.y  = c("SAMPLE_ID", "LOD"), 
                           all.x = TRUE)[,c("SAMPLE_ID", "PARENT_COMBN_ID", paste("PARENT_",1:(2*n.in.pools),sep = ""))]
      colnames(alt.parents) <- c("SAMPLE_ID", "PARENT_COMBN_ID", paste("ALT_PARENT_",1:(2*n.in.pools),sep = ""))
      
      multi.alt.parents <-  aggregate(!is.na(alt.parents$SAMPLE_ID), 
                                      by = list(alt.parents$SAMPLE_ID), 
                                      na.rm=T, FUN = "sum") 
      alt.parents <- alt.parents[!duplicated(alt.parents[,"SAMPLE_ID"]),]
      alt.parent <- NULL
      
      for(r in 1:nrow(alt.parents)) {
        
        tmp.1 <- aggregate(t(!is.na(most.like.parents.list[r,3:(2*n.in.pools+2)])), 
                           by = list(as.integer(most.like.parents.list[r,3:(2*n.in.pools+2)])), 
                           na.rm=T, FUN = "sum") 
        
        tmp.2 <-  aggregate(t(!is.na(alt.parents[r,3:(2*n.in.pools+2)])), 
                            by = list(as.integer(alt.parents[r,3:(2*n.in.pools+2)])), 
                            na.rm=T, FUN = "sum") 
        if (nrow(tmp.2) > 0) {
          tmp.3 <- merge(tmp.1, tmp.2, by = 1, all = TRUE)
        } else {
          tmp.3 <- tmp.1
          tmp.3[,3] <- NA
        }
        
        tmp.3[is.na(tmp.3[,1]),1] <- 0
        tmp.3[is.na(tmp.3[,2]),2] <- 0
        tmp.3[is.na(tmp.3[,3]),3] <- 0
        
        tmp <- tmp.3[tmp.3[,3]-tmp.3[,2] == 1,1]
        if(length(tmp) != 1) {tmp <- NA} #more than one parent changed or no alternatives
        rm(tmp.1, tmp.2, tmp.3)
        # tmp <- setdiff(alt.parents[r,3:(2*n.in.pools+2)], most.like.parents.list[r,3:(2*n.in.pools+2)])
        #tmp <- c(alt.parents[r,"SAMPLE_ID"], tmp)
        alt.parent <- c(alt.parent,tmp)
      }
      
      alt.parent <-  unlist(alt.parent)
      alt.parent <-  cbind(multi.alt.parents, alt.parent)
      colnames(alt.parent) <- c("SAMPLE_ID", "COUNT", paste("ALT_PARENT_", i, sep=""))
      alt.parent[alt.parent[,"COUNT"] > 1,paste("ALT_PARENT_", i, sep="")] <- NA
      alt.parent <- alt.parent[,c("SAMPLE_ID", paste("ALT_PARENT_", i, sep=""))]
      
      # most.like <- merge(most.like, alt.parent, by = "SAMPLE_ID", all.x = TRUE)
      most.like <- left_join(most.like, alt.parent, by = "SAMPLE_ID")
      
      alt.combn <- alt.parents[,c("SAMPLE_ID", "PARENT_COMBN_ID")]
      colnames(alt.combn) <- c("SAMPLE_ID", paste("ALT_PARENT_COMBN_", i, sep=""))
      # most.like <- merge(most.like, alt.combn, by = "SAMPLE_ID", all.x = TRUE)
      most.like <- left_join(most.like, alt.combn, by = "SAMPLE_ID")
      
      rm(tmp, alt.parents,multi.alt.parents, alt.parent, alt.combn)
      
      #Get delta LOD
      second.most.like <- second.most.like[,c("SAMPLE_ID", paste("PARENT_",i, "_ALT_LOD",sep = ""))]
      #most.like <- merge(most.like, second.most.like, by = "SAMPLE_ID", all.x = TRUE)
      most.like <- left_join(most.like, second.most.like, by = "SAMPLE_ID")
      
      most.like[,paste("PARENT_",i, "_DELTA_LOD",sep = "")] <-
        most.like[,"LOD"] - most.like[,paste("PARENT_",i, "_ALT_LOD",sep = "")]
      most.like <- most.like[,colnames(most.like) !=  paste("PARENT_",i, "_ALT_LOD",sep = "")]
      
    } else {
      most.like[,paste("PARENT_",i, "_DELTA_LOD",sep = "")] <- NA
      most.like[,paste("ALT_PARENT_",i, sep = "")] <- NA
      most.like[,paste("ALT_PARENT_COMBN_", i, sep = "")] <- NA
    }
  }
  
  #column order
  
  #Alternate family ids
  parent.combns.unambiguous <- parent.combns.unambiguous[,c("PARENT_COMBN_ID", "FAM_COMBN_ID")]
  colnames(parent.combns.unambiguous) <- c("PARENT_COMBN_ID", "ALT_FAM_COMBN_ID")
  parent.combns.unambiguous <- parent.combns.unambiguous[!is.na(parent.combns.unambiguous[,"ALT_FAM_COMBN_ID"]),]
  parent.combns.unambiguous <- unique(parent.combns.unambiguous)
  
  for(i in 1:(2*n.in.pools)) {
    
    most.like <- merge(most.like, 
                       parent.combns.unambiguous, 
                       by.x = paste("ALT_PARENT_COMBN_",i, sep=""),
                       by.y = "PARENT_COMBN_ID",
                       all.x = TRUE)
    colnames(most.like)[colnames(most.like) == "ALT_FAM_COMBN_ID"] <-  paste("ALT_FAM_COMBN_",i, sep="")
  }
  
  most.like <- most.like[,c(colnames(duos.lod), 
                            "FAM_COMBN_ID",
                            paste("FAMILY_ID_",1:n.in.pools,sep = ""),
                            paste("PARENT_",1:(2*n.in.pools), "_DELTA_LOD",sep = ""),
                            paste("ALT_PARENT_",1:(2*n.in.pools),sep = ""),
                            paste("ALT_PARENT_COMBN_",1:(2*n.in.pools),sep = ""),
                            paste("ALT_FAM_COMBN_",1:(2*n.in.pools),sep = ""))
                         ]
  
  #Remove data for samples in ambig.assign.list
  most.like <- most.like[!most.like[,"SAMPLE_ID"] %in% ambig.assign.list,]
  #Add row of NA for samples in ambig.assign.list
  # most.like <- merge(most.like, data.frame(SAMPLE_ID = ambig.assign.list), by = "SAMPLE_ID", all = TRUE)
  tmp <- data.frame(SAMPLE_ID = ambig.assign.list)
  most.like <- left_join(most.like, tmp, by = "SAMPLE_ID")
  
  most.like <- most.like[order(most.like[,"SAMPLE_ID"]),]
  
  return(most.like)
}


#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

plot.lod.vs.LogL.range.fun <- function(lod, meth) {
  
  #generates scatterplot
  
  #Args##########################################
  
  # trios.lod: Data frame.  Output of trios.lod.fun
  #          1. OFFSPRING_ID: Offspring identifier from individual identifier in Gij
  #          9. RANGE_5_TO_95_LOGL: 5-95 percentile range of log-likelihood ratio
  #          10. LOD: Summed log-likelihood ratios (log odds score) for the offspring, sire, dam combination.
  
  #meth = current method
  
  #Returns##########################################
  
  # scatterplot:  Plot of LOD vs RANGE_5_TO_95_LOGL returned 
  #               in a directory named "lod.scatter" in the working directory. 
  
  print("Running plot.lod.vs.LogL.range.fun")
  
  #Check that all headings are present in inputs  
  #  if(sum(c("OFFSPRING_ID",	"SIRE_ID",	"DAM_ID",	"FAMILY_ID",	"SNP_COUNT",	"SNP_EXCLUDED",	"MIN_LOGL",	"MIN_LOGL_SNP",
  #           "MAX_LOGL",	"MAX_LOGL_SNP",	"RANGE_5_TO_95_LOGL",	"LOD") %in% colnames(trios.lod)) != 12) {
  #    stop("indiv.snp.dat.indiv input must be a data frame containing the following headings: OFFSPRING_ID, SIRE_ID, DAM_ID, FAMILY_ID, SNP_COUNT, SNP_EXCLUDED, MIN_LOGL, MIN_LOGL_SNP, MAX_LOGL, MAX_LOGL_SNP, RANGE_5_TO_95_LOGL, LOD")
  #  }
  
  #Name columns and assign class
  
  #  trios.lod$OFFSPRING_ID  <- as.character(trios.lod$OFFSPRING_ID)
  #  trios.lod$RANGE_5_TO_95_LOGL    <- as.numeric(trios.lod$RANGE_5_TO_95_LOGL)
  #  trios.lod$LOD    <- as.numeric(trios.lod$LOD)
  
  wd <- getwd()
  #Generate directory for plots if does not exist
  dir.create(file.path(wd, "lod.scatter"), showWarnings = FALSE)
  setwd(file.path(wd, "lod.scatter"))
  
  #Loop through offspring
  for(sample in unique(lod[,1])) {
    
    tmp.lod <- lod[lod[,1] == sample,]
    
    #Define plot variables and data
    tmp.lod$X.AXIS <- tmp.lod$LOD
    x.title <- "Log odds (LOD) score"
    tmp.lod$Y.AXIS <- tmp.lod$RANGE_5_TO_95_LOGL    
    y.title <- "5-95 percentile range of log-likelihood ratio"
    main.title <- sample
    
    png(filename = paste(sample, meth, ".png", sep=""))
    
    #Generate plot
    plot(tmp.lod$X.AXIS,tmp.lod$Y.AXIS,
         type="p",pch = 16, cex = 0.5,
         main= main.title,
         xlab = x.title,
         ylab = y.title)
    
    dev.off()
  }
  setwd(wd)
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

prelim.ml.discrete.assigned.genos.fun <- function(method,
  snp.dat.indiv,
  snp.dat.pools,
  parent.combns.by.fam.set.combn,
  flj,
  snp.error,
  nlj,
  fam.set.combns.by.pool) {
  
  print("Running prelim.ml.discrete.assigned.genos.fun")
  
  Dij <- Dij.from.snp.dat.fun(snp.dat.indiv = snp.dat.indiv) 
  
  tclj <- NULL
  for(fam.set.combn in unique(flj$FAM_SET_COMBN_ID)) {
    
    tmp.fj <- flj[flj[,"FAM_SET_COMBN_ID"] == fam.set.combn,]
    tmp.parent.combns <- parent.combns.by.fam.set.combn[parent.combns.by.fam.set.combn[,"FAM_SET_COMBN_ID"] == fam.set.combn,] 
    
    tmp.parents <- tmp.parent.combns[,!colnames(tmp.parent.combns) %in% c("FAM_SET_COMBN_ID", "PARENT_COMBN_ID")]
    tmp.parents <- unique(as.vector(as.matrix(tmp.parents)))
    
    tmp.tij <- Dij[Dij[,"SAMPLE_ID"] %in% tmp.parents, ]
    tmp.tclj <- tcj.fun(tij = tmp.tij, 
                        parent.combns = tmp.parent.combns, 
                        fj = tmp.fj)
    tmp.tclj$FAM_SET_COMBN_ID <- fam.set.combn
    tclj <- rbind(tclj, tmp.tclj)
  }
  rm(tmp.fj, tmp.parent.combns, tmp.parents, tmp.tij, tmp.tclj)
  
  miss.parent.count <- tclj[,c("PARENT_COMBN_ID", "SNP_ID", "MISS_PARENT_COUNT")]
  tclj               <- tclj[,colnames(tclj) != "MISS_PARENT_COUNT"] 
  
  if(!"B_ALLELE" %in% colnames(snp.dat.pools)) {
  snp.dat.pools <- left_join(snp.dat.pools, 
                             unique(snp.dat.indiv[snp.dat.indiv[,"SNP_ID"] %in% snp.dat.pools[,"SNP_ID"],
                                                          c("SNP_ID", "A_ALLELE", "B_ALLELE")]), 
                             by = "SNP_ID")
  }
  
  dkj <- dkj.from.snp.dat.pools.fun(snp.dat.pools = snp.dat.pools) 
  
  if("Discrete" %in% method) {
  g.d.klj.adj <- adj.geno.prob.fun(lhs = merge(dkj,fam.set.combns.by.pool, by = "SAMPLE_ID", all = TRUE),
                                   snp.error=snp.error,
                                   nlj = nlj)
  } else {
    g.d.klj.adj <- NULL
  }
  
  return(list(  Dij = Dij, 
                tclj = tclj,
                miss.parent.count = miss.parent.count,
                dkj = dkj,
                g.d.klj.adj = g.d.klj.adj))
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

Dij.from.snp.dat.fun <- function(snp.dat.indiv) {
  
  #Returns########################################## the Dij matrix for each individual and SNP according to the descrition on page 5 of Henshall et al. 2014.   
  #Discrete genotypes assigned (1 for homozygote, 0.5 for heterozygote)
  #to an element of Dij based on GENOTYPE in snp.dat.indiv.
  
  #Args##########################################
  # snp.dat.indiv: Data frame
  #              SAMPLE_ID is the individual identifier
  #              SNP_ID   is the SNP identifier
  #              A_ALLELE  is the base represented by allele A
  #              B_ALLELE  is the base represented by allele B
  #              GENOTYPE is the SNP genotype call
  
  #Returns##########################################
  # D.matrices:     Data frame 
  #              1.   SAMPLE_ID is the individual identifier 
  #              2.   SNP_ID is the SNP identifier
  #              3-6. AA_GENO_PROB, AB_GENO_PROB, BA_GENO_PROB and BB_GENO_PROB: elements of the Dij matrix (see the 
  #                   top left of page 5 of Henshall et al. 2014
  #              7-8. A_TRANS_PROB, B_TRANS_PROB are the probabilities of allele transmission 
  #                   for alleles A and B respectively computed from Dij (i.e. the elements of 
  #                   the transmission (Tij) vector, Equation 2 of Henshall et al. 2014).
  
  print("Running Dij.from.snp.dat.fun")
  
  #Check that all headings are present in inputs  
  if(sum(c("SAMPLE_ID", "SNP_ID","A_ALLELE", "B_ALLELE", "GENOTYPE") %in% colnames(snp.dat.indiv)) != 5) {
    stop("snp.dat.indiv input must be a data frame containing the following headings: SAMPLE_ID, SNP_ID, A_ALLELE, B_ALLELE, GENOTYPE")
  }
  
  #Name columns and assign class
  #colnames(snp.dat.indiv) <- c("SAMPLE_ID", "SNP_ID", "INTENSITY_A", "INTENSITY_B", "A_ALLELE", "B_ALLELE", "GENOTYPE")
  snp.dat.indiv$SAMPLE_ID  <- as.integer(snp.dat.indiv$SAMPLE_ID)
  snp.dat.indiv$SNP_ID    <- as.character(snp.dat.indiv$SNP_ID)
  snp.dat.indiv$A_ALLELE   <- as.character(snp.dat.indiv$A_ALLELE)
  snp.dat.indiv$B_ALLELE   <- as.character(snp.dat.indiv$B_ALLELE)
  snp.dat.indiv$GENOTYPE  <- as.character(snp.dat.indiv$GENOTYPE)
  snp.dat.indiv <- snp.dat.indiv[,c("SAMPLE_ID", "SNP_ID", "A_ALLELE", "B_ALLELE", "GENOTYPE")]
  
  #Check for duplicated records in snp.dat.indiv
  indiv.snp <- paste(snp.dat.indiv$SAMPLE_ID,snp.dat.indiv$SNP_ID, sep=".")
  if(sum(duplicated(indiv.snp)) > 0) {
    stop("SAMPLE_ID and SNP_ID combinations are not unique in snp.dat.indiv.  Delete duplicates or recode SAMPLE_ID.")
  }
  rm(indiv.snp)
  
  #Check that there is only one nucleotide in column A_ALLELE for each SNP
  tmp.1 <-  unique(snp.dat.indiv[,c("SNP_ID","A_ALLELE")])
  if(sum(unique(snp.dat.indiv[,"SNP_ID"]) != tmp.1[,c("SNP_ID")]) > 0) {
    stop("There is more than one nucleotide in column A_ALLELE for at least one SNP")
  }
  rm(tmp.1)
  
  #Check that there is only one nucleotide in column B_ALLELE for each SNP  
  tmp.1 <-  unique(snp.dat.indiv[,c("SNP_ID","B_ALLELE")])
  if(sum(unique(snp.dat.indiv[,"SNP_ID"]) != tmp.1[,c("SNP_ID")]) > 0) {
    stop("There is more than one nucleotide in column B_ALLELE for at least one SNP")
  }
  rm(tmp.1)
  
  #Change missing values
  snp.dat.indiv[is.na(snp.dat.indiv[,"GENOTYPE"]), "GENOTYPE"] <- "No.call"
  
  snp.dat.indiv$ALLELES <- NA
  #Generate column of allele genotypes AA, AB, BB
  snp.dat.indiv[snp.dat.indiv[,"GENOTYPE"] == 
            paste(snp.dat.indiv[,"A_ALLELE"], snp.dat.indiv[,"A_ALLELE"], sep = ""),"ALLELES"] <- "AA"
  snp.dat.indiv[snp.dat.indiv[,"GENOTYPE"] == 
            paste(snp.dat.indiv[,"B_ALLELE"], snp.dat.indiv[,"B_ALLELE"], sep = ""),"ALLELES"] <- "BB"
  snp.dat.indiv[snp.dat.indiv[,"GENOTYPE"] == 
            paste(snp.dat.indiv[,"A_ALLELE"], snp.dat.indiv[,"B_ALLELE"], sep = ""),"ALLELES"] <- "AB"
  snp.dat.indiv[snp.dat.indiv[,"GENOTYPE"] == 
            paste(snp.dat.indiv[,"B_ALLELE"], snp.dat.indiv[,"A_ALLELE"], sep = ""),"ALLELES"] <- "BA"
  
  #Change missing values back to NA
  snp.dat.indiv[snp.dat.indiv[,"GENOTYPE"] == "No.call", "GENOTYPE"] <- NA
  
  snp.dat.indiv[!is.na(snp.dat.indiv[,"ALLELES"]) , "AA_GENO_PROB"] <- 0
  snp.dat.indiv[!is.na(snp.dat.indiv[,"ALLELES"]) , "AB_GENO_PROB"] <- 0
  snp.dat.indiv[!is.na(snp.dat.indiv[,"ALLELES"]) , "BA_GENO_PROB"] <- 0
  snp.dat.indiv[!is.na(snp.dat.indiv[,"ALLELES"]) , "BB_GENO_PROB"] <- 0
  
  #Assign descrete genotypes.  Note that it is assumed that Gij and Dij are symetrical (i.e. AB = BA = 0.5). 
  # See Henshall et al. 2014 in the top left of page 5.
  
  snp.dat.indiv[snp.dat.indiv[,"ALLELES"] == "AA" & !is.na(snp.dat.indiv[,"ALLELES"]) , "AA_GENO_PROB"] <- 1
  
  snp.dat.indiv[snp.dat.indiv[,"ALLELES"] == "AB" & !is.na(snp.dat.indiv[,"ALLELES"]) , "AB_GENO_PROB"] <- 1/2
  snp.dat.indiv[snp.dat.indiv[,"ALLELES"] == "BA" & !is.na(snp.dat.indiv[,"ALLELES"]) , "AB_GENO_PROB"] <- 1/2
  
  snp.dat.indiv["BA_GENO_PROB"] <- snp.dat.indiv["AB_GENO_PROB"] 
  
  snp.dat.indiv[snp.dat.indiv[,"ALLELES"] == "BB" & !is.na(snp.dat.indiv[,"ALLELES"]) , "BB_GENO_PROB"] <- 1
  
  #Equation 2 of Henshall et al. 2014 - T vector
  snp.dat.indiv$A_TRANS_PROB <- (snp.dat.indiv$AA_GENO_PROB * 2 + snp.dat.indiv$AB_GENO_PROB + snp.dat.indiv$BA_GENO_PROB)/2  #(sum row 1 + sum col 1) / 2
  snp.dat.indiv$B_TRANS_PROB <- (snp.dat.indiv$BB_GENO_PROB * 2 + snp.dat.indiv$AB_GENO_PROB + snp.dat.indiv$BA_GENO_PROB)/2  #(sum row 2 + sum col 2) / 2
  
  #retain only relevant columns
  snp.dat.indiv <- snp.dat.indiv[,c("SAMPLE_ID", "SNP_ID", "AA_GENO_PROB", "AB_GENO_PROB", "BA_GENO_PROB", "BB_GENO_PROB", "A_TRANS_PROB", "B_TRANS_PROB")]
  
  return(snp.dat.indiv)
  
}


#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

dkj.from.snp.dat.pools.fun <- function(snp.dat.pools) {
  
  #Returns########################################## the Dij matrix for each individual and SNP according to the descrition on page 5 of Henshall et al. 2014.   
  #Discrete genotypes assigned (1 for homozygote, 0.5 for heterozygote)
  #to an element of Dij based on GENOTYPE in snp.dat.pools.
  
  #Args##########################################
  # snp.dat.pools: Data frame
  #              SAMPLE_ID is the individual identifier
  #              SNP_ID   is the SNP identifier
  #              A_ALLELE  is the base represented by allele A
  #              B_ALLELE  is the base represented by allele B
  #              GENOTYPE is the SNP genotype call
  
  #Returns##########################################
  # D.matrices:     Data frame 
  #              1.   SAMPLE_ID is the individual identifier 
  #              2.   SNP_ID is the SNP identifier
  
  print("Running dkj.from.snp.dat.pools.fun")
  
  n.in.pools <- max(nchar(snp.dat.pools[,"GENOTYPE"]), na.rm = TRUE)/2
  
  #Change missing values to string of zeros
  tmp.snp.dat.pools <- snp.dat.pools
  tmp.snp.dat.pools[is.na(tmp.snp.dat.pools[,"GENOTYPE"]), "GENOTYPE"] <- paste(as.character(rep(0,n.in.pools*2)),collapse = "")
  
  #count of B allele
  tmp <- t(matrix(unlist(strsplit(tmp.snp.dat.pools[,"GENOTYPE"],"")), 
                  ncol = nrow(tmp.snp.dat.pools)))
  count.b.allele <- rowSums(tmp == tmp.snp.dat.pools[,"B_ALLELE"], na.rm = TRUE)
  count.b.allele[tmp[,1] == 0] <- NA
  
  #generate lambda.kj - unsorted genotype probability vector
  geno.prob <- matrix(0, ncol = (n.in.pools*2+1), nrow = nrow(snp.dat.pools))
  for (i in 1:nrow(geno.prob)) {
    geno.prob[i,(count.b.allele[i]+1)] <- 1
  }
  
  geno.prob[rowSums(geno.prob) == 0,] <- NA
  
  colnames(geno.prob) <- paste(genotypes.fun(n.in.pools*2), "_LAMBDA", sep = "")
  
  lambda.kj <- cbind(snp.dat.pools[,c("SAMPLE_ID", "SNP_ID")], geno.prob)
  
  lambda.kj$ALLELIC_PROP_POOL <- count.b.allele/(n.in.pools*2)
  
  rho.inv <- rho.inv.fun(n.in.pools)
  
  print("Still running dkj.from.snp.dat.pools.fun")
  
  dkj <- lambda.kj
  colnames(dkj) <- gsub("_LAMBDA", "", colnames(dkj))
  dkj <- dkj[,-ncol(dkj)]
  
  for(geno in rho.inv[,"GENOTYPE"]) {
    dkj[,geno] <- dkj[,geno] * rho.inv[rho.inv[,"GENOTYPE"]==geno,"RHO_INV"]
  }
  
  return(dkj)
  
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

prelim.ml.discrete.geno.probs.fun <- function(
  Gij,
  snp.dat.indiv,
  snp.dat.pools,
  threshold.indiv,
  threshold.pools,
  parent.combns.by.fam.set.combn,
  flj,
  gkj,
  lambda.kj,
  snp.error,
  nlj,
  fam.set.combns.by.pool) {
  
  Dij <- Dij.from.Gij.fun(Gij = Gij, threshold.indiv = threshold.indiv) 
  
  tclj <- NULL
  for(fam.set.combn in unique(flj$FAM_SET_COMBN_ID)) {
    
    tmp.fj <- flj[flj[,"FAM_SET_COMBN_ID"] == fam.set.combn,]
    tmp.parent.combns <- parent.combns.by.fam.set.combn[parent.combns.by.fam.set.combn[,"FAM_SET_COMBN_ID"] == fam.set.combn,] 
    
    tmp.parents <- tmp.parent.combns[,!colnames(tmp.parent.combns) %in% c("FAM_SET_COMBN_ID", "PARENT_COMBN_ID")]
    tmp.parents <- unique(as.vector(as.matrix(tmp.parents)))
    
    tmp.tij <- Dij[Dij[,"SAMPLE_ID"] %in% tmp.parents, ]
    tmp.tclj <- tcj.fun(tij = tmp.tij, 
                        parent.combns = tmp.parent.combns, 
                        fj = tmp.fj)
    tmp.tclj$FAM_SET_COMBN_ID <- fam.set.combn
    tclj <- rbind(tclj, tmp.tclj)
  }
  rm(tmp.fj, tmp.parent.combns, tmp.parents, tmp.tij, tmp.tclj)
  
  miss.parent.count <- tclj[,c("PARENT_COMBN_ID", "SNP_ID", "MISS_PARENT_COUNT")]
  tclj               <- tclj[,colnames(tclj) != "MISS_PARENT_COUNT"] 
  
  dkj <- dkj.from.gkj.fun(lambda.kj = lambda.kj, threshold.pools = threshold.pools) 
  
  g.d.klj.adj <- adj.geno.prob.fun(lhs= merge(dkj,fam.set.combns.by.pool, by = "SAMPLE_ID", all = TRUE),
                                   snp.error=snp.error,
                                   nlj = nlj)
  
  return(list(Dij = Dij,
              tclj = tclj,
              miss.parent.count = miss.parent.count,
              dkj = dkj,
              g.d.klj.adj = g.d.klj.adj))
}


#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

Dij.from.Gij.fun <- function(Gij, threshold.indiv) {
  
  #Description: Returns the Dij matrix for each individual and SNP according to the description on page 5 of 
  #Henshall et al. 2014 from the Gij matrices.  Discrete genotypes assigned (1 for homozygote, 0.5 for heterozygote)
  #to an element of Dij if the corresponding element of Gij greater than a threshold.indiv.
  
  #Args##########################################
  
  #Gij: Data frame.  Output of Gij.fun 
  #          1.   SAMPLE_ID is the individual identifier (assumed to be offspring if not 
  #               listed as a sire or dam in 'fams' input)
  #          2.   SNP_ID is the SNP identifier
  #          3-6. AA_GENO_PROB, AB_GENO_PROB, BA_GENO_PROB and BB_GENO_PROB: elements of the Gij  matrix (see the 
  #               top left of page 4 of Henshall et al. 2014
  
  #threshold.indiv: Number.  Genotype called if AA_GENO_PROB, AB_GENO_PROB + BA_GENO_PROB or BB_GENO_PROB greater than this value
  
  #Returns##########################################
  # D.matrices:     Data frame 
  #              1.   SAMPLE_ID is the individual identifier 
  #              2.   SNP_ID is the SNP identifier
  #              3-6. AA_GENO_PROB, AB_GENO_PROB, BA_GENO_PROB and BB_GENO_PROB: elements of the Dij matrix (see the 
  #                   top left of page 5 of Henshall et al. 2014
  #              7-8. A_TRANS_PROB, B_TRANS_PROB are the probabilities of allele transmission 
  #                   for alleles A and B respectively computed from Dij (i.e. the elements of 
  #                   the transmission (Tij) vector, Equation 2 of Henshall et al. 2014).
  
  print("Running Dij.from.Gij.fun")
  
  #Check that all headings are present in inputs  
  if(sum(c("SAMPLE_ID", "SNP_ID", "AA_GENO_PROB", "AB_GENO_PROB", "BA_GENO_PROB", 
           "BB_GENO_PROB") %in% 
         colnames(Gij)) != 6) {
    stop("Gij input must be a data frame containing the following headings: SAMPLE_ID, SNP_ID, AA_GENO_PROB, AB_GENO_PROB, BA_GENO_PROB, BB_GENO_PROB")
  }
  
  #Name columns and assign class
  #colnames(Gij)     <- c("SAMPLE_ID", "SNP_ID", "AA_GENO_PROB", "AB_GENO_PROB", "BA_GENO_PROB", "BB_GENO_PROB")
  Gij$SAMPLE_ID     <- as.integer(Gij$SAMPLE_ID)
  Gij$SNP_ID       <- as.character(Gij$SNP_ID)
  Gij$AA_GENO_PROB <- as.numeric(Gij$AA_GENO_PROB)
  Gij$AB_GENO_PROB <- as.numeric(Gij$AB_GENO_PROB)
  Gij$BA_GENO_PROB <- as.numeric(Gij$BA_GENO_PROB)
  Gij$BB_GENO_PROB <- as.numeric(Gij$BB_GENO_PROB)
  Gij <- Gij[,c("SAMPLE_ID", "SNP_ID", "AA_GENO_PROB", "AB_GENO_PROB", "BA_GENO_PROB", "BB_GENO_PROB")]
  
  #Check that probabilities between 0 and 1
  if(
    sum(
      Gij$AA_GENO_PROB > 1 |
      Gij$AB_GENO_PROB > 1 |
      Gij$BA_GENO_PROB > 1 |
      Gij$BB_GENO_PROB > 1 |
      
      Gij$AA_GENO_PROB < 0 |
      Gij$AB_GENO_PROB < 0 |
      Gij$BA_GENO_PROB < 0 |
      Gij$BB_GENO_PROB < 0 
      , na.rm = T) != 0
  ) {
    stop("Probabilities in Gij must be between 0 and 1 inclusive")
  }
  
  #Check that "AA_GENO_PROB", "AB_GENO_PROB", "BA_GENO_PROB", "BB_GENO_PROB" sum to one
  if(
    sum(
      round((Gij$AA_GENO_PROB + Gij$AB_GENO_PROB +
             Gij$BA_GENO_PROB + Gij$BB_GENO_PROB),5) != 1, na.rm = T
    ) != 0
  ) {
    stop("AA_GENO_PROB + AB_GENO_PROB + BA_GENO_PROB + BB_GENO_PROB must equal 1 in all rows of Gij")
  }
  
  #Ensure that SAMPLE_ID column contains no 0s or NA
  if(
    sum(
      (is.na(Gij$SAMPLE_ID) |
       Gij$SAMPLE_ID == 0), na.rm = T
    ) != 0
  ) {
    stop("SAMPLE_ID in Gij cannot be NA or 0")
  } 
  
  Dij <- Gij
  colnames(Dij) <- c("SAMPLE_ID", "SNP_ID", "AA_GENO_PROB_G", "AB_GENO_PROB_G", "BA_GENO_PROB_G", "BB_GENO_PROB_G")
  
  #only retain data for genotype with maximum probability
  Dij$ROW_MAX <- apply(Dij[, c("AA_GENO_PROB_G", "AB_GENO_PROB_G", "BA_GENO_PROB_G", "BB_GENO_PROB_G")], 1, max) 
  Dij[!Dij[,"AA_GENO_PROB_G"] == Dij$ROW_MAX & !is.na(Dij[,"AA_GENO_PROB_G"]),"AA_GENO_PROB_G"] <- 0
  Dij[!Dij[,"AB_GENO_PROB_G"] == Dij$ROW_MAX & !is.na(Dij[,"AB_GENO_PROB_G"]),"AB_GENO_PROB_G"] <- 0
  Dij[!Dij[,"BA_GENO_PROB_G"] == Dij$ROW_MAX & !is.na(Dij[,"BA_GENO_PROB_G"]),"BA_GENO_PROB_G"] <- 0
  Dij[!Dij[,"BB_GENO_PROB_G"] == Dij$ROW_MAX & !is.na(Dij[,"BB_GENO_PROB_G"]),"BB_GENO_PROB_G"] <- 0
  
  #new columns (NA where GENO_PROB_G is NA)
  Dij[,"AA_GENO_PROB"] <- Dij[,"AA_GENO_PROB_G"] * 0
  Dij[,"AB_GENO_PROB"] <- Dij[,"AB_GENO_PROB_G"] * 0
  Dij[,"BA_GENO_PROB"] <- Dij[,"BA_GENO_PROB_G"] * 0
  Dij[,"BB_GENO_PROB"] <- Dij[,"BB_GENO_PROB_G"] * 0
  
  #Discrete genotypes from GENO_PROB_G if above threshold.indiv.  
  
  #If Gij is symetrical.
  if(identical(Dij[, "AB_GENO_PROB_G"],Dij[, "BA_GENO_PROB_G"])) {
    Dij[Dij[,"AA_GENO_PROB_G"] > threshold.indiv & !is.na(Dij[,"AA_GENO_PROB_G"]), "AA_GENO_PROB"] <- 1
    Dij[(Dij[,"AB_GENO_PROB_G"] + Dij[,"AB_GENO_PROB_G"]) > threshold.indiv & 
          !is.na(Dij[,"AB_GENO_PROB_G"]) & 
          !is.na(Dij[,"BA_GENO_PROB_G"]), "AB_GENO_PROB"] <- 1/2
    Dij[, "BA_GENO_PROB"] <- Dij[, "AB_GENO_PROB"] 
    Dij[Dij[,"BB_GENO_PROB_G"] > threshold.indiv & !is.na(Dij[,"BB_GENO_PROB_G"]), "BB_GENO_PROB"] <- 1
  }
  
  #If Gij is not symetrical.
  if(!identical(Dij[, "AB_GENO_PROB_G"],Dij[, "BA_GENO_PROB_G"])) {
    Dij[Dij[,"AA_GENO_PROB_G"] > threshold.indiv & !is.na(Dij[,"AA_GENO_PROB_G"]), "AA_GENO_PROB"] <- 1
    Dij[Dij[,"AB_GENO_PROB_G"] > threshold.indiv & !is.na(Dij[,"AB_GENO_PROB_G"]), "AB_GENO_PROB"] <- 1
    Dij[Dij[,"BA_GENO_PROB_G"] > threshold.indiv & !is.na(Dij[,"BA_GENO_PROB_G"]), "BA_GENO_PROB"] <- 1
    Dij[Dij[,"BB_GENO_PROB_G"] > threshold.indiv & !is.na(Dij[,"BB_GENO_PROB_G"]), "BB_GENO_PROB"] <- 1
  }
  
  Dij <- Dij[,c("SAMPLE_ID", "SNP_ID", "AA_GENO_PROB", "AB_GENO_PROB", "BA_GENO_PROB", "BB_GENO_PROB")]
  
  #probabilities must sum to 1
  row.sum <- rowSums(Dij[,c("AA_GENO_PROB", "AB_GENO_PROB", "BA_GENO_PROB", "BB_GENO_PROB")])
  Dij[row.sum != 1 & !is.na(row.sum),"AA_GENO_PROB"] <- NA
  Dij[row.sum != 1 & !is.na(row.sum),"AB_GENO_PROB"] <- NA
  Dij[row.sum != 1 & !is.na(row.sum),"BA_GENO_PROB"] <- NA
  Dij[row.sum != 1 & !is.na(row.sum),"BB_GENO_PROB"] <- NA
  
  #Equation 2 of Henshall et al. 2014 - T vector
  Dij$A_TRANS_PROB <- (Dij$AA_GENO_PROB * 2 + Dij$AB_GENO_PROB + Dij$BA_GENO_PROB)/2  #(sum row 1 + sum col 1) / 2
  Dij$B_TRANS_PROB <- (Dij$BB_GENO_PROB * 2 + Dij$AB_GENO_PROB + Dij$BA_GENO_PROB)/2  #(sum row 2 + sum col 2) / 2
  
  return(Dij)
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

dkj.from.gkj.fun <- function(lambda.kj, threshold.pools) {
  
  #Description: Returns the Dij matrix for each individual and SNP according to the description on page 5 of 
  #Henshall et al. 2014 from the Gij matrices.  Discrete genotypes assigned (1 for homozygote, 0.5 for heterozygote)
  #to an element of Dij if the corresponding element of Gij greater than a threshold.pools.
  
  #Args##########################################
  
  #Gij: Data frame.  Output of Gij.fun 
  #          1.   SAMPLE_ID is the individual identifier (assumed to be offspring if not 
  #               listed as a sire or dam in 'fams' input)
  #          2.   SNP_ID is the SNP identifier
  #          3- Genotype probabilities (see the 
  #               top left of page 4 of Henshall et al. 2014
  
  #threshold.pools: Number.  Genotype called if AA_GENO_PROB, AB_GENO_PROB + BA_GENO_PROB or BB_GENO_PROB greater than this value
  
  #Returns##########################################
  # D.matrices:     Data frame 
  #              1.   SAMPLE_ID is the individual identifier 
  #              2.   SNP_ID is the SNP identifier
  #              3. Genotype probabilities: elements of the dkj matrix 
  
  print("Running dkj.from.gkj.fun")
  
  
  #  tmp <- gkj
  
  #get rho.inv
  #for(geno in rho.inv[,"GENOTYPE"]) {
  #   tmp[,geno] <- tmp[,geno] / rho.inv[rho.inv[,"GENOTYPE"]==geno,"RHO_INV"]
  #  }
  
  tmp <- lambda.kj[,-ncol(lambda.kj)]
  for(i in 3:ncol(tmp)) {
    colnames(tmp)[i] <- gsub("_LAMBDA","",colnames(tmp)[i])
  }
  tmp[,3:ncol(tmp)] <- tmp[,3:ncol(tmp)]/rowSums(tmp[,3:ncol(tmp)])
  
  n.in.pools <- nchar(colnames(tmp)[ncol(tmp)])/2
  
  rho.inv <- rho.inv.fun(n.in.pools)
  
  #get row maximums
  row.max <-  do.call(pmax, tmp[,3:ncol(tmp)])
  
  dkj <- tmp
  
  #Assign genotype if probability is the maximum in the row and greater than the threshold.pools
  for(geno in rho.inv[,"GENOTYPE"]) {
    dkj[tmp[,geno] == row.max &
          !is.na(tmp[,geno]) &
          row.max >= threshold.pools, geno] <- 1
    dkj[tmp[,geno] != row.max &
          !is.na(tmp[,geno]) &
          row.max >= threshold.pools, geno] <- 0
    dkj[row.max < threshold.pools |
          is.na(row.max), geno] <- NA
  }
  
  #probabilities must sum to 1
  row.sum <- round(rowSums(dkj[,!colnames(dkj) %in% c("SAMPLE_ID", "SNP_ID")]),10)
  dkj[row.sum != 1 & !is.na(row.sum),!colnames(dkj) %in% c("SAMPLE_ID", "SNP_ID")] <- NA
  
  #multiply by rho.inv because genotypes are unordered in dkj
  for(geno in rho.inv[,"GENOTYPE"]) {
    dkj[,geno] <- dkj[,geno] * rho.inv[rho.inv[,"GENOTYPE"]==geno,"RHO_INV"]
  }
  
  return(dkj)
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

mismatches.fun <- function(dkj,
                           tclj,
                           miss.parent.count,
                           parent.combns,
                           fam.set.combns.by.pool) {
  
  #Exclusion method
  tmp <- duos.mismatches.fun(dkj = dkj,
                             tclj = tclj,
                             miss.parent.count = miss.parent.count,
                             fam.set.combns.by.pool =fam.set.combns.by.pool)
  duos.mismatches          <- tmp$duos.mismatches
  duos.mismatches.by.snp   <- tmp$duos.mismatches.by.snp
  mismatches.by.snp.sample <- tmp$mismatches.by.snp.sample
  mismatch.snp.count       <- tmp$mismatch.snp.count
  rm(tmp)
  
  most.like.parents <- most.like.parents.duos.mismatches.fun(duos.mismatches = duos.mismatches,
                                                             parent.combns = parent.combns)
  
  #Generate list where only common parents and families to all equally most likely parental combinations are retained
  
  most.like.parents.non.dup <- unique(most.like.parents[,-grep("ALT_",colnames(most.like.parents))]) #remove ALT parent combinations
  most.like.parents.non.dup <- unique(most.like.parents.non.dup[, !colnames(most.like.parents.non.dup) %in% c("PARENT_COMBN_ID", "FAM_COMBN_ID")])
  
  dup.samples <- unique(most.like.parents.non.dup[duplicated(most.like.parents.non.dup[,
                                                                                       c("SAMPLE_ID", "MISMATCHES", "SNP_COUNT", "MISMATCH_PROP", "MISMATCH_PROP_SE", "MISMATCH_PROP_Z")]),"SAMPLE_ID"])
  dup.most.like.parents <- most.like.parents.non.dup[most.like.parents.non.dup[,"SAMPLE_ID"] %in% dup.samples,]
  non.dup.most.like.parents <- most.like.parents.non.dup[!most.like.parents.non.dup[,"SAMPLE_ID"] %in% dup.samples,]
  tmp.dup.most.like.parents <- NULL
  
  for(samp in dup.samples) {
    tmp <- dup.most.like.parents[dup.most.like.parents[,"SAMPLE_ID"] == samp,]
    tmp.n <- nrow(tmp)
    
    tmp.parents <- tmp[,grep("PARENT_",colnames(most.like.parents.non.dup))]
    tmp.colnames.parents <- colnames(tmp.parents)
    tmp.parents <- matrix(unlist(tmp.parents), ncol = 1)
    tmp.parents[is.na(tmp.parents)] <- 0 #replace NA with 0
    tmp.parents <- as.data.frame(table(tmp.parents))
    tmp.parents[,"tmp.parents"] <- as.integer(levels(tmp.parents[,"tmp.parents"] ))
    tmp.parents[,"Freq"] <- tmp.parents[,"Freq"]/tmp.n
    tmp.parents[,"Freq"] <- floor(tmp.parents[,"Freq"])
    tmp.parents <- tmp.parents[tmp.parents[,"Freq"] > 0,]
    tmp.parents <- rep(tmp.parents[,1], tmp.parents[,2]) #list of parents common to all most likely combinations
    tmp.parents <- c(tmp.parents,rep(NA,length(tmp.colnames.parents) - length(tmp.parents) ))
    tmp.parents <- as.data.frame(t(tmp.parents))
    tmp.parents[tmp.parents == 0] <- NA #replace 0 with NA
    colnames(tmp.parents) <- tmp.colnames.parents
    
    tmp.fams <- tmp[,grep("FAMILY_ID_",colnames(most.like.parents.non.dup))]
    tmp.colnames.fams <- colnames(tmp.fams)
    tmp.fams <- matrix(unlist(tmp.fams), ncol = 1)
    tmp.fams[is.na(tmp.fams)] <- 0 #replace NA with 0
    tmp.fams <- as.data.frame(table(tmp.fams))
    tmp.fams[,"tmp.fams"] <- as.integer(levels(tmp.fams[,"tmp.fams"] ))
    tmp.fams[,"Freq"] <- tmp.fams[,"Freq"]/tmp.n
    tmp.fams[,"Freq"] <- floor(tmp.fams[,"Freq"])
    tmp.fams <- tmp.fams[tmp.fams[,"Freq"] > 0,]
    tmp.fams <- rep(tmp.fams[,1], tmp.fams[,2]) #list of families common to all most likely combinations
    tmp.fams <- c(tmp.fams,rep(NA,length(tmp.colnames.fams) - length(tmp.fams) ))
    tmp.fams <- as.data.frame(t(tmp.fams))
    tmp.fams[tmp.fams == 0] <- NA #replace 0 with NA
    colnames(tmp.fams) <- tmp.colnames.fams
    if(length(tmp.fams) == 0) {
      tmp.fams <- t(as.data.frame(rep(NA, length(tmp.parents)/2)))
      colnames(tmp.fams) <- paste("FAMILY_ID_", 1:(length(tmp.parents)/2), sep = "")
    }
    
    tmp <-  cbind(tmp[1,c("SAMPLE_ID", "MISMATCHES", "SNP_COUNT", "MISMATCH_PROP", "MISMATCH_PROP_SE", "MISMATCH_PROP_Z")],
                  tmp.fams, 
                  tmp.parents)
    tmp.dup.most.like.parents <- rbind(tmp.dup.most.like.parents,tmp)
    
    rm(tmp.colnames.parents, tmp.colnames.fams, tmp, tmp.n, tmp.fams, tmp.parents)
    
  }
  
  most.like.parents.non.dup <- rbind(tmp.dup.most.like.parents, non.dup.most.like.parents)
  rm(dup.most.like.parents, non.dup.most.like.parents, tmp.dup.most.like.parents)
  most.like.parents.non.dup <- most.like.parents.non.dup[order(most.like.parents.non.dup$SAMPLE_ID),]
  
  return(list(duos.mismatches = duos.mismatches,
              duos.mismatches.by.snp = duos.mismatches.by.snp,
              mismatches.by.snp.sample = mismatches.by.snp.sample,
              mismatch.snp.count = mismatch.snp.count,
              most.like.parents = most.like.parents,
              most.like.parents.non.dup = most.like.parents.non.dup))
}


#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

duos.mismatches.fun <- function(dkj,
                                tclj,
                                miss.parent.count,
                                fam.set.combns.by.pool) {  
  
  #Computes count of mismatches by duo using the exclusion approach 
  
  print("Running duos.mismatches.fun")
  
  #identify missing SAMPLE_ID and SNP_ID combinations
  dkj$MISS_POOL <- rowSums(is.na(dkj[,3:ncol(dkj)])) != 0
  #  tclj <- merge(tclj, 
  #               miss.parent.count, 
  #               by = c("PARENT_COMBN_ID", "SNP_ID"), 
  #               all.x = TRUE)
  
  miss.parent.count$SNP_ID    <- as.character(miss.parent.count$SNP_ID)
  tclj$SNP_ID    <- as.character(tclj$SNP_ID)  
  
  #seed data frame
  duos.mismatches <- NULL
  mismatches.by.snp.sample <- NULL
  
  #Loop through pools 
  #Could merge Tfj with Dij but likely to run into memory issues
  
  pools <- unique(dkj$SAMPLE_ID)
  
  for(i in 1:length(pools)) {
    
    pool <- pools[i] #pools identifier    
    fam.set.combn <-fam.set.combns.by.pool[fam.set.combns.by.pool[,"SAMPLE_ID"] == pool,"FAM_SET_COMBN_ID"]
    tcj <- tclj[tclj[,"FAM_SET_COMBN_ID"] == fam.set.combn,]
    
    tcj <- left_join(tcj, miss.parent.count, by = c("PARENT_COMBN_ID", "SNP_ID"))
    tcj$MISS_PARENT <- tcj$MISS_PARENT_COUNT > 0
    
    
    print(paste("Computing duo mismatches for pool", pool, "-", i, "of", length(pools)))
    
    mismatches <- merge(tcj,
                        dkj[dkj[,"SAMPLE_ID"] == pool,],
                        by = "SNP_ID",
                        all = TRUE,
                        suffixes = c("_TRANS","_POOLS")
    )
    
    #snp.count
    snp.count <- aggregate(!mismatches$MISS_PARENT & !mismatches$MISS_POOL,
                           by = list(mismatches$SAMPLE_ID,
                                     mismatches$PARENT_COMBN_ID), 
                           na.rm=T, 
                           FUN = "sum")   
    colnames(snp.count) <- c("SAMPLE_ID", "PARENT_COMBN_ID", "SNP_COUNT")
    
    #Equation 3 of Henshall et al. 2014
    mismatches[,"MISMATCHES"] <- rowSums(mismatches[,grepl("TRANS", colnames(mismatches))] * 
                                           mismatches[,grepl("POOLS", colnames(mismatches))])
    mismatches[,"MISMATCHES"] <- as.integer(mismatches[,"MISMATCHES"] == 0)
    
    mismatches[mismatches[,"MISS_PARENT"] == TRUE, "MISMATCHES"] <- NA
    mismatches[mismatches[,"MISS_POOL"]   == TRUE, "MISMATCHES"] <- NA
    
    snp.mismatches <- aggregate(mismatches[,"MISMATCHES"], by = list(mismatches$SAMPLE_ID,
                                                                     mismatches$PARENT_COMBN_ID), 
                                na.rm=T, FUN = "sum")     
    colnames(snp.mismatches) <- c("SAMPLE_ID", "PARENT_COMBN_ID", "MISMATCHES")
    
    #   snp.mismatches <- merge(snp.mismatches,snp.count, by = c("SAMPLE_ID", "PARENT_COMBN_ID"), all.x = TRUE)
    snp.mismatches <- left_join(snp.mismatches, snp.count, by = c("SAMPLE_ID", "PARENT_COMBN_ID"))
    
    #Calculate proportion of SNPs that are mismatched
    snp.mismatches$MISMATCH_PROP <- snp.mismatches$MISMATCHES / snp.mismatches$SNP_COUNT
    snp.mismatches$MISMATCH_PROP_SE <- sqrt((snp.mismatches$MISMATCH_PROP*(1-snp.mismatches$MISMATCH_PROP)) / snp.mismatches$SNP_COUNT)
    
    #identify most likely to identify bad SNP
    
    #Obtain most likely combn (i.e. min MISMATCH_PROP) for each pool
    
    #if no SNP for which genotypes were assigned in pool
    if(max(snp.mismatches$SNP_COUNT) == 0) {
      mismatches.by.snp <- mismatches
      mismatches.by.snp <- mismatches.by.snp[mismatches.by.snp[,"SAMPLE_ID"] == pool,]
      mismatches.by.snp[,!colnames(mismatches.by.snp) %in% c("SNP_ID", "SAMPLE_ID")] <- NA
      mismatches.by.snp <- unique(mismatches.by.snp)
      
      duos.mismatches <- rbind(duos.mismatches,snp.mismatches)
      mismatches.by.snp.sample <- rbind(mismatches.by.snp.sample,mismatches.by.snp)
      print(paste("WARNING: Exclusion approach to pedigree assignment is not functioning appropriately as there are no SNP for which genotypes were assigned in pool", pool, ". Consider using changing discrete.method to equal \'assigned.genos\' or reducing the value of threshold.pools"))
    } else {
      
      most.like <-  min(snp.mismatches$MISMATCH_PROP, na.rm = TRUE) 
      most.like <- snp.mismatches[snp.mismatches[,"MISMATCH_PROP"] == most.like,]
      
      #if there are multiple combns then go with the one with the most snp
      if(nrow(most.like) > 1) {
        tmp.most.like <-  aggregate(most.like$SNP_COUNT, 
                                    by = list(most.like$SAMPLE_ID), na.rm=T, FUN = "max") 
        colnames(tmp.most.like) <- c("SAMPLE_ID", "SNP_COUNT")
        
        #   most.like <- merge(tmp.most.like, most.like, by = c("SAMPLE_ID", "SNP_COUNT"), all.x = TRUE)
        most.like <- left_join(tmp.most.like, most.like, by = c("SAMPLE_ID", "SNP_COUNT"))
        
        most.like <- most.like[,c("SAMPLE_ID",	"PARENT_COMBN_ID", "MISMATCHES",	"SNP_COUNT",	
                                  "MISMATCH_PROP",	"MISMATCH_PROP_SE")]
        #remove duplicates at random
        most.like <- most.like[order(runif(nrow(most.like))),]
        most.like <- most.like[most.like[,"PARENT_COMBN_ID"] == most.like[1,"PARENT_COMBN_ID"],]
      }
      
      #add mismatched SNP in most.like to total counts of bad snp
      
      mismatches.by.snp <- mismatches[mismatches[,"PARENT_COMBN_ID"] == most.like[,"PARENT_COMBN_ID"],]
    }
    duos.mismatches <- rbind(duos.mismatches,snp.mismatches)
    mismatches.by.snp.sample <- rbind(mismatches.by.snp.sample,mismatches.by.snp)
    
  } #END for(i in 1:length(pools)) {
  
  duos.mismatches$MISMATCH_PROP_Z <- duos.mismatches$MISMATCH_PROP / duos.mismatches$MISMATCH_PROP_SE 
  duos.mismatches[duos.mismatches[,"MISMATCH_PROP"] == 0 &
                    !is.na(duos.mismatches[,"MISMATCH_PROP"]),  
                  "MISMATCH_PROP_Z"] <- 0
  
  duos.mismatches <- duos.mismatches[order(duos.mismatches$MISMATCH_PROP),]
  duos.mismatches <- duos.mismatches[order(duos.mismatches$SAMPLE_ID),]
  
  #Count mismatches
  
  mismatch.snp.count <- aggregate(mismatches.by.snp.sample[,"MISMATCHES"] == 1, 
                                  by = list(mismatches.by.snp.sample$SNP_ID), 
                                  na.rm=T, FUN = "sum")   
  colnames(mismatch.snp.count) <- c("SNP_ID", "MISMATCH_COUNT")
  
  match.snp.count <- aggregate(mismatches.by.snp.sample[,"MISMATCHES"] == 0, 
                               by = list(mismatches.by.snp.sample$SNP_ID), 
                               na.rm=T, FUN = "sum")   
  colnames(match.snp.count) <- c("SNP_ID", "MATCH_COUNT")
  
  miss.snp.count <- aggregate(is.na(mismatches.by.snp.sample[,"MISMATCHES"]), 
                              by = list(mismatches.by.snp.sample$SNP_ID), 
                              na.rm=T, FUN = "sum")   
  colnames(miss.snp.count) <- c("SNP_ID", "MISSING_COUNT")
  
  #  mismatch.snp.count <- merge(mismatch.snp.count, match.snp.count, by = "SNP_ID", all = TRUE)  
  mismatch.snp.count$SNP_ID <- as.character(mismatch.snp.count$SNP_ID)
  match.snp.count$SNP_ID    <- as.character(match.snp.count$SNP_ID)
  mismatch.snp.count        <- inner_join(mismatch.snp.count, match.snp.count, by = "SNP_ID")
  
  #mismatch.snp.count <- merge(mismatch.snp.count, miss.snp.count, by = "SNP_ID", all = TRUE)  
  mismatch.snp.count$SNP_ID <- as.character(mismatch.snp.count$SNP_ID)
  miss.snp.count$SNP_ID    <- as.character(miss.snp.count$SNP_ID)
  mismatch.snp.count        <- inner_join(mismatch.snp.count, miss.snp.count, by = "SNP_ID")
  
  mismatch.snp.count <- mismatch.snp.count[order(mismatch.snp.count$MISSING_COUNT, decreasing = TRUE),]
  mismatch.snp.count <- mismatch.snp.count[order(mismatch.snp.count$MISMATCH_COUNT, decreasing = TRUE),]
  
  return(list(duos.mismatches = duos.mismatches, 
              duos.mismatches.by.snp = mismatches, 
              mismatches.by.snp.sample = mismatches.by.snp.sample,
              mismatch.snp.count = mismatch.snp.count))
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export
#' 
most.like.parents.duos.mismatches.fun <- function(duos.mismatches,
                                                  parent.combns) {
  
  #Description: Identifies the family that each inidividual is most likely to belong to.
  
  #  Args:
  
  #  Returns: data frame
  #          1.  SAMPLE_ID 
  #          2.  PARENT_COMBN_ID 
  #              FAM_COMBN_ID
  #          3.  MISS_PARENT_SNP_DATA_PROP 
  #          4.  MISS_POOL_SNP_DATA_PROP 
  #          5.  NO_MISS_PARENT_OR_POOL_PROP 
  #              PARENT_1 PARENT_2 PARENT_3 PARENT_4 ...
  #          6.  SNP_COUNT: Count of SNP  (only includes SNP for which genotypes for the sire, dam and pool were present)
  #          7.  MISMATCHES: Mismatched snp (only includes SNP for which genotypes for the sire, dam and pool were present)
  #          8.  MISMATCH_PROP: MISMATCHES / SNP_COUNT
  #          9.  MISMATCH_PROP_SE: Standard error of MISMATCH_PROP (where SE = sqrt [ p(1 - p) / n ] )
  #          10. MISMATCH_PROP_Z
  #              ALT_PARENT_COMBN_ID 
  #              ALT_FAM_COMBN_ID 
  #              ALT_PARENT_1 ALT_PARENT_2 ALT_PARENT_3 ALT_PARENT_4 ...
  #          11. ALT_SNP_COUNT: Count of SNP  (only includes SNP for which genotypes for the sire, dam and pool were present)
  #          12. ALT_MISMATCHES: Mismatched snp (only includes SNP for which genotypes for the sire, dam and pool were present)
  #          13. ALT_MISMATCH_PROP: MISMATCHES / SNP_COUNT
  #          14. ALT_MISMATCH_PROP_SE: Standard error of MISMATCH_PROP (where SE = sqrt [ p(1 - p) / n ] )
  #          15. ALT_MISMATCH_PROP_Z  
  
  print("Running most.like.parents.duos.mismatches.fun")
  
  #remove FAM_COMBN_ID containing duplicated FAM_COMBN_ID
  tmp <- parent.combns[duplicated(parent.combns[,"PARENT_COMBN_ID"]),
                       "PARENT_COMBN_ID"]
  
  if(length(tmp) != 0) {
    parent.combns[parent.combns[,"PARENT_COMBN_ID"] %in% tmp,grep("FAM",colnames(parent.combns))] <- NA
  }
  rm(tmp)
  parent.combns.unambiguous <- parent.combns
  
  #Obtain most likely combn (i.e. min MISMATCH_PROP) for each pool
  most.like           <-  aggregate(duos.mismatches$MISMATCH_PROP, 
                                    by = list(duos.mismatches$SAMPLE_ID), na.rm=T, FUN = "min")   
  colnames(most.like) <- c("SAMPLE_ID", "MISMATCH_PROP")
  
  #  most.like <-  merge(most.like, duos.mismatches, by = c("SAMPLE_ID", "MISMATCH_PROP"), all.x = TRUE)
  most.like <- left_join(most.like, duos.mismatches, by = c("SAMPLE_ID", "MISMATCH_PROP"))
  
  #if there are multiple families then go with the one with the most snp
  tmp.most.like <-  aggregate(most.like$SNP_COUNT, 
                              by = list(most.like$SAMPLE_ID), na.rm=T, FUN = "max") 
  colnames(tmp.most.like) <- c("SAMPLE_ID", "SNP_COUNT")
  
  # most.like <-  merge(tmp.most.like, most.like, by = c("SAMPLE_ID", "SNP_COUNT"), all.x = TRUE)
  most.like               <- left_join(tmp.most.like, most.like, by = c("SAMPLE_ID", "SNP_COUNT"))
  
  most.like <- most.like[,c("SAMPLE_ID",	"PARENT_COMBN_ID", "MISMATCHES",	"SNP_COUNT",	
                            "MISMATCH_PROP",	"MISMATCH_PROP_SE",	"MISMATCH_PROP_Z")]
  # most.like <-  merge(most.like, parent.combns.unambiguous, by = c("PARENT_COMBN_ID"), all.x = TRUE)
  most.like <- left_join(most.like, parent.combns.unambiguous, by = "PARENT_COMBN_ID")
  
  most.like <- unique(most.like)
  
  #Remove most likely trio to obtain second most likely
  remove <- most.like[,c("SAMPLE_ID", "PARENT_COMBN_ID")]
  remove$REMOVE <- TRUE
  # duos.mismatches.removed <- merge(duos.mismatches, remove, by = c("SAMPLE_ID", "PARENT_COMBN_ID"), all.x = TRUE)
  duos.mismatches.removed   <- left_join(duos.mismatches, remove, by = c("SAMPLE_ID", "PARENT_COMBN_ID"))
  duos.mismatches.removed   <- duos.mismatches.removed[is.na(duos.mismatches.removed$REMOVE),]
  
  #Obtain second most likely family (i.e. maximum LOD) for each pool
  second.most.like <-  aggregate(duos.mismatches.removed$MISMATCH_PROP, 
                                 by = list(duos.mismatches.removed$SAMPLE_ID), na.rm=T, FUN = "min")   
  colnames(second.most.like) <- c("SAMPLE_ID", "MISMATCH_PROP")
  
  #  second.most.like <-  merge(second.most.like, duos.mismatches.removed, 
  #                             by = c("SAMPLE_ID", "MISMATCH_PROP"), all.x = TRUE)
  second.most.like   <- left_join(second.most.like, duos.mismatches.removed, by = c("SAMPLE_ID", "MISMATCH_PROP"))
  
  tmp.second.most.like <-  aggregate(second.most.like$SNP_COUNT, 
                                     by = list(second.most.like$SAMPLE_ID), na.rm=T, FUN = "max") 
  colnames(tmp.second.most.like) <- c("SAMPLE_ID", "SNP_COUNT")
  
  # second.most.like <-  merge(tmp.second.most.like, second.most.like, by = c("SAMPLE_ID", "SNP_COUNT"), all.x = TRUE)
  second.most.like <- left_join(tmp.second.most.like, second.most.like, by = c("SAMPLE_ID", "SNP_COUNT"))
  
  #  second.most.like <- merge(second.most.like, 
  #                             parent.combns.unambiguous[,c("PARENT_COMBN_ID", "FAM_COMBN_ID")], 
  #                             by = c("PARENT_COMBN_ID"), all.x = TRUE)
  second.most.like <- left_join(second.most.like, 
                                parent.combns.unambiguous[,c("PARENT_COMBN_ID", "FAM_COMBN_ID")], 
                                by = "PARENT_COMBN_ID")
  
  second.most.like <- second.most.like[,c("SAMPLE_ID",	"PARENT_COMBN_ID",	"FAM_COMBN_ID", "MISMATCHES",	"SNP_COUNT",	
                                          "MISMATCH_PROP",	"MISMATCH_PROP_SE",	"MISMATCH_PROP_Z")]
  
  colnames(second.most.like) <- c("SAMPLE_ID",	"ALT_PARENT_COMBN_ID",	"ALT_FAM_COMBN_ID","ALT_MISMATCHES",	"ALT_SNP_COUNT",	
                                  "ALT_MISMATCH_PROP",	"ALT_MISMATCH_PROP_SE",	"ALT_MISMATCH_PROP_Z")
  second.most.like <- unique(second.most.like)
  
  #Merge most likely and second most likely 
  #  most.like <- merge(most.like, second.most.like, by = "SAMPLE_ID", all = TRUE)
  most.like <- inner_join(most.like, second.most.like, by = "SAMPLE_ID")
  
  return(most.like)
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

ls.fun <- function(fams,
                   fam.set.combns,
                   fam.set.combns.by.pool,
                   Gij,
                   flj,
                   snp.dat.pools,
                   snp.param.indiv,
                   min.intensity,
                   beta.min.ss){
  
  #retain only required columns
  snp.dat.pools <- snp.dat.pools[,c("SNP_ID", "SAMPLE_ID", "INTENSITY_A", "INTENSITY_B")]
  
  #get parent.combns to generate tclj
  tmp.fam.set.combns <- fam.set.combns
  tmp.fam.set.combns$FAM_SET_ID=1 #Assume n.in.pools = 1 and there is only one family set to get appropriate 'parent.combns' (i.e. 1 row per family)
  tmp <- parent.combns.fun(fams = fams,
                           n.in.pools = 1, 
                           fam.set.combns = tmp.fam.set.combns)
  parent.combns.by.fam.set.combn <- tmp$parent.combns.by.fam.set.combn
  parent.combns <- tmp$parent.combns 
  rm(tmp.fam.set.combns)
  
  tclj <- NULL
  for(fam.set.combn in unique(flj$FAM_SET_COMBN_ID)) {
    
    tmp.fj <- flj[flj[,"FAM_SET_COMBN_ID"] == fam.set.combn,]
    tmp.parent.combns <- parent.combns.by.fam.set.combn[parent.combns.by.fam.set.combn[,"FAM_SET_COMBN_ID"] == fam.set.combn,] 
    
    tmp.parents <- tmp.parent.combns[,!colnames(tmp.parent.combns) %in% c("FAM_SET_COMBN_ID", "PARENT_COMBN_ID")]
    tmp.parents <- unique(as.vector(as.matrix(tmp.parents)))
    
    tmp.tij <- Gij[Gij[,"SAMPLE_ID"] %in% tmp.parents, ]
    tmp.tclj <- tcj.fun(tij = tmp.tij, 
                        parent.combns = tmp.parent.combns, 
                        fj = tmp.fj)
    tmp.tclj$FAM_SET_COMBN_ID <- fam.set.combn
    tclj <- rbind(tclj, tmp.tclj)
  }
  rm(tmp.fj, tmp.parent.combns, tmp.parents, tmp.tij, tmp.tclj)
  
  #Get B_TRANS_PROB
  tclj$B_TRANS_PROB <- tclj$AB + tclj$BB #note that tclj$AB was divided by two in tcj.fun
  tclj <- tclj[,c("SNP_ID", "PARENT_COMBN_ID", "MISS_PARENT_COUNT", "FAM_SET_COMBN_ID", "B_TRANS_PROB")]
  
  fkj.and.weight <- fkj.and.weight.fun(snp.dat.pools = snp.dat.pools, 
                                       snp.param.indiv = snp.param.indiv, 
                                       min.intensity = min.intensity)
   
  Xl.mat <- X.mat.fun(tclj.ls = tclj, parent.combns = parent.combns)
  
  beta <- NULL
  #Loop through samples to get beta 
  for(samp in unique(fam.set.combns.by.pool[,"SAMPLE_ID"])) {
    
    fam.set.combn <-fam.set.combns.by.pool[fam.set.combns.by.pool[,"SAMPLE_ID"] == samp, "FAM_SET_COMBN_ID"]
    
    X.mat <- Xl.mat[[fam.set.combn]]
    #fkj.and.weight with samples from fam.set.combn only
    tmp.fkj.and.weight <- fkj.and.weight[fkj.and.weight[,"SAMPLE_ID"] == samp,]
    
    if(!beta.min.ss) {
      tmp.beta <- beta.fun(X.mat = X.mat,
                           fkj.and.weight = tmp.fkj.and.weight,
                           fams = fams,
                           miss.x.and.y = 0.5, 
                           miss.w = 0)
      tmp.beta$BETA_MIN_SS <- NA
    } else {
      tmp.beta <- beta.min.ss.fun(X.mat = X.mat,
                                  fkj.and.weight = tmp.fkj.and.weight,
                                  fam.set.combns = fam.set.combns[fam.set.combns[,"FAM_SET_COMBN_ID"] == fam.set.combn,],
                                  fams = fams)
    }
    
    #get BETA_HAT_CONSTRAINED
    tmp.fam.set.combns <- fam.set.combns.by.pool[fam.set.combns.by.pool[,"SAMPLE_ID"] == samp, "FAM_SET_COMBN_ID"]
    tmp.fam.set.combns <- fam.set.combns[fam.set.combns[,"FAM_SET_COMBN_ID"] == tmp.fam.set.combns, c("FAM_SET_ID", "FAMILY_ID")]
    beta.hat.constrained <- NULL
    n.in.pool <- length(unique(tmp.fam.set.combns[,"FAM_SET_ID"]))
    tmp <- tmp.beta
    for(fam.set in unique(tmp.fam.set.combns[,"FAM_SET_ID"])) {
      tmp.beta.hat.constrained <- tmp[tmp[,"FAMILY_ID"] %in% 
                                        tmp.fam.set.combns[tmp.fam.set.combns[,"FAM_SET_ID"] == fam.set,"FAMILY_ID"],]
      tmp.beta.hat.constrained <- tmp.beta.hat.constrained[tmp.beta.hat.constrained[,"BETA_HAT"] == 
                                                             max(tmp.beta.hat.constrained[,"BETA_HAT"]),"FAMILY_ID"] #identify familiy with maximum beta
      
      tmp[tmp[,"FAMILY_ID"] == tmp.beta.hat.constrained,"BETA_HAT"] <- 
        tmp[tmp[,"FAMILY_ID"] ==  tmp.beta.hat.constrained,"BETA_HAT"] - 1/n.in.pool
      beta.hat.constrained <- c(beta.hat.constrained,tmp.beta.hat.constrained)
    }
    beta.hat.constrained <- data.frame(FAMILY_ID = beta.hat.constrained,
                                       BETA_HAT_CONSTRAINED = 1/n.in.pool)
    beta.hat.constrained <- aggregate(BETA_HAT_CONSTRAINED ~ FAMILY_ID, data = beta.hat.constrained, FUN = sum) #sum BETA_HAT_CONSTRAINED by family
    tmp.beta <- merge(tmp.beta, beta.hat.constrained, by = "FAMILY_ID", all.x = TRUE)
    tmp.beta[is.na(tmp.beta[,"BETA_HAT_CONSTRAINED"]),"BETA_HAT_CONSTRAINED"] <- 0
    rm(beta.hat.constrained, tmp.beta.hat.constrained)
    
    beta <- rbind(beta,tmp.beta)
    rm(tmp.fkj.and.weight, tmp.beta)
  }
  
  beta <- beta[,c("SAMPLE_ID", "SIRE_ID", "DAM_ID", "FAMILY_ID", "BETA_STAR",  "BETA_HAT", "BETA_HAT_CONSTRAINED", "BETA_MIN_SS")]
  
  return(list(tclj.ls = tclj,
              fkj.and.weight = fkj.and.weight,
              Xl.mat = Xl.mat,
              beta = beta
  ))
  
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

fkj.and.weight.fun <- function(snp.dat.pools, 
                               snp.param.indiv, 
                               min.intensity = 0) {
  
  #Estimates B allele frequencies for pools.  According to the method
  #of Henshall et al. 2014.  Note that "Unlike
  #in the approach used to estimate B allele Frequencies in [24], where allelic
  #proprtions were scaled such that values p(SAMPLE_ID) - 00, 0.5 and 1.0 corrensponded 
  #to the means AA, AB and BB, respectively, we did not scale the p(SAMPLE_ID) values"
  #page 3 of Henshall et al. 2014
  
  #NOTE: Assumed ab.FREQ = BA.FREQ in genotype probability matrix (i.e. it is symetrical)
  
  #Required functions:
  # pij.fun
  
  #Args##########################################
  # snp.dat.pools: Data frame (pooled samples only) containing relevant fields from the Genotype.Intensity 
  #tab of the corresponding GenotypeIntensity.xls file outputted from Sequenom's Typer software 
  #(Sequenom 2006).  Equivalent outputs from other platforms could also be used.  
  #              1. SAMPLE_ID  is the pool sample identifier 
  #              2. SNP_ID  is the SNP identifier,
  #              3. INTENSITY_A   is the area/intensity for allele A, 
  #              4. INTENSITY_B   is the area/intensity for allele B
  
  # snp.param.indiv: Data frame (see "Estimation of SNP specific parameters, page 3 of Henshall et al. 2014).
  # Data frame containing relevant fields from the output of snp.gen.param.fun from individual animal data (e.g. parents).
  #              1. SNP_ID    is the SNP identifier, 
  #              2. MEAN_P_AA is the mean of allelic proportion (homozygous allele A), 
  #              3. MEAN_P_AB is the mean of allelic proportion (heterozygous), 
  #              4. MEAN_P_BB is the mean of allelic proportion (homozygous allele B), 
  #              5. WELCH_A   is the welch statistics for the intervals of mean of 
  #                           allelic proportion AA to AB (WELCH_A) 
  #              6. WELCH_B   is the welch statistics for the intervals of mean of 
  #                           allelic proportion AB to BB 
  
  # min.intensity      Number used in pij.fun. If sqrt((snp.dat.pools$INTENSITY_A)^2 +
  #              (snp.dat.pools$INTENSITY_B)^2) less than this value
  #              then set allelic proportion to missing (see end of page 3 of Henshall et al 2014).
  #              Essentially removes observations that fall into the lower left of INTENSITY_A
  #              by INTENSITY_B scatter plot.
  
  
  #Returns##########################################
  # fkj.and.weight:   Data frame
  #              1. SNP_ID    is the SNP identifier
  #              2. SAMPLE_ID   is the pool identifier 
  #              3. MEAN_P_AA is the mean of allelic proportion (homozygous allele A) from snp.param.indiv
  #              4. MEAN_P_AB is the mean of allelic proportion (heterozygous) from snp.param.indiv
  #              5. MEAN_P_BB is the mean of allelic proportion (homozygous allele B) from snp.param.indiv
  #              6. ALLELIC_PROP_POOL is the allelic proportion for pool (Equation 1 of Henshall et al. 2014)
  #              7. FREQ_POOL is the estimated frequency of the B allele of the SNP in the pool (y vector of page 6 of Henshall et al. 2014)
  #              8. FREQ_POOL_ERROR_WT  is the relevant Welch statistic squared (y vector of page 6 of Henshall et al. 2014)
  
  print("Running fkj.and.weight.fun")
  
  if(sum(c("SAMPLE_ID", "SNP_ID", "INTENSITY_A", "INTENSITY_B") %in% colnames(snp.dat.pools)) != 4) {
    stop("snp.dat.pools input must be a data frame containing the following headings: SAMPLE_ID, SNP_ID, INTENSITY_A, INTENSITY_B")
  }
  
  if(sum(c("SNP_ID", "MEAN_P_AA", "MEAN_P_AB", "MEAN_P_BB", "WELCH_A", "WELCH_B") %in% colnames(snp.param.indiv)) != 6) {
    stop("snp.param.indiv input must be a data frame containing the following headings: SNP_ID, MEAN_P_AA, MEAN_P_AB, MEAN_P_BB, WELCH_A, WELCH_B")
  }
  
  #Name columns
  snp.dat.pools$SAMPLE_ID   <- as.integer(snp.dat.pools$SAMPLE_ID)
  snp.dat.pools$SNP_ID    <- as.character(snp.dat.pools$SNP_ID)
  snp.dat.pools$INTENSITY_A    <- as.numeric(snp.dat.pools$INTENSITY_A)
  snp.dat.pools$INTENSITY_B    <- as.numeric(snp.dat.pools$INTENSITY_B)
  snp.dat.pools <- snp.dat.pools[,c("SAMPLE_ID", "SNP_ID", "INTENSITY_A", "INTENSITY_B")]
  
  snp.param.indiv$SNP_ID      <- as.character(snp.param.indiv$SNP_ID)
  snp.param.indiv$MEAN_P_AA     <- as.numeric(snp.param.indiv$MEAN_P_AA)
  snp.param.indiv$MEAN_P_AB     <- as.numeric(snp.param.indiv$MEAN_P_AB)
  snp.param.indiv$MEAN_P_BB     <- as.numeric(snp.param.indiv$MEAN_P_BB)
  snp.param.indiv$WELCH_A     <- as.numeric(snp.param.indiv$WELCH_A)
  snp.param.indiv$WELCH_B     <- as.numeric(snp.param.indiv$WELCH_B)
  snp.param.indiv <- snp.param.indiv[,c("SNP_ID", "MEAN_P_AA", "MEAN_P_AB", "MEAN_P_BB", "WELCH_A", "WELCH_B")]
  
  # Check the list of SNPs the same in input files
  if(sum(!unique(snp.dat.pools[,"SNP_ID"]) %in% unique(snp.param.indiv[,"SNP_ID"]))>0 &
     sum(!unique(snp.param.indiv[,"SNP_ID"]) %in% unique(snp.dat.pools[,"SNP_ID"]))>0) {
    print("SNP identifiers do not match in snp.dat.pools and snp.param.indiv")
    stop()
  }
  
  #Get allelic proportion
  snp.allelic.prop <- pij.fun(snp.dat.indiv = snp.dat.pools, min.intensity = min.intensity)
  print("Still running fkj.and.weight.fun")
  
  #Rename columns
  colnames(snp.allelic.prop)[colnames(snp.allelic.prop) == "SAMPLE_ID"]               <- "SAMPLE_ID"
  colnames(snp.allelic.prop)[colnames(snp.allelic.prop) == "ALLELIC_PROP"]     <- "ALLELIC_PROP_POOL"  
  colnames(snp.allelic.prop)[colnames(snp.allelic.prop) == "INTENSITY"]        <- "INTENSITY_POOL"  
  
  #Merge data using cbind
  if(identical(snp.dat.pools$SAMPLE_ID,snp.allelic.prop$SAMPLE_ID) &
     identical(snp.dat.pools$SNP_ID,snp.allelic.prop$SNP_ID)) {
    snp.dat.pools <- cbind(snp.dat.pools, snp.allelic.prop[,!colnames(snp.allelic.prop) %in% c("SAMPLE_ID", "SNP_ID", "INTENSITY_A", "INTENSITY_B")])
  } else {
    stop("SAMPLE_ID and SNP_ID columns of pij.fun output do not match those of snp.dat.pools.  Not sure why.")
  }
  
  #Merge data
  # fkj.and.weight <- merge(snp.dat.pools, snp.param.indiv, by = "SNP_ID", all.x = TRUE)
  snp.dat.pools$SNP_ID    <- as.character(snp.dat.pools$SNP_ID)
  snp.param.indiv$SNP_ID    <- as.character(snp.param.indiv$SNP_ID)
  fkj.and.weight <- left_join(snp.dat.pools, snp.param.indiv, by = "SNP_ID")
  
  #Compute allele frequencies (see rhs of page 5 of Henshall et al. 2014)
  fkj.and.weight$FREQ_POOL <- NA
  
  #ALLELIC_PROP_POOL <= MEAN_P_AA
  is.true <- fkj.and.weight[,"ALLELIC_PROP_POOL"] <= fkj.and.weight[,"MEAN_P_AA"]
  is.true[is.na(is.true)] <- FALSE
  
  fkj.and.weight[is.true,"FREQ_POOL"] <- 0
  fkj.and.weight[is.true,"FREQ_ERROR"]      <- fkj.and.weight[is.true,"WELCH_A"]^2 #WELCH_A squared
  
  #MEAN_P_AA < ALLELIC_PROP_POOL < MEAN_P_AB
  is.true <- fkj.and.weight[,"MEAN_P_AA"] < fkj.and.weight[,"ALLELIC_PROP_POOL"] &
    fkj.and.weight[,"ALLELIC_PROP_POOL"] < fkj.and.weight[,"MEAN_P_AB"] 
  is.true[is.na(is.true)] <- FALSE
  
  fkj.and.weight[is.true,"FREQ_POOL"] <- 0.5 *
    ((fkj.and.weight[is.true,"ALLELIC_PROP_POOL"] - fkj.and.weight[is.true,"MEAN_P_AA"]) / 
       (fkj.and.weight[is.true,"MEAN_P_AB"] - fkj.and.weight[is.true,"MEAN_P_AA"]))
  fkj.and.weight[is.true,"FREQ_ERROR"]      <- fkj.and.weight[is.true,"WELCH_A"]^2 #WELCH_A squared
  
  #MEAN_P_AB < ALLELIC_PROP_POOL < MEAN_P_BB
  is.true <- fkj.and.weight[,"MEAN_P_AB"] < fkj.and.weight[,"ALLELIC_PROP_POOL"] &
    fkj.and.weight[,"ALLELIC_PROP_POOL"] < fkj.and.weight[,"MEAN_P_BB"]
  is.true[is.na(is.true)] <- FALSE
  
  fkj.and.weight[is.true,"FREQ_POOL"] <- 0.5 + 0.5 *
    ((fkj.and.weight[is.true,"ALLELIC_PROP_POOL"] - fkj.and.weight[is.true,"MEAN_P_AB"]) / 
       (fkj.and.weight[is.true,"MEAN_P_BB"] - fkj.and.weight[is.true,"MEAN_P_AB"])) 
  fkj.and.weight[is.true,"FREQ_ERROR"]      <- fkj.and.weight[is.true,"WELCH_B"]^2 #WELCH_B squared
  
  #ALLELIC_PROP_POOL >= MEAN_P_BB
  is.true <- fkj.and.weight[,"ALLELIC_PROP_POOL"] >= fkj.and.weight[,"MEAN_P_BB"]
  is.true[is.na(is.true)] <- FALSE
  
  fkj.and.weight[is.true,"FREQ_POOL"] <- 1
  fkj.and.weight[is.true,"FREQ_ERROR"]      <- fkj.and.weight[is.true,"WELCH_B"]^2 #WELCH_B squared
  rm(is.true)
  
  #If ALLELIC_PROP_POOL, MEAN_P_AA, MEAN_P_AB or MEAN_P_BB = NA, then make FREQ_POOL = NA
  fkj.and.weight[is.na(fkj.and.weight[,"ALLELIC_PROP_POOL"]) |
                   is.na(fkj.and.weight[,"MEAN_P_AA"]) |
                   is.na(fkj.and.weight[,"MEAN_P_AB"]) |
                   is.na(fkj.and.weight[,"MEAN_P_BB"]),"FREQ_POOL"] <- NA
  
  #Retain relevant columns only
  fkj.and.weight <- fkj.and.weight[,c("SNP_ID", "SAMPLE_ID", "MEAN_P_AA", "MEAN_P_AB", "MEAN_P_BB",
                                      "ALLELIC_PROP_POOL", "FREQ_POOL", "FREQ_ERROR", "SAMPLE_ID")]
  colnames(fkj.and.weight)[colnames(fkj.and.weight) == "FREQ_ERROR"] <- "FREQ_POOL_ERROR_WT" 
  
  #Order by pool identifier
  fkj.and.weight <- fkj.and.weight[order(fkj.and.weight$SAMPLE_ID, decreasing = FALSE), ]   #help("order")
  
  #if FREQ_POOL is NA then make FREQ_POOL_ERROR_WT NA
  fkj.and.weight[is.na(fkj.and.weight[,"FREQ_POOL"]),"FREQ_POOL_ERROR_WT"] <- NA
  
  fkj.and.weight <- fkj.and.weight[,colnames(fkj.and.weight) != "SAMPLE_ID.1"]
  
  return(fkj.and.weight)
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

X.mat.fun <- function(tclj.ls, parent.combns, miss.x.and.y = 0.5) {
  #Generate the X matrix, as described in Henshall et al. (2014)   
  # miss.x.and.y Number. Value to replace missing values in the X matrix and y vector (Henshall et al. 2014 page 6).  
  
  print("Running X.mat.fun")
  
 # if("reshape2" %in% installed.packages()[, "Package"] == FALSE) {install.packages("reshape2")} 
  library(reshape2)
  
  if(sum(c("SNP_ID", "PARENT_COMBN_ID", "MISS_PARENT_COUNT", "B_TRANS_PROB") %in% colnames(tclj.ls)) != 4) {
    stop("tclj.ls input must be a data frame containing the following headings: SNP_ID, PARENT_COMBN_ID, MISS_PARENT_COUNT, B_TRANS_PROB")
  }
  
  fam.set.combns <- unique(tclj.ls[,"FAM_SET_COMBN_ID"])
  
  X.mat <- NULL
  
  for(fam.set.combn in fam.set.combns) {
    
    tcj <- tclj.ls[tclj.ls[,"FAM_SET_COMBN_ID"] == fam.set.combn,]
    tcj <- tcj[,colnames(tcj) != "FAM_SET_COMBN_ID"]
    
    #  tcj <- merge(tcj, parent.combns, by = "PARENT_COMBN_ID", all.x = TRUE)
    tcj <- left_join(tcj, parent.combns, by = "PARENT_COMBN_ID")
    
    colnames(tcj)[colnames(tcj) == "FAMILY_ID_1"] <- "FAMILY_ID"
    
    #Get B_TRANS_PROB
    # tcj$B_TRANS_PROB <- tcj$AB/2  + tcj$BB
    
    #Replace missing values
    #If missing they must be missing in fjfj and snp.param.indiv 
    tcj[tcj$MISS_PARENT_COUNT == 2,"B_TRANS_PROB"] <- miss.x.and.y #if both sire and dam missing genotype data
    print(paste("WARNING:", sum(tcj$MISS_PARENT_COUNT == 2), "missing elements of the X matrix have been replaced with", miss.x.and.y))
    
    tcj$SAMPLE_ID <- tcj$FAMILY_ID
    # Generate X matrices (assume X matrix is the same for each pool)
    tmp.X.mat <- acast(tcj, SNP_ID ~ SAMPLE_ID , value.var = "B_TRANS_PROB")
    tmp.X.mat <- tmp.X.mat[order(rownames(tmp.X.mat), decreasing = FALSE), ] #order by SNP_ID 
    
    X.mat[[fam.set.combn]] <- tmp.X.mat
    
  }
  
  return(X.mat)
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

beta.fun <- function(X.mat, 
                     fkj.and.weight, 
                     fams, 
                     miss.x.and.y = 0.5, 
                     miss.w = 0) {
  #Estimates family contributions to pools based on the method outlined in Henshall et al. 2014
  
  # fkj.and.weight    Data frame. From the output of fkj.and.weight.fun.
  #          1. SNP_ID           is the SNP identifier, 
  #          2. SAMPLE_ID          is the pool identifier
  #          3. FREQ_POOL        is the estimated B allele frequency of the the SNP in the pool (y vector of Henshall et al. 2014 page 6)
  #          4. FREQ_POOL_ERROR_WT is the weight attached to the estimated frequency (w vector of Henshall et al. 2014 page 6)
  
  # miss.x.and.y Number. Value to replace missing values in the X matrix and y vector (Henshall et al. 2014 page 6).  
  
  # miss.w     Number.   Replace missing values in the w vector (Henshall et al. 2014 page 6).  
  
  #Returns##########################################
  
  # beta: Data frame. 
  #          1. SAMPLE_ID   is the pool identifier 
  #          2. SIRE_ID   is the sire identifier, 
  #          3. DAM_ID    is the dam identifier, 
  #          4. FAMILY_ID    is the family identifier. 
  #          5. BETA_STAR is the estimated family contribution (proportion) to the pool as 
  #                       estimated using the pcls function of the mgcv package. Contributions 
  #                       may sum to a number greater than one (see BETA_HAT). See Henshall 
  #                       et al. {, 2014 #2945} page 6.
  #          6. BETA_HAT  is the estimated family contribution (proportion) to the pool 
  #                       where family contributions to each pool are adjusted to sum to one.  
  #                       Calculated as BETA_STAR / (sum of BETA_STAR within each pool).  
  #                       See Henshall et al. {, 2014 #2945} page 6.
  
  print("Running beta.fun")
  
  #Get requried packages
  
 # if("mgcv" %in% installed.packages()[, "Package"] == FALSE) {install.packages("mgcv")} 
  library(mgcv)
  
  fkj.and.weight$SNP_ID           <- as.character(fkj.and.weight$SNP_ID)
  fkj.and.weight$SAMPLE_ID          <- as.integer(fkj.and.weight$SAMPLE_ID)
  fkj.and.weight$FREQ_POOL        <- as.numeric(fkj.and.weight$FREQ_POOL) 
  fkj.and.weight$FREQ_POOL_ERROR_WT <- as.numeric(fkj.and.weight$FREQ_POOL_ERROR_WT) 
  fkj.and.weight <- fkj.and.weight[,c("SNP_ID", "SAMPLE_ID", "FREQ_POOL", "FREQ_POOL_ERROR_WT")]
  
  # Check the list of SNPs the same in input files
  
  if(sum(!unique(fkj.and.weight[,"SNP_ID"]) %in% rownames(X.mat))>0 & 
     sum(!rownames(X.mat) %in% unique(fkj.and.weight[,"SNP_ID"]))>0) {
    print("SNP identifiers do not match in fkj.and.weight and X.mat")
    stop()
  }
  
  # Check that there are more SNP than families
  
  if(nrow(X.mat) <= ncol(X.mat)) {
    print("There are more families than SNP.  Reduce the number of families or increase the number of SNP")
    stop()
  }
  
  #Loop through pools
  fkj.and.weight <- fkj.and.weight[order(fkj.and.weight[,"SNP_ID"], decreasing = FALSE), ]   #order by SNP_ID 
  colnames(fkj.and.weight)   <- c("SNP_ID", "SAMPLE_ID", "FREQ_POOL", "FREQ_POOL_ERROR_WT")  
  
  #Replace missing values
  print(paste("WARNING:", sum(is.na(fkj.and.weight[,"FREQ_POOL"])), "missing elements of the y vector have been replaced with", miss.x.and.y))
  fkj.and.weight[is.na(fkj.and.weight[,"FREQ_POOL"]),"FREQ_POOL"] <- miss.x.and.y
  
  print(paste("WARNING:", sum(is.na(fkj.and.weight[,"FREQ_POOL_ERROR_WT"])), "missing elements of the w vector have been replaced with", miss.w))
  fkj.and.weight[is.na(fkj.and.weight[,"FREQ_POOL_ERROR_WT"]),"FREQ_POOL_ERROR_WT"] <- miss.w
  
  #Empty matrices
  pools <- unique(fkj.and.weight$SAMPLE_ID)
  beta.star   <- matrix(NA, 
                        nrow = length(pools),
                        ncol = ncol(X.mat))
  rownames(beta.star) <- pools
  colnames(beta.star) <- colnames(X.mat)
  
  beta.hat <- beta.star
  
  for (pool in pools) {
    
    beta.star.tmp <- NA
    beta.hat.tmp  <- NA
    
    print(pool)
    
    #Get y and w vectors (see Henshall et al. (2014) pages 5 and 6)
    y <- fkj.and.weight[fkj.and.weight[,"SAMPLE_ID"] == pool, "FREQ_POOL"]
    w <- matrix(fkj.and.weight[fkj.and.weight[,"SAMPLE_ID"] == pool, "FREQ_POOL_ERROR_WT"],ncol = 1)
    
    # Estimate beta using pcls function of mgcv package
    
    # Inputs for pcls funtion. See help(pcls)
    M <- list( y   = y,
               w   = w,
               X   = X.mat,
               C   = matrix(0,0,0),
               p   = rep(1, ncol(X.mat)),
               off = array(0,0),
               S   = list(),
               sp  = array(0,0),
               Ain = diag(ncol(X.mat)),
               bin = rep(0, ncol(X.mat)) )
    
    
    try( beta.star.tmp     <- pcls(M) )
    beta.star.tmp <- round(beta.star.tmp,10)
    
    beta.hat.tmp      <- beta.star.tmp / sum(beta.star.tmp)
    
    beta.star[rownames(beta.star) == pool,] <- beta.star.tmp
    beta.hat[rownames(beta.hat) == pool,]   <- beta.hat.tmp
    
  }
  
  #Generate beta data frame
  
  #Rearrange beta.star
  tmp.beta.star <- NULL
  #Loop through columns
  for (c in 1:ncol(beta.star)) {
    tmp.1 <- data.frame(SAMPLE_ID = rownames(beta.star),
                        FAMILY_ID = colnames(beta.star)[c],
                        BETA_STAR = beta.star[,c])
    tmp.beta.star <- rbind(tmp.beta.star,tmp.1)
  }
  tmp.beta.star[,"SAMPLE_ID"] <- as.integer(as.character(tmp.beta.star[,"SAMPLE_ID"]))
  tmp.beta.star[,"FAMILY_ID"] <- as.integer(as.character(tmp.beta.star[,"FAMILY_ID"]))
  
  #Rearrange beta.hat
  tmp.beta.hat <- NULL
  #Loop through columns
  for (c in 1:ncol(beta.hat)) {
    tmp.1 <- data.frame(SAMPLE_ID = rownames(beta.hat),
                        FAMILY_ID = colnames(beta.hat)[c],
                        BETA_HAT = beta.hat[,c])
    tmp.beta.hat <- rbind(tmp.beta.hat,tmp.1)
  }
  tmp.beta.hat[,"SAMPLE_ID"] <- as.integer(as.character(tmp.beta.hat[,"SAMPLE_ID"]))
  tmp.beta.hat[,"FAMILY_ID"]  <- as.integer(as.character(tmp.beta.hat[,"FAMILY_ID"]))
  
  beta <- merge(tmp.beta.star, tmp.beta.hat, by = c("SAMPLE_ID", "FAMILY_ID"))
  
  #  beta <- merge(beta, fams, by = "FAMILY_ID", all.x = TRUE)
  beta <- left_join(beta, fams, by = "FAMILY_ID")
  
  beta <- beta[,c("SAMPLE_ID", "SIRE_ID", "DAM_ID", "FAMILY_ID", "BETA_STAR", "BETA_HAT")] 
  
  beta <- beta[order(beta$SAMPLE_ID, decreasing = FALSE), ] 
  
  return(beta)
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

beta.min.ss.fun <- function(X.mat, fkj.and.weight, miss.x.and.y = 0.5, miss.w = 0, fam.set.combns, fams) {
  #Estimates family contributions to pools based on the method outlined in Henshall et al. 2014.  And then
  #estimates beta constrained to possible values given the known number of individuals in a pool an minimum sum of squares identified
  #and possible contributors.  That is elements of BETA_MIN_SS must be 0 or a multiple of 1 / (contributors to the pool)
  
  # fkj.and.weight    Data frame. From the output of fkj.and.weight.fun.
  #          1. SNP_ID           is the SNP identifier, 
  #          2. SAMPLE_ID          is the pool identifier
  #          3. FREQ_POOL        is the estimated B allele frequency of the the SNP in the pool (y vector of Henshall et al. 2014 page 6)
  #          4. FREQ_POOL_ERROR_WT is the weight attached to the estimated frequency (w vector of Henshall et al. 2014 page 6)
  
  # miss.x.and.y Number. Value to replace missing values in the X matrix and y vector (Henshall et al. 2014 page 6).  
  
  # miss.w     Number.   Replace missing values in the w vector (Henshall et al. 2014 page 6).  
  
  # n.by.pool  Data frame. See indiv.ped.file.fun. Generally NULL but if not:
  #          1. SAMPLE_ID  is the pool identifier and 
  #          2. N_INDIV is the number of individuals that contributed to the pool.    
  
  #Returns##########################################
  
  # beta: Data frame. 
  #          1. SAMPLE_ID   is the pool identifier 
  #          2. SIRE_ID   is the sire identifier, 
  #          3. DAM_ID    is the dam identifier, 
  #          4. FAMILY_ID    is the family identifier. 
  #          5. BETA_STAR is the estimated family contribution (proportion) to the pool as 
  #                       estimated using the pcls function of the mgcv package. Contributions 
  #                       may sum to a number greater than one (see BETA_HAT). See Henshall 
  #                       et al. {, 2014 #2945} page 6.
  #          6. BETA_HAT  is the estimated family contribution (proportion) to the pool 
  #                       where family contributions to each pool are adjusted to sum to one.  
  #                       Calculated as BETA_STAR / (sum of BETA_STAR within each pool).  
  #                       See Henshall et al. {, 2014 #2945} page 6.
  #          7. BETA_MIN_SS beta constrained to possible contbributions given n.by.pool and min sum of squares identified.  That is
  #                       elements of BETA_MIN_SS must be 0 or a multiple of 1 / (contributors to the pool)
  
  print("Running beta.min.ss.fun")
  
  #Get requried packages
  
 # if("mgcv" %in% installed.packages()[, "Package"] == FALSE) {install.packages("mgcv")} 
  library(mgcv)
  
  fkj.and.weight$SNP_ID             <- as.character(fkj.and.weight$SNP_ID)
  fkj.and.weight$SAMPLE_ID          <- as.integer(fkj.and.weight$SAMPLE_ID)
  fkj.and.weight$FREQ_POOL          <- as.numeric(fkj.and.weight$FREQ_POOL) 
  fkj.and.weight$FREQ_POOL_ERROR_WT <- as.numeric(fkj.and.weight$FREQ_POOL_ERROR_WT) 
  fkj.and.weight <- fkj.and.weight[,c("SNP_ID", "SAMPLE_ID", "FREQ_POOL", "FREQ_POOL_ERROR_WT")]
  
  # Check the list of SNPs the same in input files
  
  if(sum(!unique(fkj.and.weight[,"SNP_ID"]) %in% rownames(X.mat))>0 & 
     sum(!rownames(X.mat) %in% unique(fkj.and.weight[,"SNP_ID"]))>0) {
    print("SNP identifiers do not match in fkj.and.weight and X.mat")
    stop()
  }
  
  # Check that there are more SNP than families
  if(nrow(X.mat) <= ncol(X.mat)) {
    print("There are more families than SNP.  You need to reduce the number of families or increase the number of SNP to run Least_squares method")
    stop()
  }
  
  #Loop through pools
  fkj.and.weight <- fkj.and.weight[order(fkj.and.weight[,"SNP_ID"], decreasing = FALSE), ]   #order by SNP_ID 
  colnames(fkj.and.weight)   <- c("SNP_ID", "SAMPLE_ID", "FREQ_POOL", "FREQ_POOL_ERROR_WT")  
  
  #Replace missing values
  print(paste("WARNING:", sum(is.na(fkj.and.weight[,"FREQ_POOL"])), "missing elements of the y vector have been replaced with", miss.x.and.y))
  fkj.and.weight[is.na(fkj.and.weight[,"FREQ_POOL"]),"FREQ_POOL"] <- miss.x.and.y
  
  print(paste("WARNING:", sum(is.na(fkj.and.weight[,"FREQ_POOL_ERROR_WT"])), "missing elements of the w vector have been replaced with", miss.w))
  fkj.and.weight[is.na(fkj.and.weight[,"FREQ_POOL_ERROR_WT"]),"FREQ_POOL_ERROR_WT"] <- miss.w
  
  #Empty matrices
  pools <- unique(fkj.and.weight$SAMPLE_ID)
  beta.star   <- matrix(NA, 
                        nrow = length(pools),
                        ncol = ncol(X.mat))
  rownames(beta.star) <- pools
  colnames(beta.star) <- colnames(X.mat)
  
  beta.hat <- beta.star
  beta.contstrained <- beta.star
  beta.contstrained <- as.data.frame(beta.contstrained)
  
  for (pool in pools) {
    
    beta.star.tmp <- NA
    beta.hat.tmp  <- NA
    
    print(paste("Pool",pool))
    
    #Get y and w vectors (see Henshall et al. (2014) pages 5 and 6)
    y <- fkj.and.weight[fkj.and.weight[,"SAMPLE_ID"] == pool, "FREQ_POOL"]
    w <- matrix(fkj.and.weight[fkj.and.weight[,"SAMPLE_ID"] == pool, "FREQ_POOL_ERROR_WT"],ncol = 1)
    
    # Estimate beta using pcls function of mgcv package
    
    # Inputs for pcls funtion. See help(pcls)
    M <- list( y   = y,
               w   = w,
               X   = X.mat,
               C   = matrix(0,0,0),
               p   = rep(1, ncol(X.mat)),
               off = array(0,0),
               S   = list(),
               sp  = array(0,0),
               Ain = diag(ncol(X.mat)),
               bin = rep(0, ncol(X.mat)) )
    
    try( beta.star.tmp     <- pcls(M) )
    beta.star.tmp <- round(beta.star.tmp,10)
    
    beta.hat.tmp      <- beta.star.tmp / sum(beta.star.tmp)
    
    beta.star[rownames(beta.star) == pool,] <- beta.star.tmp
    beta.hat[rownames(beta.hat) == pool,]   <- beta.hat.tmp
    
    #compute beta.contstrained
    
    #Get number of contributors in the pool
    # n <- n.by.pool[n.by.pool[,"SAMPLE_ID"] == pool,"N_INDIV"]
    n.in.pool <- length(unique(fam.set.combns[,"FAM_SET_ID"]))
    
    #estimate integer contributions of families in pool
    #    beta.integer <- round(beta.hat.tmp*n)
    #    
    #get all possible combinations of beta constrained to beta.integer +/- tmp
    #    tmp <- max(2, round(0.15*n))
    #    beta.integer <- data.frame(MIN = beta.integer - 2,
    #                               EXPECTED = beta.integer,
    #                               MAX = beta.integer + 2)
    #    beta.integer[beta.integer[,] < 0] <- 0
    #    beta.integer[beta.integer[,] > n] <- n
    
    #    combinations <- NULL
    #    for (i in 1:nrow(beta.integer)) {
    #      if(i == 1) {
    #        combinations <- as.data.frame(beta.integer[i,"MIN"]:beta.integer[i,"MAX"])
    #      } else {
    #        combinations <- merge(combinations,(beta.integer[i,"MIN"]:beta.integer[i,"MAX"]))
    #        combinations <- combinations[rowSums(combinations) <= n,] #remove row if row sum is greater than n
    #        colnames(combinations) <- 1:ncol(combinations)
    #      }
    #    }
    
    #remove rows that don't sum to n
    #    combinations <- as.matrix(combinations[rowSums(combinations) == n,])
    #    combinations <- combinations / n #express as proportion
    
    combinations <- parent.combns.fun(fams, 
                                      n.in.pools = n.in.pool,
                                      fam.set.combns = fam.set.combns)$parent.combns
    combinations <- combinations[,1:(n.in.pool+1)]
    
    #compute ||(sqrt(w) * (X * beta - y))||^2
    
    combinations[,"SUM_OF_SQUARES"] <- NA
    for (j in 1:nrow(combinations)) {
      print(paste("Make beta.min.ss = FALSE if this is too slow.  Sum of squares", j, "of", nrow(combinations)))
      tmp.beta <- combinations[j,!colnames(combinations) %in% c("FAM_COMBN_ID", "SUM_OF_SQUARES")]
      tmp.beta <- colnames(X.mat) %in% tmp.beta
      tmp.beta <- tmp.beta / n.in.pool
      # tmp.beta <- as.matrix(combinations[j,-ncol(combinations)])
      combinations[j,"SUM_OF_SQUARES"] <- sum((sqrt(w) * (X.mat %*% tmp.beta - y))^2)
    }
    
    #identify minimum SUM_OF_SQUARES
    beta.contstrained.tmp <- combinations[combinations[,"SUM_OF_SQUARES"] == min(combinations[,"SUM_OF_SQUARES"]),-ncol(combinations)]
    beta.contstrained.tmp <- colSums(beta.contstrained.tmp) / nrow(beta.contstrained.tmp) #in case min SUM_OF_SQUARES is the same for multiple combinations
    
    beta.contstrained.tmp <- colnames(beta.contstrained) %in% beta.contstrained.tmp[-1] / n.in.pool
    
    beta.contstrained[rownames(beta.contstrained) == pool,]   <- as.vector(beta.contstrained.tmp)
  }
  
  #Generate beta data frame
  
  #Rearrange beta.star
  tmp.beta.star <- NULL
  #Loop through columns
  for (c in 1:ncol(beta.star)) {
    tmp.1 <- data.frame(SAMPLE_ID = rownames(beta.star),
                        FAMILY_ID = colnames(beta.star)[c],
                        BETA_STAR = beta.star[,c])
    tmp.beta.star <- rbind(tmp.beta.star,tmp.1)
  }
  tmp.beta.star[,"SAMPLE_ID"] <- as.integer(as.character(tmp.beta.star[,"SAMPLE_ID"]))
  tmp.beta.star[,"FAMILY_ID"] <- as.integer(as.character(tmp.beta.star[,"FAMILY_ID"]))
  
  #Rearrange beta.hat
  tmp.beta.hat <- NULL
  #Loop through columns
  for (c in 1:ncol(beta.hat)) {
    tmp.1 <- data.frame(SAMPLE_ID = rownames(beta.hat),
                        FAMILY_ID = colnames(beta.hat)[c],
                        BETA_HAT = beta.hat[,c])
    tmp.beta.hat <- rbind(tmp.beta.hat,tmp.1)
  }
  tmp.beta.hat[,"SAMPLE_ID"] <- as.integer(as.character(tmp.beta.hat[,"SAMPLE_ID"]))
  tmp.beta.hat[,"FAMILY_ID"]  <- as.integer(as.character(tmp.beta.hat[,"FAMILY_ID"]))
  
  
  #Rearrange beta.contstrained
  tmp.beta.contstrained <- NULL
  #Loop through columns
  for (c in 1:ncol(beta.contstrained)) {
    tmp.1 <- data.frame(SAMPLE_ID = rownames(beta.contstrained),
                        FAMILY_ID = colnames(beta.contstrained)[c],
                        BETA_MIN_SS = beta.contstrained[,c])
    tmp.beta.contstrained <- rbind(tmp.beta.contstrained,tmp.1)
  }
  tmp.beta.contstrained[,"SAMPLE_ID"] <- as.integer(as.character(tmp.beta.contstrained[,"SAMPLE_ID"]))
  tmp.beta.contstrained[,"FAMILY_ID"] <- as.integer(as.character(tmp.beta.contstrained[,"FAMILY_ID"]))
  
  
  beta <- merge(tmp.beta.star, tmp.beta.hat, by = c("SAMPLE_ID", "FAMILY_ID"))
  beta <- merge(beta, tmp.beta.contstrained, by = c("SAMPLE_ID", "FAMILY_ID"))
  
  #beta <- merge(beta, fams, by = "FAMILY_ID", all.x = TRUE)
  beta <- left_join(beta, fams, by = "FAMILY_ID")
  
  beta <- beta[,c("SAMPLE_ID", "SIRE_ID", "DAM_ID", "FAMILY_ID", "BETA_STAR", "BETA_HAT", "BETA_MIN_SS")] 
  
  beta <- beta[order(beta$SAMPLE_ID, decreasing = FALSE), ] 
  
  return(beta)
}

#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################

#' @export

bar.plot.fun <- function(beta,
                         file.name = "",
                         var = "BETA_HAT",
                         heading = "Estimated family contributions to pooled samples",
                         plot.to.heading.height = 20,
                         font.size.heading = 3,
                         font.size.y.axis = 2,
                         font.size.x.axis = 2 
) {
  
  #Generates abar plot of contributions of families to pools and dendrogram that groups pools
  
  #Args##########################################
  # beta: Data frame.  From output of beta.fun. 
  #          1. SAMPLE_ID   is the pool identifier 
  #          2. FAMILY_ID    is the family identifier. 
  #          3. BETA_STAR is the estimated family contribution (proportion) to the pool as 
  #                       estimated using the pcls function of the mgcv package. Contributions 
  #                       may sum to a number greater than one (see BETA_HAT). See Henshall 
  #                       et al. {, 2014 #2945} page 6.
  #          4. BETA_HAT  is the estimated family contribution (proportion) to the pool 
  #                       where family contributions to each pool are adjusted to sum to one.  
  #                       Calculated as BETA_STAR / (sum of BETA_STAR within each pool).  
  #                       See Henshall et al. {, 2014 #2945} page 6.
  
  # file.name:              Text. Name of thebar plot file
  # var:                    Text.  Variable to plot "BETA_STAR" or "BETA_HAT"
  # heading:                Text. Title of thebar plot 
  # plot.to.heading.height: Number. Height of the title relative to the height of thebar plot 
  # font.size.heading:      Number. Font size ofbar plot heading.
  # font.size.y.axis:       Number. Font size ofbar plot y axis labels
  # font.size.x.axis:       Number. Font size ofbar plot x axis labels
  
  # Returns:
  # heat.map.png saved to the working directory
  
  print("Running bar.plot.fun")
  
  #Load required packages
 # if("RColorBrewer" %in% installed.packages()[, "Package"] == FALSE) {install.packages("RColorBrewer")} 
  library(RColorBrewer)
  
 # if("gplots" %in% installed.packages()[, "Package"] == FALSE) {install.packages("gplots")} 
  library(gplots)
  
 # if("reshape2" %in% installed.packages()[, "Package"] == FALSE) {install.packages("reshape2")} 
  library(reshape2)  
  
 # if("ggplot2" %in% installed.packages()[, "Package"] == FALSE) {install.packages("ggplot2")} 
  library(ggplot2)
  
  #Name columns and assign class
  beta$SAMPLE_ID     <- as.integer(beta$SAMPLE_ID  )
  beta$FAMILY_ID      <- as.integer(beta$FAMILY_ID   )  
  beta$BETA_STAR    <- as.numeric(beta$BETA_STAR )
  beta$BETA_HAT    <- as.numeric(beta$BETA_HAT )
  
  if("BETA_MIN_SS" %in% colnames(beta)) {
    if(sum(is.na(beta[,"BETA_MIN_SS"])) != nrow(beta)) { #if BETA_MIN_SS all equal NA
      beta$BETA_MIN_SS    <- as.numeric(beta$BETA_MIN_SS)
      beta$MIN_SUM_SQ <- beta$BETA_MIN_SS != 0
    } else {
      beta <- beta[,c("SAMPLE_ID", "FAMILY_ID", "BETA_STAR", "BETA_HAT")]
      beta$MIN_SUM_SQ <- FALSE
    }
  } else {
    beta <- beta[,c("SAMPLE_ID", "FAMILY_ID", "BETA_STAR", "BETA_HAT")]
    beta$MIN_SUM_SQ <- FALSE
  }
  
  beta$VAR <- beta[,colnames(beta) == var]
  beta$FAMILY_ID <- as.character(beta$FAMILY_ID)
  
  # Bar graph, family on x-axis, color fill grouped by pool -- use position_dodge()
  barplot <- ggplot(data=beta, aes(x=FAMILY_ID, y=VAR, fill=MIN_SUM_SQ)) 
  #barplot <- barplot + ylab(var) 
  barplot <- barplot + labs(y = var)
  barplot <- barplot + geom_bar(stat="identity", position=position_dodge())
  barplot <- barplot + theme(axis.text.x=element_text(angle = 90, vjust = 0.5)) 
  ggsave(filename = paste(file.name,".bar.png",sep=""), plot = barplot)
  
}
mghamilton/SNPpools documentation built on Feb. 13, 2021, 12:52 a.m.