R/relatedness.R
In KinformR: Relationship-Informed Pedigree and Variant Scoring

Documented in add.fam.scores calc.rv.score score.fam subset.mat

#' Calculate a relatedness-weighted score for a given rare variant.
#'
#' These scores can be used to compare variants of interest within a family.
#'
#' For each encoded relationship, a relationship-informed weight is applied to their sharing
#' or not sharing of a variant.
#' The score for affected status is:
#'     (1 / coefficient.of.relatedness) * status.weight
#' For example, an affected cousin (encoded as a 3) would get a score of:
#'     (1/0.125) * affected weight
#'     8 * 1
#'     = 8 points in favour of the variant.
#' Whereas for unaffected individuals, scores decay the further a person is in
#' relation to the proband based on the formula:
#'     ((unaffected.max*2) * coefficient.of.relatedness ) * unaffected.weight
#' For example, with the default unaffected.max of 8. The sister that does not have a variant would get a score of
#'     ((8*2) 0.5) * unaffected.weight
#'     (16 * 0.5) * 0.5
#'     = 4 points for the variant.
#' If these were the only two relatives considered we could sum the points
#' and get a score in favour of the variant of
#'     8 + 4 = 12
#' If there is evidence against a variant, this is factored into the score as:
#'     total.score = evidence.for - evidence.against
#' For example, if there were also an affected sibling without the variant we would have the score against of:
#'  (1/0.5) * 1 = 2
#' The final score for the variant would then be
#'     for - against = total
#'     12 - 2 = 10
#' Giving a final score of 10 for the variant. Comparing values across variants can be used
#' to rank them based on pedigree-informed levels of variant sharing across affected
#' and unaffected individuals.
#'     Increasing the affected.weight relative to the unaffected.weight will make the scores
#'     give more weight to the correct/incorrect status of affected individuals. The default
#'     is 2:1 weight for affected relative to unaffected, which accounts for the fact that
#'     variants are likely to be incompletely penetrant and we therefore want to be more tolerant
#'     of unaffected individuals that have a variant rather than affected individuals that do not.
#'
#' Input:
#'
#' @param fam.list
#'         - A list with the names: 'A.c', 'A.i', 'U.c', 'U.i'
#'           respectively containing the affected correct, affected incorrect,
#'           unaffected correct and unaffected incorrect.
#'         - This can be generated with the function: score.variant.status
#'         - where each value in the dictionary is a list containing the reference and the
#'           reference's relatives as encoded based on their degree of relatedness to the
#'           reference (reference = 0, sibling/parent/offspring = 1, uncle/grandparent = 2,
#'           cousin = 3, etc.)
#' @param affected.weight A coefficient to multiply the calculated A.c and A.i relatedness values by.
#' @param unaffected.weight A coefficient to multiply the U.c and U.i relatedness values by.
#' @param unaffected.max This  param controls the score given to a first degree unaffected relatives
#'         scores decay from this specified maximum by half for each subsequent relationship degree.
#' @param max.err A heuristic cap of the number of incorrect assignments allowed when scoring. When the total number
#' of incorrect (sum of affected and unaffected) is exceeded,  the variant's score is set to 0, regardless of the number
#' of points for or against. This simplifies scoring and allows for fast filtering of poor quality variants. Default is 4.
#' @return A list with three components: score, score.for, score.against.
#'
#' @examples
#' relations <- list("A.c" = c(0, 1, 3, 1), "A.i" = c(3), "U.c" = c(1, 2), "U.i" = c(1))
#' rv.scores <- calc.rv.score(relations)
#' @export
calc.rv.score <- function(fam.list, affected.weight = 1, unaffected.weight = 0.5, unaffected.max = 8, max.err = 4) {
  relatedness <- list()

  for (i in 0:8) {
    relatedness[i + 1] <- 1 / (2^(i))
  }

  score.dict <- list()

  score.dict[["A.c"]]
  score.dict[["A.i"]]
  score.dict[["U.c"]]
  score.dict[["U.i"]]

  # TODO - change this to apply the different formula for the unaffecteds
  for (n in names(fam.list)) {
    scores <- c()
    if (n == "A.c" || n == "A.i") {
      for (x in fam.list[[n]]) {
        # for affected, importance score gets higher the further from reference individual you get.
        scores <- c(scores, 1 / relatedness[[x + 1]])
      }
    } else if (n == "U.c" || n == "U.i") {
      for (x in fam.list[[n]]) {
        # for unaffected, importance score gets lower the further from reference individual you get.
        scores <- c(scores, (unaffected.max * 2) * relatedness[[x + 1]])
      }
    }
    score.dict[[n]] <- scores
  }

  weighted.for <- sum(score.dict[["A.c"]] * affected.weight) +
    sum(score.dict[["U.c"]] * unaffected.weight)


  weighted.against <- sum(score.dict[["A.i"]] * affected.weight) +
    sum(score.dict[["U.i"]] * unaffected.weight)

  out.list <- list(
    "score" = weighted.for - weighted.against,
    "score.for" = weighted.for,
    "score.against" = weighted.against
  )


  n.incor <- length(score.dict[["A.i"]]) + length(score.dict[["U.i"]])
  if (n.incor > max.err) {
    out.list[["score"]] <- 0
  }
  return(
    out.list
  )
}



#' Take the matrix and subset out only the encoded individuals that are present in the status dataframe.
#' @param mat.df The full matrix file to subset
#' @param status.df The list of sampled individuals, matrix is subset to only these individuals.
#' @return A subset of the input matrix.
subset.mat <- function(mat.df, status.df) {
  sub.ids <- rownames(mat.df)[rownames(mat.df) %in% status.df$name]
  sub.mat <- mat.df[sub.ids, sub.ids]
  return(sub.mat)
}



#' Given a relationship matrix and status dataframe, score a family by applying the calc.rv.score
#' scoring system to every pairwise combination of individuals.
#'
#' By default all individuals are treated as the reference 'proband' and
#' the given variant's score  is calculated based on relationships to all other individuals.
#' e.g. for each row in the relationship matrix. calc.rv.score is run, with the row name indicating the
#' reference individual that the calculation is relative to.
#' Note that the relation.mat can include more individuals than are present within the status.df, but the matrix will
#' be subset to include only those individuals that have status information provided.
#'
#' There are several return options possible.
#'
#' - If affected.only is TRUE, the final scores will be reported for only rows where the reference
#' individual is affected (default = True).
#' - If return.means is TRUE, the average scores for the rows will be reported. (default = TRUE)
#' - If return.sums is True, the sum of the scores for all the rows will be reported. (default = False)
#' NOTE: if affected.only = True, the averages and sums are calculated using only the affected reference individuals.
#'
#' @param relation.mat A relationship matrix for the family.
#' @param status.df A dataframe with the encoded variant/disease status of each individual
#' @param affected.weight A coefficient to multiply the calculated A.c and A.i relatedness values by.
#' @param unaffected.weight A coefficient to multiply the U.c and U.i relatedness values by.
#' @param return.sums Boolean indicating if sum of family variant scores should be returned (default = FALSE).
#' @param return.means Boolean indicating if mean of all family variant scores should be returned (default = TRUE).
#' @param affected.only Boolean indicating if family score should be calculated using only affected individuals (default = TRUE).
#' @param max.err A heuristic cap of the number of incorrect assignments allowed when scoring. When the total number
#' of incorrect (sum of affected and unaffected) is exceeded,  the variant's score is set to 0, regardless of the number
#' of points for or against. This simplifies scoring and allows for fast filtering of poor quality variants. Default is 4.
#' @return A labelled vector with names: score, score.for, score.against
#' @examples
#' mat.name1 <- system.file("extdata/1234_ex2.mat", package = "KinformR")
#' tsv.name1 <- system.file("extdata/1234_ex2.tsv", package = "KinformR")
#' mat.df <- read.relation.mat(mat.name1)
#' ind.df <- read.indiv(tsv.name1)
#' ind.df.status <- score.variant.status(ind.df)
#' score.default <- score.fam(mat.df, ind.df.status)
#' @export
score.fam <- function(relation.mat, status.df, affected.weight = 1, unaffected.weight = 0.5,
                      return.sums = FALSE, return.means = TRUE,
                      affected.only = TRUE, max.err = 4) {
  # The family encoding matrix needs to be subset to include only the individuals in the status dataframe
  sub.relation.mat <- subset.mat(relation.mat, status.df)

  encoded.dat <- encode.rows(sub.relation.mat, status.df, drop.unrelated = TRUE)

  per.indv.scores <- lapply(encoded.dat, calc.rv.score,
    affected.weight = affected.weight, unaffected.weight = unaffected.weight,
    max.err = max.err
  )

  scores <- do.call(rbind.data.frame, per.indv.scores)

  if (affected.only) {
    affected.indiv <- status.df[status.df$status == "A", ]
    scores <- scores[row.names(scores) %in% affected.indiv$name, ]
  }

  if (return.sums) {
    return(colSums(scores))
  } else if (return.means) {
    return(colMeans(scores))
  }
  return(scores)
}


#' Sum all the given scores and return a single vector with cumulative "score", "for" and "against" vals.
#' For use in instances where one wishes to combine scores from multiple families.
#' @param score.vec A vector will all of the per family score outputs.
#' @return A vector with the summed scores of all inputs.
#' @examples
#' score.fam1 <- c("score" = 1.0, "score.for" = 2.0, "score.against" = 1.0)
#' score.fam2 <- c("score" = 1.0, "score.for" = 3.0, "score.against" = 2.0)
#' out <- add.fam.scores(c(score.fam1, score.fam2))
#' @export
add.fam.scores <- function(score.vec) {
  outvec <- tapply(score.vec, names(score.vec), sum)

  sorted.out <- c(
    "score" = outvec[["score"]],
    "score.for" = outvec[["score.for"]],
    "score.against" = outvec[["score.against"]]
  )
  return(sorted.out)
}