R/process_cnv_call.R

#' predict the inheritance state of a CNV call
#'
#' @param samples dataframe listing sample IDs, and file paths
#' @param probes dataframe of log-2-ratio (or adm3 score) values for all
#'        participants, for the exome probes that lie within the CNV region.
#' @param child_id ID of the proband
#' @param mom_id sample ID for the proband's mother
#' @param dad_id sample ID for the proband's father
#' @param cnv row information for CNV, include this if you want to plot the
#'        underlying data for the inheritance prediction
#' @export
#'
#' @return list of inheritance classification, the mother's p-value and the
#'        father's p-value.
#'
#' @examples
#' samples = read.table(header=TRUE, text="
#'        individual_id is_proband
#'        A             FALSE
#'        B             FALSE
#'        C             FALSE
#'        D             FALSE
#'        E             TRUE
#'        F             FALSE
#'        G             FALSE
#'        H             FALSE
#'        I             FALSE
#'        J             FALSE
#'        K             TRUE
#'        L             FALSE",
#'        colClasses=c("character", "logical"))
#' probes = read.table(header=TRUE, text="
#'        probe A B C D E F G H I J K
#'        p1    5 6 4 5 5 6 5 5 4 6 9
#'        p2    5 6 4 5 5 6 5 5 4 6 9
#'        p3    5 6 4 5 5 6 5 5 4 6 9
#'        p4    5 6 4 5 5 6 5 5 4 6 9")
#' child_id = "K"
#' mom_id = "J"
#' dad_id = "I"
#' process_cnv_call(samples, probes, child_id, mom_id, dad_id)
#'
#' # run another example, this time with a larger dataset
#' cohort_n = 500
#' sample_ids = paste("sample", 1:cohort_n, sep="_")
#' samples = data.frame("individual_id"=sample_ids,
#'     "is_proband"=c(rep(FALSE, length(sample_ids) - 1), TRUE))
#'
#' # define the population as having probes values cenetred around zero
#' probes = data.frame(matrix(rnorm(length(sample_ids) * 5), nrow=5))
#' names(probes) = sample_ids
#'
#' # set the child probe values to distant from the population values
#' probes[sample_ids[length(sample_ids)]] = rnorm(5, mean=10, sd=1)
#'
#' # define the sample IDs for the trio members
#' child_id = sample_ids[length(sample_ids)]
#' mom_id = sample_ids[1]
#' dad_id = sample_ids[2]
#'
#' process_cnv_call(samples, probes, child_id, mom_id, dad_id)
process_cnv_call <- function(samples, probes, child_id, mom_id, dad_id, cnv=NA) {
    
    # make sure all the parents are included in the probe dataset, so that we
    # can order the parental z scores and correlations correctly.
    samples$individual_id = as.character(samples$individual_id)
    parent_ids = unique(samples[!samples$is_proband, ]$individual_id)
    
    # drop out the parents for the current child, so we can be sure the population
    # parameters aren't biased by the child's parents.
    parent_ids = parent_ids[!(parent_ids %in% c(mom_id, dad_id))]
    missing_parents = parent_ids[!parent_ids %in% names(probes)]
    for (parent in missing_parents) {
        probes[[parent]] = NA
    }
    
    # find the Z scores for the population, and the family of interest
    z_scores = get_l2r_z_scores(probes[[mom_id]], probes[[dad_id]],
        probes[[child_id]], probes[, parent_ids])
    
    # predict the inheritance state of the childs CNV
    prediction = predict_inheritance(z_scores)
    
    if (!is.na(cnv)) { plot_cnv(z_scores, cnv) }
    
    return(prediction)
}
jeremymcrae/cifer documentation built on May 19, 2019, 5:08 a.m.