R/get_iossifov_nature_de_novos.R

#' get de novo data from the 2014 Iossifov et al. autism exome study in Nature
#'
#' De novo mutation data sourced from Supplementary table 2:
#' Iossifov et al. (2014) Nature 498:216-221
#' doi: 10.1038/nature13908
#'
#' @export
#'
#' @return data frame of de novos, including gene symbol, functional consequence
#'     (VEP format), chromosome, nucleotide position and SNV or INDEL type
iossifov_nature_de_novos <- function() {
    tmpdir = tempdir()
    path = tempfile(tmpdir=tmpdir)
    
    # obtain the dataframe of de novo variants
    download.file("http://www.nature.com/nature/journal/v515/n7526/extref/nature13908-s2.zip", path)
    unzip(path, files=c("nature13908-s2/Supplementary Table 1.xlsx",
        "nature13908-s2/Supplementary Table 2.xlsx"), exdir=tmpdir)
    variants = gdata::read.xls(file.path(tmpdir, "nature13908-s2", "Supplementary Table 2.xlsx"), stringsAsFactors=FALSE)
    families = gdata::read.xls(file.path(tmpdir, "nature13908-s2", "Supplementary Table 1.xlsx"), stringsAsFactors=FALSE)
    unlink(path)
    
    variants = fix_coordinates(variants, "location", "vcfVariant")
    
    # exclude the de novos from the unaffected sibs
    variants = variants[!grepl("^s", variants$inChild), ]
    
    # get the sex of the probands
    variants$sex = substr(variants$inChild, 2, 2)
    variants$sex[variants$sex == "M"] = "male"
    variants$sex[variants$sex == "F"] = "female"
    
    # NOTE: the variant with the most severe consequence might not necessarily
    # NOTE: be within the listed gene. I haven't accounted for this yet.
    vep = apply(variants, 1, get_vep_consequence, verbose=TRUE)
    variants$consequence = sapply(vep, "[", 1)
    variants$hgnc = sapply(vep, "[", 2)
    
    variants$person_id = variants$familyId
    variants$study_code = "iossifov_nature_2014"
    variants$publication_doi = "10.1038/nature13908"
    variants$study_phenotype = "autism"
    
    # identify the family IDs of the probands with normal IQ, as we shall
    # reclassify these probands as having autism with a normal IQ.
    normal_iq = families$familyId[families$probandVIQ >= 70 & families$probandNVIQ >= 70]
    normal_iq = normal_iq[!is.na(normal_iq)]
    variants$study_phenotype[variants$person_id %in% normal_iq] = "normal_iq_autism"
    
    variants = subset(variants, select=c("person_id", "sex", "chrom", "start_pos",
        "end_pos", "ref_allele", "alt_allele", "hgnc", "consequence",
        "study_code", "publication_doi", "study_phenotype"))
    
    return(variants)
}
jeremymcrae/publishedDeNovos documentation built on May 19, 2019, 5:08 a.m.