inst/scripts/create_ccvariants.R

# Create sqlite database with variants (SNPs/indels/SVs)
# in the 8 founder strains of the Collaborative Cross (CC)
######################################################################
# sources:
#   SNPs:
#     ftp://ftp-mouse.sanger.ac.uk/current_snps/mgp.v5.merged.snps_all.dbSNP142.vcf.gz
#     ftp://ftp-mouse.sanger.ac.uk/current_snps/mgp.v5.merged.snps_all.dbSNP142.vcf.gz.tbi
#   Indels:
#     ftp://ftp-mouse.sanger.ac.uk/current_indels/mgp.v5.merged.indels.dbSNP142.normed.vcf.gz
#     ftp://ftp-mouse.sanger.ac.uk/current_indels/mgp.v5.merged.indels.dbSNP142.normed.vcf.gz.tbi
#   SVs:
#     ftp://ftp-mouse.sanger.ac.uk/current_svs/28strains.REL-1410-SV.sdp.tab.gz
#     ftp://ftp-mouse.sanger.ac.uk/current_svs/28strains.REL-1410-SV.sdp.tab.gz.tbi
#
# Same files (except SVs), at JAX:
#   SNPs:
#     ftp://ftp.jax.org/SNPtools/variants/mgp.v5.merged.snps_all.dbSNP142.vcf.gz
#     ftp://ftp.jax.org/SNPtools/variants/mgp.v5.merged.snps_all.dbSNP142.vcf.gz.tbi
#   Indels:
#     ftp://ftp.jax.org/SNPtools/variants/mgp.v5.merged.indels.dbSNP142.normed.vcf.gz
#     ftp://ftp.jax.org/SNPtools/variants/mgp.v5.merged.indels.dbSNP142.normed.vcf.gz.tbi
######################################################################

##############################
### download files
##############################
site <- c("ftp://ftp.jax.org",
          "ftp://ftp.jax.org",
          "ftp://ftp-mouse.sanger.ac.uk")
subdir <- c("SNPtools/variants",
            "SNPtools/variants",
            "current_svs")
files <- c("mgp.v5.merged.snps_all.dbSNP142.vcf.gz",
           "mgp.v5.merged.indels.dbSNP142.normed.vcf.gz",
           "28strains.REL-1410-SV.sdp.tab.gz")
date_source <- c("2015-09-20",
                 "2015-09-20",
                 "2014-10-20")
genome_build <- rep("GRCm38/mm10", 3)

for(i in seq_along(files)) {
    file <- files[i]
    url <- paste0(site[i], "/", subdir[i], "/", file)
    tbi_file <- paste0(file, ".tbi")
    tbi_url <- paste0(site[i], "/", subdir[i], "/", tbi_file)

    if(!file.exists(file)) {
        cat(" -Downloading", file, "\n")
        download.file(url, file)
    }
    if(!file.exists(tbi_file)) {
        cat(" -Downloading", tbi_file, "\n")
        download.file(tbi_url, tbi_file)
    }
}

# function for re-formating snp/indel "consequence"
# Consequences: vectors of |-separated valued
#    2nd value is gene ID
#    3rd is transcript ID (ignored)
#    5th is consequence (potentially with multiple separated by &'s)
format_consequence <-
    function(csq_record)
{
        x <- strsplit(csq_record, "|", fixed=TRUE)
        genes <- sapply(x, "[", 2)
        csq <- sapply(x, "[", 5)
        csq <- unlist(lapply(seq_along(csq),
                             function(i) {
            if(genes[i] == "") {
                return(csq[i])
            } else {
                tmp <- unlist(strsplit(csq[i], "&", fixed=TRUE))
                return(paste(genes[i], tmp, sep=":"))
            }}))
        c(paste(unique(genes), collapse=","),
          paste(unique(csq), collapse=","))
}





##############################
### SNPs
##############################
chr <- c(1:19, "X", "Y", "MT")
cc_founders <- c("A/J", "C57BL/6J", "129S1/SvImJ", "NOD/ShiLtJ", "NZO/HlLtJ",
                 "CAST/EiJ", "PWK/PhJ", "WSB/EiJ")
strains <- sub("/", "_", cc_founders[-2])
n_strains <- length(strains)

library(VariantAnnotation)

library(RSQLite)
db_file <- "cc_variants.sqlite"
db <- dbConnect(SQLite(), dbname=db_file)

dbExecute(db, paste0("ATTACH '", db_file, "' AS NEW"))

cat(" -SNPs\n")
tabfile <- TabixFile(files[1], paste0(files[1], ".tbi"))

db_started <- FALSE
for(thechr in chr) {
    for(left in seq(0, 190, by=10)) {
        cat(thechr, left, "\n")

        # 10 Mbp range
        gr <- GRanges(seqnames=thechr, ranges=IRanges(start=left*1e6, end=(left+10)*1e6-1))

        # grab data
        param <- ScanVcfParam(geno = c("GT", "FI"), samples = strains,
                              which = gr)
        snps <- readVcf(tabfile, genome = "mm10", param = param)
        if(nrow(snps)==0) next

        # drop snps with any quality < 1
        fi <- geno(snps)$FI
        snps <- snps[rowSums(!is.na(fi) & fi==1) == n_strains]

        # drop snps that are all 0/0
        g <- geno(snps)$GT
        snps <- snps[rowSums(is.na(g)) == 0 & rowSums(g=="0/0") < n_strains]
        g <- geno(snps)$GT
        if(nrow(snps)==0) next

        # grab genotypes
        g <- geno(snps)$GT
        # add B6 genotypes (reference) and change column names
        g <- cbind(g[,1,drop=FALSE], C57BL_6J="0/0", g[,-1])
        colnames(g) <- cc_founders

        # alleles
        major <- as.character(ref(snps))
        minor <- CharacterList(alt(snps))
        alleles <- matrix("", nrow=nrow(snps), ncol=4)
        alleles[,1] <- major
        for(i in 2:4) {
            alleles[,i] <- sapply(minor, "[", i-1)
        }

        # numbers of each genotype
        rs <- sapply(c("0/0", "1/1", "2/2", "3/3"),
                     function(a) rowSums(g==a))

        # rows should all sum to 8
        stopifnot(all(rowSums(rs)==8))

        # find the most common allele and swap with major
        for(i in ncol(rs):2) {
            # this allele most common?
            wh <- (rs[,i] > rs[,1] & rowSums(rs <= rs[,i]) == ncol(rs))

            if(any(wh)) {
                # swap alleles
                tmp <- alleles[wh,i]
                alleles[wh,i] <- alleles[wh,1]
                alleles[wh,1] <- tmp

                # swap genotypes
                pat <- paste0(i-1, "/", i-1)
                gg <- g[wh,,drop=FALSE]
                gg[gg==pat] <- "x/x"
                gg[gg=="0/0"] <- pat
                gg[gg=="x/x"] <- "0/0"
                g[wh,] <- gg
            }
        }

        # version with A/C/G/T
        glet <- gnum <- matrix(nrow=nrow(g), ncol=ncol(g))
        dimnames(glet) <- dimnames(gnum) <- dimnames(g)
        for(i in 1:4) {
            pat <- paste0(i-1, "/", i-1)
            for(j in 1:ncol(g)) {
                wh <- (g[,j] == pat)
                glet[wh,j] <- alleles[wh,i]
                gnum[wh,j] <- i
            }
        }

        # NAs in alleles when not seen
        for(i in 1:4) {
            wh <- which(rowSums(gnum==i)==0)
            alleles[wh,i] <- NA
        }

        # alleles as a pattern A|C/G/T
        alleles_char <- paste(alleles[,1],
                              apply(alleles[,-1,drop=FALSE], 1,
                                    function(a) paste(a[!is.na(a)], collapse="/")),
                              sep="|")

        # gnum: turn it into consecutive numbers
        #   (if "2" is missing make 3 = 2)
        for(i in 3:2) {
            wh <- is.na(alleles[,i])
            if(any(wh)) {
                tmp <- gnum[wh,]
                tmp[tmp >= i] <- tmp[tmp >= i] - 1
                gnum[wh,] <- tmp
            }
        }

        # convert to 1/3
        gbin <- gnum
        gbin[gbin > 1] <- 3

        # first row = gene; 2nd row = consequence
        csq <- sapply(info(snps)$CSQ, format_consequence)

        # create full table of info
        snps <- data.frame(snp_id=rownames(g),
                           chr=as.vector(seqnames(snps)),
                           pos=start(snps),
                           alleles=alleles_char,
                           sdp=qtl2::calc_sdp(gbin),
                           ensembl_gene=csq[1,],
                           consequence=csq[2,],
                           gnum,
                           type="snp",
                           stringsAsFactors=FALSE)
        # make sure column names are what we want
        colnames(snps)[8:15] <- c(strains[1], "C57BL_6J", strains[-1])

        dbWriteTable(db, "variants", snps, row.names=FALSE, overwrite=!db_started,
                     append=db_started, field.types=NULL)
        db_started <- TRUE

    }
}

##############################
### indels
##############################
cat(" -InDels\n")
chr <- c(1:19, "X", "Y") # drop MT because not present in indel file
tabfile <- TabixFile(files[2], paste0(files[2], ".tbi"))

for(thechr in chr) {
    for(left in seq(0, 190, by=10)) {
        cat(thechr, left, "\n")

        # 10 Mbp range
        gr <- GRanges(seqnames=thechr, ranges=IRanges(start=left*1e6, end=(left+10)*1e6-1))

        # grab data
        param <- ScanVcfParam(geno = c("GT", "FI"), samples = strains,
                              which = gr)
        indels <- readVcf(tabfile, genome = "mm10", param = param)
        if(nrow(indels)==0) next

        # drop indels with any quality < 1
        fi <- geno(indels)$FI
        indels <- indels[rowSums(!is.na(fi) & fi==1) == n_strains]
        if(nrow(indels)==0) next

        # drop indels that are all 0/0
        g <- geno(indels)$GT
        indels <- indels[rowSums(is.na(g)) == 0 & rowSums(g=="0/0") < n_strains]
        g <- geno(indels)$GT
        if(nrow(indels)==0) next

        # add B6 genotypes (reference) and change column names
        g <- cbind(g[,1,drop=FALSE], C57BL_6J="0/0", g[,-1,drop=FALSE])
        colnames(g) <- cc_founders

        # alleles
        major <- as.character(ref(indels))
        minor <- CharacterList(alt(indels))
        max_minor <- max(sapply(minor, length))
        alleles <- matrix("", nrow=nrow(g), ncol=max_minor+1)
        alleles[,1] <- major
        for(i in 2:(max_minor+1))
            alleles[,i] <- sapply(minor, "[", i-1)

        # numbers of each genotype
        pat <- paste0(0:max_minor, "/", 0:max_minor)
        rs <- sapply(pat, function(a) rowSums(g==a))
        if(!is.matrix(rs)) {
            rs <- matrix(rs, nrow=1)
            dimnames(rs) <- list(rownames(g), pat)
        }

        # rows should all sum to 8
        stopifnot(all(rowSums(rs)==8))

        # find the most common allele and swap with major
        for(i in ncol(rs):2) {
            # this allele most common?
            wh <- (rs[,i] > rs[,1] & rowSums(rs <= rs[,i]) == ncol(rs))

            if(any(wh)) {
                # swap alleles
                tmp <- alleles[wh,i]
                alleles[wh,i] <- alleles[wh,1]
                alleles[wh,1] <- tmp

                # swap genotypes
                pat <- colnames(rs)[i]
                gg <- g[wh,,drop=FALSE]
                gg[gg==pat] <- "x/x"
                gg[gg=="0/0"] <- pat
                gg[gg=="x/x"] <- "0/0"
                g[wh,] <- gg
            }
        }

        # version with alleles
        glet <- gnum <- matrix(nrow=nrow(g), ncol=ncol(g))
        dimnames(glet) <- dimnames(gnum) <- dimnames(g)
        for(i in 1:ncol(rs)) {
            pat <- paste0(i-1, "/", i-1)
            for(j in 1:ncol(g)) {
                wh <- (g[,j] == pat)
                glet[wh,j] <- alleles[wh,i]
                gnum[wh,j] <- i
            }
        }

        # NAs in alleles when not seen
        for(i in 1:ncol(alleles)) {
            wh <- which(rowSums(gnum==i)==0)
            alleles[wh,i] <- NA
        }

        # alleles as a pattern A|C/G/T
        alleles_char <- paste(alleles[,1],
                              apply(alleles[,-1,drop=FALSE], 1,
                                    function(a) paste(a[!is.na(a)], collapse="/")),
                              sep="|")

        # gnum: turn it into consecutive numbers
        #   (if "2" is missing make 3 = 2)
        if(ncol(alleles) >= 3) {
            for(i in 3:2) {
                wh <- is.na(alleles[,i])
                if(any(wh)) {
                    tmp <- gnum[wh,]
                    tmp[tmp >= i] <- tmp[tmp >= i] - 1
                    gnum[wh,] <- tmp
                }
            }
        }

        # convert to 1/3
        gbin <- gnum
        gbin[gbin > 1] <- 3

        # first row = gene; 2nd row = consequence
        csq <- sapply(info(indels)$CSQ, format_consequence)

        # create full table of info
        indels <- data.frame(snp_id=rownames(g),
                             chr=thechr,
                             pos=start(indels),
                             alleles=alleles_char,
                             sdp=qtl2::calc_sdp(gbin),
                             ensembl_gene=csq[1,],
                             consequence=csq[2,],
                             gnum,
                             type="indel",
                             stringsAsFactors=FALSE)
        # make sure column names are what we want
        colnames(indels)[8:15] <- c(strains[1], "C57BL_6J", strains[-1])

        dbWriteTable(db, "variants", indels, row.names=FALSE, overwrite=FALSE,
                     append=TRUE, field.types=NULL)
    }
}


##############################
# structural variants (SVs)
##############################
cat(" -Stuctural variants\n")
tmpfile <- tempfile()
system(paste0("gunzip -c ", files[3], " > ", tmpfile))
svs <- data.table::fread(tmpfile, data.table=FALSE)
unlink(tmpfile)

# pull out genotypes
g <- svs[,colnames(svs) %in% strains]
g <- g[,strains]

# add B6 ref genotype
g <- cbind(g[,1,drop=FALSE], C57BL_6J="0", g[,-1,drop=FALSE], stringsAsFactors=FALSE)

# drop monomorphic ones
n_allele <- apply(g, 1, function(a) length(unique(a)))
svs <- svs[n_allele > 1,]
g <- g[n_allele > 1,]
g[g=="0"] <- "-"

# alleles
alleles <- apply(g, 1, function(a) {
    tab <- table(a)
    result <- names(sort(tab, decreasing=TRUE))
    if(length(result) < 8)
        result <- c(result, rep(NA, 8-length(result)))
    result })
alleles <- t(alleles)

# numeric version
gnum <- matrix(nrow=nrow(g), ncol=ncol(g))
dimnames(gnum) <- dimnames(g)
for(i in 1:ncol(alleles)) {
    for(j in 1:ncol(g)) {
        wh <- (!is.na(alleles[,i]) & g[,j] == alleles[,i])
        gnum[wh,j] <- i
    }
}

# alleles as a pattern A|C/G/T
alleles_char <- paste(alleles[,1],
                      apply(alleles[,-1,drop=FALSE], 1,
                            function(a) paste(a[!is.na(a)], collapse="/")),
                      sep="|")

# convert to 1/3
gbin <- gnum
gbin[gbin > 1] <- 3

# create full table of info
svs <- data.frame(snp_id=paste0("SV_", svs[,"#CHROM"], "_", svs[,"START"], "_", svs[,"END"]),
                  chr=svs[,"#CHROM"],
                  pos=round((svs[,"START"] + svs[,"END"])/2), # average of start and end
                  alleles=alleles_char,
                  sdp=qtl2::calc_sdp(gbin),
                  ensembl_gene=NA,
                  consequence=NA,
                  gnum,
                  type="SV",
                  stringsAsFactors=FALSE)
# make sure column names are what we want
colnames(svs)[8:15] <- c(strains[1], "C57BL_6J", strains[-1])

dbWriteTable(db, "variants", svs, row.names=FALSE, overwrite=FALSE,
             append=TRUE, field.types=NULL)


##############################
### Add description table
##############################

description <- data.frame(description=c("SNPs in Collaborative Cross founders",
                                        "Indels in Collaborative Cross founders",
                                        "SVs in Collaborative Cross founders"),
                          source=c("Mouse Genome Informatics (MGI), Jackson Lab",
                                   "Mouse Genome Informatics (MGI), Jackson Lab",
                                   "Sanger"),
                          url=paste0(site, "/", subdir, "/", files),
                          date_created=rep(as.character(Sys.Date()), 3),
                          date_source=date_source,
                          genome_build=genome_build,
                          stringsAsFactors=FALSE)
dbWriteTable(db, "description", description, append=TRUE)


##############################
### add index
##############################
dbExecute(db, "CREATE INDEX chr_pos ON variants(chr, pos)")
dbDisconnect(db)

Try the qtl2 package in your browser

Any scripts or data that you put into this service are public.

qtl2 documentation built on April 22, 2023, 1:10 a.m.