Nothing
# Create sqlite database with variants (SNPs/indels/SVs)
# in the 8 founder strains of the Collaborative Cross (CC)
######################################################################
# sources:
# SNPs:
# ftp://ftp-mouse.sanger.ac.uk/current_snps/mgp.v5.merged.snps_all.dbSNP142.vcf.gz
# ftp://ftp-mouse.sanger.ac.uk/current_snps/mgp.v5.merged.snps_all.dbSNP142.vcf.gz.tbi
# Indels:
# ftp://ftp-mouse.sanger.ac.uk/current_indels/mgp.v5.merged.indels.dbSNP142.normed.vcf.gz
# ftp://ftp-mouse.sanger.ac.uk/current_indels/mgp.v5.merged.indels.dbSNP142.normed.vcf.gz.tbi
# SVs:
# ftp://ftp-mouse.sanger.ac.uk/current_svs/28strains.REL-1410-SV.sdp.tab.gz
# ftp://ftp-mouse.sanger.ac.uk/current_svs/28strains.REL-1410-SV.sdp.tab.gz.tbi
#
# Same files (except SVs), at JAX:
# SNPs:
# ftp://ftp.jax.org/SNPtools/variants/mgp.v5.merged.snps_all.dbSNP142.vcf.gz
# ftp://ftp.jax.org/SNPtools/variants/mgp.v5.merged.snps_all.dbSNP142.vcf.gz.tbi
# Indels:
# ftp://ftp.jax.org/SNPtools/variants/mgp.v5.merged.indels.dbSNP142.normed.vcf.gz
# ftp://ftp.jax.org/SNPtools/variants/mgp.v5.merged.indels.dbSNP142.normed.vcf.gz.tbi
######################################################################
##############################
### download files
##############################
site <- c("ftp://ftp.jax.org",
"ftp://ftp.jax.org",
"ftp://ftp-mouse.sanger.ac.uk")
subdir <- c("SNPtools/variants",
"SNPtools/variants",
"current_svs")
files <- c("mgp.v5.merged.snps_all.dbSNP142.vcf.gz",
"mgp.v5.merged.indels.dbSNP142.normed.vcf.gz",
"28strains.REL-1410-SV.sdp.tab.gz")
date_source <- c("2015-09-20",
"2015-09-20",
"2014-10-20")
genome_build <- rep("GRCm38/mm10", 3)
for(i in seq_along(files)) {
file <- files[i]
url <- paste0(site[i], "/", subdir[i], "/", file)
tbi_file <- paste0(file, ".tbi")
tbi_url <- paste0(site[i], "/", subdir[i], "/", tbi_file)
if(!file.exists(file)) {
cat(" -Downloading", file, "\n")
download.file(url, file)
}
if(!file.exists(tbi_file)) {
cat(" -Downloading", tbi_file, "\n")
download.file(tbi_url, tbi_file)
}
}
# function for re-formating snp/indel "consequence"
# Consequences: vectors of |-separated valued
# 2nd value is gene ID
# 3rd is transcript ID (ignored)
# 5th is consequence (potentially with multiple separated by &'s)
format_consequence <-
function(csq_record)
{
x <- strsplit(csq_record, "|", fixed=TRUE)
genes <- sapply(x, "[", 2)
csq <- sapply(x, "[", 5)
csq <- unlist(lapply(seq_along(csq),
function(i) {
if(genes[i] == "") {
return(csq[i])
} else {
tmp <- unlist(strsplit(csq[i], "&", fixed=TRUE))
return(paste(genes[i], tmp, sep=":"))
}}))
c(paste(unique(genes), collapse=","),
paste(unique(csq), collapse=","))
}
##############################
### SNPs
##############################
chr <- c(1:19, "X", "Y", "MT")
cc_founders <- c("A/J", "C57BL/6J", "129S1/SvImJ", "NOD/ShiLtJ", "NZO/HlLtJ",
"CAST/EiJ", "PWK/PhJ", "WSB/EiJ")
strains <- sub("/", "_", cc_founders[-2])
n_strains <- length(strains)
library(VariantAnnotation)
library(RSQLite)
db_file <- "cc_variants.sqlite"
db <- dbConnect(SQLite(), dbname=db_file)
dbExecute(db, paste0("ATTACH '", db_file, "' AS NEW"))
cat(" -SNPs\n")
tabfile <- TabixFile(files[1], paste0(files[1], ".tbi"))
db_started <- FALSE
for(thechr in chr) {
for(left in seq(0, 190, by=10)) {
cat(thechr, left, "\n")
# 10 Mbp range
gr <- GRanges(seqnames=thechr, ranges=IRanges(start=left*1e6, end=(left+10)*1e6-1))
# grab data
param <- ScanVcfParam(geno = c("GT", "FI"), samples = strains,
which = gr)
snps <- readVcf(tabfile, genome = "mm10", param = param)
if(nrow(snps)==0) next
# drop snps with any quality < 1
fi <- geno(snps)$FI
snps <- snps[rowSums(!is.na(fi) & fi==1) == n_strains]
# drop snps that are all 0/0
g <- geno(snps)$GT
snps <- snps[rowSums(is.na(g)) == 0 & rowSums(g=="0/0") < n_strains]
g <- geno(snps)$GT
if(nrow(snps)==0) next
# grab genotypes
g <- geno(snps)$GT
# add B6 genotypes (reference) and change column names
g <- cbind(g[,1,drop=FALSE], C57BL_6J="0/0", g[,-1])
colnames(g) <- cc_founders
# alleles
major <- as.character(ref(snps))
minor <- CharacterList(alt(snps))
alleles <- matrix("", nrow=nrow(snps), ncol=4)
alleles[,1] <- major
for(i in 2:4) {
alleles[,i] <- sapply(minor, "[", i-1)
}
# numbers of each genotype
rs <- sapply(c("0/0", "1/1", "2/2", "3/3"),
function(a) rowSums(g==a))
# rows should all sum to 8
stopifnot(all(rowSums(rs)==8))
# find the most common allele and swap with major
for(i in ncol(rs):2) {
# this allele most common?
wh <- (rs[,i] > rs[,1] & rowSums(rs <= rs[,i]) == ncol(rs))
if(any(wh)) {
# swap alleles
tmp <- alleles[wh,i]
alleles[wh,i] <- alleles[wh,1]
alleles[wh,1] <- tmp
# swap genotypes
pat <- paste0(i-1, "/", i-1)
gg <- g[wh,,drop=FALSE]
gg[gg==pat] <- "x/x"
gg[gg=="0/0"] <- pat
gg[gg=="x/x"] <- "0/0"
g[wh,] <- gg
}
}
# version with A/C/G/T
glet <- gnum <- matrix(nrow=nrow(g), ncol=ncol(g))
dimnames(glet) <- dimnames(gnum) <- dimnames(g)
for(i in 1:4) {
pat <- paste0(i-1, "/", i-1)
for(j in 1:ncol(g)) {
wh <- (g[,j] == pat)
glet[wh,j] <- alleles[wh,i]
gnum[wh,j] <- i
}
}
# NAs in alleles when not seen
for(i in 1:4) {
wh <- which(rowSums(gnum==i)==0)
alleles[wh,i] <- NA
}
# alleles as a pattern A|C/G/T
alleles_char <- paste(alleles[,1],
apply(alleles[,-1,drop=FALSE], 1,
function(a) paste(a[!is.na(a)], collapse="/")),
sep="|")
# gnum: turn it into consecutive numbers
# (if "2" is missing make 3 = 2)
for(i in 3:2) {
wh <- is.na(alleles[,i])
if(any(wh)) {
tmp <- gnum[wh,]
tmp[tmp >= i] <- tmp[tmp >= i] - 1
gnum[wh,] <- tmp
}
}
# convert to 1/3
gbin <- gnum
gbin[gbin > 1] <- 3
# first row = gene; 2nd row = consequence
csq <- sapply(info(snps)$CSQ, format_consequence)
# create full table of info
snps <- data.frame(snp_id=rownames(g),
chr=as.vector(seqnames(snps)),
pos=start(snps),
alleles=alleles_char,
sdp=qtl2::calc_sdp(gbin),
ensembl_gene=csq[1,],
consequence=csq[2,],
gnum,
type="snp",
stringsAsFactors=FALSE)
# make sure column names are what we want
colnames(snps)[8:15] <- c(strains[1], "C57BL_6J", strains[-1])
dbWriteTable(db, "variants", snps, row.names=FALSE, overwrite=!db_started,
append=db_started, field.types=NULL)
db_started <- TRUE
}
}
##############################
### indels
##############################
cat(" -InDels\n")
chr <- c(1:19, "X", "Y") # drop MT because not present in indel file
tabfile <- TabixFile(files[2], paste0(files[2], ".tbi"))
for(thechr in chr) {
for(left in seq(0, 190, by=10)) {
cat(thechr, left, "\n")
# 10 Mbp range
gr <- GRanges(seqnames=thechr, ranges=IRanges(start=left*1e6, end=(left+10)*1e6-1))
# grab data
param <- ScanVcfParam(geno = c("GT", "FI"), samples = strains,
which = gr)
indels <- readVcf(tabfile, genome = "mm10", param = param)
if(nrow(indels)==0) next
# drop indels with any quality < 1
fi <- geno(indels)$FI
indels <- indels[rowSums(!is.na(fi) & fi==1) == n_strains]
if(nrow(indels)==0) next
# drop indels that are all 0/0
g <- geno(indels)$GT
indels <- indels[rowSums(is.na(g)) == 0 & rowSums(g=="0/0") < n_strains]
g <- geno(indels)$GT
if(nrow(indels)==0) next
# add B6 genotypes (reference) and change column names
g <- cbind(g[,1,drop=FALSE], C57BL_6J="0/0", g[,-1,drop=FALSE])
colnames(g) <- cc_founders
# alleles
major <- as.character(ref(indels))
minor <- CharacterList(alt(indels))
max_minor <- max(sapply(minor, length))
alleles <- matrix("", nrow=nrow(g), ncol=max_minor+1)
alleles[,1] <- major
for(i in 2:(max_minor+1))
alleles[,i] <- sapply(minor, "[", i-1)
# numbers of each genotype
pat <- paste0(0:max_minor, "/", 0:max_minor)
rs <- sapply(pat, function(a) rowSums(g==a))
if(!is.matrix(rs)) {
rs <- matrix(rs, nrow=1)
dimnames(rs) <- list(rownames(g), pat)
}
# rows should all sum to 8
stopifnot(all(rowSums(rs)==8))
# find the most common allele and swap with major
for(i in ncol(rs):2) {
# this allele most common?
wh <- (rs[,i] > rs[,1] & rowSums(rs <= rs[,i]) == ncol(rs))
if(any(wh)) {
# swap alleles
tmp <- alleles[wh,i]
alleles[wh,i] <- alleles[wh,1]
alleles[wh,1] <- tmp
# swap genotypes
pat <- colnames(rs)[i]
gg <- g[wh,,drop=FALSE]
gg[gg==pat] <- "x/x"
gg[gg=="0/0"] <- pat
gg[gg=="x/x"] <- "0/0"
g[wh,] <- gg
}
}
# version with alleles
glet <- gnum <- matrix(nrow=nrow(g), ncol=ncol(g))
dimnames(glet) <- dimnames(gnum) <- dimnames(g)
for(i in 1:ncol(rs)) {
pat <- paste0(i-1, "/", i-1)
for(j in 1:ncol(g)) {
wh <- (g[,j] == pat)
glet[wh,j] <- alleles[wh,i]
gnum[wh,j] <- i
}
}
# NAs in alleles when not seen
for(i in 1:ncol(alleles)) {
wh <- which(rowSums(gnum==i)==0)
alleles[wh,i] <- NA
}
# alleles as a pattern A|C/G/T
alleles_char <- paste(alleles[,1],
apply(alleles[,-1,drop=FALSE], 1,
function(a) paste(a[!is.na(a)], collapse="/")),
sep="|")
# gnum: turn it into consecutive numbers
# (if "2" is missing make 3 = 2)
if(ncol(alleles) >= 3) {
for(i in 3:2) {
wh <- is.na(alleles[,i])
if(any(wh)) {
tmp <- gnum[wh,]
tmp[tmp >= i] <- tmp[tmp >= i] - 1
gnum[wh,] <- tmp
}
}
}
# convert to 1/3
gbin <- gnum
gbin[gbin > 1] <- 3
# first row = gene; 2nd row = consequence
csq <- sapply(info(indels)$CSQ, format_consequence)
# create full table of info
indels <- data.frame(snp_id=rownames(g),
chr=thechr,
pos=start(indels),
alleles=alleles_char,
sdp=qtl2::calc_sdp(gbin),
ensembl_gene=csq[1,],
consequence=csq[2,],
gnum,
type="indel",
stringsAsFactors=FALSE)
# make sure column names are what we want
colnames(indels)[8:15] <- c(strains[1], "C57BL_6J", strains[-1])
dbWriteTable(db, "variants", indels, row.names=FALSE, overwrite=FALSE,
append=TRUE, field.types=NULL)
}
}
##############################
# structural variants (SVs)
##############################
cat(" -Stuctural variants\n")
tmpfile <- tempfile()
system(paste0("gunzip -c ", files[3], " > ", tmpfile))
svs <- data.table::fread(tmpfile, data.table=FALSE)
unlink(tmpfile)
# pull out genotypes
g <- svs[,colnames(svs) %in% strains]
g <- g[,strains]
# add B6 ref genotype
g <- cbind(g[,1,drop=FALSE], C57BL_6J="0", g[,-1,drop=FALSE], stringsAsFactors=FALSE)
# drop monomorphic ones
n_allele <- apply(g, 1, function(a) length(unique(a)))
svs <- svs[n_allele > 1,]
g <- g[n_allele > 1,]
g[g=="0"] <- "-"
# alleles
alleles <- apply(g, 1, function(a) {
tab <- table(a)
result <- names(sort(tab, decreasing=TRUE))
if(length(result) < 8)
result <- c(result, rep(NA, 8-length(result)))
result })
alleles <- t(alleles)
# numeric version
gnum <- matrix(nrow=nrow(g), ncol=ncol(g))
dimnames(gnum) <- dimnames(g)
for(i in 1:ncol(alleles)) {
for(j in 1:ncol(g)) {
wh <- (!is.na(alleles[,i]) & g[,j] == alleles[,i])
gnum[wh,j] <- i
}
}
# alleles as a pattern A|C/G/T
alleles_char <- paste(alleles[,1],
apply(alleles[,-1,drop=FALSE], 1,
function(a) paste(a[!is.na(a)], collapse="/")),
sep="|")
# convert to 1/3
gbin <- gnum
gbin[gbin > 1] <- 3
# create full table of info
svs <- data.frame(snp_id=paste0("SV_", svs[,"#CHROM"], "_", svs[,"START"], "_", svs[,"END"]),
chr=svs[,"#CHROM"],
pos=round((svs[,"START"] + svs[,"END"])/2), # average of start and end
alleles=alleles_char,
sdp=qtl2::calc_sdp(gbin),
ensembl_gene=NA,
consequence=NA,
gnum,
type="SV",
stringsAsFactors=FALSE)
# make sure column names are what we want
colnames(svs)[8:15] <- c(strains[1], "C57BL_6J", strains[-1])
dbWriteTable(db, "variants", svs, row.names=FALSE, overwrite=FALSE,
append=TRUE, field.types=NULL)
##############################
### Add description table
##############################
description <- data.frame(description=c("SNPs in Collaborative Cross founders",
"Indels in Collaborative Cross founders",
"SVs in Collaborative Cross founders"),
source=c("Mouse Genome Informatics (MGI), Jackson Lab",
"Mouse Genome Informatics (MGI), Jackson Lab",
"Sanger"),
url=paste0(site, "/", subdir, "/", files),
date_created=rep(as.character(Sys.Date()), 3),
date_source=date_source,
genome_build=genome_build,
stringsAsFactors=FALSE)
dbWriteTable(db, "description", description, append=TRUE)
##############################
### add index
##############################
dbExecute(db, "CREATE INDEX chr_pos ON variants(chr, pos)")
dbDisconnect(db)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.