Nothing
## ----echo=FALSE---------------------------------------------------------------------------------------------
options(width=110)
## -----------------------------------------------------------------------------------------------------------
library(SeqArray)
# open a SeqArray file in the package (1000 Genomes Phase1, chromosome 22)
file <- seqOpen(seqExampleFileName("KG_Phase1"))
seqSummary(file)
## -----------------------------------------------------------------------------------------------------------
af <- seqApply(file, "genotype", as.is="double", margin="by.variant",
FUN=function(x) { mean(x==0L, na.rm=TRUE) })
head(af)
## -----------------------------------------------------------------------------------------------------------
library(Rcpp)
cppFunction("
double CalcAlleleFreq(IntegerVector x)
{
int len=x.size(), n=0, n0=0;
for (int i=0; i < len; i++)
{
int g = x[i];
if (g != NA_INTEGER)
{
n++;
if (g == 0) n0++;
}
}
return double(n0) / n;
}")
## -----------------------------------------------------------------------------------------------------------
af <- seqApply(file, "genotype", as.is="double", margin="by.variant", FUN=CalcAlleleFreq)
head(af)
## -----------------------------------------------------------------------------------------------------------
af <- seqApply(file, "genotype", as.is="double",
margin="by.variant", FUN=function(x) { mean(x==0L, na.rm=TRUE) }, parallel=2)
head(af)
## -----------------------------------------------------------------------------------------------------------
# covariance variable with an initial value
s <- 0
seqBlockApply(file, "$dosage", function(x)
{
p <- 0.5 * colMeans(x, na.rm=TRUE) # allele frequencies (a vector)
g <- (t(x) - 2*p) / sqrt(p*(1-p)) # normalized by allele frequency
g[is.na(g)] <- 0 # correct missing values
s <<- s + crossprod(g) # update the cov matrix s in the parent environment
}, margin="by.variant", .progress=TRUE)
# scaled by the number of samples over the trace
s <- s * (nrow(s) / sum(diag(s)))
# eigen-decomposition
eig <- eigen(s, symmetric=TRUE)
## ----fig.width=5, fig.height=5, fig.align='center'----------------------------------------------------------
# figure
plot(eig$vectors[,1], eig$vectors[,2], xlab="PC 1", ylab="PC 2")
## -----------------------------------------------------------------------------------------------------------
# use 2 cores for demonstration
seqParallelSetup(2)
# numbers of distinct alleles per site
table(seqNumAllele(file))
# reference allele frequencies
summary(seqAlleleFreq(file, ref.allele=0L))
# close the cluster environment
seqParallelSetup(FALSE)
## ----message=FALSE------------------------------------------------------------------------------------------
library(VariantAnnotation)
## -----------------------------------------------------------------------------------------------------------
# select a region [10Mb, 30Mb] on chromosome 22
seqSetFilterChrom(file, 22, from.bp=10000000, to.bp=30000000)
vcf <- seqAsVCF(file, chr.prefix="chr")
vcf
## ----echo=FALSE---------------------------------------------------------------------------------------------
unlink("exons.vcf.gz", force=TRUE)
## ----message=FALSE------------------------------------------------------------------------------------------
library(Biobase)
## ----eval=FALSE---------------------------------------------------------------------------------------------
# library(SeqVarTools)
## -----------------------------------------------------------------------------------------------------------
data(KG_P1_SampData)
KG_P1_SampData
head(pData(KG_P1_SampData)) # show KG_P1_SampData
## ----eval=FALSE---------------------------------------------------------------------------------------------
# # link sample data to SeqArray file
# seqData <- SeqVarData(file, sample.data)
#
# # set sample and variant filters
# female <- sampleData(seqData)$sex == "female"
# seqSetFilter(seqData, sample.sel=female)
#
# # run linear regression
# res <- regression(seqData, outcome="phenotype", covar="age")
# head(res)
## -----------------------------------------------------------------------------------------------------------
library(SNPRelate)
set.seed(1000)
# may try different LD thresholds for sensitivity analysis
snpset <- snpgdsLDpruning(file, ld.threshold=0.2, maf=0.01)
names(snpset)
head(snpset$chr22) # variant.id
# get all selected variant id
snpset.id <- unlist(snpset)
## -----------------------------------------------------------------------------------------------------------
# Run PCA
pca <- snpgdsPCA(file, snp.id=snpset.id, num.thread=2)
# variance proportion (%)
pc.percent <- pca$varprop*100
head(round(pc.percent, 2))
## ----fig.width=5, fig.height=5, fig.align='center'----------------------------------------------------------
# plot the first 4 eigenvectors with character=20 and size=0.5
plot(pca, eig=1:4, pch=20, cex=0.5)
## -----------------------------------------------------------------------------------------------------------
pop.code <- factor(seqGetData(file, "sample.annotation/Population"))
head(pop.code)
popgroup <- list(
EastAsia = c("CHB", "JPT", "CHS", "CDX", "KHV", "CHD"),
European = c("CEU", "TSI", "GBR", "FIN", "IBS"),
African = c("ASW", "ACB", "YRI", "LWK", "GWD", "MSL", "ESN"),
SouthAmerica = c("MXL", "PUR", "CLM", "PEL"),
India = c("GIH", "PJL", "BEB", "STU", "ITU"))
colors <- sapply(levels(pop.code), function(x) {
for (i in 1:length(popgroup)) {
if (x %in% popgroup[[i]])
return(names(popgroup)[i])
}
NA
})
colors <- as.factor(colors)
legend.text <- sapply(levels(colors), function(x) paste(levels(pop.code)[colors==x], collapse=","))
legend.text
## ----fig.width=5, fig.height=5, fig.align='center'----------------------------------------------------------
# make a data.frame
tab <- data.frame(sample.id = pca$sample.id,
EV1 = pca$eigenvect[,1], # the first eigenvector
EV2 = pca$eigenvect[,2], # the second eigenvector
Population = pop.code, stringsAsFactors = FALSE)
head(tab)
# draw
plot(pca, pch=20, cex=0.75, main="1KG Phase 1, chromosome 22", col=colors[tab$Population])
legend("topright", legend=legend.text, col=1:length(legend.text), pch=19, cex=0.75)
## -----------------------------------------------------------------------------------------------------------
# YRI samples
sample.id <- seqGetData(file, "sample.id")
CEU.id <- sample.id[pop.code == "CEU"]
## ----fig.width=5, fig.height=5, fig.align='center'----------------------------------------------------------
# Estimate IBD coefficients
ibd <- snpgdsIBDMoM(file, sample.id=CEU.id, snp.id=snpset.id, num.thread=2)
# Make a data.frame
ibd.coeff <- snpgdsIBDSelection(ibd)
head(ibd.coeff)
plot(ibd.coeff$k0, ibd.coeff$k1, xlim=c(0,1), ylim=c(0,1), xlab="k0", ylab="k1", main="CEU samples (MoM)")
lines(c(0,1), c(1,0), col="red", lty=2)
## -----------------------------------------------------------------------------------------------------------
set.seed(1000)
ibs.hc <- snpgdsHCluster(snpgdsIBS(file, snp.id=snpset.id, num.thread=2))
## ----fig.width=10, fig.height=5, fig.align='center'---------------------------------------------------------
# Determine groups of individuals by population information
rv <- snpgdsCutTree(ibs.hc, samp.group=as.factor(colors[pop.code]))
plot(rv$dendrogram, leaflab="none", main="1KG Phase 1, chromosome 22",
edgePar=list(col=rgb(0.5,0.5,0.5,0.75), t.col="black"))
legend("bottomleft", legend=legend.text, col=1:length(legend.text), pch=19, cex=0.75, ncol=4)
## ----fig.width=10, fig.height=5, fig.align='center'---------------------------------------------------------
# sliding windows (window size: 500kb)
sw <- snpgdsSlidingWindow(file, winsize=500000, shift=100000,
FUN="snpgdsFst", as.is="numeric", population=pop.code)
plot(sw$chr22.pos/1000, sw$chr22.val, xlab="genome coordinate (kb)", ylab="population-average Fst",
main="1KG Phase 1, chromosome 22")
abline(h=mean(sw$chr22.val), lty=3, col="red", lwd=2)
## -----------------------------------------------------------------------------------------------------------
seqClose(file)
## -----------------------------------------------------------------------------------------------------------
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.