inst/doc/DataCleaning.R

### R code from vignette source 'DataCleaning.Rnw'

###################################################
### code chunk number 1: DataCleaning.Rnw:124-153
###################################################
library(GWASTools)
library(GWASdata)
# Load the SNP annotation (simple data frame)
data(illumina_snp_annot)
# Create a SnpAnnotationDataFrame
snpAnnot <- SnpAnnotationDataFrame(illumina_snp_annot)
# names of columns
varLabels(snpAnnot)
# data
head(pData(snpAnnot))
# Add metadata to describe the columns
meta <- varMetadata(snpAnnot)
meta[c("snpID", "chromosome", "position", "rsID", "alleleA", "alleleB",
  "BeadSetID", "IntensityOnly", "tAA", "tAB", "tBB", "rAA", "rAB", "rBB"),
  "labelDescription"] <- c("unique integer ID for SNPs",
  paste("integer code for chromosome: 1:22=autosomes,",
   "23=X, 24=pseudoautosomal, 25=Y, 26=Mitochondrial, 27=Unknown"),
  "base pair position on chromosome (build 36)",
  "RS identifier",
  "alelleA", "alleleB",
  "BeadSet ID from Illumina",
  "1=no genotypes were attempted for this assay",
  "mean theta for AA cluster",
  "mean theta for AB cluster",
  "mean theta for BB cluster",
  "mean R for AA cluster",
  "mean R for AB cluster",
  "mean R for BB cluster")
varMetadata(snpAnnot) <- meta


###################################################
### code chunk number 2: DataCleaning.Rnw:159-168
###################################################
snpID <- snpAnnot$snpID
snpID <- getSnpID(snpAnnot)
chrom <- snpAnnot[["chromosome"]]
chrom <- getChromosome(snpAnnot)
table(chrom)
chrom <- getChromosome(snpAnnot, char=TRUE)
table(chrom)
position <- getPosition(snpAnnot)
rsID <- getVariable(snpAnnot, "rsID")


###################################################
### code chunk number 3: DataCleaning.Rnw:191-201
###################################################
tmp <- snpAnnot[,c("snpID", "chromosome", "position")]
snp <- getAnnotation(tmp)
snp$flag <- sample(c(TRUE, FALSE), nrow(snp), replace=TRUE)
pData(tmp) <- snp
meta <- getMetadata(tmp)
meta["flag", "labelDescription"] <- "flag"
varMetadata(tmp) <- meta
getVariableNames(tmp)
varLabels(tmp)[4] <- "FLAG"
rm(tmp)


###################################################
### code chunk number 4: DataCleaning.Rnw:229-256
###################################################
# Load the scan annotation (simple data frame)
data(illumina_scan_annot)
# Create a ScanAnnotationDataFrame
scanAnnot <- ScanAnnotationDataFrame(illumina_scan_annot)
# names of columns
varLabels(scanAnnot)
# data
head(pData(scanAnnot))
# Add metadata to describe the columns
meta <- varMetadata(scanAnnot)
meta[c("scanID", "subjectID", "family", "father", "mother",
  "CoriellID", "race", "sex", "status", "genoRunID", "plate",
  "batch", "file"), "labelDescription"] <-
   c("unique ID for scans",
  "subject identifier (may have multiple scans)",
  "family identifier",
  "father identifier as subjectID",
  "mother identifier as subjectID",
  "Coriell subject identifier",
  "HapMap population group",
  "sex coded as M=male and F=female",
  "simulated case/control status" ,
  "genotyping instance identifier",
  "plate containing samples processed together for genotyping chemistry",
  "simulated genotyping batch",
  "raw data file")
varMetadata(scanAnnot) <- meta


###################################################
### code chunk number 5: DataCleaning.Rnw:263-268
###################################################
scanID <- scanAnnot$scanID
scanID <- getScanID(scanAnnot)
sex <- scanAnnot[["sex"]]
sex <- getSex(scanAnnot)
subjectID <- getVariable(scanAnnot, "subjectID")


###################################################
### code chunk number 6: DataCleaning.Rnw:336-384
###################################################
# Define a path to the raw data files
path <- system.file("extdata", "illumina_raw_data", package="GWASdata")

geno.file <- "tmp.geno.gds"

# first 3 samples only
scan.annotation <- illumina_scan_annot[1:3, c("scanID", "genoRunID", "file")]
names(scan.annotation)[2] <- "scanName"

snp.annotation <- illumina_snp_annot[,c("snpID", "rsID", "chromosome", "position")]
# indicate which column of SNP annotation is referenced in data files
names(snp.annotation)[2] <-  "snpName"

col.nums <- as.integer(c(1,2,12,13))
names(col.nums) <- c("snp", "sample", "a1", "a2")

diag.geno.file <- "diag.geno.RData"
diag.geno <- createDataFile(path = path,
  filename = geno.file,
  file.type = "gds",
  variables = "genotype",
  snp.annotation = snp.annotation,
  scan.annotation = scan.annotation,
  sep.type = ",",
  skip.num = 11,
  col.total = 21,
  col.nums = col.nums,
  scan.name.in.file = 1,
  diagnostics.filename = diag.geno.file,
  verbose = FALSE)
# Look at the values included in the "diag.geno" object which holds
#   all output from the function call
names(diag.geno)
# `read.file' is a vector indicating whether (1) or not (0) each file
#   specified in the `files' argument was read successfully
table(diag.geno$read.file)
# `row.num' is a vector of the number of rows read from each file
table(diag.geno$row.num)
# `sample.match' is a vector indicating whether (1) or not (0)
#   the sample name inside the raw text file matches that in the
#   sample annotation data.frame
table(diag.geno$sample.match)
# `snp.chk' is a vector indicating whether (1) or not (0)
#   the raw text file has the expected set of SNP names
table(diag.geno$snp.chk)
# `chk' is a vector indicating whether (1) or not (0) all previous
#   checks were successful and the data were written to the data file
table(diag.geno$chk)


###################################################
### code chunk number 7: DataCleaning.Rnw:390-411
###################################################
check.geno.file <- "check.geno.RData"
check.geno <- checkGenotypeFile(path = path,
  filename = geno.file,
  file.type = "gds",
  snp.annotation = snp.annotation,
  scan.annotation = scan.annotation,
  sep.type = ",",
  skip.num = 11,
  col.total = 21,
  col.nums = col.nums,
  scan.name.in.file = 1,
  check.scan.index = 1:3,
  n.scans.loaded = 3,
  diagnostics.filename = check.geno.file,
  verbose = FALSE)
# Look at the values included in the "check.geno" object which holds
#   all output from the function call
names(check.geno)
# 'geno.chk' is a vector indicating whether (1) or not (0) the genotypes
#   match the text file
table(check.geno$geno.chk)


###################################################
### code chunk number 8: DataCleaning.Rnw:421-431
###################################################
(gds <- GdsGenotypeReader(geno.file))
nscan(gds)
nsnp(gds)
head(getScanID(gds))
head(getSnpID(gds))
head(getChromosome(gds))
head(getPosition(gds))
# genotypes for the first 3 samples and  the first 5 SNPs
getGenotype(gds, snp=c(1,5), scan=c(1,3))
close(gds)


###################################################
### code chunk number 9: DataCleaning.Rnw:459-478
###################################################
qxy.file <- "tmp.qxy.gds"

col.nums <- as.integer(c(1,2,5,16,17))
names(col.nums) <- c("snp", "sample", "quality", "X", "Y")

diag.qxy.file <- "diag.qxy.RData"
diag.qxy <- createDataFile(path = path,
  filename = qxy.file,
  file.type = "gds",
  variables = c("quality","X","Y"),
  snp.annotation = snp.annotation,
  scan.annotation = scan.annotation,
  sep.type = ",",
  skip.num = 11,
  col.total = 21,
  col.nums = col.nums,
  scan.name.in.file = 1,
  diagnostics.filename = diag.qxy.file,
  verbose = FALSE)


###################################################
### code chunk number 10: DataCleaning.Rnw:484-499
###################################################
check.qxy.file <- "check.qxy.RData"
check.qxy <- checkIntensityFile(path = path,
  filename = qxy.file,
  file.type = "gds",
  snp.annotation = snp.annotation,
  scan.annotation = scan.annotation,
  sep.type = ",",
  skip.num = 11,
  col.total = 21,
  col.nums = col.nums,
  scan.name.in.file = 1,
  check.scan.index = 1:3,
  n.scans.loaded = 3,
  diagnostics.filename = check.qxy.file,
  verbose = FALSE)


###################################################
### code chunk number 11: DataCleaning.Rnw:508-522
###################################################
(gds <- GdsIntensityReader(qxy.file))
nscan(gds)
nsnp(gds)
head(getScanID(gds))
head(getSnpID(gds))
head(getChromosome(gds))
head(getPosition(gds))
# quality score for the first 3 samples and  the first 5 SNPs
getQuality(gds, snp=c(1,5), scan=c(1,3))
# X intensity for the first 3 samples and  the first 5 SNPs
getX(gds, snp=c(1,5), scan=c(1,3))
# Y intensity for the first 3 samples and  the first 5 SNPs
getY(gds, snp=c(1,5), scan=c(1,3))
close(gds)


###################################################
### code chunk number 12: DataCleaning.Rnw:595-614
###################################################
bl.file <- "tmp.bl.gds"

col.nums <- as.integer(c(1,2,20,21))
names(col.nums) <- c("snp", "sample", "BAlleleFreq", "LogRRatio")

diag.bl.file <- "diag.bl.RData"
diag.bl <- createDataFile(path = path,
  filename = bl.file,
  file.type = "gds",
  variables = c("BAlleleFreq","LogRRatio"),
  snp.annotation = snp.annotation,
  scan.annotation = scan.annotation,
  sep.type = ",",
  skip.num = 11,
  col.total = 21,
  col.nums = col.nums,
  scan.name.in.file = 1,
  diagnostics.filename = diag.bl.file,
  verbose = FALSE)


###################################################
### code chunk number 13: DataCleaning.Rnw:621-625
###################################################
(gds <- GdsIntensityReader(bl.file))
getBAlleleFreq(gds, snp=c(1,5), scan=c(1,3))
getLogRRatio(gds, snp=c(1,5), scan=c(1,3))
close(gds)


###################################################
### code chunk number 14: DataCleaning.Rnw:629-632
###################################################
file.remove(geno.file, qxy.file, bl.file)
file.remove(diag.geno.file, diag.qxy.file, diag.bl.file)
file.remove(check.geno.file, check.qxy.file)


###################################################
### code chunk number 15: DataCleaning.Rnw:648-657
###################################################
genofile <- system.file("extdata", "illumina_geno.gds", package="GWASdata")
gds <- GdsGenotypeReader(genofile)
# only GDS file
genoData <- GenotypeData(gds)
# with scan annotation
genoData <- GenotypeData(gds, scanAnnot=scanAnnot)
# with scan and SNP annotation
genoData <- GenotypeData(gds, snpAnnot=snpAnnot, scanAnnot=scanAnnot)
genoData


###################################################
### code chunk number 16: DataCleaning.Rnw:664-683
###################################################
nsnp(genoData)
nscan(genoData)
# scan annotation
range(getScanID(genoData))
hasSex(genoData)
table(getSex(genoData))
hasScanVariable(genoData, "subjectID")
head(getScanVariable(genoData, "subjectID"))
getScanVariableNames(genoData)
# snp annotation
range(getSnpID(genoData))
table(getChromosome(genoData, char=TRUE))
head(getPosition(genoData))
hasSnpVariable(genoData, "rsID")
head(getSnpVariable(genoData, "rsID"))
getSnpVariableNames(genoData)
# genotypes
getGenotype(genoData, snp=c(1,5), scan=c(1,5))
close(genoData)


###################################################
### code chunk number 17: DataCleaning.Rnw:690-707
###################################################
# quality score, X and X intensity
qxyfile <- system.file("extdata", "illumina_qxy.gds", package="GWASdata")
gds <- GdsIntensityReader(qxyfile)
qxyData <- IntensityData(gds, snpAnnot=snpAnnot, scanAnnot=scanAnnot)
qxyData
getQuality(qxyData, snp=c(1,5), scan=c(1,5))
getX(qxyData, snp=c(1,5), scan=c(1,5))
getY(qxyData, snp=c(1,5), scan=c(1,5))
close(qxyData)
# BAF/LRR
blfile <- system.file("extdata", "illumina_bl.gds", package="GWASdata")
gds <- GdsIntensityReader(blfile)
blData <- IntensityData(gds, snpAnnot=snpAnnot, scanAnnot=scanAnnot)
blData
getBAlleleFreq(blData, snp=c(1,5), scan=c(1,5))
getLogRRatio(blData, snp=c(1,5), scan=c(1,5))
close(blData)


###################################################
### code chunk number 18: DataCleaning.Rnw:754-767
###################################################
# open the GDS file and create a GenotypeData object
gdsfile <- system.file("extdata", "illumina_geno.gds", package="GWASdata")
gds <- GdsGenotypeReader(gdsfile)
# sex is required for this function, so we need the scan annotation
genoData <-  GenotypeData(gds, scanAnnot=scanAnnot)
# Calculate the number of missing calls for each snp over all samples
#     for each sex separately
miss <- missingGenotypeBySnpSex(genoData)
# Examine the results
names(miss)
head(miss$missing.counts)
miss$scans.per.sex
head(miss$missing.fraction)


###################################################
### code chunk number 19: DataCleaning.Rnw:776-783
###################################################
# Make sure ordering matches snp annotation
allequal(snpAnnot$snpID, as.numeric(names(miss$missing.fraction)))
snpAnnot$missing.n1 <- miss$missing.fraction
varMetadata(snpAnnot)["missing.n1", "labelDescription"] <- paste(
  "fraction of genotype calls missing over all samples",
  "except that females are excluded for Y chr SNPs")
summary(snpAnnot$missing.n1)


###################################################
### code chunk number 20: DataCleaning.Rnw:790-793
###################################################
hist(snpAnnot$missing.n1, ylim=c(0,100),
     xlab="SNP missing call rate",
     main="Missing Call Rate for All Probes")


###################################################
### code chunk number 21: DataCleaning.Rnw:796-807
###################################################
# Find the number of SNPs with every call missing
length(snpAnnot$missing.n1[snpAnnot$missing.n1 == 1])
# Fraction of autosomal SNPs with missing call rate < 0.05
x <- snpAnnot$missing.n1[snpAnnot$chromosome < 23]
length(x[x < 0.05]) / length(x)
# Fraction of X chromosome SNPs with missing call rate < 0.05
x <- snpAnnot$missing.n1[snpAnnot$chromosome == 23]
length(x[x < 0.05]) / length(x)
# Fraction of Y chromosome SNPs with missing call rate < 0.05
x <- snpAnnot$missing.n1[snpAnnot$chromosome == 25]
length(x[x < 0.05]) / length(x)


###################################################
### code chunk number 22: DataCleaning.Rnw:825-839
###################################################
# Want to exclude all SNP probes with 100% missing call rate
# Check on how many SNPs to exclude
sum(snpAnnot$missing.n1 == 1)
# Create a variable that contains the IDs of these SNPs to exclude
snpexcl <- snpAnnot$snpID[snpAnnot$missing.n1 == 1]
length(snpexcl)
# Calculate the missing call rate per sample
miss <- missingGenotypeByScanChrom(genoData, snp.exclude=snpexcl)
names(miss)
head(miss$missing.counts)
head(miss$snps.per.chr)
# Check to make sure that the correct number of SNPs were excluded
sum(miss$snps.per.chr)
nrow(snpAnnot) - sum(miss$snps.per.chr)


###################################################
### code chunk number 23: DataCleaning.Rnw:846-855
###################################################
head(miss$missing.fraction)
# Check the ordering matches the sample annotation file
allequal(names(miss$missing.fraction), scanAnnot$scanID)
# Add the missing call rates vector to the sample annotation file
scanAnnot$missing.e1 <- miss$missing.fraction
varMetadata(scanAnnot)["missing.e1", "labelDescription"] <- paste(
  "fraction of genotype calls missing over all snps with missing.n1<1",
  "except that Y chr SNPs are excluded for females")
summary(scanAnnot$missing.e1)


###################################################
### code chunk number 24: DataCleaning.Rnw:867-870
###################################################
hist(scanAnnot$missing.e1,
     xlab="Fraction of missing calls over all probes",
     main="Histogram of Sample Missing Call Rate for all Samples")


###################################################
### code chunk number 25: DataCleaning.Rnw:873-879
###################################################
# Look at missing.e1 for males
summary(scanAnnot$missing.e1[scanAnnot$sex == "M"])
# Look at missing.e1 for females
summary(scanAnnot$missing.e1[scanAnnot$sex == "F"])
# Number of samples with missing call rate > 5%
sum(scanAnnot$missing.e1 > 0.05)


###################################################
### code chunk number 26: DataCleaning.Rnw:889-911
###################################################
auto <- colnames(miss$missing.counts) %in% 1:22
missa <- rowSums(miss$missing.counts[,auto]) / sum(miss$snps.per.chr[auto])
summary(missa)
missx <- miss$missing.counts[,"X"] / miss$snps.per.chr["X"]
summary(missx)
# check they match sample annotation file
allequal(names(missa), scanAnnot$scanID)
allequal(names(missx), scanAnnot$scanID)
# Add these separate sample missing call rates to the sample
# annotation
scanAnnot$miss.e1.auto <- missa
scanAnnot$miss.e1.xchr <- missx
# Order scanAnnot by missing.e1 so duplicate subjectIDs
# with a higher missing rate are marked as duplicates
scanAnnot <- scanAnnot[order(scanAnnot$subjectID, scanAnnot$missing.e1),]
scanAnnot$duplicated <- duplicated(scanAnnot$subjectID)
table(scanAnnot$duplicated, useNA="ifany")
# Put scanAnnot back in scanID order; this is very important!!
scanAnnot <- scanAnnot[order(scanAnnot$scanID),]
allequal(scanAnnot$scanID, sort(scanAnnot$scanID))
varMetadata(scanAnnot)["duplicated", "labelDescription"] <-
  "TRUE for duplicate scan with higher missing.e1"


###################################################
### code chunk number 27: DataCleaning.Rnw:928-938
###################################################
# Find the samples with missing.e1 > .05 and make a vector of
# scanID to exclude from the calculation
scan.exclude <- scanAnnot$scanID[scanAnnot$missing.e1 > 0.05]
# Call missingGenotypeBySnpSex and save the output
miss <- missingGenotypeBySnpSex(genoData, scan.exclude=scan.exclude)
snpAnnot$missing.n2 <- miss$missing.fraction
varMetadata(snpAnnot)["missing.n2", "labelDescription"] <- paste(
  "fraction of genotype calls missing over all samples with missing.e1<0.05",
  "except that females are excluded for Y chr SNPs")
summary(snpAnnot$missing.n2)


###################################################
### code chunk number 28: DataCleaning.Rnw:948-958
###################################################
# Create a vector of the SNPs to exclude.
snpexcl <- snpAnnot$snpID[snpAnnot$missing.n2 >= 0.05]
length(snpexcl)
miss <- missingGenotypeByScanChrom(genoData, snp.exclude=snpexcl)
# Add the missing call rates vector to the sample annotation file
scanAnnot$missing.e2 <- miss$missing.fraction
varMetadata(scanAnnot)["missing.e2", "labelDescription"] <- paste(
  "fraction of genotype calls missing over all snps with missing.n2<0.05",
  "except that Y chr SNPs are excluded for females")
summary(scanAnnot$missing.e2)


###################################################
### code chunk number 29: DataCleaning.Rnw:964-967
###################################################
hist(scanAnnot$missing.e2, xlab="Fraction of missing calls over all probes
     with missing call rate < 0.05",
     main="Histogram of Sample Missing Call Rate for all Samples")


###################################################
### code chunk number 30: DataCleaning.Rnw:980-984
###################################################
varLabels(scanAnnot)
# Check how many batches exist and how many samples are in each batch
length(unique(scanAnnot$batch))
table(scanAnnot$batch, useNA="ifany")


###################################################
### code chunk number 31: DataCleaning.Rnw:987-991
###################################################
# Plot the distribution of the number of samples per batch.
barplot(table(scanAnnot$batch),
        ylab="Number of Samples", xlab="Batch",
        main="Distribution of Samples per Batch")


###################################################
### code chunk number 32: DataCleaning.Rnw:994-1003
###################################################
# Examine the mean missing call rate per batch for all SNPs
batches <- unique(scanAnnot$batch)
bmiss <- rep(NA,length(batches)); names(bmiss) <- batches
bn <- rep(NA,length(batches)); names(bn) <- batches
for(i in 1:length(batches)) {
  x <- scanAnnot$missing.e1[is.element(scanAnnot$batch, batches[i])]
  bmiss[i] <- mean(x)
  bn[i] <- length(x)
}


###################################################
### code chunk number 33: DataCleaning.Rnw:1014-1016
###################################################
y <- lm(bmiss ~ bn)
anova(y)


###################################################
### code chunk number 34: DataCleaning.Rnw:1019-1023
###################################################
plot(bn, bmiss,
 xlab="Number of samples per batch", ylab="Mean missing call rate",
 main="Mean Missing Call Rate vs\nSamples per Batch")
abline(y$coefficients)


###################################################
### code chunk number 35: DataCleaning.Rnw:1044-1052
###################################################
res <- batchChisqTest(genoData, batchVar="batch", return.by.snp=TRUE)
close(genoData)
# chi-square values for each SNP
dim(res$chisq)
# genomic inflation factor
res$lambda
# average chi-square test statistic for each of the batches
res$mean.chisq


###################################################
### code chunk number 36: DataCleaning.Rnw:1057-1075
###################################################
x <- table(scanAnnot$race, useNA="ifany")
x
x[1] / sum(x)
x[2] / sum(x)
x <- table(scanAnnot$race, scanAnnot$batch)
x
# Run an approximate chi-square test to see if there are ethnic effects
chisq <- chisq.test(x)
chisq$p.value
# Calculate the fraction of samples in each batch that are CEU
batches <- unique(scanAnnot$batch)
eth <- rep(NA,length(batches)); names(eth) <- sort(batches)
for(i in 1:length(batches)){
  x <- scanAnnot$race[is.element(scanAnnot$batch, batches[i])]
  xl <- length(x[x == "CEU"])
  eth[i] <- xl / length(x)
}
allequal(names(eth), names(res$mean.chisq))


###################################################
### code chunk number 37: DataCleaning.Rnw:1078-1085
###################################################
# Plot the average Chi-Square test statistic against the
#     fraction of samples that are CEU
plot(eth, res$mean.chisq, xlab="Fraction of CEU Samples per Batch",
  ylab="Average Chi-square Test Statistic",
  main="Fraction of CEU Samples per Batch
  vs Average Chi-square Test Statistic")
abline(v=mean(eth), lty=2, col="red")


###################################################
### code chunk number 38: DataCleaning.Rnw:1116-1124
###################################################
qxyfile <- system.file("extdata", "illumina_qxy.gds", package="GWASdata")
qualGDS <- GdsIntensityReader(qxyfile)
qualData <- IntensityData(qualGDS, scanAnnot=scanAnnot)
genofile <- system.file("extdata", "illumina_geno.gds", package="GWASdata")
genoGDS <- GdsGenotypeReader(genofile)
genoData <- GenotypeData(genoGDS, scanAnnot=scanAnnot)
qual.results <- qualityScoreByScan(qualData, genoData)
close(qualData)


###################################################
### code chunk number 39: DataCleaning.Rnw:1131-1133
###################################################
hist(qual.results[,"median.quality"], main="Median Genotype Quality Scores
  of Samples", xlab="Median Quality")


###################################################
### code chunk number 40: DataCleaning.Rnw:1164-1171
###################################################
blfile <- system.file("extdata", "illumina_bl.gds", package="GWASdata")
blGDS <- GdsIntensityReader(blfile)
blData <- IntensityData(blGDS, scanAnnot=scanAnnot)
nbins <- rep(12, 3)
slidingBAF12 <- sdByScanChromWindow(blData, genoData, nbins=nbins)
names(slidingBAF12)
dim(slidingBAF12[["21"]])


###################################################
### code chunk number 41: DataCleaning.Rnw:1179-1182
###################################################
sds.chr <- meanSdByChromWindow(slidingBAF12, scanAnnot$sex)
sds.chr[["21"]]
sds.chr[["X"]]


###################################################
### code chunk number 42: DataCleaning.Rnw:1188-1192
###################################################
res12bin4sd <- findBAFvariance(sds.chr, slidingBAF12, scanAnnot$sex,
                               sd.threshold=4)
head(res12bin4sd)
table(res12bin4sd[, "chromosome"])


###################################################
### code chunk number 43: DataCleaning.Rnw:1199-1205
###################################################
scanID <- as.integer(res12bin4sd[, "scanID"])
chrom <- as.integer(res12bin4sd[, "chromosome"])
chrom[res12bin4sd[, "chromosome"] == "X"] <- 23
bincode <- paste("Bin", res12bin4sd[, "bin"], sep = " ")
chromIntensityPlot(blData, scanID, chrom, info=bincode, ideogram=FALSE)
close(blData)


###################################################
### code chunk number 44: DataCleaning.Rnw:1231-1235
###################################################
miss <- missingGenotypeByScanChrom(genoData)
miss.rate <- t(apply(miss$missing.counts, 1, function(x) {
  x / miss$snps.per.chr}))
miss.rate <- as.data.frame(miss.rate)


###################################################
### code chunk number 45: DataCleaning.Rnw:1239-1242
###################################################
cols <- names(miss.rate) %in% c(1:22, "X", "XY")
boxplot(miss.rate[,cols], main="Missingness by Chromosome",
  ylab="Proportion Missing", xlab="Chromosome")


###################################################
### code chunk number 46: DataCleaning.Rnw:1245-1248
###################################################
boxplot(miss.rate$X ~ scanAnnot$sex,
  main="X Chromosome Missingness by Sex",
  ylab="Proportion Missing")


###################################################
### code chunk number 47: DataCleaning.Rnw:1255-1267
###################################################
# Calculate heterozygosity by scan by chromosome
het.results <- hetByScanChrom(genoData)
close(genoData)
# Ensure heterozygosity results are ordered correctly
allequal(scanAnnot$scanID, rownames(het.results))
# Write autosomal and X chr heterozygosity to sample annot
scanAnnot$het.A <- het.results[,"A"]
scanAnnot$het.X <- het.results[,"X"]
varMetadata(scanAnnot)["het.A", "labelDescription"] <-
  "fraction of heterozygotes for autosomal SNPs"
varMetadata(scanAnnot)["het.X", "labelDescription"] <-
  "fraction of heterozygotes for X chromosome SNPs"


###################################################
### code chunk number 48: DataCleaning.Rnw:1275-1277
###################################################
boxplot(scanAnnot$het.A ~ scanAnnot$race,
  main="Autosomal Heterozygosity")


###################################################
### code chunk number 49: DataCleaning.Rnw:1280-1283
###################################################
female <- scanAnnot$sex == "F"
boxplot(scanAnnot$het.X[female] ~ scanAnnot$race[female],
  main="X Chromosome Heterozygosity in Females")


###################################################
### code chunk number 50: DataCleaning.Rnw:1320-1325
###################################################
qxyfile <- system.file("extdata", "illumina_qxy.gds", package="GWASdata")
intenGDS <- GdsIntensityReader(qxyfile)
inten.by.chrom <- meanIntensityByScanChrom(intenGDS)
close(intenGDS)
names(inten.by.chrom)


###################################################
### code chunk number 51: DataCleaning.Rnw:1333-1343
###################################################
mninten <- inten.by.chrom[[1]]  # mean intensities
dim(mninten)
# Check to be sure sample ordering is consistent
allequal(scanAnnot$scanID, rownames(mninten))
# Assign each sex a color
xcol <- rep(NA, nrow(scanAnnot))
xcol[scanAnnot$sex == "M"] <- "blue"
xcol[scanAnnot$sex == "F"] <- "red"
nx <- sum(snpAnnot$chromosome == 23)
ny <- sum(snpAnnot$chromosome == 25)


###################################################
### code chunk number 52: DataCleaning.Rnw:1353-1385
###################################################
#All intensities
x1 <-mninten[,"X"]; y1 <- mninten[,"Y"]
main1 <- "Mean X vs \nMean Y Chromosome Intensity"
#Het on X vs X intensity
x2 <- mninten[,"X"]; y2 <- scanAnnot$het.X
main2 <- "Mean X Chromosome Intensity vs
  Mean X Chromosome Heterozygosity"
# Het on X vs Y intensity
y3 <- mninten[,"Y"]; x3 <- scanAnnot$het.X
main3 <- "Mean X Chromosome Heterozygosity vs
  Mean Y Chromosome Intensity"
# X vs A het
x4 <- scanAnnot$het.A[scanAnnot$sex == "F"]
y4 <- scanAnnot$het.X[scanAnnot$sex == "F"]
main4 <- "Mean Autosomal Heterozygosity vs
  Mean X Chromosome Heterozygosity"
cols <- c("blue","red")
mf <- c("male", "female")
xintenlab <- paste("X intensity (n=", nx, ")", sep="")
yintenlab <- paste("Y intensity (n=", ny, ")", sep="")
pdf("DataCleaning-sex.pdf")
par(mfrow=c(2,2))
plot(x1, y1, xlab=xintenlab, ylab=yintenlab,
  main=main1, col=xcol, cex.main=0.8)
legend("topright",mf,col=cols,pch=c(1,1))
plot(x2, y2, col=xcol, xlab=xintenlab,
  ylab="X heterozygosity", main=main2, cex.main=0.8)
plot(x3, y3, col=xcol, ylab=yintenlab,
  xlab="X heterozygosity", main=main3, cex.main=0.8)
plot(x4,y4, col="red", xlab="Autosomal heterozygosity",
  ylab="X heterozygosity", main=main4, cex.main=0.8)
dev.off()


###################################################
### code chunk number 53: DataCleaning.Rnw:1412-1420
###################################################
library(SNPRelate)
gdsfile <- system.file("extdata", "illumina_geno.gds", package="GWASdata")
gdsobj <- snpgdsOpen(gdsfile)
ibdobj <- snpgdsIBDKING(gdsobj)
snpgdsClose(gdsobj)
names(ibdobj)
dim(ibdobj$kinship)
ibdobj$kinship[1:5,1:5]


###################################################
### code chunk number 54: DataCleaning.Rnw:1431-1436
###################################################
ped <- pData(scanAnnot)[,c("family", "subjectID", "father", "mother", "sex")]
dim(ped)
names(ped) <- c("family", "individ", "father", "mother", "sex")
ped[1:5, ]
(chk <- pedigreeCheck(ped))


###################################################
### code chunk number 55: DataCleaning.Rnw:1442-1445
###################################################
dups <- chk$duplicates
uni.ped <- pedigreeDeleteDuplicates(ped, dups)
(chk <- pedigreeCheck(uni.ped))


###################################################
### code chunk number 56: DataCleaning.Rnw:1450-1456
###################################################
ni <- chk$parent.no.individ.entry
parent <- data.frame(family=ni$family, individ=ni$parentID,
                     father=0, mother=0, sex="F",
                     stringsAsFactors=FALSE)
ped.complete <- rbind(uni.ped, parent)
(chk <- pedigreeCheck(ped.complete))


###################################################
### code chunk number 57: DataCleaning.Rnw:1464-1472
###################################################
ped.complete <- ped.complete[ped.complete$family != 58,]
subf <- chk$subfamilies.ident
table(subf$family)
subf.ids <- subf$individ[subf$subfamily == 2]
newfam <- ped.complete$individ %in% subf.ids
ped.complete$family[newfam] <- paste0(ped.complete$family[newfam], "-2")
table(ped.complete$family)
pedigreeCheck(ped.complete)


###################################################
### code chunk number 58: DataCleaning.Rnw:1482-1487
###################################################
rels <- pedigreePairwiseRelatedness(ped.complete)
length(rels$inbred.fam)
relprs <- rels$relativeprs
relprs[1:5,]
table(relprs$relation)


###################################################
### code chunk number 59: DataCleaning.Rnw:1497-1515
###################################################
samp <- pData(scanAnnot)[,c("scanID", "subjectID")]
samp <- samp[match(ibdobj$sample.id, samp$scanID),]
names(samp) <- c("scanID", "Individ")
ibd <- snpgdsIBDSelection(ibdobj, kinship.cutoff=1/32)
ibd <- merge(ibd, samp, by.x="ID1", by.y="scanID")
ibd <- merge(ibd, samp, by.x="ID2", by.y="scanID", suffixes=c("1","2"))
ibd$ii <- pasteSorted(ibd$Individ1, ibd$Individ2)

relprs$ii <- pasteSorted(relprs$Individ1, relprs$Individ2)
ibd <- merge(ibd, relprs[,c("ii","relation")], all.x=TRUE)
names(ibd)[names(ibd) == "relation"] <- "exp.rel"
ibd$exp.rel[ibd$Individ1 == ibd$Individ2] <- "Dup"
ibd$exp.rel[is.na(ibd$exp.rel)] <- "U"
table(ibd$exp.rel, useNA="ifany")

# assign observed relationships
ibd$obs.rel <- ibdAssignRelatednessKing(ibd$IBS0, ibd$kinship)
table(ibd$obs.rel, useNA="ifany")


###################################################
### code chunk number 60: DataCleaning.Rnw:1521-1533
###################################################
## thresholds for assigning relationships using kinship coefficients
## in table 1 of Manichaikul (2010)
cut.dup <- 1/(2^(3/2))
cut.deg1 <- 1/(2^(5/2))
cut.deg2 <- 1/(2^(7/2))
cut.deg3 <- 1/(2^(9/2))
cols <- c(Dup="magenta", PO="cyan", U="black")

plot(ibd$IBS0, ibd$kinship, col=cols[ibd$exp.rel],
     xlab="Fraction of IBS=0", ylab="Kinship coefficient")
abline(h=c(cut.deg1, cut.deg2, cut.deg3, cut.dup), lty=2, col="gray")
legend("topright", legend=names(cols), col=cols, pch=1)


###################################################
### code chunk number 61: DataCleaning.Rnw:1562-1585
###################################################
filt <- get(data(pcaSnpFilters.hg18))
chrom <- getChromosome(snpAnnot)
pos <- getPosition(snpAnnot)
snpID <- getSnpID(snpAnnot)
snp.filt <- rep(TRUE, length(snpID))
for (f in 1:nrow(filt)) {
  snp.filt[chrom == filt$chrom[f] & filt$start.base[f] < pos
           & pos < filt$end.base[f]] <- FALSE
}
snp.sel <- snpID[snp.filt]
length(snp.sel)

sample.sel <- scanAnnot$scanID[!scanAnnot$duplicated]
length(sample.sel)

gdsobj <- snpgdsOpen(gdsfile)
snpset <- snpgdsLDpruning(gdsobj, sample.id=sample.sel, snp.id=snp.sel,
                          autosome.only=TRUE, maf=0.05, missing.rate=0.05,
                          method="corr", slide.max.bp=10e6,
                          ld.threshold=sqrt(0.1))

snp.pruned <- unlist(snpset, use.names=FALSE)
length(snp.pruned)


###################################################
### code chunk number 62: DataCleaning.Rnw:1591-1595
###################################################
pca <- snpgdsPCA(gdsobj, sample.id=sample.sel, snp.id=snp.pruned)
names(pca)
length(pca$eigenval)
dim(pca$eigenvect)


###################################################
### code chunk number 63: DataCleaning.Rnw:1601-1609
###################################################
# Calculate the percentage of variance explained
# by each principal component.
pc.frac <- pca$eigenval/sum(pca$eigenval)
lbls <- paste("EV", 1:4, "\n", format(pc.frac[1:4], digits=2), sep="")
samp <- pData(scanAnnot)[match(pca$sample.id, scanAnnot$scanID),]
cols <- rep(NA, nrow(samp))
cols[samp$race == "CEU"] <- "blue"
cols[samp$race == "YRI"] <- "red"


###################################################
### code chunk number 64: DataCleaning.Rnw:1612-1614
###################################################
pairs(pca$eigenvect[,1:4], col=cols, labels=lbls,
  main = "CEU: blue, YRI: red")


###################################################
### code chunk number 65: DataCleaning.Rnw:1625-1638
###################################################
par.coord <- pca$eigenvect
rangel <- apply(par.coord, 2, function(x) range(x)[1])
rangeh <- apply(par.coord, 2, function(x) range(x)[2])
std.coord <- par.coord
for (i in 1:14)
  std.coord[,i] <- (par.coord[,i] - rangel[i])/(rangeh[i]-rangel[i])
plot(c(0,15), c(0,1), type = 'n', axes = FALSE, ylab = "", xlab = "",
 main = "Parallel Coordinates Plot
 CEU: blue, YRI: red")
for (j in 1:13)
  for (i in sample(1:nrow(std.coord)) )
    lines(c(j,j+1), std.coord[i,c(j,j+1)], col=cols[i], lwd=0.25)
axis(1, at = 1:14, labels = paste("PC",1:14, sep = "."))


###################################################
### code chunk number 66: DataCleaning.Rnw:1648-1659
###################################################
corr <- snpgdsPCACorr(pca, gdsobj, eig.which=1:4)
snpgdsClose(gdsobj)
snp <- snpAnnot[match(corr$snp.id, snpID),]
chrom <- getChromosome(snp, char=TRUE)
pdf("DataCleaning-corr.pdf")
par(mfrow=c(4,1))
for (i in 1:4) {
  snpCorrelationPlot(abs(corr$snpcorr[i,]), chrom,
                     main=paste("Eigenvector",i), ylim=c(0,1))
}
dev.off()


###################################################
### code chunk number 67: DataCleaning.Rnw:1680-1685
###################################################
princomp <- as.data.frame(pca$eigenvect)
samples.nodup <- pData(scanAnnot)[!scanAnnot$duplicated,]
princomp$scanID <- as.factor(samples.nodup$scanID)
princomp$case.ctrl.status <- as.factor(samples.nodup$status)
princomp$race <- as.factor(samples.nodup$race)


###################################################
### code chunk number 68: DataCleaning.Rnw:1691-1698
###################################################
pc.percent <- 100 * pca$eigenval[1:32]/sum(pca$eigenval)
pc.percent
lbls <- paste("EV", 1:3, "\n", format(pc.percent[1:3], digits=2), "%", sep="")
table(samples.nodup$status)
cols <- rep(NA, nrow(samples.nodup))
cols[samples.nodup$status == 1] <- "green"
cols[samples.nodup$status == 0] <- "magenta"


###################################################
### code chunk number 69: DataCleaning.Rnw:1708-1710
###################################################
pairs(pca$eigenvect[,1:3], col=cols, labels=lbls,
  main = "First Three EVs by Case-Control Status")


###################################################
### code chunk number 70: DataCleaning.Rnw:1713-1715
###################################################
boxplot(princomp[, 1] ~ princomp$case.ctrl.status,
  ylab = "PC1", main = "PC1 vs. Case-control Status")


###################################################
### code chunk number 71: DataCleaning.Rnw:1718-1720
###################################################
boxplot(princomp[, 2] ~ princomp$case.ctrl.status,
  ylab = "PC2", main = "PC2 vs. Case-control Status")


###################################################
### code chunk number 72: DataCleaning.Rnw:1723-1725
###################################################
boxplot(princomp[, 3] ~ princomp$case.ctrl.status,
  ylab = "PC3", main = "PC3 vs. Case-control Status")


###################################################
### code chunk number 73: DataCleaning.Rnw:1728-1737
###################################################
aov.p1 <- aov(princomp[,1] ~ princomp$race *
  princomp$case.ctrl.status, princomp)
summary(aov.p1)
aov.p2 <- aov(princomp[,2] ~ princomp$race *
  princomp$case.ctrl.status, princomp)
summary(aov.p2)
aov.p3 <- aov(princomp[,3] ~ princomp$race *
  princomp$case.ctrl.status, princomp)
summary(aov.p3)


###################################################
### code chunk number 74: DataCleaning.Rnw:1757-1759
###################################################
lm.all <- lm(scanAnnot$missing.e1 ~ scanAnnot$status)
summary(aov(lm.all))


###################################################
### code chunk number 75: DataCleaning.Rnw:1762-1764
###################################################
boxplot(scanAnnot$missing.e1 ~ scanAnnot$status, ylab =
  "Mean missing call rate", main="Mean missing call rate by case status")


###################################################
### code chunk number 76: DataCleaning.Rnw:1780-1787
###################################################
blfile <- system.file("extdata", "illumina_bl.gds", package="GWASdata")
blgds <- GdsIntensityReader(blfile)
blData <-  IntensityData(blgds, snpAnnot=snpAnnot, scanAnnot=scanAnnot)

genofile <- system.file("extdata", "illumina_geno.gds", package="GWASdata")
genogds <- GdsGenotypeReader(genofile)
genoData <-  GenotypeData(genogds, snpAnnot=snpAnnot, scanAnnot=scanAnnot)


###################################################
### code chunk number 77: DataCleaning.Rnw:1792-1795
###################################################
baf.sd <- sdByScanChromWindow(blData, genoData, var="BAlleleFreq")
med.baf.sd <- medianSdOverAutosomes(baf.sd)
low.qual.ids <- med.baf.sd$scanID[med.baf.sd$med.sd > 0.05]


###################################################
### code chunk number 78: DataCleaning.Rnw:1800-1817
###################################################
chrom <- getChromosome(snpAnnot, char=TRUE)
pos <- getPosition(snpAnnot)
hla.df <- get(data(HLA.hg18))
hla <- chrom == "6" & pos >= hla.df$start.base & pos <= hla.df$end.base
xtr.df <- get(data(pseudoautosomal.hg18))
xtr <- chrom == "X" & pos >= xtr.df["X.XTR", "start.base"] &
  pos <= xtr.df["X.XTR", "end.base"]
centromeres <- get(data(centromeres.hg18))
gap <- rep(FALSE, nrow(snpAnnot))
for (i in 1:nrow(centromeres)) {
  ingap <- chrom == centromeres$chrom[i] & pos > centromeres$left.base[i] &
    pos < centromeres$right.base[i]
  gap <- gap | ingap
}
ignore <- snpAnnot$missing.n1 == 1 #ignore includes intensity-only and failed snps
snp.exclude <- ignore | hla | xtr | gap
snp.ok <- snpAnnot$snpID[!snp.exclude]


###################################################
### code chunk number 79: DataCleaning.Rnw:1822-1827
###################################################
scan.ids <- scanAnnot$scanID[1:10]
chrom.ids <- 21:23
baf.seg <- anomSegmentBAF(blData, genoData, scan.ids=scan.ids,
  chrom.ids=chrom.ids, snp.ids=snp.ok, verbose=FALSE)
head(baf.seg)


###################################################
### code chunk number 80: DataCleaning.Rnw:1832-1838
###################################################
baf.anom <- anomFilterBAF(blData, genoData, segments=baf.seg,
  snp.ids=snp.ok, centromere=centromeres, low.qual.ids=low.qual.ids,
  verbose=FALSE)
names(baf.anom)
baf.filt <- baf.anom$filtered
head(baf.filt)


###################################################
### code chunk number 81: DataCleaning.Rnw:1850-1856
###################################################
loh.anom <- anomDetectLOH(blData, genoData, scan.ids=scan.ids,
  chrom.ids=chrom.ids, snp.ids=snp.ok, known.anoms=baf.filt,
  verbose=FALSE)
names(loh.anom)
loh.filt <- loh.anom$filtered
head(loh.filt)


###################################################
### code chunk number 82: DataCleaning.Rnw:1865-1879
###################################################
# create required data frame
baf.filt$method <- "BAF"
if (!is.null(loh.filt)) {
  loh.filt$method <- "LOH"
  cols <- intersect(names(baf.filt), names(loh.filt))
  anoms <- rbind(baf.filt[,cols], loh.filt[,cols])
} else {
  anoms <- baf.filt
}
anoms$anom.id <- 1:nrow(anoms)

stats <- anomSegStats(blData, genoData, snp.ids=snp.ok, anom=anoms,
                      centromere=centromeres)
names(stats)


###################################################
### code chunk number 83: DataCleaning.Rnw:1885-1888
###################################################
snp.not.ok <- snpAnnot$snpID[snp.exclude]
anomStatsPlot(blData, genoData, anom.stats=stats[1,],
  snp.ineligible=snp.not.ok, centromere=centromeres, cex.leg=1)


###################################################
### code chunk number 84: DataCleaning.Rnw:1898-1900
###################################################
lrr.sd <- sdByScanChromWindow(blData, var="LogRRatio", incl.hom=TRUE)
med.lrr.sd <- medianSdOverAutosomes(lrr.sd)


###################################################
### code chunk number 85: DataCleaning.Rnw:1906-1908
###################################################
baf.seg.info <- baf.anom$seg.info
loh.seg.info <- loh.anom$base.info[,c("scanID", "chromosome", "num.segs")]


###################################################
### code chunk number 86: DataCleaning.Rnw:1918-1923
###################################################
snpAnnot$eligible <- !snp.exclude
baf.low.qual <- anomIdentifyLowQuality(snpAnnot, med.baf.sd, baf.seg.info,
  sd.thresh=0.1, sng.seg.thresh=0.0008, auto.seg.thresh=0.0001)
loh.low.qual <- anomIdentifyLowQuality(snpAnnot, med.lrr.sd, loh.seg.info,
  sd.thresh=0.25, sng.seg.thresh=0.0048, auto.seg.thresh=0.0006)


###################################################
### code chunk number 87: DataCleaning.Rnw:1928-1930
###################################################
close(blData)
close(genoData)


###################################################
### code chunk number 88: DataCleaning.Rnw:1945-1957
###################################################
# anomalies to filter
anom.filt <- stats[,c("scanID", "chromosome", "left.base", "right.base")]
# whole.chrom column is required and can be used for sex chromosome
#   anomalies such as XXX
anom.filt$whole.chrom <- FALSE
# select unique subjects
subj <- scanAnnot$scanID[!scanAnnot$duplicated]
subj.filt.file <- "subj_filt.gds"
setMissingGenotypes(genofile, subj.filt.file, anom.filt,
                    file.type="gds", sample.include=subj, verbose=FALSE)
(gds <- GdsGenotypeReader(subj.filt.file))
close(gds)


###################################################
### code chunk number 89: DataCleaning.Rnw:1982-1984
###################################################
scan.excl <- scanAnnot$scanID[scanAnnot$missing.e1 >= 0.05]
length(scan.excl)


###################################################
### code chunk number 90: DataCleaning.Rnw:1995-2026
###################################################
snp.excl <- snpAnnot$snpID[snpAnnot$missing.n1 == 1]
length(snp.excl)
genofile <- system.file("extdata", "illumina_geno.gds", package="GWASdata")
genoGDS <- GdsGenotypeReader(genofile)
genoData <- GenotypeData(genoGDS, snpAnnot=snpAnnot, scanAnnot=scanAnnot)
dupdisc <- duplicateDiscordance(genoData, subjName.col="subjectID",
  scan.exclude=scan.excl, snp.exclude=snp.excl)
names(dupdisc)
head(dupdisc$discordance.by.snp)
length(dupdisc$discordance.by.subject)
dupdisc$discordance.by.subject[[2]]
# each entry is a 2x2 matrix, but only one value of each
# is important since these are all pairs
npair <- length(dupdisc$discordance.by.subject)
disc.subj <- rep(NA, npair)
subjID <- rep(NA, npair)
race <- rep(NA, npair)
for (i in 1:npair) {
  disc.subj[i] <- dupdisc$discordance.by.subject[[i]][1,2]
  subjID[i] <- names(dupdisc$discordance.by.subject)[i]
  race[i] <- scanAnnot$race[scanAnnot$subjectID == subjID[i]][1]
}
dat <- data.frame(subjID=subjID, disc=disc.subj, pop=race,
                  stringsAsFactors=FALSE)
summary(dat$disc)
# Assign colors for the duplicate samples based on population group.
dat$col <- NA
dat$col[dat$pop == "CEU"] <- "blue"
dat$col[dat$pop == "YRI"] <- "red"
dat <- dat[order(dat$disc),]
dat$rank <- 1:npair


###################################################
### code chunk number 91: DataCleaning.Rnw:2029-2034
###################################################
# Plot the sample discordance rates color coded by race.
plot(dat$disc, dat$rank, col=dat$col, ylab="rank",
  xlab="Discordance rate between duplicate samples",
  main="Duplicate Sample Discordance by Continental Ancestry")
legend("bottomright", unique(dat$pop), pch=rep(1,2), col=unique(dat$col))


###################################################
### code chunk number 92: DataCleaning.Rnw:2058-2059
###################################################
duplicateDiscordanceProbability(npair)


###################################################
### code chunk number 93: DataCleaning.Rnw:2080-2092
###################################################
men.list <- with(pData(scanAnnot), mendelList(family, subjectID,
  father, mother, sex, scanID))
res <- mendelListAsDataFrame(men.list)
head(res)
dim(res)
# Only want to use SNPs with missing.n1 < 0.05
snp.excl <- snpAnnot$snpID[snpAnnot$missing.n1 >= 0.05]
length(snp.excl)
mend <- mendelErr(genoData, men.list, snp.exclude=snp.excl)
names(mend)
head(mend$trios)
names(mend$snp)


###################################################
### code chunk number 94: DataCleaning.Rnw:2102-2105
###################################################
# Calculate the error rate
err <- mend$snp$error.cnt / mend$snp$check.cnt
table(err == 0, useNA="ifany")


###################################################
### code chunk number 95: DataCleaning.Rnw:2108-2110
###################################################
plot(err, rank(err), xlab="Error Rate (fraction)",
  ylab="rank", main="Mendelian Error Rate per SNP, ranked")


###################################################
### code chunk number 96: DataCleaning.Rnw:2118-2130
###################################################
fam <- mend$snp$error.cnt
n <- mend$snp$check.cnt
summary(fam)
# SNPs with errors
length(fam[n > 0 & fam > 0])
# SNPs for which more than one family has an error
length(fam[n > 0 & fam > 1])
# Get the SNPs with valid trios for error detection
val <- length(fam[n > 0])
noerr <- length(fam[n > 0 & fam == 0])
# Divide to get fraction with no errors
noerr / val


###################################################
### code chunk number 97: DataCleaning.Rnw:2137-2150
###################################################
snp.sel <- match(names(mend$snp$error.cnt), snpAnnot$snpID)
snpAnnot$mendel.err.count[snp.sel] <- mend$snp$error.cnt
snpAnnot$mendel.err.sampsize[snp.sel] <- mend$snp$check.cnt
allequal(snpAnnot$snpID, sort(snpAnnot$snpID))
# The high number of NA values is due to the filtering out of SNPs
#    before the Mendelian error rate calculation
sum(is.na(snpAnnot$mendel.err.count))
sum(is.na(snpAnnot$mendel.err.sampsize))
varMetadata(snpAnnot)["mendel.err.count", "labelDescription"] <-
  paste("number of Mendelian errors detected in trios averaged over",
        "multiple combinations of replicate genotyping instances")
varMetadata(snpAnnot)["mendel.err.sampsize", "labelDescription"] <-
  "number of opportunities to detect Mendelian error in trios"


###################################################
### code chunk number 98: DataCleaning.Rnw:2162-2179
###################################################
# Get a vector of SNPs to check
snp <- pData(snpAnnot)
snp$err.rate <- snp$mendel.err.count /
  snp$mendel.err.sampsize
snp <- snp[order(snp$err.rate, decreasing=TRUE),]
snp <- snp[1:9,]
xyfile <- system.file("extdata", "illumina_qxy.gds", package="GWASdata")
xyGDS <- GdsIntensityReader(xyfile)
xyData <- IntensityData(xyGDS, snpAnnot=snpAnnot, scanAnnot=scanAnnot)
pdf(file="DataCleaning-mendel.pdf")
par(mfrow = c(3,3))
mtxt <- paste("SNP", snp$rsID, "\nMendelian Error Rate",
   format(snp$err.rate, digits=5))
genoClusterPlot(xyData, genoData, snpID=snp$snpID, main.txt=mtxt,
                cex.main=0.9)
dev.off()
close(xyData)


###################################################
### code chunk number 99: DataCleaning.Rnw:2192-2209
###################################################
# Calculate the fraction of SNPs with an error for each trio
trios <- mend$trios
trios$Mend.err <- trios$Men.err.cnt/trios$Men.cnt
summary(trios$Mend.err)
# Start by pulling out the vectors needed from `trios'
tmp <- trios[, c("fam.id", "Mend.err")]; dim(tmp)
# Change fam.id to match the sample annotation column name
names(tmp) <- c("family", "Mend.err.rate.fam")
# Merge the variables into the sample annotation file
scanAnnot$mend.err.rate.fam <- NA
for (i in 1:nrow(tmp)) {
  ind <- which(is.element(scanAnnot$family, tmp$family[i]))
  scanAnnot$mend.err.rate.fam[ind] <- tmp$Mend.err.rate.fam[i]
}
head(scanAnnot$mend.err.rate.fam)
varMetadata(scanAnnot)["mend.err.rate.fam", "labelDescription"] <-
  "Mendelian error rate per family"


###################################################
### code chunk number 100: DataCleaning.Rnw:2217-2228
###################################################
# Get the families that have non-NA values for the family
#     Mendelian error rate
fams <- pData(scanAnnot)[!is.na(scanAnnot$mend.err.rate.fam) &
  !duplicated(scanAnnot$family), c("family",
  "mend.err.rate.fam", "race")]
dim(fams)
table(fams$race, useNA="ifany")
# Assign colors for the different ethnicities in these families
pcol <- rep(NA, nrow(fams))
pcol[fams$race == "CEU"] <- "blue"
pcol[fams$race == "YRI"] <- "red"


###################################################
### code chunk number 101: DataCleaning.Rnw:2231-2236
###################################################
plot(fams$mend.err.rate.fam*100, rank(fams$mend.err.rate.fam),
  main="Mendelian Error rate per Family, ranked",
  xlab="Mendelian error rate per family (percent)",
  ylab="rank", col=pcol)
legend("bottomright", c("CEU", "YRI"), pch=c(1,1), col=c("blue", "red"))


###################################################
### code chunk number 102: DataCleaning.Rnw:2254-2269
###################################################
head(pData(scanAnnot)[,c("father", "mother")])
nonfounders <- scanAnnot$father != 0 &
               scanAnnot$mother != 0
table(nonfounders)
scan.excl <- scanAnnot$scanID[scanAnnot$race != "CEU" |
   nonfounders | scanAnnot$duplicated]
length(scan.excl)

chr <- getChromosome(genoData)
auto <- range(which(chr %in% 1:22))
X <- range(which(chr == 23))
hwe <- exactHWE(genoData, scan.exclude=scan.excl, snpStart=auto[1], snpEnd=auto[2])
hweX <- exactHWE(genoData, scan.exclude=scan.excl, snpStart=X[1], snpEnd=X[2])
hwe <- rbind(hwe, hweX)
close(genoData)


###################################################
### code chunk number 103: DataCleaning.Rnw:2275-2285
###################################################
names(hwe)
dim(hwe)
# Check on sample sizes for autosomes and X chromosome
hwe$N <- hwe$nAA + hwe$nAB + hwe$nBB
summary(hwe$N[is.element(hwe$chr,1:22)])
summary(hwe$N[is.element(hwe$chr,23)])
hwe$pval[1:10]
sum(is.na(hwe$pval[hwe$chr == 23])) # X
hwe$MAF[1:10]
hwe[1:3, c("nAA", "nAB", "nBB")]


###################################################
### code chunk number 104: DataCleaning.Rnw:2292-2293
###################################################
summary(hwe$f)


###################################################
### code chunk number 105: DataCleaning.Rnw:2296-2298
###################################################
hist(hwe$f, main="Histogram of the Inbreeding Coefficient
  For CEU Samples", xlab="Inbreeding Coefficient")


###################################################
### code chunk number 106: DataCleaning.Rnw:2301-2304
###################################################
# Check the MAF of those SNPs with f=1
chkf <- hwe[!is.na(hwe$f) & hwe$f==1,]; dim(chkf)
summary(chkf$MAF)


###################################################
### code chunk number 107: DataCleaning.Rnw:2311-2322
###################################################
hwe.0 <- hwe[hwe$MAF > 0,]; dim(hwe.0)
# Only keep the autosomal SNPs for first plot
pval <- hwe.0$pval[is.element(hwe.0$chr, 1:22)]
length(pval)
pval <- pval[!is.na(pval)]
length(pval)
# X chromosome SNPs for plot 2
pval.x <- hwe.0$pval[is.element(hwe.0$chr, 23)]
length(pval.x)
pval.x <- pval.x[!is.na(pval.x)]
length(pval.x)


###################################################
### code chunk number 108: DataCleaning.Rnw:2324-2331
###################################################
pdf(file = "DataCleaning-hwe.pdf")
par(mfrow=c(2,2))
qqPlot(pval=pval, truncate = FALSE, main="Autosomes, all")
qqPlot(pval=pval, truncate = TRUE, main="Autosomes, truncated")
qqPlot(pval=pval.x, truncate = FALSE, main="X chromosome, all")
qqPlot(pval=pval.x, truncate = TRUE, main="X chromosome, truncated")
dev.off()


###################################################
### code chunk number 109: DataCleaning.Rnw:2337-2340
###################################################
plot(hwe.0$MAF, -log10(hwe.0$pval),
  xlab="Minor Allele Frequency", ylab="-log(p-value)",
  main="Minor Allele Frequency vs\nP-value")


###################################################
### code chunk number 110: DataCleaning.Rnw:2382-2398
###################################################
genoGDS <- GdsGenotypeReader(subj.filt.file)
subjAnnot <- scanAnnot[scanAnnot$scanID %in% getScanID(genoGDS),]
subjAnnot$sex <- as.factor(subjAnnot$sex)
subjAnnot$EV1 <- pca$eigenvect[match(subjAnnot$scanID, pca$sample.id), 1]

genoData <- GenotypeData(genoGDS, scanAnnot=subjAnnot)
chr <- getChromosome(genoData)
assoc.list <- lapply(unique(chr), function(x) {
    ## Y chromsome only includes males, cannot have sex as a covariate
    covar <- ifelse(x == 25, "EV1", c("sex", "EV1"))
    start <- which(chr == x)[1]
    assocRegression(genoData, outcome="status", covar=covar, model.type="logistic", 
                    snpStart=start, snpEnd=start+50)
})
assoc <- do.call(rbind, assoc.list)
close(genoData)


###################################################
### code chunk number 111: DataCleaning.Rnw:2412-2414
###################################################
qqPlot(pval=assoc$Wald.pval,
  truncate=TRUE, main="QQ Plot of Wald Test p-values")


###################################################
### code chunk number 112: DataCleaning.Rnw:2422-2425
###################################################
chrom <- getChromosome(snpAnnot, char=TRUE)
snp.sel <- getSnpID(snpAnnot) %in% assoc$snpID
manhattanPlot(assoc$Wald.pval, chromosome=chrom[snp.sel])


###################################################
### code chunk number 113: DataCleaning.Rnw:2436-2454
###################################################
# Identify SNPs with lowest p-values
snp <- pData(snpAnnot)[snp.sel, c("snpID", "rsID")]
snp$pval <- assoc$Wald.pval
snp <- snp[order(snp$pval),]
snp <- snp[1:9,]
xyfile <- system.file("extdata", "illumina_qxy.gds", package="GWASdata")
xyGDS <- GdsIntensityReader(xyfile)
xyData <- IntensityData(xyGDS, snpAnnot=snpAnnot, scanAnnot=scanAnnot)
genofile <- system.file("extdata", "illumina_geno.gds", package="GWASdata")
genoGDS <- GdsGenotypeReader(genofile)
genoData <- GenotypeData(genoGDS, snpAnnot=snpAnnot, scanAnnot=scanAnnot)
pdf(file="DataCleaning-cluster.pdf")
par(mfrow = c(3,3))
mtxt <- paste("SNP", snp$rsID, "\np =", format(snp$pval, digits=4))
genoClusterPlot(xyData, genoData, snpID=snp$snpID, main.txt=mtxt)
dev.off()
close(xyData)
close(genoData)


###################################################
### code chunk number 114: DataCleaning.Rnw:2461-2462
###################################################
unlink(subj.filt.file)

Try the GWASTools package in your browser

Any scripts or data that you put into this service are public.

GWASTools documentation built on Nov. 8, 2020, 7:49 p.m.