iCNV-vignette.R
In iCNV: Integrated Copy Number Variation detection

## ----CODEX_Normalization_1, message=FALSE,results='hide',warning=FALSE--------

### 2.2 code chunk number 1: bambedObj1
library(CODEX)
library(WES.1KG.WUGSC) # Load Toy data from the 1000 Genomes Project.
library(iCNV) 
# Assume your work dir set to demo
dir <- system.file("extdata", package = "WES.1KG.WUGSC") # 'PATH/TO/BAM'
bamFile <- list.files(dir, pattern = '*.bam$')
bamdir <- file.path(dir, bamFile)
sampname <- as.matrix(read.table(file.path(dir, "sampname")))
bedFile <- file.path(dir, "chr22_400_to_500.bed")
chr <- 22
bambedObj <- getbambed(bamdir = bamdir, bedFile = bedFile, 
                   sampname = sampname, projectname = "icnv.demo.", chr)
bamdir <- bambedObj$bamdir; sampname <- bambedObj$sampname
ref <- bambedObj$ref; projectname <- bambedObj$projectname; chr <- bambedObj$chr

### 2.2 code chunk number 2: coverageObj1
coverageObj <- getcoverage(bambedObj, mapqthres = 20)
Y <- coverageObj$Y; readlength <- coverageObj$readlength

### 2.2 code chunk number 3: gcmapp1
gc <- getgc(chr, ref)
mapp <- getmapp(chr, ref)

### 2.2 code chunk number 4: qcObj1
qcObj <- qc(Y, sampname, chr, ref, mapp, gc, cov_thresh = c(20, 4000), 
            length_thresh = c(20, 2000), mapp_thresh = 0.9, gc_thresh = c(20, 80))
Y_qc <- qcObj$Y_qc; sampname_qc <- qcObj$sampname_qc; gc_qc <- qcObj$gc_qc
mapp_qc <- qcObj$mapp_qc; ref_qc <- qcObj$ref_qc; qcmat <- qcObj$qcmat

### 2.2 code chunk number 5: normObj1
normObj <- normalize(Y_qc, gc_qc, K = 1:9)
Yhat <- normObj$Yhat; AIC <- normObj$AIC; BIC <- normObj$BIC
RSS <- normObj$RSS; K <- normObj$K
choiceofK(AIC, BIC, RSS, K, filename = paste0(projectname, chr, "_choiceofK.pdf"))
save(qcObj,normObj,sampname,file=paste0(projectname,'plrObj_', chr,".rda"))
# This saving step is necessary for large dataset, such that you don't need 
# to recompute everything again if you would like to view your choice of K 
# plot. You could just load this as in the next chunk of code.

## ----fig.height=2, fig.width=8, echo=FALSE, out.width = "200%", dpi=200-------
Kmax <- length(AIC)
par(mfrow = c(1, 3))
plot(K, RSS, type = "b", xlab = "Number of latent variables")
plot(K, AIC, type = "b", xlab = "Number of latent variables")
plot(K, BIC, type = "b", xlab = "Number of latent variables")

## ----CODEX_Normalization_2, message=FALSE-------------------------------------
load(paste0(projectname,'plrObj_',chr,".rda"))
optK <- K[which.max(BIC)] # by default or customize given curves

### 2.2 code chunk number 6: plr
Y_qc <- qcObj$Y_qc; sampname_qc <- qcObj$sampname_qc; gc_qc <- qcObj$gc_qc
mapp_qc <- qcObj$mapp_qc; ref_qc <- qcObj$ref_qc; qcmat <- qcObj$qcmat
Yhat <- normObj$Yhat; AIC <- normObj$AIC; BIC <- normObj$BIC
RSS <- normObj$RSS; K <- normObj$K
ref_qc <- qcObj$ref_qc # IRanges object for exon target
sampname_qc <- qcObj$sampname_qc # sample names
Y_norm <- normObj$Yhat[[optK]] # normalized read count under null (no CNV)
plr <- log(pmax(Y_qc,0.0001)/pmax(Y_norm,0.0001)) # log transformed z-scores
ngs_plr <- lapply(seq_len(ncol(plr)), function(i) plr[,i])
ngs_plr.pos <- lapply(seq_len(ncol(plr)),function(x){return(cbind(start(ref_qc),end(ref_qc)))})
save(sampname_qc,ngs_plr,ngs_plr.pos,file=paste0(projectname,'plrObj_',chr,'_',optK,'.rda'))

## ----message=FALSE,results='hide',warning=FALSE-------------------------------
### 2.3 code chunk number 1: bambaf_from_vcf
projectname <- "icnv.demo."
dir <- system.file("extdata", package="iCNV")
bambaf_from_vcf(dir,'bam_vcf.list',chr=22,projectname) 
# ignore chr argement if you want to convert all chromosome 1-22
load(paste0(projectname,'bambaf_22.rda'))

## ----results='hide'-----------------------------------------------------------
### 2.4 code chunk number 1: icnv_array_input
# PATH/TO/FOLDER where you stored the array data
dir <- system.file("extdata", package="iCNV")#PATH/TO/FOLDER where you stored the array data
chr <- 22
projectname <- "icnv.demo."
pattern <- paste0('*.csv.arrayicnv$')
get_array_input(dir,pattern,chr,projectname)
load(paste0(projectname,'array_lrrbaf_',chr,'.rda'))

## ----fig.height=4, fig.width=6, out.width = "200%", dpi=200-------------------
### 2.4 code chunk number 2: svd (optional)
library(iCNV)
projectname <- "icnv.demo."
load(paste0(projectname,'array_lrrbaf_',chr,'.rda'))
# standardize LRR
lrr.sd <- mapply(function(x){
  x=pmax(pmin(x,100),-100);(x-mean(x,na.rm=TRUE))/sd(x,na.rm = TRUE)
  },snp_lrr,SIMPLIFY = TRUE)
# You may want to substitute all the NAs in lrr by rowMeans
lrr.sd.dena <- apply(lrr.sd,2,function(x){x[is.na(x)]=mean(x,na.rm=TRUE);return(x)})

# lrr is LRR matrix, with row as sample and column as positions
lrr.svd <- svd (t(lrr.sd.dena))
save(lrr.svd,file=paste0(projectname,'array_lrrbaf_svd_',chr,'.rda'))

# Plot out the variance explained with K; You can store the plot in a pdf
# pdf(paste0(projectname,'array_lrrbaf_svd_',chr,'.pdf'),height=8,width=10)
plot(x=seq_len(10),y=(lrr.svd$d[1:10])^2/sum(lrr.svd$d^2),
     xlab='rank',ylab='D square',main='Variance explained',type='l')
# dev.off() # Close the plot connection

## ----results='hide',message=FALSE---------------------------------------------
### 2.5 code chunk number 1: iCNV_detection with genotype
str(ngs_plr) # List of n vector, each one is the PLR for an exon
str(ngs_plr.pos) # List of n matrix (p x 2), each one is the start and end location for an exon
str(ngs_baf) # List of n vector, each one is the variants BAF from .bam
str(ngs_baf.pos) # List of n vector, each one is the variants BAF position
str(snp_lrr) # List of n vector, each one is the normalized LRR for a SNP
str(snp_lrr.pos) # List of n vector, each one is a SNP position
str(snp_baf) # List of n vector, each one is the BAF for a SNP
str(snp_baf.pos) # List of n vector, each one is the SNP BAF position
projectname <- 'icnv.demo.'
icnv_res0 <- iCNV_detection(ngs_plr,snp_lrr,
                         ngs_baf,snp_baf,
                         ngs_plr.pos,snp_lrr.pos,
                         ngs_baf.pos,snp_baf.pos,
                         projname=projectname,CN=1,mu=c(-3,0,2),cap=TRUE,visual = 1)
icnv.output <- output_list(icnv_res0,sampname_qc,CN=1)
head(icnv.output)

## ----fig.width=8, fig.height=4,echo=FALSE, out.width = "200%", dpi=200--------
plotHMMscore(icnv_res0,title=projectname)

## -----------------------------------------------------------------------------
### 2.5 code chunk number 2: covert result to Genome Browser input
icnv.output <- output_list(icnv_res=icnv_res0,sampleid=sampname_qc, CN=0, min_size=10000)
gb_input <- icnv_output_to_gb(22,icnv.output)
write.table(gb_input,file=paste0(projectname,'icnv_res_gb_chr',chr,'.tab'),
            quote=FALSE,col.names=FALSE,row.names=FALSE)

## ----fig.width=6, fig.height=3, echo=F, out.width = "100%", dpi=75------------
plot(c(100, 190), c(300, 450), type= "n",xaxt='n',yaxt='n', ann=FALSE, xlab = "", ylab = "")
rect(100, 310, 140, 360, col = "#FF0000") # 
rect(100, 400, 140, 450, col = "#820000") # 
rect(150, 310, 190, 360, col = "#00FF00")
rect(150, 400, 190, 450, col = "#008200")
text(120,390,'Homo Deletion')
text(120,300,'Het Deletion')
text(170,390,'Homo Duplication')
text(170,300,'Het Duplication')

## ----echo=FALSE,fig.height=4, fig.width=8, out.width = "200%", dpi=200--------
plotHMMscore(icnv_res0,title=projectname,col='DGV')

## ----message=FALSE,results='hide',fig.keep='all',fig.height=3, fig.width=8, out.width = "200%", dpi=200----
### 2.5 code chunk number 3: plotindi
I <- 24 # ploting out the 24th individual
plotindi(ngs_plr,snp_lrr,
         ngs_baf,snp_baf,
         ngs_plr.pos,snp_lrr.pos,
         ngs_baf.pos,snp_baf.pos,
         icnv_res0,I)

## ----message=FALSE,results='hide',fig.keep='all',fig.height=4, fig.width=8, out.width = "200%", dpi=200----
### 2.6 code chunk number 1: NGS only iCNV_detection
str(ngs_plr) # List of n vector, each one is the PLR for an exon
str(ngs_plr.pos) # List of n matrix (p x 2), each one is the start and end location for an exon
str(ngs_baf) # List of n vector, each one is the variants BAF from .bam
str(ngs_baf.pos) # List of n vector, each one is the variants BAF position
projectname <- 'icnv.demo.ngs.'
icnv_res0_ngs <- iCNV_detection(ngs_plr=ngs_plr, ngs_baf = ngs_baf, 
                             ngs_plr.pos = ngs_plr.pos,ngs_baf.pos = ngs_baf.pos,
                             projname=projectname,CN=0,mu=c(-3,0,2),cap=TRUE,visual = 2)
icnv.output <- output_list(icnv_res0_ngs,sampname_qc,CN=0)
head(icnv.output)
plotHMMscore(icnv_res0_ngs,title=projectname)

## ----message=FALSE,results='hide',fig.keep='all',fig.height=4, fig.width=8, out.width = "200%", dpi=200----
### 2.6 code chunk number 2: array only iCNV_detection
str(snp_lrr) # List of n vector, each one is the normalized LRR for a SNP
str(snp_lrr.pos) # List of n vector, each one is a SNP position
str(snp_baf) # List of n vector, each one is the BAF for a SNP
str(snp_baf.pos) # List of n vector, each one is the SNP BAF position
projectname <- 'icnv.demo.snp'
icnv_res0_snp <- iCNV_detection(snp_lrr=snp_lrr, snp_baf = snp_baf, 
                             snp_lrr.pos = snp_lrr.pos,snp_baf.pos = snp_baf.pos,
                             projname=projectname,CN=1,mu=c(-3,0,2),cap=TRUE,visual = 2)
icnv.output <- output_list(icnv_res0_snp,sampname_qc,CN=1)
head(icnv.output)
plotHMMscore(icnv_res0_snp,title=projectname)

## -----------------------------------------------------------------------------
sessionInfo()