PrepareInputData.R
In CINdex: Chromosome Instability Index

## ---- results='hide', message=FALSE-------------------------------------------
#if (!requireNamespace("BiocManager", quietly=TRUE))
    #install.packages("BiocManager")

library(GenomicRanges)
library(AnnotationHub)
library(pd.genomewidesnp.6)
library(rtracklayer)
library(biovizBase) #needed for stain information
library(CINdex)
library(IRanges)

## ----seg----------------------------------------------------------------------
#Load example segment data into the workspace. 
data("grl.data")
#Examining the class of the object - GRangesList
class(grl.data) 

#Print first few rows
head(grl.data)

#The names of the list items in the GRangesList
names(grl.data)

# NOTE - The names of the list items in 'grl.data' must match the 
# sample names in the clinical data input 'clin.crc'

#Extracting segment information for the sample named "s4" shown as a GRanges object
grl.data[["s4"]]  
#You can see that each row in the GRanges object is a segment. The "value" columns shows the 
#copy number value for that segment.



## ----probeAnnotHg19, cache=TRUE-----------------------------------------------
#connect to the underlying SQLite database that is part of the pd.genomewidesnp.6 package
con <- db(pd.genomewidesnp.6)
# get the copy number probes
cnv <- dbGetQuery(con, "select man_fsetid, chrom, chrom_start, chrom_stop, 
                  strand from featureSetCNV;")
head(cnv, n =3) #print first few rows 
#get the SNP probes
snp <- dbGetQuery(con, "select man_fsetid, dbsnp_rs_id, chrom, physical_pos, 
                  strand from featureSet;")
head(snp, n=3)

## ---- cache=TRUE--------------------------------------------------------------
#function to convert Copy number data into GRanges object
convert.to.gr.cnv <- function(cnv2) {
    cnv2$chrom <- paste0("chr", cnv2$chrom)
    # subset out SNPs with missing location
    cnv2 <- cnv2[!(is.na(cnv2$chrom) | is.na(cnv2$chrom_start)),]
    # convert strand info to +,-,*
    cnv2$strand[is.na(cnv2$strand)] <- 2
    cnv2$strand <- cnv2$strand + 1
    cnv2$strand <- c("+", "-", "*")[cnv2$strand]
    #convert into GRanges object
    cnv2.gr <- GRanges(cnv2$chrom, IRanges(cnv2$chrom_start,cnv2$chrom_stop), 
                       cnv2$strand, ID = cnv2$man_fsetid)
    return(cnv2.gr)
}

#function to convert SNP data into GRanges object
convert.to.gr.snp <- function(snp2) {
    
    # make chromosomes the same as a chain file
    snp2$chrom <- paste0("chr", snp2$chrom)
    # subset out SNPs with missing location
    snp2 <- snp2[!(is.na(snp2$chrom) | is.na(snp2$physical_pos)),]
    # convert strand info to +,-,*
    snp2$strand[is.na(snp2$strand)] <- 2
    snp2$strand <- snp2$strand + 1
    snp2$strand <- c("+", "-", "*")[snp2$strand]
    snp2.gr <- GRanges(snp2$chrom, IRanges(snp2$physical_pos,snp2$physical_pos), 
                       snp2$strand, ID = snp2$man_fsetid, 
                       dbsnp = snp2$dbsnp_rs_id)
    return(snp2.gr)
}
# convert this copy number data from into a GRanges object
cnv.gr <- convert.to.gr.cnv(cnv2 = cnv) 
head(cnv.gr, n=3)

# convert this SNP data from into a GRanges object
snp.gr <- convert.to.gr.snp(snp2 = snp)
head(snp.gr, n=3)

## ---- cache=TRUE--------------------------------------------------------------
#subset only those probes that are in autosomes
snpgr.19.auto <- subset(snp.gr, seqnames(snp.gr) %in% c("chr1", 
                                                            "chr2", "chr3","chr4",
                                                            "chr5", "chr6", "chr7", 
                                                            "chr8", "chr9", "chr10", 
                                                            "chr11", "chr12", "chr13", 
                                                            "chr14", "chr15", "chr16",
                                                            "chr17", "chr18","chr19", 
                                                            "chr20", "chr21", "chr22"))

#subset only those probes that are in autosomes
cnvgr.19.auto <- subset(cnv.gr, seqnames(cnv.gr) %in% c("chr1", 
                                                            "chr2", "chr3","chr4",
                                                            "chr5", "chr6", "chr7", 
                                                            "chr8", "chr9", "chr10", 
                                                            "chr11", "chr12", "chr13", 
                                                            "chr14", "chr15", "chr16",
                                                            "chr17", "chr18","chr19", 
                                                            "chr20", "chr21", "chr22"))

#This gives a total of 1756096 probes (copy number and SNP) on this Affymetrix chip

## ----probeAnnotHg18, cache=TRUE-----------------------------------------------
con <- db(pd.genomewidesnp.6)
cnv2 <- dbGetQuery(con, "select man_fsetid, chrom, chrom_start, 
                   chrom_stop, strand from featureSetCNV;")
snp2 <- dbGetQuery(con, "select man_fsetid, dbsnp_rs_id, chrom, 
                   physical_pos, strand from featureSet;")

## ---- cache=TRUE--------------------------------------------------------------
# convert this copy number data into a GRanges object
cnv2gr <- convert.to.gr.cnv(cnv2 = cnv2) 
head(cnv2gr, n=3)

# convert this SNP data into a GRanges object
snp2gr <- convert.to.gr.snp(snp2 = snp2)
head(snp2gr, n=3)

## ---- eval=FALSE--------------------------------------------------------------
#  download.file("http://hgdownload.cse.ucsc.edu/gbdb/hg19/liftOver/hg19ToHg18.over.chain.gz",
#                "hg19ToHg18.over.chain.gz")
#  system("gzip -d hg19ToHg18.over.chain.gz") ## have to decompress

## ---- cache=TRUE--------------------------------------------------------------
## now use liftOver from rtracklayer, using the hg19ToHg18.over.chain from UCSC
chain <- import.chain("hg19ToHg18.over.chain")
snpgr.18 <- unlist(liftOver(snp2gr, chain))
head(snpgr.18, n=3)

cnvgr.18 <- unlist(liftOver(cnv2gr, chain))
head(cnvgr.18, n=3)

#subset only those probes that are in autosomes
snpgr.18.auto <- subset(snpgr.18, seqnames(snpgr.18) %in% c("chr1", 
                                                            "chr2", "chr3","chr4",
                                                            "chr5", "chr6", "chr7", 
                                                            "chr8", "chr9", "chr10", 
                                                            "chr11", "chr12", "chr13", 
                                                            "chr14", "chr15", "chr16",
                                                            "chr17", "chr18","chr19", 
                                                            "chr20", "chr21", "chr22"))

#subset only those probes that are in autosomes
cnvgr.18.auto <- subset(cnvgr.18, seqnames(cnvgr.18) %in% c("chr1", 
                                                            "chr2", "chr3","chr4",
                                                            "chr5", "chr6", "chr7", 
                                                            "chr8", "chr9", "chr10", 
                                                            "chr11", "chr12", "chr13", 
                                                            "chr14", "chr15", "chr16",
                                                            "chr17", "chr18","chr19", 
                                                            "chr20", "chr21", "chr22"))

#This gives us a total of 1756029 probes (about 1.7 million)

#Save these objects for future
#save(cnvgr.18.auto, file = "cnvgr.18.auto.RData")
#save(snpgr.18.auto, file = "snpgr.18.auto.RData")

## ----refGen, message=FALSE, warning=FALSE-------------------------------------
hg18.ucsctrack <- getIdeogram("hg18", cytoband = TRUE) 

head(hg18.ucsctrack, n=3)
#The user must ensure that the input object is a GRanges object

#Save this object for future use 
#save(hg18.ucsctrack, file = "hg18.ucsctrack.RData")

#NOTE - To get the reference file for hg19, use the code below
#hg19IdeogramCyto <- getIdeogram("hg19", cytoband = TRUE) 

## ----Clin---------------------------------------------------------------------
data("clin.crc")

# checking the class of the object
class(clin.crc)

# checking the structure
str(clin.crc)

#Let us examine the first five rows of this object
head(clin.crc,5)

# Look at sample names 
clin.crc[,1]

## ----results='hide', message=FALSE--------------------------------------------
##BiocManager::install("TxDb.Hsapiens.UCSC.hg18.knownGene")
##BiocManager::install("Homo.sapiens")
library(TxDb.Hsapiens.UCSC.hg18.knownGene)
library(Homo.sapiens)

## ----GeneAnn------------------------------------------------------------------
# We will continue to use hg18 gene annotations in this tutorial.
TxDb(Homo.sapiens) <- TxDb.Hsapiens.UCSC.hg18.knownGene
z <- select(Homo.sapiens, keys(Homo.sapiens, "ENTREZID"),
            c("CDSID","CDSCHROM","CDSSTRAND","CDSSTART","CDSEND","SYMBOL"), "ENTREZID")
z1 <- na.omit(object = z) #remove NA values

# extracting only the columns we want as a matrix
geneAnno <- cbind(z1$CDSCHROM, z1$CDSSTRAND,  z1$CDSSTART, z1$CDSEND, z1$SYMBOL)
colnames(geneAnno) <- c("chrom","strand", "cdsStart", "cdsEnd", "GeneName")

#So this gene annotation file looks like this
head(geneAnno, n=3)

# Examining the class and structure of this oject
class(geneAnno)
str(geneAnno)

#Save this object for future use 
#save(geneAnno, file = "geneAnno.RData")