````r cat("") cat("") cat("") cat("") cat("") cat("")

Chip
========================================================
Feature Selection was done with the ArrayPro Software of TECAN. 

Feature Selection Parameters were chosen the following: 

Every grid was inspected and (schlieren) were excluded from the data measurement. 
Throughout all measured data following parameters were used: 
* Raw Intensity: 
  Within thresholds (auto threshold)
* Background: 
  Local ring with offset 1 and width 2
* Net Intensity: Raw Intensity - Median background (negative ignored)
* Normalization: None
* Spot size was set to 20x20 pixel in the case of the secon chip

The rest of the data values was chosen as proposed by the Array Pro Software. 
For down stream analysis, the net intensity values (raw-medianBg) were used!
```r
library(googleVis)
op <- options(gvis.plot.tag='chart')
options(stringsAsFactors=FALSE)
library(ggplot2)
library(reshape2)
library(limma)
library("geneplotter")
library(rCharts)
library(ggplot2)
library(sqldf)
library(genefilter)
library("Biostrings")
library("RCurl")
library(arrayQualityMetrics)
library("vsn")
library("MmPalateMiRNA")
library("latticeExtra")
library(CustomMicroarrayPackage)
library(Biostrings)

````r

Reading in the Appswe model

setwd("/media/Data/PHDStudies/ADpaper/testing/Analysis/")

gprDir = file.path(getwd(), "..") #VARDEFINITION analysisPath = file.path( getwd() ) #VARDEFINITION

reading in gpr files

data(chipAnnotDF,package="CustomMicroarrayPackage")

connecting to annotation db

db <- dbConnect(SQLite(), dbname=system.file("data/ncrna.chip2.annotation.db",package="CustomMicroarrayPackage"))

dbListTables(db)

setwd(gprDir)

chipDataL <- readAndProcessChipDataFirstChip( currDir=gprDir)

RG = chipDataL$RG RG$genes$ID = ifelse( RG$genes$ID == "","noSig",RG$genes$ID ) RG$genes$ID = ifelse( RG$genes$ID == "unknown","noSig",RG$genes$ID ) RG$genes$ID = ifelse( RG$genes$ID == "Name","noSig",RG$genes$ID ) RG$genes = RG$genes[,-which(colnames(RG$genes) == "probe.type")]

one modification since the ID has changed in the database

RG$genes[which( RG$genes$ID == "900001_MM" ),]$ID = "900001_MM1"

modification because of old chip -> adding new unique ids to data

oidm = dbGetQuery(db,"SELECT * FROM oligoIDMapping;") RG$genes$Name = RG$genes$ID RG$genes$runIndex = 1:dim(RG$genes)[1] genesNew = merge(x=RG$genes, y=oidm, by.x="Name", by.y="oligoID", all.x=TRUE ) genesNew = genesNew[order(genesNew$runIndex),] RG$genes$ID = paste0("e",genesNew$oligoIDMappingTableID) RG$genes = RG$genes[,-dim(RG$genes)[2]] RG$genes[which( RG$genes$Name == "Cy3_Anchor-spot" ),]$ID = "Cy3_Anchor-spot" RG$genes[which( RG$genes$Name == "noSig" ),]$ID = "noSig"

RG$genes = addProbeTypeAndChipTypeToRGGenes(RG=RG,dbcon=db, anchorName="Cy3_Anchor-spot", noSignal="noSig") # processing the chip

Modify anything related to the RG file here

RG = extractNetIntensityValues(RG, deleteOther=TRUE)

RG$other = NULL

setwd(analysisPath)

Declaration of Analysis Parameters
-------------------------------------------------------------------
```r
#Variable Declarations for downstream analysis
normMethod = "quantile" #VARDEFINITION
ndups = 8 #VARDEFINITION
spacing = 1  #VARDEFINITION
minDetectLim = 0.15  #VARDEFINITION (minimal detection limit)
subsetByManualControl = FALSE  #VARDEFINITION (#)filtering option)
subsetByPMvsMM = TRUE  #VARDEFINITION (filtering option)
orderingVar = "Name"  #VARDEFINITION 
addInterestCand = c("900000","20285","20285_1","20285_2" )  #VARDEFINITION (additional candidates to be labeled)
design = rep(c(1, -1), length(RG$targets[, 1])/2) 
RG$targets
print(paste0( "<h2>Comparison is done by log(Cy5/Cy3), therefore the reference for the fold change is: <b>",RG$targets[1,2],"</b></h2>" ))

QualityControl Analysis before normalization

Print Tips of the chip with fitted Loess (Local Polynomial Regression Fitting) Curve

 plotPrintTipLoess( RG )

Box Plot and Density Plot showing the green and red channel signal distributions

 plotBoxDensityFromRG(RG)

MA Plot of each of the arrays on the Raw Data (M: $log(R)-log(G)$ versus A: $\frac{log2(R)+log2(G)}{2}$ values)

#creating a data.table for ggplot plotting
plotMAofArrays( MA.RG(RG) )

Array Quality Metrics generation before Normalization

arrayQualityMetrics(expressionset = MA.RG(RG), outdir = "AQM_beforeNormalization", force = TRUE, do.logtransform = FALSE)

The file can be accessed here:

cat("<a href=\"AQM_beforeNormalization/index.html\"> Quality Report before Normalization </a>")

Normalization on the raw data

MA = normalizeTwoColorArray(RG, method=normMethod)

Box Plot and Density Plot showing the green and red channel signal distributions after normalization

 plotBoxDensityFromRG(RG.MA(MA))

Array Quality Metrics generation after Normalization

arrayQualityMetrics(expressionset = MA, outdir = "AQM_afterNormalization", force = TRUE, do.logtransform = FALSE)

The file can be accessed here:

cat("<a href=\"AQM_afterNormalization/index.html\"> Quality Report after Normalization </a>")

Filtering of the data

MAnonFilt = MA #backup variable
MA = orderTwoColorArray(MA, by=orderingVar) #Ordering of MA by name 
MA$genes[MA$genes$Name == "900001",]$chipType = "SpikeInControl"#FAKING THE controls in case of First Chip 
MA = MA[MA$genes$probe.type == "MM" | MA$genes$probe.type == "PM",]
MA = filterSpotsBySpikeIn(MA=MA, normMethod=normMethod, subsetByManualControl = subsetByManualControl, subsetByPMvsMM = subsetByPMvsMM, ndups=ndups, spacing=spacing, writeLog=FALSE, design=design )

MA Plot of each of the arrays on the Normalized and Filtered Data (M: $log(R)-log(G)$ versus A: $\frac{log2(R)+log2(G)}{2}$ values)

#creating a data.table for ggplot plotting
plotMAofArrays( MA )

Differential Expression Analysis

MA = orderTwoColorArray(MA, by=orderingVar)
resL = performDiffExpAnalysis(MA, ndups = ndups, spacing=spacing )
tt = resL$ResultTable
#Quering the database of the chip annotation for extra information
additionalData = dbGetQuery(db, paste(
"SELECT *  
FROM oligoIDMapping oidm
JOIN oligo o ON oidm.oligo_oligoTableID = o.oligoTableID
JOIN secondary seco ON seco.secondaryTableID = o.secondary_secondaryTableID
LEFT OUTER JOIN contig c ON c.contigTableID = oidm.contig_contigTableID
LEFT OUTER JOIN annotationSummary ans ON ans.contigAnnotation_contig_contigTableID = c.contigTableID
JOIN geneAnnot ga ON ga.contigAnnotation_contig_contigTableID = c.contigTableID
LEFT OUTER JOIN predictionAdditional preda ON preda.oligo_oligoTableID = o.oligoTableID
WHERE o.oligoTableID IN( ",paste(  sub("e","",tt$ID) , collapse=","),");",sep=""))
#"))
#Adding this data to the Result Table
additionalData$coordinates = paste0(additionalData$contigChromosome, ":", additionalData$contigStart, "-", additionalData$contigEnd)
addData = additionalData[,c("oligoTableID","contigID","gc","length","microgram", "mp","ncRNABioType","ncRNAOrientation", "repeatName", 
            "bindingEfficiency", "contigChromosome","geneRegion", "coordinates", "predictionProgram", "disease", "rsnumber", "gene","geneName","geneTranscriptID", "sequence")]
addData$sequence = as.character( reverseComplement(DNAStringSet( addData$sequence ) ) )
addData$oligoTableID = paste0("e",addData$oligoTableID)
dat.tt = merge( tt, addData, by.x = "ID", by.y="oligoTableID", sort=FALSE )
dat.tt$logPrev = -log10(dat.tt$P.Value)
labelDF.tt = dat.tt[dat.tt$adj.P.Val < 0.05 ,]
if(dim(labelDF.tt)[1] == 0){labelDF.tt = dat.tt[1:20,]; message("There is no significant candidate in the data, therefore the top 20 are displayed")}
#Only in this case we want to display additional candidates: 
if( length(addInterestCand) != 0){
  labelDF.tt = rbind( labelDF.tt, dat.tt[ dat.tt$Name %in% addInterestCand, ] )
  labelDF.tt = unique(labelDF.tt)
}
dat.tt$ncRNABioTypeSub = ifelse( dat.tt$ncRNABioType %in% c("miRNA","snoRNA","tRNA","piRNA","lincRNA"), dat.tt$ncRNABioType, "other" )
dat.tt$Significance = 'p.value > 0.05 (BH)'
dat.tt[dat.tt$adj.P.Val < 0.05 ,]$Significance = 'p.value < 0.05 (BH)'

#writing csv file
fncsv = paste0("resultTable.csv")
write.table(dat.tt,file=fncsv, sep="\t", col.names=TRUE, row.names=FALSE)

The ResultTable can be accessed here:

cat(paste0("<a href=\"",fncsv,"\"> Differential Expression result (csv) </a>"))

MA Plots of the resulting data (M: $log(R)-log(G)$ versus A: $\frac{log2(R)+log2(G)}{2}$ values)

g1 = ggplot( dat.tt, aes(x=AveExpr, y=logFC), environment=environment() ) + geom_point(colour="lightgrey") + 
  labs(title="MAPlot (Averaged over all arrays)", x="Intensity (A)", y="log2 Fold Change (M)") +
  geom_text( data = labelDF.tt, aes( x= AveExpr, y=logFC, label=Name), show_guide = FALSE, size=2, color="red") +
  geom_hline( aes(yintercept=-minDetectLim, linetype="dashed"), show_guide = FALSE ) +
  geom_hline( aes(yintercept=minDetectLim, linetype="dashed"), show_guide = FALSE )
g1
g2 =  ggplot( dat.tt, aes(x=AveExpr, y=logFC, colour = ncRNABioTypeSub), environment=environment() ) + geom_point() + 
  labs(title="MAPlot (Averaged over all arrays)", x="Intensity (A)", y="log2 Fold Change (M)") +
  geom_text( data = labelDF.tt, aes( x= AveExpr, y=logFC, label=Name), show_guide = FALSE, size=2, color="black") +
  geom_hline( aes(yintercept=-minDetectLim, linetype="dashed"), show_guide = FALSE ) +
  geom_hline( aes(yintercept=minDetectLim, linetype="dashed"), show_guide = FALSE )
g2

Vulcano Plot of the resulting Data

g = ggplot( dat.tt, aes(x=logFC, y=logPrev, colour = Significance), environment=environment() ) + geom_point() + scale_color_manual(values=c( "red","lightgrey")) + 
  labs(title="Vulcano Plot (Averaged over all arrays)", x="log2 Fold Change (M)", y="-log10(p-value)") + 
  geom_text( data = labelDF.tt, aes( x= logFC, y=logPrev, label=Name), show_guide = FALSE, size=2, color="black") #-0.07
g

Table Showing the top candidates

````r tablegvis = labelDF.tt[,c("ID","Name","contigID","probe.type","chipType","logFC", "AveExpr","t","P.Value","adj.P.Val","B","ncRNABioType","ncRNAOrientation","geneName")] plot(createGVisTableFromDF(tablegvis))

### Fold Change by condition, of top five differentially expressed candidates
````r
#Creating an Expression set
#Creating an Expression set
minAnnot = unique(dat.tt[,c("Block", "Row","Column","ID","Name","probe.type","chipType","logFC", "AveExpr","ncRNABioTypeSub")]) #Since due to left out join, there could be some entries duplicated whereby only one attributes differe (disease for example is a one to many relation)
MAave = avedups( MA, ndups=ndups, spacing=spacing )
featAnnot = merge( MAave$genes,  minAnnot, by=colnames(MAave$genes), sort=FALSE)
exprsData = MAave$M
rownames(exprsData) = featAnnot$Name
rownames( MAave$targets ) = sub(".*-","", sub("\\.gpr","",rownames( MAave$targets )))
sampleType = paste( rownames( MAave$targets)  ,MAave$targets$Cy5, sep="_") #sample type always needs to be inspected

colnames(exprsData) = rownames(MAave$targets)
rownames(featAnnot) = featAnnot$Name

eSet = ExpressionSet(exprsData, phenoData= AnnotatedDataFrame(MAave$targets), featureData = AnnotatedDataFrame(featAnnot), annotation="customAnnot")
eSet$SampleType = as.factor(sampleType)
discr = as.factor(sub( ".*_","", eSet$SampleType))
rocs = rowpAUCs(eSet, discr,  p=0.2) #calculates the area under the curve

top5Cand = which(names(genefilter::area(rocs)) %in% unique(c(dat.tt[1:5,"Name"],addInterestCand)))

for( j in top5Cand  ){
  y = exprs(eSet[j,])
  ord=order(discr)
  featureName = featureNames(eSet[j,])
  par(mfrow=c(1,2))
  genefilter::plot(rocs[j], main=featureName)
  plot( y[ord], pch=c(1,16)[discr[ord]], col=c("black","red")[discr[ord]], main=featureName, ylab=expression(log[2]~intensity), xlab="samples" )
  text( x=1:24, y=y[ord]+0.01, labels=phenoData(eSet)$Sample[ord], cex = 0.4 )  
}

Interactive Plots (Javascript needs to be enabled)

Interactive MA Plot

````r dataToPlot = dat.tt colnames(dataToPlot)[ which(colnames(dataToPlot) == "P.Value") ] = "PValue" colnames(dataToPlot)[ which(colnames(dataToPlot) == "adj.P.Val") ] = "adjPValue"

dataToPlot = dataToPlot[,c("ID", "Name", "chipType", "logFC", "AveExpr", "PValue", "adjPValue", "ncRNABioType", "geneRegion", "coordinates", "predictionProgram" , "disease", "rsnumber", "gene", "geneName", "geneTranscriptID")] n2 = nPlot( logFC ~ AveExpr, data=dataToPlot, type='scatterChart', group='chipType' ) n2$chart(tooltipContent= "#! function(key, x, y, e){ return 'Name: ' + e.point.Name + '
' + 'PValue: ' + e.point.PValue + '
' + 'adjPValue: ' + e.point.adjPValue + '
' + 'ncRNABioType: ' + e.point.ncRNABioType + '
' + 'geneRegion: ' + e.point.geneRegion + '
' + 'geneName: ' + e.point.geneName + '
' + 'geneTranscriptID: ' + e.point.geneTranscriptID + '
' + 'gene: ' + e.point.gene + '
' + 'coordinates: ' + e.point.coordinates + '
' + 'predictionProgram: ' + e.point.predictionProgram + '
' + 'disease: ' + e.point.disease + '
' + 'rsnumber: ' + e.point.rsnumber + '
' + 'coordinates: ' + e.point.coordinates } !#") n2$print("chart1")

Additional Information: Size Distribution of non-coding RNA contigs on the microarray
----------------------------------------------------------------------

```r
sizeData = dbGetQuery(db, 
"SELECT * FROM oligoIDMapping oidm 
JOIN oligo o ON o.oligoTableID = oidm.oligo_oligoTableID
JOIN contig c ON c.contigTableID = oidm.contig_contigTableID
JOIN annotationSummary annots ON annots.contigAnnotation_contig_contigTableID = c.contigTableID" )

#binning the contigSize Data
#bins of 5 till 100 , then everything in one bin
#first remove all contigs < 18
sizeData = sizeData[sizeData$contigLength > 18,]
sizeData = sizeData[sizeData$category == "PM",]
binningContigLength = function( data, varToBin, binsFrom=18,binsTo=100,binsBy=5 ){

  bins = seq(binsFrom,binsTo, by=binsBy)
  lowerBin = bins
  upperBin = bins + 5
  upperBin[length(upperBin)] = max(sizeData$contigLength)+1

  contigBin = rep("", dim(data)[1])

  for(i in c(1:length(upperBin))  ){
    contigBin[which( data$contigLength < upperBin[i] & data$contigLength >= lowerBin[i])] = paste0(lowerBin[i],"-",upperBin[i]-1)
  }
  return(contigBin)
}

contigBins = binningContigLength(sizeData)
sizeData$contigBins = contigBins

contL = split(sizeData, f=as.factor(sizeData$contigID)  )
contigOligoCoverage = unlist(lapply( contL, function(x){return(dim(x)[1])} ))
rm(contL)
contigData = sizeData
contigData = contigData[!duplicated(contigData$contigID),]
contigData$contigOligoCoverage = contigOligoCoverage

contigData = contigData[,c("contigBins","ncRNABioType","contigOligoCoverage")]
contigData$contigOligoCoverage = as.character(contigData$contigOligoCoverage)
contigData[ !(contigData$ncRNABioType %in% c("miRNA","piRNA","snoRNA","tRNA","rRNA","lincRNA")),]$ncRNABioType = "other"

contigData[,3] = as.numeric(contigData[,3])
contigBins = melt( table(contigData[,1:2]) )

contigData_new = data.frame( "binBio"=paste0(contigData$contigBins,"_",contigData$ncRNABioType), "OligoCov"=contigData[,3] )

for( binbio in unique(contigData_new$binBio)  ){
    occ = which(contigData_new$binBio == binbio)
    summedVal = sum( contigData_new[occ,"OligoCov"] )
    contigData_new[occ,]$OligoCov = summedVal
}
contigData_new = contigData_new[!duplicated(contigData_new$binBio),]
oligoCoverageByBin = data.frame( "contigBins"=sub("_.*","",contigData_new$binBio), 
                                 "ncRNABioType"=sub(".*_","",contigData_new$binBio),
                                 "OligoCoverage"=contigData_new$OligoCov  )

contigBins= merge( contigBins, oligoCoverageByBin, by=c("contigBins","ncRNABioType"), all.x=TRUE )
contigBins$OligoCoverage = ifelse(is.na(contigBins$OligoCoverage), 0, contigBins$OligoCoverage)
colnames(contigBins) = c("contigBins", "ncRNABioType", "Freq", "OligoCoverage")
n1 = nPlot( Freq~contigBins, data = contigBins,group="ncRNABioType", type="multiBarChart" )

n1$chart(tooltipContent= "#! function(key, x, y, e){
  return '<b>Contig Bin: </b> ' + e.point.contigBins + '<br/>' + 
         '<b>ncRNA Biotype: </b> ' + e.point.ncRNABioType + '<br/>' + 
         '<b>Contigs in Bin: </b> ' + e.point.Freq + '<br/>' + 
         '<b>OligoCoverage: </b> ' + e.point.OligoCoverage
} !#")

n1$print("chart2")

````r save.image(file="session.rda")

Additional Information for reproducibility 
----------------------------------------------------------------------
### R package versions
````r
sessionInfo()

````r

analysisTemplate1.0 = readLines("/media/Rstick/ADpaper/workspace/CustomMicroarrayPackage/src/AnalysisTemplate1_0.Rmd")

save(analysisTemplate1.0, file="/media/Rstick/ADpaper/workspace/CustomMicroarrayPackage/data/analysisTemplate1.0.rda")

to complete: hsaPiRNAs

c("e590", "e649", "e604", "e473", "e599", "e977", "e712", "e511", "e749", "e653", "e279", "e388", "e290", "e110",

"e655", "e112", "e129", "e643", "e519", "e652", "e268", "e647", "e322", "e196", "e967", "e199", "e725", "e486",

"e139", "e266", "e934")

```



SimonSchafferer/custom-neuro-ncRNA-microarray documentation built on May 9, 2019, 1:45 p.m.