inst/doc/Linnorm_User_Manual.R

## ---- echo=TRUE---------------------------------------------------------------
library(Linnorm)

#Obtain data
data(Islam2011)

#Do not filter gene list:
Transformed <- Linnorm(Islam2011)

#Filter low count genes
FTransformed <- Linnorm(Islam2011, Filter=TRUE)

## ---- echo=TRUE---------------------------------------------------------------
#You can use this file with Excel.
#This file includes all genes.
write.table(Transformed, "Transformed.txt",
quote=FALSE, sep="\t", col.names=TRUE, row.names=TRUE)

#This file filtered low count genes.
write.table(FTransformed, "Transformed.txt",
quote=FALSE, sep="\t", col.names=TRUE, row.names=TRUE)

## ---- echo=TRUE---------------------------------------------------------------
library(Linnorm)
Normalized <- Linnorm.Norm(Islam2011)
#Important: depending on downstream analysis, please
#set output to be "XPM" or "Raw".
#Set to "XPM" if downstream tools will convert the 
#dataset into CPM or TPM.
#Set to "Raw" if input is raw counts and downstream 
#tools will work with raw counts.

## ---- echo=TRUE---------------------------------------------------------------
#You can use this file with Excel.
write.table(Normalized, "Normalized.txt",
quote=FALSE, sep="\t", col.names=TRUE, row.names=TRUE)

## ---- echo=TRUE---------------------------------------------------------------
library(Linnorm)
data(LIHC)

## ---- echo=TRUE---------------------------------------------------------------
#Now, we can calculate fold changes between
#sample set 1 and sample set 2.
#Index of sample set 1
set1 <- 1:5
#Index of sample set 2
set2 <- 6:10

#Normalization
LIHC2 <- Linnorm.Norm(LIHC,output="XPM")

#Optional: Only use genes with at least 50%
#of the values being non-zero
LIHC2 <- LIHC2[rowSums(LIHC2 != 0) >= ncol(LIHC2)/2,]

## ---- echo=TRUE---------------------------------------------------------------
#Put resulting data into a matrix
FCMatrix <- matrix(0, ncol=1, nrow=nrow(LIHC2))
rownames(FCMatrix) <- rownames(LIHC2)
colnames(FCMatrix) <- c("Log 2 Fold Change")
FCMatrix[,1] <- log((rowMeans(LIHC2[,set1])+1)/(rowMeans(LIHC2[,set2])+1),2)
#Now FCMatrix contains fold change results.

#If the optional filtering step is not done,
#users might need to remove infinite and zero values:
#Remove Infinite values.
FCMatrix <- FCMatrix[!is.infinite(FCMatrix[,1]),,drop=FALSE]
#Remove Zero values
FCMatrix <- FCMatrix[FCMatrix[,1] != 0,,drop=FALSE]

#Now FCMatrix contains fold change results.

## ---- echo=TRUE, fig.height=6, fig.width=6------------------------------------
Density <- density(FCMatrix[,1])
plot(Density$x,Density$y,type="n",xlab="Log 2 Fold Change of TPM+1",
ylab="Probability Density",)
lines(Density$x,Density$y, lwd=1.5, col="blue")
title("Probability Density of Fold Change.\nTCGA Partial LIHC data
Paired Tumor vs Adjacent Normal")
legend("topright",legend=paste("mean = ", 
round(mean(FCMatrix[,1]),2),
"\nStdev = ", round(sd(FCMatrix[,1]),2)))

## ---- echo=TRUE, fig.height=6, fig.width=6------------------------------------
#Normalization with BE_strength set to 1.
#This increases normalization strength.
LIHC2 <- Linnorm.Norm(LIHC,output="XPM",BE_strength=1)

#Optional: Only use genes with at least 50%
#of the values being non-zero
LIHC2 <- LIHC2[rowSums(LIHC2 != 0) >= ncol(LIHC2)/2,]

FCMatrix <- matrix(0, ncol=1, nrow=nrow(LIHC2))
rownames(FCMatrix) <- rownames(LIHC2)
colnames(FCMatrix) <- c("Log 2 Fold Change")
FCMatrix[,1] <- log((rowMeans(LIHC2[,set1])+1)/(rowMeans(LIHC2[,set2])+1),2)
#Now FCMatrix contains fold change results.


Density <- density(FCMatrix[,1])
plot(Density$x,Density$y,type="n",xlab="Log 2 Fold Change of TPM+1",
ylab="Probability Density",)
lines(Density$x,Density$y, lwd=1.5, col="blue")
title("Probability Density of Fold Change.\nTCGA Partial LIHC data
Paired Tumor vs Adjacent Normal")
legend("topright",legend=paste("mean = ", 
round(mean(FCMatrix[,1]),2),
"\nStdev = ", round(sd(FCMatrix[,1]),2)))

## ---- echo=TRUE---------------------------------------------------------------
library(Linnorm)
data(Islam2011)
#Obtain transformed data
Transformed <- Linnorm(Islam2011)

#Data Imputation
DataImputed <- Linnorm.DataImput(Transformed)

#Or, users can directly perform data imputation during transformation.
DataImputed <- Linnorm(Islam2011,DataImputation=TRUE)

## ---- echo=TRUE---------------------------------------------------------------
#You can use this file with Excel.
write.table(DataImputed, "DataImputed.txt",
quote=FALSE, sep="\t", col.names=TRUE, row.names=TRUE)

## ---- echo=TRUE---------------------------------------------------------------
library(Linnorm)
data(Islam2011)
#Obtain stable genes
StableGenes <- Linnorm.SGenes(Islam2011)

## ---- echo=TRUE---------------------------------------------------------------
#You can use this file with Excel.
write.table(StableGenes, "StableGenes.txt",
quote=FALSE, sep="\t", col.names=TRUE, row.names=TRUE)

## ---- echo=TRUE---------------------------------------------------------------
library(Linnorm)
data(LIHC)
datamatrix <- LIHC

## ---- echo=TRUE---------------------------------------------------------------
#5 samples for condition 1 and 5 samples for condition 2.
#You might need to edit this line
design <- c(rep(1,5),rep(2,5))
#These lines can be copied directly.
design <- model.matrix(~ 0+factor(design))
colnames(design) <- c("group1", "group2")
rownames(design) <- colnames(datamatrix)

## ---- echo=TRUE---------------------------------------------------------------
library(Linnorm)
#The Linnorm-limma pipeline only consists of one line.
DEG_Results <- Linnorm.limma(datamatrix,design)
#The DEG_Results matrix contains your DEG analysis results.

## ---- echo=TRUE---------------------------------------------------------------
library(Linnorm)
#Just add output="Both" into the argument list
BothResults <- Linnorm.limma(datamatrix,design,output="Both")

#To separate results into two matrices:
DEG_Results <- BothResults$DEResults
TransformedMatrix <- BothResults$Linnorm
#The DEG_Results matrix now contains DEG analysis results.
#The TransformedMatrix matrix now contains a Linnorm
#Transformed dataset.

## -----------------------------------------------------------------------------
write.table(DEG_Results, "DEG_Results.txt", quote=FALSE, sep="\t",
col.names=TRUE,row.names=TRUE)

## ---- echo=TRUE---------------------------------------------------------------
Genes10 <- DEG_Results[order(DEG_Results[,"adj.P.Val"]),][1:10,]
#Users can print the gene list by the following command:
#print(Genes10)

#logFC: log 2 fold change of the gene.
#XPM: If input is raw count or CPM, XPM is CPM.
#	If input is RPKM, FPKM or TPM, XPM is TPM.
#t: moderated t statistic.
#P.Value: p value.
#adj.P.Val: Adjusted p value. This is also called False Discovery Rate or q value.}
#B: log odds that the feature is differential.

#Note all columns are printed here

## ----kable1, echo=FALSE-------------------------------------------------------
library(knitr)
kable(Genes10[,c(1:5)], digits=4)

## ---- echo=TRUE---------------------------------------------------------------
SignificantGenes <- DEG_Results[DEG_Results[,5] <= 0.05,1]

## ---- echo=TRUE, fig.height=6, fig.width=6------------------------------------
plot(x=DEG_Results[,1], y=DEG_Results[,5], col=ifelse(DEG_Results[,1] %in%
SignificantGenes, "red", "green"),log="y", ylim = 
rev(range(DEG_Results[,5])), main="Volcano Plot", 
xlab="log2 Fold Change", ylab="q values", cex = 0.5)

## ---- echo=TRUE---------------------------------------------------------------
library(Linnorm)
data(Islam2011)
IslamData <- Islam2011[,1:92]

## ---- echo=TRUE---------------------------------------------------------------
#48 samples for condition 1 and 44 samples for condition 2.
#You might need to edit this line
design <- c(rep(1,48),rep(2,44))
#There lines can be copied directly.
design <- model.matrix(~ 0+factor(design))
colnames(design) <- c("group1", "group2")
rownames(design) <- colnames(IslamData)

## ---- echo=TRUE---------------------------------------------------------------
#Example 1: Filter low count genes.
#and genes with high technical noise.
scRNAseqResults <- Linnorm.limma(IslamData,design,Filter=TRUE)


#Example 2: Do not filter gene list.
scRNAseqResults <- Linnorm.limma(IslamData,design)

## ---- echo=TRUE---------------------------------------------------------------
data(Islam2011)

#Obtain embryonic stem cell data
datamatrix <- Islam2011[,1:48]

## ---- echo=TRUE---------------------------------------------------------------
#Setting plotNetwork to TRUE will create a figure file in your current directory.
#Setting it to FALSE will stop it from creating a figure file, but users can plot it
#manually later using the igraph object in the output.
#For this vignette, we will plot it manually in step 4.
results <- Linnorm.Cor(datamatrix, plotNetwork=FALSE,
#Edge color when correlation is positive
plot.Pos.cor.col="red",
#Edge color when correlation is negative
plot.Neg.cor.col="green")

## ---- echo=TRUE---------------------------------------------------------------
Genes10 <- results$Results[order(results$Results[,5]),][1:10,]
#Users can print the gene list by the following command:
#print(Genes10)

## ----kable2, echo=FALSE-------------------------------------------------------
library(knitr)
kable(Genes10, digits=4, row.names=FALSE)

## ---- echo=TRUE, fig.height=6, fig.width=6------------------------------------
library(igraph)
Thislayout <- layout_with_kk(results$igraph)
plot(results$igraph,
#Vertex size
vertex.size=2,
#Vertex color, based on clustering results
vertex.color=results$Cluster$Cluster,
#Vertex frame color
vertex.frame.color="transparent",
#Vertex label color (the gene names)
vertex.label.color="black",
#Vertex label size
vertex.label.cex=0.05,
#This is how much the edges should be curved.
edge.curved=0.1,
#Edge width
edge.width=0.05,
#Use the layout created previously
layout=Thislayout
)

## ---- echo=TRUE---------------------------------------------------------------
TheCluster <- which(results$Cluster[,1] == "Mmp2")

## ---- echo=TRUE---------------------------------------------------------------
#Index of the genes
ListOfGenes <- which(results$Cluster[,2] == TheCluster)

#Names of the genes
GeneNames <- results$Cluster[ListOfGenes,1]

#Users can write these genes into a file for further analysis.

## ---- echo=TRUE---------------------------------------------------------------
top100 <- results$Results[order(results$Results[,4],decreasing=FALSE)[1:100],1]

## ---- echo=TRUE---------------------------------------------------------------
Top100.Cor.Matrix <- results$Cor.Matrix[top100,top100]

## ---- echo=TRUE, fig.height=6, fig.width=6------------------------------------
library(RColorBrewer)
library(gplots)
heatmap.2(as.matrix(Top100.Cor.Matrix),
#Hierarchical clustering on both row and column
Rowv=TRUE, Colv=TRUE,
#Center white color at correlation 0
symbreaks=TRUE,
#Turn off level trace
trace="none",
#x and y axis labels
xlab = 'Genes', ylab = "Genes",
#Turn off density info
density.info='none',
#Control color
key.ylab=NA,
col=colorRampPalette(c("blue", "white", "yellow"))(n = 1000),
lmat=rbind(c(4, 3), c(2, 1)),
#Gene name font size
cexRow=0.3,
cexCol=0.3,
#Margins
margins = c(8, 8)
)

## ---- echo=TRUE---------------------------------------------------------------
data(Islam2011)


## ---- echo=TRUE---------------------------------------------------------------
#The first 48 columns are the embryonic stem cells.
results <- Linnorm.HVar(Islam2011[,1:48])

## ---- echo=TRUE---------------------------------------------------------------
resultsdata <- results$Results
Genes10 <- resultsdata[order(resultsdata[,"q.value"]),][1:10,3:5]
#Users can print the gene list by the following command:
#print(Genes10)

## ----kable3, echo=FALSE-------------------------------------------------------
library(knitr)
kable(Genes10, digits=4)

## ---- echo=TRUE, fig.height=6, fig.width=6------------------------------------
print(results$plot$plot)
#By default, this plot highlights genes/features with p value less than 0.05.

## ---- echo=TRUE---------------------------------------------------------------
library(Linnorm)
data(Islam2011)

## ---- echo=TRUE---------------------------------------------------------------
tSNE.results <- Linnorm.tSNE(Islam2011[,1:92])

## ---- echo=TRUE, fig.height=6, fig.width=6------------------------------------
#Here, we can see two clusters.
print(tSNE.results$plot$plot)

## ---- echo=TRUE---------------------------------------------------------------
#The first 48 samples belong to mouse embryonic stem cells.
Groups <- rep("ES_Cells",48)
#The next 44 samples are mouse embryonic fibroblasts.
Groups <- c(Groups, rep("EF_Cells",44))

## ---- echo=TRUE---------------------------------------------------------------
#Useful arguments:
#Group:
#allows user to provide each sample's information.
#num_center:
#how many clusters are supposed to be there.
#num_PC:
#how many principal components should be used in k-means
#clustering.

tSNE.results <- Linnorm.tSNE(Islam2011[,1:92],
 Group=Groups, num_center=2)

## ---- echo=TRUE, fig.height=6, fig.width=6------------------------------------
#Here, we can see two clusters.
print(tSNE.results$plot$plot)

## ---- echo=TRUE---------------------------------------------------------------
library(Linnorm)
data(Islam2011)

## ---- echo=TRUE---------------------------------------------------------------
PCA.results <- Linnorm.PCA(Islam2011[,1:92])

## ---- echo=TRUE, fig.height=6, fig.width=6------------------------------------
#Here, we can see multiple clusters.
print(PCA.results$plot$plot)

## ---- echo=TRUE---------------------------------------------------------------
#The first 48 samples belong to mouse embryonic stem cells.
Groups <- rep("ES_Cells",48)
#The next 44 samples are mouse embryonic fibroblasts.
Groups <- c(Groups, rep("EF_Cells",44))

## ---- echo=TRUE---------------------------------------------------------------
#Useful arguments:
#Group:
#allows user to provide each sample's information.
#num_center:
#how many clusters are supposed to be there.
#num_PC
#how many principal components should be used in k-means
#clustering.

PCA.results <- Linnorm.PCA(Islam2011[,1:92],
Group=Groups, num_center=2, num_PC=3)

## ---- echo=TRUE, fig.height=6, fig.width=6------------------------------------
#Here, we can see two clusters.
print(PCA.results$plot$plot)

## ---- echo=TRUE---------------------------------------------------------------
data(Islam2011)
Islam <- Islam2011[,1:92]

## ---- echo=TRUE---------------------------------------------------------------
#48 ESC, 44 EF, and 4 NegCtrl
Group <- c(rep("ESC",48),rep("EF",44))
colnames(Islam) <- paste(colnames(Islam),Group,sep="_")

## ---- echo=TRUE---------------------------------------------------------------
#Note that there are 3 known clusters.
HClust.Results <- Linnorm.HClust(Islam,Group=Group,
num_Clust=2, fontsize=1.5, Color = c("Red","Blue"), RectColor="Green")

## ---- echo=TRUE, fig.height=6, fig.width=6------------------------------------
print(HClust.Results$plot$plot)

## ---- echo=TRUE---------------------------------------------------------------
library(Linnorm)
data(SEQC)
SampleA <- SEQC

## ---- echo=TRUE---------------------------------------------------------------
#This will generate two sets of RNA-seq data with 5 replicates each. 
#It will have 20000 genes totally with 2000 genes being differentially 
#expressed. It has the Negative Binomial distribution.
SimulatedData <- RnaXSim(SampleA)

## ---- echo=TRUE---------------------------------------------------------------
#Index of differentially expressed genes.
DE_Index <- SimulatedData[[2]]

#Expression Matrix
ExpMatrix <- SimulatedData[[1]]

#Sample Set 1
Sample1 <- ExpMatrix[,1:3]

#Sample Set 2
Sample2 <- ExpMatrix[,4:6]

## ---- echo=TRUE---------------------------------------------------------------
data(SEQC)
SampleA <- SEQC

## ---- echo=TRUE---------------------------------------------------------------
library(Linnorm)
SimulatedData <- RnaXSim(SampleA,
distribution="Gamma", #Distribution in the simulated dataset.
#Put "NB" for Negative Binomial, "Gamma" for Gamma,
#"Poisson" for Poisson and "LogNorm" for Log Normal distribution.
NumRep=5, #Number of replicates in each sample set. 
#5 will generate 10 samples in total.
NumDiff = 1000, #Number of differentially expressed genes.
NumFea = 5000 #Total number of genes in the dataset
)

## ---- echo=TRUE---------------------------------------------------------------
#Index of differentially expressed genes.
DE_Index <- SimulatedData[[2]]

#Expression Matrix
ExpMatrix <- SimulatedData[[1]]

#Simulated Sample Set 1
Sample1 <- ExpMatrix[,1:3]

#Simulated Sample Set 2
Sample2 <- ExpMatrix[,4:6]

Try the Linnorm package in your browser

Any scripts or data that you put into this service are public.

Linnorm documentation built on Nov. 8, 2020, 6:48 p.m.