## Writing our own functions
## library(XLConnect)
#library(plotly)
#library(ReporteRs)
#library(svglite)
library(ggplot2)
library(mixOmics)
library(grid)
library(ggrepel)
library(gridExtra)
data.raw <- read.csv("./data/mx 107155 _lung cancer tissue_summer course_08-2015_submit.csv", header = F, stringsAsFactors = F)
data.raw <- data.raw[,-5] ## We get rid of the mass spec column.
head(data.raw)
data.raw[20,10] # inspecting the value of row number 20 and column number 10
data.raw[1:5,8:12] # inspecting the value of raw number 1 to 5 and column number 8 t 12
data.raw[,1] # first column – metabolite names
data.raw[1,] # first row – sample names
data.raw[10,] # class information
cpd.names <- data.raw[,1]
cpd.names[1:20]
#### Internal Standards
data.raw.is <- data.raw[grep("^z ",cpd.names),] # make a separate data frame for Internal standards.
grep("z ",cpd.names)#which rows contain starting with z
###
data.raw <- data.raw[-grep("^z ",cpd.names),] # get rid of the internal standards.
## Indexing information
grep("BinBase", data.raw[,1])
help(grep)
metstart.index <- (grep("BinBase",data.raw[,1])+1) # get the row from where we have metabolites are starting.
row.ind <- metstart.index:nrow(data.raw) # row numbers that has peak intensities.
col.ind <- grep("bbdsa|cbdsa|bddsa", data.raw[1,]) # column numbers that has peak intensities.
### Header Details
sample.header <- data.raw[1:(metstart.index-1),(col.ind[1]-1)]
metabolite.header <- data.raw [grep("BinBase",data.raw[,1]),1:(col.ind[1]-1)]
## Metabolites meta data
cpd.metadata <- data.raw[ metstart.index:nrow(data.raw) , 1:(col.ind[1]-1) ]
## Subject MetabData
subject.metadata <- data.raw[ 1:(metstart.index-1), (col.ind[1]):ncol(data.raw) ]
dim(subject.metadata)
head(cpd.metadata,10)
tail(cpd.metadata,10)
table(as.character(subject.metadata[10,])) ## Get the levels in the phenotype column.
## Peak Intensities
numeric.data <- data.raw[row.ind,col.ind] # make a dataframe for only the metabolites.
#Principal Component Analysis
pcamat <- do.call("cbind",lapply(numeric.data,as.numeric))#takes certain columns and stitch them together, as a matrix
pcamat[which(is.na(pcamat)==TRUE)] <- min(pcamat[which(is.na(pcamat)==FALSE)])#replace missing value with the minimum
pcamat[which(pcamat==0)] <- min(pcamat[-which(pcamat==0)])##IMPT, replacing 0 with the min value in the matrix that is not 0
which(is.na(pcamat)==TRUE)
pca.res <- pca(pcamat, ncomp=2,max.iter=100)
pca.res
pca.res$loadings
typeof(pca.res)
pc.scores <- pca.res$loadings[[1]]
# combine subject metadata and pc_scores in one data frame
subject.metadata.transpose <- as.data.frame(t(subject.metadata), stringsAsFactors = F) ## data frame need to be trasponsed so can be combined with PCA results.
#t is transpose
pcdf <- cbind(subject.metadata.transpose,pc.scores)
typeof(pcdf) #
str(pcdf)
is.matrix(pcdf) #
is.data.frame(pcdf) #
colnames(pcdf) <- c(sample.header,"PC1","PC2")
p1 <- ggplot(pcdf, aes(PC1,PC2)) +
scale_y_continuous(paste("PC2 - variance explained : ",signif(pca.res$explained_variance[2],digit=4),"(%) ",sep="")) +
scale_x_continuous(paste("PC1 - variance explained : ",signif(pca.res$explained_variance[1],digit=4),"(%) ",sep="")) +
theme_bw() +
labs(title = "Principal component analysis (PCA)") +
theme(
plot.title = element_text(face="bold", size=14),
axis.title.x = element_text(face="bold", size=12),
axis.title.y = element_text(face="bold", size=12, angle=90),
panel.grid.major = element_blank(), # switch off major gridlines
panel.grid.minor = element_blank(), # switch off minor gridlines
legend.position = c(0.1,0.1), # manually position the legend (numbers being from 0,0 at bottom left of whole plot to 1,1 at top right)
#top to bottom, left to right
legend.title = element_blank(), # switch off the legend title
legend.text = element_text(size=12),
legend.key.size = unit(1.5, "lines"),
legend.key = element_blank(), # switch off the rectangle around symbols in the legend
legend.spacing = unit(.05, "cm"),
axis.text.x = element_text(size=0,angle = 45, hjust = 1),
axis.text.y = element_text(size=0,angle = 45, hjust = 1)
)
p2 <- p1 + geom_point(aes(shape=phenotype, colour=phenotype ), size=3)#scattered plot
p3 <- p2 + stat_ellipse( aes(colour=phenotype ), linetype = 1 )
plot(p2)
plot(p3)
ggsave("pcaplot.png", width = 12, height = 8, dpi = 300)
ggsave("pcaplot.pdf", width = 12, height = 8, dpi = 300)
# t.test
groupA.ind <- which(subject.metadata[10,]=="non-malignant") ## non-cancer tissues, location by "which"
groupB.ind <- which(subject.metadata[10,]=="tumor") ## tumor tissues
groupc#if there are more than two groups
table(subject.metadata)###inspect how many groups there are -----?
## we need to create a function for running ttest.
runttest <- function(index) {
results <- t.test(pcamat[index,groupA.ind],pcamat[index,groupB.ind])
foldchange <- median(pcamat[index,groupA.ind])/median(pcamat[index,groupB.ind])
unlist(c(cpd.metadata[index,],results$p.value,foldchange))
}
index = 8
results <- t.test(pcamat[index,groupA.ind],pcamat[index,groupB.ind])
pcamat[index,groupA.ind]
pcamat[index,groupB.ind]
foldchange <- median(pcamat[index,groupA.ind])/median(pcamat[index,groupB.ind])
runttest(1) ## run the ttest for metabolite 1.
ttest.results <- do.call(rbind,lapply(1:nrow(pcamat), runttest)) ## run the ttest for all the compounds and combine that results into one dataframe.
1:nrow(pcamat)
lapply(1:nrow(pcamat), runttest)
head(ttest.results,10)
colnames(ttest.results) <- c(metabolite.header,"PValue","FoldChange")
head(ttest.results,10)
ttest.results <- as.data.frame(ttest.results, stringsAsFactors = F)
ttest.results$PValue <- as.numeric(ttest.results$PValue )
ttest.results$FoldChange <- as.numeric(ttest.results$FoldChange )
pvalue.adjusted <- p.adjust(ttest.results$PValue,method = "fdr") ## FDR adjustment of PValues #p.adjust.methods
# c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY",
# "fdr", "none")
ttest.results <- cbind(ttest.results,PValue.Adjusted=pvalue.adjusted)
write.table(ttest.results, file="ttestresults.csv", col.names = T, row.names = F,quote = F, sep = "\t") ## export the results to an external file.
# Volcano plot
volcanodf <- data.frame(name=ttest.results$`BinBase name`, pvalue=ttest.results$PValue, foldchange = ttest.results$FoldChange)
volcanodf$Significant <- ifelse(volcanodf$pvalue < 0.005, "PValue < 0.005", "Not Sig")
table(volcanodf$Significant)##how many are signigicant
pv1 <- ggplot(volcanodf, aes(x = log10(foldchange), y = -log10(pvalue))) +
geom_point(aes(color = Significant)) +
scale_color_manual(values = c("red", "grey")) +
theme_bw(base_size = 16) +
geom_text_repel(
data = subset(volcanodf, pvalue < 0.005),
aes(label = name),
size = 5,
box.padding = unit(0.35, "lines"),
point.padding = unit(0.3, "lines")
)
plot(pv1)
ggsave("volcano-1.png", width = 12, height = 8, dpi = 600)
# box-whisker plots
data_bw <- cbind(as.data.frame(t(pcamat)),phenotype =pcdf$phenotype) ##can add extra columns for dif grouping options to draw box plot
data_bw
str(data_bw)
k = 4 # compound:valine
f1 = ggplot(data_bw, aes(phenotype, data_bw[[ colnames(data_bw) [ k ]]]) ) +
geom_boxplot(fill = "grey", colour = "#3366FF",outlier.shape = 4) +
geom_point(aes(color=phenotype), size=2, position = position_jitterdodge()) +
scale_y_continuous("Metabolite levels (normalized ion count)") +
scale_shape_manual(values=c(15,19,17,5)) +
scale_fill_manual(values=c("#CC0000", "#006600", "#669999", "#00CCCC","#660099", "#CC0066", "#FF9999", "#FF9900","black", "black", "black", "black", "black")) +
theme_bw() + # make the theme black-and-white rather than grey (do this before font changes, or it overrides them)
labs(title = paste(" Levels of ", cpd.metadata$V1[ k ],sep=" : ")) +
theme(
plot.title = element_text(face="bold", size=14), # use theme_get() to see available options
axis.title.x = element_text(face="bold", size=12),
axis.title.y = element_text(face="bold", size=12, angle=90),
panel.grid.major = element_blank(), # switch off major gridlines
panel.grid.minor = element_blank(), # switch off minor gridlines
legend.position = c(0.1,0.9), # manually position the legend (numbers being from 0,0 at bottom left of whole plot to 1,1 at top right)
legend.title = element_blank(), # switch off the legend title
legend.text = element_text(size=12),
legend.key.size = unit(1.5, "lines"),
legend.key = element_blank(), # switch off the rectangle around symbols in the legend
legend.margin = unit(.05, "cm"),
axis.text.x = element_text(size=12,angle = 45, hjust = 1)
)
plot(f1)
## let's make a loop to go over all the significant metabolites.IMPT!!
topcpds.index <- order(ttest.results$PValue) [1:10]
for (k in topcpds.index) {
f1 = ggplot(data_bw, aes(phenotype, data_bw[[ colnames(data_bw) [ k ]]]) ) +
geom_boxplot(fill = "grey", colour = "#3366FF",outlier.shape = 3) +
geom_point(aes(color=phenotype), size=2, position = position_jitterdodge()) +
scale_y_continuous("Metabolite levels (normalized ion count)") +
scale_shape_manual(values=c(15,19,17,5)) +
scale_fill_manual(values=c("#CC0000", "#006600", "#669999", "#00CCCC","#660099", "#CC0066", "#FF9999", "#FF9900","black", "black", "black", "black", "black")) +
theme_bw() + # make the theme black-and-white rather than grey (do this before font changes, or it overrides them)
labs(title = paste(" Levels of ", cpd.metadata$V1[ k ],sep=" : ")) +
theme(
plot.title = element_text(face="bold", size=14), # use theme_get() to see available options
axis.title.x = element_text(face="bold", size=12),
axis.title.y = element_text(face="bold", size=12, angle=90),
panel.grid.major = element_blank(), # switch off major gridlines
panel.grid.minor = element_blank(), # switch off minor gridlines
legend.position = c(0.1,0.9), # manually position the legend (numbers being from 0,0 at bottom left of whole plot to 1,1 at top right)
legend.title = element_blank(), # switch off the legend title
legend.text = element_text(size=12),
legend.key.size = unit(1.5, "lines"),
legend.key = element_blank(), # switch off the rectangle around symbols in the legend
legend.margin = unit(.05, "cm"),
axis.text.x = element_text(size=12,angle = 45, hjust = 1)
)
plotname <- paste0("bw_plots_",cpd.metadata$v1[k],".jpg")###name is the compound name accordingly
ggsave(plotname, width = 12, height = 8, dpi = 84)
}
cpd.metadata$V1[6]
## add multiple graphs on one page of pdf.
plot.box.whisker <- function(k) {
f1 = ggplot(data_bw, aes(phenotype, data_bw[[ colnames(data_bw) [ k ]]]) ) +
geom_boxplot(fill = "grey", colour = "#3366FF",outlier.shape = 3) +
geom_point(aes(color=phenotype), size=2, position = position_jitterdodge()) +
scale_y_continuous("Metabolite levels (normalized ion count)") +
scale_shape_manual(values=c(15,19,17,5)) +
scale_fill_manual(values=c("#CC0000", "#006600", "#669999", "#00CCCC","#660099", "#CC0066", "#FF9999", "#FF9900","black", "black", "black", "black", "black")) +
theme_bw() + # make the theme black-and-white rather than grey (do this before font changes, or it overrides them)
labs(title = paste(" Levels of ", cpd.metadata$V1[ k ],sep=" : ")) +
theme(
plot.title = element_text(face="bold", size=14), # use theme_get() to see available options
axis.title.x = element_text(face="bold", size=12),
axis.title.y = element_text(face="bold", size=12, angle=90),
panel.grid.major = element_blank(), # switch off major gridlines
panel.grid.minor = element_blank(), # switch off minor gridlines
legend.position = c(0.1,0.9), # manually position the legend (numbers being from 0,0 at bottom left of whole plot to 1,1 at top right)
legend.title = element_blank(), # switch off the legend title
legend.text = element_text(size=12),
legend.key.size = unit(1.5, "lines"),
legend.key = element_blank(), # switch off the rectangle around symbols in the legend
legend.margin = unit(.05, "cm"),
axis.text.x = element_text(size=12,angle = 45, hjust = 1)
)
f1
}
plotlist <- lapply(topcpds.index,plot.box.whisker) # we collect all the plots into a list object
ml <- marrangeGrob(plotlist, nrow=2,ncol=3) # then we arrange the list
ggsave("multipage.pdf", ml)
# write an if statement to get the names of all the internal standards
for (i in cpd.names) {
if( length(grep('^z ',i)>0) ){
print(i)
}
}
# getting the row number and name of int standards.
for (i in 1:length(cpd.names)) {
if( length(grep('^z ',cpd.names[i])>0) ){
print(c(i,cpd.names[i]))
}
}
# lets get all the samples ids that has a label “tumor”
data.raw[1, grep("tumor",data.raw[10,] ) ]
# lets get all the columns that has sample data,
data.raw[1,grep("bbdsa", data.raw[1,])]
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.