# Script for looking at MSPathfinder outputs: *.IcTda.tsv files
# Metadata for Lewy study added for customization
# And finally make MSnSet object
library(sqldf)
library(reshape2)
##Concatenate .tsv files and add filename as 'file'
files <- list.files(path = "../notinst/extdata/MSPathfinder/HumanDB_default/",
pattern = "*IcTda.tsv.xz",
full.names = TRUE)
dataTable <- NULL
for (f in files) {
dat <- read.csv(f, header=T, sep="\t", na.strings="", colClasses="character")
dat$file <- sub("_IcTda\\.tsv\\.xz","",basename(f))
dataTable <- rbind(dataTable, dat)
}
#Filter on QValue
dataTable_f <- subset(dataTable, QValue <= 0.01)
#bring in and attach metadata
metaData <- read.delim("../notinst/extdata/lewy_intact_meta.txt", header = T, as.is=T)
rownames(metaData) <- metaData$sample.id
# metaData <- read.delim("../inst/extdata/metaData.txt", header = T, as.is=T)
dataTable_f <- merge(dataTable_f, metaData, by.x = "file", by.y = "file")
#Create Proteoform identifier
proteoformTable <- sqldf("Select *, ProteinName || '_' || Composition
|| '_s' || Start || '-e' || End as Proteoform
From dataTable_f")
# #Get identification stats
# PrSMs <- sqldf("Select file, `sample.id`, runOrder, prepBatch, homogenizationBatch, count(file) as PrSMs
# From dataTable_f
# group by file")
#
# Proteoforms <- sqldf("Select count(DISTINCT Proteoform) as Proteoforms
# From proteoformTable
# group by file")
#
# Proteins <- sqldf("Select count(DISTINCT ProteinName) as Proteins
# From dataTable_f
# group by file")
#
# id_stats <- cbind(PrSMs, Proteoforms, Proteins)
# rownames(id_stats) <- id_stats$alias
# Proteoform Level crosstab for expression slot of MSnSet
#-------------------------------------------------------
for_xtab <- sqldf("Select `sample.id`, Proteoform, ProteinName, Modifications,
Count(Proteoform) As Spectral_Counts
From proteoformTable
Group By `sample.id`, Proteoform")
proteoform_count_xtab <- acast(for_xtab, Proteoform ~ `sample.id`,
value.var = "Spectral_Counts", fill=0)
# proteoform_count_xtab[is.na(proteoform_count_xtab)] <- 0
#-------------------------------------------------------
# feature data
#-------------------------------------------------------
proteinMap <- sqldf("Select ProteinName, Modifications, Proteoform
From for_xtab
Group by Proteoform")
rownames(proteinMap) <- proteinMap$Proteoform
#-------------------------------------------------------
library("MSnbase")
m <- MSnSet(exprs = proteoform_count_xtab,
fData = proteinMap[rownames(proteoform_count_xtab),],
pData = metaData[colnames(proteoform_count_xtab),])
stopifnot(validObject(m))
# # tweak pData
# a <- pData(m)$alias
# aa <- ifelse(grepl("Cs", a), "case",
# ifelse(grepl("Ct1", a), "control.1", "control.2"))
# pData(m)$subject.type <- factor(aa, levels=c("control.2", "control.1", "case"))
# am <- sub("(\\d+)C.*", "\\1", a)
# pData(m)$match.group <- sprintf("%02d", as.numeric(am))
# pData(m)$match.group <- as.factor(pData(m)$match.group)
# pData(m)$sample.id <- pData(m)$alias
# pData(m)$alias <- NULL
# tweak pData
pData(m)$subject.type <- factor(pData(m)$subject.type,
levels=c("control.2", "control.1", "case"))
pData(m)$match.group <- as.factor(sprintf("%02d", pData(m)$match.group))
pData(m)$Projid <- as.character(pData(m)$Projid)
pData(m)$homogenizationBatch <- as.factor(pData(m)$homogenizationBatch)
pData(m)$prepBatch <- as.factor(pData(m)$prepBatch)
# what is the min observation count?
plot(table(rowSums(exprs(m) > 0)))
save(m, file="../data/mspf_human_default.RData")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.