scripts/prepare_MSPathfinder_SYUA_UnkMod_SingleCleavage.R

# Script for looking at MSPathfinder outputs: *.IcTda.tsv files
# Metadata for Lewy study added for customization
# And finally make MSnSet object


library(sqldf)
library(reshape2)



##Concatenate .tsv files and add filename as 'file'
files <- list.files(path = "../inst/extdata/MSPathfinder/SYUA_UnkMods_SingleCleavage", 
                    pattern = "*IcTda.tsv", 
                    full.names = TRUE)
dataTable <- NULL
for (f in files) {
  dat <- read.csv(f, header=T, sep="\t", na.strings="", colClasses="character")
  dat$file <- sub("\\.tsv","",basename(f))
  dataTable <- rbind(dataTable, dat)
}


#Filter on QValue
dataTable_f <- subset(dataTable, QValue <= 0.01)

#bring in and attach metadata
metaData <- read.delim("../inst/extdata/metaData.txt", header = T, as.is=T)
dataTable_f <- merge(dataTable_f, metaData, by.x = "file", by.y = "file")

#Create Proteoform identifier
proteoformTable <- sqldf("Select *, ProteinName || '_' || Composition
                        || '_s' || Start || '-e' || End as Proteoform
                        From dataTable_f")


#Get identification stats
PrSMs <- sqldf("Select file, alias, runOrder, prepBatch, homogenizationBatch, count(file) as PrSMs 
              From dataTable_f
              group by file")

Proteoforms <- sqldf("Select count(DISTINCT Proteoform) as Proteoforms 
                From proteoformTable
                group by file")

Proteins <- sqldf("Select count(DISTINCT ProteinName) as Proteins 
                From dataTable_f
                group by file")

id_stats <- cbind(PrSMs, Proteoforms, Proteins)
rownames(id_stats) <- id_stats$alias


#Proteoform Level crosstab
#-------------------------------------------------------
for_xtab <- sqldf("Select alias, Proteoform, ProteinName, Modifications,
                  Count(Proteoform) As Spectral_Counts
                  From proteoformTable
                  Group By  alias, Proteoform"
)
proteoform_count_xtab <- acast(for_xtab, Proteoform ~ alias, 
                               value.var = "Spectral_Counts")
proteoform_count_xtab[is.na(proteoform_count_xtab)] <- 0
#-------------------------------------------------------


# feature data
#-------------------------------------------------------
proteinMap <- sqldf("Select ProteinName, Modifications, Proteoform
                    From for_xtab
                    Group by Proteoform")
rownames(proteinMap) <- proteinMap$Proteoform
#-------------------------------------------------------


library("MSnbase")
m <- MSnSet(exprs = proteoform_count_xtab, 
            fData = proteinMap[rownames(proteoform_count_xtab),],
            pData = id_stats[colnames(proteoform_count_xtab),])
stopifnot(validObject(m))


# need at least 10 out of 51 samples with some counts
m <- m[rowSums(exprs(m) > 0) > 10,]

# tweak pData
a <- pData(m)$alias
aa <- ifelse(grepl("Cs", a), "case",
             ifelse(grepl("Ct1", a), "control.1", "control.2"))
pData(m)$subject.type <- factor(aa, levels=c("control.2", "control.1", "case"))
am <- sub("(\\d+)C.*", "\\1", a)
pData(m)$match.group <- sprintf("%02d", as.numeric(am))
pData(m)$match.group <- as.factor(pData(m)$match.group)
pData(m)$sample.id <- pData(m)$alias
pData(m)$alias <- NULL

save(m, file="../data/mspf_syua_unkmod_single_cleav.RData")
vladpetyuk/LewyBodies.SN.TopDown documentation built on May 3, 2019, 6:15 p.m.