scripts/prepare_MSAlign_Human_default.R

# Script for looking at MSPathfinder outputs: *.IcTda.tsv files
# Metadata for Lewy study added for customization
# And finally make MSnSet object


library("sqldf")
library("reshape2")



##Concatenate .tsv files and add filename as 'file'
files <- list.files(path = "../notinst/extdata/MSAlign/HumanDB/", 
                    pattern = "*ResultTable.txt.xz", 
                    full.names = TRUE)
dataTable <- NULL
for (f in files) {
  dat <- read.delim(f, check.names=F, as.is=T)
  dat <- subset(dat, `E-value` <= 1e-5 & `E-value` > 0)
  dat$Data_file_name <- NULL
  dat$file <- sub("\\_MSAlign_ResultTable.txt.xz","",basename(f))
  dataTable <- rbind(dataTable, dat)
}

# create proteoform ID
dataTable$UniProtID <- sapply(strsplit(dataTable$Protein_name, ' '), '[[', 1)
# First_residue Last_residue
# round(Protein_mass)
dataTable$ProteoformID <- with(dataTable, paste(UniProtID, 
                                                First_residue, 
                                                Last_residue, 
                                                round(Precursor_mass), 
                                                sep='_'))

#bring in and attach metadata
metaData <- read.delim("../notinst/extdata/lewy_intact_meta.txt", header = T, as.is=T)
rownames(metaData) <- metaData$sample.id
dataTable_f <- merge(dataTable, metaData, by.x = "file", by.y = "file")


# #Create Proteoform identifier
# proteoformTable <- sqldf("Select *, ProteinName || '_' || Composition
#                         || '_s' || Start || '-e' || End as Proteoform
#                         From dataTable_f")
# 
# 
# #Get identification stats
# PrSMs <- sqldf("Select file, alias, runOrder, prepBatch, homogenizationBatch, count(file) as PrSMs 
#               From dataTable_f
#               group by file")
# 
# Proteoforms <- sqldf("Select count(DISTINCT Proteoform) as Proteoforms 
#                 From proteoformTable
#                 group by file")
# 
# Proteins <- sqldf("Select count(DISTINCT ProteinName) as Proteins 
#                 From dataTable_f
#                 group by file")
# 
# id_stats <- cbind(PrSMs, Proteoforms, Proteins)
# rownames(id_stats) <- id_stats$alias


#Proteoform Level crosstab
#-------------------------------------------------------
proteoform_count_xtab <- acast(dataTable_f, ProteoformID ~ sample.id)
# for_xtab <- sqldf("Select alias, ProteoformID,
#                   Count(ProteoformID) As Spectral_Counts
#                   From dataTable_f
#                   Group By  alias, ProteoformID"
# )
# proteoform_count_xtab <- acast(for_xtab, ProteoformID ~ alias, 
#                                value.var = "Spectral_Counts")
# proteoform_count_xtab[is.na(proteoform_count_xtab)] <- 0
#-------------------------------------------------------


# feature data
#-------------------------------------------------------
fData <- unique(dataTable_f[,c("ProteoformID","UniProtID","First_residue","Last_residue")])
rownames(fData) <- fData$ProteoformID
# proteinMap <- sqldf("Select ProteinName, Modifications, Proteoform
#                     From for_xtab
#                     Group by Proteoform")
# rownames(proteinMap) <- proteinMap$Proteoform
#-------------------------------------------------------


library("MSnbase")
m <- MSnSet(exprs = proteoform_count_xtab, 
            fData = fData[rownames(proteoform_count_xtab),],
            pData = metaData[colnames(proteoform_count_xtab),])
stopifnot(validObject(m))


# -- SKIPPED
# need at least 10 out of 51 samples with some counts
# m <- m[rowSums(exprs(m) > 0) > 10,]

# # tweak pData
# a <- pData(m)$alias
# aa <- ifelse(grepl("Cs", a), "case",
#              ifelse(grepl("Ct1", a), "control.1", "control.2"))
# pData(m)$subject.type <- factor(aa, levels=c("control.2", "control.1", "case"))
# am <- sub("(\\d+)C.*", "\\1", a)
# pData(m)$match.group <- sprintf("%02d", as.numeric(am))
# pData(m)$match.group <- as.factor(pData(m)$match.group)
# pData(m)$sample.id <- pData(m)$alias
# pData(m)$alias <- NULL

# tweak pData
pData(m)$subject.type <- factor(pData(m)$subject.type, 
                                levels=c("control.2", "control.1", "case"))
pData(m)$match.group <- as.factor(sprintf("%02d", pData(m)$match.group))
pData(m)$Projid <- as.character(pData(m)$Projid)
pData(m)$homogenizationBatch <- as.factor(pData(m)$homogenizationBatch)
pData(m)$prepBatch <- as.factor(pData(m)$prepBatch)


save(m, file="../data/msal_human_default.RData")
vladpetyuk/LewyBodies.SN.TopDown documentation built on May 3, 2019, 6:15 p.m.