scripts/prepare_IQ_HumanDB.R

# Script for looking at IQ outputs: *.Lewy_intact_06_TargetOutput.txt files
# Metadata for Lewy study added for customization
# And finally make MSnSet object


##Concatenate .tsv files and add filename as 'file'
fn <- "../notinst/extdata/IQ/HumanDB/Lewy_Intact_1268.txt.xz"
dat <- read.delim(fn, check.names=F, as.is=T)


# hist(dat$`P-Score`)
# hist(dat$MedianPearson)
# hist(dat$MedianChargeCorrelation)
# hist(dat$MedianPpmError)
# x <- subset(dat, DataSet == 'Lewy_intact_01_TargetOutput.txt', 
#             select=c("P-Score","MedianPearson","MedianChargeCorrelation","MedianPpmError"))
# plot(x)
# 
# tgt <- unique(dat$Target)[1]
# View(subset(dat, DataSet == 'Lewy_intact_01_TargetOutput.txt' & Target == tgt))
# 
# 
# plot(table(table(unique(dat[,c("DataSet","Target")])$Target)))
# plot(table(unique(dat[,c("DataSet","Target")])$Target), type='p')
# 
# target.obs <- table(unique(dat[,c("DataSet","Target")])$Target)
# target.obs <- as.data.frame(target.obs)
# colnames(target.obs) <- c("Target","Target.Obs")
# 
# x <- merge(dat, target.obs)
# 
# x <- subset(x, DataSet == 'Lewy_intact_01_TargetOutput.txt', 
#             select=c("P-Score","MedianPearson","MedianChargeCorrelation","MedianPpmError","Target.Obs"))
# plot(x)
# 

# filtering criteria:
# 1) P-Score > 0.8
# 2) abs(MedianPpmError) < 3
# 3) after filtering apply unique observation count filter > 10 - arguable

# filter based on P-Score and mass measurement error
dim(dat)
dat <- subset(dat, `P-Score` > 0.8 & abs(MedianPpmError) < 3)
dim(dat)



# # filtering based on observation count -- SKIPPED
# target.obs <- table(unique(dat[,c("DataSet","Target")])$Target)
# target.obs <- as.data.frame(target.obs)
# colnames(target.obs) <- c("Target","Target.Obs")
# dat <- merge(dat, target.obs)
# dim(dat)
# dat <- subset(dat, Target.Obs > 25)
# dim(dat)



# tweak the dat
dat$DataSet <- sub("_TargetOutput.txt","",dat$DataSet)
dat$ScanRange <- NULL
dat$ElutionApex <- NULL
dat$ChargeSet <- NULL
dat$MedianPpmError <- NULL
dat$MedianChargeCorrelation <- NULL
dat$MedianPearson <- NULL
dat$`P-Score` <- NULL


# bring in and attach metadata
metaData <- read.delim("../notinst/extdata/lewy_intact_meta.txt", header = T, as.is=T)
rownames(metaData) <- metaData$sample.id
# linking to subject.id, so they can be used as sample names instead of dataset names
dat <- merge(dat, metaData[,c('file','sample.id')], by.x='DataSet', by.y='file')
dat$DataSet <- NULL
pData <- metaData



fData <- unique(dat[,c("Target","ProteinName","Mass")])
fData$Feature <- with(fData, paste(sub("_HUMAN","",ProteinName), round(Mass,2), sep='_')) # danger! copy-paste
# if it stumbles here, then find out another way of encoding feature names
# such that they are unique
stopifnot(!any(duplicated(fData$Feature)))
# is it good to go?
rownames(fData) <- fData$Feature
fData$Feature <- NULL

# apply the same scheme of feature naming
dat$Feature <- with(dat, paste(sub("_HUMAN","",ProteinName), round(Mass,2), sep='_')) # danger! copy-paste


# expression slot of MSnSet object
library("reshape2")
exprshn <- acast(Feature ~ sample.id, data=dat, value.var="Intensity", 
                 fun.aggregate = sum, fill=as.numeric(NA))


library("MSnbase")
m <- MSnSet(exprshn, 
                 pData=pData[colnames(exprshn),],
                 fData=fData[rownames(exprshn),])

stopifnot(validObject(m))



# tweak pData
pData(m)$subject.type <- factor(pData(m)$subject.type, 
                                levels=c("control.2", "control.1", "case"))
pData(m)$match.group <- as.factor(sprintf("%02d", pData(m)$match.group))
pData(m)$Projid <- as.character(pData(m)$Projid)
pData(m)$homogenizationBatch <- as.factor(pData(m)$homogenizationBatch)
pData(m)$prepBatch <- as.factor(pData(m)$prepBatch)


# -- SKIPPED
# # some data preprocessing
# exprs(m) <- log2(exprs(m))
# exprs(m) <- sweep(exprs(m), 1, rowMeans(exprs(m), na.rm=TRUE)) # zero center proteoforms

# what is the min observation count?
plot(table(rowSums(!is.na(exprs(m)))))

save(m, file="../data/iq_human_db.RData")
vladpetyuk/LewyBodies.SN.TopDown documentation built on May 3, 2019, 6:15 p.m.