scripts/prepare_IQ_alphasyn.R

# Script for looking at IQ outputs: *.Lewy_intact_06_TargetOutput.txt files
# Metadata for Lewy study added for customization
# And finally make MSnSet object


##Concatenate .tsv files and add filename as 'file'
fn <- "../notinst/extdata/IQ/SYUA_HUMAN/Lewy_Intact_1267.txt.xz"
dat <- read.delim(fn, check.names=F, as.is=T)


# hist(dat$`P-Score`)
# hist(dat$MedianPearson)
# hist(dat$MedianChargeCorrelation)
# hist(dat$MedianPpmError)
# x <- subset(dat, DataSet == 'Lewy_intact_01_TargetOutput.txt', 
#             select=c("P-Score","MedianPearson","MedianChargeCorrelation","MedianPpmError"))
# plot(x)
# 
# tgt <- unique(dat$Target)[1]
# View(subset(dat, DataSet == 'Lewy_intact_01_TargetOutput.txt' & Target == tgt))
# 
# 
# plot(table(table(unique(dat[,c("DataSet","Target")])$Target)))
# plot(table(unique(dat[,c("DataSet","Target")])$Target), type='p')
# 
# target.obs <- table(unique(dat[,c("DataSet","Target")])$Target)
# target.obs <- as.data.frame(target.obs)
# colnames(target.obs) <- c("Target","Target.Obs")
# 
# x <- merge(dat, target.obs)
# 
# x <- subset(x, DataSet == 'Lewy_intact_01_TargetOutput.txt', 
#             select=c("P-Score","MedianPearson","MedianChargeCorrelation","MedianPpmError","Target.Obs"))
# plot(x)
# 

# filtering criteria:
# 1) P-Score > 0.75
# 2) abs(MedianPpmError) < 10
# 3) after filtering apply unique observation count filter > 10

dim(dat)
dat <- subset(dat, `P-Score` > 0.75 & abs(MedianPpmError) < 5)
dim(dat)


target.obs <- table(unique(dat[,c("DataSet","Target")])$Target)
target.obs <- as.data.frame(target.obs)
colnames(target.obs) <- c("Target","Target.Obs")
dat <- merge(dat, target.obs)
dim(dat)
dat <- subset(dat, Target.Obs > 25)
dim(dat)

# tweak the dat
dat$DataSet <- sub("_TargetOutput.txt","",dat$DataSet)
dat$ScanRange <- NULL
dat$ElutionApex <- NULL
dat$ChargeSet <- NULL
dat$MedianPpmError <- NULL
dat$MedianChargeCorrelation <- NULL
dat$MedianPearson <- NULL
dat$`P-Score` <- NULL


#bring in and attach metadata
metaData <- read.delim("../notinst/extdata/lewy_intact_meta.txt", 
                       header = T, as.is=T)
rownames(metaData) <- metaData$sample.id
# add subject id, so they can be used as sample names instead of dataset names
dat <- merge(dat, metaData[,c('file','sample.id')], by.x='DataSet', by.y='file')
dat$DataSet <- NULL
pData <- metaData
rownames(pData) <- pData$sample.id


fData <- unique(dat[,c("Target","ProteinName","Mass","Target.Obs")])
fData$Feature <- with(fData, paste(sub("_HUMAN","",ProteinName), round(Mass,2), sep='_'))
# if it stumbles here, then find out another way of encoding feature names
# such that they are unique
stopifnot(!any(duplicated(fData$Feature))) 
# is it good to go?
rownames(fData) <- fData$Feature

# apply the same scheme of feature naming
dat$Feature <- with(dat, paste(sub("_HUMAN","",ProteinName), round(Mass,2), sep='_'))


library("reshape2")
exprshn <- acast(Feature ~ sample.id, data=dat, value.var="Intensity", 
                 fun.aggregate = sum, fill=as.numeric(NA))

library("MSnbase")
m <- MSnSet(exprshn, 
                 pData=pData[colnames(exprshn),],
                 fData=fData[rownames(exprshn),])

stopifnot(validObject(m))


# tweak pData
pData(m)$match.group <- sprintf("%02d", as.numeric(pData(m)$match.group))
pData(m)$match.group <- as.factor(pData(m)$match.group)
pData(m)$homogenizationBatch <- as.factor(pData(m)$homogenizationBatch)
pData(m)$prepBatch <- as.factor(pData(m)$prepBatch)
pData(m)$subject.type <- as.factor(pData(m)$subject.type)



# some data preprocessing
exprs(m) <- log2(exprs(m))
exprs(m) <- sweep(exprs(m), 1, rowMeans(exprs(m), na.rm=TRUE))

save(m, file="../data/iq_alpha_syn.RData")
vladpetyuk/LewyBodies.SN.TopDown documentation built on May 3, 2019, 6:15 p.m.