analysis/00.DownloadPreprocess/05.stanleyCCohort.R

library(XML)
library(ogbox)
library(affy)
library(dplyr)
library(magrittr)
library(XLConnect)
# download stanley C cohort data

downloadData = FALSE
downloadGemma = TRUE

dir.create('data-raw/StanleyC/cels',recursive=TRUE, showWarnings = FALSE)

if(downloadData){
    # autogenerated password that was sent to me in plaintext form. They obviously dont care enough to be actually secure
    # so why should I. Replace with your 
    system('wget --user OganM --password rMa3JbF4 https://www.stanleygenomics.org/stanley/standard/study_zips/study2_raw.zip -O data-raw/StanleyC/stanleyC.zip')
    system('wget --user OganM --password rMa3JbF4 https://www.stanleygenomics.org/stanley/standard/studyPatients.jsp?study_id=2 -O data-raw/StanleyC/metadata.html')
    system('unzip data-raw/StanleyC/stanleyC.zip -d data-raw/StanleyC/cels')
}
if(downloadGemma){
    ogbox::getGemmaAnnot('GPL570','data-raw/GemmaAnnots/GPL96',annotType='noParents',overwrite=TRUE)
}
StanleyCMeta = readHTMLTable('data-raw/StanleyC/metadata.html')[[2]]

system('wget --user OganM --password rMa3JbF4 https://www.stanleygenomics.org/stanley/standard/studyQC.jsp?study_id=2 -O data-raw/StanleyC/qc.html')
StanleyCQC = readHTMLTable('data-raw/StanleyC/qc.html')[[1]]
StanleyCQC %<>% filter(Outlier != 1)
StanleyCMeta = StanleyCMeta[StanleyCMeta$Filename %in% StanleyCQC$Filename,]

cels= celFiles('data-raw/StanleyC/cels/',full.names=TRUE)

cels = cels[grepl(regexMerge(StanleyCMeta$Filename),cels)]
cels = cels[match(StanleyCMeta$Filename, basename(gsub('\\s.*$','',cels)))]
batches = cels%>% sapply(celfileDate) 

cels = cels[!batches %in% names(which(table(batches)==1))]
StanleyCMeta = StanleyCMeta[!batches %in% names(which(table(batches)==1)),]
batches = batches[!batches %in% names(which(table(batches)==1))]

affy = ReadAffy(filenames = cels)
affy = affy::rma(affy)
affy = gemmaAnnot(affy, 'data-raw/GemmaAnnots/GPL96')

# remove samples of wrong sex
genders = bioGender(affy)
cels = cels[genders == StanleyCMeta$Sex]
batches = batches[genders== StanleyCMeta$Sex]
StanleyCMeta = StanleyCMeta[genders == StanleyCMeta$Sex,]

# batch correction is not possible due to covariate=batch confound
# list[gene,exp] = affy %>% sepExpr()
data.frame(batches,StanleyCMeta$Profile) %>% table
# exp = ComBat(exp,batch =batches,mod = model.matrix(~Profile,StanleyCMeta))
# affy = cbind(gene,exp)


list[gene,exp] = sepExpr(affy)

affy %<>% mostVariable(threshold= exp %>% unlist %>% median,
                       threshFun=median)

list[gene,exp] = sepExpr(affy)

colnames(exp) = gsub('\\s.*$','',cn(exp))

rownames(exp) = gene$Gene.Symbol

StanleyCExp = exp

StanleyCExp = StanleyCExp[match(StanleyCMeta$Filename, cn(StanleyCExp))]

patients = StanleyCMeta$`Stanley ID` %>% unique




devtools::use_data(StanleyCExp,overwrite=TRUE)
devtools::use_data(StanleyCMeta,overwrite=TRUE)

#'KDM5D' %in% rn(StanleyCExp)


#StanleyCExp['KDM5D',] %>% unlist %>% plot(col= StanleyCMeta$Sex %>% toColor %$% cols)

# Uranova analysis data-------------------
download.file('http://sncid.stanleyresearch.org/RawData/1999_03%20URANOVA.zip',destfile = 'data-raw/StanleyC/UranovaRawData.zip')
unzip('data-raw/StanleyC/UranovaRawData.zip',exdir = 'data-raw/StanleyC/')
rawUranova = loadWorkbook('data-raw/StanleyC/1999_03 URANOVA.xls')
rawUranova = readWorksheet(rawUranova, sheet = 'data', header = FALSE)
rawUranova = rawUranova[-(1:5),]

rawUranova %<>% filter(Col1 %in% c("Code", "Area 9", "Layer VI"))
rawUranova$Col1 <- as.character(rawUranova$Col1)
rawUranova$Col1[seq(4, 40, 4)] <- "WM"
rawUranova$Col1[seq(3, 39, 4)] <- "GMse"
rawUranova$Col1[seq(2, 38, 4)] <- "GM"

rawUranova <- data.frame(StanleyID = rawUranova %>% filter(Col1 == "Code") %>% 
                             select(-Col1, -Col2) %>% as.list() %>%
                             unlist %>% 
                             as.character(),
                         GM = rawUranova %>% filter(Col1 == "GM") %>%
                             select(-Col1, -Col2) %>% 
                             as.list() %>% 
                             unlist %>% 
                             as.character %>%
                             as.numeric(),
                         GMse = rawUranova %>% filter(Col1 == "GMse") %>% 
                             select(-Col1, -Col2) %>%
                             as.list() %>%
                             unlist %>%
                             as.character %>% 
                             as.numeric(),
                         WM = rawUranova %>% filter(Col1 == "WM") %>% select(-Col1, -Col2) %>% as.list() %>% unlist %>% as.character %>% as.numeric())

use_data(rawUranova,overwrite = TRUE)
PavlidisLab/neuroExpressoAnalysis documentation built on Aug. 23, 2022, 7:42 a.m.