inst/tests/step1-preprocess-raw-data.R

# Function
#   step 1: preprocess raw data and store them

# set work directory
# setwd("~/projects/iProfile/")
setwd("~/Projects/iProfile/")

# load packages
suppressMessages(require(foreach))
suppressMessages(require(doMC))
suppressMessages(require(oligo))

# set up GSE IDs
GEO_IDs <- c("GSE19188", "GSE10072",
             "GSE31210", "GSE7670" ,
             "GSE68465", "GSE27716",
             "GSE17475", "GSE43580",
             "GSE50081", "GSE37745", "GSE28571")

# register CPU cores for computing, the number can be changed handly
Ncores <- parallel::detectCores()
registerDoMC(cores = Ncores)

# set up directories where raw cel files are
dir_list <- paste0("./inst/tests/iGCC/cel_files/", GEO_IDs)
# dir_list <- dir("./inst/tests/iGCC/cel_files", pattern = "^GSE[0-9]{4,5}$",
#                 full.names = TRUE)
# dir_list <- setdiff(dir_list, "./inst/tests/iGCC/cel_files/GSE72094")
Studynames <- basename(dir_list)
Study_Res  <- list()
for (i in 1:length(dir_list)){
    Study_Res[[Studynames[i]]] <- affyPreprocess.oligo(dir_list[i])
    cat("The process for",Studynames[i], "has finished!\n" )
}

# set up directory where series matrix files are
dir_matrix <- "./inst/tests/iGCC/matrixSeries/"
Study_Mat  <- list()
for (i in Studynames){
    fn <- paste0(dir_matrix, i, "_series_matrix.txt.gz")
    Study_Mat[[i]] <- getGEO(filename = fn,
                             destdir = dir_matrix, AnnotGPL = TRUE)
    cat("The process for", i, "has finished!\n" )
}

# Study_Mat_NoAnnot  <- list()
# for (i in Studynames){
#     fn <- paste0(dir_matrix, i, "_series_matrix.txt.gz")
#     Study_Mat_NoAnnot[[i]] <- getGEO(filename = fn,
#                              destdir = dir_matrix, AnnotGPL = FALSE)
#     cat("The process for", i, "has finished!\n" )
# }

# # something wrong, do it one by one
# Study_Mat[[Studynames[i]]] <-
#     getGEO(filename = paste0(dir_matrix, Studynames[i], "_series_matrix.txt.gz"),
#                          destdir = dir_matrix)
#
# # There something wrong with GSE31210
# i = 3
# test <- getGEO(filename = paste0(dir_matrix, Studynames[i], "_series_matrix.txt.gz"),
#                destdir = dir_matrix, AnnotGPL = TRUE)
# GSE31210 <- getGEO(GEO="GSE31210")
n <- length(Study_Mat)
save(Study_Mat, Study_Res,
     file=paste0("./inst/tests/data/preprocessed_ExpressionSet_of_", n, "_GEO_Studies.Rdata"))
# save(Study_Mat, Study_Mat_NoAnnot, Study_Res,
#      file=paste0("./inst/tests/preprocessed_ExpressionSet_of_", n, "_GEO_Studies.Rdata"))
ShixiangWang/iProfile documentation built on May 11, 2019, 6:25 p.m.