inst/tests/step2-combine-expression-and-PhenoData.R

# Function
#   step 2: combine preprocessed expression data with PhenoData which retrieve from Series Matrix files

# global setting
source(paste(system.file(pattern="tests",package="iProfile"),"step0-setup-global-setting.R", sep="/"))

# load preprocessed data
# load(paste(data_dir,"preprocessed_ExpressionSet_of_10_GEO_Studies.Rdata", sep="/"))
load(paste(data_dir,"preprocessed_ExpressionSet_of_11_GEO_Studies.Rdata", sep="/"))

# # check these data
# sapply(Study_Mat, function(x)dim(x))
# sapply(Study_Mat_NoAnnot, function(x)dim(x))
# sapply(Study_Res, function(x)dim(x))
#
# sapply(Study_Mat, function(x)fvarLabels(x))
# sapply(Study_Mat_NoAnnot, function(x)fvarLabels(x))
# sapply(Study_Res, function(x)fvarLabels(x))
#
# sapply(Study_Mat, function(x)varLabels(x))
# sapply(Study_Mat_NoAnnot, function(x)varLabels(x))
# sapply(Study_Res, function(x)varLabels(x))
#
# sapply(Study_Mat, function(x)annotation(x))
# They seem good
# Study_Mat_NoAnnot as backup of Study_Mat
# So, next we use Study_Mat and Study_Res.
# rm(Study_Mat_NoAnnot);gc()

#######################################################################################
######################## The next step processed by hand ##############################
#######################################################################################
# Due to different varLabels in different datasets, we have to process handly one by one
# There are two main targets for this step:
#   first, we only keep LUAD samples
#   second, we re-organize the phenoData
#       1. Study_ID, like GSEXXXXX
#       2. Sample_ID, like GSMXXXXX
#       3. Platform, like GPL96 or GPL570
#       4. Type, tell the tissue type, such as Normal, LUAD ..
#       5. Gender
#       6. Age
#       7. Tumor_stage
#       8. Family_history
#       9. Smoking_status
#       10. Metastasis_status
#       11. Tumor_feature
#       12. Survival_status
#       13. Survival_time
#       14. Preprocessed_method
#       15. Matched_CNV_ID

# grepl()
#

## test create_Exprset function
test <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = "GSE19188", rev_nams = c("ADC", "healthy"), keepSource = "characteristics_ch1.1", Gender = "characteristics_ch1.4", Survival_status = "characteristics_ch1.3", Survival_time = "characteristics_ch1.2")
test2 <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = "GSE10072", keepSource = "source_name_ch1", Gender = "characteristics_ch1", Age = "characteristics_ch1.1", Tumor_stage = "characteristics_ch1.3", Smoking_status = "characteristics_ch1.2")
## The function is work

# create a new list for storing final result
Study_Res_New <- list()

########## #1
id <- GEO_IDs[1]
# check what arguments we should assign
View(pData(Study_Mat[[id]]))
Study_Res_New[[id]] <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = id, rev_nams = c("ADC", "healthy"), keepSource = "characteristics_ch1.1", Gender = "characteristics_ch1.4", Survival_status = "characteristics_ch1.3", Survival_time = "characteristics_ch1.2")
View(pData(Study_Res_New[[id]]))
########## #1 finished

########## #2
id <- GEO_IDs[2]
# check what arguments we should assign
View(pData(Study_Mat[[id]]))
Study_Res_New[[id]] <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = id, keepSource = "source_name_ch1", Gender = "characteristics_ch1", Age = "characteristics_ch1.1", Tumor_stage = "characteristics_ch1.3", Smoking_status = "characteristics_ch1.2")
View(pData(Study_Res_New[[id]]))
######### #2 finished

######### #3
id <- GEO_IDs[3]
# check what arguments we should assign
View(pData(Study_Mat[[id]]))
# p_data <- pData(Study_Mat[[id]])
# write.table(p_data, file="./inst/tests/wrong_GSE31210_data.csv", sep = ",", quote = FALSE, row.names = FALSE)
# fix it , "," is not a good separator, some unuseful information discard by me
p_data <- read.table(file="./inst/tests/fix_GSE31210_data.csv", header = TRUE, sep=",", quote = "")
rownames(p_data) <- p_data$geo_accession
pData(Study_Mat[[id]]) <- p_data
View(pData(Study_Mat[[id]]))

Study_Res_New[[id]] <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = id, keepSource = "characteristics_ch1",Gender = "characteristics_ch1.2", Age = "characteristics_ch1.1", Tumor_stage = "characteristics_ch1.5", Smoking_status = "characteristics_ch1.3", Mutation_status = "characteristics_ch1.7", Survival_status = "characteristics_ch1.13", Survival_time = "characteristics_ch1.14", Tumor_feature = "characteristics_ch1.10")
View(pData(Study_Res_New[[id]]))
######### #3 finished

########## #4
id <- GEO_IDs[4]
# check what arguments we should assign
View(pData(Study_Mat[[id]]))
Study_Res_New[[id]] <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = id, keepSource = "characteristics_ch1", rev_nams = c("Adjacent normal part of adenocarcinoma", "Tumor part of adenocarcinoma", "Adenocarcinoma", "Normal lung tissues (Cat. 735020)", "Normal lung tissues (Cat. 636524)"), Gender = "characteristics_ch1.1")
View(pData(Study_Res_New[[id]]))
######### #4 finished

########## #5
id <- GEO_IDs[5]
# check what arguments we should assign
View(pData(Study_Mat[[id]]))
Study_Res_New[[id]] <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = id, keepSource = "characteristics_ch1", Gender = "characteristics_ch1.1", Age = "characteristics_ch1.2", Tumor_feature = "characteristics_ch1.7", Smoking_status = "characteristics_ch1.12", Survival_status = "characteristics_ch1.4", Survival_time = "characteristics_ch1.11")
View(pData(Study_Res_New[[id]]))
######### #5 finished

########## #6
id <- GEO_IDs[6]
# check what arguments we should assign
View(pData(Study_Mat[[id]]))
Study_Res_New[[id]] <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = id, keepSource = "characteristics_ch1", Gender = "characteristics_ch1.2", Age = "characteristics_ch1.3", Tumor_stage = "characteristics_ch1.4", Tumor_feature = "source_name_ch1")
View(pData(Study_Res_New[[id]]))
######### #6 finished

########## #7
id <- GEO_IDs[7]
# check what arguments we should assign
View(pData(Study_Mat[[id]]))
Study_Res_New[[id]] <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = id, keepSource = "source_name_ch1", Gender = "characteristics_ch1.4", Age = "characteristics_ch1.3", Tumor_stage = "characteristics_ch1.5", Tumor_feature = "characteristics_ch1.2")
View(pData(Study_Res_New[[id]]))
######### #7 finished

########## #8
id <- GEO_IDs[8]
# check what arguments we should assign
View(pData(Study_Mat[[id]]))
# p_data <- pData(Study_Mat[[id]])
# write.table(p_data, file="./inst/tests/wrong_GSE43580_data.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
# fix it , "," is not a good separator, some unuseful information discard by me
p_data <- read.table(file="./inst/tests/fix_GSE43580_data.tsv", header = TRUE, sep="\t", quote = "")
rownames(p_data) <- p_data$geo_accession
pData(Study_Mat[[id]]) <- p_data
View(pData(Study_Mat[[id]]))
Study_Res_New[[id]] <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = id, keepSource = "source_name_ch1", rev_nams = c("\"Adenocarcinoma, stage 1\"", "\"Adenocarcinoma, stage 2\""), Gender = "characteristics_ch1", Age = "characteristics_ch1.1", Tumor_stage = "characteristics_ch1.14", Smoking_status = "characteristics_ch1.7", Tumor_feature = "characteristics_ch1.13")
View(pData(Study_Res_New[[id]]))
######### #8 finished

########## #9
id <- GEO_IDs[9]
# check what arguments we should assign
View(pData(Study_Mat[[id]]))
Study_Res_New[[id]] <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = id, keepSource = "characteristics_ch1.1", rev_nams = "adenocarcinoma", Gender = "characteristics_ch1", Age = "characteristics_ch1.6", Tumor_stage = "characteristics_ch1.5", Smoking_status = "characteristics_ch1.7", Tumor_feature = "characteristics_ch1.11", Survival_status = "characteristics_ch1.9", Survival_time = "characteristics_ch1.8")
pData(Study_Res_New[[id]])[,"Tumor_feature"] <- paste("recurrence", pData(Study_Res_New[[id]])[, "Tumor_feature"]
                                                      , sep=":")
View(pData(Study_Res_New[[id]]))
######### #9 finished

########## #10
id <- GEO_IDs[10]
# check what arguments we should assign
View(pData(Study_Mat[[id]]))
Study_Res_New[[id]] <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = id, keepSource = "characteristics_ch1.2", rev_nams = "adeno",Gender = "characteristics_ch1.6", Age = "characteristics_ch1.5", Tumor_stage = "characteristics_ch1.3", Tumor_feature = "characteristics_ch1.8", Survival_status = "characteristics_ch1", Survival_time = "characteristics_ch1.1")
pData(Study_Res_New[[id]])[,"Tumor_feature"] <- paste("recurrence", pData(Study_Res_New[[id]])[, "Tumor_feature"]
                                                      , sep=":")
pData(Study_Res_New[[id]])[,"Survival_status"] <- paste("dead", pData(Study_Res_New[[id]])[, "Survival_status"]
                                                      , sep=":")
View(pData(Study_Res_New[[id]]))
######### #10 finished

########## #11
id <- GEO_IDs[11]
# check what arguments we should assign
View(pData(Study_Mat[[id]]))
Study_Res_New[[id]] <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = id, keepSource = "characteristics_ch1.1", rev_nams = "adenocarcinoma", Tumor_feature = "characteristics_ch1.2")
pData(Study_Res_New[[id]])[,"Tumor_feature"] <- paste("survival", pData(Study_Res_New[[id]])[, "Tumor_feature"]
                                                      , sep=":")
View(pData(Study_Res_New[[id]]))
######### #11 finished



######### All 11 datasets finshed, we check their phenodata one more time
# rm(id);gc()
#
# id <- GEO_IDs[1]
# View(pData(Study_Res_New[[id]]))
#
# id <- GEO_IDs[2]
# View(pData(Study_Res_New[[id]]))
#
# id <- GEO_IDs[3]
# View(pData(Study_Res_New[[id]]))
# View(pData(Study_Mat[[id]]))
#
# id <- GEO_IDs[4]
# View(pData(Study_Res_New[[id]]))
# # View(pData(Study_Mat[[id]]))
#
# id <- GEO_IDs[5]
# View(pData(Study_Res_New[[id]]))
# # View(pData(Study_Mat[[id]]))
#
# id <- GEO_IDs[6]
# View(pData(Study_Res_New[[id]]))
#
# id <- GEO_IDs[7]
# View(pData(Study_Res_New[[id]]))
#
# id <- GEO_IDs[8]
# View(pData(Study_Res_New[[id]]))
#
# id <- GEO_IDs[9]
# View(pData(Study_Res_New[[id]]))
#
# id <- GEO_IDs[10]
# View(pData(Study_Res_New[[id]]))
#########

######## save data
save(Study_Mat, file=paste(data_dir,"fixed_ExpressionSet_of_11_GEO_Studies.RData", sep="/"))
save(Study_Res_New, file=paste(data_dir,"RMA_processed_ExpressionSet_of_11_GEO_Studies_luad.RData", sep="/"))

###### compare all the results using RMA and processed by other researchers
# hist(Study_Res_New[[1]])
# hist(Study_Mat[[1]])
#
# hist(Study_Res_New[[2]])
# hist(Study_Mat[[2]])
#
# hist(Study_Res_New[[3]])
# hist(Study_Mat[[3]])
#
# hist(Study_Res_New[[4]])
# hist(Study_Mat[[4]])
#
# hist(Study_Res_New[[5]])
# hist(Study_Mat[[5]])
#
# hist(Study_Res_New[[6]])
# hist(Study_Mat[[6]])
#
# hist(Study_Res_New[[7]])
# hist(Study_Mat[[7]])
#
# hist(Study_Res_New[[8]])
# hist(Study_Mat[[8]])
#
# hist(Study_Res_New[[9]])
# hist(Study_Mat[[9]])
#
# hist(Study_Res_New[[10]])
# hist(Study_Mat[[10]])

##########################################################################
# plot them
# install patchwork
# library(devtools)
# devtools::install_github("thomasp85/patchwork")
opar <- par(no.readonly = TRUE)
# par(mfrow=c(2,1))

result_dir <- paste(test_dir, "result0", sep="/")
#1
pdf(file = paste0(result_dir, "/comparePreprocessMethod_bet_RMAandGEO_of_", GEO_IDs[1],".pdf"), width = 5, height = 6)
par(fig=c(0,1,0.5,1))
hist(Study_Res_New[[1]], main="GSE19188")
par(fig=c(0,1,0,0.5), new=TRUE)
hist(Study_Mat[[1]])
dev.off()
#2
pdf(file = paste0(result_dir, "/comparePreprocessMethod_bet_RMAandGEO_of_", GEO_IDs[2],".pdf"), width = 5, height = 6)
par(fig=c(0,1,0.5,1))
hist(Study_Res_New[[2]], main="GSE10072")
par(fig=c(0,1,0,0.5), new=TRUE)
hist(Study_Mat[[2]])
dev.off()

#3
pdf(file = paste0(result_dir, "/comparePreprocessMethod_bet_RMAandGEO_of_", GEO_IDs[3],".pdf"), width = 5, height = 6)
par(fig=c(0,1,0.5,1))
hist(Study_Res_New[[3]], main="GSE31210")
par(fig=c(0,1,0,0.5), new=TRUE)
hist(Study_Mat[[3]], xlab="intensity")
dev.off()

#4
pdf(file = paste0(result_dir, "/comparePreprocessMethod_bet_RMAandGEO_of_", GEO_IDs[4],".pdf"), width = 5, height = 6)
par(fig=c(0,1,0.5,1))
hist(Study_Res_New[[4]], main="GSE7670")
par(fig=c(0,1,0,0.5), new=TRUE)
hist(Study_Mat[[4]], xlab="intensity")
dev.off()

#5
pdf(file = paste0(result_dir, "/comparePreprocessMethod_bet_RMAandGEO_of_", GEO_IDs[5],".pdf"), width = 5, height = 6)
par(fig=c(0,1,0.5,1))
hist(Study_Res_New[[5]], main="GSE68465")
par(fig=c(0,1,0,0.5), new=TRUE)
hist(Study_Mat[[5]], xlab="intensity")
dev.off()

#6
pdf(file = paste0(result_dir, "/comparePreprocessMethod_bet_RMAandGEO_of_", GEO_IDs[6],".pdf"), width = 5, height = 6)
par(fig=c(0,1,0.5,1))
hist(Study_Res_New[[6]], main="GSE27716")
par(fig=c(0,1,0,0.5), new=TRUE)
hist(Study_Mat[[6]])
dev.off()

#7
pdf(file = paste0(result_dir, "/comparePreprocessMethod_bet_RMAandGEO_of_", GEO_IDs[7],".pdf"), width = 5, height = 6)
par(fig=c(0,1,0.5,1))
hist(Study_Res_New[[7]], main="GSE17475")
par(fig=c(0,1,0,0.5), new=TRUE)
hist(Study_Mat[[7]], xlab="intensity")
dev.off()

#8
pdf(file = paste0(result_dir, "/comparePreprocessMethod_bet_RMAandGEO_of_", GEO_IDs[8],".pdf"), width = 5, height = 6)
par(fig=c(0,1,0.5,1))
hist(Study_Res_New[[8]], main="GSE43580")
par(fig=c(0,1,0,0.5), new=TRUE)
hist(Study_Mat[[8]], xlab="intensity")
dev.off()

#9
pdf(file = paste0(result_dir, "/comparePreprocessMethod_bet_RMAandGEO_of_", GEO_IDs[9],".pdf"), width = 5, height = 6)
par(fig=c(0,1,0.5,1))
hist(Study_Res_New[[9]], main="GSE50081")
par(fig=c(0,1,0,0.5), new=TRUE)
hist(Study_Mat[[9]])
dev.off()

#10
pdf(file = paste0(result_dir, "/comparePreprocessMethod_bet_RMAandGEO_of_", GEO_IDs[10],".pdf"), width = 5, height = 6)
par(fig=c(0,1,0.5,1))
hist(Study_Res_New[[10]], main="GSE37745")
par(fig=c(0,1,0,0.5), new=TRUE)
hist(Study_Mat[[10]])
dev.off()

#11
pdf(file = paste0(result_dir, "/comparePreprocessMethod_bet_RMAandGEO_of_", GEO_IDs[11],".pdf"), width = 5, height = 6)
par(fig=c(0,1,0.5,1))
hist(Study_Res_New[[10]], main=GEO_IDs[11])
par(fig=c(0,1,0,0.5), new=TRUE)
hist(Study_Mat[[11]])
dev.off()

par(opar)
###########################################################################



######### #1
# id <- GEO_IDs[1]
# id
# # judge the two dataset have same sort for samples
# all(sampleNames(Study_Mat[[id]]) == sampleNames(Study_Res[[id]]))
# View(pData(Study_Mat[[id]]))
# keepIDs <- c()
# rev_nams <- c("ADC", "healthy")
# ident_id <- "characteristics_ch1.1"
# keepIDs <- as.character(pData(Study_Mat[[id]])[which(apply.decode((pData(Study_Mat[[id]])[, ident_id]))%in%rev_nams), "geo_accession"])
#
# # keep LUAD and Normal samples
# res <- Study_Mat[[id]][ , as.character(Study_Mat[[id]]$geo_accession)%in%keepIDs]
# Study_Res_New[[id]] <- Study_Res[[id]][, sampleNames(Study_Res[[id]])%in%keepIDs]
# # make sure they have sample sort
# Study_Res_New[[id]] <- Study_Res_New[[id]][, sampleNames(res)]
#
# # construct a new pheno data
# p_data <- construct_PhenoData(Study_ID = id, Sample_ID = as.character(res$geo_accession), Platform = annotation(res), Type = apply.decode(pData(res)[, "characteristics_ch1.1"]), Gender = apply.decode(pData(res)[, "characteristics_ch1.4"]), Survival_status = apply.decode(pData(res)[, "characteristics_ch1.3"]), Survival_time = apply.decode(pData(res)[, "characteristics_ch1.2"]))
#
# pData(Study_Res_New[[id]]) <- p_data
#
# ### assign feature data
# f_data <- fData(res)
# colnames(f_data) <- make.names(colnames(f_data))
# # make data have some sort, and only keep id, gene symbol and gene id..
# f_data <- f_data[featureNames(Study_Res_New[[id]]), c("ID", "Gene.title", "Gene.symbol", "Gene.ID")]
# # make sure the names of probe are same
# all(rownames(f_data) == featureNames(Study_Res_New[[id]]))
# # assign
# fData(Study_Res_New[[id]]) <- f_data
#
# ## remove some variabes and do next one
# rm(f_data, p_data, res, rev_nams, keepIDs, ident_id); gc()
# ######## #1 finished


######### #2
# id <- GEO_IDs[2]
# id
# # judge the two dataset have same sort for samples
# all(sampleNames(Study_Mat[[id]]) == sampleNames(Study_Res[[id]]))
# View(pData(Study_Mat[[id]]))
# keepIDs <- c()
# # rev_nams <- c("ADC", "healthy") # This dataset is LUAD dataset
# ident_id <- "source_name_ch1"
# if (!exists(x="rev_nams")){
#     rev_nams <- names(table(pData(Study_Mat[[id]])[, ident_id]))
# }
#
# keepIDs <- as.character(pData(Study_Mat[[id]])[which(as.character(pData(Study_Mat[[id]])[, ident_id])%in%rev_nams), "geo_accession"])
# # keepIDs <- as.character(pData(Study_Mat[[id]])[which(apply.decode((pData(Study_Mat[[id]])[, ident_id]))%in%rev_nams), "geo_accession"])
#
# # keep LUAD and Normal samples
# res <- Study_Mat[[id]][ , as.character(Study_Mat[[id]]$geo_accession)%in%keepIDs]
# Study_Res_New[[id]] <- Study_Res[[id]][, sampleNames(Study_Res[[id]])%in%keepIDs]
# # make sure they have sample sort
# Study_Res_New[[id]] <- Study_Res_New[[id]][, sampleNames(res)]
#
# # construct a new pheno data
# p_data <- construct_PhenoData(Study_ID = id, Sample_ID = as.character(res$geo_accession), Platform = annotation(res), Type = as.character(pData(res)[, "source_name_ch1"]), Gender = apply.decode(pData(res)[, "characteristics_ch1"]), Age = apply.decode(pData(res)[, "characteristics_ch1.1"]), Tumor_stage = apply.decode(pData(res)[, "characteristics_ch1.3"]), Smoking_status = apply.decode(pData(res)[, "characteristics_ch1.2"]))
#
# pData(Study_Res_New[[id]]) <- p_data
#
# ### assign feature data
# f_data <- fData(res)
# colnames(f_data) <- make.names(colnames(f_data))
# # make data have some sort, and only keep id, gene symbol and gene id..
# f_data <- f_data[featureNames(Study_Res_New[[id]]), c("ID", "Gene.title", "Gene.symbol", "Gene.ID")]
# # make sure the names of probe are same
# all(rownames(f_data) == featureNames(Study_Res_New[[id]]))
# # assign
# fData(Study_Res_New[[id]]) <- f_data
#
# ## remove some variabes and do next one
# rm(f_data, p_data, res, rev_nams, keepIDs, ident_id); gc()
# ######## #2 finished
ShixiangWang/iProfile documentation built on May 11, 2019, 6:25 p.m.