# Function
# step 2: combine preprocessed expression data with PhenoData which retrieve from Series Matrix files
# global setting
source(paste(system.file(pattern="tests",package="iProfile"),"step0-setup-global-setting.R", sep="/"))
# load preprocessed data
# load(paste(data_dir,"preprocessed_ExpressionSet_of_10_GEO_Studies.Rdata", sep="/"))
load(paste(data_dir,"preprocessed_ExpressionSet_of_11_GEO_Studies.Rdata", sep="/"))
# # check these data
# sapply(Study_Mat, function(x)dim(x))
# sapply(Study_Mat_NoAnnot, function(x)dim(x))
# sapply(Study_Res, function(x)dim(x))
#
# sapply(Study_Mat, function(x)fvarLabels(x))
# sapply(Study_Mat_NoAnnot, function(x)fvarLabels(x))
# sapply(Study_Res, function(x)fvarLabels(x))
#
# sapply(Study_Mat, function(x)varLabels(x))
# sapply(Study_Mat_NoAnnot, function(x)varLabels(x))
# sapply(Study_Res, function(x)varLabels(x))
#
# sapply(Study_Mat, function(x)annotation(x))
# They seem good
# Study_Mat_NoAnnot as backup of Study_Mat
# So, next we use Study_Mat and Study_Res.
# rm(Study_Mat_NoAnnot);gc()
#######################################################################################
######################## The next step processed by hand ##############################
#######################################################################################
# Due to different varLabels in different datasets, we have to process handly one by one
# There are two main targets for this step:
# first, we only keep LUAD samples
# second, we re-organize the phenoData
# 1. Study_ID, like GSEXXXXX
# 2. Sample_ID, like GSMXXXXX
# 3. Platform, like GPL96 or GPL570
# 4. Type, tell the tissue type, such as Normal, LUAD ..
# 5. Gender
# 6. Age
# 7. Tumor_stage
# 8. Family_history
# 9. Smoking_status
# 10. Metastasis_status
# 11. Tumor_feature
# 12. Survival_status
# 13. Survival_time
# 14. Preprocessed_method
# 15. Matched_CNV_ID
# grepl()
#
## test create_Exprset function
test <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = "GSE19188", rev_nams = c("ADC", "healthy"), keepSource = "characteristics_ch1.1", Gender = "characteristics_ch1.4", Survival_status = "characteristics_ch1.3", Survival_time = "characteristics_ch1.2")
test2 <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = "GSE10072", keepSource = "source_name_ch1", Gender = "characteristics_ch1", Age = "characteristics_ch1.1", Tumor_stage = "characteristics_ch1.3", Smoking_status = "characteristics_ch1.2")
## The function is work
# create a new list for storing final result
Study_Res_New <- list()
########## #1
id <- GEO_IDs[1]
# check what arguments we should assign
View(pData(Study_Mat[[id]]))
Study_Res_New[[id]] <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = id, rev_nams = c("ADC", "healthy"), keepSource = "characteristics_ch1.1", Gender = "characteristics_ch1.4", Survival_status = "characteristics_ch1.3", Survival_time = "characteristics_ch1.2")
View(pData(Study_Res_New[[id]]))
########## #1 finished
########## #2
id <- GEO_IDs[2]
# check what arguments we should assign
View(pData(Study_Mat[[id]]))
Study_Res_New[[id]] <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = id, keepSource = "source_name_ch1", Gender = "characteristics_ch1", Age = "characteristics_ch1.1", Tumor_stage = "characteristics_ch1.3", Smoking_status = "characteristics_ch1.2")
View(pData(Study_Res_New[[id]]))
######### #2 finished
######### #3
id <- GEO_IDs[3]
# check what arguments we should assign
View(pData(Study_Mat[[id]]))
# p_data <- pData(Study_Mat[[id]])
# write.table(p_data, file="./inst/tests/wrong_GSE31210_data.csv", sep = ",", quote = FALSE, row.names = FALSE)
# fix it , "," is not a good separator, some unuseful information discard by me
p_data <- read.table(file="./inst/tests/fix_GSE31210_data.csv", header = TRUE, sep=",", quote = "")
rownames(p_data) <- p_data$geo_accession
pData(Study_Mat[[id]]) <- p_data
View(pData(Study_Mat[[id]]))
Study_Res_New[[id]] <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = id, keepSource = "characteristics_ch1",Gender = "characteristics_ch1.2", Age = "characteristics_ch1.1", Tumor_stage = "characteristics_ch1.5", Smoking_status = "characteristics_ch1.3", Mutation_status = "characteristics_ch1.7", Survival_status = "characteristics_ch1.13", Survival_time = "characteristics_ch1.14", Tumor_feature = "characteristics_ch1.10")
View(pData(Study_Res_New[[id]]))
######### #3 finished
########## #4
id <- GEO_IDs[4]
# check what arguments we should assign
View(pData(Study_Mat[[id]]))
Study_Res_New[[id]] <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = id, keepSource = "characteristics_ch1", rev_nams = c("Adjacent normal part of adenocarcinoma", "Tumor part of adenocarcinoma", "Adenocarcinoma", "Normal lung tissues (Cat. 735020)", "Normal lung tissues (Cat. 636524)"), Gender = "characteristics_ch1.1")
View(pData(Study_Res_New[[id]]))
######### #4 finished
########## #5
id <- GEO_IDs[5]
# check what arguments we should assign
View(pData(Study_Mat[[id]]))
Study_Res_New[[id]] <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = id, keepSource = "characteristics_ch1", Gender = "characteristics_ch1.1", Age = "characteristics_ch1.2", Tumor_feature = "characteristics_ch1.7", Smoking_status = "characteristics_ch1.12", Survival_status = "characteristics_ch1.4", Survival_time = "characteristics_ch1.11")
View(pData(Study_Res_New[[id]]))
######### #5 finished
########## #6
id <- GEO_IDs[6]
# check what arguments we should assign
View(pData(Study_Mat[[id]]))
Study_Res_New[[id]] <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = id, keepSource = "characteristics_ch1", Gender = "characteristics_ch1.2", Age = "characteristics_ch1.3", Tumor_stage = "characteristics_ch1.4", Tumor_feature = "source_name_ch1")
View(pData(Study_Res_New[[id]]))
######### #6 finished
########## #7
id <- GEO_IDs[7]
# check what arguments we should assign
View(pData(Study_Mat[[id]]))
Study_Res_New[[id]] <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = id, keepSource = "source_name_ch1", Gender = "characteristics_ch1.4", Age = "characteristics_ch1.3", Tumor_stage = "characteristics_ch1.5", Tumor_feature = "characteristics_ch1.2")
View(pData(Study_Res_New[[id]]))
######### #7 finished
########## #8
id <- GEO_IDs[8]
# check what arguments we should assign
View(pData(Study_Mat[[id]]))
# p_data <- pData(Study_Mat[[id]])
# write.table(p_data, file="./inst/tests/wrong_GSE43580_data.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
# fix it , "," is not a good separator, some unuseful information discard by me
p_data <- read.table(file="./inst/tests/fix_GSE43580_data.tsv", header = TRUE, sep="\t", quote = "")
rownames(p_data) <- p_data$geo_accession
pData(Study_Mat[[id]]) <- p_data
View(pData(Study_Mat[[id]]))
Study_Res_New[[id]] <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = id, keepSource = "source_name_ch1", rev_nams = c("\"Adenocarcinoma, stage 1\"", "\"Adenocarcinoma, stage 2\""), Gender = "characteristics_ch1", Age = "characteristics_ch1.1", Tumor_stage = "characteristics_ch1.14", Smoking_status = "characteristics_ch1.7", Tumor_feature = "characteristics_ch1.13")
View(pData(Study_Res_New[[id]]))
######### #8 finished
########## #9
id <- GEO_IDs[9]
# check what arguments we should assign
View(pData(Study_Mat[[id]]))
Study_Res_New[[id]] <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = id, keepSource = "characteristics_ch1.1", rev_nams = "adenocarcinoma", Gender = "characteristics_ch1", Age = "characteristics_ch1.6", Tumor_stage = "characteristics_ch1.5", Smoking_status = "characteristics_ch1.7", Tumor_feature = "characteristics_ch1.11", Survival_status = "characteristics_ch1.9", Survival_time = "characteristics_ch1.8")
pData(Study_Res_New[[id]])[,"Tumor_feature"] <- paste("recurrence", pData(Study_Res_New[[id]])[, "Tumor_feature"]
, sep=":")
View(pData(Study_Res_New[[id]]))
######### #9 finished
########## #10
id <- GEO_IDs[10]
# check what arguments we should assign
View(pData(Study_Mat[[id]]))
Study_Res_New[[id]] <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = id, keepSource = "characteristics_ch1.2", rev_nams = "adeno",Gender = "characteristics_ch1.6", Age = "characteristics_ch1.5", Tumor_stage = "characteristics_ch1.3", Tumor_feature = "characteristics_ch1.8", Survival_status = "characteristics_ch1", Survival_time = "characteristics_ch1.1")
pData(Study_Res_New[[id]])[,"Tumor_feature"] <- paste("recurrence", pData(Study_Res_New[[id]])[, "Tumor_feature"]
, sep=":")
pData(Study_Res_New[[id]])[,"Survival_status"] <- paste("dead", pData(Study_Res_New[[id]])[, "Survival_status"]
, sep=":")
View(pData(Study_Res_New[[id]]))
######### #10 finished
########## #11
id <- GEO_IDs[11]
# check what arguments we should assign
View(pData(Study_Mat[[id]]))
Study_Res_New[[id]] <- create_Exprset(Mat = Study_Mat, Res = Study_Res, id = id, keepSource = "characteristics_ch1.1", rev_nams = "adenocarcinoma", Tumor_feature = "characteristics_ch1.2")
pData(Study_Res_New[[id]])[,"Tumor_feature"] <- paste("survival", pData(Study_Res_New[[id]])[, "Tumor_feature"]
, sep=":")
View(pData(Study_Res_New[[id]]))
######### #11 finished
######### All 11 datasets finshed, we check their phenodata one more time
# rm(id);gc()
#
# id <- GEO_IDs[1]
# View(pData(Study_Res_New[[id]]))
#
# id <- GEO_IDs[2]
# View(pData(Study_Res_New[[id]]))
#
# id <- GEO_IDs[3]
# View(pData(Study_Res_New[[id]]))
# View(pData(Study_Mat[[id]]))
#
# id <- GEO_IDs[4]
# View(pData(Study_Res_New[[id]]))
# # View(pData(Study_Mat[[id]]))
#
# id <- GEO_IDs[5]
# View(pData(Study_Res_New[[id]]))
# # View(pData(Study_Mat[[id]]))
#
# id <- GEO_IDs[6]
# View(pData(Study_Res_New[[id]]))
#
# id <- GEO_IDs[7]
# View(pData(Study_Res_New[[id]]))
#
# id <- GEO_IDs[8]
# View(pData(Study_Res_New[[id]]))
#
# id <- GEO_IDs[9]
# View(pData(Study_Res_New[[id]]))
#
# id <- GEO_IDs[10]
# View(pData(Study_Res_New[[id]]))
#########
######## save data
save(Study_Mat, file=paste(data_dir,"fixed_ExpressionSet_of_11_GEO_Studies.RData", sep="/"))
save(Study_Res_New, file=paste(data_dir,"RMA_processed_ExpressionSet_of_11_GEO_Studies_luad.RData", sep="/"))
###### compare all the results using RMA and processed by other researchers
# hist(Study_Res_New[[1]])
# hist(Study_Mat[[1]])
#
# hist(Study_Res_New[[2]])
# hist(Study_Mat[[2]])
#
# hist(Study_Res_New[[3]])
# hist(Study_Mat[[3]])
#
# hist(Study_Res_New[[4]])
# hist(Study_Mat[[4]])
#
# hist(Study_Res_New[[5]])
# hist(Study_Mat[[5]])
#
# hist(Study_Res_New[[6]])
# hist(Study_Mat[[6]])
#
# hist(Study_Res_New[[7]])
# hist(Study_Mat[[7]])
#
# hist(Study_Res_New[[8]])
# hist(Study_Mat[[8]])
#
# hist(Study_Res_New[[9]])
# hist(Study_Mat[[9]])
#
# hist(Study_Res_New[[10]])
# hist(Study_Mat[[10]])
##########################################################################
# plot them
# install patchwork
# library(devtools)
# devtools::install_github("thomasp85/patchwork")
opar <- par(no.readonly = TRUE)
# par(mfrow=c(2,1))
result_dir <- paste(test_dir, "result0", sep="/")
#1
pdf(file = paste0(result_dir, "/comparePreprocessMethod_bet_RMAandGEO_of_", GEO_IDs[1],".pdf"), width = 5, height = 6)
par(fig=c(0,1,0.5,1))
hist(Study_Res_New[[1]], main="GSE19188")
par(fig=c(0,1,0,0.5), new=TRUE)
hist(Study_Mat[[1]])
dev.off()
#2
pdf(file = paste0(result_dir, "/comparePreprocessMethod_bet_RMAandGEO_of_", GEO_IDs[2],".pdf"), width = 5, height = 6)
par(fig=c(0,1,0.5,1))
hist(Study_Res_New[[2]], main="GSE10072")
par(fig=c(0,1,0,0.5), new=TRUE)
hist(Study_Mat[[2]])
dev.off()
#3
pdf(file = paste0(result_dir, "/comparePreprocessMethod_bet_RMAandGEO_of_", GEO_IDs[3],".pdf"), width = 5, height = 6)
par(fig=c(0,1,0.5,1))
hist(Study_Res_New[[3]], main="GSE31210")
par(fig=c(0,1,0,0.5), new=TRUE)
hist(Study_Mat[[3]], xlab="intensity")
dev.off()
#4
pdf(file = paste0(result_dir, "/comparePreprocessMethod_bet_RMAandGEO_of_", GEO_IDs[4],".pdf"), width = 5, height = 6)
par(fig=c(0,1,0.5,1))
hist(Study_Res_New[[4]], main="GSE7670")
par(fig=c(0,1,0,0.5), new=TRUE)
hist(Study_Mat[[4]], xlab="intensity")
dev.off()
#5
pdf(file = paste0(result_dir, "/comparePreprocessMethod_bet_RMAandGEO_of_", GEO_IDs[5],".pdf"), width = 5, height = 6)
par(fig=c(0,1,0.5,1))
hist(Study_Res_New[[5]], main="GSE68465")
par(fig=c(0,1,0,0.5), new=TRUE)
hist(Study_Mat[[5]], xlab="intensity")
dev.off()
#6
pdf(file = paste0(result_dir, "/comparePreprocessMethod_bet_RMAandGEO_of_", GEO_IDs[6],".pdf"), width = 5, height = 6)
par(fig=c(0,1,0.5,1))
hist(Study_Res_New[[6]], main="GSE27716")
par(fig=c(0,1,0,0.5), new=TRUE)
hist(Study_Mat[[6]])
dev.off()
#7
pdf(file = paste0(result_dir, "/comparePreprocessMethod_bet_RMAandGEO_of_", GEO_IDs[7],".pdf"), width = 5, height = 6)
par(fig=c(0,1,0.5,1))
hist(Study_Res_New[[7]], main="GSE17475")
par(fig=c(0,1,0,0.5), new=TRUE)
hist(Study_Mat[[7]], xlab="intensity")
dev.off()
#8
pdf(file = paste0(result_dir, "/comparePreprocessMethod_bet_RMAandGEO_of_", GEO_IDs[8],".pdf"), width = 5, height = 6)
par(fig=c(0,1,0.5,1))
hist(Study_Res_New[[8]], main="GSE43580")
par(fig=c(0,1,0,0.5), new=TRUE)
hist(Study_Mat[[8]], xlab="intensity")
dev.off()
#9
pdf(file = paste0(result_dir, "/comparePreprocessMethod_bet_RMAandGEO_of_", GEO_IDs[9],".pdf"), width = 5, height = 6)
par(fig=c(0,1,0.5,1))
hist(Study_Res_New[[9]], main="GSE50081")
par(fig=c(0,1,0,0.5), new=TRUE)
hist(Study_Mat[[9]])
dev.off()
#10
pdf(file = paste0(result_dir, "/comparePreprocessMethod_bet_RMAandGEO_of_", GEO_IDs[10],".pdf"), width = 5, height = 6)
par(fig=c(0,1,0.5,1))
hist(Study_Res_New[[10]], main="GSE37745")
par(fig=c(0,1,0,0.5), new=TRUE)
hist(Study_Mat[[10]])
dev.off()
#11
pdf(file = paste0(result_dir, "/comparePreprocessMethod_bet_RMAandGEO_of_", GEO_IDs[11],".pdf"), width = 5, height = 6)
par(fig=c(0,1,0.5,1))
hist(Study_Res_New[[10]], main=GEO_IDs[11])
par(fig=c(0,1,0,0.5), new=TRUE)
hist(Study_Mat[[11]])
dev.off()
par(opar)
###########################################################################
######### #1
# id <- GEO_IDs[1]
# id
# # judge the two dataset have same sort for samples
# all(sampleNames(Study_Mat[[id]]) == sampleNames(Study_Res[[id]]))
# View(pData(Study_Mat[[id]]))
# keepIDs <- c()
# rev_nams <- c("ADC", "healthy")
# ident_id <- "characteristics_ch1.1"
# keepIDs <- as.character(pData(Study_Mat[[id]])[which(apply.decode((pData(Study_Mat[[id]])[, ident_id]))%in%rev_nams), "geo_accession"])
#
# # keep LUAD and Normal samples
# res <- Study_Mat[[id]][ , as.character(Study_Mat[[id]]$geo_accession)%in%keepIDs]
# Study_Res_New[[id]] <- Study_Res[[id]][, sampleNames(Study_Res[[id]])%in%keepIDs]
# # make sure they have sample sort
# Study_Res_New[[id]] <- Study_Res_New[[id]][, sampleNames(res)]
#
# # construct a new pheno data
# p_data <- construct_PhenoData(Study_ID = id, Sample_ID = as.character(res$geo_accession), Platform = annotation(res), Type = apply.decode(pData(res)[, "characteristics_ch1.1"]), Gender = apply.decode(pData(res)[, "characteristics_ch1.4"]), Survival_status = apply.decode(pData(res)[, "characteristics_ch1.3"]), Survival_time = apply.decode(pData(res)[, "characteristics_ch1.2"]))
#
# pData(Study_Res_New[[id]]) <- p_data
#
# ### assign feature data
# f_data <- fData(res)
# colnames(f_data) <- make.names(colnames(f_data))
# # make data have some sort, and only keep id, gene symbol and gene id..
# f_data <- f_data[featureNames(Study_Res_New[[id]]), c("ID", "Gene.title", "Gene.symbol", "Gene.ID")]
# # make sure the names of probe are same
# all(rownames(f_data) == featureNames(Study_Res_New[[id]]))
# # assign
# fData(Study_Res_New[[id]]) <- f_data
#
# ## remove some variabes and do next one
# rm(f_data, p_data, res, rev_nams, keepIDs, ident_id); gc()
# ######## #1 finished
######### #2
# id <- GEO_IDs[2]
# id
# # judge the two dataset have same sort for samples
# all(sampleNames(Study_Mat[[id]]) == sampleNames(Study_Res[[id]]))
# View(pData(Study_Mat[[id]]))
# keepIDs <- c()
# # rev_nams <- c("ADC", "healthy") # This dataset is LUAD dataset
# ident_id <- "source_name_ch1"
# if (!exists(x="rev_nams")){
# rev_nams <- names(table(pData(Study_Mat[[id]])[, ident_id]))
# }
#
# keepIDs <- as.character(pData(Study_Mat[[id]])[which(as.character(pData(Study_Mat[[id]])[, ident_id])%in%rev_nams), "geo_accession"])
# # keepIDs <- as.character(pData(Study_Mat[[id]])[which(apply.decode((pData(Study_Mat[[id]])[, ident_id]))%in%rev_nams), "geo_accession"])
#
# # keep LUAD and Normal samples
# res <- Study_Mat[[id]][ , as.character(Study_Mat[[id]]$geo_accession)%in%keepIDs]
# Study_Res_New[[id]] <- Study_Res[[id]][, sampleNames(Study_Res[[id]])%in%keepIDs]
# # make sure they have sample sort
# Study_Res_New[[id]] <- Study_Res_New[[id]][, sampleNames(res)]
#
# # construct a new pheno data
# p_data <- construct_PhenoData(Study_ID = id, Sample_ID = as.character(res$geo_accession), Platform = annotation(res), Type = as.character(pData(res)[, "source_name_ch1"]), Gender = apply.decode(pData(res)[, "characteristics_ch1"]), Age = apply.decode(pData(res)[, "characteristics_ch1.1"]), Tumor_stage = apply.decode(pData(res)[, "characteristics_ch1.3"]), Smoking_status = apply.decode(pData(res)[, "characteristics_ch1.2"]))
#
# pData(Study_Res_New[[id]]) <- p_data
#
# ### assign feature data
# f_data <- fData(res)
# colnames(f_data) <- make.names(colnames(f_data))
# # make data have some sort, and only keep id, gene symbol and gene id..
# f_data <- f_data[featureNames(Study_Res_New[[id]]), c("ID", "Gene.title", "Gene.symbol", "Gene.ID")]
# # make sure the names of probe are same
# all(rownames(f_data) == featureNames(Study_Res_New[[id]]))
# # assign
# fData(Study_Res_New[[id]]) <- f_data
#
# ## remove some variabes and do next one
# rm(f_data, p_data, res, rev_nams, keepIDs, ident_id); gc()
# ######## #2 finished
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.