iProfile: integrated cancer Profile using multiple datasets

# Function
#   step 3: integrate multiple datasets (ExpressionSet)

# global setting
source(paste(system.file(pattern="tests",package="iProfile"),"step0-setup-global-setting.R", sep="/"))
load("inst/tests/data/RMA_processed_ExpressionSet_of_11_GEO_Studies_luad.RData")

identifier <- "SYMBOL"
# identifier <- "ENTREZ_ID"

data(hugo) # load hugo gene symbol dataframe

# database id you want, most useful is gene symbols and entrez ids
if(identifier == "SYMBOL"){
    Features <- as.character(unique(hugo[, "symbol"]))
}else if(identifier == "ENTREZ_ID"){
    Features <- as.character(unique(hugo[, "entrez_id"]))
}

# rm(hugo);gc()

# merge pheno data from multiple datasets
# merged_pData <- Reduce(rbind, lapply(Study_Res_New, function(x)pData(x)))

# re-construct feature data by identifier
library(reshape2)
# Res <- lapply(Study_Res_New, ArrayBuildfData)
Res <- list()
for (i in 1:length(GEO_IDs)){
    Res[[GEO_IDs[i]]] <- ArrayBuildfData(exprz = Study_Res_New[[GEO_IDs[i]]], name = GEO_IDs[i], identifier = "SYMBOL")
}

################# Build a ExpressionSet
Eset <- ArrayMerge(Exprs_list = "Res", Features = Features)

pdf(paste0(test_dir, "/result0/histogram_of_11_datasets_processed_by_RMA_before_batch_removed.pdf"), width = 6, height = 4)
hist(Eset, main="Histogram of 11 datasets processed by RMA")
dev.off()

# tidy pData
p_Data <- pData(Eset)
write.csv(p_Data, file=paste0(test_dir, "/result0/11_datasets_processed_by_RMA_before_tidy.csv"))

str(p_Data)

# Type
table(p_Data$Type)
type_normal <- grep(pattern = "(normal)|(healthy)", x = names(table(p_Data$Type)), ignore.case = T, value = T)
type_luad   <- setdiff(names(table(p_Data$Type)), type_normal)
p_Data[p_Data$Type%in%type_luad, ]$Type <- "LUAD"
p_Data[p_Data$Type%in%type_normal, ]$Type <- "Normal"
table(p_Data$Type)

# Gender
table(p_Data$Gender)
type_woman <- grep(pattern = "f", x = names(table(p_Data$Gender)), ignore.case = T, value = T)
type_man   <- grep(pattern = "(^male)|(M$)", x = names(table(p_Data$Gender)), ignore.case = T, value = T)
p_Data[p_Data$Gender%in%type_woman, ]$Gender <- "F"
p_Data[p_Data$Gender%in%type_man, ]$Gender <- "M"
# p_Data[p_Data$Gender=="Not available", ]$Gender <- NA
table(p_Data$Gender)

# Tumor_stage
table(p_Data$Tumor_stage)
type_normal <- grep(pattern = "(normal)|(healthy)", x = names(table(p_Data$Type)), ignore.case = T, value = T)
type_luad   <- setdiff(names(table(p_Data$Type)), type_normal)
p_Data[p_Data$Type%in%type_luad, ]$Type <- "LUAD"
p_Data[p_Data$Type%in%type_normal, ]$Type <- "Normal"
table(p_Data$Type)

# Smoking_status
table(p_Data$Smoking_status)
p_Data[p_Data$Smoking_status%in%c("--", "Unable to determine", "Unknown", "NA"), ]$Smoking_status <- "Unknown"
p_Data[p_Data$Smoking_status%in%c("Current", "Current Smoker", "Current Use", "Currently smoking", "Ever-smoker"), ]$Smoking_status <- "Current Smoker"
p_Data[p_Data$Smoking_status%in%c("Ex-smoker", "Former Smoker", "Previous Use", "Smoked in the past"), ]$Smoking_status <- "Former Smoker"
p_Data[p_Data$Smoking_status%in%c("Never", "Never smoked", "Never Smoked", "Never Used", "Never-smoker"), ]$Smoking_status <- "Never Smoker"

# Mutation_status
table(p_Data$Mutation_status)

# Survival_status
table(p_Data$Survival_status)
p_Data[p_Data$Survival_status%in%c("alive", "Alive", "dead:no"), ]$Survival_status <- "Alive"
p_Data[p_Data$Survival_status%in%c("dead", "Dead", "dead:yes", "deceased"), ]$Survival_status <- "Dead"

# Survial_time
p_Data$Survival_time <- as.numeric(p_Data$Survival_time)
# plot(p_Data$Survival_time)
# some data using month, some using days, years, we need to change them to month as unit
# 1
View(subset(p_Data, Study_ID==GEO_IDs[1]))
# 2
View(subset(p_Data, Study_ID==GEO_IDs[2]))
# 3
View(subset(p_Data, Study_ID==GEO_IDs[3]))
p_Data[p_Data$Study_ID==GEO_IDs[3], ]$Survival_time <- p_Data[p_Data$Study_ID==GEO_IDs[3], ]$Survival_time/30
# 4
View(subset(p_Data, Study_ID==GEO_IDs[4]))
# 5
View(subset(p_Data, Study_ID==GEO_IDs[5]))
# 6
View(subset(p_Data, Study_ID==GEO_IDs[6]))
# 7
View(subset(p_Data, Study_ID==GEO_IDs[7]))
# 8
View(subset(p_Data, Study_ID==GEO_IDs[8]))
# 9
View(subset(p_Data, Study_ID==GEO_IDs[9]))
p_Data[p_Data$Study_ID==GEO_IDs[9], ]$Survival_time <- p_Data[p_Data$Study_ID==GEO_IDs[9], ]$Survival_time*12
# 10
View(subset(p_Data, Study_ID==GEO_IDs[10]))
p_Data[p_Data$Study_ID==GEO_IDs[10], ]$Survival_time <- p_Data[p_Data$Study_ID==GEO_IDs[10], ]$Survival_time/30
# 11
View(subset(p_Data, Study_ID==GEO_IDs[11]))

hist(p_Data$Survival_time)
########

write.csv(p_Data, file=paste0(test_dir, "/result0/11_datasets_processed_by_RMA_after_tidy.csv"))
# assign back
pData(Eset) <- p_Data
# save
save(Eset, file=paste(data_dir,"Merged_ExpressionSet_of_11_GEO_Studies_processedBy_RMA.RData", sep="/"))

ShixiangWang/iProfile documentation built on May 11, 2019, 6:25 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

ShixiangWang/iProfile
integrated cancer Profile using multiple datasets

inst/tests/step3-integrate-multiple-datasets.R
In ShixiangWang/iProfile: integrated cancer Profile using multiple datasets

R Package Documentation

Browse R Packages

We want your feedback!

ShixiangWang/iProfile integrated cancer Profile using multiple datasets

inst/tests/step3-integrate-multiple-datasets.R In ShixiangWang/iProfile: integrated cancer Profile using multiple datasets

R Package Documentation

Browse R Packages

We want your feedback!

ShixiangWang/iProfile
integrated cancer Profile using multiple datasets

inst/tests/step3-integrate-multiple-datasets.R
In ShixiangWang/iProfile: integrated cancer Profile using multiple datasets