script/05_DataprocLungCancer.R

library(GEOquery)
library(stringr)
library(affy)
library(hgu133plus2hsentrezgcdf)
library(biomaRt)
library(MatrixEQTL)
library(org.Hs.eg.db)
library(clusterProfiler)
library("BSgenome.Hsapiens.UCSC.hg19")
library(SparseSEMs)
library(igraph)
library(limma)
library(synbreed)      #### Beagle are used for missing genotype imputation
library(gage)
library(httr)
library(XML)

gse =
  getGEO(filename = "/media/xinchou/Storage/SMLfl/exp/GSE33356_family.soft.gz")
gse1 =
  GEOquery:::parseGSEMatrix(
    "/media/xinchou/Storage/SMLfl/exp/GSE33356-GPL570_series_matrix.txt.gz",
    destdir = "exp",
    AnnotGPL = FALSE,
    getGPL = F
  )
gse2 =
  GEOquery:::parseGSEMatrix(
    "/media/xinchou/Storage/SMLfl/exp/GSE33356-GPL6801_series_matrix.txt.gz",
    destdir = "exp",
    AnnotGPL = FALSE,
    getGPL = F
  )

platforms = lapply(GSMList(gse), function(x) {
  Meta(x)$platform_id
})

# step-2 Retrieve prob expression levels and log-transform them
exprlst = BiocGenerics::Filter(function(gsm) {
  Meta(gsm)$platform_id == 'GPL570'
},  GSMList(gse))

probes = Table(GPLList(gse)[[1]])$ID

probesExprmat = do.call('cbind', lapply(exprlst, function(x) {
  tab = Table(x)
  match = match(probes, tab$ID_REF)
  tab$VALUE[match]
}))

probesExprmat = apply(probesExprmat, 2, function(x) {
  as.numeric(as.character(x))
})
probesExprmat = log2(probesExprmat)
rownames(probesExprmat) = as.character(probes)
## dim(probesExprmat) ## [1] 54675   120     ## 54675 probes of 120 samples


## step-3 Retrieve SNP of corresponding patients
SNPlst = BiocGenerics::Filter(function(gsm) {
  Meta(gsm)$platform_id == 'GPL6801'
},  GSMList(gse))
SNPID  = as.character(Table(GPLList(gse)[[2]])$ID)
SNPlib = read.csv(
  "/media/xinchou/Storage/SMLfl/exp/GenomeWideSNP_6.na29.annot.csv",
  comment.char = "#",
  sep = ",",
  stringsAsFactors = F
)
### only keep SNP with location information
SNPlib = SNPlib[SNPlib[,3] != "---" & SNPlib[,4] != "---",]
### build snp's has table
SNPmap = SNPlib[, c(2, 3, 4)]
SNPmap = unique(SNPmap)
rownames(SNPmap) = SNPmap[,1]
SNPhash = SNPlib[, 2]
names(SNPhash) = SNPlib[, 1]
### filter SNP's hash table
SNPhash = SNPhash[!duplicated(SNPhash)]
SNPvarmat = do.call('cbind', lapply(SNPlst, function(x) {
  tab = Table(x)
  mid = match(SNPID, tab$ID_REF)
  as.character(tab$VALUE[mid])
}))
rownames(SNPvarmat) = SNPID
FilterSNP = intersect(SNPID, names(SNPhash))
SNPvarmat = SNPvarmat[FilterSNP, , drop = F]
snpnames = SNPhash[rownames(SNPvarmat)]
names(snpnames) = NULL
rownames(SNPvarmat) = snpnames  ## SNPID --> RS____ ID
### transform character of SNP to numeric
## SNPvarmat[SNPvarmat == "AA"] = 01
## SNPvarmat[SNPvarmat == "AB"] = 02
## SNPvarmat[SNPvarmat == "BB"] = 03
SNPvarmat[SNPvarmat == "NC"] = NA
### remove unchanged SNP and all Missing NA
### impute missing NA in SNP matrix
SNPvarmat = t(SNPvarmat)
SNPmap = SNPmap[colnames(SNPvarmat),c(2,3)]
colnames(SNPmap) = c("chr", "pos")
SNPmap[,2] = as.numeric(SNPmap[,2])
## dim(SNPvarmat) ## [1]    122 930002
PData2 = phenoData(gse2$eset)                     # SNP
SNPPheno = PData2@data[rownames(SNPvarmat), c(10, 11)]
SNPPheno[,1] = as.numeric(SNPPheno[,1])
SNPPheno[,2] = 2 - as.numeric(SNPPheno[,2])
colnames(SNPPheno) = c("Gender", "Status")
SNPData = create.gpData(pheno = SNPPheno, geno = SNPvarmat, map = SNPmap, map.unit = "bp")
SNPImputed = codeGeno(SNPData, impute=TRUE, impute.type="beagle", cores = 4)
SNPvarmat = t(SNPImputed$geno)

## map back to "AA", "AB", "BB"
ImputeData = SNPImputed$geno
RawData = SNPData$geno[,colnames(ImputeData)]
RawData[RawData == "AA"] = "0"
RawData[RawData == "AB"] = "1"
RawData[RawData == "BB"] = "2"
mode(ImputeData) = "character"
for (i in 1:ncol(RawData)) {
  genomap = na.omit(unique(cbind(ImputeData[,i], RawData[,i])))
  t = genomap[,2]
  names(t) = genomap[,1]
  ImputeData[,i] = t[ImputeData[,i]]
}
mode(ImputeData) = "numeric"
SNPvarmat = t(ImputeData)

## step-4 pair-data of lung cancer and normal data
PData1 = phenoData(gse1$eset)                     # GE
PData2 = phenoData(gse2$eset)                     # SNP
## samples have both Gene exprs and SNP vars
sampidGE = as.character(PData1@data$title)
sampidGE = str_sub(sampidGE, start = 13, end = -1)
sampidSNP = as.character(PData2@data$title)
sampidSNP = str_extract(sampidSNP, "[\\d]+[T|N]")
## 84 paired sample
sampleID = intersect(sampidGE, sampidSNP)
GEix = sapply(sampleID, function(id) {
  which(sampidGE == id)
})
SNPix = sapply(sampleID, function(id) {
  which(sampidSNP == id)
})
probesExprmat = probesExprmat[, GEix, drop = F]
SNPvarmat = SNPvarmat[, SNPix, drop = F]

## step-5 microarray-probe data to gene entrez id's expression
## by Brainarray CDF
## only read in 84 samples' CEL files
celfiles = paste0("/media/xinchou/Storage/SMLfl/exp/GSE33356/",
                  colnames(probesExprmat),
                  ".CEL.gz")
rawmarray = ReadAffy(filenames = celfiles)
rawmarray@cdfName = "HGU133Plus2_Hs_ENTREZG"   ## ENTREZ ID
geneExprmat = rma(rawmarray)
geneExprData = geneExprmat@assayData$exprs     ## Transform to 20414 genes
rownames(geneExprData) = sapply(str_split(rownames(geneExprData), "_"), `[`, 1)
colnames(geneExprData) = sapply(str_split(colnames(geneExprData), "\\."), `[`, 1)
geneExprData = geneExprData[,colnames(probesExprmat)]

## step-5.1 Filtering genes only protein-coding gene left. We removed anti0-sense RNA, noncoding RNA related gene
## because that these gene's functional mechanisms do not satisfiy the assumption of Gene regulatory network.
annotation2Entrez = read.table(
  "./data/geneAnnotation.txt",
  sep = "\t",
  quote = "\"",
  na.strings = "-",
  fill = TRUE,
  col.names = c("GeneID", "Symbol", "TypeOfGene"), stringsAsFactors = FALSE
)
gene4mRNA = annotation2Entrez$GeneID[annotation2Entrez$TypeOfGene == "protein-coding"]
gene4mRNA = as.character(gene4mRNA)
geneExprData = geneExprData[intersect(rownames(geneExprData), gene4mRNA), ]
## dim(geneExprData)  ## [1] 17013    84

## step-6 search eQTL w.r.t gene expression by
### MatrixEQTL detect eQTL
Libensembl = useDataset("hsapiens_gene_ensembl", mart = useMart("ensembl"))
GeneLocation = getBM(
  attributes = c(
    "ensembl_gene_id",
    "chromosome_name",
    "start_position",
    "end_position"
  ),
  mart = Libensembl
)
### Collect entrezID with ensembl
Libid = bitr(
  rownames(geneExprData),
  fromType = "ENTREZID",
  toType = "ENSEMBL",
  OrgDb = "org.Hs.eg.db"
)
### Remove 7.6% of input gene IDs are fail to map
ensembl2id = Libid$ENTREZID
names(ensembl2id) = Libid$ENSEMBL
GeneLocation = GeneLocation[GeneLocation$ensembl_gene_id %in% names(ensembl2id), , drop = F]
GeneLocation$entrezgene = ensembl2id[GeneLocation$ensembl_gene_id]
GeneLocation = GeneLocation[complete.cases(GeneLocation), , drop = F]
Chromosomes = sapply(GeneLocation$entrezgene, function(x) {
  org.Hs.egCHR[[x]]
})
GeneLocation$chromosome_name = sapply(Chromosomes, `[`, 1)
GeneLocation = unique(GeneLocation)

### Build object for MatrixEQTL
geneExprData1 = geneExprData[as.character(unique(GeneLocation$entrezgene)), , drop = F]
Exprmat = SlicedData$new()
colnames(geneExprData1) = NULL
Exprmat$initialize(geneExprData1)
#### SNP build
SNPmat = SlicedData$new()
SNPvarmat1 = SNPvarmat
colnames(SNPvarmat1) = NULL
SNPmat$initialize(SNPvarmat1)
GeneLoc = GeneLocation[as.character(GeneLocation$entrezgene) %in% rownames(geneExprData1), ]
GeneLoc = GeneLoc[, c(5, 2, 3, 4)]
colnames(GeneLoc) = c("geneid", "chr", "left", "right")
rownames(GeneLoc) = NULL
SNPLoc = data.frame(SNPlib[,c(2, 3, 4)])
SNPLoc[,3] = as.numeric(SNPLoc[,3])
SNPLoc = SNPLoc[complete.cases(SNPLoc), , drop = F]
SNPLoc = unique(SNPLoc)
colnames(SNPLoc) = c("snpid", "chr", "pos")
SNPLoc = SNPLoc[SNPLoc$snpid %in% rownames(SNPmat),]
Covariates = NULL
PData = PData1@data[colnames(probesExprmat), ]
Status = as.character(PData$characteristics_ch1)
Status[Status == "tissue: lung cancer"] = "tumor"
Status[Status != "tumor"] = "normal"
Ages = as.numeric(str_sub(
  as.character(PData$characteristics_ch1.2),
  start = 6,
  end = 7
))
Covariates = rbind(Covariates,
                   status = ifelse(Status == "normal", 0, 1),
                   age = Ages)
Covmat = SlicedData$new()
Covmat$initialize(Covariates)

### Prepare for MatrixEQTL
SNPInfo = SNPvarmat1[SNPLoc$snpid[SNPLoc$chr %in% c(as.character(seq(1,22)), "X", "Y", "MT")], ]
SNPS    = data.frame(id = rownames(SNPInfo), SNPInfo)
colnames(SNPS) = c("id", paste("Sample_", seq(1, ncol(SNPInfo)), sep=""))
write.table(SNPS, "./data/SNP.txt", quote = F, row.names = F, col.names = TRUE, sep = "\t")
GE  = data.frame(id = rownames(geneExprData1), geneExprData1)
colnames(GE) = c("id", paste("Sample_", seq(1, ncol(geneExprData)), sep=""))
write.table(GE, "./data/GE.txt", quote = F, row.names = F, col.names = TRUE, sep = "\t")
COV = data.frame(id = rownames(Covariates), Covariates)
colnames(COV) = c("id", paste("Sample_", seq(1, ncol(Covariates)), sep=""))
write.table(COV, "./data/Covariates.txt", quote = F, row.names = F, col.names = TRUE, sep = "\t")
##### Location information of SNP and Gene
saveRDS(SNPLoc, "./data/SNP.rds")
saveRDS(GeneLoc, "./data/GE.rds")

## Extract Significant eQTL from MatrixEQTL result
cis_eQTL = read.csv("./data/cis_eQTL_results_R.txt", sep = "\t", stringsAsFactors = F)
#--------------------------------------------------------------#
Normal = which(Status == "normal")
Tumor  = which(Status == "tumor")
### SNP with MAF > 0.05 are considered in HapMap, so filter eQTL with MAP (minor allele frequency) > 0.05
FilterByMAF = apply(SNPvarmat, 1, function(x) {
  MAF_N = min(c(sum(x[Normal] == 0), sum(x[Normal] == 1), sum(x[Normal] == 2))) / length(Normal)
  MAF_T = min(c(sum(x[Tumor] == 0), sum(x[Tumor] == 1), sum(x[Tumor] == 2))) / length(Tumor)
  (MAF_N > 0.05 & MAF_T > 0.05)
})
SNPFilterByMAF = names(which(FilterByMAF))
##CandidateGenes = union(HumanBaseRefGRN$G1, HumanBaseRefGRN$G2)
Significant_eQTLs = cis_eQTL[(cis_eQTL$FDR < 0.05 & cis_eQTL$SNP %in% SNPFilterByMAF), , drop = F]
## cis-eQTL mapping 585 genes with cis-eQTL
Gene2eQTL = split(Significant_eQTLs$SNP, Significant_eQTLs$gene)
## center function
center = function(X) {
  apply(X, 1, function(x) {
    x - mean(x)
  })
}
## filtered out eQTL, if two eQTL have the same pattern, remove one of them
eQTLRank = lapply(Gene2eQTL, function(g) {
  t = unique(SNPvarmat[g, Tumor, drop = F])
  n = unique(SNPvarmat[g, Normal, drop = F])
  min(qr(crossprod(center(t)))$rank, qr(crossprod(center(n)))$rank)
})
## only keep non-repeated eQTL
Gene2eQTL2 = Gene2eQTL
#for(i in 1:length(Gene2eQTL)) {
#  FDR = NULL
#  g = as.numeric(names(Gene2eQTL[i]))
#  for (s in Gene2eQTL[[i]]) {
#    FDR = c(FDR, Significant_eQTLs[Significant_eQTLs$SNP == s &
#                                     Significant_eQTLs$gene == g, 6])
#  }
#  Gene2eQTL[[i]] = Gene2eQTL[[i]][which.min(FDR)]
#}

for(i in 1:length(Gene2eQTL)) {
  if (eQTLRank[[i]] == length(Gene2eQTL[[i]])) {
    Gene2eQTL[[i]] = Gene2eQTL[[i]]
  } else if (eQTLRank[[i]] == 1 & length(Gene2eQTL[[i]]) > 1) {
    FDR = NULL
    g = as.numeric(names(Gene2eQTL[i]))
    for (s in Gene2eQTL[[i]]) {
      FDR = c(FDR, Significant_eQTLs[Significant_eQTLs$SNP == s & Significant_eQTLs$gene == g, 6])
    }
    Gene2eQTL[[i]] = Gene2eQTL[[i]][which.min(FDR)]
  } else if (eQTLRank[[i]] > 1 & length(Gene2eQTL[[i]]) > 1) {
    FDR = NULL
    g = as.numeric(names(Gene2eQTL[i]))
    for (s in Gene2eQTL[[i]]) {
      FDR = c(FDR, Significant_eQTLs[Significant_eQTLs$SNP == s & Significant_eQTLs$gene == g, 6])
    }
    index = sort.int(FDR, decreasing = F, index.return = T)$ix
    n = 0
    j = 1
    s = c(Gene2eQTL[[i]][index[j]])
    j = j + 1
    n = 1
    while (n < eQTLRank[[i]] & j < length(index)) {
      tmr = SNPvarmat[c(s, Gene2eQTL[[i]][index[j]]), Tumor, drop = F]
      nml = SNPvarmat[c(s, Gene2eQTL[[i]][index[j]]), Normal, drop = F]
      if (qr(crossprod(center(tmr)))$rank == n + 1 && qr(crossprod(center(nml)))$rank == n + 1) {
        s = c(s, Gene2eQTL[[i]][index[j]])
        n = n + 1
      }
      j = j + 1
    }
    Gene2eQTL[[i]] = s
  } else {
    Gene2eQTL[[i]] = NA
  }
}
Gene2eQTL = Gene2eQTL[!is.na(Gene2eQTL)]

## Gene feature selection, only keep those genes who are significant differentially expressed under tumor vs normal
## design = model.matrix(~ age + status, as.data.frame(t(Covariates)))
## fit0 = lmFit(GE[,-1], design)
## fite = eBayes(fit0)
## limres = topTable(fite, coef = 3, number = Inf)
## sigGene = limres[limres$adj.P.Val < 0.01, ]

## We only consider the differential GRN of genes who was detected differential expressed under tumor status
## Gene2eQTL0 = Gene2eQTL[intersect(rownames(sigGene), names(Gene2eQTL))]

### build input of FSSEM
seed = as.numeric(Sys.time())
N = sum(Status == "tumor")
Ng = length(Gene2eQTL)                                                          ## 1459 genes
Nk = sum(sapply(Gene2eQTL, length))                                             ## 2146 eQTLs
set.seed(seed)
Sk = list()
## build Sk for FSSEM
index = 0
CandidateEQTLs = NULL
for (i in 1:length(Gene2eQTL)) {
  Sk[[i]] = index + seq(1:length(Gene2eQTL[[i]]))
  index = max(Sk[[i]])
  CandidateEQTLs = c(CandidateEQTLs, Gene2eQTL[[i]])
}
CandidateGenes = names(Gene2eQTL)
Y = vector("list", 2)  ## Y[[1]] normal; Y[[2]] tumor
Y[[1]] = geneExprData[CandidateGenes, Status == "normal"]
Y[[2]] = geneExprData[CandidateGenes, Status == "tumor"]
rownames(Y[[1]]) = rownames(Y[[2]]) = NULL
X = vector("list", 2)
X[[1]] = SNPvarmat[CandidateEQTLs, Status == "normal"]
X[[2]] = SNPvarmat[CandidateEQTLs, Status == "tumor"]
rownames(X[[1]]) = rownames(X[[2]]) = NULL

data = list(
  Data = list(
    X = X, Y = Y, Sk = Sk
  ),
  Vars = list(
    Genes = CandidateGenes, eQTLs = CandidateEQTLs,
    n = N, p = Ng, k = Nk
  )
)
##saveRDS(data, "./data/luca0.10.rds")
##saveRDS(data, "./data/luca0.01.rds")
##saveRDS(data, "./data/luca0.03.rds")
##saveRDS(data, "./data/luca0.05.rds")
save.image("./script/data/beagle_filter.RData")

seed = as.numeric(Sys.time())
set.seed(seed)
data = readRDS("./script/data/luca0.01.rds")
gamma = cv.multiRegression(data$Data$X, data$Data$Y, data$Data$Sk, ngamma = 50, nfold = 5, data$Vars$n, data$Vars$p, data$Vars$k)
ifit  = multiRegression(data$Data$X, data$Data$Y, data$Data$Sk, gamma, data$Vars$n, data$Vars$p, data$Vars$k, trans = FALSE)
Xs    = data$Data$X
colnames(Xs[[1]]) = colnames(Xs[[2]]) = NULL
Ys    = data$Data$Y
colnames(Ys[[1]]) = colnames(Ys[[2]]) = NULL
Sk    = data$Data$Sk

cvfit = opt.multiFSSEMiPALM2(Xs = Xs, Ys = Ys, Bs = ifit$Bs, Fs = ifit$Fs, Sk = Sk,
                            sigma2 = ifit$sigma2, nlambda = 20, nrho = 20,
                            p = data$Vars$p, q = data$Vars$k, wt = T)


fit = multiFSSEMiPALM2(Xs = Xs, Ys = Ys, Bs = ifit$Bs, Fs = ifit$Fs, Sk = Sk,
                       sigma2 = ifit$sigma2, lambda = cvfit$lambda, rho = cvfit$rho,
                       Wl = inverseB(ifit$Bs), Wf = flinvB(ifit$Bs),
                       p = data$Vars$p, maxit = 1000, threshold = 1e-5, sparse = T,
                       verbose = T, trans = T, strict = T)

## saveRDS(fit, "./data/lucafitFSSEM0.05.rds")
## for 0.01 cutoff
## load("./data/LUNG_CANCER.RData")
## fit = readRDS("./data/lucafitFSSEM0.05.rds")
## for robust we only keep top 20% edges with absolute value >= 0.10
filterDiffNet =  function(fit, data, cutoff = 0.01) {
  Ci1 = rowMeans(qnorm(1 - cutoff, mean = 0, sd = sqrt(fit$sigma2)) / data$Data$Y[[1]])
  Ci2 = rowMeans(qnorm(1 - cutoff, mean = 0, sd = sqrt(fit$sigma2)) / data$Data$Y[[2]])
  BThreshold = quantile(c(abs(fit$Bs[[1]])[fit$Bs[[1]] != 0], abs(fit$Bs[[2]])[fit$Bs[[2]] != 0]), seq(0, 1, by = 0.1))[2 + 1]
  B1 = fit$Bs[[1]]
  B2 = fit$Bs[[2]]
  ## Threshold filter
  for(i in 1:data$Vars$p) {
    B1[,i] = B1[,i] * (abs(B1[,i]) >= max(BThreshold, Ci1[i]))
    B2[,i] = B2[,i] * (abs(B2[,i]) >= max(BThreshold, Ci2[i]))
  }
  DiffB = B2 - B1
  for(i in 1:data$Vars$p) {
    DiffB[,i] = DiffB[,i] * ((abs(DiffB[,i]) >= max(BThreshold, Ci1[i], Ci2[i])) & abs(DiffB[,i]) >= pmin(abs(B1[,i]), abs(B2[,i])))
  }
  list(B1 = B1, B2 = B2, DiffB = DiffB)
}

FilteredB = filterDiffNet(fit, data, cutoff = 0.01)
##Sigma2Threshold = max(max(qnorm(1 - 0.001, mean = 0, sd = sqrt(fit$sigma2)) / abs(data$Data$Y[[1]])), median(qnorm(1 - 0.001, mean = 0, sd = sqrt(fit$sigma2)) / abs(data$Data$Y[[2]])))
##BThreshold = quantile(c(abs(fit$Bs[[1]])[fit$Bs[[1]] != 0], abs(fit$Bs[[2]])[fit$Bs[[2]] != 0]), seq(0, 1, by = 0.1))[2 + 1]
##BThreshold = max(Sigma2Threshold, BThreshold)
##B1 = fit$Bs[[1]] * (abs(fit$Bs[[1]]) >= BThreshold)
##B2 = fit$Bs[[2]] * (abs(fit$Bs[[2]]) >= BThreshold)
B1 = FilteredB$B1
B2 = FilteredB$B2
DiffB = FilteredB$DiffB
adjNormalGRN = as.matrix(B1 != 0)
adjTumorGRN  = as.matrix(B2 != 0)
## adjDifferentialGRN  = as.matrix((abs(DiffB) >= BThreshold) & (abs(DiffB) >= pmin(abs(B1), abs(B2))))
adjDifferentialGRN = as.matrix(DiffB != 0)
NormalGRN = graph_from_adjacency_matrix(t(adjNormalGRN)) %>% set_vertex_attr("name", value = data$Vars$Genes)
TumorGRN = graph_from_adjacency_matrix(t(adjTumorGRN)) %>% set_vertex_attr("name", value = data$Vars$Genes)
DifferentialGRN  = graph_from_adjacency_matrix(t(adjDifferentialGRN)) %>% set_vertex_attr("name", value = data$Vars$Genes)
NormalGRN = delete.vertices(igraph::simplify(NormalGRN), degree(NormalGRN) == 0)
TumorGRN = delete.vertices(igraph::simplify(TumorGRN), degree(TumorGRN) == 0)
DifferentialGRN  = delete.vertices(igraph::simplify(DifferentialGRN), degree(DifferentialGRN) == 0)
DifferentialGRN1 = delete.vertices(igraph::simplify(DifferentialGRN), degree(DifferentialGRN) == 0)
adjDifferentialGRN1  = as.matrix((abs(DiffB) >= BThreshold) & (abs(DiffB) >= pmin(abs(B1), abs(B2)))) * DiffB
DifferentialGRN2  = graph_from_adjacency_matrix(t(adjDifferentialGRN1), weighted = T) %>% set_vertex_attr("name", value = data$Vars$Genes)
DifferentialGRN2 = delete.vertices(igraph::simplify(DifferentialGRN2), degree(DifferentialGRN2) == 0)

### Entrez ID to Gene Symbol
DiffGene = bitr(
  names(V(DifferentialGRN)),
  fromType = "ENTREZID",
  toType = "SYMBOL",
  OrgDb = "org.Hs.eg.db"
)
rownames(DiffGene) = DiffGene[,1]
V(DifferentialGRN)$name = DiffGene[V(DifferentialGRN)$name,2]
V(DifferentialGRN2)$name = DiffGene[V(DifferentialGRN2)$name,2]
EdgeList = as.data.frame(as_edgelist(DifferentialGRN2))
EdgeList$weight = E(DifferentialGRN2)$weight
colnames(EdgeList) = c("G1", "G2", "Type")
write.table(EdgeList, "./script/data/lungnet.txt", sep = "\t", quote = F, row.names = F, col.names = T)


### plot
GRNLayout = function(G = NULL, weight = FALSE) {
  qcut = sort(degree(G), decreasing = T)[10]
  V(G)$color = "lightblue"
  V(G)$frame.color = "darkblue"
  V(G)$color[which(degree(G) >= qcut)] = "red"
  V(G)$frame.color[which(degree(G) >= qcut)] = "darkred"
  V(G)$size = sqrt(degree(G)) * 1.2 + 0.1
  E(G)$color = rgb(0, 0, 0, alpha = .1)
  if(weight) {
    E(G)$color[E(G)$weight > 0] = rgb(0, 1, 0, alpha = .2)
  }
  Vnames = rep(NA, length(V(G)))
  Vnames[which(degree(G) >= qcut)] = names(which(degree(G) >= qcut))
  plot(
    G,
    vertex.label = Vnames,
    layout = layout.fruchterman.reingold,
    edge.arrow.size = 0.1,
    edge.curve = 0.1,
    vertex.label.font = 1,
    vertex.label.cex = 0.3,
    vertex.label.dist = 0,
    vertex.label.color = "black"
  )
}


GRNLayout(NormalGRN)
GRNLayout(TumorGRN)
GRNLayout(DifferentialGRN)
##GRNLayout(DifferentialGRN2, weight = T)

### GeneSet analysis
### GSEA
getGSEAdb = function(db = NULL) {
  f = readLines(db)
  lst = sapply(f, function(x)
    unlist(strsplit(x, "\t", fixed = TRUE)))
  names(lst) = sapply(lst, function(x)
    x[1])
  lst1 = lapply(lst, function(x)
    x[-(1:2)])
  lst2 = lapply(lst, function(x)
    x[2])
  list(gsea = lst1, url = lst2)
}

GSEAC2 =  getGSEAdb("./script/data/c2.all.v6.2.entrez.gmt")
EntrezGenes = unique(unlist(GSEAC2$gsea))
names(EntrezGenes) = NULL

testGSEA = function(geneset, diffgene, universe) {
  bgset   = setdiff(universe, diffgene)
  pvalue  = lapply(geneset, function(x) {
                              x1 = length(intersect(diffgene, x))
                              x2 = length(setdiff(diffgene, x))
                              y1 = length(intersect(bgset, x))
                              y2 = length(setdiff(bgset, x))
                              c(fisher.test(matrix(c(x1, x2, y1, y2), nrow = 2), alternative = "two.sided")$p.value,
                                ifelse(x1/x2 >= y1/y2, "+", "-"))
                            })
  pvalue
}

## filter Gene set have lung annotation
LUCA_related = lapply(GSEAC2$url, function(url) {
  url = as.character(url)
  mdb = GET(url)
  mdb = readHTMLTable(rawToChar(mdb$content), stringsAsFactors = F)
  i = which(mdb[[1]][,1] == "Full description or abstract")
  description = mdb[[1]][i, 2]
  pattern1 = str_detect(description, "lung relapse | lung cancer | lung tumor | LUCA | NSCLC | lung adenocarcinoma")
  pattern2 = all(str_detect(description, c("lung", "cancer | tumor | adenocarcinoma | carcinoma")))
  pattern1 | pattern2
})
GSEA4LUNG = GSEAC2$gsea[unlist(LUCA_related)]
GSEA4LUNG = GSEA4LUNG[sapply(GSEA4LUNG, function(x){length(intersect(x, data$Vars$Genes)) != 0})]
result = testGSEA(GSEA4LUNG, names(which(degree(DifferentialGRN1, mode = "all") >= 1)), data$Vars$Genes)

pvalue_gsea  = as.numeric(sapply(result, `[`, 1))
enrich_gsea  = sapply(result, `[`, 2)
enrich_idx   = which(pvalue_gsea < 0.05 & enrich_gsea == "+")
enrichedGS   = data.frame(gs = names(result)[enrich_idx],
                         #description = unlist(GSEAC2$url[names(result)[enrich_idx]]),
                         pval = pvalue_gsea[enrich_idx])
enrichedGS = enrichedGS[sort(enrichedGS$pval, decreasing = F, index = T)$ix, ]
rownames(enrichedGS) = NULL
enrichedGS

for(n in 1:nrow(enrichedGS)) {
  cat(as.character(enrichedGS[n,1]), " & ", toupper(format(signif(enrichedGS[n,2], 2), scientific = T)), " \\\\ \n")
  cat("\\hline \n")
}
Ivis4ml/SparseSEMs documentation built on May 23, 2019, 2:43 p.m.