R/app_server_interfun.R

Defines functions gmtToDataframe extrcactVariantType plotHeat corMatScreen corCal plotDot plotBar eKegg eGO symToEnt seriesToDiscrete transferID plotVolcano mafDiffer deaLimma deaEdgeR diyForest getCoxTable multipleCox singleCox surAnalysis sbToLine toSurStatus getMaf gsvaCal countsToTPM classifyCounts writeGmt readMatrix

#'  The application internal function
#'  
#'  @import GSVA
#'  @import GSVAdata
#'  @import maftools
#'  @import survival
#'  @import survminer
#'  @import edgeR
#'  @import limma
#'  @import clusterProfiler
#'  @import org.Hs.eg.db
#'  @import DOSE
#'  @import ggplot2
#'  @import ggpubr
#'  @import psych
#'  @import pheatmap
#'  @import data.table
#'  @import GSEABase
#'  @import ggcorrplot
data("interdata", package = 'ARMT')
geneLength <- interdata$gLen
geneSet <- interdata$gSet

interClinical <- interdata$clinical
allCancer <- levels(factor(interClinical$type)) 

#读csv文件
readMatrix <- function(csvPath, rowFlag = TRUE){
    if (is.null(csvPath))
      return(NULL)
    getread <- fread(csvPath$datapath, na.strings = c("", getOption("datatable.na.strings","NA")), check.name = FALSE, data.table = FALSE, stringsAsFactors = FALSE)
    if(rowFlag){
      rownames(getread) <- getread[,1]
      getread <- getread[,-1, drop = FALSE]
    }
    return(getread)
}
#写gmt文件
writeGmt <- function(gsCsv, gmtfile){
  gs_list <- tapply(gsCsv[,2],as.factor(gsCsv[,1]),function(x) x)
  i <- 1
  sink(gmtfile)
  for(gs in gs_list){
    cat(names(gs_list)[i])
    i<-i+1
    cat('\tNA\t')
    cat(paste(gs,collapse = '\t'))
    cat('\n')
  }
  sink()
}
#counts数据样本分离
classifyCounts <- function(allData){
  if(all(nchar(colnames(allData)[1]) >= 15, substr(colnames(allData)[1], 1, 4) == 'TCGA')){
      normData <- allData[,as.numeric(substr(colnames(allData), 14, 15)) > 10, drop = FALSE]
      tumorData <- allData[, !(colnames(allData) %in% colnames(normData)), drop = FALSE]
      colnames(normData) <- sapply(colnames(normData), function(x){substr(x, 1, 12)})
      normData <- normData[,!duplicated(colnames(normData)), drop = FALSE]
      colnames(tumorData) <- sapply(colnames(tumorData), function(x){substr(x, 1, 12)})
      tumorData <- tumorData[, !duplicated(colnames(tumorData)), drop = FALSE]
      return(list(nr = normData, tm = tumorData))
  }
  else{return(list(nr = allData, tm = allData))}

}
#TPM标准化
countsToTPM <- function(countsMatrix, transID = TRUE){
  p <- Progress$new(min = 1, max = 12)
  on.exit(p$close())
  p$set(message = 'TPM Normalizing', detail = 'This may take a while...', value = 1)
  rownames(countsMatrix) <- unlist(lapply(strsplit(rownames(countsMatrix), "[.]"), function(x) x[1]))
  p$set(value = 2)
  rownames(geneLength) <- unlist(lapply(strsplit(rownames(geneLength), "[.]"), function(x) x[1]))
  if(transID){
    p$set(value = 3)
    countsMatrix <- countsMatrix[rownames(countsMatrix) %in% rownames(geneLength), , drop = FALSE]
    p$set(value = 4)
    geneLength <- geneLength[match(rownames(countsMatrix), rownames(geneLength)), , drop = FALSE]
  }
  else{
    p$set(value = 3)
    countsMatrix <- countsMatrix[rownames(countsMatrix) %in% geneLength$Gene_name, , drop = FALSE]
    p$set(value = 3)
    geneLength <- geneLength[match(rownames(countsMatrix), geneLength$Gene_name),,drop = FALSE]
  }
  p$set(value = 5)
  tmp <- countsMatrix/geneLength$Gene_length 
  p$set(value = 6)
  tpm_counts <- as.data.frame(t(t(tmp)/colSums(tmp))*1000000)   #标准化
  p$set(value = 7)
  #id转换及去重
  if(transID){
    ids <- data.frame(symbol = geneLength$Gene_name, mean = apply(tpm_counts, 1, mean))
    p$set(value = 8)
    ids <- ids[order(ids$mean,decreasing = T), ] #ids$symbol按照ids$median中位数从大到小排列的顺序排序,将对应的行赋值为一个新的ids
    p$set(value = 9)
    ids <- ids[!duplicated(ids$symbol), ] #将symbol这一列取取出重复项,'!'为否,即取出不重复的项,去除重复的gene ,保留每个基因最大表达量结果s
    p$set(value = 10)
    tpm_counts <- tpm_counts[rownames(ids),] #新的ids取出ensembl名,将tpm_counts按照取出的这一列中的每一行组成一个新的
    p$set(value = 11)
    rownames(tpm_counts) <- ids$symbol
  }
  else{
    p$set(value = 8)
    p$set(value = 9)
    p$set(value = 10)
    p$set(value = 11)}
  #去除在所有样本中表达为零的基因
  p$set(value = 12)
  tpm_counts <- tpm_counts[rowSums(tpm_counts) >0,] 
  p$set(value = 13)
  return(tpm_counts)
}

#GSVA计算
gsvaCal <- function(mat, setlist, gmtFlag, gmtPath){
  if(gmtFlag){
    geneSet$new <- getGmt(gmtPath)
    setlist[length(setlist)+1]<-'new'
  }
  prolen <- length(setlist)+3
  withProgress(message = 'Calculation in GSVA',
               detail = 'This may take a while...', value = 0,
               expr = {
                   mat <- log(mat+1, 2)
                   incProgress(1/prolen)
                   data <- mat[-(1:nrow(mat)),]
                   incProgress(1/prolen)
                   mat <- as.matrix(mat)
                   incProgress(1/prolen)
                   for (gmt in setlist){
                     gset <- geneSet[[gmt]]
                     vgmt <- gsva(mat, gset,verbose=F,parallel.sz=1)
                     incProgress(1/prolen)
                     vgmt <-  as.data.frame(vgmt)
                     data <- rbind(data,vgmt)  }
                     data <- as.data.frame(t(data))
               })
  return(data)
}

#读maf文件并生成所需的突变信息矩阵
getMaf <- function(variantGeneList, mafPath){
  mafContent <- read.maf(mafPath, isTCGA = TRUE)
  maf_data <- mafContent@data
  mafResult <- sampleList <- mafContent@clinical.data
  for(variantGene in variantGeneList){
    maf_gene <- maf_data[which(maf_data$Hugo_Symbol == variantGene)]
    maf_abstract<-apply(sampleList, MARGIN = 1, function(x){ 
      if(x %in% maf_gene$Tumor_Sample_Barcode){st <- 'Variant' 
                                              ty <- paste(unique(maf_gene[which(maf_gene$Tumor_Sample_Barcode == x)]$VARIANT_CLASS), collapse = ',')}
      else{st <- 'None'
          ty <- 'None'}
      return(c(st, ty))
    })
    maf_abstract <- data.frame(t(as.data.frame(maf_abstract)))
    names(maf_abstract) <- c(paste(variantGene, 'Variant_Status', sep = '_'), paste(variantGene, 'Variant_Type', sep = '_'))
    mafResult <- cbind(mafResult,maf_abstract)
  }
  mafResult <- data.frame(mafResult, row.names = 1)
  return(mafResult)
}

#生存状态二值替换
toSurStatus <- function(dt, status ,event){
  dt[,status] = ifelse(tolower(dt[,status])==tolower(event), 1, 0)
  return(dt)
}

#列名符号换下划线
sbToLine <- function(data){
  names(data) <- gsub("[^[:alnum:]]", '_', names(data))
  return(data)
}

#生存分析
surAnalysis <- function(data, survivaltime, sta, t){
  if(is.numeric(data[,t])){data[,t] <- ifelse(data[,t] > median(data[,t]), 'High', 'Low')}
  data[,survivaltime]<-as.numeric(data[,survivaltime])
  data[,t] <- factor(as.character(data[,t]), order = TRUE)
  ort <- t
  t <- gsub("[^[:alnum:]]", '_', t)
  names(data)[names(data) == ort] <- t
  #s 和f要是全局变量
  s<<-Surv(data[,survivaltime],data[,sta])
  f<<-as.formula(paste0('s','~',t))
  fit<-survfit(f[0:3], data=data)
  names(fit$strata) <- sub(t, ort, names(fit$strata))
  suppressMessages(surplot <- ggsurvplot(fit = fit, data = data,  legend.title=t, palette=c("#F95006", "#33484C"),pval = TRUE,surv.median.line = "hv", legend = c(0.7, 0.8)))
  #remove(f, envir = parent.env(environment()))
  #remove(s, envir = parent.env(environment()))
  #suppressMessages(ggsave(paste0(t,'.pdf'),plot=print(surplot),path= outpath,width=8, height=8))
  return(surplot)
}

#单因素COX
singleCox <- function(data, survivaltime, sta, t){
  data[,survivaltime]<-as.numeric(data[,survivaltime])
  data <- data[!is.na(data[,t]), , drop = FALSE]
  ort <- t
  t <- gsub("[^[:alnum:]]", '_', t)
  names(data)[names(data) == ort] <- t
  s<-Surv(data[,survivaltime],data[,sta])
  f<-as.formula(paste0('s','~',t))
  result <- coxph(f,data=data)
  snum <- result$means
  result <- summary(result)
  rownames(result$coefficients) <- sub(t, ort, rownames(result$coefficients))
  rownames(result$conf.int) <- sub(t, ort, rownames(result$conf.int))
  result$sample_n <- round(snum*result$n)
  return(result)
}

#多因素COX
multipleCox <- function(data, survivaltime, sta, factable){
  data[,survivaltime]<-as.numeric(data[,survivaltime])
  factable<-as.vector(factable)
  orfactable <- factable
  for(i in 1:length(factable)){
    if(!is.numeric(data[, factable[[i]]])){
      data[, factable[[i]]] <- as.factor(data[,factable[[i]]])
    }
    factable[[i]] <- gsub("[^[:alnum:]]", '_', factable[[i]])
    names(data)[names(data) == orfactable[[i]]] <- factable[[i]]
  }
  s<-Surv(data[,survivaltime],data[,sta])
  multif<-as.formula(paste0('s','~',paste(factable, collapse ="+")))
  result<-coxph(multif,data=data)
  for(b in 1:length(factable)){
    names(result$coefficients)[b] <- sub(factable[[b]] ,orfactable[[b]], names(result$coefficients)[[b]])
  }
  snum <- result$means
  result <- summary(result)
  result$sample_n <- round(snum*result$n)
  return(result)
}

#Cox结果提炼
getCoxTable <- function(data){
  a<- as.data.frame(data$coefficients)
  b <- as.data.frame(data$conf.int)
  a = a[c(2,1,3,4, 5)]
  b = b[-1]
  result = merge(a, b, by = "row.names")
  row.names(result) <- result[,1]
  result <- result[,-1]
  result$sample_number <- data$sample_n
  names(result)[names(result) == 'exp(coef)'] <- 'HR'
  names(result)[names(result) == 'Pr(>|z|)'] <- 'P-Value'
  names(result)[names(result) == 'lower .95'] <- 'low95'
  names(result)[names(result) == 'upper .95'] <- 'up95'
  return(result)
}

#自制森林图
diyForest <- function(data){
  data$factor <- row.names(data)
  data$factor <- factor(data$factor, levels = data$factor) #保持画图的因子顺序
  data$up95 <- round(data$up95, 2)
  data$low95 <- round(data$low95, 2)
  data$HR <- round(data$HR, 2)
  data$`P-Value` <- round(data$`P-Value`, 3)
  unitX <- (max(data$up95) - min(data$low95))/5
  X_max <- ceiling(max(data$up95)/unitX)*unitX
  X_min <- floor(min(data$low95)/unitX)*unitX
  #标记定位
  data$X_low <- X_min - unitX/2 - unitX -unitX
  data$X_up <- X_min -unitX/2 - unitX
  data$X_Hr <- X_min - unitX/2
  data$X_P <- X_max + unitX/2
  data$Y <- c(1:nrow(data))
  #提取标记坐标和标记
  extractData <- function(xname, pname){
    extract <- data[c(xname,'Y',pname)]
    names(extract)[names(extract) == xname] = 'X'
    names(extract)[names(extract) == pname] = 'Label'
    newRow <- data.frame(extract$X[[1]], max(extract$Y)+1, pname)
    names(newRow)<-names(extract)
    extract <- rbind(extract, newRow)
  }
  #标记表
  anno <- rbind(extractData('X_low', 'low95'), extractData('X_up', 'up95'))
  anno <- rbind(anno, extractData('X_Hr', 'HR'))
  anno <- rbind(anno, extractData('X_P', 'P-Value'))
  forestResult<- ggplot(data, aes(HR, factor))+
    geom_point(aes(size = sample_number, col = factor)) +
    geom_errorbarh(aes(xmax = up95, xmin = low95), height = 0.4)+
    geom_vline(aes(xintercept = 1))+
    scale_x_continuous(limits = c(X_min-3*unitX, X_max+unitX), breaks = seq(X_min, X_max, (X_max-X_min)/5)) +
    xlab(' ')+ylab(" ") + theme_bw() + theme(legend.position ="top") +
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
          plot.background = element_rect(fill ='skyblue', color ='red'),
          axis.text=element_text(size=10, face = "bold"),legend.text=element_text(size=11))+
    geom_text(data=anno, aes(x=X, y=Y, label=Label))+
    scale_y_discrete(expand = c(0, 2))+
    scale_color_discrete(guide = FALSE)
  return(forestResult)
}

#edgeR
deaEdgeR <- function(df, condition, h, control, normalizeWay){
  pr <- Progress$new(min=1, max=15)
  on.exit(pr$close())
  condition <-  condition[condition[,1] %in% c(h, control), , drop = FALSE]
  pr$set(message = 'Calculation in EdgeR',detail = 'This may take a while...', value = 1)
  condition <- condition[rownames(condition) %in% colnames(df), , drop = FALSE]
  pr$set(value = 2)
  df <- df[,rownames(condition)]
  pr$set(value = 3)
  condition <- factor(condition[,1])
  pr$set(value = 4)
  condition <- relevel(condition, ref = control)
  pr$set(value = 5)
  genelist<-DGEList(counts=df, group = condition)
  pr$set(value = 6)
  keep<-rowSums(cpm(genelist)>1)>=2
  pr$set(value = 7)
  genelist.filted<-genelist[keep,keep.lib.sizes=FALSE]
  pr$set(value = 8)
  genelist.norm <- calcNormFactors(genelist.filted, method = normalizeWay)
  pr$set(value = 9)
  design <- model.matrix(~condition)
  pr$set(value = 10)
  colnames(design) <- levels(condition)
  pr$set(value = 11)
  genelist.Disp<-estimateDisp(genelist.norm, design,robust=TRUE)
  pr$set(value = 12)
  fit<-glmQLFit(genelist.Disp, design, robust=TRUE)
  pr$set(value = 13)
  res <- glmQLFTest(fit)
  pr$set(value = 14)
  result<-topTags(res,n=Inf)$table
  pr$set(value = 15)
  return(result)
}

#Limma
deaLimma <- function(df, condition, h, control){
  pr <- Progress$new(min=1, max=14)
  on.exit(pr$close())
  df <- t(df)
  pr$set(message = 'Calculation in Limma',detail = 'This may take a while...', value = 1)
  condition <- condition[condition[,1] %in% c(h, control), , drop = FALSE]
  pr$set(value = 2)
  condition <- condition[rownames(condition) %in% colnames(df), , drop = FALSE]
  pr$set(value = 3)
  df <- df[,rownames(condition)]
  pr$set(value = 4)
  group_list <- condition[,1]
  pr$set(value = 5)
  group_list <- relevel(factor(group_list), ref = control)
  pr$set(value = 6)
  design <- model.matrix(~0+group_list)
  pr$set(value = 7)
  colnames(design) <- levels(group_list)
  pr$set(value = 8)
  fit <- lmFit(df,design)
  pr$set(value = 9)
  conVector <- paste(h, control, sep = '-')
  pr$set(value = 10)
  contrastMatrix <- makeContrasts(contrasts = conVector, levels = design)
  pr$set(value = 11)
  fit <- contrasts.fit(fit, contrastMatrix)
  pr$set(value = 12)
  fit2 <- eBayes(fit)
  pr$set(value = 13)
  result <- topTable(fit2, coef=1, n=Inf)
  pr$set(value = 14)
  return(result)
}

#maf差异分析
mafDiffer <- function(mafIndata, condition, h, control){
  pr <- Progress$new(min=1, max=4)
  on.exit(pr$close())
  pr$set(message = 'Calculating difference in cohort:',detail = 'This may take a while...', value = 1)
  mafEx <- subsetMaf(mafIndata, tsb = row.names(condition[condition[,1] == h, ,drop = FALSE]))
  pr$set(value = 2)
  mafCt <- subsetMaf(mafIndata, tsb = row.names(condition[condition[,1] == control, ,drop = FALSE]))
  pr$set(value = 3)
  result <- mafCompare(mafEx, mafCt, 
                       m1Name = paste(colnames(condition), h, sep = ':'),
                       m2Name = paste(colnames(condition), control, sep = ':'))
  pr$set(value = 4)
  return(result)
}

#火山图
plotVolcano <- function(degData){
  staNum <- length(levels(as.factor(degData$Status)))
  if(staNum == 3){colo <- c('blue','black','red')}
  else{colo <- c('blue','red')}
  pic <- ggplot(data = degData, aes(x = logFC, y=-log10(FDR), color = Status)) +
                   geom_point(alpha = 0.5, size = 1.8) +
                   xlab("log2FC") + ylab("-log10(FDR)") +
                   scale_colour_manual(values = colo) +
                   theme_set(theme_set(theme_bw(base_size=20)))
  return(pic)
}

#id转换器
transferID <- function(emMatrix) {
  rownames(emMatrix) <- unlist(lapply(strsplit(rownames(emMatrix), "[.]"), function(x) x[1]))
  rownames(geneLength) <- unlist(lapply(strsplit(rownames(geneLength), "[.]"), function(x) x[1]))
  emMatrix <- emMatrix[rownames(emMatrix) %in% rownames(geneLength), , drop = FALSE]
  geneLength <- geneLength[match(rownames(emMatrix), rownames(geneLength)), , drop = FALSE]
  emMatrix[,'Symbol'] <- geneLength$Gene_name
  return(emMatrix)
}

#数据分箱
seriesToDiscrete <- function(series, cutoff){
  qu <- quantile(series, c(cutoff, 1-cutoff))
  result <- sapply(series, function(x){
    if(x <= qu[1]){return('Low')}
    else if(x >= qu[2]){return('High')}
    else{return('Median')}
  })
  return(result)
}

#富集分析换id
symToEnt <- function(data){
  if(!is.null(data)){
    DEG <- data$Symbol                                              #取差异分析文件
    degID <- bitr(DEG, fromType = "SYMBOL", toType = c( "ENTREZID" ), OrgDb = org.Hs.eg.db )   #id类型转换
    return(degID)
  }
}

#GO富集分析
eGO <- function(deg, pc = 0.1, qc = 0.2, onto){
  eg <- enrichGO(
    gene = deg[,1],
    OrgDb = 'org.Hs.eg.db',
    keyType = "SYMBOL",
    ont = onto,         
    pvalueCutoff = pc,
    pAdjustMethod = "BH",
    qvalueCutoff = qc,
    readable = FALSE, )
  ego <- as.data.frame(eg)
  return(list(ego, eg))
}

#KEGG富集分析
eKegg <- function(deg, pc = 0.1, qc = 0.2){
  eKEG<-enrichKEGG(
    gene = deg[,2],
    organism = "hsa",
    keyType = "kegg",
    pvalueCutoff = pc,
    pAdjustMethod = "BH",
    qvalueCutoff = qc,
    use_internal_data = FALSE)
  eKEGG <- as.data.frame(eKEG)
  return(list(eKEGG, eKEG))
}
#Bar图
plotBar <- function(data, eway, showNum = 5){
  if(is.null(data)){return(NULL)}
  else{
    if(eway == 'GO'){
      if('ONTOLOGY' %in% colnames(data@result)){
        pout <- barplot(data, split="ONTOLOGY",showCategory=showNum)+
              facet_grid(ONTOLOGY~., scale="free")
      }
      else{pout <- barplot(data, showCategory=showNum)}
    }
    else if(eway == 'KEGG'){
      pout <- barplot(data,showCategory=showNum)
    }
    return(pout)
  }
}

#Dot图
plotDot <- function(data, eway, showNum = 5){
  if(is.null(data)){return(NULL)}
  else{
    if(eway == 'GO'){
      if('ONTOLOGY' %in% colnames(data@result)){
        pout <- dotplot(data, split="ONTOLOGY",showCategory=showNum)+
          facet_grid(ONTOLOGY~., scale="free")
      }
      else{pout <- dotplot(data,showCategory=showNum)}
    }
    else if(eway == 'KEGG'){
      pout <- dotplot(data,showCategory=showNum)
    }
    return(pout)
  }
}

#相关系数
corCal <- function(data, x, y, way, group = 'No group', singleFac = 'All'){
  #基本求相关函数
  corCalculation <- function(dataIn){
    corResult <- corr.test(dataIn[x], dataIn[y], method = way, adjust = 'fdr')
    rN <- NULL
    for(i in y){
      rN <- c(rN, paste(x,i,sep = '-'))
    }
    re2 <- corResult$ci
    row.names(re2) <- rN
    re1 <- as.data.frame(corResult$r)
    return(list(Mat = re1, Ls = re2))
  }
  #不分组
  if(group == 'No group'){
    result <- list()
    corRe <- corCalculation(dataIn = data)
    result$mat <- corRe$Mat
    result$ls <- corRe$Ls
  }
  #分组
  else{
    groupList <- levels(factor(data[[group]]))
    result <- list()
    #多组单对多
    for(singleFac in  x){
      Mat<-data.frame()
      Ls<-data.frame()
      for(t in groupList){
        if(length(data[data[[group]] == t,1])<=3){next}
        oneResult <- corr.test(data[data[[group]] == t, singleFac, drop = FALSE],
                               data[data[[group]] == t, y, drop = FALSE],
                               method = way, adjust = 'fdr')
        row.names(oneResult$r)<-t
        Mat <- rbind(Mat, as.data.frame(oneResult$r))
        rN <- NULL
        for(i in y){
          rN <- c(rN, paste(t,i,sep = '-'))
        }
        oneLs <- oneResult$ci
        row.names(oneLs) <- rN
        Ls <- rbind(Ls, oneLs)
      }
      result[[singleFac]] <- list(mat = Mat, ls = Ls)
    }
    #多组比较
    multiMat <- list()
    multiLs <- data.frame()
    for(k in groupList){
      if(length(data[data[[group]] == k,1])<=3){next}
      oneResult <- corCalculation(data[data[[group]] == k,])
      multiMat[[k]] <- oneResult$Mat
      oneResult$Ls[[group]] <- k
      multiLs <- rbind(multiLs, oneResult$Ls)
    }
    result$All <- list(mat = multiMat, ls = multiLs)
  }
  return(result)
}

#相关系数矩阵筛选
corMatScreen <- function(data, scrData, rcut, pcut){
  scrData <- scrData[(scrData$p > pcut)|(abs(scrData$r) < rcut),,drop = FALSE]
  for(i in row.names(scrData)){
    a <- strsplit(i, split = '-')
    data[a[[1]][1],a[[1]][2]] <- NA 
  }
  return(data)
}

#热图
plotHeat <- function(data){
  if(is.null(data)){return(NULL)}
  pHeat <- ggcorrplot(data)
  return(pHeat)
}

#提取maf文件突变类型
extrcactVariantType <- function(mafObject){
  if(is.null(mafObject)){return(NULL)}
  typeList <- levels(as.factor(mafObject@data$VARIANT_CLASS))
  return(typeList)
}

#读gmt文件为dataframe
gmtToDataframe <- function(gmtPath){
  if(is.null(gmtPath)){return(NULL)}
  gmtObj <- GSEABase::getGmt(gmtPath)
  result <- data.frame()
  for (var in names(gmtObj)) {
      varData <- data.frame('Gene_Name' = gmtObj[[var]]@geneIds, 
                            'Geneset_Name' = var,
                            check.names = FALSE,
                            stringsAsFactors = FALSE)
      result <- rbind(result, varData)
  }
  return(result)
}
Dulab2020/ARMT documentation built on Nov. 8, 2022, 8:53 a.m.