R/cal.R

Defines functions perm.cal plot.perm summarize.fdr write.result write.summary rm.zero.col summarize.p weight.test hist.mirror2

# weighted test agains continuous response variables, such as gene expression


# drawback: the ylabel of the inversed plot is negative, can be edited in AI
library(ggplot2)
library(grid)
library(gridExtra)
library(gtable)
hist.mirror2 <- function(data.plot, label)
{
  myplot <- ggplot(data.plot, aes(ps, fill=status))+geom_histogram(data=subset(data.plot, status=="Yes"), aes(ps, y=..count..))+
    geom_histogram(data=subset(data.plot,status=="No"), aes(ps, y= -..count..))+
    scale_fill_hue(label)+
    labs(x="Propensity Score")
  print(myplot)
}


weight.test <- function(data,form, molecular.pri, is.continuous=TRUE, weight="MW", mirror.plot=FALSE,cancer, data.type, outdir=".", perm=FALSE, seed=seed)
{  #molecular.pri <- mutation.pri[,keep.index]
  #rownames(molecular.pri) <- gsub("\\.","\\-",rownames(molecular.pri))
  common <- intersect(rownames(data), rownames(molecular.pri))
  print(paste("Number of samples:", length(common)))

  molecular.common <- as.matrix( molecular.pri[match(common, rownames(molecular.pri)),])
  colnames(molecular.common) <- colnames(molecular.pri)
  clinical.common <- data[match(common, rownames(data)),]
  print(summary(factor(clinical.common$Z)))
  # write sample info to file
  #write.table(t(rbind(rownames(clinical.common),ifelse(clinical.common$Z==1, "high","low"))), file=paste(cancer,"_",data.type,"_samples.txt", sep=""),quote = F,row.names = F,sep="\t")

  print("----------------")
  if("age_at__diagnosis" %in% colnames(clinical.common)){
    age.cutoff <- 65
    print(paste("age <", age.cutoff,":", length(which(clinical.common$age_at_diagnosis< age.cutoff))))
    print(paste("age >=", age.cutoff,":", length(which(clinical.common$age_at_diagnosis>= age.cutoff))))
    print("----------------")

  }


  if(perm)
  {
    n <- nrow(clinical.common)
    set.seed(seed)
    perm <- sample(1:n,n)
    clinical.common$Z <- clinical.common$Z[perm]
  }

  print(paste("Weighting scheme:", weight))
  ##loading PSM
  #source("GIPW_function_omega.R")
  ans <- GIPW.std.omega(dat=clinical.common, form.ps=form, weight=weight,trt.est=F)

  # draw mirror plot

  if(mirror.plot)
  {
    ps <- ans$ps
    status <- ifelse(clinical.common$Z==1, "Yes","No")
    data.plot <- data.frame(ps, status )
    pdf(paste(outdir,"/",cancer,"_", data.type, ifelse(perm,paste("_perm_",seed, sep=""), ""),"_mirror_raw.pdf",sep=""))
    if(analysis=="omt")
    {
      label="OMT"
    }
    if(analysis=="gender")
    {
      label="Female"
    }
    if(analysis=="race")
    {
      label="Non-White"
    }
    if(analysis=="BMI_type")
    {
      label="high_BMI"
    }

    print(hist.mirror2(data.plot, label))
    dev.off()
  }
  wt <- ans$W
  #source("check_balance.R")
  index.tr <- which(clinical.common$Z==1)
  index.ctr <- which(clinical.common$Z==0)

  cutoff=0.1 # 10%
  for (i in 2:(ncol(clinical.common)-1))
  {
    print(paste("check", colnames(clinical.common)[i]))
    if(nrow(clinical.common[which(clinical.common[,i]==1),]) >=3){
      #print(summary(clinical.common[,i]))
      std.diff <- check.balance(index.tr, index.ctr, clinical.common[,i], wt, printout=TRUE)
      if(std.diff>cutoff)
      {
        print(paste("Fail: standardized difference= ", std.diff))
        print("=======================================")
        #  stop()
      }else{
        print("Pass!")
      }
    }else{
      print("too small sample size")
    }


  }


  molecular.pvalues <- c()
  molecular.coefs <- c()
  molecular.0 <- c()
  molecular.1 <- c()
  molecular.0.w <- c()
  molecular.1.w <- c()
  for (i in 1:ncol(molecular.common))
  {
    tmp <- sapply(split(as.numeric(molecular.common[,i]), clinical.common$Z), mean,na.rm=T)
    molecular.0 <- c(molecular.0, tmp[1])
    molecular.1 <- c(molecular.1, tmp[2])
    # get weighted mean
    molecular.0.w <- c(molecular.0.w, sum(as.numeric(molecular.common[index.ctr,i])*wt[index.ctr],na.rm=T)/sum(wt[index.ctr],na.rm=T))
    molecular.1.w <- c(molecular.1.w, sum(as.numeric(molecular.common[index.tr,i])*wt[index.tr],na.rm=T)/sum(wt[index.tr],na.rm=T))
    #print(i)
    if (i%%1000==0)
    {
      print(i)
    }

    if(is.continuous)
    {
      #pvalue <- try(summary(lm(clinical.common$Z~molecular.common[,i], weights=wt))$coef[2,4])
      pvalue <- try(summary(lm(molecular.common[,i]~clinical.common$Z, weights=wt))$coef[2,4])
      coef <- try(summary(lm(molecular.common[,i]~clinical.common$Z, weights=wt))$coef[2,1])

      if (class(pvalue)=="try-error")
      {
        pvalue <- NA
        coef <- NA
      }
    }else{

      #pvalue <- summary(glm(clinical.common$Z~molecular.common[,i], family=binomial, weights=wt))$coef[2,4]
      pvalue <- summary(glm(as.numeric(molecular.common[,i])~clinical.common$Z, family=binomial, weights=wt),na.rm=T)$coef[2,4]
      coef <- summary(glm(as.numeric(molecular.common[,i])~clinical.common$Z, family=binomial, weights=wt),na.rm=T)$coef[2,1]
      if (class(pvalue)=="try-error")
      {
        pvalue <- NA
        coef <- NA
      }
    }
    molecular.pvalues <- c(molecular.pvalues, pvalue)
    molecular.coefs <- c(molecular.coefs, coef)
  }
  molecular.fdr <- p.adjust(molecular.pvalues,"fdr")

  return(list(feature=colnames(molecular.common),coef=molecular.coefs, pvalue=molecular.pvalues,fdr=molecular.fdr , mean.0= molecular.0, mean.1= molecular.1, mean.0.w=molecular.0.w, mean.1.w=molecular.1.w))
}

summarize.p <- function(molecular.data, molecular.result, cutoff=0.05, print=FALSE )
{
  pvalue <- molecular.result$pvalue
  fdr <- molecular.result$fdr
  coef <- molecular.result$coef
  mean.0 <- molecular.result$mean.0
  mean.1 <- molecular.result$mean.1
  mean.0.w <- molecular.result$mean.0.w
  mean.1.w <- molecular.result$mean.1.w
  signif.index <- which(pvalue<cutoff)
  print(paste("Features with p value <", cutoff, "=", length(signif.index)))
  if(print)
  {
    print(cbind(colnames(molecular.data)[signif.index],signif(pvalue[signif.index],3), signif(fdr[signif.index],3), signif(coef[signif.index],3), signif(mean.0[signif.index],3), signif(mean.1[signif.index],3)))
  }
  return(list(feature.sig=colnames(molecular.data)[signif.index], pvalue.sig=pvalue[signif.index], fdr.sig= fdr[signif.index], n.sig=length(signif.index), coef.sig=coef[signif.index], mean0.sig=mean.0[signif.index], mean1.sig=mean.1[signif.index], mean0.sig.w=mean.0.w[signif.index], mean1.sig.w=mean.1.w[signif.index]))#, n.sig=length(which(molecular.result$pvalue<cutoff))) )

}


rm.zero.col <- function(molecular.data)
{
  ## remove non-coding genes
  genes.raw <- colnames(molecular.data)
  genes <- unlist(lapply(genes.raw, function(x) unlist(strsplit(x,"[|]"))[1]))
  ids <- as.numeric(unlist(lapply(genes.raw, function(x) unlist(strsplit(x,"[|]"))[2])))
  if(FALSE) # do not remove any gene
  {
    rm1 <- grep("?", genes, fixed=TRUE) # question mark
    molecular.data <- molecular.data[,-rm1]
    print(paste(length(rm1),"genes were removed."))
  }

  if(FALSE)
  {
    gene.data <- read.table("protein_coding_gene_annotation.tsv",header=TRUE, sep="\t", quote="")
    name.id <- paste(data$Gene, data$ID, sep="|")
    keep.index <- match(gene.data$ID, ids)
    if (length(which(is.na(keep.index)))>0)
    {
      stop("Some protein coding genes are not found. Double check!")
    }
    print(paste("After remove non-coding, total genes:", length(keep.index)))
    ##print(paste(length(rm),"columns were removed because of non-coding."))

    molecular.data <- molecular.data[,keep.index]
  }

  # remove non-variable genes, i.e., sd=0
  zero.col <- which(apply(molecular.data, 2, sd)==0)
  if (length(zero.col)>0)
  {
    molecular.data <- molecular.data[,-zero.col]
    print(paste(length(zero.col),"columns were removed because of no variation."))
  }

  return(molecular.data)
}

write.summary <- function(summary, cancer, analysis, type)
{
  file <- paste(cancer,"_",analysis,"_summary_",type,".txt", sep="")
  data <- data.frame(summary$feature.sig, summary$fdr.sig, summary$coef.sig, summary$mean0.sig, summary$mean1.sig, summary$mean0.sig.w, summary$mean1.sig.w)
  if (analysis=="myclusters")
  {
    colnames(data) <- c("feature","fdr","coef","mean_low_FPS", "mean_high_FPS", "mean_low_FPS_weighted", "mean_high_FPS_weighted")
  }
  if (analysis=="BMI_type")
  {
    colnames(data) <- c("feature","fdr","coef","mean_low_BMI", "mean_high_BMI", "mean_low_BMI_weighted", "mean_high_BMI_weighted")
  }
  if (analysis=="gender")
  {
    colnames(data) <- c("feature","fdr","coef","mean_MALE", "mean_FEMALE", "mean_MALE_weighted", "mean_FEMALE_weighted")
  }
  if(analysis=="race")
  {
    colnames(data) <- c("feature","fdr","coef","mean_nonWHITE", "mean_WHITE","mean_nonWHITE_weighted", "mean_WHITE_weighted")
  }
  if (analysis=="omt")
  {
    colnames(data) <- c("feature","fdr","coef","mean_nonOMT", "mean_OMT","mean_nonOMT_weighted", "mean_OMT_weighted")
  }
  write.table(data,file=file, quote=FALSE, sep="\t", col.names=TRUE, row.names=FALSE)
}

write.result <- function(result, cancer, analysis, type, fdr.cutoff=0.05)
{
  file <- paste(cancer,"_",analysis,"_result_",type,".txt", sep="")
  data <- data.frame(result$feature, result$pvalue, result$fdr, result$coef, result$mean.0, result$mean.1, result$mean.0.w, result$mean.1.w)
  if (analysis=="myclusters")
  {
    colnames(data) <- c("feature","pvalue","fdr","coef","mean_low_FPS", "mean_high_FPS", "mean_low_FPS_weighted", "mean_high_FPS_weighted")
  }

  if (analysis=="BMI_type")
  {
    colnames(data) <- c("feature","pvalue","fdr","coef","mean_low_BMI", "mean_high_BMI", "mean_low_BMI_weighted", "mean_high_BMI_weighted")
  }
  if (analysis=="gender")
  {
    colnames(data) <- c("feature","pvalue","fdr","coef","mean_MALE", "mean_FEMALE", "mean_MALE_weighted", "mean_FEMALE_weighted")
  }
  if(analysis=="race")
  {
    colnames(data) <- c("feature","fdr","coef","mean_nonWHITE", "mean_WHITE","mean_nonWHITE_weighted", "mean_WHITE_weighted")
  }
  if (analysis=="omt")
  {
    colnames(data) <- c("feature","fdr","coef","mean_nonOMT", "mean_OMT","mean_nonOMT_weighted", "mean_OMT_weighted")
  }
  write.table(data,file=file, quote=FALSE, sep="\t", col.names=TRUE, row.names=FALSE)
}

summarize.fdr <- function(molecular.data, molecular.result, cutoff=0.05, print=FALSE )
{
  pvalue <- molecular.result$pvalue
  fdr <- molecular.result$fdr
  coef <- molecular.result$coef
  mean.0 <- molecular.result$mean.0
  mean.1 <- molecular.result$mean.1
  mean.0.w <- molecular.result$mean.0.w
  mean.1.w <- molecular.result$mean.1.w
  signif.index <- which(fdr<cutoff)
  print(paste("Features with FDR <", cutoff, "=", length(signif.index)))
  if(print)
  {
    print(cbind(colnames(molecular.data)[signif.index],signif(pvalue[signif.index],3), signif(fdr[signif.index],3), signif(coef[signif.index],3), signif(mean.0[signif.index],3), signif(mean.1[signif.index],3)))
  }
  return(list(feature.sig=colnames(molecular.data)[signif.index], pvalue.sig=pvalue[signif.index], fdr.sig= fdr[signif.index], n.sig=length(signif.index), coef.sig=coef[signif.index], mean0.sig=mean.0[signif.index], mean1.sig=mean.1[signif.index], mean0.sig.w=mean.0.w[signif.index], mean1.sig.w=mean.1.w[signif.index]))#, n.sig=length(which(molecular.result$pvalue<cutoff))) )

}


plot.perm <- function(perm, n, cancer, analysis, type, cutoff)
{
  p=length(which(perm>= n))/length(perm)
  print(paste("Permutation P-value =", p))
  print(paste(type,": n.sig =", n))
  print(paste("Median perm n.sig =", median(perm)))
  file <- paste(cancer,"_",analysis,"_perm_",type,"_",cutoff,".pdf", sep="")
  pdf(file, width=3, height=3)
  par(mgp=c(2,1,0))
  if(type=="mut") {main="Mutation"}
  if(type=="cnv") {main="SCNA"}
  if(type=="methy") {main="Methy"}
  if(type=="mRNAseq") {main="mRNA"}
  if(type=="miRNA") {main="miRNA"}
  if(type=="rppa") {main="Protein"}
  if (n > max(perm))
  {
    myhist <- hist(perm, xlim=c(0, max(max(perm),n)), main="", xlab="# Genes" )

  }else{
    myhist <- hist(perm, main="", xlab="# Genes")
  }
  text(n, max(myhist$counts),paste("p-value =", p), pos=ifelse(n> 0.5* max(perm),2,4), col=ifelse(p<=0.05, "red","blue"), cex=1.1, xpd=T)
  abline(v=n, col=ifelse(p<=0.05, "red","blue"), lty=2, lwd=2)
  dev.off()
}

perm.cal <- function(cancer, analysis, type, molecular.pri, cutoff=0.05, seedV=1:100)
{
  print(paste("Calculate permutation for", cancer,":",type,":"))
  #  load(file=paste(cancer,"_", analysis,"_result.RData",sep=""))
  result <- get(paste(type,".result", sep=""))
  summary <- summarize.fdr(molecular.pri, result, cutoff=cutoff )
  n <- summary$n.sig
  perm <- c()
  print("For permutation:")
  for (seed in seedV)
  {
    ##print(seed)
    perm.result <- perm.summary <- c()
    if(type=="mut")
    {
      load(paste(scripts.dir, "/",cancer,"_",analysis,"/perm_sig_result_",seed,".RData", sep=""))
    }else if(type=="pre")
    {
      load(paste(scripts.dir, "/",cancer,"_",analysis,"/perm_sig_result_pre_",seed,".RData", sep=""))
    }else{

      load(paste(scripts.dir, "/",cancer,"_",analysis,"/perm_sig_result_",seed,".RData", sep=""))
    }
    perm.result <- get(paste("perm.",type,".result", sep=""))
    perm.summary <- summarize.fdr(molecular.pri, perm.result, cutoff=cutoff )
    perm <- c(perm, perm.summary$n.sig)
    do.call(rm, list(paste("perm.",type,".result", sep="")))

  }
  ##print (perm)
  ##print (n)
  plot.perm(perm, n, cancer, analysis, type, cutoff)
}
Yelab2020/FPSOmics documentation built on May 14, 2022, 8:59 p.m.