R/smoking_analysis.R

Defines functions smoking_analysis plot_qq plot_smoking_heatmap plot_smoking_analysis run_smoking_analysis prep_smoking_data process_meth_data

Documented in prep_smoking_data run_smoking_analysis smoking_analysis

process_meth_data <- function(X){
  # remove sites with missing values 
  X <- X[rowSums(is.na(X)) == 0,]
}

#' Download and organize Liu et al. and Hannum et al. Data
#'
#' @param data_path Directory for RData files that will store data
#' @param hannum_smk_status_path Path to hannum_smoking_status.txt
#'        that must be downloaded separately.
#' @return List of filenames for these RData files
prep_smoking_data <- function(data_path, hannum_smk_status_path){
  file_name1 <- paste(data_path,"liu.RData",sep="/")
  if (!file.exists(file_name1)){
    # Download the Liu et al. data 
    gse <- GEOquery::getGEO("GSE42861", destdir = data_path, GSEMatrix = TRUE)
    # smoking status; consider never-smokers [0], ex-smokers [1] and current smokers [2] (refer to occasional smokers as smokers)
    smk.liu <- Biobase::pData(gse[[1]])[["smoking status:ch1"]]
    # two samples have NA values for smoking; remove them from the analysis
    keep <- setdiff(1:length(smk.liu), which(smk.liu == "na"))
    smk.liu <- smk.liu[keep]
    smk.liu[which(smk.liu == "occasional")] <- "current"
    smk.liu <- 3-as.matrix(as.numeric(as.factor(smk.liu)))
    colnames(smk.liu) <- "smoking"
    # covariates
    cov.liu <- cbind(as.numeric(as.factor(Biobase::pData(gse[[1]])[["disease state:ch1"]])), as.numeric(as.factor(Biobase::pData(gse[[1]])[["gender:ch1"]])), as.numeric(as.factor(Biobase::pData(gse[[1]])[["age:ch1"]])))
    colnames(cov.liu) <- c("disease", "gender", "age")
    cov.liu <- cov.liu[keep,]
    # methylaion data
    X.liu <- process_meth_data(Biobase::exprs(gse[[1]])[,keep])
    rownames(smk.liu) <- colnames(X.liu)
    rownames(cov.liu) <- colnames(X.liu)
    
    # estimate cell-type proportions usign a reference-based approach 
    # methylation reference
    ref <- as.matrix(EpiDISH::centDHSbloodDMC.m[,c("Neutro","Eosino","CD4T","CD8T","Mono","B","NK")])
    W.liu <- EpiDISH::epidish(X.liu, ref)$estF
    
    liu <- list(X=X.liu,
                smk=smk.liu,
                cov=cov.liu,
                W=W.liu)
    gse.file <- paste0(data_path, "/GSE42861_series_matrix.txt.gz")
    other.gse.file <- paste0(data_path, "/GPL13534.soft")
    file.remove(gse.file)
    file.remove(other.gse.file)
    save(liu, file=file_name1)
    rm(gse, X.liu, smk.liu, cov.liu, W.liu, liu)

  }
  
  file_name2 <- paste(data_path,"hannum.RData",sep="/")
  if (!file.exists(file_name2)){
    # Download the Hannum et al. data 
    gse <- GEOquery::getGEO("GSE40279", destdir = data_path, GSEMatrix = TRUE)
    # smoking status; consider never-smokers [0], ex-smokers [1] and current smokers [2] (refer to occasional smokers as smokers)
    smk.hannum <- read.table(hannum_smk_status_path, header = T, row.names = 1, sep=",")
    # remove samples with NA values for smoking
    keep <- setdiff(1:nrow(smk.hannum), which(is.na(smk.hannum)))
    keep.ids <- rownames(smk.hannum)[keep]
    smk.hannum <- smk.hannum[keep.ids,,drop=F]
    smk.hannum$smk <- as.numeric(smk.hannum$smk)
    smk.hannum$smk[which(smk.hannum$smk == 2)] <- 0
    smk.hannum$smk[which(smk.hannum$smk == 1)] <- 2
    smk.hannum$smk[which(smk.hannum$smk == 3)] <- 1
    colnames(smk.hannum) <- "smoking"
    # covariates
    cov.hannum <- cbind(as.numeric(as.factor(Biobase::pData(gse[[1]])[["age (y):ch1"]])), as.numeric(as.factor(Biobase::pData(gse[[1]])[["gender:ch1"]])), as.numeric(as.factor(Biobase::pData(gse[[1]])[["plate:ch1"]])), as.numeric(as.factor(Biobase::pData(gse[[1]])[["ethnicity:ch1"]])))
    rownames(cov.hannum) <- Biobase::pData(gse[[1]])[["geo_accession"]]
    colnames(cov.hannum) <- c("age", "gender", "plate","ethnicity")
    cov.hannum <- cov.hannum[keep.ids,]
    # methylaion data
    X.hannum <- process_meth_data(Biobase::exprs(gse[[1]])[,keep.ids])
    rownames(smk.hannum) <- colnames(X.hannum)
    rownames(cov.hannum) <- colnames(X.hannum)
    
    # estimate cell-type proportions usign a reference-based approach 
    ref <- as.matrix(EpiDISH::centDHSbloodDMC.m[,c("Neutro","Eosino","CD4T","CD8T","Mono","B","NK")])
    W.hannum <- EpiDISH::epidish(X.hannum, ref)$estF
    
    hannum <- list(X=X.hannum,
                   smk=smk.hannum,
                   cov=cov.hannum,
                   W=W.hannum)
    gse.file <- paste0(data_path, "/GSE40279_series_matrix.txt.gz")
    other.gse.file <- paste0(data_path, "/GPL13534.soft")
    file.remove(gse.file)
    file.remove(other.gse.file)
    save(hannum, file=file_name2)
    rm(gse, X.hannum, smk.hannum, cov.hannum, W.hannum)
    
  }
  return(list(file_name1, file_name2))
}

#' Smoking analysis of methylation data
#'
#' Performs cell-type-specific analysis of blood methylation data in the context of smoking.
#' Provides p-values estimated by CellDMC and TCA while correcting for known covariates
#' as well unknown technical variation using PCs calculated from control probes. Combines cell types into myeloid and lymphoid compartments
#' for analysis.
#'
#' @param X Methylation values in matrix with sites as rows and samples as columns
#' @param smk Smoking status encoded as 0 (non-smoker), 1 (ex-smoker), or 2 (smoker) in
#'            a matrix with rownames as samples and 1 column entitled \code{smoking}
#' @param C1 Covariates in matrix format; for covariates that may affect methylation at the cell-type level. Rownames are samples and column names are
#'                   covariate names
#' @param C2 Covariates in matrix format; for covariates that may affect the methylation mixtures. Rownames are samples and column names are
#'                   covariate names
#' @param W Cell fractions in a matrix with cell types as column names and samples as
#'          row names. Expects cell types: "Neutro","Eosino","Mono","CD4T","CD8T","B","NK"
#' @param smoking_cpgs List of CpG names to analyze for associations
#' @param dataset.name Name of dataset for naming output variables.
#' @return A list of results. Each slot contains p-values output by each method
run_smoking_analysis <- function(X, smk, C1, C2, W, smoking_cpgs, dataset.name){
  # combine cell types into myeloid and lymphoid compartments
  W.2 <- cbind(rowSums(W[,c("Neutro","Eosino","Mono")]),
               rowSums(W[,c("CD4T","CD8T","B","NK")]))
  colnames(W.2) <- c("MYE","LYM")
  
  # calculate PCs from low variance probes , to be treated as control probes (as in Lenhe et al. 2015); here, since we don't work with IDAT files and therefore don't have actual control probes, we use sites with the least variation in the data as control probes (as in Rahmani et al. 2019).
  site.variances <- matrixStats::rowVars(X)
  names(site.variances) <- rownames(X)
  p <- 1000
  low.var.sites <- names(head(sort(site.variances), p))
  low.var.pca <- prcomp(t(X[low.var.sites,]), center=TRUE, scale=TRUE, rank=10)
  
  C2_ <- if (is.null(C2)) low.var.pca$x else cbind(C2, low.var.pca$x)
  
  # remove polymorphic or cross-reactive sites and non-autosomal sites
  nonspecific_probes <- read.table("https://raw.githubusercontent.com/cozygene/glint/master/parsers/assets/nonspecific_probes.txt")[,1]
  XY_probes <- read.table("https://raw.githubusercontent.com/cozygene/glint/master/parsers/assets/HumanMethylationSites_X_Y.txt")[,1]
  polymorphic_probes <- read.table("https://raw.githubusercontent.com/cozygene/glint/master/parsers/assets/polymorphic_cpgs.txt")[,1]
  
  # remove sites with very low variance that are unlikely to exhibit biological signal
  low_var_sites <- names(which(site.variances<0.0001))
  
  exclude <- union(nonspecific_probes,union(XY_probes,union(low_var_sites,polymorphic_probes)))

  X.final <- X[setdiff(rownames(X),exclude),]
  message(sprintf("%i sites in final data; genome-wide significance at -log10 scale: %f", nrow(X.final),-log10(0.05/nrow(X.final))))
  
  message("Run celldmc and tca and extract p-values for the 7 smoking cpgs...")
  celldmc.res <- CellDMC(X.final[smoking_cpgs,], smk, W.2, cov.mod = cbind(C1, C2_))
  celldmc.pvals <- list(celldmc.res$coe$MYE$p, celldmc.res$coe$LYM$p)
  tca.mdl <- tca(X = X.final[smoking_cpgs,], W = W.2, C1 = cbind(C1, smk), C2 = C2_)
  tca.pvals <- list(tca.mdl$gammas_hat_pvals[,"MYE.smk"], tca.mdl$gammas_hat_pvals[,"LYM.smk"])
  tca.pvals.joint <- list(tca.mdl$gammas_hat_pvals.joint[,"smk"])
  print(dim(X.final))
  message("Perform a full cell-type-specific EWAS in order to asses calibration of TCA and CellDMC...")
  tca.mdl.full <- tca(X = X.final, W = W.2, C1 = cbind(C1, smk), C2 = C2_)
  tca.pvals.full <- list(tca.mdl.full$gammas_hat_pvals[,"MYE.smk"], tca.mdl.full$gammas_hat_pvals[,"LYM.smk"])
  tca.pvals.joint.full <- list(tca.mdl.full$gammas_hat_pvals.joint[,"smk"])
  celldmc.res.full <- CellDMC(X.final, smk, W.2, cov.mod = cbind(C1, C2_))
  celldmc.pvals.full <- list(celldmc.res.full$coe$MYE$p, celldmc.res.full$coe$LYM$p)
  
  res.list <- list()
  
  res.list[[sprintf("celldmc.pvals.%s", dataset.name)]] <- celldmc.pvals
  res.list[[sprintf("tca.pvals.%s", dataset.name)]] <- tca.pvals
  res.list[[sprintf("tca.pvals.joint.%s", dataset.name)]] <- tca.pvals.joint
  
  res.list[[sprintf("celldmc.pvals.full.%s", dataset.name)]] <- celldmc.pvals.full
  res.list[[sprintf("tca.pvals.full.%s", dataset.name)]] <- tca.pvals.full
  res.list[[sprintf("tca.pvals.joint.full.%s", dataset.name)]] <- tca.pvals.joint.full
  
  return(res.list)
}

plot_smoking_analysis <- function(smoking.res, outfile){
  p.celldmc <- plot_smoking_heatmap(smoking.res$smoking_cpgs, smoking.res$celldmc.pvals.liu, smoking.res$celldmc.pvals.hannum, smoking.res$cell_types, "CellDMC")
  p.tca <- plot_smoking_heatmap(smoking.res$smoking_cpgs, smoking.res$tca.pvals.liu, smoking.res$tca.pvals.hannum, smoking.res$cell_types, "TCA")
  p.tca.joint <- plot_smoking_heatmap(smoking.res$smoking_cpgs, smoking.res$tca.pvals.joint.liu, smoking.res$tca.pvals.joint.hannum, c("Joint"), "TCA - joint test")
  p.qq <- plot_qq(smoking.res)
  p.final <- ggarrange(p.celldmc, p.tca, p.tca.joint, ncol = 3, nrow = 1, labels = c("a","b","c"))
  p.final <- ggarrange(p.final, p.qq, ncol=1,nrow=2, labels=c("", "d"), heights=c(3,4.5))
  ggsave(outfile, plot = p.final, width = 8.5, height = 10)
  return(p.final)
}


plot_smoking_heatmap <- function(cpgs, pvals.liu, pvals.hannum, cell_types, method_title){
  data <- data.frame(matrix(ncol = 4, nrow = 2*length(cell_types)*length(cpgs)))
  colnames(data) <- c("cpg","cell_type","neg_log_pval","dataset")
  for (i in 1:length(cell_types)){
    data[((i-1)*length(cpgs)+1):(i*length(cpgs)),] <- cbind(cpgs,
                                                            rep(c(cell_types[[i]])),
                                                            -log10(pvals.hannum[[i]]),
                                                            rep(c("Hannum et al."),
                                                                length(cpgs)))
  }
  for (i in 1:length(cell_types)){
    data[(length(cell_types)*length(cpgs)+(i-1)*length(cpgs)+1):(length(cell_types)*length(cpgs)+i*length(cpgs)),] <- cbind(cpgs,
                                                                                                                            rep(c(cell_types[[i]])),
                                                                                                                            -log10(pvals.liu[[i]]),
                                                                                                                            rep(c("Liu et al."),
                                                                                                                                length(cpgs)))
  }
  data$neg_log_pval <- as.numeric(data$neg_log_pval)
  data$cpg <- factor(data[,1],rev(data[1:length(cpgs),1]))
  axis.text.x <- if (length(cell_types)-1) element_text(colour = c("#009E73","#D55E00" )) else axis.text.x = NULL
  plt <- ggplot(data, aes(cell_type, cpg, fill= neg_log_pval)) +  geom_tile() + 
    scale_fill_gradient(low = "#808080", high = "#FFFF00", na.value = "#FFFF00", limits = c(0,15)) +
    geom_text(aes(label = round(data$neg_log_pval,1))) + 
    geom_hline(aes(yintercept=2.5), size = 1) +
    facet_wrap(~dataset) +
    ggtitle(method_title) +
    ylab(NULL) +
    xlab(NULL) +
    labs(fill=expression(-log[10](P))) +
    
    theme(axis.text.y = element_text(colour = c("#009E73","#009E73","#D55E00","#D55E00",
                                                "#D55E00","#D55E00","#D55E00")),
          axis.text.x = axis.text.x, plot.title = element_text(hjust = 0.5),
          strip.background = element_rect(fill="white"),
          panel.background = element_rect(fill = "white", colour = "white"),
          legend.position="bottom", strip.text = element_text(size = 10,face = "bold")) 
  return(plt)
}

plot_qq <- function(smoking.res, neglogp.thresh=10){
  liu.sites <- length(smoking.res$tca.pvals.joint.full.liu[[1]])
  hannum.sites <- length(smoking.res$tca.pvals.joint.full.hannum[[1]])
  obs.pvals <- c(-log10(sort(smoking.res$celldmc.pvals.full.liu[[1]])),
                 -log10(sort(smoking.res$tca.pvals.full.liu[[1]])),
                 -log10(sort(smoking.res$celldmc.pvals.full.liu[[2]])),
                 -log10(sort(smoking.res$tca.pvals.full.liu[[2]])),
                 -log10(sort(smoking.res$tca.pvals.joint.full.liu[[1]])),
                 -log10(sort(smoking.res$celldmc.pvals.full.hannum[[1]])),
                 -log10(sort(smoking.res$tca.pvals.full.hannum[[1]])),
                 -log10(sort(smoking.res$celldmc.pvals.full.hannum[[2]])),
                 -log10(sort(smoking.res$tca.pvals.full.hannum[[2]])),
                 -log10(sort(smoking.res$tca.pvals.joint.full.hannum[[1]])))
  exp.pvals <- c(rep(-log10((1:liu.sites)/liu.sites), 5),
                 rep(-log10((1:hannum.sites)/hannum.sites), 5))
  methods <- do.call(c, lapply(c(liu.sites, hannum.sites), function(m){
    c(rep("CellDMC", m),
      rep("TCA", m),
      rep("CellDMC", m),
      rep("TCA", m),
      rep("TCA", m))
  }))
  cell.types <- do.call(c, lapply(c(liu.sites, hannum.sites), function(m){
    c(rep("MYE",m*2),
      rep("LYM", m*2),
      rep("TCA joint test", m))
  }))
  data.sets <- c(rep("Liu et al.", liu.sites*5),
                 rep("Hannum et al.", hannum.sites*5))
  
  pval.df <- data.frame(pval.obs=obs.pvals,
                        pval.exp=exp.pvals,
                        Method=methods,
                        CellType=cell.types,
                        Dataset=data.sets,
                        stringsAsFactors = TRUE)
  # exclude sites with -log10(p) > thresh
  pval.df <- pval.df[pval.df$pval.obs <= neglogp.thresh & pval.df$pval.exp <= neglogp.thresh,]
  lim <- min(max(pval.df$pval.exp), max(pval.df$pval.obs))
  levels(pval.df$CellType) <- c("LYM", "MYE", "Joint")
  
  p <- ggplot(pval.df, aes(x = pval.exp, y=pval.obs, colour = Method)) + 
    facet_grid(Dataset ~ CellType) + 
    stat_binhex(geom = "point", bins=1000, size=1) +
    #geom_segment(aes(x = 0, xend = lim, y = 0, yend = lim), colour="black") +
    geom_abline() +
    xlab(expression(Expected~-log[10](P))) + ylab(expression(Observed~-log[10](P))) +
    theme_bw() +
    guides(fill="none") + 
    theme(legend.position="bottom", strip.background = element_blank(), panel.border = element_rect(colour = "black"))
  return(p)
}

#' Replicate smoking analysis of Liu et al. and Hannum et al. datasets
#'
#' @param hannum_smk_status_path Path to CSV containing GSM IDs as rownames and a column titled
#'                               \code{smk} that contains either \code{Never}, \code{Current Smoker},
#'                               or \code{Past Smoker} for each sample. 
#' @param data_dir Directory to save Liu and Hannum data for analysis
#' @param results_dir Directory to save results
#' @param plot_dir Directory to save plots
#' @param plot_type Extension for saving plot graphics, such as pdf or png
#' @param n.sites Number of sites to subset from each subset for quick execution (for testing)
#' @return A list of results. Slot \code{results} contains a list of p-values generated by each method.
#'         Slot \code{plot} contains plot of results.
#' @export
smoking_analysis <- function(hannum_smk_status_path, data_dir, results_dir, 
                             plot_dir, plot_type="tiff", n.sites=NULL){
  data.paths <- prep_smoking_data(data_dir, hannum_smk_status_path)
  load(data.paths[[1]]) # Load Liu
  load(data.paths[[2]]) # Load Hannum
  # Smoking cpgs from Su et al. that were considered by Jing et al.
  smoking_cpgs <- c("cg05575921","cg21566642","cg09935388","cg06126421",
                    "cg03636183","cg19859270","cg09099830")
  if (!is.null(n.sites)){
    set.seed(1000)
    liu.sites <- rownames(liu$X)[!(rownames(liu$X) %in% smoking_cpgs)]
    liu.sites <- c(smoking_cpgs, sample(liu.sites, n.sites))
    liu$X <- liu$X[liu.sites,]
    hannum.sites <- rownames(hannum$X)[!(rownames(hannum$X) %in% smoking_cpgs)]
    hannum.sites <- c(smoking_cpgs, sample(hannum.sites, n.sites))
    hannum$X <- hannum$X[hannum.sites,]
  }
  # Cell types will be combined into myeloid and lymphoid compartments
  cell.types <- c("MYE","LYM")
  
  # Liu et al. analysis
  liu.smoking.res <- run_smoking_analysis(X = liu$X,
                                          smk = liu$smk[,1], 
                                          C1 = liu$cov, 
                                          C2 = NULL,
                                          W = liu$W, 
                                          smoking_cpgs = smoking_cpgs, 
                                          dataset.name = "liu")
  save(liu.smoking.res, file=paste(results_dir,"liu.smoking.res.RData",sep="/") )
  
  # Hannum et al. analysis
  C2.hannum <- matrix(0,nrow(hannum$cov),8)
  for (i in 1:8) C2.hannum[,i] <- as.numeric(hannum$cov[,"plate"] == i)
  hannum.smoking.res <- run_smoking_analysis(X = hannum$X,
                                             smk = hannum$smk[,1], 
                                             C1 = hannum$cov[,c("age","gender")], 
                                             C2 = C2.hannum,
                                             W = hannum$W, 
                                             smoking_cpgs = smoking_cpgs, 
                                             dataset.name = "hannum")
  save(hannum.smoking.res, file=paste(results_dir,"hannum.smoking.res.RData",sep="/") )
  
  smoking.res <- c(liu.smoking.res,
                   hannum.smoking.res,
                   list("cell_types" = cell.types,
                        "smoking_cpgs" = smoking_cpgs))
  outfile <- paste(plot_dir, "/Figure3.", plot_type,
                   sep="")
  smoking.plot <- plot_smoking_analysis(smoking.res, outfile)
  return(list(smoking.res=smoking.res,
              smoking.plot=smoking.plot))
}
cozygene/CellTypeSpecificMethylationAnalysis documentation built on Jan. 28, 2022, 11:20 a.m.