inst/1_Aclust_data_import.R

########### Summary ------------------------------------------------------------------------------------------------
#
# Purpose: This file was used to obtain co-methylated clusters of CpGs using the Adjacent Site Clustering algorithm
#          in Sofer et al. (2011) (PMID:23990415)
# 
# Input: (1) betaVals_mat: a beta value matrix of all CpGs on the array
#        (2) cpgLocation_df: an annotation file that indicates locations of CpGs. 
#            This file has rows = cpg ids, columns = chromosome, location. An example file is at /data/cpgLocation_df.csv.
#
# Output: startEndCPG_df, which is beta value matrix for clusters of CpGs. 
#         This file has rows = cpg ids, columns = cluster number, chr, start of cluster, end of cluster, sample ids. 
#         An example file is at /data/startEndCpG_df.csv.
#
############# ---------------------------------------------------------------------------------------------------------


######  Library  ##############################################################
data.dir <- "inst/extdata/"

# devtools::install_github('tamartsi/Aclust', dependencies = TRUE)
library(Aclust)
library(DMRcate)
library(data.table)

library(DMRcompare)



######  1. Aclust - prepare annotation file  ##################################

# data downloaded from https://rforge.net/IMA/ , under '4 Annotation file'
# This loads 11 lists, from 5-19Mb, and the full annotation matrix (0.5Gb)
load(paste0(data.dir, "fullannotInd.rda"))
rm(EXON1Ind, GENEBODYInd, ISLANDInd, NSHELFInd, NSHOREInd, SSHELFInd,
   SSHOREInd, TSS1500Ind, TSS200Ind, UTR3Ind, UTR5Ind)

# Turn off "stringsAsFactors"
annot <- as.data.frame(fullannot, stringsAsFactors = FALSE)


###  CPG Locations  ###
vars <- c("ILMNID", "CHR", "MAPINFO")
cpg.location <- annot[vars]
names(cpg.location) <- c("ILMNID", "chr", "MAPINFO")
cpg.location$MAPINFO <- as.numeric(cpg.location$MAPINFO)
cpg.location$chr <- paste0("chr", cpg.location$chr)
cpg.location$ILMNID <- as.factor(cpg.location$ILMNID)

cpg.location <- cpg.location[substr(cpg.location$ILMNID, 1, 2) != "rs", ]
anyNA(cpg.location)

# cpgLocation_df <- cpg.location
# devtools::use_data(cpgLocation_df)
# saveRDS(cpg.location, paste0(data.dir, "cpg.location.RDS"))


###  Cluster by Chromosome  ###
vars <- c("ILMNID", "INFINIUM_DESIGN_TYPE", "CHR", "MAPINFO",
          "UCSC_REFGENE_NAME", "UCSC_REFGENE_GROUP", "UCSC_CPG_ISLANDS_NAME",
          "RELATION_TO_UCSC_CPG_ISLAND")
annots450k <- annot[vars]

# change colname 'MAPINFO' by name 'Coordinate_37'
colnames(annots450k) <- c("IlmnID", "Infinium_Design_Type", "CHR",
                          "Coordinate_37", "UCSC_RefGene_Name",
                          "UCSC_RefGene_Group", "UCSC_CpG_Islands_Name",
                          "Relation_to_UCSC_CpG_Island")

annots450k$Coordinate_37 <- as.numeric(annots450k$Coordinate_37)

# For Aclust, the locus must be sorted according to 'CHR' and
#   'Coordinate_37/MAPINFO' together in ascending order
annot_450k <- annots450k[order(annots450k$CHR, annots450k$Coordinate_37), ]
nrow(annot_450k) # 485577 cpgs
# Remove the cpgs (loci) that have no CHR (chromosome) name
annot_450k <- annot_450k[!is.na(annot_450k$CHR), ]
nrow(annot_450k) # 485512 cpgs

# make a list of annotation files for different chromosomes
chrome_annot_files <- lapply(1:22, function(chrom){

  out <- annot_450k[(annot_450k$CHR == chrom), ]
  subset(out, !duplicated(out[, 4]))

})

nrow(chrome_annot_files[[1]]) # 46857 cpgs

# Save our results and clean up the workspace.
# saveRDS(chrome_annot_files,
#         paste0(data.dir, "annotation-450K-Aclust-by-chromosome.rds"))
rm(fullannot, vars, annots450k, annot_450k)



######  2. Beta value files  ##################################################

beta_value1 <- read.csv(paste0(data.dir, "GSE41169meth_betaValue.csv"),
                        row.names = 1, header = TRUE)
nrow(beta_value1) # 476944 cpgs


###  Trim CPGs  ###
beta_value12 <- rmSNPandCH(as.matrix(beta_value1),
                           dist = 2, mafcut = 0.05, rmXY = TRUE)
nrow(beta_value12) # 419121 cpgs

# Keep only the probes for which (0.05 < betaval < 0.95) in at least one sample
betaLogi_mat <- as.matrix(beta_value12) > 0.05 & as.matrix(beta_value12) < 0.95
betaKeep_logi <- rowSums(betaLogi_mat) > 0
beta.value.all <- beta_value12[betaKeep_logi, ]
nrow(beta.value.all) # 356603 cpgs


###  Rename Columns  ###
# Because we will use this data for further simulation, we will artificially
#   create cases ("Tumor") and control ("Normal") columns
betaNames_char <- colnames(beta.value.all)
betaNames_char <- substr(betaNames_char, 7, 10)
betaNames_char <- paste0(betaNames_char,
                         c(rep("-Tumor", 7), rep("-Normal", 7)))
colnames(beta.value.all) <- betaNames_char

###  Save Beta Matrix  ###
# # Necessary for the data simulation step (2_simulatedata.R) and further steps
# betaVals_mat <- beta.value.all
# devtools::use_data(betaVals_mat)
# saveRDS(beta.value.all, paste0(data.dir, "beta.value.all.rds"))
rm(beta_value1, beta_value12, betaLogi_mat, betaKeep_logi, betaNames_char)



######  3. Assign cpgs to clusters  ###########################################

# This takes 12-13 minutes:
clusters_ls <- lapply(1:22, function(chrome){

  assign.to.clusters(betas = beta.value.all,
                     annot = as.data.table(chrome_annot_files[[chrome]]),
                     dist.thresh = 0.5, bp.merge = 200,
                     dist.type = "spearman", method = "complete")

})

results_ls <- unlist(clusters_ls, recursive = FALSE)
length(results_ls) # 284117

# select clusters with 5 cpgs or more
resultsTrim_ls <- results_ls[lengths(results_ls) >= 5]
length(resultsTrim_ls) # 3063

a <- Sys.time()
cluster_tab <- ConvertCPGList(cpgs_ls = resultsTrim_ls,
                              methylval_df = beta.value.all)
Sys.time() - a # 2.298284 min
rm(clusters_ls, results_ls, resultsTrim_ls)



######  4. Add ranges for clusters  ###########################################

clusterOrd_tab <- cluster_tab[order(cluster_tab$cluster, cluster_tab$probeID)]

###  merge with annotation to get position info  ###
annot.location <- annot[, c("CHR", "MAPINFO")]
annot.location$probeID <- row.names(annot.location)
annot.location <- annot.location[annot.location$probeID, ]
location_tab <- merge(clusterOrd_tab, annot.location, by = "probeID")
rm(cluster_tab, annot.location, clusterOrd_tab)


###  Start and End of Each Cluster  ###
location_tab$MAPINFO <- as.numeric(as.character(location_tab$MAPINFO))
# Order the location_tab row according to cluster id ('cluster') and 'MAPINFO'
#   together in ascending order
location_tab <-
  location_tab[order(location_tab$cluster, location_tab$MAPINFO), ]

# Start
start_df <- aggregate(MAPINFO ~ as.factor(cluster),
                   data = location_tab, function(x) min(x))
colnames(start_df) <- c("cluster", "start_position")
start_df$cluster <- as.numeric(as.character(start_df$cluster))

# End
end_df <- aggregate(MAPINFO ~ as.factor(cluster),
                 data = location_tab, function(x) max(x))
colnames(end_df) <- c("cluster", "end_position")
end_df$cluster <- as.numeric(as.character(end_df$cluster))


###  Merge with Methylation Values  ###
start_tab <- merge(location_tab, start_df, by = "cluster")
startEnd_tab <- merge(start_tab, end_df, by = "cluster")
rm(location_tab, start_df, end_df, start_tab)


###  Rename and Order Columns  ###
colnames(startEnd_tab)[1] <- "Clusternumber"
colnames(startEnd_tab)[2] <- "cpg"
startEnd_tab$coordinate_37 <- startEnd_tab$MAPINFO
startEnd_tab$chromosome <- paste0("chr", startEnd_tab$CHR)
startEnd_df <- as.data.frame(startEnd_tab)

impVars_df <- startEnd_df[c("Clusternumber", "cpg", "CHR", "MAPINFO",
                            "start_position", "end_position", "coordinate_37",
                            "chromosome")]
otherVars_df <- startEnd_df[setdiff(names(startEnd_df), colnames(impVars_df))]
startEndCPG_df <- cbind(impVars_df, otherVars_df)

# For the RunDMRcate() function to properly detect DMRs, the rownames of this
#   data frame must be equal to the CPGs for this data frame
rownames(startEndCPG_df) <- startEndCPG_df$cpg

# devtools::use_data(startEndCPG_df)
# write.csv(startEndCPG_df,
#           file = paste0(data.dir, "A-clust-results.csv"), row.names = FALSE)
rm(startEnd_tab, startEnd_df, impVars_df, otherVars_df)
gabrielodom/DMRcomparePaper documentation built on May 25, 2019, 2:52 a.m.