R/createTADdata.R

Defines functions createTADdata

Documented in createTADdata

#' Function to create a data matrix used for building a predictive model to
#' classify boundary regions from functional genomic elements
#'
#' @param bounds.GR a GRanges object with chromosomal coordinates of TAD
#' boundaries used to identify positive cases (can be obtained using
#' \code{\link{extractBoundaries}}). Required.
#' @param resolution Numeric, the width to bin the genome at, should match the
#' resolution that TADs were called at. Required.
#' @param genomicElements.GR a GRangesList object containing GRanges objects
#' for each ChIP-seq data to leverage in the random forest model (can be
#' obtained using the \code{\link{bedToGRangesList}}). Required.
#' @param featureType Character, controls how the feature space is constructed
#' (one of either "binary" (overlap yes/no), "oc" (overlap counts, the number
#' of overlaps), "op" (overlap percent, the percent of bin width covered by the
#' genomic annotation), or "distance" (log2-transformed distance from the center
#' of the nearest genomic annotation to the center of the bin); default is
#' "distance"). Required.
#' @param resampling Character, controls if and how the data should be
#' resampled to create balanced classes of boundary vs. nonboundary regions (one
#' of either "none" - no re-sampling, "ros" - Random Over-Sampling, "rus" -
#' Random Under-Sampling. Required.
#' @param trainCHR Character vector of chromosomes to use to build the binned
#' data matrix for training. Required.
#' @param predictCHR Character vector of chromosomes to use to build the binned
#' data matrix for testing. Default in NULL, indicating no test data is created.
#'  If trainCHR=predictCHR then a 7:3 split is created.
#' @param genome version of the human genome assembly. Used to filter out
#' bases overlapping centromeric regions. Accepted values - hg19 (default) or 
#' hg38.
#'
#' @return A list object containing two data.frames: 1) the training data, 2)
#' the test data (only if predictCHR is not NULL, otherwise it is NA). "y" is
#' an indicator whether the corresponding bin is a TAD boundary, and the
#' subsequent columns have the association measures between bins and the
#' genomic annotations
#' @export
#'
#' @import IRanges GenomicRanges rCGH
#'
#' @examples
#' # Create training data for CHR21 and testing data for CHR22 with
#' # 5 kb binning, oc-type predictors from 26 different transcription factor
#' # binding sites from the GM12878 cell line, and random under-sampling
#'
#' # Read in ARROWHEAD-called TADs at 5kb
#' data(arrowhead_gm12878_5kb)
#'
#' #Extract unique boundaries
#' bounds.GR <- extractBoundaries(domains.mat = arrowhead_gm12878_5kb,
#'                                filter = FALSE,
#'                                CHR = c("CHR21", "CHR22"),
#'                                resolution = 5000)
#'
#' # Read in GRangesList of 26 TFBS
#' data(tfbsList)
#'
#' tadData <- createTADdata(bounds.GR = bounds.GR,
#'                          resolution = 5000,
#'                          genomicElements.GR = tfbsList,
#'                          featureType = "oc",
#'                          resampling = "rus",
#'                          trainCHR = "CHR21",
#'                          predictCHR = "CHR22")
createTADdata <- function(bounds.GR, resolution, genomicElements.GR, featureType = "distance",
                          resampling, trainCHR, predictCHR = NULL, genome = "hg19") {

    resolution = as.integer(resolution)

    #ESTABLISHING CHROMOSOME-SPECIFIC SEQINFO#

    #LOADING CHROMOSOME LENGTHS#
    if (genome %in% c("hg19", "hg38")) {
        ifelse(genome == "hg19", centromeres <- rCGH::hg19, centromeres <- rCGH::hg38)
        centromeres$chrom <- paste0("CHR", centromeres$chrom)
    } else {
        print("Wrong genome version. Use hg19 or hg38")
        return(NULL)
    }
    
    #ESTABLISHING DATA MATRIX FOR MODELING#

    if (is.null(predictCHR)) {
        # you are not interested in performance therefore no holdout test data is created

        # building training set
        bounds.GR.flank <- flank(bounds.GR[which(as.character(seqnames(bounds.GR)) %in%
                                                     tolower(trainCHR))], width = (resolution/2), both = TRUE)

        data_mat_list <- list()
        outcome_list <- list()
        train_list <- list()

        for (i in seq_len(length(trainCHR))) {
            start <- (resolution/2)
            chrLength <- centromeres$length[centromeres$chrom %in% trainCHR][i]
            centromereStart <- as.integer(centromeres$centromerStart[centromeres$chrom==trainCHR[i]])
            centromereEnd <- as.integer(centromeres$centromerEnd[centromeres$chrom==trainCHR[i]])
            end <- chrLength - (chrLength %% resolution) + resolution/2

            data_mat_list[[i]] <- matrix(nrow=length(seq(start,end-1,resolution)[-which(seq(start,end-1,resolution) %in% c(centromereStart:centromereEnd))]),
                                         ncol=length(genomicElements.GR))

            dat_mat_gr <- GRanges(seqnames = tolower(trainCHR[i]),
                                  IRanges(start = seq(start,end-1,resolution)[-which(seq(start,end-1,resolution) %in% c(centromereStart:centromereEnd))],
                                          width=resolution))

            if (featureType == "distance") {
                # for use of distance type features
                dat_mat_gr <- GRanges(seqnames = tolower(trainCHR[i]), IRanges(start = (start(dat_mat_gr) +
                                                                                            end(dat_mat_gr))/2, width = 1))
                for (k in seq(length(genomicElements.GR))) {
                    d <- distance_func(dat_mat_gr, genomicElements.GR[[k]])
                    data_mat_list[[i]][, k] <- d
                }
                for (h in seq_len(ncol(data_mat_list[[i]]))) {
                    data_mat_list[[i]][, h] <- log(data_mat_list[[i]][, h] + 1, base = 2)
                }
            } else if (featureType == "binary") {
                for (k in seq_len(length(genomicElements.GR))) {
                    cb <- binary_func(dat_mat_gr, genomicElements.GR[[k]])
                    data_mat_list[[i]][, k] <- cb
                }
            } else if (featureType == "oc") {
                for (k in seq_len(length(genomicElements.GR))) {
                    co <- count_func(dat_mat_gr, genomicElements.GR[[k]])
                    data_mat_list[[i]][, k] <- co
                }
            } else if (featureType == "op") {
                for (k in seq_len(length(genomicElements.GR))) {
                    cp <- percent_func(dat_mat_gr, genomicElements.GR[[k]])
                    data_mat_list[[i]][, k] <- cp
                }
            } else {
                cs <- signal_func(dat_mat_gr, genomicElements.GR[[k]])
                data_mat_list[[i]][, k] <- cs
            }

            outcome_list[[i]] <- countOverlaps(GRanges(seqnames = tolower(trainCHR[i]),
                                                       IRanges(start = seq(start,end-1,resolution)[-which(seq(start,end-1,resolution) %in% c(centromereStart:centromereEnd))],
                                                               width = resolution)), bounds.GR.flank)
            outcome_list[[i]] <- ifelse(outcome_list[[i]] >= 1, 1, 0)

            train_list[[i]] <- cbind.data.frame(outcome_list[[i]], as.matrix(data_mat_list[[i]]))
            names(train_list[[i]]) <- c("y", names(genomicElements.GR))
            train_list[[i]]$y <- factor(train_list[[i]]$y)
            levels(train_list[[i]]$y) <- c("No", "Yes")

            if (resampling == "ros") {
                # assign sample indices
                sampids.train <- sample(x = which(train_list[[i]]$y == "Yes"), size = length(which(train_list[[i]]$y ==
                                                                                                       "No")), replace = TRUE)

                train_list[[i]] <- rbind.data.frame(train_list[[i]][which(train_list[[i]]$y ==
                                                                              "No"), ], train_list[[i]][sampids.train, ])

                # Randomly shuffle the data
                train_list[[i]] <- train_list[[i]][sample(nrow(train_list[[i]])),
                ]

            } else if (resampling == "rus") {
                sampids.train <- sample(x = which(train_list[[i]]$y == "No"), size = length(which(train_list[[i]]$y ==
                                                                                                      "Yes")), replace = FALSE)

                train_list[[i]] <- rbind.data.frame(train_list[[i]][which(train_list[[i]]$y ==
                                                                              "Yes"), ], train_list[[i]][sampids.train, ])

                # Randomly shuffle the data
                train_list[[i]] <- train_list[[i]][sample(nrow(train_list[[i]])),
                ]

            # } else if (resampling == "smote") {
            #     train_list[[i]] <- SMOTE(y ~ ., data = train_list[[i]], perc.over = 100,
            #                              perc.under = 200)
            # 
            #     # Randomly shuffle the data
            #     train_list[[i]] <- train_list[[i]][sample(nrow(train_list[[i]])),
            #     ]
            } else {
                train_list[[i]] = train_list[[i]]
            }


        }

        train_list <- do.call("rbind.data.frame", train_list)

    } else if (isTRUE(all.equal(sort(trainCHR), sort(predictCHR)))) {
        # you are training on the same chr(s) that you are predicting on so we build the
        # full datamatrix (including predictor type) and then split the data into
        # training and testing then perform resampling on training set

        bounds.GR.flank <- flank(bounds.GR, width = (resolution/2), both = TRUE)

        data_mat_list <- list()
        outcome_list <- list()
        full_list <- list()
        train_list <- list()
        test_list <- list()

        for (i in seq_len(length(trainCHR))) {
            start <- (resolution/2)
            chrLength <- centromeres$length[centromeres$chrom %in% trainCHR][i]
            centromereStart <- as.integer(centromeres$centromerStart[centromeres$chrom==trainCHR[i]])
            centromereEnd <- as.integer(centromeres$centromerEnd[centromeres$chrom==trainCHR[i]])
            end <- chrLength - (chrLength %% resolution) + resolution/2

            data_mat_list[[i]] <- matrix(nrow=length(seq(start,end-1,resolution)[-which(seq(start,end-1,resolution) %in% c(centromereStart:centromereEnd))]),
                                         ncol=length(genomicElements.GR))

            dat_mat_gr <- GRanges(seqnames = tolower(trainCHR[i]),
                                  IRanges(start = seq(start,end-1,resolution)[-which(seq(start,end-1,resolution) %in% c(centromereStart:centromereEnd))],
                                          width=resolution))

            if (featureType == "distance") {
                # for use of distance type features
                dat_mat_gr <- GRanges(seqnames = tolower(trainCHR[i]), IRanges(start = (start(dat_mat_gr) +
                                                                                            end(dat_mat_gr))/2, width = 1))
                for (k in seq_len(length(genomicElements.GR))) {
                    d <- distance_func(dat_mat_gr, genomicElements.GR[[k]])
                    data_mat_list[[i]][, k] <- d
                }
                for (h in seq_len(ncol(data_mat_list[[i]]))) {
                    data_mat_list[[i]][, h] <- log(data_mat_list[[i]][, h] + 1, base = 2)
                }
            } else if (featureType == "binary") {
                for (k in seq_len(length(genomicElements.GR))) {
                    cb <- binary_func(dat_mat_gr, genomicElements.GR[[k]])
                    data_mat_list[[i]][, k] <- cb
                }
            } else if (featureType == "oc") {
                for (k in seq_len(length(genomicElements.GR))) {
                    co <- count_func(dat_mat_gr, genomicElements.GR[[k]])
                    data_mat_list[[i]][, k] <- co
                }
            } else if (featureType == "op") {
                for (k in seq_len(length(genomicElements.GR))) {
                    cp <- percent_func(dat_mat_gr, genomicElements.GR[[k]])
                    data_mat_list[[i]][, k] <- cp
                }
            }

            outcome_list[[i]] <- countOverlaps(GRanges(seqnames = tolower(trainCHR[i]),
                                                       IRanges(start = seq(start,end-1,resolution)[-which(seq(start,end-1,resolution) %in% c(centromereStart:centromereEnd))],
                                                               width = resolution)), bounds.GR.flank)
            outcome_list[[i]] <- ifelse(outcome_list[[i]] >= 1, 1, 0)

            full_list[[i]] <- cbind.data.frame(outcome_list[[i]], as.matrix(data_mat_list[[i]]))

            names(full_list[[i]]) <- c("y", names(genomicElements.GR))
            full_list[[i]]$y <- factor(full_list[[i]]$y)
            levels(full_list[[i]]$y) <- c("No", "Yes")

            inTrainingSet <- sample(length(full_list[[i]]$y), floor(length(full_list[[i]]$y) *
                                                                        0.7))
            train_list[[i]] <- full_list[[i]][inTrainingSet, ]
            test_list[[i]] <- full_list[[i]][-inTrainingSet, ]

            if (resampling == "ros") {
                # assign sample indices
                sampids.train <- sample(x = which(train_list[[i]]$y == "Yes"), size = length(which(train_list[[i]]$y ==
                                                                                                       "No")), replace = TRUE)

                train_list[[i]] <- rbind.data.frame(train_list[[i]][which(train_list[[i]]$y ==
                                                                              "No"), ], train_list[[i]][sampids.train, ])

                # Randomly shuffle the data
                train_list[[i]] <- train_list[[i]][sample(nrow(train_list[[i]])),
                ]

            } else if (resampling == "rus") {
                sampids.train <- sample(x = which(train_list[[i]]$y == "No"), size = length(which(train_list[[i]]$y ==
                                                                                                      "Yes")), replace = FALSE)

                train_list[[i]] <- rbind.data.frame(train_list[[i]][which(train_list[[i]]$y ==
                                                                              "Yes"), ], train_list[[i]][sampids.train, ])

                # Randomly shuffle the data
                train_list[[i]] <- train_list[[i]][sample(nrow(train_list[[i]])),
                ]

            # } else if (resampling == "smote") {
            #     train_list[[i]] <- SMOTE(y ~ ., data = train_list[[i]], perc.over = 100,
            #                              perc.under = 200)
            # 
            #     # Randomly shuffle the data
            #     train_list[[i]] <- train_list[[i]][sample(nrow(train_list[[i]])),
            #     ]
            } else {
                train_list[[i]] = train_list[[i]]
            }

        }

        train_list <- do.call("rbind.data.frame", train_list)
        test_list <- do.call("rbind.data.frame", test_list)
    } else {
        # you are training on one set of chr(s) and are predicting on another set of
        # chr(s) first build training set on the trainCHR, implement resampling technique
        # then build testing set from the predictCHR, with no resampling technique

        # building training set
        bounds.GR.flank <- flank(bounds.GR[which(as.character(seqnames(bounds.GR)) %in%
                                                     tolower(trainCHR))], width = (resolution/2), both = TRUE)

        data_mat_list <- list()
        outcome_list <- list()
        train_list <- list()

        for (i in seq_len(length(trainCHR))) {
            start <- (resolution/2)
            chrLength <- centromeres$length[centromeres$chrom %in% trainCHR][i]
            centromereStart <- as.integer(centromeres$centromerStart[centromeres$chrom==trainCHR[i]])
            centromereEnd <- as.integer(centromeres$centromerEnd[centromeres$chrom==trainCHR[i]])
            end <- chrLength - (chrLength %% resolution) + resolution/2

            data_mat_list[[i]] <- matrix(nrow=length(seq(start,end-1,resolution)[-which(seq(start,end-1,resolution) %in% c(centromereStart:centromereEnd))]),
                                         ncol=length(genomicElements.GR))

            dat_mat_gr <- GRanges(seqnames = tolower(trainCHR[i]),
                                  IRanges(start = seq(start,end-1,resolution)[-which(seq(start,end-1,resolution) %in% c(centromereStart:centromereEnd))],
                                          width=resolution))

            if (featureType == "distance") {
                # for use of distance type features
                dat_mat_gr <- GRanges(seqnames = tolower(trainCHR[i]), IRanges(start = (start(dat_mat_gr) +
                                                                                            end(dat_mat_gr))/2, width = 1))
                for (k in seq_len(length(genomicElements.GR))) {
                    d <- distance_func(dat_mat_gr, genomicElements.GR[[k]])
                    data_mat_list[[i]][, k] <- d
                }
                for (h in seq_len(ncol(data_mat_list[[i]]))) {
                    data_mat_list[[i]][, h] <- log(data_mat_list[[i]][, h] + 1, base = 2)
                }
            } else if (featureType == "binary") {
                for (k in seq_len(length(genomicElements.GR))) {
                    cb <- binary_func(dat_mat_gr, genomicElements.GR[[k]])
                    data_mat_list[[i]][, k] <- cb
                }
            } else if (featureType == "oc") {
                for (k in seq_len(length(genomicElements.GR))) {
                    co <- count_func(dat_mat_gr, genomicElements.GR[[k]])
                    data_mat_list[[i]][, k] <- co
                }
            } else if (featureType == "op") {
                for (k in seq_len(length(genomicElements.GR))) {
                    cp <- percent_func(dat_mat_gr, genomicElements.GR[[k]])
                    data_mat_list[[i]][, k] <- cp
                }
            }

            outcome_list[[i]] <- countOverlaps(GRanges(seqnames = tolower(trainCHR[i]),
                                                       IRanges(start = seq(start,end-1,resolution)[-which(seq(start,end-1,resolution) %in% c(centromereStart:centromereEnd))],
                                                               width = resolution)), bounds.GR.flank)

            outcome_list[[i]] <- ifelse(outcome_list[[i]] >= 1, 1, 0)

            train_list[[i]] <- cbind.data.frame(outcome_list[[i]], as.matrix(data_mat_list[[i]]))
            names(train_list[[i]]) <- c("y", names(genomicElements.GR))
            train_list[[i]]$y <- factor(train_list[[i]]$y)
            levels(train_list[[i]]$y) <- c("No", "Yes")

            if (resampling == "ros") {
                # assign sample indices
                sampids.train <- sample(x = which(train_list[[i]]$y == "Yes"), size = length(which(train_list[[i]]$y ==
                                                                                                       "No")), replace = TRUE)

                train_list[[i]] <- rbind.data.frame(train_list[[i]][which(train_list[[i]]$y ==
                                                                              "No"), ], train_list[[i]][sampids.train, ])

                # Randomly shuffle the data
                train_list[[i]] <- train_list[[i]][sample(nrow(train_list[[i]])),
                ]

            } else if (resampling == "rus") {
                sampids.train <- sample(x = which(train_list[[i]]$y == "No"), size = length(which(train_list[[i]]$y ==
                                                                                                      "Yes")), replace = FALSE)

                train_list[[i]] <- rbind.data.frame(train_list[[i]][which(train_list[[i]]$y ==
                                                                              "Yes"), ], train_list[[i]][sampids.train, ])

                # Randomly shuffle the data
                train_list[[i]] <- train_list[[i]][sample(nrow(train_list[[i]])),
                ]

            # } else if (resampling == "smote") {
            #     train_list[[i]] <- SMOTE(y ~ ., data = train_list[[i]], perc.over = 100,
            #                              perc.under = 200)
            # 
            #     # Randomly shuffle the data
            #     train_list[[i]] <- train_list[[i]][sample(nrow(train_list[[i]])),
            #     ]
            } else {
                train_list[[i]] = train_list[[i]]
            }


        }

        train_list <- do.call("rbind.data.frame", train_list)

        # building testing set
        bounds.GR.flank.pred <- flank(bounds.GR[which(as.character(seqnames(bounds.GR)) %in%
                                                          tolower(predictCHR))], width = (resolution/2), both = TRUE)

        data_mat_list <- list()
        outcome_list <- list()
        test_list <- list()

        for (i in seq_len(length(predictCHR))) {
            start <- (resolution/2)
            chrLength <- centromeres$length[centromeres$chrom %in% predictCHR][i]
            centromereStart <- as.integer(centromeres$centromerStart[centromeres$chrom==predictCHR[i]])
            centromereEnd <- as.integer(centromeres$centromerEnd[centromeres$chrom==predictCHR[i]])
            end <- chrLength - (chrLength %% resolution) + resolution/2

            data_mat_list[[i]] <- matrix(nrow=length(seq(start,end-1,resolution)[-which(seq(start,end-1,resolution) %in% c(centromereStart:centromereEnd))]),
                                         ncol=length(genomicElements.GR))

            dat_mat_gr <- GRanges(seqnames = tolower(predictCHR[i]),
                                  IRanges(start = seq(start,end-1,resolution)[-which(seq(start,end-1,resolution) %in% c(centromereStart:centromereEnd))],
                                          width=resolution))

            if (featureType == "distance") {
                # for use of distance type features
                dat_mat_gr <- GRanges(seqnames = tolower(predictCHR[i]), IRanges(start = (start(dat_mat_gr) +
                                                                                              end(dat_mat_gr))/2, width = 1))
                for (k in seq_len(length(genomicElements.GR))) {
                    d <- distance_func(dat_mat_gr, genomicElements.GR[[k]])
                    data_mat_list[[i]][, k] <- d
                }
                for (h in seq_len(ncol(data_mat_list[[i]]))) {
                    data_mat_list[[i]][, h] <- log(data_mat_list[[i]][, h] + 1, base = 2)
                }
            } else if (featureType == "binary") {
                for (k in seq_len(length(genomicElements.GR))) {
                    cb <- binary_func(dat_mat_gr, genomicElements.GR[[k]])
                    data_mat_list[[i]][, k] <- cb
                }
            } else if (featureType == "oc") {
                for (k in seq_len(length(genomicElements.GR))) {
                    co <- count_func(dat_mat_gr, genomicElements.GR[[k]])
                    data_mat_list[[i]][, k] <- co
                }
            } else if (featureType == "op") {
                for (k in seq_len(length(genomicElements.GR))) {
                    cp <- percent_func(dat_mat_gr, genomicElements.GR[[k]])
                    data_mat_list[[i]][, k] <- cp
                }
            }

            outcome_list[[i]] <- countOverlaps(GRanges(seqnames = tolower(predictCHR[i]),
                                                       IRanges(start = seq(start,end-1,resolution)[-which(seq(start,end-1,resolution) %in% c(centromereStart:centromereEnd))],
                                                               width = resolution)), bounds.GR.flank.pred)
            outcome_list[[i]] <- ifelse(outcome_list[[i]] >= 1, 1, 0)

            test_list[[i]] <- cbind.data.frame(outcome_list[[i]], as.matrix(data_mat_list[[i]]))
            names(test_list[[i]]) <- c("y", names(genomicElements.GR))
            test_list[[i]]$y <- factor(test_list[[i]]$y)
            levels(test_list[[i]]$y) <- c("No", "Yes")

        }

        test_list <- do.call("rbind.data.frame", test_list)

    }

    if (is.null(predictCHR)) {
        TADdataList <- list(train_list, NA)
    } else {
        TADdataList <- list(train_list, test_list)
    }

    return(TADdataList)
}
stilianoudakis/preciseTAD documentation built on Sept. 23, 2021, 9:36 p.m.