inst/unitTests/test_makeTxDbFromGRanges.R

###

format_txdb_dump <- GenomicFeatures:::.format_txdb_dump

test_makeTxDbFromGRanges_empty <- function()
{
    ## The GFF3 way
    gr1 <- GRanges(type=character(0), ID=character(0))
    txdb1 <- makeTxDbFromGRanges(gr1)
    checkTrue(validObject(txdb1))
    checkTrue(all(elementNROWS(as.list(txdb1)) == 0L))
    checkIdentical(0L, length(seqinfo(txdb1)))
    checkIdentical(NA_character_, organism(txdb1))

    ## The GTF way
    gr2 <- GRanges(type=character(0),
                   gene_id=character(0), transcript_id=character(0))
    txdb2 <- makeTxDbFromGRanges(gr2)
    checkIdentical(as.list(txdb1), as.list(txdb2))
}

test_makeTxDbFromGRanges_no_exons_no_cds <- function()
{
    ## 2 transcripts linked to a gene + 2 orphan transcripts
    gene_ID    <- "gene1"
    gene_start <-     101
    gene_end   <-     160

    tx_ID      <- c( "tx1a", "tx2",  "tx1b", "tx3")
    tx_Parent  <- c("gene1",    NA, "gene1",    NA)
    tx_start   <- c(    151,   201,     101,     5)
    tx_end     <- c(    160,   250,     120,    35)

    gene_gr <- GRanges("chr1", IRanges(gene_start, gene_end), "+",
                       type="gene", ID=gene_ID, Parent=NA)
    tx_gr   <- GRanges("chr1", IRanges(tx_start, tx_end), "+",
                       type="mRNA", ID=tx_ID, Parent=tx_Parent)
    gr <- c(gene_gr, tx_gr)

    target_transcripts <- data.frame(
        tx_id=1:4,
        tx_name=c("tx3", "tx1b", "tx1a", "tx2"),
        tx_type="mRNA",
        tx_chrom="chr1",
        tx_strand="+",
        tx_start=c(5, 101, 151, 201),
        tx_end=c(35, 120, 160, 250)
    )
    target_splicings <- data.frame(
        tx_id=target_transcripts$tx_id,
        exon_rank=1,
        exon_id=1:4,
        exon_name=target_transcripts$tx_name,
        exon_chrom=target_transcripts$tx_chrom,
        exon_strand=target_transcripts$tx_strand,
        exon_start=target_transcripts$tx_start,
        exon_end=target_transcripts$tx_end
    )
    target_genes <- data.frame(
        tx_id=2:3,
        gene_id="gene1"
    )
    target_chrominfo <- data.frame(
        chrom="chr1",
        length=NA,
        is_circular=NA
    )
    target_txdb_dump <- format_txdb_dump(transcripts=target_transcripts,
                                         splicings=target_splicings,
                                         genes=target_genes,
                                         chrominfo=target_chrominfo)

    current_txdb <- makeTxDbFromGRanges(gr)
    checkIdentical(target_txdb_dump, as.list(current_txdb))

    ## make Parent a CharacterList instead of an atomic vector
    mcols(gr)$Parent <- CharacterList(character(0))
    mcols(gr)$Parent[c(2L, 4L)] <- "gene1"
    current_txdb <- makeTxDbFromGRanges(gr)
    checkIdentical(target_txdb_dump, as.list(current_txdb))

    ## 4 orphan transcripts
    gr4 <- gr[mcols(gr)$type != "gene"]  # remove the gene

    target_txdb_dump <- format_txdb_dump(transcripts=target_transcripts,
                                         splicings=target_splicings,
                                         chrominfo=target_chrominfo)

    current_txdb <- makeTxDbFromGRanges(gr4)
    checkIdentical(target_txdb_dump, as.list(current_txdb))

    ## 2 orphan transcripts
    gr2 <- gr[c(3L, 5L)]

    target_transcripts2 <- target_transcripts[c(1,4), ]
    target_transcripts2$tx_id <- 1:2
    target_splicings2 <- target_splicings[c(1,4), ]
    target_splicings2$tx_id <- 1:2
    target_splicings2$exon_id <- 1:2

    target_txdb_dump <- format_txdb_dump(transcripts=target_transcripts2,
                                         splicings=target_splicings2,
                                         chrominfo=target_chrominfo)

    current_txdb <- makeTxDbFromGRanges(gr2)
    checkIdentical(target_txdb_dump, as.list(current_txdb))

    ## 1 gene with no children
    gr3 <- gr[-c(2L, 4L)]

    current_txdb <- makeTxDbFromGRanges(gr3)
    checkIdentical(target_txdb_dump, as.list(current_txdb))

    ## 0 transcript
    gr0 <- gr[0]

    target_txdb_dump <- format_txdb_dump(chrominfo=target_chrominfo)

    current_txdb <- makeTxDbFromGRanges(gr0)
    checkIdentical(target_txdb_dump, as.list(current_txdb))   
}

test_makeTxDbFromGRanges_with_exons_and_cds <- function()
{
    ## 2 exons for tx1a, 1 exon for tx1b, 0 exon for tx2, 3 exons for tx3
    tx_ID       <- c( "tx1a", "tx1b", "tx2", "tx3")
    tx_start    <- c(    101,    145,   201,     5)
    tx_end      <- c(    160,    160,   250,    35)

    exon_Parent <- rep.int(tx_ID, c(2L, 1L, 0L, 3L))
    exon_start  <- c(101, 145, 145,  5, 18, 31)
    exon_end    <- c(134, 160, 160, 12, 28, 35)

    tx_gr   <- GRanges("chr1", IRanges(tx_start, tx_end), "+",
                       type="mRNA", ID=tx_ID, Parent=NA)
    exon_gr <- GRanges("chr1", IRanges(exon_start, exon_end), "+",
                       type="exon", ID=NA, Parent=exon_Parent)
    gr <- c(tx_gr, exon_gr)

    tx_oo <- c(4L, 1:3)
    target_transcripts <- data.frame(
        tx_id=1:4,
        tx_name=tx_ID[tx_oo],
        tx_type="mRNA",
        tx_chrom="chr1",
        tx_strand="+",
        tx_start=tx_start[tx_oo],
        tx_end=tx_end[tx_oo]
    )
    exon_oo <- c(4:6, 1:3)
    target_splicings <- data.frame(
        tx_id=c(rep(1:3, 3:1), 4),
        exon_rank=c(1:3, 1:2, 1, 1),
        exon_id=c(1:5, 5:6),
        exon_name=c(rep(NA, 6), "tx2"),
        exon_chrom="chr1",
        exon_strand="+",
        exon_start=c(exon_start[exon_oo], 201),
        exon_end=c(exon_end[exon_oo], 250)
    )
    target_chrominfo <- data.frame(
        chrom="chr1",
        length=NA,
        is_circular=NA
    )
    target_txdb_dump <- format_txdb_dump(transcripts=target_transcripts,
                                         splicings=target_splicings,
                                         chrominfo=target_chrominfo)

    current_txdb <- makeTxDbFromGRanges(gr)
    checkIdentical(target_txdb_dump, as.list(current_txdb))

    ## naming exons thru ID
    exon_ID <- paste0("ex", seq_along(exon_gr))
    mcols(exon_gr)$ID <- exon_ID
    gr <- c(tx_gr, exon_gr)

    target_splicings$exon_id <- 1:7
    target_splicings$exon_name <- c(exon_ID[exon_oo], "tx2")
    target_txdb_dump <- format_txdb_dump(transcripts=target_transcripts,
                                         splicings=target_splicings,
                                         chrominfo=target_chrominfo)

    current_txdb <- makeTxDbFromGRanges(gr)
    checkIdentical(target_txdb_dump, as.list(current_txdb))

    ## adding CDS
    cds_Parent  <- rep.int(tx_ID, c(2L, 1L, 0L, 2L))
    cds_start   <- c(122, 145, 149, 21, 31)
    cds_end     <- c(134, 152, 157, 28, 34)

    cds_gr  <- GRanges("chr1", IRanges(cds_start, cds_end), "+",
                       type="CDS", ID=NA, Parent=cds_Parent)
    gr <- c(gr, cds_gr)

    cds_oo <- c(NA, 4:5, 1:3)
    target_splicings <- cbind(
        target_splicings,
        cds_id=c(NA, 1:5, NA),
        cds_start=c(cds_start[cds_oo], NA),
        cds_end=c(cds_end[cds_oo], NA)
    )
    target_txdb_dump <- format_txdb_dump(transcripts=target_transcripts,
                                         splicings=target_splicings,
                                         chrominfo=target_chrominfo)

    current_txdb <- makeTxDbFromGRanges(gr)
    checkIdentical(target_txdb_dump, as.list(current_txdb))

    ## with exons not separated by introns
    exon_end[c(1, 5)] <- c(144, 36)
    end(exon_gr) <- exon_end
    gr <- c(tx_gr, exon_gr)

    tx_oo <- 1:3
    target_transcripts <- data.frame(
        tx_id=1:3,
        tx_name=tx_ID[tx_oo],
        tx_type="mRNA",
        tx_chrom="chr1",
        tx_strand="+",
        tx_start=tx_start[tx_oo],
        tx_end=tx_end[tx_oo]
    )
    exon_oo <- 1:3
    target_splicings <- data.frame(
        tx_id=c(1, 1, 2, 3),
        exon_rank=c(1, 2, 1, 1),
        exon_id=1:4,
        exon_name=c(exon_ID[exon_oo], "tx2"),
        exon_chrom="chr1",
        exon_strand="+",
        exon_start=c(exon_start[exon_oo], 201),
        exon_end=c(exon_end[exon_oo], 250)
    )

    target_txdb_dump <- format_txdb_dump(transcripts=target_transcripts,
                                         splicings=target_splicings,
                                         chrominfo=target_chrominfo)

    current_txdb <- suppressWarnings(makeTxDbFromGRanges(gr))
    checkIdentical(target_txdb_dump, as.list(current_txdb))
}

test_makeTxDbFromGRanges_with_multiple_part_transcripts <- function()
{
    ## 1 gene with 2 transcripts + 1 gene with 1 transcript
    gene_ID     <- c("gene1", "gene2")
    gene_start  <- c(    101,       5)
    gene_end    <- c(    160,      35)

    tx_ID       <- c( "tx1a",  "tx1a",  "tx1b",   "tx2",   "tx2",   "tx2")
    tx_Parent   <- c("gene1", "gene1", "gene1", "gene2", "gene2", "gene2")
    tx_start    <- c(    145,     101,     145,      31,       5,      18)
    tx_end      <- c(    160,     134,     160,      35,      12,      28)

    gene_gr <- GRanges("chr1", IRanges(gene_start, gene_end), "+",
                       type="gene", ID=gene_ID, Parent=NA)
    tx_gr   <- GRanges("chr1", IRanges(tx_start, tx_end), "+",
                       type="mRNA", ID=tx_ID, Parent=tx_Parent)
    gr <- c(gene_gr, tx_gr)

    target_transcripts <- data.frame(
        tx_id=1:3,
        tx_name=c("tx2", "tx1a", "tx1b"),
        tx_type="mRNA",
        tx_chrom="chr1",
        tx_strand="+",
        tx_start=c(5, 101, 145),
        tx_end=c(35, 160, 160)
    )
    target_splicings <- data.frame(
        tx_id=target_transcripts$tx_id,
        exon_rank=1,
        exon_id=1:3,
        exon_name=target_transcripts$tx_name,
        exon_chrom=target_transcripts$tx_chrom,
        exon_strand=target_transcripts$tx_strand,
        exon_start=target_transcripts$tx_start,
        exon_end=target_transcripts$tx_end
    )
    target_genes <- data.frame(
        tx_id=1:3,
        gene_id=rep.int(c("gene2", "gene1"), c(1, 2))
    )
    target_chrominfo <- data.frame(
        chrom="chr1",
        length=NA,
        is_circular=NA
    )
    target_txdb_dump <- format_txdb_dump(transcripts=target_transcripts,
                                         splicings=target_splicings,
                                         genes=target_genes,
                                         chrominfo=target_chrominfo)

    current_txdb <- suppressWarnings(makeTxDbFromGRanges(gr))
    checkIdentical(target_txdb_dump, as.list(current_txdb))

    ## adding 1 exon per transcript part
    exon_gr   <- GRanges("chr1", IRanges(tx_start, tx_end), "+",
                         type="exon", ID=NA, Parent=tx_ID)
    gr <- c(gr, exon_gr)

    exon_oo <- c(5:6, 4, 2, 1, 3)
    target_splicings <- data.frame(
        tx_id=rep.int(1:3, 3:1),
        exon_rank=c(1:3, 1:2, 1),
        exon_id=c(1:5, 5),
        exon_chrom="chr1",
        exon_strand="+",
        exon_start=tx_start[exon_oo],
        exon_end=tx_end[exon_oo]
    )
    target_txdb_dump <- format_txdb_dump(transcripts=target_transcripts,
                                         splicings=target_splicings,
                                         genes=target_genes,
                                         chrominfo=target_chrominfo)

    current_txdb <- suppressWarnings(makeTxDbFromGRanges(gr))
    checkIdentical(target_txdb_dump, as.list(current_txdb))
}

test_makeTxDbFromGRanges_with_exons_and_cds_as_gene_direct_children <-
    function()
{
    ## 1 gene with 2 transcripts + 1 gene with 3 exons as direct children +
    ## 1 gene with 2 CDS as direct children
    gene_ID     <- c("gene1", "gene2", "gene3")
    gene_start  <- c(    101,       5,     325)
    gene_end    <- c(    160,      35,     370)

    tx_ID       <- c( "tx1a",  "tx1b")
    tx_Parent   <- c("gene1", "gene1")
    tx_start    <- c(    101,     145)
    tx_end      <- c(    160,     160)

    exon_ID     <- c( "ex1",  "ex2",  "ex3",   "ex4",   "ex5",   "ex6")
    exon_Parent <- c("tx1a", "tx1a", "tx1b", "gene2", "gene2", "gene2")
    exon_start  <- c(   145,    101,    145,      31,       5,      18)
    exon_end    <- c(   160,    134,    160,      35,      12,      28)

    cds_Parent  <- c("gene3", "gene3")
    cds_start   <- c(    351,     325)
    cds_end     <- c(    370,     340)

    gene_gr <- GRanges("chr1", IRanges(gene_start, gene_end), "+",
                       type="gene", ID=gene_ID, Parent=NA)
    tx_gr   <- GRanges("chr1", IRanges(tx_start, tx_end), "+",
                       type="mRNA", ID=tx_ID, Parent=tx_Parent)
    exon_gr <- GRanges("chr1", IRanges(exon_start, exon_end), "+",
                       type="exon", ID=exon_ID, Parent=exon_Parent)
    cds_gr  <- GRanges("chr1", IRanges(cds_start, cds_end), "+",
                       type="CDS", ID=NA, Parent=cds_Parent)
    gr <- c(gene_gr, tx_gr, exon_gr, cds_gr)

    target_transcripts <- data.frame(
        tx_id=1:4,
        tx_name=c("gene2", "tx1a", "tx1b", "gene3"),
        tx_type=c("gene", "mRNA", "mRNA", "gene"),
        tx_chrom="chr1",
        tx_strand="+",
        tx_start=c(5, 101, 145, 325),
        tx_end=c(35, 160, 160, 370)
    )
    exon_oo <- c(5:6, 4, 2, 1, 3)
    cds_oo <- 2:1
    target_splicings <- data.frame(
        tx_id=rep.int(1:4, c(3:1, 2)),
        exon_rank=c(1:3, 1:2, 1, 1:2),
        exon_id=1:8,
        exon_name=c(exon_ID[exon_oo], NA, NA),
        exon_chrom="chr1",
        exon_strand="+",
        exon_start=c(exon_start[exon_oo], cds_start[cds_oo]),
        exon_end=c(exon_end[exon_oo], cds_end[cds_oo]),
        cds_id=c(rep.int(NA, length(exon_oo)), 1, 2),
        cds_start=c(rep.int(NA, length(exon_oo)), cds_start[cds_oo]),
        cds_end=c(rep.int(NA, length(exon_oo)), cds_end[cds_oo])
    )
    target_genes <- data.frame(
        tx_id=1:4,
        gene_id=rep.int(c("gene2", "gene1", "gene3"), c(1, 2, 1))
    )
    target_chrominfo <- data.frame(
        chrom="chr1",
        length=NA,
        is_circular=NA
    )
    target_txdb_dump <- format_txdb_dump(transcripts=target_transcripts,
                                         splicings=target_splicings,
                                         genes=target_genes,
                                         chrominfo=target_chrominfo)

    current_txdb <- makeTxDbFromGRanges(gr)
    checkIdentical(target_txdb_dump, as.list(current_txdb))

    ## Shuffling the input should not change anything.
    for (i in 1:5) {
        current_txdb <- makeTxDbFromGRanges(sample(gr))
        checkIdentical(target_txdb_dump, as.list(current_txdb))
    }
}
jmacdon/GenomicFeatures documentation built on Jan. 2, 2022, 7:40 a.m.