miRmine: miRmine dataset

Description Usage Format Source Examples

Description

miRmine dataset, from Panwar et al (2017) miRmine: A Database of Human miRNA Expression

Usage

1
data("miRmine")

Format

miRmine A RangedSummarizedExperiment object.

For all the details on how this dataset was produced, see examples.

Source

Panwar et al (2017) miRmine: A Database of Human miRNA Expression

Examples

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
## Not run: 
library(GenomicRanges)
library(rtracklayer)
library(SummarizedExperiment)
library(Biostrings)

ext.data <- system.file("extdata", package = "miRmine")
hsa.gff3.file = file.path(ext.data, "hsa.gff3")
mature.fa.file = file.path(ext.data, "mature.fa")
miRmine.info.file = file.path(ext.data, "miRmine-info.txt")
miRmine.tissues.file = file.path(ext.data, "miRmine-tissues.csv")
miRmine.cell.lines.file = file.path(ext.data, "miRmine-cell-lines.csv")

gffRangedData.all <- import.gff3(hsa.gff3.file, genome="GRCh38")
gffRangedData.all$source = "miRBase v21"
gffRangedData.all$UniqueName = gffRangedData.all$Name
for (id in seq_along(as.character(gffRangedData.all$ID))){
    name = gffRangedData.all[id, ]$Name
    derives_from = gffRangedData.all[id, ]$Derives_from
    if (!is.na(derives_from)){
        precursor =
            gffRangedData.all[gffRangedData.all$ID == derives_from, ]$Name
        gffRangedData.all[id, ]$UniqueName = paste(name, precursor, sep=".")
    }
}
gff = gffRangedData.all[gffRangedData.all$type == "miRNA"]
gff = sort(gff, by=~UniqueName)

tiss = read.csv(miRmine.tissues.file)
tiss$UniqueName =
    paste(tiss$Mature.miRNA.ID, tiss$Precursor.miRNA.ID, sep=".")
tiss = tiss[base::order(tiss$UniqueName), ]

diff.names = setdiff(tiss$UniqueName, gff$UniqueName) # 7 rows differ

cellines = read.csv(miRmine.cell.lines.file)
cellines$UniqueName =
    paste(cellines$Mature.miRNA.ID, cellines$Precursor.miRNA.ID, sep=".")
cellines = cellines[base::order(cellines$UniqueName), ]

setdiff(cellines$UniqueName, gff$UniqueName) # same 7 rows differ

tissue.mirnas.freq = base::sort(table(tiss$UniqueName))
gff.mirnas.freq = base::sort(table(gff$UniqueName))
setdiff(tissue.mirnas.freq, gff.mirnas.freq) # additional 2 rows duplicated
tissue.mirnas.freq[tissue.mirnas.freq > 1] # shows which rows are different

base::rownames(
    tiss[(tiss$UniqueName %in%
        c('hsa-miR-3142.hsa-mir-3142','hsa-miR-4487.hsa-mir-4487')),])

tiss = tiss[-c(624, 1213),]
tiss = tiss[!(tiss$UniqueName %in% diff.names), ]
cellines = cellines[-c(624, 1213),]
cellines = cellines[!(cellines$UniqueName %in% diff.names), ]

mirnas.unique.names = tiss$UniqueName
tiss.counts =
    tiss[, -which(names(tiss) %in%
        c("UniqueName", "Mature.miRNA.ID", "Precursor.miRNA.ID"))]
cellines.counts =
    cellines[, -which(names(cellines) %in%
        c("UniqueName", "Mature.miRNA.ID", "Precursor.miRNA.ID"))]
expression = as.matrix(cbind(tiss.counts, cellines.counts))
rownames(expression) = mirnas.unique.names

# add mirna sequences
library(Rsamtools)
fasta = FaFile(mature.fa.file)
mirna.string.set = scanFa(fasta)
newnames = strsplit(names(mirna.string.set), " ")
newnames = unlist(newnames)[ c(TRUE, rep(FALSE, 4)) ]
names(mirna.string.set) = newnames

dna.strings = list()
for (id in seq_along(gff)){
    name = gff[id, ]$Name
    unique_name = gff[id, ]$UniqueName
    dna.strings[[unique_name]] = mirna.string.set[[name]]
}
gff$mirna_seq = dna.strings

# construct RSE
meta = read.csv(miRmine.info.file, sep="\t")

miRmine =
    SummarizedExperiment(
        assays=SimpleList(counts=expression),
        rowData=NULL,
        rowRanges=gff,
        colData=meta
    )

## End(Not run)

duxan/miRmine documentation built on May 6, 2019, 3:30 p.m.