Description Usage Format Source Examples
miRmine dataset, from Panwar et al (2017) miRmine: A Database of Human miRNA Expression
1 | data("miRmine")
|
miRmine
A RangedSummarizedExperiment object.
For all the details on how this dataset was produced, see examples.
Panwar et al (2017) miRmine: A Database of Human miRNA Expression
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | ## Not run:
library(GenomicRanges)
library(rtracklayer)
library(SummarizedExperiment)
library(Biostrings)
ext.data <- system.file("extdata", package = "miRmine")
hsa.gff3.file = file.path(ext.data, "hsa.gff3")
mature.fa.file = file.path(ext.data, "mature.fa")
miRmine.info.file = file.path(ext.data, "miRmine-info.txt")
miRmine.tissues.file = file.path(ext.data, "miRmine-tissues.csv")
miRmine.cell.lines.file = file.path(ext.data, "miRmine-cell-lines.csv")
gffRangedData.all <- import.gff3(hsa.gff3.file, genome="GRCh38")
gffRangedData.all$source = "miRBase v21"
gffRangedData.all$UniqueName = gffRangedData.all$Name
for (id in seq_along(as.character(gffRangedData.all$ID))){
name = gffRangedData.all[id, ]$Name
derives_from = gffRangedData.all[id, ]$Derives_from
if (!is.na(derives_from)){
precursor =
gffRangedData.all[gffRangedData.all$ID == derives_from, ]$Name
gffRangedData.all[id, ]$UniqueName = paste(name, precursor, sep=".")
}
}
gff = gffRangedData.all[gffRangedData.all$type == "miRNA"]
gff = sort(gff, by=~UniqueName)
tiss = read.csv(miRmine.tissues.file)
tiss$UniqueName =
paste(tiss$Mature.miRNA.ID, tiss$Precursor.miRNA.ID, sep=".")
tiss = tiss[base::order(tiss$UniqueName), ]
diff.names = setdiff(tiss$UniqueName, gff$UniqueName) # 7 rows differ
cellines = read.csv(miRmine.cell.lines.file)
cellines$UniqueName =
paste(cellines$Mature.miRNA.ID, cellines$Precursor.miRNA.ID, sep=".")
cellines = cellines[base::order(cellines$UniqueName), ]
setdiff(cellines$UniqueName, gff$UniqueName) # same 7 rows differ
tissue.mirnas.freq = base::sort(table(tiss$UniqueName))
gff.mirnas.freq = base::sort(table(gff$UniqueName))
setdiff(tissue.mirnas.freq, gff.mirnas.freq) # additional 2 rows duplicated
tissue.mirnas.freq[tissue.mirnas.freq > 1] # shows which rows are different
base::rownames(
tiss[(tiss$UniqueName %in%
c('hsa-miR-3142.hsa-mir-3142','hsa-miR-4487.hsa-mir-4487')),])
tiss = tiss[-c(624, 1213),]
tiss = tiss[!(tiss$UniqueName %in% diff.names), ]
cellines = cellines[-c(624, 1213),]
cellines = cellines[!(cellines$UniqueName %in% diff.names), ]
mirnas.unique.names = tiss$UniqueName
tiss.counts =
tiss[, -which(names(tiss) %in%
c("UniqueName", "Mature.miRNA.ID", "Precursor.miRNA.ID"))]
cellines.counts =
cellines[, -which(names(cellines) %in%
c("UniqueName", "Mature.miRNA.ID", "Precursor.miRNA.ID"))]
expression = as.matrix(cbind(tiss.counts, cellines.counts))
rownames(expression) = mirnas.unique.names
# add mirna sequences
library(Rsamtools)
fasta = FaFile(mature.fa.file)
mirna.string.set = scanFa(fasta)
newnames = strsplit(names(mirna.string.set), " ")
newnames = unlist(newnames)[ c(TRUE, rep(FALSE, 4)) ]
names(mirna.string.set) = newnames
dna.strings = list()
for (id in seq_along(gff)){
name = gff[id, ]$Name
unique_name = gff[id, ]$UniqueName
dna.strings[[unique_name]] = mirna.string.set[[name]]
}
gff$mirna_seq = dna.strings
# construct RSE
meta = read.csv(miRmine.info.file, sep="\t")
miRmine =
SummarizedExperiment(
assays=SimpleList(counts=expression),
rowData=NULL,
rowRanges=gff,
colData=meta
)
## End(Not run)
|
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.