# This function combines a set of splicing junction files generated by STAR and write out a combined version of splicing junctions in a single file
# The combined version contains only junction sites with unique location/strand. The total number of mapped reads and the maximal overhang bases of all samples will be used in the combined file
# It also supports filtering of junction sites via:
## Whether the site has canonical splicing motifs
## Whether the site was previously annotated
## The total number of reads from all samples uniquely mapped to the sites
## The number of samples reported this site. Whether a sample reported the sites will be determined by:
### The number of reads uniquely mapped to the site
### The number of overhang bases
CombineStarSj<-function(fn.sj, output='./combined_SJ.out.tab', canonical.only=FALSE, unannotated.only=TRUE, min.unique=3, min.overhang=6, min.sample=2, min.unique.total=max(10, length(fn.sj)/2)) {
# fn.sj A vector of file names of STAR output of splicing junctions; with 9 columns and number rows equal to number of junctions
# output Path and name of output file
# canonical.only Include junctions with canonical motifs in the output only if TRUE
# unannotated.only Include previously unannotated junctions in the output only if TRUE
# min.sample Minimal number of samples within which the junction was detected
# min.unique Minimal number of reads uniquely mapped to the junction in each sample within which the junction was detected
# min.unique.total Total number of reads uniquely mapped to the junction in all samples
# min.overhang Minimal number of overhang bases of the samples within which the junction was detected
tbls<-lapply(fn.sj, function(f) {
t<-read.table(f, stringsAsFactors = FALSE);
rownames(t)<-paste(t[, 1], t[, 2], t[, 3], t[, 4]);
t;
});
# remove sites with non-canonical junction motifs
if (canonical.only) tbls<-lapply(tbls, function(t) t[t[, 5]>0, ]);
if (unannotated.only) tbls<-lapply(tbls, function(t) t[t[, 6]==0, ]);
# Unique IDs
ids.all<-lapply(tbls, rownames);
ids.all<-unlist(ids.all, use.names=FALSE);
ids<-sort(unique(ids.all));
# Canonical column
canon<-lapply(tbls, function(t) t[, 5]);
canon<-unlist(canon, use.names=FALSE);
names(canon)<-ids.all;
canon<-as.vector(canon[ids]);
# Annotated column
annot<-lapply(tbls, function(t) t[, 6]);
annot<-unlist(annot, use.names=FALSE);
names(annot)<-ids.all;
annot<-as.vector(annot[ids]);
############################################################
# an empty vector for place holding
ev<-rep(0, length(ids));
names(ev)<-ids;
# Number of uniquely mapped reads
nuni<-sapply(tbls, function(t) {
ev[rownames(t)]<-t[, 7];
ev;
});
ttl<-rowSums(nuni);
# Number of multiply mapped reads
nmul<-sapply(tbls, function(t) {
ev[rownames(t)]<-t[, 8];
ev;
});
ttl1<-rowSums(nmul);
# Hangover bases
ovrh<-sapply(tbls, function(t) {
ev[rownames(t)]<-t[, 9];
ev;
});
max.ovrh<-apply(ovrh, 1, max);
colnames(nuni)<-colnames(nmul)<-colnames(ovrh)<-names(fn.sj);
# Number of qualified samples
cnt<-rowSums(sapply(1:ncol(nuni), function(i) nuni[,i]>=min.unique & ovrh[,i]>=min.overhang));
flg<-ttl>=min.unique.total & cnt>=min.sample;
out<-cbind(canon[flg], annot[flg], ttl[flg], ttl1[flg], max.ovrh[flg]);
loc<-strsplit(rownames(out), ' ');
loc<-do.call('rbind', loc);
out<-data.frame(loc, out, stringsAsFactors = FALSE);
write.table(out, output, sep='\t', qu=FALSE, row.names=FALSE, col.names=FALSE);
list(summary=out, unique=nuni[rownames(out), , drop=FALSE], multiple=nmul[rownames(out), , drop=FALSE], overhang=ovrh[rownames(out), , drop=FALSE]);
}
# Summarize a set of SJ.out.tab files generated by STAR.
SummarizeStarSj<-function(sites, set.names=names(sites), compare.pairs=TRUE, canonical.only=FALSE, annotated.only=FALSE,
min.unique=0, max.unique=Inf, min.multiple=0, max.multiple=Inf, min.combined=0, max.combined=Inf, min.overhang=0, max.overhang=Inf) {
# sites Names of a set of SJ.out.tab files generated by STAR; or a set of GRanges objects parsed by the LoadStarSj function
# set.names Names given to the sets
# compare.pairs If TRUE, compare the sets next to each other
# canonical.only=FALSE, annotated.only=FALSE
# min.unique=0, max.unique=Inf
# min.multiple=0, max.multiple=Inf
# min.combined=0, max.combined=Inf
# min.overhang=0, max.overhang=Inf Filtering strategy
if (class(sites) == 'character') sites<-lapply(sites, LoadStarSj);
motif<-c('NONE', 'GTAG', 'CTAC', 'GCAG', 'CTGC', 'ATAC', 'GTAT');
# Filter junction sites
if (canonical.only) sites<-lapply(sites, function(s) s[s$motif != 'NONE']); # Junctions with canonical junction bases or not
if (annotated.only) sites<-lapply(sites, function(s) s[s$annotated == 1]); # Junctions in the known junction sites only
sites<-lapply(sites, function(s) s[s$n_unique>=min.unique & s$n_unique<=max.unique]);
sites<-lapply(sites, function(s) s[s$n_multiple>=min.multiple & s$n_multiple<=max.multiple]);
sites<-lapply(sites, function(s) s[(s$n_unique+s$n_multiple)>=min.combined & (s$n_unique+s$n_multiple)<=max.combined]);
sites<-lapply(sites, function(s) s[s$overhang>=min.overhang & s$overhang<=max.overhang]);
# Sumamry stats
stat<-cbind(
Total_Unique_Reads=sapply(sites, function(s) sum(s$n_unique)),
Total_Site=sapply(sites, length),
Motif=t(sapply(sites, function(s) table(s$motif)[motif])),
Annotated=sapply(sites, function(s) length(s[s$annotated==1]))
)
# Compare 2 sets next to each other
if (compare.pairs) {
comp<-lapply(1:(length(sites)-1), function(i) CompareStarSj(sites[[i]], sites[[i+1]]));
added<-c(NA, sapply(comp, function(c) length(c[[1]])));
removed<-c(NA, sapply(comp, function(c) length(c[[2]])));
corr<-sapply(comp, function(c) c[[3]]);
corr<-rbind(c(NA, NA, NA), t(corr));
colnames(corr)<-paste('Correlation', colnames(corr), sep='_');
stat<-cbind(stat, Added=added, Removed=removed, corr);
} else comp<-list();
rownames(stat)<-set.names;
list(summary=stat, comparison=comp);
}
# Compare 2 sets of splicing junctions
CompareStarSj<-function(gr1, gr2) {
# gr1, gr2 Two sets of splicing junctions as GRanges generated by the LoadStarSj function
ol<-as.matrix(findOverlaps(gr1, gr2, type='equal'));
g1<-gr1[unique(ol[,1])];
g2<-gr2[unique(ol[,2])];
n<-cbind(g1$n_unique, g2$n_unique, g1$n_multiple, g2$n_multiple);
logN<-log10(n+1);
corr<-c(Unique=cor(logN[,1], logN[,2]), Multiple=cor(logN[,3], logN[,4]), Combined=cor(logN[,1]+logN[,3], logN[,2]+logN[,4]));
if (nrow(ol) == 0) added<-gr2 else added<-gr2[-unique(ol[,2])];
if (nrow(ol) == 0) removed<-gr1 else removed<-gr1[-unique(ol[,1])];
list(Added=added, Removed=removed, Correlation=corr);
}
# Merge a set of splicing files generated by STAR into one file
MergeStarSj<-function(fns.in, fn.out) {
# fns.in Original STAR junction file names
# fn.out Output file name
grs<-lapply(fns.in, LoadStarSj)
dfs<-lapply(grs, function(g) as.data.frame(g, stringsAsFactors = FALSE));
ids<-unlist(lapply(dfs, rownames), use.names=FALSE);
chr<-unlist(lapply(dfs, function(d) d[, 1]), use.names=FALSE);
stt<-unlist(lapply(dfs, function(d) d[, 2]), use.names=FALSE);
end<-unlist(lapply(dfs, function(d) d[, 3]), use.names=FALSE);
str<-unlist(lapply(dfs, function(d) d[, 5]), use.names=FALSE);
mtf<-unlist(lapply(dfs, function(d) d[, 6]), use.names=FALSE);
ann<-unlist(lapply(dfs, function(d) d[, 7]), use.names=FALSE);
mp<-0:6;
names(mp)<-c('NONE', 'GTAG', 'CTAC', 'GCAG', 'CTGC', 'ATAC', 'GTAT');
mtf<-mp[mtf];
mp<-0:2;
names(mp)<-c('*', '+', '-');
str<-mp[str];
names(chr)<-names(stt)<-names(end)<-names(str)<-names(mtf)<-names(ann)<-ids;
id<-sort(unique(ids));
tbl<-data.frame(chr[id], stt[id], end[id], str[id], mtf[id], ann[id]);
mtx<-matrix(nr=length(id), nc=length(grs), dimnames=list(id, 1:length(grs)));
mtxs<-lapply(8:10, function(ind) {
for (i in 1:length(grs)) mtx[rownames(dfs[[i]]), i]<-dfs[[i]][, ind];
mtx;
});
uni<-rowSums(mtxs[[1]], na.rm=TRUE);
mul<-rowSums(mtxs[[2]], na.rm=TRUE);
ovh<-apply(mtxs[[3]], 1, max);
tbl<-cbind(tbl, uni, mul, ovh);
colnames(tbl)<-colnames(dfs[[1]])[-4];
write.table(tbl, fn.out, row.names=FALSE, col.names=FALSE, sep='\t');
tbl;
}
# Load a splicing function file generated by STAR and parse it into a GRanges object
LoadStarSj<-function(fn) {
# fn Name of the SJ.out.tab file created by STAR
cat('Loading junction sites:', fn, '\n');
library(GenomicRanges);
sj<-read.table(fn, sep='\t', stringsAsFactors=FALSE);
gr<-GRanges(sj[[1]], IRanges(sj[[2]], sj[[3]]), c('*', '+', '-')[sj[[4]]+1]);
gr$motif<-c('NONE', 'GTAG', 'CTAC', 'GCAG', 'CTGC', 'ATAC', 'GTAT')[sj[[5]]+1];
gr$annotated<-sj[[6]];
gr$n_unique<-sj[[7]];
gr$n_multiple<-sj[[8]];
gr$overhang<-sj[[9]];
chr<-as.vector(seqnames(gr));
chr<-sub('^chr', '', chr, ignore.case = TRUE);
chr[chr %in% as.character(1:9)]<-paste('0', chr[chr %in% as.character(1:9)], sep='');
stt<-start(gr);
end<-end(gr);
str<-as.vector(strand(gr));
str<-c('+'='f', '-'='r', '*'='n')[str];
names(gr)<-paste(chr, stt, end, str, sep='_');
gr<-gr[order(chr, stt)];
gr;
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.