R/AnalyzeJunction.r

Defines functions CombineStarSj SummarizeStarSj CompareStarSj MergeStarSj LoadStarSj

# This function combines a set of splicing junction files generated by STAR and write out a combined version of splicing junctions in a single file
# The combined version contains only junction sites with unique location/strand. The total number of mapped reads and the maximal overhang bases of all samples will be used in the combined file
# It also supports filtering of junction sites via:
## Whether the site has canonical splicing motifs
## Whether the site was previously annotated
## The total number of reads from all samples uniquely mapped to the sites
## The number of samples reported this site. Whether a sample reported the sites will be determined by:
### The number of reads uniquely mapped to the site
### The number of overhang bases

CombineStarSj<-function(fn.sj, output='./combined_SJ.out.tab', canonical.only=FALSE, unannotated.only=TRUE, min.unique=3, min.overhang=6, min.sample=2, min.unique.total=max(10, length(fn.sj)/2)) {
  # fn.sj             A vector of file names of STAR output of splicing junctions; with 9 columns and number rows equal to number of junctions
  # output            Path and name of output file
  # canonical.only    Include junctions with canonical motifs in the output only if TRUE
  # unannotated.only  Include previously unannotated junctions in the output only if TRUE 
  # min.sample        Minimal number of samples within which the junction was detected
  # min.unique        Minimal number of reads uniquely mapped to the junction in each sample within which the junction was detected
  # min.unique.total  Total number of reads uniquely mapped to the junction in all samples
  # min.overhang      Minimal number of overhang bases of the samples within which the junction was detected
  
  tbls<-lapply(fn.sj, function(f) {
    t<-read.table(f, stringsAsFactors = FALSE);
    rownames(t)<-paste(t[, 1], t[, 2], t[, 3], t[, 4]); 
    t; 
  });
  
  # remove sites with non-canonical junction motifs
  if (canonical.only) tbls<-lapply(tbls, function(t) t[t[, 5]>0, ]); 
  if (unannotated.only) tbls<-lapply(tbls, function(t) t[t[, 6]==0, ]);
  
  # Unique IDs
  ids.all<-lapply(tbls, rownames); 
  ids.all<-unlist(ids.all, use.names=FALSE); 
  ids<-sort(unique(ids.all)); 

  # Canonical column
  canon<-lapply(tbls, function(t) t[, 5]);
  canon<-unlist(canon, use.names=FALSE);
  names(canon)<-ids.all;
  canon<-as.vector(canon[ids]); 
  
  # Annotated column
  annot<-lapply(tbls, function(t) t[, 6]);
  annot<-unlist(annot, use.names=FALSE);
  names(annot)<-ids.all;
  annot<-as.vector(annot[ids]); 

  ############################################################
  # an empty vector for place holding
  ev<-rep(0, length(ids)); 
  names(ev)<-ids;
  
  # Number of uniquely mapped reads
  nuni<-sapply(tbls, function(t) {
    ev[rownames(t)]<-t[, 7];
    ev;
  }); 
  ttl<-rowSums(nuni);

  # Number of multiply mapped reads
  nmul<-sapply(tbls, function(t) {
    ev[rownames(t)]<-t[, 8];
    ev;
  }); 
  ttl1<-rowSums(nmul); 
  
  # Hangover bases
  ovrh<-sapply(tbls, function(t) {
    ev[rownames(t)]<-t[, 9];
    ev;
  }); 
  max.ovrh<-apply(ovrh, 1, max); 
  
  colnames(nuni)<-colnames(nmul)<-colnames(ovrh)<-names(fn.sj); 
  
  # Number of qualified samples
  cnt<-rowSums(sapply(1:ncol(nuni), function(i) nuni[,i]>=min.unique & ovrh[,i]>=min.overhang)); 
  
  flg<-ttl>=min.unique.total & cnt>=min.sample;
  
  out<-cbind(canon[flg], annot[flg], ttl[flg], ttl1[flg], max.ovrh[flg]);
  
  loc<-strsplit(rownames(out), ' ');
  loc<-do.call('rbind', loc); 
  
  out<-data.frame(loc, out, stringsAsFactors = FALSE); 
  
  write.table(out, output, sep='\t', qu=FALSE, row.names=FALSE, col.names=FALSE); 
  
  list(summary=out, unique=nuni[rownames(out), , drop=FALSE], multiple=nmul[rownames(out), , drop=FALSE], overhang=ovrh[rownames(out), , drop=FALSE]); 
}

# Summarize a set of SJ.out.tab files generated by STAR. 
SummarizeStarSj<-function(sites, set.names=names(sites), compare.pairs=TRUE, canonical.only=FALSE, annotated.only=FALSE,
                          min.unique=0, max.unique=Inf, min.multiple=0, max.multiple=Inf, min.combined=0, max.combined=Inf, min.overhang=0, max.overhang=Inf) {
  # sites           Names of a set of SJ.out.tab files generated by STAR; or a set of GRanges objects parsed by the LoadStarSj function
  # set.names       Names given to the sets
  # compare.pairs   If TRUE, compare the sets next to each other
  # canonical.only=FALSE, annotated.only=FALSE 
  # min.unique=0, max.unique=Inf
  # min.multiple=0, max.multiple=Inf 
  # min.combined=0, max.combined=Inf 
  # min.overhang=0, max.overhang=Inf       Filtering strategy
  
  if (class(sites) == 'character') sites<-lapply(sites, LoadStarSj);
  
  motif<-c('NONE', 'GTAG', 'CTAC', 'GCAG', 'CTGC', 'ATAC', 'GTAT');

  # Filter junction sites
  if (canonical.only) sites<-lapply(sites, function(s) s[s$motif != 'NONE']); # Junctions with canonical junction bases or not
  if (annotated.only) sites<-lapply(sites, function(s) s[s$annotated == 1]); # Junctions in the known junction sites only
  sites<-lapply(sites, function(s) s[s$n_unique>=min.unique & s$n_unique<=max.unique]);
  sites<-lapply(sites, function(s) s[s$n_multiple>=min.multiple & s$n_multiple<=max.multiple]);
  sites<-lapply(sites, function(s) s[(s$n_unique+s$n_multiple)>=min.combined & (s$n_unique+s$n_multiple)<=max.combined]);
  sites<-lapply(sites, function(s) s[s$overhang>=min.overhang & s$overhang<=max.overhang]);  

  # Sumamry stats
  stat<-cbind(
    Total_Unique_Reads=sapply(sites, function(s) sum(s$n_unique)),
    Total_Site=sapply(sites, length), 
    Motif=t(sapply(sites, function(s) table(s$motif)[motif])),
    Annotated=sapply(sites, function(s) length(s[s$annotated==1]))
  )

  # Compare 2 sets next to each other
  if (compare.pairs) {
    comp<-lapply(1:(length(sites)-1), function(i) CompareStarSj(sites[[i]], sites[[i+1]])); 
    added<-c(NA, sapply(comp, function(c) length(c[[1]]))); 
    removed<-c(NA, sapply(comp, function(c) length(c[[2]])));
    corr<-sapply(comp, function(c) c[[3]]); 
    corr<-rbind(c(NA, NA, NA), t(corr));
    colnames(corr)<-paste('Correlation', colnames(corr), sep='_');
    stat<-cbind(stat, Added=added, Removed=removed, corr);
  } else comp<-list();

  rownames(stat)<-set.names;
  
  list(summary=stat, comparison=comp);
}

# Compare 2 sets of splicing junctions 
CompareStarSj<-function(gr1, gr2) {
  # gr1, gr2    Two sets of splicing junctions as GRanges generated by the LoadStarSj function
  ol<-as.matrix(findOverlaps(gr1, gr2, type='equal'));
  
  g1<-gr1[unique(ol[,1])];
  g2<-gr2[unique(ol[,2])];
  
  n<-cbind(g1$n_unique, g2$n_unique, g1$n_multiple, g2$n_multiple);
  logN<-log10(n+1);
  
  corr<-c(Unique=cor(logN[,1], logN[,2]), Multiple=cor(logN[,3], logN[,4]), Combined=cor(logN[,1]+logN[,3], logN[,2]+logN[,4]));
  if (nrow(ol) == 0) added<-gr2 else added<-gr2[-unique(ol[,2])];
  if (nrow(ol) == 0) removed<-gr1 else removed<-gr1[-unique(ol[,1])];
  
  list(Added=added, Removed=removed, Correlation=corr);  
}

# Merge a set of splicing files generated by STAR into one file
MergeStarSj<-function(fns.in, fn.out) {
  # fns.in    Original STAR junction file names
  # fn.out    Output file name
  
  grs<-lapply(fns.in, LoadStarSj)
  
  dfs<-lapply(grs, function(g) as.data.frame(g, stringsAsFactors = FALSE));
  
  ids<-unlist(lapply(dfs, rownames), use.names=FALSE);
  chr<-unlist(lapply(dfs, function(d) d[, 1]), use.names=FALSE);
  stt<-unlist(lapply(dfs, function(d) d[, 2]), use.names=FALSE);
  end<-unlist(lapply(dfs, function(d) d[, 3]), use.names=FALSE);
  str<-unlist(lapply(dfs, function(d) d[, 5]), use.names=FALSE);
  mtf<-unlist(lapply(dfs, function(d) d[, 6]), use.names=FALSE);
  ann<-unlist(lapply(dfs, function(d) d[, 7]), use.names=FALSE);
  
  mp<-0:6;
  names(mp)<-c('NONE', 'GTAG', 'CTAC', 'GCAG', 'CTGC', 'ATAC', 'GTAT');
  mtf<-mp[mtf];
  
  mp<-0:2;
  names(mp)<-c('*', '+', '-');
  str<-mp[str];
  
  names(chr)<-names(stt)<-names(end)<-names(str)<-names(mtf)<-names(ann)<-ids;
  id<-sort(unique(ids));
  
  tbl<-data.frame(chr[id], stt[id], end[id], str[id], mtf[id], ann[id]);

  mtx<-matrix(nr=length(id), nc=length(grs), dimnames=list(id, 1:length(grs)));
  mtxs<-lapply(8:10, function(ind) {
    for (i in 1:length(grs)) mtx[rownames(dfs[[i]]), i]<-dfs[[i]][, ind];
    mtx;
  }); 
  
  uni<-rowSums(mtxs[[1]], na.rm=TRUE);
  mul<-rowSums(mtxs[[2]], na.rm=TRUE);
  ovh<-apply(mtxs[[3]], 1, max); 
  
  tbl<-cbind(tbl, uni, mul, ovh);
  colnames(tbl)<-colnames(dfs[[1]])[-4];
  
  write.table(tbl, fn.out, row.names=FALSE, col.names=FALSE, sep='\t');
  
  tbl;
}

# Load a splicing function file generated by STAR and parse it into a GRanges object
LoadStarSj<-function(fn) {
  # fn  Name of the SJ.out.tab file created by STAR
  cat('Loading junction sites:', fn, '\n');
  
  library(GenomicRanges);
  
  sj<-read.table(fn, sep='\t', stringsAsFactors=FALSE);
    
  gr<-GRanges(sj[[1]], IRanges(sj[[2]], sj[[3]]), c('*', '+', '-')[sj[[4]]+1]);
  
  gr$motif<-c('NONE', 'GTAG', 'CTAC', 'GCAG', 'CTGC', 'ATAC', 'GTAT')[sj[[5]]+1];
  gr$annotated<-sj[[6]];
  gr$n_unique<-sj[[7]];
  gr$n_multiple<-sj[[8]];
  gr$overhang<-sj[[9]];
  
  chr<-as.vector(seqnames(gr));
  chr<-sub('^chr', '', chr, ignore.case = TRUE);
  chr[chr %in% as.character(1:9)]<-paste('0', chr[chr %in% as.character(1:9)], sep='');
  stt<-start(gr);
  end<-end(gr); 
  str<-as.vector(strand(gr));
  str<-c('+'='f', '-'='r', '*'='n')[str];
  names(gr)<-paste(chr, stt, end, str, sep='_');
  gr<-gr[order(chr, stt)];
  
  gr;
}
zhezhangsh/Rnaseq documentation built on May 4, 2019, 11:20 p.m.