spp-package: ChIP-seq (Solexa) Processing Pipeline

Description Details Author(s) References Examples

Description

A set of routines for reading short sequence alignments, calculating tag density, estimates of statistically significant enrichment/depletion along the chromosome, identifying point binding positions (peaks), and characterizing saturation properties related to sequencing depth.

Details

Package: spp
Type: Package
Version: 1.11
Date: 2012-06-20
License: What license is it under?
LazyLoad: yes

See example below for typical processing sequence.y

Author(s)

Peter Kharchenko <peter.kharchenko@post.harvard.edu>

References

Kharchenko P., Tolstorukov M., Park P. "Design and analysis of ChIP-seq experiments for DNA-binding proteins." Nature Biotech. doi:10.1038/nbt.1508

Examples

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
## Not run: 
  # load the library
  library(spp);

  ## The following section shows how to initialize a cluster of 8 nodes for parallel processing
  ## To enable parallel processing, uncomment the next three lines, 
  #and comment out "cluster<-NULL";
  ## see "snow" package manual for details.
  #library(snow)
  #cluster <- makeCluster(2);
  #invisible(clusterCall(cluster,source,"routines.r"));
  cluster <- NULL;


  
  # read in tag alignments
  chip.data <- read.eland.tags("chip.eland.alignment");
  input.data <- read.eland.tags("input.eland.alignment");
  
  # get binding info from cross-correlation profile
  # srange gives the possible range for the size of the protected region;
  # srange should be higher than tag length; making the upper 
  #boundary too high will increase calculation time
  #
  # bin - bin tags within the specified number of basepairs to speed up calculation;
  # increasing bin size decreases the accuracy of the determined parameters
  binding.characteristics <- get.binding.characteristics(chip.data,
    srange=c(50,500),
    bin=5,
    cluster=cluster);


  # plot cross-correlation profile
  pdf(file="example.crosscorrelation.pdf",width=5,height=5)
  par(mar = c(3.5,3.5,1.0,0.5), mgp = c(2,0.65,0), cex = 0.8);
  plot(binding.characteristics$cross.correlation,
    type='l',
    xlab="strand shift",
    ylab="cross-correlation");
  abline(v=binding.characteristics$peak$x,lty=2,col=2)
  dev.off();
  
  # select informative tags based on the binding characteristics
  chip.data <- select.informative.tags(chip.data,binding.characteristics);
  input.data <- select.informative.tags(input.data,binding.characteristics);

  # restrict or remove positions with anomalous number of tags relative
  # to the local density
  chip.data <- remove.local.tag.anomalies(chip.data);
  input.data <- remove.local.tag.anomalies(input.data);


  # output smoothed tag density (subtracting re-scaled input) into a WIG file
  # note that the tags are shifted by half of the peak separation distance
  smoothed.density <- get.smoothed.tag.density(chip.data,
    control.tags=input.data,
    bandwidth=200,
    step=100,
    tag.shift=round(binding.characteristics$peak$x/2));

  writewig(smoothed.density,
    "example.density.wig","Example smoothed, background-subtracted tag density");

  rm(smoothed.density);

  # output conservative enrichment estimates
  # alpha specifies significance level at which confidence intervals will be estimated
  enrichment.estimates <- get.conservative.fold.enrichment.profile(chip.data,
    input.data,
    fws=2*binding.characteristics$whs,
    step=100,
    alpha=0.01);
  
  writewig(enrichment.estimates,
    "example.enrichment.estimates.wig",
    "Example conservative fold-enrichment/depletion estimates shown on log2 scale");
  rm(enrichment.estimates);


  # binding detection parameters
  # desired FDR. Alternatively, an E-value can be supplied to the 
  #method calls below instead of the fdr parameter
  fdr <- 1e-2; 
  # the binding.characteristics contains the optimized half-size for binding detection window
  detection.window.halfsize <- binding.characteristics$whs;
  
  # determine binding positions using wtd method
  bp <- find.binding.positions(signal.data=chip.data,
    control.data=input.data,
    fdr=fdr,
    method=tag.wtd,
    whs=detection.window.halfsize,
    cluster=cluster)

  # alternatively determined binding positions using lwcc 
  # method (note: this takes longer than wtd)
  # bp <- find.binding.positions(signal.data=chip.data,control.data=input.data,
  #fdr=fdr,method=tag.lwcc,whs=detection.window.halfsize,cluster=cluster)
  
  print(paste("detected",sum(unlist(lapply(bp$npl,function(d) length(d$x)))),"peaks"));
  
  # output detected binding positions
  output.binding.results(bp,"example.binding.positions.txt");


  # ------------------------------------------------------------------------------------------- 
  # the set of commands in the following section illustrates methods for saturation analysis
  # these are separated from the previous section, since they are highly CPU intensive
  # ------------------------------------------------------------------------------------------- 

  # determine MSER
  # note: this will take approximately 10-15x the amount of time the initial 
  # binding detection did
  # The saturation criteria here is 0.99 consistency in the set of binding 
  # positions when adding 1e5 tags.
  # To ensure convergence the number of subsampled chains (n.chains) should be higher (80)
  mser <- get.mser(chip.data,
    input.data,
    step.size=1e5,
    test.agreement=0.99,
    n.chains=8,
    cluster=cluster,
    fdr=fdr,
    method=tag.wtd,
    whs=detection.window.halfsize)
  
  print(paste("MSER at a current depth is",mser));
  
  # note: an MSER value of 1 or very near one implies that the set of
  # detected binding positions satisfies saturation criteria without
  # additional selection by fold-enrichment ratios. In other words, 
  # the dataset has reached saturation in a traditional sense (absolute saturation).

  # interpolate MSER dependency on tag count
  # note: this requires considerably more calculations than the previous 
  # steps (~ 3x more than the first MSER calculation)
  # Here we interpolate MSER dependency to determine a point at which MSER of 2 is reached
  # The interpolation will be based on the difference in MSER at the 
  # current depth, and a depth at 5e5 fewer tags (n.steps=6);
  # evaluation of the intermediate points is omitted here to 
  # speed up the calculation (excluded.steps parameter)
  # A total of 7 chains is used here to speed up calculation, 
  # whereas a higher number of chains (50) would give good convergence
  msers <- get.mser.interpolation(chip.data,
    input.data,
    step.size=1e5,
    test.agreement=0.99, 
    target.fold.enrichment=2,
    n.chains=7,
    n.steps=6,
    excluded.steps=c(2:4),
    cluster=cluster,
    fdr=fdr,
    method=tag.wtd,
    whs=detection.window.halfsize)

  print(paste("predicted sequencing depth =",
    round(unlist(lapply(msers,function(x) x$prediction))/1e6,5)," million tags"))
  

  # note: the interpolation will return NA prediction if the dataset 
  # has reached absolute saturation at the current depth.
  # note: use return.chains=T to also calculated random chains 
  # returned under msers$chains field) - these can be passed back as
  #       "get.mser.interpolation( ..., chains=msers$chains)" to calculate 
  #       predictions for another target.fold.enrichment value
  #        without having to recalculate the random chain predictions.

  ## stop cluster if it was initialized
  #stopCluster(cluster);  


  
## End(Not run)

hms-dbmi/spp documentation built on Sept. 20, 2020, 7 p.m.