Nothing
############
mergeSeg<-function(segs,snum,ch,cL,cR, base.mean,base.sd,sd.reg,sd.long,num.mark.thresh,long.num.mark.thresh,low.frac.used,
low.frac.used.num.mark,baf.raw,baf.dat,index,braw.base.med,small.thresh, dev.sim.thresh, LRR,snp.ids,intid ) {
#segs is data.frame of DNAcopy segments from sample snum and chromosome ch
# cL and cR: left and right indices of centromere for the given chrom ch
#############################
runTrue2<-function(ss){
if(class(ss)!="logical")stop("input needs to be logical")
r<-rle(ss)
vals<-r[[2]]
lens<-r[[1]]
nv<-length(vals)
endp<-cumsum(lens) #end positions of each run
stp<-c(1,endp[1:(nv-1)]+1)
wt<-which(vals & lens>=2)
if(length(wt)==0) return(NULL)
endt<-endp[wt]
stt<-stp[wt]
out<-list(stt,endt)
names(out)<-c("start","end") # start and end of runs of TRUE
return(out)
}
########################
if(!is.element(class(segs),"data.frame")) stop("segment data is not a data.frame")
if(dim(segs)[1]==0)stop("error: no segment info given to merge")
colm<-c("scanID","chrom","left","right","num.mark","seg.mean","sd.fac","sex")
if(!all(is.element(colm,names(segs)))){
stop("incorrect column variables for segs - must be c(scanID,chrom,left,right,num.mark,seg.mean,sd.fac,sex)")}
if(!all(segs$scanID==snum))stop("data.frame of segments needs to be from same sample")
if(!all(segs$chrom==ch))stop("data.frame of segments needs to be from same chromosome")
sx<-segs$sex[1]
an<-segs[order(segs$left),]
an$merge<-FALSE
nms<-names(an)
if(dim(an)[1]<2) return(an)
frac.used<-an$num.mark/(an$right-an$left+1)
# denom is total number of markers in between, including intensity only
# this would make it more likely for frac.used to be smaller
# would be an unusual anom and would probably not want to merge
s1<-an$sd.fac>=sd.reg
s2<-an$num.mark>long.num.mark.thresh & an$sd.fac>=sd.long
s<-s1|s2
s3<-frac.used>low.frac.used | an$num.mark >= low.frac.used.num.mark
ss<-s & s3 # T for ones above threshold and not low.frac or above threshold and more markers if is low frac
out<-runTrue2(ss)
if(is.null(out)) return(an)
endt<-out$end
stt<-out$start
######## determine type: gain (1), loss (2), neutral (3)
## break up sets if 'type' changes
selgood<-is.element(intid,snp.ids)
stt2<-NULL;endt2<-NULL
for(i in 1:length(stt)){
ind<-stt[i]:endt[i]
tmp<-an[ind,] #set of consecutive anoms
tmp$type<-2
for(jk in 1:nrow(tmp)){
seg<-tmp[jk,]
ledge<-seg$left;redge<-seg$right
int<- intid>=intid[ledge] & intid<=intid[redge] # $left and right are intid values
sel<-int&selgood
lrrseg<-LRR[sel]
medlrr<-median(lrrseg,na.rm=TRUE)
sdlrr<-sd(lrrseg,na.rm=TRUE)
if(medlrr>0.3*sdlrr) tmp$type[jk]<-3
if(medlrr< -0.3*sdlrr) tmp$type[jk]<-1
}
ts<-tmp$type
r<-rle(ts)
vals<-r[[2]]
lens<-r[[1]]
nv<-length(vals)
endp<-cumsum(lens) #end positions of each run
stp<-c(1,endp[1:(nv-1)]+1)
wt<-which(lens>=2)
if(length(wt)==0){endt2<-endt2;stt2<-stt2} else {
endt2<-c(endt2,ind[endp[wt]])
stt2<-c(stt2,ind[stp[wt]])
}
}
if(is.null(stt2)|is.null(endt2)) return(an) # after accounting for type, no consec anoms
stt<-stt2;endt<-endt2 #stt and endt record sets as we go along
## have used 'type' and no longer need it
########
## break sets up if cross centromere
if(!is.null(cL) & !is.null(cR)){
# skip acrocentric; cL and cR are indices of intid; left and right are indices of intid
bk<-NULL
for(i in 1:length(stt)){
ind<-stt[i]:endt[i]
tmp<-an[ind,] #set of consecutive anoms
nn<-nrow(tmp) # will be at least 2
J<-0
for(j in 1:(nn-1)){
if(tmp$right[j]<=cL & tmp$left[j+1]>=cR) {
J<-j; break
}
}
if(J!=0){bk<-c(i,J); break}
}
if(!is.null(bk)){
ii<-bk[1];J<-bk[2]
ind<-stt[ii]:endt[ii]
if(length(stt)==1){
stt2<-c(stt[ii],ind[J+1])
endt2<-c(ind[J],endt[ii])
}
if(length(stt)>1){
if(ii==1){
stt2<-c(stt[ii],ind[J+1],stt[(ii+1):length(stt)])
endt2<-c(ind[J],endt[ii:length(endt)])
} else {
if(ii==length(stt)){
stt2<-c(stt[1:ii],ind[J+1])
endt2<-c(endt[1:(ii-1)],ind[J],endt[ii:length(endt)])
} else {
stt2<-c(stt[1:ii],ind[J+1],stt[(ii+1):length(stt)])
endt2<-c(endt[1:(ii-1)],ind[J],endt[ii:length(endt)])
}
}
}
sdiff<-endt2-stt2+1
sel<-sdiff>=2
stt<-stt2[sel]; endt<-endt2[sel]
}
if(length(stt)==0) return(an)
}
del.merge<-NULL
merged.anoms<-NULL
for(i in 1:length(stt)){
cnt<-0
mrg<-list()
ind<-stt[i]:endt[i]
tmp<-an[ind,]
## compute baf.dev.med
tmp$baf.dev.med<-NA
for(jjj in 1:nrow(tmp)){
tmp2<-tmp[jjj,]
set1<-index>=tmp2$left & index<=tmp2$right # indices that are baf eligible
abf<-baf.raw[set1]
bf.dev<-abs(abf-braw.base.med)
tmp$baf.dev.med[jjj]<-median(bf.dev,na.rm=TRUE)
}
#create strings of T/F depending upon num.mark
sel<-tmp$num.mark>=num.mark.thresh
tmp$ok<-FALSE
tmp$ok[sel]<-TRUE
if(all(tmp$ok)) next # if all intervals meet num.mark threshold, keep separate
if(all(!tmp$ok)){ # none meet num.mark threshold
ss<-tmp$sd.fac>=small.thresh
out<-runTrue2(ss)
if(!is.null(out)){
st<-out$start; ed<-out$end
nk<-length(st)
for(ii in 1:nk){
cnt<-cnt+1
mrg[[cnt]]<-st[ii]:ed[ii] #indicates intervals to merge
}
}
### ADD CODE for indicating merging results for that set
if(length(mrg) !=0){
dt<-unlist(mrg)
del.merge<-c(del.merge,ind[dt])
tpmerge<-NULL
for(jj in 1:length(mrg)){
x<-mrg[[jj]]
a1<-ind[x[1]];a2<-ind[x[length(x)]]
set1<-a1:a2
new.left<-an$left[a1];new.right<-an$right[a2]
new.num.mark<-sum(an$num.mark[set1])
new.seg.mean<-sum(an$seg.mean[set1]*an$num.mark[set1])/new.num.mark
new.sdfac<-abs(new.seg.mean-base.mean)/base.sd
new<-data.frame(snum,ch,new.left,new.right,new.num.mark,new.seg.mean,new.sdfac,sx,TRUE,stringsAsFactors=FALSE)
names(new)<-c("scanID","chrom","left","right","num.mark","seg.mean","sd.fac","sex","merge")
merged.anoms<-rbinddt(merged.anoms,new)
}
}
next # go to next set - which will restart cnt
}
# find positions of the T's
ww<-which(tmp$ok)
# initial block of potential F's in front of first T
k1<-ww[1] #position of first T
if(k1==2){
if(tmp$sd.fac[1]>=small.thresh){
cnt<-cnt+1;mrg[[cnt]]<-c(1,2) #merge if only one F before first T and meets thresh
}
}
if(k1==3){
if(tmp$sd.fac[k1-1]>=small.thresh & tmp$sd.fac[k1-2]<small.thresh){
cnt<-cnt+1
mrg[[cnt]]<-c(k1-1,k1) # FFT with first F not high but second F is
} else {
if(tmp$sd.fac[k1-1]>=small.thresh & tmp$sd.fac[k1-2]>=small.thresh){
cnt<-cnt+1
mrg[[cnt]]<-c(k1-2,k1-1) # both F's high - merge them together
}
}
}
if(k1>3){
if(tmp$sd.fac[k1-1]>=small.thresh & tmp$sd.fac[k1-2]<small.thresh){
flag<-1;ix<-1:(k1-3)} else {flag<-0; ix<-1:(k1-1)
} # sets of F's to consider looking for consec sd.fac large
ss<-tmp$sd.fac[ix]>=small.thresh
out<-runTrue2(ss)
if(!is.null(out)){
st<-out$start; ed<-out$end
nk<-length(st)
for(ii in 1:nk){
cnt<-cnt+1
mrg[[cnt]]<-ix[st[ii]:ed[ii]]
}
}
if(flag==1){
cnt<-cnt+1; mrg[[cnt]]<-c(k1-1,k1) # add on merging the FT to the sets of preceding consec F's that get merged
}
}
# end initial block
if(length(ww)>1){
for(kk in 2:length(ww)){
ki<-ww[kk]
kip<-ww[kk-1]
df<-ki-kip
if(df==1) next #we have 2 T's consecutive
if(df==2){ # TFT configuration
mnbd<-min(tmp$baf.dev.med[ki],tmp$baf.dev.med[kip])
if(mnbd==0) relerr<-10 else relerr<-abs(tmp$baf.dev.med[ki]-tmp$baf.dev.med[kip])/mnbd
if(relerr<=dev.sim.thresh){#
cnt<-cnt+1
mrg[[cnt]]<-kip:ki
} else { #deciding which T interval to merge the F interval with
df1<-abs(tmp$baf.dev.med[kip]-tmp$baf.dev.med[kip+1])
df2<-abs(tmp$baf.dev.med[ki] - tmp$baf.dev.med[kip+1])
cnt<-cnt+1
if(df1<df2)mrg[[cnt]]<-kip:(kip+1) else mrg[[cnt]]<-(kip+1):ki
}
}
if(df==3){ # TFFT configuration
sm<-tmp$num.mark[kip+1]+tmp$num.mark[kip+2]
mnbd<-min(tmp$baf.dev.med[k1],tmp$baf.dev.med[kip])
if(mnbd==0) relerr<-10 else relerr<-abs(tmp$baf.dev.med[ki]-tmp$baf.dev.med[kip])/mnbd
if(sm>=num.mark.thresh){
cnt<-cnt+1; mrg[[cnt]]<-(kip+1):(kip+2)
} else {
if(relerr<=dev.sim.thresh){ # if the T's similar, merge them with the F's
cnt<-cnt+1
mrg[[cnt]]<-kip:ki
} else { # determine which T each F merges with, if any
ws1<-c(kip,kip+1); ws2<-c(kip+2,ki)
if(tmp$sd.fac[kip+1]>=small.thresh){cnt<-cnt+1;mrg[[cnt]]<-ws1}
if(tmp$sd.fac[kip+2]>=small.thresh){cnt<-cnt+1;mrg[[cnt]]<-ws2}
}
}
}
if(df>3){ # just look for runs of F's with enough evidence
ix<-(kip+1):(ki-1)
ss<-tmp$sd.fac[ix]>=small.thresh
out<-runTrue2(ss)
if(!is.null(out)){
st<-out$start; ed<-out$end
nk<-length(st)
for(ii in 1:nk){
cnt<-cnt+1
mrg[[cnt]]<-ix[st[ii]:ed[ii]]
}
}
}
} # end of loop for 'middle' runs
} # end if on length of ww
# block of potential F's after last T
kf<-ww[length(ww)]
kd<-nrow(tmp)- kf # number of F's after the last T
if(kd==1) {
if(tmp$sd.fac[kf+1]>=small.thresh){cnt<-cnt+1;mrg[[cnt]]<-c(kf,kf+1) }
}
if(kd==2){ # TFF
c1<-tmp$sd.fac[kf+1]>=small.thresh
c2<-tmp$sd.fac[kf+2]>=small.thresh
if(c1&c2){ cnt<-cnt+1; mrg[[cnt]]<-c(kf+1,kf+2)} # still possible for it to get eliminated later if sum of num.mark too small
if(c1&!c2) {cnt<-cnt+1; mrg[[cnt]]<-c(kf,kf+1) }
}
if(kd>2){
c1<-tmp$sd.fac[kf+1]>=small.thresh
c2<-tmp$sd.fac[kf+2]>=small.thresh
if(c1 & !c2){
flag<-1; ix<-(kf+3):nrow(tmp) } else {
flag<-0; ix<-(kf+1):nrow(tmp)
}
ss<-tmp$sd.fac[ix]>=small.thresh
out<-runTrue2(ss)
if(!is.null(out)){
st<-out$start; ed<-out$end
nk<-length(st)
for(ii in 1:nk){
cnt<-cnt+1
mrg[[cnt]]<-ix[st[ii]:ed[ii]]
}
}
if(flag==1){
cnt<-cnt+1; mrg[[cnt]]<-c(kf,kf+1) # add on initial TF merge
}
}
####
## for given set i, we now have mrg with sets of indices to merge - check/deal with overlaps
## although clearly baf.dev.med 'similarity' is not transitive, there is no good way to control the degree of transitivity
## is have string of overlaps, how to decide which pieces to merge is not easy
## so most cases, transitivity works well enough
## i.e. if sets overlap, combine them
msets<-length(mrg)
if(msets>1){
ovlap<-rep(FALSE,msets-1)
for(m in 1:(msets-1)){
if(length(intersect(mrg[[m]],mrg[[m+1]]))!=0) ovlap[m]<-TRUE
}
r<-rle(ovlap)
vals<-r[[2]]
lens<-r[[1]]
nv<-length(vals)
endp<-cumsum(lens) #end positions of each run
stp<-c(1,endp[1:(nv-1)]+1)
wT<-which(vals==TRUE)
if(length(wT) ==0){ mrg2<-mrg} else {
used<-NULL
cnt2<-0
mrg2<-list()
for(kl in 1:length(wT)){
cnt2<-cnt2+1
s1<-stp[wT[kl]];e1<-endp[wT[kl]]
cm<-s1:(e1+1)
a<-mrg[cm]
used<-c(used,cm)
mrg2[[cnt2]]<-unique(c(a,recursive=TRUE))
}
oth<-setdiff(1:length(mrg),used)
if(length(oth)!=0){
for(ij in 1:length(oth)){
sel<-oth[ij]
cnt2<-cnt2+1
mrg2[[cnt2]]<-mrg[[sel]]
}
}
}
} else mrg2<-mrg
### ADD CODE for indicating merging results for that set
if(length(mrg2) !=0){
dt<-unlist(mrg2)
del.merge<-c(del.merge,ind[dt])
tpmerge<-NULL
for(jj in 1:length(mrg2)){
x<-mrg2[[jj]]
a1<-ind[x[1]];a2<-ind[x[length(x)]]
set1<-a1:a2
new.left<-an$left[a1];new.right<-an$right[a2]
new.num.mark<-sum(an$num.mark[set1])
new.seg.mean<-sum(an$seg.mean[set1]*an$num.mark[set1])/new.num.mark
new.sdfac<-abs(new.seg.mean-base.mean)/base.sd
new<-data.frame(snum,ch,new.left,new.right,new.num.mark,new.seg.mean,new.sdfac,sx,TRUE,stringsAsFactors=FALSE)
names(new)<-c("scanID","chrom","left","right","num.mark","seg.mean","sd.fac","sex","merge")
merged.anoms<-rbinddt(merged.anoms,new)
}
}
} # end of loop on i (number of sets of consec anoms)
if(length(del.merge)!=0){
tmp<-an[-del.merge,names(merged.anoms)]
out<-rbinddt(tmp,merged.anoms)
out<-out[order(out$left),]
} else out<-an
return(out)
} #end function
#########
###### delHomoRuns ##############
## function to possibly narrow segments found containing homo del
# look for adjustment for selected anoms
# to identify homozygous deletions
# looking for run of lrr values < lrr.cut then narrow to this run
# (BAF DNAcopy tends to not segment these well - often occur in longer homozygous runs)
delHomoRuns<-function(anoms,sid,eligible,intid,LRR,run.size,inter.size,
low.frac.used,lrr.cut,ct.thresh,frac.thresh){
#run.size - min length of run
#inter.size - number of homozygotes allowed to "interrupt" run
#low.frac.used - fraction of markers used compared to number of markers in interval
#lrr.cut-look for runs of lrr values below lrr.cut
#ct.thresh - minimum number of lrr values below lrr.cut needed in order to process
#frac.thresh - process only if (# lrr values below lrr.cut)/(# eligible lrr in interval) > frac.thresh
#anoms: data.frame of anomalies scanID, chrom, left, right, num.mark, seg.mean, sd.fac, sex, merge
if(!is.element("data.frame",class(anoms))) stop("anoms needs to be a data.frame")
annames<-c("scanID","chrom","left","right","num.mark","seg.mean","sd.fac","sex","merge")
if(!all(is.element(annames,names(anoms)))) stop("anoms does not have required variable names")
if(length(unique(anoms$scanID))!=1) stop("anoms needs to be for one sample")
anoms.rev<-NULL
for(I in 1:dim(anoms)[1]){
an<-anoms[I,]
snum<-an$scanID; chr<-an$chrom
ledge<-an$left;redge<-an$right
sindex <- which(is.element(sid, snum))
if(length(sindex)==0) stop(paste("Sample ",snum, " does not exist",sep=""))
##want to look for runs only in the already identified anomaly
int<-intid>=intid[ledge] & intid<=intid[redge] #$left and $right are indices of intid
selgood<-is.element(intid,eligible)
index<-which(selgood&int)
lrr <- LRR[index]
wn<-!is.na(lrr)
lrr<-lrr[wn]
index<-index[wn]
whm<-lrr< lrr.cut
ct<-sum(whm)
frac<-ct/length(index)
an$low.ct<-ct
an$nmark.lrr<-length(index)
pct<-an$num.mark/(an$right-an$left+1)
w<-frac>frac.thresh | pct<low.frac.used
if(ct< ct.thresh |!w){
an$old.left<-an$left; an$old.right<-an$right
anoms.rev<-rbinddt(anoms.rev,an)
next
}
who<-!whm
lrr[whm]<-0
lrr[who]<-1
rgt<-NULL;lft<-NULL
w<-rle(as.vector(lrr))
#w<-rle( ) has w[[1]] = lengths of runs, w[[2]] corresponding values
vals<-w[[2]];lngs<-w[[1]]
r0<-vals
rlen<-length(r0)
if(rlen==1){ # keep original if only one value
an$old.left<-an$left; an$old.right<-an$right
anoms.rev<-rbinddt(anoms.rev,an)
next
}
##establish initial positions of alternating runs
endp<-cumsum(lngs)
in.pos<-c(1,endp[1:(rlen-1)]+1)
##merging intervals if separated by < inter.size no. of undesirable values
# assuming sum of lengths of desirable intervals on either side meets run.size criterion
tpos<-which(r0==1&lngs<=inter.size) #identify small runs of homos
smf<-which(r0==0 & lngs<run.size) #identify 'small' runs of hets
if(length(tpos)!=0){
if(tpos[1]==1) {
if(lngs[2]>=run.size){r0[1]<-0}
tpos<-tpos[-1]}
if(length(tpos)!=0){
if(tpos[length(tpos)]==rlen) {
tpos<-tpos[-length(tpos)]; if(lngs[rlen-1]>=run.size) {r0[rlen]<-0}}
if(length(tpos)!=0){
for(k in tpos){if((lngs[k-1]+lngs[k+1])>=run.size) {r0[k]<-0}
}
}} }
##want smaller runs of 0s to become runs of undesirable but not if they
#are part of a combined run
run2<-rle(r0)
vals2<-run2[[2]];lngs2<-run2[[1]]
w0<-which(vals2==0 & lngs2>1)
if(length(w0)==0){r0[smf]<-1} else {
index.set<-NULL
for(j in 1:length(w0)){ if(w0[j]==1){start<-1} else {start<-sum(lngs2[1:(w0[j]-1)])+1}
end<-sum(lngs2[1:w0[j]])
index.set<-c(index.set,start:end)}
smf.use<-setdiff(smf,intersect(smf,index.set))
r0[smf.use]<-1}
##after merging some of the initial runs, we get modified listing of r0 vals
## look for runs here; e.g. if run of two 0s that means that initially
#we had a run of desirable with small run of undesirable - rle length of 2 then indicates
#putting those two original runs together as one run of desirable
new.rle<-rle(r0)
nvals<-new.rle[[2]]
nlens<-new.rle[[1]] ##indicates how many of original runs to put together
if(length(nvals)==1){ #all now classified as undesirable or as desirable = no change
an$old.left<-an$left; an$old.right<-an$right
anoms.rev<-rbinddt(anoms.rev,an)
next
}
newt<-which(nvals==0) #newt could be empty if originally there were no long het/miss runs
if(length(newt)==0){
an$old.left<-an$left; an$old.right<-an$right
anoms.rev<-rbinddt(anoms.rev,an)
next
}
left<-NULL
right<-NULL
### if newt indicates runs of 0s, change initial and end positions of
#runs of desired accordingly
if(newt[1]==1){left<-c(left,1);right<-c(right,in.pos[nlens[1]+1]-1);newt<-newt[-1]}
for(k in newt){ind<-sum(nlens[1:(k-1)]);left<-c(left,in.pos[ind+1])
kl<-length(newt);kk<-newt[kl]
if((ind+1+nlens[kk])<=length(in.pos)){
right<-c(right,in.pos[ind+1+nlens[k]]-1)} else {right<-c(right,length(index))}}
##right and left positions are indices of lrr (= indices of index)
## if splits into more than one run, leave as the original
if(length(right)==0|length(left)==0|length(right)>1|length(left)>1){
an$old.left<-an$left; an$old.right<-an$right
anoms.rev<-rbinddt(anoms.rev,an)
next
}
## there is one adjusted interval found
an$old.left<-an$left;an$old.right<-an$right
an$left<-index[left];an$right<-index[right]
anoms.rev<-rbinddt(anoms.rev,an)
} #end loop on anomalies
return(anoms.rev)
}
###############################
anomFilterBAF<-function(intenData, genoData, segments, snp.ids,
centromere,low.qual.ids=NULL,
num.mark.thresh=15,long.num.mark.thresh=200,sd.reg=2,sd.long=1,
low.frac.used=.1,run.size=10,inter.size=2,low.frac.used.num.mark=30,
very.low.frac.used=.01,low.qual.frac.num.mark=150,
lrr.cut= -2,ct.thresh=10,frac.thresh=.1,verbose=TRUE,
small.thresh=2.5, dev.sim.thresh=0.1, centSpan.fac=1.25, centSpan.nmark=50){
##segments - data.frame to determine which are anomalous
# names of data.frame must include "scanID","chromosome","num.mark","left.index","right.index","seg.mean"
# assume have segmented each chromosome (at least all autosomes) for a given sample
##snp.ids: vector of eligible snp ids
##centromere: data.frame with centromere position info
## low.qual.ids: sample numbers determined to be messy for which segments are filtered
# based on num.mark and fraction used
## num.mark.thresh: minimum size of segment to consider for anomaly
## long.num.mark.thresh: min number of markers for "long" segment
# (significance threshold allowed to be lower)
## sd.reg: number of standard deviations from segment mean
# compared to a baseline mean for "normal" needed to declare segment anomalous
## sd.long: same meaning as sd. long but applied to "long" segments
## low.frac.used: fraction used to declare a segment with
# low number hets or missing compared with number of markers in interval
## run.size, inter.size: for possible determination of homozygous deletions
# (see description in delHomoRuns function above)
## low.frac.used.num.mark: used in final step of deleting "small" low frac.used segments
# which tend to be false positives (after determining homo deletions)
## very.low.frac.used: any segments with (num.mark)/(number of markers in interval) less than this are filtered out
## low.qual.frac.num.mark: num.mark threshold for messy samples
##lrr.cut-look for runs of lrr values below lrr.cut to adjust homo del endpts
##ct.thresh - minimum number of lrr values below lrr.cut needed in order to adjust
##frac.thresh - adjust only if (# lrr values below lrr.cut)/(# eligible lrr in interval) > frac.thresh
##small.thresh - sd.fac threshold used in making merge decisions involving small num.mark segments
##dev.sim.thresh - relative error threshold for determining similarity in BAF deviations; used in merge decisions
## centSpan.fac - thresholds increased by this factor when considering filtering of centromere pieces
## centSpan.nmark - minimum number of markers for centromere cross
#############################
# check that intenData has BAF
if (!hasBAlleleFreq(intenData)) stop("BAlleleFreq not found in intenData")
# check that dimensions of intenData and genoData are equal
intenSnpID <- getSnpID(intenData)
genoSnpID <- getSnpID(genoData)
if (!all(intenSnpID == genoSnpID)) stop("snp dimensions of intenData and genoData differ")
intenScanID <- getScanID(intenData)
genoScanID <- getScanID(genoData)
if (!all(intenScanID == genoScanID)) stop("scan dimensions of intenData and genoData differ")
# check that sex is present in annotation
if (hasSex(intenData)) {
sex <- getSex(intenData)
} else if (hasSex(genoData)) {
sex <- getSex(genoData)
} else stop("sex not found in intenData or genoData")
intid <- intenSnpID
if(!all(is.element(snp.ids,intid))) stop("eligible snps not contained in snp ids")
sid <- intenScanID
chrom <- getChromosome(intenData)
Pos <- getPosition(intenData)
if(!is.element(class(segments),"data.frame")) stop("data is not a data.frame")
chk<-is.element(c("scanID","chromosome","num.mark","left.index","right.index","seg.mean"),names(segments))
if(!all(chk)) stop("Error in names of columns of data")
if(!is.element(class(centromere),"data.frame")) stop("centromere info is not a data.frame")
cchk<-is.element(c("chrom","left.base","right.base"),names(centromere))
if(!all(cchk)) stop("Error in names of centromere data.frame")
centromere$chrom[is.element(centromere$chrom, "X")] <- XchromCode(intenData)
centromere$chrom[is.element(centromere$chrom, "Y")] <- YchromCode(intenData)
centromere$chrom[is.element(centromere$chrom, "XY")] <- XYchromCode(intenData)
centromere$chrom <- as.integer(centromere$chrom)
# internal functions require these names, so convert from package standard
names(segments)[names(segments) == "chromosome"] <- "chrom"
names(segments)[names(segments) == "left.index"] <- "left"
names(segments)[names(segments) == "right.index"] <- "right"
##delete any segments from male samples on chromosome X
male<-sid[is.element(sex,"M")]
wdel<-is.element(segments$scanID,male)&segments$chrom==XchromCode(intenData)
anoms<-segments[!wdel,]
anoms<-anoms[order(anoms$scanID,anoms$chrom),]
#find unsegmented chromosomes
smpchr<-paste(anoms$scanID,anoms$chrom)
dup<-which(duplicated(smpchr))
sng<-which(!is.element(smpchr,smpchr[dup]))
anoms.sngl<-anoms[sng,] #unsegmented chromosomes
######
anoms2<-NULL #raw with sd factor
anoms.fil<-NULL #filtered
normal.info<-NULL
seg.info<-NULL
samples<-unique(anoms$scanID)
NS<-length(samples)
for(i in 1:NS){
snum<-samples[i]
if(floor(i/10)*10-i==0 & verbose==TRUE){
message(paste("processing ",i,"th scanID out of ",NS,sep=""))
}
sindex <- which(is.element(sid, snum))
if(length(sindex)==0) stop(paste("Sample ",snum, " does not exist",sep=""))
GENO <- getGenotype(genoData, snp=c(1,-1), scan=c(sindex,1))
## compute baf metric
sel<-is.element(intid,snp.ids) & (GENO == 1 | is.na(GENO))
baf <- getBAlleleFreq(intenData, snp=c(1,-1), scan=c(sindex,1))
ws<-!is.na(baf)
INDEX<-which(sel&ws)
CHR<-chrom[sel&ws]
BAF<-baf[sel&ws]
index<-NULL
chr<-NULL
baf.dat<-NULL
baf.raw<-NULL
uuch<-unique(anoms$chrom)
uch<-uuch[uuch!=XYchromCode(intenData)]
for(ch in uch){
wc<-CHR==ch #T/F for indices of CHR which match indices of INDEX,BAF
bf<-BAF[wc]
ind<-INDEX[wc]
chrm<-CHR[wc]
med<-median(bf,na.rm=TRUE)
bf1<-1-bf
bfm<-abs(bf-med)
c<-cbind(bf,bf1,bfm)
met<-apply(c,1,min)
baf.metric<-sqrt(met)
index<-c(index,ind)
chr<-c(chr,chrm)
baf.dat<-c(baf.dat,baf.metric)
baf.raw<-c(baf.raw,bf)
} #end of chrom loop
an<-anoms[is.element(anoms$scanID,snum),]
an.sngl<-anoms.sngl[is.element(anoms.sngl$scanID,snum),]
sel.chr.all<-an.sngl$chrom
if(sum(duplicated(an.sngl$chrom))!=0) stop(paste("Error in singletons for Sample ",snum,sep=" "))
## treat X chromosome and XY differently
# not included in baseline but do need to be compared
wX<-which(sel.chr.all==XchromCode(intenData))
wps<-which(sel.chr.all==XYchromCode(intenData))
if(length(wX)==0 & length(wps)==0){ sel.chr<-sel.chr.all} else {sel.chr<-sel.chr.all[-union(wX,wps)]}
if(length(sel.chr)<2) {w.selec<-which(!is.element(chr,c(XchromCode(intenData),XYchromCode(intenData))))} else {
##compare each autosome seg.mean with baseline based on other autosomes
an.snglo<-an.sngl[order(an.sngl$seg.mean,decreasing=TRUE),]
an.snglo<-an.snglo[is.element(an.snglo$chrom,sel.chr),]
sel.chro<-an.snglo$chrom #decreasing order of seg.mean so work with largest mean sequentially
N<-length(sel.chro) #sel.chro are autosome chroms unsegmented
flag<-1
j<-1
while(flag==1){
w.selec<-is.element(chr,sel.chro[(j+1):N])
bbase<-baf.dat[w.selec]
base.mean<-mean(bbase,na.rm=TRUE)
base.sd<-sd(bbase,na.rm=TRUE)
mean.chk<-an.snglo$seg.mean[is.element(an.snglo$chrom,sel.chro[j])]
if(abs(mean.chk-base.mean)/base.sd >sd.long){
if((N-j)==1) {flag<-0;keep<-N}
j<-j+1
} else {keep<- j:N; flag<-0}
}#end while
w.selec<-is.element(chr,sel.chro[keep])
} #end of else - selection of unsegmented chroms to use
##baseline now based on autosome unsegmented not identified as whole chrom anoms
bbase<-baf.dat[w.selec]
base.mean<-mean(bbase,na.rm=TRUE)
base.sd<-sd(bbase,na.rm=TRUE)
sd.fac<-abs(an$seg.mean-base.mean)/base.sd
braw.base<-baf.raw[w.selec]
braw.base.med<-median(braw.base,na.rm=TRUE)
if(length(sel.chr)<2){chr.ct<-0} else {chr.ct<-length(sel.chro[keep])}
normi<-data.frame(snum,base.mean,base.sd,braw.base.med,chr.ct)
names(normi)<-c("scanID","base.mean","base.sd","base.baf.med","chr.ct")
normal.info<-rbinddt(normal.info,normi)
an$sd.fac<-sd.fac
an$sex<-sex[sindex]
anoms2<-rbinddt(anoms2,an)
##an is segment/sd.fac info for the given sample
## base.mean and base.sd are for the given sample
##anoms2 now contains segment info along with sd.fac info - accumulating over samples
########## Centromere spanning, Merging
an<-an[order(an$chrom,an$left),]
an.seg.info<-NULL
CHR<-unique(an$chrom)
CHR<-CHR[CHR!=XYchromCode(intenData)]
LRR <- getLogRRatio(intenData, snp=c(1,-1), scan=c(sindex,1))
an3.fil<-NULL
for(ch in CHR){
anch<-an[an$chrom==ch,]
if(dim(anch)[1]==0) next
nsegs<-dim(anch)[1]
tp<-data.frame(snum,ch,nsegs)
names(tp)<-c("scanID","chrom","num.segs")
an.seg.info<-rbinddt(an.seg.info,tp)
## cent-span code insert here
centL<-centromere$left.base[centromere$chrom==ch]
centR<-centromere$right.base[centromere$chrom==ch]
cleft<-chrom==ch & Pos<centL & is.element(intid,index) # baf eligible
cright<-chrom==ch & Pos>centR & is.element(intid,index)
if(sum(cleft) == 0) {cL<-NULL; cR<-NULL} else {
cLL<-max(intid[cleft])
cRR<-min(intid[cright])
cL<-which(intid==cLL)
cR<-which(intid==cRR) # left and right are indices of intid
w<-anch$left<=cL & anch$right>=cR
if(sum(w)!=0){ # segment spans centromere - will be at most one
ach<-anch[w,]
anrest<-anch[!w,]
# check if segment passes basic filter
s2<-ach$num.mark>=centSpan.nmark & ach$sd.fac>=sd.reg
s3<-ach$num.mark>long.num.mark.thresh& ach$sd.fac>=sd.long
s<-(s2|s3)
if(!s){anch<-anrest} else {
# if passes filter, split into two pieces, one on either side of centromere
tmp<-ach #
tmp$right<-cL
tmp2<-ach
tmp2$left<-cR
tmpmarkers<-intid>=intid[tmp$left] & intid<=intid[tmp$right]
tmplrr<-is.element(intid,snp.ids) & tmpmarkers
tmpbaf<-index>=tmp$left & index<=tmp$right
tmp2markers<-intid>=intid[tmp2$left] & intid<=intid[tmp2$right]
tmp2baf<-index>=tmp2$left & index<=tmp2$right
tmp2lrr<-is.element(intid,snp.ids) & tmp2markers
# recompute/compute basic features for each piece
tmp$num.mark<-sum(tmpbaf)
tmp2$num.mark<-sum(tmp2baf)
tmp$seg.mean<-mean(baf.dat[tmpbaf],na.rm=TRUE)
tmp2$seg.mean<-mean(baf.dat[tmp2baf],na.rm=TRUE)
tmp$sd.fac<-abs(tmp$seg.mean-base.mean)/base.sd
tmp2$sd.fac<-abs(tmp2$seg.mean-base.mean)/base.sd
tmp$frac.used<-tmp$num.mark/sum(tmplrr)
tmp2$frac.used<-tmp2$num.mark/sum(tmp2lrr)
# test each piece for passing (somewhat more stringent) filter
s1<-tmp$num.mark>=centSpan.fac*num.mark.thresh
if(tmp$num.mark< 3*centSpan.fac* num.mark.thresh){
s2<-tmp$sd.fac>=centSpan.fac*sd.reg & tmp$frac.used>= centSpan.fac* low.frac.used
} else {
s2<-tmp$sd.fac>=centSpan.fac*sd.reg
}
s3<-tmp$num.mark>long.num.mark.thresh& tmp$sd.fac>=sd.long
t1<-(s2|s3) & s1
s1<-tmp2$num.mark>=centSpan.fac*num.mark.thresh
if(tmp2$num.mark< 3* centSpan.fac*num.mark.thresh){
s2<-tmp2$sd.fac>=centSpan.fac*sd.reg & tmp$frac.used>= centSpan.fac*low.frac.used
} else {
s2<-tmp2$sd.fac>=centSpan.fac*sd.reg
}
s3<-tmp2$num.mark>long.num.mark.thresh& tmp2$sd.fac>=sd.long
t2<-(s2|s3) & s1
if(!t1 & !t2) anch<-anrest #delete both failed
if(t1 & !t2) anch<-rbinddt(anrest,tmp[,names(anch)]) # delete one fail, keep other OK
if(!t1 & t2) anch<-rbinddt(anrest,tmp2[,names(anch)])
if(t1&t2){ # check same type and similar baf.dev (width)
# gain, loss, neutral
tlrr<-LRR[tmplrr]
mtlrr<-median(tlrr,na.rm=TRUE); sdtlrr<-sd(tlrr,na.rm=TRUE)
tmp$type<-3
if(mtlrr> 0.3*sdtlrr) tmp$type<-1
if(mtlrr< -0.3*sdtlrr) tmp$type<-2
# gain, loss, neutral
t2lrr<-LRR[tmp2lrr]
mt2lrr<-median(t2lrr,na.rm=TRUE); sdt2lrr<-sd(t2lrr,na.rm=TRUE)
tmp2$type<-3
if(mt2lrr> 0.3*sdt2lrr) tmp2$type<-1
if(mt2lrr< -0.3*sdt2lrr) tmp2$type<-2
q<-tmp$type==tmp2$type
br<-baf.raw[tmpbaf]
tmp$baf.dev.med<-median(abs(br-braw.base.med),na.rm=TRUE)
br2<-baf.raw[tmp2baf]
tmp2$baf.dev.med<-median(abs(br2-braw.base.med),na.rm=TRUE)
mbd<-min(tmp$baf.dev.med,tmp2$baf.dev.med)
if(mbd==0) relerr<-1 else relerr<-abs(tmp$baf.dev.med-tmp2$baf.dev.med)/mbd
q2<-relerr<=dev.sim.thresh
if(!q | !q2) { # keep each piece separate
anch<-rbinddt(anrest,tmp[,names(anch)],tmp2[,names(anch)])
}
# note at this stage, anch hasn't changed
}
}
if(nrow(anch)==0) next
anch<-anch[order(anch$left),]
}
}
tmp2<-mergeSeg(anch,snum,ch,cL,cR,base.mean,base.sd,sd.reg,sd.long,num.mark.thresh,long.num.mark.thresh,low.frac.used,low.frac.used.num.mark,baf.raw,baf.dat,index,braw.base.med,small.thresh, dev.sim.thresh, LRR,snp.ids,intid ) ## merged for given samp/chrom
tst.rev<-delHomoRuns(tmp2,sid,snp.ids,intid,LRR,
run.size,inter.size,low.frac.used,lrr.cut,ct.thresh,frac.thresh)
diff.right<-tst.rev$old.right-tst.rev$right
diff.left<-tst.rev$old.left-tst.rev$left
d.keep<-NULL
tst.rev$homodel.adjust<-FALSE
for(k in 1:dim(tst.rev)[1]) {
if(diff.right[k]==0 & diff.left[k]==0) next
d.keep<-c(d.keep,k)
int2<-intid>=intid[tst.rev$left[k]] & intid<=intid[tst.rev$right[k]]
selgood<-is.element(intid,snp.ids)
whm<- GENO == 1 | is.na(GENO)
index2<-which(int2&selgood&whm)
mt<-is.element(index,index2)
seg.mn<-mean(baf.dat[mt],na.rm=TRUE)
sdf<-abs(seg.mn-base.mean)/base.sd
tst.rev$num.mark[k]<-length(index2)
tst.rev$sd.fac[k]<-sdf
tst.rev$seg.mean[k]<-seg.mn
}
if(length(d.keep)!=0) tst.rev$homodel.adjust[d.keep]<-TRUE
# want to include small homo dels originally found exactly (so not adjusted)
ct<-tst.rev$low.ct
nlrr<-tst.rev$nmark.lrr
ws<-which(ct>=ct.thresh & ct/nlrr>=.9)
#keeps homo dels found exactly (didn't need adjustment) that might be too small to pass further filters
d.keep<-union(d.keep,ws)
colm<-c("scanID","chrom","left","right","num.mark","seg.mean","sd.fac","sex","merge","homodel.adjust")
tst.rev<-tst.rev[,colm]
######## FILTER ######################
s1<-d.keep #will keep any homo dels found
s2<-tst.rev$num.mark>=num.mark.thresh& tst.rev$sd.fac>=sd.reg
s3<-tst.rev$num.mark>long.num.mark.thresh& tst.rev$sd.fac>=sd.long
s<-which(s2|s3)
filin<-union(s,d.keep)
fil<-tst.rev[filin,]
an3.fil<-rbinddt(an3.fil,fil) #becomes filtered anoms for current sample
}#end ch loop
anoms.fil<-rbinddt(anoms.fil,an3.fil)
seg.info<-rbinddt(seg.info,an.seg.info)
} #end of sample loop
anoms2<-anoms2[order(anoms2$scanID,anoms2$chrom,anoms2$left),] #raw annotated
anoms2$left.base <- getPosition(intenData, index=anoms2$left)
anoms2$right.base <- getPosition(intenData, index=anoms2$right)
if(!is.null(anoms.fil)){
anoms.fil<-anoms.fil[order(anoms.fil$scanID,anoms.fil$chrom,anoms.fil$left),]
# convert index to base position
anoms.fil$left.base <- getPosition(intenData, index=anoms.fil$left)
anoms.fil$right.base <- getPosition(intenData, index=anoms.fil$right)
}
## XY filter
tmp<-anoms2[anoms2$chrom==XYchromCode(intenData),]
if(dim(tmp)[1]!=0){
s1<-tmp$num.mark>=num.mark.thresh& tmp$sd.fac>=sd.reg
s2<-tmp$num.mark>long.num.mark.thresh& tmp$sd.fac>=sd.long
if(sum(s1|s2)!=0){
an.XY<-tmp[s1|s2,]
an.XY$merge<-NA
an.XY$homodel.adjust<-NA
an.XY$left.base<-getPosition(intenData, index=an.XY$left)
an.XY$right.base<-getPosition(intenData, index=an.XY$right)} else an.XY<-NULL
anoms.fil<-rbinddt(anoms.fil,an.XY)
an.seg.info<-NULL
s<-unique(tmp$scanID)
for(snum in s){
an<-tmp[tmp$scanID==snum,]
ns<-dim(an)[1]
tt<-data.frame(snum,XYchromCode(intenData),ns)
names(tt)<-c("scanID","chrom","num.segs")
an.seg.info<-rbinddt(an.seg.info,tt)
}
seg.info<-rbinddt(seg.info,an.seg.info)
}
## further filtering based on low.frac.used and/or messiness
if(!is.null(anoms.fil)& dim(anoms.fil)[1]!=0){
nf<-dim(anoms.fil)[1]
frac.used<-NULL
for(kk in 1:nf){
int<-intid>=intid[anoms.fil$left[kk]] & intid<=intid[anoms.fil$right[kk]]
selgood<-is.element(intid,snp.ids)
int.ok<-int&selgood
frac.used<-c(frac.used,anoms.fil$num.mark[kk]/sum(int.ok))
}
anoms.fil$frac.used<-frac.used
wd1<-anoms.fil$frac.used<=very.low.frac.used
wd2<-anoms.fil$frac.used<low.frac.used & anoms.fil$num.mark<low.frac.used.num.mark
if(!is.null(low.qual.ids)){
wd3<-anoms.fil$frac.used<low.frac.used & anoms.fil$num.mark<low.qual.frac.num.mark & is.element(anoms.fil$scanID,low.qual.ids)
}
wdel<-wd1 | wd2
if(!is.null(low.qual.ids)) wdel<-wdel|wd3
anoms.fil<-anoms.fil[!wdel,]
}
seg.info<-seg.info[order(seg.info$scanID,seg.info$chrom),]
out<-list(anoms2,anoms.fil,normal.info,seg.info)
names(out)<-c("raw","filtered","base.info","seg.info")
# convert back to package standard names
for (i in 1:length(out)) {
if ("chrom" %in% names(out[[i]]))
names(out[[i]])[names(out[[i]]) == "chrom"] <- "chromosome"
if ("left" %in% names(out[[i]]))
names(out[[i]])[names(out[[i]]) == "left"] <- "left.index"
if ("right" %in% names(out[[i]]))
names(out[[i]])[names(out[[i]]) == "right"] <- "right.index"
}
return(out)
} #end of function
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.