#' starfish_link
#'
#' This function loads a SV dataframe and generates "seed" CGR regions, "connected" CGR regions, and complex SVs.
#'
#'
#' @param sv_file a SV dataframe with 8 columns: "chrom1","pos1", "chrom2","pos2","svtype" (DEL,DUP,h2hINV,t2tINV,TRA),"strand1" (+/-) and "strand2" (+/-),"sample". Other svtypes like INV, INS, BND are not accepted
#' @param prefix the prefix for all intermediate files, default is none
#' @return a list of files: $interleave_tra_complex_sv contains complex SVs, $mergecall contains "seed" CGR regions, and $starfish_call contains "connected" CGR regions.
#' @export
starfish_link=function(sv_file,prefix=""){
chrlist=c(as.character(1:22),"X","Y")
svtotal=unique(sv_file)
########## no "chr" in chromosome name #######
svtotal$chrom1=gsub("Chr|chr","",svtotal$chrom1)
svtotal$chrom2=gsub("Chr|chr","",svtotal$chrom2)
###### pesudo cnv file #######
cnvtotal=as.data.frame(matrix(nrow=1,ncol=4))
colnames(cnvtotal)=c("chromosome","start","end","total_cn")
cnvtotal[1,]=c("X","0","1","0")
##############################
svtotal <-svtotal[(svtotal$chrom1 %in% chrlist & svtotal$chrom2 %in% chrlist),]
svtotal$end1=svtotal$pos1
svtotal$end2=svtotal$pos2
samplelist=unique(svtotal$sample)
ss6=data.frame()
ss3=data.frame()
sv=data.frame()
interleave_sv_total=data.frame()
for (i in c(1:length(samplelist))) {
############ 6 interleaved SVs ##############
SV =svtotal[svtotal$sample==samplelist[i],]
cnv=cnvtotal
SVSample <- ShatterSeeky::SVs(chrom1=as.character(SV$chrom1), pos1=as.numeric(SV$end1),chrom2=as.character(SV$chrom2), pos2=as.numeric(SV$end2),SVtype=as.character(SV$svtype),strand1=as.character(SV$strand1),strand2=as.character(SV$strand2))
CN_data=ShatterSeeky::CNVsegs(chrom=as.character(cnv$chromosome),start=as.numeric(cnv$start),end=as.numeric(cnv$end),total_cn=as.numeric(cnv$total_cn))
chromothripsis6 <- ShatterSeeky::shatterseek(SV.sample=SVSample,seg.sample=CN_data,min.Size=6)
chrss6=chromothripsis6@chromSummary
chrss6$sample=samplelist[i]
ss6=rbind(ss6,chrss6)
########## 3 interleaved SVs #################
chromothripsis3 <- ShatterSeeky::shatterseek(SV.sample=SVSample,seg.sample=CN_data,min.Size=3)
chrss3=chromothripsis3@chromSummary
chrss3$sample=samplelist[i]
ss3=rbind(ss3,chrss3)
########## interleaved sv #############
interleave_size=3
idx=which(sapply(chromothripsis3@detail$connComp,length)>=interleave_size)
if(length(idx)>0){
interleave_sv=chromothripsis3@detail$SV[as.numeric(unlist(chromothripsis3@detail$connComp[idx])) ,]
chrss_interleaved_intrasv=merge(SV,interleave_sv,by.x=c("chrom1","end1","chrom2","end2","strand1","strand2","svtype"),by.y=c("chrom1","pos1","chrom2","pos2","strand1","strand2","SVtype"))
interleave_sv_total=rbind(interleave_sv_total,chrss_interleaved_intrasv)
}
}
###### test for 6 interleaved SVs ########
shattercall6=ss6
ft=shattercall6$pval_fragment_joins
ent=shattercall6$chr_breakpoint_enrichment
ext=shattercall6$pval_exp_chr
exct=shattercall6$pval_exp_cluster
shattercall6$ft_fdr=p.adjust(ft, method = "fdr")
shattercall6$ent_fdr=p.adjust(ent, method = "fdr")
shattercall6$ext_fdr=p.adjust(ext, method = "fdr")
shattercall6$exct_fdr=p.adjust(exct, method = "fdr")
shattercall6$total_intra=shattercall6$number_DEL+shattercall6$number_DUP+shattercall6$number_h2hINV+shattercall6$number_t2tINV
if (length(samplelist)>50){
shattercall6$call6=ifelse(((shattercall6$total_intra>5)&(shattercall6$ft_fdr>0.2)&(shattercall6$ent_fdr<=0.2|shattercall6$exct_fdr<=0.2)),"chrss","no")
} else {
shattercall6$call6=ifelse(((shattercall6$total_intra>5)&(shattercall6$ft_fdr>0.05)&(shattercall6$ent_fdr<=0.2|shattercall6$exct_fdr<=0.2)),"chrss","no")
}
###### test for 3 interleaved SVs ########
shattercall3=ss3
ft=shattercall3$pval_fragment_joins
ent=shattercall3$chr_breakpoint_enrichment
ext=shattercall3$pval_exp_chr
exct=shattercall3$pval_exp_cluster
shattercall3$ft_fdr=p.adjust(ft, method = "fdr")
shattercall3$ent_fdr=p.adjust(ent, method = "fdr")
shattercall3$ext_fdr=p.adjust(ext, method = "fdr")
shattercall3$exct_fdr=p.adjust(exct, method = "fdr")
shattercall3$total_intra=shattercall3$number_DEL+shattercall3$number_DUP+shattercall3$number_h2hINV+shattercall3$number_t2tINV
if (length(samplelist)>50){
shattercall3$call3=ifelse(((shattercall3$total_intra>2)&(shattercall3$number_TRA>3)&(shattercall3$ft_fdr>0.2)),"chrss","no")
} else{
shattercall3$call3=ifelse(((shattercall3$total_intra>2)&(shattercall3$number_TRA>3)&(shattercall3$ft_fdr>0.05)),"chrss","no")
}
######## merge 3 and 6 interleaved SVs #####
shattercall6$call3= shattercall3$call3
shattercall3$call6=shattercall6$call6
keeps=c("chrom","start","end","sample","call3","call6")
shattercall6=shattercall6[keeps]
shattercall3=shattercall3[keeps]
mergecall=rbind(shattercall6[shattercall6$call6=="chrss"&shattercall6$call3=="no",],shattercall3[shattercall3$call3=="chrss",])
if(nrow(mergecall)==0){
print("No CGR region is identified!")
} else {
colnames(mergecall)=c("chr","start","end","sample","call3","call6")
######## call linking groups #######
chrss=mergecall
chrss$format_id=paste0(chrss$sample,"_",chrss$chr)
chrss$size=chrss$end-chrss$start
chrss$chrss_status="chrss"
chrss$chrss_chr=chrss$chr
svlist=svtotal
svlist$start1=svlist$end1-1
svlist$start2=svlist$end2-1
svlist$size=svlist$end2-svlist$end1
format_id=chrss$format_id
ss=data.frame()
chrsslink=function(chrss,formatid=format_id){
for (i in 1:length(chrss$chr)){
##### identify TRA in 10k ##########
inter1=max(chrss$start[i]-10000,0)
inter2=chrss$end[i]+10000
sv=svlist[svlist$sample==chrss$sample[i],]
svtra=sv[((sv$chrom1==chrss$chr[i]&sv$end1>=inter1&sv$end1<=inter2)|(sv$chrom2==chrss$chr[i]&sv$end2>=inter1&sv$end2<=inter2))&(sv$svtype=="TRA"),]
if (length(svtra$chrom1)>0){
svtra$chr=ifelse(svtra$chrom1==chrss$chr[i],svtra$chrom2,svtra$chrom1)
svtra$tra_pos1=ifelse(svtra$chrom1==chrss$chr[i],svtra$end2,svtra$end1)
svtra$chrss_chr=ifelse(svtra$chrom1==chrss$chr[i],svtra$chrom1,svtra$chrom2)
svtra$sample=chrss$sample[i]
svtra <- plyr::ddply(svtra, .(chr), transform, n = length(chr))
svtra=svtra[svtra$n>0,]
if (length(svtra$chrom1)>0){
svtramin=aggregate(svtra$tra_pos1, by = list(svtra$chr), min)
colnames(svtramin)=c("chr","start")
svtramax=aggregate(svtra$tra_pos1, by = list(svtra$chr), max)
colnames(svtramax)=c("chr","end")
svtra2=merge(svtramin,svtramax,by.x=("chr"),by.y=("chr"))
svtra=svtra[!duplicated(svtra[,c('chr')]),]
svtra3=merge(x = svtra2, y = svtra[ , c("chr","chrss_chr", "sample","n")], by = "chr", all.x=TRUE)
ss=rbind(ss,svtra3)
}
}
}
if(nrow(ss)>0){
ss$format_id=paste0(ss$sample,"_",ss$chr)
idlist=unique(ss$format_id)
# requires at least two chrss linked tra
linkss=ss[1,]
linkss2=data.frame()
for (id in idlist){
ss_id=ss[ss$format_id==id,]
if(sum(ss_id$n)>1){
# judge the chromosome is an original chrss or not
if(!(id %in% formatid)){
# caculate the region for the link region
linkss$chr=unique(ss_id$chr)
linkss$start=min(ss_id$start,ss_id$end)
linkss$end=max(ss_id$start,ss_id$end)
linkss$chrss_chr=paste(c(chrss[chrss$format_id==id,]$chrss_chr,paste(c(ss_id$chrss_chr),collapse="_",sep="_")),collapse="_",sep="_")
linkss$sample=unique(ss_id$sample)
linkss$format_id=unique(ss_id$format_id)
linkss$chrss_status="no"
linkss2=rbind(linkss2,linkss)
}
if((id %in% formatid)){
# caculate the region for the link region
linkss$chr=unique(ss_id$chr)
# assign start and end based on TRA position
linkss$start=min(ss_id$start,ss_id$end)
linkss$end=max(ss_id$start,ss_id$end)
linkss$chrss_chr=paste(c(chrss[chrss$format_id==id,]$chrss_chr,paste(c(ss_id$chrss_chr),collapse="_",sep="_")),collapse="_",sep="_")
linkss$sample=unique(ss_id$sample)
linkss$format_id=unique(ss_id$format_id)
linkss$chrss_status="chrss"
linkss2=rbind(linkss2,linkss)
}
}
}
# assign start and end based on chrss cluster
if(nrow(linkss2)>0){
linkss2=merge(linkss2,chrss[,c("format_id","start","end")],by.x=("format_id"),by.y=("format_id"),all.x=T)
linkss2$start=ifelse(!is.na(linkss2$start.y),ifelse(linkss2$start.x<linkss2$start.y,linkss2$start.x,linkss2$start.y),linkss2$start.x)
linkss2$end=ifelse(!is.na(linkss2$end.y),ifelse(linkss2$end.x>linkss2$end.y,linkss2$end.x,linkss2$end.y),linkss2$end.x)
keeps=c("chr","start","end","chrss_chr","sample","format_id","chrss_status")
linkss2=linkss2[keeps]
# merge with original chrss
chrss2=chrss[keeps]
linkss2=rbind(linkss2,chrss2)
idlist=unique(linkss2$format_id)
linkss=linkss2[1,]
linkss3=data.frame()
for (id in idlist){
ss_id=linkss2[linkss2$format_id==id,]
# caculate the region for the link region
linkss$chr=unique(ss_id$chr)
linkss$start=min(ss_id$start,ss_id$end)
linkss$end=max(ss_id$start,ss_id$end)
linkss$chrss_chr=paste(c(ss_id$chr,ss_id$chrss_chr),collapse="_",sep="_")
linkss$sample=unique(ss_id$sample)
linkss$format_id=unique(ss_id$format_id)
linkss$chrss_status=unique(linkss2[linkss2$format_id==id,]$chrss_status)
linkss3=rbind(linkss3,linkss)
}
} else {
linkss3=chrss[c("chr","start","end","chrss_chr","sample","format_id","chrss_status")]
}
} else {
linkss3=chrss[c("chr","start","end","chrss_chr","sample","format_id","chrss_status")]
}
return(linkss3)
}
chrsslink1=chrsslink(chrss)
chrsslink2=chrsslink(chrsslink1)
# iteratively search and update the chromothriptic "seed" list
round=0
while (!identical(chrsslink1,chrsslink2)){
chrsslink1=chrsslink(chrsslink2)
chrsslink2=chrsslink(chrsslink1)
chrsslink1$chrss_chr <- sapply(strsplit(chrsslink1$chrss_chr, "_", fixed = TRUE), function(x)
paste(unique(x), collapse = "_"))
chrsslink2$chrss_chr <- sapply(strsplit(chrsslink2$chrss_chr, "_", fixed = TRUE), function(x)
paste(unique(x), collapse = "_"))
round=round+1
print(paste0("Iteration ",round))
print("Starfish is connecting chromothriptic regions...")
}
# split and unique value in chrss_chr
chrsslink3=chrsslink1
chrsslink1$chr=gsub("X","23",chrsslink1$chr)
chrsslink1$chrss_chr=gsub("X","23",chrsslink1$chrss_chr)
chrsslink1$chr=gsub("Y","24",chrsslink1$chr)
chrsslink1$chrss_chr=gsub("Y","24",chrsslink1$chrss_chr)
chrsslink1$link_chromosome=0
chrsslink1$chrss_chr=strsplit(as.character(chrsslink1$chrss_chr), "_", fixed = TRUE)
for (j in 1:length(chrsslink1$sample)){
id=chrsslink1$sample[j]
chr=chrsslink1$chr[j]
chrssid=chrsslink1[chrsslink1$sample==id,]
# identify chrss_chr contains chr with grepl
link_chrss=as.character(sort(as.numeric(unique(unlist(c(chrssid[chrssid$chr==chr,]$chrss_chr))))))
# find intersect rows with link_chrss
link_tra=as.character(sort(as.numeric(unique(unlist(c(chrssid$chrss_chr[unlist(lapply(chrssid$chrss_chr, function(x,y=link_chrss) length(intersect(y, x))>0))]))))))
round=0
while (!identical(link_chrss,link_tra)){
link_chrss=link_tra=as.character(sort(as.numeric(unique(unlist(c(chrssid$chrss_chr[unlist(lapply(chrssid$chrss_chr, function(x,y=link_tra) length(intersect(y, x))>0))]))))))
link_tra=link_tra=as.character(sort(as.numeric(unique(unlist(c(chrssid$chrss_chr[unlist(lapply(chrssid$chrss_chr, function(x,y=link_chrss) length(intersect(y, x))>0))]))))))
round=round+1
}
link_chr=paste(link_chrss,collapse="_")
chrsslink1$link_chromosome[j]=link_chr
}
chrsslink1$chrss_chr <- sapply(chrsslink1$chrss_chr, function(x) paste(unique(x), collapse = "_"))
chrsslink1$chr=gsub("23","X",chrsslink1$chr)
chrsslink1$chr=gsub("24","Y",chrsslink1$chr)
chrsslink1$chrss_chr=gsub("23","X",chrsslink1$chrss_chr)
chrsslink1$link_chromosome=gsub("23","X",chrsslink1$link_chromosome)
chrsslink1$chrss_chr=gsub("24","Y",chrsslink1$chrss_chr)
chrsslink1$link_chromosome=gsub("24","Y",chrsslink1$link_chromosome)
chrsslink1$cluster_id=paste(chrsslink1$sample,chrsslink1$link_chromosome,sep="_")
chrsslink1$chrss_status=gsub("no","link",chrsslink1$chrss_status)
########## integrate TRA back to interleave SV cluster ##############
samplelist=unique(chrsslink1$sample)
ssmatrix=data.frame()
for(i in c(1:length(samplelist))){
svss=svtotal[svtotal$sample==samplelist[i],]
svss_inter=svss[svss$svtype=="TRA",]
svss_inter=svss_inter[c("chrom1","end1","chrom2","end2","svtype")]
svss_intra=interleave_sv_total[interleave_sv_total$sample==samplelist[i],]
svss_intra=svss_intra[c("chrom1","end1","chrom2","end2","svtype")]
svss=rbind(svss_inter,svss_intra)
if(nrow(svss)>0){
svlist3=chrsslink1[chrsslink1$sample==samplelist[i],]
svss$complex=0
svss$cluster_id=""
for (j in 1:length(svss$chrom1)){
t1=0
t2=0
clusterid1=""
clusterid2=""
for (k in 1:length(svlist3$chr)){
t1=ifelse(svss$chrom1[j]==svlist3$chr[k] & svss$end1[j]>=svlist3$start[k] & svss$end1[j]<=svlist3$end[k],1,0)+t1
cluster_new1=ifelse(svss$chrom1[j]==svlist3$chr[k] & svss$end1[j]>=svlist3$start[k] & svss$end1[j]<=svlist3$end[k],svlist3$cluster_id[k],"")
clusterid1=ifelse(cluster_new1!="",cluster_new1,clusterid1)
}
for (k in 1:length(svlist3$chr)){
t2=ifelse(svss$chrom2[j]==svlist3$chr[k] & svss$end2[j]>=svlist3$start[k] & svss$end2[j]<=svlist3$end[k],1,0)+t2
cluster_new2=ifelse(svss$chrom2[j]==svlist3$chr[k] & svss$end2[j]>=svlist3$start[k] & svss$end2[j]<=svlist3$end[k],svlist3$cluster_id[k],"")
clusterid2=ifelse(cluster_new2!="",cluster_new2,clusterid2)
}
svss$complex[j]=t1+t2
if(clusterid1==clusterid2&clusterid1!=""){
clusternew=clusterid1
} else if(clusterid1==""&clusterid2!=""){
clusternew=clusterid2
}else if(clusterid1!=""&clusterid2==""){
clusternew=clusterid1
} else if(clusterid1!=clusterid2&clusterid1!=""&clusterid2!=""){
clusternew=paste0(clusterid1,",",clusterid2)
} else if(clusterid1==""&clusterid2==""){
clusternew=","
}
svss$cluster_id[j]=clusternew
}
svss2=svss[svss$complex>1,]
if(nrow(svss2)>0){
svss2=svss2[c("chrom1","end1","chrom2","end2","svtype","complex","cluster_id")]
svss2$sample=samplelist[i]
ssmatrix=rbind(ssmatrix,svss2)
}
}
}
smatrix2=unique(ssmatrix)
##### remove different or none CGR SVs #########
smatrix2$comman <- lengths(regmatches(smatrix2$cluster_id, gregexpr(",", smatrix2$cluster_id)))
smatrix2=smatrix2[smatrix2$comman==0,]
smatrix2=smatrix2[c("chrom1","end1","chrom2","end2","svtype","complex","sample","cluster_id")]
colnames(smatrix2)=c("chrom1","pos1","chrom2","pos2","svtype","complex","sample","cluster_id")
mergecall$call3=gsub("chrss","CGR",mergecall$call3)
mergecall$call6=gsub("chrss","CGR",mergecall$call6)
colnames(chrsslink1)=c("chr","start","end","CGR_chr","sample","format_id","CGR_status","link_chromosome","cluster_id")
chrsslink1$CGR_status=gsub("chrss","CGR",chrsslink1$CGR_status)
chrsslink1=chrsslink1[c("chr","start","end","sample","CGR_status","link_chromosome","cluster_id")]
starfish_list=list("ss6"=ss6,"ss3"=ss3,"interleave_sv"=interleave_sv_total,"interleave_tra_complex_sv"=smatrix2,"shattercall6"=shattercall6,"shattercall3"=shattercall3,"mergecall"=mergecall,"starfish_call"=chrsslink1)
filename=ifelse(prefix=="","Connected_CGR_event.csv",paste0(prefix,"_connected_CGR_event.csv"))
write.csv(chrsslink1,filename,row.names = F)
sv_filename=ifelse(prefix=="","Connected_CGR_complex_SV.csv",paste0(prefix,"_connected_CGR_complex_SV.csv"))
write.csv(smatrix2,sv_filename,row.names = F)
return(starfish_list)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.