#' starfish_feature
#'
#' This function loads "connected" CGR regions and complex SVs reported by starfish_link, then combines CNV calls and sample gender to construct a CGR region vs. feature matrix for downstream clustering and classification.
#'
#' @param cgr "connected" CGR regions, which is the output of starfish_link_out$starfish_call
#' @param complex_sv complex SVs, which is the output from starfish_link_out$interleave_tra_complex_sv
#' @param cnv_file a CNV dataframe with 5 columns: "chromosome","start","end","total_cn", and "sample". "total_cn" should contain absolute copy numbers
#' @param gender_file a sample table with 2 columns: "sample" and "gender". Gender could be "Female, "female","F","f","Male","male","M", or "m". If the gender is unknown, any other characters could be given, such as "unknown", and the gender will be inferred by the CN baseline of chromosome X
#' @param prefix the prefix for all intermediate files, default is none
#' @param genome_v which genome assembly was used to call SV and CNV. It should be "hg19" or "hg38", default is "hg19"
#' @param cnv_factor the CN fluctuation beyond or below baseline to identify loss and gain fragments for samples with decimal CN, default is "auto", or users can provide a value between 0 and 1
#' @param arm_del_rm the logical value of removing arm level deletion or not, default is TRUE
#' @return a list of files: $cluster_feature is the CGR region vs. feature matrix,$cnv_baseline is the cnv file with baseline annotation
#' @export
starfish_feature=function(cgr,complex_sv,cnv_file,gender_file,prefix="",genome_v="hg19",cnv_factor="auto",arm_del_rm=TRUE){
print("Starfish is computing the feature matrix, please be patient...")
if(genome_v=="hg19"){
chrlength=hg19_chr_length
} else if(genome_v=="hg38"){
chrlength=hg38_chr_length
} else {
print("Wrong genome version!")
}
chrlist=c(as.character(1:22),"X","Y")
cgr$range_size=cgr$end-cgr$start+1
cgr$chr=as.character(cgr$chr)
chrlength$chrom=as.character(chrlength$chrom)
cgr=merge(cgr,unique(chrlength[c("chrom","chr_size")]),by.x=("chr"),by.y=("chrom"))
samplelist=unique(cgr$cluster_id)
complex_sv$end1=complex_sv$pos1
complex_sv$end2=complex_sv$pos2
complex_sv$start1=complex_sv$end1-1
complex_sv$start2=complex_sv$end2-1
complex_sv=complex_sv[complex_sv$sample %in% cgr$sample,]
complex_sv$size=complex_sv$end2-complex_sv$end1
###### format CNV file ######
cnv_total=cnv_file[!is.na(cnv_file$total_cn),]
cnv_total=cnv_total[c("chromosome","start","end","total_cn","sample")]
cnv_total$chromosome=as.character(cnv_total$chromosome)
cnv_total$chromosome=gsub("Chr|chr","",cnv_total$chromosome)
cnv_total$total_cn=round(cnv_total$total_cn,digits=2)
cnv_total$total_cn=round(cnv_total$total_cn,digits=1)
cnv_total=cnv_total[cnv_total$chromosome %in% chrlist,]
cnv_total=cnv_total[cnv_total$start!=cnv_total$end,]
cnv_total$size=cnv_total$end-cnv_total$start+1
cnv_total=cnv_total[with(cnv_total,order(sample, chromosome,start)),]
############## matrix of features ###############
cluster=cgr[c("cluster_id","sample")]
gender_file$gender=ifelse(gender_file$gender%in% c("female","f","F"),"Female",ifelse(gender_file$gender %in% c("male","m","M"),"Male",gender_file$gender))
if(nrow(cluster[!cluster$sample %in% gender_file$sample,])>0){
cluster_no_gender=cluster[!cluster$sample %in% gender_file$sample,]
print(paste0("There is no gender file for sample ",c(unique(cluster_no_gender$sample))))
}
cluster=merge(cluster,gender_file,by.x="sample",by.y="sample")
cluster=cluster[!duplicated(cluster[,c("cluster_id")]),]
cluster$brk_10mb <- 0
cluster$brk_1mb <- 0
cluster$sv_avg_size <- 0
cluster$CN_state <- 0
cluster$cnv_state_mean <- 0
cluster$max_CNV <- 0
cluster$del_size=0
cluster$Loss_size_percentage=0
cluster$Loss_chr_percentage=0
cluster$dup_size=0
cluster$Gain_size_percentage=0
cluster$Gain_chr_percentage=0
cluster$range_size_sum=0
cluster$range_size_mean=0
cluster$arm_deletion=0
cluster$telo_deletion_size=0
cluster$max_telo_loss_percentage=0
cluster$Telo_loss_ratio_mean=0
cluster$Loss_number_10Mb=0
cluster$Gain_number_10Mb=0
cluster$Chrss_size_ratio=0
cluster$telo_vs_chrss_del_mean_ratio=0
cluster$telo_vs_chrss_del_max_ratio=0
cluster$chrss_chr_number=0
cluster$Loss_size_mean_ratio=0
cluster$Gain_size_mean_ratio=0
cluster$Loss_number_length=0
cluster$Gain_number_length=0
cluster$Brk_dispersion_MAD <- 0
cluster$Brk_dispersion_MAD_mean_total=0
####### only calculate feature for samples with cnv #########
if(nrow(cluster[!cluster$sample %in% cnv_total$sample,])>0){
cluster_no=cluster[!cluster$sample %in% cnv_total$sample,]
print(paste0("There is no CNV file for sample ",c(unique(cluster_no$sample))))
}
cluster=cluster[cluster$sample %in% cnv_total$sample,]
row.names(cluster)=1:nrow(cluster)
cnvtotal=data.frame()
for (i in 1:nrow(cluster)) {
print(paste0(format(i/nrow(cluster)*100,digits=1),"% is done..."))
print(paste0("Event ",cluster$cluster_id[i]))
chrss_cluster=cgr[cgr$cluster_id==cluster$cluster_id[i],]
donor_sex=cluster$gender[i]
if(!donor_sex %in% c("Female","Male")){
print (paste0("There is no gender information for sample ",cluster$sample[i],", will infer the gender based on CN of chromosome X"))
}
chrn=length(unique(chrss_cluster$chr))
cluster$chrss_chr_number[i]=chrn
chrss_size_mean=mean(chrss_cluster$range_size)
chrss_size_sum=sum(chrss_cluster$range_size)
cluster$range_size_mean[i]=chrss_size_mean
cluster$range_size_sum[i]=chrss_size_sum
range_ratio=chrss_size_sum/sum(chrss_cluster$chr_size)
cluster$Chrss_size_ratio[i]=range_ratio
cnv=cnv_total[cnv_total$sample==cluster$sample[i],]
cnv=na.omit(cnv)
cnv$ovl=0
cnv$telo_loss=0
if(nrow(cnv)>0){
if(nrow(cnv)>1){
######### merge neighbor fragments with the same cnv ######
for (t in 1:(nrow(cnv)-1)){
if (cnv$chromosome[t]==cnv$chromosome[t+1]&cnv$total_cn[t]==cnv$total_cn[t+1]){
cnv$start[t+1]=cnv$start[t]
cnv$ovl[t]=1
}
}
}
cnv=cnv[cnv$ovl==0,]
cnv$size=cnv$end-cnv$start+1
###################################
##### determine background cnv and deletion #####
cnv_background=as.data.frame(cnv %>% dplyr::group_by(chromosome,total_cn) %>% dplyr::summarize(size=sum(size)))
# cnv_background=cnv_background[cnv_background$total_cn>1.5,]
cnv_background2=as.data.frame(cnv_background %>% dplyr::group_by(chromosome) %>% dplyr::top_n(n=1, wt = size))
colnames(cnv_background2)=c("chromosome","background_cnv","background_size")
cnv=merge(cnv,cnv_background2,by.x="chromosome",by.y="chromosome")
# ####### use the whole CGR baseline CNV as the baseline #######
# cnv_background=as.data.frame(cnv %>% dplyr::group_by(total_cn) %>% dplyr::summarize(size=sum(size)))
# cnv_background2=cnv_background[cnv_background$size==max(cnv_background$size),]
# cnv$background_cnv=cnv_background2$total_cn
# cnv$background_size=cnv_background2$size
###############################
########### determine cnv is decimal or integer #######
cnv_integer=sum(cnv$total_cn %%1)
cnv_integer=ifelse(cnv_integer>0,"decimal","integer")
donor_sex=ifelse(nrow(cnv[cnv$chromosome=="X",])>0,donor_sex,"Male")
cnv_raw=cnv
cnv=cnv[cnv$chromosome %in% chrss_cluster$chr,]
if(cnv_factor=="auto"){
if(cnv_integer=="integer"){
# if (donor_sex=="Male"){
# cnv$copy_loss=ifelse((cnv$total_cn<cnv$background_cnv|(cnv$chromosome!="X"&cnv$total_cn<2)|(cnv$chromosome=="X"&cnv$total_cn<1)),"deletion",ifelse((cnv$total_cn>cnv$background_cnv)&((cnv$chromosome=="X"&cnv$total_cn>1)|(cnv$chromosome!="X"&cnv$total_cn>2)) ,"gain","neutral"))
#
# } else {
# cnv$copy_loss=ifelse((cnv$total_cn<(cnv$background_cnv)|cnv$total_cn<2),"deletion",ifelse(cnv$total_cn>(cnv$background_cnv)&cnv$total_cn>2,"gain","neutral"))
#
# }
cnv_factor=0
loss_factor=1-cnv_factor
gain_factor=1+cnv_factor
#### infer gender based on CN of X ####
if (donor_sex!="Male"&donor_sex!="Female"){
donor_sex=ifelse((unique(cnv_raw[cnv_raw$chromosome=="X",]$background_cnv)<=1*gain_factor&unique(cnv_raw[cnv_raw$chromosome=="X",]$background_cnv)>=1*loss_factor),"Male","Female")
print(paste0("Sample gender is inferred as ",donor_sex,"!"))
}
if (donor_sex=="Male"){
cnv$copy_loss=ifelse((cnv$total_cn<(cnv$background_cnv*loss_factor)|(cnv$chromosome!="X"&cnv$chromosome!="Y"&cnv$total_cn<2*loss_factor)|(cnv$chromosome=="X"&cnv$total_cn<1*loss_factor)|(cnv$chromosome=="Y"&cnv$total_cn<1*loss_factor)),"deletion",ifelse((cnv$total_cn>cnv$background_cnv*gain_factor)&((cnv$chromosome=="X"&cnv$total_cn>1*gain_factor)|(cnv$chromosome!="X"&cnv$chromosome!="Y"&cnv$total_cn>2*gain_factor)|(cnv$chromosome=="Y"&cnv$total_cn>1*gain_factor)) ,"gain","neutral"))
} else {
cnv$copy_loss=ifelse((cnv$total_cn<(cnv$background_cnv*loss_factor)|cnv$total_cn<2*loss_factor),"deletion",ifelse(cnv$total_cn>(cnv$background_cnv*gain_factor)&cnv$total_cn>2*gain_factor,"gain","neutral"))
}
} else if(cnv_integer=="decimal") {
cnv_factor=0.15
loss_factor=1-cnv_factor
gain_factor=1+cnv_factor
if (donor_sex!="Male"&donor_sex!="Female"){
donor_sex=ifelse((unique(cnv_raw[cnv_raw$chromosome=="X",]$background_cnv)<=1*gain_factor&unique(cnv_raw[cnv_raw$chromosome=="X",]$background_cnv)>=1*loss_factor),"Male","Female")
print(paste0("Sample gender is inferred as ",donor_sex,"!"))
}
if (donor_sex=="Male"){
cnv$copy_loss=ifelse((cnv$total_cn<(cnv$background_cnv*loss_factor)|(cnv$chromosome!="X"&cnv$chromosome!="Y"&cnv$total_cn<2*loss_factor)|(cnv$chromosome=="X"&cnv$total_cn<1*loss_factor)|(cnv$chromosome=="Y"&cnv$total_cn<1*loss_factor)),"deletion",ifelse((cnv$total_cn>cnv$background_cnv*gain_factor)&((cnv$chromosome=="X"&cnv$total_cn>1*gain_factor)|(cnv$chromosome!="X"&cnv$chromosome!="Y"&cnv$total_cn>2*gain_factor)|(cnv$chromosome=="Y"&cnv$total_cn>1*gain_factor)) ,"gain","neutral"))
} else {
cnv$copy_loss=ifelse((cnv$total_cn<(cnv$background_cnv*loss_factor)|cnv$total_cn<2*loss_factor),"deletion",ifelse(cnv$total_cn>(cnv$background_cnv*gain_factor)&cnv$total_cn>2*gain_factor,"gain","neutral"))
}
}
} else if (as.numeric(cnv_factor)>=0&as.numeric(cnv_factor)<1) {
loss_factor=1-cnv_factor
gain_factor=1+cnv_factor
if (donor_sex!="Male"&donor_sex!="Female"){
donor_sex=ifelse((unique(cnv_raw[cnv_raw$chromosome=="X",]$background_cnv)<=1*gain_factor&unique(cnv_raw[cnv_raw$chromosome=="X",]$background_cnv)>=1*loss_factor),"Male","Female")
print(paste0("Sample gender is inferred as ",donor_sex,"!"))
}
if (donor_sex=="Male"){
cnv$copy_loss=ifelse((cnv$total_cn<(cnv$background_cnv*loss_factor)|(cnv$chromosome!="X"&cnv$chromosome!="Y"&cnv$total_cn<2*loss_factor)|(cnv$chromosome=="X"&cnv$total_cn<1*loss_factor)|(cnv$chromosome=="Y"&cnv$total_cn<1*loss_factor)),"deletion",ifelse((cnv$total_cn>cnv$background_cnv*gain_factor)&((cnv$chromosome=="X"&cnv$total_cn>1*gain_factor)|(cnv$chromosome!="X"&cnv$chromosome!="Y"&cnv$total_cn>2*gain_factor)|(cnv$chromosome=="Y"&cnv$total_cn>1*gain_factor)) ,"gain","neutral"))
} else {
cnv$copy_loss=ifelse((cnv$total_cn<(cnv$background_cnv*loss_factor)|cnv$total_cn<2*loss_factor),"deletion",ifelse(cnv$total_cn>(cnv$background_cnv*gain_factor)&cnv$total_cn>2*gain_factor,"gain","neutral"))
}
} else {
print("Value out of range! 'cnv_factor' should be larger or equal to 0 and less than 1")
}
######## merge same copy gain or loss fragment #########
cnv_ori=cnv
if (nrow(cnv)>1){
for (t in 1:(nrow(cnv)-1)){
if (cnv$chromosome[t]==cnv$chromosome[t+1]&cnv$copy_loss[t]==cnv$copy_loss[t+1]){
cnv$start[t+1]=cnv$start[t]
cnv$ovl[t]=1
}
}
}
cnv=cnv[cnv$ovl==0,]
#################################
cnv=merge(cnv,unique(chrlength[c("chrom","chr_size")]),by.x=("chromosome"),by.y=("chrom"))
cnvtotal=rbind(cnvtotal,cnv)
###### get overlapped cnv #####
# ###### trim 10bp from the both end of chrss range to allow accurate cnv overlap #####
# chrss_cluster$start=chrss_cluster$start-10
# chrss_cluster$end=chrss_cluster$end+10
#############
chrss_range=makeGRangesFromDataFrame(chrss_cluster[c("chr","start","end","range_size")],keep.extra.columns = T)
cnv_range=makeGRangesFromDataFrame(cnv,keep.extra.columns = T)
cnv_ori_range=makeGRangesFromDataFrame(cnv_ori,keep.extra.columns = T)
chr_range=makeGRangesFromDataFrame(chrlength,keep.extra.columns = T)
######## determine arm-level deletion if a fragment is larger than 95% of the arm #######
chr_ovl=as.data.frame(findOverlapPairs(cnv_range, chr_range,ignore.strand=TRUE))
chr_ovl$cnv_arm=chr_ovl$first.X.width/chr_ovl$second.arm_size
chr_ovl$del_arm=ifelse(chr_ovl$first.copy_loss=="deletion"&chr_ovl$cnv_arm>=0.95,"arm_deletion","no_arm_deletion")
arm_del_chr=as.character(chr_ovl[chr_ovl$del_arm=="arm_deletion",]$first.X.seqnames)
chr_name=unique(cnv$chromosome)
arm_deletion=vector()
for (c in 1:length(chr_name)){
chr_ovl_chr=chr_ovl[chr_ovl$first.X.seqnames==chr_name[c],]
#### determine by deletion proportion ####
arm_deletion_chr=ifelse(chr_ovl_chr$del_arm[1]=="arm_deletion"|chr_ovl_chr$del_arm[nrow(chr_ovl_chr)]=="arm_deletion",1,0)
##### determine by top deleted #####
arm_deletion=c(arm_deletion,arm_deletion_chr)
}
###### don't consider arm deletion if arm_del is not TRUE ######
if (arm_del_rm!=TRUE) {
arm_del_chr=0 }
######## define telomere loss size ###############
telo_deletion_size=vector()
telo_deletion_ratio=vector()
telo_loss=data.frame()
for (c in 1:length(chr_name)){
cnv_ovl_chr=cnv[cnv$chromosome==chr_name[c],]
telo_deletion_chr_size1=ifelse(((!cnv_ovl_chr$chromosome[1] %in% arm_del_chr) & cnv_ovl_chr$copy_loss[1]=="deletion"),cnv_ovl_chr$end[1]+1,0)
telo_deletion_chr_size2=ifelse(((!cnv_ovl_chr$chromosome[nrow(cnv_ovl_chr)] %in% arm_del_chr) &cnv_ovl_chr$copy_loss[nrow(cnv_ovl_chr)]=="deletion"),cnv_ovl_chr$chr_size[nrow(cnv_ovl_chr)]-cnv_ovl_chr$start[nrow(cnv_ovl_chr)]+1,0)
cnv_ovl_chr$telo_loss[1]=ifelse(((!cnv_ovl_chr$chromosome[1] %in% arm_del_chr) & cnv_ovl_chr$copy_loss[1]=="deletion"),1,0)
cnv_ovl_chr$telo_loss[nrow(cnv_ovl_chr)]=ifelse(((!cnv_ovl_chr$chromosome[nrow(cnv_ovl_chr)] %in% arm_del_chr) &cnv_ovl_chr$copy_loss[nrow(cnv_ovl_chr)]=="deletion"),1,0)
telo_loss=rbind(telo_loss,cnv_ovl_chr[cnv_ovl_chr$telo_loss==1,])
telo_deletion_chr_size=ifelse(nrow(cnv_ovl_chr)==1,telo_deletion_chr_size1,telo_deletion_chr_size1+telo_deletion_chr_size2)
telo_deletion_chr_ratio=telo_deletion_chr_size/(unique(cnv_ovl_chr$chr_size))
telo_deletion_size=c(telo_deletion_size,telo_deletion_chr_size)
telo_deletion_ratio=c(telo_deletion_ratio,telo_deletion_chr_ratio)
}
arm_deletion=max(arm_deletion)
telo_deletion_size_sum=sum(telo_deletion_size)
telo_deletion_size_mean=0
telo_deletion_size_max=0
if (telo_deletion_size_sum>0){
telo_deletion_size_t=telo_deletion_size[telo_deletion_size>0]
telo_deletion_size_mean=mean(telo_deletion_size_t)
telo_deletion_size_mean=ifelse(is.na(telo_deletion_size_mean),0,telo_deletion_size_mean)
telo_deletion_size_max=max(telo_deletion_size_t)
telo_deletion_size_max=ifelse(is.na(telo_deletion_size_max),0,telo_deletion_size_max)
}
telo_deletion_ratio_max=max(telo_deletion_ratio)
telo_deletion_ratio_mean=mean(telo_deletion_ratio)
cluster$arm_deletion[i]=arm_deletion
cluster$telo_deletion_size[i]=telo_deletion_size_sum
cluster$max_telo_loss_percentage[i]=telo_deletion_ratio_max
cluster$Telo_loss_ratio_mean[i]=telo_deletion_ratio_mean
######## determine ovelap cnv in the chrss region ######
olaps <- findOverlaps(cnv_ori_range, chrss_range)
cnv_ovl<- as.data.frame(pintersect(cnv_ori_range[queryHits(olaps)], chrss_range[subjectHits(olaps)]))
if(nrow(cnv_ovl)>0){
cnv_ovl$size=cnv_ovl$width
del_size_mean=mean(cnv_ovl[cnv_ovl$copy_loss=="deletion",]$size)
del_size_mean=ifelse(is.na(del_size_mean),0,del_size_mean)
del_mean_por=del_size_mean/sum(chrss_cluster$range_size)
del_por=sum(cnv_ovl[cnv_ovl$copy_loss=="deletion",]$size)/sum(chrss_cluster$range_size)
del_chr_por=sum(cnv_ovl[cnv_ovl$copy_loss=="deletion",]$size)/sum(chrss_cluster$chr_size)
del_number_10Mb=nrow(cnv_ovl[cnv_ovl$copy_loss=="deletion",])/(sum(chrss_cluster$range_size)/10000000)
del_number_length=nrow(cnv_ovl[cnv_ovl$copy_loss=="deletion",])/(sum(chrss_cluster$range_size))
dup_size_mean=mean(cnv_ovl[cnv_ovl$copy_loss=="gain",]$size)
dup_size_mean=ifelse(is.na(dup_size_mean),0,dup_size_mean)
dup_mean_por=dup_size_mean/sum(chrss_cluster$range_size)
dup_por=sum(cnv_ovl[cnv_ovl$copy_loss=="gain",]$size)/sum(chrss_cluster$range_size)
dup_chr_por=sum(cnv_ovl[cnv_ovl$copy_loss=="gain",]$size)/sum(chrss_cluster$chr_size)
dup_number_10Mb=nrow(cnv_ovl[cnv_ovl$copy_loss=="gain",])/(sum(chrss_cluster$range_size)/10000000)
dup_number_length=nrow(cnv_ovl[cnv_ovl$copy_loss=="gain",])/(sum(chrss_cluster$range_size))
###### cnv state per chr ####
chr_uni=unique(cnv_ovl$seqnames)
cnv_state=vector()
for (k in 1:length(chr_uni)){
cnv_uni=cnv_ovl[cnv_ovl$seqnames==chr_uni[k],]
cnv_state_uni=length(unique(cnv_uni$total_cn))
cnv_state=c(cnv_state,cnv_state_uni)
}
cnv_state_max=max(cnv_state)
cnv_state_mean=mean(cnv_state)
cnv_max=max(as.numeric(cnv_ovl$total_cn))
cluster$CN_state[i] <- cnv_state_max
cluster$cnv_state_mean[i] <- cnv_state_mean
cluster$max_CNV[i] <- cnv_max
cluster$del_size[i] <- del_size_mean
cluster$Loss_size_percentage[i]=del_por
cluster$Loss_chr_percentage[i]=del_chr_por
cluster$dup_size[i] <- dup_size_mean
cluster$Gain_size_percentage[i]=dup_por
cluster$Gain_chr_percentage[i]=dup_chr_por
cluster$Loss_number_10Mb[i]=del_number_10Mb
cluster$Gain_number_10Mb[i]=dup_number_10Mb
cluster$Loss_size_mean_ratio[i]=del_mean_por
cluster$Gain_size_mean_ratio[i]=dup_mean_por
cluster$Loss_number_length[i]=del_number_length
cluster$Gain_number_length[i]=dup_number_length
}
cluster$telo_vs_chrss_del_mean_ratio[i]=(telo_deletion_size_mean)/(cluster$del_size[i]+1)
cluster$telo_vs_chrss_del_max_ratio[i]=(telo_deletion_size_max)/(cluster$del_size[i]+1)
###################### sv ##############
chrss_chr=chrss_cluster$chr
sample=unique(chrss_cluster$sample)
sv=complex_sv[complex_sv$sample==sample&(complex_sv$chrom1 %in% chrss_chr|complex_sv$chrom2 %in% chrss_chr),]
if (nrow(sv)>1){
brk_n=nrow(sv[sv$chrom1 %in% chrss_chr,])+nrow(sv[sv$chrom2 %in% chrss_chr,])
brk10=brk_n/(sum(chrss_cluster$range_size)/10000000)
brk1=brk_n/(sum(chrss_cluster$range_size)/1000000)
### size ###
sv_avg=mean(sv[sv$svtype!="TRA",]$size)
sv_avg=ifelse(is.na(sv_avg),0,sv_avg)
sv$min_range=""
### get all break ####
all_dispersion=vector()
all_brk_dispersion=vector()
all_sd=vector()
all_brk_sd=vector()
all_interval_total=vector()
all_rd=vector()
for (p in 1:length(chrss_cluster$chr)){
sv_chr=chrss_cluster$chr[p]
all_brk=sort(c(sv[sv$chrom1==sv_chr,]$start1,sv[sv$chrom2==sv_chr,]$start2))
all_interval=diff(all_brk)
all_interval_total=c(all_interval_total,all_interval)
##### mean absolute deviation with normalize ########
dispersion_mean=mean(abs(as.numeric(all_interval-mean(all_interval))))/mean(as.numeric(all_interval))
dispersion_mean=ifelse(is.na(dispersion_mean),0,dispersion_mean)
all_dispersion=c(all_dispersion,dispersion_mean)
sd_mean=sd(all_interval)/mean(as.numeric(all_interval))
sd_mean=ifelse(is.na(sd_mean),0,sd_mean)
all_sd=c(all_sd,sd_mean)
brk_dispersion_mean=mean(abs(as.numeric(all_brk-mean(all_brk))))/mean(as.numeric(all_brk))
brk_dispersion_mean=ifelse(is.na(brk_dispersion_mean),0,brk_dispersion_mean)
all_brk_dispersion=c(all_brk_dispersion,brk_dispersion_mean)
}
dispersion_mean_total=mean(abs(as.numeric(all_interval_total-mean(all_interval_total))))/mean(as.numeric(all_interval_total))
dispersion_mean_total=ifelse(is.na(dispersion_mean_total),0,dispersion_mean_total)
all_dispersion_mean=mean(all_dispersion)
cluster$Brk_dispersion_MAD[i] <- all_dispersion_mean
cluster$brk_10mb[i] <- brk10
cluster$brk_1mb[i] <- brk1
cluster$sv_avg_size[i] <- sv_avg
cluster$Brk_dispersion_MAD_mean_total[i] <- dispersion_mean_total
}
}
}
print("CGR feature computing is done!")
cluster=cluster[c("sample","cluster_id","max_CNV","Loss_size_percentage","Gain_size_percentage","max_telo_loss_percentage","Brk_dispersion_MAD_mean_total")]
# chrss_cluster_feature=list("cluster_feature"=cluster,"cnv_baseline"=cnvtotal)
chrss_cluster_feature=list("cluster_feature"=cluster)
filename=ifelse(prefix=="","CGR_feature_matrix.csv",paste0(prefix,"_CGR_feature_matrix.csv"))
write.csv(cluster,filename,row.names = F)
return (chrss_cluster_feature)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.