##########################################################################
############################ Main Function ###############################
##########################################################################
#Top level processing script
#marker_file-Tab delmited table with counts of reads aligned to RADseq cut sites,
# first column is strain name, first row is marker chromosome,
# second row is marker (restriction site) position within chromosome
# Assumes markers are sorted by chromosome.
#chromosome_file - Tab delimited table with columns: strain name, chr (mt, 1-7, R,total reads)
# values for chromosome columns are the proportion of all reads in that strain aligned
# to that chromosome. Used to infer relative chromosome copy numbers
#non_diploid_file - Script assumes euploids are diploid, this file allows this to be overriden,
# tab delimted two columns: strain name and median chromosome copy number
#offset_file - Converts by-chromosome marker numbering to genome-wide numbering,
# two columns:chromosome name and offset to convert within-chromosome to genome-wide marker positioning
#coverage_output_dir - Output directory for coverage plots without allele ratios
#log - logfile path
#logfile - logfile name
#het_dir - Output directory for coverage plots with allele ratios
#data_dir - Output directory for data files
#p1_table - Table of counts of "A" alleles at het sites for all strains. First column=markernames (concatenated chromosome and position)
# second column=allele (i.e. base), remaining columns= strains, first row=header ("Allele", strain names)
#p2_table - Table of counts of "B" alleles at het sites for all strains. First column=markernames (concatenated chromosome and position)
# second column=allele (i.e. base), remaining columns= strains, first row=header ("Allele", strain names)
#seg_flag - Flag for whether segmental CNV should be activated for allele ratios
#segmental_file - Identifies segmental patterns, tab-delimited file, first column
# is strain, second is chromosome and third is left to right description of segment copy numbers
# e.g. 12 = copy number one for left end of chromosome and copy number 2 for right end. Maximum of 3 segments.
process_yseq=function(marker_file,chromosome_file,non_diploid_file,offset_file, coverage_output_dir, log, logfile,het_dir,data_dir,p1_table,p2_table,segmental_file,seg_flag){
#Input offsets file for converting from by-chromosome to whole-genome indexing
offsets=read.table(offset_file, sep="\t", stringsAsFactors=FALSE)
#Input counts as list.
#First element is a matrix of the marker counts with rowname being the strains, and no colnames (marker names) as these are held in the next element.
#Second element is vector of chromosome component of the marker positions
#Third element is vector of coverage marker positions within chromosomes (bp)
count_list=input_counts(marker_file)
#Ensure markers are sorted numerically within chromosomes
new_order=order(count_list[[2]],as.numeric(count_list[[3]])) #sort marker numerically within chromosomes
count_list[[1]]=count_list[[1]][,new_order]
count_list[[2]]=count_list[[2]][new_order]
count_list[[3]]=count_list[[3]][new_order]
#Visualize proportion of missing values by strain and column to assess cutoffs
#for filtering
assess_cutoffs(count_list)
#Do strain and marker trimming, trim_rows: edits only (rows) of counts matrix in count_list[[1]]
#trim_cols: trims columns in counts matrix in count_list[[1]] and marker names in count_list[[2]] and count_list[[3]]
count_list=trim_rows(count_list,0.05) #strain must have at least 5% of markers called
count_list=trim_cols(count_list,0.5) #marker must be called in at least 50% of (remaining) strains
#Specify which strains are to be used as the normalization set
#looking for name pattern match (identifies row numbers in the counts matrix)
euploid_strain_indices=c(grep("9318",rownames(count_list[[1]])),grep("AF7",rownames(count_list[[1]])))
#Remove any columns that have 0 counts in more than 20% of euploid_strains
count_list=trim_on_euploids(count_list,euploid_strain_indices,0.8)
#Normalize the counts in each strain to make the median strain value equal to 1 (ignoring 0 counts)
#Hold normalized ratios in counts list element 4
count_list=median_strain_norm(count_list)
#Convert to whole genome numbering
#Hold in counts list element 5
count_list=add_whole_genome_numbering(count_list,offsets)
#For each marker, find median ratio among euploids, hold as counts list element 6.
#This allows normalizing for marker-specific effects on coverage for all other strains.
count_list=add_euploid_medians(count_list,euploid_strain_indices)
#Assess noisiness (SD) of each marker after euploid normalization as counts list element 7
count_list=add_noise_assess(count_list,euploid_strain_indices)
#Plot normalized coverage by marker (with noisiness (SD) cutoff of 0.5)
plot_coverage(coverage_output_dir,count_list,0.5)
#Estimate chromosome copy numbers for each strain from proportion of aligned reads that are aligned to each chromosome
#Note assumes the median chromosome copy number for each strain is 2 (so euploids would be diploid),
#exceptions must be specified in the non_diploid_file
all_ploidy=normalize_chromosome_file(chromosome_file,non_diploid_file)
ploidy=t(all_ploidy[[5]])
#Assign chromosome names
rownames(ploidy)=c("Ca21chr1_C_albicans_SC5314","Ca21chr2_C_albicans_SC5314","Ca21chr3_C_albicans_SC5314","Ca21chr4_C_albicans_SC5314","Ca21chr5_C_albicans_SC5314","Ca21chr6_C_albicans_SC5314","Ca21chr7_C_albicans_SC5314", "Ca21chrR_C_albicans_SC5314")
#Input tables of allele counts (reads matching "A" allele and reads matching "B" allele at each SNP site) for each strain
allele_count_list=count_alleles(rownames(count_list[[1]]),p1_table,p2_table)
#Convert allele table marker names into a table with two columns, first is chromosome on which marker is found and second is position (bp) of marker
#Then sort table first on chromosome and then on marker position, sort allele count table in same way
marker_table=t(sapply(rownames(allele_count_list[[1]]),function(x){y=strsplit(x,"SC5314_")[[1]];y[1]=paste(y[1],"SC5314",sep="");y}))
new_order=order(marker_table[,1],as.numeric(marker_table[,2]))
allele_count_list[[1]]=allele_count_list[[1]][new_order,]
allele_count_list[[2]]=allele_count_list[[2]][new_order,]
marker_table=marker_table[new_order,]
#Calculate model probabilities based on observed allele counts
#This is done for all models (1A:0B, 3A:1B,2A:1B,1A:1B,1A:2B,1A:3B,0A:1B)
#Note that counts of zero will result in log p-values of zero under all models
model_prob_array=calc_model_probs(allele_count_list)
#Calculate the best model across all models
#and the best model based on 1,2,3 or 4 copies of chromosomes
best_model=calc_best_model(model_prob_array)
#Calculate the score of the best model across all possible models
#and the score of the best model based on 1, 2,3 or 4 copies of chromosomes
lod_scores=calc_best_score(model_prob_array)
#Expand ploidy table to match number of markers in model and score tables
#i.e. specify for each marker the copy number of the chromosome on which it is found.
#At this point assumes no segmental copy number variation, only whole chromosome.
#Segmentals can be specified later
ploidy_table=create_ploidy_table(best_model[,,1],ploidy)
#Method not designed for copy numbers better than 4, treat these all as a single class of copy number "5" (see below)
ploidy_table[ploidy_table>4]=5
#Override ploidy for segmentals based on user-supplied known segmental transition patterns
if (seg_flag==1){
ploidy_table=process_segmentals(segmental_file,count_list,ploidy,ploidy_table,marker_table)
}
#Extract best allele ratio model and corresponding lod score for each strain/marker constrained
#by copy number (identical for all markers on the same chromosome except for segmentals specified above)
#Note that the placeholder "5" copy number is treated as unconstrained - best of all models is used
final_model_and_score=extract_final_model_and_score(best_model,lod_scores,ploidy_table)
#Convert marker positions into whole-genome-positions
wgs_table=extract_chrom_and_wg_pos(final_model_and_score, offsets)
#Plot allele ratios along with coverage plots with whole-genome scaling
#with colors indicating allele ratio (copy number "5" treated differently from others)
plot_coverage_with_het(het_dir,count_list,0.5,wgs_table,final_model_and_score,ploidy_table)
#Output data
rownames(ploidy_table)=rownames(lod_scores)
save(allele_count_list,file=paste(data_dir,"/allele_count_list_r",sep=""))
save(model_prob_array,file=paste(data_dir,"/model_prob_array_r",sep=""))
save(best_model,file=paste(data_dir,"/best_model_r",sep=""))
save(lod_scores,file=paste(data_dir,"/lod_scores_r",sep=""))
save(ploidy_table,file=paste(data_dir,"/ploidy_table_r",sep=""))
save(final_model_and_score,file=paste(data_dir,"/final_model_and_score_r",sep=""))
#Output ploidy table as text file
write.table(ploidy_table,file=paste(data_dir,"/ploidy_table.txt",sep=""),sep="\t",quote=F)
#Output log file
write.table(log,file=logfile,col.names=F,row.names=F,quote=F)
}
##########################################################################
######################### Accessory Functions ############################
##########################################################################
#Replaces the ploidy calls in ploidy_table (one for each marker) for
#segmental chromosomes. Normally,ploidy calls are the same for all markers
#on a chromosome, for the segmentals these are replaced with a ploidy call
#for each segment, with edges assessed from marker coverage. The ordered
#set of segmental copy numbers are provided by the user (e.g. after manual
#examination of the coverage plots) in table format, with strain and
#chromosome as the first two columns. The third column is a number
#describing the (left to right) segmental copy numbers, e.g. "12" means
#two segments, with the left end of the chromosome having copy number 1
#and the right, copy number 2. The positions of the transitions between the
#segments are estimated by maximum likelihood based on the normalized coverage
#counts for the RADseq markers assuming normal distribution with means reflecting
#segment copy number
#Can deal with 3 segments at most.
#
#Returns edited ploidy_table
#
#Arguments: segmental chromosome file, counts list,chromosome ploidy (ploidy),
#allele marker ploidy table (ploidy_table), allele count list
process_segmentals=function(segmental_file,count_list,ploidy,ploidy_table,marker_table){
#Read table defining segmental transitions
seg_table=read.table(segmental_file, sep="\t",stringsAsFactors=F)
#Process segmental chromosomes one at a time, replacing corresponding ploidy
#in ploidy_table with new estimates
for (i in 1:length(seg_table[,1])){
#Extract current strain and chromosome and transition pattern
current_strain=seg_table[i,1]
current_chrom=seg_table[i,2]
transitions=as.numeric(strsplit(as.character(seg_table[i,3]),"")[[1]])
#If segmental strain not present in count list move on to next strain
if(!(current_strain %in% rownames(count_list[[4]]))){print (c("Missing strain ", current_strain));next}
#Identify the median coverage ratio for the current strain and
#the copy number that this reflects. Convert euploid-normlized
#coverage ratios (count_list[[4]] for this strain divided by
#euploid medians in count_list[[6]]) to ploidies by multiplying
#ratios (gobal median==1) by median ploidy for strain
#Take log2 of ratios as expect them to be log normal
count_ratios=count_list[[4]][current_strain,]/count_list[[6]]
median_ploidy=median(ploidy[,current_strain])
count_ratios=log(count_ratios*median_ploidy,2)
#Remove ratio 0 values (log -Inf) from ratios and marker chromosome and position IDs
filter=!(count_ratios== -Inf)
all_chroms=count_list[[2]][filter]
all_pos=count_list[[3]][filter]
count_ratios=count_ratios[filter]
#Extract just chromosome of current interest and extract number and position of markers
filter=all_chroms==current_chrom
current_counts=count_ratios[filter]
no_pos=length(current_counts) #number of markers
current_pos=all_pos[filter] #marker positions
#Expect count ratios to be approximately log normal around local copy number
#Create log p-value matrix based on the probability density of each observed
#normalized count ratio (pointwise ploidy estimate) under normal distributions with
#means given by each segmental ploidy. Count ratios already log transformed before calculation but
#log transform the segmental specified copy numbers (expected means of the normal distributions) here.
#Note uses an estimate of the the SD of all log ratios from this strain, using an 11 marker sliding window
#and taking the median value
#Returned matrix has marker positions as cols and rows as the ordered ploidies
#of the transition vector (top to bottom for segments left to right) with log p-values for each marker
#under each model. Later, combine these at different transition points to get global likelihood estimates
global_sd=median(sapply(1:(length(count_ratios)-10),function(x){y=count_ratios[x:(x+10)];sd(y)}))
p_mat=t(sapply(transitions,function(x){(dnorm(current_counts,log(x,2),global_sd,log=T))}))
#Find the transition marker index/indices between segments that maximize(s) the data likelihood
#for the normalized count ratios (pointwise plody estimates)
best_transition_points=c()
if (length(transitions)==2){
trans_score_vect=sapply(1:(no_pos-1),function(x){
first_end=x
second_start=x+1
sum(p_mat[1,1:first_end],p_mat[2,second_start:(no_pos)])
})
best_transition_points=which(trans_score_vect==max(trans_score_vect))
} else {
#For 2 transitions
trans_score_mat=matrix(nrow=no_pos,ncol=no_pos)
if (length(transitions)==3){
for (x in 1:(no_pos-2)){
first_end=x
second_start=x+1
for (y in x:(no_pos-1)){
second_end=y
third_start=y+1
trans_score_mat[x,y]=sum(p_mat[1,1:first_end],p_mat[2,second_start:second_end],p_mat[3,third_start:no_pos])
}
}
best_transition_points=as.numeric(which(trans_score_mat==max(trans_score_mat,na.rm=T),arr.ind=T)[1,])
}
}
#Add left end position (0) to transition points to make it same length as segment state vector (transitions)
#i.e. one transition means two segments (0-transition point, transition point-end)
best_transition_points=current_pos[best_transition_points]
best_transition_points=c(0,best_transition_points)
#Get vector of ploidies and of positions for all allele markers on this chromosome, for this strain
#Note that these are the allele markers NOT coverage markers and coverage ratios as have been using up to now
current_ploidies=ploidy_table[rownames(ploidy_table)==current_chrom,current_strain]
current_marker_pos=as.numeric(marker_table[,2])[marker_table[,1]==current_chrom]
#Replace marker ploidies based on what segment of the chromosome they fall in based on the transition positions
for (i in 1:length(transitions)){
current_ploidies[current_marker_pos>best_transition_points[i]]=transitions[i]
}
#Replace ploidies into full ploidy table for this chromosome and this strain
ploidy_table[rownames(ploidy_table)==current_chrom,current_strain]=current_ploidies
}
return (ploidy_table)
}
#For each strain, plot normalized coverage ratios and smoothed
#allele ratios.
#
#Arguments: output directory for images, counts list, threshold for
#removing noisy markers based on their SD in euploid strains, marker
#table (first column=chrom, second column= genome-wide position),
#final model and score list (two elements, first is best model given
#ploidy constraint for marker, second is LOD vs second best model),
#ploidy table, giving estimated copy number of each allele marker
#(from whole chromosome, or segmental copy number)
plot_coverage_with_het=function(outdir,counts, noisy_threshold,wgs_table,final_model_and_score,ploidy_table){
#Define colors for alternating chromosomes and for alleles based on best model (models 1:7 are: HomA, 3A:1B,2A:1B,1A:1B,1A:2B,1A:3B,HomB)
customcolors=rep(c("black", "grey70"), 20)
silly_cols=as.numeric(as.factor(counts[[2]]))
#Use different color schemes for the 7 allele ratios for each copy numbers. Note that many of these are placeholders
#as models imposible for that coverage (e.g. 3:1 ratio when copy number =3)
#also note that copy number "5" (any copy number >4) uses all potential models but colors them
#to indicate HomA, A>B, A=B, A**10 & counts[[7]]0){
png(out,width = 960, height = 480)
plot(as.numeric(counts[[5]])[filter],current_norm[filter],pch=19,cex=0.5,log="y",ylim=c(0.2,5), col=customcolors[silly_cols[filter]],main=rownames(counts[[1]])[i], ylab="Normalized Coverage",xlab="Genomic Position")
filter2=final_model_and_score[[1]][,strain]>0 & final_model_and_score[[2]][,strain]>log(2,10)
#Then add the median-smoothed (window=7) allele ratio call identified by color
allele_nums=runmed(as.numeric(final_model_and_score[[1]][filter2,strain]),7)
ploidy_nums=runmed(as.numeric(ploidy_table[filter2,strain]),7)
points(as.numeric(wgs_table[filter2,2]),jitter(rep(4,length(wgs_table[filter2,2])),amount=0.4),cex=0.6,col=allele_cols[cbind(allele_nums,ploidy_nums)])
dev.off()
}
}
}
#Takes an initial ploidy table giving copy number of each chromosome (rows=strains, cols=chromosomes)
#and a matrix whose rownames are strains and colnames are markers.
#
#Returns a matrix with the same
#dimensions as the input matrix, but giving the copy number of each marker in each strain set to equal
#the copy number of the appropriate chromosome in that strain
#
#Arguments: Template strain and marker matrix, initial ploidy table
create_ploidy_table=function(initial_table,ploidy){
#Initialize output matrix
ploidy_table=initial_table*0
#Extract chromosome vector from markernames of inital matrix
chroms=as.character(sapply(rownames(initial_table),function(x){
paste(strsplit(x,"SC5314_")[[1]][1],"SC5314",sep="")
}))
#Fill in marker copy number from corresponding chromosome copy number, default to 0 as error
ploidy_table=sapply(chroms,function(x){
sapply(colnames(initial_table),function(y){
cov=0
if((x %in% rownames(ploidy)) & (y %in% colnames(ploidy))){cov=ploidy[x,y]}
cov
})
})
return (t(ploidy_table))
}
#Extracts chromosome and marker position from concatenated unique markernames
#present as rownames in object provided as argument. Markernames are converted from
#chromosome-based to whole-genome-based using offsets table (first column is chrom name, second
#is offset to give whole genome position, for markers on that chromosome).
#
#Returns table with first column = chromosome and second column = whole genome marker position
#
#Arguments: Table with concatenated marker names as row names,
extract_chrom_and_wg_pos=function(final_model_and_score, offsets){
#Split concatenated marker name into chromosome and within-chromosome position as 2 columns
marker_info=t(sapply(rownames(final_model_and_score[[1]]),function(x){
y=strsplit(x,"SC5314_")[[1]]
y[1]=paste(y[1],"SC5314",sep="")
y
}))
#Renumber position using offsets so that is genome-wide
for (m in 1:length(offsets[,1])){
current_chrom=offsets[m,1]
current_offset=as.numeric(offsets[m,2])
marker_info[marker_info[,1]==current_chrom,2]=as.numeric(marker_info[marker_info[,1]==current_chrom,2])+current_offset
}
return(marker_info)
}
#Extracts best allele model and LOD vs second best model for each marker in each strain based on
#constraint of copy number (1-4) of the marker (previously derived from chromosome or segment copy number)
#Uses LOD scores and best model scores - held in arrays with rows=strains, cols=markers and 5 levels
#1=haploids/monosomes, 2=disomes/diploids, 3=trisomes/triploids, 4=tetrasomes/tetraploids, 5=unconstrained
#Fifth level is used for error ploidies (>4)
#
#Returns list with first element giving best model (1:7, HomA,3A:1B, 2A:1B, 1A:1B,1A:2B,1A:3B,HomB) for each marker
#given ploidy constraint and second element giving LOD best model vs second best model for each marker.
#Both elements are matrices with strains=rows, cols=markers
#
#Arguments:best model array (first dimension=strains, second=markers, third is ploidy (1:5), entries are best model constrained by marker copy number),
#LOD array - same structure as model array, but holds LOD score of best model vs second best model (constrained by marker copy number),
#ploidy table (rows=strains,cols=markers, values=copy number of that marker (i.e. chromosome))
extract_final_model_and_score=function(best_model,lod_scores,ploidy_table){
out_list=list()
#For each copy number constraint, populate the best model from the best model array for markers with that copy number
final_model=best_model[,,5] #For error ploidy (not 1-4) give best overall model
final_model[ploidy_table==1]=best_model[,,1][ploidy_table==1]
final_model[ploidy_table==2]=best_model[,,2][ploidy_table==2]
final_model[ploidy_table==3]=best_model[,,3][ploidy_table==3]
final_model[ploidy_table==4]=best_model[,,4][ploidy_table==4]
#For each copy number constraint, populate the LOD of the best model from the LOD array
final_score=lod_scores[,,5] #For error ploidy (not 1-4) give LOD of best overall model vs second best overall model
final_score[ploidy_table==1]=lod_scores[,,1][ploidy_table==1]
final_score[ploidy_table==2]=lod_scores[,,2][ploidy_table==2]
final_score[ploidy_table==3]=lod_scores[,,3][ploidy_table==3]
final_score[ploidy_table==4]=lod_scores[,,4][ploidy_table==4]
out_list[[1]]=final_model
out_list[[2]]=final_score
return (out_list)
}
#Calculates the LOD score of the best model versus the second best model (e.g. homA vs 1A:1B) for each marker in each strain under 5 copy number constraints:
#N=1 - compare HomA, HomB
#N=2 - compare HomA, 1A:1B, HomB
#N=3 - compare HomA, 2A:1B, 1A:2B, HomB
#N=4 - compare HomA, 3A:1B, 2A:2B (1A:1B), 1A:3B, HomB
#N="5" - Unconstrained - best model across all models -HomA, 3A:1B, 2A:1B, 1A:1B, 1A:2B,1A:3B, HomB
#
#Returns array with rows=strains, cols=markers and 5 levels, corresponding to the copy number models above in order 1:5.
#Each entry is the LOD best vs second best for that ploidy constraint.
#
#Arguments:model probability array (log probabilities from model probability array (rows=strains, cols=markers, 7 levels one for each model: HomA, 3A:1B, 2A:1B, 1A:1B, 1A:2B,1A:3B, HomB)).
#Note that these are binomial probabilities without the permutation term as permutation term is the same under all models and cancel when LOD calculated.
calc_best_score=function(model_prob_array){
out_array=array(data = 0, dim = c(length(model_prob_array[,1,1]),length(model_prob_array[1,,1]),5),dimnames=list(rownames(model_prob_array),colnames(model_prob_array),c(1:5)))
#Calculate LOD of best model vs second best model across all models (N="5")
out_array[,,5]=sapply(1:length(model_prob_array[1,,1]),function(x){
apply(model_prob_array[,x,],1,function(y){
z=y[order(as.numeric(y),decreasing=TRUE)]
z[1]-z[2]
})
})
#For only models consistent with N=1
out_array[,,1]=sapply(1:length(model_prob_array[1,,1]),function(x){
apply(model_prob_array[,x,c(1,7)],1,function(y){
z=y[order(as.numeric(y),decreasing=TRUE)]
z[1]-z[2]
})
})
#For N=2
out_array[,,2]=sapply(1:length(model_prob_array[1,,1]),function(x){
apply(model_prob_array[,x,c(1,4,7)],1,function(y){
z=y[order(as.numeric(y),decreasing=TRUE)]
z[1]-z[2]
})
})
#For N=3
out_array[,,3]=sapply(1:length(model_prob_array[1,,1]),function(x){
apply(model_prob_array[,x,c(1,3,5,7)],1,function(y){
z=y[order(as.numeric(y),decreasing=TRUE)]
z[1]-z[2]
})
})
#For N=4
out_array[,,4]=sapply(1:length(model_prob_array[1,,1]),function(x){
apply(model_prob_array[,x,c(1,2,4,6,7)],1,function(y){
z=y[order(as.numeric(y),decreasing=TRUE)]
z[1]-z[2]
})
})
return (out_array)
}
#Identifies the best model (e.g. homA or 1A:1B) for each marker in each strain under 5 copy number constraints:
#N=1 - compare HomA, HomB
#N=2 - compare HomA, 1A:1B, HomB
#N=3 - compare HomA, 2A:1B, 1A:2B, HomB
#N=4 - compare HomA, 3A:1B, 2A:2B (1A:1B), 1A:3B, HomB
#N="5" - Unconstrained - best model across all models -HomA, 3A:1B, 2A:1B, 1A:1B, 1A:2B,1A:3B, HomB
#
#Returns array with rows=strains, cols=markers and 5 levels, corresponding to the copy number models above in order 1:5.
#Each entry is the best for that ploidy constraint (1:7 = HomA, 3A:1B, 2A:1B, 1A:1B, 1A:2B,1A:3B, HomB).
#
#Arguments:model probability array (log probabilities from model probability array (rows=strains, cols=markers, 7 levels one for each model: HomA, 3A:1B, 2A:1B, 1A:1B, 1A:2B,1A:3B, HomB).
#Note that these are binomial probabilities without the permutation term as permutation term is the same under all models and cancel when LOD calculated.
calc_best_model=function(model_prob_array){
out_array=array(data = 0, dim = c(length(model_prob_array[,1,1]),length(model_prob_array[1,,1]),5),dimnames=list(rownames(model_prob_array),colnames(model_prob_array),c(1:5)))
#Find best model across all models
print ("Testing All Models")
out_array[,,5]=sapply(1:length(model_prob_array[1,,1]),function(x){
apply(model_prob_array[,x,],1,function(y){
c(1:7)[order(as.numeric(y),decreasing=TRUE)][1]
})
})
#For only models consistent with N=1
print ("Testing N=1")
out_array[,,1]=sapply(1:length(model_prob_array[1,,1]),function(x){
apply(model_prob_array[,x,c(1,7)],1,function(y){
c(1,7)[order(as.numeric(y),decreasing=TRUE)][1]
})
})
#For only models consistent with N=2
print ("Testing N=2")
out_array[,,2]=sapply(1:length(model_prob_array[1,,1]),function(x){
apply(model_prob_array[,x,c(1,4,7)],1,function(y){
c(1,4,7)[order(as.numeric(y),decreasing=TRUE)][1]
})
})
#For only models consistent with N=3
print ("Testing N=3")
out_array[,,3]=sapply(1:length(model_prob_array[1,,1]),function(x){
apply(model_prob_array[,x,c(1,3,5,7)],1,function(y){
c(1,3,5,7)[order(as.numeric(y),decreasing=TRUE)][1]
})
})
#For only models consistent with N=4
print ("Testing N=4")
out_array[,,4]=sapply(1:length(model_prob_array[1,,1]),function(x){
apply(model_prob_array[,x,c(1,2,4,6,7)],1,function(y){
c(1,2,4,6,7)[order(as.numeric(y),decreasing=TRUE)][1]
})
})
return (out_array)
}
#Calculates the probability of the observed a and b allele frequencies at each strain/marker
#under 7 different haplotype models:
#homozygous A (B error at 0.01), 3A:1B, 2A:1B,1A:1B (or 2A:2B), 1A:2B, 1A:3B, hom B (A error at 0.01)
#
#Returns array, rows=strains, cols=markers and 3rd dimension has 7 levels each corresponding to one of the models above in order 1:7.
#Each entry is the observed binomimal probability of the observed allele counts in that strain, for that marker, using the current model.
#NOTE THAT PERMUTATION COMPONENT OF PROBABILITY HAS BEEN DROPPED AS CANCEL OUT IN CALCULATING LOD
#SCORES BETWEEN MODELS AT LATER STEP
#
#Arguments: Allele count list (first element is A counts, second is B counts, counts in table, rows are markers, first column is allele name (A T G or C), rest are strains with values A or B allele counts at that marker in that strain)
calc_model_probs=function(allele_count_list){
#First get the allele counts (drop the vector of alleles from the first column)
a_counts=allele_count_list[[1]][,-1]
b_counts=allele_count_list[[2]][,-1]
#Create an array with 7 levels (one for each model) and each level a matrix with rows=strains and cols=markers
#each entry is the observed bionimal probability of the observed allele counts in that strain, for that marker
#, using the current model. Note that the permutation term has been removed from the p-value as these cancel
#out when calculating LOD scores between models
model_calcs=array(data = 0, dim = c(length(a_counts[,1]),length(a_counts[1,]),7),dimnames=list(rownames(a_counts),colnames(a_counts),c(1:7)))
model_calcs[,,1]=as.matrix(log(0.99,10)*a_counts+log(0.01,10)*b_counts)
model_calcs[,,2]=as.matrix(log(3/4,10)*a_counts+log(1/4,10)*b_counts)
model_calcs[,,3]=as.matrix(log(2/3,10)*a_counts+log(1/3,10)*b_counts)
model_calcs[,,4]=as.matrix(log(0.5,10)*a_counts+log(0.5,10)*b_counts)
model_calcs[,,5]=as.matrix(log(1/3,10)*a_counts+log(2/3,10)*b_counts)
model_calcs[,,6]=as.matrix(log(1/4,10)*a_counts+log(3/4,10)*b_counts)
model_calcs[,,7]=as.matrix(log(0.01,10)*a_counts+log(0.99,10)*b_counts)
return(model_calcs)
}
#Creates allele count list with element 1 being the counts of a alleles at each observed marker in all
#strains, and the 2nd element being the b count. Columns after first are strain names, rows are markers with markernames
#as row names. First column is the allele (a or b as appropriate, with col name "Allele", values will be A, T, G or C)
#
#Returns allele count list
#
#Arguments: strain names and "A" and "B" allele count tables
count_alleles=function(labels,p1_file,p2_file){
out_list=list()
marker_names=c() #Vector of all marker names across all strains (no duplicate markers)
#Input the allele call files
p1_table=read.table(p1_file, sep="\t", stringsAsFactors=FALSE,check.names=F,header=T)
p2_table=read.table(p2_file, sep="\t", stringsAsFactors=FALSE,check.names=F,header=T)
#Set markernames as rownames
rownames(p1_table)=p1_table[,1]
rownames(p2_table)=p2_table[,1]
p1_table=p1_table[,-1]
p2_table=p2_table[,-1]
reverse_filter=labels %in% colnames(p1_table)
p1_table=p1_table[,c("Allele",labels[reverse_filter])]
p2_table=p2_table[,c("Allele",labels[reverse_filter])]
#Remove any positions with only a single non-zero value
p1_good=apply(p1_table,1,function(x){length(which(x>0))>1})
p2_good=apply(p2_table,1,function(x){length(which(x>0))>1})
filter=p1_good & p2_good
p1_table=p1_table[filter,]
p2_table=p2_table[filter,]
#output the a and b allele count tables
out_list[[1]]=p1_table
out_list[[2]]=p2_table
return (out_list)
}
#Plots normalized coverage using counts list
#Markers are filtered based on their noisiness in the euploid control strains
#
#Arguments: output directory for images, counts list and threshold for filtering noisy markers
plot_coverage=function(outdir,counts, noisy_threshold){
#Set alternating grey and black colors for chromosomes
customcolors=rep(c("black", "grey70"), 20)
silly_cols=as.numeric(as.factor(counts[[2]]))
#Output results on a by-marker basis, using the noisy threshold to clean up the marker noise
#and only plotting points with at least a count of 10 in the current strain
for (i in 1:length(rownames(counts[[1]]))){
#Set output filename and normalize current strain counts (already strain median-normalized) using
#euploid strain medians (controls for systemic high- and low-coverage markers caused by fragment length effects etc)
out=paste(outdir,rownames(counts[[1]])[i],".png",sep="")
current_norm=counts[[4]][i,]/counts[[6]]
#No plottting if no markers pass filters
filter=counts[[1]][i,] >10 & counts[[7]]0){
png(out,width = 960, height = 480)
plot(current_norm[filter],pch=19,cex=0.5,log="y",ylim=c(0.1,10), col=customcolors[silly_cols[filter]],main=rownames(counts[[1]])[i], ylab="Normalized Coverage")
dev.off()
}
}
}
#Assess marker standard deviation in euploids , allowing highly noisy markers
#to be identified and filtered later. Measures SD of log transformed normalized
#coverage for each marker in the euploids.
#
#Returns vector of log SD values as counts list element 7.
#
#Arguments:counts list (uses element 4 - matrix of counts normalized for each strain and element 6 -
#median normalized ratio across all euploids) and euploid strain indices within counts list element 4
add_noise_assess=function(counts,euploid_strain_nos){
#Normalize euploid markers for marker-specific effects on coverage
temp_norm=t(apply(counts[[4]][euploid_strain_nos,],1,function(x){x/counts[[6]]}))
#Measure SD of log transformed values for each marker, ignoring 0 values
counts[[7]]=apply(temp_norm,2,function(x){sd(log(x[x>0]))})
hist(counts[[7]],xlab="SD of Normalized Coverage", main="")
return(counts)
}
#Add vector of euploid medians to counts list as element 6
#based on counts elements 4 (median normalized counts for each strain)
#
#Returns modified list.
#
#Arguments: counts list and euploid strain indices as arguments
add_euploid_medians=function(counts,euploid_strain_indices){
#Calculate median (ignoring 0 values) for each marker in euploids
counts[[6]]=apply(counts[[4]][euploid_strain_indices,],2,function(x){median(x[x>0])})
return(counts)
}
#Uses offset file to covert coverage marker positions within each chromosome
#to genome-wide positions.Takes offset filename and counts list and converts
#counts list element 3 (position) to genome-wide version as element 5
#
#Returns modified counts list with genome-wide marker positions as element 5
#
#Takes counts list and offset filename as arguments
add_whole_genome_numbering=function(counts,offsets){
#Get within-chromosome indexing vector
wg_pos_names=counts[[3]]
#Process chromosomes one at a time
for (m in 1:length(offsets[,1])){
#Get offset for current chromosome
current_chrom=offsets[m,1]
current_offset=as.numeric(offsets[m,2])
#Match chromosome (list element 2) to current chromosome and offset positions accordingly
wg_pos_names[counts[[2]]==current_chrom]=wg_pos_names[counts[[2]]==current_chrom]+current_offset
}
counts[[5]]=wg_pos_names
return(counts)
}
#Input counts file and convert to first element in a list, held as a matrix
#with rowname being the strains, and no colnames, each row is a strain and each column
#is a marker, values are counts for that strain/marker combo. Second element is a vector
#of the chromosome names for the markers. Third element is marker position within its chromosome (bp)
#Original counts file should have: first column = strain name, rest of columns
#are the marker counts, rows are strains. There may also be a final total counts column
#("Total_Reads"), which is removed. First row is chromosome, second row is base position on that chromosome.
#
#Returns counts list with 3 elements described above
#
#Arguments: counts filename as argument.
input_counts=function(file_name){
#Input data
counts=list()
counts[[1]]=read.table(file_name, sep="\t", stringsAsFactors=FALSE)
#Remove totals column if exists
last_col=length(counts[[1]][1,])
if(counts[[1]][1,last_col]=="Total_Reads"){counts[[1]]=counts[[1]][,-last_col]}
#Extract rownames
rownames(counts[[1]])=counts[[1]][,1]
counts[[1]]=counts[[1]][,-1]
#Get chrom and position
counts[[2]]=as.character(counts[[1]][1,])
counts[[3]]=as.numeric(counts[[1]][2,])
counts[[1]]=counts[[1]][c(-1,-2),]
#Convert NA to 0 and make into numeric matrix
temp=rownames(counts[[1]])
counts[[1]][is.na(counts[[1]])]=0
counts[[1]]=as.matrix(counts[[1]])
counts[[1]]=apply(counts[[1]],2,as.numeric)
rownames(counts[[1]])=temp
return(counts)
}
#Visualize proportion of 0 marker calls by strain and by marker position
#allows user to define cutoffs for trimming low-coverage strains and markers
#
#Arguments: Counts list (uses element 1 - raw counts per strain)
assess_cutoffs=function(counts){
#Examine distribution of missing data by row (strain) and by column (marker)
col_prop=apply(counts[[1]],2,function(x){x=as.numeric(x); sum(x>0)/length(x)})
row_prop=apply(counts[[1]],1,function(x){x=as.numeric(x); sum(x>0)/length(x)})
hist(col_prop,xlab="Proportion of Alleles Called",main="Markers")
hist(row_prop,xlab="Proportion of Alleles Called",main="Strains")
}
#Remove strains that have too much missing data using cutoff defined by user.
#
#Returns counts list with modified element 1, with strains failing cutoff removed
#
#Arguments: counts list and missing strain cutoff
trim_rows=function(counts,strain_cutoff){
#Examine distribution of missing data by strain
row_prop=apply(counts[[1]],1,function(x){x=as.numeric(x); sum(x>0)/length(x)})
good_strains=c(row_prop>strain_cutoff)
counts[[1]]=counts[[1]][good_strains,]
return(counts)
}
#Remove markers that have too much missing data
#Cutoff defined by user.
#
#Returns modified counts list with elements 1 (counts), 2 (marker chromosome) and 3 (marker bp position in chromosome)
#trimmed to remove markers with high missing data.
#
#Arguments: counts list and missing marker cutoff as arguments
trim_cols=function(counts,marker_cutoff){
#Examine distribution of missing data by strain and by column
col_prop=apply(counts[[1]],2,function(x){x=as.numeric(x); sum(x>0)/length(x)})
good_markers=c(col_prop>marker_cutoff)
counts[[1]]=counts[[1]][,good_markers]
counts[[2]]=counts[[2]][good_markers]
counts[[3]]=counts[[3]][good_markers]
return(counts)
}
#Remove markers that are absent in too many of the euploid control strains.
#Cutoff defined by user.
#
#Returns modified counts list with elements 1 (counts), 2 (marker chromosome) and 3 (marker bp position in chromosome)
#trimmed to remove markers with high missing data in euploids.
#
#Arguments: counts list, euploid strain indices (for euploid rows in counts[[1]]) and missing marker cutoff (proportion) as arguments
trim_on_euploids=function(counts,euploid_strain_indices,cutoff){
#Apply missing data marker cutoffs to euploid rows of counts[[1]]
good_cols=apply(counts[[1]][euploid_strain_indices,],2,function(x){x=as.numeric(x); sum(x>0)/length(x) >cutoff})
counts[[1]]=counts[[1]][,good_cols]
counts[[2]]=counts[[2]][good_cols]
counts[[3]]=counts[[3]][good_cols]
return(counts)
}
#Normalize coverage values for each strain to the median value for that strain.
#Opeartes on first element of counts list (counts matrix with rows=strains,cols=markers)
#
#Returns as element 4 in counts list
#
#Arguments:counts list
median_strain_norm=function(counts){
#Divide coverage values by median (of non-zero values)
counts[[4]]=t(apply(counts[[1]],1,function(x){x/median(x[x>0])}))
return(counts)
}
#Estimates copy number of each chromosome for each strain based on the proportion
#of all aligned reads that align to that chromosome and normalization of those
#values to a reference panel of euploid strains. Note will default to a median
#copy number of 2 for euploids i.e. will be called as diploids. Default behavior
#can be overridden by specifying median chromosome copy number for a strain
#in a sepearate ploidy_adjust_file
#
#Returns: list with following components:
#[[1]]=raw aligned read proportion for each strain
#[[2]]=proportions normalized to euploid controls
#[[3]]=chromosome median normalized to 1 for each strain
#[[4]]=ploidy_estimate-element 3 multiplied by factor that gives minimum RMS versus nearest whole number for each chromosome
#[[5]]=element 4 rounded to nearest whole number
#
#Arguments:chromosome_file (rows=strains, columns=proportion of reads aligned to each chromosome),
# ploidy_adjust_file (override for strains where median chromosome copy number !=2)
normalize_chromosome_file=function(chromosome_file, ploidy_adjust_file){
#List to hold output
out=list()
#Input proportion of reads on each chromosome to chromosome table
#and read in ploidy override table
raw_chrom=read.table(chromosome_file,stringsAsFactors=F,sep="\t")
ploidy_override=read.table(ploidy_adjust_file,stringsAsFactors=F,sep="\t")
#Extract strain names as rownames and keep chr1-R columns
ploidy_vect=as.numeric(ploidy_override[,2])
names(ploidy_vect)=ploidy_override[,1]
rownames(raw_chrom)=raw_chrom[,1]
raw_chrom=raw_chrom[,3:10]
#Identify control strains and calculate medians for each chromosome
control_nos=c(grep("9318",rownames(raw_chrom)),grep("AF7",rownames(raw_chrom)))
control_meds=apply(raw_chrom[control_nos,],2,median)
#Divide all chromosomes by control medians
first_norm_chrom=t(apply(raw_chrom,1,function(x){x/control_meds}))
#Normalize median chromosome of each strain to 1
second_norm_chrom=t(apply(first_norm_chrom,1,function(x){x/median(x)}))
#Create matrix of factors from 1.5 to 5.5
test_factors=c(15:55)/10
#Convert to ploidy estimate
ploidy_estimate=t(apply(second_norm_chrom,1,function(y){
#Create ploidy values for each level of the factor
ploidy_range=t(sapply(test_factors,function(x){x*y}))
#Calculate RMS vs nearest whole numbers, note that for euploids this will
#give 2 as the best result
round_ploidy_range=round(ploidy_range)
diff_ploidy=round_ploidy_range-ploidy_range
rms_ploidy=apply(diff_ploidy,1,function(x){mean(x**2)**0.5})
#Do not allow ploidy estimates higher than 5
too_high=apply(round_ploidy_range,1,function(x){any(x>5)})
rms_ploidy[too_high]=1000
final_ploidy=ploidy_range[which(rms_ploidy==min(rms_ploidy))[1],]
}))
#Extend limit factors to cover haploids (where specified in the override table)
test_factors=c((1:14)/10,test_factors)
#Carry out override
for (i in 1:length(ploidy_vect)){
#Get current strain name and the preset median chromosome ploidy
strain=names(ploidy_vect)[i]
real_ploidy=ploidy_vect[i]
#Only use range of factors around override preset so that when rounded median will have
#preset value
limit_factors=test_factors[test_factors>real_ploidy-0.5 & test_factors**