scripts/disruption_calculate_empirical_p_values.R

#'Calculate transcription factor disruption p values for the SNPs
#
#'@param score_changes_file file name containing the score changes between reference and alternate sequences around the input SNPs
#'@param bg_score_changes_dir directory cotaining the background score changes between the reference and alternate sequences aroung the 1000 Genome SNPs
#'@param output_filename the path to the file name to write the output

args<-commandArgs(TRUE);
score_changes_file <- args[1] ;
bg_score_changes_dir  <- args[2] ; 
output_filename <- args[3] ; 

#Extract TF name from the file name which is separated from motif name with @: AHR@M6139_1.02
getTFname <- function(filename){
  input <- as.character(filename)
  tfname <- strsplit(input, "@")[[1]][1]
  return(tfname)
}

#Read the input SNPs file into a data frame
input_snps = read.table(file=score_changes_file,head=F,sep="\t");
#Get the motifs
motifs = unique(input_snps[,1])

output <- data.frame()

print("Reading motifs")
#Read the motifs one by one
for(i in 1:length(motifs))
{
  #Get the motif
  motif = as.character(motifs[i])
  
  #Get the SNPs overlapping with TFBS for the motif
  snp_subset = input_snps[input_snps$V1==motif,]
  
  #Get the background scores for the motif
  bg_score_changes_file = paste0(bg_score_changes_dir, motif);
  if(! file.exists(bg_score_changes_file) )
  {
    next
  }

  #Read the background SNPs table
  bg_snps = read.table(file=bg_score_changes_file,head=F,sep="\t",fill = T);
  bg_snps = bg_snps[complete.cases(bg_snps),]
  bg_snps_count <- nrow(bg_snps)
  
  n <- nrow(snp_subset)
  snp_subset$p_values <- c()
  
  #For each SNP with a TFBS matching the motif, calculate the p value
  for(j in 1:n)
  {
    motif <- as.character(snp_subset[j,1])
    snp <- snp_subset[j,2]
    score_change_abs <- as.numeric(snp_subset[j,5])
    score_change_signed <- as.numeric(snp_subset[j,6])
    position <- as.numeric(snp_subset[j,7])
    
    count_larger <- sum(bg_snps$V3>score_change_abs)  
    
    p_value <- count_larger / bg_snps_count
    
    snp_subset[j,8] <- p_value
    
  }	 
  
  #Add a pseudonumner for 0 p-values to avoid infinite log values
  snp_subset$V8[snp_subset$V8 == 0] <- 0.0000000000001
  q_values <- p.adjust(snp_subset$V8, method = 'BH')
  q_values[q_values==0] <- 0.000000001
  snp_subset$V9 <- q_values
  output <- rbind(output, snp_subset)
  
}

output$V10 = -log10(output$V9)

tf_motif <- data.frame(do.call('rbind', strsplit(as.character(output$V1),'@',fixed=TRUE)))
output2 <- cbind(tf_motif, output)
output2$V1 <- NULL

#Prepare the output
print('Preparing the output')
colnames(output2) <- c('TF','motif', 'snp', 'ref', 'alt', 'score_change', 'score_change_abs', 'position', 'p', 'q', 'logq')

output3 <- output2[FALSE,]
snps <- unique(as.character(output2$snp))

#If the SNP disrupts multiple motifs, select the disruption with minimum p-value 
for(i in 1:length(snps))
{
  snp = snps[i]
  temp = output2[output2$snp == snp, ]
  min_p_index = which.min(temp$p)
  output3 = rbind(output3, temp[min_p_index,])
}
  

#Select the output fields
output4 <- output3[, c('snp', 'ref', 'alt', 'TF', 'motif', 'p', 'q', 'logq', 'position', 'score_change')]
colnames(output4) <- c('snp_id', 'ref', 'alt', 'TF', 'motif', 'disruption_p', 'disruption_q', 'log_disruption_q', 'position', 'score_change')

#Write the output file
write.table(output4, file=output_filename, row.names = F, col.names = T, quote=F, sep = "\t")
gaolong/arvin documentation built on May 28, 2019, 8:41 p.m.