R/qtl_postProcess.R

Defines functions getFDR calculatePairwiseConcordance estimateLeadVariantConcordance extractQTLsFromList addExpectedPvalue calculatePairR2 calculateR2FromLead addR2FromLead filterGeneR2 filterHitsR2 calculatePermutationPvalues calculatePairwisePi1 calculatePi1

Documented in addExpectedPvalue addR2FromLead calculatePairwisePi1 calculatePermutationPvalues calculatePi1 extractQTLsFromList filterHitsR2 getFDR

#' Calculate Pi1 replicability statistic between two sets of pvalues
#' 
#' Currently expects the following columns: gene_id, qvalue and p_beta.
#' 
#' @param table1 First table maximum p-values per feature.
#' @param table2 Second table of maximum p-values per feature.
#' @param qvalue_thresh qvalue threshold for table1.
#' @return None
#' @author Kaur Alasoo
#' @export 
calculatePi1 <- function(table1, table2, qvalue_thresh = 0.1, feature_id = "gene_id"){
  #Identify significant hits from first table
  table1_hits = dplyr::filter(table1, qvalue < qvalue_thresh)
  #Extract the same genes from the second table
  table2_hits = dplyr::semi_join(table2, table1_hits, by = feature_id)
  #Estimate the proportion of replicated qtls
  pi1 = 1 - qvalue::qvalue(table2_hits$p_beta)$pi0
  return(pi1)
}

#' Calculate all pairwise Pi1 statistics for a list p-value tables.
#' 
#' Currently expects the following columns: gene_id, qvalue and p_beta.
#' 
#' @param qtl_list List of p-value tables
#' @param qvalue_thresh qvalue threshold for table1.
#' @return None
#' @author Kaur Alasoo
#' @export 
calculatePairwisePi1 <- function(qtl_list, qvalue_thresh = 0.1, tidy = FALSE, feature_id = "gene_id"){
  sample_names = names(qtl_list)
  rep_matrix = matrix(1,length(sample_names),length(sample_names))
  colnames(rep_matrix) = sample_names
  rownames(rep_matrix) = sample_names
  
  #Iterate through all pairs of p-values
  for (sn1 in 1:length(sample_names)){
    for (sn2 in 1:length(sample_names)){
      if (sn1 != sn2){
        rep_matrix[sn1, sn2] = calculatePi1(qtl_list[[sn1]], qtl_list[[sn2]], qvalue_thresh, feature_id)
      }
    }
  }
  
  #If tidy then return data frme insted of matrix
  if(tidy == TRUE){
    res = as.data.frame(rep_matrix) %>% 
      dplyr::mutate(first = rownames(rep_matrix)) %>% 
      dplyr::select(first, everything()) %>% 
      tidyr::gather("second","pi1",2:(ncol(rep_matrix)+1)) %>% 
      dplyr::arrange(first, second)
    return(res)
  }
  return(rep_matrix)
}

#' Calculate emiprical permutation p-values
#'
#' For each element in the te vector of original p-values, count the number of permutation p-values that are smaller than that.
#' 
#' @param raw_pvalues Vector of raw p-values -log10 form the original data.
#' @param output_dir Vector of maximum -log10 pvalues from the permuted data.
#' @return None
#' @author Kaur Alasoo
#' @export 
calculatePermutationPvalues <- function(raw_pvalues, max_permutation_pvalues){
  
  #Count the number of permutations
  n_perm = length(max_permutation_pvalues)
  
  #Count the number of pvalues
  n_pvalues = length(raw_pvalues)
  
  #Construct matrix of permutation pvalues
  perm_mat = matrix(rep(sort(max_permutation_pvalues), n_pvalues), nrow = n_pvalues, byrow = TRUE)
  comparison = perm_mat - raw_pvalues
  comparison[comparison > 0] = 1
  comparison[comparison <= 0] = 0
  permutation_pvalues = (rowSums(comparison) + 1) / (n_perm + 1)
  
  return(permutation_pvalues)
}


#' Filter SNPs based on R2
#'
#' Identify genes that have two different lead SNPs and remove one of them if 
#' the R2 between the two SNPs is greater than R2_thresh.
#'
#' @param feature_snp_pairs Data frame with two columns: gene_id, snp_id.
#' @param genotypes Genotype matrix (SNPs in rows, samples in columns)
#' @param R2_thresh R2 threshold, default 0.8.
#'
#' @return Filtered feature_snp_pairs that does not cotain highly correlated SNPs for single feature.
#' @export
filterHitsR2 <- function(feature_snp_pairs, genotypes, R2_thresh = 0.8){
  
  #Filter genotypes matrix to only include the relevant variants
  genotypes = genotypes[unique(feature_snp_pairs$snp_id),]
  
  #Count SNPs per gene
  snp_count = dplyr::group_by(feature_snp_pairs, gene_id) %>% dplyr::summarise(snp_count = length(snp_id))
  single_snps = dplyr::semi_join(feature_snp_pairs, dplyr::filter(snp_count, snp_count == 1), by = "gene_id")
  multi_snps = dplyr::anti_join(feature_snp_pairs, dplyr::filter(snp_count, snp_count == 1), by = "gene_id")
  
  #Filter genes with multiple genes
  multi_snp_list = plyr::dlply(multi_snps, "gene_id")
  multi_snp_filtered_list = lapply(multi_snp_list, filterGeneR2, genotypes, R2_thresh)
  multi_snp_filtered = plyr::ldply(multi_snp_filtered_list, .id = NULL)
  result = rbind(single_snps, multi_snp_filtered)
  return(result)
}

#Helper function for filterHitsR2
filterGeneR2 <- function(gene_df, genotypes, r2_thresh){
  genotype_matrix = t(genotypes[gene_df$snp_id,])
  r2 = cor(genotype_matrix, use = "pairwise.complete.obs")^2
  r2 = r2 - diag(1, nrow(r2))
  r2[lower.tri(r2)] = 0
  gene_df[colSums(r2 > r2_thresh) == 0,]
}

#' Calculate R2 between the lead SNP and all other SNPs in the data frame and adds corresponding column.
#'
#' Assumes that lead SNP is in the first row of the gene_df data frame 
#' (data frame is sorted by p-value).
#'
#' @param gene_df Data frame of QTL snps (required column: gene_id)
#' @param genotypes Genotype matrix
#'
#' @return gene_df data frame with R2 column added.
#' @export
addR2FromLead <- function(gene_df, genotypes){
  
  assertthat::assert_that(!is.null(gene_df))
  assertthat::assert_that(assertthat::has_name(gene_df, "snp_id"))
  
  #Extract genotype matrix
  genotype_matrix = t(genotypes[gene_df$snp_id,])
  
  #If more than one SNP then calculate R2
  if( nrow(gene_df) > 1 ){ 
    #Calculate R2 between the first SNP in the genotype matrix and all other SNPs
    r2 = apply(genotype_matrix, 2, function(x, y){ cor(x,y,use = "pairwise.complete.obs") }, genotype_matrix[,1])^2
    gene_df = dplyr::mutate(gene_df, R2 = r2)
  } else { #If only one SNP then set R2 to 1
    gene_df = dplyr::mutate(gene_df, R2 = 1)
  }
  return(gene_df)
}

calculateR2FromLead <- function(snp_ids, genotypes){
  
  #Extract genotype matrix
  genotype_matrix = t(genotypes[snp_ids,])
  
  #If more than one SNP then calculate R2
  if( length(snp_ids) > 1 ){ 
    #Calculate R2 between the first SNP in the genotype matrix and all other SNPs
    r2 = apply(genotype_matrix, 2, function(x, y){ cor(x,y,use = "pairwise.complete.obs") }, genotype_matrix[,1])^2
  } else { #If only one SNP then set R2 to 1
    r2 = 1
  }
  return(r2)
}

calculatePairR2 <- function(snp_id1, snp_id2, genotypes){
  
  #Extract genotypes
  gt1 = genotypes[snp_id1,]
  gt2 = genotypes[snp_id2,]
  
  #Calcuate R2
  r2 = cor(gt1, gt2, use = "pairwise.complete.obs")^2
  return(r2)
}

#' Add expected p-value for Q-Q plots
#'
#' @param pvalue_df 
#'
#' @return pvalue_df with p_expected column
#' @export
addExpectedPvalue <- function(pvalue_df){
  dplyr::arrange(pvalue_df, p_eigen) %>%
    dplyr::mutate(p_expected = c(1:length(p_eigen))/length(p_eigen))
}

#' Extract all QTLs at a specific FDR level from a list of min pvalues by condition
#'
#' Multiple variants per gene are sorted by p-value
#'
#' @param min_pvalue_list List of QTLs per condition
#' @param fdr_cutoff 
#'
#' @return Data frame of QTLs
#' @export
extractQTLsFromList <- function(min_pvalue_list, fdr_cutoff = 0.1){
  min_hits = purrr::map(min_pvalue_list, ~dplyr::filter(.,p_fdr < fdr_cutoff))
  qtl_df = purrr::map_df(min_hits, identity, .id = "condition_name") %>% 
    dplyr::group_by(gene_id) %>% 
    dplyr::arrange(p_nominal) %>%
    dplyr::ungroup()
  return(qtl_df)
}

estimateLeadVariantConcordance <- function(table1, table2, filter_formula, genotype_matrix, feature_id = "gene_id"){
  #Identify significant hits from first table
  table1_hits = dplyr::filter_(table1, filter_formula) %>%
    dplyr::rename(snp_id1 = snp_id) %>%
    dplyr::select_(feature_id, "snp_id1")
  #Extract the same genes from the second table
  table2_hits = dplyr::semi_join(table2, table1_hits, by = feature_id) %>%
    dplyr::rename(snp_id2 = snp_id) %>%
    dplyr::select_(feature_id, "snp_id2")
  #Estimate the proportion of replicated qtls
  joint_hits = dplyr::left_join(table1_hits, table2_hits, by = feature_id) %>%
    dplyr::filter(!is.na(snp_id1), !is.na(snp_id2)) #Remove NAs

  #Calculate R2 between lead variants
  g1 = vcf_file$genotypes[joint_hits$snp_id1,]
  g2 = vcf_file$genotypes[joint_hits$snp_id2,]
  r2 = diag(cor(t(g1), t(g2), use = "pairwise.complete.obs"))^2
  result = dplyr::mutate(joint_hits, R2 = r2)
  
  return(result)
}

calculatePairwiseConcordance <- function(qtl_list, filter_formula, genotype_matrix, 
                                         tidy = FALSE, feature_id = "gene_id", R2_thresh = 0.8){
  sample_names = names(qtl_list)
  rep_matrix = matrix(1,length(sample_names),length(sample_names))
  colnames(rep_matrix) = sample_names
  rownames(rep_matrix) = sample_names
  
  #Iterate through all pairs of p-values
  for (sn1 in 1:length(sample_names)){
    for (sn2 in 1:length(sample_names)){
      if (sn1 != sn2){
        concordance = estimateLeadVariantConcordance(qtl_list[[sn1]], qtl_list[[sn2]], 
                                                              filter_formula, genotype_matrix, feature_id)
        rep_matrix[sn1, sn2] = length(which(concordance$R2 > R2_thresh))/nrow(concordance)
      }
    }
  }
  
  #If tidy then return data frme insted of matrix
  if(tidy == TRUE){
    res = as.data.frame(rep_matrix) %>% 
      dplyr::mutate(first = rownames(rep_matrix)) %>% 
      dplyr::select(first, everything()) %>% 
      tidyr::gather("second","concordance",2:(ncol(rep_matrix)+1)) %>% 
      dplyr::arrange(first, second)
    return(res)
  }
  return(rep_matrix)
}


#' Calculate FDR based on an empirical null distribution
#'
#' Author: Natsuhiko Kumasaka
#' 
#' @param p1 alt p-value vector
#' @param p0 null p-value vector
#' @param alpha desired FDR level
#' @param z ?
#' @param subset ?
#'
#' @return P-value cutoff corresponding to the desired FDR
#' @export
getFDR <- function(p1,p0,alpha=0.1,z=NULL,subset=NULL){
    if(is.null(z)){
      a=0
      for(itr in 1:10){
        #print(a)
        a=getFDR(p1,p0,alpha,rev(a+0:100/100^itr),subset)
      }
      a
    }else{
      if(!is.null(subset)){
        p1=p1[subset]
        p0=p0[subset]
      }
      p1=p1[!is.na(p1)]
      p0=p0[!is.na(p0)]
      x=NULL;
      for(i in z){
        x=c(x,sum(p0<i)/length(p0)/(sum(p1<i)/length(p1)))
      };
      #plot(z,x,log="xy");
      #abline(h=alpha)
      #invisible(x)
      max(c(0,z[x<alpha]),na.rm=T)
    }
  }
kauralasoo/seqUtils documentation built on Sept. 24, 2018, 1:11 a.m.