R/gnomad_scores.R

Defines functions gnomad_scores

Documented in gnomad_scores

#' GnomAD Scores
#'
#' Function gets input gene's gnomAD Synonymous Z-Score, Non-Synonymous Z-Score, and PLI Score
#' from data retrieved from [gnomAD](https://gnomad.broadinstitute.org/). If there is no gene name information in the input,
#' \code{gnomad_scores} finds the genes that related with input PDB chain ID by using a dataset retrieved from [Ensembl](https://www.ensembl.org/).
#' If there are multiple genes annotated for same PDB, \code{gnomad_scores} function gets the gene that has the maximum Synonymous Z-Score, Non-Synonymous Z-Score, and pLI Score.
#'
#' @param filtered_info_df input data.frame which contain only one PDB entries
#'
#' @return gnomAD scores added input data.frame
#'
#' @export
#'

gnomad_scores <- function(filtered_info_df){

  if(ncol(filtered_info_df) < 6){
    stop("Input data.frame should contain at least 6 columns; PDB_ID, Chain, Position, Orig_AA, Mut_AA and Gene_Name ... respectively.")
  }
  else{
    colnames(filtered_info_df)[1:6] <- c("PDB_ID", "Chain", "Position", "Orig_AA", "Mut_AA", "Gene_Name")
  }

  filtered_info_df$pdb_chain_info <- paste0(filtered_info_df$PDB_ID, ".", filtered_info_df$Chain)

  no_name <- c()
  no_gnomad <- c()
  for(i in 1:length(filtered_info_df$Gene_Name)){

    if(filtered_info_df$Gene_Name[i] == ""){

      chain_info <- filtered_info_df$pdb_chain_info[i]

      res <- pdb2gene[which(pdb2gene$PDB.Chain == chain_info),]

      if(length(unique(res$Gene)) == 0){

        no_name <- c(no_name, paste0(unique(filtered_info_df$pdb_chain_info), collapse = ", "))

        filtered_info_df$Gene_Name[i] <- "no_name"

        filtered_info_df$syn_z[i] <- NA

        filtered_info_df$mis_z[i] <- NA

        filtered_info_df$pLI[i] <- NA

      }
      else if(length(unique(res$Gene)) == 1){

        filtered_gnomad_data <- gnomad_data[gnomad_data$gene == unique(res$Gene),]

        if(nrow(filtered_gnomad_data) == 0){

          no_gnomad <- c(no_gnomad, unique(res$Gene))

          filtered_info_df$Gene_Name[i] <- unique(res$Gene)

          filtered_info_df$syn_z[i] <- NA

          filtered_info_df$mis_z[i] <- NA

          filtered_info_df$pLI[i] <- NA
        }
        else{

          idx <- table(sapply(2:4, function(x) which.max(filtered_gnomad_data[,x])))
          filtered_gnomad_data <- filtered_gnomad_data[as.numeric(names(idx)[which.max(idx)]),]

          filtered_info_df$Gene_Name[i] <- filtered_gnomad_data$gene

          filtered_info_df$syn_z[i] <- filtered_gnomad_data$syn_z

          filtered_info_df$mis_z[i] <- filtered_gnomad_data$mis_z

          filtered_info_df$pLI[i] <- filtered_gnomad_data$pLI
        }
      }
      else{

        filtered_gnomad_data <- gnomad_data[gnomad_data$gene %in% unique(res$Gene),]

        if(nrow(filtered_gnomad_data) == 0){

          no_gnomad <- c(no_gnomad, paste(unique(res$Gene), collapse = ", "))

          filtered_info_df$Gene_Name[i] <- paste(unique(res$Gene), collapse = ", ")

          filtered_info_df$syn_z[i] <- NA

          filtered_info_df$mis_z[i] <- NA

          filtered_info_df$pLI[i] <- NA
        }
        else{

          idx <- table(sapply(2:4, function(x) which.max(filtered_gnomad_data[,x])))
          filtered_gnomad_data <- filtered_gnomad_data[as.numeric(names(idx)[which.max(idx)]),]

          filtered_info_df$Gene_Name[i] <- filtered_gnomad_data$gene

          filtered_info_df$syn_z[i] <- filtered_gnomad_data$syn_z

          filtered_info_df$mis_z[i] <- filtered_gnomad_data$mis_z

          filtered_info_df$pLI[i] <- filtered_gnomad_data$pLI
        }
      }
    }

    # gene name included
    else{

      filtered_gnomad_data <- gnomad_data[gnomad_data$gene == filtered_info_df$Gene_Name[i],]

      if(nrow(filtered_gnomad_data) == 0){

        no_gnomad <- c(no_gnomad, filtered_info_df$Gene_Name[i])

        filtered_info_df$syn_z[i] <- NA

        filtered_info_df$mis_z[i] <- NA

        filtered_info_df$pLI[i] <- NA
      }

      else if(nrow(filtered_gnomad_data) == 1){

        filtered_info_df$syn_z[i] <- filtered_gnomad_data$syn_z

        filtered_info_df$mis_z[i] <- filtered_gnomad_data$mis_z

        filtered_info_df$pLI[i] <- filtered_gnomad_data$pLI

      }
      else{

        idx <- table(sapply(2:4, function(x) which.max(filtered_gnomad_data[,x])))
        filtered_gnomad_data <- filtered_gnomad_data[as.numeric(names(idx)[which.max(idx)]),]

        filtered_info_df$syn_z[i] <- filtered_gnomad_data$syn_z

        filtered_info_df$mis_z[i] <- filtered_gnomad_data$mis_z

        filtered_info_df$pLI[i] <- filtered_gnomad_data$pLI
      }
    }
  }

  if(length(unique(no_name)) > 0){
    message(crayon::white(paste0("\n", "Gene name(s) couldn't find for ", paste(unique(no_name), collapse = ", "), " (PDB.Chain), will be removed from the query", "\n")))
  }

  if(length(unique(no_gnomad)) > 0){
    message(crayon::white(paste0("\n", "gnomAD scores of the gene(s) ", paste(unique(no_gnomad), collapse = ", "), " couldn't find, will be removed from the query", "\n")))
  }

  filtered_info_df <- filtered_info_df[-(which(colnames(filtered_info_df) == "pdb_chain_info"))]

  message(crayon::white(paste0("GNOMAD Scores:", "\t\t\t", "DONE")))

  return(filtered_info_df)
}
berkgurdamar/predatoR documentation built on Jan. 10, 2023, 10:51 p.m.