R/Get_GWAS_matrix.R

Defines functions Get_GWAS_matrix

usethis::use_package("stringr")

Get_GWAS_matrix <- function(gt_GTonly_filt_expand,
                            fix_filt_expand,
                            indication,
                            only_SNPs = FALSE,
                            dir_results = getwd(),
                            results_name = name_by_time(),
                            do_save = TRUE,
                            filter_zeros = TRUE){
  if(is.null(results_name)){results_name = name_by_time()}
  if(do_save){
    if(only_SNPs){step_name = "Step1.4-GWAS_Matrix-only_SNPs"
    }else{step_name = "Step1.4-GWAS_Matrix"}
    results_directory <- create_directory(called_from = step_name,
                                          dir_results = dir_results,
                                          results_name = results_name)
  }
  if (only_SNPs){
    message("\nDeleting all INDELs from the data (only SNPs will remain\n")
    SNP_rows <- which(nchar(fix_filt_expand$ALT)==1 & nchar(fix_filt_expand$REF)==1)
    fix_filt_expand <- fix_filt_expand[SNP_rows, ]
    gt_GTonly_filt <- gtGTonly[SNP_rows, ]
    indication <- indication[SNP_rows]
  }

  #* keep unique names
  rownames(gt_GTonly_filt_expand) <- make.names(rownames(gt_GTonly_filt_expand), unique = TRUE)
  if(!require("stringr")){install.packages("stringr")}
  #* Create empty matrix of the same size with NAs

  indication_as_character <- as.character(round(indication))
  result_array <- stringr::str_count(gt_GTonly_filt_expand, indication_as_character)
  GWAS_mat <- matrix(result_array,
                     nrow = nrow(gt_GTonly_filt_expand),
                     ncol = ncol(gt_GTonly_filt_expand),
                     dimnames = list(rownames(gt_GTonly_filt_expand),
                                     colnames(gt_GTonly_filt_expand)))


  ### MIGHT BE UNNECESSARY: ##########
  #* creating the GWAS matrix: go cell by cell and check the list it contains.
  #* The cell in `GWAS_mat` get's a value by the number of instances of the `indication`
  #* in the matching cell of `gt_GTonly_expand`
  #*

  # if(!require("progress")){
  #   install.packages("progress")
  # }
  # library(progress)
  # #* progress creates a progress bar which allows to show a visual progress
  #
  # pb <- progress_bar$new(format = "(:spin) [:bar] :percent",
  #                        total = nrow(GWAS_mat),
  #                        complete = "=",   # Completion bar character
  #                        incomplete = "-", # Incomplete bar character
  #                        current = ">",    # Current bar character
  #                        clear = FALSE,    # If TRUE, clears the bar when finish
  #                        width = 100)      # Width of the progress bar
  #
  # for (snp in 1:nrow(GWAS_mat)){# rows
  #   for (strain in 1:ncol(GWAS_mat)){## columns
  #
  #     if(is.na(gt_GTonly_expand[snp,strain])){next} #skip NA (saves a lot of time)
  #     array(stringr::str_count(demo1[1:5,],as.character(round(indication))),dim=c(5,10))
  #     cell_value <-
  #       round(
  #         as.numeric(
  #           unlist(
  #
  #             strsplit(
  #               gt_GTonly_expand[snp,strain], "\\||/"
  #               #* \1 split each cell: "0/0" becomes a list:
  #               #*     [[1]]
  #               #*     [1] "0" "0"
  #               )#close strsplit
  #             #* \2 unlist the list of the cell to get:
  #             #* [1] "0" "0"
  #           )#close unlist
  #           #* \3 turn character list to numeric to get numeric array:
  #           #* [1] 0 0
  #         )#close as.numeric
  #         #* \4 round the array otherwise it doesn't work:
  #         #* [1] 0 0
  #       )#close round
  #
  #     #* cell_value contains the value of the cell ((3 3), (0 1), etc)
  #     #* so the info in cell_value came from the gt matrix
  #     #* we now compare the values in the cell_value to the `indication` values.
  #     #* The final value that will be introduced to the GWAS matrix is the number of appearances
  #     #* of the indication value of that row (`indication[snp]`) in the cell_value
  #
  #     GWAS_mat[snp, strain] <- sum(round(indication[snp]) == cell_value)
  #
  #     #* check the number of occurrences (instances) of the current indication in the current cell value.
  #     #* For homozygotes of the current indication we'll get 2.
  #     #* For hetero that contains it at least once, we'll get 1, and if it doesn't appear well get 0.
  #     #* If NA we'll get NA
  #
  #   } ##
  #   #* progress_bar that updates after strain:
  #   pb$tick()
  # } #

  ###############

  ##* Finish GWAS matrix:
  #* Omit any variants that turned out to be all 0 (snps that don't appear in the population at all)
  if(filter_zeros){
    relevant_rows <- apply(GWAS_mat, 1, function(x) !(all(x==0 |is.na(x))))
    GWAS_mat <- GWAS_mat[relevant_rows, ]
    fix_filt_expand <- fix_filt_expand[relevant_rows, ]
  }


  #* `rs` stands for 'reference SNP'
  rows <- sprintf("%s%0*d", "rs",1, 1:nrow(fix_filt_expand))
  rownames(fix_filt_expand) <- rows
  rownames(GWAS_mat) <- rows
  GWAS_mat <- t(GWAS_mat)

  if(!require("psych")){install.packages("psych", quiet = TRUE)}
  psych::headTail(GWAS_mat[,1:15])

  message(Sys.time(), " - Finished gwas matrix\n")

  if(do_save){
    message("Saving files to: ", results_directory,"\n")
    Save_as_RDS(list(GWAS_mat = GWAS_mat,
                     mapping_info = fix_filt_expand),
                directory = results_directory)

    return_list = list(directory = results_directory,
                       mapping_info = fix_filt_expand,
                       GWAS_mat = GWAS_mat)
  }else{return_list = list(mapping_info = fix_filt_expand,
                           GWAS_mat = GWAS_mat)}
  return(return_list)
}
TomerAntman/VCFtoGWAS documentation built on July 6, 2022, 4:51 a.m.