R/get_phenotypes.R

Defines functions get_phenotypes

Documented in get_phenotypes

#' @title get phenotpes
#' @description Returns phenotypes for specific sets of individials based
#' on their genotype and mutation.
#' @param data data input (df/dt)
#' @param reference string directory input
#' @param reference.suffix a string (e.g .txt)
#' @param col.id integer. Column number for ID.
#' @param col.variant integer. Column number for variant.
#' @param col.gene integer. Column number for gene
#' @param get.contrasts boolean. Should the data be contrasted? I.e. should the population
#' be returned with a contrast label (KO/Not KO)?
#' @author flassen
#' @return a list of
#' @export

get_phenotypes <- function(data, reference, reference.suffix = '.txt', col.id = 1, col.variant = 6, col.gene=11, get.contrasts = F){

  require(data.table)

  if (F){
    # debugging on server
    require(data.table)
    data = fread("ukb_mfi_homo_plof_wb_mrg_chr10.txt", header = F)
    col.id = 1
    col.variant = 6
    col.gene=11
    reference = "/well/lindgren/UKBIOBANK/stefania/Pheno/QCWB/lists"

  }

  ## handle KO file
  # df format columns:
  # 1 - id
  # 2 - id
  # 3 - chr
  # 4 - pos
  # 5 - snpid
  # 6 - rsid
  # 7 - info
  # 8 - VEP gene
  # 9 - VEP SO
  # 10 - VEP impact
  # 11 - vep gene
  # 12 - vep coding/non-coding

  print(head(data))
  cols = c(col.id, col.variant, col.gene)
  dt = setDT(data)
  dt = dt[,..cols]
  colnames(dt) = c('sample','rsid','gene')
  write(paste(nrow(dt),'rows in KO file..'),stdout())

  ## handle reference
  # ref name is plot name
  # ref colunmns:
  # 1 - id
  # 2 - pheno

  # has a dir been supplied, the
  # function should iterate and open every file
  if (dir.exists(reference)){

    # get files
    files = list.files(reference, pattern = reference.suffix, full.names = T)
    print('The following files were found:')
    print(basename(files))

    # return all files
    phenotypes = lapply(files, function(x){
      infile = fread(x, header = F)
      colnames(infile) = c('sample',tools::file_path_sans_ext(basename(x)))
      return(infile)
    })

    names(phenotypes) = tools::file_path_sans_ext(basename(files))

    # for each phenotype
    #   for each gene
    #     extract id from ref
    #     do some summary stat

    pheno = names(phenotypes)
    genes = unique(dt$gene)
    result_pheno = lapply(pheno, function(p){

      pheno_df = phenotypes[[p]]

      result_gene = lapply(genes, function(g){

        individual_ids = dt$sample[dt$gene == g]

        if (length(individual_ids) > 0 ){

          # get indivdual with the phenotype KO
          pheno_gene_df = pheno_df[pheno_df$sample %in% individual_ids,]
          colnames(pheno_gene_df) = c('sample','value')
          pheno_gene_df$phenotype = p
          pheno_gene_df$gene = g
          pheno_gene_df$knockout = 1
          pheno_gene_df$mus = mean(pheno_gene_df$value)

          # get indivdual with the phenotype NO KO
          #if (get.contrasts){
          #  pheno_gene_df_not = pheno_df[!pheno_df$sample %in% individual_ids]
          #  colnames(pheno_gene_df_not) = c('sample','value')
          #  pheno_gene_df_not$phenotype = p
          #  pheno_gene_df_not$gene = g
          #  pheno_gene_df_not$knockout = 0
          #  pheno_gene_df = rbind(pheno_gene_df, pheno_gene_df_not)
          #  pheno_gene_df$mus = abs(mean(pheno_gene_df$value[pheno_gene_df$knockout==1]) -
          #                            mean(pheno_gene_df$value[pheno_gene_df$knockout==0]))
          #
          #}

          # do KS test


          return(pheno_gene_df)
        }

      })

      # order by average deviation
      tmp = do.call(rbind,result_gene)
      #tmp$mus =
      return(tmp)

    })

  names(result_pheno) = names(phenotypes)
  return(result_pheno)

  } else {
    stop('reference must be a directory')
  }
}
frhl/our documentation built on Feb. 5, 2021, 7:30 p.m.