#' @title get phenotpes
#' @description Returns phenotypes for specific sets of individials based
#' on their genotype and mutation.
#' @param data data input (df/dt)
#' @param reference string directory input
#' @param reference.suffix a string (e.g .txt)
#' @param col.id integer. Column number for ID.
#' @param col.variant integer. Column number for variant.
#' @param col.gene integer. Column number for gene
#' @param get.contrasts boolean. Should the data be contrasted? I.e. should the population
#' be returned with a contrast label (KO/Not KO)?
#' @author flassen
#' @return a list of
#' @export
get_phenotypes <- function(data, reference, reference.suffix = '.txt', col.id = 1, col.variant = 6, col.gene=11, get.contrasts = F){
require(data.table)
if (F){
# debugging on server
require(data.table)
data = fread("ukb_mfi_homo_plof_wb_mrg_chr10.txt", header = F)
col.id = 1
col.variant = 6
col.gene=11
reference = "/well/lindgren/UKBIOBANK/stefania/Pheno/QCWB/lists"
}
## handle KO file
# df format columns:
# 1 - id
# 2 - id
# 3 - chr
# 4 - pos
# 5 - snpid
# 6 - rsid
# 7 - info
# 8 - VEP gene
# 9 - VEP SO
# 10 - VEP impact
# 11 - vep gene
# 12 - vep coding/non-coding
print(head(data))
cols = c(col.id, col.variant, col.gene)
dt = setDT(data)
dt = dt[,..cols]
colnames(dt) = c('sample','rsid','gene')
write(paste(nrow(dt),'rows in KO file..'),stdout())
## handle reference
# ref name is plot name
# ref colunmns:
# 1 - id
# 2 - pheno
# has a dir been supplied, the
# function should iterate and open every file
if (dir.exists(reference)){
# get files
files = list.files(reference, pattern = reference.suffix, full.names = T)
print('The following files were found:')
print(basename(files))
# return all files
phenotypes = lapply(files, function(x){
infile = fread(x, header = F)
colnames(infile) = c('sample',tools::file_path_sans_ext(basename(x)))
return(infile)
})
names(phenotypes) = tools::file_path_sans_ext(basename(files))
# for each phenotype
# for each gene
# extract id from ref
# do some summary stat
pheno = names(phenotypes)
genes = unique(dt$gene)
result_pheno = lapply(pheno, function(p){
pheno_df = phenotypes[[p]]
result_gene = lapply(genes, function(g){
individual_ids = dt$sample[dt$gene == g]
if (length(individual_ids) > 0 ){
# get indivdual with the phenotype KO
pheno_gene_df = pheno_df[pheno_df$sample %in% individual_ids,]
colnames(pheno_gene_df) = c('sample','value')
pheno_gene_df$phenotype = p
pheno_gene_df$gene = g
pheno_gene_df$knockout = 1
pheno_gene_df$mus = mean(pheno_gene_df$value)
# get indivdual with the phenotype NO KO
#if (get.contrasts){
# pheno_gene_df_not = pheno_df[!pheno_df$sample %in% individual_ids]
# colnames(pheno_gene_df_not) = c('sample','value')
# pheno_gene_df_not$phenotype = p
# pheno_gene_df_not$gene = g
# pheno_gene_df_not$knockout = 0
# pheno_gene_df = rbind(pheno_gene_df, pheno_gene_df_not)
# pheno_gene_df$mus = abs(mean(pheno_gene_df$value[pheno_gene_df$knockout==1]) -
# mean(pheno_gene_df$value[pheno_gene_df$knockout==0]))
#
#}
# do KS test
return(pheno_gene_df)
}
})
# order by average deviation
tmp = do.call(rbind,result_gene)
#tmp$mus =
return(tmp)
})
names(result_pheno) = names(phenotypes)
return(result_pheno)
} else {
stop('reference must be a directory')
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.