R/genotypes2alleles.R

#' Convert Genotypic data from Genotype format to Allele format
#'
#' A function that will return a data frame in allele per column format from a data frame in genotype per column format.
#'
#'
#'@param df Data frame that contains genotypic data and metadata
#'@param geno_cols Numeric column identifiers of loci, if NULL all columns will be assumed to contain genotypic data.
#'@param ncode the number of characters for a genotype. For example, A "12" heterozygote has an ncode of 2.
#'@author Zak Robinson, Contact: zachary.robinson(at)umontana.com
#'@return Data frame in Allele per column format
#'@export


genotypes2alleles<-function(df,geno_cols=NULL,ncode){

if(is.null(geno_cols)){
  geno_cols<-1:ncol(df)
}

meta_data_cols<-which(!(1:ncol(df) %in% geno_cols))
genotypes<-df[,geno_cols]

if(length(unique(nchar(genotypes[!(is.na(genotypes))])))>1){
  stop("Genotypes are different character lengths: Missing data encoded as something other than NA? ; Correct columns submitted ?")}


  foo <- function(x,n_code=ncode){
  ot<-strsplit(as.character(x),fixed = T,split = "")
  ot2<-lapply(ot,FUN = function(x){if(is.na(x[1])){return(rep(NA,2))}else{return(c(paste0(x[1:(n_code/2)],collapse = ""), paste0(x[((n_code/2)+1):n_code],collapse = "")))}})
  ot3<-unlist(ot2)
  return(ot3)
  }



  allel_mat<-as.data.frame(t(apply(genotypes,MARGIN = 1,FUN = foo)),stringsAsFactors = F)

  colnames(allel_mat)[seq(1,ncol(genotypes)*2,2)]<-paste0(colnames(genotypes),"_1")
  colnames(allel_mat)[seq(2,ncol(genotypes)*2,2)]<-paste0(colnames(genotypes),"_2")


  out_df<-as.data.frame(matrix(ncol=(ncol(allel_mat)+length(meta_data_cols)),nrow = nrow(allel_mat)),stringsAsFactors=FALSE)
  out_df[,min(geno_cols):((length(geno_cols)*2)+(min(geno_cols)-1))]<-allel_mat
  colnames(out_df)[min(geno_cols):((length(geno_cols)*2)+(min(geno_cols)-1))] <- colnames(allel_mat)


  for(i in meta_data_cols){
    if(i>max(geno_cols)){
      col_correct<-(i-length(geno_cols))+(length(geno_cols)*2)
      out_df[,col_correct]<-df[,i]
      colnames(out_df)[col_correct]<-colnames(df)[i]
    }
    else{
      out_df[,i]<-df[,i]
      colnames(out_df)[i]<-colnames(df)[i]

    }
  }


  return(out_df)


}
zakrobinson/RSibPurge documentation built on June 29, 2019, 3:19 a.m.