R/make_htrx.R

Defines functions make_snp make_htr make_htrx

Documented in make_htr make_htrx make_snp

#' @title Generate haplotype data
#' @description Generate the feature data, either the genotype data for single nucleotide polymorphisms (SNPs) (\code{make_snp}),
#' the feature data for Haplotype Trend Regression (HTR) (\code{make_htr}), or
#' the feature data for Haplotype Trend Regression with eXtra flexibility (HTRX) (\code{make_htrx}).
#' @name make_htrx
#' @param hap1 a data frame of the SNPs' genotype of the first genome. The genotype of a SNP for each individual is either 0 (reference allele) or 1 (alternative allele).
#' @param hap2 a data frame of the SNPs' genotype of the second genome.
#' The genotype of a SNP for each individual is either 0 (reference allele) or 1 (alternative allele).
#' By default, \code{hap2=hap1} representing haploid.
#' @param rareremove logical. Remove rare SNPs and haplotypes or not. By default, \code{rareremove=FALSE}.
#' @param rare_threshold a numeric number below which the haplotype or SNP is removed.
#' This only works when \code{rareremove=TRUE}. By default, \code{rare_threshold=0.001}.
#' @param fixedfeature a character consisted of the names of haplotypes.
#' This parameter can be \code{NULL} (by default) if all the haplotypes are used as variables.
#' @param max_int a positive integer which specifies the maximum number of SNPs that can interact.
#' If no value is given, interactions between all the SNPs will be considered.
#'
#' @details If there are n SNPs, there are \ifelse{html}{\out{2<sup>n</sup>}}{\eqn{2^n}} different haplotypes created by HTR,
#' and \ifelse{html}{\out{3<sup>n</sup>}}{\eqn{3^n}}-1 different haplotypes created by HTRX.
#'
#' When the data is haploid, please use the default setting \code{hap2=hap1}.
#' @return a data frame of the feature data (either for SNPs, HTR or HTRX).
#' @examples
#' ## create SNP data for both genomes (diploid data)
#' hap1=as.data.frame(matrix(0,nrow=100,ncol=4))
#' hap2=as.data.frame(matrix(0,nrow=100,ncol=4))
#' colnames(hap1)=colnames(hap2)=c('a','b','c','d')
#' p=runif(4,0.01,0.99)
#' for(j in 1:4){
#'   hap1[,j]=rbinom(100,1,p[j])
#'   hap2[,j]=rbinom(100,1,p[j])
#' }
#'
#' ## create the SNP data without removing rare SNPs
#' make_snp(hap1,hap2)
#'
#' ## create feature data for "HTR" removing haplotypes rarer than 0.5%
#' make_htr(hap1,hap2,rareremove=TRUE,0.005)
#'
#' ## create feature data for "HTRX"
#' ## retaining haplotypes with interaction across at most 3 SNPs
#' make_htrx(hap1,hap2,max_int=3)
#'
#' ## create feature data for feature "01XX" and "X101"
#' ## without removing haplotypes
#' make_htrx(hap1,hap2,fixedfeature=c("01XX","X101"))
#'
#' ## If the data is haploid instead of diploid
#' ## create feature data for "HTRX" without removing haplotypes
#' make_htrx(hap1,hap1)
NULL

#' @rdname make_htrx
#' @export
make_htrx<-function(hap1,hap2=hap1,rareremove=FALSE,rare_threshold=0.001,
                    fixedfeature=NULL,max_int=NULL){
  ## Make a HTRX feature matrix
  ## All the combinations of 0,1,X of SNPs
  ## Each element of combinations become 'factor'.
  n=n_total=dim(hap1)[1]
  nsnp=dim(hap1)[2]
  if(is.null(fixedfeature)){
    combinations=expand.grid(lapply(1:nsnp,function(x)c(0,1,'X')))
    combinations=combinations[-nrow(combinations),]

    #retain rows with the interaction between max_int SNPs.
    if(!is.null(max_int)){
      if(max_int>nsnp){
        max_int=nsnp
        warning('max_int is reduced to nsnp because max_int should not be bigger than nsnp')
      }
      retain_index <- vector()
      for(i in 1:nrow(combinations)){
        if(length(which(combinations[i,]!='X'))<=max_int) retain_index=c(retain_index,i);
      }
      combinations=combinations[retain_index,]
    }

    HTRX_matrix <- as.data.frame(matrix(0,nrow=n_total,ncol=nrow(combinations)))
    HTRX_matrix2 <- as.data.frame(matrix(0,nrow=n_total,ncol=nrow(combinations)))
    ##the genotype must be 0 for reference allele and 1 for alternative allele
    for(i in 1:nrow(combinations)){
      HTRX_matrix[Reduce(intersect,
                         lapply(1:nsnp,function(x)
                         {
                           if(combinations[i,x]=='X')return(1:n);
                           if(combinations[i,x]!='X')return(which(hap1[,x]==as.numeric(
                             levels(combinations[i,x])[combinations[i,x]])));
                         })
      ),i]=0.5
      HTRX_matrix2[Reduce(intersect,
                          lapply(1:nsnp,function(x)
                          {
                            if(combinations[i,x]=='X')return(1:n);
                            if(combinations[i,x]!='X')return(which(hap2[,x]==as.numeric(
                              levels(combinations[i,x])[combinations[i,x]])));
                          })
      ),i]=0.5
    }
  }else{
    combinations=as.data.frame(matrix(NA,nrow=length(fixedfeature),ncol=nsnp))
    for(i in 1:length(fixedfeature)){
      combinations[i,]=unlist(strsplit(fixedfeature[i],split=''))
    }

    HTRX_matrix <- as.data.frame(matrix(0,nrow=n_total,ncol=nrow(combinations)))
    HTRX_matrix2 <- as.data.frame(matrix(0,nrow=n_total,ncol=nrow(combinations)))

    for(i in 1:nrow(combinations)){
      HTRX_matrix[Reduce(intersect,
                         lapply(1:nsnp,function(x)
                         {
                           if(combinations[i,x]=='X')return(1:n);
                           if(combinations[i,x]!='X')return(which(hap1[,x]==as.numeric(
                             combinations[i,x])));
                         })
      ),i]=0.5
      HTRX_matrix2[Reduce(intersect,
                          lapply(1:nsnp,function(x)
                          {
                            if(combinations[i,x]=='X')return(1:n);
                            if(combinations[i,x]!='X')return(which(hap2[,x]==as.numeric(
                              combinations[i,x])));
                          })
      ),i]=0.5
    }

  }

  HTRX_matrix = HTRX_matrix + HTRX_matrix2
  colnames(HTRX_matrix)=Reduce(paste0,combinations)

  #remove haplotypes with frequency=100%
  missing <- vector()
  for(i in 1:nrow(combinations)){
    if(sum(HTRX_matrix[,i])==0||sum(HTRX_matrix[,i])==nrow(HTRX_matrix)){
      missing=c(missing,i)
    }
  }
  if(length(missing)!=0) HTRX_matrix=HTRX_matrix[,-missing];

  #remove rare haplotypes or not
  if(rareremove){
    rare <- vector()
    for(d in 1:ncol(HTRX_matrix)){
      if(sum(HTRX_matrix[,d])<n*rare_threshold){
        rare <- c(rare,d)
      }
    }
    if(length(rare)!=0){
      HTRX_matrix <- HTRX_matrix[,-rare]
    }
  }

  return(HTRX_matrix)
}

#' @rdname make_htrx
#' @export
make_htr<-function(hap1,hap2=hap1,rareremove=FALSE,rare_threshold=0.001){
  ## Make a HTR feature matrix
  n=n_total=dim(hap1)[1]
  nsnp=dim(hap1)[2]
  ## HTR
  combinations=expand.grid(lapply(1:nsnp,function(x)c(0,1)))
  HTR_matrix <- as.data.frame(matrix(0,nrow=n_total,ncol=nrow(combinations)))

  HTR_matrix2 <- as.data.frame(matrix(0,nrow=n_total,ncol=nrow(combinations)))

  for(i in 1:nrow(combinations)){
    ##the genotype must be 0 for reference allele and 1 for alternative allele
    HTR_matrix[Reduce(intersect,
                      lapply(1:nsnp,function(x)
                      {
                        which(hap1[,x]==combinations[i,x])
                      })),i]=0.5
    HTR_matrix2[Reduce(intersect,
                       lapply(1:nsnp,function(x)
                       {
                         which(hap2[,x]==combinations[i,x])
                       })),i]=0.5
  }
  HTR_matrix = HTR_matrix + HTR_matrix2
  colnames(HTR_matrix)=Reduce(paste0,combinations)

  #remove haplotypes with frequency=100%
  missing <- vector()
  for(i in 1:nrow(combinations)){
    if(sum(HTR_matrix[,i])==0||sum(HTR_matrix[,i])==nrow(HTR_matrix)){
      missing=c(missing,i)
    }
  }
  if(length(missing)!=0) HTR_matrix=HTR_matrix[,-missing];

  ##remove rare haplotypes or not
  if(rareremove){
    rare <- vector()
    for(d in 1:ncol(HTR_matrix)){
      if(sum(HTR_matrix[,d])<n*rare_threshold){
        rare <- c(rare,d)
      }
    }
    if(length(rare)!=0){
      HTR_matrix <- HTR_matrix[,-rare]
    }
  }

  return(HTR_matrix)
}

#' @rdname make_htrx
#' @export
make_snp<-function(hap1,hap2=hap1,rareremove=FALSE,rare_threshold=0.001){
  ## Make a SNP feature matrix
  ## Convenience function to use the same format for making a SNP matrix from two haplotype matrices
  n=dim(hap1)[1]
  SNP=hap1+hap2

  #remove rare SNPs or not
  if(rareremove){
    rare <- vector()
    for(d in 1:ncol(SNP)){
      if(sum(SNP[,d])<n*rare_threshold){
        rare <- c(rare,d)
      }
    }
    if(length(rare)!=0){
      SNP <- SNP[,-rare]
    }
  }

  return(SNP)
}

Try the HTRX package in your browser

Any scripts or data that you put into this service are public.

HTRX documentation built on May 29, 2024, 5:53 a.m.