rLCLAE: Implement Low Coverage Local Ancestry Estimation in R

Documented in getmode gettract

#library(data.table)

#' getmode
#'
#' @param v a input
#'
#' @return nothing
# @export
#'
# @examples
getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

utils::globalVariables(c(".", "chrom","loc_Dummy", "loc_Minus100", "loc_Plus100",
                         "nxt", "nxt_chrom", "nxt_state",
                         "snp"))

#' @title gettract
#' @description Generating tract from snp level ancestry call.
#' @param datadir input snp level ancestry call file
#' @param outputdir output tract file
#' @param indiv_name The unique identifier for the individual, which could be numeric index (e.g. 1) or character sample name (e.g. "sample_1")
#' @param chr the chromosome it is on
#' @param chrlength the length of the chromosome
#' @param value window size
#' @param mode_n the minimum percentage for a call to be made using majority rule
#' @param min_n the minimum number of SNPs within the window to make a call
#' @param exclude how much of the start/end of each chromosome to ignore
#'
#' @return An output file including tract and majority rule
#' @export
#' @import Rcpp
#' @import data.table
#' @importFrom graphics hist
#' @importFrom stats median
#' @importFrom utils read.delim write.table
# @examples
gettract <- function(datadir,outputdir,
                     indiv_name='50',
                     chr='chrX',chrlength='142711496',
                     value=35000,
                     mode_n=0.5,
                     min_n=20,
                     exclude=50000){

# This function could only run one chromosome at a time
filedir<-datadir
maj_dir<-paste(outputdir,indiv_name,'MajorityRule.txt',sep = '.')
tractdir<-paste(outputdir,indiv_name,'tracts.txt',sep = '.')

# Prepare a data column with snp, call, chrom, indiv_name
fread(filedir,showProgress = T,stringsAsFactors = F) -> indv_ #chr17 pos call
data.table(indv_,stringsAsFactors = F)->indv
# print(str(indv))
#indv$chrom <- 'chrX' #add a line full of chrX
indv$chrom<-chr
# print(head(indv))
colnames(indv) <- c('snp', 'call', 'chrom')
indv$snp<-as.numeric(as.character(indv$snp))
indv$call<-as.numeric(as.character(indv$call))
indv$indiv <- as.character(indiv_name)
# print(str(indv))
# print(unique(indv$call))
chroms<-cbind(chr,chrlength)
j=1

indv -> tmp
subset(tmp, !(tmp$call == -1)) -> tmp
tmp -> indv
# print(str(tmp))

if (nrow(tmp) > 0) {
  # index temporary file of chromosome calls and use setkeys to prepare the data for matching sites within $VALUE of each SNP
  # run match to get sets of sites within `value` of each SNP
  # get the mode of all calls, the total number of calls, and the number of calls which were the mode.
  tmp[,loc_Dummy := snp]; tmp[,.(snp, call)] -> tmp2
  tmp2[,loc_Plus100 := snp + value]; tmp2[,loc_Minus100 := snp - value]

  # print(str(tmp))
  # print(str(tmp2))

  setkey(tmp,snp,loc_Dummy); setkey(tmp2,loc_Minus100, loc_Plus100)
  #print(nrow(tmp))
  message(paste("Now doing matches for ", chroms[j,1], "....", sep=""))
  Matches <- foverlaps(tmp[,.(snp, loc_Dummy)], tmp2[,.(loc_Minus100,loc_Plus100,call)])
  Matches[,.(n = .N, mode = getmode(call), n_mode=sum(call==getmode(call))), by = .(snp)] -> i1
  #rm(Matches); gc(); rm(tmp2)
  print("done with matches!")
  i1$n_mode<-as.numeric(as.character(i1$n_mode))
  i1$n<-as.numeric(as.character(i1$n))
  i1$perc <- i1$n_mode/i1$n

  # print("head of i1")
  # print(head(i1))

  # remove sites where there is not a consensus call by majority rule (at least `mode_n` percent of calls with the same state) or enough nearby ancestry informative sites (`min_n` within `value`)
  subset(tmp, i1$perc >= mode_n & i1$n >= min_n) -> tmp; subset(i1, i1$perc >= mode_n & i1$n >= min_n) -> i1



  # Some format adjustment
  tmp$snp<-as.numeric(as.character(tmp$snp))
  i1$snp<-as.numeric(as.character(i1$snp))
  chroms[j,2]<-as.numeric(as.character(chroms[j,2]))
  if(nrow(i1)<=0){
    warning("No consensus call under mode",mode_n,"and min snp number",min_n)
  }
  else if (nrow(i1) > 0) {
    # merge majority rule calls for this chromosome to the growing file, removing the first and last X kb (set by "exclude").
    #if (j==1) {cbind(tmp,i1)[tmp$snp >= exclude & tmp$snp <= chroms[j,2]-exclude,-c(5:6)] -> maj_rule} else {cbind(tmp,i1) -> te; rbind(maj_rule, te[tmp$snp >= exclude & tmp$snp <= chroms[j,2]-exclude,-c(5:6)]) -> maj_rule}
    cbind(tmp,i1)[as.numeric(tmp$snp) >= exclude & as.numeric(tmp$snp) <= as.numeric(chroms[j,2])-exclude,-c(5:6)] -> maj_rule

    # progress to calling ancestry tracts
    cbind(tmp, i1) -> modes; #rm(tmp); rm(i1); if (j > 1) {rm(te)}

    modes$nxt <- c(as.numeric(modes$snp[-1]),max(modes$snp))
    modes$nxt_chrom <- c(as.character(modes$chrom[-1]),modes$chrom[nrow(modes)])
    modes$nxt_state <- c(modes$mode[-1],modes$call[nrow(modes)])

    # keep only sites where this site is different than the next one.
    # when this occurs, the first row is now the first break point
    subset(modes, !(modes$mode == modes$nxt_state & modes$chrom == modes$nxt_chrom)) -> blocks
    blocks[,.(chrom, nxt_chrom, snp, nxt, mode, nxt_state)] -> blocks

    # nxt_state is the one that defines that block
    # add the first block from position 1 to the first break pt
    first <- as.data.frame(t(matrix(c(blocks$chrom[1], blocks$chrom[1], 1, blocks$snp[1], modes$mode[1], blocks$mode[1]))))
    colnames(first) <- colnames(blocks); rbind(first, blocks) -> blocks; #rm(first)

    last <- as.data.frame(matrix(c(as.character(chroms[j,1]), as.character(chroms[j,1]), max(modes$snp), chroms[j,2], modes$nxt_state[nrow(modes)], modes$nxt_state[nrow(modes)]), ncol=6)); as.numeric(as.character(last$V3)) -> last$V3; as.numeric(as.character(last$V4)) -> last$V4; as.numeric(as.character(last$V5)) -> last$V5;as.numeric(as.character(last$V6)) -> last$V6;
    colnames(last) <- colnames(blocks); rbind(blocks, last) -> blocks; #rm(last)

    blocks$chrom <- blocks$nxt_chrom <- as.character(chroms[j,1]) # If you want the numbers, this should be `j` instead of `chroms$V1[j]`

    # write and read back in to reset formatting
    #write.table(blocks, "./tmp2.INDIV.txt", row.names=F, col.names=T, sep="\t")
    #read.delim("./tmp2.INDIV.txt") -> blocks

    blocks$brk <- (as.numeric(as.character(blocks$nxt)) + as.numeric(as.character(blocks$snp)))/2
    blocks$brk[1] <- 1  # start at the first bp of the chromosome
    modes$snp[1] -> blocks$brk[1] # assume the a single ancestry tract up until the first called SNP

    chroms[j,2]  -> blocks$brk[nrow(blocks)] # extend the last tract until the end of the chromosome
    blocks$nxt_brk <- c(as.numeric(as.character(blocks$brk[-1])),'end')
    blocks[-nrow(blocks),] -> blocks  # last line no longer matters, we've already extended from the last SNP to the end of the chromosome

    blocks$length <- as.numeric(blocks$nxt_brk) - as.numeric(blocks$brk)
    #blocks$length[1] <- NA; blocks$length[nrow(blocks)-1] <- NA



    # remove uncertainty because we really don't need this now that I'm not doing as much methods testing.
    #uncertainty of the break points to each side
    blocks$u_prev <- (as.numeric(blocks$nxt) - as.numeric(blocks$snp))/2
    blocks$u_prev[1] <- NA
    blocks$u_next <- c(as.numeric(blocks$u_prev[-1]),NA)
    ## blocks u_prev is the number of bases at the start of that tract which were inferred (i.e. before the first AIM with that ancestry call)
    ## blocks u_next is the same thing for the end of that tract

    # print("Head of blocks")
    # print(head(blocks))

    #Blocks starts at the first SNP and goes to the last one.

    t <- blocks[, c(1, 7, 8, 6, 9:11)]
    colnames(t) <- c("chrom", "start", "end", "state", "length", "prev_inferred", "after_inferred")
    t$name <- indiv_name
    #t$length2 <- as.numeric(t$end) - as.numeric(t$start)
    t$end <- as.numeric(as.character(t$end))
    t$start <- as.numeric(as.character(t$start))

    # print("Head of t")
    # print(head(t))
    # print(str(t))

    # fix the extremes of `t`, cropping the first and last `exclude`bp from the chromosome
    t <- subset(t, t$start <= (as.numeric(chroms[j,2])-exclude) & t$end >= exclude)
    t$end <- as.numeric(as.character(t$end))
    t$start <- as.numeric(as.character(t$start))
    t$start[t$start < exclude] <- exclude
    t$end[t$end > (as.numeric(chroms[j,2])-exclude)] <- (as.numeric(chroms[j,2])-exclude)
    t$length[c(1,nrow(t))] <- NA

    # print("Head of t")
    # print(head(t))
    # print(str(t))

    # merge with list of tracts per individual
    if (j==1) {t -> tracts} else {rbind(tracts,t) -> tracts}
    if(nrow(tracts) == 1 & length(unique(maj_rule$mode))==1){
      tracts$state<-maj_rule$mode[1]
      tracts$length<-tracts$end - tracts$start
      message("Only one tract, using the unique value.")
    }
    #rm(t); rm(modes)
    #rm(blocks)
  }
}
if(nrow(maj_rule)>0&&nrow(tracts>0)){
  write.table(maj_rule, maj_dir, row.names=F, col.names=T, quote=F, sep="\t")
  write.table(tracts, tractdir, row.names=F, col.names=T, quote=F, sep="\t")
  #save.image(file = "My_Object.RData")
  }
}