R/LDAS.R
In LDAandLDAS: Linkage Disequilibrium of Ancestry (LDA) and LDA Score (LDAS)

Documented in LDAS

#' @title LDA Score
#' @description Computation of the Linkage Disequilibrium of Ancestry Score (LDAS) of each single nucleotide polymorphism (SNP).
#' @param LDA_data a data frame of LDA between all pairs of SNPs that are within the 'window'.
#' SNPs should be in the decreasing order of physical position on a chromosome for both rows and columns.
#' This is the output from \code{\link{LDA}}.
#' @param map a data frame of the physical position and genetic distance of
#' all the SNPs contained in 'LDA_data'.
#' 'map' contains two columns.
#' The first column is the physical distance (unit: b) of SNPs in the decreasing order.
#' The second column is the genetic distance (unit: cM) of SNPs.
#' @param SNPidx a numeric vector denoting the LDAS of which SNPs
#' (located in which rows of the map) are computed.
#' All the SNPs' indices in the LDA_data should be included in SNPidx.
#' By default, SNPidx=NULL which specifies the LDAS of all the SNPs in the map will be computed.
#' @param window a positive number specifying the genetic distance that
#' the LDA score of each SNP is computed within. By default, window=5.
#' @param runparallel logical. Parallel programming or not (note: unavailable for Windows system).
#' @param mc.cores a positive number specifying the number of cores used for parallel programming.
#' By default, mc.cores=8.
#' @param verbose logical. Print the process of calculating the LDA score for the i-th SNP.
#'
#' @return a data frame of the LDA score and its upper and lower bound
#' at the physical position of each SNP.
#' @details LDA score is the total amount of genome in LDA with each SNP
#' (measured in recombination map distance).
#' A low LDA score is the signal of “recombinant favouring selection”.
#' @references Barrie, W., Yang, Y., Irving-Pease, E.K. et al. Elevated genetic risk for multiple sclerosis emerged in steppe pastoralist populations. Nature 625, 321–328 (2024).
#'
#' @examples
#' \donttest{
#' # visualize the painting data
#' # Painting data are the average probabilities of different populations
#' head(LDAandLDAS::example_painting_p1[1:5,],10)
#'
#' # combine the painting data for two ancestries as a list
#' # to make to input data for function 'LDA'.
#' paintings=list(LDAandLDAS::example_painting_p1,
#'           LDAandLDAS::example_painting_p2)
#'
#' # calculate the pairwise LDA of SNPs
#' LDA_result <- LDA(paintings)
#'
#' # map is the data containing two columns
#' # The first column is the physical position (unit: b) (decreasing order)
#' # The second column is the recombination distance (unit: cM) of the SNPs
#' head(LDAandLDAS::example_map,10)
#'
#' # calculate the LDA score for the SNPs
#' LDA_score <- LDAS(LDA_result,map=LDAandLDAS::example_map,window=10)
#'
#' #visualize the LDA scores
#' plot(x=LDA_score$SNP,y=LDA_score$LDAS)
#'
#' #' # if we only want to calculate the LDA of the 76th-85th SNP in the map
#' # based on the 31st-130th SNP, which aims at saving the memory
#' paintings2=list(LDAandLDAS::example_painting_p1[,31:130],
#'                 LDAandLDAS::example_painting_p2[,31:130])
#' # note that the 76th-85th SNP in the original dataset is only the
#' # (76-30)th-(85-30)th SNP in the new dataset (paintings2)
#' LDA_result2 <- LDA(paintings2,SNPidx=76:85-30)
#'
#' # calculate the LDA score for the SNPs
#' LDA_score2 <- LDAS(LDA_result2,SNPidx=76:85-30,
#'                    map=LDAandLDAS::example_map[31:130,],window=5)
#' }
#' @export

LDAS <- function(LDA_data,SNPidx=NULL,map,window=5,runparallel=FALSE,mc.cores=8,verbose=TRUE){
  n_snp <- nrow(map)

  if(is.null(SNPidx)) SNPidx=1:nrow(map)

  LDA_score <- vector()
  LDA_score_max <- vector()
  LDA_score_min <- vector()

  cal_ldas <- function(j){
    if(verbose) cat("Calculating LDA score of SNP",j,'\n');

    LDA_idx <- which(SNPidx==j)

    max_window=max(c(map[1,2]-map[j,2],map[j,2]-map[n_snp,2]))
    if(max_window<window) window=max_window

    window=window-1e-6 ## avoid genetic distance equals the window

    # the number of SNPs within 5cM window left and right to the SNP
    # Note: n_snps1 is left in our data but right in reality
    n_snps1<-length(which(map[1:j,2]<=(map[j,2]+window)))-1
    n_snps2<-length(which(map[j:n_snp,2]>=(map[j,2]-window)))-1

    # the genetic distance gap between every two SNPs
    if(j==1){
      snp_gd_gap <- abs(map[(j+1):(j+n_snps2),2]-
                          map[j:(j+n_snps2-1),2])
      LDA_use <- as.numeric(c(1,LDA_data[(j+1):(j+n_snps2),LDA_idx]))
    }else{
      if(j==n_snp){
        snp_gd_gap <- abs(map[(j-n_snps1):(j-1),2]-
                            map[(j-n_snps1+1):j,2])
        LDA_use <- as.numeric(c(LDA_data[(j-n_snps1):(j-1),LDA_idx],1))
      }else{
        snp_gd_gap <- c(abs(map[(j-n_snps1):(j-1),2]-
                              map[(j-n_snps1+1):j,2]),
                        abs(map[(j+1):(j+n_snps2),2]-
                              map[j:(j+n_snps2-1),2]))
        LDA_use <- as.numeric(c(LDA_data[(j-n_snps1):(j-1),LDA_idx],1,LDA_data[(j+1):(j+n_snps2),LDA_idx]))
      }
    }

    #LDA_use <- LDA_data[(j-n_snps1):(j+n_snps2),j] #use these LDA data

    # the average LDA of two SNPs with respect to the jth SNP
    lda_score_j <- sapply(1:(n_snps1+n_snps2),function(n)(LDA_use[n]+LDA_use[n+1])/2)
    lda_score_max_j <- sapply(1:(n_snps1+n_snps2),function(n)max(LDA_use[n],LDA_use[n+1]))
    lda_score_min_j <- sapply(1:(n_snps1+n_snps2),function(n)min(LDA_use[n],LDA_use[n+1]))

    #left end in reality but right end in the data
    if(map[j,2]<window+map[n_snp,2]){

      LDA_right=as.numeric(LDA_data[j-n_snps1-1,LDA_idx])
      gd_gap=map[j-n_snps1-1,2]-map[j-n_snps1,2]
      gd_to_end=window-map[j-n_snps1,2]+map[j,2]
      LDA_right_ave = (LDA_use[1]+gd_to_end/gd_gap*(LDA_right-LDA_use[1]))/2

      LDA_right_ave_max = max(LDA_right,LDA_use[1])
      LDA_right_ave_min = min(LDA_right,LDA_use[1])


      gd_left <- map[n_snp,2]
      n_snps3<-n_snps1-length(which(map[1:j,2]<(map[j,2]*2-gd_left)))+1
      gd_gap_add <- map[n_snp-n_snps1-n_snps2+n_snps3-1,2]-(map[j,2]*2-gd_left)
      lda_add <- (LDA_use[n_snps1+n_snps2+1]+LDA_use[n_snps3])/2
      lda_add_max <- max(LDA_use[n_snps1+n_snps2+1],LDA_use[n_snps3])
      lda_add_min <- 0

      if(n_snps3==1){
        snp_gd_gap <- c(gd_to_end,snp_gd_gap,gd_gap_add,gd_to_end)
        lda_score_j <- c(LDA_right_ave,lda_score_j,lda_add,LDA_right_ave)
        lda_score_max_j <- c(LDA_right_ave_max,lda_score_max_j,lda_add_max,LDA_right_ave_max)
        lda_score_min_j <- c(LDA_right_ave_min,lda_score_min_j,lda_add_min,LDA_right_ave_min)

      }else{
        if(n_snps3!=0){
          snp_gd_gap <- c(gd_to_end,snp_gd_gap,gd_gap_add,
                          snp_gd_gap[1:(n_snps3-1)],gd_to_end)
          lda_score_j <- c(LDA_right_ave,lda_score_j,lda_add,
                           lda_score_j[1:(n_snps3-1)],LDA_right_ave)
          lda_score_max_j <- c(LDA_right_ave_max,lda_score_max_j,lda_add_max,
                               lda_score_max_j[1:(n_snps3-1)],LDA_right_ave_max)
          lda_score_min_j <- c(LDA_right_ave_min,lda_score_min_j,lda_add_min,
                               lda_score_min_j[1:(n_snps3-1)],LDA_right_ave_min)
        }else
        {
          snp_gd_gap <- c(gd_to_end,snp_gd_gap)
          lda_score_j <- c(LDA_right_ave,lda_score_j)
          lda_score_max_j <- c(LDA_right_ave_max,lda_score_max_j)
          lda_score_min_j <- c(LDA_right_ave_min,lda_score_min_j)

          gd_to_end=gd_left+window-map[j,2]
          gd_gap=map[j-n_snps1-1,2]-map[j,2]-window+gd_to_end
          LDA_right_ave = (LDA_use[n_snps1+n_snps2+1]+
                             gd_to_end/gd_gap*(LDA_right-LDA_use[n_snps1+n_snps2+1]))/2

          LDA_right_ave_max = max(LDA_right,LDA_use[n_snps1+n_snps2+1])

          LDA_right_ave_min = min(LDA_right,LDA_use[n_snps1+n_snps2+1])

          snp_gd_gap <- c(snp_gd_gap,gd_to_end)
          lda_score_j <- c(lda_score_j,LDA_right_ave)
          lda_score_max_j <- c(lda_score_max_j,LDA_right_ave_max)
          lda_score_min_j <- c(lda_score_min_j,LDA_right_ave_min)
        }
      }

    }else if(map[j,2]>window+map[n_snp,2]){
      #right end
      if(map[j,2]>map[1,2]-window){

        #left in real, but right in our data (right is small pd)
        LDA_left=as.numeric(LDA_data[j+n_snps2+1,LDA_idx])
        gd_gap=map[j+n_snps2,2]-map[j+n_snps2+1,2]
        gd_to_end=map[j+n_snps2,2]-map[j,2]+window
        LDA_left_ave = (LDA_use[n_snps1+n_snps2+1]+
                          gd_to_end/gd_gap*(LDA_left-LDA_use[n_snps1+n_snps2+1]))/2
        LDA_left_ave_max = max(LDA_left,LDA_use[n_snps1+n_snps2+1])
        LDA_left_ave_min = min(LDA_left,LDA_use[n_snps1+n_snps2+1])

        gd_right <- map[1,2]
        #Assume A is the gd of the jth SNP, and B is the gd of the closest SNP to right end
        #Then n_snps3 is the number of SNPs within (A-5,A-(B-A))
        n_snps3<-n_snps2-length(which(map[j:n_snp,2]>(map[j,2]*2-gd_right)))+1
        gd_gap_add <- (map[j,2]*2-gd_right)-map[n_snps1+n_snps2-n_snps3+2,2]
        lda_add <- (LDA_use[1]+LDA_use[n_snps1+n_snps2-n_snps3+2])/2
        lda_add_max <- max(LDA_use[1],LDA_use[n_snps1+n_snps2-n_snps3+2])
        lda_add_min <- 0

        if(n_snps3==1){
          snp_gd_gap <- c(gd_to_end,gd_gap_add,snp_gd_gap,gd_to_end)
          lda_score_j <- c(LDA_left_ave,lda_add,lda_score_j,LDA_left_ave)
          lda_score_max_j <- c(LDA_left_ave_max,lda_add_max,lda_score_max_j,LDA_left_ave_max)
          lda_score_min_j <- c(LDA_left_ave_min,lda_add_min,lda_score_min_j,LDA_left_ave_min)
        }else{
          if(n_snps3!=0){
            snp_gd_gap <- c(gd_to_end,snp_gd_gap[(n_snps1+n_snps2-n_snps3+2):(n_snps1+n_snps2)],
                            gd_gap_add,snp_gd_gap,gd_to_end)
            lda_score_j <- c(LDA_left_ave,
                             lda_score_j[(n_snps1+n_snps2-n_snps3+2):(n_snps1+n_snps2)],
                             lda_add,lda_score_j,LDA_left_ave)
            lda_score_max_j <- c(LDA_left_ave_max,
                                 lda_score_max_j[(n_snps1+n_snps2-n_snps3+2):(n_snps1+n_snps2)],
                                 lda_add_max,lda_score_max_j,LDA_left_ave_max)
            lda_score_min_j <- c(LDA_left_ave_min,
                                 lda_score_min_j[(n_snps1+n_snps2-n_snps3+2):(n_snps1+n_snps2)],
                                 lda_add_min,lda_score_min_j,LDA_left_ave_min)
          }else
          {
            snp_gd_gap <- c(snp_gd_gap,gd_to_end)
            lda_score_j <- c(lda_score_j,LDA_left_ave)
            lda_score_max_j <- c(lda_score_max_j,LDA_left_ave_max)
            lda_score_min_j <- c(lda_score_min_j,LDA_left_ave_min)

            gd_to_end=window+map[j,2]-gd_right
            gd_gap=map[j,2]-window-map[j+n_snps1+1,2]+gd_to_end
            LDA_left_ave = (LDA_use[1]+gd_to_end/gd_gap*(LDA_left-LDA_use[1]))/2
            LDA_left_ave_max = max(LDA_use[1],LDA_left)
            LDA_left_ave_min = min(LDA_use[1],LDA_left)

            snp_gd_gap <- c(gd_to_end,snp_gd_gap)
            lda_score_j <- c(LDA_left_ave,lda_score_j)
            lda_score_max_j <- c(LDA_left_ave_max,lda_score_max_j)
            lda_score_min_j <- c(LDA_left_ave_min,lda_score_min_j)
          }
        }

      }else{
        LDA_right=as.numeric(LDA_data[j-n_snps1-1,LDA_idx])
        gd_gap_right=map[j-n_snps1-1,2]-map[j-n_snps1,2]
        gd_to_end_right=window-map[j-n_snps1,2]+map[j,2]
        LDA_right_ave = (LDA_use[1]+gd_to_end_right/gd_gap_right*(LDA_right-LDA_use[1]))/2
        LDA_right_ave_max = max(LDA_use[1],LDA_right)
        LDA_right_ave_min = min(LDA_use[1],LDA_right)


        LDA_left=as.numeric(LDA_data[j+n_snps2+1,LDA_idx])

        gd_gap_left=map[j+n_snps2,2]-map[j+n_snps2+1,2]
        gd_to_end_left=map[j+n_snps2,2]-map[j,2]+window
        LDA_left_ave = (LDA_use[n_snps1+n_snps2+1]+
                          gd_to_end_left/gd_gap_left*(LDA_left-LDA_use[n_snps1+n_snps2+1]))/2
        LDA_left_ave_max = max(LDA_use[n_snps1+n_snps2+1],LDA_left)
        LDA_left_ave_min = min(LDA_use[n_snps1+n_snps2+1],LDA_left)

        snp_gd_gap <- c(gd_to_end_right,snp_gd_gap,gd_to_end_left)
        lda_score_j <- c(LDA_right_ave,lda_score_j,LDA_left_ave)
        lda_score_max_j <- c(LDA_right_ave_max,lda_score_max_j,LDA_left_ave_max)
        lda_score_min_j <- c(LDA_right_ave_min,lda_score_min_j,LDA_left_ave_min)
      }
    }

    #if(length(lda_score_j)!=length(snp_gd_gap)){print(j)}
    # compute the LDA score
    LDA_score_temp=sum(lda_score_j*snp_gd_gap)
    LDA_score_max_temp=sum(lda_score_max_j*snp_gd_gap)
    LDA_score_min_temp=sum(lda_score_min_j*snp_gd_gap)
    return(c(LDA_score_temp,LDA_score_max_temp,LDA_score_min_temp))
  }


  if(runparallel){
    LDA_score <- cbind(map[SNPidx,],t(as.data.frame(parallel::mclapply(SNPidx,cal_ldas,mc.cores=mc.cores))))
  }else{
    LDA_score <- cbind(map[SNPidx,],t(as.data.frame(lapply(SNPidx,cal_ldas))))
  }


  colnames(LDA_score)[3]='LDAS'
  colnames(LDA_score)[4]='LDAS_max'
  colnames(LDA_score)[5]='LDAS_min'
  return(LDA_score)
}