R/kir.R

Defines functions KIR_haplotype_Tel KIR_haplotype_Cen KIR_present KIR3DL1_HLA_B_inhibiting KIR3DL1_3DS1_assignment KIR_det_GCN

Documented in KIR3DL1_3DS1_assignment KIR3DL1_HLA_B_inhibiting KIR_det_GCN KIR_haplotype_Cen KIR_haplotype_Tel KIR_present

#' Determine KIR Gene Copy Number
#'
#' @param kir_string String with KIR info with the first field only (three characters), example "004+003|006+010", "NEG", "POS"
#' @param return_numeric If set to TRUE, returns numeric GCN instead of character, sets "POS" to NA and 2|3 to 2.5 etc
#'
#' @return String vector with the gene copy number, with "x" if "POS", "0" if NEG, NA if NA or empty string. If \code{return_numeric = TRUE}
#' instead returns a numeric vector, where POS results are NA and "2|3" are 2.5
#'
#' @examples
#'
#' str(KIR_det_GCN("004+003|006+010"))
#' str(KIR_det_GCN("004+003|006+010", return_numeric = TRUE))
#' KIR_det_GCN("POS")
#' KIR_det_GCN("POS", return_numeric = TRUE)
#' 
#' dat <- data.frame(kircol = c("004+003", "001", "008+008+008", "POS", "NEG", "", NA, "001|001+002"))
#' KIR_det_GCN(dat$kircol)
#' KIR_det_GCN(dat$kircol, return_numeric = TRUE)
KIR_det_GCN = function(kir_string, return_numeric = FALSE){
  require(dplyr)

  kir_string <- as.character(kir_string)
  
  # Call this on each KIR string
  det_clean_GCN <- function(y) {
    if (is.na(y) | grepl("^$", y)) return(NA_character_)
    if (grepl("POS", y)) return("x")
    if (grepl("NEG", y)) return("0")
    
    split = unlist(strsplit(y,split = "[|]"))
    clean_gcn = nchar(split) - nchar(gsub("[+]","",split))+1
    clean_gcn = ifelse(min(clean_gcn) == max(clean_gcn),min(clean_gcn),paste(min(clean_gcn),max(clean_gcn),sep="|"))
    return(as.character(clean_gcn))
  }

  gcn <- unlist(lapply(X = kir_string, FUN = det_clean_GCN))
  

  if (return_numeric == TRUE) {
    # When running src/KIR_investigation.Rmd from 18-01 project error is when hitting gcn==1|2 statement no TRUE/FALSE value
    gcn_num <- function(y) {
      if(is.na(y)) return(NA_real_)
      if(y == "1|2") return(1.5)
      if(y == "2|3") return(2.5)
      if(y == "3|4") return(3.5)
      return(suppressWarnings(as.numeric(y)))
    }
    gcn <- unlist(lapply(X = gcn, FUN = gcn_num))
    return(gcn)
  } else {return(gcn)}
}


#' Classifies KIR 3DL1 & 3DS1 alleles into KIR3DL1-H, -L, -N
#' 
#' Uses the categorization from \code{\link{ASSIGN_KIR3DL1}}
#' 
#' @param KIR3DL1_string KIR3DL1 string vector with first fields only, i.e. "001/095+004". Can contain "NEG" and "POS".
#' @param KIR3DS1_string KIR3DS1 string vector in same format
#'
#' @return String vector with "KIR3DL1-H", "KIR3DL1-L", "KIR3DL1-N" or "unknown"
#' 
#' @seealso \code{\link{ASSIGN_KIR3DL1}}
#'
#' @examples
#'
#' dat <- data.frame(kir_3DL1 = c("001/095+004", "POS", "005"), kir_3DS1 = c("NEG", "NEG", "013/107"), stringsAsFactors = FALSE)
#' KIR3DL1_3DS1_assignment(dat$kir_3DL1, dat$kir_3DS1)
KIR3DL1_3DS1_assignment <- function(KIR3DL1_string, KIR3DS1_string) {
  
  # If passed as factor variables pass to character
  KIR3DL1_string <- as.character(KIR3DL1_string)
  KIR3DS1_string <- as.character(KIR3DS1_string)
  
  # This whole function was written by Henning for reading in LSL data in 1701 project. It's just inserted here pretty 
  # much as-is, with just minor tweaks to inputs and outputs (hence why the comments are in German).
  # Requires that data/ASSIGN_KIR3DL1.RData has been loaded, should have been done when loading package
  
  if (!exists("ASSIGN_KIR3DL1")) {
    load("data/ASSIGN_KIR3DL1.RData")
  }
  
  
  
  # Sub-functions -----------------------------------------------------------
  
  KIR_class = function(kir_glstring){
    kir_result = lapply(kir_glstring, Extract_GLstring_KIR)
    #kir_result <- Extract_GLstring_KIR(kir_glstring)
    #KIR_result_class(kir_result)
    out = lapply(kir_result, KIR_result_class)
    return(out)
  }
  
  
  ## Aufl?sen des GL-Strings
  Extract_GLstring_KIR = function(kir_glstring){
    # 1. nach | Phasings auftrennen,
    kir_result = {}
    kir_glstring = gsub("[*]","",kir_glstring) # "*"-Zeichen l?schen, wenn vorhanden
    x1 = unlist(strsplit(kir_glstring, split = "|", fixed = T))
    for (i in 1:length(x1)){
      # 2. nach + beiden Allelen auftrennen
      x2 = unlist(strsplit(x1[i], split = "+", fixed = T))
      for (j in 1:length(x2)){
        # 3. nach / verschiedenen Allele auftrennen
        x3 = unlist(strsplit(x2[j], split = "/", fixed = T))
        #paste(unique(substr(x3,start = 1, stop = 3)), collapse = "/")
        x3 = unique(substr(x3,start = 1, stop = 3))
        for (k in 1:length(x3)) kir_result = rbind(kir_result,c(i,j,x3[k]))
      }
    }
    kir_result = as.data.frame(kir_result)
    names(kir_result) = c("Phase","Allel","Digit")
    kir_result = merge(kir_result, ASSIGN_KIR3DL1, all.x = T)
    kir_result$Class[is.na(kir_result$Class)] = 0
    # kir_result$Class = sapply(kir_result$Digit,function(x){
    #   y = ASSIGN_KIR3DL1$Class[ASSIGN_KIR3DL1$Allele==x]
    #   if (length(y) == 0) y = 0
    #   # if (y == 0 & x !="NEW") print(x) # Ausgabe, wenn ein neues Allel gefunden wird
    #   return(y)
    # })
    return(kir_result)
  }
  
  
  
  ## Zuordnungstabelle f?r die KIR3DL1/KIR3DS1-Klassifizierung
  assignment_KIR3DL1 = data.frame(subtype = c("l","h","s","n"), 
                                  num = 1:4, 
                                  overall = c("KIR3DL1-L","KIR3DL1-H",rep("KIR3DL1-N",2)))
  
  KIR_result_class = function(kir_result){
    xx = subset(kir_result, select = - Digit)
    xx = xx[!duplicated(xx),] #dadurch werden mehrdeutige Allele, die aber diesselbe Subtyp-Klasse l,h,s oder n haben, zusammengefasst  
    result = plyr::ddply(xx,c("Phase","Allel"),function(z){
      c(subtype = paste(z$Class,collapse = "/"))
    })
    output = plyr::ddply(result,c("Phase"),function(z){
      paste(z$subtype,collapse = "+")
    })
    output = paste(output$V1,collapse = "|")  
    
    
    if (any(result$subtype == 0) | any(grepl("/",result$subtype))){
      if (any(result$subtype == 0)){
        class = "unclassified allel"
        overall = "unknown"
      }
      if (any(grepl("/",result$subtype))){
        class = "ambiguites"
        overall = "unknown"
      }
    } else {
      result$Phase = as.numeric(result$Phase)
      ## !!! Annahme: Bei 1 Allel wird das zweite zu "n" gesetzt
      for (i in 1:max(result$Phase)){
        if (length(result$Phase[result$Phase == i]) == 1){
          result = rbind(result,c(i,2,"n"))
        }
      }
      result = plyr::join(result,assignment_KIR3DL1,by = "subtype") 
      ## !!! Annahme: Bei mind. 3 Allelen werden die beiden st?rksten Inhibierungseffekte gesucht
      result_phase = plyr::ddply(result, "Phase", function(z){
        z = z[order(z$num)[1:2],]
        c(class_i = as.character(paste(z$subtype,collapse = "/")),
          overall_i = as.character(z$overall[1]))
      })
      class = ifelse(length(unique(result_phase$class_i))==1,result_phase$class_i[1],"phasing")
      overall = ifelse(length(unique(result_phase$overall_i))==1,result_phase$overall_i[1],"unknown") 
    }
    return(list(as.character(output),as.character(class),as.character(overall)))
  }
  
  # main call ---------------------------------------------------------------
  
  
  kir3dl1.3ds1 = ifelse(KIR3DS1_string == "NEG", KIR3DL1_string, paste(KIR3DL1_string,KIR3DS1_string, sep = "+"))
  kir3dl1.3ds1_GLstring = kir3dl1.3ds1
  
  
  
  ## Wenn kir3ds1 positiv ist oder ein neues Allel typisiert wurde, wird per se ein "s" klassifiziert
  kir3dl1.3ds1_GLstring <- unlist(lapply(kir3dl1.3ds1_GLstring, FUN = function (x) {
    if (grepl("NEW",x) | grepl("POS",x)){
    x2 = gsub("NEW","sss",x)
    x2 = gsub("POS","sss",x2)
    out = paste(x,gsub("NEW","sss",x2), sep = "+")
    return(out)
    } else {
    return(x)
  }
    }))
  
  kir3dl1.3ds1_result  = KIR_class(kir3dl1.3ds1_GLstring)
  
  #kir3dl1.3ds1_diff    = kir3dl1.3ds1_result[[1]]
  #kir3dl1.3ds1_class   = kir3dl1.3ds1_result[[2]]
  #kir3dl1.3ds1_overall = kir3dl1.3ds1_result[[3]]
  
  overall <- unlist(lapply(kir3dl1.3ds1_result, function(x) return(x[[3]])))
  
  return(overall)
  
}

#' Classifies KIR3DL1 and HLA-B into strong/weak/no inhibiting
#' 
#' 
#' @param KIRD3L1_assignment String vector, KIR3DL1 assignment in fromat "KIR3DL1-H", from \code{\link{KIR3DL1_3DS1_assignment}}
#' @param HLA_B_overall String vector with HLA-B Bw classification in format "Bw4 - 80T", from \code{\link{HLA_B_classification}}
#' @param levels Can be set to 2, 3 or 4, default is 2
#'
#' @return If levels = 2, string with "Strong inhibiting", "Weak inhibiting/noninhibiting" or "unknown". If levels = 3, 
#' split into "Weak inhibiting" and "non-inhibiting". If levels = 4, split "noninhibiting" into "Missing ligand" and "Educated, uninhibiting"
#' 
#'
#' @examples
#' dat <- data.frame(assignment_KIR3DL1 = c("KIR3DL1-L", "KIR3DL1-N", "unknown", "unknown"), HLA_B_group = c("Bw4 - 80T", "Bw4 - 80I", NA, "Bw4 - 80I"), stringsAsFactors = FALSE)
#' KIR3DL1_HLA_B_inhibiting(dat$assignment_KIR3DL1, dat$HLA_B_group)
#' KIR3DL1_HLA_B_inhibiting(dat$assignment_KIR3DL1, dat$HLA_B_group, levels = 3)
#' KIR3DL1_HLA_B_inhibiting(dat$assignment_KIR3DL1, dat$HLA_B_group, levels = 4)
KIR3DL1_HLA_B_inhibiting <- function(KIR3DL1_assignment, HLA_B_overall, levels = 2) {
  # If passed as factor columns
  KIR3DL1_assignment <- as.character(KIR3DL1_assignment)
  HLA_B_overall <- as.character(HLA_B_overall)
  
  # dplyr::case_when used instead of lots of ifelse statements. If a left hand side evaluates to TRUE, the right hand side
  # value is returned, otherwise it continues down the list. 
  
  # First constructs the most detailed case, when levels = 4. Then, if levels is set lower it merges groups together.
    out <- case_when(
      is.na(KIR3DL1_assignment) | is.na(HLA_B_overall) ~ NA_character_,
      KIR3DL1_assignment == "unknown" | HLA_B_overall == "unknown" ~ "unknown",
      KIR3DL1_assignment == "KIR3DL1-L" & HLA_B_overall == "Bw4 - 80T" ~ "Strong inhibiting",
      KIR3DL1_assignment == "KIR3DL1-H" & HLA_B_overall == "Bw4 - 80I" ~ "Strong inhibiting",
      KIR3DL1_assignment == "KIR3DL1-L" & HLA_B_overall == "Bw4 - 80I" ~ "Weak inhibiting",
      KIR3DL1_assignment == "KIR3DL1-H" & HLA_B_overall == "Bw4 - 80T" ~ "Weak inhibiting",
      KIR3DL1_assignment == "KIR3DL1-L" & HLA_B_overall == "Bw6" ~ "Missing ligand",
      KIR3DL1_assignment == "KIR3DL1-H" & HLA_B_overall == "Bw6" ~ "Missing ligand",
      KIR3DL1_assignment == "KIR3DL1-N" & HLA_B_overall == "Bw6" ~ "Missing ligand",
      KIR3DL1_assignment == "KIR3DL1-N" & HLA_B_overall == "Bw4 - 80I" ~ "Educated, Uninhibited",
      KIR3DL1_assignment == "KIR3DL1-N" & HLA_B_overall == "Bw4 - 80T" ~ "Educated, Uninhibited"
    )

    if (levels == 4) {return(out)}
    
    # group Missing Ligand and Educated, Uninhibited into "noninhibiting"
      out <- ifelse((out == "Missing ligand" | out == "Educated, Uninhibited"),
             "noninhibiting", out)
    
    if (levels == 3) {return(out)}
    
    # For levels = 2, also group weak and noninhibiting together
      out <- ifelse((out == "noninhibiting" | out == "Weak inhibiting"),
                    "Weak inhibiting/noninhibiting", out)
    return(out) # Returns here if levels = 2
}



#' KIR present/absent
#'
#' Determine whether the KIR is present, given a KIR string. Assumes a string with only "NEG" is absent, any other string not including
#' "NEG" is present, if missing or some combination together with "NEG" return NA - see examples.
#'
#' @param string KIR string vector
#'
#' @return logical vector, TRUE if the KIR is present, FALSE if it's not present, NA if passed an empty/NA string
#'
#' @examples
#' dat <- data.frame(kirstring = c("003/034+003/034", "NEG", "", NA, "003|NEG"), stringsAsFactors = FALSE)
#' KIR_present(dat$kirstring)
KIR_present <- function(string) {
  # if passed factor column
  string <- as.character(string)
  # Assumes anything other than an empty string or a "NEG" KIR string means the gene is present
  # Need to make sure inputs are cleaned
  out <- case_when(
    is.na(string) ~ NA,
    grepl("^$", string) ~ NA,
    string == "NA" ~ NA,
    grepl("^NEG$", string) ~ FALSE, # Only NEG
    grepl("NEG", string) ~ NA, # NEG with something else
    TRUE ~ TRUE) # Otherwise present
  return(out)
}




#' KIR Centromeric Haplotype
#' 
#' Assigns the Centrometic haplotype (A/A, A/B or B/B) given a set of KIRs
#'
#' @param df data.frame with KIR alleles string as columns
#' @param KIR_2DL3col string, name of 2DL3 column
#' @param KIR_2DS2col string, name of 2DS2 column
#' @param KIR_2DL2col string, name of 2DL2 column
#' @param present_cols Default is FALSE which assumes the KIR columns contains the allele string ("001", "NEG" etc), and the presence / absence is calculated inside the function using \code{\link{KIR_present}}. If set to TRUE, the KIR columns already contain TRUE/FALSE for presence/absence for all KIRs. 
#'
#' @return string vector with haplotype ("A/A", "A/B", "B/B"), NA if any input KIR is missing, and "unknown" otherwise.
#'
#' @seealso \code{\link{KIR_haplotype_Tel}}
#' @examples
#' dat <- data.frame(kir_2DL3 = c("001", "NEG", NA, "001|002+003"), 
#'    kir_2DS2 = c("001", "001", "001", "001"), 
#'    kir_2DL2 = c("NEG", "001", "NEG", "NEG"), 
#'    stringsAsFactors = FALSE)
#' KIR_haplotype_Cen(dat, "kir_2DL3", "kir_2DS2", "kir_2DL2")
#' 
#' dat_pres <- data.frame(kir_2DL3 = c(TRUE, FALSE, NA, TRUE), 
#'    kir_2DS2 = c(TRUE, TRUE, TRUE, TRUE), 
#'    kir_2DL2 = c(FALSE, TRUE, FALSE, FALSE), 
#'    stringsAsFactors = FALSE)
#' KIR_haplotype_Cen(dat_pres, "kir_2DL3", "kir_2DS2", "kir_2DL2", present_cols = TRUE)
KIR_haplotype_Cen <- function(df, KIR_2DL3col, KIR_2DS2col, KIR_2DL2col, present_cols = FALSE) {
  
  if (present_cols == FALSE) {
    if (any(class(df[[KIR_2DL3col]]) == "logical", 
            class(df[[KIR_2DS2col]]) == "logical",
            class(df[[KIR_2DL2col]]) == "logical")) {
      stop("KIR columns are not 'logical' type but present_cols was set to TRUE")
    }
    kir2DL3_present <- CTUtools::KIR_present(as.character(df[[KIR_2DL3col]]))
    kir2DS2_present <- CTUtools::KIR_present(as.character(df[[KIR_2DS2col]]))
    kir2DL2_present <- CTUtools::KIR_present(as.character(df[[KIR_2DL2col]]))
  }
  if (present_cols == TRUE) {
    if (any(class(df[[KIR_2DL3col]]) != "logical", 
            class(df[[KIR_2DS2col]]) != "logical",
            class(df[[KIR_2DL2col]]) != "logical")) {
      stop("KIR columns are not 'logical' type but present_cols was set to TRUE")
    }
    kir2DL3_present <- df[[KIR_2DL3col]]
    kir2DS2_present <- df[[KIR_2DS2col]]
    kir2DL2_present <- df[[KIR_2DL2col]]
  }
  
  out <- case_when(
    is.na(kir2DL3_present) | is.na(kir2DS2_present) | is.na(kir2DL2_present) ~ NA_character_,
    kir2DL3_present == "" | kir2DS2_present == "" | kir2DL2_present == "" ~ NA_character_,
    (kir2DL3_present == TRUE & kir2DS2_present == FALSE & kir2DL2_present == FALSE) ~ "A/A",
    (kir2DL3_present == TRUE & (kir2DS2_present == TRUE | kir2DL2_present == TRUE)) ~ "A/B",
    (kir2DL3_present == FALSE & (kir2DS2_present == TRUE | kir2DL2_present == TRUE)) ~ "B/B",
    TRUE ~ "unknown"
  )
  
  return(out)
  
}


#' KIR Telomeric Haplotype
#' 
#' Assigns the Telomeric haplotype (A/A, A/B or B/B) given a set of KIRs
#'
#' @param df data.frame with KIR alleles string as columns
#' @param KIR_3DL1col string, name of 3DL1 column
#' @param KIR_2DS4col string, name of 2DS4 column
#' @param KIR_3DS1col string, name of 3DS1 column
#' @param KIR_2DS1col string, name of 2DS1 column
#' @param present_cols Default is FALSE which assumes the KIR columns contains the allele string ("001", "NEG" etc), and the presence / absence is calculated inside the function using \code{\link{KIR_present}}. If set to TRUE, the KIR columns already contain TRUE/FALSE for presence/absence for all KIRs. 
#'
#' @return string vector with haplotype ("A/A", "A/B", "B/B"), NA if any input KIR is missing, and "unknown" otherwise.
#'
#' @seealso \code{\link{KIR_haplotype_Cen}}
#' @examples
#' dat <- data.frame(kir_3DL1 = c("001", "NEG", NA, "001|002+003"), 
#'    kir_2DS4 = c("001", "001", "001", "001"), 
#'    kir_3DS1 = c("NEG", "001", "NEG", "NEG"),
#'    kir_2DS1 = c("001", "001", "NEG", "NEG"),
#'    stringsAsFactors = FALSE)
#' KIR_haplotype_Tel(dat, "kir_3DL1", "kir_2DS4", "kir_3DS1", "kir_2DS1")
#'    
#' dat_pres <- data.frame(kir_3DL1 = c(TRUE, FALSE, NA, TRUE), 
#'     kir_2DS4 = c(TRUE, TRUE, TRUE, TRUE), 
#'     kir_3DS1 = c(FALSE, TRUE, FALSE, FALSE),
#'     kir_2DS1 = c(TRUE, TRUE, FALSE, FALSE),
#'     stringsAsFactors = FALSE)
#' KIR_haplotype_Tel(dat_pres, "kir_3DL1", "kir_2DS4", "kir_3DS1", "kir_2DS1", present_cols = TRUE)
KIR_haplotype_Tel <- function(df, KIR_3DL1col, KIR_2DS4col, KIR_3DS1col, KIR_2DS1col, present_cols = FALSE) {
  
  if (present_cols == FALSE) {
    if (any(class(df[[KIR_3DL1col]]) == "logical", 
            class(df[[KIR_2DS4col]]) == "logical",
            class(df[[KIR_3DS1col]]) == "logical",
            class(df[[KIR_2DS1col]]) == "logical")) {
      stop("KIR column are 'logical' type but present_cols was set to FALSE")
    }
    kir3DL1_present <- CTUtools::KIR_present(as.character(df[[KIR_3DL1col]]))
    kir2DS4_present <- CTUtools::KIR_present(as.character(df[[KIR_2DS4col]]))
    kir3DS1_present <- CTUtools::KIR_present(as.character(df[[KIR_3DS1col]]))
    kir2DS1_present <- CTUtools::KIR_present(as.character(df[[KIR_2DS1col]]))
  }
  if (present_cols == TRUE) {
    if (any(class(df[[KIR_3DL1col]]) != "logical", 
            class(df[[KIR_2DS4col]]) != "logical",
            class(df[[KIR_3DS1col]]) != "logical",
            class(df[[KIR_2DS1col]]) != "logical")) {
      stop("KIR columns are not 'logical' type but present_cols was set to TRUE")
    }
    kir3DL1_present <- df[[KIR_3DL1col]]
    kir2DS4_present <- df[[KIR_2DS4col]]
    kir3DS1_present <- df[[KIR_3DS1col]]
    kir2DS1_present <- df[[KIR_2DS1col]]
  }
  
  
  
  out <- case_when(
    is.na(kir3DL1_present) | is.na(kir2DS4_present) | is.na(kir3DS1_present) | is.na(kir2DS1_present) ~ NA_character_,
    kir3DL1_present == "" | kir2DS4_present == "" | kir3DS1_present == "" | kir2DS1_present == "" ~ NA_character_,
    (kir3DL1_present == TRUE & kir2DS4_present == TRUE) & (kir3DS1_present == FALSE & kir2DS1_present == FALSE) ~ "A/A",
    (kir3DL1_present == TRUE & kir2DS4_present == TRUE) & (kir3DS1_present == TRUE | kir2DS1_present == TRUE) ~ "A/B",
    (kir3DL1_present == FALSE | kir2DS4_present == FALSE) ~ "B/B",
    TRUE ~ "unknown"
  )
  
  return(out)
  
}
bosefalk/CTUtools documentation built on Feb. 4, 2022, 4:10 p.m.