R/mixcrToImmunarch.R

parse_mixcr <- function (.dataframe) {
  
  IMMCOL = new.env()
  
  IMMCOL$count  = "Clones"
  IMMCOL$prop   = "Proportion"
  IMMCOL$cdr3nt = "CDR3.nt"
  IMMCOL$cdr3aa = "CDR3.aa"
  IMMCOL$v      = "V.name"
  IMMCOL$d      = "D.name"
  IMMCOL$j      = "J.name"
  IMMCOL$ve     = "V.end"
  IMMCOL$ds     = "D.start"
  IMMCOL$de     = "D.end"
  IMMCOL$js     = "J.start"
  IMMCOL$vnj    = "VJ.ins"
  IMMCOL$vnd    = "VD.ins"
  IMMCOL$dnj    = "DJ.ins"
  IMMCOL$seq    = "Sequence"
  IMMCOL$order  = c(IMMCOL$count, IMMCOL$prop, IMMCOL$cdr3nt, IMMCOL$cdr3aa,
                    IMMCOL$v, IMMCOL$d, IMMCOL$j,
                    IMMCOL$ve, IMMCOL$ds, IMMCOL$de, IMMCOL$js,
                    IMMCOL$vnj, IMMCOL$vnd, IMMCOL$dnj, IMMCOL$seq)
  IMMCOL$type   = c("numeric", "numeric", "character", "character",
                    "character", "character", "character",
                    "integer", "integer", "integer", "integer",
                    "integer", "integer", "integer", "character")
  
                    
  
  require(stringr)
  require(immunarch)
  fix.alleles <- function (.data) {
    .data[[IMMCOL$v]] <- gsub("[*][[:digit:]]*", "", .data[[IMMCOL$v]])
    .data[[IMMCOL$d]] <- gsub("[*][[:digit:]]*", "", .data[[IMMCOL$d]])
    .data[[IMMCOL$j]] <- gsub("[*][[:digit:]]*", "", .data[[IMMCOL$j]])
    .data
  }
  
  
  .count <- 'clonecount'
  .sep = '\t'
  .vend <- "allvalignments"
  .jstart <- "alljalignments"
  .dalignments <- "alldalignments"
  .vd.insertions <- "VD.insertions"
  .dj.insertions <- "DJ.insertions"
  .total.insertions <- "Total.insertions"
  
  table.colnames <- tolower(colnames(.dataframe))
  table.colnames <- gsub(".", "", table.colnames, fixed = T)
  
  # Columns of different MiXCR formats
  # Clone count - Clonal sequence(s) - N. Seq. CDR3
  # cloneCount - clonalSequence - nSeqCDR3
  # cloneCount - targetSequences - nSeqImputedCDR3
  # cloneCount - targetSequences - nSeqCDR3
  if ("targetsequences" %in% table.colnames) {
    if ('nseqimputedcdr3' %in% table.colnames) {
      .nuc.seq <- 'nseqimputedcdr3'
    } else {
      .nuc.seq <- 'nseqcdr3'
    }
    
    .big.seq <- 'targetsequences'
  } else {
    .nuc.seq <- 'nseqcdr3'
    
    if ("clonalsequences" %in% table.colnames) {
      .big.seq <- 'clonalsequences'
    } else if ("clonalsequence" %in% table.colnames) {
      .big.seq <- 'clonalsequence'
    } else {
      .big.seq = NA
    }
  }
  
  if (!("allvalignments" %in% table.colnames)) {
    if ("allvalignment" %in% table.colnames) {
      .vend = "allvalignment"
    } else {
      .vend = NA
    }
  }
  if (!("alldalignments" %in% table.colnames)) {
    if ("alldalignment" %in% table.colnames) {
      .dalignments = "alldalignment"
    } else {
      .dalignments = NA
    }
  }
  if (!("alljalignments" %in% table.colnames)) {
    if ("alljalignment" %in% table.colnames) {
      .jstart = "alljalignment"
    } else {
      .jstart = NA
    }
  }
  
  if ("bestvhit" %in% table.colnames) {
    .vgenes <- 'bestvhit'
  } else if ('allvhits' %in% table.colnames) {
    .vgenes <- 'allvhits'
  } else if ('vhits' %in% table.colnames) {
    .vgenes <- 'vhits'
  } else if ('allvhitswithscore' %in% table.colnames) {
    .vgenes <- 'allvhitswithscore'
  } else {
    cat("Error: can't find a column with V genes\n")
  }
  
  if ("bestjhit" %in% table.colnames) {
    .jgenes <- 'bestjhit'
  } else if ('alljhits' %in% table.colnames) {
    .jgenes <- 'alljhits'
  } else if ('jhits' %in% table.colnames) {
    .jgenes <- 'jhits'
  } else if ('alljhitswithscore' %in% table.colnames) {
    .jgenes <- 'alljhitswithscore'
  } else {
    cat("Error: can't find a column with J genes\n")
  }
  
  if ("bestdhit" %in% table.colnames) {
    .dgenes <- 'bestdhit'
  } else if ('alldhits' %in% table.colnames) {
    .dgenes <- 'alldhits'
  } else if ('dhits' %in% table.colnames) {
    .dgenes <- 'dhits'
  } else if ('alldhitswithscore' %in% table.colnames) {
    .dgenes <- 'alldhitswithscore'
  } else {
    cat("Error: can't find a column with D genes\n")
  }
  
  df <- .dataframe
  
  #
  # return NULL if there is no clonotypes in the data frame
  #
  if (nrow(df) == 0) {
    return(NULL)
  }
  
  names(df) = make.names(names(df))
  names(df) <- tolower(gsub(".", "", names(df), fixed = T))
  names(df) <- str_replace_all(names(df), " ", "")
  
  # check for VJ or VDJ recombination
  # VJ / VDJ / Undeterm
  recomb_type = "Undeterm"
  if (sum(substr(head(df)[[.vgenes]], 1, 4) %in% c("TCRA", "TRAV", "TRGV", "IGKV", "IGLV"))) {
    recomb_type = "VJ"
  } else if (sum(substr(head(df)[[.vgenes]], 1, 4) %in% c("TCRB", "TRBV", "TRDV", "IGHV"))) {
    recomb_type = "VDJ"
  }
  
  if (!is.na(.vend) && !is.na(.jstart)) {
    .vd.insertions <- "VD.insertions"
    df$VD.insertions <- -1
    if (recomb_type == "VJ") {
      df$VD.insertions <- -1
    } else if (recomb_type == "VDJ") {
      logic <- sapply(strsplit(df[[.dalignments]], "|", T, F, T), length) >= 4 &
        sapply(strsplit(df[[.vend]], "|", T, F, T), length) >= 5
      df$VD.insertions[logic] <-
        as.numeric(sapply(strsplit(df[[.dalignments]][logic], "|", T, F, T), "[[", 4)) -
        as.numeric(sapply(strsplit(df[[.vend]][logic], "|", T, F, T), "[[", 5)) - 1
    }
    
    .dj.insertions <- "DJ.insertions"
    df$DJ.insertions <- -1
    if (recomb_type == "VJ") {
      df$DJ.insertions <- -1
    } else if (recomb_type == "VDJ") {
      logic <- sapply(strsplit(df[[.jstart]], "|", T, F, T), length) >= 4 &
        sapply(strsplit(df[[.dalignments]], "|", T, F, T), length) >= 5
      df$DJ.insertions[logic] <-
        as.numeric(sapply(strsplit(df[[.jstart]][logic], "|", T, F, T), "[[", 4)) -
        as.numeric(sapply(strsplit(df[[.dalignments]][logic], "|", T, F, T), "[[", 5)) - 1
    }
    
    # VJ.insertions
    logic <- (sapply(strsplit(df[[.vend]], "|", T, F, T), length) > 4) & (sapply(strsplit(df[[.jstart]], "|", T, F, T), length) >= 4)
    .total.insertions <- "Total.insertions"
    if (recomb_type == "VJ") {
      df$Total.insertions <- NA
      if (length(which(logic)) > 0) {
        df$Total.insertions[logic] <-
          as.numeric(sapply(strsplit(df[[.jstart]][logic], "|", T, F, T), "[[", 4)) - as.numeric(sapply(strsplit(df[[.vend]][logic], "|", T, F, T), "[[", 5)) - 1
      }
    } else if (recomb_type == "VDJ") {
      df$Total.insertions <- df[[.vd.insertions]] + df[[.dj.insertions]]
    } else {
      df$Total.insertions <- NA
    }
    df$Total.insertions[df$Total.insertions < 0] <- -1
    
    df$V.end <- -1
    df$J.start <- -1
    df[[.vend]] = gsub(";", "", df[[.vend]], fixed = T)
    logic = sapply(strsplit(df[[.vend]], "|", T, F, T), length) >= 5
    df$V.end[logic] <- sapply(strsplit(df[[.vend]][logic], "|", T, F, T), "[[", 5)
    logic = sapply(strsplit(df[[.jstart]], "|", T, F, T), length) >= 4
    df$J.start[logic] <- sapply(strsplit(df[[.jstart]][logic], "|", T, F, T), "[[", 4)
  } else {
    df$V.end <- -1
    df$J.start <- -1
    df$Total.insertions <- -1
    df$VD.insertions <- -1
    df$DJ.insertions <- -1
    
    .dj.insertions <- "DJ.insertions"
    .vd.insertions <- "VD.insertions"
  }
  
  .vend <- "V.end"
  .jstart <- "J.start"
  
  if (!is.na(.dalignments)) {
    logic <- sapply(str_split(df[[.dalignments]], "|"), length) >= 5
    df$D5.end <- -1
    df$D3.end <- -1
    df$D5.end[logic] <- sapply(str_split(df[[.dalignments]][logic], "|"), "[[", 4)
    df$D3.end[logic] <- sapply(str_split(df[[.dalignments]][logic], "|"), "[[", 5)
    .dalignments <- c('D5.end', 'D3.end')
  } else {
    df$D5.end <- -1
    df$D3.end <- -1
  }
  
  .dalignments <- c('D5.end', 'D3.end')
  
  if (!(.count %in% table.colnames)) {
    warn_msg = c("  [!] Warning: can't find a column with clonal counts. Setting all clonal counts to 1.")
    warn_msg = c(warn_msg, "\n      Did you apply repLoad to MiXCR file *_alignments.txt?")
    warn_msg = c(warn_msg, " If so please consider moving all *.clonotypes.*.txt MiXCR files to")
    warn_msg = c(warn_msg, " a separate folder and applying repLoad to it.")
    warn_msg = c(warn_msg, "\n      Note: The *_alignments.txt file IS NOT a repertoire file suitable for any analysis.\n")
    cat(warn_msg)
    
    df[[.count]] = 1
  }
  .freq = "Proportion"
  df$Proportion = as.numeric(df[[.count]]) / sum(as.numeric(df[[.count]]))
  
  .aa.seq = IMMCOL$cdr3aa
  df[[.aa.seq]] = bunch_translate(df[[.nuc.seq]])
  
  if (is.na(.big.seq)) {
    .big.seq = "BigSeq"
    df$BigSeq = df[[.nuc.seq]]
  }
  
  df <- df[, make.names(c(.count, .freq,
                          .nuc.seq, .aa.seq,
                          .vgenes, .dgenes, .jgenes,
                          .vend, .dalignments, .jstart,
                          .total.insertions, .vd.insertions, .dj.insertions, .big.seq))]
  
  colnames(df) <- IMMCOL$order
  
  df[[IMMCOL$v]] <- gsub("([*][[:digit:]]*)([(][[:digit:]]*[.]*[[:digit:]]*[)])", "", df[[IMMCOL$v]])
  df[[IMMCOL$v]] <- gsub(",", ", ", df[[IMMCOL$v]])
  df[[IMMCOL$v]] = str_replace_all(df[[IMMCOL$v]], '"', "")
  
  df[[IMMCOL$d]] <- gsub("([*][[:digit:]]*)([(][[:digit:]]*[.]*[[:digit:]]*[)])", "", df[[IMMCOL$d]])
  df[[IMMCOL$d]] <- gsub(",", ", ", df[[IMMCOL$d]])
  df[[IMMCOL$d]] = str_replace_all(df[[IMMCOL$d]], '"', "")
  
  df[[IMMCOL$j]] <- gsub("([*][[:digit:]]*)([(][[:digit:]]*[.]*[[:digit:]]*[)])", "", df[[IMMCOL$j]])
  df[[IMMCOL$j]] <- gsub(",", ", ", df[[IMMCOL$j]])
  df[[IMMCOL$j]] = str_replace_all(df[[IMMCOL$j]], '"', "")
  
  
  lol <- fix.alleles(df)
  
  logic = is.na(lol[[IMMCOL$cdr3aa]]) & !is.na(lol[[IMMCOL$cdr3nt]])
  if (any(logic)) {
    lol[[IMMCOL$cdr3aa]][logic] = bunch_translate(lol[[IMMCOL$cdr3nt]][logic])
  }
  
  logic = is.na(lol[[IMMCOL$cdr3aa]]) & is.na(lol[[IMMCOL$cdr3nt]])
  if (any(logic)) {
    warn_msg = c("  [!] Warning: found", sum(logic))
    warn_msg = c(warn_msg, "clonotypes with no nucleotide or amino acid CDR3 sequence.")
    warn_msg = c(warn_msg, "\n      Please check if your files are the final repertoire files, not the intermediate ones before filtering out bad clonotypes.\n")
    cat(warn_msg)
  }
  
  for (colname in c(IMMCOL$ve, IMMCOL$ds, IMMCOL$de, IMMCOL$js, IMMCOL$vnj, IMMCOL$vnd, IMMCOL$dnj)) {
    if (colname %in% colnames(lol)) {
      logic = is.na(lol[[colname]])
      lol[[colname]][logic] = -1
      
      logic = lol[[colname]] < 0
      lol[[colname]][logic] = NA
    }
  }
  
  for (col_i in 1:length(IMMCOL$order)) {
    colname = IMMCOL$order[col_i]
    if (colname %in% colnames(lol)) {
      if (!has_class(lol[[colname]], IMMCOL$type[col_i])) {
        lol[[colname]] = as(lol[[colname]], IMMCOL$type[col_i])
      }
    }
  }
  
  lol
  
}
Jabst/dsImmunarch documentation built on Oct. 30, 2019, 7:35 p.m.