R/fullRepertoire.R

#' Simulates full heavy chain antibody repertoires for either human or mice.
#' @param max.tree.num Integer value describing maximum number of trees allowed
#'  to generate the core sequences of the repertoire. Each of these trees is started
#'  by an independent VDJ recombination event.
#' @inheritParams singleLineage
#' @return Returns a nested list. output[[1]][[1]] is an array of the simulated sequences
#' output[[2]][[1]] is an array names corresponding to each sequence. For example, output[[2]][[1]][1]
#' is the name of the sequence corresponding to output[[1]][[1]][1]. The simulated tree of this is found in
#' output[[3]][[1]]. The length of the output list is determined by the number of sampling points
#' Thus if you have two sampling points, output[[4]][[1]] would be a character array holding the sequences
#' with output[[5]][[1]] as a character array holding the corresponding names. Then the sequences recovered
#' second sampling point would be stored at output[[6]][[1]], with the names at output[[7]][[1]]. This
#' nested list was designed for full antibody repertoire simulations, and thus, may seem unintuitive
#' for the single lineage function. The first sequence and name corresponds to the germline sequence
#' that served as the root of the tree. See vignette for comprehensive example
#' @export
#' @seealso singleLineage
#' @examples
# fullRepertoire(max.seq.num=51,max.timer=150,
#  SHM.method="naive",baseline.mut = 0.0008,
#  SHM.branch.prob = "identical", SHM.branch.param = 0.05,
#  SHM.nuc.prob = 15/350,species="mus",
#  VDJ.branch.prob = 0.1,proportion.sampled = 1,
#  sample.time = 50,max.tree.num=3, chain.type="heavy",vdj.model="naive",
#  vdj.insertion.mean=4, vdj.insertion.stdv=2)

fullRepertoire <- function(max.seq.num,
                          max.timer,
                          SHM.method,
                          baseline.mut,
                          SHM.branch.prob,
                          SHM.branch.param,
                          SHM.nuc.prob,
                          species,
                          VDJ.branch.prob,
                          proportion.sampled,
                          sample.time,
                          max.tree.num,
                          chain.type,
                          vdj.model,
                          vdj.insertion.mean,
                          vdj.insertion.stdv
                          ){
  tree_list <- c()
  tree_list[1:max.tree.num] <- "(0,1)L;"
  seq_list <- list()
  name_list <- list()

  seq_per_tree <- rep(1,max.tree.num)
  germline_vseq <- c()
  germline_dseq <- c()
  germline_jseq <- c()
  germline_seq <- c()
  germline_name <- c()
  #tree_list <- list()
  max.seq.num <- max.seq.num + 1
  output_list <- list()
  max_SHM <- max.seq.num - max.tree.num
  if(species=="mus" || species=="mouse" || species=="blc6" && chain.type=="heavy"){
    indV <- sample(x = 1:nrow(ighv_mus_df),size = 1,replace = FALSE)
    indD <- sample(x = 1:nrow(ighd_mus_df),size=1,replace=FALSE)
    indJ <- sample(x = 1:nrow(ighj_mus_df),size=1,replace=FALSE)
    germline_name[1] <- paste(as.character(ighv_mus_df[[1]][indV]),as.character(ighd_mus_df[[1]][indD]),as.character(ighj_mus_df[[1]][indJ]),sep="")
    germline_vseq[1] <- as.character(ighv_mus_df[[2]][indV])
    germline_dseq[1] <- as.character(ighd_mus_df[[2]][indD])
    germline_jseq[1] <- as.character(ighj_mus_df[[2]][indJ])
    germline_seq[1] <- paste(as.character(ighv_mus_df[[2]][indV]),as.character(ighd_mus_df[[2]][indD]),as.character(ighj_mus_df[[2]][indJ]),sep="")
    seq_list[[1]] <- as.character(.VDJ_RECOMBIN_FUNCTION(as.character(ighv_mus_df[[2]][indV]), as.character(ighd_mus_df[[2]][indD]),as.character(ighj_mus_df[[2]][indJ]),
                                                         method=vdj.model,
                                                         chain.type=chain.type,
                                                         species=species,
                                                         vdj.insertion.mean=vdj.insertion.mean,
                                                         vdj.insertion.stdv=vdj.insertion.stdv))
    name_list[[1]] <- paste(paste("S","1", sep=""),1,sep="_")
#     indV <- sample(x = 1:nrow(igkv_mus_df),size = 1,replace = FALSE)
#     indD <- sample(x = 1:nrow(ighd_mus_df),size=1,replace=FA)
#     indJ <- sample(x = 1:nrow(ighj_mus_df),size=1,replace=TRUE)
#     vseq <- as.character(ighv_mus_df$seq[indV])
#     dseq <- as.character(ighd_mus_df$seq[indD])
#     jseq <- as.character(ighj_mus_df$seq[indJ])
#     germline <- paste(as.character(ighv_mus_df[[2]][indV]),as.character(ighd_mus_df[[2]][indD]),as.character(ighj_mus_df[[2]][indJ]),sep="")
#     germline_v <- as.character(ighv_mus_df[[1]][indV])
#     germline_d <- as.character(ighd_mus_df[[1]][indD])
#     germline_j <- as.character(ighj_mus_df[[1]][indJ])
  }
  else if(species=="hum" || species== "human" && chain.type=="heavy"){
    indV <- sample(x = 1:nrow(ighv_hum_df),size = 1,replace = FALSE)
    indD <- sample(x = 1:nrow(ighd_hum_df),size=1,replace=FALSE)
    indJ <- sample(x = 1:nrow(ighj_hum_df),size=1,replace=FALSE)
    germline_name[1] <- paste(as.character(ighv_hum_df[[1]][indV]),as.character(ighd_hum_df[[1]][indD]),as.character(ighj_hum_df[[1]][indJ]),sep="")
    germline_vseq[1] <- as.character(ighv_hum_df[[2]][indV])
    germline_dseq[1] <- as.character(ighd_hum_df[[2]][indD])
    germline_jseq[1] <- as.character(ighj_hum_df[[2]][indJ])
    germline_seq[1] <- paste(as.character(ighv_hum_df[[2]][indV]),as.character(ighd_hum_df[[2]][indD]),as.character(ighj_hum_df[[2]][indJ]),sep="")
    seq_list[[1]] <- as.character(.VDJ_RECOMBIN_FUNCTION(as.character(ighv_hum_df[[2]][indV]), as.character(ighd_hum_df[[2]][indD]),as.character(ighj_hum_df[[2]][indJ]),
                                                         method=vdj.model,
                                                         chain.type=chain.type,
                                                         species=species,
                                                         vdj.insertion.mean=vdj.insertion.mean,
                                                         vdj.insertion.stdv=vdj.insertion.stdv))
    name_list[[1]] <- paste(paste("S","1", sep=""),1,sep="_")
  }

  ### light chains
  else if(species=="hum" || species== "human" && chain.type=="light"){
    indV <- sample(x = 1:nrow(rbind(iglv_hum_df,igkv_hum_df)),size = 1,replace = FALSE)
    #indD <- sample(x = 1:nrow(rbind(iglv_mus_df,igkv_mus_df)),size=1,replace=FALSE)
    indJ <- sample(x = 1:nrow(rbind(iglj_hum_df,igkj_hum_df)),size=1,replace=FALSE)
    germline_name[1] <- paste(as.character(rbind(iglv_hum_df,igkv_hum_df)[[1]][indV]),"",as.character(rbind(iglv_hum_df,igkv_hum_df)[[1]][indJ]),sep="")
    germline_vseq[1] <- as.character(rbind(iglv_hum_df,igkv_hum_df)[[2]][indV])
    germline_dseq[1] <- "" #as.character(ighd_hum_df[[2]][indD])
    germline_jseq[1] <- as.character(rbind(iglj_hum_df,igkj_hum_df)[[2]][indJ])
    germline_seq[1] <- paste(as.character(rbind(iglv_hum_df,igkv_hum_df)[[2]][indV]),"",as.character(rbind(iglv_hum_df,igkv_hum_df)[[2]][indJ]),sep="")
    seq_list[[1]] <- as.character(.VDJ_RECOMBIN_FUNCTION(as.character(rbind(iglv_mus_df,igkv_mus_df)[[2]][indV]), "",as.character(rbind(iglj_mus_df,igkj_mus_df)[[2]][indJ]),
                                                         method=vdj.model,
                                                         chain.type=chain.type,
                                                         species=species,
                                                         vdj.insertion.mean=vdj.insertion.mean,
                                                         vdj.insertion.stdv=vdj.insertion.stdv))
    name_list[[1]] <- paste(paste("S","1", sep=""),1,sep="_")
  }
  else if(species=="mus" || species=="mouse" || species=="blc6" && chain.type=="light"){
    indV <- sample(x = 1:nrow(rbind(iglv_mus_df,igkv_mus_df)),size = 1,replace = FALSE)
    #indD <- sample(x = 1:nrow(rbind(iglv_mus_df,igkv_mus_df)),size=1,replace=FALSE)
    indJ <- sample(x = 1:nrow(rbind(iglj_mus_df,igkj_mus_df)),size=1,replace=FALSE)
    germline_name[1] <- paste(as.character(rbind(iglv_mus_df,igkv_mus_df)[[1]][indV]),"",as.character(rbind(iglj_mus_df,igkj_mus_df)[[1]][indJ]),sep="")
    germline_vseq[1] <- as.character(rbind(iglv_mus_df,igkv_mus_df)[[2]][indV])
    germline_dseq[1] <- "" #as.character(rbind(iglv_mus_df,igkv_mus_df)[[2]][indD])
    germline_jseq[1] <- as.character(rbind(iglv_mus_df,igkv_mus_df)[[2]][indJ])
    germline_seq[1] <- paste(as.character(rbind(iglv_mus_df,igkv_mus_df)[[2]][indV]),"",as.character(rbind(iglj_mus_df,igkj_mus_df)[[2]][indJ]),sep="")
    seq_list[[1]] <- as.character(.VDJ_RECOMBIN_FUNCTION(as.character(rbind(iglv_mus_df,igkv_mus_df)[[2]][indV]),"",as.character(rbind(iglj_mus_df,igkj_mus_df)[[2]][indJ]),
                                                         method=vdj.model,
                                                         chain.type=chain.type,
                                                         species=species,
                                                         vdj.insertion.mean=vdj.insertion.mean,
                                                         vdj.insertion.stdv=vdj.insertion.stdv))
    name_list[[1]] <- paste(paste("S","1", sep=""),1,sep="_")
  }

  current_seq_count <- 1
  VDJ_count <- 1
  SHM_count <- 0
  next_node <- 2
  output_list <- list()
  if(class(sample.time)=="numeric" && length(sample.time)>0){
    for(i in seq(1,length(sample.time),2)){
      output_list[[i+3]] <- list()
      output_list[[i+4]] <- list()
    }
  }

  seq_type <- c()
  new_VDJ_prob <- VDJ.branch.prob
  if(SHM.branch.prob=="identical"){
    new_SHM_prob <- rep(SHM.branch.param,max.seq.num)
  }
  else if(SHM.branch.prob=="uniform"){
    new_SHM_prob <- stats::runif(n=max.seq.num,min=SHM.branch.param[1],max=SHM.branch.param[2])
  }
  else if(SHM.branch.prob=="exponential"|| SHM.branch.prob=="exp"){
    new_SHM_prob <- stats::rexp(n=max.seq.num,rate=SHM.branch.param)
  }
  else if(SHM.branch.prob=="lognorm"|| SHM.branch.prob=="lognormal"){
    new_SHM_prob <- stats::rlnorm(n=max.seq.num,meanlog = SHM.branch.param[1],sdlog = SHM.branch.param[2])
  }
  else if(SHM.branch.prob=="normal"|| SHM.branch.prob=="norm"){
    new_SHM_prob <- stats::rnorm(n=max.seq.num,mean=SHM.branch.param[1],sd=SHM.branch.param[2])
  }

  sample_seq_list <- list()
  sample_names_list <- list()
  output_tree_text <- "(0,1)L;"
  sample_index <- 1
  sample.time <- sort(sample.time,decreasing = FALSE)
  current_tree_num <- 1
  for(i in 1:max.timer){
    if(length(unlist(seq_list))>=max.seq.num) break
    if(i==sample.time[sample_index] && length(unlist(seq_list)) >= 1){
      for(z in 1:length(seq_list)){
        current_len <- length(seq_list[[z]])
        holding_size <- as.integer(proportion.sampled * current_len)
        holding_ind <- sample.int(n = current_len,
                                  size=holding_size,
                                  replace = FALSE,
                                  prob = rep(x = 1/current_len,current_len))
        output_list[[2*sample_index+2]][[z]] <- seq_list[[z]][holding_ind]
        output_list[[2*sample_index+3]][[z]] <- paste(name_list[[z]][holding_ind],sample.time[sample_index],sep="_")
      }
    if(length(sample.time) > sample_index) sample_index <- sample_index + 1
    }
    if(length(unlist(seq_list))>=2){
      for(u in 1:length(seq_list)){
        seq_list[[u]] <- .applyBaseLine(seq_list[[u]],baseline.mut)
      }
    }
    is_new_VDJ <- sample(x=c(0,1), replace=TRUE,size = 1, prob=c(VDJ.branch.prob,
                                                        1- VDJ.branch.prob))
    if(is_new_VDJ==0 && current_seq_count<max.seq.num && current_tree_num<max.tree.num){
      if(species=="mus" || species=="mouse" || species=="blc6" && chain.type=="heavy"){
        indV <- sample(x = 1:nrow(ighv_mus_df),size = 1,replace = FALSE)
        indD <- sample(x = 1:nrow(ighd_mus_df),size=1,replace=FALSE)
        indJ <- sample(x = 1:nrow(ighj_mus_df),size=1,replace=FALSE)
        germline_name[current_tree_num+1] <- paste(as.character(ighv_mus_df[[1]][indV]),as.character(ighd_mus_df[[1]][indD]),as.character(ighj_mus_df[[1]][indJ]),sep="")
        germline_seq[current_tree_num+1] <- paste(as.character(ighv_mus_df[[2]][indV]),as.character(ighd_mus_df[[2]][indD]),as.character(ighj_mus_df[[2]][indJ]),sep="")
        seq_list[[current_tree_num+1]] <- as.character(.VDJ_RECOMBIN_FUNCTION(as.character(ighv_mus_df[[2]][indV]), as.character(ighd_mus_df[[2]][indD]),as.character(ighj_mus_df[[2]][indJ]),
                                                                              method=vdj.model,
                                                                              chain.type=chain.type,
                                                                              species=species,
                                                                              vdj.insertion.mean=vdj.insertion.mean,
                                                                              vdj.insertion.stdv=vdj.insertion.stdv))
      }

      else if(species=="hum" || species== "human" && chain.type=="heavy"){
        indV <- sample(x = 1:nrow(ighv_hum_df),size = 1,replace = FALSE)
        indD <- sample(x = 1:nrow(ighd_hum_df),size=1,replace=FALSE)
        indJ <- sample(x = 1:nrow(ighj_hum_df),size=1,replace=FALSE)
        germline_name[current_tree_num+1] <- paste(as.character(ighv_hum_df[[1]][indV]),as.character(ighd_hum_df[[1]][indD]),as.character(ighj_hum_df[[1]][indJ]),sep="")
        germline_seq[current_tree_num+1] <- paste(as.character(ighv_hum_df[[2]][indV]),as.character(ighd_hum_df[[2]][indD]),as.character(ighj_hum_df[[2]][indJ]),sep="")
        seq_list[[current_tree_num+1]] <- as.character(.VDJ_RECOMBIN_FUNCTION(as.character(ighv_hum_df[[2]][indV]), as.character(ighd_hum_df[[2]][indD]),as.character(ighj_hum_df[[2]][indJ]),
                                                                              method=vdj.model,
                                                                              chain.type=chain.type,
                                                                              species=species,
                                                                              vdj.insertion.mean=vdj.insertion.mean,
                                                                              vdj.insertion.stdv=vdj.insertion.stdv))
      }
      else if(species=="mus" || species=="mouse" || species=="blc6" && chain.type=="light"){
        indV <- sample(x = 1:nrow(rbind(iglv_mus_df,igkv_mus_df)),size = 1,replace = FALSE)
        #indD <- sample(x = 1:nrow(blc6_d_df),size=1,replace=FALSE)
        indJ <- sample(x = 1:nrow(rbind(iglj_mus_df,igkj_mus_df)),size=1,replace=FALSE)
        germline_name[current_tree_num+1] <- paste(as.character(rbind(iglv_mus_df,igkv_mus_df)[[1]][indV]),"",as.character(rbind(iglj_mus_df,igkj_mus_df)[[1]][indJ]),sep="")
        germline_seq[current_tree_num+1] <- paste(as.character(rbind(iglv_mus_df,igkv_mus_df)[[2]][indV]),"",as.character(rbind(iglj_mus_df,igkj_mus_df)[[2]][indJ]),sep="")
        seq_list[[current_tree_num+1]] <- as.character(.VDJ_RECOMBIN_FUNCTION(as.character(rbind(iglv_mus_df,igkv_mus_df)[[2]][indV]), "",as.character(rbind(iglj_mus_df,igkj_mus_df)[[2]][indJ]),
                                                                              method=vdj.model,
                                                                              chain.type=chain.type,
                                                                              species=species,
                                                                              vdj.insertion.mean=vdj.insertion.mean,
                                                                              vdj.insertion.stdv=vdj.insertion.stdv))
      }
      else if(species=="hum" || species== "human" && chain.type=="light"){
        indV <- sample(x = 1:nrow(rbind(iglv_hum_df,igkv_hum_df)),size = 1,replace = FALSE)
        #indD <- #sample(x = 1:nrow(rbind(iglv_hum_df,igkv_hum_df)),size=1,replace=FALSE)
        indJ <- sample(x = 1:nrow(rbind(iglj_hum_df,igkj_hum_df)),size=1,replace=FALSE)
        germline_name[current_tree_num+1] <- paste(as.character(rbind(iglv_hum_df,igkv_hum_df)[[1]][indV]),"",as.character(rbind(iglj_hum_df,igkj_hum_df)[[1]][indJ]),sep="")
        germline_seq[current_tree_num+1] <- paste(as.character(rbind(iglv_hum_df,igkv_hum_df)[[2]][indV]),"",as.character(rbind(iglj_hum_df,igkj_hum_df)[[2]][indJ]),sep="")
        seq_list[[current_tree_num+1]] <- as.character(.VDJ_RECOMBIN_FUNCTION(as.character(rbind(iglv_hum_df,igkv_hum_df)[[2]][indV]),"",as.character(rbind(iglj_hum_df,igkj_hum_df)[[2]][indJ]),
                                                                              method=vdj.model,
                                                                              chain.type=chain.type,
                                                                              species=species,
                                                                              vdj.insertion.mean=vdj.insertion.mean,
                                                                              vdj.insertion.stdv=vdj.insertion.stdv))
      }
      name_list[[current_tree_num+1]] <- paste(paste("S",next_node, sep=""),i,sep="_")

      current_seq_count <- current_seq_count + 1
      current_tree_num <- current_tree_num + 1
      next_node <- next_node + 1
      seq_per_tree[current_tree_num+1] <- seq_per_tree[current_tree_num+1]
      tree_list[current_tree_num] <- gsub(pattern="1",replacement = as.character(next_node-1),x=tree_list[current_tree_num])
    }
    if(next_node>= max.seq.num) break
    if(current_seq_count>=1){
      for(o in 1:length(seq_list)){
       for(j in 1:length(seq_list[[o]])){
        is_new_SHM <- sample(x=c(0,1),size=1, replace=TRUE, prob=c(new_SHM_prob[next_node], 1- new_SHM_prob[next_node]))
        if (is_new_SHM==0 && next_node<max.seq.num && SHM_count<max_SHM ){
          holding_jsub <- gsub("_.*","",name_list[[o]][j])
          holding_jsub <- gsub("[^0-9]","",holding_jsub)
          seq_list[[o]][seq_per_tree[o]+1] <- .SHM_FUNCTION_SEQUENCE4(seq_list[[o]][j],
                                                          SHM.method,germline_vseq[o],germline_dseq[o],germline_jseq[o],SHM.nuc.prob)
          name_list[[o]][seq_per_tree[o]+1] <- paste(paste("S",next_node, sep=""),i,sep="_")
          tree_list[o] <- .branchingProcess3(tree_list[o],holding_jsub,next_node,"SHM")
          next_node <- next_node + 1
          current_seq_count <- current_seq_count + 1
          SHM_count <- SHM_count + 1
          seq_per_tree[o] <- seq_per_tree[o] + 1

        }
        holding_no_nas <- unlist(seq_list)
        if(length(holding_no_nas[is.na(holding_no_nas)==FALSE])>=max.seq.num) break
       }
      holding_no_nas <- unlist(seq_list)
      if(length(holding_no_nas[is.na(holding_no_nas)==FALSE])>=max.seq.num) break      }
    }
    holding_no_nas <- unlist(seq_list)
    if(length(holding_no_nas[is.na(holding_no_nas)==FALSE])>=max.seq.num) break
  }
  for(i1 in 1:length(seq_list)){
    output_list[[1]][[i1]] <- append(germline_seq[i1],seq_list[[i1]])
    output_list[[2]][[i1]] <- append(germline_name[i1],name_list[[i1]])
    temp.tree <- ape::read.tree(text=tree_list[i1])
    output_list[[3]][[i1]] <- temp.tree
    for(i2 in 2:length(output_list[[3]][[i1]]$tip.label)){
      output_list[[3]][[i1]]$tip.label[1] <- output_list[[2]][[i1]][1]
      output_list[[3]][[i1]]$tip.label[i2] <- output_list[[2]][[i1]][grep(output_list[[2]][[i1]],pattern=paste("S",output_list[[3]][[i1]]$tip.label[i2],"_",sep=""))]
    }
   }

  return(output_list)
}

Try the AbSim package in your browser

Any scripts or data that you put into this service are public.

AbSim documentation built on May 2, 2019, 5:08 a.m.