RepAn: Differential abundance Analysis for deep sequenced immune repertoire (Repseq) datasets

Documented in getDefined getHasStop getOutOfFrame getProductive getUnproductive loadBioconductorPacks loadPacks readSample readSampleV2

### ***************************************************************
### ******** Basic analysis for TCR repertoire data **************
### ***************************************************************


########################### External R packages ###########################

#' Attaches packages from CRAN after installing them if they don't exist already
#' 
#' @param package.list vector of packages
#' 
loadPacks <- function(package.list = c("ggplot2","seqRFLP","stringr","permute","cluster","data.table","NbClust","doParallel","dendextend","kmer","ape","Matrix","wordspace","dynamicTreeCut","e1071","fclust","randomForest","preprocessCore","gplots")){
  # other probably needed packages that have been removed for now: "Rclusterpp"
  new.packages <-package.list[!(package.list %in% installed.packages()[,"Package"])]
  if(length(new.packages) > 0) try(install.packages(new.packages))
  loadSuccess <- lapply(eval(package.list), require, character.only=TRUE,quietly=TRUE)
}

#' Attaches packages from Bioconductor after installing them if they don't exist already
#' 
#' @param package.list vector of Bioconductor packages
#' 
loadBioconductorPacks <- function(package.list = c("Biostrings","RankProd","preprocessCore","msa","ggseqlogo")){
  new.packages <-package.list[!(package.list %in% installed.packages()[,"Package"])]
  if(length(new.packages) > 0){
    if((R.Version()[["major"]] >= 4) | (R.Version()[["major"]] >= 3 & unlist(strsplit(R.Version()[["minor"]],"\\."))[1] >= 5)){
      if (!requireNamespace("BiocManager", quietly = TRUE))
        install.packages("BiocManager")
      BiocManager::install(new.packages)      
    }else{
      source("http://bioconductor.org/biocLite.R")
      biocLite(new.packages)
    }
  }
  loadSuccess <- lapply(eval(package.list), require, character.only=TRUE,quietly=TRUE)
}



#loadPacks()
#loadBioconductorPacks()


########################### FUNCTIONS #########################


## read data files ####

#' Read in Repertoire data (Repseq sample) in older Immunoseq format
#' 
#' @param samplename name of the immunoseq formatted repertoire sample (full address or in the working directory)
#' @param cloneStatus logical; reading only productive or only unproductive CDR3s from the sample; default is productive only
#' @param filterOutBelowSize1 logical; drop clonotypes with a size of 1, default is true
#' @return a repseq sample is returned
#' 

readSample <- function(samplename,cloneStatus="p",filterOutBelowSize1=T){
  
  # read sample name and change colnames to uppercase
  
  sam <- read.table(samplename, header=T, sep="\t",dec = ".",stringsAsFactors=F)
  #sam <- as.data.frame(fread(samplename, header=T, sep="\t",stringsAsFactors=F))
  
  
  colnames(sam) <- toupper(colnames(sam))
  sam <- sam[,which(colnames(sam)=="NUCLEOTIDE"):length(colnames(sam))] # remove unwanted columns that come with some data e.g container etc
  
  # Extract cdr3 sequences and put on one column
  CDR3NT = substring(sam$NUCLEOTIDE,sam$VINDEX+1,sam$VINDEX + sam$CDR3LENGTH)
  sam <- data.frame(sam,CDR3NT,stringsAsFactors=F)
  
  # if there is a column named copy, change it to COUNT
  
  if(sum(colnames(sam) == "COPY") == 1){
    
    colnames(sam)[which(colnames(sam) == "COPY")] <- "UNNORMALIZEDCOUNT"
    colnames(sam)[which(colnames(sam) == "NORMALIZEDCOPY")] <- "COUNT"
    colnames(sam)[which(colnames(sam) == "NORMALIZEDFREQUENCY")] <- "FREQUENCYCOUNT"
    
    
  }
  
  if(filterOutBelowSize1==T)
    sam <- sam[sam[,"COUNT"] > 1,]
  
  
  # filter sample for only productive or unproductive clonotypes
  
  if(cloneStatus == "p"){
    sam = getProductive(sam)
  }else if(cloneStatus == "unp"){
    sam = getUnproductive(sam)
  }
  
  return(sam)
  
}


#' Read in Repertoire data (Repseq sample) in the recent (version 3) Immunoseq format
#' 
#' @param samplename name of the immunoseq formatted repertoire sample (full address or in the working directory)
#' @param cloneStatus logical; reading only productive or only unproductive CDR3s from the sample; default is productive only
#' @param filterOutBelowSize1 logical; drop clonotypes with a size of 1, default is true
#' @return a repseq sample is returned
#' 

readSampleV2 <- function(samplename,cloneStatus="p",filterOutBelowSize1=T){
  
  # read sample name and change colnames to uppercase
  
  #sam <- read.table(samplename, header=T, sep="\t",dec = ".",stringsAsFactors=F)
  #sam <- as.data.frame(fread(samplename, header=T, sep="\t",stringsAsFactors=F,verbose=F,showProgress=F))
  
  sam <- as.data.frame(fread(samplename, header=T, sep="\t",dec = ".",stringsAsFactors=F))
  
  #sam <- read.table(samplename, header=T, sep="\t",dec = ".",stringsAsFactors=F)
  
  colnames(sam) <- toupper(colnames(sam))
  sam <- sam[,which(colnames(sam)=="NUCLEOTIDE"):length(colnames(sam))] # remove unwanted columns that come with some data e.g container etc
  
  # Extract cdr3 sequences and put on one column
  CDR3NT = substring(sam$NUCLEOTIDE,sam$VINDEX+1,sam$VINDEX + sam$CDR3LENGTH)
  sam <- data.frame(sam,CDR3NT,stringsAsFactors=F)
  
  # if there is a column named copy, change it to COUNT
  
  colnames(sam)[grep("TEMPLATES",colnames(sam))] <- "COUNT"
  colnames(sam)[grep("FREQUENCYCOUNT",colnames(sam))] <- "FREQUENCYCOUNT"
  
  
  if(filterOutBelowSize1==T)
    sam <- sam[sam[,"COUNT"] > 1,]
  
  
  #colnames(sam)[grep("AMINOACID",colnames(sam))] <- "AMINOACID"
  #colnames(sam)[grep("VGENENAME",colnames(sam))] <- "VGENENAME"
  #colnames(sam)[grep("SEQUENCESTATUS",colnames(sam))] <- "SEQUENCESTATUS"
  
  # filter sample for only productive or unproductive clonotypes
  
  if(cloneStatus == "p"){
    sam = getProductive(sam)
  }else if(cloneStatus == "unp"){
    sam = getUnproductive(sam)
  }
  
  return(sam)
  
}


#' Read in MiXCR format data and change to Immunoseq format
#' 
#' @param samplename name of the MiXCR formatted repertoire sample (full address or in the working directory)
#' @param cloneStatus logical; reading only productive or only unproductive CDR3s from the sample; default is productive only
#' @param filterOutBelowSize1 logical; drop clonotypes with a size of 1, default is true
#' @return a repseq sample is returned
#' 

readMiXCR <- function(samplename,cloneStatus="p",filterOutBelowSize1=T){
  
  # read the file fast, these are big files usually
  
  sam <- as.data.frame(data.table::fread(samplename, header=T, dec = ".",stringsAsFactors=F))
  
  # we drop clones with size 1 at this point since these data are huge, in addition Immunoseq min clone size is 2.
  
  if(filterOutBelowSize1==T)
    sam <- sam[sam[,"Clone count"] > 1,]
  
  
  sam$CDR3length <- sapply(sam[,"N. Seq. CDR3"],nchar)
  
  sam$vgenename <- sapply(sam[,"All V hits"],function(x) unlist(strsplit(unlist(strsplit(x,","))[1],"\\*"))[1])
  sam$vgenename[is.na(sam$vgenename)] <- "unresolved"
  
  sam$dgenename <- sapply(sam[,"All D hits"],function(x) unlist(strsplit(unlist(strsplit(x,","))[1],"\\*"))[1])
  sam$dgenename[is.na(sam$dgenename)] <- "unresolved"
  
  sam$jgenename <- sapply(sam[,"All J hits"],function(x) unlist(strsplit(unlist(strsplit(x,","))[1],"\\*"))[1])
  sam$jgenename[is.na(sam$jgenename)] <- "unresolved"
  
  # continue by figuring out the stop and out of frame clones, then reorder the columns, then name them using immunoseq names.Then return sample
  #sam[,"AA. Seq. CDR3"]
  
  sam$SEQUENCESTATUS <- "In"
  sam$SEQUENCESTATUS[grep("\\*",sam[,"AA. Seq. CDR3"])] <- "Stop"
  
  reOrderColumnsToImmunoseqFormat <- c("Clonal sequence(s)","AA. Seq. CDR3","Clone count",
                                       "Clone fraction","CDR3length","vgenename","dgenename","jgenename","SEQUENCESTATUS","N. Seq. CDR3") 
  
  sam <- sam[,reOrderColumnsToImmunoseqFormat]
  
  colnames(sam) <- c("NUCLEOTIDE","AMINOACID","COUNT","FREQUENCYCOUNT","CDR3LENGTH","VGENENAME","DGENENAME","JGENENAME","SEQUENCESTATUS","CDR3NT")
  colnames(sam) <- toupper(colnames(sam))
  
  # filter sample for only productive or unproductive clonotypes
  
  if(cloneStatus == "p"){
    sam = getProductive(sam)
  }else if(cloneStatus == "unp"){
    sam = getUnproductive(sam)
  }
  
  return(sam)
  
}


renameCopyTOCount <- function(samplename){
  # change some variable names
  for(rep in reps){
    sam <- get(rep)
    
    if(sum(colnames(sam) == "COPY") == 1){
      
      colnames(sam)[which(colnames(sam) == "COPY")] <- "UNNORMALIZEDCOUNT"
      colnames(sam)[which(colnames(sam) == "NORMALIZEDCOPY")] <- "COUNT"
      colnames(sam)[which(colnames(sam) == "NORMALIZEDFREQUENCY")] <- "FREQUENCYCOUNT"
      
      
    }
    
  }

}

extractCDR3 <- function(samplename){
  for(rep in reps){
    p <- get(rep)
    CDR3NT = substring(p$NUCLEOTIDE,p$VINDEX,p$VINDEX+p$CDR3LENGTH)
    p <- data.frame(p,CDR3NT)
    
  }
  
}





#### Important Data related functions : eg. extract productive clones only etc. ####

#' Get productive only CDR3s from a repertoire data (Repseq sample) in immunoseq format
#' 
#' @param rep name of the repertoire sample (which has already been read in with for example with read.table)
#' @return a subset of the repertoire (rep) containing only productive CDR3s is returned
#' 

getProductive <- function(rep) {
  p <- rep[rep$SEQUENCESTATUS=="Productive" | rep$SEQUENCESTATUS=="In",]
  return(p)
}

#' Get unproductive CDR3s from a repertoire data (Repseq sample) in immunoseq format
#' 
#' @param rep name of the repertoire sample (which has already been read in with for example with read.table)
#' @return a subset of the repertoire (rep) containing only unproductive CDR3s is returned
#' 

getUnproductive <- function(rep) {
  p <- rep[!(rep$SEQUENCESTATUS=="Productive" | rep$SEQUENCESTATUS=="In"),]
  return(p)
}

#' Get out of frame CDR3s from a repertoire data (Repseq sample) in immunoseq format
#' 
#' @param rep name of the repertoire sample (which has already been read in with for example with read.table)
#' @return a subset of the repertoire (rep) containing only out of frame CDR3s is returned
#' 
getOutOfFrame <- function(rep) {
  p <- rep[rep$SEQUENCESTATUS=="Out of frame" | rep$SEQUENCESTATUS=="Out",]
  return(p)
}

#' Get stop codon containing CDR3s from a repertoire data (Repseq sample) in immunoseq format
#' 
#' @param rep name of the repertoire sample (which has already been read in with for example with read.table)
#' @return a subset of the repertoire (rep) containing CDR3s with stop codon is returned
#' 
getHasStop <- function(rep) {
  p <- rep[rep$SEQUENCESTATUS=="Has stop" | rep$SEQUENCESTATUS=="Stop",]
  return(p)
}

#' Get CDR3s that have well defined V gene name from a repertoire data (Repseq sample) in immunoseq format
#' 
#' @param rep name of the repertoire sample (which has already been read in with for example with read.table)
#' @return a subset of the repertoire (rep) containing CDR3s with well defined v gene name is returned
#' 
getDefined <- function(rep) {
  p <- rep[rep$VGENENAME != "(undefined)" | rep$VGENENAME!= "unresolved",]
  return(p)
}



#### Getting all V and J genes in our data:  ####


getAllVgenes <- function(reps) 
{
  vgenes_GB=c()
  for(rep in reps){
    p <- get(rep)
    vgenes_GB=union(vgenes_GB,getUniqueVgenes(p))
  }
  
  return(sort(vgenes_GB))
}

getAllVfamilies <- function(reps) 
{
  vfam=c()
  for(rep in reps){
    p <- get(rep)
    vfam=union(vfam,getUniqueVfamilies(p))
  }
  
  return(sort(vfam))
}

getAllJgenes <- function(reps) 
{
  jgenes_GB=c()
  for(rep in reps){
    p <- get(rep)
    jgenes_GB=union(jgenes_GB,getUniqueJgenes(p))
  }
  
  return(sort(jgenes_GB))
}

getAllVJgenes <- function(reps) 
{
  jgenes=getAllJgenes(reps)
  vgenes=getAllVgenes(reps)
  ## All possible VJ combinations 
  vj <- expand.grid(vgenes,jgenes)
  colnames(vj) <- c("VGENENAME","JGENENAME")
  return(vj)
}


getUniqueVgenes <- function(rep) {
  p <- rep[rep$VGENENAME != "(undefined)" | rep$VGENENAME != "unresolved",]
  return(unique(p$VGENENAME)) # [-1] -1 inorder to drop the undefined Vgene type
}

getUniqueVfamilies <- function(rep) {
  p <- rep[rep$VFAMILYNAME != "(undefined)" | rep$VFAMILYNAME != "unresolved",]
  return(unique(p$VFAMILYNAME)) # [-1] -1 inorder to drop the undefined Vgene type
}

getUniqueVfamilies_PBMC <- function(rep) {
  p <- rep[rep$VFAMILYNAME != "(undefined)" | rep$VFAMILYNAME != "unresolved",]
  return(unique(p$VFAMILYNAME)) # [-1] -1 inorder to drop the undefined Vgene type
}

getUniqueJgenes <- function(rep) {
  p <- rep[rep$JGENENAME != "(undefined)" | rep$JGENENAME != "unresolved",]
  return(unique(p$JGENENAME))
}


#### V, J or VJ usage counts and proportions : 

# clone level
getVgeneCloneCountPMatrix<- function(reps,g){
  ### Returns a datamatrix whose first two colums show the V gene. The remaining columns show 
  ### The proportions of available clones using each v gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data. 
  
  m <- as.data.frame(g)
  colnames(m)  <- c("VGENENAME")
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getProductive(p)
    print (nrow(p))
    #p <- getDefined(p) 
    vfamAgg <- aggregate(AMINOACID~ VGENENAME ,data = p, FUN = length)
    vfamAgg[,2] <- vfamAgg[,2]/sum(vfamAgg[,2]) # get the proportion
    colnames(vfamAgg) <- c("VGENENAME",rep)  
    m <- merge(m,vfamAgg, by="VGENENAME",all.x=T) 
    
  }
  
 
  m[is.na(m)] <- 0
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  
  return(m)
}

getVgeneCloneCountMatrix <- function(reps,g){
  ### Returns a datamatrix whose first two colums show the V gene. The remaining columns show 
  ### The number of available clones using each v gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- as.data.frame(g)
  colnames(m)  <- c("VGENENAME")
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getProductive(p)
    print (nrow(p))
    #p <- getDefined(p) 
    vgeneAgg <- aggregate(AMINOACID~ VGENENAME ,data = p, FUN = length)
    colnames(vgeneAgg) <- c("VGENENAME",rep)  
    m <- merge(m,vgeneAgg, by="VGENENAME",all.x=T) 
    
  }
  m[is.na(m)] <- 0
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}


getVgeneCloneCountPMatrixUnprod<- function(reps,g){
  ### Returns a datamatrix whose first two colums show the V gene. The remaining columns show 
  ### The proportions of available unproductive clones using each v gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- as.data.frame(g)
  colnames(m)  <- c("VGENENAME")
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getUnproductive(p)
    print (nrow(p))
    #p <- getDefined(p) 
    vfamAgg <- aggregate(AMINOACID~ VGENENAME ,data = p, FUN = length)
    vfamAgg[,2] <- vfamAgg[,2]/sum(vfamAgg[,2]) # get the proportion
    colnames(vfamAgg) <- c("VGENENAME",paste(rep)) 
    m <- merge(m,vfamAgg, by="VGENENAME",all.x=T) 
    
  }
  m[is.na(m)] <- 0
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}

getVgeneCloneCountMatrixUnprod <- function(reps,g){
  ### Returns a datamatrix whose first two colums show the V gene. The remaining columns show 
  ### The number of available unproductive clones using each v gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- as.data.frame(g)
  colnames(m)  <- c("VGENENAME")
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getUnproductive(p)
    print (nrow(p))
    #p <- getDefined(p) 
    vgeneAgg <- aggregate(AMINOACID~ VGENENAME ,data = p, FUN = length)
    colnames(vgeneAgg) <- c("VGENENAME",rep)  
    m <- merge(m,vgeneAgg, by="VGENENAME",all.x=T) 
    
  }
  m[is.na(m)] <- 0
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}


getVjCloneCountMatrix<- function(reps,vj){
  ### Returns a datamatrix whose first two colums show the Vj gene combinations. The remaining columns show 
  ### The number of available clones using each vj gene combination for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- vj
  colnames(m)  <- c("VGENENAME","JGENENAME")
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getProductive(p)
    #p <- getDefined(p) 
    vjPairAgg <- aggregate(AMINOACID~ VGENENAME + JGENENAME   ,data = p, FUN = length)
    print(head(vjPairAgg))
    colnames(vjPairAgg) <- c("VGENENAME","JGENENAME",rep)  
    m <- merge(m,vjPairAgg, by=c("VGENENAME","JGENENAME"),all.x=T) 
    
  }
  m[is.na(m)] <- 0
  m[,1]<-paste(m[,1],m[,2])
  m<-m[,-2]
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}

getVjCloneCountMatrixUnprod<- function(reps,vj){
  ### Returns a datamatrix whose first two colums show the Vj gene. The remaining columns show 
  ### The number of available unproductive clones using each vj gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- vj
  colnames(m)  <- c("VGENENAME","JGENENAME")
  
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getUnproductive(p)
    #p <- getDefined(p) 
    vjPairAgg <- aggregate(AMINOACID~ VGENENAME + JGENENAME   ,data = p, FUN = length)
    colnames(vjPairAgg) <- c("VGENENAME","JGENENAME",rep)  
    m <- merge(m,vjPairAgg, by=c("VGENENAME","JGENENAME"),all.x=T) 
    
  }
  m[is.na(m)] <- 0
  m[,1]<-paste(m[,1],m[,2])
  m<-m[,-2]
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}

getVjCloneCountPMatrix<- function(reps,vj){
  ### Returns a datamatrix whose first two colums show the Vj gene. The remaining columns show 
  ### The proportions of available clones using each vj gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- vj
  colnames(m)  <- c("VGENENAME","JGENENAME")
  
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getProductive(p)
    #p <- getDefined(p) 
    vjPairAgg <- aggregate(AMINOACID~ VGENENAME + JGENENAME   ,data = p, FUN = length)
    vjPairAgg[,3] <- vjPairAgg[,3]/sum(vjPairAgg[,3]) # get the proportion
    colnames(vjPairAgg) <- c("VGENENAME","JGENENAME",rep)  
    m <- merge(m,vjPairAgg, by=c("VGENENAME","JGENENAME"),all.x=T) 
    
  }
  m[is.na(m)] <- 0
  m[,1]<-paste(m[,1],m[,2])
  m<-m[,-2]
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}

getVjCloneCountPMatrixUnprod<- function(reps,vj){
  ### Returns a datamatrix whose first two colums show the Vj gene. The remaining columns show 
  ### The proportions of available unproductive clones using each vj gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- vj
  colnames(m)  <- c("VGENENAME","JGENENAME")
  
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getUnproductive(p)
    #p <- getDefined(p) 
    vjPairAgg <- aggregate(AMINOACID~ VGENENAME + JGENENAME   ,data = p, FUN = length)
    vjPairAgg[,3] <- vjPairAgg[,3]/sum(vjPairAgg[,3]) # get the proportion
    colnames(vjPairAgg) <- c("VGENENAME","JGENENAME",rep)  
    m <- merge(m,vjPairAgg, by=c("VGENENAME","JGENENAME"),all.x=T) 
    
  }
  m[is.na(m)] <- 0
  m[,1]<-paste(m[,1],m[,2])
  m<-m[,-2]
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}


# read level

getVgeneReadCountPMatrix<- function(reps,g){
  ### Returns a datamatrix whose first two colums show the V gene. The remaining columns show 
  ### The proportions of available clones using each v gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data. 
  
  m <- as.data.frame(g)
  colnames(m)  <- c("VGENENAME")
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getProductive(p)
    print (nrow(p))
    #p <- getDefined(p) 
    vfamAgg <- aggregate(COUNT~ VGENENAME ,data = p, FUN = sum)
    vfamAgg[,2] <- vfamAgg[,2]/sum(vfamAgg[,2]) # get the proportion
    colnames(vfamAgg) <- c("VGENENAME",rep)  
    m <- merge(m,vfamAgg, by="VGENENAME",all.x=T) 
    
  }
  m[is.na(m)] <- 0
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}

getVgeneReadCountMatrix <- function(reps,g){
  ### Returns a datamatrix whose first two colums show the V gene. The remaining columns show 
  ### The number of available clones using each v gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- as.data.frame(g)
  colnames(m)  <- c("VGENENAME")
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getProductive(p)
    print (nrow(p))
    #p <- getDefined(p) 
    vgeneAgg <- aggregate(COUNT~ VGENENAME ,data = p, FUN = sum)
    colnames(vgeneAgg) <- c("VGENENAME",rep)  
    m <- merge(m,vgeneAgg, by="VGENENAME",all.x=T) 
    
  }
  m[is.na(m)] <- 0
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}

getVgeneReadCountPMatrixUnprod<- function(reps,g){
  ### Returns a datamatrix whose first two colums show the V gene. The remaining columns show 
  ### The proportions of available unproductive clones using each v gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- as.data.frame(g)
  colnames(m)  <- c("VGENENAME")
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getUnproductive(p)
    print (nrow(p))
    #p <- getDefined(p) 
    vfamAgg <- aggregate(COUNT~ VGENENAME ,data = p, FUN = sum)
    vfamAgg[,2] <- vfamAgg[,2]/sum(vfamAgg[,2]) # get the proportion
    colnames(vfamAgg) <- c("VGENENAME",paste(rep)) 
    m <- merge(m,vfamAgg, by="VGENENAME",all.x=T) 
    
  }
  m[is.na(m)] <- 0
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}

getVgeneReadCountMatrixUnprod <- function(reps,g){
  ### Returns a datamatrix whose first two colums show the V gene. The remaining columns show 
  ### The number of available unproductive clones using each v gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- as.data.frame(g)
  colnames(m)  <- c("VGENENAME")
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getUnproductive(p)
    print (nrow(p))
    #p <- getDefined(p) 
    vgeneAgg <- aggregate(COUNT~ VGENENAME ,data = p, FUN = sum)
    colnames(vgeneAgg) <- c("VGENENAME",rep)  
    m <- merge(m,vgeneAgg, by="VGENENAME",all.x=T) 
    
  }
  m[is.na(m)] <- 0
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}

getVjReadCountMatrix<- function(reps,vj){
  ### Returns a datamatrix whose first two colums show the vj gene. The remaining columns show 
  ### The number of available clones using each vj gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- vj
  colnames(m)  <- c("VGENENAME","JGENENAME")
  
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getProductive(p)
    #p <- getDefined(p) 
    vjPairAgg <- aggregate(COUNT~ VGENENAME + JGENENAME   ,data = p, FUN = sum)
    colnames(vjPairAgg) <- c("VGENENAME","JGENENAME",rep)  
    m <- merge(m,vjPairAgg, by=c("VGENENAME","JGENENAME"),all.x=T) 
    
  }
  m[is.na(m)] <- 0
  m[,1]<-paste(m[,1],m[,2])
  m<-m[,-2]
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}

getVjReadCountMatrixUnprod<- function(reps,vj){
  ### Returns a datamatrix whose first two colums show the Vj gene. The remaining columns show 
  ### The number of available unproductive clones using each vj gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- vj
  colnames(m)  <- c("VGENENAME","JGENENAME")
  
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getUnproductive(p)
    #p <- getDefined(p) 
    vjPairAgg <- aggregate(COUNT~ VGENENAME + JGENENAME   ,data = p, FUN = sum)
    colnames(vjPairAgg) <- c("VGENENAME","JGENENAME",rep)  
    m <- merge(m,vjPairAgg, by=c("VGENENAME","JGENENAME"),all.x=T) 
    
  }
  m[is.na(m)] <- 0
  m[,1]<-paste(m[,1],m[,2])
  m<-m[,-2]
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}

getVjReadCountPMatrix<- function(reps,vj){
  ### Returns a datamatrix whose first two colums show the Vj gene. The remaining columns show 
  ### The proportions of available clones using each vj gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- vj
  colnames(m)  <- c("VGENENAME","JGENENAME")
  
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getProductive(p)
    #p <- getDefined(p) 
    vjPairAgg <- aggregate(COUNT~ VGENENAME + JGENENAME   ,data = p, FUN = sum)
    vjPairAgg[,3] <- vjPairAgg[,3]/sum(vjPairAgg[,3]) # get the proportion
    colnames(vjPairAgg) <- c("VGENENAME","JGENENAME",rep)  
    m <- merge(m,vjPairAgg, by=c("VGENENAME","JGENENAME"),all.x=T) 
    
  }
  m[is.na(m)] <- 0
  m[,1]<-paste(m[,1],m[,2])
  m<-m[,-2]
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}

getVjReadCountPMatrixUnprod<- function(reps,vj){
  ### Returns a datamatrix whose first two colums show the Vj gene. The remaining columns show 
  ### The proportions of available unprod. clones using each vj gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- vj
  colnames(m)  <- c("VGENENAME","JGENENAME")
  
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getUnproductive(p)
    #p <- getDefined(p) 
    vjPairAgg <- aggregate(COUNT~ VGENENAME + JGENENAME   ,data = p, FUN = sum)
    vjPairAgg[,3] <- vjPairAgg[,3]/sum(vjPairAgg[,3]) # get the proportion
    colnames(vjPairAgg) <- c("VGENENAME","JGENENAME",rep)  
    m <- merge(m,vjPairAgg, by=c("VGENENAME","JGENENAME"),all.x=T) 
    
  }
  m[is.na(m)] <- 0
  m[,1]<-paste(m[,1],m[,2])
  m<-m[,-2]
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}



# General Counting function for V, J, VJ genes for each sequence status  ####

getAggregateCount <- function(reps,countfor="V",seqStatus="IN",lev="A",countsorpro="prop"){

  ### Returns a datamatrix whose first two colums show the features The remaining columns show 
  ### The number of available clones for each feature :
  
  #reps - a vector of dataframe names. The data frames are TCR repertoire data.
  # countfor - the features to be counted, default V for VgeneNames, vf for V family, J for J gene, VJ for vj genes
  # seqStatus - for which sequences is the count to be done. default In for productive, Out for out of frame, Stop for stop codon containing sequences
  # lev - at which level is the counting to be done . default A for Amino acid, N for nucleotide level
  # countsorpro - return counts raw counts or proportions
  
  # setting work parameters
  countfor = toupper(countfor)
  seqStatus= toupper(seqStatus)
  lev = toupper(lev)
  countsorpro= toupper(countsorpro)
  
  print(countfor)
  
  if(countfor=="V"){
    
    
    features= getAllVgenes(reps)
    
    m <- as.data.frame(features)
    colnames(m)  <- c("VGENENAME")
    
    
      for(rep in reps){
        print(rep)
        p <- get(rep)
        
        if(seqStatus=="IN"){
          p <- getProductive(p)
        }else if(seqStatus=="OUT"){
          p <- getOutOfFrame(p)
        }else if(seqStatus=="STOP"){
          p <- getHasStop(p)
        }
        
        print (nrow(p))
     
        if(lev=="N"){
        dAgg <- aggregate(NUCLEOTIDE~ VGENENAME ,data = p, FUN = length)
        }else {
        dAgg <- aggregate(AMINOACID~ VGENENAME ,data = p, FUN = length) 
        }
        
        if(countsorpro=="PROP"){
        dAgg[,2] <- dAgg[,2]/sum(dAgg[,2]) # get the proportion
        }
        colnames(dAgg) <- c("VGENENAME",rep)  
        m <- merge(m,dAgg, by="VGENENAME",all.x=T) 
        
      }
      m[is.na(m)] <- 0
      
      rownames(m) <- m[,1]
      m <- m[,-1]
      
     
     
  }
  else if(countfor=="J"){
    features= getAllJgenes(reps)
    
    m <- as.data.frame(features)
    colnames(m)  <- c("JGENENAME")
    
    for(rep in reps){
      print(rep)
      p <- get(rep)
      
      if(seqStatus=="IN"){
        p <- getProductive(p)
      }else if(seqStatus=="OUT"){
        p <- getOutOfFrame(p)
      }else if(seqStatus=="STOP"){
        p <- getHasStop(p)
      }
      
      print (nrow(p))
      
      if(lev=="N"){
        dAgg <- aggregate(NUCLEOTIDE~ JGENENAME ,data = p, FUN = length)
      }else {
        dAgg <- aggregate(AMINOACID~ JGENENAME ,data = p, FUN = length) 
      }
      
      if(countsorpro=="PROP"){
      dAgg[,2] <- dAgg[,2]/sum(dAgg[,2]) # get the proportion
      }
      colnames(dAgg) <- c("JGENENAME",rep)  
      m <- merge(m,dAgg, by="JGENENAME",all.x=T) 
      
    }
    m[is.na(m)] <- 0
    
    rownames(m) <- m[,1]
    m <- m[,-1]
    
    
    
  }
  
  else if(countfor=="VF"){
    features= getAllVfamilies(reps)
    
    m <- as.data.frame(features)
    colnames(m)  <- c("VFAMILYNAME")
    
    for(rep in reps){
      print(rep)
      p <- get(rep)
      
      if(seqStatus=="IN"){
        p <- getProductive(p)
      }else if(seqStatus=="OUT"){
        p <- getOutOfFrame(p)
      }else if(seqStatus=="STOP"){
        p <- getHasStop(p)
      }
      
      print (nrow(p))
      
      if(lev=="N"){
        dAgg <- aggregate(NUCLEOTIDE~ VFAMILYNAME ,data = p, FUN = length)
      }else {
        dAgg <- aggregate(AMINOACID~ VFAMILYNAME ,data = p, FUN = length) 
      }
      
      if(countsorpro=="PROP"){
      dAgg[,2] <- dAgg[,2]/sum(dAgg[,2]) # get the proportion
      }
      colnames(dAgg) <- c("VFAMILYNAME",rep)  
      m <- merge(m,dAgg, by="VFAMILYNAME",all.x=T) 
      
    }
    m[is.na(m)] <- 0
    
    rownames(m) <- m[,1]
    m <- m[,-1]
    
     
  }
  
  else if(countfor=="VJ"){
    features= getAllVJgenes(reps)
    
    m <- features
    colnames(m)  <- c("VGENENAME","JGENENAME")
    
    for(rep in reps){
      print(rep)
      p <- get(rep)
      
      if(seqStatus=="IN"){
        p <- getProductive(p)
      }else if(seqStatus=="OUT"){
        p <- getOutOfFrame(p)
      }else if(seqStatus=="STOP"){
        p <- getHasStop(p)
      }
      
      print (nrow(p))
      
      if(lev=="N"){
        dAgg <- aggregate(NUCLEOTIDE~ VGENENAME + JGENENAME ,data = p, FUN = length)
      }else {
        dAgg <- aggregate(AMINOACID~ VGENENAME + JGENENAME ,data = p, FUN = length) 
      }
      
      if(countsorpro=="PROP"){
      dAgg[,3] <- dAgg[,3]/sum(dAgg[,3]) # get the proportion
      }
      colnames(dAgg) <- c("VGENENAME","JGENENAME",rep)  
      m <- merge(m,dAgg, by=c("VGENENAME","JGENENAME"),all.x=T) 
      
    }
    m[is.na(m)] <- 0
    m[,1]<-paste(m[,1],m[,2])
    m<-m[,-2]
    
    rownames(m) <- m[,1]
    m <- m[,-1]
    
  }
  
  
  return(m)
 
}


# get overlapping clonotype counts for every pair of samples: using top 5% or 10% etc #### 

getAbundantClonotypesbyperc<- function(rep,topCutoff=10){
  
  # Return the top abundant clonotypes using the topcutoff , default is top 5%
  
  print(nrow(rep))
  y=rep$COUNT/sum(rep$COUNT)
  y=cumsum(y) #cumulitive read size proportion at each clone, ordered from biggest to smallest
  
  cuttingPoints=topCutoff/100
  y=cut(y,seq(0,1,by=cuttingPoints)) # Cut the cumulitive proportion values in y by 10% each
  
  
  # We can split the whole data frame using the 5% split factor
  
  rep_subgroups=split(rep,y)
  
  
  return(rep_subgroups[[1]])
  
  
}

getAbundantClonotypesbynumber<- function(rep,topCutoff=10){
  
  # Return the top abundant clonotypes using the topcutoff , default is top 10
  # it assumes rep is already sorted from biggest clone to smallest
  
  print(nrow(rep))
  
  return(rep[1:topCutoff,])
  
  
}


overLappingClonotypes<- function(reps,cutoff=20,cutoffby="perc"){
  ### Returns a datamatrix of top topnumber clonotopes from each sample plus: overlap count or Jaccard index, for prod and non productive sequences, N insertion size
  
  
  # setting work parameters
  cutoffby = toupper(cutoffby)
 
  
  m <- as.data.frame(expand.grid(reps,reps))
  colnames(m)  <- c("Sample1","Sample2")
  
  toadd= data.frame()
  
 
  
  for(r in 1:nrow(m)){
    
    overlapValuesProd=c()
    overlapValuesUnprod=c()
    overlapValuesProdJI=c()
    overlapValuesUnprodJI=c()
    
    overlapValuesProdJI_abundancebased=c()
    overlapValuesUnprodJI_abundancebased=c()
    
    NinsertionProd1=c()
    NinsertionUnprod1=c()
    
    NinsertionProd2=c()
    NinsertionUnprod2=c()
    
    
    print(r)
    sam1=m[r,1]
    sam2=m[r,2]
    
    #print(sam1)
    #print(sam2)
  
    sam1 <- get(as.character(sam1))
    sam2 <- get(as.character(sam2))
    
    
    sam1Prod <- getProductive(sam1)
    sam1Unprod <- getUnproductive(sam1)
    
    sam2Prod <- getProductive(sam2)
    sam2Unprod <- getUnproductive(sam2)
   
 
    if(cutoffby=="PERC"){
      topclones_sam1Prod = getAbundantClonotypesbyperc(sam1Prod,cutoff)
      topclones_sam1Unprod = getAbundantClonotypesbyperc(sam1Unprod,cutoff)
      
      topclones_sam2Prod = getAbundantClonotypesbyperc(sam2Prod,cutoff)
      topclones_sam2Unprod = getAbundantClonotypesbyperc(sam2Unprod,cutoff)
      
      # commons
      commonClonesProd=length(intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE))
      unionProd=length(union(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE))
      
      # commonality checked at the Nt level for non productive sequences, no AA given in the original data
      commonClonesUnprod=length(intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      unionUnprod=length(union(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      
      
      overlapValuesProd=c(overlapValuesProd,as.numeric(commonClonesProd))
      overlapValuesUnprod=c(overlapValuesUnprod,as.numeric(commonClonesUnprod))
      
      overlapValuesProdJI=c(overlapValuesProdJI,formatC(commonClonesProd/unionProd,digits= 8,format="f"))
      overlapValuesUnprodJI = c(overlapValuesUnprodJI,formatC(commonClonesUnprod/unionUnprod,digits= 8,format="f"))
      
      # Jaccard abundance-based similarity index (UV/(U+V-UV)). U is total relative abundance in of shared sample 1, U total relative of shared in sample 2
      
      commonClonesProd_U_data= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_U = sum(commonClonesProd_U_data$COUNT)/sum(topclones_sam1Prod$COUNT)
      
      commonClonesProd_V_data= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_V = sum(commonClonesProd_V_data$COUNT)/sum(topclones_sam2Prod$COUNT)
      
      uv=commonClonesProd_U*commonClonesProd_V
      uplusv=commonClonesProd_U+commonClonesProd_V
      
      AbundanceBasedJIProd = format(uv/(uplusv-uv),digits=8,format="f")
      
      overlapValuesProdJI_abundancebased=c(overlapValuesProdJI_abundancebased,as.numeric(AbundanceBasedJIProd))
      
        # for unproductive sequences
      
      commonClonesUnProd_U_data= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_U = sum(commonClonesUnProd_U_data$COUNT)/sum(topclones_sam1Unprod$COUNT)
      
      commonClonesUnProd_V_data= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_V = sum(commonClonesUnProd_V_data$COUNT)/sum(topclones_sam2Unprod$COUNT)
      
      uv_UnProd=commonClonesUnProd_U * commonClonesUnProd_V
      uplusv_UnProd=commonClonesUnProd_U + commonClonesUnProd_V
      
      AbundanceBasedJIUnProd = formatC(uv_UnProd/(uplusv_UnProd-uv_UnProd),digits=8,format="f")
      
      overlapValuesUnprodJI_abundancebased=c(overlapValuesUnprodJI_abundancebased,as.numeric(AbundanceBasedJIUnProd))
     
      
 
      # Ninsertions  in common sequences
      
      commonClonotypes1= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonotypes2= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      
      commonClonotypes1Unprod= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonotypes2Unprod= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      
      
      NinsertionsP1= mean(commonClonotypes1$N2INSERTION + commonClonotypes1$N1INSERTION)
      NinsertionsP2 = mean(commonClonotypes2$N2INSERTION + commonClonotypes2$N1INSERTION)
      
      NinsertionsUP1= mean(commonClonotypes1Unprod$N2INSERTION + commonClonotypes1Unprod$N1INSERTION)
      NinsertionsUP2 = mean(commonClonotypes2Unprod$N2INSERTION + commonClonotypes2Unprod$N1INSERTION)
      
      
      NinsertionProd1=c(NinsertionProd1,NinsertionsP1)
      NinsertionProd2= c(NinsertionProd2,NinsertionsP2)
      
      NinsertionUnprod1=c(NinsertionUnprod1,NinsertionsUP1)
      NinsertionUnprod2=c(NinsertionUnprod2,NinsertionsUP2)
      
      
    }else{
      topclones_sam1Prod = getAbundantClonotypesbynumber(sam1Prod,cutoff)
      topclones_sam1Unprod = getAbundantClonotypesbynumber(sam1Unprod,cutoff)
      
      topclones_sam2Prod = getAbundantClonotypesbynumber(sam2Prod,cutoff)
      topclones_sam2Unprod = getAbundantClonotypesbynumber(sam2Unprod,cutoff)
      
      
      # commons
      # commons
      commonClonesProd=length(intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE))
      unionProd=length(union(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE))
      
      # commonality checked at the Nt level for non productive sequences, no AA given in the original data
      commonClonesUnprod=length(intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      unionUnprod=length(union(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      
      
      overlapValuesProd=c(overlapValuesProd,as.numeric(commonClonesProd))
      overlapValuesUnprod=c(overlapValuesUnprod,as.numeric(commonClonesUnprod))
      
      overlapValuesProdJI=c(overlapValuesProdJI,formatC(commonClonesProd/unionProd,digits= 8,format="f"))
      overlapValuesUnprodJI = c(overlapValuesUnprodJI,formatC(commonClonesUnprod/unionUnprod,digits= 8,format="f"))
      
      
      # Jaccard abundance-based similarity index (UV/(U+V-UV)). U is total relative abundance in of shared sample 1, U total relative of shared in sample 2
      
      commonClonesProd_U_data= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_U = sum(commonClonesProd_U_data$COUNT)/sum(topclones_sam1Prod$COUNT)
      
      commonClonesProd_V_data= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_V = sum(commonClonesProd_V_data$COUNT)/sum(topclones_sam2Prod$COUNT)
      
      uv=commonClonesProd_U*commonClonesProd_V
      uplusv=commonClonesProd_U+commonClonesProd_V
      
      AbundanceBasedJIProd = format(uv/(uplusv-uv),digits=8,format="f")
      
      overlapValuesProdJI_abundancebased=c(overlapValuesProdJI_abundancebased,as.numeric(AbundanceBasedJIProd))
      
      # for unproductive sequences
      
      commonClonesUnProd_U_data= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_U = sum(commonClonesUnProd_U_data$COUNT)/sum(topclones_sam1Unprod$COUNT)
      
      commonClonesUnProd_V_data= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_V = sum(commonClonesUnProd_V_data$COUNT)/sum(topclones_sam2Unprod$COUNT)
      
      uv_UnProd=commonClonesUnProd_U * commonClonesUnProd_V
      uplusv_UnProd=commonClonesUnProd_U + commonClonesUnProd_V
      
      AbundanceBasedJIUnProd = formatC(uv_UnProd/(uplusv_UnProd-uv_UnProd),digits=8,format="f")
      
      overlapValuesUnprodJI_abundancebased=c(overlapValuesUnprodJI_abundancebased,as.numeric(AbundanceBasedJIUnProd))
      
      
      
      # Ninsertions  in common sequences
      
      commonClonotypes1= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonotypes2= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      
      commonClonotypes1Unprod= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonotypes2Unprod= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      
      
      NinsertionsP1= mean(commonClonotypes1$N2INSERTION + commonClonotypes1$N1INSERTION)
      NinsertionsP2 = mean(commonClonotypes2$N2INSERTION + commonClonotypes2$N1INSERTION)
      
      NinsertionsUP1= mean(commonClonotypes1Unprod$N2INSERTION + commonClonotypes1Unprod$N1INSERTION)
      NinsertionsUP2 = mean(commonClonotypes2Unprod$N2INSERTION + commonClonotypes2Unprod$N1INSERTION)
      
      
      NinsertionProd1=c(NinsertionProd1,NinsertionsP1)
      NinsertionProd2= c(NinsertionProd2,NinsertionsP2)
      
      NinsertionUnprod1=c(NinsertionUnprod1,NinsertionsUP1)
      NinsertionUnprod2=c(NinsertionUnprod2,NinsertionsUP2)
        
      }
    
    #totalNumberOfProdClonotypes= nrow(topclones_sam1Prod) + nrow(topclones_sam2Prod)
    #totalNumberOfUnprodClonotypes= nrow(topclones_sam1Unprod) + nrow(topclones_sam2Unprod)

    
    if(nrow(toadd) > 0){
    newrow=data.frame(overlapValuesProd,overlapValuesUnprod,overlapValuesProdJI,overlapValuesUnprodJI,overlapValuesProdJI_abundancebased,overlapValuesUnprodJI_abundancebased,NinsertionProd1,NinsertionProd2,NinsertionUnprod1,NinsertionUnprod2)
    colnames(newrow) <- c("OverlapCountProductive","OverlapCountUnProductive","OverlapProd_JI","OverlapUnProd_JI","AbundanceBased_JI","AbundanceBased_JI","NinsertionProdSample1","NinsertionProdSample2","NinsertionUnProdSample1","NinsertionUnProdSample2")
    
    toadd=rbind(toadd,newrow)
    }else{
      toadd=data.frame(overlapValuesProd,overlapValuesUnprod,overlapValuesProdJI,overlapValuesUnprodJI,overlapValuesProdJI_abundancebased,overlapValuesUnprodJI_abundancebased,NinsertionProd1,NinsertionProd2,NinsertionUnprod1,NinsertionUnprod2)
      colnames(toadd) <- c("OverlapCountProductive","OverlapCountUnProductive","OverlapProd_JI","OverlapUnProd_JI","AbundanceBased_JI","AbundanceBased_JI","NinsertionProdSample1","NinsertionProdSample2","NinsertionUnProdSample1","NinsertionUnProdSample2")  
    }
    
  }
   
  

    
  print(nrow(toadd))
  resultTable=data.frame(m,toadd)        
  
  write.table(resultTable,file="Pairwise Overlap Counts.txt",row.names = T,sep = "\t")
  return(resultTable)
}

overLappingClonotypes2<- function(reps,cutoff=20,cutoffby="perc"){
  ### Returns a datamatrix of top topnumber clonotopes from each sample plus: overlap count or Jaccard index, for prod and non productive sequences, N insertion size
  
  
  # setting work parameters
  cutoffby = toupper(cutoffby)
  
  
  m <- as.data.frame(expand.grid(reps,reps))
  colnames(m)  <- c("Sample1","Sample2")
  
  toadd= data.frame()
  
  
  
  for(r in 1:nrow(m)){
    
    overlapValuesProd=c()
    overlapValuesUnprod=c()
    overlapValuesProdJI=c()
    overlapValuesUnprodJI=c()
    
    overlapValuesProdJI_abundancebased=c()
    overlapValuesUnprodJI_abundancebased=c()
    
    NinsertionProd1=c()
    NinsertionUnprod1=c()
    
    NinsertionProd2=c()
    NinsertionUnprod2=c()
    
    
    print(r)
    sam1=m[r,1]
    sam2=m[r,2]
    
    #print(sam1)
    #print(sam2)
    
    sam1 <- get(as.character(sam1))
    sam2 <- get(as.character(sam2))
    
    
    sam1Prod <- getProductive(sam1)
    sam1Unprod <- getUnproductive(sam1)
    
    sam2Prod <- getProductive(sam2)
    sam2Unprod <- getUnproductive(sam2)
    
    
    if(cutoffby=="PERC"){
      topclones_sam1Prod = getAbundantClonotypesbyperc(sam1Prod,cutoff)
      topclones_sam1Unprod = getAbundantClonotypesbyperc(sam1Unprod,cutoff)
      
      topclones_sam2Prod = getAbundantClonotypesbyperc(sam2Prod,cutoff)
      topclones_sam2Unprod = getAbundantClonotypesbyperc(sam2Unprod,cutoff)
      
      # commons
      commonClonesProd=length(intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE)) 
      unionProd=length(union(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE)) 
      
      # commonality checked at the Nt level for non productive sequences, no AA given in the original data
      commonClonesUnprod=length(intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      unionUnprod=length(union(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      
      if(unionUnprod==0){
        unionUnprod = 1
      }
      
      if(unionProd==0){
        unionProd = 1
      }
      
      overlapValuesProd=c(overlapValuesProd,as.numeric(commonClonesProd))
      overlapValuesUnprod=c(overlapValuesUnprod,as.numeric(commonClonesUnprod))
      
      overlapValuesProdJI=c(overlapValuesProdJI,commonClonesProd/unionProd)
      overlapValuesUnprodJI = c(overlapValuesUnprodJI,commonClonesUnprod/unionUnprod)
      
      # Jaccard abundance-based similarity index (UV/(U+V-UV)). U is total relative abundance in of shared sample 1, U total relative of shared in sample 2
      
      commonClonesProd_U_data= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_U = sum(commonClonesProd_U_data$COUNT)/sum(topclones_sam1Prod$COUNT)
      
      commonClonesProd_V_data= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_V = sum(commonClonesProd_V_data$COUNT)/sum(topclones_sam2Prod$COUNT)
      
      uv=commonClonesProd_U*commonClonesProd_V
      uplusv=commonClonesProd_U+commonClonesProd_V
      
      AbundanceBasedJIProd = uv/(uplusv-uv)
      
      overlapValuesProdJI_abundancebased=c(overlapValuesProdJI_abundancebased,as.numeric(AbundanceBasedJIProd))
      
      # for unproductive sequences
      
      commonClonesUnProd_U_data= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_U = sum(commonClonesUnProd_U_data$COUNT)/sum(topclones_sam1Unprod$COUNT)
      
      commonClonesUnProd_V_data= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_V = sum(commonClonesUnProd_V_data$COUNT)/sum(topclones_sam2Unprod$COUNT)
      
      uv_UnProd=commonClonesUnProd_U * commonClonesUnProd_V
      uplusv_UnProd=commonClonesUnProd_U + commonClonesUnProd_V
      
      AbundanceBasedJIUnProd = uv_UnProd/(uplusv_UnProd-uv_UnProd)
      
      overlapValuesUnprodJI_abundancebased=c(overlapValuesUnprodJI_abundancebased,as.numeric(AbundanceBasedJIUnProd))
      
      
      
      # Ninsertions  in common sequences
      
      commonClonotypes1= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonotypes2= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      
      commonClonotypes1Unprod= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonotypes2Unprod= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      
      
      NinsertionsP1= mean(commonClonotypes1$N2INSERTION + commonClonotypes1$N1INSERTION)
      NinsertionsP2 = mean(commonClonotypes2$N2INSERTION + commonClonotypes2$N1INSERTION)
      
      NinsertionsUP1= mean(commonClonotypes1Unprod$N2INSERTION + commonClonotypes1Unprod$N1INSERTION)
      NinsertionsUP2 = mean(commonClonotypes2Unprod$N2INSERTION + commonClonotypes2Unprod$N1INSERTION)
      
      
      NinsertionProd1=c(NinsertionProd1,NinsertionsP1)
      NinsertionProd2= c(NinsertionProd2,NinsertionsP2)
      
      NinsertionUnprod1=c(NinsertionUnprod1,NinsertionsUP1)
      NinsertionUnprod2=c(NinsertionUnprod2,NinsertionsUP2)
      
      
    }else{
      topclones_sam1Prod = getAbundantClonotypesbynumber(sam1Prod,cutoff)
      topclones_sam1Unprod = getAbundantClonotypesbynumber(sam1Unprod,cutoff)
      
      topclones_sam2Prod = getAbundantClonotypesbynumber(sam2Prod,cutoff)
      topclones_sam2Unprod = getAbundantClonotypesbynumber(sam2Unprod,cutoff)
      
      
      # commons
      # commons
      commonClonesProd=length(intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE))
      unionProd=length(union(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE))
      
      # commonality checked at the Nt level for non productive sequences, no AA given in the original data
      commonClonesUnprod=length(intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      unionUnprod=length(union(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      
      if(unionUnprod==0){
        unionUnprod = 1
      }
      
      if(unionProd==0){
        unionProd = 1
      }
      
      overlapValuesProd=c(overlapValuesProd,as.numeric(commonClonesProd))
      overlapValuesUnprod=c(overlapValuesUnprod,as.numeric(commonClonesUnprod))
      
      overlapValuesProdJI=c(overlapValuesProdJI,commonClonesProd/unionProd)
      overlapValuesUnprodJI = c(overlapValuesUnprodJI,commonClonesUnprod/unionUnprod)
      
      
      # Jaccard abundance-based similarity index (UV/(U+V-UV)). U is total relative abundance in of shared sample 1, U total relative of shared in sample 2
      
      commonClonesProd_U_data= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_U = sum(commonClonesProd_U_data$COUNT)/sum(topclones_sam1Prod$COUNT)
      
      commonClonesProd_V_data= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_V = sum(commonClonesProd_V_data$COUNT)/sum(topclones_sam2Prod$COUNT)
      
      uv=commonClonesProd_U*commonClonesProd_V
      uplusv=commonClonesProd_U+commonClonesProd_V
      
      AbundanceBasedJIProd = uv/(uplusv-uv)
      
      overlapValuesProdJI_abundancebased=c(overlapValuesProdJI_abundancebased,as.numeric(AbundanceBasedJIProd))
      
      # for unproductive sequences
      
      commonClonesUnProd_U_data= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_U = sum(commonClonesUnProd_U_data$COUNT)/sum(topclones_sam1Unprod$COUNT)
      
      commonClonesUnProd_V_data= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_V = sum(commonClonesUnProd_V_data$COUNT)/sum(topclones_sam2Unprod$COUNT)
      
      uv_UnProd=commonClonesUnProd_U * commonClonesUnProd_V
      uplusv_UnProd=commonClonesUnProd_U + commonClonesUnProd_V
      
      AbundanceBasedJIUnProd = uv_UnProd/(uplusv_UnProd-uv_UnProd)
      
      overlapValuesUnprodJI_abundancebased=c(overlapValuesUnprodJI_abundancebased,as.numeric(AbundanceBasedJIUnProd))
      
      
      
      # Ninsertions  in common sequences
      
      commonClonotypes1= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonotypes2= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      
      commonClonotypes1Unprod= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonotypes2Unprod= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      
      
      NinsertionsP1= mean(commonClonotypes1$N2INSERTION + commonClonotypes1$N1INSERTION)
      NinsertionsP2 = mean(commonClonotypes2$N2INSERTION + commonClonotypes2$N1INSERTION)
      
      NinsertionsUP1= mean(commonClonotypes1Unprod$N2INSERTION + commonClonotypes1Unprod$N1INSERTION)
      NinsertionsUP2 = mean(commonClonotypes2Unprod$N2INSERTION + commonClonotypes2Unprod$N1INSERTION)
      
      
      NinsertionProd1=c(NinsertionProd1,NinsertionsP1)
      NinsertionProd2= c(NinsertionProd2,NinsertionsP2)
      
      NinsertionUnprod1=c(NinsertionUnprod1,NinsertionsUP1)
      NinsertionUnprod2=c(NinsertionUnprod2,NinsertionsUP2)
      
    }
    
    #totalNumberOfProdClonotypes= nrow(topclones_sam1Prod) + nrow(topclones_sam2Prod)
    #totalNumberOfUnprodClonotypes= nrow(topclones_sam1Unprod) + nrow(topclones_sam2Unprod)
    
    
    if(nrow(toadd) > 0){
      newrow=data.frame(overlapValuesProd,overlapValuesUnprod,overlapValuesProdJI,overlapValuesUnprodJI,overlapValuesProdJI_abundancebased,overlapValuesUnprodJI_abundancebased,NinsertionProd1,NinsertionProd2,NinsertionUnprod1,NinsertionUnprod2)
      colnames(newrow) <- c("OverlapCountProductive","OverlapCountUnProductive","OverlapProd_JI","OverlapUnProd_JI","AbundanceBased_JI","AbundanceBased_JI","NinsertionProdSample1","NinsertionProdSample2","NinsertionUnProdSample1","NinsertionUnProdSample2")
      
      toadd=rbind(toadd,newrow)
    }else{
      toadd=data.frame(overlapValuesProd,overlapValuesUnprod,overlapValuesProdJI,overlapValuesUnprodJI,overlapValuesProdJI_abundancebased,overlapValuesUnprodJI_abundancebased,NinsertionProd1,NinsertionProd2,NinsertionUnprod1,NinsertionUnprod2)
      colnames(toadd) <- c("OverlapCountProductive","OverlapCountUnProductive","OverlapProd_JI","OverlapUnProd_JI","AbundanceBased_JI","AbundanceBased_JI","NinsertionProdSample1","NinsertionProdSample2","NinsertionUnProdSample1","NinsertionUnProdSample2")  
    }
    
  }
  
  
  
  
  print(nrow(toadd))
  resultTable=data.frame(m,toadd)        
  
  # write pdf dendrogram and heatmaps
  
  # select the sample columns 1 and 2, and the JI column 5
  forProdselected = resultTable[,c(1,2,5)]
  forProdselected = forProdselected[order(forProdselected$Sample1),]
  
  l = length(unique(forProdselected$Sample1))
  mat = matrix(as.numeric(forProdselected[,3]),nrow=l,ncol=l,byrow=T)
  # replace NaN values with 0
  mat[is.nan(mat)] = 0
  colnames(mat) <- unique(forProdselected$Sample1)
  rownames(mat) <- unique(forProdselected$Sample1)
  
  
  mat = signif(mat,digits=2)
  hc = hclust(as.dist(log10(1-mat)))
  
  flname = paste("ProdJI_dendrogram.pdf")
  pdf(file=flname, useDingbats = FALSE)
  
  # very simple dendrogram
  plot(hc)
  
  dev.off()
  
  # for unprod
  # select the sample columns 1 and 2, and the JI column 5
  forUnprodselected = resultTable[,c(1,2,6)]
  forUnprodselected = forUnprodselected[order(forUnprodselected$Sample1),]
  
  l = length(unique(forUnprodselected$Sample1))
  mat = matrix(as.numeric(forUnprodselected[,3]),nrow=l,ncol=l,byrow=T)
  # replace NaN values with 0
  mat[is.nan(mat)] = 0
  colnames(mat) <- unique(forUnprodselected$Sample1)
  rownames(mat) <- unique(forUnprodselected$Sample1)
  
  
  mat = signif(mat,digits=2)
  hc = hclust(as.dist(log10(1-mat)))
  
  
  flname = paste("UnprodJI_dendrogram.pdf")
  pdf(file=flname, useDingbats = FALSE)
  
  # very simple dendrogram
  plot(hc)
  
  dev.off()
  
  write.table(resultTable,file="Pairwise Overlap Counts.txt",row.names = T,sep = "\t")
  return(resultTable)
}


# JI calculation for 'has stop' unproductive Nucleotides

overLappingClonotypesHasStop <- function(reps,cutoff=20,cutoffby="perc"){
  ### Returns a datamatrix of top topnumber clonotopes from each sample plus: overlap count or Jaccard index, for prod and non productive sequences, N insertion size
  
  ### Total read counts for the common clones of both samples are given for the productive/hasStop versions the JI analysis. This is implemented when the analysis is 
  ### done for by "perc". 
  
  
  # setting work parameters
  cutoffby = toupper(cutoffby)
  
  
  m <- as.data.frame(expand.grid(reps,reps))
  colnames(m)  <- c("Sample1","Sample2")
  
  toadd= data.frame()
  
  
  
  for(r in 1:nrow(m)){
    
    overlapValuesProd=c()
    overlapValuesUnprod=c()
    
    overlapValuesTotalSam1Prod=c()
    overlapValuesTotalSam2Prod=c()
    
    overlapValuesTotalSam1Unprod=c()
    overlapValuesTotalSam2Unprod=c()
    
    overlapValuesProdJI=c()
    overlapValuesUnprodJI=c()
    
    overlapValuesProdJI_abundancebased=c()
    overlapValuesUnprodJI_abundancebased=c()
    
    NinsertionProd1=c()
    NinsertionUnprod1=c()
    
    NinsertionProd2=c()
    NinsertionUnprod2=c()
    
    
    print(r)
    sam1=m[r,1]
    sam2=m[r,2]
    
    #print(sam1)
    #print(sam2)
    
    sam1 <- get(as.character(sam1))
    sam2 <- get(as.character(sam2))
    
    
    sam1Prod <- getProductive(sam1)
    sam1Unprod <- getHasStop(sam1)
    
    sam2Prod <- getProductive(sam2)
    sam2Unprod <- getHasStop(sam2)
    
    
    if(cutoffby=="PERC"){
      topclones_sam1Prod = getAbundantClonotypesbyperc(sam1Prod,cutoff)
      topclones_sam1Unprod = getAbundantClonotypesbyperc(sam1Unprod,cutoff)
      
      topclones_sam2Prod = getAbundantClonotypesbyperc(sam2Prod,cutoff)
      topclones_sam2Unprod = getAbundantClonotypesbyperc(sam2Unprod,cutoff)
      
      # commons
      commonClonesProd=length(intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE)) 
      unionProd=length(union(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE)) 
      
      # commonality checked at the Nt level for non productive sequences, no AA given in the original data
      commonClonesUnprod=length(intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      unionUnprod=length(union(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      
      
      
      
      if(unionUnprod==0){
        unionUnprod = 1
      }
      
      if(unionProd==0){
        unionProd = 1
      }
      
      overlapValuesProd=c(overlapValuesProd,as.numeric(commonClonesProd))
      overlapValuesUnprod=c(overlapValuesUnprod,as.numeric(commonClonesUnprod))
      
      
      
      overlapValuesProdJI=c(overlapValuesProdJI,commonClonesProd/unionProd)
      overlapValuesUnprodJI = c(overlapValuesUnprodJI,commonClonesUnprod/unionUnprod)
      
      # Jaccard abundance-based similarity index (UV/(U+V-UV)). U is total relative abundance in of shared sample 1, U total relative of shared in sample 2
      
      commonClonesProd_U_data= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_U = sum(commonClonesProd_U_data$COUNT)/sum(topclones_sam1Prod$COUNT)
      
      commonClonesProd_U_abundance = sum(commonClonesProd_U_data$COUNT)
      
      commonClonesProd_V_data= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_V = sum(commonClonesProd_V_data$COUNT)/sum(topclones_sam2Prod$COUNT)
      
      commonClonesProd_V_abundance = sum(commonClonesProd_V_data$COUNT)
      
      overlapValuesTotalSam1Prod=c(overlapValuesTotalSam1Prod,as.numeric(commonClonesProd_U_abundance))
      overlapValuesTotalSam2Prod=c(overlapValuesTotalSam2Prod,as.numeric(commonClonesProd_V_abundance))
      
      uv=commonClonesProd_U*commonClonesProd_V
      uplusv=commonClonesProd_U+commonClonesProd_V
      
      AbundanceBasedJIProd = uv/(uplusv-uv)
      
      overlapValuesProdJI_abundancebased=c(overlapValuesProdJI_abundancebased,as.numeric(AbundanceBasedJIProd))
      
      # for unproductive sequences
      
      commonClonesUnProd_U_data= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_U = sum(commonClonesUnProd_U_data$COUNT)/sum(topclones_sam1Unprod$COUNT)
      
      commonClonesUnProd_U_abundance = sum(commonClonesUnProd_U_data$COUNT)
      
      commonClonesUnProd_V_data= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_V = sum(commonClonesUnProd_V_data$COUNT)/sum(topclones_sam2Unprod$COUNT)
      
      commonClonesUnProd_V_abundance = sum(commonClonesUnProd_V_data$COUNT)
      
      
      overlapValuesTotalSam1Unprod=c(overlapValuesTotalSam1Unprod,as.numeric(commonClonesUnProd_U_abundance))
      overlapValuesTotalSam2Unprod=c(overlapValuesTotalSam2Unprod,as.numeric(commonClonesUnProd_V_abundance))
      
      
      uv_UnProd=commonClonesUnProd_U * commonClonesUnProd_V
      uplusv_UnProd=commonClonesUnProd_U + commonClonesUnProd_V
      
      AbundanceBasedJIUnProd = uv_UnProd/(uplusv_UnProd-uv_UnProd)
      
      overlapValuesUnprodJI_abundancebased=c(overlapValuesUnprodJI_abundancebased,as.numeric(AbundanceBasedJIUnProd))
      
      
      
      # Ninsertions  in common sequences
      
      commonClonotypes1= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonotypes2= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      
      commonClonotypes1Unprod= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonotypes2Unprod= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      
      
      NinsertionsP1= mean(commonClonotypes1$N2INSERTION + commonClonotypes1$N1INSERTION)
      NinsertionsP2 = mean(commonClonotypes2$N2INSERTION + commonClonotypes2$N1INSERTION)
      
      NinsertionsUP1= mean(commonClonotypes1Unprod$N2INSERTION + commonClonotypes1Unprod$N1INSERTION)
      NinsertionsUP2 = mean(commonClonotypes2Unprod$N2INSERTION + commonClonotypes2Unprod$N1INSERTION)
      
      
      NinsertionProd1=c(NinsertionProd1,NinsertionsP1)
      NinsertionProd2= c(NinsertionProd2,NinsertionsP2)
      
      NinsertionUnprod1=c(NinsertionUnprod1,NinsertionsUP1)
      NinsertionUnprod2=c(NinsertionUnprod2,NinsertionsUP2)
      
      
    }else{
      topclones_sam1Prod = getAbundantClonotypesbynumber(sam1Prod,cutoff)
      topclones_sam1Unprod = getAbundantClonotypesbynumber(sam1Unprod,cutoff)
      
      topclones_sam2Prod = getAbundantClonotypesbynumber(sam2Prod,cutoff)
      topclones_sam2Unprod = getAbundantClonotypesbynumber(sam2Unprod,cutoff)
      
      
      # commons
      # commons
      commonClonesProd=length(intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE))
      unionProd=length(union(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE))
      
      # commonality checked at the Nt level for non productive sequences, no AA given in the original data
      commonClonesUnprod=length(intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      unionUnprod=length(union(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      
      if(unionUnprod==0){
        unionUnprod = 1
      }
      
      if(unionProd==0){
        unionProd = 1
      }
      
      overlapValuesProd=c(overlapValuesProd,as.numeric(commonClonesProd))
      overlapValuesUnprod=c(overlapValuesUnprod,as.numeric(commonClonesUnprod))
      
      overlapValuesProdJI=c(overlapValuesProdJI,commonClonesProd/unionProd)
      overlapValuesUnprodJI = c(overlapValuesUnprodJI,commonClonesUnprod/unionUnprod)
      
      
      # Jaccard abundance-based similarity index (UV/(U+V-UV)). U is total relative abundance in of shared sample 1, U total relative of shared in sample 2
      
      commonClonesProd_U_data= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_U = sum(commonClonesProd_U_data$COUNT)/sum(topclones_sam1Prod$COUNT)
      
      commonClonesProd_V_data= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_V = sum(commonClonesProd_V_data$COUNT)/sum(topclones_sam2Prod$COUNT)
      
      uv=commonClonesProd_U*commonClonesProd_V
      uplusv=commonClonesProd_U+commonClonesProd_V
      
      AbundanceBasedJIProd = uv/(uplusv-uv)
      
      overlapValuesProdJI_abundancebased=c(overlapValuesProdJI_abundancebased,as.numeric(AbundanceBasedJIProd))
      
      # for unproductive sequences
      
      commonClonesUnProd_U_data= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_U = sum(commonClonesUnProd_U_data$COUNT)/sum(topclones_sam1Unprod$COUNT)
      
      commonClonesUnProd_V_data= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_V = sum(commonClonesUnProd_V_data$COUNT)/sum(topclones_sam2Unprod$COUNT)
      
      uv_UnProd=commonClonesUnProd_U * commonClonesUnProd_V
      uplusv_UnProd=commonClonesUnProd_U + commonClonesUnProd_V
      
      AbundanceBasedJIUnProd = uv_UnProd/(uplusv_UnProd-uv_UnProd)
      
      overlapValuesUnprodJI_abundancebased=c(overlapValuesUnprodJI_abundancebased,as.numeric(AbundanceBasedJIUnProd))
      
      
      
      # Ninsertions  in common sequences
      
      commonClonotypes1= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonotypes2= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      
      commonClonotypes1Unprod= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonotypes2Unprod= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      
      
      NinsertionsP1= mean(commonClonotypes1$N2INSERTION + commonClonotypes1$N1INSERTION)
      NinsertionsP2 = mean(commonClonotypes2$N2INSERTION + commonClonotypes2$N1INSERTION)
      
      NinsertionsUP1= mean(commonClonotypes1Unprod$N2INSERTION + commonClonotypes1Unprod$N1INSERTION)
      NinsertionsUP2 = mean(commonClonotypes2Unprod$N2INSERTION + commonClonotypes2Unprod$N1INSERTION)
      
      
      NinsertionProd1=c(NinsertionProd1,NinsertionsP1)
      NinsertionProd2= c(NinsertionProd2,NinsertionsP2)
      
      NinsertionUnprod1=c(NinsertionUnprod1,NinsertionsUP1)
      NinsertionUnprod2=c(NinsertionUnprod2,NinsertionsUP2)
      
    }
    
    #totalNumberOfProdClonotypes= nrow(topclones_sam1Prod) + nrow(topclones_sam2Prod)
    #totalNumberOfUnprodClonotypes= nrow(topclones_sam1Unprod) + nrow(topclones_sam2Unprod)
    
    
    if(nrow(toadd) > 0){
      newrow=data.frame(overlapValuesProd,overlapValuesTotalSam1Prod,overlapValuesTotalSam2Prod,overlapValuesUnprod,overlapValuesTotalSam1Unprod,overlapValuesTotalSam2Unprod,overlapValuesProdJI,overlapValuesUnprodJI,overlapValuesProdJI_abundancebased,overlapValuesUnprodJI_abundancebased,NinsertionProd1,NinsertionProd2,NinsertionUnprod1,NinsertionUnprod2)
      colnames(newrow) <- c("OverlapCountProductive","OverlapCountTotalProductiveSam1","OverlapCountTotalProductiveSam2","OverlapCountHasStop","OverlapCountTotalHasStopSam1","OverlapCountTotalHasStopSam2","OverlapProd_JI","OverlapHasStop_JI","AbundanceBasedProd_JI","AbundanceBasedHasStop_JI","NinsertionProdSample1","NinsertionProdSample2","NinsertionHasStopSample1","NinsertionHasStopSample2")
      
      toadd=rbind(toadd,newrow)
    }else{
      toadd=data.frame(overlapValuesProd,overlapValuesTotalSam1Prod,overlapValuesTotalSam2Prod,overlapValuesUnprod,overlapValuesTotalSam1Unprod,overlapValuesTotalSam2Unprod,overlapValuesProdJI,overlapValuesUnprodJI,overlapValuesProdJI_abundancebased,overlapValuesUnprodJI_abundancebased,NinsertionProd1,NinsertionProd2,NinsertionUnprod1,NinsertionUnprod2)
      colnames(toadd) <- c("OverlapCountProductive","OverlapCountTotalProductiveSam1","OverlapCountTotalProductiveSam2","OverlapCountHasStop","OverlapCountTotalHasStopSam1","OverlapCountTotalHasStopSam2","OverlapProd_JI","OverlapHasStop_JI","AbundanceBasedProd_JI","AbundanceBasedHasStop_JI","NinsertionProdSample1","NinsertionProdSample2","NinsertionHasStopSample1","NinsertionHasStopSample2")
    }
    
  }
  
  
  
  
  print(nrow(toadd))
  resultTable=data.frame(m,toadd)        
  
  
  write.table(resultTable,file="Pairwise Overlap Counts.txt",row.names = T,sep = "\t")
  return(resultTable)
  
  
}


# Amico acid based Jaccard index calculation

overLappingClonotypesAA<- function(reps,cutoff=20,cutoffby="perc"){
  ### Returns a datamatrix of top topnumber clonotopes (Amino acid level) from each sample plus: overlap count or Jaccard index, for prod and non productive sequences, N insertion size
  
  
  # setting work parameters
  cutoffby = toupper(cutoffby)
  
  
  m <- as.data.frame(expand.grid(reps,reps))
  colnames(m)  <- c("Sample1","Sample2")
  
  toadd= data.frame()
  
  
  
  for(r in 1:nrow(m)){
    
    overlapValuesProd=c()
    overlapValuesUnprod=c()
    overlapValuesProdJI=c()
    overlapValuesUnprodJI=c()
    
    overlapValuesProdJI_abundancebased=c()
    overlapValuesUnprodJI_abundancebased=c()
    
    NinsertionProd1=c()
    NinsertionUnprod1=c()
    
    NinsertionProd2=c()
    NinsertionUnprod2=c()
    
    
    print(r)
    sam1=m[r,1]
    sam2=m[r,2]
    
    #print(sam1)
    #print(sam2)
    
    sam1 <- get(as.character(sam1))
    sam2 <- get(as.character(sam2))
    
    
    sam1Prod <- getProductive(sam1)
    sam1Unprod <- getUnproductive(sam1)
    
    sam2Prod <- getProductive(sam2)
    sam2Unprod <- getUnproductive(sam2)
    
    
    if(cutoffby=="PERC"){
      topclones_sam1Prod = getAbundantClonotypesbyperc(sam1Prod,cutoff)
      topclones_sam1Unprod = getAbundantClonotypesbyperc(sam1Unprod,cutoff)
      
      topclones_sam2Prod = getAbundantClonotypesbyperc(sam2Prod,cutoff)
      topclones_sam2Unprod = getAbundantClonotypesbyperc(sam2Unprod,cutoff)
      
      # commons
      commonClonesProd=length(intersect(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID)) 
      unionProd=length(union(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID)) 
      
      # commonality checked at the Nt level for non productive sequences, no AA given in the original data
      commonClonesUnprod=length(intersect(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID))
      unionUnprod=length(union(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID))
      
      if(unionUnprod==0){
        unionUnprod = 1
      }
      
      if(unionProd==0){
        unionProd = 1
      }
      
      overlapValuesProd=c(overlapValuesProd,as.numeric(commonClonesProd))
      overlapValuesUnprod=c(overlapValuesUnprod,as.numeric(commonClonesUnprod))
      
      overlapValuesProdJI=c(overlapValuesProdJI,commonClonesProd/unionProd)
      overlapValuesUnprodJI = c(overlapValuesUnprodJI,commonClonesUnprod/unionUnprod)
      
      # Jaccard abundance-based similarity index (UV/(U+V-UV)). U is total relative abundance in of shared sample 1, U total relative of shared in sample 2
      
      commonClonesProd_U_data= topclones_sam1Prod[topclones_sam1Prod$AMINOACID %in% intersect(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID),]
      commonClonesProd_U = sum(commonClonesProd_U_data$COUNT)/sum(topclones_sam1Prod$COUNT)
      
      commonClonesProd_V_data= topclones_sam2Prod[topclones_sam2Prod$AMINOACID %in% intersect(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID),]
      commonClonesProd_V = sum(commonClonesProd_V_data$COUNT)/sum(topclones_sam2Prod$COUNT)
      
      uv=commonClonesProd_U*commonClonesProd_V
      uplusv=commonClonesProd_U+commonClonesProd_V
      
      AbundanceBasedJIProd = uv/(uplusv-uv)
      
      overlapValuesProdJI_abundancebased=c(overlapValuesProdJI_abundancebased,as.numeric(AbundanceBasedJIProd))
      
      # for unproductive sequences
      
      commonClonesUnProd_U_data= topclones_sam1Unprod[topclones_sam1Unprod$AMINOACID %in% intersect(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID),]
      commonClonesUnProd_U = sum(commonClonesUnProd_U_data$COUNT)/sum(topclones_sam1Unprod$COUNT)
      
      commonClonesUnProd_V_data= topclones_sam2Unprod[topclones_sam2Unprod$AMINOACID %in% intersect(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID),]
      commonClonesUnProd_V = sum(commonClonesUnProd_V_data$COUNT)/sum(topclones_sam2Unprod$COUNT)
      
      uv_UnProd=commonClonesUnProd_U * commonClonesUnProd_V
      uplusv_UnProd=commonClonesUnProd_U + commonClonesUnProd_V
      
      AbundanceBasedJIUnProd = uv_UnProd/(uplusv_UnProd-uv_UnProd)
      
      overlapValuesUnprodJI_abundancebased=c(overlapValuesUnprodJI_abundancebased,as.numeric(AbundanceBasedJIUnProd))
      
      
      
      # Ninsertions  in common sequences
      
      commonClonotypes1= topclones_sam1Prod[topclones_sam1Prod$AMINOACID %in% intersect(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID),]
      commonClonotypes2= topclones_sam2Prod[topclones_sam2Prod$AMINOACID %in% intersect(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID),]
      
      commonClonotypes1Unprod= topclones_sam1Unprod[topclones_sam1Unprod$AMINOACID %in% intersect(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID),]
      commonClonotypes2Unprod= topclones_sam2Unprod[topclones_sam2Unprod$AMINOACID %in% intersect(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID),]
      
      
      NinsertionsP1= mean(commonClonotypes1$N2INSERTION + commonClonotypes1$N1INSERTION)
      NinsertionsP2 = mean(commonClonotypes2$N2INSERTION + commonClonotypes2$N1INSERTION)
      
      NinsertionsUP1= mean(commonClonotypes1Unprod$N2INSERTION + commonClonotypes1Unprod$N1INSERTION)
      NinsertionsUP2 = mean(commonClonotypes2Unprod$N2INSERTION + commonClonotypes2Unprod$N1INSERTION)
      
      
      NinsertionProd1=c(NinsertionProd1,NinsertionsP1)
      NinsertionProd2= c(NinsertionProd2,NinsertionsP2)
      
      NinsertionUnprod1=c(NinsertionUnprod1,NinsertionsUP1)
      NinsertionUnprod2=c(NinsertionUnprod2,NinsertionsUP2)
      
      
    }else{
      topclones_sam1Prod = getAbundantClonotypesbynumber(sam1Prod,cutoff)
      topclones_sam1Unprod = getAbundantClonotypesbynumber(sam1Unprod,cutoff)
      
      topclones_sam2Prod = getAbundantClonotypesbynumber(sam2Prod,cutoff)
      topclones_sam2Unprod = getAbundantClonotypesbynumber(sam2Unprod,cutoff)
      
      
      # commons
      # commons
      commonClonesProd=length(intersect(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID))
      unionProd=length(union(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID))
      
      # commonality checked at the Nt level for non productive sequences, no AA given in the original data
      commonClonesUnprod=length(intersect(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID))
      unionUnprod=length(union(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID))
      
      if(unionUnprod==0){
        unionUnprod = 1
      }
      
      if(unionProd==0){
        unionProd = 1
      }
      
      overlapValuesProd=c(overlapValuesProd,as.numeric(commonClonesProd))
      overlapValuesUnprod=c(overlapValuesUnprod,as.numeric(commonClonesUnprod))
      
      overlapValuesProdJI=c(overlapValuesProdJI,commonClonesProd/unionProd)
      overlapValuesUnprodJI = c(overlapValuesUnprodJI,commonClonesUnprod/unionUnprod)
      
      
      # Jaccard abundance-based similarity index (UV/(U+V-UV)). U is total relative abundance in of shared sample 1, U total relative of shared in sample 2
      
      commonClonesProd_U_data= topclones_sam1Prod[topclones_sam1Prod$AMINOACID %in% intersect(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID),]
      commonClonesProd_U = sum(commonClonesProd_U_data$COUNT)/sum(topclones_sam1Prod$COUNT)
      
      commonClonesProd_V_data= topclones_sam2Prod[topclones_sam2Prod$AMINOACID %in% intersect(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID),]
      commonClonesProd_V = sum(commonClonesProd_V_data$COUNT)/sum(topclones_sam2Prod$COUNT)
      
      uv=commonClonesProd_U*commonClonesProd_V
      uplusv=commonClonesProd_U+commonClonesProd_V
      
      AbundanceBasedJIProd = uv/(uplusv-uv)
      
      overlapValuesProdJI_abundancebased=c(overlapValuesProdJI_abundancebased,as.numeric(AbundanceBasedJIProd))
      
      # for unproductive sequences
      
      commonClonesUnProd_U_data= topclones_sam1Unprod[topclones_sam1Unprod$AMINOACID %in% intersect(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID),]
      commonClonesUnProd_U = sum(commonClonesUnProd_U_data$COUNT)/sum(topclones_sam1Unprod$COUNT)
      
      commonClonesUnProd_V_data= topclones_sam2Unprod[topclones_sam2Unprod$AMINOACID %in% intersect(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID),]
      commonClonesUnProd_V = sum(commonClonesUnProd_V_data$COUNT)/sum(topclones_sam2Unprod$COUNT)
      
      uv_UnProd=commonClonesUnProd_U * commonClonesUnProd_V
      uplusv_UnProd=commonClonesUnProd_U + commonClonesUnProd_V
      
      AbundanceBasedJIUnProd = uv_UnProd/(uplusv_UnProd-uv_UnProd)
      
      overlapValuesUnprodJI_abundancebased=c(overlapValuesUnprodJI_abundancebased,as.numeric(AbundanceBasedJIUnProd))
      
      
      
      # Ninsertions  in common sequences
      
      commonClonotypes1= topclones_sam1Prod[topclones_sam1Prod$AMINOACID %in% intersect(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID),]
      commonClonotypes2= topclones_sam2Prod[topclones_sam2Prod$AMINOACID %in% intersect(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID),]
      
      commonClonotypes1Unprod= topclones_sam1Unprod[topclones_sam1Unprod$AMINOACID %in% intersect(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID),]
      commonClonotypes2Unprod= topclones_sam2Unprod[topclones_sam2Unprod$AMINOACID %in% intersect(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID),]
      
      
      NinsertionsP1= mean(commonClonotypes1$N2INSERTION + commonClonotypes1$N1INSERTION)
      NinsertionsP2 = mean(commonClonotypes2$N2INSERTION + commonClonotypes2$N1INSERTION)
      
      NinsertionsUP1= mean(commonClonotypes1Unprod$N2INSERTION + commonClonotypes1Unprod$N1INSERTION)
      NinsertionsUP2 = mean(commonClonotypes2Unprod$N2INSERTION + commonClonotypes2Unprod$N1INSERTION)
      
      
      NinsertionProd1=c(NinsertionProd1,NinsertionsP1)
      NinsertionProd2= c(NinsertionProd2,NinsertionsP2)
      
      NinsertionUnprod1=c(NinsertionUnprod1,NinsertionsUP1)
      NinsertionUnprod2=c(NinsertionUnprod2,NinsertionsUP2)
      
    }
    
    #totalNumberOfProdClonotypes= nrow(topclones_sam1Prod) + nrow(topclones_sam2Prod)
    #totalNumberOfUnprodClonotypes= nrow(topclones_sam1Unprod) + nrow(topclones_sam2Unprod)
    
    
    if(nrow(toadd) > 0){
      newrow=data.frame(overlapValuesProd,overlapValuesUnprod,overlapValuesProdJI,overlapValuesUnprodJI,overlapValuesProdJI_abundancebased,overlapValuesUnprodJI_abundancebased,NinsertionProd1,NinsertionProd2,NinsertionUnprod1,NinsertionUnprod2)
      colnames(newrow) <- c("OverlapCountProductive","OverlapCountUnProductive","OverlapProd_JI","OverlapUnProd_JI","AbundanceBased_JI","AbundanceBased_JI","NinsertionProdSample1","NinsertionProdSample2","NinsertionUnProdSample1","NinsertionUnProdSample2")
      
      toadd=rbind(toadd,newrow)
    }else{
      toadd=data.frame(overlapValuesProd,overlapValuesUnprod,overlapValuesProdJI,overlapValuesUnprodJI,overlapValuesProdJI_abundancebased,overlapValuesUnprodJI_abundancebased,NinsertionProd1,NinsertionProd2,NinsertionUnprod1,NinsertionUnprod2)
      colnames(toadd) <- c("OverlapCountProductive","OverlapCountUnProductive","OverlapProd_JI","OverlapUnProd_JI","AbundanceBased_JI","AbundanceBased_JI","NinsertionProdSample1","NinsertionProdSample2","NinsertionUnProdSample1","NinsertionUnProdSample2")  
    }
    
  }
  
  
  
  
  print(nrow(toadd))
  resultTable=data.frame(m,toadd)        
  
  # write pdf dendrogram and heatmaps
  
  # select the sample columns 1 and 2, and the JI column 5
  forProdselected = resultTable[,c(1,2,5)]
  forProdselected = forProdselected[order(forProdselected$Sample1),]
  
  l = length(unique(forProdselected$Sample1))
  mat = matrix(as.numeric(forProdselected[,3]),nrow=l,ncol=l,byrow=T)
  # replace NaN values with 0
  mat[is.nan(mat)] = 0
  colnames(mat) <- unique(forProdselected$Sample1)
  rownames(mat) <- unique(forProdselected$Sample1)
  
  
  mat = signif(mat,digits=2)
  hc = hclust(as.dist(log10(1-mat)))
  
  flname = paste("ProdJI_dendrogram.pdf")
  pdf(file=flname, useDingbats = FALSE)
  
  # very simple dendrogram
  plot(hc)
  
  dev.off()
  
  # for unprod
  # select the sample columns 1 and 2, and the JI column 5
  forUnprodselected = resultTable[,c(1,2,6)]
  forUnprodselected = forUnprodselected[order(forUnprodselected$Sample1),]
  
  l = length(unique(forUnprodselected$Sample1))
  mat = matrix(as.numeric(forUnprodselected[,3]),nrow=l,ncol=l,byrow=T)
  # replace NaN values with 0
  mat[is.nan(mat)] = 0
  colnames(mat) <- unique(forUnprodselected$Sample1)
  rownames(mat) <- unique(forUnprodselected$Sample1)
  
  
  mat = signif(mat,digits=2)
  hc = hclust(as.dist(log10(1-mat)))
  
  
  flname = paste("UnprodJI_dendrogram.pdf")
  pdf(file=flname, useDingbats = FALSE)
  
  # very simple dendrogram
  plot(hc)
  
  dev.off()
  
  write.table(resultTable,file="Pairwise AA Overlap Counts.txt",row.names = T,sep = "\t")
  return(resultTable)
}


# Overlap sequence calculation using various parameters

overLappingClonotypesByChoice <- function(reps,cutoff=100,cutoffby="perc",seqType="AA",unProdStatus="unprod"){
  ### Returns a datamatrix of top topnumber clonotopes from each sample plus: overlap count or Jaccard index, for prod and non productive sequences, N insertion size
  
  ### Total read counts for the common clones of both samples are given for the productive/ chosen unproductive types.
  ### cutoffby : Perc (percentage of total repertoire as indicated by cutoff) or noperc (select top clones as many as indicated by cutoff)
  ### seqType:  can be by Nt (nucleotide) or AA (AminoAcid)
  ### unProdStatus : which unproductive clones to consider Out or Stop or Unprod (union of Out and Stop)
  
  
  # setting work parameters
  #---------------------------------------------------------
  
  #reps <- samNames
  
  cutoffby = toupper(cutoffby)
  
  m <- as.data.frame(combinations(n = length(reps), r = 2, v = reps, repeats.allowed = TRUE))
  colnames(m)  <- c("Sample1","Sample2")
  
  
  
  toadd= data.frame()
  
  #change parameter values to uppercase to accept Nt, nt etc for example
  unProdStatus <- toupper(unProdStatus)
  seqType <- toupper(seqType)
  
  if(seqType=="AA"){
    seqType <- "AMINOACID"
  }else{
    seqType <- "NUCLEOTIDE"
  }
  
  #---------------------------------------------------------
  
  
  for(r in 1:nrow(m)){
    
    overlapValuesProd=c()
    overlapValuesUnprod=c()
    
    overlapValuesTotalSam1Prod=c()
    overlapValuesTotalSam2Prod=c()
    
    overlapValuesTotalSam1Unprod=c()
    overlapValuesTotalSam2Unprod=c()
    
    overlapValuesProdJI=c()
    overlapValuesUnprodJI=c()
    
    # the following two hold abundance based JI using : http://chao.stat.nthu.edu.tw/wordpress/paper/2006_Biometrics_62_P361.pdf
    
    overlapValuesProdJI_abundancebased=c()
    overlapValuesUnprodJI_abundancebased=c()
    
    # the following two hold abundance based JI calculation using (total size intersecting clones)/(total size of union clones)
    overlapValuesProdJI_abundancebased2=c()
    overlapValuesUnprodJI_abundancebased2=c()
    
    NinsertionProd1=c()
    NinsertionUnprod1=c()
    
    NinsertionProd2=c()
    NinsertionUnprod2=c()
    
    
    print(r)
    sam1=m[r,1]
    sam2=m[r,2]
    
    samplesCompared = paste(sam1,sam2,sep="_")
    #print(sam1)
    #print(sam2)
    
    sam1 <- get(as.character(sam1))
    sam2 <- get(as.character(sam2))
    
    #---------------------------------------------------------
    # productive and unproductive selection for each sample
    
    
    sam1Prod <- getProductive(sam1)
    
    if(unProdStatus=="STOP"){
      sam1Unprod <- getHasStop(sam1)
    }else if(unProdStatus=="OUT"){
      sam1Unprod <- getOutOfFrame(sam1)
    }else{
      sam1Unprod <- getUnproductive(sam1)
    }
    
    
    sam2Prod <- getProductive(sam2)
    
    if(unProdStatus=="STOP"){
      sam2Unprod <- getHasStop(sam2)
    }else if(unProdStatus=="OUT"){
      sam2Unprod <- getOutOfFrame(sam2)
    }else{
      sam2Unprod <- getUnproductive(sam2)
    }
    
    
    
    #---------------------------------------------------------
    # selection of sequence type index for each sample
    
    indSam1 <- which(colnames(sam1) == seqType)
    indSam2 <- which(colnames(sam2) == seqType)
    
    
    
    #---------------------------------------------------------
    # selecting top clonotypes for which to do the overlap counts, JI calculation etc. Either by percentage or actual number of top clonotypes
    
    if(cutoffby=="PERC"){
      topclones_sam1Prod = getAbundantClonotypesbyperc(sam1Prod,cutoff)
      topclones_sam1Unprod = getAbundantClonotypesbyperc(sam1Unprod,cutoff)
      
      topclones_sam2Prod = getAbundantClonotypesbyperc(sam2Prod,cutoff)
      topclones_sam2Unprod = getAbundantClonotypesbyperc(sam2Unprod,cutoff)
    }else{
      topclones_sam1Prod = getAbundantClonotypesbynumber(sam1Prod,cutoff)
      topclones_sam1Unprod = getAbundantClonotypesbynumber(sam1Unprod,cutoff)
      
      topclones_sam2Prod = getAbundantClonotypesbynumber(sam2Prod,cutoff)
      topclones_sam2Unprod = getAbundantClonotypesbynumber(sam2Unprod,cutoff)
    }
    
    #---------------------------------------------------------
    # Getting number of overlapping clonotypes, Jaccard index, average insertion size etc for the two samples being compared
    
    # commons for sequence type selected
    commonClonesProd=length(intersect(topclones_sam1Prod[,indSam1],topclones_sam2Prod[,indSam2])) 
    unionProd=length(union(topclones_sam1Prod[,indSam1],topclones_sam2Prod[,indSam2])) 
    
    # commonality checked at the Nt level for non productive sequences, no AA given in the original data
    commonClonesUnprod=length(intersect(topclones_sam1Unprod[,indSam1],topclones_sam2Unprod[,indSam2]))
    unionUnprod=length(union(topclones_sam1Unprod[,indSam1],topclones_sam2Unprod[,indSam2]))
    
    
    
    
    if(unionUnprod==0){
      unionUnprod = 1
    }
    
    if(unionProd==0){
      unionProd = 1
    }
    
    overlapValuesProd=c(overlapValuesProd,as.numeric(commonClonesProd))
    overlapValuesUnprod=c(overlapValuesUnprod,as.numeric(commonClonesUnprod))
    
    
    
    overlapValuesProdJI=c(overlapValuesProdJI,commonClonesProd/unionProd)
    overlapValuesUnprodJI = c(overlapValuesUnprodJI,commonClonesUnprod/unionUnprod)
    
    # Jaccard abundance-based similarity index (UV/(U+V-UV)). U is total relative abundance in of shared sample 1, U total relative of shared in sample 2
    # taken from : http://chao.stat.nthu.edu.tw/wordpress/paper/2006_Biometrics_62_P361.pdf
    
    
    commonClonesProd_U_data= topclones_sam1Prod[topclones_sam1Prod[,indSam1] %in% intersect(topclones_sam1Prod[,indSam1],topclones_sam2Prod[,indSam2]),]
    commonClonesProd_U = sum(commonClonesProd_U_data$COUNT)/sum(topclones_sam1Prod$COUNT)
    
    commonClonesProd_U_abundance = sum(commonClonesProd_U_data$COUNT)
    
    commonClonesProd_V_data= topclones_sam2Prod[topclones_sam2Prod[,indSam2] %in% intersect(topclones_sam1Prod[,indSam1],topclones_sam2Prod[,indSam2]),]
    commonClonesProd_V = sum(commonClonesProd_V_data$COUNT)/sum(topclones_sam2Prod$COUNT)
    
    commonClonesProd_V_abundance = sum(commonClonesProd_V_data$COUNT)
    
    overlapValuesTotalSam1Prod=c(overlapValuesTotalSam1Prod,as.numeric(commonClonesProd_U_abundance))
    overlapValuesTotalSam2Prod=c(overlapValuesTotalSam2Prod,as.numeric(commonClonesProd_V_abundance))
    
    uv=commonClonesProd_U*commonClonesProd_V
    uplusv=commonClonesProd_U+commonClonesProd_V
    
    AbundanceBasedJIProd = uv/(uplusv-uv)
    
    overlapValuesProdJI_abundancebased=c(overlapValuesProdJI_abundancebased,as.numeric(AbundanceBasedJIProd))
    
    # for abundance based JI type 2
    totalSam1Sam2Prod = sum(topclones_sam1Prod$COUNT) + sum(topclones_sam2Prod$COUNT)                                                                                        
    AbundanceBasedJIProd2 = (commonClonesProd_U_abundance + commonClonesProd_V_abundance) / totalSam1Sam2Prod
    overlapValuesProdJI_abundancebased2 = c(overlapValuesProdJI_abundancebased2,as.numeric(AbundanceBasedJIProd2))
    
    
    # for unproductive sequences
    
    commonClonesUnProd_U_data= topclones_sam1Unprod[topclones_sam1Unprod[,indSam1] %in% intersect(topclones_sam1Unprod[,indSam1],topclones_sam2Unprod[,indSam2]),]
    commonClonesUnProd_U = sum(commonClonesUnProd_U_data$COUNT)/sum(topclones_sam1Unprod$COUNT)
    
    commonClonesUnProd_U_abundance = sum(commonClonesUnProd_U_data$COUNT)
    
    commonClonesUnProd_V_data= topclones_sam2Unprod[topclones_sam2Unprod[,indSam2] %in% intersect(topclones_sam1Unprod[,indSam1],topclones_sam2Unprod[,indSam2]),]
    commonClonesUnProd_V = sum(commonClonesUnProd_V_data$COUNT)/sum(topclones_sam2Unprod$COUNT)
    
    commonClonesUnProd_V_abundance = sum(commonClonesUnProd_V_data$COUNT)
    
    
    overlapValuesTotalSam1Unprod=c(overlapValuesTotalSam1Unprod,as.numeric(commonClonesUnProd_U_abundance))
    overlapValuesTotalSam2Unprod=c(overlapValuesTotalSam2Unprod,as.numeric(commonClonesUnProd_V_abundance))
    
    
    uv_UnProd=commonClonesUnProd_U * commonClonesUnProd_V
    uplusv_UnProd=commonClonesUnProd_U + commonClonesUnProd_V
    
    AbundanceBasedJIUnProd = uv_UnProd/(uplusv_UnProd-uv_UnProd)
    
    overlapValuesUnprodJI_abundancebased=c(overlapValuesUnprodJI_abundancebased,as.numeric(AbundanceBasedJIUnProd))
    
    # for abundance based JI type 2
    totalSam1Sam2Unprod = sum(topclones_sam1Unprod$COUNT) + sum(topclones_sam2Unprod$COUNT)                                                                                          
    AbundanceBasedJIUnProd2 = (commonClonesUnProd_U_abundance + commonClonesUnProd_V_abundance) / totalSam1Sam2Unprod
    overlapValuesUnprodJI_abundancebased2 = c(overlapValuesUnprodJI_abundancebased2,as.numeric(AbundanceBasedJIUnProd2))
    
    
    # Ninsertions  in common sequences
    
    commonClonotypes1= topclones_sam1Prod[topclones_sam1Prod[,indSam1] %in% intersect(topclones_sam1Prod[,indSam1],topclones_sam2Prod[,indSam1]),]
    commonClonotypes2= topclones_sam2Prod[topclones_sam2Prod[,indSam2] %in% intersect(topclones_sam1Prod[,indSam1],topclones_sam2Prod[,indSam2]),]
    
    commonClonotypes1Unprod= topclones_sam1Unprod[topclones_sam1Unprod[,indSam1] %in% intersect(topclones_sam1Unprod[,indSam1],topclones_sam2Unprod[,indSam2]),]
    commonClonotypes2Unprod= topclones_sam2Unprod[topclones_sam2Unprod[,indSam2] %in% intersect(topclones_sam1Unprod[,indSam1],topclones_sam2Unprod[,indSam2]),]
    
    
    NinsertionsP1= mean(commonClonotypes1$N2INSERTION + commonClonotypes1$N1INSERTION)
    NinsertionsP2 = mean(commonClonotypes2$N2INSERTION + commonClonotypes2$N1INSERTION)
    
    NinsertionsUP1= mean(commonClonotypes1Unprod$N2INSERTION + commonClonotypes1Unprod$N1INSERTION)
    NinsertionsUP2 = mean(commonClonotypes2Unprod$N2INSERTION + commonClonotypes2Unprod$N1INSERTION)
    
    
    NinsertionProd1=c(NinsertionProd1,NinsertionsP1)
    NinsertionProd2= c(NinsertionProd2,NinsertionsP2)
    
    NinsertionUnprod1=c(NinsertionUnprod1,NinsertionsUP1)
    NinsertionUnprod2=c(NinsertionUnprod2,NinsertionsUP2)
    
    
    # write some common clonotypes of productive and unproductive repertoires
    # select columns to merge and write 
    
    selectedCols <- c("NUCLEOTIDE","AMINOACID","COUNT","VFAMILYNAME","VGENENAME","JGENENAME","CDR3LENGTH","SEQUENCESTATUS")
    selectedColsIdx <- which(colnames(commonClonotypes1) %in% selectedCols)
    
    # write for productive commons
    
    resultDir = paste("Pairwise_Overlap_results_","Productive_Unproductive(",unProdStatus,")",sep="")
    
    dir.create(file.path(resultDir),showWarnings = FALSE)
    
    
    commonsProdForWriting = head(merge(commonClonotypes1[,selectedColsIdx],commonClonotypes2[,selectedColsIdx],by="NUCLEOTIDE"),20)
    commonsUnProdForWriting = merge(commonClonotypes1Unprod[,selectedColsIdx],commonClonotypes2Unprod[,selectedColsIdx],by="NUCLEOTIDE")
    
    
    write.table(commonsProdForWriting,file=paste(resultDir,"/",samplesCompared,"_Common_productive_sequences_top20.txt",sep=""),row.names = F,sep = "\t")
    write.table(commonsUnProdForWriting,file=paste(resultDir,"/",samplesCompared,"_Common_Unproductive_sequences_all.txt",sep=""),row.names = F,sep = "\t")
    
    
    
    #totalNumberOfProdClonotypes= nrow(topclones_sam1Prod) + nrow(topclones_sam2Prod)
    #totalNumberOfUnprodClonotypes= nrow(topclones_sam1Unprod) + nrow(topclones_sam2Unprod)
    
    
    #unproductiveOverlapCountName = paste("OverlapCountUnProductive(",unProdStatus,")",sep="")
    
    if(nrow(toadd) > 0){
      newrow=data.frame(overlapValuesProd,overlapValuesTotalSam1Prod,overlapValuesTotalSam2Prod,overlapValuesUnprod,overlapValuesTotalSam1Unprod,overlapValuesTotalSam2Unprod,overlapValuesProdJI,overlapValuesUnprodJI,overlapValuesProdJI_abundancebased,overlapValuesUnprodJI_abundancebased,overlapValuesProdJI_abundancebased2,overlapValuesUnprodJI_abundancebased2,NinsertionProd1,NinsertionProd2,NinsertionUnprod1,NinsertionUnprod2)
      colnames(newrow) <- c("OverlapCountProductive","OverlapCountTotalProductiveSam1","OverlapCountTotalProductiveSam2","OverlapCountUnProductive","OverlapCountTotalUnProductiveSam1","OverlapCountTotalUnProductiveSam2","OverlapProd_JI","OverlapUnProductive_JI","AbundanceBasedProd_JI","AbundanceBasedUnProductive_JI","AbundanceBasedProd_JI2","AbundanceBasedUnProductive_JI2","NinsertionProdSample1","NinsertionProdSample2","NinsertionUnProductiveSample1","NinsertionUnProductiveSample2")
      
      toadd=rbind(toadd,newrow)
    }else{
      toadd=data.frame(overlapValuesProd,overlapValuesTotalSam1Prod,overlapValuesTotalSam2Prod,overlapValuesUnprod,overlapValuesTotalSam1Unprod,overlapValuesTotalSam2Unprod,overlapValuesProdJI,overlapValuesUnprodJI,overlapValuesProdJI_abundancebased,overlapValuesUnprodJI_abundancebased,overlapValuesProdJI_abundancebased2,overlapValuesUnprodJI_abundancebased2,NinsertionProd1,NinsertionProd2,NinsertionUnprod1,NinsertionUnprod2)
      colnames(toadd) <- c("OverlapCountProductive","OverlapCountTotalProductiveSam1","OverlapCountTotalProductiveSam2","OverlapCountUnProductive","OverlapCountTotalUnProductiveSam1","OverlapCountTotalUnProductiveSam2","OverlapProd_JI","OverlapUnProductive_JI","AbundanceBasedProd_JI","AbundanceBasedUnProductive_JI","AbundanceBasedProd_JI2","AbundanceBasedUnProductive_JI2","NinsertionProdSample1","NinsertionProdSample2","NinsertionUnProductiveSample1","NinsertionUnProductiveSample2")
    }
    
  }
  
  
  
  
  
  #head(toadd)
  #print(nrow(toadd))
  resultTable=data.frame(m,toadd)        
  
  outputFileName = paste(resultDir,"/","Pairwise_Overlap_Counts_",seqType,"_Productive_Unproductive(",unProdStatus,").txt",sep="")
  
  write.table(resultTable,file=outputFileName,row.names = T,sep = "\t")
  return(resultTable)
  
  
}




#### Making PCA for V or VJ usage ####

getPCAfordata <- function(dd,groups,scaling=T,oprefix="O"){
  # Assumes that the observations (samples) are on columns and variables on rows. We want to make the variables
  # on columns so it first transposes d.
  dd=t(dd)
  #head(d)
  
  pr<-prcomp(dd,scale=scaling)
  print(summary(pr)) # proportions of the total variance explained by each component
  
  #samples names
  sampleNames=rownames(dd)
  
  # get variance explained by each PC
  s=summary(pr)$importance[2,]
  
  # grouping of samples
  #groups=groups
  
  # Write and display PCA result
  
  flname = paste(oprefix,"PCA.pdf")
  pdf(file=flname, useDingbats = FALSE)
  
  tobedrawn=data.frame(pr$x,group=factor(groups),snames=rownames(dd))
  write.table(pr$x,file=paste(oprefix,"_PCArotatedData.txt"),row.names = T,col.names=T,sep = "\t")
  
  pcaP=ggplot(tobedrawn, aes(x=PC1, y=PC2,shape=group,colour=snames)) + xlab(paste("PC1 (",format(round(s[1]*100),nsmall=2),"%)")) + ylab(paste("PC2 (",format(round(s[2]*100),nsmall=2),"%)")) + geom_point(size=6) + scale_fill_brewer(palette="Set1") 
  pcaP = pcaP + theme_bw() + theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank()) + theme(axis.line = element_line(colour = "black")) 
  
  
  
  
  print(pcaP)  
  #png(file=paste(oprefix,"PCA.png"),height=700, width=650)
  
  #Plot using ggplot, much more flexible
  #ggplot(as.data.frame(pr$x), aes(x=PC1, y=PC2)) + xlab(paste("PC1 (",format(round(s[1]*100),nsmall=2),"%)")) + ylab(paste("PC2 (",format(round(s[2]*100),nsmall=2),"%)")) + geom_point(size=6,aes(colour = sampleNames,shape=groups)) + scale_fill_brewer(palette="Set1") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(colour = "black")) 
                                                                                                                                                                                                                                                                              
  
  dev.off()
  
  #ggsave(flname, plot = last_plot(), device = "pdf")
  
  #print(pcaP)
  
}

getPCAfordata1 <- function(dd,groups,oprefix="O"){
  # Assumes that the observations (samples) are on columns and variables on rows. We want to make the variables
  # on columns so it first transposes d.
  dd=t(dd)
  #head(d)
  
  pr<-prcomp(dd)
  print(summary(pr)) # proportions of the total variance explained by each component
  
  #samples names
  sampleNames=rownames(dd)
  
  # get variance explained by each PC
  s=summary(pr)$importance[2,]
  
  # grouping of samples
  #groups=groups
  
  # Write and display PCA result
  flname = paste(oprefix,"PCA.pdf")
  pdf(file=flname, useDingbats = FALSE)
  
  
  tobedrawn=data.frame(pr$x,group=factor(groups),snames=rownames(dd))
  write.table(pr$x,file=paste(oprefix,"_PCArotatedData.txt"),row.names = T,col.names=T,sep = "\t")
  
  pcaP=ggplot(tobedrawn, aes(x=PC1, y=PC2,shape=group,colour=snames)) + xlab(paste("PC1 (",format(round(s[1]*100),nsmall=2),"%)")) + ylab(paste("PC2 (",format(round(s[2]*100),nsmall=2),"%)")) + geom_point(size=6) + scale_fill_brewer(palette="Set1")
  pcaP = pcaP + theme_bw() + theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank()) + theme(axis.line = element_line(colour = "black")) 
  
  
  #png(file=paste(oprefix,"PCA.png"),height=700, width=650)
  
  #Plot using ggplot, much more flexible
  #ggplot(as.data.frame(pr$x), aes(x=PC1, y=PC2)) + xlab(paste("PC1 (",format(round(s[1]*100),nsmall=2),"%)")) + ylab(paste("PC2 (",format(round(s[2]*100),nsmall=2),"%)")) + geom_point(size=6,aes(colour = sampleNames,shape=groups)) + scale_fill_brewer(palette="Set1") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(colour = "black")) 
  print(pcaP)                                                                                                                                                                                                                                                                              
  
  dev.off()
  
  
  
  #print(pcaP)
  
}


getPCAfordata2 <- function(dd,groups,oprefix="O"){
  # Assumes that the observations (samples) are on columns and variables on rows. We want to make the variables
  # on columns so it first transposes d.
  dd=t(dd)
  #head(d)
  
  pr<-prcomp(dd)
  print(summary(pr)) # proportions of the total variance explained by each component
  
  #samples names
  sampleNames=rownames(dd)
  
  # get variance explained by each PC
  s=summary(pr)$importance[2,]
  
  # grouping of samples
  #groups=groups
  
  # Write and display PCA result
  flname = paste(oprefix,"PCA.pdf")
  pdf(file=flname, useDingbats = FALSE)
  
  tobedrawn=data.frame(pr$x,repType=factor(groups),snames=rownames(dd))
  write.table(pr$x,file=paste(oprefix,"_PCArotatedData.txt"),row.names = T,col.names=T,sep = "\t")
  
  #pcaP=ggplot(tobedrawn, aes(x=PC1, y=PC2,shape=repType,colour=repType,label=snames))
  # pcaP=ggplot(tobedrawn, aes(x=PC1, y=PC2,shape=repType,colour=repType,label=snames))
  pcaP=ggplot(tobedrawn, aes(x=PC1, y=PC2,label=snames)) + 
    xlab(paste("PC1 (",format(round(s[1]*100),nsmall=2),"%)")) + 
    ylab(paste("PC2 (",format(round(s[2]*100),nsmall=2),"%)")) + 
    scale_fill_brewer(palette="Set1")
  
  pcaP = pcaP + geom_text(size=3,check_overlap = TRUE,aes(colour = repType, fontface = "bold"),position=position_jitter())
  
  pcaP = pcaP + theme_bw() + theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank()) + theme(axis.line = element_line(colour = "black")) 
  
  
  
  #png(file=paste(oprefix,"PCA.png"),height=700, width=650)
  
  #Plot using ggplot, much more flexible
  #ggplot(as.data.frame(pr$x), aes(x=PC1, y=PC2)) + xlab(paste("PC1 (",format(round(s[1]*100),nsmall=2),"%)")) + ylab(paste("PC2 (",format(round(s[2]*100),nsmall=2),"%)")) + geom_point(size=6,aes(colour = sampleNames,shape=groups)) + scale_fill_brewer(palette="Set1") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(colour = "black")) 
  print(pcaP)                                                                                                                                                                                                                                                                              
  
  dev.off()
  
  
  
  #print(pcaP)
  
}

getPCAfordata3 <- function(dd,groups,oprefix="O"){
  # Assumes that the observations (samples) are on columns and variables on rows. We want to make the variables
  # on columns so it first transposes d.
  dd=t(dd)
  #head(d)
  
  pr<-prcomp(dd)
  print(summary(pr)) # proportions of the total variance explained by each component
  
  #samples names
  sampleNames=rownames(dd)
  
  # get variance explained by each PC
  s=summary(pr)$importance[2,]
  
  # grouping of samples
  #groups=groups
  
  # Write and display PCA result
  flname = paste(oprefix,"PCA.pdf")
  pdf(file=flname, useDingbats = FALSE)
  
  
  tobedrawn=data.frame(pr$x,repType=factor(groups),snames=rownames(dd))
  write.table(pr$x,file=paste(oprefix,"_PCArotatedData.txt"),row.names = T,col.names=T,sep = "\t")
  
  pcaP=ggplot(tobedrawn, aes(x=PC1, y=PC2,shape=repType,colour=repType)) + xlab(paste("PC1 (",format(round(s[1]*100),nsmall=2),"%)")) + ylab(paste("PC2 (",format(round(s[2]*100),nsmall=2),"%)")) + geom_point(size=6) + scale_fill_brewer(palette="Set1") 
  pcaP = pcaP + theme_bw() + theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank()) + theme(axis.line = element_line(colour = "black")) 
  
  #png(file=paste(oprefix,"PCA.png"),height=700, width=650)
  
  #Plot using ggplot, much more flexible
  #ggplot(as.data.frame(pr$x), aes(x=PC1, y=PC2)) + xlab(paste("PC1 (",format(round(s[1]*100),nsmall=2),"%)")) + ylab(paste("PC2 (",format(round(s[2]*100),nsmall=2),"%)")) + geom_point(size=6,aes(colour = sampleNames,shape=groups)) + scale_fill_brewer(palette="Set1") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(colour = "black")) 
  print(pcaP)                                                                                                                                                                                                                                                                              
  
  dev.off()
  
  
  
  #print(pcaP)
  
}


getPCAfordata4 <- function(dd,groups,scaling=T,oprefix="O"){
  # Assumes that the observations (samples) are on columns and variables on rows. We want to make the variables
  # on columns so it first transposes d.
  dd=t(dd)
  #head(d)
  
  pr<-prcomp(dd,scale=scaling)
  print(summary(pr)) # proportions of the total variance explained by each component
  
  #samples names
  sampleNames=rownames(dd)
  
  # get variance explained by each PC
  s=summary(pr)$importance[2,]
  
  # grouping of samples
  #groups=groups
  
  # Write and display PCA result
  
  flname = paste(oprefix,"PCA.pdf")
  pdf(file=flname, useDingbats = FALSE)
  
  tobedrawn=data.frame(pr$x,group=factor(groups),snames=rownames(dd))
  write.table(pr$x,file=paste(oprefix,"_PCArotatedData.txt"),row.names = T,col.names=T,sep = "\t")
  
  pcaP=ggplot(tobedrawn, aes(x=PC1, y=PC2,colour=group)) + xlab(paste("PC1 (",format(round(s[1]*100),nsmall=2),"%)")) + ylab(paste("PC2 (",format(round(s[2]*100),nsmall=2),"%)")) + geom_point(size=4) + scale_fill_brewer(palette="Set1") 
  pcaP = pcaP + theme_bw() + theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank()) + theme(axis.line = element_line(colour = "black")) 
  
  
  
  
  print(pcaP)  
  #png(file=paste(oprefix,"PCA.png"),height=700, width=650)
  
  #Plot using ggplot, much more flexible
  #ggplot(as.data.frame(pr$x), aes(x=PC1, y=PC2)) + xlab(paste("PC1 (",format(round(s[1]*100),nsmall=2),"%)")) + ylab(paste("PC2 (",format(round(s[2]*100),nsmall=2),"%)")) + geom_point(size=6,aes(colour = sampleNames,shape=groups)) + scale_fill_brewer(palette="Set1") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(colour = "black")) 
  
  
  dev.off()
  
  #ggsave(flname, plot = last_plot(), device = "pdf")
  
  #print(pcaP)
  
}


#### Fisher's exact test for V,J or VJ usage counts ####

compareUsageProportions<-function(freqTable,pcutoff=0.05){
  ### freqTable : an Feature frequency table between sample groups, table has headers, headers 2 and 3 have the counts
  
  sam=freqTable
  #hdnames= colnames(sam)
  
  #hlaAlleles <- as.data.frame(sam[,1],stringsAsFactors=FALSE )
  #colnames(hlaAlleles)  <- hdnames[1]
  
  #signficanceFound=T # a flag to check if significant comparison is found in each round
  #topsignificantAllele=F # a flag to hold the allele showing the most significant difference between sample groups in each round, will be removed during next round comparison
  
  resultTable <- sam
  pvals=c()
  ORs=c()
  CIs=c()
  
  print(paste("There are",ncol(sam),"samples in the data. Comparison is done for every possible pair of samples..."))
  
  print(paste("Number of features to compare : ",nrow(sam)))
  
  toploop=ncol(sam)-1
  inloop=ncol(sam)
  
  for (j in 1:toploop){
    
    for (k in j+1:inloop){
      #print(inloop)
      #print(j)
      #print(k)
      if (k > inloop){break}
      res=compareUsageInTwoSamples(sam[,c(j,k)])
      colnames(res) <- c(paste(colnames(sam[,c(j,k)])[1],colnames(sam[,c(j,k)])[2],"padjusted",sep="_"),paste(colnames(sam[,c(j,k)])[1],colnames(sam[,c(j,k)])[2],"OR",sep="_"),paste(colnames(sam[,c(j,k)])[1],colnames(sam[,c(j,k)])[2],"CI",sep="_"))
      resultTable <- cbind(resultTable,res) 
    }
    
  }
  
  print("Result written to file....")
  write.table(resultTable,file="Muke_Usage_fishers_comparison_result.txt",row.names = T,sep = "\t")
  return(resultTable)
  
  
}


compareUsageInTwoSamples<-function(freqTable,pcutoff=0.05){
  sam=freqTable
  pvals=c()
  ORs=c()
  CIs=c()
  
  print(head(sam))
  
  for(i in 1:nrow(sam)){
    
    current<-sam[i,]
    othersSum<-colSums(sam[-i,])
    #print(othersSum)
    
    dataForComparison=matrix(c(as.numeric(current),as.numeric(othersSum)),nrow = 2,byrow=T)
    #colnames(dataForComparison)<-c(hdnames[2], hdnames[3])
    #rownames(dataForComparison)<-c(as.character(sam[i,i]), "OtherAlleles")
    
    fresult=fisher.test(dataForComparison)
    pvals=c(pvals,fresult$p.value)
    ORs=c(ORs,as.numeric(fresult$estimate))
    CIs=c(CIs,paste(fresult$conf[1],fresult$conf[2],sep="-"))
    
    #pvals=p.adjust(pvals, "BH") # pvals are adjusted
    
    #cnames=c(paste(colnames(sam)[selectedCols],"pval",sep="") ,paste(colnames(sam)[selectedCols],"OR",sep=""),paste(colnames(sam)[selectedCols],"CI",sep=""))
    
    
  }
  
  pvals=p.adjust(pvals, "BH") # pvals are adjusted
  res = cbind(pvals,ORs,CIs)
  
  return(res)  
}


compareAbundanceInTwoSamplesForShared<-function(freqTable,pcutoff=0.05,sharedClonesOnly=T,minTotalAbundance=100){
  sam=freqTable
  pvals=c()
  ORs=c()
  CIs=c()
  
  print(head(sam))
  
  if(sharedClonesOnly==T){
    sharedClonesIndices = apply(sam,1,function(x) (sum(x>0) == 2 & sum(x) >= minTotalAbundance))
  }else{
    sharedClonesIndices = apply(sam,1,function(x) (sum(x) >= minTotalAbundance))
  }
  
  sharedCloneIndices = which(sharedClonesIndices)
  numOfShared = length(sharedCloneIndices)
  
    
  for(i in 1:numOfShared){
    
    current<-sam[sharedCloneIndices[i],]
    othersSum<-colSums(sam[-sharedCloneIndices[i],])
    #print(othersSum)
    
    dataForComparison=matrix(c(as.numeric(current),as.numeric(othersSum)),nrow = 2,byrow=T)
    #colnames(dataForComparison)<-c(hdnames[2], hdnames[3])
    #rownames(dataForComparison)<-c(as.character(sam[i,i]), "OtherAlleles")
    
    fresult=fisher.test(dataForComparison)
    pvals=c(pvals,fresult$p.value)
    ORs=c(ORs,as.numeric(fresult$estimate))
    CIs=c(CIs,paste(fresult$conf[1],fresult$conf[2],sep="-"))
    
    #pvals=p.adjust(pvals, "BH") # pvals are adjusted
    
    #cnames=c(paste(colnames(sam)[selectedCols],"pval",sep="") ,paste(colnames(sam)[selectedCols],"OR",sep=""),paste(colnames(sam)[selectedCols],"CI",sep=""))
    
    
  }
  
  pvals=p.adjust(pvals, "BH") # pvals are adjusted
  res = cbind(as.numeric(pvals),ORs,CIs)
  
  rownames(res) = rownames(sam[sharedCloneIndices,])
  
  return(res)  
}


compareUsageProportionsPaired<-function(freqTable,pcutoff=0.05,pairs){
  ### freqTable : an Feature frequency table between sample groups, table has headers, headers 2 and 3 have the counts
  
  sam=freqTable
  #hdnames= colnames(sam)
  
  #hlaAlleles <- as.data.frame(sam[,1],stringsAsFactors=FALSE )
  #colnames(hlaAlleles)  <- hdnames[1]
  
  #signficanceFound=T # a flag to check if significant comparison is found in each round
  #topsignificantAllele=F # a flag to hold the allele showing the most significant difference between sample groups in each round, will be removed during next round comparison
  
  resultTable <- sam
  pvals=c()
  ORs=c()
  CIs=c()
  
  
  
  print(paste("There are",ncol(sam),"samples in the data. Comparison is done for pair of samples..."))
  
  print(paste("Number of features to compare : ",nrow(sam)))
  
  rnds = length(pairs)/2
  for (k in 1:rnds){
      kpair = k + rnds
      res=compareUsageInTwoSamples(sam[,c(k,kpair)])
      colnames(res) <- c(paste(colnames(sam[,c(k,kpair)])[1],colnames(sam[,c(k,kpair)])[2],"padjusted",sep="_"),paste(colnames(sam[,c(k,kpair)])[1],colnames(sam[,c(k,kpair)])[2],"OR",sep="_"),paste(colnames(sam[,c(k,kpair)])[1],colnames(sam[,c(k,kpair)])[2],"CI",sep="_"))
      resultTable <- cbind(resultTable,res) 
    }
    
  
  
  print("Result written to file....")
  write.table(resultTable,file="Usage_fishers_comparison_result_forPairSamples.txt",row.names = T,sep = "\t")
  return(resultTable)
  
  
}


compareAbundanceInPairedSamples<-function(freqTable,pcutoff=0.01,pairs,writeResult=F,resultDir="matcheDA"){
  ### freqTable : an clone count table between sample groups, table has headers, headers 1 and 2 have the counts
  
  sam=freqTable
  
  resultlist <- list()
  pvals=c()
  ORs=c()
  CIs=c()
  
  
  
  print(paste("There are",ncol(sam),"samples in the data. Comparison is done for pair of samples..."))
  
  print(paste("Number of features to compare : ",nrow(sam)))
  
  
  # dir to write results to
    
  dir.create(file.path(resultDir),showWarnings = FALSE)
   
  
  # comparison for every pair of samples
  
  rnds = length(pairs)/2
  for (k in 1:rnds){
    kpair = k + rnds
    res=compareAbundanceInTwoSamplesForShared(sam[,c(k,kpair)],sharedClonesOnly=F)
    colnames(res) <- c(paste(colnames(sam[,c(k,kpair)])[1],colnames(sam[,c(k,kpair)])[2],"padjusted",sep="_"),paste(colnames(sam[,c(k,kpair)])[1],colnames(sam[,c(k,kpair)])[2],"OR",sep="_"),paste(colnames(sam[,c(k,kpair)])[1],colnames(sam[,c(k,kpair)])[2],"CI",sep="_"))
    
    print(str(res[,1]))
    
    
    siglist = as.numeric(res[,1]) < pcutoff
    
    resSig = res[siglist,]
    
    print(head(siglist))
    
    
    resultlist[[colnames(sam[,c(k,kpair)])[1]]] <- cbind(sam[rownames(resSig),c(k,kpair)],resSig)
    
    if(writeResult==T){
    print("Result written to file....")
    write.table(res,file=paste(resultDir,"/",colnames(sam[,c(k,kpair)])[1],"_",colnames(sam[,c(k,kpair)])[2],"_Abundance_fishers_comparison_result_forPairSamples.txt",sep=""),row.names = T,sep = "\t")
    }
   
    
      
  
  }
  
  return(resultlist)
  
  
}




#### composition based comparison of repertoires using 4-mer nt usage frequencies ####

getKmerNtFrequencies <- function(reps,kMer=4,asProb=F,vGenes=NULL,seqStatus="prod"){
  
  kmerFreqTable = NULL
  
  for(rep in reps){
    print(rep)
    p <- get(rep)
    
    if(seqStatus=="prod"){
      p <- getProductive(p)
    }else {
      p <- getUnproductive(p)
    }
    
  
    if(!is.null(vGenes)){    
      p <- p[p$VGENENAME %in% vGenes, ]
    }  
    
    print(nrow(p))
    #print(p[,c(8,22)])
    
    seqs <- unique(p$NUCLEOTIDE) # select unique AA clonotypes
    seqSet <- DNAStringSet(seqs)
    seq_mers <- oligonucleotideFrequency(seqSet,width=kMer,as.prob=asProb,simplify.as="collapsed")
    kmerFreqTable = rbind(kmerFreqTable,seq_mers)
    
  }
  
  
  rownames(kmerFreqTable) <- reps
  
  kmerFreqTable <- t(kmerFreqTable)
  
  print("Writing result to file....")
  
  write.table(kmerFreqTable,file="4merFrequencyTable.txt",row.names = T,col.names=T,sep = "\t")
  
  return(kmerFreqTable)
  
}

getKmerNtFrequenciesVJ <- function(reps,kMer=4,asProb=F,vjGenes=NULL,seqStatus="prod"){
  
  kmerFreqTable = NULL
  
  for(rep in reps){
    print(rep)
    p <- get(rep)
    
    if(seqStatus=="prod"){
      p <- getProductive(p)
    }else {
      p <- getUnproductive(p)
    }
    
    tempd = NULL
    if(!is.null(vjGenes)){
      
      res=strsplit(vjGenes," ")
      for(i in 1:length(res)){
        
        tempd <- rbind(tempd,p[(p$VGENENAME==res[[i]][1]) & (p$JGENENAME==res[[i]][2]),])
      }
    
      p <- tempd
      
    }  
   
  print(nrow(p))
  #print(p[,c(8,22)])
    
  seqs <- unique(p$NUCLEOTIDE) # select unique AA clonotypes
  seqSet <- DNAStringSet(seqs)
  seq_mers <- oligonucleotideFrequency(seqSet,width=kMer,as.prob=asProb,simplify.as="collapsed")
  kmerFreqTable = rbind(kmerFreqTable,seq_mers)
  
  }
  
 
  rownames(kmerFreqTable) <- reps
  
  kmerFreqTable <- t(kmerFreqTable)
  
  print("Writing result to file....")
  
  write.table(kmerFreqTable,file="4merFrequencyTable.txt",row.names = T,col.names=T,sep = "\t")
  
  return(kmerFreqTable)
  
}

getClonalKmerNtFrequencies <- function(reps,selectionType="random",nClones=100,kMer=4,asProb=F,vGenes=NULL,seqStatus="prod"){
  
  kmerFreqTable = NULL
  repnames = c()
  
  for(rep in reps){
    print(rep)
    p <- get(rep)
    
    if(seqStatus=="prod"){
      p <- getProductive(p)
    }else {
      p <- getUnproductive(p)
    }
    
    #select clones with given list of Vgenes
    
    if(!is.null(vGenes)){ 
      p <- p[p$VGENENAME %in% vGenes,]
    }  
    
    # then select 100 clonotypes
    
    if(selectionType=="random"){ 
    sampling.index = sample(1:nrow(p),nClones,replace=F)  
    }else{
      # just pick the top nClones
      sampling.index = 1:nClones  
    }
    
    p <- p[sampling.index,]
    
   
    
    print(nrow(p))
    
    seqs <- unique(p$NUCLEOTIDE) # select unique AA clonotypes
    seqSet <- DNAStringSet(seqs)
    seq_mers <- oligonucleotideFrequency(seqSet,width=kMer,as.prob=asProb)
    kmerFreqTable = rbind(kmerFreqTable,seq_mers)
    
    repnames = c(repnames,rep(rep,nClones))
  }
  
  rownames(kmerFreqTable) <- repnames  
  kmerFreqTable <- t(kmerFreqTable)
  
  print("Writing result to file....")
  
  write.table(kmerFreqTable,file="clonal4merFrequencyTable.txt",row.names = T,col.names=T,sep = "\t")
  
  return(list(freqTable = kmerFreqTable,sams = repnames))
  
}
dyohanne/RepAn documentation built on Feb. 3, 2023, 2:41 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
dyohanne/RepAn
Differential abundance Analysis for deep sequenced immune repertoire (Repseq) datasets

R/RepAnFns.R
In dyohanne/RepAn: Differential abundance Analysis for deep sequenced immune repertoire (Repseq) datasets

Documented in getDefined getHasStop getOutOfFrame getProductive getUnproductive loadBioconductorPacks loadPacks readSample readSampleV2

R Package Documentation

Browse R Packages

We want your feedback!

dyohanne/RepAn Differential abundance Analysis for deep sequenced immune repertoire (Repseq) datasets

R/RepAnFns.R In dyohanne/RepAn: Differential abundance Analysis for deep sequenced immune repertoire (Repseq) datasets

Documented in getDefined getHasStop getOutOfFrame getProductive getUnproductive loadBioconductorPacks loadPacks readSample readSampleV2

R Package Documentation

Browse R Packages

We want your feedback!

dyohanne/RepAn
Differential abundance Analysis for deep sequenced immune repertoire (Repseq) datasets

R/RepAnFns.R
In dyohanne/RepAn: Differential abundance Analysis for deep sequenced immune repertoire (Repseq) datasets