R/RepAnFns.R

Defines functions getClonalKmerNtFrequencies getKmerNtFrequenciesVJ getKmerNtFrequencies compareAbundanceInPairedSamples compareUsageProportionsPaired compareAbundanceInTwoSamplesForShared compareUsageInTwoSamples compareUsageProportions getPCAfordata4 getPCAfordata3 getPCAfordata2 getPCAfordata1 getPCAfordata overLappingClonotypesByChoice overLappingClonotypesAA overLappingClonotypesHasStop overLappingClonotypes2 overLappingClonotypes getAbundantClonotypesbynumber getAbundantClonotypesbyperc getAggregateCount getVjReadCountPMatrixUnprod getVjReadCountPMatrix getVjReadCountMatrixUnprod getVjReadCountMatrix getVgeneReadCountMatrixUnprod getVgeneReadCountPMatrixUnprod getVgeneReadCountMatrix getVgeneReadCountPMatrix getVjCloneCountPMatrixUnprod getVjCloneCountPMatrix getVjCloneCountMatrixUnprod getVjCloneCountMatrix getVgeneCloneCountMatrixUnprod getVgeneCloneCountPMatrixUnprod getVgeneCloneCountMatrix getVgeneCloneCountPMatrix getUniqueJgenes getUniqueVfamilies_PBMC getUniqueVfamilies getUniqueVgenes getAllVJgenes getAllJgenes getAllVfamilies getAllVgenes getDefined getHasStop getOutOfFrame getUnproductive getProductive extractCDR3 renameCopyTOCount readMiXCR readSampleV2 readSample loadBioconductorPacks loadPacks

Documented in getDefined getHasStop getOutOfFrame getProductive getUnproductive loadBioconductorPacks loadPacks readSample readSampleV2

### ***************************************************************
### ******** Basic analysis for TCR repertoire data **************
### ***************************************************************


########################### External R packages ###########################

#' Attaches packages from CRAN after installing them if they don't exist already
#' 
#' @param package.list vector of packages
#' 
loadPacks <- function(package.list = c("ggplot2","seqRFLP","stringr","permute","cluster","data.table","NbClust","doParallel","dendextend","kmer","ape","Matrix","wordspace","dynamicTreeCut","e1071","fclust","randomForest","preprocessCore","gplots")){
  # other probably needed packages that have been removed for now: "Rclusterpp"
  new.packages <-package.list[!(package.list %in% installed.packages()[,"Package"])]
  if(length(new.packages) > 0) try(install.packages(new.packages))
  loadSuccess <- lapply(eval(package.list), require, character.only=TRUE,quietly=TRUE)
}

#' Attaches packages from Bioconductor after installing them if they don't exist already
#' 
#' @param package.list vector of Bioconductor packages
#' 
loadBioconductorPacks <- function(package.list = c("Biostrings","RankProd","preprocessCore","msa","ggseqlogo")){
  new.packages <-package.list[!(package.list %in% installed.packages()[,"Package"])]
  if(length(new.packages) > 0){
    if((R.Version()[["major"]] >= 4) | (R.Version()[["major"]] >= 3 & unlist(strsplit(R.Version()[["minor"]],"\\."))[1] >= 5)){
      if (!requireNamespace("BiocManager", quietly = TRUE))
        install.packages("BiocManager")
      BiocManager::install(new.packages)      
    }else{
      source("http://bioconductor.org/biocLite.R")
      biocLite(new.packages)
    }
  }
  loadSuccess <- lapply(eval(package.list), require, character.only=TRUE,quietly=TRUE)
}



#loadPacks()
#loadBioconductorPacks()


########################### FUNCTIONS #########################


## read data files ####

#' Read in Repertoire data (Repseq sample) in older Immunoseq format
#' 
#' @param samplename name of the immunoseq formatted repertoire sample (full address or in the working directory)
#' @param cloneStatus logical; reading only productive or only unproductive CDR3s from the sample; default is productive only
#' @param filterOutBelowSize1 logical; drop clonotypes with a size of 1, default is true
#' @return a repseq sample is returned
#' 

readSample <- function(samplename,cloneStatus="p",filterOutBelowSize1=T){
  
  # read sample name and change colnames to uppercase
  
  sam <- read.table(samplename, header=T, sep="\t",dec = ".",stringsAsFactors=F)
  #sam <- as.data.frame(fread(samplename, header=T, sep="\t",stringsAsFactors=F))
  
  
  colnames(sam) <- toupper(colnames(sam))
  sam <- sam[,which(colnames(sam)=="NUCLEOTIDE"):length(colnames(sam))] # remove unwanted columns that come with some data e.g container etc
  
  # Extract cdr3 sequences and put on one column
  CDR3NT = substring(sam$NUCLEOTIDE,sam$VINDEX+1,sam$VINDEX + sam$CDR3LENGTH)
  sam <- data.frame(sam,CDR3NT,stringsAsFactors=F)
  
  # if there is a column named copy, change it to COUNT
  
  if(sum(colnames(sam) == "COPY") == 1){
    
    colnames(sam)[which(colnames(sam) == "COPY")] <- "UNNORMALIZEDCOUNT"
    colnames(sam)[which(colnames(sam) == "NORMALIZEDCOPY")] <- "COUNT"
    colnames(sam)[which(colnames(sam) == "NORMALIZEDFREQUENCY")] <- "FREQUENCYCOUNT"
    
    
  }
  
  if(filterOutBelowSize1==T)
    sam <- sam[sam[,"COUNT"] > 1,]
  
  
  # filter sample for only productive or unproductive clonotypes
  
  if(cloneStatus == "p"){
    sam = getProductive(sam)
  }else if(cloneStatus == "unp"){
    sam = getUnproductive(sam)
  }
  
  return(sam)
  
}


#' Read in Repertoire data (Repseq sample) in the recent (version 3) Immunoseq format
#' 
#' @param samplename name of the immunoseq formatted repertoire sample (full address or in the working directory)
#' @param cloneStatus logical; reading only productive or only unproductive CDR3s from the sample; default is productive only
#' @param filterOutBelowSize1 logical; drop clonotypes with a size of 1, default is true
#' @return a repseq sample is returned
#' 

readSampleV2 <- function(samplename,cloneStatus="p",filterOutBelowSize1=T){
  
  # read sample name and change colnames to uppercase
  
  #sam <- read.table(samplename, header=T, sep="\t",dec = ".",stringsAsFactors=F)
  #sam <- as.data.frame(fread(samplename, header=T, sep="\t",stringsAsFactors=F,verbose=F,showProgress=F))
  
  sam <- as.data.frame(fread(samplename, header=T, sep="\t",dec = ".",stringsAsFactors=F))
  
  #sam <- read.table(samplename, header=T, sep="\t",dec = ".",stringsAsFactors=F)
  
  colnames(sam) <- toupper(colnames(sam))
  sam <- sam[,which(colnames(sam)=="NUCLEOTIDE"):length(colnames(sam))] # remove unwanted columns that come with some data e.g container etc
  
  # Extract cdr3 sequences and put on one column
  CDR3NT = substring(sam$NUCLEOTIDE,sam$VINDEX+1,sam$VINDEX + sam$CDR3LENGTH)
  sam <- data.frame(sam,CDR3NT,stringsAsFactors=F)
  
  # if there is a column named copy, change it to COUNT
  
  colnames(sam)[grep("TEMPLATES",colnames(sam))] <- "COUNT"
  colnames(sam)[grep("FREQUENCYCOUNT",colnames(sam))] <- "FREQUENCYCOUNT"
  
  
  if(filterOutBelowSize1==T)
    sam <- sam[sam[,"COUNT"] > 1,]
  
  
  #colnames(sam)[grep("AMINOACID",colnames(sam))] <- "AMINOACID"
  #colnames(sam)[grep("VGENENAME",colnames(sam))] <- "VGENENAME"
  #colnames(sam)[grep("SEQUENCESTATUS",colnames(sam))] <- "SEQUENCESTATUS"
  
  # filter sample for only productive or unproductive clonotypes
  
  if(cloneStatus == "p"){
    sam = getProductive(sam)
  }else if(cloneStatus == "unp"){
    sam = getUnproductive(sam)
  }
  
  return(sam)
  
}


#' Read in MiXCR format data and change to Immunoseq format
#' 
#' @param samplename name of the MiXCR formatted repertoire sample (full address or in the working directory)
#' @param cloneStatus logical; reading only productive or only unproductive CDR3s from the sample; default is productive only
#' @param filterOutBelowSize1 logical; drop clonotypes with a size of 1, default is true
#' @return a repseq sample is returned
#' 

readMiXCR <- function(samplename,cloneStatus="p",filterOutBelowSize1=T){
  
  # read the file fast, these are big files usually
  
  sam <- as.data.frame(data.table::fread(samplename, header=T, dec = ".",stringsAsFactors=F))
  
  # we drop clones with size 1 at this point since these data are huge, in addition Immunoseq min clone size is 2.
  
  if(filterOutBelowSize1==T)
    sam <- sam[sam[,"Clone count"] > 1,]
  
  
  sam$CDR3length <- sapply(sam[,"N. Seq. CDR3"],nchar)
  
  sam$vgenename <- sapply(sam[,"All V hits"],function(x) unlist(strsplit(unlist(strsplit(x,","))[1],"\\*"))[1])
  sam$vgenename[is.na(sam$vgenename)] <- "unresolved"
  
  sam$dgenename <- sapply(sam[,"All D hits"],function(x) unlist(strsplit(unlist(strsplit(x,","))[1],"\\*"))[1])
  sam$dgenename[is.na(sam$dgenename)] <- "unresolved"
  
  sam$jgenename <- sapply(sam[,"All J hits"],function(x) unlist(strsplit(unlist(strsplit(x,","))[1],"\\*"))[1])
  sam$jgenename[is.na(sam$jgenename)] <- "unresolved"
  
  # continue by figuring out the stop and out of frame clones, then reorder the columns, then name them using immunoseq names.Then return sample
  #sam[,"AA. Seq. CDR3"]
  
  sam$SEQUENCESTATUS <- "In"
  sam$SEQUENCESTATUS[grep("\\*",sam[,"AA. Seq. CDR3"])] <- "Stop"
  
  reOrderColumnsToImmunoseqFormat <- c("Clonal sequence(s)","AA. Seq. CDR3","Clone count",
                                       "Clone fraction","CDR3length","vgenename","dgenename","jgenename","SEQUENCESTATUS","N. Seq. CDR3") 
  
  sam <- sam[,reOrderColumnsToImmunoseqFormat]
  
  colnames(sam) <- c("NUCLEOTIDE","AMINOACID","COUNT","FREQUENCYCOUNT","CDR3LENGTH","VGENENAME","DGENENAME","JGENENAME","SEQUENCESTATUS","CDR3NT")
  colnames(sam) <- toupper(colnames(sam))
  
  # filter sample for only productive or unproductive clonotypes
  
  if(cloneStatus == "p"){
    sam = getProductive(sam)
  }else if(cloneStatus == "unp"){
    sam = getUnproductive(sam)
  }
  
  return(sam)
  
}


renameCopyTOCount <- function(samplename){
  # change some variable names
  for(rep in reps){
    sam <- get(rep)
    
    if(sum(colnames(sam) == "COPY") == 1){
      
      colnames(sam)[which(colnames(sam) == "COPY")] <- "UNNORMALIZEDCOUNT"
      colnames(sam)[which(colnames(sam) == "NORMALIZEDCOPY")] <- "COUNT"
      colnames(sam)[which(colnames(sam) == "NORMALIZEDFREQUENCY")] <- "FREQUENCYCOUNT"
      
      
    }
    
  }

}

extractCDR3 <- function(samplename){
  for(rep in reps){
    p <- get(rep)
    CDR3NT = substring(p$NUCLEOTIDE,p$VINDEX,p$VINDEX+p$CDR3LENGTH)
    p <- data.frame(p,CDR3NT)
    
  }
  
}





#### Important Data related functions : eg. extract productive clones only etc. ####

#' Get productive only CDR3s from a repertoire data (Repseq sample) in immunoseq format
#' 
#' @param rep name of the repertoire sample (which has already been read in with for example with read.table)
#' @return a subset of the repertoire (rep) containing only productive CDR3s is returned
#' 

getProductive <- function(rep) {
  p <- rep[rep$SEQUENCESTATUS=="Productive" | rep$SEQUENCESTATUS=="In",]
  return(p)
}

#' Get unproductive CDR3s from a repertoire data (Repseq sample) in immunoseq format
#' 
#' @param rep name of the repertoire sample (which has already been read in with for example with read.table)
#' @return a subset of the repertoire (rep) containing only unproductive CDR3s is returned
#' 

getUnproductive <- function(rep) {
  p <- rep[!(rep$SEQUENCESTATUS=="Productive" | rep$SEQUENCESTATUS=="In"),]
  return(p)
}

#' Get out of frame CDR3s from a repertoire data (Repseq sample) in immunoseq format
#' 
#' @param rep name of the repertoire sample (which has already been read in with for example with read.table)
#' @return a subset of the repertoire (rep) containing only out of frame CDR3s is returned
#' 
getOutOfFrame <- function(rep) {
  p <- rep[rep$SEQUENCESTATUS=="Out of frame" | rep$SEQUENCESTATUS=="Out",]
  return(p)
}

#' Get stop codon containing CDR3s from a repertoire data (Repseq sample) in immunoseq format
#' 
#' @param rep name of the repertoire sample (which has already been read in with for example with read.table)
#' @return a subset of the repertoire (rep) containing CDR3s with stop codon is returned
#' 
getHasStop <- function(rep) {
  p <- rep[rep$SEQUENCESTATUS=="Has stop" | rep$SEQUENCESTATUS=="Stop",]
  return(p)
}

#' Get CDR3s that have well defined V gene name from a repertoire data (Repseq sample) in immunoseq format
#' 
#' @param rep name of the repertoire sample (which has already been read in with for example with read.table)
#' @return a subset of the repertoire (rep) containing CDR3s with well defined v gene name is returned
#' 
getDefined <- function(rep) {
  p <- rep[rep$VGENENAME != "(undefined)" | rep$VGENENAME!= "unresolved",]
  return(p)
}



#### Getting all V and J genes in our data:  ####


getAllVgenes <- function(reps) 
{
  vgenes_GB=c()
  for(rep in reps){
    p <- get(rep)
    vgenes_GB=union(vgenes_GB,getUniqueVgenes(p))
  }
  
  return(sort(vgenes_GB))
}

getAllVfamilies <- function(reps) 
{
  vfam=c()
  for(rep in reps){
    p <- get(rep)
    vfam=union(vfam,getUniqueVfamilies(p))
  }
  
  return(sort(vfam))
}

getAllJgenes <- function(reps) 
{
  jgenes_GB=c()
  for(rep in reps){
    p <- get(rep)
    jgenes_GB=union(jgenes_GB,getUniqueJgenes(p))
  }
  
  return(sort(jgenes_GB))
}

getAllVJgenes <- function(reps) 
{
  jgenes=getAllJgenes(reps)
  vgenes=getAllVgenes(reps)
  ## All possible VJ combinations 
  vj <- expand.grid(vgenes,jgenes)
  colnames(vj) <- c("VGENENAME","JGENENAME")
  return(vj)
}


getUniqueVgenes <- function(rep) {
  p <- rep[rep$VGENENAME != "(undefined)" | rep$VGENENAME != "unresolved",]
  return(unique(p$VGENENAME)) # [-1] -1 inorder to drop the undefined Vgene type
}

getUniqueVfamilies <- function(rep) {
  p <- rep[rep$VFAMILYNAME != "(undefined)" | rep$VFAMILYNAME != "unresolved",]
  return(unique(p$VFAMILYNAME)) # [-1] -1 inorder to drop the undefined Vgene type
}

getUniqueVfamilies_PBMC <- function(rep) {
  p <- rep[rep$VFAMILYNAME != "(undefined)" | rep$VFAMILYNAME != "unresolved",]
  return(unique(p$VFAMILYNAME)) # [-1] -1 inorder to drop the undefined Vgene type
}

getUniqueJgenes <- function(rep) {
  p <- rep[rep$JGENENAME != "(undefined)" | rep$JGENENAME != "unresolved",]
  return(unique(p$JGENENAME))
}


#### V, J or VJ usage counts and proportions : 

# clone level
getVgeneCloneCountPMatrix<- function(reps,g){
  ### Returns a datamatrix whose first two colums show the V gene. The remaining columns show 
  ### The proportions of available clones using each v gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data. 
  
  m <- as.data.frame(g)
  colnames(m)  <- c("VGENENAME")
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getProductive(p)
    print (nrow(p))
    #p <- getDefined(p) 
    vfamAgg <- aggregate(AMINOACID~ VGENENAME ,data = p, FUN = length)
    vfamAgg[,2] <- vfamAgg[,2]/sum(vfamAgg[,2]) # get the proportion
    colnames(vfamAgg) <- c("VGENENAME",rep)  
    m <- merge(m,vfamAgg, by="VGENENAME",all.x=T) 
    
  }
  
 
  m[is.na(m)] <- 0
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  
  return(m)
}

getVgeneCloneCountMatrix <- function(reps,g){
  ### Returns a datamatrix whose first two colums show the V gene. The remaining columns show 
  ### The number of available clones using each v gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- as.data.frame(g)
  colnames(m)  <- c("VGENENAME")
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getProductive(p)
    print (nrow(p))
    #p <- getDefined(p) 
    vgeneAgg <- aggregate(AMINOACID~ VGENENAME ,data = p, FUN = length)
    colnames(vgeneAgg) <- c("VGENENAME",rep)  
    m <- merge(m,vgeneAgg, by="VGENENAME",all.x=T) 
    
  }
  m[is.na(m)] <- 0
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}


getVgeneCloneCountPMatrixUnprod<- function(reps,g){
  ### Returns a datamatrix whose first two colums show the V gene. The remaining columns show 
  ### The proportions of available unproductive clones using each v gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- as.data.frame(g)
  colnames(m)  <- c("VGENENAME")
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getUnproductive(p)
    print (nrow(p))
    #p <- getDefined(p) 
    vfamAgg <- aggregate(AMINOACID~ VGENENAME ,data = p, FUN = length)
    vfamAgg[,2] <- vfamAgg[,2]/sum(vfamAgg[,2]) # get the proportion
    colnames(vfamAgg) <- c("VGENENAME",paste(rep)) 
    m <- merge(m,vfamAgg, by="VGENENAME",all.x=T) 
    
  }
  m[is.na(m)] <- 0
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}

getVgeneCloneCountMatrixUnprod <- function(reps,g){
  ### Returns a datamatrix whose first two colums show the V gene. The remaining columns show 
  ### The number of available unproductive clones using each v gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- as.data.frame(g)
  colnames(m)  <- c("VGENENAME")
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getUnproductive(p)
    print (nrow(p))
    #p <- getDefined(p) 
    vgeneAgg <- aggregate(AMINOACID~ VGENENAME ,data = p, FUN = length)
    colnames(vgeneAgg) <- c("VGENENAME",rep)  
    m <- merge(m,vgeneAgg, by="VGENENAME",all.x=T) 
    
  }
  m[is.na(m)] <- 0
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}


getVjCloneCountMatrix<- function(reps,vj){
  ### Returns a datamatrix whose first two colums show the Vj gene combinations. The remaining columns show 
  ### The number of available clones using each vj gene combination for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- vj
  colnames(m)  <- c("VGENENAME","JGENENAME")
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getProductive(p)
    #p <- getDefined(p) 
    vjPairAgg <- aggregate(AMINOACID~ VGENENAME + JGENENAME   ,data = p, FUN = length)
    print(head(vjPairAgg))
    colnames(vjPairAgg) <- c("VGENENAME","JGENENAME",rep)  
    m <- merge(m,vjPairAgg, by=c("VGENENAME","JGENENAME"),all.x=T) 
    
  }
  m[is.na(m)] <- 0
  m[,1]<-paste(m[,1],m[,2])
  m<-m[,-2]
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}

getVjCloneCountMatrixUnprod<- function(reps,vj){
  ### Returns a datamatrix whose first two colums show the Vj gene. The remaining columns show 
  ### The number of available unproductive clones using each vj gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- vj
  colnames(m)  <- c("VGENENAME","JGENENAME")
  
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getUnproductive(p)
    #p <- getDefined(p) 
    vjPairAgg <- aggregate(AMINOACID~ VGENENAME + JGENENAME   ,data = p, FUN = length)
    colnames(vjPairAgg) <- c("VGENENAME","JGENENAME",rep)  
    m <- merge(m,vjPairAgg, by=c("VGENENAME","JGENENAME"),all.x=T) 
    
  }
  m[is.na(m)] <- 0
  m[,1]<-paste(m[,1],m[,2])
  m<-m[,-2]
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}

getVjCloneCountPMatrix<- function(reps,vj){
  ### Returns a datamatrix whose first two colums show the Vj gene. The remaining columns show 
  ### The proportions of available clones using each vj gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- vj
  colnames(m)  <- c("VGENENAME","JGENENAME")
  
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getProductive(p)
    #p <- getDefined(p) 
    vjPairAgg <- aggregate(AMINOACID~ VGENENAME + JGENENAME   ,data = p, FUN = length)
    vjPairAgg[,3] <- vjPairAgg[,3]/sum(vjPairAgg[,3]) # get the proportion
    colnames(vjPairAgg) <- c("VGENENAME","JGENENAME",rep)  
    m <- merge(m,vjPairAgg, by=c("VGENENAME","JGENENAME"),all.x=T) 
    
  }
  m[is.na(m)] <- 0
  m[,1]<-paste(m[,1],m[,2])
  m<-m[,-2]
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}

getVjCloneCountPMatrixUnprod<- function(reps,vj){
  ### Returns a datamatrix whose first two colums show the Vj gene. The remaining columns show 
  ### The proportions of available unproductive clones using each vj gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- vj
  colnames(m)  <- c("VGENENAME","JGENENAME")
  
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getUnproductive(p)
    #p <- getDefined(p) 
    vjPairAgg <- aggregate(AMINOACID~ VGENENAME + JGENENAME   ,data = p, FUN = length)
    vjPairAgg[,3] <- vjPairAgg[,3]/sum(vjPairAgg[,3]) # get the proportion
    colnames(vjPairAgg) <- c("VGENENAME","JGENENAME",rep)  
    m <- merge(m,vjPairAgg, by=c("VGENENAME","JGENENAME"),all.x=T) 
    
  }
  m[is.na(m)] <- 0
  m[,1]<-paste(m[,1],m[,2])
  m<-m[,-2]
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}


# read level

getVgeneReadCountPMatrix<- function(reps,g){
  ### Returns a datamatrix whose first two colums show the V gene. The remaining columns show 
  ### The proportions of available clones using each v gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data. 
  
  m <- as.data.frame(g)
  colnames(m)  <- c("VGENENAME")
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getProductive(p)
    print (nrow(p))
    #p <- getDefined(p) 
    vfamAgg <- aggregate(COUNT~ VGENENAME ,data = p, FUN = sum)
    vfamAgg[,2] <- vfamAgg[,2]/sum(vfamAgg[,2]) # get the proportion
    colnames(vfamAgg) <- c("VGENENAME",rep)  
    m <- merge(m,vfamAgg, by="VGENENAME",all.x=T) 
    
  }
  m[is.na(m)] <- 0
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}

getVgeneReadCountMatrix <- function(reps,g){
  ### Returns a datamatrix whose first two colums show the V gene. The remaining columns show 
  ### The number of available clones using each v gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- as.data.frame(g)
  colnames(m)  <- c("VGENENAME")
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getProductive(p)
    print (nrow(p))
    #p <- getDefined(p) 
    vgeneAgg <- aggregate(COUNT~ VGENENAME ,data = p, FUN = sum)
    colnames(vgeneAgg) <- c("VGENENAME",rep)  
    m <- merge(m,vgeneAgg, by="VGENENAME",all.x=T) 
    
  }
  m[is.na(m)] <- 0
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}

getVgeneReadCountPMatrixUnprod<- function(reps,g){
  ### Returns a datamatrix whose first two colums show the V gene. The remaining columns show 
  ### The proportions of available unproductive clones using each v gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- as.data.frame(g)
  colnames(m)  <- c("VGENENAME")
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getUnproductive(p)
    print (nrow(p))
    #p <- getDefined(p) 
    vfamAgg <- aggregate(COUNT~ VGENENAME ,data = p, FUN = sum)
    vfamAgg[,2] <- vfamAgg[,2]/sum(vfamAgg[,2]) # get the proportion
    colnames(vfamAgg) <- c("VGENENAME",paste(rep)) 
    m <- merge(m,vfamAgg, by="VGENENAME",all.x=T) 
    
  }
  m[is.na(m)] <- 0
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}

getVgeneReadCountMatrixUnprod <- function(reps,g){
  ### Returns a datamatrix whose first two colums show the V gene. The remaining columns show 
  ### The number of available unproductive clones using each v gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- as.data.frame(g)
  colnames(m)  <- c("VGENENAME")
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getUnproductive(p)
    print (nrow(p))
    #p <- getDefined(p) 
    vgeneAgg <- aggregate(COUNT~ VGENENAME ,data = p, FUN = sum)
    colnames(vgeneAgg) <- c("VGENENAME",rep)  
    m <- merge(m,vgeneAgg, by="VGENENAME",all.x=T) 
    
  }
  m[is.na(m)] <- 0
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}

getVjReadCountMatrix<- function(reps,vj){
  ### Returns a datamatrix whose first two colums show the vj gene. The remaining columns show 
  ### The number of available clones using each vj gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- vj
  colnames(m)  <- c("VGENENAME","JGENENAME")
  
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getProductive(p)
    #p <- getDefined(p) 
    vjPairAgg <- aggregate(COUNT~ VGENENAME + JGENENAME   ,data = p, FUN = sum)
    colnames(vjPairAgg) <- c("VGENENAME","JGENENAME",rep)  
    m <- merge(m,vjPairAgg, by=c("VGENENAME","JGENENAME"),all.x=T) 
    
  }
  m[is.na(m)] <- 0
  m[,1]<-paste(m[,1],m[,2])
  m<-m[,-2]
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}

getVjReadCountMatrixUnprod<- function(reps,vj){
  ### Returns a datamatrix whose first two colums show the Vj gene. The remaining columns show 
  ### The number of available unproductive clones using each vj gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- vj
  colnames(m)  <- c("VGENENAME","JGENENAME")
  
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getUnproductive(p)
    #p <- getDefined(p) 
    vjPairAgg <- aggregate(COUNT~ VGENENAME + JGENENAME   ,data = p, FUN = sum)
    colnames(vjPairAgg) <- c("VGENENAME","JGENENAME",rep)  
    m <- merge(m,vjPairAgg, by=c("VGENENAME","JGENENAME"),all.x=T) 
    
  }
  m[is.na(m)] <- 0
  m[,1]<-paste(m[,1],m[,2])
  m<-m[,-2]
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}

getVjReadCountPMatrix<- function(reps,vj){
  ### Returns a datamatrix whose first two colums show the Vj gene. The remaining columns show 
  ### The proportions of available clones using each vj gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- vj
  colnames(m)  <- c("VGENENAME","JGENENAME")
  
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getProductive(p)
    #p <- getDefined(p) 
    vjPairAgg <- aggregate(COUNT~ VGENENAME + JGENENAME   ,data = p, FUN = sum)
    vjPairAgg[,3] <- vjPairAgg[,3]/sum(vjPairAgg[,3]) # get the proportion
    colnames(vjPairAgg) <- c("VGENENAME","JGENENAME",rep)  
    m <- merge(m,vjPairAgg, by=c("VGENENAME","JGENENAME"),all.x=T) 
    
  }
  m[is.na(m)] <- 0
  m[,1]<-paste(m[,1],m[,2])
  m<-m[,-2]
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}

getVjReadCountPMatrixUnprod<- function(reps,vj){
  ### Returns a datamatrix whose first two colums show the Vj gene. The remaining columns show 
  ### The proportions of available unprod. clones using each vj gene for each sample
  
  #reps = a vector of dataframe names. The data frames are TCR repertoire data
  
  m <- vj
  colnames(m)  <- c("VGENENAME","JGENENAME")
  
  for(rep in reps){
    print(rep)
    p <- get(rep)
    p <- getUnproductive(p)
    #p <- getDefined(p) 
    vjPairAgg <- aggregate(COUNT~ VGENENAME + JGENENAME   ,data = p, FUN = sum)
    vjPairAgg[,3] <- vjPairAgg[,3]/sum(vjPairAgg[,3]) # get the proportion
    colnames(vjPairAgg) <- c("VGENENAME","JGENENAME",rep)  
    m <- merge(m,vjPairAgg, by=c("VGENENAME","JGENENAME"),all.x=T) 
    
  }
  m[is.na(m)] <- 0
  m[,1]<-paste(m[,1],m[,2])
  m<-m[,-2]
  
  rownames(m) <- m[,1]
  m <- m[,-1]
  return(m)
}



# General Counting function for V, J, VJ genes for each sequence status  ####

getAggregateCount <- function(reps,countfor="V",seqStatus="IN",lev="A",countsorpro="prop"){

  ### Returns a datamatrix whose first two colums show the features The remaining columns show 
  ### The number of available clones for each feature :
  
  #reps - a vector of dataframe names. The data frames are TCR repertoire data.
  # countfor - the features to be counted, default V for VgeneNames, vf for V family, J for J gene, VJ for vj genes
  # seqStatus - for which sequences is the count to be done. default In for productive, Out for out of frame, Stop for stop codon containing sequences
  # lev - at which level is the counting to be done . default A for Amino acid, N for nucleotide level
  # countsorpro - return counts raw counts or proportions
  
  # setting work parameters
  countfor = toupper(countfor)
  seqStatus= toupper(seqStatus)
  lev = toupper(lev)
  countsorpro= toupper(countsorpro)
  
  print(countfor)
  
  if(countfor=="V"){
    
    
    features= getAllVgenes(reps)
    
    m <- as.data.frame(features)
    colnames(m)  <- c("VGENENAME")
    
    
      for(rep in reps){
        print(rep)
        p <- get(rep)
        
        if(seqStatus=="IN"){
          p <- getProductive(p)
        }else if(seqStatus=="OUT"){
          p <- getOutOfFrame(p)
        }else if(seqStatus=="STOP"){
          p <- getHasStop(p)
        }
        
        print (nrow(p))
     
        if(lev=="N"){
        dAgg <- aggregate(NUCLEOTIDE~ VGENENAME ,data = p, FUN = length)
        }else {
        dAgg <- aggregate(AMINOACID~ VGENENAME ,data = p, FUN = length) 
        }
        
        if(countsorpro=="PROP"){
        dAgg[,2] <- dAgg[,2]/sum(dAgg[,2]) # get the proportion
        }
        colnames(dAgg) <- c("VGENENAME",rep)  
        m <- merge(m,dAgg, by="VGENENAME",all.x=T) 
        
      }
      m[is.na(m)] <- 0
      
      rownames(m) <- m[,1]
      m <- m[,-1]
      
     
     
  }
  else if(countfor=="J"){
    features= getAllJgenes(reps)
    
    m <- as.data.frame(features)
    colnames(m)  <- c("JGENENAME")
    
    for(rep in reps){
      print(rep)
      p <- get(rep)
      
      if(seqStatus=="IN"){
        p <- getProductive(p)
      }else if(seqStatus=="OUT"){
        p <- getOutOfFrame(p)
      }else if(seqStatus=="STOP"){
        p <- getHasStop(p)
      }
      
      print (nrow(p))
      
      if(lev=="N"){
        dAgg <- aggregate(NUCLEOTIDE~ JGENENAME ,data = p, FUN = length)
      }else {
        dAgg <- aggregate(AMINOACID~ JGENENAME ,data = p, FUN = length) 
      }
      
      if(countsorpro=="PROP"){
      dAgg[,2] <- dAgg[,2]/sum(dAgg[,2]) # get the proportion
      }
      colnames(dAgg) <- c("JGENENAME",rep)  
      m <- merge(m,dAgg, by="JGENENAME",all.x=T) 
      
    }
    m[is.na(m)] <- 0
    
    rownames(m) <- m[,1]
    m <- m[,-1]
    
    
    
  }
  
  else if(countfor=="VF"){
    features= getAllVfamilies(reps)
    
    m <- as.data.frame(features)
    colnames(m)  <- c("VFAMILYNAME")
    
    for(rep in reps){
      print(rep)
      p <- get(rep)
      
      if(seqStatus=="IN"){
        p <- getProductive(p)
      }else if(seqStatus=="OUT"){
        p <- getOutOfFrame(p)
      }else if(seqStatus=="STOP"){
        p <- getHasStop(p)
      }
      
      print (nrow(p))
      
      if(lev=="N"){
        dAgg <- aggregate(NUCLEOTIDE~ VFAMILYNAME ,data = p, FUN = length)
      }else {
        dAgg <- aggregate(AMINOACID~ VFAMILYNAME ,data = p, FUN = length) 
      }
      
      if(countsorpro=="PROP"){
      dAgg[,2] <- dAgg[,2]/sum(dAgg[,2]) # get the proportion
      }
      colnames(dAgg) <- c("VFAMILYNAME",rep)  
      m <- merge(m,dAgg, by="VFAMILYNAME",all.x=T) 
      
    }
    m[is.na(m)] <- 0
    
    rownames(m) <- m[,1]
    m <- m[,-1]
    
     
  }
  
  else if(countfor=="VJ"){
    features= getAllVJgenes(reps)
    
    m <- features
    colnames(m)  <- c("VGENENAME","JGENENAME")
    
    for(rep in reps){
      print(rep)
      p <- get(rep)
      
      if(seqStatus=="IN"){
        p <- getProductive(p)
      }else if(seqStatus=="OUT"){
        p <- getOutOfFrame(p)
      }else if(seqStatus=="STOP"){
        p <- getHasStop(p)
      }
      
      print (nrow(p))
      
      if(lev=="N"){
        dAgg <- aggregate(NUCLEOTIDE~ VGENENAME + JGENENAME ,data = p, FUN = length)
      }else {
        dAgg <- aggregate(AMINOACID~ VGENENAME + JGENENAME ,data = p, FUN = length) 
      }
      
      if(countsorpro=="PROP"){
      dAgg[,3] <- dAgg[,3]/sum(dAgg[,3]) # get the proportion
      }
      colnames(dAgg) <- c("VGENENAME","JGENENAME",rep)  
      m <- merge(m,dAgg, by=c("VGENENAME","JGENENAME"),all.x=T) 
      
    }
    m[is.na(m)] <- 0
    m[,1]<-paste(m[,1],m[,2])
    m<-m[,-2]
    
    rownames(m) <- m[,1]
    m <- m[,-1]
    
  }
  
  
  return(m)
 
}


# get overlapping clonotype counts for every pair of samples: using top 5% or 10% etc #### 

getAbundantClonotypesbyperc<- function(rep,topCutoff=10){
  
  # Return the top abundant clonotypes using the topcutoff , default is top 5%
  
  print(nrow(rep))
  y=rep$COUNT/sum(rep$COUNT)
  y=cumsum(y) #cumulitive read size proportion at each clone, ordered from biggest to smallest
  
  cuttingPoints=topCutoff/100
  y=cut(y,seq(0,1,by=cuttingPoints)) # Cut the cumulitive proportion values in y by 10% each
  
  
  # We can split the whole data frame using the 5% split factor
  
  rep_subgroups=split(rep,y)
  
  
  return(rep_subgroups[[1]])
  
  
}

getAbundantClonotypesbynumber<- function(rep,topCutoff=10){
  
  # Return the top abundant clonotypes using the topcutoff , default is top 10
  # it assumes rep is already sorted from biggest clone to smallest
  
  print(nrow(rep))
  
  return(rep[1:topCutoff,])
  
  
}


overLappingClonotypes<- function(reps,cutoff=20,cutoffby="perc"){
  ### Returns a datamatrix of top topnumber clonotopes from each sample plus: overlap count or Jaccard index, for prod and non productive sequences, N insertion size
  
  
  # setting work parameters
  cutoffby = toupper(cutoffby)
 
  
  m <- as.data.frame(expand.grid(reps,reps))
  colnames(m)  <- c("Sample1","Sample2")
  
  toadd= data.frame()
  
 
  
  for(r in 1:nrow(m)){
    
    overlapValuesProd=c()
    overlapValuesUnprod=c()
    overlapValuesProdJI=c()
    overlapValuesUnprodJI=c()
    
    overlapValuesProdJI_abundancebased=c()
    overlapValuesUnprodJI_abundancebased=c()
    
    NinsertionProd1=c()
    NinsertionUnprod1=c()
    
    NinsertionProd2=c()
    NinsertionUnprod2=c()
    
    
    print(r)
    sam1=m[r,1]
    sam2=m[r,2]
    
    #print(sam1)
    #print(sam2)
  
    sam1 <- get(as.character(sam1))
    sam2 <- get(as.character(sam2))
    
    
    sam1Prod <- getProductive(sam1)
    sam1Unprod <- getUnproductive(sam1)
    
    sam2Prod <- getProductive(sam2)
    sam2Unprod <- getUnproductive(sam2)
   
 
    if(cutoffby=="PERC"){
      topclones_sam1Prod = getAbundantClonotypesbyperc(sam1Prod,cutoff)
      topclones_sam1Unprod = getAbundantClonotypesbyperc(sam1Unprod,cutoff)
      
      topclones_sam2Prod = getAbundantClonotypesbyperc(sam2Prod,cutoff)
      topclones_sam2Unprod = getAbundantClonotypesbyperc(sam2Unprod,cutoff)
      
      # commons
      commonClonesProd=length(intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE))
      unionProd=length(union(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE))
      
      # commonality checked at the Nt level for non productive sequences, no AA given in the original data
      commonClonesUnprod=length(intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      unionUnprod=length(union(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      
      
      overlapValuesProd=c(overlapValuesProd,as.numeric(commonClonesProd))
      overlapValuesUnprod=c(overlapValuesUnprod,as.numeric(commonClonesUnprod))
      
      overlapValuesProdJI=c(overlapValuesProdJI,formatC(commonClonesProd/unionProd,digits= 8,format="f"))
      overlapValuesUnprodJI = c(overlapValuesUnprodJI,formatC(commonClonesUnprod/unionUnprod,digits= 8,format="f"))
      
      # Jaccard abundance-based similarity index (UV/(U+V-UV)). U is total relative abundance in of shared sample 1, U total relative of shared in sample 2
      
      commonClonesProd_U_data= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_U = sum(commonClonesProd_U_data$COUNT)/sum(topclones_sam1Prod$COUNT)
      
      commonClonesProd_V_data= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_V = sum(commonClonesProd_V_data$COUNT)/sum(topclones_sam2Prod$COUNT)
      
      uv=commonClonesProd_U*commonClonesProd_V
      uplusv=commonClonesProd_U+commonClonesProd_V
      
      AbundanceBasedJIProd = format(uv/(uplusv-uv),digits=8,format="f")
      
      overlapValuesProdJI_abundancebased=c(overlapValuesProdJI_abundancebased,as.numeric(AbundanceBasedJIProd))
      
        # for unproductive sequences
      
      commonClonesUnProd_U_data= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_U = sum(commonClonesUnProd_U_data$COUNT)/sum(topclones_sam1Unprod$COUNT)
      
      commonClonesUnProd_V_data= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_V = sum(commonClonesUnProd_V_data$COUNT)/sum(topclones_sam2Unprod$COUNT)
      
      uv_UnProd=commonClonesUnProd_U * commonClonesUnProd_V
      uplusv_UnProd=commonClonesUnProd_U + commonClonesUnProd_V
      
      AbundanceBasedJIUnProd = formatC(uv_UnProd/(uplusv_UnProd-uv_UnProd),digits=8,format="f")
      
      overlapValuesUnprodJI_abundancebased=c(overlapValuesUnprodJI_abundancebased,as.numeric(AbundanceBasedJIUnProd))
     
      
 
      # Ninsertions  in common sequences
      
      commonClonotypes1= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonotypes2= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      
      commonClonotypes1Unprod= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonotypes2Unprod= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      
      
      NinsertionsP1= mean(commonClonotypes1$N2INSERTION + commonClonotypes1$N1INSERTION)
      NinsertionsP2 = mean(commonClonotypes2$N2INSERTION + commonClonotypes2$N1INSERTION)
      
      NinsertionsUP1= mean(commonClonotypes1Unprod$N2INSERTION + commonClonotypes1Unprod$N1INSERTION)
      NinsertionsUP2 = mean(commonClonotypes2Unprod$N2INSERTION + commonClonotypes2Unprod$N1INSERTION)
      
      
      NinsertionProd1=c(NinsertionProd1,NinsertionsP1)
      NinsertionProd2= c(NinsertionProd2,NinsertionsP2)
      
      NinsertionUnprod1=c(NinsertionUnprod1,NinsertionsUP1)
      NinsertionUnprod2=c(NinsertionUnprod2,NinsertionsUP2)
      
      
    }else{
      topclones_sam1Prod = getAbundantClonotypesbynumber(sam1Prod,cutoff)
      topclones_sam1Unprod = getAbundantClonotypesbynumber(sam1Unprod,cutoff)
      
      topclones_sam2Prod = getAbundantClonotypesbynumber(sam2Prod,cutoff)
      topclones_sam2Unprod = getAbundantClonotypesbynumber(sam2Unprod,cutoff)
      
      
      # commons
      # commons
      commonClonesProd=length(intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE))
      unionProd=length(union(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE))
      
      # commonality checked at the Nt level for non productive sequences, no AA given in the original data
      commonClonesUnprod=length(intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      unionUnprod=length(union(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      
      
      overlapValuesProd=c(overlapValuesProd,as.numeric(commonClonesProd))
      overlapValuesUnprod=c(overlapValuesUnprod,as.numeric(commonClonesUnprod))
      
      overlapValuesProdJI=c(overlapValuesProdJI,formatC(commonClonesProd/unionProd,digits= 8,format="f"))
      overlapValuesUnprodJI = c(overlapValuesUnprodJI,formatC(commonClonesUnprod/unionUnprod,digits= 8,format="f"))
      
      
      # Jaccard abundance-based similarity index (UV/(U+V-UV)). U is total relative abundance in of shared sample 1, U total relative of shared in sample 2
      
      commonClonesProd_U_data= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_U = sum(commonClonesProd_U_data$COUNT)/sum(topclones_sam1Prod$COUNT)
      
      commonClonesProd_V_data= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_V = sum(commonClonesProd_V_data$COUNT)/sum(topclones_sam2Prod$COUNT)
      
      uv=commonClonesProd_U*commonClonesProd_V
      uplusv=commonClonesProd_U+commonClonesProd_V
      
      AbundanceBasedJIProd = format(uv/(uplusv-uv),digits=8,format="f")
      
      overlapValuesProdJI_abundancebased=c(overlapValuesProdJI_abundancebased,as.numeric(AbundanceBasedJIProd))
      
      # for unproductive sequences
      
      commonClonesUnProd_U_data= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_U = sum(commonClonesUnProd_U_data$COUNT)/sum(topclones_sam1Unprod$COUNT)
      
      commonClonesUnProd_V_data= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_V = sum(commonClonesUnProd_V_data$COUNT)/sum(topclones_sam2Unprod$COUNT)
      
      uv_UnProd=commonClonesUnProd_U * commonClonesUnProd_V
      uplusv_UnProd=commonClonesUnProd_U + commonClonesUnProd_V
      
      AbundanceBasedJIUnProd = formatC(uv_UnProd/(uplusv_UnProd-uv_UnProd),digits=8,format="f")
      
      overlapValuesUnprodJI_abundancebased=c(overlapValuesUnprodJI_abundancebased,as.numeric(AbundanceBasedJIUnProd))
      
      
      
      # Ninsertions  in common sequences
      
      commonClonotypes1= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonotypes2= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      
      commonClonotypes1Unprod= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonotypes2Unprod= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      
      
      NinsertionsP1= mean(commonClonotypes1$N2INSERTION + commonClonotypes1$N1INSERTION)
      NinsertionsP2 = mean(commonClonotypes2$N2INSERTION + commonClonotypes2$N1INSERTION)
      
      NinsertionsUP1= mean(commonClonotypes1Unprod$N2INSERTION + commonClonotypes1Unprod$N1INSERTION)
      NinsertionsUP2 = mean(commonClonotypes2Unprod$N2INSERTION + commonClonotypes2Unprod$N1INSERTION)
      
      
      NinsertionProd1=c(NinsertionProd1,NinsertionsP1)
      NinsertionProd2= c(NinsertionProd2,NinsertionsP2)
      
      NinsertionUnprod1=c(NinsertionUnprod1,NinsertionsUP1)
      NinsertionUnprod2=c(NinsertionUnprod2,NinsertionsUP2)
        
      }
    
    #totalNumberOfProdClonotypes= nrow(topclones_sam1Prod) + nrow(topclones_sam2Prod)
    #totalNumberOfUnprodClonotypes= nrow(topclones_sam1Unprod) + nrow(topclones_sam2Unprod)

    
    if(nrow(toadd) > 0){
    newrow=data.frame(overlapValuesProd,overlapValuesUnprod,overlapValuesProdJI,overlapValuesUnprodJI,overlapValuesProdJI_abundancebased,overlapValuesUnprodJI_abundancebased,NinsertionProd1,NinsertionProd2,NinsertionUnprod1,NinsertionUnprod2)
    colnames(newrow) <- c("OverlapCountProductive","OverlapCountUnProductive","OverlapProd_JI","OverlapUnProd_JI","AbundanceBased_JI","AbundanceBased_JI","NinsertionProdSample1","NinsertionProdSample2","NinsertionUnProdSample1","NinsertionUnProdSample2")
    
    toadd=rbind(toadd,newrow)
    }else{
      toadd=data.frame(overlapValuesProd,overlapValuesUnprod,overlapValuesProdJI,overlapValuesUnprodJI,overlapValuesProdJI_abundancebased,overlapValuesUnprodJI_abundancebased,NinsertionProd1,NinsertionProd2,NinsertionUnprod1,NinsertionUnprod2)
      colnames(toadd) <- c("OverlapCountProductive","OverlapCountUnProductive","OverlapProd_JI","OverlapUnProd_JI","AbundanceBased_JI","AbundanceBased_JI","NinsertionProdSample1","NinsertionProdSample2","NinsertionUnProdSample1","NinsertionUnProdSample2")  
    }
    
  }
   
  

    
  print(nrow(toadd))
  resultTable=data.frame(m,toadd)        
  
  write.table(resultTable,file="Pairwise Overlap Counts.txt",row.names = T,sep = "\t")
  return(resultTable)
}

overLappingClonotypes2<- function(reps,cutoff=20,cutoffby="perc"){
  ### Returns a datamatrix of top topnumber clonotopes from each sample plus: overlap count or Jaccard index, for prod and non productive sequences, N insertion size
  
  
  # setting work parameters
  cutoffby = toupper(cutoffby)
  
  
  m <- as.data.frame(expand.grid(reps,reps))
  colnames(m)  <- c("Sample1","Sample2")
  
  toadd= data.frame()
  
  
  
  for(r in 1:nrow(m)){
    
    overlapValuesProd=c()
    overlapValuesUnprod=c()
    overlapValuesProdJI=c()
    overlapValuesUnprodJI=c()
    
    overlapValuesProdJI_abundancebased=c()
    overlapValuesUnprodJI_abundancebased=c()
    
    NinsertionProd1=c()
    NinsertionUnprod1=c()
    
    NinsertionProd2=c()
    NinsertionUnprod2=c()
    
    
    print(r)
    sam1=m[r,1]
    sam2=m[r,2]
    
    #print(sam1)
    #print(sam2)
    
    sam1 <- get(as.character(sam1))
    sam2 <- get(as.character(sam2))
    
    
    sam1Prod <- getProductive(sam1)
    sam1Unprod <- getUnproductive(sam1)
    
    sam2Prod <- getProductive(sam2)
    sam2Unprod <- getUnproductive(sam2)
    
    
    if(cutoffby=="PERC"){
      topclones_sam1Prod = getAbundantClonotypesbyperc(sam1Prod,cutoff)
      topclones_sam1Unprod = getAbundantClonotypesbyperc(sam1Unprod,cutoff)
      
      topclones_sam2Prod = getAbundantClonotypesbyperc(sam2Prod,cutoff)
      topclones_sam2Unprod = getAbundantClonotypesbyperc(sam2Unprod,cutoff)
      
      # commons
      commonClonesProd=length(intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE)) 
      unionProd=length(union(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE)) 
      
      # commonality checked at the Nt level for non productive sequences, no AA given in the original data
      commonClonesUnprod=length(intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      unionUnprod=length(union(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      
      if(unionUnprod==0){
        unionUnprod = 1
      }
      
      if(unionProd==0){
        unionProd = 1
      }
      
      overlapValuesProd=c(overlapValuesProd,as.numeric(commonClonesProd))
      overlapValuesUnprod=c(overlapValuesUnprod,as.numeric(commonClonesUnprod))
      
      overlapValuesProdJI=c(overlapValuesProdJI,commonClonesProd/unionProd)
      overlapValuesUnprodJI = c(overlapValuesUnprodJI,commonClonesUnprod/unionUnprod)
      
      # Jaccard abundance-based similarity index (UV/(U+V-UV)). U is total relative abundance in of shared sample 1, U total relative of shared in sample 2
      
      commonClonesProd_U_data= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_U = sum(commonClonesProd_U_data$COUNT)/sum(topclones_sam1Prod$COUNT)
      
      commonClonesProd_V_data= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_V = sum(commonClonesProd_V_data$COUNT)/sum(topclones_sam2Prod$COUNT)
      
      uv=commonClonesProd_U*commonClonesProd_V
      uplusv=commonClonesProd_U+commonClonesProd_V
      
      AbundanceBasedJIProd = uv/(uplusv-uv)
      
      overlapValuesProdJI_abundancebased=c(overlapValuesProdJI_abundancebased,as.numeric(AbundanceBasedJIProd))
      
      # for unproductive sequences
      
      commonClonesUnProd_U_data= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_U = sum(commonClonesUnProd_U_data$COUNT)/sum(topclones_sam1Unprod$COUNT)
      
      commonClonesUnProd_V_data= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_V = sum(commonClonesUnProd_V_data$COUNT)/sum(topclones_sam2Unprod$COUNT)
      
      uv_UnProd=commonClonesUnProd_U * commonClonesUnProd_V
      uplusv_UnProd=commonClonesUnProd_U + commonClonesUnProd_V
      
      AbundanceBasedJIUnProd = uv_UnProd/(uplusv_UnProd-uv_UnProd)
      
      overlapValuesUnprodJI_abundancebased=c(overlapValuesUnprodJI_abundancebased,as.numeric(AbundanceBasedJIUnProd))
      
      
      
      # Ninsertions  in common sequences
      
      commonClonotypes1= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonotypes2= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      
      commonClonotypes1Unprod= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonotypes2Unprod= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      
      
      NinsertionsP1= mean(commonClonotypes1$N2INSERTION + commonClonotypes1$N1INSERTION)
      NinsertionsP2 = mean(commonClonotypes2$N2INSERTION + commonClonotypes2$N1INSERTION)
      
      NinsertionsUP1= mean(commonClonotypes1Unprod$N2INSERTION + commonClonotypes1Unprod$N1INSERTION)
      NinsertionsUP2 = mean(commonClonotypes2Unprod$N2INSERTION + commonClonotypes2Unprod$N1INSERTION)
      
      
      NinsertionProd1=c(NinsertionProd1,NinsertionsP1)
      NinsertionProd2= c(NinsertionProd2,NinsertionsP2)
      
      NinsertionUnprod1=c(NinsertionUnprod1,NinsertionsUP1)
      NinsertionUnprod2=c(NinsertionUnprod2,NinsertionsUP2)
      
      
    }else{
      topclones_sam1Prod = getAbundantClonotypesbynumber(sam1Prod,cutoff)
      topclones_sam1Unprod = getAbundantClonotypesbynumber(sam1Unprod,cutoff)
      
      topclones_sam2Prod = getAbundantClonotypesbynumber(sam2Prod,cutoff)
      topclones_sam2Unprod = getAbundantClonotypesbynumber(sam2Unprod,cutoff)
      
      
      # commons
      # commons
      commonClonesProd=length(intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE))
      unionProd=length(union(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE))
      
      # commonality checked at the Nt level for non productive sequences, no AA given in the original data
      commonClonesUnprod=length(intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      unionUnprod=length(union(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      
      if(unionUnprod==0){
        unionUnprod = 1
      }
      
      if(unionProd==0){
        unionProd = 1
      }
      
      overlapValuesProd=c(overlapValuesProd,as.numeric(commonClonesProd))
      overlapValuesUnprod=c(overlapValuesUnprod,as.numeric(commonClonesUnprod))
      
      overlapValuesProdJI=c(overlapValuesProdJI,commonClonesProd/unionProd)
      overlapValuesUnprodJI = c(overlapValuesUnprodJI,commonClonesUnprod/unionUnprod)
      
      
      # Jaccard abundance-based similarity index (UV/(U+V-UV)). U is total relative abundance in of shared sample 1, U total relative of shared in sample 2
      
      commonClonesProd_U_data= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_U = sum(commonClonesProd_U_data$COUNT)/sum(topclones_sam1Prod$COUNT)
      
      commonClonesProd_V_data= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_V = sum(commonClonesProd_V_data$COUNT)/sum(topclones_sam2Prod$COUNT)
      
      uv=commonClonesProd_U*commonClonesProd_V
      uplusv=commonClonesProd_U+commonClonesProd_V
      
      AbundanceBasedJIProd = uv/(uplusv-uv)
      
      overlapValuesProdJI_abundancebased=c(overlapValuesProdJI_abundancebased,as.numeric(AbundanceBasedJIProd))
      
      # for unproductive sequences
      
      commonClonesUnProd_U_data= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_U = sum(commonClonesUnProd_U_data$COUNT)/sum(topclones_sam1Unprod$COUNT)
      
      commonClonesUnProd_V_data= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_V = sum(commonClonesUnProd_V_data$COUNT)/sum(topclones_sam2Unprod$COUNT)
      
      uv_UnProd=commonClonesUnProd_U * commonClonesUnProd_V
      uplusv_UnProd=commonClonesUnProd_U + commonClonesUnProd_V
      
      AbundanceBasedJIUnProd = uv_UnProd/(uplusv_UnProd-uv_UnProd)
      
      overlapValuesUnprodJI_abundancebased=c(overlapValuesUnprodJI_abundancebased,as.numeric(AbundanceBasedJIUnProd))
      
      
      
      # Ninsertions  in common sequences
      
      commonClonotypes1= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonotypes2= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      
      commonClonotypes1Unprod= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonotypes2Unprod= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      
      
      NinsertionsP1= mean(commonClonotypes1$N2INSERTION + commonClonotypes1$N1INSERTION)
      NinsertionsP2 = mean(commonClonotypes2$N2INSERTION + commonClonotypes2$N1INSERTION)
      
      NinsertionsUP1= mean(commonClonotypes1Unprod$N2INSERTION + commonClonotypes1Unprod$N1INSERTION)
      NinsertionsUP2 = mean(commonClonotypes2Unprod$N2INSERTION + commonClonotypes2Unprod$N1INSERTION)
      
      
      NinsertionProd1=c(NinsertionProd1,NinsertionsP1)
      NinsertionProd2= c(NinsertionProd2,NinsertionsP2)
      
      NinsertionUnprod1=c(NinsertionUnprod1,NinsertionsUP1)
      NinsertionUnprod2=c(NinsertionUnprod2,NinsertionsUP2)
      
    }
    
    #totalNumberOfProdClonotypes= nrow(topclones_sam1Prod) + nrow(topclones_sam2Prod)
    #totalNumberOfUnprodClonotypes= nrow(topclones_sam1Unprod) + nrow(topclones_sam2Unprod)
    
    
    if(nrow(toadd) > 0){
      newrow=data.frame(overlapValuesProd,overlapValuesUnprod,overlapValuesProdJI,overlapValuesUnprodJI,overlapValuesProdJI_abundancebased,overlapValuesUnprodJI_abundancebased,NinsertionProd1,NinsertionProd2,NinsertionUnprod1,NinsertionUnprod2)
      colnames(newrow) <- c("OverlapCountProductive","OverlapCountUnProductive","OverlapProd_JI","OverlapUnProd_JI","AbundanceBased_JI","AbundanceBased_JI","NinsertionProdSample1","NinsertionProdSample2","NinsertionUnProdSample1","NinsertionUnProdSample2")
      
      toadd=rbind(toadd,newrow)
    }else{
      toadd=data.frame(overlapValuesProd,overlapValuesUnprod,overlapValuesProdJI,overlapValuesUnprodJI,overlapValuesProdJI_abundancebased,overlapValuesUnprodJI_abundancebased,NinsertionProd1,NinsertionProd2,NinsertionUnprod1,NinsertionUnprod2)
      colnames(toadd) <- c("OverlapCountProductive","OverlapCountUnProductive","OverlapProd_JI","OverlapUnProd_JI","AbundanceBased_JI","AbundanceBased_JI","NinsertionProdSample1","NinsertionProdSample2","NinsertionUnProdSample1","NinsertionUnProdSample2")  
    }
    
  }
  
  
  
  
  print(nrow(toadd))
  resultTable=data.frame(m,toadd)        
  
  # write pdf dendrogram and heatmaps
  
  # select the sample columns 1 and 2, and the JI column 5
  forProdselected = resultTable[,c(1,2,5)]
  forProdselected = forProdselected[order(forProdselected$Sample1),]
  
  l = length(unique(forProdselected$Sample1))
  mat = matrix(as.numeric(forProdselected[,3]),nrow=l,ncol=l,byrow=T)
  # replace NaN values with 0
  mat[is.nan(mat)] = 0
  colnames(mat) <- unique(forProdselected$Sample1)
  rownames(mat) <- unique(forProdselected$Sample1)
  
  
  mat = signif(mat,digits=2)
  hc = hclust(as.dist(log10(1-mat)))
  
  flname = paste("ProdJI_dendrogram.pdf")
  pdf(file=flname, useDingbats = FALSE)
  
  # very simple dendrogram
  plot(hc)
  
  dev.off()
  
  # for unprod
  # select the sample columns 1 and 2, and the JI column 5
  forUnprodselected = resultTable[,c(1,2,6)]
  forUnprodselected = forUnprodselected[order(forUnprodselected$Sample1),]
  
  l = length(unique(forUnprodselected$Sample1))
  mat = matrix(as.numeric(forUnprodselected[,3]),nrow=l,ncol=l,byrow=T)
  # replace NaN values with 0
  mat[is.nan(mat)] = 0
  colnames(mat) <- unique(forUnprodselected$Sample1)
  rownames(mat) <- unique(forUnprodselected$Sample1)
  
  
  mat = signif(mat,digits=2)
  hc = hclust(as.dist(log10(1-mat)))
  
  
  flname = paste("UnprodJI_dendrogram.pdf")
  pdf(file=flname, useDingbats = FALSE)
  
  # very simple dendrogram
  plot(hc)
  
  dev.off()
  
  write.table(resultTable,file="Pairwise Overlap Counts.txt",row.names = T,sep = "\t")
  return(resultTable)
}


# JI calculation for 'has stop' unproductive Nucleotides

overLappingClonotypesHasStop <- function(reps,cutoff=20,cutoffby="perc"){
  ### Returns a datamatrix of top topnumber clonotopes from each sample plus: overlap count or Jaccard index, for prod and non productive sequences, N insertion size
  
  ### Total read counts for the common clones of both samples are given for the productive/hasStop versions the JI analysis. This is implemented when the analysis is 
  ### done for by "perc". 
  
  
  # setting work parameters
  cutoffby = toupper(cutoffby)
  
  
  m <- as.data.frame(expand.grid(reps,reps))
  colnames(m)  <- c("Sample1","Sample2")
  
  toadd= data.frame()
  
  
  
  for(r in 1:nrow(m)){
    
    overlapValuesProd=c()
    overlapValuesUnprod=c()
    
    overlapValuesTotalSam1Prod=c()
    overlapValuesTotalSam2Prod=c()
    
    overlapValuesTotalSam1Unprod=c()
    overlapValuesTotalSam2Unprod=c()
    
    overlapValuesProdJI=c()
    overlapValuesUnprodJI=c()
    
    overlapValuesProdJI_abundancebased=c()
    overlapValuesUnprodJI_abundancebased=c()
    
    NinsertionProd1=c()
    NinsertionUnprod1=c()
    
    NinsertionProd2=c()
    NinsertionUnprod2=c()
    
    
    print(r)
    sam1=m[r,1]
    sam2=m[r,2]
    
    #print(sam1)
    #print(sam2)
    
    sam1 <- get(as.character(sam1))
    sam2 <- get(as.character(sam2))
    
    
    sam1Prod <- getProductive(sam1)
    sam1Unprod <- getHasStop(sam1)
    
    sam2Prod <- getProductive(sam2)
    sam2Unprod <- getHasStop(sam2)
    
    
    if(cutoffby=="PERC"){
      topclones_sam1Prod = getAbundantClonotypesbyperc(sam1Prod,cutoff)
      topclones_sam1Unprod = getAbundantClonotypesbyperc(sam1Unprod,cutoff)
      
      topclones_sam2Prod = getAbundantClonotypesbyperc(sam2Prod,cutoff)
      topclones_sam2Unprod = getAbundantClonotypesbyperc(sam2Unprod,cutoff)
      
      # commons
      commonClonesProd=length(intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE)) 
      unionProd=length(union(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE)) 
      
      # commonality checked at the Nt level for non productive sequences, no AA given in the original data
      commonClonesUnprod=length(intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      unionUnprod=length(union(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      
      
      
      
      if(unionUnprod==0){
        unionUnprod = 1
      }
      
      if(unionProd==0){
        unionProd = 1
      }
      
      overlapValuesProd=c(overlapValuesProd,as.numeric(commonClonesProd))
      overlapValuesUnprod=c(overlapValuesUnprod,as.numeric(commonClonesUnprod))
      
      
      
      overlapValuesProdJI=c(overlapValuesProdJI,commonClonesProd/unionProd)
      overlapValuesUnprodJI = c(overlapValuesUnprodJI,commonClonesUnprod/unionUnprod)
      
      # Jaccard abundance-based similarity index (UV/(U+V-UV)). U is total relative abundance in of shared sample 1, U total relative of shared in sample 2
      
      commonClonesProd_U_data= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_U = sum(commonClonesProd_U_data$COUNT)/sum(topclones_sam1Prod$COUNT)
      
      commonClonesProd_U_abundance = sum(commonClonesProd_U_data$COUNT)
      
      commonClonesProd_V_data= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_V = sum(commonClonesProd_V_data$COUNT)/sum(topclones_sam2Prod$COUNT)
      
      commonClonesProd_V_abundance = sum(commonClonesProd_V_data$COUNT)
      
      overlapValuesTotalSam1Prod=c(overlapValuesTotalSam1Prod,as.numeric(commonClonesProd_U_abundance))
      overlapValuesTotalSam2Prod=c(overlapValuesTotalSam2Prod,as.numeric(commonClonesProd_V_abundance))
      
      uv=commonClonesProd_U*commonClonesProd_V
      uplusv=commonClonesProd_U+commonClonesProd_V
      
      AbundanceBasedJIProd = uv/(uplusv-uv)
      
      overlapValuesProdJI_abundancebased=c(overlapValuesProdJI_abundancebased,as.numeric(AbundanceBasedJIProd))
      
      # for unproductive sequences
      
      commonClonesUnProd_U_data= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_U = sum(commonClonesUnProd_U_data$COUNT)/sum(topclones_sam1Unprod$COUNT)
      
      commonClonesUnProd_U_abundance = sum(commonClonesUnProd_U_data$COUNT)
      
      commonClonesUnProd_V_data= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_V = sum(commonClonesUnProd_V_data$COUNT)/sum(topclones_sam2Unprod$COUNT)
      
      commonClonesUnProd_V_abundance = sum(commonClonesUnProd_V_data$COUNT)
      
      
      overlapValuesTotalSam1Unprod=c(overlapValuesTotalSam1Unprod,as.numeric(commonClonesUnProd_U_abundance))
      overlapValuesTotalSam2Unprod=c(overlapValuesTotalSam2Unprod,as.numeric(commonClonesUnProd_V_abundance))
      
      
      uv_UnProd=commonClonesUnProd_U * commonClonesUnProd_V
      uplusv_UnProd=commonClonesUnProd_U + commonClonesUnProd_V
      
      AbundanceBasedJIUnProd = uv_UnProd/(uplusv_UnProd-uv_UnProd)
      
      overlapValuesUnprodJI_abundancebased=c(overlapValuesUnprodJI_abundancebased,as.numeric(AbundanceBasedJIUnProd))
      
      
      
      # Ninsertions  in common sequences
      
      commonClonotypes1= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonotypes2= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      
      commonClonotypes1Unprod= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonotypes2Unprod= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      
      
      NinsertionsP1= mean(commonClonotypes1$N2INSERTION + commonClonotypes1$N1INSERTION)
      NinsertionsP2 = mean(commonClonotypes2$N2INSERTION + commonClonotypes2$N1INSERTION)
      
      NinsertionsUP1= mean(commonClonotypes1Unprod$N2INSERTION + commonClonotypes1Unprod$N1INSERTION)
      NinsertionsUP2 = mean(commonClonotypes2Unprod$N2INSERTION + commonClonotypes2Unprod$N1INSERTION)
      
      
      NinsertionProd1=c(NinsertionProd1,NinsertionsP1)
      NinsertionProd2= c(NinsertionProd2,NinsertionsP2)
      
      NinsertionUnprod1=c(NinsertionUnprod1,NinsertionsUP1)
      NinsertionUnprod2=c(NinsertionUnprod2,NinsertionsUP2)
      
      
    }else{
      topclones_sam1Prod = getAbundantClonotypesbynumber(sam1Prod,cutoff)
      topclones_sam1Unprod = getAbundantClonotypesbynumber(sam1Unprod,cutoff)
      
      topclones_sam2Prod = getAbundantClonotypesbynumber(sam2Prod,cutoff)
      topclones_sam2Unprod = getAbundantClonotypesbynumber(sam2Unprod,cutoff)
      
      
      # commons
      # commons
      commonClonesProd=length(intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE))
      unionProd=length(union(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE))
      
      # commonality checked at the Nt level for non productive sequences, no AA given in the original data
      commonClonesUnprod=length(intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      unionUnprod=length(union(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE))
      
      if(unionUnprod==0){
        unionUnprod = 1
      }
      
      if(unionProd==0){
        unionProd = 1
      }
      
      overlapValuesProd=c(overlapValuesProd,as.numeric(commonClonesProd))
      overlapValuesUnprod=c(overlapValuesUnprod,as.numeric(commonClonesUnprod))
      
      overlapValuesProdJI=c(overlapValuesProdJI,commonClonesProd/unionProd)
      overlapValuesUnprodJI = c(overlapValuesUnprodJI,commonClonesUnprod/unionUnprod)
      
      
      # Jaccard abundance-based similarity index (UV/(U+V-UV)). U is total relative abundance in of shared sample 1, U total relative of shared in sample 2
      
      commonClonesProd_U_data= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_U = sum(commonClonesProd_U_data$COUNT)/sum(topclones_sam1Prod$COUNT)
      
      commonClonesProd_V_data= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonesProd_V = sum(commonClonesProd_V_data$COUNT)/sum(topclones_sam2Prod$COUNT)
      
      uv=commonClonesProd_U*commonClonesProd_V
      uplusv=commonClonesProd_U+commonClonesProd_V
      
      AbundanceBasedJIProd = uv/(uplusv-uv)
      
      overlapValuesProdJI_abundancebased=c(overlapValuesProdJI_abundancebased,as.numeric(AbundanceBasedJIProd))
      
      # for unproductive sequences
      
      commonClonesUnProd_U_data= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_U = sum(commonClonesUnProd_U_data$COUNT)/sum(topclones_sam1Unprod$COUNT)
      
      commonClonesUnProd_V_data= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonesUnProd_V = sum(commonClonesUnProd_V_data$COUNT)/sum(topclones_sam2Unprod$COUNT)
      
      uv_UnProd=commonClonesUnProd_U * commonClonesUnProd_V
      uplusv_UnProd=commonClonesUnProd_U + commonClonesUnProd_V
      
      AbundanceBasedJIUnProd = uv_UnProd/(uplusv_UnProd-uv_UnProd)
      
      overlapValuesUnprodJI_abundancebased=c(overlapValuesUnprodJI_abundancebased,as.numeric(AbundanceBasedJIUnProd))
      
      
      
      # Ninsertions  in common sequences
      
      commonClonotypes1= topclones_sam1Prod[topclones_sam1Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      commonClonotypes2= topclones_sam2Prod[topclones_sam2Prod$NUCLEOTIDE %in% intersect(topclones_sam1Prod$NUCLEOTIDE,topclones_sam2Prod$NUCLEOTIDE),]
      
      commonClonotypes1Unprod= topclones_sam1Unprod[topclones_sam1Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      commonClonotypes2Unprod= topclones_sam2Unprod[topclones_sam2Unprod$NUCLEOTIDE %in% intersect(topclones_sam1Unprod$NUCLEOTIDE,topclones_sam2Unprod$NUCLEOTIDE),]
      
      
      NinsertionsP1= mean(commonClonotypes1$N2INSERTION + commonClonotypes1$N1INSERTION)
      NinsertionsP2 = mean(commonClonotypes2$N2INSERTION + commonClonotypes2$N1INSERTION)
      
      NinsertionsUP1= mean(commonClonotypes1Unprod$N2INSERTION + commonClonotypes1Unprod$N1INSERTION)
      NinsertionsUP2 = mean(commonClonotypes2Unprod$N2INSERTION + commonClonotypes2Unprod$N1INSERTION)
      
      
      NinsertionProd1=c(NinsertionProd1,NinsertionsP1)
      NinsertionProd2= c(NinsertionProd2,NinsertionsP2)
      
      NinsertionUnprod1=c(NinsertionUnprod1,NinsertionsUP1)
      NinsertionUnprod2=c(NinsertionUnprod2,NinsertionsUP2)
      
    }
    
    #totalNumberOfProdClonotypes= nrow(topclones_sam1Prod) + nrow(topclones_sam2Prod)
    #totalNumberOfUnprodClonotypes= nrow(topclones_sam1Unprod) + nrow(topclones_sam2Unprod)
    
    
    if(nrow(toadd) > 0){
      newrow=data.frame(overlapValuesProd,overlapValuesTotalSam1Prod,overlapValuesTotalSam2Prod,overlapValuesUnprod,overlapValuesTotalSam1Unprod,overlapValuesTotalSam2Unprod,overlapValuesProdJI,overlapValuesUnprodJI,overlapValuesProdJI_abundancebased,overlapValuesUnprodJI_abundancebased,NinsertionProd1,NinsertionProd2,NinsertionUnprod1,NinsertionUnprod2)
      colnames(newrow) <- c("OverlapCountProductive","OverlapCountTotalProductiveSam1","OverlapCountTotalProductiveSam2","OverlapCountHasStop","OverlapCountTotalHasStopSam1","OverlapCountTotalHasStopSam2","OverlapProd_JI","OverlapHasStop_JI","AbundanceBasedProd_JI","AbundanceBasedHasStop_JI","NinsertionProdSample1","NinsertionProdSample2","NinsertionHasStopSample1","NinsertionHasStopSample2")
      
      toadd=rbind(toadd,newrow)
    }else{
      toadd=data.frame(overlapValuesProd,overlapValuesTotalSam1Prod,overlapValuesTotalSam2Prod,overlapValuesUnprod,overlapValuesTotalSam1Unprod,overlapValuesTotalSam2Unprod,overlapValuesProdJI,overlapValuesUnprodJI,overlapValuesProdJI_abundancebased,overlapValuesUnprodJI_abundancebased,NinsertionProd1,NinsertionProd2,NinsertionUnprod1,NinsertionUnprod2)
      colnames(toadd) <- c("OverlapCountProductive","OverlapCountTotalProductiveSam1","OverlapCountTotalProductiveSam2","OverlapCountHasStop","OverlapCountTotalHasStopSam1","OverlapCountTotalHasStopSam2","OverlapProd_JI","OverlapHasStop_JI","AbundanceBasedProd_JI","AbundanceBasedHasStop_JI","NinsertionProdSample1","NinsertionProdSample2","NinsertionHasStopSample1","NinsertionHasStopSample2")
    }
    
  }
  
  
  
  
  print(nrow(toadd))
  resultTable=data.frame(m,toadd)        
  
  
  write.table(resultTable,file="Pairwise Overlap Counts.txt",row.names = T,sep = "\t")
  return(resultTable)
  
  
}


# Amico acid based Jaccard index calculation

overLappingClonotypesAA<- function(reps,cutoff=20,cutoffby="perc"){
  ### Returns a datamatrix of top topnumber clonotopes (Amino acid level) from each sample plus: overlap count or Jaccard index, for prod and non productive sequences, N insertion size
  
  
  # setting work parameters
  cutoffby = toupper(cutoffby)
  
  
  m <- as.data.frame(expand.grid(reps,reps))
  colnames(m)  <- c("Sample1","Sample2")
  
  toadd= data.frame()
  
  
  
  for(r in 1:nrow(m)){
    
    overlapValuesProd=c()
    overlapValuesUnprod=c()
    overlapValuesProdJI=c()
    overlapValuesUnprodJI=c()
    
    overlapValuesProdJI_abundancebased=c()
    overlapValuesUnprodJI_abundancebased=c()
    
    NinsertionProd1=c()
    NinsertionUnprod1=c()
    
    NinsertionProd2=c()
    NinsertionUnprod2=c()
    
    
    print(r)
    sam1=m[r,1]
    sam2=m[r,2]
    
    #print(sam1)
    #print(sam2)
    
    sam1 <- get(as.character(sam1))
    sam2 <- get(as.character(sam2))
    
    
    sam1Prod <- getProductive(sam1)
    sam1Unprod <- getUnproductive(sam1)
    
    sam2Prod <- getProductive(sam2)
    sam2Unprod <- getUnproductive(sam2)
    
    
    if(cutoffby=="PERC"){
      topclones_sam1Prod = getAbundantClonotypesbyperc(sam1Prod,cutoff)
      topclones_sam1Unprod = getAbundantClonotypesbyperc(sam1Unprod,cutoff)
      
      topclones_sam2Prod = getAbundantClonotypesbyperc(sam2Prod,cutoff)
      topclones_sam2Unprod = getAbundantClonotypesbyperc(sam2Unprod,cutoff)
      
      # commons
      commonClonesProd=length(intersect(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID)) 
      unionProd=length(union(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID)) 
      
      # commonality checked at the Nt level for non productive sequences, no AA given in the original data
      commonClonesUnprod=length(intersect(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID))
      unionUnprod=length(union(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID))
      
      if(unionUnprod==0){
        unionUnprod = 1
      }
      
      if(unionProd==0){
        unionProd = 1
      }
      
      overlapValuesProd=c(overlapValuesProd,as.numeric(commonClonesProd))
      overlapValuesUnprod=c(overlapValuesUnprod,as.numeric(commonClonesUnprod))
      
      overlapValuesProdJI=c(overlapValuesProdJI,commonClonesProd/unionProd)
      overlapValuesUnprodJI = c(overlapValuesUnprodJI,commonClonesUnprod/unionUnprod)
      
      # Jaccard abundance-based similarity index (UV/(U+V-UV)). U is total relative abundance in of shared sample 1, U total relative of shared in sample 2
      
      commonClonesProd_U_data= topclones_sam1Prod[topclones_sam1Prod$AMINOACID %in% intersect(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID),]
      commonClonesProd_U = sum(commonClonesProd_U_data$COUNT)/sum(topclones_sam1Prod$COUNT)
      
      commonClonesProd_V_data= topclones_sam2Prod[topclones_sam2Prod$AMINOACID %in% intersect(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID),]
      commonClonesProd_V = sum(commonClonesProd_V_data$COUNT)/sum(topclones_sam2Prod$COUNT)
      
      uv=commonClonesProd_U*commonClonesProd_V
      uplusv=commonClonesProd_U+commonClonesProd_V
      
      AbundanceBasedJIProd = uv/(uplusv-uv)
      
      overlapValuesProdJI_abundancebased=c(overlapValuesProdJI_abundancebased,as.numeric(AbundanceBasedJIProd))
      
      # for unproductive sequences
      
      commonClonesUnProd_U_data= topclones_sam1Unprod[topclones_sam1Unprod$AMINOACID %in% intersect(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID),]
      commonClonesUnProd_U = sum(commonClonesUnProd_U_data$COUNT)/sum(topclones_sam1Unprod$COUNT)
      
      commonClonesUnProd_V_data= topclones_sam2Unprod[topclones_sam2Unprod$AMINOACID %in% intersect(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID),]
      commonClonesUnProd_V = sum(commonClonesUnProd_V_data$COUNT)/sum(topclones_sam2Unprod$COUNT)
      
      uv_UnProd=commonClonesUnProd_U * commonClonesUnProd_V
      uplusv_UnProd=commonClonesUnProd_U + commonClonesUnProd_V
      
      AbundanceBasedJIUnProd = uv_UnProd/(uplusv_UnProd-uv_UnProd)
      
      overlapValuesUnprodJI_abundancebased=c(overlapValuesUnprodJI_abundancebased,as.numeric(AbundanceBasedJIUnProd))
      
      
      
      # Ninsertions  in common sequences
      
      commonClonotypes1= topclones_sam1Prod[topclones_sam1Prod$AMINOACID %in% intersect(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID),]
      commonClonotypes2= topclones_sam2Prod[topclones_sam2Prod$AMINOACID %in% intersect(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID),]
      
      commonClonotypes1Unprod= topclones_sam1Unprod[topclones_sam1Unprod$AMINOACID %in% intersect(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID),]
      commonClonotypes2Unprod= topclones_sam2Unprod[topclones_sam2Unprod$AMINOACID %in% intersect(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID),]
      
      
      NinsertionsP1= mean(commonClonotypes1$N2INSERTION + commonClonotypes1$N1INSERTION)
      NinsertionsP2 = mean(commonClonotypes2$N2INSERTION + commonClonotypes2$N1INSERTION)
      
      NinsertionsUP1= mean(commonClonotypes1Unprod$N2INSERTION + commonClonotypes1Unprod$N1INSERTION)
      NinsertionsUP2 = mean(commonClonotypes2Unprod$N2INSERTION + commonClonotypes2Unprod$N1INSERTION)
      
      
      NinsertionProd1=c(NinsertionProd1,NinsertionsP1)
      NinsertionProd2= c(NinsertionProd2,NinsertionsP2)
      
      NinsertionUnprod1=c(NinsertionUnprod1,NinsertionsUP1)
      NinsertionUnprod2=c(NinsertionUnprod2,NinsertionsUP2)
      
      
    }else{
      topclones_sam1Prod = getAbundantClonotypesbynumber(sam1Prod,cutoff)
      topclones_sam1Unprod = getAbundantClonotypesbynumber(sam1Unprod,cutoff)
      
      topclones_sam2Prod = getAbundantClonotypesbynumber(sam2Prod,cutoff)
      topclones_sam2Unprod = getAbundantClonotypesbynumber(sam2Unprod,cutoff)
      
      
      # commons
      # commons
      commonClonesProd=length(intersect(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID))
      unionProd=length(union(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID))
      
      # commonality checked at the Nt level for non productive sequences, no AA given in the original data
      commonClonesUnprod=length(intersect(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID))
      unionUnprod=length(union(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID))
      
      if(unionUnprod==0){
        unionUnprod = 1
      }
      
      if(unionProd==0){
        unionProd = 1
      }
      
      overlapValuesProd=c(overlapValuesProd,as.numeric(commonClonesProd))
      overlapValuesUnprod=c(overlapValuesUnprod,as.numeric(commonClonesUnprod))
      
      overlapValuesProdJI=c(overlapValuesProdJI,commonClonesProd/unionProd)
      overlapValuesUnprodJI = c(overlapValuesUnprodJI,commonClonesUnprod/unionUnprod)
      
      
      # Jaccard abundance-based similarity index (UV/(U+V-UV)). U is total relative abundance in of shared sample 1, U total relative of shared in sample 2
      
      commonClonesProd_U_data= topclones_sam1Prod[topclones_sam1Prod$AMINOACID %in% intersect(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID),]
      commonClonesProd_U = sum(commonClonesProd_U_data$COUNT)/sum(topclones_sam1Prod$COUNT)
      
      commonClonesProd_V_data= topclones_sam2Prod[topclones_sam2Prod$AMINOACID %in% intersect(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID),]
      commonClonesProd_V = sum(commonClonesProd_V_data$COUNT)/sum(topclones_sam2Prod$COUNT)
      
      uv=commonClonesProd_U*commonClonesProd_V
      uplusv=commonClonesProd_U+commonClonesProd_V
      
      AbundanceBasedJIProd = uv/(uplusv-uv)
      
      overlapValuesProdJI_abundancebased=c(overlapValuesProdJI_abundancebased,as.numeric(AbundanceBasedJIProd))
      
      # for unproductive sequences
      
      commonClonesUnProd_U_data= topclones_sam1Unprod[topclones_sam1Unprod$AMINOACID %in% intersect(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID),]
      commonClonesUnProd_U = sum(commonClonesUnProd_U_data$COUNT)/sum(topclones_sam1Unprod$COUNT)
      
      commonClonesUnProd_V_data= topclones_sam2Unprod[topclones_sam2Unprod$AMINOACID %in% intersect(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID),]
      commonClonesUnProd_V = sum(commonClonesUnProd_V_data$COUNT)/sum(topclones_sam2Unprod$COUNT)
      
      uv_UnProd=commonClonesUnProd_U * commonClonesUnProd_V
      uplusv_UnProd=commonClonesUnProd_U + commonClonesUnProd_V
      
      AbundanceBasedJIUnProd = uv_UnProd/(uplusv_UnProd-uv_UnProd)
      
      overlapValuesUnprodJI_abundancebased=c(overlapValuesUnprodJI_abundancebased,as.numeric(AbundanceBasedJIUnProd))
      
      
      
      # Ninsertions  in common sequences
      
      commonClonotypes1= topclones_sam1Prod[topclones_sam1Prod$AMINOACID %in% intersect(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID),]
      commonClonotypes2= topclones_sam2Prod[topclones_sam2Prod$AMINOACID %in% intersect(topclones_sam1Prod$AMINOACID,topclones_sam2Prod$AMINOACID),]
      
      commonClonotypes1Unprod= topclones_sam1Unprod[topclones_sam1Unprod$AMINOACID %in% intersect(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID),]
      commonClonotypes2Unprod= topclones_sam2Unprod[topclones_sam2Unprod$AMINOACID %in% intersect(topclones_sam1Unprod$AMINOACID,topclones_sam2Unprod$AMINOACID),]
      
      
      NinsertionsP1= mean(commonClonotypes1$N2INSERTION + commonClonotypes1$N1INSERTION)
      NinsertionsP2 = mean(commonClonotypes2$N2INSERTION + commonClonotypes2$N1INSERTION)
      
      NinsertionsUP1= mean(commonClonotypes1Unprod$N2INSERTION + commonClonotypes1Unprod$N1INSERTION)
      NinsertionsUP2 = mean(commonClonotypes2Unprod$N2INSERTION + commonClonotypes2Unprod$N1INSERTION)
      
      
      NinsertionProd1=c(NinsertionProd1,NinsertionsP1)
      NinsertionProd2= c(NinsertionProd2,NinsertionsP2)
      
      NinsertionUnprod1=c(NinsertionUnprod1,NinsertionsUP1)
      NinsertionUnprod2=c(NinsertionUnprod2,NinsertionsUP2)
      
    }
    
    #totalNumberOfProdClonotypes= nrow(topclones_sam1Prod) + nrow(topclones_sam2Prod)
    #totalNumberOfUnprodClonotypes= nrow(topclones_sam1Unprod) + nrow(topclones_sam2Unprod)
    
    
    if(nrow(toadd) > 0){
      newrow=data.frame(overlapValuesProd,overlapValuesUnprod,overlapValuesProdJI,overlapValuesUnprodJI,overlapValuesProdJI_abundancebased,overlapValuesUnprodJI_abundancebased,NinsertionProd1,NinsertionProd2,NinsertionUnprod1,NinsertionUnprod2)
      colnames(newrow) <- c("OverlapCountProductive","OverlapCountUnProductive","OverlapProd_JI","OverlapUnProd_JI","AbundanceBased_JI","AbundanceBased_JI","NinsertionProdSample1","NinsertionProdSample2","NinsertionUnProdSample1","NinsertionUnProdSample2")
      
      toadd=rbind(toadd,newrow)
    }else{
      toadd=data.frame(overlapValuesProd,overlapValuesUnprod,overlapValuesProdJI,overlapValuesUnprodJI,overlapValuesProdJI_abundancebased,overlapValuesUnprodJI_abundancebased,NinsertionProd1,NinsertionProd2,NinsertionUnprod1,NinsertionUnprod2)
      colnames(toadd) <- c("OverlapCountProductive","OverlapCountUnProductive","OverlapProd_JI","OverlapUnProd_JI","AbundanceBased_JI","AbundanceBased_JI","NinsertionProdSample1","NinsertionProdSample2","NinsertionUnProdSample1","NinsertionUnProdSample2")  
    }
    
  }
  
  
  
  
  print(nrow(toadd))
  resultTable=data.frame(m,toadd)        
  
  # write pdf dendrogram and heatmaps
  
  # select the sample columns 1 and 2, and the JI column 5
  forProdselected = resultTable[,c(1,2,5)]
  forProdselected = forProdselected[order(forProdselected$Sample1),]
  
  l = length(unique(forProdselected$Sample1))
  mat = matrix(as.numeric(forProdselected[,3]),nrow=l,ncol=l,byrow=T)
  # replace NaN values with 0
  mat[is.nan(mat)] = 0
  colnames(mat) <- unique(forProdselected$Sample1)
  rownames(mat) <- unique(forProdselected$Sample1)
  
  
  mat = signif(mat,digits=2)
  hc = hclust(as.dist(log10(1-mat)))
  
  flname = paste("ProdJI_dendrogram.pdf")
  pdf(file=flname, useDingbats = FALSE)
  
  # very simple dendrogram
  plot(hc)
  
  dev.off()
  
  # for unprod
  # select the sample columns 1 and 2, and the JI column 5
  forUnprodselected = resultTable[,c(1,2,6)]
  forUnprodselected = forUnprodselected[order(forUnprodselected$Sample1),]
  
  l = length(unique(forUnprodselected$Sample1))
  mat = matrix(as.numeric(forUnprodselected[,3]),nrow=l,ncol=l,byrow=T)
  # replace NaN values with 0
  mat[is.nan(mat)] = 0
  colnames(mat) <- unique(forUnprodselected$Sample1)
  rownames(mat) <- unique(forUnprodselected$Sample1)
  
  
  mat = signif(mat,digits=2)
  hc = hclust(as.dist(log10(1-mat)))
  
  
  flname = paste("UnprodJI_dendrogram.pdf")
  pdf(file=flname, useDingbats = FALSE)
  
  # very simple dendrogram
  plot(hc)
  
  dev.off()
  
  write.table(resultTable,file="Pairwise AA Overlap Counts.txt",row.names = T,sep = "\t")
  return(resultTable)
}


# Overlap sequence calculation using various parameters

overLappingClonotypesByChoice <- function(reps,cutoff=100,cutoffby="perc",seqType="AA",unProdStatus="unprod"){
  ### Returns a datamatrix of top topnumber clonotopes from each sample plus: overlap count or Jaccard index, for prod and non productive sequences, N insertion size
  
  ### Total read counts for the common clones of both samples are given for the productive/ chosen unproductive types.
  ### cutoffby : Perc (percentage of total repertoire as indicated by cutoff) or noperc (select top clones as many as indicated by cutoff)
  ### seqType:  can be by Nt (nucleotide) or AA (AminoAcid)
  ### unProdStatus : which unproductive clones to consider Out or Stop or Unprod (union of Out and Stop)
  
  
  # setting work parameters
  #---------------------------------------------------------
  
  #reps <- samNames
  
  cutoffby = toupper(cutoffby)
  
  m <- as.data.frame(combinations(n = length(reps), r = 2, v = reps, repeats.allowed = TRUE))
  colnames(m)  <- c("Sample1","Sample2")
  
  
  
  toadd= data.frame()
  
  #change parameter values to uppercase to accept Nt, nt etc for example
  unProdStatus <- toupper(unProdStatus)
  seqType <- toupper(seqType)
  
  if(seqType=="AA"){
    seqType <- "AMINOACID"
  }else{
    seqType <- "NUCLEOTIDE"
  }
  
  #---------------------------------------------------------
  
  
  for(r in 1:nrow(m)){
    
    overlapValuesProd=c()
    overlapValuesUnprod=c()
    
    overlapValuesTotalSam1Prod=c()
    overlapValuesTotalSam2Prod=c()
    
    overlapValuesTotalSam1Unprod=c()
    overlapValuesTotalSam2Unprod=c()
    
    overlapValuesProdJI=c()
    overlapValuesUnprodJI=c()
    
    # the following two hold abundance based JI using : http://chao.stat.nthu.edu.tw/wordpress/paper/2006_Biometrics_62_P361.pdf
    
    overlapValuesProdJI_abundancebased=c()
    overlapValuesUnprodJI_abundancebased=c()
    
    # the following two hold abundance based JI calculation using (total size intersecting clones)/(total size of union clones)
    overlapValuesProdJI_abundancebased2=c()
    overlapValuesUnprodJI_abundancebased2=c()
    
    NinsertionProd1=c()
    NinsertionUnprod1=c()
    
    NinsertionProd2=c()
    NinsertionUnprod2=c()
    
    
    print(r)
    sam1=m[r,1]
    sam2=m[r,2]
    
    samplesCompared = paste(sam1,sam2,sep="_")
    #print(sam1)
    #print(sam2)
    
    sam1 <- get(as.character(sam1))
    sam2 <- get(as.character(sam2))
    
    #---------------------------------------------------------
    # productive and unproductive selection for each sample
    
    
    sam1Prod <- getProductive(sam1)
    
    if(unProdStatus=="STOP"){
      sam1Unprod <- getHasStop(sam1)
    }else if(unProdStatus=="OUT"){
      sam1Unprod <- getOutOfFrame(sam1)
    }else{
      sam1Unprod <- getUnproductive(sam1)
    }
    
    
    sam2Prod <- getProductive(sam2)
    
    if(unProdStatus=="STOP"){
      sam2Unprod <- getHasStop(sam2)
    }else if(unProdStatus=="OUT"){
      sam2Unprod <- getOutOfFrame(sam2)
    }else{
      sam2Unprod <- getUnproductive(sam2)
    }
    
    
    
    #---------------------------------------------------------
    # selection of sequence type index for each sample
    
    indSam1 <- which(colnames(sam1) == seqType)
    indSam2 <- which(colnames(sam2) == seqType)
    
    
    
    #---------------------------------------------------------
    # selecting top clonotypes for which to do the overlap counts, JI calculation etc. Either by percentage or actual number of top clonotypes
    
    if(cutoffby=="PERC"){
      topclones_sam1Prod = getAbundantClonotypesbyperc(sam1Prod,cutoff)
      topclones_sam1Unprod = getAbundantClonotypesbyperc(sam1Unprod,cutoff)
      
      topclones_sam2Prod = getAbundantClonotypesbyperc(sam2Prod,cutoff)
      topclones_sam2Unprod = getAbundantClonotypesbyperc(sam2Unprod,cutoff)
    }else{
      topclones_sam1Prod = getAbundantClonotypesbynumber(sam1Prod,cutoff)
      topclones_sam1Unprod = getAbundantClonotypesbynumber(sam1Unprod,cutoff)
      
      topclones_sam2Prod = getAbundantClonotypesbynumber(sam2Prod,cutoff)
      topclones_sam2Unprod = getAbundantClonotypesbynumber(sam2Unprod,cutoff)
    }
    
    #---------------------------------------------------------
    # Getting number of overlapping clonotypes, Jaccard index, average insertion size etc for the two samples being compared
    
    # commons for sequence type selected
    commonClonesProd=length(intersect(topclones_sam1Prod[,indSam1],topclones_sam2Prod[,indSam2])) 
    unionProd=length(union(topclones_sam1Prod[,indSam1],topclones_sam2Prod[,indSam2])) 
    
    # commonality checked at the Nt level for non productive sequences, no AA given in the original data
    commonClonesUnprod=length(intersect(topclones_sam1Unprod[,indSam1],topclones_sam2Unprod[,indSam2]))
    unionUnprod=length(union(topclones_sam1Unprod[,indSam1],topclones_sam2Unprod[,indSam2]))
    
    
    
    
    if(unionUnprod==0){
      unionUnprod = 1
    }
    
    if(unionProd==0){
      unionProd = 1
    }
    
    overlapValuesProd=c(overlapValuesProd,as.numeric(commonClonesProd))
    overlapValuesUnprod=c(overlapValuesUnprod,as.numeric(commonClonesUnprod))
    
    
    
    overlapValuesProdJI=c(overlapValuesProdJI,commonClonesProd/unionProd)
    overlapValuesUnprodJI = c(overlapValuesUnprodJI,commonClonesUnprod/unionUnprod)
    
    # Jaccard abundance-based similarity index (UV/(U+V-UV)). U is total relative abundance in of shared sample 1, U total relative of shared in sample 2
    # taken from : http://chao.stat.nthu.edu.tw/wordpress/paper/2006_Biometrics_62_P361.pdf
    
    
    commonClonesProd_U_data= topclones_sam1Prod[topclones_sam1Prod[,indSam1] %in% intersect(topclones_sam1Prod[,indSam1],topclones_sam2Prod[,indSam2]),]
    commonClonesProd_U = sum(commonClonesProd_U_data$COUNT)/sum(topclones_sam1Prod$COUNT)
    
    commonClonesProd_U_abundance = sum(commonClonesProd_U_data$COUNT)
    
    commonClonesProd_V_data= topclones_sam2Prod[topclones_sam2Prod[,indSam2] %in% intersect(topclones_sam1Prod[,indSam1],topclones_sam2Prod[,indSam2]),]
    commonClonesProd_V = sum(commonClonesProd_V_data$COUNT)/sum(topclones_sam2Prod$COUNT)
    
    commonClonesProd_V_abundance = sum(commonClonesProd_V_data$COUNT)
    
    overlapValuesTotalSam1Prod=c(overlapValuesTotalSam1Prod,as.numeric(commonClonesProd_U_abundance))
    overlapValuesTotalSam2Prod=c(overlapValuesTotalSam2Prod,as.numeric(commonClonesProd_V_abundance))
    
    uv=commonClonesProd_U*commonClonesProd_V
    uplusv=commonClonesProd_U+commonClonesProd_V
    
    AbundanceBasedJIProd = uv/(uplusv-uv)
    
    overlapValuesProdJI_abundancebased=c(overlapValuesProdJI_abundancebased,as.numeric(AbundanceBasedJIProd))
    
    # for abundance based JI type 2
    totalSam1Sam2Prod = sum(topclones_sam1Prod$COUNT) + sum(topclones_sam2Prod$COUNT)                                                                                        
    AbundanceBasedJIProd2 = (commonClonesProd_U_abundance + commonClonesProd_V_abundance) / totalSam1Sam2Prod
    overlapValuesProdJI_abundancebased2 = c(overlapValuesProdJI_abundancebased2,as.numeric(AbundanceBasedJIProd2))
    
    
    # for unproductive sequences
    
    commonClonesUnProd_U_data= topclones_sam1Unprod[topclones_sam1Unprod[,indSam1] %in% intersect(topclones_sam1Unprod[,indSam1],topclones_sam2Unprod[,indSam2]),]
    commonClonesUnProd_U = sum(commonClonesUnProd_U_data$COUNT)/sum(topclones_sam1Unprod$COUNT)
    
    commonClonesUnProd_U_abundance = sum(commonClonesUnProd_U_data$COUNT)
    
    commonClonesUnProd_V_data= topclones_sam2Unprod[topclones_sam2Unprod[,indSam2] %in% intersect(topclones_sam1Unprod[,indSam1],topclones_sam2Unprod[,indSam2]),]
    commonClonesUnProd_V = sum(commonClonesUnProd_V_data$COUNT)/sum(topclones_sam2Unprod$COUNT)
    
    commonClonesUnProd_V_abundance = sum(commonClonesUnProd_V_data$COUNT)
    
    
    overlapValuesTotalSam1Unprod=c(overlapValuesTotalSam1Unprod,as.numeric(commonClonesUnProd_U_abundance))
    overlapValuesTotalSam2Unprod=c(overlapValuesTotalSam2Unprod,as.numeric(commonClonesUnProd_V_abundance))
    
    
    uv_UnProd=commonClonesUnProd_U * commonClonesUnProd_V
    uplusv_UnProd=commonClonesUnProd_U + commonClonesUnProd_V
    
    AbundanceBasedJIUnProd = uv_UnProd/(uplusv_UnProd-uv_UnProd)
    
    overlapValuesUnprodJI_abundancebased=c(overlapValuesUnprodJI_abundancebased,as.numeric(AbundanceBasedJIUnProd))
    
    # for abundance based JI type 2
    totalSam1Sam2Unprod = sum(topclones_sam1Unprod$COUNT) + sum(topclones_sam2Unprod$COUNT)                                                                                          
    AbundanceBasedJIUnProd2 = (commonClonesUnProd_U_abundance + commonClonesUnProd_V_abundance) / totalSam1Sam2Unprod
    overlapValuesUnprodJI_abundancebased2 = c(overlapValuesUnprodJI_abundancebased2,as.numeric(AbundanceBasedJIUnProd2))
    
    
    # Ninsertions  in common sequences
    
    commonClonotypes1= topclones_sam1Prod[topclones_sam1Prod[,indSam1] %in% intersect(topclones_sam1Prod[,indSam1],topclones_sam2Prod[,indSam1]),]
    commonClonotypes2= topclones_sam2Prod[topclones_sam2Prod[,indSam2] %in% intersect(topclones_sam1Prod[,indSam1],topclones_sam2Prod[,indSam2]),]
    
    commonClonotypes1Unprod= topclones_sam1Unprod[topclones_sam1Unprod[,indSam1] %in% intersect(topclones_sam1Unprod[,indSam1],topclones_sam2Unprod[,indSam2]),]
    commonClonotypes2Unprod= topclones_sam2Unprod[topclones_sam2Unprod[,indSam2] %in% intersect(topclones_sam1Unprod[,indSam1],topclones_sam2Unprod[,indSam2]),]
    
    
    NinsertionsP1= mean(commonClonotypes1$N2INSERTION + commonClonotypes1$N1INSERTION)
    NinsertionsP2 = mean(commonClonotypes2$N2INSERTION + commonClonotypes2$N1INSERTION)
    
    NinsertionsUP1= mean(commonClonotypes1Unprod$N2INSERTION + commonClonotypes1Unprod$N1INSERTION)
    NinsertionsUP2 = mean(commonClonotypes2Unprod$N2INSERTION + commonClonotypes2Unprod$N1INSERTION)
    
    
    NinsertionProd1=c(NinsertionProd1,NinsertionsP1)
    NinsertionProd2= c(NinsertionProd2,NinsertionsP2)
    
    NinsertionUnprod1=c(NinsertionUnprod1,NinsertionsUP1)
    NinsertionUnprod2=c(NinsertionUnprod2,NinsertionsUP2)
    
    
    # write some common clonotypes of productive and unproductive repertoires
    # select columns to merge and write 
    
    selectedCols <- c("NUCLEOTIDE","AMINOACID","COUNT","VFAMILYNAME","VGENENAME","JGENENAME","CDR3LENGTH","SEQUENCESTATUS")
    selectedColsIdx <- which(colnames(commonClonotypes1) %in% selectedCols)
    
    # write for productive commons
    
    resultDir = paste("Pairwise_Overlap_results_","Productive_Unproductive(",unProdStatus,")",sep="")
    
    dir.create(file.path(resultDir),showWarnings = FALSE)
    
    
    commonsProdForWriting = head(merge(commonClonotypes1[,selectedColsIdx],commonClonotypes2[,selectedColsIdx],by="NUCLEOTIDE"),20)
    commonsUnProdForWriting = merge(commonClonotypes1Unprod[,selectedColsIdx],commonClonotypes2Unprod[,selectedColsIdx],by="NUCLEOTIDE")
    
    
    write.table(commonsProdForWriting,file=paste(resultDir,"/",samplesCompared,"_Common_productive_sequences_top20.txt",sep=""),row.names = F,sep = "\t")
    write.table(commonsUnProdForWriting,file=paste(resultDir,"/",samplesCompared,"_Common_Unproductive_sequences_all.txt",sep=""),row.names = F,sep = "\t")
    
    
    
    #totalNumberOfProdClonotypes= nrow(topclones_sam1Prod) + nrow(topclones_sam2Prod)
    #totalNumberOfUnprodClonotypes= nrow(topclones_sam1Unprod) + nrow(topclones_sam2Unprod)
    
    
    #unproductiveOverlapCountName = paste("OverlapCountUnProductive(",unProdStatus,")",sep="")
    
    if(nrow(toadd) > 0){
      newrow=data.frame(overlapValuesProd,overlapValuesTotalSam1Prod,overlapValuesTotalSam2Prod,overlapValuesUnprod,overlapValuesTotalSam1Unprod,overlapValuesTotalSam2Unprod,overlapValuesProdJI,overlapValuesUnprodJI,overlapValuesProdJI_abundancebased,overlapValuesUnprodJI_abundancebased,overlapValuesProdJI_abundancebased2,overlapValuesUnprodJI_abundancebased2,NinsertionProd1,NinsertionProd2,NinsertionUnprod1,NinsertionUnprod2)
      colnames(newrow) <- c("OverlapCountProductive","OverlapCountTotalProductiveSam1","OverlapCountTotalProductiveSam2","OverlapCountUnProductive","OverlapCountTotalUnProductiveSam1","OverlapCountTotalUnProductiveSam2","OverlapProd_JI","OverlapUnProductive_JI","AbundanceBasedProd_JI","AbundanceBasedUnProductive_JI","AbundanceBasedProd_JI2","AbundanceBasedUnProductive_JI2","NinsertionProdSample1","NinsertionProdSample2","NinsertionUnProductiveSample1","NinsertionUnProductiveSample2")
      
      toadd=rbind(toadd,newrow)
    }else{
      toadd=data.frame(overlapValuesProd,overlapValuesTotalSam1Prod,overlapValuesTotalSam2Prod,overlapValuesUnprod,overlapValuesTotalSam1Unprod,overlapValuesTotalSam2Unprod,overlapValuesProdJI,overlapValuesUnprodJI,overlapValuesProdJI_abundancebased,overlapValuesUnprodJI_abundancebased,overlapValuesProdJI_abundancebased2,overlapValuesUnprodJI_abundancebased2,NinsertionProd1,NinsertionProd2,NinsertionUnprod1,NinsertionUnprod2)
      colnames(toadd) <- c("OverlapCountProductive","OverlapCountTotalProductiveSam1","OverlapCountTotalProductiveSam2","OverlapCountUnProductive","OverlapCountTotalUnProductiveSam1","OverlapCountTotalUnProductiveSam2","OverlapProd_JI","OverlapUnProductive_JI","AbundanceBasedProd_JI","AbundanceBasedUnProductive_JI","AbundanceBasedProd_JI2","AbundanceBasedUnProductive_JI2","NinsertionProdSample1","NinsertionProdSample2","NinsertionUnProductiveSample1","NinsertionUnProductiveSample2")
    }
    
  }
  
  
  
  
  
  #head(toadd)
  #print(nrow(toadd))
  resultTable=data.frame(m,toadd)        
  
  outputFileName = paste(resultDir,"/","Pairwise_Overlap_Counts_",seqType,"_Productive_Unproductive(",unProdStatus,").txt",sep="")
  
  write.table(resultTable,file=outputFileName,row.names = T,sep = "\t")
  return(resultTable)
  
  
}




#### Making PCA for V or VJ usage ####

getPCAfordata <- function(dd,groups,scaling=T,oprefix="O"){
  # Assumes that the observations (samples) are on columns and variables on rows. We want to make the variables
  # on columns so it first transposes d.
  dd=t(dd)
  #head(d)
  
  pr<-prcomp(dd,scale=scaling)
  print(summary(pr)) # proportions of the total variance explained by each component
  
  #samples names
  sampleNames=rownames(dd)
  
  # get variance explained by each PC
  s=summary(pr)$importance[2,]
  
  # grouping of samples
  #groups=groups
  
  # Write and display PCA result
  
  flname = paste(oprefix,"PCA.pdf")
  pdf(file=flname, useDingbats = FALSE)
  
  tobedrawn=data.frame(pr$x,group=factor(groups),snames=rownames(dd))
  write.table(pr$x,file=paste(oprefix,"_PCArotatedData.txt"),row.names = T,col.names=T,sep = "\t")
  
  pcaP=ggplot(tobedrawn, aes(x=PC1, y=PC2,shape=group,colour=snames)) + xlab(paste("PC1 (",format(round(s[1]*100),nsmall=2),"%)")) + ylab(paste("PC2 (",format(round(s[2]*100),nsmall=2),"%)")) + geom_point(size=6) + scale_fill_brewer(palette="Set1") 
  pcaP = pcaP + theme_bw() + theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank()) + theme(axis.line = element_line(colour = "black")) 
  
  
  
  
  print(pcaP)  
  #png(file=paste(oprefix,"PCA.png"),height=700, width=650)
  
  #Plot using ggplot, much more flexible
  #ggplot(as.data.frame(pr$x), aes(x=PC1, y=PC2)) + xlab(paste("PC1 (",format(round(s[1]*100),nsmall=2),"%)")) + ylab(paste("PC2 (",format(round(s[2]*100),nsmall=2),"%)")) + geom_point(size=6,aes(colour = sampleNames,shape=groups)) + scale_fill_brewer(palette="Set1") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(colour = "black")) 
                                                                                                                                                                                                                                                                              
  
  dev.off()
  
  #ggsave(flname, plot = last_plot(), device = "pdf")
  
  #print(pcaP)
  
}

getPCAfordata1 <- function(dd,groups,oprefix="O"){
  # Assumes that the observations (samples) are on columns and variables on rows. We want to make the variables
  # on columns so it first transposes d.
  dd=t(dd)
  #head(d)
  
  pr<-prcomp(dd)
  print(summary(pr)) # proportions of the total variance explained by each component
  
  #samples names
  sampleNames=rownames(dd)
  
  # get variance explained by each PC
  s=summary(pr)$importance[2,]
  
  # grouping of samples
  #groups=groups
  
  # Write and display PCA result
  flname = paste(oprefix,"PCA.pdf")
  pdf(file=flname, useDingbats = FALSE)
  
  
  tobedrawn=data.frame(pr$x,group=factor(groups),snames=rownames(dd))
  write.table(pr$x,file=paste(oprefix,"_PCArotatedData.txt"),row.names = T,col.names=T,sep = "\t")
  
  pcaP=ggplot(tobedrawn, aes(x=PC1, y=PC2,shape=group,colour=snames)) + xlab(paste("PC1 (",format(round(s[1]*100),nsmall=2),"%)")) + ylab(paste("PC2 (",format(round(s[2]*100),nsmall=2),"%)")) + geom_point(size=6) + scale_fill_brewer(palette="Set1")
  pcaP = pcaP + theme_bw() + theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank()) + theme(axis.line = element_line(colour = "black")) 
  
  
  #png(file=paste(oprefix,"PCA.png"),height=700, width=650)
  
  #Plot using ggplot, much more flexible
  #ggplot(as.data.frame(pr$x), aes(x=PC1, y=PC2)) + xlab(paste("PC1 (",format(round(s[1]*100),nsmall=2),"%)")) + ylab(paste("PC2 (",format(round(s[2]*100),nsmall=2),"%)")) + geom_point(size=6,aes(colour = sampleNames,shape=groups)) + scale_fill_brewer(palette="Set1") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(colour = "black")) 
  print(pcaP)                                                                                                                                                                                                                                                                              
  
  dev.off()
  
  
  
  #print(pcaP)
  
}


getPCAfordata2 <- function(dd,groups,oprefix="O"){
  # Assumes that the observations (samples) are on columns and variables on rows. We want to make the variables
  # on columns so it first transposes d.
  dd=t(dd)
  #head(d)
  
  pr<-prcomp(dd)
  print(summary(pr)) # proportions of the total variance explained by each component
  
  #samples names
  sampleNames=rownames(dd)
  
  # get variance explained by each PC
  s=summary(pr)$importance[2,]
  
  # grouping of samples
  #groups=groups
  
  # Write and display PCA result
  flname = paste(oprefix,"PCA.pdf")
  pdf(file=flname, useDingbats = FALSE)
  
  tobedrawn=data.frame(pr$x,repType=factor(groups),snames=rownames(dd))
  write.table(pr$x,file=paste(oprefix,"_PCArotatedData.txt"),row.names = T,col.names=T,sep = "\t")
  
  #pcaP=ggplot(tobedrawn, aes(x=PC1, y=PC2,shape=repType,colour=repType,label=snames))
  # pcaP=ggplot(tobedrawn, aes(x=PC1, y=PC2,shape=repType,colour=repType,label=snames))
  pcaP=ggplot(tobedrawn, aes(x=PC1, y=PC2,label=snames)) + 
    xlab(paste("PC1 (",format(round(s[1]*100),nsmall=2),"%)")) + 
    ylab(paste("PC2 (",format(round(s[2]*100),nsmall=2),"%)")) + 
    scale_fill_brewer(palette="Set1")
  
  pcaP = pcaP + geom_text(size=3,check_overlap = TRUE,aes(colour = repType, fontface = "bold"),position=position_jitter())
  
  pcaP = pcaP + theme_bw() + theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank()) + theme(axis.line = element_line(colour = "black")) 
  
  
  
  #png(file=paste(oprefix,"PCA.png"),height=700, width=650)
  
  #Plot using ggplot, much more flexible
  #ggplot(as.data.frame(pr$x), aes(x=PC1, y=PC2)) + xlab(paste("PC1 (",format(round(s[1]*100),nsmall=2),"%)")) + ylab(paste("PC2 (",format(round(s[2]*100),nsmall=2),"%)")) + geom_point(size=6,aes(colour = sampleNames,shape=groups)) + scale_fill_brewer(palette="Set1") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(colour = "black")) 
  print(pcaP)                                                                                                                                                                                                                                                                              
  
  dev.off()
  
  
  
  #print(pcaP)
  
}

getPCAfordata3 <- function(dd,groups,oprefix="O"){
  # Assumes that the observations (samples) are on columns and variables on rows. We want to make the variables
  # on columns so it first transposes d.
  dd=t(dd)
  #head(d)
  
  pr<-prcomp(dd)
  print(summary(pr)) # proportions of the total variance explained by each component
  
  #samples names
  sampleNames=rownames(dd)
  
  # get variance explained by each PC
  s=summary(pr)$importance[2,]
  
  # grouping of samples
  #groups=groups
  
  # Write and display PCA result
  flname = paste(oprefix,"PCA.pdf")
  pdf(file=flname, useDingbats = FALSE)
  
  
  tobedrawn=data.frame(pr$x,repType=factor(groups),snames=rownames(dd))
  write.table(pr$x,file=paste(oprefix,"_PCArotatedData.txt"),row.names = T,col.names=T,sep = "\t")
  
  pcaP=ggplot(tobedrawn, aes(x=PC1, y=PC2,shape=repType,colour=repType)) + xlab(paste("PC1 (",format(round(s[1]*100),nsmall=2),"%)")) + ylab(paste("PC2 (",format(round(s[2]*100),nsmall=2),"%)")) + geom_point(size=6) + scale_fill_brewer(palette="Set1") 
  pcaP = pcaP + theme_bw() + theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank()) + theme(axis.line = element_line(colour = "black")) 
  
  #png(file=paste(oprefix,"PCA.png"),height=700, width=650)
  
  #Plot using ggplot, much more flexible
  #ggplot(as.data.frame(pr$x), aes(x=PC1, y=PC2)) + xlab(paste("PC1 (",format(round(s[1]*100),nsmall=2),"%)")) + ylab(paste("PC2 (",format(round(s[2]*100),nsmall=2),"%)")) + geom_point(size=6,aes(colour = sampleNames,shape=groups)) + scale_fill_brewer(palette="Set1") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(colour = "black")) 
  print(pcaP)                                                                                                                                                                                                                                                                              
  
  dev.off()
  
  
  
  #print(pcaP)
  
}


getPCAfordata4 <- function(dd,groups,scaling=T,oprefix="O"){
  # Assumes that the observations (samples) are on columns and variables on rows. We want to make the variables
  # on columns so it first transposes d.
  dd=t(dd)
  #head(d)
  
  pr<-prcomp(dd,scale=scaling)
  print(summary(pr)) # proportions of the total variance explained by each component
  
  #samples names
  sampleNames=rownames(dd)
  
  # get variance explained by each PC
  s=summary(pr)$importance[2,]
  
  # grouping of samples
  #groups=groups
  
  # Write and display PCA result
  
  flname = paste(oprefix,"PCA.pdf")
  pdf(file=flname, useDingbats = FALSE)
  
  tobedrawn=data.frame(pr$x,group=factor(groups),snames=rownames(dd))
  write.table(pr$x,file=paste(oprefix,"_PCArotatedData.txt"),row.names = T,col.names=T,sep = "\t")
  
  pcaP=ggplot(tobedrawn, aes(x=PC1, y=PC2,colour=group)) + xlab(paste("PC1 (",format(round(s[1]*100),nsmall=2),"%)")) + ylab(paste("PC2 (",format(round(s[2]*100),nsmall=2),"%)")) + geom_point(size=4) + scale_fill_brewer(palette="Set1") 
  pcaP = pcaP + theme_bw() + theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank()) + theme(axis.line = element_line(colour = "black")) 
  
  
  
  
  print(pcaP)  
  #png(file=paste(oprefix,"PCA.png"),height=700, width=650)
  
  #Plot using ggplot, much more flexible
  #ggplot(as.data.frame(pr$x), aes(x=PC1, y=PC2)) + xlab(paste("PC1 (",format(round(s[1]*100),nsmall=2),"%)")) + ylab(paste("PC2 (",format(round(s[2]*100),nsmall=2),"%)")) + geom_point(size=6,aes(colour = sampleNames,shape=groups)) + scale_fill_brewer(palette="Set1") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(colour = "black")) 
  
  
  dev.off()
  
  #ggsave(flname, plot = last_plot(), device = "pdf")
  
  #print(pcaP)
  
}


#### Fisher's exact test for V,J or VJ usage counts ####

compareUsageProportions<-function(freqTable,pcutoff=0.05){
  ### freqTable : an Feature frequency table between sample groups, table has headers, headers 2 and 3 have the counts
  
  sam=freqTable
  #hdnames= colnames(sam)
  
  #hlaAlleles <- as.data.frame(sam[,1],stringsAsFactors=FALSE )
  #colnames(hlaAlleles)  <- hdnames[1]
  
  #signficanceFound=T # a flag to check if significant comparison is found in each round
  #topsignificantAllele=F # a flag to hold the allele showing the most significant difference between sample groups in each round, will be removed during next round comparison
  
  resultTable <- sam
  pvals=c()
  ORs=c()
  CIs=c()
  
  print(paste("There are",ncol(sam),"samples in the data. Comparison is done for every possible pair of samples..."))
  
  print(paste("Number of features to compare : ",nrow(sam)))
  
  toploop=ncol(sam)-1
  inloop=ncol(sam)
  
  for (j in 1:toploop){
    
    for (k in j+1:inloop){
      #print(inloop)
      #print(j)
      #print(k)
      if (k > inloop){break}
      res=compareUsageInTwoSamples(sam[,c(j,k)])
      colnames(res) <- c(paste(colnames(sam[,c(j,k)])[1],colnames(sam[,c(j,k)])[2],"padjusted",sep="_"),paste(colnames(sam[,c(j,k)])[1],colnames(sam[,c(j,k)])[2],"OR",sep="_"),paste(colnames(sam[,c(j,k)])[1],colnames(sam[,c(j,k)])[2],"CI",sep="_"))
      resultTable <- cbind(resultTable,res) 
    }
    
  }
  
  print("Result written to file....")
  write.table(resultTable,file="Muke_Usage_fishers_comparison_result.txt",row.names = T,sep = "\t")
  return(resultTable)
  
  
}


compareUsageInTwoSamples<-function(freqTable,pcutoff=0.05){
  sam=freqTable
  pvals=c()
  ORs=c()
  CIs=c()
  
  print(head(sam))
  
  for(i in 1:nrow(sam)){
    
    current<-sam[i,]
    othersSum<-colSums(sam[-i,])
    #print(othersSum)
    
    dataForComparison=matrix(c(as.numeric(current),as.numeric(othersSum)),nrow = 2,byrow=T)
    #colnames(dataForComparison)<-c(hdnames[2], hdnames[3])
    #rownames(dataForComparison)<-c(as.character(sam[i,i]), "OtherAlleles")
    
    fresult=fisher.test(dataForComparison)
    pvals=c(pvals,fresult$p.value)
    ORs=c(ORs,as.numeric(fresult$estimate))
    CIs=c(CIs,paste(fresult$conf[1],fresult$conf[2],sep="-"))
    
    #pvals=p.adjust(pvals, "BH") # pvals are adjusted
    
    #cnames=c(paste(colnames(sam)[selectedCols],"pval",sep="") ,paste(colnames(sam)[selectedCols],"OR",sep=""),paste(colnames(sam)[selectedCols],"CI",sep=""))
    
    
  }
  
  pvals=p.adjust(pvals, "BH") # pvals are adjusted
  res = cbind(pvals,ORs,CIs)
  
  return(res)  
}


compareAbundanceInTwoSamplesForShared<-function(freqTable,pcutoff=0.05,sharedClonesOnly=T,minTotalAbundance=100){
  sam=freqTable
  pvals=c()
  ORs=c()
  CIs=c()
  
  print(head(sam))
  
  if(sharedClonesOnly==T){
    sharedClonesIndices = apply(sam,1,function(x) (sum(x>0) == 2 & sum(x) >= minTotalAbundance))
  }else{
    sharedClonesIndices = apply(sam,1,function(x) (sum(x) >= minTotalAbundance))
  }
  
  sharedCloneIndices = which(sharedClonesIndices)
  numOfShared = length(sharedCloneIndices)
  
    
  for(i in 1:numOfShared){
    
    current<-sam[sharedCloneIndices[i],]
    othersSum<-colSums(sam[-sharedCloneIndices[i],])
    #print(othersSum)
    
    dataForComparison=matrix(c(as.numeric(current),as.numeric(othersSum)),nrow = 2,byrow=T)
    #colnames(dataForComparison)<-c(hdnames[2], hdnames[3])
    #rownames(dataForComparison)<-c(as.character(sam[i,i]), "OtherAlleles")
    
    fresult=fisher.test(dataForComparison)
    pvals=c(pvals,fresult$p.value)
    ORs=c(ORs,as.numeric(fresult$estimate))
    CIs=c(CIs,paste(fresult$conf[1],fresult$conf[2],sep="-"))
    
    #pvals=p.adjust(pvals, "BH") # pvals are adjusted
    
    #cnames=c(paste(colnames(sam)[selectedCols],"pval",sep="") ,paste(colnames(sam)[selectedCols],"OR",sep=""),paste(colnames(sam)[selectedCols],"CI",sep=""))
    
    
  }
  
  pvals=p.adjust(pvals, "BH") # pvals are adjusted
  res = cbind(as.numeric(pvals),ORs,CIs)
  
  rownames(res) = rownames(sam[sharedCloneIndices,])
  
  return(res)  
}


compareUsageProportionsPaired<-function(freqTable,pcutoff=0.05,pairs){
  ### freqTable : an Feature frequency table between sample groups, table has headers, headers 2 and 3 have the counts
  
  sam=freqTable
  #hdnames= colnames(sam)
  
  #hlaAlleles <- as.data.frame(sam[,1],stringsAsFactors=FALSE )
  #colnames(hlaAlleles)  <- hdnames[1]
  
  #signficanceFound=T # a flag to check if significant comparison is found in each round
  #topsignificantAllele=F # a flag to hold the allele showing the most significant difference between sample groups in each round, will be removed during next round comparison
  
  resultTable <- sam
  pvals=c()
  ORs=c()
  CIs=c()
  
  
  
  print(paste("There are",ncol(sam),"samples in the data. Comparison is done for pair of samples..."))
  
  print(paste("Number of features to compare : ",nrow(sam)))
  
  rnds = length(pairs)/2
  for (k in 1:rnds){
      kpair = k + rnds
      res=compareUsageInTwoSamples(sam[,c(k,kpair)])
      colnames(res) <- c(paste(colnames(sam[,c(k,kpair)])[1],colnames(sam[,c(k,kpair)])[2],"padjusted",sep="_"),paste(colnames(sam[,c(k,kpair)])[1],colnames(sam[,c(k,kpair)])[2],"OR",sep="_"),paste(colnames(sam[,c(k,kpair)])[1],colnames(sam[,c(k,kpair)])[2],"CI",sep="_"))
      resultTable <- cbind(resultTable,res) 
    }
    
  
  
  print("Result written to file....")
  write.table(resultTable,file="Usage_fishers_comparison_result_forPairSamples.txt",row.names = T,sep = "\t")
  return(resultTable)
  
  
}


compareAbundanceInPairedSamples<-function(freqTable,pcutoff=0.01,pairs,writeResult=F,resultDir="matcheDA"){
  ### freqTable : an clone count table between sample groups, table has headers, headers 1 and 2 have the counts
  
  sam=freqTable
  
  resultlist <- list()
  pvals=c()
  ORs=c()
  CIs=c()
  
  
  
  print(paste("There are",ncol(sam),"samples in the data. Comparison is done for pair of samples..."))
  
  print(paste("Number of features to compare : ",nrow(sam)))
  
  
  # dir to write results to
    
  dir.create(file.path(resultDir),showWarnings = FALSE)
   
  
  # comparison for every pair of samples
  
  rnds = length(pairs)/2
  for (k in 1:rnds){
    kpair = k + rnds
    res=compareAbundanceInTwoSamplesForShared(sam[,c(k,kpair)],sharedClonesOnly=F)
    colnames(res) <- c(paste(colnames(sam[,c(k,kpair)])[1],colnames(sam[,c(k,kpair)])[2],"padjusted",sep="_"),paste(colnames(sam[,c(k,kpair)])[1],colnames(sam[,c(k,kpair)])[2],"OR",sep="_"),paste(colnames(sam[,c(k,kpair)])[1],colnames(sam[,c(k,kpair)])[2],"CI",sep="_"))
    
    print(str(res[,1]))
    
    
    siglist = as.numeric(res[,1]) < pcutoff
    
    resSig = res[siglist,]
    
    print(head(siglist))
    
    
    resultlist[[colnames(sam[,c(k,kpair)])[1]]] <- cbind(sam[rownames(resSig),c(k,kpair)],resSig)
    
    if(writeResult==T){
    print("Result written to file....")
    write.table(res,file=paste(resultDir,"/",colnames(sam[,c(k,kpair)])[1],"_",colnames(sam[,c(k,kpair)])[2],"_Abundance_fishers_comparison_result_forPairSamples.txt",sep=""),row.names = T,sep = "\t")
    }
   
    
      
  
  }
  
  return(resultlist)
  
  
}




#### composition based comparison of repertoires using 4-mer nt usage frequencies ####

getKmerNtFrequencies <- function(reps,kMer=4,asProb=F,vGenes=NULL,seqStatus="prod"){
  
  kmerFreqTable = NULL
  
  for(rep in reps){
    print(rep)
    p <- get(rep)
    
    if(seqStatus=="prod"){
      p <- getProductive(p)
    }else {
      p <- getUnproductive(p)
    }
    
  
    if(!is.null(vGenes)){    
      p <- p[p$VGENENAME %in% vGenes, ]
    }  
    
    print(nrow(p))
    #print(p[,c(8,22)])
    
    seqs <- unique(p$NUCLEOTIDE) # select unique AA clonotypes
    seqSet <- DNAStringSet(seqs)
    seq_mers <- oligonucleotideFrequency(seqSet,width=kMer,as.prob=asProb,simplify.as="collapsed")
    kmerFreqTable = rbind(kmerFreqTable,seq_mers)
    
  }
  
  
  rownames(kmerFreqTable) <- reps
  
  kmerFreqTable <- t(kmerFreqTable)
  
  print("Writing result to file....")
  
  write.table(kmerFreqTable,file="4merFrequencyTable.txt",row.names = T,col.names=T,sep = "\t")
  
  return(kmerFreqTable)
  
}

getKmerNtFrequenciesVJ <- function(reps,kMer=4,asProb=F,vjGenes=NULL,seqStatus="prod"){
  
  kmerFreqTable = NULL
  
  for(rep in reps){
    print(rep)
    p <- get(rep)
    
    if(seqStatus=="prod"){
      p <- getProductive(p)
    }else {
      p <- getUnproductive(p)
    }
    
    tempd = NULL
    if(!is.null(vjGenes)){
      
      res=strsplit(vjGenes," ")
      for(i in 1:length(res)){
        
        tempd <- rbind(tempd,p[(p$VGENENAME==res[[i]][1]) & (p$JGENENAME==res[[i]][2]),])
      }
    
      p <- tempd
      
    }  
   
  print(nrow(p))
  #print(p[,c(8,22)])
    
  seqs <- unique(p$NUCLEOTIDE) # select unique AA clonotypes
  seqSet <- DNAStringSet(seqs)
  seq_mers <- oligonucleotideFrequency(seqSet,width=kMer,as.prob=asProb,simplify.as="collapsed")
  kmerFreqTable = rbind(kmerFreqTable,seq_mers)
  
  }
  
 
  rownames(kmerFreqTable) <- reps
  
  kmerFreqTable <- t(kmerFreqTable)
  
  print("Writing result to file....")
  
  write.table(kmerFreqTable,file="4merFrequencyTable.txt",row.names = T,col.names=T,sep = "\t")
  
  return(kmerFreqTable)
  
}

getClonalKmerNtFrequencies <- function(reps,selectionType="random",nClones=100,kMer=4,asProb=F,vGenes=NULL,seqStatus="prod"){
  
  kmerFreqTable = NULL
  repnames = c()
  
  for(rep in reps){
    print(rep)
    p <- get(rep)
    
    if(seqStatus=="prod"){
      p <- getProductive(p)
    }else {
      p <- getUnproductive(p)
    }
    
    #select clones with given list of Vgenes
    
    if(!is.null(vGenes)){ 
      p <- p[p$VGENENAME %in% vGenes,]
    }  
    
    # then select 100 clonotypes
    
    if(selectionType=="random"){ 
    sampling.index = sample(1:nrow(p),nClones,replace=F)  
    }else{
      # just pick the top nClones
      sampling.index = 1:nClones  
    }
    
    p <- p[sampling.index,]
    
   
    
    print(nrow(p))
    
    seqs <- unique(p$NUCLEOTIDE) # select unique AA clonotypes
    seqSet <- DNAStringSet(seqs)
    seq_mers <- oligonucleotideFrequency(seqSet,width=kMer,as.prob=asProb)
    kmerFreqTable = rbind(kmerFreqTable,seq_mers)
    
    repnames = c(repnames,rep(rep,nClones))
  }
  
  rownames(kmerFreqTable) <- repnames  
  kmerFreqTable <- t(kmerFreqTable)
  
  print("Writing result to file....")
  
  write.table(kmerFreqTable,file="clonal4merFrequencyTable.txt",row.names = T,col.names=T,sep = "\t")
  
  return(list(freqTable = kmerFreqTable,sams = repnames))
  
}
dyohanne/RepAn documentation built on Feb. 3, 2023, 2:41 p.m.