igfuns: Analyzes of Ig sequencing data

#' A sample of PBMC sequences of NGS170804 project.
#'
#' This dataset contains mutations, V-gene family, sample ID,
#' number of mutations and germline divergence score.
#' V mutations div_germ sample status label_subtype
#' @format A data frame with 4000 rows and 6 variables
#' \describe{
#'   \item{V}{V-gene segment}
#'   \item{mutations}{number of mutations}
#'   \item{div_germ}{divergence from germline in percentage}
#'   \item{sample}{sample code}
#'   \item{status}{status of the patient, early-treated (PHI) or late-treated (CHI)}
#'   \item{label_subtype}{isotype label, either IgA or IgG in this project}
#' }
"pbmc_div_mut"

#' A sample of MUCOSA sequences of NGS170804 project.
#'
#' This dataset contains mutations, V-gene family, sample ID,
#' number of mutations and germline divergence score.
#' V mutations div_germ sample status label_subtype
#' @format A data frame with 16000 rows and 6 variables
#' \describe{
#'   \item{V}{V-gene segment}
#'   \item{mutations}{number of mutations}
#'   \item{div_germ}{divergence from germline in percentage}
#'   \item{sample}{sample code}
#'   \item{status}{status of the patient, early-treated (PHI) or late-treated (CHI), NT (non-treated), HD (healthy-donor)}
#'   \item{label_subtype}{isotype label, either IgA or IgG in this project}
#' }
"mucosa_div_mut"

#' single cell data
#'
#' This dataset contains mutations, V-gene family, sample ID,
#' number of mutations and germline divergence score.
#'V mutations div_germ sample status label_subtype
#' @format A data frame with 623 rows and 35 variables
#' \describe{
#'   \item{alnstart}{alignemnt start on the subject}
#'   \item{b_cell_subset}{b_cell_subset}
#'   \item{cdr1}{CDR1 sequence, in AA}
#'   \item{cdr1_seq}{CDR1 sequence, in nt}
#'   \item{cdr2}{CDR2 sequence, in AA}
#'   \item{cdr2_seq}{CDR2 sequence, in nt}
#'   \item{cdr3}{CDR3 sequence, in AA}
#'   \item{cdr3_seq}{CDR3 sequence, in nt}
#'   \item{chain_type}{type of the chain, either heavy or light}
#'   \item{compartment}{the compartment}
#'   \item{dh}{D-gene information, with subfamily}
#'   \item{dh2}{D-gene information, 2 digits precision (allele information)}
#'   \item{evalue}{blast evalue, les than 10e-3 is good}
#'   \item{fr1}{FR1 sequence, in AA}
#'   \item{fr1_seq}{FR1 sequence, in nt}
#'   \item{fr2}{FR2 sequence, in AA}
#'   \item{fr2_seq}{FR2 sequence, in nt}
#'   \item{fr3}{FR3 sequence, in AA}
#'   \item{fr3_seq}{FR3 sequence, in nt}
#'   \item{fr4}{FR4 sequence, in AA}
#'   \item{fr4_seq}{FR4 sequence, in nt}
#'   \item{inframe}{is the CDR3 stop IN-FRAME}
#'   \item{jh}{J-gene information, with subfamily}
#'   \item{jh2}{J-gene information, 2 digits precision (allele information)}
#'   \item{length}{CDR3 length, in nt}
#'   \item{mutations}{number of mutations in the V-region}
#'   \item{name}{sequence name}
#'   \item{sample}{sample code}
#'   \item{status}{status of the patient, early-treated (PHI) or late-treated (CHI)}
#'   \item{stop}{stop codon, boolean YES or NO}
#'   \item{strand}{DNA strand, either '+' or '-'}
#'   \item{vh}{V-gene information, with subfamily}
#'   \item{vh2}{V-gene information, 2 digits precision (allele information)}
#'   \item{wgxg}{mask to state wether the CDR3 is well defined or not, boolean YES or NO}
#'   \item{wgxg_2}{mask to state wether the CDR3 is well defined or not [this delimitation is not IN-FRAME], boolean YES or NO}
#' }
"sc_lb"

#' Sequencing data in chord blood from mothers.
#'
#' A dataset containing the molecular characterization information of antibody sequences
#'
#' @format A data frame with 801 rows and 28 variables:
#' \describe{
#'   \item{name}{sequence name}
#'   \item{chain_type}{type of the chain, either heavy or light}
#'   \item{vh}{V-gene information, with subfamily}
#'   \item{dh}{D-gene information, with subfamily}
#'   \item{jh}{J-gene information, with subfamily}
  # \item{vh2}{V-gene information, 2 digits precision (allele information)}
  # \item{dh2}{D-gene information, 2 digits precision (allele information)}
  # \item{jh2}{J-gene information, 2 digits precision (allele information)}
#'   \item{evalue}{blast evalue, les than 10e-3 is good}
#'   \item{strand}{DNA strand, either '+' or '-'}
#'   \item{fr1}{FR1 sequence, in AA}
#'   \item{cdr1}{CDR1 sequence, in AA}
#'   \item{fr2}{FR2 sequence, in AA}
#'   \item{cdr2}{CDR2 sequence, in AA}
#'   \item{fr3}{FR3 sequence, in AA}
#'   \item{cdr3}{CDR3 sequence, in AA}
#'   \item{fr4}{FR4 sequence, in AA}
#'   \item{pos_ch_cdr3}{number of positive charges in CDR3}
#'   \item{neg_ch_cdr3}{number of negative charges in CDR3}
#'   \item{gravy_cdr3}{hydrophobicity in CDR3}
#'   \item{pos_ch_vreg}{number of positive charges in V-Region}
#'   \item{neg_ch_vreg}{number of negative charges in V-Region}
#'   \item{gravy_vreg}{hydrophobicity in V-Region}
  # \item{fr1_seq}{FR1 sequence, in nt}
  # \item{cdr1_seq}{CDR1 sequence, in nt}
  # \item{fr2_seq}{FR2 sequence, in nt}
  # \item{cdr2_seq}{CDR2 sequence, in nt}
  # \item{fr3_seq}{FR3 sequence, in nt}
  # \item{cdr3_seq}{CDR3 sequence, in nt}
  # \item{fr4_seq}{FR4 sequence, in nt}
#'   \item{length}{CDR3 length, in nt}
#'   \item{length_v}{V-REGION length, in nt}
#'   \item{mutations}{number of mutations in the V-region}
  # \item{stop}{stop codon, boolean YES or NO}
  # \item{wgxg}{mask to state wether the CDR3 is well defined or not, boolean YES or NO}
  # \item{wgxg2}{mask to state wether the CDR3 is well defined or not [this delimitation is not IN-FRAME], boolean YES or NO}
  # \item{inframe}{is the CDR3 stop IN-FRAME}
  # \item{alnstart}{alignemnt start on the subject}
#'   \item{b_cell_subset}{b_cell_subset}
#'   \item{sample}{the sample}
#'   \item{compartment}{the compartment}
#'   \item{v}{v gene - Family only}
#'   \item{j}{j gene - Family only}
#' }
#' @source \url{http://www.diamondse.info/}
"bregs"


#' Sequencing data sample from NGS project.
#'
#' A dataset containing the molecular characterization information of antibody sequences
#'
#' @format A data frame with 421194 rows and 66 variables:
#' \describe{
#'   \item{cdr3}{CDR3 sequence, in AA}
#'   \item{cdr3_seq}{CDR3 sequence, in nt}
#'   \item{chain_type}{type of the chain, either heavy or light}
#'   \item{clstr_id}{id of cd-hit group}
#'   \item{color}{color code for clonal family}
#'   \item{compartment}{compartment: PBMC/MUCOSA}
#'   \item{dbtype}{database type: nt or aa}
#'   \item{depth}{depth of a read based on cd-hit}
#'   \item{dh}{D-gene information, with subfamily}
#'   \item{dh2}{D-gene information, 2 digits precision (allele information)}
#'   \item{div_germ}{divergence from germline in percentage}
#'   \item{fraction}{cdr3 similarity threshold}
#'   \item{fraction.1}{cd-hit similarity score}
#'   \item{FREQ_R_CDR1}{frequence Replacement in CDR1}
#'   \item{FREQ_R_CDR2}{frequence Replacement in CDR2}
#'   \item{FREQ_R_FR1}{frequence Replacement in FR1}
#'   \item{FREQ_R_FR2}{frequence Replacement in FR2}
#'   \item{FREQ_R_FR3}{frequence Replacement in FR3}
#'   \item{FREQ_R_VREG}{frequence Replacement in VREG}
#'   \item{FREQ_S_CDR1}{frequence Silent in CDR1}
#'   \item{FREQ_S_CDR2}{frequence Silent in CDR2}
#'   \item{FREQ_S_FR1}{frequence Silent in FR1}
#'   \item{FREQ_S_FR2}{frequence Silent in FR2}
#'   \item{FREQ_S_FR3}{frequence Silent in FR3}
#'   \item{FREQ_S_VREG}{frequence Silent in VREG}
#'   \item{gravy_cdr3}{hydrophobicity in CDR3}
#'   \item{id2}{clonotype family id}
#'   \item{jh}{J-gene information, with subfamily}
#'   \item{jh2}{J-gene information, 2 digits precision (allele information)}
#'   \item{label_subtype}{isotype label, IgA, IgG, IgM}
#'   \item{length}{CDR3 length, in nt}
#'   \item{mutations}{number of mutations in the V-region}
#'   \item{name}{sequence name}
#'   \item{ncl_mut_cdr1}{number of nt mutations in the cdr1}
#'   \item{ncl_mut_cdr2}{number of nt mutations in the cdr2}
#'   \item{ncl_mut_fr1}{number of nt mutations in the fr1}
#'   \item{ncl_mut_fr2}{number of nt mutations in the fr2}
#'   \item{ncl_mut_fr3}{number of nt mutations in the fr3}
#'   \item{neg_ch_cdr3}{negative charges in CDR3}
#'   \item{pos_ch_cdr3}{positive charges in CDR3}
#'   \item{R_CDR1}{Ratio R_CDR1}
#'   \item{R_CDR2}{Ratio R_CDR2}
#'   \item{R_FR1}{R_FR1}
#'   \item{R_FR2}{R_FR2}
#'   \item{R_FR3}{R_FR3}
#'   \item{R_VREG}{R_VREG}
#'   \item{RS_CDR1}{Ratio RS_CDR1}
#'   \item{RS_CDR2}{Ratio RS_CDR2}
#'   \item{RS_FR1}{RS_FR1}
#'   \item{RS_FR2}{RS_FR2}
#'   \item{RS_FR3}{RS_FR3}
#'   \item{RS_VREG}{RS_VREG}
#'   \item{run}{run id}
#'   \item{S_CDR1}{Ratio S_CDR1}
#'   \item{S_CDR2}{Ratio S_CDR2}
#'   \item{S_FR1}{S_FR1}
#'   \item{S_FR2}{S_FR2}
#'   \item{S_FR3}{S_FR3}
#'   \item{S_VREG}{S_VREG}
#'   \item{sample}{the sample}
#'   \item{status}{status of the patient, early-treated (PHI) or late-treated (CHI)}
#'   \item{subtype}{isotype found by BLAST}
#'   \item{V}{V-gene segment family}
#'   \item{vh}{V-gene information, with subfamily}
#'   \item{vh2}{V-gene information, 2 digits precision (allele information)}
#'   \item{X}{index created by R}
#' }
#' @source \url{http://www.diamondse.info/}
"orleans"