R/common.R

Defines functions .filter_na_zerovar .filter_overimputed ordermeta_mset imputation_metrics

Documented in imputation_metrics ordermeta_mset

## All ----

#### subsetting (MultiDataSet) ####

#' @rdname subsetting
#' @export
setMethod("subsetting", signature(x = "MultiDataSet"),
          function(x,
                   set.c = NULL,
                   genes.vc = "all",
                   sex.vc = "all",
                   tissues.vc = "all",
                   common_samples.l = FALSE,
                   na_thresh.n = 0.2,
                   var_thresh.n = 1e-5,
                   imputed_thresh.n = 0.2) {
            
            if (length(genes.vc) == 1 && genes.vc == "all")
              genes.vc <- c("WT", LATMX2::genes.vc())
            stopifnot(all(genes.vc %in% c("WT", LATMX2::genes.vc())))
            
            if (length(sex.vc) == 1 && sex.vc == "all")
              sex.vc <- LATMX2::sex.vc()
            stopifnot(all(sex.vc %in% LATMX2::sex.vc()))
            
            stopifnot(length(tissues.vc) == 1)
            
            if (tissues.vc != "all") {
              
              stopifnot(tissues.vc %in% LATMX2::tissues.vc())
              
              x <- x[, names(x)[names(x) %in% c("clinics",
                                                            grep(tissues.vc,
                                                                 LATMX2::sets.vc(),
                                                                 value = TRUE))]]
              
            }
            
            if (common_samples.l) {
              x <- MultiDataSet::commonSamples(x)
              ref.eset <- x[[names(x)[1]]]
            } else if ("clinics" %in% names(x)) {
              ref.eset <- x[["clinics"]]
            } else {
              sample_names.ls <- Biobase::sampleNames(x)
              all_samples.vc <- Reduce("union", sample_names.ls)
              ref_eset.i <- which.max(unlist(lapply(Biobase::sampleNames(x),
                                                    function(sample_names.vc)
                                                      sum(sample_names.vc %in% all_samples.vc))))[1]
              ref.eset <- x[[names(x)[[ref_eset.i]]]]
            }
            
            samples.vc <- Biobase::sampleNames(ref.eset)
            samples_sel.vl <- Biobase::pData(ref.eset)[, "gene"] %in% genes.vc &
              Biobase::pData(ref.eset)[, "sex"] %in% sex.vc
            
            sub.mset <- x[samples.vc[samples_sel.vl], ]
            
            sub_sets.vc <- names(sub.mset)
            
            for (set.c in sub_sets.vc) {
              
              eset <- sub.mset[[set.c]]
              
              eset <- subsetting(x = eset,
                                 set.c = set.c,
                                 genes.vc = genes.vc,
                                 sex.vc = sex.vc,
                                 tissues.vc = NULL,
                                 common_samples.l = NULL,
                                 na_thresh.n = na_thresh.n,
                                 var_thresh.n = var_thresh.n,
                                 imputed_thresh.n = imputed_thresh.n)
              
              sub.mset <- MultiDataSet::add_eset(sub.mset,
                                                 eset,
                                                 dataset.type = set.c,
                                                 GRanges = NA,
                                                 overwrite = TRUE,
                                                 warnings = FALSE)
              
            }
            
            sub.mset <- sub.mset[, sub_sets.vc]
            
            invisible(sub.mset)
            
          })


#### subsetting (ExpressionSet) ####

#' @rdname subsetting
#' @export
setMethod("subsetting", signature(x = "ExpressionSet"),
          function(x,
                   set.c,
                   genes.vc = "all",
                   sex.vc = "all",
                   tissues.vc = NULL,
                   common_samples.l = NULL,
                   na_thresh.n = 0.2,
                   var_thresh.n = 1e-5,
                   imputed_thresh.n = 0.2) {
            
            if (length(genes.vc) == 1 && genes.vc == "all")
              genes.vc <- LATMX2::wtgenes.vc()
            stopifnot(all(genes.vc %in% LATMX2::wtgenes.vc()))
            
            if (length(sex.vc) == 1 && sex.vc == "all")
              sex.vc <- LATMX2::sex.vc()
            stopifnot(all(sex.vc %in% LATMX2::sex.vc()))
            
            samples.vc <- Biobase::sampleNames(x)
            
            samples_sel.vl <- Biobase::pData(x)[, "gene"] %in% genes.vc &
              Biobase::pData(x)[, "sex"] %in% sex.vc
            
            x <- x[, samples.vc[samples_sel.vl]]
            
            filter.vi <- c(nas_zerovar = NA_integer_,
                           overimputed = NA_integer_)
            
            # NAs <= 20% and variance >= 1e-5
            na_zerovar_sel.vl <- ProMetIA:::.filter_na_zerovar(t(Biobase::exprs(x)),
                                                               na_thresh.n = na_thresh.n,
                                                               var_thresh.n = var_thresh.n)
            
            filter.vi["nas_zerovar"] <- sum(!na_zerovar_sel.vl)
            
            # proteomics: observations >= 80% in at least one condition
            if (grepl("proteomics", set.c)) {
              overimputed_sel.vl <- ProMetIA:::.filter_overimputed(eset = x,
                                                                   genes.vc = genes.vc,
                                                                   sex.vc = sex.vc,
                                                                   imputed_thresh.n = imputed_thresh.n)
            } else
              overimputed_sel.vl <- rep(TRUE, dim(x)["Features"])
            
            filter.vi["overimputed"] <- sum(!overimputed_sel.vl)
            
            # intersection of both conditions
            feat_sel.vl <- na_zerovar_sel.vl & overimputed_sel.vl
            
            stopifnot(length(feat_sel.vl) == dim(x)["Features"] &&
                        !any(is.na(feat_sel.vl)))
            
            x <- x[feat_sel.vl, ]
            
            if (sum(!feat_sel.vl))
              message("Nb of discard feat. in '", set.c, "': ",
                      paste(paste0(names(filter.vi), ": ", filter.vi), collapse = ", "))
            
            invisible(x)
            
          })


.filter_na_zerovar <- function(input.mn,
                               na_thresh.n,
                               var_thresh.n) {
  
  # removing variables with > 20% NA (including 100% NA in females)
  feat_na.vn <- apply(input.mn, 2, function(feat.vn)
    sum(is.na(feat.vn)) / length(feat.vn))
  feat_notna.vl <- feat_na.vn <= na_thresh.n
  # sum(feat_notna.vl)
  
  # removing variables with variance < 1e-5
  feat_var.vn <- apply(input.mn, 2, function(feat.vn)
    var(feat.vn, na.rm = TRUE))
  feat_notzerovar.vl <- !is.na(feat_var.vn) &
    (feat_var.vn >= var_thresh.n)
  # sum(feat_notzerovar.vl)
  
  feat_sel.vl <- feat_notna.vl & feat_notzerovar.vl
  
  stopifnot(length(feat_sel.vl) == ncol(input.mn) &&
              !any(is.na(feat_sel.vl)))
  
  feat_sel.vl
  
}


.filter_overimputed <- function(eset,
                                genes.vc,
                                sex.vc,
                                imputed_thresh.n) {
  
  fdata.df <- Biobase::fData(eset)
  
  imputed.vc <- paste0("imputed_", Biobase::sampleNames(eset))
  
  stopifnot(all(imputed.vc %in% colnames(fdata.df)))
  
  imputed.mi <- fdata.df[, imputed.vc]
  
  if (length(genes.vc) >= 2) {
    # general case: no restriction about sex
    # or e.g. LAT vs WT on males (or females) only
    
    factor.fc <- factor(Biobase::pData(eset)[, "gene"],
                        levels = LATMX2::wtgenes.vc()[LATMX2::wtgenes.vc() %in% genes.vc])
    
  } else if (length(genes.vc) == 1 && length(sex.vc) == 2) {
    # males vs females on LAT (or WT) only
    
    factor.fc <- factor(Biobase::pData(eset)[, "sex"],
                        levels = LATMX2::sex.vc())
    
  } else
    stop("The corresponding imputation metric for this combination of genotype(s) and gender(s) is not currently available.",
         call. = FALSE)
  
  imputed_factor.mi <- t(apply(imputed.mi, 1, function(var.vn) {
    tapply(var.vn, factor.fc, sum)
  }))
  imputed_prop.mn <- sweep(imputed_factor.mi, 2, table(factor.fc), "/")
  
  imputed.ml <- imputed_prop.mn >= imputed_thresh.n
  imputed.vn <- rowSums(imputed.ml, na.rm = TRUE)
  
  feat_sel.vl <- imputed.vn <= 1
  
  stopifnot(length(feat_sel.vl) == dim(eset)["Features"] &&
              !any(is.na(feat_sel.vl)))
  
  feat_sel.vl
  
}


#' Moving the 'supp_' columns at the end of the sampleMetadata and variableMetadata.
#' These columns originate from the preprocessing and are kept only as supplementary
#' information about the variables (e.g. quality metrics)
#' For the proteomics datasets, the 'imputed' columns of variableMetadata (to be
#' used to filter overimputed variables when comparing two conditions) are placed
#' just before the 'supp_' columns.
#' @export
ordermeta_mset <- function(mset,
                           imputed_prefix.c = "imputed",
                           supp_prefix.c = "supp_") {
  
  mset_names.vc <- names(mset)
  
  for (set.c in mset_names.vc) {
    
    eset <- mset[[set.c]]
    
    pdata.df <- Biobase::pData(eset)
    p_supp.vi <- grep(paste0("^", supp_prefix.c), colnames(pdata.df))
    pdata.df <- pdata.df[, c(setdiff(1:ncol(pdata.df), p_supp.vi), p_supp.vi)]
    Biobase::pData(eset) <- pdata.df
    
    fdata.df <- Biobase::fData(eset)
    f_supp.vi <- grep(paste0("^", supp_prefix.c), colnames(fdata.df))
    if (grepl("proteomics", set.c)) {
      f_impsupp.vi <- c(grep(paste0("^", imputed_prefix.c), colnames(fdata.df)),
                        f_supp.vi)
    } else
      f_impsupp.vi <- f_supp.vi
    fdata.df <- fdata.df[, c(setdiff(1:ncol(fdata.df), f_impsupp.vi), f_impsupp.vi)]
    Biobase::fData(eset) <- fdata.df
    
    stopifnot(methods::validObject(eset))
    
    mset <- MultiDataSet::add_eset(mset,
                                   eset,
                                   dataset.type = set.c,
                                   GRanges = NA,
                                   overwrite = TRUE,
                                   warnings = FALSE)
    
  }
  
  mset <- mset[, mset_names.vc]
  
  mset
  
}

## Proteomics ----

#' Computing the 'imputation' sample nb and percent (i.e. 'imputed by Prostar')
#' for each variable (for the variables with too high imputation metric in both
#' genotype conditions to be removed in the statistical analysis )
#' @export
imputation_metrics <- function(eset) {
  
  prot_pda.df <- Biobase::pData(eset)
  prot_fda.df <- Biobase::fData(eset)
  prot_samp.vc <- Biobase::sampleNames(eset)
  
  ## checking that the sample names are ordered by increasing ID
  prot_samp.vi <- as.integer(substr(prot_samp.vc, 2, 4))
  stopifnot(identical(prot_samp.vi, sort(prot_samp.vi)))
  
  ## getting imputation info
  value_origin.df <- prot_fda.df[, grep("OriginOfValueabundance",
                                        colnames(prot_fda.df), value = TRUE)]
  colnames(value_origin.df) <- gsub("_run90methode30K",
                                    "",
                                    gsub("_mgf", "",
                                         gsub("supp_OriginOfValueabundance_", "",
                                              colnames(value_origin.df))))
  
  ## re-ordering imputation info to match sample names
  if (Biobase::experimentData(eset)@title == "proteomics_liver") {
    file_to_sample.vc <- prot_pda.df[, "supp_sample name"]
    names(file_to_sample.vc) <- gsub("abundance_", "",
                                     gsub(".mgf", "",
                                          prot_pda.df[, "supp_Sample.name"], fixed = TRUE))
    colsel.vl <- colnames(value_origin.df) %in% names(file_to_sample.vc)
    value_origin.df <- value_origin.df[, colsel.vl]
    colnames(value_origin.df) <- file_to_sample.vc[colnames(value_origin.df)]
  }
  
  value_samp.vi <- as.integer(colnames(value_origin.df), 1, 3)
  value_origin.df <- value_origin.df[, order(value_samp.vi)]
  
  ## getting genotype factor
  gene.fc <- factor(prot_pda.df[, "gene"],
                    levels = c("WT", LATMX2::genes.vc()))
  
  ## getting sex factor
  sex.fc <- factor(prot_pda.df[, "sex"],
                   levels = LATMX2::sex.vc())
  
  ## WT, LAT and MX2 imputation metric
  imputed.mn <- t(apply(value_origin.df, 1, function(value.vc) {
    tapply(value.vc, gene.fc, function(x) sum(DAPAR::is.MV(x)))
  }))
  imputed.mn <- round(sweep(imputed.mn, 2, table(gene.fc), "/"), 2)
  colnames(imputed.mn) <- paste0("imputed_mfWLX_", colnames(imputed.mn))
  
  prot_fda.df <- cbind.data.frame(prot_fda.df,
                                  imputed.mn)
  
  if (Biobase::experimentData(eset)@title == "proteomics_liver") {
    
    ## LAT and WT imputation metric by sex
    ### M
    mWL_sel.vl <- gene.fc %in% c("WT", "LAT") & sex.fc == "M"
    mWL_gene.fc <- factor(gene.fc[mWL_sel.vl])
    mWL_value.df <- value_origin.df[, mWL_sel.vl]
    
    mWL_imputed.mn <- t(apply(mWL_value.df, 1, function(value.vc) {
      tapply(value.vc, mWL_gene.fc, function(x) sum(DAPAR::is.MV(x)))
    }))
    mWL_imputed.mn <- round(sweep(mWL_imputed.mn, 2,
                                  table(mWL_gene.fc), "/"), 2)
    colnames(mWL_imputed.mn) <- paste0("imputed_mWL_",
                                       colnames(mWL_imputed.mn))
    
    ### F
    fWL_sel.vl <- gene.fc %in% c("WT", "LAT") & sex.fc == "F"
    fWL_gene.fc <- factor(gene.fc[fWL_sel.vl])
    fWL_value.df <- value_origin.df[, fWL_sel.vl]
    
    fWL_imputed.mn <- t(apply(fWL_value.df, 1, function(value.vc) {
      tapply(value.vc, fWL_gene.fc, function(x) sum(DAPAR::is.MV(x)))
    }))
    fWL_imputed.mn <- round(sweep(fWL_imputed.mn, 2,
                                  table(fWL_gene.fc), "/"), 2)
    colnames(fWL_imputed.mn) <- paste0("imputed_fWL_",
                                       colnames(fWL_imputed.mn))
    
    ## M and F imputation by (LAT/WT)
    ### WT
    mfW_sel.vl <- gene.fc == "WT"
    mfW_sex.fc <- factor(sex.fc[mfW_sel.vl])
    mfW_value.df <- value_origin.df[, mfW_sel.vl]
    
    mfW_imputed.mn <- t(apply(mfW_value.df, 1, function(value.vc) {
      tapply(value.vc, mfW_sex.fc, function(x) sum(DAPAR::is.MV(x)))
    }))
    mfW_imputed.mn <- round(sweep(mfW_imputed.mn, 2,
                                  table(mfW_sex.fc), "/"), 2)
    colnames(mfW_imputed.mn) <- paste0("imputed_mfW_",
                                       colnames(mfW_imputed.mn)) 
    
    ### LAT
    mfL_sel.vl <- gene.fc == "LAT"
    mfL_sex.fc <- factor(sex.fc[mfL_sel.vl])
    mfL_value.df <- value_origin.df[, mfL_sel.vl]
    
    mfL_imputed.mn <- t(apply(mfL_value.df, 1, function(value.vc) {
      tapply(value.vc, mfL_sex.fc, function(x) sum(DAPAR::is.MV(x)))
    }))
    mfL_imputed.mn <- round(sweep(mfL_imputed.mn, 2,
                                  table(mfL_sex.fc), "/"), 2)
    colnames(mfL_imputed.mn) <- paste0("imputed_mfL_",
                                       colnames(mfL_imputed.mn))
    
    
    prot_fda.df <- cbind.data.frame(prot_fda.df,
                                    mWL_imputed.mn,
                                    fWL_imputed.mn,
                                    mfW_imputed.mn,
                                    mfL_imputed.mn)
    
  }
  
  Biobase::fData(eset) <- prot_fda.df
  
  eset
  
}
ProMetIS/ProMetIA documentation built on March 6, 2020, 2:11 a.m.