R/data_validation_functions.R

Defines functions sync_candidates_and_background

sync_candidates_and_background <- function(candidates,background) {
  if(!all.equal(key(candidates), c("chr","start","end"))){
    data.table::setkey(candidates,chr,start,end)
  }
  if(!all.equal(key(background), c("chr","start","end"))){
    data.table::setkey(background,chr,start,end)
  }
  # should be unique positions
  if(anyDuplicated(background)!=0) {
    stop("The background set contains duplicates. Please remove and try again")
  }
  if(anyDuplicated(candidates)!=0) {
    stop("The candidate set contains duplicates. Please remove and try again")
  }
  # candidate should be complete subset of background
  if(anyNA(background[candidates, on=.(chr,start,end), which=TRUE])){
    stop("The candidate set is not a subset of the background. Please fix and try again")
  }
  return(1)
}

check_feature_format <- function(dt) {
  # should be a 4 col data.table
  if ((!is.data.table(dt)) || dim(dt)[2] !=4 ){
    stop("The feature definitions should be a four column data.table. Please adjust and try again")
  }
  name.col <- names(dt)[4]
  if(dt[,uniqueN(get(name.col))] != dim(dt)[1]){
    stop(paste0("There are duplicated feature names in the column named \"",name.col,"\". Please correct and try again"), call. = F)
  }
  return(1)
}

check_annotation_format <- function(dt) {
  # should be a 3 col data.table
  if ((!is.data.table(dt)) || dim(dt)[2] != 3 ){
    stop("The annotation definitions should be a three column data.table. Please adjust and try again")
  }
  name.col <- names(dt)[3]
  if(dt[,uniqueN(get(name.col))] != dim(dt)[1]){
    stop(paste0("There are duplicated annotation names in the column named \"",name.col,"\". Please correct and try again"), call. = F)
  }
  return(1)
}


remove_low_coverage_from_feature_set <- function(feature_set, missing_features, coverage_threshold) {
  type <- names(feature_set)[4]
  if ( type !=  names(missing_features)[1] ) {
    stop(paste0("The feature_set and the missing_features feature type do not match (column 1). Was expecting ", type," but found ", names(missing_features)[1], " instead. Please adjust and try again"))
  }
  filtered_feature_set <- feature_set[!get(type) %in% missing_features[coverage < coverage_threshold,get(type)]][]
  return(filtered_feature_set[])
}

sync_feature_set_map_to_feature_set <- function(feature_set_map, clean_feature_set, min_n_set) {
  type <- names(clean_feature_set)[4]
  if ( type !=  names(feature_set_map)[2] ) {
    stop(paste0("The clean_feature_set and the feature_set_map feature type do not match (column 2). Was expecting ", type," but found ", names(feature_set_map)[2], " instead. Please adjust and try again"))
  }
  sync_feature_set_map <- feature_set_map[get(type) %in% clean_feature_set[,get(type)]]
  sync_feature_set_map <- sync_feature_set_map[id %in% sync_feature_set_map[,uniqueN(get(type)),by=id][V1 >= min_n_set]$id]
  return(sync_feature_set_map)
}
joshuamschmidt/multiPermr documentation built on Oct. 12, 2020, 11:42 a.m.