R/inputdata_data_linkage_finder.R

Defines functions scanTab returnItemData jaccard_dist check_link findLink

Documented in findLink

#This is a bunch of code that will crunch and sort the input data into useable metadata

#Function to Scan tables columns and identify what they are, and if they are special data types
scanTab<-function(tabIndex=NA,objData = NULL,objMeta=NULL,dataDict=NULL){
  #no table index has been provided, so get one yourself
  
  # if(!is.na(tabIndex)){
  #   if(objData[[tabIndex]]@type == "spatial"){
  #     browser()
  #   }
  # }
  
  if(is.na(tabIndex)){
    #check which objects are tables
    tabIndex<-which(objMeta$dataType == "table")
    #check which objects have tablular metadata
    metatabs<-which(sapply(objMeta$dataID, function(x,obj){
      x<-obj[[as.character(x)]]
      if(is.null(x@data$metadata) || is.na(x@data$metadata))
        return(FALSE)
      
      if(is.data.frame(x@data$metadata))
        return(TRUE)
    },obj = objData))
    
    tabIndex<-unique(c(tabIndex,metatabs))
  }
  
  #if there is no table
  if(length(tabIndex) == 0){
    warning("There are no table data objects")
    return(NA)
  }
  
  #if there are multiple tables
  if(length(tabIndex)>1){
    #call yo-self!
    #scanTab(3,objData,objMeta,dataDict)
    tmp<-lapply(tabIndex,function(x){scanTab(x,objData,objMeta,dataDict)})
    tmp<-dplyr::bind_rows(tmp) #bind the rows of that recursion!
    return(tmp)
  }
  

  #now let's get to business. Scan those columns!
  if(objData[[tabIndex]]@type == "table"){
    itemData<-objData[[tabIndex]]@data[[1]]
  }else{
    itemData<-objData[[tabIndex]]@data$metadata
  }
  
  itemName<-as.character(objMeta[tabIndex,"dataID"])
  itemEnvName<-as.character(objMeta[tabIndex,"dataEnvName"])
  
  #scan for categories:
  #then, wait for user input to decide whether to show one variable, or multiple variables
  tableInfo<-data.frame(tableSource = rep(itemName,ncol(itemData)),
                        variable = colnames(itemData),
                        class = sapply(itemData, class),
                        envName = rep(itemEnvName,ncol(itemData)),
                        stringsAsFactors = FALSE)
  
  #scan for special column types (spatial, genomic, dates)
  #universal data dictionary a standard, users can add other data dictionaries
  
  # #scan and apply to special table
  # tableInfo$specialClass<-apply(tableInfo,1,function(x,dict){
  #   #exact match so can have more variability here
  #   data_term <- trimws(x[["variable"]])
  #   
  #   idxDict<-which(dict$term %in% data_term)
  #   category <- NA
  #   
  #   #If there's no match...
  #   if(length(idxDict)<1)
  #     return(category)
  #   
  #   #If there's a match....
  #   if(length(idxDict)>1){
  #     tmp<-table(dict[idxDict,]$category)
  #     tmp<-tmp/sum(tmp)
  #     
  #     # if there are multiple matches and
  #     # if any category breaks 50% return that
  #     # note: conservative route is to return NA
  #     tmp<-sort(tmp,decreasing = TRUE)
  #     
  #     if(tmp[1]>0.5){
  #       category<-names(tmp)[1]
  #     }
  #   }else{
  #     category<-dict[idxDict,]$category
  #   }
  #   return(category)
  # },
  # dict=dataDict) #Note to self: I like passing then second variable rather than just hoping its correctly scoped.
  
  return(tableInfo)
}

#HELPER FUNCTION: STANDARDIZING DATA INPUT
returnItemData<-function(index,obj,meta,returnMeta=FALSE){
  itemData<-obj[[index]]@data
  itemName<-as.character(meta[index,"dataID"])
  itemType<-as.character(meta[index,"dataType"])
  
  #extract the data or metadata into a table / character vector
  dataOut<-switch(itemType,
                  "phyloTree" = if(!returnMeta) as.character(itemData[[1]]$tip.label) else itemData$metadata,
                  "dna" = if(!returnMeta) as.character(names(itemData[[1]])) else itemData$metadata,
                  "spatial" = if(!returnMeta) rownames(itemData$geometry) else itemData$metadata,
                  "table" = itemData[[1]])
  
  return(dataOut)
  #return(list(itemData = itemData,itemName=itemName,itemType = itemType,data=dataOut))
}

#HELPER FUNCTION: QUICK JACCARD DISTANCE
jaccard_dist<-function(x,y){
  #intersection
  nIntersect<-length(intersect(x,y))
  #union
  nUnion<-length(union(x,y))
  dist_metric<-(nUnion - nIntersect)/nUnion
  
  #return(1 - (nIntersect/total))
}


#amethod to check links between two charts
check_link<-function(item_one = NA,item_two=NA,item_one_name=NA,item_two_name=NA){
  compRes<-NULL
  
  if(is.null(ncol(item_one)) & is.null(ncol(item_two))){
    #two vectors
    class_one<-class(item_one)
    class_two<-class(item_two)
    
    item_one<-if(class_one=="factor") as.character(item_one) else item_one
    item_two<-if(class_two=="factor") as.character(item_two) else item_two
    
    if(class_one == class_two){
      if(class_one %in% c("factor","character")){
        if(dplyr::setequal(item_one,item_two)){
          return(0)
        }else{
          return(jaccard_dist(item_one,item_two))
        }
      }else{
        #don't match on numerical axes yet, it could be a co-incidence
        #mainly looking to match ids to some ids column
        compRes<-NULL
      }
    }
    
  }else if(is.null(ncol(item_one)) | is.null(ncol(item_two))){
    #one vector and one data frame
    df_item<-if(is.data.frame(item_one)) item_one else item_two
    vec_item<-if (!is.data.frame(item_one)) item_one else item_two
    
    #names of items
    df_item_name<-if(is.data.frame(item_one)) item_one_name else item_two_name
    vec_item_name<-if (!is.data.frame(item_one)) item_one_name else item_two_name
    
    #get similarity
    compRes<-apply(df_item,2,function(x,comp_item){
      check_link(x,comp_item)
    },comp_item = vec_item)
    
    
    compRes<-cbind(rep(as.character(vec_item_name),length(compRes)),
                   rep(as.character(df_item_name),length(compRes)),
                   names(compRes),
                   unname(compRes))
    
  }else{
    #comparing and linking two data frames
    compRes<-do.call("rbind",apply(item_one,2,function(x,comp_item,name_one,name_two){
      data.frame(check_link(item_one = x,item_two = comp_item,name_one,name_two))
    },comp_item = item_two,name_one = item_one_name, name_two = item_two_name))
    
    
    compRes[sapply(compRes,function(x){length(x)==0})] <- NULL
    
    
  }
  
  return(compRes)
}


#FUNCTION : FIND LINKS BETWEEN DIFFERENT DATA TYPES
#' Title
#'
#' @param allObj 
#' @param allObjMeta 
#' @param cutoff 
#'
#' @importFrom dplyr %>%
#' @return

findLink<-function(allObj=NA,allObjMeta = NA,cutoff=1){
  
  if(length(allObj) < 2)
    return(NULL)
  
  noQuantMeta<-dplyr::filter(allObjMeta,is.na(field_detail) | grepl("qual",field_detail))
  
  #list all pairwise combinations
  combos<-combn(1:length(allObj),m=2)
  
  #scan all pairwise data combinations to find links between data items
  varComp<-c()
  for(i in 1:ncol(combos)){
    combo<-combos[,i]
    
    #get standard information out
    item_one<-returnItemData(combo[1],allObj,noQuantMeta)
    item_two<-returnItemData(combo[2],allObj,noQuantMeta)
    
    #get names to be passed to other functions
    item_one_name = as.character(noQuantMeta[combo[1],"dataID"])
    item_two_name = as.character(noQuantMeta[combo[2],"dataID"])
    
    #check if non-tabular objects "exploded fields" better to link on that
    if(item_one_name %in% noQuantMeta$dataSource & as.character(noQuantMeta[combo[1],"dataType"])!="table"){
      returnItemData(combo[1],allObj,noQuantMeta,TRUE)
    }
    
    if(item_two_name %in% noQuantMeta$dataSource & as.character(noQuantMeta[combo[2],"dataType"])!="table"){
      item_two<-returnItemData(combo[2],allObj,noQuantMeta,TRUE)
    }
  
    #check whether there are any linkages between these data items
    
    links<-check_link(item_one,item_two,item_one_name,item_two_name)

  if(length(links) == 1){
    varComp<-rbind(varComp,c(item_one_name,item_two_name,paste(item_two_name,"gevitID",sep="_"),links))
  }else{
    links<-as.matrix(links) #easier to rbind this way
    varComp<-rbind(varComp,links)
  }
    
  }
  
  
  #get the comparing variable for the data
  compvar<-apply(cbind(rownames(varComp),varComp[,1]),1, function(x,allObjMeta){
    if(x[1]=="")
        return(paste(x[2],"gevitID",sep ="_"))
      
      return(gsub("\\.[0-9]+$","",x[1]))
    
  },allObjMeta) %>% unlist() %>%unname()
  
  
  varComp<-data.frame(item_one = varComp[,1],
                      item_one_var = compvar,
                      item_two = varComp[,2],
                      item_two_var = varComp[,3],
                      jaccard_dist = as.numeric(varComp[,4]),
                      stringsAsFactors = FALSE)
  
  return(varComp)
  #return(dplyr::filter(varComp,jaccard_dist<cutoff))

  
}
amcrisan/GEViTRec documentation built on Feb. 12, 2020, 8:27 p.m.