#' Relationship matching SQL query
#'
#' The query is constructed on 3 different levels (nested queries):
#' \itemize{
#' \item{Level 1: construct simple counts for co-occurence of values in both columns}
#' \item{Level 2: aggregate coocurences, drop the values}
#' \item{Level 3: calculate the statistics of co-occurences}
#' }
#'
#' The purpose of \emph{Level 1} query is to perform simple counting of distinct values in both columns resulting in the following table:
#' \itemize{
#' \item{\emph{eone} the value from combined list of non-NULL values from column1 and column2}
#' \item{\emph{aone.cnt} number of occurences of the \code{eone} in column1 or \code{NULL} in case of no occurences}
#' \item{\emph{atwo.cnt} number of occurences of the \code{eone} in column2 or \code{NULL} in case of no occurences}
#' }
#'
#' The purpose of \emph{Level 2} query is to perform simple processing of \emph{Level 1} query and dropping actual dictionary values,
#' which results in the following table (described only in part concerning column1 - the part concerning column2 is symmetrical) consisting of 8 columns:
#' \itemize{
#' \item{\emph{aone_cnt} number of occurences of the value in \emph{column1}, 0 or more}
#' \item{\emph{aone_cnt_ind} occurence indicator of the value in \emph{column1}, 0 or 1}
#' \item{\emph{aone_cnt_nullable} number of occurences of the value in \emph{column1} as provided by \emph{Level 1} query (exact copy of \code{aone.cnt})}
#' \item{\emph{aone_cnt_both_present} serving the purpose of a correlator of the counts from both columns:
#' it is \code{NULL} when atwo.cnt is NULL, otherwise it is aone.cnt}
#' }
#'
#' The purpose of \emph{Level 3} query is to aggregate results of \emph{Level 2} query into single row of the following structure:
#' \itemize{
#' \item{\code{distvals_cnt}: number of distinct non-NULL values in column1 and column2 (combined)}
#' \item{\code{f_distentities_frac}: fraction of (combined) distinct values in column1 (first)}
#' \item{\code{s_distentities_frac}: fraction of (combined) distinct values in column2 (second)}
#' \item{\code{distentities_match_frac}: fraction of matching values (1+ -- 1+ relationships)}
#' \item{\code{f_mincnt}, \code{f_avgcnt}, \code{f_maxcnt}, \code{f_sdevcnt}:
#' \code{min}, \code{mean}, \code{max} and \code{sd} of distinct value counts in column1 (first) }
#' \item{\code{s_mincnt}, \code{s_avgcnt}, \code{s_maxcnt}, \code{f_sdevcnt}:
#' \code{min}, \code{mean}, \code{max} and \code{sd} of distinct value counts in column2 (second) }
#' \item{\code{f_sumcnt}: number of non-NULL rows in column1}
#' \item{\code{s_sumcnt}: number of non-NULL rows in column2}
#' \item{\code{f_avgcnt_when_present}, \code{f_sdevcnt_when_present}:
#' \code{mean} and \code{sd} of distinct value counts in column1 (first) when they are present}
#' \item{\code{s_avgcnt_when_present}, \code{s_sdevcnt_when_present}:
#' \code{mean} and \code{sd} of distinct value counts in column2 (second) when they are present}
#' \item{\code{f_mincnt_both_present}, \code{f_avgcnt_both_present}, \code{f_maxcnt_both_present}, \code{f_sdevcnt_both_present}:
#' \code{min}, \code{mean}, \code{max} and \code{sd} of of distinct value counts in column1 (first) when there are matching rows in column2}
#' \item{\code{s_mincnt_both_present}, \code{s_avgcnt_both_present}, \code{s_maxcnt_both_present}, \code{s_sdevcnt_both_present}:
#' \code{min}, \code{mean}, \code{max} and \code{sd} of of distinct value counts in column2 (second) when there are matching rows in column1}
#' }
#' @export
sql.entity.relation.generic <- function(query1, query1.colname, query1.countname = "cnt", query2, query2.colname, query2.countname = "cnt"){
### TODO: pay attention to the restriction of NULL values (!!!)
## %%Q1_VALCOUNTER%%, %%Q1_VALNAME%%, %%Q1_QUERY%%
## %%Q2_VALCOUNTER%%, %%Q2_VALNAME%%, %%Q2_QUERY%%
# stringi::stri_replace_all_fixed(
# get("relationship_miner", pkg_globals),
# c(
# "%%Q1_QUERY%%", "%%Q1_VALNAME%%","%%Q1_VALCOUNTER%%",
# "%%Q2_QUERY%%", "%%Q2_VALNAME%%","%%Q2_VALCOUNTER%%"
# ),
# c(
# query1, query1.colname, query1.countname,
# query2, query2.colname, query2.countname
# ),
# vectorize_all = FALSE)
sql.fill.template(
"relationship_miner",
query.names = NULL,
param.list = list(
"%%Q1_QUERY%%" = query1,
"%%Q1_VALNAME%%" = query1.colname,
"%%Q1_VALCOUNTER%%" = query1.countname,
"%%Q2_QUERY%%" = query2,
"%%Q2_VALNAME%%" = query2.colname,
"%%Q2_VALCOUNTER%%" = query2.countname
)
)
}
#' SQL groupped counts
create.groupping.query <- function(groupping.col, subquery, na.rm = TRUE){
variable.name <- "eone"
count.name <- "cnt"
na.rm.query <- ""
if (na.rm){
na.rm.query <- paste0("where ", groupping.col, " is not null")
}
paste0("
select ", groupping.col, " as ", variable.name, "
count(*) as ", count.name, "
from
(", subquery, ")
", na.rm.query, "
group by ", groupping.col)
}
create.groupping.query <- function(groupping.col, tablename, schemaname = NA){
variable.name <- "eone"
count.name <- "cnt"
paste0("
select ", groupping.col, " as ", variable.name, "
count(*) as ", count.name, "
from ", schemaselector, tablename,
" where ", groupping.col, " is not null group by ", groupping.col)
}
#sql.entity.relation2("q1", "q1c", "q1cnt", "q2", "q2c", "q2cnt")
#' @export
sql.entity.relation.simple <- function(tab1.name, tab2.name, tab1.groupping.col, tab2.groupping.col = tab1.groupping.col, schemaname1 = NA, schemaname2 = schemaname1){
q1 <- sql.table.schemed(tab1.name, schemaname1)
q2 <- sql.table.schemed(tab2.name, schemaname2)
sql.entity.relation.generic(
query1 = q1,
query1.colname = tab1.groupping.col,
query1.countname = "cnt",
query2 = q2,
query2.colname = tab2.groupping.col,
query2.countname = "cnt")
}
experimental.db.infer.relation <- function(db.connection, colName, baseTable, relTables, schemaname=NA){
relTables.enf=setdiff(unlist(relTables),baseTable)
queryList=list()
#queryList[baseTable]=get.groupping.att.query(colName,baseTable,schemaname)
for(tabname in relTables.enf){
#queryList[tabname]=get.groupping.att.query(colName,tabname,schemaname)
queryList[tabname]=sql.entity.relation(colName,baseTable,tabname,schemaname)
}
mmm=query.load.execute(queryList,control.connection)
resultRel=NULL
for(tabname in relTables.enf){
z=data.frame(f_tablename=baseTable,s_tablename=tabname,groupping_col=colName,mmm[[tabname]],stringsAsFactors=FALSE)
resultRel=rbind(resultRel,z)
}
resultRel[order(resultRel$distentities_match_frac,decreasing=TRUE),]
}
#' @export
experimental.db.infer.relation.simple <- function(db.connection, base.att, related.att){
related.att.clean <- dplyr::setdiff(related.att, base.att)
### TODO: unique, just in case....
from.rel <- base.att %>% dplyr::mutate(
schemed.tab = sql.table.schemed(tab.name = tablename, schema.name = schemaname),
relpoint.id = internalid, relpoint.name = internalname)
to.rel <- related.att.clean %>% dplyr::mutate(
schemed.tab = sql.table.schemed(tab.name = tablename, schema.name = schemaname),
relpoint.id = internalid, relpoint.name = internalname)
##cross_join
cross.description <- NULL
cross.description.queries <- list()
for(from.rel.it in 1:nrow(from.rel)){
from.rel.elem <- from.rel[from.rel.it,]
for(to.rel.it in 1:nrow(to.rel)){
to.rel.elem <- to.rel[to.rel.it,]
join.name <- paste0("***",from.rel.elem$relpoint.id, "+ + +", to.rel.elem$relpoint.id, "***")
cross.description <- rbind(
cross.description,
data.frame(
from.rel.id = from.rel.elem$relpoint.id,
from.rel.name = from.rel.elem$relpoint.name,
to.rel.id = to.rel.elem$relpoint.id,
to.rel.name = to.rel.elem$relpoint.name,
join.name = join.name
)
)
cross.description.queries[[join.name]] <-
sql.entity.relation.generic(
query1 = paste0("select ", from.rel.elem$attname, " from ", from.rel.elem$schemed.tab),
query1.colname = from.rel.elem$attname,
query1.countname = "cnt",
query2 = paste0("select ", to.rel.elem$attname, " from ", to.rel.elem$schemed.tab),
query2.colname = to.rel.elem$attname,
query2.countname = "cnt")[[1]]
}
}
rel.results <- query.load.execute(cross.description.queries, db.connection)
## repacking as data.frame
rel.results.repack <- query.flatten.results(rel.results, "join.name")
rel.results.df <- rel.results.repack$results
res <- dplyr::inner_join(x = rel.results.df, y = cross.description, by = rel.results.repack$idfield) %>%
dplyr::select(-one_of(rel.results.repack$idfield))
## join & rename both from.rel and to.rel
res
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.