README.md

add2impala

DOI

The goal of add2impala is to parse statistical metrics and encoding in SQL using R. If you want to calculate KS statistic in R, free to try rawKS.

Installation

The development version from GitHub with:

# install.packages("devtools")
devtools::install_github("JiaxiangBU/add2impala")

Example

library(add2impala)
library(tidyverse)

KS

parse_ks_sql(verbose = 0, y_hat = "pred", y = "target", table = "opd.table")
#> select pred, target, 
#>     sum(y=1) over (order by y_hat rows between current row and unbounded following) as tp,
#>     sum(y=0) over (order by y_hat rows between current row and unbounded following) as fp,
#>     sum(y=1) over (order by y_hat rows between unbounded preceding and 1 preceding) as fn,
#>     sum(y=0) over (order by y_hat rows between unbounded preceding and 1 preceding) as tn
#> from opd.tableThe SQL text is on your clipboard.
parse_ks_sql(verbose = 1, y_hat = "pred", y = "target", table = "opd.table")
#> with a as (
#> select pred, target, 
#>     sum(y=1) over (order by y_hat rows between current row and unbounded following) as tp,
#>     sum(y=0) over (order by y_hat rows between current row and unbounded following) as fp,
#>     sum(y=1) over (order by y_hat rows between unbounded preceding and 1 preceding) as fn,
#>     sum(y=0) over (order by y_hat rows between unbounded preceding and 1 preceding) as tn
#> from opd.table
#> )
#> select pred, target, 
#>    tp/(tp+fn) as tpr,
#>    fp/(fp+tn) as fpr,
#>    tp/(tp+fn) - fp/(fp+tn) as ks
#> from aThe SQL text is on your clipboard.
parse_ks_sql(verbose = 2, y_hat = "pred", y = "target", table = "opd.table")
#> with a as (
#> select pred, target, 
#>     sum(y=1) over (order by y_hat rows between current row and unbounded following) as tp,
#>     sum(y=0) over (order by y_hat rows between current row and unbounded following) as fp,
#>     sum(y=1) over (order by y_hat rows between unbounded preceding and 1 preceding) as fn,
#>     sum(y=0) over (order by y_hat rows between unbounded preceding and 1 preceding) as tn
#> from opd.table
#> )
#> max(tp/(tp+fn) - fp/(fp+tn)) as ks
#> from aThe SQL text is on your clipboard.

The measurment explanation is here (Chinese).

Regular expression

inputs <- row.names(mtcars)
inputs %>% head
#> [1] "Mazda RX4"         "Mazda RX4 Wag"     "Datsun 710"       
#> [4] "Hornet 4 Drive"    "Hornet Sportabout" "Valiant"
# remotes::install_github("daranzolin/inferregex")
return_regex(inputs, verbose = "recommend")
#> # A tibble: 3 x 2
#> # Groups:   regex [3]
#>   regex                                  n
#>   <chr>                              <int>
#> 1 "^[A-Z][a-z]{3}\\s\\d{3}$"             3
#> 2 "^[A-Z][a-z]{3}\\s\\d{3}[A-Z]$"        2
#> 3 "^[A-Z][a-z]{3}\\s\\d{3}[A-Z]{2}$"     2
return_regex(inputs, verbose = "all")
#>                 string                                         regex
#> 1            Mazda RX4                 ^[A-Z][a-z]{4}\\s[A-Z]{2}\\d$
#> 2        Mazda RX4 Wag ^[A-Z][a-z]{4}\\s[A-Z]{2}\\d\\s[A-Z][a-z]{2}$
#> 3           Datsun 710                      ^[A-Z][a-z]{5}\\s\\d{3}$
#> 4       Hornet 4 Drive         ^[A-Z][a-z]{5}\\s\\d\\s[A-Z][a-z]{4}$
#> 5    Hornet Sportabout               ^[A-Z][a-z]{5}\\s[A-Z][a-z]{9}$
#> 6              Valiant                               ^[A-Z][a-z]{6}$
#> 7           Duster 360                      ^[A-Z][a-z]{5}\\s\\d{3}$
#> 8            Merc 240D                 ^[A-Z][a-z]{3}\\s\\d{3}[A-Z]$
#> 9             Merc 230                      ^[A-Z][a-z]{3}\\s\\d{3}$
#> 10            Merc 280                      ^[A-Z][a-z]{3}\\s\\d{3}$
#> 11           Merc 280C                 ^[A-Z][a-z]{3}\\s\\d{3}[A-Z]$
#> 12          Merc 450SE              ^[A-Z][a-z]{3}\\s\\d{3}[A-Z]{2}$
#> 13          Merc 450SL              ^[A-Z][a-z]{3}\\s\\d{3}[A-Z]{2}$
#> 14         Merc 450SLC              ^[A-Z][a-z]{3}\\s\\d{3}[A-Z]{3}$
#> 15  Cadillac Fleetwood               ^[A-Z][a-z]{7}\\s[A-Z][a-z]{8}$
#> 16 Lincoln Continental              ^[A-Z][a-z]{6}\\s[A-Z][a-z]{10}$
#> 17   Chrysler Imperial               ^[A-Z][a-z]{7}\\s[A-Z][a-z]{7}$
#> 18            Fiat 128                      ^[A-Z][a-z]{3}\\s\\d{3}$
#> 19         Honda Civic               ^[A-Z][a-z]{4}\\s[A-Z][a-z]{4}$
#> 20      Toyota Corolla               ^[A-Z][a-z]{5}\\s[A-Z][a-z]{6}$
#> 21       Toyota Corona               ^[A-Z][a-z]{5}\\s[A-Z][a-z]{5}$
#> 22    Dodge Challenger               ^[A-Z][a-z]{4}\\s[A-Z][a-z]{9}$
#> 23         AMC Javelin                    ^[A-Z]{3}\\s[A-Z][a-z]{6}$
#> 24          Camaro Z28                 ^[A-Z][a-z]{5}\\s[A-Z]\\d{2}$
#> 25    Pontiac Firebird               ^[A-Z][a-z]{6}\\s[A-Z][a-z]{7}$
#> 26           Fiat X1-9                ^[A-Z][a-z]{3}\\s[A-Z]\\d-\\d$
#> 27       Porsche 914-2                  ^[A-Z][a-z]{6}\\s\\d{3}-\\d$
#> 28        Lotus Europa               ^[A-Z][a-z]{4}\\s[A-Z][a-z]{5}$
#> 29      Ford Pantera L       ^[A-Z][a-z]{3}\\s[A-Z][a-z]{6}\\s[A-Z]$
#> 30        Ferrari Dino               ^[A-Z][a-z]{6}\\s[A-Z][a-z]{3}$
#> 31       Maserati Bora               ^[A-Z][a-z]{7}\\s[A-Z][a-z]{3}$
#> 32          Volvo 142E                 ^[A-Z][a-z]{4}\\s\\d{3}[A-Z]$
#>    nchars nlower nupper ndigits nwhite
#> 1       9      4      3       1      1
#> 2      13      6      4       1      2
#> 3      10      5      1       3      1
#> 4      14      9      2       1      2
#> 5      17     14      2       0      1
#> 6       7      6      1       0      0
#> 7      10      5      1       3      1
#> 8       9      3      2       3      1
#> 9       8      3      1       3      1
#> 10      8      3      1       3      1
#> 11      9      3      2       3      1
#> 12     10      3      3       3      1
#> 13     10      3      3       3      1
#> 14     11      3      4       3      1
#> 15     18     15      2       0      1
#> 16     19     16      2       0      1
#> 17     17     14      2       0      1
#> 18      8      3      1       3      1
#> 19     11      8      2       0      1
#> 20     14     11      2       0      1
#> 21     13     10      2       0      1
#> 22     16     13      2       0      1
#> 23     11      6      4       0      1
#> 24     10      5      2       2      1
#> 25     16     13      2       0      1
#> 26      9      3      2       2      1
#> 27     13      6      1       4      1
#> 28     12      9      2       0      1
#> 29     14      9      3       0      2
#> 30     12      9      2       0      1
#> 31     13     10      2       0      1
#> 32     10      4      2       3      1
return_regex(inputs, verbose = "lazy")
#> [1] "^[A-Z][a-z]{4}\\s[A-Z]{2}\\d$"

Rolling KS

rolling_ks(
    y_hat = "pred",
    y = "target",
    table = "opd.test_pred_table",
    time_variable = "inserttime"
)
#> with a as (
#>     with a as (
#>         with a as (
#>             select distinct cast(strleft(cast(inserttime as string),10) as timestamp) as obs_date
#>             from opd.test_pred_table
#>         )
#>         select a.obs_date, target, pred, inserttime
#>         from a
#>         left join opd.test_pred_table b
#>         on datediff(a.obs_date,inserttime) <= 7
#>     )
#>     select
#>         pred, target, inserttime, obs_date,
#>         sum(target=1) over (partition by obs_date order by obs_date, pred rows between current row and unbounded following) as tp,
#>         sum(target=0) over (partition by obs_date order by obs_date, pred rows between current row and unbounded following) as fp,
#>         sum(target=1) over (partition by obs_date order by obs_date, pred rows between unbounded preceding and 1 preceding) as fn,
#>         sum(target=0) over (partition by obs_date order by obs_date, pred rows between unbounded preceding and 1 preceding) as tn
#>     from a
#> )
#> select
#>     obs_date,
#>     max(tp/(tp+fn) - fp/(fp+tn)) as ks
#> from a
#> group by 1
#> order by obs_dateThe SQL text is on your clipboard.

cat2int

library(RODBC)
impala <- odbcConnect("Impala")
output_cat2int <- cat2int(feature = "cyl", table = "opd.sqlsave_test_ljx")
output_cat2int %>% cat
#> case when cyl = '4' then 4
#>  when cyl = '6' then 6
#>  when cyl = '8' then 8 end as cyl

paste_sql_result

可以方便把 SQL 反馈的结果,做成 markdown 格式,方便分享到 GitHub 或者 GitLab 上。

library(RODBC)
impala <- odbcConnect("Impala")
output_paste_sql_result1 <- paste_sql_result("describe opd.sqlsave_test_ljx")
output_paste_sql_result1 %>% cat
#> ```sql describe opd.sqlsave_test_ljx ```   |name |type   |comment | |:----|:------|:-------| |mpg  |double |NA      | |cyl  |double |NA      | |disp |double |NA      | |hp   |double |NA      | |drat |double |NA      | |wt   |double |NA      | |qsec |double |NA      | |vs   |double |NA      | |am   |double |NA      | |gear |double |NA      | |carb |double |NA      |
output_paste_sql_result2 <- paste_sql_result("select * from opd.sqlsave_test_ljx limit 100")
output_paste_sql_result2 %>% cat
#> ```sql select * from opd.sqlsave_test_ljx limit 100 ```   |  mpg| cyl|  disp|  hp| drat|    wt| |----:|---:|-----:|---:|----:|-----:| | 15.2|   8| 275.8| 180| 3.07| 3.780| | 18.1|   6| 225.0| 105| 2.76| 3.460| | 15.0|   8| 301.0| 335| 3.54| 3.570| | 27.3|   4|  79.0|  66| 4.08| 1.935| | 30.4|   4|  95.1| 113| 3.77| 1.513| | 21.4|   4| 121.0| 109| 4.11| 2.780|

run_impala

run_impala

**Code of Conduct**

Please note that the `add2impala` project is released with a [Contributor Code of Conduct](.github/CODE_OF_CONDUCT.md).By contributing to this project, you agree to abide by its terms.

**License**

MIT © [Jiaxiang Li](LICENSE.md)


JiaxiangBU/add2impala documentation built on June 11, 2020, 2:20 p.m.