extras/grouped_perf/grouped_performance_data_algebra.md

import pandas
from data_algebra.data_ops import *
d = pandas.read_csv('d.csv.gz')

d
x g 0 0.376972 level_000357975 1 0.301548 level_000668062 2 -1.098023 level_000593766 3 -1.130406 level_000203296 4 -2.796534 level_000064605 ... ... ... 999995 0.154607 level_000029194 999996 -0.241628 level_000721132 999997 0.727351 level_000698435 999998 -1.705844 level_000237171 999999 0.428118 level_000125022

1000000 rows × 2 columns

ops = describe_table(d, table_name='d'). \
    extend({
        'rn': '_row_number()',
        'cs': 'x.cumsum()'
        },
        partition_by=['g'],
        order_by=['x']). \
    order_rows(['g', 'x'])

ops    
TableDescription(
 table_name='d',
 column_names=[
   'x', 'g']) .\
   extend({
    'rn': '_row_number()',
    'cs': 'x.cumsum()'},
   partition_by=['g'],
   order_by=['x']) .\
   order_rows(['g', 'x'])
res = ops.transform(d)

res
x g rn cs 0 -0.920397 level_000000002 1 -0.920397 1 0.537211 level_000000003 1 0.537211 2 0.734919 level_000000004 1 0.734919 3 -0.890755 level_000000005 1 -0.890755 4 1.702935 level_000000008 1 1.702935 ... ... ... ... ... 999995 1.435739 level_000999990 4 1.569817 999996 0.262819 level_000999993 1 0.262819 999997 0.081815 level_000999995 1 0.081815 999998 1.553806 level_000999997 1 1.553806 999999 -0.669694 level_000999998 1 -0.669694

1000000 rows × 4 columns

expect = pandas.read_csv('res.csv.gz')
expect
x g rn cs 0 -0.920397 level_000000002 1 -0.920397 1 0.537211 level_000000003 1 0.537211 2 0.734919 level_000000004 1 0.734919 3 -0.890755 level_000000005 1 -0.890755 4 1.702935 level_000000008 1 1.702935 ... ... ... ... ... 999995 1.435739 level_000999990 4 1.569817 999996 0.262819 level_000999993 1 0.262819 999997 0.081815 level_000999995 1 0.081815 999998 1.553806 level_000999997 1 1.553806 999999 -0.669694 level_000999998 1 -0.669694

1000000 rows × 4 columns


time(ops.transform(d))
CPU times: user 14 s, sys: 847 ms, total: 14.8 s
Wall time: 14.3 s
x g rn cs 0 -0.920397 level_000000002 1 -0.920397 1 0.537211 level_000000003 1 0.537211 2 0.734919 level_000000004 1 0.734919 3 -0.890755 level_000000005 1 -0.890755 4 1.702935 level_000000008 1 1.702935 ... ... ... ... ... 999995 1.435739 level_000999990 4 1.569817 999996 0.262819 level_000999993 1 0.262819 999997 0.081815 level_000999995 1 0.081815 999998 1.553806 level_000999997 1 1.553806 999999 -0.669694 level_000999998 1 -0.669694

1000000 rows × 4 columns

import timeit 

def f():
    return ops.transform(d)

timeit.timeit(f, number=5)
73.305209192
import data_algebra.SQLite
dbmodel = data_algebra.SQLite.SQLiteModel()
print(ops.to_sql(dbmodel, pretty=True))
SELECT "x",
       "cs",
       "g",
       "rn"
FROM
  (SELECT "x",
          SUM("x") OVER (PARTITION BY "g"
                         ORDER BY "x") AS "cs",
                        "g",
                        ROW_NUMBER() OVER (PARTITION BY "g"
                                           ORDER BY "x") AS "rn"
   FROM "d") "extend_1"
ORDER BY "g",
         "x"
import sqlite3
conn = sqlite3.connect(':memory:')
dbmodel.prepare_connection(conn)
def f_db():
    try:
        dbmodel.read_query(conn, "DROP TABLE d")
    except:
        pass
    dbmodel.insert_table(conn, d, 'd')
    return dbmodel.read_query(conn, ops.to_sql(dbmodel))
time(f_db())
CPU times: user 8.12 s, sys: 611 ms, total: 8.73 s
Wall time: 8.67 s
x cs g rn 0 -0.920397 -0.920397 level_000000002 1 1 0.537211 0.537211 level_000000003 1 2 0.734919 0.734919 level_000000004 1 3 -0.890755 -0.890755 level_000000005 1 4 1.702935 1.702935 level_000000008 1 ... ... ... ... ... 999995 1.435739 1.569817 level_000999990 4 999996 0.262819 0.262819 level_000999993 1 999997 0.081815 0.081815 level_000999995 1 999998 1.553806 1.553806 level_000999997 1 999999 -0.669694 -0.669694 level_000999998 1

1000000 rows × 4 columns

timeit.timeit(f_db, number=5)
47.82512383599999
# neaten up
conn.close()



WinVector/rqdatatable documentation built on Aug. 22, 2023, 3:25 p.m.