import pandas
from data_algebra.data_ops import *
d = pandas.read_csv('d.csv.gz')
d
1000000 rows × 2 columns
ops = describe_table(d, table_name='d'). \
extend({
'rn': '_row_number()',
'cs': 'x.cumsum()'
},
partition_by=['g'],
order_by=['x']). \
order_rows(['g', 'x'])
ops
TableDescription(
table_name='d',
column_names=[
'x', 'g']) .\
extend({
'rn': '_row_number()',
'cs': 'x.cumsum()'},
partition_by=['g'],
order_by=['x']) .\
order_rows(['g', 'x'])
res = ops.transform(d)
res
1000000 rows × 4 columns
expect = pandas.read_csv('res.csv.gz')
expect
1000000 rows × 4 columns
time(ops.transform(d))
CPU times: user 14 s, sys: 847 ms, total: 14.8 s
Wall time: 14.3 s
1000000 rows × 4 columns
import timeit
def f():
return ops.transform(d)
timeit.timeit(f, number=5)
73.305209192
import data_algebra.SQLite
dbmodel = data_algebra.SQLite.SQLiteModel()
print(ops.to_sql(dbmodel, pretty=True))
SELECT "x",
"cs",
"g",
"rn"
FROM
(SELECT "x",
SUM("x") OVER (PARTITION BY "g"
ORDER BY "x") AS "cs",
"g",
ROW_NUMBER() OVER (PARTITION BY "g"
ORDER BY "x") AS "rn"
FROM "d") "extend_1"
ORDER BY "g",
"x"
import sqlite3
conn = sqlite3.connect(':memory:')
dbmodel.prepare_connection(conn)
def f_db():
try:
dbmodel.read_query(conn, "DROP TABLE d")
except:
pass
dbmodel.insert_table(conn, d, 'd')
return dbmodel.read_query(conn, ops.to_sql(dbmodel))
time(f_db())
CPU times: user 8.12 s, sys: 611 ms, total: 8.73 s
Wall time: 8.67 s
1000000 rows × 4 columns
timeit.timeit(f_db, number=5)
47.82512383599999
# neaten up
conn.close()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.