library(arrayop) # optional step to leave out the `arrayop::` prefix. # First, load user name and token from an authentication file, which is also compatible with iquery. auth = yaml::yaml.load_file("~/.scidb_auth") # Then connect to scidb. Params host, username and token are all single strings and can be provided in any way conn = db_connect(host = "localhost", username = auth[["user-name"]], token = auth[["user-password"]])
Once we have connected to scidb with db_connect
, we can get save the connection instance as shown above.
Or call arrayop::get_default_connection()
anywhere in code, which always return the db connection.
identical(get_default_connection(), conn) # the exact same thing! print(conn) # print out `username@host [scidb version]` if the connection is succesful
We use the term array_op
to denote two types of arrays:
myNamespace.myArray
.Semantically, both types of arrays are the same, and support the same set of verbs. They also both have:
Regular arrays have at least one explicit dimensions. SciDB data frames have no explit dimension, but internally two hidden dimensions whose names starts with a dollar sign $.
for(array_name in c("test_CO2", "test_CO2_empty", "test_CO2_empty_df", "test_CO2_stored")){ try(conn$execute(arrayop::afl(array_name | remove)), silent = T) }
# Upload a data frame to scidb arr_uploaded = conn$upload_df(CO2, name = "test_CO2") # datasets::CO2 # Or just create a new array arr_created = conn$create_array( name = "test_CO2_empty", schema_template = "<Plant:string,Type:string,Treatment:string,conc:double,uptake:double> [i]" ) # Create a scidb data frame. Notice no dimension provided arr_created_df = conn$create_array( name = "test_CO2_empty_df", schema_template = "<Plant:string,Type:string,Treatment:string,conc:double,uptake:double>" # leave out any dimension ) # Retrieve an existing array (the one we just uploaded) arr_existing = conn$array("test_CO2") # Compile data.frame into a scidb 'build' operation with a build dimension arr_compiled = conn$compile_df(head(CO2,4)) # Compile to a scidb data frame without explicit dimensions arr_compiled_df = conn$compile_df(head(CO2,4), as_scidb_data_frame = T) # Create a transient array by AFL arr_from_afl = conn$afl_expr(arr_existing$filter(conc > 800)$to_afl()) # Store an AFL statement into a persistent array arr_stored = conn$afl_expr( # afl_str = "filter(test_CO2, conc > 800)" # equivalent to code below arr_existing$filter(conc > 800)$to_afl() )$persist( save_array_name = "test_CO2_stored", .gc = F # prevent from being destroyed by R's garbage collected )
Let's inspect these arrays!
arr_uploaded$to_afl() arr_uploaded$to_schema_str() arr_uploaded$is_persistent() # test if the afl is an array name, without checking in db arr_uploaded$exists_persistent_array() # whether the persistent array exists in db (does check in db) arr_uploaded$array_meta_data() arr_uploaded$summarize_array() arr_uploaded$cell_count() str(arr_created) str(arr_existing) # same as arr_uploaded arr_compiled$to_afl() arr_compiled$to_schema_str() arr_compiled$is_persistent() # SciDB data frames arr_created_df$to_schema_str() arr_compiled_df$to_schema_str() arr_from_afl$to_afl() arr_from_afl$cell_count() str(arr_stored) arr_stored$cell_count()
Call to_df
method on any array_op
instance to get a data frame with all its attributes;
or to_df_all
to get all fields (both dimensions and attributes).
arr_stored$to_df() arr_stored$to_df_all() # Notice the dimension column [i]
Common array operations
We can create array_ops on the fly and download them as data frames.
Each array operation also returns an array_op instance.
So we can chain up the operations in a dplyr
like data flow.
All the array operation verbs are named following dplyr
convention where appropriate,
e.g. filter
, inner_join
, mutate
, group_by
and summarize
etc.
# Comma separated expressions are joined with logical AND arr_existing$filter(conc > 80, uptake > 40)$to_df() # Equivalent to above # arr_existing$filter(conc > 80 & uptake > 40)$to_df() arr_existing$filter(conc <= 100 | conc > 900)$to_df() # logical OR # Filter on string fields # %like% takes regex as the right operand; all other verbs treat strings literally. arr_existing$filter(Plant %contains% "3", Plant %starts_with% "Q", Treatment %contains% "chill", Treatment %like% "chill[a-z]{2}$")$to_df()
To use a variable in a filter expression, prepend !!
, as shown below
min_conc <- 80 arr_existing$filter(conc > !!min_conc, uptake > 40)$to_df()
As shown below, field names can be either quoted or not quoted.
But the field expression must be a single string.
Unlike R, SciDB uses single quotes for string literals. E.g. 'at'
is single-quoted.
Like filter expressoins, use !!
to reference variables in memory.
# Modify existing fields result = arr_stored$mutate(Type = 'at_' + Type, conc = conc/1000) result$to_afl() result$to_df() # Add new fields result = arr_stored$mutate(Plant_Type = Plant + '-' + Type, feel = if(Treatment == 'chilled') 'cold' else 'ok') # Same as above. Notice the `iif` operator is explicitly written here, # equivalent to native R `if` statement # result = arr_stored$mutate(Plant_Type = Plant + '-' + Type, feel = iif(Treatment == 'chilled', 'cold', 'ok')) result$to_afl() result$to_df()
mutate
operation keeps the original fields unless we modify them.
transmute
allows us to explicitly specify which fields to keep.
# strings are escaped for AFL result = arr_stored$transmute(Plant, Plant_Type = Plant + "'-'" + Type, feel = iif(Treatment == 'chilled', 'c"ooo"ld', "OK")) result$to_afl() result$to_df()
mutate
and transmute
are useful if the mutated fields are in expressions.
Another common use case is mutating fields of an existing array by another array, rather than by expressions.
dataArray = conn$array_from_df(data.frame(Plant = c("Qn1", "Qn2", "Qn3"), Treatment = "treated-mutated")) result = arr_stored$mutate_by(dataArray, keys = "Plant", updated_fields = "Treatment") result$to_afl() result$to_df()
NOTE: the source arrays remain the same in examples above. All mutate operations just created mutated data copies of the source.
To modify array content, we can update
or overwrite
the source array with mutated data copies.
dataArray = conn$array_from_df(data.frame(Plant = c("Qn1", "Qn2", "Qn3"), Treatment = "treated-mutated")) mutated = arr_stored$mutate_by(dataArray, keys = "Plant", updated_fields = "Treatment") # Persist data changes to an existing array mutated$update(arr_stored)$execute() arr_stored$to_df() # Only matched data cells are updated mutated$overwrite(arr_stored)$execute() arr_stored$to_df() # Caution: Overwrite all array content!!!
Three joins are supported: inner_join
, left_join
and right_join
.
First, let's populate two arrays.
leftDf = data.frame(lda = 1:5, db = 11:15, fa = LETTERS[1:5], lfb = 3.14 * 1:5) rightDf = data.frame(rda = 3:10, db = 13:20, fa = LETTERS[3:10], rfb = 3.14 * 3:10) leftDf rightDf # At least one input to 'cross_join' must have a specified chunk size. # So we need to persist the arrays L = conn$array_from_df(leftDf, "<fa:string, lfb:double> [lda;db] ", force_template_schema = T)$persist(.gc = F) R = conn$array_from_df(rightDf, "<fa:string, rfb:double> [rda;db] ", force_template_schema = T)$persist(.gc = F)
Then, start with a vinilla join where join keys are inferred from array fields.
In this case, fa
and db
are join keys since they exist in both arrays.
L$inner_join(R)$to_df() L$left_join(R)$to_df() L$right_join(R)$to_df() L$full_join(R)$to_df()
We can specify which fields to keep with array_op$select
.
By default, all fields are selected, except that the join keys only exist once in the result even if they are selected on both operands.
L$select("lfb", "lda")$ inner_join(R)$to_df() L$inner_join(R$select("rda", "rfb"))$to_df() L$select("lfb", "db")$inner_join(R$select("rfb"))$to_df()
We can specify the join keys and aliases explicitly. Array aliases are important to disambiguate overlapping fields.
# `db` as join key, `fa` exists in both arrays and will be suffixed with array aliaes in the result L$left_join(R, by = c('db'), left_alias = "_LL", right_alias = "_RR")$to_df() # different join keys L$right_join(R, by.x = "lda", by.y = "rda", left_alias = "_LL", right_alias = "_RR")$to_df()
By default, join is implemented by SciDB equi_join
operator, which works great on either array dimensions and/or attributes.
But equi_join
has one drawback as it materializes results. This can be underiable in a complex nested join.
We can choose a different join mode, cross_join
, to avoid result materialization.
Two constraints with cross_join
mode:
inner_join
is supported.L$inner_join(R, by.x = c('db', 'lda'), by.y = c('db', 'rda'), join_mode = 'cross_join', left_alias = "_LL", right_alias = "_RR")$ to_df()
result = L$inner_join(R, settings = list( algorithm = "'hash_replicate_right'", # string setting item hash_join_threshold = 1024 # numeric setting item )) result$to_afl() result$to_df()
In many SQL dialects, we can define an auto incremented field. When new records are inserted, the field is auto incremented. SciDB currently does not support this feature.
Given a data source array_op (normally created from R data frame or conn$fread), and a target template array which we insert data into, we can acheieve this feature in two steps:
Let's see an example.
First, create a target array with 3 attrs and 2 dimensions; and
a source array with some data. Notice we set the source array's build_dim_spec = 'z'
so that we can refer to it later.
AutoFieldsContent = data.frame(da=1:10, db=1:10, f_str = letters[1:10], f_int32=1:10, f_int64=11:20) target = conn$array_from_df(AutoFieldsContent, template = "<f_str:string, f_int32:int32, f_int64:int64> [da=0:*:0:1000; db=0:*:0:1000]", force_template_schema = T)$ persist(.temp=T, .gc = F) empty_target = conn$create_array(dbutils$random_array_name(), target, .temp = T) source = conn$compile_df(data.frame(f_str = LETTERS[1:3]), template = target, build_dim_spec = "z")
source$set_auto_fields(target)$to_df() # same as below # source$set_auto_fields(target, source_auto_increment=c(z=0), target_auto_increment=c(da=0, db=0, f_int32=0))$to_df() # # target_auto_increment does not matter if the target array is not empty source$set_auto_fields(target, source_auto_increment=c(z=0), target_auto_increment=c(da=1, db=2, f_int32=3))$to_df() # # target_auto_increment affects the initial field values if the target array is empty source$set_auto_fields(empty_target, source_auto_increment=c(z=0), target_auto_increment=c(da=1, db=2, f_int32=3))$to_df() # By default, both source and target auto increment start from 0 # The only dimension in source is used as the source_auto_increment field; otherwise, we need to specify the source field # All the missing fields are assumed the target_auto_increment fields unless provided otherwise. source$set_auto_fields(empty_target)$to_df() source$set_auto_fields(empty_target, target_auto_increment = c("da", "db"))$to_df()
In a less common case, we may need to populate a dimension field to avoid cell coordinate collisions.
AutoFieldsContent conn$ compile_df(data.frame(f_str = LETTERS[4:10], f_int32 = 4:10, da = c(1,2,1,2,1,4,5)), template = target)$ set_auto_fields(target, anti_collision_field = 'db')$ to_df() %>% dplyr::arrange(da, db) conn$ compile_df(data.frame(f_str = LETTERS[4:10], f_int32 = 4:10, da = c(1,2,1,2,1,4,5)), template = empty_target)$ set_auto_fields(empty_target, anti_collision_field = 'db')$ to_df() %>% dplyr::arrange(da, db)
Notice there are no duplicated tuples of (da, db) so they can be safely inserted into the target array without coordinate collisions.
We can comibne the two types of auto fields in one run and update the target array.
conn$ compile_df(data.frame(f_str = LETTERS[4:10], da = c(1,2,1,2,1,4,5)), template = target, build_dim_spec = "z")$ set_auto_fields(target, source_auto_increment = "z", target_auto_increment = "f_int64", anti_collision_field = 'db')$ mutate(f_int32 = int32(null))$ change_schema(target)$ update(target)$ execute() target$to_df_all()
conn$ compile_df(data.frame(f_str = LETTERS[4:10], da = c(1,2,1,2,1,4,5)), template = target, build_dim_spec = "z")$ set_auto_fields(empty_target, source_auto_increment = "z", target_auto_increment = "f_int64", anti_collision_field = 'db')$ mutate(f_int32 = int32(null))$ change_schema(empty_target)$ update(empty_target)$ execute() empty_target$to_df_all()
# clean up try({ target$remove_array() empty_target$remove_array() }, silent = T)
# Tip: Call `dbutils$db_aggregates()$to_df()` to see supported aggregate functions arr_existing$ group_by("Plant", "Type")$ summarize(count = count(), uptake_sd = stdev(uptake), avg(conc), # unamed fields are named by SciDB max(uptake))$ to_df() # Aggregation result is still an array_op. So we can carry on. arr_existing$ group_by("Plant", "Type")$ summarize(count = count(), uptake_sd = stdev(uptake), avg(conc), max(uptake))$ filter(uptake_sd > 8)$ # requires uptake standard deviation > 8 to_df()
Sometimes we need to get stats on array fields for all cells without any group_by fields, e.g. find the max/min value of a field.
arr_existing$summarize( count(), max(conc), stdev(uptake), max(i) # this is a dimension )$to_df()
Since we just modified arr_stored
several times.
Let's check out how many versions it has so far and delete the version history.
arr_stored$list_versions() # Check how many cells in each version for(version_id in 1:3){ print(sprintf("Version-%d has %d cells", version_id, arr_stored$version(version_id)$cell_count())) }
Remove all but the latest version.
arr_stored$remove_versions() arr_stored$list_versions()
fread
method of a connection object takes a file path or multiple file paths as input,
with other params, and returns an array_op which can be further processed to
modify existing arrays or join with other arrays.
If we need to read data from all file columns, use fread
without a template.
This behaves similarly to data.table::fread
.
# file column names and types are inferred by peeking into a few lines into the file conn$fread("/file/path", header = T, sep = "\t") # default sep is "\t" # col.names are V1, V2, ... types are inferred. conn$fread("/file/path", header = F, nrow = 10) # infer column type using only up to first 10 rows # file column types are inferred, but names are provided explicitly. col.names can be different than actual file column names but must be the same number as columns. conn$fread("/file/path", header = T, col.names = c("chrom", "pos", "ref", "alt")) conn$fread("/file/path", header = F, col.names = c("chrom", "pos", "ref", "alt")) # same as above, but read the first row of data
More commonly, we want to load data from file into an existing array. In this case, we provide a template (either array_op or schema string), only the columns whose name matches the template's fields are imported.
NOTE: template
field types superceedes actual file column types.
If header=T
, column names can still be inferred.
conn$fread("/file/path", template = templateArray, header = T) # template can also be a schema string # When header = F, we need to provide the correct col.names so they match the template fields. conn$fread("/file/path", template = "<ref:string, alt:string> [chrom;pos]", header = F, col.names = c("chrom", "pos", "extra_col1", "ref", "alt", "extra_col2")) # extra_cols ignored
In examples above, fread
generates AFL where fields are coerced to the template data types.
If no template provided, we can think of the data.frame read from nrows
lines of the file as a template.
But no all files are perfectly formatted. We often need to error handle ill-formatted fields or missing data.
auto_dcast
param, if set TRUE, will cast all numerical fields using dcast(a1, double(null))
where double
can be any numerical types, and a1
is the matching field.
conn$fread("/file/path", template = myArray, header = T, auto_dcast = T)
In case of special field casting, we can provide mutate imported fields on the fly.
@
is replaced with raw imported string fields, e.g. a0, a1, a2, ...
conn$fread("/file/path", template = myArray, header = T, mutate_fields = list('chrom' = "iif(@='X', 23, iif(@='Y', 24, dcast(@, int64(null))))")) # We can still dcast other numeric fields while casting `chrom` differently conn$fread("/file/path", template = myArray, header = T, auto_dcast = T, mutate_fields = list('chrom' = "iif(@='X', 23, iif(@='Y', 24, dcast(@, int64(null))))"))
Hopefully arrayop
captures all your common SciDB use cases.
In case something is missing, we can run raw AFL statements directly on a ScidbConnection object.
To run AFL statements (in string aka. length-1 R character vector format), use one of the methods of a ScidbConnection
:
query
if we expect results (array attributes as R data frames).query_all
if we need both array dimensions and attributes.execute
if we do not expect results conn$query("list('operators')") conn$query("op_scidbversion()")
Let's create an array temp_array
in the public
namespace for later examples.
conn$execute("create array public.temp_array <a:int32, b:string, c:bool> [z]") conn$query("show(temp_array)") # query(...) returns result attribtues only conn$query_all("show(temp_array)") # query_all(...) returns result dimesions and attributes
We can pipe multiple SciDB operators in arrayop::afl
function which returns an AFL string.
Depending on the number of operands:
afl(a | b)
is converted to b(a)
afl(a | b(c))
is converted to b(a, c)
conn$query(afl( "'operators'" | list | filter("library != 'scidb'") | limit(10) ))
NOTE: removed arrays cannot be recovered. Always double check and back up if necessary before entering the danger zone.
try(conn$execute("remove(temp_array)"), silent = T) # manually remove for(x in c(arr_uploaded, arr_created, arr_created_df, arr_existing, arr_compiled, arr_stored, L, R )){ if(x$exists_persistent_array()) { # can only remove persistent arrays x$remove_array() } }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.