Skip to content

Commit

Permalink
v2-docs: Add docs for transformed data (#533)
Browse files Browse the repository at this point in the history
* transformed data cleanup and docs

* remove redundant grpc runtime implementations

* fmt

* fix tests

* clippy fixes

* python lint

* lint/types

* pandas first

* fix test
  • Loading branch information
jonmmease authored Nov 7, 2024
1 parent d330c57 commit 2e9c374
Show file tree
Hide file tree
Showing 30 changed files with 585 additions and 456 deletions.
6 changes: 4 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion docs/source/column_usage.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Get Column Usage
# Column Usage
VegaFusion provides a function for introspecting a Vega specification and determining which columns are referenced from each root dataset. A root dataset is one defined at the top-level of the spec that includes a `url` or `values` properties. This is useful in contexts where it's more efficient to minimize the number of columns provided to the Vega specification. For example, the Python library uses this function to determine how to downsample the input DataFrame columns prior to converting to Arrow.

When VegaFusion cannot precisely determine which columns are referenced from each root dataset, this function returns `None` or `null` for the corresponding dataset.
Expand Down
1 change: 1 addition & 0 deletions docs/source/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,5 @@ If you've arrived here looking for information on how to scale Vega-Altair visua
:caption: Contents
column_usage
transformed_data
```
16 changes: 16 additions & 0 deletions docs/source/transformed_data.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Transformed Data

VegaFusion can be used to evaluate datasets in a Vega spec and return them as arrow tables or DataFrames. This is the foundation for Vega-Altair's [`chart.transformed_data`](https://altair-viz.github.io/user_guide/transform/index.html#accessing-transformed-data) method.

## Python

```{eval-rst}
.. automethod:: vegafusion.runtime.VegaFusionRuntime.pre_transform_datasets
```

**Example**: See [pre_transform_data.py](https://github.com/vega/vegafusion/tree/v2/examples/python-examples/pre_transform_data.py) for a complete example.

## Rust
The Rust API provides a slightly more general `pre_transform_values` method that can extract dataset or signal values.

See [pre_transform_data.rs](https://github.com/vega/vegafusion/tree/v2/examples/rust-examples/examples/pre_transform_data.rs) for a complete example of extracting dataset values as arrow tables.
5 changes: 2 additions & 3 deletions examples/python-examples/column_usage.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import json
from typing import Any

from vegafusion import get_column_usage
import vegafusion as vf


def main():
spec = get_spec()
column_usage = get_column_usage(spec)
column_usage = vf.get_column_usage(spec)
print(json.dumps(column_usage, indent=2))

assert column_usage == {
Expand Down
170 changes: 170 additions & 0 deletions examples/python-examples/pre_transformed_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
import json
from typing import Any

import vegafusion as vf


def main():
spec = get_spec()
res, warnings = vf.runtime.pre_transform_datasets(
spec, ["counts"], dataset_format="polars"
)
assert warnings == []
assert len(res) == 1
print(res[0])


def get_spec() -> dict[str, Any]:
"""
Based on https://vega.github.io/editor/#/examples/vega/histogram-null-values
"""
spec_str = """
{
"$schema": "https://vega.github.io/schema/vega/v5.json",
"description": "A histogram of film ratings, modified to include null values.",
"width": 400,
"height": 200,
"padding": 5,
"autosize": {"type": "fit", "resize": true},
"signals": [
{
"name": "maxbins", "value": 10
},
{
"name": "binCount",
"update": "(bins.stop - bins.start) / bins.step"
},
{
"name": "nullGap", "value": 10
},
{
"name": "barStep",
"update": "(width - nullGap) / (1 + binCount)"
}
],
"data": [
{
"name": "table",
"url": "data/movies.json",
"transform": [
{
"type": "extent", "field": "IMDB Rating",
"signal": "extent"
},
{
"type": "bin", "signal": "bins",
"field": "IMDB Rating", "extent": {"signal": "extent"},
"maxbins": 10
}
]
},
{
"name": "counts",
"source": "table",
"transform": [
{
"type": "filter",
"expr": "datum['IMDB Rating'] != null"
},
{
"type": "aggregate",
"groupby": ["bin0", "bin1"]
}
]
},
{
"name": "nulls",
"source": "table",
"transform": [
{
"type": "filter",
"expr": "datum['IMDB Rating'] == null"
},
{
"type": "aggregate",
"groupby": []
}
]
}
],
"scales": [
{
"name": "yscale",
"type": "linear",
"range": "height",
"round": true, "nice": true,
"domain": {
"fields": [
{"data": "counts", "field": "count"},
{"data": "nulls", "field": "count"}
]
}
},
{
"name": "xscale",
"type": "linear",
"range": [{"signal": "barStep + nullGap"}, {"signal": "width"}],
"round": true,
"domain": {"signal": "[bins.start, bins.stop]"},
"bins": {"signal": "bins"}
},
{
"name": "xscale-null",
"type": "band",
"range": [0, {"signal": "barStep"}],
"round": true,
"domain": [null]
}
],
"axes": [
{"orient": "bottom", "scale": "xscale", "tickMinStep": 0.5},
{"orient": "bottom", "scale": "xscale-null"},
{"orient": "left", "scale": "yscale", "tickCount": 5, "offset": 5}
],
"marks": [
{
"type": "rect",
"from": {"data": "counts"},
"encode": {
"update": {
"x": {"scale": "xscale", "field": "bin0", "offset": 1},
"x2": {"scale": "xscale", "field": "bin1"},
"y": {"scale": "yscale", "field": "count"},
"y2": {"scale": "yscale", "value": 0},
"fill": {"value": "steelblue"}
},
"hover": {
"fill": {"value": "firebrick"}
}
}
},
{
"type": "rect",
"from": {"data": "nulls"},
"encode": {
"update": {
"x": {"scale": "xscale-null", "value": null, "offset": 1},
"x2": {"scale": "xscale-null", "band": 1},
"y": {"scale": "yscale", "field": "count"},
"y2": {"scale": "yscale", "value": 0},
"fill": {"value": "#aaa"}
},
"hover": {
"fill": {"value": "firebrick"}
}
}
}
]
}
"""
return json.loads(spec_str)


if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions examples/rust-examples/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ edition = "2021"
[dev-dependencies]
serde_json = { workspace = true }
vegafusion-core = { path = "../../vegafusion-core" }
vegafusion-runtime = { path = "../../vegafusion-runtime" }
tokio = "1.41.1"
Loading

0 comments on commit 2e9c374

Please sign in to comment.