Skip to content

Commit

Permalink
ENH: export to geoarrow output (#53)
Browse files Browse the repository at this point in the history
Co-authored-by: Benoit Bovy <[email protected]>
  • Loading branch information
jorisvandenbossche and benbovy authored Nov 15, 2024
1 parent eb7d03f commit 1217af5
Show file tree
Hide file tree
Showing 3 changed files with 322 additions and 1 deletion.
1 change: 1 addition & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ Input/Output
from_wkt
to_wkt
from_geoarrow
to_geoarrow

.. _api_measurement:

Expand Down
249 changes: 248 additions & 1 deletion src/geoarrow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,190 @@ py::array_t<PyObjectGeography> from_geoarrow(py::object input,
return result;
}

/// \brief Object holding (and managing the memory) of an Arrow array (ArrowArray and ArraySchema
/// combo)
class ArrowArrayHolder {
public:
/// \brief Construct an invalid instance holding no resources
ArrowArrayHolder() {
array_.release = nullptr;
schema_.release = nullptr;
}

/// \brief Move and take ownership of data
ArrowArrayHolder(ArrowArray* array, ArrowSchema* schema) {
move(array, schema, &array_, &schema_);
}

/// \brief Move and take ownership of data wrapped by rhs
ArrowArrayHolder(ArrowArrayHolder&& rhs) : ArrowArrayHolder(rhs.array(), rhs.schema()) {}
ArrowArrayHolder& operator=(ArrowArrayHolder&& rhs) {
reset(rhs.array(), rhs.schema());
return *this;
}

// These objects are not copyable
ArrowArrayHolder(const ArrowArrayHolder& rhs) = delete;

/// \brief Get a pointer to the data owned by this object
ArrowArray* array() noexcept {
return &array_;
}
const ArrowArray* array() const noexcept {
return &array_;
}

ArrowSchema* schema() noexcept {
return &schema_;
}
const ArrowSchema* schema() const noexcept {
return &schema_;
}

void set_schema(ArrowSchema* schema_src) {
memcpy(&schema_, schema_src, sizeof(struct ArrowSchema));
schema_src->release = nullptr;
}

py::tuple return_capsules(py::args args, const py::kwargs& kwargs) {
if ((args.size() > 0) && (!args[0].is_none())) {
throw std::invalid_argument(
"Additional arguments (such as requested_schema) with a non-default value are not "
"supported");
}
if (kwargs) {
for (auto& item : kwargs) {
if (!item.second.is_none()) {
throw std::invalid_argument(
"Additional arguments (such as requested_schema) with a non-default value "
"are not supported");
}
}
}

ArrowArray* c_array = static_cast<ArrowArray*>(malloc(sizeof(ArrowArray)));
ArrowSchema* c_schema = static_cast<ArrowSchema*>(malloc(sizeof(ArrowSchema)));
move(&array_, &schema_, c_array, c_schema);

constexpr auto array_cleanup = [](void* ptr) noexcept {
auto array = static_cast<ArrowArray*>(ptr);
if (array->release != nullptr) {
array->release(array);
}
free(array);
};
py::capsule array_capsule{c_array, "arrow_array", array_cleanup};

constexpr auto schema_cleanup = [](void* ptr) noexcept {
auto schema = static_cast<ArrowSchema*>(ptr);
if (schema->release != nullptr) {
schema->release(schema);
}
free(schema);
};
py::capsule schema_capsule{c_schema, "arrow_schema", schema_cleanup};

return py::make_tuple(schema_capsule, array_capsule);
}

void move(ArrowArray* array_src,
ArrowSchema* schema_src,
ArrowArray* array_dst,
ArrowSchema* schema_dst) {
memcpy(array_dst, array_src, sizeof(struct ArrowArray));
array_src->release = nullptr;

memcpy(schema_dst, schema_src, sizeof(struct ArrowSchema));
schema_src->release = nullptr;
}

/// \brief Call data's release callback if valid
void reset() {
if (array_.release != nullptr) {
array_.release(&array_);
}

if (schema_.release != nullptr) {
schema_.release(&schema_);
}
}

/// \brief Call data's release callback if valid and move ownership of the data
/// pointed to by data
void reset(ArrowArray* array_src, ArrowSchema* schema_src) {
reset();
move(array_src, schema_src, &array_, &schema_);
}

~ArrowArrayHolder() {
reset();
}

protected:
ArrowArray array_;
ArrowSchema schema_;
};

ArrowArrayHolder to_geoarrow(py::array_t<PyObjectGeography> input,
py::object output_schema,
bool planar,
float tessellate_tolerance,
int precision) {
ArrowArrayHolder array = ArrowArrayHolder();

s2geog::geoarrow::Writer writer;
std::vector<std::unique_ptr<s2geog::Geography>> s2geog_vec;

s2geog::geoarrow::ExportOptions options;
options.set_precision(precision);
if (planar) {
auto tol = S1Angle::Radians(tessellate_tolerance / EARTH_RADIUS_METERS);
options.set_tessellate_tolerance(tol);
// TODO make this configurable
// options.set_projection(s2geog::geoarrow::mercator());
}

if (!output_schema.is(py::none())) {
if (!py::hasattr(output_schema, "__arrow_c_schema__")) {
throw std::invalid_argument(
"'output_schema' should be an Arrow-compatible schema object "
"(i.e. has an '__arrow_c_schema__' method)");
}
py::capsule schema_capsule = output_schema.attr("__arrow_c_schema__")();
ArrowSchema* schema = static_cast<ArrowSchema*>(schema_capsule);
try {
writer.Init(schema, options);
} catch (const std::exception& ex) {
// re-raise RuntimeError as ValueError
if (strlen(ex.what()) > 0) {
throw py::value_error(ex.what());
} else {
throw py::value_error("Error initializing writer. Did you pass a valid schema?");
}
}
array.set_schema(schema);
// TODO add support for specifying a geometry encoding
// } else if (geometry_encoding.equal(py::str("WKT"))) {
// writer.Init(s2geog::geoarrow::Writer::OutputType::kWKT, options);
// } else if (geometry_encoding.equal(py::str("WKB"))) {
// writer.Init(s2geog::geoarrow::Writer::OutputType::kWKB, options);
} else {
throw std::invalid_argument("'output_schema' should be specified");
}

for (int i = 0; i < input.size(); i++) {
writer.WriteGeography((*input.data(i)).as_geog_ptr()->geog());
}

writer.Finish(array.array());

return std::move(array);
}

void init_geoarrow(py::module& m) {
py::class_<ArrowArrayHolder>(m, "ArrowArrayHolder")
.def("__arrow_c_array__", &ArrowArrayHolder::return_capsules);

m.def("from_geoarrow",
&from_geoarrow,
py::arg("input"),
Expand Down Expand Up @@ -109,7 +292,8 @@ void init_geoarrow(py::module& m) {
If set to True, the edges of linestrings and polygons are assumed
to be linear on the plane. In that case, additional points will
be added to the line while creating the geography objects, to
ensure every point is within 100m of the original line.
ensure every point is within the `tessellate_tolerance` distance
(100m by default) of the original line.
By default (False), it is assumed that the edges are spherical
(i.e. represent the shortest path on the sphere between two points).
tessellate_tolerance : float, default 100.0
Expand All @@ -124,4 +308,67 @@ void init_geoarrow(py::module& m) {
binary type, if specifying this keyword with "WKT" or "WKB",
respectively.
)pbdoc");

m.def("to_geoarrow",
&to_geoarrow,
py::arg("input"),
py::pos_only(),
py::kw_only(),
py::arg("output_schema") = py::none(),
py::arg("planar") = false,
py::arg("tessellate_tolerance") = 100.0,
py::arg("precision") = 6,
R"pbdoc(
Convert an array of geographies to an Arrow array object with a GeoArrow
extension type.
See https://geoarrow.org/ for details on the GeoArrow specification.
Parameters
----------
input : array_like
An array of geography objects.
output_schema : Arrow schema, pyarrow.DataType, pyarrow.Field, default None
The geoarrow extension type to use for the output. This can indicate
one of the native geoarrow types (e.g. "point", "linestring", "polygon",
etc) or the serialized WKT or WKB options.
The type can be specified with any Arrow schema compatible object
(any object implementing the Arrow PyCapsule Protocol for schemas,
i.e. which has a ``__arrow_c_schema__`` method). For example, this
can be a ``pyarrow.DataType`` or ``pyarrow.Field``, and you can
use the ``geoarrow.pyarrow`` package to construct such geoarrow
extension types.
planar : bool, default False
If set to True, the edges of linestrings and polygons in the output
are assumed to be linear on the plane. In that case, additional
points will be added to the line while converting to the output
encoding, to ensure every point is within the `tessellate_tolerance`
distance (100m by default) of the original line on the sphere.
tessellate_tolerance : float, default 100.0
The maximum distance in meters that a point must be moved to
satisfy the planar edge constraint. This is only used if `planar`
is set to True.
precision : int, default 6
The number of decimal places to include in the output. Only used
when writing as WKT.
Returns
-------
ArrowArrayHolder
A generic Arrow array object with geograhies encoded to GeoArrow.
Examples
--------
>>> import spherely
>>> import geoarrow.pyarrow as ga
>>> arr = spherely.to_geoarrow(arr, output_schema=ga.point())
The returned object is a generic Arrow-compatible object that then
can be consumed by your Arrow library of choice. For example, using
``pyarrow``:
>>> import pyarrow as pa
>>> arr_pa = pa.array(arr)
)pbdoc");
}
73 changes: 73 additions & 0 deletions tests/test_geoarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,76 @@ def test_from_geoarrow_invalid_encoding():
def test_from_geoarrow_no_arrow_object():
with pytest.raises(ValueError, match="input should be an Arrow-compatible array"):
spherely.from_geoarrow(np.array(["POINT (1 1)"], dtype=object))


def test_to_geoarrow():
arr = spherely.points([1, 2, 3], [1, 2, 3])
res = spherely.to_geoarrow(
arr, output_schema=ga.point().with_coord_type(ga.CoordType.INTERLEAVED)
)
assert isinstance(res, spherely.ArrowArrayHolder)
assert hasattr(res, "__arrow_c_array__")

arr_pa = pa.array(res)
coords = np.asarray(arr_pa.storage.values)
expected = np.array([1, 1, 2, 2, 3, 3], dtype="float64")
np.testing.assert_allclose(coords, expected)


def test_to_geoarrow_wkt():
arr = spherely.points([1, 2, 3], [1, 2, 3])
result = pa.array(spherely.to_geoarrow(arr, output_schema=ga.wkt()))
expected = pa.array(["POINT (1 1)", "POINT (2 2)", "POINT (3 3)"])
assert result.storage.equals(expected)


def test_to_geoarrow_wkb():
arr = spherely.points([1, 2, 3], [1, 2, 3])
result = pa.array(spherely.to_geoarrow(arr, output_schema=ga.wkb()))
# the conversion from lon/lat values to S2 points and back gives some floating
# point differences, and output to WKB does not do any rounding,
# therefore checking exact values here
expected = ga.as_wkb(
[
"POINT (0.9999999999999998 1)",
"POINT (2 1.9999999999999996)",
"POINT (3.0000000000000004 3.0000000000000004)",
]
)
assert result.equals(expected)


def test_wkt_roundtrip():
wkt = [
"POINT (30 10)",
"LINESTRING (30 10, 10 30, 40 40)",
"POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))",
"POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20, 20 30))",
"MULTIPOINT ((10 40), (40 30), (20 20), (30 10))",
"MULTILINESTRING ((10 10, 20 20, 10 40), (40 40, 30 30, 40 20, 30 10))",
"MULTIPOLYGON (((30 20, 45 40, 10 40, 30 20)), ((15 5, 40 10, 10 20, 5 10, 15 5)))",
"MULTIPOLYGON (((40 40, 20 45, 45 30, 40 40)), ((20 35, 10 30, 10 10, 30 5, 45 20, 20 35), (30 20, 20 15, 20 25, 30 20)))",
"GEOMETRYCOLLECTION (POINT (40 10), LINESTRING (10 10, 20 20, 10 40), POLYGON ((40 40, 20 45, 45 30, 40 40)))",
]

arr = spherely.from_geoarrow(ga.as_wkt(wkt))
result = pa.array(spherely.to_geoarrow(arr, output_schema=ga.wkt()))
np.testing.assert_array_equal(result, wkt)


def test_to_geoarrow_no_output_encoding():
arr = spherely.points([1, 2, 3], [1, 2, 3])

with pytest.raises(ValueError, match="'output_schema' should be specified"):
spherely.to_geoarrow(arr)


def test_to_geoarrow_invalid_output_schema():
arr = spherely.points([1, 2, 3], [1, 2, 3])
with pytest.raises(
ValueError, match="'output_schema' should be an Arrow-compatible schema"
):
spherely.to_geoarrow(arr, output_schema="WKT")

with pytest.raises(ValueError, match="Did you pass a valid schema"):
spherely.to_geoarrow(arr, output_schema=pa.schema([("test", pa.int64())]))

0 comments on commit 1217af5

Please sign in to comment.