diff --git a/docs/api.rst b/docs/api.rst index b5b7fa5..83270db 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -47,6 +47,7 @@ Input/Output from_wkt to_wkt from_geoarrow + to_geoarrow .. _api_measurement: diff --git a/src/geoarrow.cpp b/src/geoarrow.cpp index b434d21..1b7cc21 100644 --- a/src/geoarrow.cpp +++ b/src/geoarrow.cpp @@ -71,7 +71,190 @@ py::array_t from_geoarrow(py::object input, return result; } +/// \brief Object holding (and managing the memory) of an Arrow array (ArrowArray and ArraySchema +/// combo) +class ArrowArrayHolder { +public: + /// \brief Construct an invalid instance holding no resources + ArrowArrayHolder() { + array_.release = nullptr; + schema_.release = nullptr; + } + + /// \brief Move and take ownership of data + ArrowArrayHolder(ArrowArray* array, ArrowSchema* schema) { + move(array, schema, &array_, &schema_); + } + + /// \brief Move and take ownership of data wrapped by rhs + ArrowArrayHolder(ArrowArrayHolder&& rhs) : ArrowArrayHolder(rhs.array(), rhs.schema()) {} + ArrowArrayHolder& operator=(ArrowArrayHolder&& rhs) { + reset(rhs.array(), rhs.schema()); + return *this; + } + + // These objects are not copyable + ArrowArrayHolder(const ArrowArrayHolder& rhs) = delete; + + /// \brief Get a pointer to the data owned by this object + ArrowArray* array() noexcept { + return &array_; + } + const ArrowArray* array() const noexcept { + return &array_; + } + + ArrowSchema* schema() noexcept { + return &schema_; + } + const ArrowSchema* schema() const noexcept { + return &schema_; + } + + void set_schema(ArrowSchema* schema_src) { + memcpy(&schema_, schema_src, sizeof(struct ArrowSchema)); + schema_src->release = nullptr; + } + + py::tuple return_capsules(py::args args, const py::kwargs& kwargs) { + if ((args.size() > 0) && (!args[0].is_none())) { + throw std::invalid_argument( + "Additional arguments (such as requested_schema) with a non-default value are not " + "supported"); + } + if (kwargs) { + for (auto& item : kwargs) { + if (!item.second.is_none()) { + throw std::invalid_argument( + "Additional arguments (such as requested_schema) with a non-default value " + "are not supported"); + } + } + } + + ArrowArray* c_array = static_cast(malloc(sizeof(ArrowArray))); + ArrowSchema* c_schema = static_cast(malloc(sizeof(ArrowSchema))); + move(&array_, &schema_, c_array, c_schema); + + constexpr auto array_cleanup = [](void* ptr) noexcept { + auto array = static_cast(ptr); + if (array->release != nullptr) { + array->release(array); + } + free(array); + }; + py::capsule array_capsule{c_array, "arrow_array", array_cleanup}; + + constexpr auto schema_cleanup = [](void* ptr) noexcept { + auto schema = static_cast(ptr); + if (schema->release != nullptr) { + schema->release(schema); + } + free(schema); + }; + py::capsule schema_capsule{c_schema, "arrow_schema", schema_cleanup}; + + return py::make_tuple(schema_capsule, array_capsule); + } + + void move(ArrowArray* array_src, + ArrowSchema* schema_src, + ArrowArray* array_dst, + ArrowSchema* schema_dst) { + memcpy(array_dst, array_src, sizeof(struct ArrowArray)); + array_src->release = nullptr; + + memcpy(schema_dst, schema_src, sizeof(struct ArrowSchema)); + schema_src->release = nullptr; + } + + /// \brief Call data's release callback if valid + void reset() { + if (array_.release != nullptr) { + array_.release(&array_); + } + + if (schema_.release != nullptr) { + schema_.release(&schema_); + } + } + + /// \brief Call data's release callback if valid and move ownership of the data + /// pointed to by data + void reset(ArrowArray* array_src, ArrowSchema* schema_src) { + reset(); + move(array_src, schema_src, &array_, &schema_); + } + + ~ArrowArrayHolder() { + reset(); + } + +protected: + ArrowArray array_; + ArrowSchema schema_; +}; + +ArrowArrayHolder to_geoarrow(py::array_t input, + py::object output_schema, + bool planar, + float tessellate_tolerance, + int precision) { + ArrowArrayHolder array = ArrowArrayHolder(); + + s2geog::geoarrow::Writer writer; + std::vector> s2geog_vec; + + s2geog::geoarrow::ExportOptions options; + options.set_precision(precision); + if (planar) { + auto tol = S1Angle::Radians(tessellate_tolerance / EARTH_RADIUS_METERS); + options.set_tessellate_tolerance(tol); + // TODO make this configurable + // options.set_projection(s2geog::geoarrow::mercator()); + } + + if (!output_schema.is(py::none())) { + if (!py::hasattr(output_schema, "__arrow_c_schema__")) { + throw std::invalid_argument( + "'output_schema' should be an Arrow-compatible schema object " + "(i.e. has an '__arrow_c_schema__' method)"); + } + py::capsule schema_capsule = output_schema.attr("__arrow_c_schema__")(); + ArrowSchema* schema = static_cast(schema_capsule); + try { + writer.Init(schema, options); + } catch (const std::exception& ex) { + // re-raise RuntimeError as ValueError + if (strlen(ex.what()) > 0) { + throw py::value_error(ex.what()); + } else { + throw py::value_error("Error initializing writer. Did you pass a valid schema?"); + } + } + array.set_schema(schema); + // TODO add support for specifying a geometry encoding + // } else if (geometry_encoding.equal(py::str("WKT"))) { + // writer.Init(s2geog::geoarrow::Writer::OutputType::kWKT, options); + // } else if (geometry_encoding.equal(py::str("WKB"))) { + // writer.Init(s2geog::geoarrow::Writer::OutputType::kWKB, options); + } else { + throw std::invalid_argument("'output_schema' should be specified"); + } + + for (int i = 0; i < input.size(); i++) { + writer.WriteGeography((*input.data(i)).as_geog_ptr()->geog()); + } + + writer.Finish(array.array()); + + return std::move(array); +} + void init_geoarrow(py::module& m) { + py::class_(m, "ArrowArrayHolder") + .def("__arrow_c_array__", &ArrowArrayHolder::return_capsules); + m.def("from_geoarrow", &from_geoarrow, py::arg("input"), @@ -109,7 +292,8 @@ void init_geoarrow(py::module& m) { If set to True, the edges of linestrings and polygons are assumed to be linear on the plane. In that case, additional points will be added to the line while creating the geography objects, to - ensure every point is within 100m of the original line. + ensure every point is within the `tessellate_tolerance` distance + (100m by default) of the original line. By default (False), it is assumed that the edges are spherical (i.e. represent the shortest path on the sphere between two points). tessellate_tolerance : float, default 100.0 @@ -124,4 +308,67 @@ void init_geoarrow(py::module& m) { binary type, if specifying this keyword with "WKT" or "WKB", respectively. )pbdoc"); + + m.def("to_geoarrow", + &to_geoarrow, + py::arg("input"), + py::pos_only(), + py::kw_only(), + py::arg("output_schema") = py::none(), + py::arg("planar") = false, + py::arg("tessellate_tolerance") = 100.0, + py::arg("precision") = 6, + R"pbdoc( + Convert an array of geographies to an Arrow array object with a GeoArrow + extension type. + + See https://geoarrow.org/ for details on the GeoArrow specification. + + Parameters + ---------- + input : array_like + An array of geography objects. + output_schema : Arrow schema, pyarrow.DataType, pyarrow.Field, default None + The geoarrow extension type to use for the output. This can indicate + one of the native geoarrow types (e.g. "point", "linestring", "polygon", + etc) or the serialized WKT or WKB options. + The type can be specified with any Arrow schema compatible object + (any object implementing the Arrow PyCapsule Protocol for schemas, + i.e. which has a ``__arrow_c_schema__`` method). For example, this + can be a ``pyarrow.DataType`` or ``pyarrow.Field``, and you can + use the ``geoarrow.pyarrow`` package to construct such geoarrow + extension types. + planar : bool, default False + If set to True, the edges of linestrings and polygons in the output + are assumed to be linear on the plane. In that case, additional + points will be added to the line while converting to the output + encoding, to ensure every point is within the `tessellate_tolerance` + distance (100m by default) of the original line on the sphere. + tessellate_tolerance : float, default 100.0 + The maximum distance in meters that a point must be moved to + satisfy the planar edge constraint. This is only used if `planar` + is set to True. + precision : int, default 6 + The number of decimal places to include in the output. Only used + when writing as WKT. + + Returns + ------- + ArrowArrayHolder + A generic Arrow array object with geograhies encoded to GeoArrow. + + Examples + -------- + >>> import spherely + >>> import geoarrow.pyarrow as ga + >>> arr = spherely.to_geoarrow(arr, output_schema=ga.point()) + + The returned object is a generic Arrow-compatible object that then + can be consumed by your Arrow library of choice. For example, using + ``pyarrow``: + + >>> import pyarrow as pa + >>> arr_pa = pa.array(arr) + + )pbdoc"); } diff --git a/tests/test_geoarrow.py b/tests/test_geoarrow.py index cb0b28b..f3b8a39 100644 --- a/tests/test_geoarrow.py +++ b/tests/test_geoarrow.py @@ -109,3 +109,76 @@ def test_from_geoarrow_invalid_encoding(): def test_from_geoarrow_no_arrow_object(): with pytest.raises(ValueError, match="input should be an Arrow-compatible array"): spherely.from_geoarrow(np.array(["POINT (1 1)"], dtype=object)) + + +def test_to_geoarrow(): + arr = spherely.points([1, 2, 3], [1, 2, 3]) + res = spherely.to_geoarrow( + arr, output_schema=ga.point().with_coord_type(ga.CoordType.INTERLEAVED) + ) + assert isinstance(res, spherely.ArrowArrayHolder) + assert hasattr(res, "__arrow_c_array__") + + arr_pa = pa.array(res) + coords = np.asarray(arr_pa.storage.values) + expected = np.array([1, 1, 2, 2, 3, 3], dtype="float64") + np.testing.assert_allclose(coords, expected) + + +def test_to_geoarrow_wkt(): + arr = spherely.points([1, 2, 3], [1, 2, 3]) + result = pa.array(spherely.to_geoarrow(arr, output_schema=ga.wkt())) + expected = pa.array(["POINT (1 1)", "POINT (2 2)", "POINT (3 3)"]) + assert result.storage.equals(expected) + + +def test_to_geoarrow_wkb(): + arr = spherely.points([1, 2, 3], [1, 2, 3]) + result = pa.array(spherely.to_geoarrow(arr, output_schema=ga.wkb())) + # the conversion from lon/lat values to S2 points and back gives some floating + # point differences, and output to WKB does not do any rounding, + # therefore checking exact values here + expected = ga.as_wkb( + [ + "POINT (0.9999999999999998 1)", + "POINT (2 1.9999999999999996)", + "POINT (3.0000000000000004 3.0000000000000004)", + ] + ) + assert result.equals(expected) + + +def test_wkt_roundtrip(): + wkt = [ + "POINT (30 10)", + "LINESTRING (30 10, 10 30, 40 40)", + "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))", + "POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20, 20 30))", + "MULTIPOINT ((10 40), (40 30), (20 20), (30 10))", + "MULTILINESTRING ((10 10, 20 20, 10 40), (40 40, 30 30, 40 20, 30 10))", + "MULTIPOLYGON (((30 20, 45 40, 10 40, 30 20)), ((15 5, 40 10, 10 20, 5 10, 15 5)))", + "MULTIPOLYGON (((40 40, 20 45, 45 30, 40 40)), ((20 35, 10 30, 10 10, 30 5, 45 20, 20 35), (30 20, 20 15, 20 25, 30 20)))", + "GEOMETRYCOLLECTION (POINT (40 10), LINESTRING (10 10, 20 20, 10 40), POLYGON ((40 40, 20 45, 45 30, 40 40)))", + ] + + arr = spherely.from_geoarrow(ga.as_wkt(wkt)) + result = pa.array(spherely.to_geoarrow(arr, output_schema=ga.wkt())) + np.testing.assert_array_equal(result, wkt) + + +def test_to_geoarrow_no_output_encoding(): + arr = spherely.points([1, 2, 3], [1, 2, 3]) + + with pytest.raises(ValueError, match="'output_schema' should be specified"): + spherely.to_geoarrow(arr) + + +def test_to_geoarrow_invalid_output_schema(): + arr = spherely.points([1, 2, 3], [1, 2, 3]) + with pytest.raises( + ValueError, match="'output_schema' should be an Arrow-compatible schema" + ): + spherely.to_geoarrow(arr, output_schema="WKT") + + with pytest.raises(ValueError, match="Did you pass a valid schema"): + spherely.to_geoarrow(arr, output_schema=pa.schema([("test", pa.int64())]))