From b3f531cee934256c4b1f4fb929548d013b57e2e2 Mon Sep 17 00:00:00 2001 From: Thomas Heigl Date: Thu, 5 Dec 2024 10:33:51 +0100 Subject: [PATCH 01/13] feat(java): configurable buffer size limit (#1963) ## What does this PR do? This PR introduces a new configuration option `bufferSizeLimitBytes` that replaces the hard-coded default of 128kb. ## Related issues #1950 ## Does this PR introduce any user-facing change? The PR introduces a new configuration option `bufferSizeLimitBytes`. - [x] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Discussion This PR solves my problem, but I'm not sure if it is the right way to move forward. This is quite a low-level configuration option, but a potentially very important one. Every user whose average payload size is >=128kb, will need to increase this value for maximum performance. Maybe the default limit should be increased to something less conservative like 1MB, so fewer users will need to adjust this setting? --- .../src/main/java/org/apache/fury/Fury.java | 5 ++--- .../java/org/apache/fury/config/Config.java | 8 +++++++ .../org/apache/fury/config/FuryBuilder.java | 12 +++++++++++ .../test/java/org/apache/fury/FuryTest.java | 21 +++++++++++++++++++ 4 files changed, 43 insertions(+), 3 deletions(-) diff --git a/java/fury-core/src/main/java/org/apache/fury/Fury.java b/java/fury-core/src/main/java/org/apache/fury/Fury.java index e10e989ab0..b1918358a2 100644 --- a/java/fury-core/src/main/java/org/apache/fury/Fury.java +++ b/java/fury-core/src/main/java/org/apache/fury/Fury.java @@ -98,7 +98,6 @@ public final class Fury implements BaseFury { private static final byte isOutOfBandFlag = 1 << 3; private static final boolean isLittleEndian = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN; private static final byte BITMAP = isLittleEndian ? isLittleEndianFlag : 0; - private static final int BUFFER_SIZE_LIMIT = 128 * 1024; private static final short MAGIC_NUMBER = 0x62D4; private final Config config; @@ -330,8 +329,8 @@ public MemoryBuffer getBuffer() { public void resetBuffer() { MemoryBuffer buf = buffer; - if (buf != null && buf.size() > BUFFER_SIZE_LIMIT) { - buffer = MemoryBuffer.newHeapBuffer(BUFFER_SIZE_LIMIT); + if (buf != null && buf.size() > config.bufferSizeLimitBytes()) { + buffer = MemoryBuffer.newHeapBuffer(config.bufferSizeLimitBytes()); } } diff --git a/java/fury-core/src/main/java/org/apache/fury/config/Config.java b/java/fury-core/src/main/java/org/apache/fury/config/Config.java index 98301413b3..bc6afe87d8 100644 --- a/java/fury-core/src/main/java/org/apache/fury/config/Config.java +++ b/java/fury-core/src/main/java/org/apache/fury/config/Config.java @@ -61,6 +61,7 @@ public class Config implements Serializable { private transient int configHash; private final boolean deserializeNonexistentEnumValueAsNull; private final boolean serializeEnumByName; + private final int bufferSizeLimitBytes; public Config(FuryBuilder builder) { name = builder.name; @@ -95,6 +96,7 @@ public Config(FuryBuilder builder) { scalaOptimizationEnabled = builder.scalaOptimizationEnabled; deserializeNonexistentEnumValueAsNull = builder.deserializeNonexistentEnumValueAsNull; serializeEnumByName = builder.serializeEnumByName; + bufferSizeLimitBytes = builder.bufferSizeLimitBytes; } /** Returns the name for Fury serialization. */ @@ -187,6 +189,10 @@ public LongEncoding longEncoding() { return longEncoding; } + public int bufferSizeLimitBytes() { + return bufferSizeLimitBytes; + } + public boolean requireClassRegistration() { return requireClassRegistration; } @@ -283,6 +289,7 @@ public boolean equals(Object o) { && compressString == config.compressString && compressInt == config.compressInt && compressLong == config.compressLong + && bufferSizeLimitBytes == config.bufferSizeLimitBytes && requireClassRegistration == config.requireClassRegistration && suppressClassRegistrationWarnings == config.suppressClassRegistrationWarnings && registerGuavaTypes == config.registerGuavaTypes @@ -317,6 +324,7 @@ public int hashCode() { compressInt, compressLong, longEncoding, + bufferSizeLimitBytes, requireClassRegistration, suppressClassRegistrationWarnings, registerGuavaTypes, diff --git a/java/fury-core/src/main/java/org/apache/fury/config/FuryBuilder.java b/java/fury-core/src/main/java/org/apache/fury/config/FuryBuilder.java index 8e183d66fb..06e954c5c8 100644 --- a/java/fury-core/src/main/java/org/apache/fury/config/FuryBuilder.java +++ b/java/fury-core/src/main/java/org/apache/fury/config/FuryBuilder.java @@ -83,6 +83,7 @@ public final class FuryBuilder { boolean suppressClassRegistrationWarnings = true; boolean deserializeNonexistentEnumValueAsNull = false; boolean serializeEnumByName = false; + int bufferSizeLimitBytes = 128 * 1024; MetaCompressor metaCompressor = new DeflaterMetaCompressor(); public FuryBuilder() {} @@ -184,6 +185,17 @@ public FuryBuilder withStringCompressed(boolean stringCompressed) { return this; } + /** + * Sets the limit for Fury's internal buffer. If the buffer size exceeds this limit, it will be + * reset to this limit after every serialization and deserialization. + * + *

The default is 128k. + */ + public FuryBuilder withBufferSizeLimitBytes(int bufferSizeLimitBytes) { + this.bufferSizeLimitBytes = bufferSizeLimitBytes; + return this; + } + /** * Set classloader for fury to load classes, this classloader can't up updated. Fury will cache * the class meta data, if classloader can be updated, there may be class meta collision if diff --git a/java/fury-core/src/test/java/org/apache/fury/FuryTest.java b/java/fury-core/src/test/java/org/apache/fury/FuryTest.java index 1e4b5a00ae..b5aaa0ffc7 100644 --- a/java/fury-core/src/test/java/org/apache/fury/FuryTest.java +++ b/java/fury-core/src/test/java/org/apache/fury/FuryTest.java @@ -615,4 +615,25 @@ public void testNullObjSerAndDe() { Object obj = fury.deserializeJavaObjectAndClass(bytes); assertNull(obj); } + + @Test + public void testResetBufferToSizeLimit() { + final int minBufferBytes = 64; + final int limitInBytes = 1024; + Fury fury = Fury.builder().withBufferSizeLimitBytes(limitInBytes).build(); + + final byte[] smallPayload = new byte[0]; + final byte[] serializedSmall = fury.serialize(smallPayload); + assertEquals(fury.getBuffer().size(), minBufferBytes); + + fury.deserialize(serializedSmall); + assertEquals(fury.getBuffer().size(), minBufferBytes); + + final byte[] largePayload = new byte[limitInBytes * 2]; + final byte[] serializedLarge = fury.serialize(largePayload); + assertEquals(fury.getBuffer().size(), limitInBytes); + + fury.deserialize(serializedLarge); + assertEquals(fury.getBuffer().size(), limitInBytes); + } } From 3865dcd0982c0ca9de04a8ba9635892b76288769 Mon Sep 17 00:00:00 2001 From: penguin_wwy <940375606@qq.com> Date: Sun, 8 Dec 2024 00:09:32 +0800 Subject: [PATCH 02/13] perf(python): Directly access the key-value pairs of a dict (#1970) ## What does this PR do? In Python, to implement a linear memory structure that stores key-value pairs, we can traverse them in the order of insertion like accessing an array. However, Cython does not provide a direct access interface, and these interfaces are internal in CPython, requiring compatibility work to use them correctly. Nevertheless, we can still use the`PyDict_Next` interface to replace the `items` method. Essentially, `items` use `PyDict_Next` to append to a list. Doing so can reduce the copying overhead. ## Related issues ## Does this PR introduce any user-facing change? - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark For large dict ``` [dict_item] 541 us +- 39 us -> [dict_next] 535 us +- 35 us: 1.00x faster [dict_item] 119.8 MiB +- 1344.0 KiB -> [dict_next] 118.8 MiB +- 1338.4 KiB: 1.01x faster ``` --- python/pyfury/_serialization.pyx | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/python/pyfury/_serialization.pyx b/python/pyfury/_serialization.pyx index ce1443c65f..74bd755b0a 100644 --- a/python/pyfury/_serialization.pyx +++ b/python/pyfury/_serialization.pyx @@ -44,6 +44,7 @@ from pyfury.util import is_little_endian from libc.stdint cimport * from libcpp.vector cimport vector from cpython cimport PyObject +from cpython.dict cimport PyDict_Next from cpython.ref cimport * from cpython.list cimport PyList_New, PyList_SET_ITEM from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM @@ -2049,7 +2050,13 @@ cdef class MapSerializer(Serializer): buffer.write_varint32(len(value)) cdef ClassInfo key_classinfo cdef ClassInfo value_classinfo - for k, v in value.items(): + cdef int64_t key_addr, value_addr + cdef Py_ssize_t pos = 0 + while PyDict_Next(value, &pos, &key_addr, &value_addr) != 0: + k = int2obj(key_addr) + Py_INCREF(k) + v = int2obj(value_addr) + Py_INCREF(v) key_cls = type(k) if key_cls is str: buffer.write_int16(NOT_NULL_STRING_FLAG) @@ -2122,7 +2129,13 @@ cdef class MapSerializer(Serializer): cpdef inline xwrite(self, Buffer buffer, o): cdef dict value = o buffer.write_varint32(len(value)) - for k, v in value.items(): + cdef int64_t key_addr, value_addr + cdef Py_ssize_t pos = 0 + while PyDict_Next(value, &pos, &key_addr, &value_addr) != 0: + k = int2obj(key_addr) + Py_INCREF(k) + v = int2obj(value_addr) + Py_INCREF(v) self.fury.xserialize_ref( buffer, k, serializer=self.key_serializer ) From b43d521fa0eb29cc97fc60f4d7bfa4171c993c6a Mon Sep 17 00:00:00 2001 From: Shawn Yang Date: Tue, 10 Dec 2024 08:53:43 +0800 Subject: [PATCH 03/13] docs(java): add object mapping example and tests (#1974) ## What does this PR do? add object mapping example and tests ## Related issues #1973 ## Does this PR introduce any user-facing change? - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark --- docs/guide/java_serialization_guide.md | 51 +++++++++++++++++++ .../test/java/org/apache/fury/FuryTest.java | 34 +++++++++++++ 2 files changed, 85 insertions(+) diff --git a/docs/guide/java_serialization_guide.md b/docs/guide/java_serialization_guide.md index cd9ac57cfd..5270390113 100644 --- a/docs/guide/java_serialization_guide.md +++ b/docs/guide/java_serialization_guide.md @@ -398,6 +398,57 @@ losing any information. If metadata sharing is not enabled, the new class data will be skipped and an `NonexistentSkipClass` stub object will be returned. +### Coping/Mapping object from one type to another type + +Fury support mapping object from one type to another type. +> Notes: +> +> 1. This mapping will execute a deep copy, all mapped fields are serialized into binary and +deserialized from that binary to map into another type. +> 2. All struct types must be registered with same ID, otherwise Fury can not mapping to correct struct type. +> Be careful when you use `Fury#register(Class)`, because fury will allocate an auto-grown ID which might be +> inconsistent if you register classes with different order between Fury instance. + +```java +public class StructMappingExample { + static class Struct1 { + int f1; + String f2; + + public Struct1(int f1, String f2) { + this.f1 = f1; + this.f2 = f2; + } + } + + static class Struct2 { + int f1; + String f2; + double f3; + } + + static ThreadSafeFury fury1 = Fury.builder() + .withCompatibleMode(CompatibleMode.COMPATIBLE).buildThreadSafeFury(); + static ThreadSafeFury fury2 = Fury.builder() + .withCompatibleMode(CompatibleMode.COMPATIBLE).buildThreadSafeFury(); + + static { + fury1.register(Struct1.class); + fury2.register(Struct2.class); + } + + public static void main(String[] args) { + Struct1 struct1 = new Struct1(10, "abc"); + Struct2 struct2 = (Struct2) fury2.deserialize(fury1.serialize(struct1)); + Assert.assertEquals(struct2.f1, struct1.f1); + Assert.assertEquals(struct2.f2, struct1.f2); + struct1 = (Struct1) fury1.deserialize(fury2.serialize(struct2)); + Assert.assertEquals(struct1.f1, struct2.f1); + Assert.assertEquals(struct1.f2, struct2.f2); + } +} +``` + ## Migration ### JDK migration diff --git a/java/fury-core/src/test/java/org/apache/fury/FuryTest.java b/java/fury-core/src/test/java/org/apache/fury/FuryTest.java index b5aaa0ffc7..30e845f117 100644 --- a/java/fury-core/src/test/java/org/apache/fury/FuryTest.java +++ b/java/fury-core/src/test/java/org/apache/fury/FuryTest.java @@ -55,6 +55,7 @@ import org.apache.fury.annotation.Expose; import org.apache.fury.annotation.Ignore; import org.apache.fury.builder.Generated; +import org.apache.fury.config.CompatibleMode; import org.apache.fury.config.FuryBuilder; import org.apache.fury.config.Language; import org.apache.fury.exception.FuryException; @@ -636,4 +637,37 @@ public void testResetBufferToSizeLimit() { fury.deserialize(serializedLarge); assertEquals(fury.getBuffer().size(), limitInBytes); } + + static class Struct1 { + int f1; + String f2; + + public Struct1(int f1, String f2) { + this.f1 = f1; + this.f2 = f2; + } + } + + static class Struct2 { + int f1; + String f2; + double f3; + } + + @Test + public void testStructMapping() { + ThreadSafeFury fury1 = + Fury.builder().withCompatibleMode(CompatibleMode.COMPATIBLE).buildThreadSafeFury(); + ThreadSafeFury fury2 = + Fury.builder().withCompatibleMode(CompatibleMode.COMPATIBLE).buildThreadSafeFury(); + fury1.register(Struct1.class); + fury2.register(Struct2.class); + Struct1 struct1 = new Struct1(10, "abc"); + Struct2 struct2 = (Struct2) fury2.deserialize(fury1.serialize(struct1)); + Assert.assertEquals(struct2.f1, struct1.f1); + Assert.assertEquals(struct2.f2, struct1.f2); + struct1 = (Struct1) fury1.deserialize(fury2.serialize(struct2)); + Assert.assertEquals(struct1.f1, struct2.f1); + Assert.assertEquals(struct1.f2, struct2.f2); + } } From 68ca4bfd6ce4f6454a9c7736fbe154891cbc44a5 Mon Sep 17 00:00:00 2001 From: Shawn Yang Date: Fri, 13 Dec 2024 02:21:53 +0800 Subject: [PATCH 04/13] fix(c++): fix bazel install (#1979) ## What does this PR do? fix bazel install ## Related issues Closes #1978 ## Does this PR introduce any user-facing change? - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark --- ci/run_ci.py | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/ci/run_ci.py b/ci/run_ci.py index 30141ab63d..cad0c73aeb 100644 --- a/ci/run_ci.py +++ b/ci/run_ci.py @@ -17,6 +17,7 @@ import argparse +import shutil import subprocess import platform import urllib.request as ulib @@ -37,6 +38,11 @@ ) +def _bazel(cmd: str): + bazel_cmd = "bazel" if _is_windows() else "~/bin/bazel" + return _exec_cmd(f"{bazel_cmd} {cmd}") + + def _exec_cmd(cmd: str): logging.info(f"running command: {cmd}") try: @@ -81,10 +87,8 @@ def _cd_project_subdir(subdir): def _run_cpp(): _install_cpp_deps() # run test - query_result = _exec_cmd("bazel query //...") - _exec_cmd( - "bazel test {}".format(query_result.replace("\n", " ").replace("\r", " ")) - ) + query_result = _bazel("query //...") + _bazel("test {}".format(query_result.replace("\n", " ").replace("\r", " "))) def _run_rust(): @@ -132,11 +136,14 @@ def _install_bazel(): bazel_path = os.path.join(os.getcwd(), local_name) _exec_cmd(f'setx path "%PATH%;{bazel_path}"') else: + if shutil.which("bazel"): + os.remove(shutil.which("bazel")) _exec_cmd(f"./{local_name} --user") + _update_shell_profile() os.remove(local_name) # bazel install status check - _exec_cmd("bazel --version") + _bazel("--version") # default is byte psutil = importlib.import_module("psutil") @@ -146,6 +153,21 @@ def _install_bazel(): file.write(f"\nbuild --jobs={limit_jobs}") +def _update_shell_profile(): + home = os.path.expanduser("~") + profiles = [".bashrc", ".bash_profile", ".zshrc"] + path_export = 'export PATH="$PATH:$HOME/bin" # Add Bazel to PATH\n' + for profile in profiles: + profile_path = os.path.join(home, profile) + if os.path.exists(profile_path): + with open(profile_path, "a") as f: + f.write(path_export) + logging.info(f"Updated {profile} to include Bazel PATH.") + break + else: + logging.info("No shell profile found. Please add Bazel to PATH manually.") + + def _parse_args(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter From 9be8c958c7db0f8f3db017cfa734ea60ff36c3a0 Mon Sep 17 00:00:00 2001 From: Amit Prasad <13373751+AmitPr@users.noreply.github.com> Date: Thu, 12 Dec 2024 20:51:49 -0600 Subject: [PATCH 05/13] fix(java): Fix flakiness in ExpressionVisitorTest#testTraverseExpression (#1968) ## What does this PR do? `ExpressionVisitorTest#testTraverseExpression` relies on ordered traversal. However, the traversal depends on the order of `getDeclaredFields()` to be deterministic (see below). The Java specification explicitly marks `getDeclaredFields` as a function that can return field names in any order. In the wild, this is most likely to manifest on different architectures with different JVM versions. https://github.com/apache/fury/blob/54b62fb6ab5d7e557131efe07c7402c885f6e7c4/java/fury-core/src/main/java/org/apache/fury/reflect/ReflectionUtils.java#L368-L381 Using [NonDex](https://github.com/TestingResearchIllinois/NonDex), we can replicate the flaky behavior with the following command: ``` mvn edu.illinois:nondex-maven-plugin:2.1.7:nondex -pl fury-core/ -Dcheckstyle.skip -Dmaven.javadoc.skip -Dtest=org.apache.fury.codegen.ExpressionVisitorTest ``` The fix, in this case, is to simply use HashSet equality instead of an ordered List equality. The above NonDex command should succeed after applying this patch. I do, however, note that there are other flaky tests (found by running NonDex on the entire `fury-core` project) that fail due to reliance on e.g. `PriorityQueue#toArray`, which is also explicity marked to return nondeterministically ordered arrays. ## Does this PR introduce any user-facing change? No --------- Co-authored-by: Shawn Yang --- .../org/apache/fury/codegen/ExpressionVisitorTest.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/java/fury-core/src/test/java/org/apache/fury/codegen/ExpressionVisitorTest.java b/java/fury-core/src/test/java/org/apache/fury/codegen/ExpressionVisitorTest.java index 5404aba3a8..3d6e9ef4f6 100644 --- a/java/fury-core/src/test/java/org/apache/fury/codegen/ExpressionVisitorTest.java +++ b/java/fury-core/src/test/java/org/apache/fury/codegen/ExpressionVisitorTest.java @@ -26,7 +26,9 @@ import java.lang.reflect.Method; import java.util.ArrayList; import java.util.Arrays; +import java.util.HashSet; import java.util.List; +import java.util.Set; import org.apache.fury.codegen.Expression.Literal; import org.apache.fury.reflect.ReflectionUtils; import org.apache.fury.reflect.TypeRef; @@ -63,7 +65,11 @@ public void testTraverseExpression() throws InvocationTargetException, IllegalAc assertEquals(serializedLambda.getCapturedArgCount(), 1); ExpressionVisitor.ExprHolder exprHolder = (ExpressionVisitor.ExprHolder) (serializedLambda.getCapturedArg(0)); - assertEquals( - expressions, Arrays.asList(forLoop, start, end, step, e1, ref, exprHolder.get("e2"))); + + // Traversal relies on getDeclaredFields(), nondeterministic order. + Set expressionsSet = new HashSet<>(expressions); + Set expressionsSet2 = + new HashSet<>(Arrays.asList(forLoop, e1, ref, exprHolder.get("e2"), end, start, step)); + assertEquals(expressionsSet, expressionsSet2); } } From 862667e7ce04414371cce0ec867a2931d1bcada4 Mon Sep 17 00:00:00 2001 From: Shawn Yang Date: Tue, 17 Dec 2024 01:30:09 +0800 Subject: [PATCH 06/13] chore(python): drop py3.7 support (#1981) ## What does this PR do? This pr drop py3.7 support ## Related issues Closes #1982 ## Does this PR introduce any user-facing change? - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark --- .github/workflows/ci.yml | 2 +- .github/workflows/release.yaml | 2 +- python/pyfury/_fury.py | 6 +----- python/pyfury/_serialization.pyx | 5 +---- python/setup.py | 2 -- 5 files changed, 4 insertions(+), 13 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1574138ff7..72201cff97 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -218,7 +218,7 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - python-version: [3.7, 3.12] + python-version: [3.8, 3.12] steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index d5beee49c0..109204e976 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -32,7 +32,7 @@ jobs: url: https://pypi.org/project/pyfury strategy: matrix: - python-version: [3.7, 3.8, 3.9, 3.10.12, 3.11, 3.12] + python-version: [3.8, 3.9, 3.10.12, 3.11, 3.12] steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} diff --git a/python/pyfury/_fury.py b/python/pyfury/_fury.py index 1304486467..cea736389d 100644 --- a/python/pyfury/_fury.py +++ b/python/pyfury/_fury.py @@ -21,7 +21,6 @@ import enum import logging import os -import sys import warnings from dataclasses import dataclass from typing import Dict, Tuple, TypeVar, Union, Iterable @@ -80,10 +79,7 @@ from cloudpickle import Pickler -if sys.version_info[:2] < (3, 8): # pragma: no cover - from pickle5 import Unpickler -else: - from pickle import Unpickler +from pickle import Unpickler logger = logging.getLogger(__name__) diff --git a/python/pyfury/_serialization.pyx b/python/pyfury/_serialization.pyx index 74bd755b0a..8e7ad3c47b 100644 --- a/python/pyfury/_serialization.pyx +++ b/python/pyfury/_serialization.pyx @@ -59,10 +59,7 @@ try: except ImportError: np = None -if sys.version_info[:2] < (3, 8): # pragma: no cover - import pickle5 as pickle # nosec # pylint: disable=import_pickle -else: - import pickle # nosec # pylint: disable=import_pickle +import pickle # nosec # pylint: disable=import_pickle cimport cython diff --git a/python/setup.py b/python/setup.py index 699ca89381..01be89bccf 100644 --- a/python/setup.py +++ b/python/setup.py @@ -159,8 +159,6 @@ def parse_version(): ], zip_safe=False, install_requires=[ - 'dataclasses; python_version<"3.7"', - 'pickle5; python_version<"3.8"', "cloudpickle", ], extras_require={ From 1515f94c0013276f6cbb2f6b704b525fca036c2d Mon Sep 17 00:00:00 2001 From: Zheng Feng Date: Fri, 20 Dec 2024 18:47:10 +0800 Subject: [PATCH 07/13] =?UTF-8?q?fix(java):=20only=20print=20warn=20messag?= =?UTF-8?q?e=20if=20scopedMetaShareEnabled=20is=20true=20=E2=80=A6=20(#198?= =?UTF-8?q?5)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …and not in CompatibleMode ## What does this PR do? ## Related issues - https://github.com/quarkiverse/quarkus-fury/issues/51 ## Does this PR introduce any user-facing change? - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark --- .../src/main/java/org/apache/fury/config/FuryBuilder.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/fury-core/src/main/java/org/apache/fury/config/FuryBuilder.java b/java/fury-core/src/main/java/org/apache/fury/config/FuryBuilder.java index 06e954c5c8..e139e09fa9 100644 --- a/java/fury-core/src/main/java/org/apache/fury/config/FuryBuilder.java +++ b/java/fury-core/src/main/java/org/apache/fury/config/FuryBuilder.java @@ -400,7 +400,7 @@ private void finish() { if (deserializeNonexistentClass == null) { deserializeNonexistentClass = false; } - if (scopedMetaShareEnabled != null) { + if (scopedMetaShareEnabled != null && scopedMetaShareEnabled) { LOG.warn("Scoped meta share is for CompatibleMode only, disable it for {}", compatibleMode); } scopedMetaShareEnabled = false; From 8d2d1240825cbfa32548ecb8afc978cea533ec23 Mon Sep 17 00:00:00 2001 From: PAN <46820719+pandalee99@users.noreply.github.com> Date: Mon, 23 Dec 2024 10:29:46 +0800 Subject: [PATCH 08/13] feat(python): Hardcoding metastring into passable parameters (#1987) ## What does this PR do? In the original MetaString, MetaStringEncoder used hard coding directly to solve the special char1/2 situation, but this was not the best choice. So it's passable, allowing MetaString to select the special char it passes. ## Related issues Close #1983 ## Does this PR introduce any user-facing change? - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark --- python/pyfury/meta/metastring.py | 108 +++++++++++++++++++++---- python/pyfury/tests/test_metastring.py | 51 +++++++----- 2 files changed, 122 insertions(+), 37 deletions(-) diff --git a/python/pyfury/meta/metastring.py b/python/pyfury/meta/metastring.py index 63232b5611..4ff065105b 100644 --- a/python/pyfury/meta/metastring.py +++ b/python/pyfury/meta/metastring.py @@ -48,12 +48,20 @@ class Encoding(Enum): class MetaString: def __init__( - self, original: str, encoding: Encoding, encoded_data: bytes, length: int + self, + original: str, + encoding: Encoding, + encoded_data: bytes, + length: int, + special_char1: str = ".", + special_char2: str = "|", ): self.original = original self.encoding = encoding self.encoded_data = encoded_data self.length = length + self.special_char1 = special_char1 + self.special_char2 = special_char2 if self.encoding != Encoding.UTF_8: self.strip_last_char = (encoded_data[0] & 0x80) != 0 else: @@ -65,6 +73,17 @@ class MetaStringDecoder: Decodes MetaString objects back into their original plain text form. """ + def __init__(self, special_char1: str, special_char2: str): + """ + Creates a MetaStringDecoder with specified special characters used for decoding. + + Args: + special_char1 (str): The first special character used for encoding. + special_char2 (str): The second special character used for encoding. + """ + self.special_char1 = special_char1 + self.special_char2 = special_char2 + def decode(self, encoded_data: bytes, encoding: Encoding) -> str: """ Decodes the encoded data using the specified encoding. @@ -203,9 +222,9 @@ def _decode_lower_upper_digit_special_char(self, char_value: int) -> str: elif 52 <= char_value <= 61: return chr(ord("0") + (char_value - 52)) elif char_value == 62: - return "." + return self.special_char1 # Use special_char1 for the encoding elif char_value == 63: - return "_" + return self.special_char2 # Use special_char2 for the encoding else: raise ValueError( f"Invalid character value for LOWER_UPPER_DIGIT_SPECIAL: {char_value}" @@ -250,9 +269,16 @@ def _decode_rep_all_to_lower_special(self, data: bytes) -> str: class MetaStringEncoder: - """ - Encodes plain text strings into MetaString objects with specified encoding mechanisms. - """ + def __init__(self, special_char1: str, special_char2: str): + """ + Creates a MetaStringEncoder with specified special characters used for encoding. + + Args: + special_char1 (str): The first special character used in custom encoding. + special_char2 (str): The second special character used in custom encoding. + """ + self.special_char1 = special_char1 + self.special_char2 = special_char2 def encode(self, input_string: str) -> MetaString: """ @@ -270,7 +296,14 @@ def encode(self, input_string: str) -> MetaString: ), "Long meta string than _METASTRING_NUM_CHARS_LIMIT is not allowed." if not input_string: - return MetaString(input_string, Encoding.UTF_8, bytes(), 0) + return MetaString( + input_string, + Encoding.UTF_8, + bytes(), + 0, + self.special_char1, + self.special_char2, + ) encoding = self.compute_encoding(input_string) return self.encode_with_encoding(input_string, encoding) @@ -292,29 +325,67 @@ def encode_with_encoding(self, input_string: str, encoding: Encoding) -> MetaStr ), "Long meta string than _METASTRING_NUM_CHARS_LIMIT is not allowed." if not input_string: - return MetaString(input_string, Encoding.UTF_8, bytes(), 0) + return MetaString( + input_string, + Encoding.UTF_8, + bytes(), + 0, + self.special_char1, + self.special_char2, + ) length = len(input_string) if encoding == Encoding.LOWER_SPECIAL: encoded_data = self._encode_lower_special(input_string) - return MetaString(input_string, encoding, encoded_data, length * 5) + return MetaString( + input_string, + encoding, + encoded_data, + length * 5, + self.special_char1, + self.special_char2, + ) elif encoding == Encoding.LOWER_UPPER_DIGIT_SPECIAL: encoded_data = self._encode_lower_upper_digit_special(input_string) - return MetaString(input_string, encoding, encoded_data, length * 6) + return MetaString( + input_string, + encoding, + encoded_data, + length * 6, + self.special_char1, + self.special_char2, + ) elif encoding == Encoding.FIRST_TO_LOWER_SPECIAL: encoded_data = self._encode_first_to_lower_special(input_string) - return MetaString(input_string, encoding, encoded_data, length * 5) + return MetaString( + input_string, + encoding, + encoded_data, + length * 5, + self.special_char1, + self.special_char2, + ) elif encoding == Encoding.ALL_TO_LOWER_SPECIAL: chars = list(input_string) upper_count = sum(1 for c in chars if c.isupper()) encoded_data = self._encode_all_to_lower_special(chars) return MetaString( - input_string, encoding, encoded_data, (upper_count + length) * 5 + input_string, + encoding, + encoded_data, + (upper_count + length) * 5, + self.special_char1, + self.special_char2, ) else: encoded_data = bytes(input_string, "utf-8") return MetaString( - input_string, Encoding.UTF_8, encoded_data, len(encoded_data) * 8 + input_string, + Encoding.UTF_8, + encoded_data, + len(encoded_data) * 8, + self.special_char1, + self.special_char2, ) def compute_encoding(self, input_string: str) -> Encoding: @@ -363,7 +434,12 @@ def _compute_statistics(self, chars: List[str]) -> Statistics: upper_count = 0 for c in chars: if can_lower_upper_digit_special_encoded: - if not (c.islower() or c.isupper() or c.isdigit() or c in {".", "_"}): + if not ( + c.islower() + or c.isupper() + or c.isdigit() + or c in {self.special_char1, self.special_char2} + ): can_lower_upper_digit_special_encoded = False if can_lower_special_encoded: if not (c.islower() or c in {".", "_", "$", "|"}): @@ -500,9 +576,9 @@ def _char_to_value(self, c: str, bits_per_char: int) -> int: return 26 + (ord(c) - ord("A")) elif "0" <= c <= "9": return 52 + (ord(c) - ord("0")) - elif c == ".": + elif c == self.special_char1: return 62 - elif c == "_": + elif c == self.special_char2: return 63 else: raise ValueError( diff --git a/python/pyfury/tests/test_metastring.py b/python/pyfury/tests/test_metastring.py index 7dd98ff73e..95596edf21 100644 --- a/python/pyfury/tests/test_metastring.py +++ b/python/pyfury/tests/test_metastring.py @@ -24,8 +24,10 @@ def test_encode_metastring_lower_special(): - encoder = MetaStringEncoder() - decoder = MetaStringDecoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") + decoder = MetaStringDecoder(special_char1=".", special_char2="_") + + # Test for encoding and decoding encoded = encoder._encode_lower_special("abc_def") assert len(encoded) == 5 assert len(encoder.encode("org.apache.fury.benchmark.data").encoded_data) == 19 @@ -41,10 +43,12 @@ def test_encode_metastring_lower_special(): def test_encode_metastring_lower_upper_digit_special(): - encoder = MetaStringEncoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") + decoder = MetaStringDecoder(special_char1=".", special_char2="_") + + # Test for encoding and decoding encoded = encoder._encode_lower_upper_digit_special("ExampleInput123") assert len(encoded) == 12 - decoder = MetaStringDecoder() decoded = decoder.decode(encoded, Encoding.LOWER_UPPER_DIGIT_SPECIAL) assert decoded == "ExampleInput123" @@ -73,8 +77,9 @@ def create_string(length): def test_metastring(): + encoder = MetaStringEncoder(special_char1=".", special_char2="_") + decoder = MetaStringDecoder(special_char1=".", special_char2="_") - encoder = MetaStringEncoder() for i in range(1, 128): try: string = create_string(i) @@ -82,7 +87,6 @@ def test_metastring(): assert metastring.encoding != Encoding.UTF_8 assert metastring.original == string - decoder = MetaStringDecoder() new_string = decoder.decode(metastring.encoded_data, metastring.encoding) assert new_string == string except Exception as e: @@ -90,8 +94,9 @@ def test_metastring(): def test_encode_empty_string(): - encoder = MetaStringEncoder() - decoder = MetaStringDecoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") + decoder = MetaStringDecoder(special_char1=".", special_char2="_") + for encoding in [ Encoding.LOWER_SPECIAL, Encoding.LOWER_UPPER_DIGIT_SPECIAL, @@ -106,7 +111,7 @@ def test_encode_empty_string(): def test_encode_characters_outside_of_lower_special(): - encoder = MetaStringEncoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") test_string = "abcdefABCDEF1234!@#" metastring = encoder.encode(test_string) @@ -114,8 +119,9 @@ def test_encode_characters_outside_of_lower_special(): def test_all_to_upper_special_encoding(): - encoder = MetaStringEncoder() - decoder = MetaStringDecoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") + decoder = MetaStringDecoder(special_char1=".", special_char2="_") + test_string = "ABC_DEF" metastring = encoder.encode(test_string) assert metastring.encoding == Encoding.LOWER_UPPER_DIGIT_SPECIAL @@ -124,8 +130,9 @@ def test_all_to_upper_special_encoding(): def test_first_to_lower_special_encoding(): - encoder = MetaStringEncoder() - decoder = MetaStringDecoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") + decoder = MetaStringDecoder(special_char1=".", special_char2="_") + test_string = "Aabcdef" metastring = encoder.encode(test_string) assert metastring.encoding == Encoding.FIRST_TO_LOWER_SPECIAL @@ -134,8 +141,9 @@ def test_first_to_lower_special_encoding(): def test_utf8_encoding(): - encoder = MetaStringEncoder() - decoder = MetaStringDecoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") + decoder = MetaStringDecoder(special_char1=".", special_char2="_") + test_string = "你好,世界" # Non-Latin characters metastring = encoder.encode(test_string) assert metastring.encoding == Encoding.UTF_8 @@ -144,7 +152,7 @@ def test_utf8_encoding(): def test_strip_last_char(): - encoder = MetaStringEncoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") test_string = "abc" # encoded as 1|00000|00, 001|00010, exactly two bytes encoded_metastring = encoder.encode(test_string) @@ -156,8 +164,9 @@ def test_strip_last_char(): def test_empty_string(): - encoder = MetaStringEncoder() - decoder = MetaStringDecoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") + decoder = MetaStringDecoder(special_char1=".", special_char2="_") + metastring = encoder.encode("") assert metastring.encoded_data == bytes() @@ -166,7 +175,7 @@ def test_empty_string(): def test_ascii_encoding(): - encoder = MetaStringEncoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") test_string = "asciiOnly" encoded_metastring = encoder.encode(test_string) @@ -175,7 +184,7 @@ def test_ascii_encoding(): def test_non_ascii_encoding(): - encoder = MetaStringEncoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") test_string = "こんにちは" # Non-ASCII string encoded_metastring = encoder.encode(test_string) @@ -183,7 +192,7 @@ def test_non_ascii_encoding(): def test_non_ascii_encoding_and_non_utf8(): - encoder = MetaStringEncoder() + encoder = MetaStringEncoder(special_char1=".", special_char2="_") non_ascii_string = "こんにちは" # Non-ASCII string From 98efd72c2c740167edafd23d9d9c8c157095b780 Mon Sep 17 00:00:00 2001 From: Shawn Yang Date: Mon, 30 Dec 2024 21:52:32 +0800 Subject: [PATCH 09/13] feat(java/python): new xlang type system spec implementation (#1690) ## What does this PR do? This PR implements a new [type system](https://fury.apache.org/docs/specification/fury_xlang_serialization_spec/#type-systems) for xlang serialization between java and python. The changes includes: - Refine type system spec: added new types: - named_enum: an enum whose value will be serialized as the registered name. - struct: a morphic(final) type serialized by Fury Struct serializer. - polymorphic_struct: a type which is not morphic(not final). i.e. it don't have subclasses. Suppose we're deserializing `List`, we can save dynamic serializer dispatch if `SomeClass` is morphic(final). - compatible_struct: a morphic(final) type serialized by Fury compatible Struct serializer. - polymorphic_compatible_struct: a non-morphic(non-final) type serialized by Fury compatible Struct serializer. - named_struct: a `struct` whose type mapping will be encoded as a name. - named_polymorphic_struct: a `polymorphic_struct` whose type mapping will be encoded as a name. - named_compatible_struct: a `compatible_struct` whose type mapping will be encoded as a name. - named_polymorphic_compatible_struct: a `polymorphic_compatible_struct` whose type mapping will be encoded as a name. - ext: a type which will be serialized by a customized serializer. - polymorphic_ext: an `ext` type which is not morphic(not final). - named_ext: an `ext` type whose type mapping will be encoded as a name. - named_polymorphic_ext: an `polymorphic_ext` type whose type mapping will be encoded as a name. - Added a new XtypeResolver in java to resolve xlang types - Support register class mapping by id. Before this PR, we only support register class by name, which is more expensive at space/performance cost. - Support pass type into to resolve type ambiguation such as `ArrayList/Object[]` in java. Users can `serialize(List.of(1, 2, ,3))` and deserialize it into array by `deserialize(bytes, Integer[].class)` - Refactor pyfury serialization by moving type resolver into python code from cython, this will make debug more easy and reduce code duplciation, it also speed serialization performance. - golang xtype serialization test are disabled, it will be reenabled after new type system is implemented in golang ## Related issues ## Does this PR introduce any user-facing change? - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark --- BUILD | 1 + README.md | 2 +- cpp/fury/type/BUILD | 15 + cpp/fury/type/type.h | 153 ++ cpp/fury/util/buffer.h | 24 + cpp/fury/util/buffer_test.cc | 10 + cpp/fury/util/logging.cc | 2 +- cpp/fury/util/status.h | 15 +- docs/guide/xlang_serialization_guide.md | 6 +- docs/guide/xlang_type_mapping.md | 82 +- .../specification/xlang_serialization_spec.md | 66 +- go/fury/fury_xlang_test.go | 3 + .../apache/fury/AbstractThreadSafeFury.java | 14 +- .../main/java/org/apache/fury/BaseFury.java | 41 +- .../src/main/java/org/apache/fury/Fury.java | 326 ++-- .../java/org/apache/fury/ThreadLocalFury.java | 5 + .../exception/ClassUnregisteredException.java | 31 + .../SerializerUnregisteredException.java} | 35 +- .../org/apache/fury/pool/ThreadPoolFury.java | 5 + .../org/apache/fury/resolver/ClassInfo.java | 39 +- .../apache/fury/resolver/ClassNameBytes.java | 44 + .../apache/fury/resolver/ClassResolver.java | 162 +- .../apache/fury/resolver/MetaStringBytes.java | 1 + .../fury/resolver/MetaStringResolver.java | 5 +- .../apache/fury/resolver/XtypeResolver.java | 494 ++++++ .../fury/serializer/ArraySerializers.java | 97 +- .../fury/serializer/BufferSerializers.java | 3 +- .../fury/serializer/EnumSerializer.java | 14 + .../serializer/FuryCopyableSerializer.java | 19 - .../fury/serializer/OptionalSerializers.java | 10 +- .../fury/serializer/PrimitiveSerializers.java | 104 +- .../apache/fury/serializer/Serializer.java | 20 +- .../apache/fury/serializer/Serializers.java | 53 +- .../fury/serializer/StringSerializer.java | 6 - .../fury/serializer/StructSerializer.java | 100 +- .../fury/serializer/TimeSerializers.java | 65 +- .../AbstractCollectionSerializer.java | 7 +- .../collection/AbstractMapSerializer.java | 50 +- .../collection/CollectionSerializers.java | 81 +- .../collection/FuryArrayAsListSerializer.java | 6 - .../GuavaCollectionSerializers.java | 57 +- .../ImmutableCollectionSerializers.java | 16 +- .../serializer/collection/MapSerializers.java | 55 +- .../collection/SubListSerializers.java | 6 +- .../collection/SynchronizedSerializers.java | 4 +- .../collection/UnmodifiableSerializers.java | 4 +- .../org/apache/fury/type/GenericType.java | 16 +- .../java/org/apache/fury/type/ScalaTypes.java | 8 + .../java/org/apache/fury/type/TypeUtils.java | 9 + .../main/java/org/apache/fury/type/Types.java | 198 +++ .../org/apache/fury/CrossLanguageTest.java | 209 ++- .../test/java/org/apache/fury/FuryTest.java | 15 +- .../java/org/apache/fury/FuryTestBase.java | 23 + .../fury/serializer/ArraySerializersTest.java | 48 +- .../fury/serializer/EnumSerializerTest.java | 6 + .../fury/serializer/SerializersTest.java | 13 +- .../apache/fury/format/type/ArrowType.java} | 8 +- .../apache/fury/format/type/DataTypes.java | 394 ++--- .../format/vectorized/ArrowSerializers.java | 9 +- .../vectorized/ArrowTableSerializer.java | 3 +- python/README.md | 10 + python/pyfury/__init__.py | 27 +- python/pyfury/_fury.py | 757 ++------- python/pyfury/_registry.py | 627 ++++++++ python/pyfury/_serialization.pyx | 1409 +++++------------ python/pyfury/_serializer.py | 393 +---- python/pyfury/_struct.py | 50 +- python/pyfury/_util.pxd | 12 +- python/pyfury/_util.pyx | 129 +- python/pyfury/error.py | 4 + python/pyfury/format/serializer.py | 7 - python/pyfury/includes/libserialization.pxd | 71 + python/pyfury/includes/libutil.pxd | 21 + python/pyfury/meta/metastring.py | 2 +- python/pyfury/resolver.py | 2 +- python/pyfury/serializer.py | 394 +++-- python/pyfury/tests/test_buffer.py | 149 +- python/pyfury/tests/test_cross_language.py | 62 +- .../pyfury/tests/test_metastring_resolver.py | 36 + python/pyfury/tests/test_serializer.py | 69 +- python/pyfury/tests/test_struct.py | 18 +- python/pyfury/type.py | 228 +-- 82 files changed, 4105 insertions(+), 3689 deletions(-) create mode 100644 cpp/fury/type/BUILD create mode 100644 cpp/fury/type/type.h create mode 100644 java/fury-core/src/main/java/org/apache/fury/exception/ClassUnregisteredException.java rename java/fury-core/src/main/java/org/apache/fury/{serializer/OpaqueObjects.java => exception/SerializerUnregisteredException.java} (50%) create mode 100644 java/fury-core/src/main/java/org/apache/fury/resolver/ClassNameBytes.java create mode 100644 java/fury-core/src/main/java/org/apache/fury/resolver/XtypeResolver.java create mode 100644 java/fury-core/src/main/java/org/apache/fury/type/Types.java rename java/{fury-core/src/main/java/org/apache/fury/type/Type.java => fury-format/src/main/java/org/apache/fury/format/type/ArrowType.java} (97%) create mode 100644 python/pyfury/_registry.py create mode 100644 python/pyfury/includes/libserialization.pxd create mode 100644 python/pyfury/tests/test_metastring_resolver.py diff --git a/BUILD b/BUILD index 5fa5b66f0e..d5f1063377 100644 --- a/BUILD +++ b/BUILD @@ -62,6 +62,7 @@ pyx_library( ), deps = [ "//cpp/fury/util:fury_util", + "//cpp/fury/type:fury_type", "@com_google_absl//absl/container:flat_hash_map", ], ) diff --git a/README.md b/README.md index 29b8e11b9f..89bf7f0496 100644 --- a/README.md +++ b/README.md @@ -278,7 +278,7 @@ class SomeClass: f3: Dict[str, str] fury = pyfury.Fury(ref_tracking=True) -fury.register_class(SomeClass, type_tag="example.SomeClass") +fury.register_type(SomeClass, typename="example.SomeClass") obj = SomeClass() obj.f2 = {"k1": "v1", "k2": "v2"} obj.f1, obj.f3 = obj, obj.f2 diff --git a/cpp/fury/type/BUILD b/cpp/fury/type/BUILD new file mode 100644 index 0000000000..391e882d83 --- /dev/null +++ b/cpp/fury/type/BUILD @@ -0,0 +1,15 @@ +load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test") + +cc_library( + name = "fury_type", + srcs = glob(["*.cc"], exclude=["*test.cc"]), + hdrs = glob(["*.h"]), + copts = ["-mavx2"], # Enable AVX2 support + linkopts = ["-mavx2"], # Ensure linker also knows about AVX2 + strip_include_prefix = "/cpp", + alwayslink=True, + linkstatic=True, + deps = [ + ], + visibility = ["//visibility:public"], +) diff --git a/cpp/fury/type/type.h b/cpp/fury/type/type.h new file mode 100644 index 0000000000..d98c0d3f44 --- /dev/null +++ b/cpp/fury/type/type.h @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include // For fixed-width integer types + +namespace fury { +enum class TypeId : int32_t { + // a boolean value (true or false). + BOOL = 1, + // a 8-bit signed integer. + INT8 = 2, + // a 16-bit signed integer. + INT16 = 3, + // a 32-bit signed integer. + INT32 = 4, + // a 32-bit signed integer which use fury var_int32 encoding. + VAR_INT32 = 5, + // a 64-bit signed integer. + INT64 = 6, + // a 64-bit signed integer which use fury PVL encoding. + VAR_INT64 = 7, + // a 64-bit signed integer which use fury SLI encoding. + SLI_INT64 = 8, + // a 16-bit floating point number. + FLOAT16 = 9, + // a 32-bit floating point number. + FLOAT32 = 10, + // a 64-bit floating point number including NaN and Infinity. + FLOAT64 = 11, + // a text string encoded using Latin1/UTF16/UTF-8 encoding. + STRING = 12, + // a data type consisting of a set of named values. Rust enum with + // non-predefined field values are not supported as an enum + ENUM = 13, + // an enum whose value will be serialized as the registered name. + NAMED_ENUM = 14, + // a morphic(final) type serialized by Fury Struct serializer. i.e. it doesn't + // have subclasses. Suppose we're + // deserializing `List`, we can save dynamic serializer dispatch + // since `SomeClass` is morphic(final). + STRUCT = 15, + // a type which is not morphic(not final). i.e. it has subclasses. Suppose + // we're deserializing + // `List`, we must dispatch serializer dynamically since + // `SomeClass` is polymorphic(non-final). + POLYMORPHIC_STRUCT = 16, + // a morphic(final) type serialized by Fury compatible Struct serializer. + COMPATIBLE_STRUCT = 17, + // a non-morphic(non-final) type serialized by Fury compatible Struct + // serializer. + POLYMORPHIC_COMPATIBLE_STRUCT = 18, + // a `struct` whose type mapping will be encoded as a name. + NAMED_STRUCT = 19, + // a `polymorphic_struct` whose type mapping will be encoded as a name. + NAMED_POLYMORPHIC_STRUCT = 20, + // a `compatible_struct` whose type mapping will be encoded as a name. + NAMED_COMPATIBLE_STRUCT = 21, + // a `polymorphic_compatible_struct` whose type mapping will be encoded as a + // name. + NAMED_POLYMORPHIC_COMPATIBLE_STRUCT = 22, + // a type which will be serialized by a customized serializer. + EXT = 23, + // an `ext` type which is not morphic(not final). + POLYMORPHIC_EXT = 24, + // an `ext` type whose type mapping will be encoded as a name. + NAMED_EXT = 25, + // an `polymorphic_ext` type whose type mapping will be encoded as a name. + NAMED_POLYMORPHIC_EXT = 26, + // a sequence of objects. + LIST = 27, + // an unordered set of unique elements. + SET = 28, + // a map of key-value pairs. Mutable types such as + // `list/map/set/array/tensor/arrow` are not allowed as key of map. + MAP = 29, + // an absolute length of time, independent of any calendar/timezone, as a + // count of nanoseconds. + DURATION = 30, + // a point in time, independent of any calendar/timezone, as a count of + // nanoseconds. The count is relative + // to an epoch at UTC midnight on January 1, 1970. + TIMESTAMP = 31, + // a naive date without timezone. The count is days relative to an epoch at + // UTC midnight on Jan 1, 1970. + LOCAL_DATE = 32, + // exact decimal value represented as an integer value in two's complement. + DECIMAL = 33, + // an variable-length array of bytes. + BINARY = 34, + // a multidimensional array which every sub-array can have different sizes but + // all have same type. + // only allow numeric components. Other arrays will be taken as List. The + // implementation should support the + // interoperability between array and list. + ARRAY = 35, + // one dimensional bool array. + BOOL_ARRAY = 36, + // one dimensional int16 array. + INT8_ARRAY = 37, + // one dimensional int16 array. + INT16_ARRAY = 38, + // one dimensional int32 array. + INT32_ARRAY = 39, + // one dimensional int64 array. + INT64_ARRAY = 40, + // one dimensional half_float_16 array. + FLOAT16_ARRAY = 41, + // one dimensional float32 array. + FLOAT32_ARRAY = 42, + // one dimensional float64 array. + FLOAT64_ARRAY = 43, + // an arrow [record + // batch](https://arrow.apache.org/docs/cpp/tables.html#record-batches) + // object. + ARROW_RECORD_BATCH = 44, + // an arrow [table](https://arrow.apache.org/docs/cpp/tables.html#tables) + // object. + ARROW_TABLE = 45, + BOUND = 64 +}; + +inline bool IsNamespacedType(int32_t type_id) { + switch (static_cast(type_id)) { + case TypeId::NAMED_ENUM: + case TypeId::NAMED_STRUCT: + case TypeId::NAMED_POLYMORPHIC_STRUCT: + case TypeId::NAMED_COMPATIBLE_STRUCT: + case TypeId::NAMED_POLYMORPHIC_COMPATIBLE_STRUCT: + case TypeId::NAMED_EXT: + case TypeId::NAMED_POLYMORPHIC_EXT: + return true; + default: + return false; + } +} + +} // namespace fury diff --git a/cpp/fury/util/buffer.h b/cpp/fury/util/buffer.h index b522ba490d..74a1b5a07d 100644 --- a/cpp/fury/util/buffer.h +++ b/cpp/fury/util/buffer.h @@ -27,6 +27,7 @@ #include "fury/util/bit_util.h" #include "fury/util/logging.h" +#include "fury/util/status.h" namespace fury { @@ -133,6 +134,29 @@ class Buffer { inline double GetDouble(uint32_t offset) { return Get(offset); } + inline Status GetBytesAsInt64(uint32_t offset, uint32_t length, + int64_t *target) { + if (length == 0) { + *target = 0; + return Status::OK(); + } + if (size_ - (offset + 8) > 0) { + uint64_t mask = 0xffffffffffffffff; + uint64_t x = (mask >> (8 - length) * 8); + *target = GetInt64(offset) & x; + } else { + if (size_ - (offset + length) < 0) { + return Status::OutOfBound("buffer out of bound"); + } + int64_t result = 0; + for (size_t i = 0; i < length; i++) { + result = result | ((int64_t)(data_[offset + i])) << (i * 8); + } + *target = result; + } + return Status::OK(); + } + inline uint32_t PutVarUint32(uint32_t offset, int32_t value) { if (value >> 7 == 0) { data_[offset] = (int8_t)value; diff --git a/cpp/fury/util/buffer_test.cc b/cpp/fury/util/buffer_test.cc index 5799889f54..7d7bcec4c6 100644 --- a/cpp/fury/util/buffer_test.cc +++ b/cpp/fury/util/buffer_test.cc @@ -65,6 +65,16 @@ TEST(Buffer, TestVarUint) { } } +TEST(Buffer, TestGetBytesAsInt64) { + std::shared_ptr buffer; + AllocateBuffer(64, &buffer); + buffer->UnsafePut(0, 100); + int64_t result = -1; + EXPECT_TRUE(buffer->GetBytesAsInt64(0, 0, &result).ok()); + EXPECT_EQ(result, 0); + EXPECT_TRUE(buffer->GetBytesAsInt64(0, 1, &result).ok()); + EXPECT_EQ(result, 100); +} } // namespace fury int main(int argc, char **argv) { diff --git a/cpp/fury/util/logging.cc b/cpp/fury/util/logging.cc index 8c2a09c3bf..c52d5feb76 100644 --- a/cpp/fury/util/logging.cc +++ b/cpp/fury/util/logging.cc @@ -111,4 +111,4 @@ bool FuryLog::IsLevelEnabled(FuryLogLevel log_level) { return log_level >= fury_severity_threshold; } -} // namespace fury \ No newline at end of file +} // namespace fury diff --git a/cpp/fury/util/status.h b/cpp/fury/util/status.h index cee1f90049..6b3e768bc7 100644 --- a/cpp/fury/util/status.h +++ b/cpp/fury/util/status.h @@ -83,11 +83,12 @@ namespace fury { enum class StatusCode : char { OK = 0, OutOfMemory = 1, - KeyError = 2, - TypeError = 3, - Invalid = 4, - IOError = 5, - UnknownError = 6, + OutOfBound = 2, + KeyError = 3, + TypeError = 4, + Invalid = 5, + IOError = 6, + UnknownError = 7, }; class Status { @@ -123,6 +124,10 @@ class Status { return Status(StatusCode::OutOfMemory, msg); } + static Status OutOfBound(const std::string &msg) { + return Status(StatusCode::OutOfMemory, msg); + } + static Status KeyError(const std::string &msg) { return Status(StatusCode::KeyError, msg); } diff --git a/docs/guide/xlang_serialization_guide.md b/docs/guide/xlang_serialization_guide.md index 59d5f3f997..60fb72906f 100644 --- a/docs/guide/xlang_serialization_guide.md +++ b/docs/guide/xlang_serialization_guide.md @@ -225,8 +225,8 @@ class SomeClass2: if __name__ == "__main__": f = pyfury.Fury() - f.register_class(SomeClass1, type_tag="example.SomeClass1") - f.register_class(SomeClass2, type_tag="example.SomeClass2") + f.register_type(SomeClass1, typename="example.SomeClass1") + f.register_type(SomeClass2, typename="example.SomeClass2") obj1 = SomeClass1(f1=True, f2={-1: 2}) obj = SomeClass2( f1=obj1, @@ -444,7 +444,7 @@ class SomeClass: f3: Dict[str, str] fury = pyfury.Fury(ref_tracking=True) -fury.register_class(SomeClass, type_tag="example.SomeClass") +fury.register_type(SomeClass, typename="example.SomeClass") obj = SomeClass() obj.f2 = {"k1": "v1", "k2": "v2"} obj.f1, obj.f3 = obj, obj.f2 diff --git a/docs/guide/xlang_type_mapping.md b/docs/guide/xlang_type_mapping.md index 43c7e3c961..13a6d05c0c 100644 --- a/docs/guide/xlang_type_mapping.md +++ b/docs/guide/xlang_type_mapping.md @@ -12,41 +12,53 @@ Note: ## Type Mapping -| Fury Type | Fury Type ID | Java | Python | Javascript | C++ | Golang | Rust | -|--------------------|--------------|-----------------|----------------------|-----------------|--------------------------------|------------------|------------------| -| bool | 1 | bool/Boolean | bool | Boolean | bool | bool | bool | -| int8 | 2 | byte/Byte | int/pyfury.Int8 | Type.int8() | int8_t | int8 | i8 | -| int16 | 3 | short/Short | int/pyfury.Int16 | Type.int16() | int16_t | int16 | i6 | -| int32 | 4 | int/Integer | int/pyfury.Int32 | Type.int32() | int32_t | int32 | i32 | -| var_int32 | 5 | int/Integer | int/pyfury.VarInt32 | Type.varint32() | fury::varint32_t | fury.varint32 | fury::varint32 | -| int64 | 6 | long/Long | int/pyfury.Int64 | Type.int64() | int64_t | int64 | i64 | -| var_int64 | 7 | long/Long | int/pyfury.VarInt64 | Type.varint64() | fury::varint64_t | fury.varint64 | fury::varint64 | -| sli_int64 | 8 | long/Long | int/pyfury.SliInt64 | Type.sliint64() | fury::sliint64_t | fury.sliint64 | fury::sliint64 | -| float16 | 9 | float/Float | float/pyfury.Float16 | Type.float16() | fury::float16_t | fury.float16 | fury::f16 | -| float32 | 10 | float/Float | float/pyfury.Float32 | Type.float32() | float | float32 | f32 | -| float64 | 11 | double/Double | float/pyfury.Float64 | Type.float64() | double | float64 | f64 | -| string | 12 | String | str | String | string | string | String/str | -| enum | 13 | Enum subclasses | enum subclasses | / | enum | / | enum | -| list | 14 | List/Collection | list/tuple | array | vector | slice | Vec | -| set | 15 | Set | set | / | set | fury.Set | Set | -| map | 16 | Map | dict | Map | unordered_map | map | HashMap | -| duration | 17 | Duration | timedelta | Number | duration | Duration | Duration | -| timestamp | 18 | Instant | datetime | Number | std::chrono::nanoseconds | Time | DateTime | -| decimal | 19 | BigDecimal | Decimal | bigint | / | / | / | -| binary | 20 | byte[] | bytes | / | `uint8_t[n]/vector` | `[n]uint8/[]T` | `Vec` | -| array | 21 | array | np.ndarray | / | / | array/slice | Vec | -| bool_array | 22 | bool[] | ndarray(np.bool_) | / | `bool[n]` | `[n]bool/[]T` | `Vec` | -| int8_array | 23 | byte[] | ndarray(int8) | / | `int8_t[n]/vector` | `[n]int8/[]T` | `Vec` | -| int16_array | 24 | short[] | ndarray(int16) | / | `int16_t[n]/vector` | `[n]int16/[]T` | `Vec` | -| int32_array | 25 | int[] | ndarray(int32) | / | `int32_t[n]/vector` | `[n]int32/[]T` | `Vec` | -| int64_array | 26 | long[] | ndarray(int64) | / | `int64_t[n]/vector` | `[n]int64/[]T` | `Vec` | -| float16_array | 27 | float[] | ndarray(float16) | / | `fury::float16_t[n]/vector` | `[n]float16/[]T` | `Vec` | -| float32_array | 28 | float[] | ndarray(float32) | / | `float[n]/vector` | `[n]float32/[]T` | `Vec` | -| float64_array | 29 | double[] | ndarray(float64) | / | `double[n]/vector` | `[n]float64/[]T` | `Vec` | -| tensor | 30 | / | / | / | / | / | / | -| sparse tensor | 31 | / | / | / | / | / | / | -| arrow record batch | 32 | / | / | / | / | / | / | -| arrow table | 33 | / | / | / | / | / | / | +| Fury Type | Fury Type ID | Java | Python | Javascript | C++ | Golang | Rust | +|-------------------------------------|--------------|-----------------|-----------------------------------|-----------------|--------------------------------|------------------|------------------| +| bool | 1 | bool/Boolean | bool | Boolean | bool | bool | bool | +| int8 | 2 | byte/Byte | int/pyfury.Int8 | Type.int8() | int8_t | int8 | i8 | +| int16 | 3 | short/Short | int/pyfury.Int16 | Type.int16() | int16_t | int16 | i6 | +| int32 | 4 | int/Integer | int/pyfury.Int32 | Type.int32() | int32_t | int32 | i32 | +| var_int32 | 5 | int/Integer | int/pyfury.VarInt32 | Type.varint32() | fury::varint32_t | fury.varint32 | fury::varint32 | +| int64 | 6 | long/Long | int/pyfury.Int64 | Type.int64() | int64_t | int64 | i64 | +| var_int64 | 7 | long/Long | int/pyfury.VarInt64 | Type.varint64() | fury::varint64_t | fury.varint64 | fury::varint64 | +| sli_int64 | 8 | long/Long | int/pyfury.SliInt64 | Type.sliint64() | fury::sliint64_t | fury.sliint64 | fury::sliint64 | +| float16 | 9 | float/Float | float/pyfury.Float16 | Type.float16() | fury::float16_t | fury.float16 | fury::f16 | +| float32 | 10 | float/Float | float/pyfury.Float32 | Type.float32() | float | float32 | f32 | +| float64 | 11 | double/Double | float/pyfury.Float64 | Type.float64() | double | float64 | f64 | +| string | 12 | String | str | String | string | string | String/str | +| enum | 13 | Enum subclasses | enum subclasses | / | enum | / | enum | +| named_enum | 14 | Enum subclasses | enum subclasses | / | enum | / | enum | +| struct | 15 | pojo/record | data class / type with type hints | object | struct/class | struct | struct | +| polymorphic_struct | 16 | pojo/record | data class / type with type hints | object | struct/class | struct | struct | +| compatible_struct | 17 | pojo/record | data class / type with type hints | object | struct/class | struct | struct | +| polymorphic_compatible_struct | 18 | pojo/record | data class / type with type hints | object | struct/class | struct | struct | +| named_struct | 19 | pojo/record | data class / type with type hints | object | struct/class | struct | struct | +| named_polymorphic_struct | 20 | pojo/record | data class / type with type hints | object | struct/class | struct | struct | +| named_compatible_struct | 21 | pojo/record | data class / type with type hints | object | struct/class | struct | struct | +| named_polymorphic_compatible_struct | 22 | pojo/record | data class / type with type hints | object | struct/class | struct | struct | +| ext | 23 | pojo/record | data class / type with type hints | object | struct/class | struct | struct | +| polymorphic_ext | 24 | pojo/record | data class / type with type hints | object | struct/class | struct | struct | +| named_ext | 25 | pojo/record | data class / type with type hints | object | struct/class | struct | struct | +| named_polymorphic_ext | 26 | pojo/record | data class / type with type hints | object | struct/class | struct | struct | +| list | 27 | List/Collection | list/tuple | array | vector | slice | Vec | +| set | 28 | Set | set | / | set | fury.Set | Set | +| map | 29 | Map | dict | Map | unordered_map | map | HashMap | +| duration | 30 | Duration | timedelta | Number | duration | Duration | Duration | +| timestamp | 31 | Instant | datetime | Number | std::chrono::nanoseconds | Time | DateTime | +| local_date | 32 | Date | datetime | Number | std::chrono::nanoseconds | Time | DateTime | +| decimal | 33 | BigDecimal | Decimal | bigint | / | / | / | +| binary | 34 | byte[] | bytes | / | `uint8_t[n]/vector` | `[n]uint8/[]T` | `Vec` | +| array | 35 | array | np.ndarray | / | / | array/slice | Vec | +| bool_array | 36 | bool[] | ndarray(np.bool_) | / | `bool[n]` | `[n]bool/[]T` | `Vec` | +| int8_array | 37 | byte[] | ndarray(int8) | / | `int8_t[n]/vector` | `[n]int8/[]T` | `Vec` | +| int16_array | 38 | short[] | ndarray(int16) | / | `int16_t[n]/vector` | `[n]int16/[]T` | `Vec` | +| int32_array | 39 | int[] | ndarray(int32) | / | `int32_t[n]/vector` | `[n]int32/[]T` | `Vec` | +| int64_array | 40 | long[] | ndarray(int64) | / | `int64_t[n]/vector` | `[n]int64/[]T` | `Vec` | +| float16_array | 41 | float[] | ndarray(float16) | / | `fury::float16_t[n]/vector` | `[n]float16/[]T` | `Vec` | +| float32_array | 42 | float[] | ndarray(float32) | / | `float[n]/vector` | `[n]float32/[]T` | `Vec` | +| float64_array | 43 | double[] | ndarray(float64) | / | `double[n]/vector` | `[n]float64/[]T` | `Vec` | +| arrow record batch | 44 | / | / | / | / | / | / | +| arrow table | 45 | / | / | / | / | / | / | ## Type info(not implemented currently) diff --git a/docs/specification/xlang_serialization_spec.md b/docs/specification/xlang_serialization_spec.md index 7fd991ceb9..c47710f5ab 100644 --- a/docs/specification/xlang_serialization_spec.md +++ b/docs/specification/xlang_serialization_spec.md @@ -39,28 +39,41 @@ also introduce more complexities compared to static serialization frameworks. So - string: a text string encoded using Latin1/UTF16/UTF-8 encoding. - enum: a data type consisting of a set of named values. Rust enum with non-predefined field values are not supported as an enum. +- named_enum: an enum whose value will be serialized as the registered name. +- struct: a morphic(final) type serialized by Fury Struct serializer. i.e. it doesn't have subclasses. Suppose we're + deserializing `List`, we can save dynamic serializer dispatch since `SomeClass` is morphic(final). +- polymorphic_struct: a type which is not morphic(not final). i.e. it has subclasses. Suppose we're deserializing + `List`, we must dispatch serializer dynamically since `SomeClass` is morphic(final). +- compatible_struct: a morphic(final) type serialized by Fury compatible Struct serializer. +- polymorphic_compatible_struct: a non-morphic(non-final) type serialized by Fury compatible Struct serializer. +- named_struct: a `struct` whose type mapping will be encoded as a name. +- named_polymorphic_struct: a `polymorphic_struct` whose type mapping will be encoded as a name. +- named_compatible_struct: a `compatible_struct` whose type mapping will be encoded as a name. +- named_polymorphic_compatible_struct: a `polymorphic_compatible_struct` whose type mapping will be encoded as a name. +- ext: a type which will be serialized by a customized serializer. +- polymorphic_ext: an `ext` type which is not morphic(not final). +- named_ext: an `ext` type whose type mapping will be encoded as a name. +- named_polymorphic_ext: an `polymorphic_ext` type whose type mapping will be encoded as a name. - list: a sequence of objects. - set: an unordered set of unique elements. - map: a map of key-value pairs. Mutable types such as `list/map/set/array/tensor/arrow` are not allowed as key of map. -- time types: - - duration: an absolute length of time, independent of any calendar/timezone, as a count of nanoseconds. - - timestamp: a point in time, independent of any calendar/timezone, as a count of nanoseconds. The count is relative - to an epoch at UTC midnight on January 1, 1970. +- duration: an absolute length of time, independent of any calendar/timezone, as a count of nanoseconds. +- timestamp: a point in time, independent of any calendar/timezone, as a count of nanoseconds. The count is relative + to an epoch at UTC midnight on January 1, 1970. +- local_date: a naive date without timezone. The count is days relative to an epoch at UTC midnight on Jan 1, 1970. - decimal: exact decimal value represented as an integer value in two's complement. - binary: an variable-length array of bytes. -- array type: only allow numeric components. Other arrays will be taken as List. The implementation should support the +- array: only allow numeric components. Other arrays will be taken as List. The implementation should support the interoperability between array and list. - - array: multidimensional array which every sub-array can have different sizes but all have same type. - - bool_array: one dimensional int16 array. - - int8_array: one dimensional int8 array. - - int16_array: one dimensional int16 array. - - int32_array: one dimensional int32 array. - - int64_array: one dimensional int64 array. - - float16_array: one dimensional half_float_16 array. - - float32_array: one dimensional float32 array. - - float64_array: one dimensional float64 array. -- tensor: a multidimensional dense array of fixed-size values such as a NumPy ndarray. -- sparse tensor: a multidimensional array whose elements are almost all zeros. +- array: multidimensional array which every sub-array can have different sizes but all have same type. +- bool_array: one dimensional int16 array. +- int8_array: one dimensional int8 array. +- int16_array: one dimensional int16 array. +- int32_array: one dimensional int32 array. +- int64_array: one dimensional int64 array. +- float16_array: one dimensional half_float_16 array. +- float32_array: one dimensional float32 array. +- float64_array: one dimensional float64 array. - arrow record batch: an arrow [record batch](https://arrow.apache.org/docs/cpp/tables.html#record-batches) object. - arrow table: an arrow [table](https://arrow.apache.org/docs/cpp/tables.html#tables) object. @@ -68,6 +81,15 @@ Note: - Unsigned int/long are not added here, since not every language support those types. +### Polymorphisms + +For polymorphism, if one non-final class is registered, and only one subclass is registered, then we can take all +elements in List/Map have same type, thus reduce runtime check cost. + +Collection/Array polymorphism are not fully supported, since some languages such as golang have only one collection +type. If users want to get exactly the type he passed, he must pass that type when deserializing or annotate that type +to the field of struct. + ### Type disambiguation Due to differences between type systems of languages, those types can't be mapped one-to-one between languages. When @@ -117,8 +139,8 @@ Such information can be provided in other languages too: ### Type ID -All internal data types are expressed using an ID in range `-64~-1`. Users can use `0~32703` for representing their -types. At runtime, all type ids are added by `64`, and then encoded as an unsigned varint. +All internal data types are expressed using an ID in range `0~64`. Users can use `0~4096` for representing their +types. ### Type mapping @@ -298,19 +320,17 @@ Meta header is a 64 bits number value encoded in little endian order. - type id: the registered id for the current type, which will be written as an unsigned varint. - field info: - header(8 - bits): `3 bits size + 2 bits field name encoding + polymorphism flag + nullability flag + ref tracking flag`. + bits): `4 bits size + 2 bits field name encoding + nullability flag + ref tracking flag`. Users can use annotation to provide those info. - 2 bits field name encoding: - encoding: `UTF8/ALL_TO_LOWER_SPECIAL/LOWER_UPPER_DIGIT_SPECIAL/TAG_ID` - If tag id is used, i.e. field name is written by an unsigned varint tag id. 2 bits encoding will be `11`. - size of field name: - - The `3 bits size: 0~7` will be used to indicate length `1~7`, the value `7` indicates to read more bytes, - the encoding will encode `size - 7` as a varint next. + - The `4 bits size: 0~14` will be used to indicate length `1~15`, the value `15` indicates to read more bytes, + the encoding will encode `size - 15` as a varint next. - If encoding is `TAG_ID`, then num_bytes of field name will be used to store tag id. - ref tracking: when set to 1, ref tracking will be enabled for this field. - nullability: when set to 1, this field can be null. - - polymorphism: when set to 1, the actual type of field will be the declared field type even the type if - not `final`. - field name: If tag id is set, tag id will be used instead. Otherwise meta string encoding `[length]` and data will be written instead. - type id: diff --git a/go/fury/fury_xlang_test.go b/go/fury/fury_xlang_test.go index b327ec343b..07b6b36d9b 100644 --- a/go/fury/fury_xlang_test.go +++ b/go/fury/fury_xlang_test.go @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +//go:build skiptest +// +build skiptest + package fury_test import ( diff --git a/java/fury-core/src/main/java/org/apache/fury/AbstractThreadSafeFury.java b/java/fury-core/src/main/java/org/apache/fury/AbstractThreadSafeFury.java index 23a6aa407e..3e2aff67ff 100644 --- a/java/fury-core/src/main/java/org/apache/fury/AbstractThreadSafeFury.java +++ b/java/fury-core/src/main/java/org/apache/fury/AbstractThreadSafeFury.java @@ -38,15 +38,25 @@ public void register(Class cls, boolean createSerializer) { } @Override - public void register(Class cls, Short id) { + public void register(Class cls, int id) { registerCallback(fury -> fury.register(cls, id)); } @Override - public void register(Class cls, Short id, boolean createSerializer) { + public void register(Class cls, int id, boolean createSerializer) { registerCallback(fury -> fury.register(cls, id, createSerializer)); } + @Override + public void register(Class cls, String typeName) { + registerCallback(fury -> fury.register(cls, typeName)); + } + + @Override + public void register(Class cls, String namespace, String typeName) { + registerCallback(fury -> fury.register(cls, namespace, typeName)); + } + @Override public void registerSerializer(Class type, Class serializerClass) { registerCallback(fury -> fury.registerSerializer(type, serializerClass)); diff --git a/java/fury-core/src/main/java/org/apache/fury/BaseFury.java b/java/fury-core/src/main/java/org/apache/fury/BaseFury.java index 96a04be021..cb4b0b61e9 100644 --- a/java/fury-core/src/main/java/org/apache/fury/BaseFury.java +++ b/java/fury-core/src/main/java/org/apache/fury/BaseFury.java @@ -33,14 +33,21 @@ public interface BaseFury { /** - * register class. + * Register class and allocate an auto-grown ID for this class. Note that the registration order + * is important. If registration order is inconsistent, the allocated ID will be different, and + * the deserialization will failed. * * @param cls class to register. */ void register(Class cls); + /** register class with given id. */ + void register(Class cls, int id); + /** - * Register class. + * Register class and allocate an auto-grown ID for this class. Note that the registration order + * is important. If registration order is inconsistent, the allocated ID will be different, and + * the deserialization will failed. * * @param cls class to register. * @param createSerializer whether to create serializer, if true and codegen enabled, this will @@ -48,9 +55,6 @@ public interface BaseFury { */ void register(Class cls, boolean createSerializer); - /** register class with given id. */ - void register(Class cls, Short id); - /** * Register class with specified id. * @@ -59,10 +63,21 @@ public interface BaseFury { * @param createSerializer whether to create serializer, if true and codegen enabled, this will * generate the serializer code too. */ - void register(Class cls, Short id, boolean createSerializer); + void register(Class cls, int id, boolean createSerializer); + + /** register class with given type name which will be used for cross-language serialization. */ + void register(Class cls, String typeName); /** - * Register a Serializer. + * register class with given type namespace and name which will be used for cross-language + * serialization. + */ + void register(Class cls, String namespace, String typeName); + + /** + * Register a Serializer for a class, and allocate an auto-grown ID for this class if it's not + * registered yet. Note that the registration order is important. If registration order is + * inconsistent, the allocated ID will be different, and the deserialization will failed. * * @param type class needed to be serialized/deserialized. * @param serializerClass serializer class can be created with {@link Serializers#newSerializer}. @@ -70,10 +85,18 @@ public interface BaseFury { */ void registerSerializer(Class type, Class serializerClass); + /** + * Register a Serializer for a class, and allocate an auto-grown ID for this class if it's not + * registered yet. Note that the registration order is important. If registration order is + * inconsistent, the allocated ID will be different, and the deserialization will failed. + */ void registerSerializer(Class type, Serializer serializer); /** - * Register a Serializer created by serializerCreator when fury created. + * Register a Serializer created by serializerCreator when fury created. And allocate an + * auto-grown ID for this class if it's not registered yet. Note that the registration order is + * important. If registration order is inconsistent, the allocated ID will be different, and the + * deserialization will failed. * * @param type class needed to be serialized/deserialized. * @param serializerCreator serializer creator with param {@link Fury} @@ -107,6 +130,8 @@ public interface BaseFury { /** Deserialize obj from a byte array. */ Object deserialize(byte[] bytes); + T deserialize(byte[] bytes, Class type); + Object deserialize(byte[] bytes, Iterable outOfBandBuffers); /** diff --git a/java/fury-core/src/main/java/org/apache/fury/Fury.java b/java/fury-core/src/main/java/org/apache/fury/Fury.java index b1918358a2..bb5b102a8a 100644 --- a/java/fury-core/src/main/java/org/apache/fury/Fury.java +++ b/java/fury-core/src/main/java/org/apache/fury/Fury.java @@ -27,7 +27,6 @@ import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; -import java.util.List; import java.util.function.Consumer; import java.util.function.Function; import javax.annotation.concurrent.NotThreadSafe; @@ -53,10 +52,10 @@ import org.apache.fury.resolver.NoRefResolver; import org.apache.fury.resolver.RefResolver; import org.apache.fury.resolver.SerializationContext; +import org.apache.fury.resolver.XtypeResolver; import org.apache.fury.serializer.ArraySerializers; import org.apache.fury.serializer.BufferCallback; import org.apache.fury.serializer.BufferObject; -import org.apache.fury.serializer.OpaqueObjects; import org.apache.fury.serializer.PrimitiveSerializers.LongSerializer; import org.apache.fury.serializer.Serializer; import org.apache.fury.serializer.SerializerFactory; @@ -64,7 +63,7 @@ import org.apache.fury.serializer.collection.CollectionSerializers.ArrayListSerializer; import org.apache.fury.serializer.collection.MapSerializers.HashMapSerializer; import org.apache.fury.type.Generics; -import org.apache.fury.type.Type; +import org.apache.fury.type.Types; import org.apache.fury.util.ExceptionUtils; import org.apache.fury.util.Preconditions; import org.apache.fury.util.StringUtils; @@ -90,8 +89,7 @@ public final class Fury implements BaseFury { public static final byte NOT_NULL_VALUE_FLAG = -1; // this flag indicates that the object is a referencable and first write. public static final byte REF_VALUE_FLAG = 0; - public static final byte NOT_SUPPORT_CROSS_LANGUAGE = 0; - public static final short FURY_TYPE_TAG_ID = Type.FURY_TYPE_TAG.getId(); + public static final byte NOT_SUPPORT_XLANG = 0; private static final byte isNilFlag = 1; private static final byte isLittleEndianFlag = 1 << 1; private static final byte isCrossLanguageFlag = 1 << 2; @@ -105,12 +103,12 @@ public final class Fury implements BaseFury { private final boolean shareMeta; private final RefResolver refResolver; private final ClassResolver classResolver; + private final XtypeResolver xtypeResolver; private final MetaStringResolver metaStringResolver; private final SerializationContext serializationContext; private final ClassLoader classLoader; private final JITContext jitContext; private MemoryBuffer buffer; - private final List nativeObjects; private final StringSerializer stringSerializer; private final ArrayListSerializer arrayListSerializer; private final HashMapSerializer hashMapSerializer; @@ -144,13 +142,17 @@ public Fury(FuryBuilder builder, ClassLoader classLoader) { this.refResolver = new NoRefResolver(); } jitContext = new JITContext(this); + generics = new Generics(this); metaStringResolver = new MetaStringResolver(); classResolver = new ClassResolver(this); classResolver.initialize(); + if (language != Language.JAVA) { + xtypeResolver = new XtypeResolver(this); + } else { + xtypeResolver = null; + } serializationContext = new SerializationContext(config); this.classLoader = classLoader; - nativeObjects = new ArrayList<>(); - generics = new Generics(this); stringSerializer = new StringSerializer(this); arrayListSerializer = new ArrayListSerializer(this); hashMapSerializer = new HashMapSerializer(this); @@ -161,42 +163,82 @@ public Fury(FuryBuilder builder, ClassLoader classLoader) { @Override public void register(Class cls) { - classResolver.register(cls); + if (language == Language.JAVA) { + classResolver.register(cls); + } else { + xtypeResolver.register(cls); + } } @Override - public void register(Class cls, boolean createSerializer) { - classResolver.register(cls, createSerializer); + public void register(Class cls, int id) { + if (language == Language.JAVA) { + classResolver.register(cls, id); + } else { + xtypeResolver.register(cls, id); + } } @Override - public void register(Class cls, Short id) { - classResolver.register(cls, id); + public void register(Class cls, boolean createSerializer) { + if (language == Language.JAVA) { + classResolver.register(cls, createSerializer); + } else { + xtypeResolver.register(cls); + } } @Override - public void register(Class cls, Short id, boolean createSerializer) { - classResolver.register(cls, id, createSerializer); + public void register(Class cls, int id, boolean createSerializer) { + if (language == Language.JAVA) { + classResolver.register(cls, id, createSerializer); + } else { + xtypeResolver.register(cls, id); + } } /** register class with given type tag which will be used for cross-language serialization. */ - public void register(Class cls, String typeTag) { - classResolver.register(cls, typeTag); + public void register(Class cls, String typeName) { + Preconditions.checkArgument(language != Language.JAVA); + int idx = typeName.lastIndexOf('.'); + String namespace = ""; + if (idx > 0) { + namespace = typeName.substring(0, idx); + typeName = typeName.substring(idx + 1); + } + register(cls, namespace, typeName); + } + + public void register(Class cls, String namespace, String typeName) { + Preconditions.checkArgument(language != Language.JAVA); + xtypeResolver.register(cls, namespace, typeName); } @Override public void registerSerializer(Class type, Class serializerClass) { - classResolver.registerSerializer(type, serializerClass); + if (language == Language.JAVA) { + classResolver.registerSerializer(type, serializerClass); + } else { + xtypeResolver.registerSerializer(type, serializerClass); + } } @Override public void registerSerializer(Class type, Serializer serializer) { - classResolver.registerSerializer(type, serializer); + if (language == Language.JAVA) { + classResolver.registerSerializer(type, serializer); + } else { + xtypeResolver.registerSerializer(type, serializer); + } } @Override public void registerSerializer(Class type, Function> serializerCreator) { - classResolver.registerSerializer(type, serializerCreator.apply(this)); + if (language == Language.JAVA) { + classResolver.registerSerializer(type, serializerCreator.apply(this)); + } else { + xtypeResolver.registerSerializer(type, serializerCreator.apply(this)); + } } @Override @@ -208,6 +250,15 @@ public SerializerFactory getSerializerFactory() { return classResolver.getSerializerFactory(); } + public Serializer getSerializer(Class cls) { + Preconditions.checkNotNull(cls); + if (language == Language.JAVA) { + return classResolver.getSerializer(cls); + } else { + return xtypeResolver.getClassInfo(cls).getSerializer(); + } + } + @Override public MemoryBuffer serialize(Object obj, long address, int size) { MemoryBuffer buffer = MemoryUtils.buffer(address, size); @@ -268,7 +319,7 @@ public MemoryBuffer serialize(MemoryBuffer buffer, Object obj, BufferCallback ca write(buffer, obj); } else { buffer.writeByte((byte) Language.JAVA.ordinal()); - xserializeInternal(buffer, obj); + xwriteRef(buffer, obj); } return buffer; } catch (StackOverflowError t) { @@ -336,8 +387,9 @@ public void resetBuffer() { private void write(MemoryBuffer buffer, Object obj) { int startOffset = buffer.writerIndex(); + boolean shareMeta = config.isMetaShareEnabled(); if (shareMeta) { - buffer.writeInt32(-1); // preserve 4-byte for nativeObjects start offsets. + buffer.writeInt32(-1); // preserve 4-byte for meta start offsets. } // reduce caller stack if (!refResolver.writeRefOrNull(buffer, obj)) { @@ -352,22 +404,6 @@ private void write(MemoryBuffer buffer, Object obj) { } } - private void xserializeInternal(MemoryBuffer buffer, Object obj) { - int startOffset = buffer.writerIndex(); - buffer.writeInt32(-1); // preserve 4-byte for nativeObjects start offsets. - buffer.writeInt32(-1); // preserve 4-byte for nativeObjects size - xwriteRef(buffer, obj); - buffer.putInt32(startOffset, buffer.writerIndex()); - buffer.putInt32(startOffset + 4, nativeObjects.size()); - refResolver.resetWrite(); - // fury write opaque object classname which cause later write of classname only write an id. - classResolver.resetWrite(); - metaStringResolver.resetWrite(); - for (Object nativeObject : nativeObjects) { - writeRef(buffer, nativeObject); - } - } - /** Serialize a nullable referencable object to buffer. */ public void writeRef(MemoryBuffer buffer, Object obj) { if (!refResolver.writeRefOrNull(buffer, obj)) { @@ -475,72 +511,63 @@ public void writeNonRef(MemoryBuffer buffer, Object obj, ClassInfo classInfo) { depth--; } - public void writeNonRef(MemoryBuffer buffer, T obj, Serializer serializer) { - depth++; - serializer.write(buffer, obj); - depth--; - } - public void xwriteRef(MemoryBuffer buffer, Object obj) { if (!refResolver.writeRefOrNull(buffer, obj)) { - xwriteNonRef(buffer, obj, null); + ClassInfo classInfo = xtypeResolver.writeClassInfo(buffer, obj); + switch (classInfo.getXtypeId()) { + case Types.BOOL: + buffer.writeBoolean((Boolean) obj); + break; + case Types.INT8: + buffer.writeByte((Byte) obj); + break; + case Types.INT16: + buffer.writeInt16((Short) obj); + break; + case Types.INT32: + case Types.VAR_INT32: + // TODO(chaokunyang) support other encoding + buffer.writeVarInt32((Integer) obj); + break; + case Types.INT64: + case Types.VAR_INT64: + // TODO(chaokunyang) support other encoding + case Types.SLI_INT64: + // TODO(chaokunyang) support varint encoding + buffer.writeVarInt64((Long) obj); + break; + case Types.FLOAT32: + buffer.writeFloat32((Float) obj); + break; + case Types.FLOAT64: + buffer.writeFloat64((Double) obj); + break; + // TODO(add fastpath for other types) + default: + depth++; + classInfo.getSerializer().xwrite(buffer, obj); + depth--; + } } } public void xwriteRef(MemoryBuffer buffer, T obj, Serializer serializer) { if (serializer.needToWriteRef()) { if (!refResolver.writeRefOrNull(buffer, obj)) { - xwriteNonRef(buffer, obj, serializer); + depth++; + serializer.xwrite(buffer, obj); + depth--; } } else { if (obj == null) { buffer.writeByte(Fury.NULL_FLAG); } else { buffer.writeByte(Fury.NOT_NULL_VALUE_FLAG); - xwriteNonRef(buffer, obj, serializer); - } - } - } - - public void xwriteRefByNullableSerializer( - MemoryBuffer buffer, T obj, Serializer serializer) { - if (serializer == null) { - xwriteRef(buffer, obj); - } else { - xwriteRef(buffer, obj, serializer); - } - } - - public void xwriteNonRef(MemoryBuffer buffer, T obj, Serializer serializer) { - depth++; - @SuppressWarnings("unchecked") - Class cls = (Class) obj.getClass(); - if (serializer == null) { - serializer = classResolver.getSerializer(cls); - } - short typeId = serializer.getXtypeId(); - buffer.writeInt16(typeId); - if (typeId != NOT_SUPPORT_CROSS_LANGUAGE) { - if (typeId == FURY_TYPE_TAG_ID) { - classResolver.xwriteTypeTag(buffer, cls); - } - if (typeId < NOT_SUPPORT_CROSS_LANGUAGE) { - classResolver.xwriteClass(buffer, cls); + depth++; + serializer.xwrite(buffer, obj); + depth--; } - serializer.xwrite(buffer, obj); - } else { - // Write classname so it can be used for debugging which object doesn't support - // cross-language. - // TODO add a config to disable this to reduce space cost. - classResolver.xwriteClass(buffer, cls); - // serializer may increase reference id multi times internally, thus peer cross-language later - // fields/objects deserialization will use wrong reference id since we skip opaque objects - // deserialization. - // So we stash native objects and serialize all those object at the last. - buffer.writeVarUint32(nativeObjects.size()); - nativeObjects.add(obj); } - depth--; } /** Write not null data to buffer. */ @@ -713,6 +740,17 @@ public Object deserialize(byte[] bytes) { return deserialize(MemoryUtils.wrap(bytes), null); } + @SuppressWarnings("unchecked") + @Override + public T deserialize(byte[] bytes, Class type) { + generics.pushGenericType(classResolver.buildGenericType(type)); + try { + return (T) deserialize(MemoryUtils.wrap(bytes), null); + } finally { + generics.popGenericType(); + } + } + @Override public Object deserialize(byte[] bytes, Iterable outOfBandBuffers) { return deserialize(MemoryUtils.wrap(bytes), outOfBandBuffers); @@ -784,7 +822,7 @@ public Object deserialize(MemoryBuffer buffer, Iterable outOfBandB } Object obj; if (isTargetXLang) { - obj = xdeserializeInternal(buffer); + obj = xreadRef(buffer); } else { if (shareMeta) { readClassDefs(buffer); @@ -829,28 +867,6 @@ public Object deserialize(FuryReadableChannel channel, Iterable ou return deserialize(buf, outOfBandBuffers); } - private Object xdeserializeInternal(MemoryBuffer buffer) { - Object obj; - int nativeObjectsStartOffset = buffer.readInt32(); - int nativeObjectsSize = buffer.readInt32(); - int endReaderIndex = nativeObjectsStartOffset; - if (peerLanguage == Language.JAVA) { - int readerIndex = buffer.readerIndex(); - buffer.readerIndex(nativeObjectsStartOffset); - for (int i = 0; i < nativeObjectsSize; i++) { - nativeObjects.add(readRef(buffer)); - } - endReaderIndex = buffer.readerIndex(); - buffer.readerIndex(readerIndex); - refResolver.resetRead(); - classResolver.resetRead(); - metaStringResolver.resetRead(); - } - obj = xreadRef(buffer); - buffer.readerIndex(endReaderIndex); - return obj; - } - /** Deserialize nullable referencable object from buffer. */ public Object readRef(MemoryBuffer buffer) { RefResolver refResolver = this.refResolver; @@ -965,7 +981,7 @@ public Object xreadRef(MemoryBuffer buffer) { RefResolver refResolver = this.refResolver; int nextReadRefId = refResolver.tryPreserveRefId(buffer); if (nextReadRefId >= NOT_NULL_VALUE_FLAG) { - Object o = xreadNonRef(buffer, null); + Object o = xreadNonRef(buffer, xtypeResolver.readClassInfo(buffer)); refResolver.setReadObject(nextReadRefId, o); return o; } else { @@ -994,52 +1010,41 @@ public Object xreadRef(MemoryBuffer buffer, Serializer serializer) { } } - public Object xreadRefByNullableSerializer(MemoryBuffer buffer, Serializer serializer) { - if (serializer == null) { - return xreadRef(buffer); - } else { - return xreadRef(buffer, serializer); - } - } - public Object xreadNonRef(MemoryBuffer buffer, Serializer serializer) { depth++; - short typeId = buffer.readInt16(); - ClassResolver classResolver = this.classResolver; - if (typeId != NOT_SUPPORT_CROSS_LANGUAGE) { - Class cls = null; - if (typeId == FURY_TYPE_TAG_ID) { - cls = classResolver.readClassByTypeTag(buffer); - } - if (typeId < NOT_SUPPORT_CROSS_LANGUAGE) { - if (peerLanguage != Language.JAVA) { - classResolver.xreadClassName(buffer); - cls = classResolver.getClassByTypeId((short) -typeId); - } else { - cls = classResolver.xreadClass(buffer); - } - } else { - if (typeId != FURY_TYPE_TAG_ID) { - cls = classResolver.getClassByTypeId(typeId); - } - } - Preconditions.checkNotNull(cls); - if (serializer == null) { - serializer = classResolver.getSerializer(cls); - } - // TODO check serializer consistent with `classResolver.getSerializer(cls)` when serializer - // not null; - Object o = serializer.xread(buffer); - depth--; - return o; - } else { - String className = classResolver.xreadClassName(buffer); - int ordinal = buffer.readVarUint32(); - if (peerLanguage != Language.JAVA) { - return OpaqueObjects.of(peerLanguage, className, ordinal); - } else { - return nativeObjects.get(ordinal); - } + Object o = serializer.xread(buffer); + depth--; + return o; + } + + public Object xreadNonRef(MemoryBuffer buffer, ClassInfo classInfo) { + assert classInfo != null; + switch (classInfo.getXtypeId()) { + case Types.BOOL: + return buffer.readBoolean(); + case Types.INT8: + return buffer.readByte(); + case Types.INT16: + return buffer.readInt16(); + case Types.INT32: + case Types.VAR_INT32: + // TODO(chaokunyang) support other encoding + return buffer.readVarInt32(); + case Types.INT64: + case Types.VAR_INT64: + // TODO(chaokunyang) support other encoding + case Types.SLI_INT64: + return buffer.readVarInt64(); + case Types.FLOAT32: + return buffer.readFloat32(); + case Types.FLOAT64: + return buffer.readFloat64(); + // TODO(add fastpath for other types) + default: + depth++; + Object o = classInfo.getSerializer().xread(buffer); + depth--; + return o; } } @@ -1062,7 +1067,7 @@ public void serializeJavaObject(MemoryBuffer buffer, Object obj) { } if (config.isMetaShareEnabled()) { int startOffset = buffer.writerIndex(); - buffer.writeInt32(-1); // preserve 4-byte for nativeObjects start offsets. + buffer.writeInt32(-1); // preserve 4-byte for meta start offsets. if (!refResolver.writeRefOrNull(buffer, obj)) { ClassInfo classInfo = classResolver.getOrUpdateClassInfo(obj.getClass()); writeData(buffer, classInfo, obj); @@ -1458,7 +1463,6 @@ public void reset() { classResolver.reset(); metaStringResolver.reset(); serializationContext.reset(); - nativeObjects.clear(); peerOutOfBandEnabled = false; bufferCallback = null; depth = 0; @@ -1470,7 +1474,6 @@ public void resetWrite() { classResolver.resetWrite(); metaStringResolver.resetWrite(); serializationContext.resetWrite(); - nativeObjects.clear(); bufferCallback = null; depth = 0; } @@ -1480,7 +1483,6 @@ public void resetRead() { classResolver.resetRead(); metaStringResolver.resetRead(); serializationContext.resetRead(); - nativeObjects.clear(); peerOutOfBandEnabled = false; depth = 0; classDefEndOffset = -1; @@ -1527,6 +1529,10 @@ public ClassResolver getClassResolver() { return classResolver; } + public XtypeResolver getXtypeResolver() { + return xtypeResolver; + } + public MetaStringResolver getMetaStringResolver() { return metaStringResolver; } diff --git a/java/fury-core/src/main/java/org/apache/fury/ThreadLocalFury.java b/java/fury-core/src/main/java/org/apache/fury/ThreadLocalFury.java index 032a250608..650e962e9c 100644 --- a/java/fury-core/src/main/java/org/apache/fury/ThreadLocalFury.java +++ b/java/fury-core/src/main/java/org/apache/fury/ThreadLocalFury.java @@ -137,6 +137,11 @@ public Object deserialize(byte[] bytes) { return bindingThreadLocal.get().get().deserialize(bytes); } + @Override + public T deserialize(byte[] bytes, Class type) { + return bindingThreadLocal.get().get().deserialize(bytes, type); + } + @Override public Object deserialize(byte[] bytes, Iterable outOfBandBuffers) { return bindingThreadLocal.get().get().deserialize(bytes, outOfBandBuffers); diff --git a/java/fury-core/src/main/java/org/apache/fury/exception/ClassUnregisteredException.java b/java/fury-core/src/main/java/org/apache/fury/exception/ClassUnregisteredException.java new file mode 100644 index 0000000000..ca8874f5fa --- /dev/null +++ b/java/fury-core/src/main/java/org/apache/fury/exception/ClassUnregisteredException.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.fury.exception; + +public class ClassUnregisteredException extends FuryException { + + public ClassUnregisteredException(Class cls) { + this(cls.getName()); + } + + public ClassUnregisteredException(String qualifiedName) { + super(String.format("Class %s is not registered", qualifiedName)); + } +} diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/OpaqueObjects.java b/java/fury-core/src/main/java/org/apache/fury/exception/SerializerUnregisteredException.java similarity index 50% rename from java/fury-core/src/main/java/org/apache/fury/serializer/OpaqueObjects.java rename to java/fury-core/src/main/java/org/apache/fury/exception/SerializerUnregisteredException.java index 5d2b00571a..967867bcbf 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/OpaqueObjects.java +++ b/java/fury-core/src/main/java/org/apache/fury/exception/SerializerUnregisteredException.java @@ -17,38 +17,15 @@ * under the License. */ -package org.apache.fury.serializer; +package org.apache.fury.exception; -import org.apache.fury.config.Language; +public class SerializerUnregisteredException extends FuryException { -/** Stub objects for unsupported cross-language serializing type. */ -public class OpaqueObjects { - - public static OpaqueObject of(Language language, String className, int ordinal) { - return new OpaqueObject(language, className, ordinal); + public SerializerUnregisteredException(Class cls) { + this(cls.getName()); } - public static class OpaqueObject { - private final Language language; - private final String className; - private final int ordinal; - - public OpaqueObject(Language language, String className, int ordinal) { - this.language = language; - this.className = className; - this.ordinal = ordinal; - } - - public Language language() { - return language; - } - - public String className() { - return className; - } - - public int ordinal() { - return ordinal; - } + public SerializerUnregisteredException(String qualifiedName) { + super(String.format("Class %s is not registered with a serializer", qualifiedName)); } } diff --git a/java/fury-core/src/main/java/org/apache/fury/pool/ThreadPoolFury.java b/java/fury-core/src/main/java/org/apache/fury/pool/ThreadPoolFury.java index ff701e0c06..40abe79bc0 100644 --- a/java/fury-core/src/main/java/org/apache/fury/pool/ThreadPoolFury.java +++ b/java/fury-core/src/main/java/org/apache/fury/pool/ThreadPoolFury.java @@ -138,6 +138,11 @@ public Object deserialize(byte[] bytes) { return execute(fury -> fury.deserialize(bytes)); } + @Override + public T deserialize(byte[] bytes, Class type) { + return execute(fury -> fury.deserialize(bytes, type)); + } + @Override public Object deserialize(byte[] bytes, Iterable outOfBandBuffers) { return execute(fury -> fury.deserialize(bytes, outOfBandBuffers)); diff --git a/java/fury-core/src/main/java/org/apache/fury/resolver/ClassInfo.java b/java/fury-core/src/main/java/org/apache/fury/resolver/ClassInfo.java index 57efd230f2..3ca0c85587 100644 --- a/java/fury-core/src/main/java/org/apache/fury/resolver/ClassInfo.java +++ b/java/fury-core/src/main/java/org/apache/fury/resolver/ClassInfo.java @@ -20,7 +20,8 @@ package org.apache.fury.resolver; import static org.apache.fury.meta.Encoders.GENERIC_ENCODER; -import static org.apache.fury.meta.Encoders.PACKAGE_ENCODER; +import static org.apache.fury.meta.Encoders.PACKAGE_DECODER; +import static org.apache.fury.meta.Encoders.TYPE_NAME_DECODER; import org.apache.fury.collection.Tuple2; import org.apache.fury.config.Language; @@ -42,7 +43,7 @@ public class ClassInfo { final MetaStringBytes packageNameBytes; final MetaStringBytes classNameBytes; final boolean isDynamicGeneratedClass; - final MetaStringBytes typeTagBytes; + int xtypeId; Serializer serializer; // use primitive to avoid boxing // class id must be less than Integer.MAX_VALUE/2 since we use bit 0 as class id flag. @@ -56,15 +57,15 @@ public class ClassInfo { MetaStringBytes packageNameBytes, MetaStringBytes classNameBytes, boolean isDynamicGeneratedClass, - MetaStringBytes typeTagBytes, Serializer serializer, - short classId) { + short classId, + short xtypeId) { this.cls = cls; this.fullClassNameBytes = fullClassNameBytes; this.packageNameBytes = packageNameBytes; this.classNameBytes = classNameBytes; this.isDynamicGeneratedClass = isDynamicGeneratedClass; - this.typeTagBytes = typeTagBytes; + this.xtypeId = xtypeId; this.serializer = serializer; this.classId = classId; if (cls != null && classId == ClassResolver.NO_CLASS_ID) { @@ -75,9 +76,9 @@ public class ClassInfo { ClassInfo( ClassResolver classResolver, Class cls, - String tag, Serializer serializer, - short classId) { + short classId, + short xtypeId) { this.cls = cls; this.serializer = serializer; needToWriteClassDef = serializer != null && classResolver.needToWriteClassDef(serializer); @@ -104,13 +105,7 @@ public class ClassInfo { this.packageNameBytes = null; this.classNameBytes = null; } - if (tag != null) { - this.typeTagBytes = - metaStringResolver.getOrCreateMetaStringBytes( - PACKAGE_ENCODER.encode(tag, Encoding.UTF_8)); - } else { - this.typeTagBytes = null; - } + this.xtypeId = xtypeId; this.classId = classId; if (cls != null) { boolean isLambda = Functions.isLambda(cls); @@ -135,12 +130,8 @@ public short getClassId() { return classId; } - public MetaStringBytes getPackageNameBytes() { - return packageNameBytes; - } - - public MetaStringBytes getClassNameBytes() { - return classNameBytes; + public int getXtypeId() { + return xtypeId; } @SuppressWarnings("unchecked") @@ -153,6 +144,14 @@ void setSerializer(ClassResolver resolver, Serializer serializer) { needToWriteClassDef = serializer != null && resolver.needToWriteClassDef(serializer); } + public String decodeNamespace() { + return packageNameBytes.decode(PACKAGE_DECODER); + } + + public String decodeTypeName() { + return classNameBytes.decode(TYPE_NAME_DECODER); + } + @Override public String toString() { return "ClassInfo{" diff --git a/java/fury-core/src/main/java/org/apache/fury/resolver/ClassNameBytes.java b/java/fury-core/src/main/java/org/apache/fury/resolver/ClassNameBytes.java new file mode 100644 index 0000000000..ab2c01dd8a --- /dev/null +++ b/java/fury-core/src/main/java/org/apache/fury/resolver/ClassNameBytes.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.fury.resolver; + +class ClassNameBytes { + private final long packageHash; + private final long classNameHash; + + ClassNameBytes(long packageHash, long classNameHash) { + this.packageHash = packageHash; + this.classNameHash = classNameHash; + } + + @Override + public boolean equals(Object o) { + // ClassNameBytes is used internally, skip + ClassNameBytes that = (ClassNameBytes) o; + return packageHash == that.packageHash && classNameHash == that.classNameHash; + } + + @Override + public int hashCode() { + int result = 31 + (int) (packageHash ^ (packageHash >>> 32)); + result = result * 31 + (int) (classNameHash ^ (classNameHash >>> 32)); + return result; + } +} diff --git a/java/fury-core/src/main/java/org/apache/fury/resolver/ClassResolver.java b/java/fury-core/src/main/java/org/apache/fury/resolver/ClassResolver.java index dbaf4bb403..9b73734e0b 100644 --- a/java/fury-core/src/main/java/org/apache/fury/resolver/ClassResolver.java +++ b/java/fury-core/src/main/java/org/apache/fury/resolver/ClassResolver.java @@ -19,6 +19,7 @@ package org.apache.fury.resolver; +import static org.apache.fury.Fury.NOT_SUPPORT_XLANG; import static org.apache.fury.meta.ClassDef.SIZE_TWO_BYTES_FLAG; import static org.apache.fury.meta.Encoders.PACKAGE_DECODER; import static org.apache.fury.meta.Encoders.PACKAGE_ENCODER; @@ -205,14 +206,13 @@ public class ClassResolver { public static final short CLASS_CLASS_ID = (short) (PRIMITIVE_DOUBLE_ARRAY_CLASS_ID + 6); public static final short EMPTY_OBJECT_ID = (short) (PRIMITIVE_DOUBLE_ARRAY_CLASS_ID + 7); // use a lower load factor to minimize hash collision - private static final float loadFactor = 0.25f; private static final float furyMapLoadFactor = 0.25f; private static final int estimatedNumRegistered = 150; private static final String SET_META__CONTEXT_MSG = "Meta context must be set before serialization, " + "please set meta context by SerializationContext.setMetaContext"; - private static final ClassInfo NIL_CLASS_INFO = - new ClassInfo(null, null, null, null, false, null, null, ClassResolver.NO_CLASS_ID); + static final ClassInfo NIL_CLASS_INFO = + new ClassInfo(null, null, null, null, false, null, NO_CLASS_ID, NOT_SUPPORT_XLANG); private final Fury fury; private ClassInfo[] registeredId2ClassInfo = new ClassInfo[] {}; @@ -221,13 +221,9 @@ public class ClassResolver { private final IdentityMap, ClassInfo> classInfoMap = new IdentityMap<>(estimatedNumRegistered, furyMapLoadFactor); private ClassInfo classInfoCache; - private final ObjectMap> classNameBytes2Class = - new ObjectMap<>(16, furyMapLoadFactor); // Every deserialization for unregistered class will query it, performance is important. private final ObjectMap compositeClassNameBytes2ClassInfo = new ObjectMap<>(16, furyMapLoadFactor); - private final HashMap> typeIdToClassXLangMap = new HashMap<>(8, loadFactor); - private final HashMap> typeTagToClassXLangMap = new HashMap<>(8, loadFactor); private final MetaStringResolver metaStringResolver; private final boolean metaContextShareEnabled; private final Map, ClassDef> classDefMap = new HashMap<>(); @@ -256,7 +252,8 @@ private static class ExtRegistry { descriptorsCache = new ConcurrentHashMap<>(); private ClassChecker classChecker = (classResolver, className) -> true; private GenericType objectGenericType; - private Map, CodeGenerator> codeGeneratorMap = new HashMap<>(); + private final IdentityMap genericTypes = new IdentityMap<>(); + private final Map, CodeGenerator> codeGeneratorMap = new HashMap<>(); } public ClassResolver(Fury fury) { @@ -436,8 +433,7 @@ public void register(Class cls, String typeTag) { + "Fury#register(Class) or Fury.register(Class, Short)"); } register(cls); - Preconditions.checkArgument(!typeTagToClassXLangMap.containsKey(typeTag)); - addSerializer(cls, new StructSerializer<>(fury, cls, typeTag)); + addSerializer(cls, new StructSerializer<>(fury, cls)); } /** @@ -476,7 +472,7 @@ public void register(Class cls, int classId) { if (classInfo != null) { classInfo.classId = id; } else { - classInfo = new ClassInfo(this, cls, null, null, id); + classInfo = new ClassInfo(this, cls, null, id, NOT_SUPPORT_XLANG); // make `extRegistry.registeredClassIdMap` and `classInfoMap` share same classInfo // instances. classInfoMap.put(cls, classInfo); @@ -491,7 +487,7 @@ public void register(String className, int classId) { register(loadClass(className, false, 0, false), classId); } - public void register(Class cls, Short id, boolean createSerializer) { + public void register(Class cls, int id, boolean createSerializer) { register(cls, id); if (createSerializer) { createSerializerAhead(cls); @@ -729,18 +725,6 @@ public void clearSerializer(Class cls) { /** Ass serializer for specified class. */ private void addSerializer(Class type, Serializer serializer) { Preconditions.checkNotNull(serializer); - String typeTag = null; - short typeId = serializer.getXtypeId(); - if (typeId != Fury.NOT_SUPPORT_CROSS_LANGUAGE) { - if (typeId > Fury.NOT_SUPPORT_CROSS_LANGUAGE) { - typeIdToClassXLangMap.put(typeId, type); - } - if (typeId == Fury.FURY_TYPE_TAG_ID) { - typeTag = serializer.getCrossLanguageTypeTag(); - typeTagToClassXLangMap.put(typeTag, type); - } - } - // 1. Try to get ClassInfo from `registeredId2ClassInfo` and // `classInfoMap` or create a new `ClassInfo`. ClassInfo classInfo; @@ -758,8 +742,8 @@ private void addSerializer(Class type, Serializer serializer) { classInfo = classInfoMap.get(type); } - if (classInfo == null || typeTag != null || classId != classInfo.classId) { - classInfo = new ClassInfo(this, type, typeTag, null, classId); + if (classInfo == null || classId != classInfo.classId) { + classInfo = new ClassInfo(this, type, null, classId, (short) 0); classInfoMap.put(type, classInfo); if (registered) { registeredId2ClassInfo[classId] = classInfo; @@ -968,6 +952,21 @@ public boolean isCollection(Class cls) { } } + public boolean isSet(Class cls) { + if (Set.class.isAssignableFrom(cls)) { + return true; + } + if (fury.getConfig().isScalaOptimizationEnabled()) { + // Scala map is scala iterable too. + if (ScalaTypes.getScalaMapType().isAssignableFrom(cls)) { + return false; + } + return ScalaTypes.getScalaSetType().isAssignableFrom(cls); + } else { + return false; + } + } + public boolean isMap(Class cls) { return Map.class.isAssignableFrom(cls) || (fury.getConfig().isScalaOptimizationEnabled() @@ -1150,6 +1149,10 @@ public ClassInfo getClassInfo(Class cls, boolean createClassInfoIfNotFound) { } } + void setClassInfo(Class cls, ClassInfo classInfo) { + classInfoMap.put(cls, classInfo); + } + @Internal public ClassInfo getOrUpdateClassInfo(Class cls) { ClassInfo classInfo = classInfoCache; @@ -1435,7 +1438,7 @@ private ClassInfo getMetaSharedClassInfo(ClassDef classDef, Class clz) { Class cls = clz; Short classId = extRegistry.registeredClassIdMap.get(cls); ClassInfo classInfo = - new ClassInfo(this, cls, null, null, classId == null ? NO_CLASS_ID : classId); + new ClassInfo(this, cls, null, classId == null ? NO_CLASS_ID : classId, NOT_SUPPORT_XLANG); if (NonexistentClass.class.isAssignableFrom(TypeUtils.getComponentIfArray(cls))) { if (cls == NonexistentMetaShared.class) { classInfo.setSerializer(this, new NonexistentClassSerializer(fury, classDef)); @@ -1603,7 +1606,9 @@ public void writeClassInternal(MemoryBuffer buffer, Class cls) { Short classId = extRegistry.registeredClassIdMap.get(cls); // Don't create serializer in case the object for class is non-serializable, // Or class is abstract or interface. - classInfo = new ClassInfo(this, cls, null, null, classId == null ? NO_CLASS_ID : classId); + classInfo = + new ClassInfo( + this, cls, null, classId == null ? NO_CLASS_ID : classId, NOT_SUPPORT_XLANG); classInfoMap.put(cls, classInfo); } writeClassInternal(buffer, classInfo); @@ -1745,7 +1750,7 @@ private ClassInfo readClassInfoFromBytes( return classInfo; } - private ClassInfo loadBytesToClassInfo( + ClassInfo loadBytesToClassInfo( MetaStringBytes packageBytes, MetaStringBytes simpleClassNameBytes) { ClassNameBytes classNameBytes = new ClassNameBytes(packageBytes.hashCode, simpleClassNameBytes.hashCode); @@ -1775,8 +1780,8 @@ private ClassInfo populateBytesToClassInfo( simpleClassNameBytes, false, null, - null, - NO_CLASS_ID); + NO_CLASS_ID, + NOT_SUPPORT_XLANG); if (NonexistentClass.class.isAssignableFrom(TypeUtils.getComponentIfArray(cls))) { classInfo.serializer = NonexistentClassSerializers.getSerializer(fury, classSpec.entireClassName, cls); @@ -1791,39 +1796,10 @@ private ClassInfo populateBytesToClassInfo( return classInfo; } - public void xwriteClass(MemoryBuffer buffer, Class cls) { - metaStringResolver.writeMetaStringBytes(buffer, getOrUpdateClassInfo(cls).fullClassNameBytes); - } - - public void xwriteTypeTag(MemoryBuffer buffer, Class cls) { - metaStringResolver.writeMetaStringBytes(buffer, getOrUpdateClassInfo(cls).typeTagBytes); - } - - public Class xreadClass(MemoryBuffer buffer) { - MetaStringBytes byteString = metaStringResolver.readMetaStringBytes(buffer); - Class cls = classNameBytes2Class.get(byteString); - if (cls == null) { - Preconditions.checkNotNull(byteString); - String className = byteString.decode(Encoders.GENERIC_DECODER); - cls = loadClass(className); - classNameBytes2Class.put(byteString, cls); - } - currentReadClass = cls; - return cls; - } - - public String xreadClassName(MemoryBuffer buffer) { - return metaStringResolver.readMetaString(buffer); - } - public Class getCurrentReadClass() { return currentReadClass; } - private Class loadClass(String className) { - return loadClass(className, false, 0); - } - private Class loadClass(ClassSpec classSpec) { return loadClass(classSpec.entireClassName, classSpec.isEnum, classSpec.dimension); } @@ -1864,39 +1840,6 @@ public void resetRead() {} public void resetWrite() {} - public Class getClassByTypeId(short typeId) { - return typeIdToClassXLangMap.get(typeId); - } - - public Class readClassByTypeTag(MemoryBuffer buffer) { - String tag = metaStringResolver.readMetaString(buffer); - return typeTagToClassXLangMap.get(tag); - } - - private static class ClassNameBytes { - private final long packageHash; - private final long classNameHash; - - private ClassNameBytes(long packageHash, long classNameHash) { - this.packageHash = packageHash; - this.classNameHash = classNameHash; - } - - @Override - public boolean equals(Object o) { - // ClassNameBytes is used internally, skip - ClassNameBytes that = (ClassNameBytes) o; - return packageHash == that.packageHash && classNameHash == that.classNameHash; - } - - @Override - public int hashCode() { - int result = 31 + (int) (packageHash ^ (packageHash >>> 32)); - result = result * 31 + (int) (classNameHash ^ (classNameHash >>> 32)); - return result; - } - } - public GenericType buildGenericType(TypeRef typeRef) { return GenericType.build( typeRef.getType(), @@ -1910,15 +1853,26 @@ public GenericType buildGenericType(TypeRef typeRef) { } public GenericType buildGenericType(Type type) { - return GenericType.build( - type, - t -> { - if (t.getClass() == Class.class) { - return isMonomorphic((Class) t); - } else { - return isMonomorphic(getRawType(t)); - } - }); + GenericType genericType = extRegistry.genericTypes.get(type); + if (genericType != null) { + return genericType; + } + return populateGenericType(type); + } + + private GenericType populateGenericType(Type type) { + GenericType genericType = + GenericType.build( + type, + t -> { + if (t.getClass() == Class.class) { + return isMonomorphic((Class) t); + } else { + return isMonomorphic(getRawType(t)); + } + }); + extRegistry.genericTypes.put(type, genericType); + return genericType; } public GenericType getObjectGenericType() { @@ -1926,12 +1880,12 @@ public GenericType getObjectGenericType() { } public ClassInfo newClassInfo(Class cls, Serializer serializer, short classId) { - return new ClassInfo(this, cls, null, serializer, classId); + return new ClassInfo(this, cls, serializer, classId, NOT_SUPPORT_XLANG); } // Invoked by fury JIT. public ClassInfo nilClassInfo() { - return new ClassInfo(this, null, null, null, NO_CLASS_ID); + return new ClassInfo(this, null, null, NO_CLASS_ID, NOT_SUPPORT_XLANG); } public ClassInfoHolder nilClassInfoHolder() { diff --git a/java/fury-core/src/main/java/org/apache/fury/resolver/MetaStringBytes.java b/java/fury-core/src/main/java/org/apache/fury/resolver/MetaStringBytes.java index 1612bcaf00..94458c94e3 100644 --- a/java/fury-core/src/main/java/org/apache/fury/resolver/MetaStringBytes.java +++ b/java/fury-core/src/main/java/org/apache/fury/resolver/MetaStringBytes.java @@ -65,6 +65,7 @@ static MetaStringBytes of(MetaString metaString) { byte[] bytes = metaString.getBytes(); // Set seed to ensure hash is deterministic. long hashCode = MurmurHash3.murmurhash3_x64_128(bytes, 0, bytes.length, 47)[0]; + hashCode = Math.abs(hashCode); if (hashCode == 0) { // Ensure hashcode is not 0, so we can do some optimization to avoid boxing. hashCode += 256; // last byte is reserved for header. diff --git a/java/fury-core/src/main/java/org/apache/fury/resolver/MetaStringResolver.java b/java/fury-core/src/main/java/org/apache/fury/resolver/MetaStringResolver.java index af24ce8e37..1a6a3ee4e0 100644 --- a/java/fury-core/src/main/java/org/apache/fury/resolver/MetaStringResolver.java +++ b/java/fury-core/src/main/java/org/apache/fury/resolver/MetaStringResolver.java @@ -82,6 +82,7 @@ public void writeMetaStringBytesWithFlag(MemoryBuffer buffer, MetaStringBytes by } dynamicWrittenMetaString[id] = byteString; int length = byteString.bytes.length; + // last bit `1` indicates class is written by name instead of registered id. buffer.writeVarUint32Small7(length << 2 | 0b1); if (length > SMALL_STRING_THRESHOLD) { buffer.writeInt64(byteString.hashCode); @@ -90,6 +91,7 @@ public void writeMetaStringBytesWithFlag(MemoryBuffer buffer, MetaStringBytes by } buffer.writeBytes(byteString.bytes); } else { + // last bit `1` indicates class is written by name instead of registered id. buffer.writeVarUint32Small7(((id + 1) << 2) | 0b11); } } @@ -260,7 +262,8 @@ private MetaStringBytes createSmallMetaStringBytes(int len, byte encoding, long LittleEndian.putInt64(data, 0, v1); LittleEndian.putInt64(data, 8, v2); long hashCode = MurmurHash3.murmurhash3_x64_128(data, 0, len, 47)[0]; - hashCode = ((hashCode) & 0xffffffffffffff00L) | encoding; + hashCode = Math.abs(hashCode); + hashCode = (hashCode & 0xffffffffffffff00L) | encoding; MetaStringBytes metaStringBytes = new MetaStringBytes(Arrays.copyOf(data, len), hashCode); longLongMap.put(v1, v2, metaStringBytes); return metaStringBytes; diff --git a/java/fury-core/src/main/java/org/apache/fury/resolver/XtypeResolver.java b/java/fury-core/src/main/java/org/apache/fury/resolver/XtypeResolver.java new file mode 100644 index 0000000000..873d7d0e90 --- /dev/null +++ b/java/fury-core/src/main/java/org/apache/fury/resolver/XtypeResolver.java @@ -0,0 +1,494 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.fury.resolver; + +import static org.apache.fury.Fury.NOT_SUPPORT_XLANG; +import static org.apache.fury.meta.Encoders.GENERIC_ENCODER; +import static org.apache.fury.meta.Encoders.PACKAGE_DECODER; +import static org.apache.fury.meta.Encoders.PACKAGE_ENCODER; +import static org.apache.fury.meta.Encoders.TYPE_NAME_DECODER; +import static org.apache.fury.resolver.ClassResolver.NO_CLASS_ID; +import static org.apache.fury.type.TypeUtils.qualifiedName; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.sql.Timestamp; +import java.time.Duration; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.Set; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import org.apache.fury.Fury; +import org.apache.fury.collection.IdentityMap; +import org.apache.fury.collection.LongMap; +import org.apache.fury.collection.ObjectMap; +import org.apache.fury.config.Config; +import org.apache.fury.exception.ClassUnregisteredException; +import org.apache.fury.exception.SerializerUnregisteredException; +import org.apache.fury.logging.Logger; +import org.apache.fury.logging.LoggerFactory; +import org.apache.fury.memory.MemoryBuffer; +import org.apache.fury.memory.Platform; +import org.apache.fury.meta.Encoders; +import org.apache.fury.meta.MetaString; +import org.apache.fury.reflect.ReflectionUtils; +import org.apache.fury.serializer.EnumSerializer; +import org.apache.fury.serializer.NonexistentClass; +import org.apache.fury.serializer.NonexistentClassSerializers; +import org.apache.fury.serializer.Serializer; +import org.apache.fury.serializer.Serializers; +import org.apache.fury.serializer.StructSerializer; +import org.apache.fury.serializer.collection.CollectionSerializer; +import org.apache.fury.serializer.collection.MapSerializer; +import org.apache.fury.type.GenericType; +import org.apache.fury.type.Generics; +import org.apache.fury.type.TypeUtils; +import org.apache.fury.type.Types; +import org.apache.fury.util.Preconditions; + +@SuppressWarnings({"unchecked", "rawtypes"}) +// TODO(chaokunyang) Abstract type resolver for java/xlang type resolution. +public class XtypeResolver { + private static final Logger LOG = LoggerFactory.getLogger(XtypeResolver.class); + + private static final float loadFactor = 0.5f; + // Most systems won't have so many types for serialization. + private static final int MAX_TYPE_ID = 4096; + + private final Config config; + private final Fury fury; + private final ClassResolver classResolver; + private final ClassInfoHolder classInfoCache = new ClassInfoHolder(ClassResolver.NIL_CLASS_INFO); + private final MetaStringResolver metaStringResolver; + // IdentityMap has better lookup performance, when loadFactor is 0.05f, performance is better + private final IdentityMap, ClassInfo> classInfoMap = new IdentityMap<>(64, loadFactor); + // Every deserialization for unregistered class will query it, performance is important. + private final ObjectMap compositeClassNameBytes2ClassInfo = + new ObjectMap<>(16, loadFactor); + private final ObjectMap qualifiedType2ClassInfo = + new ObjectMap<>(16, loadFactor); + private int xtypeIdGenerator = 64; + + // Use ClassInfo[] or LongMap? + // ClassInfo[] is faster, but we can't have bigger type id. + private final LongMap xtypeIdToClassMap = new LongMap<>(8, loadFactor); + private final Set registeredTypeIds = new HashSet<>(); + private final Generics generics; + + public XtypeResolver(Fury fury) { + this.config = fury.getConfig(); + this.fury = fury; + this.classResolver = fury.getClassResolver(); + this.generics = fury.getGenerics(); + this.metaStringResolver = fury.getMetaStringResolver(); + registerDefaultTypes(); + } + + public void register(Class type) { + while (registeredTypeIds.contains(xtypeIdGenerator)) { + xtypeIdGenerator++; + } + register(type, xtypeIdGenerator++); + } + + public void register(Class type, int typeId) { + // ClassInfo[] has length of max type id. If the type id is too big, Fury will waste many + // memory. + // We can relax this limit in the future. + Preconditions.checkArgument(typeId < MAX_TYPE_ID, "Too big type id %s", typeId); + ClassInfo classInfo = classInfoMap.get(type); + Serializer serializer = null; + if (classInfo != null) { + serializer = classInfo.serializer; + if (classInfo.xtypeId != 0) { + throw new IllegalArgumentException( + String.format("Type %s has been registered with id %s", type, classInfo.xtypeId)); + } + String prevNamespace = classInfo.decodeNamespace(); + String prevTypeName = classInfo.decodeTypeName(); + if (!type.getSimpleName().equals(prevTypeName)) { + throw new IllegalArgumentException( + String.format( + "Type %s has been registered with namespace %s type %s", + type, prevNamespace, prevTypeName)); + } + } + int xtypeId = typeId; + if (type.isEnum()) { + xtypeId = (xtypeId << 8) + Types.ENUM; + } else { + if (serializer != null) { + if (serializer instanceof StructSerializer) { + xtypeId = (xtypeId << 8) + Types.STRUCT; + } else { + xtypeId = (xtypeId << 8) + Types.EXT; + } + } + } + register( + type, + serializer, + ReflectionUtils.getPackage(type), + ReflectionUtils.getClassNameWithoutPackage(type), + xtypeId); + } + + public void register(Class type, String namespace, String typeName) { + Preconditions.checkArgument( + !typeName.contains("."), + "Typename %s should not contains `.`, please put it into namespace", + typeName); + ClassInfo classInfo = classInfoMap.get(type); + Serializer serializer = null; + if (classInfo != null) { + serializer = classInfo.serializer; + if (classInfo.classNameBytes != null) { + String prevNamespace = classInfo.decodeNamespace(); + String prevTypeName = classInfo.decodeTypeName(); + if (!namespace.equals(prevNamespace) || typeName.equals(prevTypeName)) { + throw new IllegalArgumentException( + String.format( + "Type %s has been registered with namespace %s type %s", + type, prevNamespace, prevTypeName)); + } + } + } + short xtypeId; + if (serializer != null) { + if (serializer instanceof StructSerializer) { + xtypeId = Types.NAMED_STRUCT; + } else if (serializer instanceof EnumSerializer) { + xtypeId = Types.NAMED_ENUM; + } else { + xtypeId = Types.NAMED_EXT; + } + } else { + if (type.isEnum()) { + xtypeId = Types.NAMED_ENUM; + } else { + xtypeId = Types.NAMED_STRUCT; + } + } + register(type, serializer, namespace, typeName, xtypeId); + } + + private void register( + Class type, Serializer serializer, String namespace, String typeName, int xtypeId) { + ClassInfo classInfo = newClassInfo(type, serializer, namespace, typeName, (short) xtypeId); + qualifiedType2ClassInfo.put(qualifiedName(namespace, typeName), classInfo); + if (serializer == null) { + if (type.isEnum()) { + classInfo.serializer = new EnumSerializer(fury, (Class) type); + } else { + classInfo.serializer = new StructSerializer(fury, type); + } + } + classInfoMap.put(type, classInfo); + registeredTypeIds.add(xtypeId); + xtypeIdToClassMap.put(xtypeId, classInfo); + } + + private ClassInfo newClassInfo(Class type, Serializer serializer, short xtypeId) { + return newClassInfo( + type, + serializer, + ReflectionUtils.getPackage(type), + ReflectionUtils.getClassNameWithoutPackage(type), + xtypeId); + } + + private ClassInfo newClassInfo( + Class type, Serializer serializer, String namespace, String typeName, short xtypeId) { + MetaStringBytes fullClassNameBytes = + metaStringResolver.getOrCreateMetaStringBytes( + GENERIC_ENCODER.encode(type.getName(), MetaString.Encoding.UTF_8)); + MetaStringBytes nsBytes = + metaStringResolver.getOrCreateMetaStringBytes(Encoders.encodePackage(namespace)); + MetaStringBytes classNameBytes = + metaStringResolver.getOrCreateMetaStringBytes(Encoders.encodeTypeName(typeName)); + return new ClassInfo( + type, fullClassNameBytes, nsBytes, classNameBytes, false, serializer, NO_CLASS_ID, xtypeId); + } + + public void registerSerializer(Class type, Class serializerClass) { + ClassInfo classInfo = checkClassRegistration(type); + classInfo.serializer = Serializers.newSerializer(fury, type, serializerClass); + } + + public void registerSerializer(Class type, Serializer serializer) { + ClassInfo classInfo = checkClassRegistration(type); + classInfo.serializer = serializer; + } + + private ClassInfo checkClassRegistration(Class type) { + ClassInfo classInfo = classInfoMap.get(type); + Preconditions.checkArgument( + classInfo != null + && (classInfo.xtypeId != 0 || !type.getSimpleName().equals(classInfo.decodeTypeName())), + "Type %s should be registered with id or namespace+typename before register serializer", + type); + return classInfo; + } + + public ClassInfo getClassInfo(Class cls) { + ClassInfo classInfo = classInfoMap.get(cls); + if (classInfo == null) { + classInfo = buildClassInfo(cls); + } + return classInfo; + } + + public ClassInfo getClassInfo(Class cls, ClassInfoHolder classInfoHolder) { + ClassInfo classInfo = classInfoHolder.classInfo; + if (classInfo.getCls() != cls) { + classInfo = classInfoMap.get(cls); + if (classInfo == null) { + classInfo = buildClassInfo(cls); + } + classInfoHolder.classInfo = classInfo; + } + assert classInfo.serializer != null; + return classInfo; + } + + private ClassInfo buildClassInfo(Class cls) { + Serializer serializer; + int xtypeId; + if (classResolver.isSet(cls)) { + serializer = new CollectionSerializer(fury, cls); + xtypeId = Types.SET; + } else if (classResolver.isCollection(cls)) { + serializer = new CollectionSerializer(fury, cls); + xtypeId = Types.LIST; + } else if (cls.isArray() && !TypeUtils.getArrayComponent(cls).isPrimitive()) { + serializer = classResolver.getSerializer(cls); + xtypeId = Types.LIST; + } else if (classResolver.isMap(cls)) { + serializer = new MapSerializer(fury, cls); + xtypeId = Types.MAP; + } else { + Class enclosingClass = (Class) cls.getEnclosingClass(); + if (enclosingClass != null && enclosingClass.isEnum()) { + serializer = new EnumSerializer(fury, (Class) cls); + xtypeId = getClassInfo(enclosingClass).xtypeId; + } else { + throw new ClassUnregisteredException(cls); + } + } + ClassInfo info = newClassInfo(cls, serializer, (short) xtypeId); + classInfoMap.put(cls, info); + return info; + } + + private void registerDefaultTypes() { + registerDefaultTypes(Types.BOOL, Boolean.class, boolean.class, AtomicBoolean.class); + registerDefaultTypes(Types.INT8, Byte.class, byte.class); + registerDefaultTypes(Types.INT16, Short.class, short.class); + registerDefaultTypes(Types.INT32, Integer.class, int.class, AtomicInteger.class); + registerDefaultTypes(Types.INT64, Long.class, long.class, AtomicLong.class); + registerDefaultTypes(Types.FLOAT32, Float.class, float.class); + registerDefaultTypes(Types.FLOAT64, Double.class, double.class); + registerDefaultTypes(Types.STRING, String.class, StringBuilder.class, StringBuffer.class); + registerDefaultTypes(Types.DURATION, Duration.class); + registerDefaultTypes( + Types.TIMESTAMP, + Instant.class, + Date.class, + java.sql.Date.class, + Timestamp.class, + LocalDateTime.class); + registerDefaultTypes(Types.DECIMAL, BigDecimal.class, BigInteger.class); + registerDefaultTypes( + Types.BINARY, + byte[].class, + Platform.HEAP_BYTE_BUFFER_CLASS, + Platform.DIRECT_BYTE_BUFFER_CLASS); + registerDefaultTypes(Types.BOOL_ARRAY, boolean[].class); + registerDefaultTypes(Types.INT16_ARRAY, short[].class); + registerDefaultTypes(Types.INT32_ARRAY, int[].class); + registerDefaultTypes(Types.INT64_ARRAY, long[].class); + registerDefaultTypes(Types.FLOAT32_ARRAY, float[].class); + registerDefaultTypes(Types.FLOAT64_ARRAY, double[].class); + registerDefaultTypes(Types.LIST, ArrayList.class, Object[].class); + registerDefaultTypes(Types.SET, HashSet.class, LinkedHashSet.class); + registerDefaultTypes(Types.MAP, HashMap.class, LinkedHashMap.class); + registerDefaultTypes(Types.LOCAL_DATE, LocalDate.class); + } + + private void registerDefaultTypes(int xtypeId, Class defaultType, Class... otherTypes) { + ClassInfo classInfo = + newClassInfo(defaultType, classResolver.getSerializer(defaultType), (short) xtypeId); + classInfoMap.put(defaultType, classInfo); + xtypeIdToClassMap.put(xtypeId, classInfo); + for (Class otherType : otherTypes) { + classInfo = newClassInfo(otherType, classResolver.getSerializer(otherType), (short) xtypeId); + classInfoMap.put(otherType, classInfo); + } + } + + public ClassInfo writeClassInfo(MemoryBuffer buffer, Object obj) { + ClassInfo classInfo = getClassInfo(obj.getClass(), classInfoCache); + int xtypeId = classInfo.getXtypeId(); + byte internalTypeId = (byte) xtypeId; + buffer.writeVarUint32Small7(xtypeId); + switch (internalTypeId) { + case Types.NAMED_ENUM: + case Types.NAMED_STRUCT: + case Types.NAMED_COMPATIBLE_STRUCT: + case Types.NAMED_POLYMORPHIC_STRUCT: + case Types.NAMED_POLYMORPHIC_COMPATIBLE_STRUCT: + case Types.NAMED_EXT: + case Types.NAMED_POLYMORPHIC_EXT: + assert classInfo.packageNameBytes != null; + metaStringResolver.writeMetaStringBytes(buffer, classInfo.packageNameBytes); + assert classInfo.classNameBytes != null; + metaStringResolver.writeMetaStringBytes(buffer, classInfo.classNameBytes); + break; + default: + break; + } + return classInfo; + } + + public ClassInfo readClassInfo(MemoryBuffer buffer) { + long xtypeId = buffer.readVarUint32Small14(); + byte internalTypeId = (byte) xtypeId; + switch (internalTypeId) { + case Types.NAMED_ENUM: + case Types.NAMED_STRUCT: + case Types.NAMED_COMPATIBLE_STRUCT: + case Types.NAMED_POLYMORPHIC_STRUCT: + case Types.NAMED_POLYMORPHIC_COMPATIBLE_STRUCT: + case Types.NAMED_EXT: + case Types.NAMED_POLYMORPHIC_EXT: + MetaStringBytes packageBytes = metaStringResolver.readMetaStringBytes(buffer); + MetaStringBytes simpleClassNameBytes = metaStringResolver.readMetaStringBytes(buffer); + return loadBytesToClassInfo(internalTypeId, packageBytes, simpleClassNameBytes); + case Types.LIST: + return getListClassInfo(); + case Types.TIMESTAMP: + return getGenericClassInfo(); + default: + return xtypeIdToClassMap.get(xtypeId); + } + } + + private ClassInfo getListClassInfo() { + fury.incDepth(1); + GenericType genericType = generics.nextGenericType(); + fury.incDepth(-1); + if (genericType != null) { + Class cls = genericType.getCls(); + if (cls.isArray()) { + return classResolver.getClassInfo(cls); + } + } + return xtypeIdToClassMap.get(Types.LIST); + } + + private ClassInfo getGenericClassInfo() { + fury.incDepth(1); + GenericType genericType = generics.nextGenericType(); + fury.incDepth(-1); + if (genericType != null) { + return classResolver.getClassInfo(genericType.getCls()); + } + return xtypeIdToClassMap.get(Types.TIMESTAMP); + } + + private ClassInfo loadBytesToClassInfo( + int internalTypeId, MetaStringBytes packageBytes, MetaStringBytes simpleClassNameBytes) { + ClassNameBytes classNameBytes = + new ClassNameBytes(packageBytes.hashCode, simpleClassNameBytes.hashCode); + ClassInfo classInfo = compositeClassNameBytes2ClassInfo.get(classNameBytes); + if (classInfo == null) { + classInfo = + populateBytesToClassInfo( + internalTypeId, classNameBytes, packageBytes, simpleClassNameBytes); + } + return classInfo; + } + + private ClassInfo populateBytesToClassInfo( + int typeId, + ClassNameBytes classNameBytes, + MetaStringBytes packageBytes, + MetaStringBytes simpleClassNameBytes) { + String namespace = packageBytes.decode(PACKAGE_DECODER); + String typeName = simpleClassNameBytes.decode(TYPE_NAME_DECODER); + String qualifiedName = qualifiedName(namespace, typeName); + ClassInfo classInfo = qualifiedType2ClassInfo.get(qualifiedName); + if (classInfo == null) { + String msg = String.format("Class %s not registered", qualifiedName); + Class type = null; + if (config.deserializeNonexistentClass()) { + LOG.warn(msg); + switch (typeId) { + case Types.NAMED_ENUM: + case Types.NAMED_STRUCT: + case Types.NAMED_COMPATIBLE_STRUCT: + case Types.NAMED_POLYMORPHIC_STRUCT: + case Types.NAMED_POLYMORPHIC_COMPATIBLE_STRUCT: + type = + NonexistentClass.getNonexistentClass( + qualifiedName, isEnum(typeId), 0, config.isMetaShareEnabled()); + break; + case Types.NAMED_EXT: + throw new SerializerUnregisteredException(qualifiedName); + default: + break; + } + } else { + throw new ClassUnregisteredException(qualifiedName); + } + MetaStringBytes fullClassNameBytes = + metaStringResolver.getOrCreateMetaStringBytes( + PACKAGE_ENCODER.encode(qualifiedName, MetaString.Encoding.UTF_8)); + classInfo = + new ClassInfo( + type, + fullClassNameBytes, + packageBytes, + simpleClassNameBytes, + false, + null, + NO_CLASS_ID, + NOT_SUPPORT_XLANG); + if (NonexistentClass.class.isAssignableFrom(TypeUtils.getComponentIfArray(type))) { + classInfo.serializer = NonexistentClassSerializers.getSerializer(fury, qualifiedName, type); + } + } + compositeClassNameBytes2ClassInfo.put(classNameBytes, classInfo); + return classInfo; + } + + private boolean isEnum(int internalTypeId) { + return internalTypeId == Types.ENUM || internalTypeId == Types.NAMED_ENUM; + } +} diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/ArraySerializers.java b/java/fury-core/src/main/java/org/apache/fury/serializer/ArraySerializers.java index c38b2bae96..1d091894a1 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/ArraySerializers.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/ArraySerializers.java @@ -32,8 +32,9 @@ import org.apache.fury.resolver.RefResolver; import org.apache.fury.serializer.collection.CollectionFlags; import org.apache.fury.serializer.collection.FuryArrayAsListSerializer; -import org.apache.fury.type.Type; +import org.apache.fury.type.GenericType; import org.apache.fury.type.TypeUtils; +import org.apache.fury.type.Types; import org.apache.fury.util.Preconditions; /** Serializers for array types. */ @@ -46,6 +47,7 @@ public static final class ObjectArraySerializer extends Serializer { private final Serializer componentTypeSerializer; private final ClassInfoHolder classInfoHolder; private final int[] stubDims; + private final GenericType componentGenericType; public ObjectArraySerializer(Fury fury, Class cls) { super(fury, cls); @@ -63,6 +65,7 @@ public ObjectArraySerializer(Fury fury, Class cls) { } this.innerType = (Class) innerType; Class componentType = cls.getComponentType(); + componentGenericType = fury.getClassResolver().buildGenericType(componentType); if (fury.getClassResolver().isMonomorphic(componentType)) { this.componentTypeSerializer = fury.getClassResolver().getSerializer(componentType); } else { @@ -73,11 +76,6 @@ public ObjectArraySerializer(Fury fury, Class cls) { classInfoHolder = fury.getClassResolver().nilClassInfoHolder(); } - @Override - public short getXtypeId() { - return (short) -Type.LIST.getId(); - } - @Override public void write(MemoryBuffer buffer, T[] arr) { int len = arr.length; @@ -187,9 +185,12 @@ public T[] read(MemoryBuffer buffer) { public T[] xread(MemoryBuffer buffer) { int numElements = buffer.readVarUint32Small7(); Object[] value = newArray(numElements); + fury.getGenerics().pushGenericType(componentGenericType); for (int i = 0; i < numElements; i++) { - value[i] = fury.xreadRef(buffer); + Object x = fury.xreadRef(buffer); + value[i] = x; } + fury.getGenerics().popGenericType(); return (T[]) value; } @@ -249,7 +250,7 @@ public abstract static class PrimitiveArraySerializer protected final int elemSize; public PrimitiveArraySerializer(Fury fury, Class cls) { - super(fury, cls, (short) primitiveInfo.get(TypeUtils.getArrayComponentInfo(cls).f0)[2]); + super(fury, cls); Class innerType = TypeUtils.getArrayComponentInfo(cls).f0; this.offset = primitiveInfo.get(innerType)[0]; this.elemSize = primitiveInfo.get(innerType)[1]; @@ -386,11 +387,6 @@ public char[] read(MemoryBuffer buffer) { } } - @Override - public short getXtypeId() { - return Fury.NOT_SUPPORT_CROSS_LANGUAGE; - } - @Override public void xwrite(MemoryBuffer buffer, char[] value) { throw new UnsupportedOperationException(); @@ -620,11 +616,6 @@ public StringArraySerializer(Fury fury) { list = new FuryArrayAsListSerializer.ArrayAsList(0); } - @Override - public short getXtypeId() { - return (short) -Type.FURY_STRING_ARRAY.getId(); - } - @Override public void write(MemoryBuffer buffer, String[] value) { int len = value.length; @@ -690,7 +681,7 @@ public void xwrite(MemoryBuffer buffer, String[] value) { buffer.writeVarUint32Small7(len); for (String elem : value) { if (elem != null) { - buffer.writeByte(Fury.REF_VALUE_FLAG); + buffer.writeByte(Fury.NOT_NULL_VALUE_FLAG); stringSerializer.writeUTF8String(buffer, elem); } else { buffer.writeByte(Fury.NULL_FLAG); @@ -703,7 +694,7 @@ public String[] xread(MemoryBuffer buffer) { int numElements = buffer.readVarUint32Small7(); String[] value = new String[numElements]; for (int i = 0; i < numElements; i++) { - if (buffer.readByte() == Fury.REF_VALUE_FLAG) { + if (buffer.readByte() >= Fury.NOT_NULL_VALUE_FLAG) { value[i] = stringSerializer.readUTF8String(buffer); } else { value[i] = null; @@ -714,26 +705,29 @@ public String[] xread(MemoryBuffer buffer) { } public static void registerDefaultSerializers(Fury fury) { - fury.registerSerializer(Object[].class, new ObjectArraySerializer<>(fury, Object[].class)); - fury.registerSerializer(Class[].class, new ObjectArraySerializer<>(fury, Class[].class)); - fury.registerSerializer(byte[].class, new ByteArraySerializer(fury)); - fury.registerSerializer(Byte[].class, new ObjectArraySerializer<>(fury, Byte[].class)); - fury.registerSerializer(char[].class, new CharArraySerializer(fury)); - fury.registerSerializer( + ClassResolver resolver = fury.getClassResolver(); + resolver.registerSerializer(Object[].class, new ObjectArraySerializer<>(fury, Object[].class)); + resolver.registerSerializer(Class[].class, new ObjectArraySerializer<>(fury, Class[].class)); + resolver.registerSerializer(byte[].class, new ByteArraySerializer(fury)); + resolver.registerSerializer(Byte[].class, new ObjectArraySerializer<>(fury, Byte[].class)); + resolver.registerSerializer(char[].class, new CharArraySerializer(fury)); + resolver.registerSerializer( Character[].class, new ObjectArraySerializer<>(fury, Character[].class)); - fury.registerSerializer(short[].class, new ShortArraySerializer(fury)); - fury.registerSerializer(Short[].class, new ObjectArraySerializer<>(fury, Short[].class)); - fury.registerSerializer(int[].class, new IntArraySerializer(fury)); - fury.registerSerializer(Integer[].class, new ObjectArraySerializer<>(fury, Integer[].class)); - fury.registerSerializer(long[].class, new LongArraySerializer(fury)); - fury.registerSerializer(Long[].class, new ObjectArraySerializer<>(fury, Long[].class)); - fury.registerSerializer(float[].class, new FloatArraySerializer(fury)); - fury.registerSerializer(Float[].class, new ObjectArraySerializer<>(fury, Float[].class)); - fury.registerSerializer(double[].class, new DoubleArraySerializer(fury)); - fury.registerSerializer(Double[].class, new ObjectArraySerializer<>(fury, Double[].class)); - fury.registerSerializer(boolean[].class, new BooleanArraySerializer(fury)); - fury.registerSerializer(Boolean[].class, new ObjectArraySerializer<>(fury, Boolean[].class)); - fury.registerSerializer(String[].class, new StringArraySerializer(fury)); + resolver.registerSerializer(short[].class, new ShortArraySerializer(fury)); + resolver.registerSerializer(Short[].class, new ObjectArraySerializer<>(fury, Short[].class)); + resolver.registerSerializer(int[].class, new IntArraySerializer(fury)); + resolver.registerSerializer( + Integer[].class, new ObjectArraySerializer<>(fury, Integer[].class)); + resolver.registerSerializer(long[].class, new LongArraySerializer(fury)); + resolver.registerSerializer(Long[].class, new ObjectArraySerializer<>(fury, Long[].class)); + resolver.registerSerializer(float[].class, new FloatArraySerializer(fury)); + resolver.registerSerializer(Float[].class, new ObjectArraySerializer<>(fury, Float[].class)); + resolver.registerSerializer(double[].class, new DoubleArraySerializer(fury)); + resolver.registerSerializer(Double[].class, new ObjectArraySerializer<>(fury, Double[].class)); + resolver.registerSerializer(boolean[].class, new BooleanArraySerializer(fury)); + resolver.registerSerializer( + Boolean[].class, new ObjectArraySerializer<>(fury, Boolean[].class)); + resolver.registerSerializer(String[].class, new StringArraySerializer(fury)); } // ########################## utils ########################## @@ -757,25 +751,16 @@ public static PrimitiveArrayBufferObject byteArrayBufferObject(byte[] array) { static { primitiveInfo.put( - boolean.class, - new int[] {Platform.BOOLEAN_ARRAY_OFFSET, 1, Type.FURY_PRIMITIVE_BOOL_ARRAY.getId()}); - primitiveInfo.put(byte.class, new int[] {Platform.BYTE_ARRAY_OFFSET, 1, Type.BINARY.getId()}); - primitiveInfo.put( - char.class, new int[] {Platform.CHAR_ARRAY_OFFSET, 2, Fury.NOT_SUPPORT_CROSS_LANGUAGE}); - primitiveInfo.put( - short.class, - new int[] {Platform.SHORT_ARRAY_OFFSET, 2, Type.FURY_PRIMITIVE_SHORT_ARRAY.getId()}); - primitiveInfo.put( - int.class, new int[] {Platform.INT_ARRAY_OFFSET, 4, Type.FURY_PRIMITIVE_INT_ARRAY.getId()}); - primitiveInfo.put( - long.class, - new int[] {Platform.LONG_ARRAY_OFFSET, 8, Type.FURY_PRIMITIVE_LONG_ARRAY.getId()}); + boolean.class, new int[] {Platform.BOOLEAN_ARRAY_OFFSET, 1, Types.BOOL_ARRAY}); + primitiveInfo.put(byte.class, new int[] {Platform.BYTE_ARRAY_OFFSET, 1, Types.BINARY}); primitiveInfo.put( - float.class, - new int[] {Platform.FLOAT_ARRAY_OFFSET, 4, Type.FURY_PRIMITIVE_FLOAT_ARRAY.getId()}); + char.class, new int[] {Platform.CHAR_ARRAY_OFFSET, 2, Fury.NOT_SUPPORT_XLANG}); + primitiveInfo.put(short.class, new int[] {Platform.SHORT_ARRAY_OFFSET, 2, Types.INT16_ARRAY}); + primitiveInfo.put(int.class, new int[] {Platform.INT_ARRAY_OFFSET, 4, Types.INT32_ARRAY}); + primitiveInfo.put(long.class, new int[] {Platform.LONG_ARRAY_OFFSET, 8, Types.INT64_ARRAY}); + primitiveInfo.put(float.class, new int[] {Platform.FLOAT_ARRAY_OFFSET, 4, Types.FLOAT32_ARRAY}); primitiveInfo.put( - double.class, - new int[] {Platform.DOUBLE_ARRAY_OFFSET, 8, Type.FURY_PRIMITIVE_DOUBLE_ARRAY.getId()}); + double.class, new int[] {Platform.DOUBLE_ARRAY_OFFSET, 8, Types.FLOAT64_ARRAY}); } public abstract static class AbstractedNonexistentArrayClassSerializer extends Serializer { diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/BufferSerializers.java b/java/fury-core/src/main/java/org/apache/fury/serializer/BufferSerializers.java index 17ca6c3612..6d35e18a23 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/BufferSerializers.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/BufferSerializers.java @@ -23,7 +23,6 @@ import java.nio.ByteOrder; import org.apache.fury.Fury; import org.apache.fury.memory.MemoryBuffer; -import org.apache.fury.type.Type; /** Serializers for buffer related classes. */ public class BufferSerializers { @@ -36,7 +35,7 @@ public static final class ByteBufferSerializer extends Serializers.CrossLanguageCompatibleSerializer { public ByteBufferSerializer(Fury fury, Class cls) { - super(fury, cls, Type.FURY_BUFFER.getId()); + super(fury, cls); } @Override diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/EnumSerializer.java b/java/fury-core/src/main/java/org/apache/fury/serializer/EnumSerializer.java index 7a0f78dc13..a2f9a420ba 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/EnumSerializer.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/EnumSerializer.java @@ -100,6 +100,20 @@ public Enum read(MemoryBuffer buffer) { } } + @Override + public void xwrite(MemoryBuffer buffer, Enum value) { + buffer.writeVarUint32Small7(value.ordinal()); + } + + @Override + public Enum xread(MemoryBuffer buffer) { + int value = buffer.readVarUint32Small7(); + if (value >= enumConstants.length) { + return handleNonexistentEnumValue(value); + } + return enumConstants[value]; + } + private Enum handleNonexistentEnumValue(int value) { if (fury.getConfig().deserializeNonexistentEnumValueAsNull()) { return null; diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/FuryCopyableSerializer.java b/java/fury-core/src/main/java/org/apache/fury/serializer/FuryCopyableSerializer.java index a17c35be91..a720b44be0 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/FuryCopyableSerializer.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/FuryCopyableSerializer.java @@ -48,25 +48,6 @@ public T read(MemoryBuffer buffer) { return serializer.read(buffer); } - /** - * Returns {@link Fury#NOT_SUPPORT_CROSS_LANGUAGE} if the serializer doesn't support - * cross-language serialization. Return a number in range (0, 32767) if the serializer support - * cross-language serialization and native serialization data is the same with cross-language - * serialization. Return a negative short in range [-32768, 0) if the serializer support - * cross-language serialization and native serialization data is not the same with cross-language - * serialization. - */ - @Override - public short getXtypeId() { - return serializer.getXtypeId(); - } - - /** Returns a type tag used for setup type mapping between languages. */ - @Override - public String getCrossLanguageTypeTag() { - return serializer.getCrossLanguageTypeTag(); - } - @Override public void xwrite(MemoryBuffer buffer, T value) { serializer.xwrite(buffer, value); diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/OptionalSerializers.java b/java/fury-core/src/main/java/org/apache/fury/serializer/OptionalSerializers.java index 1fa48db8fd..30052288ef 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/OptionalSerializers.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/OptionalSerializers.java @@ -25,6 +25,7 @@ import java.util.OptionalLong; import org.apache.fury.Fury; import org.apache.fury.memory.MemoryBuffer; +import org.apache.fury.resolver.ClassResolver; /** * Serializers for {@link Optional}, {@link OptionalInt}, {@link OptionalLong} and {@link @@ -130,9 +131,10 @@ public OptionalDouble read(MemoryBuffer buffer) { } public static void registerDefaultSerializers(Fury fury) { - fury.registerSerializer(Optional.class, new OptionalSerializer(fury)); - fury.registerSerializer(OptionalInt.class, new OptionalIntSerializer(fury)); - fury.registerSerializer(OptionalLong.class, new OptionalLongSerializer(fury)); - fury.registerSerializer(OptionalDouble.class, new OptionalDoubleSerializer(fury)); + ClassResolver resolver = fury.getClassResolver(); + resolver.registerSerializer(Optional.class, new OptionalSerializer(fury)); + resolver.registerSerializer(OptionalInt.class, new OptionalIntSerializer(fury)); + resolver.registerSerializer(OptionalLong.class, new OptionalLongSerializer(fury)); + resolver.registerSerializer(OptionalDouble.class, new OptionalDoubleSerializer(fury)); } } diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/PrimitiveSerializers.java b/java/fury-core/src/main/java/org/apache/fury/serializer/PrimitiveSerializers.java index 1b386b8795..71954f19de 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/PrimitiveSerializers.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/PrimitiveSerializers.java @@ -27,7 +27,7 @@ import org.apache.fury.config.LongEncoding; import org.apache.fury.memory.MemoryBuffer; import org.apache.fury.memory.Platform; -import org.apache.fury.type.Type; +import org.apache.fury.resolver.ClassResolver; import org.apache.fury.util.Preconditions; /** Serializers for java primitive types. */ @@ -36,12 +36,7 @@ public class PrimitiveSerializers { public static final class BooleanSerializer extends Serializers.CrossLanguageCompatibleSerializer { public BooleanSerializer(Fury fury, Class cls) { - super( - fury, - (Class) cls, - Type.BOOL.getId(), - !(cls.isPrimitive() || fury.isBasicTypesRefIgnored()), - true); + super(fury, (Class) cls, !(cls.isPrimitive() || fury.isBasicTypesRefIgnored()), true); } @Override @@ -58,12 +53,7 @@ public Boolean read(MemoryBuffer buffer) { public static final class ByteSerializer extends Serializers.CrossLanguageCompatibleSerializer { public ByteSerializer(Fury fury, Class cls) { - super( - fury, - (Class) cls, - Type.INT8.getId(), - !(cls.isPrimitive() || fury.isBasicTypesRefIgnored()), - true); + super(fury, (Class) cls, !(cls.isPrimitive() || fury.isBasicTypesRefIgnored()), true); } @Override @@ -82,11 +72,6 @@ public Uint8Serializer(Fury fury) { super(fury, Integer.class); } - @Override - public short getXtypeId() { - return Type.UINT8.getId(); - } - @Override public void xwrite(MemoryBuffer buffer, Integer value) { Preconditions.checkArgument(value >= 0 && value <= 255); @@ -105,11 +90,6 @@ public Uint16Serializer(Fury fury) { super(fury, Integer.class); } - @Override - public short getXtypeId() { - return Type.UINT16.getId(); - } - @Override public void xwrite(MemoryBuffer buffer, Integer value) { Preconditions.checkArgument(value >= 0 && value <= 65535); @@ -142,12 +122,7 @@ public Character read(MemoryBuffer buffer) { public static final class ShortSerializer extends Serializers.CrossLanguageCompatibleSerializer { public ShortSerializer(Fury fury, Class cls) { - super( - fury, - (Class) cls, - Type.INT16.getId(), - !(cls.isPrimitive() || fury.isBasicTypesRefIgnored()), - true); + super(fury, (Class) cls, !(cls.isPrimitive() || fury.isBasicTypesRefIgnored()), true); } @Override @@ -166,12 +141,7 @@ public static final class IntSerializer private final boolean compressNumber; public IntSerializer(Fury fury, Class cls) { - super( - fury, - (Class) cls, - Type.INT32.getId(), - !(cls.isPrimitive() || fury.isBasicTypesRefIgnored()), - true); + super(fury, (Class) cls, !(cls.isPrimitive() || fury.isBasicTypesRefIgnored()), true); compressNumber = fury.compressInt(); } @@ -196,12 +166,12 @@ public Integer read(MemoryBuffer buffer) { @Override public void xwrite(MemoryBuffer buffer, Integer value) { // TODO support varint in cross-language serialization - buffer.writeInt32(value); + buffer.writeVarInt32(value); } @Override public Integer xread(MemoryBuffer buffer) { - return buffer.readInt32(); + return buffer.readVarInt32(); } } @@ -210,12 +180,7 @@ public static final class LongSerializer private final LongEncoding longEncoding; public LongSerializer(Fury fury, Class cls) { - super( - fury, - (Class) cls, - Type.INT64.getId(), - !(cls.isPrimitive() || fury.isBasicTypesRefIgnored()), - true); + super(fury, (Class) cls, !(cls.isPrimitive() || fury.isBasicTypesRefIgnored()), true); longEncoding = fury.longEncoding(); } @@ -282,25 +247,20 @@ public static String readLongFunc(LongEncoding longEncoding) { @Override public void xwrite(MemoryBuffer buffer, Long value) { - // TODO support var long in cross-language serialization - buffer.writeInt64(value); + // TODO(chaokunyang) support var long in cross-language serialization + buffer.writeVarInt64(value); } @Override public Long xread(MemoryBuffer buffer) { - return buffer.readInt64(); + return buffer.readVarInt64(); } } public static final class FloatSerializer extends Serializers.CrossLanguageCompatibleSerializer { public FloatSerializer(Fury fury, Class cls) { - super( - fury, - (Class) cls, - Type.FLOAT.getId(), - !(cls.isPrimitive() || fury.isBasicTypesRefIgnored()), - true); + super(fury, (Class) cls, !(cls.isPrimitive() || fury.isBasicTypesRefIgnored()), true); } @Override @@ -317,12 +277,7 @@ public Float read(MemoryBuffer buffer) { public static final class DoubleSerializer extends Serializers.CrossLanguageCompatibleSerializer { public DoubleSerializer(Fury fury, Class cls) { - super( - fury, - (Class) cls, - Type.DOUBLE.getId(), - !(cls.isPrimitive() || fury.isBasicTypesRefIgnored()), - true); + super(fury, (Class) cls, !(cls.isPrimitive() || fury.isBasicTypesRefIgnored()), true); } @Override @@ -338,21 +293,22 @@ public Double read(MemoryBuffer buffer) { public static void registerDefaultSerializers(Fury fury) { // primitive types will be boxed. - fury.registerSerializer(boolean.class, new BooleanSerializer(fury, boolean.class)); - fury.registerSerializer(byte.class, new ByteSerializer(fury, byte.class)); - fury.registerSerializer(short.class, new ShortSerializer(fury, short.class)); - fury.registerSerializer(char.class, new CharSerializer(fury, char.class)); - fury.registerSerializer(int.class, new IntSerializer(fury, int.class)); - fury.registerSerializer(long.class, new LongSerializer(fury, long.class)); - fury.registerSerializer(float.class, new FloatSerializer(fury, float.class)); - fury.registerSerializer(double.class, new DoubleSerializer(fury, double.class)); - fury.registerSerializer(Boolean.class, new BooleanSerializer(fury, Boolean.class)); - fury.registerSerializer(Byte.class, new ByteSerializer(fury, Byte.class)); - fury.registerSerializer(Short.class, new ShortSerializer(fury, Short.class)); - fury.registerSerializer(Character.class, new CharSerializer(fury, Character.class)); - fury.registerSerializer(Integer.class, new IntSerializer(fury, Integer.class)); - fury.registerSerializer(Long.class, new LongSerializer(fury, Long.class)); - fury.registerSerializer(Float.class, new FloatSerializer(fury, Float.class)); - fury.registerSerializer(Double.class, new DoubleSerializer(fury, Double.class)); + ClassResolver resolver = fury.getClassResolver(); + resolver.registerSerializer(boolean.class, new BooleanSerializer(fury, boolean.class)); + resolver.registerSerializer(byte.class, new ByteSerializer(fury, byte.class)); + resolver.registerSerializer(short.class, new ShortSerializer(fury, short.class)); + resolver.registerSerializer(char.class, new CharSerializer(fury, char.class)); + resolver.registerSerializer(int.class, new IntSerializer(fury, int.class)); + resolver.registerSerializer(long.class, new LongSerializer(fury, long.class)); + resolver.registerSerializer(float.class, new FloatSerializer(fury, float.class)); + resolver.registerSerializer(double.class, new DoubleSerializer(fury, double.class)); + resolver.registerSerializer(Boolean.class, new BooleanSerializer(fury, Boolean.class)); + resolver.registerSerializer(Byte.class, new ByteSerializer(fury, Byte.class)); + resolver.registerSerializer(Short.class, new ShortSerializer(fury, Short.class)); + resolver.registerSerializer(Character.class, new CharSerializer(fury, Character.class)); + resolver.registerSerializer(Integer.class, new IntSerializer(fury, Integer.class)); + resolver.registerSerializer(Long.class, new LongSerializer(fury, Long.class)); + resolver.registerSerializer(Float.class, new FloatSerializer(fury, Float.class)); + resolver.registerSerializer(Double.class, new DoubleSerializer(fury, Double.class)); } } diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/Serializer.java b/java/fury-core/src/main/java/org/apache/fury/serializer/Serializer.java index 819908920b..720120acf1 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/Serializer.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/Serializer.java @@ -27,8 +27,7 @@ /** * Serialize/deserializer objects into binary. Note that this class is designed as an abstract class - * instead of interface to reduce virtual method call cost of {@link #needToWriteRef}/{@link - * #getXtypeId}. + * instead of interface to reduce virtual method call cost of {@link #needToWriteRef}. * * @param type of objects being serializing/deserializing */ @@ -63,23 +62,6 @@ public T read(MemoryBuffer buffer) { throw new UnsupportedOperationException(); } - /** - * Returns {@link Fury#NOT_SUPPORT_CROSS_LANGUAGE} if the serializer doesn't support - * cross-language serialization. Return a number in range (0, 32767) if the serializer support - * cross-language serialization and native serialization data is the same with cross-language - * serialization. Return a negative short in range [-32768, 0) if the serializer support - * cross-language serialization and native serialization data is not the same with cross-language - * serialization. - */ - public short getXtypeId() { - return Fury.NOT_SUPPORT_CROSS_LANGUAGE; - } - - /** Returns a type tag used for setup type mapping between languages. */ - public String getCrossLanguageTypeTag() { - throw new UnsupportedOperationException(); - } - public void xwrite(MemoryBuffer buffer, T value) { throw new UnsupportedOperationException(); } diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java b/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java index 61e7574ef5..e5b01225c1 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java @@ -48,7 +48,6 @@ import org.apache.fury.memory.Platform; import org.apache.fury.reflect.ReflectionUtils; import org.apache.fury.resolver.ClassResolver; -import org.apache.fury.type.Type; import org.apache.fury.util.ExceptionUtils; import org.apache.fury.util.GraalvmSupport; import org.apache.fury.util.StringUtils; @@ -178,28 +177,18 @@ public static Object readPrimitiveValue(Fury fury, MemoryBuffer buffer, short cl } public abstract static class CrossLanguageCompatibleSerializer extends Serializer { - private final short typeId; - public CrossLanguageCompatibleSerializer(Fury fury, Class cls, short typeId) { + public CrossLanguageCompatibleSerializer(Fury fury, Class cls) { super(fury, cls); - this.typeId = typeId; } public CrossLanguageCompatibleSerializer( - Fury fury, Class cls, short typeId, boolean needToWriteRef) { - super(fury, cls, needToWriteRef); - this.typeId = typeId; - } - - public CrossLanguageCompatibleSerializer( - Fury fury, Class cls, short typeId, boolean needToWriteRef, boolean immutable) { + Fury fury, Class cls, boolean needToWriteRef, boolean immutable) { super(fury, cls, needToWriteRef, immutable); - this.typeId = typeId; } - @Override - public short getXtypeId() { - return typeId; + public CrossLanguageCompatibleSerializer(Fury fury, Class cls, boolean needToWriteRef) { + super(fury, cls, needToWriteRef); } @Override @@ -242,11 +231,6 @@ public void xwrite(MemoryBuffer buffer, T value) { stringSerializer.writeUTF8String(buffer, value.toString()); } - @Override - public short getXtypeId() { - return (short) -Type.STRING.getId(); - } - @Override public void write(MemoryBuffer buffer, T value) { if (GET_CODER != null) { @@ -577,19 +561,20 @@ public Object read(MemoryBuffer buffer) { } public static void registerDefaultSerializers(Fury fury) { - fury.registerSerializer(Class.class, new ClassSerializer(fury)); - fury.registerSerializer(StringBuilder.class, new StringBuilderSerializer(fury)); - fury.registerSerializer(StringBuffer.class, new StringBufferSerializer(fury)); - fury.registerSerializer(BigInteger.class, new BigIntegerSerializer(fury)); - fury.registerSerializer(BigDecimal.class, new BigDecimalSerializer(fury)); - fury.registerSerializer(AtomicBoolean.class, new AtomicBooleanSerializer(fury)); - fury.registerSerializer(AtomicInteger.class, new AtomicIntegerSerializer(fury)); - fury.registerSerializer(AtomicLong.class, new AtomicLongSerializer(fury)); - fury.registerSerializer(AtomicReference.class, new AtomicReferenceSerializer(fury)); - fury.registerSerializer(Currency.class, new CurrencySerializer(fury)); - fury.registerSerializer(URI.class, new URISerializer(fury)); - fury.registerSerializer(Pattern.class, new RegexSerializer(fury)); - fury.registerSerializer(UUID.class, new UUIDSerializer(fury)); - fury.registerSerializer(Object.class, new EmptyObjectSerializer(fury)); + ClassResolver resolver = fury.getClassResolver(); + resolver.registerSerializer(Class.class, new ClassSerializer(fury)); + resolver.registerSerializer(StringBuilder.class, new StringBuilderSerializer(fury)); + resolver.registerSerializer(StringBuffer.class, new StringBufferSerializer(fury)); + resolver.registerSerializer(BigInteger.class, new BigIntegerSerializer(fury)); + resolver.registerSerializer(BigDecimal.class, new BigDecimalSerializer(fury)); + resolver.registerSerializer(AtomicBoolean.class, new AtomicBooleanSerializer(fury)); + resolver.registerSerializer(AtomicInteger.class, new AtomicIntegerSerializer(fury)); + resolver.registerSerializer(AtomicLong.class, new AtomicLongSerializer(fury)); + resolver.registerSerializer(AtomicReference.class, new AtomicReferenceSerializer(fury)); + resolver.registerSerializer(Currency.class, new CurrencySerializer(fury)); + resolver.registerSerializer(URI.class, new URISerializer(fury)); + resolver.registerSerializer(Pattern.class, new RegexSerializer(fury)); + resolver.registerSerializer(UUID.class, new UUIDSerializer(fury)); + resolver.registerSerializer(Object.class, new EmptyObjectSerializer(fury)); } } diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java b/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java index a3379660ef..99c34c260f 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java @@ -41,7 +41,6 @@ import org.apache.fury.memory.MemoryBuffer; import org.apache.fury.memory.Platform; import org.apache.fury.reflect.ReflectionUtils; -import org.apache.fury.type.Type; import org.apache.fury.util.MathUtils; import org.apache.fury.util.Preconditions; import org.apache.fury.util.StringEncodingUtils; @@ -115,11 +114,6 @@ public StringSerializer(Fury fury) { compressString = fury.compressString(); } - @Override - public short getXtypeId() { - return Type.STRING.getId(); - } - @Override public void write(MemoryBuffer buffer, String value) { writeJavaString(buffer, value); diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/StructSerializer.java b/java/fury-core/src/main/java/org/apache/fury/serializer/StructSerializer.java index 5e1399bce5..8a289c2c9a 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/StructSerializer.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/StructSerializer.java @@ -20,10 +20,11 @@ package org.apache.fury.serializer; import java.lang.reflect.Constructor; -import java.lang.reflect.Field; import java.lang.reflect.InvocationTargetException; +import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; +import java.util.HashMap; import java.util.IdentityHashMap; import java.util.List; import java.util.Map; @@ -35,24 +36,29 @@ import org.apache.fury.memory.MemoryBuffer; import org.apache.fury.memory.Platform; import org.apache.fury.reflect.FieldAccessor; +import org.apache.fury.reflect.ReflectionUtils; import org.apache.fury.reflect.TypeRef; +import org.apache.fury.resolver.ClassInfo; +import org.apache.fury.resolver.ClassResolver; +import org.apache.fury.serializer.collection.CollectionSerializer; +import org.apache.fury.serializer.collection.MapSerializer; import org.apache.fury.type.Descriptor; import org.apache.fury.type.GenericType; import org.apache.fury.type.Generics; -import org.apache.fury.type.Type; import org.apache.fury.type.TypeUtils; +import org.apache.fury.type.Types; import org.apache.fury.util.ExceptionUtils; import org.apache.fury.util.Preconditions; +import org.apache.fury.util.StringUtils; /** * A serializer used for cross-language serialization for custom objects. * *

TODO(chaokunyang) support generics optimization for {@code SomeClass}. */ -@SuppressWarnings({"unchecked", "rawtypes", "UnstableApiUsage"}) +@SuppressWarnings({"unchecked", "rawtypes"}) public class StructSerializer extends Serializer { private static final Logger LOG = LoggerFactory.getLogger(StructSerializer.class); - private final String typeTag; private final Constructor constructor; private final FieldAccessor[] fieldAccessors; private GenericType[] fieldGenerics; @@ -60,9 +66,8 @@ public class StructSerializer extends Serializer { private final IdentityHashMap genericTypesCache; private int typeHash; - public StructSerializer(Fury fury, Class cls, String typeTag) { + public StructSerializer(Fury fury, Class cls) { super(fury, cls); - this.typeTag = typeTag; if (fury.getLanguage() == Language.JAVA) { LOG.warn("Type of class {} shouldn't be serialized using cross-language serializer", cls); } @@ -78,21 +83,50 @@ public StructSerializer(Fury fury, Class cls, String typeTag) { this.constructor = ctr; fieldAccessors = Descriptor.getFields(cls).stream() - .sorted(Comparator.comparing(Field::getName)) + .sorted(Comparator.comparing(f -> StringUtils.lowerCamelToLowerUnderscore(f.getName()))) .map(FieldAccessor::createAccessor) .toArray(FieldAccessor[]::new); - fieldGenerics = buildFieldGenerics(TypeRef.of(cls), fieldAccessors); + fieldGenerics = buildFieldGenerics(fury, TypeRef.of(cls), fieldAccessors); genericTypesCache = new IdentityHashMap<>(); genericTypesCache.put(null, fieldGenerics); } - private static GenericType[] buildFieldGenerics( - TypeRef type, FieldAccessor[] fieldAccessors) { + private GenericType[] buildFieldGenerics( + Fury fury, TypeRef type, FieldAccessor[] fieldAccessors) { return Arrays.stream(fieldAccessors) - .map(fieldAccessor -> GenericType.build(type, fieldAccessor.getField().getGenericType())) + .map(fieldAccessor -> getGenericType(fury, type, fieldAccessor)) .toArray(GenericType[]::new); } + private static GenericType getGenericType( + Fury fury, TypeRef type, FieldAccessor fieldAccessor) { + GenericType t = GenericType.build(type, fieldAccessor.getField().getGenericType()); + ClassResolver resolver = fury.getClassResolver(); + Class cls = t.getCls(); + if (resolver.isMonomorphic(cls)) { + t.setSerializer(fury.getXtypeResolver().getClassInfo(cls).getSerializer()); + return t; + } + // We have one type id for map, there is no map polymorphic support. + // If one want to deserialize map data into different map type, he should + // use the concrete map subclass type to declare the field type or use some hints + // such as field annotation. + if (resolver.isMap(cls)) { + t.setSerializer( + ReflectionUtils.isAbstract(cls) + ? new MapSerializer(fury, HashMap.class) + : resolver.getSerializer(cls)); + } else if (resolver.isCollection(cls)) { + t.setSerializer( + ReflectionUtils.isAbstract(cls) + ? new CollectionSerializer(fury, ArrayList.class) + : resolver.getSerializer(cls)); + } else if (cls.isArray()) { + t.setSerializer(new ArraySerializers.ObjectArraySerializer(fury, cls)); + } + return t; + } + @Override public void write(MemoryBuffer buffer, T value) { xwrite(buffer, value); @@ -103,16 +137,6 @@ public T read(MemoryBuffer buffer) { return xread(buffer); } - @Override - public short getXtypeId() { - return Fury.FURY_TYPE_TAG_ID; - } - - @Override - public String getCrossLanguageTypeTag() { - return typeTag; - } - @Override public void xwrite(MemoryBuffer buffer, T value) { // TODO(chaokunyang) support fields back and forward compatible. @@ -128,11 +152,11 @@ public void xwrite(MemoryBuffer buffer, T value) { for (int i = 0; i < fieldAccessors.length; i++) { FieldAccessor fieldAccessor = fieldAccessors[i]; GenericType fieldGeneric = fieldGenerics[i]; - Serializer serializer = fieldGeneric.getSerializerOrNull(fury.getClassResolver()); boolean hasGenerics = fieldGeneric.hasGenericParameters(); if (hasGenerics) { generics.pushGenericType(fieldGeneric); } + Serializer serializer = fieldGeneric.getSerializer(); if (serializer != null) { fury.xwriteRef(buffer, fieldAccessor.get(value), serializer); } else { @@ -152,7 +176,7 @@ private GenericType[] getGenericTypes(Generics generics) { this.genericType = genericType; fieldGenerics = genericTypesCache.get(genericType); if (fieldGenerics == null) { - fieldGenerics = buildFieldGenerics(genericType.getTypeRef(), fieldAccessors); + fieldGenerics = buildFieldGenerics(fury, genericType.getTypeRef(), fieldAccessors); genericTypesCache.put(genericType, fieldGenerics); } this.fieldGenerics = fieldGenerics; @@ -181,12 +205,17 @@ public T xread(MemoryBuffer buffer) { for (int i = 0; i < fieldAccessors.length; i++) { FieldAccessor fieldAccessor = fieldAccessors[i]; GenericType fieldGeneric = fieldGenerics[i]; - Serializer serializer = fieldGeneric.getSerializerOrNull(fury.getClassResolver()); boolean hasGenerics = fieldGeneric.hasGenericParameters(); if (hasGenerics) { generics.pushGenericType(fieldGeneric); } - Object fieldValue = fury.xreadRefByNullableSerializer(buffer, serializer); + Object fieldValue; + Serializer serializer = fieldGeneric.getSerializer(); + if (serializer == null) { + fieldValue = fury.xreadRef(buffer); + } else { + fieldValue = fury.xreadRef(buffer, serializer); + } fieldAccessor.set(obj, fieldValue); if (hasGenerics) { generics.popGenericType(); @@ -219,23 +248,22 @@ int computeFieldHash(int hash, GenericType fieldGeneric) { int id; if (fieldGeneric.getTypeRef().isSubtypeOf(List.class)) { // TODO(chaokunyang) add list element type into schema hash - id = Type.LIST.getId(); + id = Types.LIST; } else if (fieldGeneric.getTypeRef().isSubtypeOf(Map.class)) { // TODO(chaokunyang) add map key&value type into schema hash - id = Type.MAP.getId(); + id = Types.MAP; } else { try { - Serializer serializer = fury.getClassResolver().getSerializer(fieldGeneric.getCls()); - short xtypeId = serializer.getXtypeId(); - if (xtypeId == Fury.NOT_SUPPORT_CROSS_LANGUAGE) { - return hash; - } - id = Math.abs(xtypeId); - if (id == Type.FURY_TYPE_TAG.getId()) { - id = TypeUtils.computeStringHash(serializer.getCrossLanguageTypeTag()); + ClassInfo classInfo = fury.getXtypeResolver().getClassInfo(fieldGeneric.getCls()); + int xtypeId = classInfo.getXtypeId(); + if (Types.isStructType((byte) xtypeId)) { + id = + TypeUtils.computeStringHash(classInfo.decodeNamespace() + classInfo.decodeTypeName()); + } else { + id = Math.abs(xtypeId); } } catch (Exception e) { - return hash; + id = 0; } } long newHash = ((long) hash) * 31 + id; diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/TimeSerializers.java b/java/fury-core/src/main/java/org/apache/fury/serializer/TimeSerializers.java index cdd4129eea..2d98fc1de0 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/TimeSerializers.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/TimeSerializers.java @@ -41,7 +41,7 @@ import java.util.TimeZone; import org.apache.fury.Fury; import org.apache.fury.memory.MemoryBuffer; -import org.apache.fury.type.Type; +import org.apache.fury.resolver.ClassResolver; import org.apache.fury.util.DateTimeUtils; /** Serializers for all time related types. */ @@ -76,6 +76,16 @@ public T read(MemoryBuffer buffer) { return newInstance(buffer.readInt64()); } + @Override + public void xwrite(MemoryBuffer buffer, T value) { + buffer.writeInt64(value.getTime()); + } + + @Override + public T xread(MemoryBuffer buffer) { + return newInstance(buffer.readInt64()); + } + protected abstract T newInstance(long time); } @@ -126,17 +136,14 @@ protected Time newInstance(long time) { } public static final class TimestampSerializer extends TimeSerializer { - private final short typeId; public TimestampSerializer(Fury fury) { // conflict with instant super(fury, Timestamp.class); - typeId = (short) -Type.TIMESTAMP.getId(); } public TimestampSerializer(Fury fury, boolean needToWriteRef) { super(fury, Timestamp.class, needToWriteRef); - typeId = (short) -Type.TIMESTAMP.getId(); } @Override @@ -149,11 +156,6 @@ public Timestamp xread(MemoryBuffer buffer) { return DateTimeUtils.toJavaTimestamp(buffer.readInt64()); } - @Override - public short getXtypeId() { - return typeId; - } - @Override public void write(MemoryBuffer buffer, Timestamp value) { long time = value.getTime() - (value.getNanos() / 1_000_000); @@ -178,11 +180,6 @@ public LocalDateSerializer(Fury fury, boolean needToWriteRef) { super(fury, LocalDate.class, needToWriteRef); } - @Override - public short getXtypeId() { - return Type.DATE32.getId(); - } - @Override public void xwrite(MemoryBuffer buffer, LocalDate value) { // TODO use java encoding to support larger range. @@ -227,11 +224,6 @@ public InstantSerializer(Fury fury, boolean needToWriteRef) { super(fury, Instant.class, needToWriteRef); } - @Override - public short getXtypeId() { - return Type.TIMESTAMP.getId(); - } - @Override public void xwrite(MemoryBuffer buffer, Instant value) { // FIXME JDK17 may have higher precision than millisecond @@ -639,22 +631,23 @@ public OffsetDateTime read(MemoryBuffer buffer) { } public static void registerDefaultSerializers(Fury fury) { - fury.registerSerializer(Date.class, new DateSerializer(fury)); - fury.registerSerializer(java.sql.Date.class, new SqlDateSerializer(fury)); - fury.registerSerializer(Time.class, new SqlTimeSerializer(fury)); - fury.registerSerializer(Timestamp.class, new TimestampSerializer(fury)); - fury.registerSerializer(LocalDate.class, new LocalDateSerializer(fury)); - fury.registerSerializer(LocalTime.class, new LocalTimeSerializer(fury)); - fury.registerSerializer(LocalDateTime.class, new LocalDateTimeSerializer(fury)); - fury.registerSerializer(Instant.class, new InstantSerializer(fury)); - fury.registerSerializer(Duration.class, new DurationSerializer(fury)); - fury.registerSerializer(ZoneOffset.class, new ZoneOffsetSerializer(fury)); - fury.registerSerializer(ZonedDateTime.class, new ZonedDateTimeSerializer(fury)); - fury.registerSerializer(Year.class, new YearSerializer(fury)); - fury.registerSerializer(YearMonth.class, new YearMonthSerializer(fury)); - fury.registerSerializer(MonthDay.class, new MonthDaySerializer(fury)); - fury.registerSerializer(Period.class, new PeriodSerializer(fury)); - fury.registerSerializer(OffsetTime.class, new OffsetTimeSerializer(fury)); - fury.registerSerializer(OffsetDateTime.class, new OffsetDateTimeSerializer(fury)); + ClassResolver resolver = fury.getClassResolver(); + resolver.registerSerializer(Date.class, new DateSerializer(fury)); + resolver.registerSerializer(java.sql.Date.class, new SqlDateSerializer(fury)); + resolver.registerSerializer(Time.class, new SqlTimeSerializer(fury)); + resolver.registerSerializer(Timestamp.class, new TimestampSerializer(fury)); + resolver.registerSerializer(LocalDate.class, new LocalDateSerializer(fury)); + resolver.registerSerializer(LocalTime.class, new LocalTimeSerializer(fury)); + resolver.registerSerializer(LocalDateTime.class, new LocalDateTimeSerializer(fury)); + resolver.registerSerializer(Instant.class, new InstantSerializer(fury)); + resolver.registerSerializer(Duration.class, new DurationSerializer(fury)); + resolver.registerSerializer(ZoneOffset.class, new ZoneOffsetSerializer(fury)); + resolver.registerSerializer(ZonedDateTime.class, new ZonedDateTimeSerializer(fury)); + resolver.registerSerializer(Year.class, new YearSerializer(fury)); + resolver.registerSerializer(YearMonth.class, new YearMonthSerializer(fury)); + resolver.registerSerializer(MonthDay.class, new MonthDaySerializer(fury)); + resolver.registerSerializer(Period.class, new PeriodSerializer(fury)); + resolver.registerSerializer(OffsetTime.class, new OffsetTimeSerializer(fury)); + resolver.registerSerializer(OffsetDateTime.class, new OffsetDateTimeSerializer(fury)); } } diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/collection/AbstractCollectionSerializer.java b/java/fury-core/src/main/java/org/apache/fury/serializer/collection/AbstractCollectionSerializer.java index 04a68fc395..495c89935d 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/collection/AbstractCollectionSerializer.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/collection/AbstractCollectionSerializer.java @@ -744,7 +744,12 @@ public void xreadElements( if (elemGenericType.isMonomorphic()) { Serializer elemSerializer = elemGenericType.getSerializer(fury.getClassResolver()); for (int i = 0; i < numElements; i++) { - Object elem = fury.xreadRefByNullableSerializer(buffer, elemSerializer); + Object elem; + if (elemSerializer == null) { + elem = fury.xreadRef(buffer); + } else { + elem = fury.xreadRef(buffer, elemSerializer); + } collection.add(elem); } } else { diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/collection/AbstractMapSerializer.java b/java/fury-core/src/main/java/org/apache/fury/serializer/collection/AbstractMapSerializer.java index 6461e8080b..0a925dfe89 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/collection/AbstractMapSerializer.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/collection/AbstractMapSerializer.java @@ -361,38 +361,47 @@ public static void xwriteElements(Fury fury, MemoryBuffer buffer, Map value) { if (!keyGenericType.hasGenericParameters() && !valueGenericType.hasGenericParameters()) { for (Object object : value.entrySet()) { Map.Entry entry = (Map.Entry) object; - fury.xwriteRefByNullableSerializer(buffer, entry.getKey(), keySerializer); - fury.xwriteRefByNullableSerializer(buffer, entry.getValue(), valueSerializer); + xwriteRefByNullableSerializer(fury, buffer, entry.getKey(), keySerializer); + xwriteRefByNullableSerializer(fury, buffer, entry.getValue(), valueSerializer); } } else if (valueGenericType.hasGenericParameters()) { for (Object object : value.entrySet()) { Map.Entry entry = (Map.Entry) object; - fury.xwriteRefByNullableSerializer(buffer, entry.getKey(), keySerializer); + xwriteRefByNullableSerializer(fury, buffer, entry.getKey(), keySerializer); generics.pushGenericType(valueGenericType); - fury.xwriteRefByNullableSerializer(buffer, entry.getValue(), valueSerializer); + xwriteRefByNullableSerializer(fury, buffer, entry.getValue(), valueSerializer); generics.popGenericType(); } } else if (keyGenericType.hasGenericParameters()) { for (Object object : value.entrySet()) { Map.Entry entry = (Map.Entry) object; generics.pushGenericType(keyGenericType); - fury.xwriteRefByNullableSerializer(buffer, entry.getKey(), keySerializer); + xwriteRefByNullableSerializer(fury, buffer, entry.getKey(), keySerializer); generics.popGenericType(); - fury.xwriteRefByNullableSerializer(buffer, entry.getValue(), valueSerializer); + xwriteRefByNullableSerializer(fury, buffer, entry.getValue(), valueSerializer); } } else { for (Object object : value.entrySet()) { Map.Entry entry = (Map.Entry) object; generics.pushGenericType(keyGenericType); - fury.xwriteRefByNullableSerializer(buffer, entry.getKey(), keySerializer); + xwriteRefByNullableSerializer(fury, buffer, entry.getKey(), keySerializer); generics.pushGenericType(valueGenericType); - fury.xwriteRefByNullableSerializer(buffer, entry.getValue(), valueSerializer); + xwriteRefByNullableSerializer(fury, buffer, entry.getValue(), valueSerializer); } } generics.popGenericType(); } } + public static void xwriteRefByNullableSerializer( + Fury fury, MemoryBuffer buffer, T obj, Serializer serializer) { + if (serializer == null) { + fury.xwriteRef(buffer, obj); + } else { + fury.xwriteRef(buffer, obj, serializer); + } + } + private Tuple2 getKVGenericType(GenericType genericType) { Tuple2 genericTypes = partialGenericKVTypeMap.get(genericType); if (genericTypes == null) { @@ -671,33 +680,33 @@ public static void xreadElements(Fury fury, MemoryBuffer buffer, Map map, int si Serializer valueSerializer = valueGenericType.getSerializer(fury.getClassResolver()); if (!keyGenericType.hasGenericParameters() && !valueGenericType.hasGenericParameters()) { for (int i = 0; i < size; i++) { - Object key = fury.xreadRefByNullableSerializer(buffer, keySerializer); - Object value = fury.xreadRefByNullableSerializer(buffer, valueSerializer); + Object key = xreadRefByNullableSerializer(fury, buffer, keySerializer); + Object value = xreadRefByNullableSerializer(fury, buffer, valueSerializer); map.put(key, value); } } else if (valueGenericType.hasGenericParameters()) { for (int i = 0; i < size; i++) { - Object key = fury.xreadRefByNullableSerializer(buffer, keySerializer); + Object key = xreadRefByNullableSerializer(fury, buffer, keySerializer); generics.pushGenericType(valueGenericType); - Object value = fury.xreadRefByNullableSerializer(buffer, valueSerializer); + Object value = xreadRefByNullableSerializer(fury, buffer, valueSerializer); generics.popGenericType(); map.put(key, value); } } else if (keyGenericType.hasGenericParameters()) { for (int i = 0; i < size; i++) { generics.pushGenericType(keyGenericType); - Object key = fury.xreadRefByNullableSerializer(buffer, keySerializer); + Object key = xreadRefByNullableSerializer(fury, buffer, keySerializer); generics.popGenericType(); - Object value = fury.xreadRefByNullableSerializer(buffer, valueSerializer); + Object value = xreadRefByNullableSerializer(fury, buffer, valueSerializer); map.put(key, value); } } else { for (int i = 0; i < size; i++) { // FIXME(chaokunyang) nested generics may be get by mistake. generics.pushGenericType(keyGenericType); - Object key = fury.xreadRefByNullableSerializer(buffer, keySerializer); + Object key = xreadRefByNullableSerializer(fury, buffer, keySerializer); generics.pushGenericType(valueGenericType); - Object value = fury.xreadRefByNullableSerializer(buffer, valueSerializer); + Object value = xreadRefByNullableSerializer(fury, buffer, valueSerializer); map.put(key, value); } } @@ -705,6 +714,15 @@ public static void xreadElements(Fury fury, MemoryBuffer buffer, Map map, int si } } + public static Object xreadRefByNullableSerializer( + Fury fury, MemoryBuffer buffer, Serializer serializer) { + if (serializer == null) { + return fury.xreadRef(buffer); + } else { + return fury.xreadRef(buffer, serializer); + } + } + /** * Hook for java serialization codegen, read/write key/value by entrySet. * diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/collection/CollectionSerializers.java b/java/fury-core/src/main/java/org/apache/fury/serializer/collection/CollectionSerializers.java index a647a0ddf6..a8f78a1d69 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/collection/CollectionSerializers.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/collection/CollectionSerializers.java @@ -58,7 +58,6 @@ import org.apache.fury.serializer.ReplaceResolveSerializer; import org.apache.fury.serializer.Serializer; import org.apache.fury.serializer.Serializers; -import org.apache.fury.type.Type; import org.apache.fury.util.Preconditions; import org.apache.fury.util.unsafe._JDKAccess; @@ -74,11 +73,6 @@ public ArrayListSerializer(Fury fury) { super(fury, ArrayList.class, true); } - @Override - public short getXtypeId() { - return Type.LIST.getId(); - } - @Override public ArrayList newCollection(MemoryBuffer buffer) { int numElements = buffer.readVarUint32Small7(); @@ -117,11 +111,6 @@ public List copy(List originCollection) { return newCollection; } - @Override - public short getXtypeId() { - return (short) -Type.LIST.getId(); - } - @Override public void write(MemoryBuffer buffer, List value) { try { @@ -162,11 +151,6 @@ public HashSetSerializer(Fury fury) { super(fury, HashSet.class, true); } - @Override - public short getXtypeId() { - return Type.FURY_SET.getId(); - } - @Override public HashSet newCollection(MemoryBuffer buffer) { int numElements = buffer.readVarUint32Small7(); @@ -182,11 +166,6 @@ public LinkedHashSetSerializer(Fury fury) { super(fury, LinkedHashSet.class, true); } - @Override - public short getXtypeId() { - return Type.FURY_SET.getId(); - } - @Override public LinkedHashSet newCollection(MemoryBuffer buffer) { int numElements = buffer.readVarUint32Small7(); @@ -269,11 +248,6 @@ public EmptyListSerializer(Fury fury, Class> cls) { @Override public void write(MemoryBuffer buffer, List value) {} - @Override - public short getXtypeId() { - return (short) -Type.LIST.getId(); - } - @Override public void xwrite(MemoryBuffer buffer, List value) { // write length @@ -322,11 +296,6 @@ public EmptySetSerializer(Fury fury, Class> cls) { @Override public void write(MemoryBuffer buffer, Set value) {} - @Override - public short getXtypeId() { - return (short) -Type.FURY_SET.getId(); - } - @Override public void xwrite(MemoryBuffer buffer, Set value) { // write length @@ -377,11 +346,6 @@ public void write(MemoryBuffer buffer, List value) { fury.writeRef(buffer, value.get(0)); } - @Override - public short getXtypeId() { - return (short) -Type.LIST.getId(); - } - @Override public void xwrite(MemoryBuffer buffer, List value) { buffer.writeVarUint32Small7(1); @@ -416,11 +380,6 @@ public void write(MemoryBuffer buffer, Set value) { fury.writeRef(buffer, value.iterator().next()); } - @Override - public short getXtypeId() { - return (short) -Type.FURY_SET.getId(); - } - @Override public void xwrite(MemoryBuffer buffer, Set value) { buffer.writeVarUint32Small7(1); @@ -807,46 +766,48 @@ public T copy(T value) { // TODO Support ArraySubListSerializer, SubListSerializer public static void registerDefaultSerializers(Fury fury) { - fury.registerSerializer(ArrayList.class, new ArrayListSerializer(fury)); + ClassResolver resolver = fury.getClassResolver(); + resolver.registerSerializer(ArrayList.class, new ArrayListSerializer(fury)); Class arrayAsListClass = Arrays.asList(1, 2).getClass(); - fury.registerSerializer(arrayAsListClass, new ArraysAsListSerializer(fury, arrayAsListClass)); - fury.registerSerializer( + resolver.registerSerializer( + arrayAsListClass, new ArraysAsListSerializer(fury, arrayAsListClass)); + resolver.registerSerializer( LinkedList.class, new CollectionSerializer(fury, LinkedList.class, true)); - fury.registerSerializer(HashSet.class, new HashSetSerializer(fury)); - fury.registerSerializer(LinkedHashSet.class, new LinkedHashSetSerializer(fury)); - fury.registerSerializer(TreeSet.class, new SortedSetSerializer<>(fury, TreeSet.class)); - fury.registerSerializer( + resolver.registerSerializer(HashSet.class, new HashSetSerializer(fury)); + resolver.registerSerializer(LinkedHashSet.class, new LinkedHashSetSerializer(fury)); + resolver.registerSerializer(TreeSet.class, new SortedSetSerializer<>(fury, TreeSet.class)); + resolver.registerSerializer( Collections.EMPTY_LIST.getClass(), new EmptyListSerializer(fury, (Class>) Collections.EMPTY_LIST.getClass())); - fury.registerSerializer( + resolver.registerSerializer( Collections.emptySortedSet().getClass(), new EmptySortedSetSerializer( fury, (Class>) Collections.emptySortedSet().getClass())); - fury.registerSerializer( + resolver.registerSerializer( Collections.EMPTY_SET.getClass(), new EmptySetSerializer(fury, (Class>) Collections.EMPTY_SET.getClass())); - fury.registerSerializer( + resolver.registerSerializer( Collections.singletonList(null).getClass(), new CollectionsSingletonListSerializer( fury, (Class>) Collections.singletonList(null).getClass())); - fury.registerSerializer( + resolver.registerSerializer( Collections.singleton(null).getClass(), new CollectionsSingletonSetSerializer( fury, (Class>) Collections.singleton(null).getClass())); - fury.registerSerializer( + resolver.registerSerializer( ConcurrentSkipListSet.class, new ConcurrentSkipListSetSerializer(fury, ConcurrentSkipListSet.class)); - fury.registerSerializer(Vector.class, new VectorSerializer(fury, Vector.class)); - fury.registerSerializer(ArrayDeque.class, new ArrayDequeSerializer(fury, ArrayDeque.class)); - fury.registerSerializer(BitSet.class, new BitSetSerializer(fury, BitSet.class)); - fury.registerSerializer( + resolver.registerSerializer(Vector.class, new VectorSerializer(fury, Vector.class)); + resolver.registerSerializer(ArrayDeque.class, new ArrayDequeSerializer(fury, ArrayDeque.class)); + resolver.registerSerializer(BitSet.class, new BitSetSerializer(fury, BitSet.class)); + resolver.registerSerializer( PriorityQueue.class, new PriorityQueueSerializer(fury, PriorityQueue.class)); - fury.registerSerializer( + resolver.registerSerializer( CopyOnWriteArrayList.class, new CopyOnWriteArrayListSerializer(fury, CopyOnWriteArrayList.class)); final Class setFromMapClass = Collections.newSetFromMap(new HashMap<>()).getClass(); - fury.registerSerializer(setFromMapClass, new SetFromMapSerializer(fury, setFromMapClass)); - fury.registerSerializer( + resolver.registerSerializer(setFromMapClass, new SetFromMapSerializer(fury, setFromMapClass)); + resolver.registerSerializer( ConcurrentHashMap.KeySetView.class, new ConcurrentHashMapKeySetViewSerializer(fury, ConcurrentHashMap.KeySetView.class)); } diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/collection/FuryArrayAsListSerializer.java b/java/fury-core/src/main/java/org/apache/fury/serializer/collection/FuryArrayAsListSerializer.java index 057acb7bcc..09268c4e15 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/collection/FuryArrayAsListSerializer.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/collection/FuryArrayAsListSerializer.java @@ -26,7 +26,6 @@ import org.apache.fury.Fury; import org.apache.fury.annotation.Internal; import org.apache.fury.memory.MemoryBuffer; -import org.apache.fury.type.Type; /** Serializer for {@link ArrayAsList}. Helper for serialization of other classes. */ @Internal @@ -37,11 +36,6 @@ public FuryArrayAsListSerializer(Fury fury) { super(fury, ArrayAsList.class, true); } - @Override - public short getXtypeId() { - return (short) -Type.LIST.getId(); - } - public Collection newCollection(MemoryBuffer buffer) { int numElements = buffer.readVarUint32Small7(); setNumElements(numElements); diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/collection/GuavaCollectionSerializers.java b/java/fury-core/src/main/java/org/apache/fury/serializer/collection/GuavaCollectionSerializers.java index fe94b43556..44740cc935 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/collection/GuavaCollectionSerializers.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/collection/GuavaCollectionSerializers.java @@ -39,7 +39,7 @@ import org.apache.fury.Fury; import org.apache.fury.memory.MemoryBuffer; import org.apache.fury.memory.Platform; -import org.apache.fury.type.Type; +import org.apache.fury.resolver.ClassResolver; import org.apache.fury.util.unsafe._JDKAccess; /** Serializers for common guava types. */ @@ -84,11 +84,6 @@ public T onCollectionRead(Collection collection) { return (T) list; } - @Override - public short getXtypeId() { - return (short) -Type.LIST.getId(); - } - public T xnewInstance(Collection collection) { return (T) ImmutableList.copyOf(collection); } @@ -148,11 +143,6 @@ public T copy(T originCollection) { return (T) function.apply(elements); } - @Override - public short getXtypeId() { - return (short) -Type.LIST.getId(); - } - @Override protected T xnewInstance(Collection collection) { return (T) ImmutableList.copyOf(collection); @@ -179,11 +169,6 @@ public T onCollectionRead(Collection collection) { return (T) ImmutableSet.copyOf(elements); } - @Override - public short getXtypeId() { - return (short) -Type.FURY_SET.getId(); - } - @Override protected T xnewInstance(Collection collection) { return (T) ImmutableSet.copyOf(collection); @@ -271,11 +256,6 @@ public T onMapRead(Map map) { return (T) builder.build(); } - @Override - public short getXtypeId() { - return (short) -Type.MAP.getId(); - } - @Override public T xread(MemoryBuffer buffer) { int size = buffer.readVarUint32Small7(); @@ -426,65 +406,66 @@ public static void registerDefaultSerializers(Fury fury) { // inconsistent if peers load different version of guava. // For example: guava 20 return ImmutableBiMap for ImmutableMap.of(), but guava 27 return // ImmutableMap. + ClassResolver resolver = fury.getClassResolver(); Class cls = loadClass(pkg + ".RegularImmutableBiMap", ImmutableBiMap.of("k1", 1, "k2", 4).getClass()); - fury.registerSerializer(cls, new ImmutableBiMapSerializer(fury, cls)); + resolver.registerSerializer(cls, new ImmutableBiMapSerializer(fury, cls)); cls = loadClass(pkg + ".SingletonImmutableBiMap", ImmutableBiMap.of(1, 2).getClass()); - fury.registerSerializer(cls, new ImmutableBiMapSerializer(fury, cls)); + resolver.registerSerializer(cls, new ImmutableBiMapSerializer(fury, cls)); cls = loadClass(pkg + ".RegularImmutableMap", ImmutableMap.of("k1", 1, "k2", 2).getClass()); - fury.registerSerializer(cls, new ImmutableMapSerializer(fury, cls)); + resolver.registerSerializer(cls, new ImmutableMapSerializer(fury, cls)); cls = loadClass(pkg + ".RegularImmutableList", ImmutableList.of().getClass()); - fury.registerSerializer(cls, new RegularImmutableListSerializer(fury, cls)); + resolver.registerSerializer(cls, new RegularImmutableListSerializer(fury, cls)); cls = loadClass(pkg + ".SingletonImmutableList", ImmutableList.of(1).getClass()); - fury.registerSerializer(cls, new ImmutableListSerializer(fury, cls)); + resolver.registerSerializer(cls, new ImmutableListSerializer(fury, cls)); cls = loadClass(pkg + ".RegularImmutableSet", ImmutableSet.of(1, 2).getClass()); - fury.registerSerializer(cls, new ImmutableSetSerializer(fury, cls)); + resolver.registerSerializer(cls, new ImmutableSetSerializer(fury, cls)); cls = loadClass(pkg + ".SingletonImmutableSet", ImmutableSet.of(1).getClass()); - fury.registerSerializer(cls, new ImmutableSetSerializer(fury, cls)); + resolver.registerSerializer(cls, new ImmutableSetSerializer(fury, cls)); // sorted set/map doesn't support xlang. cls = loadClass(pkg + ".RegularImmutableSortedSet", ImmutableSortedSet.of(1, 2).getClass()); - fury.registerSerializer(cls, new ImmutableSortedSetSerializer<>(fury, cls)); + resolver.registerSerializer(cls, new ImmutableSortedSetSerializer<>(fury, cls)); cls = loadClass(pkg + ".ImmutableSortedMap", ImmutableSortedMap.of(1, 2).getClass()); - fury.registerSerializer(cls, new ImmutableSortedMapSerializer<>(fury, cls)); + resolver.registerSerializer(cls, new ImmutableSortedMapSerializer<>(fury, cls)); // Guava version before 19.0, of() return // EmptyImmutableSet/EmptyImmutableBiMap/EmptyImmutableSortedMap/EmptyImmutableSortedSet // we register if class exist or register empty to deserialize. if (checkClassExist(pkg + ".EmptyImmutableSet")) { cls = loadClass(pkg + ".EmptyImmutableSet", ImmutableSet.of().getClass()); - fury.registerSerializer(cls, new ImmutableSetSerializer(fury, cls)); + resolver.registerSerializer(cls, new ImmutableSetSerializer(fury, cls)); } else { class GuavaEmptySet {} cls = GuavaEmptySet.class; - fury.registerSerializer(cls, new ImmutableSetSerializer(fury, cls)); + resolver.registerSerializer(cls, new ImmutableSetSerializer(fury, cls)); } if (checkClassExist(pkg + ".EmptyImmutableBiMap")) { cls = loadClass(pkg + ".EmptyImmutableBiMap", ImmutableBiMap.of().getClass()); - fury.registerSerializer(cls, new ImmutableMapSerializer(fury, cls)); + resolver.registerSerializer(cls, new ImmutableMapSerializer(fury, cls)); } else { class GuavaEmptyBiMap {} cls = GuavaEmptyBiMap.class; - fury.registerSerializer(cls, new ImmutableMapSerializer(fury, cls)); + resolver.registerSerializer(cls, new ImmutableMapSerializer(fury, cls)); } if (checkClassExist(pkg + ".EmptyImmutableSortedSet")) { cls = loadClass(pkg + ".EmptyImmutableSortedSet", ImmutableSortedSet.of().getClass()); - fury.registerSerializer(cls, new ImmutableSortedSetSerializer(fury, cls)); + resolver.registerSerializer(cls, new ImmutableSortedSetSerializer(fury, cls)); } else { class GuavaEmptySortedSet {} cls = GuavaEmptySortedSet.class; - fury.registerSerializer(cls, new ImmutableSortedSetSerializer(fury, cls)); + resolver.registerSerializer(cls, new ImmutableSortedSetSerializer(fury, cls)); } if (checkClassExist(pkg + ".EmptyImmutableSortedMap")) { cls = loadClass(pkg + ".EmptyImmutableSortedMap", ImmutableSortedMap.of().getClass()); - fury.registerSerializer(cls, new ImmutableSortedMapSerializer(fury, cls)); + resolver.registerSerializer(cls, new ImmutableSortedMapSerializer(fury, cls)); } else { class GuavaEmptySortedMap {} cls = GuavaEmptySortedMap.class; - fury.registerSerializer(cls, new ImmutableSortedMapSerializer(fury, cls)); + resolver.registerSerializer(cls, new ImmutableSortedMapSerializer(fury, cls)); } } diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/collection/ImmutableCollectionSerializers.java b/java/fury-core/src/main/java/org/apache/fury/serializer/collection/ImmutableCollectionSerializers.java index ae31d90bbf..d79f52b97a 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/collection/ImmutableCollectionSerializers.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/collection/ImmutableCollectionSerializers.java @@ -32,6 +32,7 @@ import org.apache.fury.Fury; import org.apache.fury.memory.MemoryBuffer; import org.apache.fury.memory.Platform; +import org.apache.fury.resolver.ClassResolver; import org.apache.fury.util.unsafe._JDKAccess; /** Serializers for jdk9+ java.util.ImmutableCollections. */ @@ -259,12 +260,13 @@ public Map onMapRead(Map map) { } public static void registerSerializers(Fury fury) { - fury.registerSerializer(List12, new ImmutableListSerializer(fury, List12)); - fury.registerSerializer(ListN, new ImmutableListSerializer(fury, ListN)); - fury.registerSerializer(SubList, new ImmutableListSerializer(fury, SubList)); - fury.registerSerializer(Set12, new ImmutableSetSerializer(fury, Set12)); - fury.registerSerializer(SetN, new ImmutableSetSerializer(fury, SetN)); - fury.registerSerializer(Map1, new ImmutableMapSerializer(fury, Map1)); - fury.registerSerializer(MapN, new ImmutableMapSerializer(fury, MapN)); + ClassResolver resolver = fury.getClassResolver(); + resolver.registerSerializer(List12, new ImmutableListSerializer(fury, List12)); + resolver.registerSerializer(ListN, new ImmutableListSerializer(fury, ListN)); + resolver.registerSerializer(SubList, new ImmutableListSerializer(fury, SubList)); + resolver.registerSerializer(Set12, new ImmutableSetSerializer(fury, Set12)); + resolver.registerSerializer(SetN, new ImmutableSetSerializer(fury, SetN)); + resolver.registerSerializer(Map1, new ImmutableMapSerializer(fury, Map1)); + resolver.registerSerializer(MapN, new ImmutableMapSerializer(fury, MapN)); } } diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/collection/MapSerializers.java b/java/fury-core/src/main/java/org/apache/fury/serializer/collection/MapSerializers.java index a9f6a6263a..2ca878705f 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/collection/MapSerializers.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/collection/MapSerializers.java @@ -43,7 +43,6 @@ import org.apache.fury.serializer.Serializer; import org.apache.fury.serializer.Serializers; import org.apache.fury.serializer.StringSerializer; -import org.apache.fury.type.Type; import org.apache.fury.util.Preconditions; /** @@ -58,11 +57,6 @@ public HashMapSerializer(Fury fury) { super(fury, HashMap.class, true); } - @Override - public short getXtypeId() { - return Type.MAP.getId(); - } - @Override public HashMap newMap(MemoryBuffer buffer) { int numElements = buffer.readVarUint32Small7(); @@ -83,11 +77,6 @@ public LinkedHashMapSerializer(Fury fury) { super(fury, LinkedHashMap.class, true); } - @Override - public short getXtypeId() { - return Type.MAP.getId(); - } - @Override public LinkedHashMap newMap(MemoryBuffer buffer) { int numElements = buffer.readVarUint32Small7(); @@ -108,11 +97,6 @@ public LazyMapSerializer(Fury fury) { super(fury, LazyMap.class, true); } - @Override - public short getXtypeId() { - return Type.MAP.getId(); - } - @Override public LazyMap newMap(MemoryBuffer buffer) { int numElements = buffer.readVarUint32Small7(); @@ -189,11 +173,6 @@ public EmptyMapSerializer(Fury fury, Class> cls) { @Override public void write(MemoryBuffer buffer, Map value) {} - @Override - public short getXtypeId() { - return (short) -Type.MAP.getId(); - } - @Override public void xwrite(MemoryBuffer buffer, Map value) { // write length @@ -246,11 +225,6 @@ public void write(MemoryBuffer buffer, Map value) { fury.writeRef(buffer, entry.getValue()); } - @Override - public short getXtypeId() { - return (short) -Type.MAP.getId(); - } - @Override public void xwrite(MemoryBuffer buffer, Map value) { buffer.writeVarUint32Small7(1); @@ -294,11 +268,6 @@ public ConcurrentHashMap newMap(MemoryBuffer buffer) { public Map newMap(Map map) { return new ConcurrentHashMap(map.size()); } - - @Override - public short getXtypeId() { - return Fury.NOT_SUPPORT_CROSS_LANGUAGE; - } } public static final class ConcurrentSkipListMapSerializer @@ -323,11 +292,6 @@ public Map newMap(Map originMap) { Comparator comparator = fury.copyObject(((ConcurrentSkipListMap) originMap).comparator()); return new ConcurrentSkipListMap(comparator); } - - @Override - public short getXtypeId() { - return Fury.NOT_SUPPORT_CROSS_LANGUAGE; - } } public static class EnumMapSerializer extends MapSerializer { @@ -517,27 +481,28 @@ public T copy(T value) { // TODO(chaokunyang) support ConcurrentSkipListMap.SubMap mo efficiently. public static void registerDefaultSerializers(Fury fury) { - fury.registerSerializer(HashMap.class, new HashMapSerializer(fury)); + ClassResolver resolver = fury.getClassResolver(); + resolver.registerSerializer(HashMap.class, new HashMapSerializer(fury)); fury.getClassResolver() .registerSerializer(LinkedHashMap.class, new LinkedHashMapSerializer(fury)); - fury.registerSerializer(TreeMap.class, new SortedMapSerializer<>(fury, TreeMap.class)); - fury.registerSerializer( + resolver.registerSerializer(TreeMap.class, new SortedMapSerializer<>(fury, TreeMap.class)); + resolver.registerSerializer( Collections.EMPTY_MAP.getClass(), new EmptyMapSerializer(fury, (Class>) Collections.EMPTY_MAP.getClass())); - fury.registerSerializer( + resolver.registerSerializer( Collections.emptySortedMap().getClass(), new EmptySortedMapSerializer( fury, (Class>) Collections.emptySortedMap().getClass())); - fury.registerSerializer( + resolver.registerSerializer( Collections.singletonMap(null, null).getClass(), new SingletonMapSerializer( fury, (Class>) Collections.singletonMap(null, null).getClass())); - fury.registerSerializer( + resolver.registerSerializer( ConcurrentHashMap.class, new ConcurrentHashMapSerializer(fury, ConcurrentHashMap.class)); - fury.registerSerializer( + resolver.registerSerializer( ConcurrentSkipListMap.class, new ConcurrentSkipListMapSerializer(fury, ConcurrentSkipListMap.class)); - fury.registerSerializer(EnumMap.class, new EnumMapSerializer(fury)); - fury.registerSerializer(LazyMap.class, new LazyMapSerializer(fury)); + resolver.registerSerializer(EnumMap.class, new EnumMapSerializer(fury)); + resolver.registerSerializer(LazyMap.class, new LazyMapSerializer(fury)); } } diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/collection/SubListSerializers.java b/java/fury-core/src/main/java/org/apache/fury/serializer/collection/SubListSerializers.java index 60353e7866..0f3fa73da2 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/collection/SubListSerializers.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/collection/SubListSerializers.java @@ -28,6 +28,7 @@ import org.apache.fury.logging.LoggerFactory; import org.apache.fury.memory.MemoryBuffer; import org.apache.fury.reflect.ReflectionUtils; +import org.apache.fury.resolver.ClassResolver; import org.apache.fury.serializer.ObjectSerializer; @SuppressWarnings({"rawtypes", "unchecked"}) @@ -74,14 +75,15 @@ class ImmutableSubListStub implements Stub {} } public static void registerSerializers(Fury fury, boolean preserveView) { + ClassResolver classResolver = fury.getClassResolver(); for (Class cls : new Class[] { SubListClass, RandomAccessSubListClass, ArrayListSubListClass, ImmutableSubListClass }) { if (fury.trackingRef() && preserveView && fury.getConfig().getLanguage() == Language.JAVA) { - fury.registerSerializer(cls, new SubListViewSerializer(fury, cls)); + classResolver.registerSerializer(cls, new SubListViewSerializer(fury, cls)); } else { - fury.registerSerializer(cls, new SubListSerializer(fury, (Class) cls)); + classResolver.registerSerializer(cls, new SubListSerializer(fury, (Class) cls)); } } } diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/collection/SynchronizedSerializers.java b/java/fury-core/src/main/java/org/apache/fury/serializer/collection/SynchronizedSerializers.java index c7e7c87357..9af57399f0 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/collection/SynchronizedSerializers.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/collection/SynchronizedSerializers.java @@ -40,6 +40,7 @@ import org.apache.fury.logging.LoggerFactory; import org.apache.fury.memory.MemoryBuffer; import org.apache.fury.memory.Platform; +import org.apache.fury.resolver.ClassResolver; import org.apache.fury.serializer.Serializer; import org.apache.fury.util.ExceptionUtils; @@ -211,8 +212,9 @@ static Tuple2, Function>[] synchronizedFactories() { */ public static void registerSerializers(Fury fury) { try { + ClassResolver resolver = fury.getClassResolver(); for (Tuple2, Function> factory : synchronizedFactories()) { - fury.registerSerializer(factory.f0, createSerializer(fury, factory)); + resolver.registerSerializer(factory.f0, createSerializer(fury, factory)); } } catch (Throwable e) { ExceptionUtils.ignore(e); diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/collection/UnmodifiableSerializers.java b/java/fury-core/src/main/java/org/apache/fury/serializer/collection/UnmodifiableSerializers.java index 84c71ae4c9..15abf6740b 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/collection/UnmodifiableSerializers.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/collection/UnmodifiableSerializers.java @@ -39,6 +39,7 @@ import org.apache.fury.logging.LoggerFactory; import org.apache.fury.memory.MemoryBuffer; import org.apache.fury.memory.Platform; +import org.apache.fury.resolver.ClassResolver; import org.apache.fury.serializer.Serializer; import org.apache.fury.util.ExceptionUtils; import org.apache.fury.util.Preconditions; @@ -209,8 +210,9 @@ static Tuple2, Function>[] unmodifiableFactories() { */ public static void registerSerializers(Fury fury) { try { + ClassResolver resolver = fury.getClassResolver(); for (Tuple2, Function> factory : unmodifiableFactories()) { - fury.registerSerializer(factory.f0, createSerializer(fury, factory)); + resolver.registerSerializer(factory.f0, createSerializer(fury, factory)); } } catch (Throwable e) { ExceptionUtils.ignore(e); diff --git a/java/fury-core/src/main/java/org/apache/fury/type/GenericType.java b/java/fury-core/src/main/java/org/apache/fury/type/GenericType.java index 86202fea73..47f441f12e 100644 --- a/java/fury-core/src/main/java/org/apache/fury/type/GenericType.java +++ b/java/fury-core/src/main/java/org/apache/fury/type/GenericType.java @@ -177,6 +177,10 @@ public GenericType getTypeParameter1() { return typeParameter1; } + public void setSerializer(Serializer serializer) { + this.serializer = serializer; + } + public Serializer getSerializer(ClassResolver classResolver) { Serializer serializer = this.serializer; if (serializer == null) { @@ -186,16 +190,12 @@ public Serializer getSerializer(ClassResolver classResolver) { return serializer; } - public boolean isMonomorphic() { - return isMonomorphic; + public Serializer getSerializer() { + return serializer; } - public Serializer getSerializerOrNull(ClassResolver classResolver) { - if (isMonomorphic) { - return getSerializer(classResolver); - } else { - return null; - } + public boolean isMonomorphic() { + return isMonomorphic; } public boolean trackingRef(ClassResolver classResolver) { diff --git a/java/fury-core/src/main/java/org/apache/fury/type/ScalaTypes.java b/java/fury-core/src/main/java/org/apache/fury/type/ScalaTypes.java index 31d439067f..3615f0b75b 100644 --- a/java/fury-core/src/main/java/org/apache/fury/type/ScalaTypes.java +++ b/java/fury-core/src/main/java/org/apache/fury/type/ScalaTypes.java @@ -30,6 +30,7 @@ public class ScalaTypes { private static volatile Class SCALA_MAP_TYPE; private static volatile Class SCALA_SEQ_TYPE; + private static volatile Class SCALA_SET_TYPE; private static volatile Class SCALA_ITERABLE_TYPE; private static volatile java.lang.reflect.Type SCALA_ITERATOR_RETURN_TYPE; private static volatile java.lang.reflect.Type SCALA_NEXT_RETURN_TYPE; @@ -50,6 +51,13 @@ public static Class getScalaSeqType() { return SCALA_SEQ_TYPE; } + public static Class getScalaSetType() { + if (SCALA_SET_TYPE == null) { + SCALA_SET_TYPE = ReflectionUtils.loadClass("scala.collection.Set"); + } + return SCALA_SET_TYPE; + } + public static Class getScalaIterableType() { if (SCALA_ITERABLE_TYPE == null) { SCALA_ITERABLE_TYPE = ReflectionUtils.loadClass("scala.collection.Iterable"); diff --git a/java/fury-core/src/main/java/org/apache/fury/type/TypeUtils.java b/java/fury-core/src/main/java/org/apache/fury/type/TypeUtils.java index 73861e5c24..b7462554b5 100644 --- a/java/fury-core/src/main/java/org/apache/fury/type/TypeUtils.java +++ b/java/fury-core/src/main/java/org/apache/fury/type/TypeUtils.java @@ -51,6 +51,7 @@ import org.apache.fury.reflect.TypeParameter; import org.apache.fury.reflect.TypeRef; import org.apache.fury.util.Preconditions; +import org.apache.fury.util.StringUtils; /** Type utils for common type inference and extraction. */ @SuppressWarnings({"UnstableApiUsage", "unchecked"}) @@ -734,4 +735,12 @@ public static boolean isEnumArray(Class clz) { } return getArrayComponent(clz).isEnum(); } + + public static String qualifiedName(String pkg, String className) { + if (StringUtils.isBlank(pkg)) { + return className; + } else { + return pkg + "." + className; + } + } } diff --git a/java/fury-core/src/main/java/org/apache/fury/type/Types.java b/java/fury-core/src/main/java/org/apache/fury/type/Types.java new file mode 100644 index 0000000000..87272a3af7 --- /dev/null +++ b/java/fury-core/src/main/java/org/apache/fury/type/Types.java @@ -0,0 +1,198 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.fury.type; + +public class Types { + + /** bool: a boolean value (true or false). */ + public static final int BOOL = 1; + + /** int8: a 8-bit signed integer. */ + public static final int INT8 = 2; + + /** int16: a 16-bit signed integer. */ + public static final int INT16 = 3; + + /** int32: a 32-bit signed integer. */ + public static final int INT32 = 4; + + /** var_int32: a 32-bit signed integer which use fury var_int32 encoding. */ + public static final int VAR_INT32 = 5; + + /** int64: a 64-bit signed integer. */ + public static final int INT64 = 6; + + /** var_int64: a 64-bit signed integer which use fury PVL encoding. */ + public static final int VAR_INT64 = 7; + + /** sli_int64: a 64-bit signed integer which use fury SLI encoding. */ + public static final int SLI_INT64 = 8; + + /** float16: a 16-bit floating point number. */ + public static final int FLOAT16 = 9; + + /** float32: a 32-bit floating point number. */ + public static final int FLOAT32 = 10; + + /** float64: a 64-bit floating point number including NaN and Infinity. */ + public static final int FLOAT64 = 11; + + /** string: a text string encoded using Latin1/UTF16/UTF-8 encoding. */ + public static final int STRING = 12; + + /** + * enum: a data type consisting of a set of named values. Rust enum with non-predefined field + * values are not \ supported as an enum. + */ + public static final int ENUM = 13; + + /** named_enum: an enum whose value will be serialized as the registered name. */ + public static final int NAMED_ENUM = 14; + + /** + * a morphic(final) type serialized by Fury Struct serializer. i.e. it doesn't have subclasses. + * Suppose we're deserializing {@code List}`, we can save dynamic serializer dispatch + * since `SomeClass` is morphic(final). + */ + public static final int STRUCT = 15; + + /** + * a type which is polymorphic(not final). i.e. it has subclasses. Suppose we're deserializing + * {@code List}`, we must dispatch serializer dynamically since `SomeClass` is + * polymorphic(non-final). + */ + public static final int POLYMORPHIC_STRUCT = 16; + + /** a morphic(final) type serialized by Fury compatible Struct serializer. */ + public static final int COMPATIBLE_STRUCT = 17; + + /** a non-morphic(non-final) type serialized by Fury compatible Struct serializer. */ + public static final int POLYMORPHIC_COMPATIBLE_STRUCT = 18; + + /** a `struct` whose type mapping will be encoded as a name. */ + public static final int NAMED_STRUCT = 19; + + /** a `polymorphic_struct` whose type mapping will be encoded as a name. */ + public static final int NAMED_POLYMORPHIC_STRUCT = 20; + + /** a `compatible_struct` whose type mapping will be encoded as a name. */ + public static final int NAMED_COMPATIBLE_STRUCT = 21; + + /** a `polymorphic_compatible_struct` whose type mapping will be encoded as a name. */ + public static final int NAMED_POLYMORPHIC_COMPATIBLE_STRUCT = 22; + + /** a type which will be serialized by a customized serializer. */ + public static final int EXT = 23; + + /** an `ext` type which is not morphic(not final). */ + public static final int POLYMORPHIC_EXT = 24; + + /** an `ext` type whose type mapping will be encoded as a name. */ + public static final int NAMED_EXT = 25; + + /** an `polymorphic_ext` type whose type mapping will be encoded as a name. */ + public static final int NAMED_POLYMORPHIC_EXT = 26; + + /** a sequence of objects. */ + public static final int LIST = 27; + + /** an unordered set of unique elements. */ + public static final int SET = 28; + + /** + * a map of key-value pairs. Mutable types such as `list/map/set/array/tensor/arrow` are not + * allowed as key of map. + */ + public static final int MAP = 29; + + /** + * an absolute length of time, independent of any calendar/timezone, as a count of nanoseconds. + */ + public static final int DURATION = 30; + + /** + * timestamp: a point in time, independent of any calendar/timezone, as a count of nanoseconds. + * The count is relative to an epoch at UTC midnight on January 1, 1970. + */ + public static final int TIMESTAMP = 31; + + /** + * a naive date without timezone. The count is days relative to an epoch at UTC midnight on Jan 1, + * 1970. + */ + public static final int LOCAL_DATE = 32; + + /** exact decimal value represented as an integer value in two's complement. */ + public static final int DECIMAL = 33; + + /** an variable-length array of bytes. */ + public static final int BINARY = 34; + + /** + * a multidimensional array which every sub-array can have different sizes but all have same type. + * only allow numeric components. Other arrays will be taken as List. The implementation should + * support the interoperability between array and list. + */ + public static final int ARRAY = 35; + + /** one dimensional int16 array. */ + public static final int BOOL_ARRAY = 36; + + /** one dimensional int8 array. */ + public static final int INT8_ARRAY = 37; + + /** one dimensional int16 array. */ + public static final int INT16_ARRAY = 38; + + /** one dimensional int32 array. */ + public static final int INT32_ARRAY = 39; + + /** one dimensional int64 array. */ + public static final int INT64_ARRAY = 40; + + /** one dimensional half_float_16 array. */ + public static final int FLOAT16_ARRAY = 41; + + /** one dimensional float32 array. */ + public static final int FLOAT32_ARRAY = 42; + + /** one dimensional float64 array. */ + public static final int FLOAT64_ARRAY = 43; + + /** + * an (arrow record + * batch) object. + */ + public static final int ARROW_RECORD_BATCH = 44; + + /** an (arrow table) object. */ + public static final int ARROW_TABLE = 45; + + public static boolean isStructType(int value) { + return value == STRUCT + || value == POLYMORPHIC_STRUCT + || value == COMPATIBLE_STRUCT + || value == POLYMORPHIC_COMPATIBLE_STRUCT + || value == NAMED_STRUCT + || value == NAMED_POLYMORPHIC_STRUCT + || value == NAMED_COMPATIBLE_STRUCT + || value == NAMED_POLYMORPHIC_COMPATIBLE_STRUCT; + } +} diff --git a/java/fury-core/src/test/java/org/apache/fury/CrossLanguageTest.java b/java/fury-core/src/test/java/org/apache/fury/CrossLanguageTest.java index 8a85c678df..7ab2033afe 100644 --- a/java/fury-core/src/test/java/org/apache/fury/CrossLanguageTest.java +++ b/java/fury-core/src/test/java/org/apache/fury/CrossLanguageTest.java @@ -19,21 +19,26 @@ package org.apache.fury; +import static org.testng.Assert.assertEquals; + import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.hash.Hashing; import java.io.IOException; +import java.lang.reflect.Method; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardOpenOption; +import java.sql.Timestamp; import java.time.Instant; import java.time.LocalDate; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; +import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; @@ -41,6 +46,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.TreeMap; import java.util.TreeSet; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; @@ -49,20 +55,25 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; import lombok.Data; +import org.apache.fury.config.FuryBuilder; import org.apache.fury.config.Language; import org.apache.fury.logging.Logger; import org.apache.fury.logging.LoggerFactory; import org.apache.fury.memory.MemoryBuffer; import org.apache.fury.memory.MemoryUtils; +import org.apache.fury.serializer.ArraySerializersTest; import org.apache.fury.serializer.BufferObject; +import org.apache.fury.serializer.EnumSerializerTest; import org.apache.fury.serializer.Serializer; +import org.apache.fury.serializer.StructSerializer; +import org.apache.fury.util.DateTimeUtils; import org.apache.fury.util.MurmurHash3; import org.testng.Assert; import org.testng.annotations.Test; /** Tests in this class need fury python installed. */ @Test -public class CrossLanguageTest { +public class CrossLanguageTest extends FuryTestBase { private static final Logger LOG = LoggerFactory.getLogger(CrossLanguageTest.class); private static final String PYTHON_MODULE = "pyfury.tests.test_cross_language"; private static final String PYTHON_EXECUTABLE = "python"; @@ -321,37 +332,35 @@ public void testCrossLanguagePreserveTypes() { .withRefTracking(true) .requireClassRegistration(false) .build(); - Assert.assertEquals( - new String[] {"str", "str"}, (Object[]) serDe(fury, new String[] {"str", "str"})); - Assert.assertEquals(new Object[] {"str", 1}, (Object[]) serDe(fury, new Object[] {"str", 1})); + Assert.assertEquals(new String[] {"str", "str"}, serDeTyped(fury, new String[] {"str", "str"})); + Assert.assertEquals(new Object[] {"str", 1}, serDeTyped(fury, new Object[] {"str", 1})); Assert.assertTrue( Arrays.deepEquals( - new Integer[][] {{1, 2}, {1, 2}}, - (Integer[][]) serDe(fury, new Integer[][] {{1, 2}, {1, 2}}))); + new Integer[][] {{1, 2}, {1, 2}}, serDeTyped(fury, new Integer[][] {{1, 2}, {1, 2}}))); - Assert.assertEquals(Arrays.asList(1, 2), serDe(fury, Arrays.asList(1, 2))); + Assert.assertEquals(Arrays.asList(1, 2), xserDe(fury, Arrays.asList(1, 2))); List arrayList = Arrays.asList("str", "str"); - Assert.assertEquals(arrayList, serDe(fury, arrayList)); - Assert.assertEquals(new LinkedList<>(arrayList), serDe(fury, new LinkedList<>(arrayList))); - Assert.assertEquals(new HashSet<>(arrayList), serDe(fury, new HashSet<>(arrayList))); + Assert.assertEquals(arrayList, xserDe(fury, arrayList)); + Assert.assertEquals(new LinkedList<>(arrayList), xserDe(fury, new LinkedList<>(arrayList))); + Assert.assertEquals(new HashSet<>(arrayList), xserDe(fury, new HashSet<>(arrayList))); TreeSet treeSet = new TreeSet<>(Comparator.naturalOrder()); treeSet.add("str1"); treeSet.add("str2"); - Assert.assertEquals(treeSet, serDe(fury, treeSet)); + Assert.assertEquals(treeSet, xserDe(fury, treeSet)); HashMap hashMap = new HashMap<>(); hashMap.put("k1", 1); hashMap.put("k2", 2); - Assert.assertEquals(hashMap, serDe(fury, hashMap)); - Assert.assertEquals(new LinkedHashMap<>(hashMap), serDe(fury, new LinkedHashMap<>(hashMap))); - Assert.assertEquals(Collections.EMPTY_LIST, serDe(fury, Collections.EMPTY_LIST)); - Assert.assertEquals(Collections.EMPTY_SET, serDe(fury, Collections.EMPTY_SET)); - Assert.assertEquals(Collections.EMPTY_MAP, serDe(fury, Collections.EMPTY_MAP)); + Assert.assertEquals(hashMap, xserDe(fury, hashMap)); + Assert.assertEquals(new LinkedHashMap<>(hashMap), xserDe(fury, new LinkedHashMap<>(hashMap))); + Assert.assertEquals(Collections.EMPTY_LIST, xserDe(fury, Collections.EMPTY_LIST)); + Assert.assertEquals(Collections.EMPTY_SET, xserDe(fury, Collections.EMPTY_SET)); + Assert.assertEquals(Collections.EMPTY_MAP, xserDe(fury, Collections.EMPTY_MAP)); Assert.assertEquals( - Collections.singletonList("str"), serDe(fury, Collections.singletonList("str"))); - Assert.assertEquals(Collections.singleton("str"), serDe(fury, Collections.singleton("str"))); + Collections.singletonList("str"), xserDe(fury, Collections.singletonList("str"))); + Assert.assertEquals(Collections.singleton("str"), xserDe(fury, Collections.singleton("str"))); Assert.assertEquals( - Collections.singletonMap("k", 1), serDe(fury, Collections.singletonMap("k", 1))); + Collections.singletonMap("k", 1), xserDe(fury, Collections.singletonMap("k", 1))); } @SuppressWarnings("unchecked") @@ -389,7 +398,7 @@ private void assertStringEquals(Object actual, Object expected, boolean useToStr } } - private Object serDe(Fury fury, Object obj) { + private Object xserDe(Fury fury, Object obj) { byte[] bytes = fury.serialize(obj); return fury.deserialize(bytes); } @@ -460,6 +469,24 @@ public static class ComplexObject2 { Map f2; } + @Test + public void testStructHash() throws Exception { + Fury fury = + Fury.builder() + .withLanguage(Language.XLANG) + .withRefTracking(true) + .requireClassRegistration(false) + .build(); + fury.register(ComplexObject1.class, "test.ComplexObject1"); + StructSerializer serializer = (StructSerializer) fury.getSerializer(ComplexObject1.class); + Method method = StructSerializer.class.getDeclaredMethod("computeStructHash"); + method.setAccessible(true); + Integer hash = (Integer) method.invoke(serializer); + MemoryBuffer buffer = MemoryBuffer.newHeapBuffer(4); + buffer.writeInt32(hash); + roundBytes("test_struct_hash", buffer.getBytes(0, 4)); + } + @Test public void testSerializeSimpleStruct() throws Exception { Fury fury = @@ -519,59 +546,6 @@ private void structRoundBack(Fury fury, Object obj, String testName) throws IOEx Assert.assertEquals(fury.deserialize(Files.readAllBytes(dataFile)), obj); } - @Test - public void testSerializeOpaqueObjectSimple() { - Fury fury = - Fury.builder() - .withLanguage(Language.XLANG) - .withRefTracking(true) - .requireClassRegistration(false) - .build(); - fury.register(ComplexObject2.class, "test.ComplexObject2"); - ComplexObject2 obj = new ComplexObject2(); - obj.f1 = Foo.create(); - byte[] serialized = fury.serialize(obj); - Assert.assertEquals(fury.deserialize(serialized), obj); - } - - @Test - public void testSerializeOpaqueObject() throws Exception { - Fury fury = - Fury.builder() - .withLanguage(Language.XLANG) - .withRefTracking(true) - .requireClassRegistration(false) - .build(); - fury.register(ComplexObject1.class, "test.ComplexObject1"); - // don't register ComplexObject2/Foo to make them serialize as opaque blobs. - ComplexObject1 obj = new ComplexObject1(); - obj.f1 = new ComplexObject2(); - ((ComplexObject2) obj.f1).f1 = true; - ((ComplexObject2) obj.f1).f2 = new HashMap<>(ImmutableMap.of((byte) -1, 2)); - obj.f2 = "abc"; - obj.f3 = Arrays.asList(obj.f1, Foo.create()); - byte[] serialized = fury.serialize(obj); - Assert.assertEquals(fury.deserialize(serialized), obj); - - Path dataFile = Files.createTempFile("test_serialize_opaque_object", "data"); - ImmutableList command = - ImmutableList.of( - PYTHON_EXECUTABLE, - "-m", - PYTHON_MODULE, - "test_serialize_opaque_object", - dataFile.toAbsolutePath().toString()); - Files.write(dataFile, serialized); - Assert.assertTrue(executeCommand(command, 30)); - // TODO support OpaqueObject itself serialization - // ComplexObject1 newObj = (ComplexObject1) fury.deserialize(Files.readAllBytes(dataFile)); - // assertEquals(newObj.f1.getClass(), OpaqueObjects.OpaqueObject.class); - // assertEquals(newObj.f2, obj.f2); - // assertNotNull(newObj.f3); - // assertTrue(newObj.f3.get(0) instanceof OpaqueObjects.OpaqueObject); - // assertTrue(newObj.f3.get(1) instanceof OpaqueObjects.OpaqueObject); - } - private static class ComplexObject1Serializer extends Serializer { public ComplexObject1Serializer(Fury fury, Class cls) { @@ -588,16 +562,6 @@ public ComplexObject1 read(MemoryBuffer buffer) { return xread(buffer); } - @Override - public short getXtypeId() { - return Fury.FURY_TYPE_TAG_ID; - } - - @Override - public String getCrossLanguageTypeTag() { - return "test.ComplexObject1"; - } - @Override public void xwrite(MemoryBuffer buffer, ComplexObject1 value) { fury.xwriteRef(buffer, value.f1); @@ -625,6 +589,7 @@ public void testRegisterSerializer() throws Exception { .withRefTracking(true) .requireClassRegistration(false) .build(); + fury.register(ComplexObject1.class, "test.ComplexObject1"); fury.registerSerializer(ComplexObject1.class, ComplexObject1Serializer.class); ComplexObject1 obj = new ComplexObject1(); obj.f1 = true; @@ -645,6 +610,19 @@ public void testRegisterSerializer() throws Exception { Assert.assertEquals(fury.deserialize(Files.readAllBytes(dataFile)), obj); } + private byte[] roundBytes(String testName, byte[] bytes) throws IOException { + Path dataFile = Paths.get(testName); + System.out.println(dataFile.toAbsolutePath()); + Files.deleteIfExists(dataFile); + Files.write(dataFile, bytes); + dataFile.toFile().deleteOnExit(); + ImmutableList command = + ImmutableList.of( + PYTHON_EXECUTABLE, "-m", PYTHON_MODULE, testName, dataFile.toAbsolutePath().toString()); + Assert.assertTrue(executeCommand(command, 30)); + return Files.readAllBytes(dataFile); + } + @Test public void testOutOfBandBuffer() throws Exception { Fury fury = @@ -734,6 +712,69 @@ public void testStructArrayField() { a.a = "123"; ArrayStruct struct = new ArrayStruct(); struct.f1 = new ArrayField[] {a}; - Assert.assertEquals(serDe(fury, struct), struct); + Assert.assertEquals(xserDe(fury, struct), struct); + } + + @Test(dataProvider = "referenceTrackingConfig") + public void basicTest(boolean referenceTracking) { + FuryBuilder builder = + Fury.builder() + .withLanguage(Language.XLANG) + .withRefTracking(referenceTracking) + .requireClassRegistration(false); + Fury fury1 = builder.build(); + Fury fury2 = builder.build(); + assertEquals("str", serDe(fury1, fury2, "str")); + assertEquals("str", serDeObject(fury1, fury2, new StringBuilder("str")).toString()); + assertEquals("str", serDeObject(fury1, fury2, new StringBuffer("str")).toString()); + fury1.register(EnumSerializerTest.EnumFoo.class); + fury2.register(EnumSerializerTest.EnumFoo.class); + fury1.register(EnumSerializerTest.EnumSubClass.class); + fury2.register(EnumSerializerTest.EnumSubClass.class); + assertEquals(EnumSerializerTest.EnumFoo.A, serDe(fury1, fury2, EnumSerializerTest.EnumFoo.A)); + assertEquals(EnumSerializerTest.EnumFoo.B, serDe(fury1, fury2, EnumSerializerTest.EnumFoo.B)); + assertEquals( + EnumSerializerTest.EnumSubClass.A, serDe(fury1, fury2, EnumSerializerTest.EnumSubClass.A)); + assertEquals( + EnumSerializerTest.EnumSubClass.B, serDe(fury1, fury2, EnumSerializerTest.EnumSubClass.B)); + java.sql.Date sqlDate = new java.sql.Date(System.currentTimeMillis()); + assertEquals(sqlDate, serDeTyped(fury1, fury2, sqlDate)); + LocalDate localDate = LocalDate.now(); + assertEquals(localDate, serDeTyped(fury1, fury2, localDate)); + Date utilDate = new Date(); + assertEquals(utilDate, serDeTyped(fury1, fury2, utilDate)); + Timestamp timestamp = new Timestamp(System.currentTimeMillis()); + assertEquals(timestamp, serDeTyped(fury1, fury2, timestamp)); + Instant instant = DateTimeUtils.truncateInstantToMicros(Instant.now()); + assertEquals(instant, serDeTyped(fury1, fury2, instant)); + + ArraySerializersTest.testPrimitiveArray(fury1, fury2); + + assertEquals(Arrays.asList(1, 2), serDe(fury1, fury2, Arrays.asList(1, 2))); + List arrayList = Arrays.asList("str", "str"); + assertEquals(arrayList, serDe(fury1, fury2, arrayList)); + assertEquals(new LinkedList<>(arrayList), serDe(fury1, fury2, new LinkedList<>(arrayList))); + assertEquals(new HashSet<>(arrayList), serDe(fury1, fury2, new HashSet<>(arrayList))); + TreeSet treeSet = new TreeSet<>(Comparator.naturalOrder()); + treeSet.add("str1"); + treeSet.add("str2"); + assertEquals(treeSet, serDe(fury1, fury2, treeSet)); + + HashMap hashMap = new HashMap<>(); + hashMap.put("k1", 1); + hashMap.put("k2", 2); + assertEquals(hashMap, serDe(fury1, fury2, hashMap)); + assertEquals(new LinkedHashMap<>(hashMap), serDe(fury1, fury2, new LinkedHashMap<>(hashMap))); + TreeMap treeMap = new TreeMap<>(Comparator.naturalOrder()); + treeMap.putAll(hashMap); + assertEquals(treeMap, serDe(fury1, fury2, treeMap)); + assertEquals(Collections.EMPTY_LIST, serDe(fury1, fury2, Collections.EMPTY_LIST)); + assertEquals(Collections.EMPTY_SET, serDe(fury1, fury2, Collections.EMPTY_SET)); + assertEquals(Collections.EMPTY_MAP, serDe(fury1, fury2, Collections.EMPTY_MAP)); + assertEquals( + Collections.singletonList("str"), serDe(fury1, fury2, Collections.singletonList("str"))); + assertEquals(Collections.singleton("str"), serDe(fury1, fury2, Collections.singleton("str"))); + assertEquals( + Collections.singletonMap("k", 1), serDe(fury1, fury2, Collections.singletonMap("k", 1))); } } diff --git a/java/fury-core/src/test/java/org/apache/fury/FuryTest.java b/java/fury-core/src/test/java/org/apache/fury/FuryTest.java index 30e845f117..127529961b 100644 --- a/java/fury-core/src/test/java/org/apache/fury/FuryTest.java +++ b/java/fury-core/src/test/java/org/apache/fury/FuryTest.java @@ -105,13 +105,10 @@ public void primitivesTest(boolean referenceTracking, Language language) { assertEquals(Double.MAX_VALUE, serDe(fury1, fury2, Double.MAX_VALUE)); } - @Test(dataProvider = "crossLanguageReferenceTrackingConfig") - public void basicTest(boolean referenceTracking, Language language) { + @Test(dataProvider = "referenceTrackingConfig") + public void basicTest(boolean referenceTracking) { FuryBuilder builder = - Fury.builder() - .withLanguage(language) - .withRefTracking(referenceTracking) - .requireClassRegistration(false); + Fury.builder().withRefTracking(referenceTracking).requireClassRegistration(false); Fury fury1 = builder.build(); Fury fury2 = builder.build(); assertEquals("str", serDe(fury1, fury2, "str")); @@ -193,6 +190,12 @@ public void assertSerializationToBuffer(Fury fury1, Fury fury2, MemoryBuffer buf assertEquals(Short.MAX_VALUE, serDeCheckIndex(fury1, fury2, buffer, Short.MAX_VALUE)); assertEquals("str", serDeCheckIndex(fury1, fury2, buffer, "str")); assertEquals("str", serDeCheckIndex(fury1, fury2, buffer, new StringBuilder("str")).toString()); + if (fury1.getLanguage() != Language.JAVA) { + fury1.register(EnumSerializerTest.EnumFoo.class); + fury2.register(EnumSerializerTest.EnumFoo.class); + fury1.register(EnumSerializerTest.EnumSubClass.class); + fury2.register(EnumSerializerTest.EnumSubClass.class); + } assertEquals( EnumSerializerTest.EnumFoo.A, serDeCheckIndex(fury1, fury2, buffer, EnumSerializerTest.EnumFoo.A)); diff --git a/java/fury-core/src/test/java/org/apache/fury/FuryTestBase.java b/java/fury-core/src/test/java/org/apache/fury/FuryTestBase.java index 3242fa908d..a33c5b9528 100644 --- a/java/fury-core/src/test/java/org/apache/fury/FuryTestBase.java +++ b/java/fury-core/src/test/java/org/apache/fury/FuryTestBase.java @@ -247,11 +247,22 @@ public static T serDeCheckSerializer(Fury fury, Object obj, String classRege return (T) fury.deserialize(bytes); } + public static Object serDeObject(Fury fury1, Fury fury2, Object obj) { + byte[] bytes = fury1.serialize(obj); + return fury2.deserialize(bytes); + } + public static T serDe(Fury fury1, Fury fury2, T obj) { byte[] bytes = fury1.serialize(obj); return (T) fury2.deserialize(bytes); } + public static T serDeCheckTyped(Fury fury1, Fury fury2, T obj) { + T o = serDeTyped(fury1, fury2, obj); + Assert.assertEquals(o, obj); + return o; + } + public static T serDeCheck(Fury fury1, Fury fury2, T obj) { T o = serDe(fury1, fury2, obj); Assert.assertEquals(o, obj); @@ -286,6 +297,18 @@ public static Object serDeCheckIndex(Fury fury1, Fury fury2, MemoryBuffer buffer return newObj; } + @SuppressWarnings("unchecked") + public static T serDeTyped(Fury fury, T obj) { + byte[] bytes = fury.serialize(obj); + return (T) fury.deserialize(bytes, obj.getClass()); + } + + @SuppressWarnings("unchecked") + public static T serDeTyped(Fury fury1, Fury fury2, T obj) { + byte[] bytes = fury1.serialize(obj); + return (T) fury2.deserialize(bytes, obj.getClass()); + } + public static void copyCheck(Fury fury, Object obj) { Object copy = fury.copy(obj); Assert.assertEquals(obj, copy); diff --git a/java/fury-core/src/test/java/org/apache/fury/serializer/ArraySerializersTest.java b/java/fury-core/src/test/java/org/apache/fury/serializer/ArraySerializersTest.java index b4db601dcf..6eb438b41e 100644 --- a/java/fury-core/src/test/java/org/apache/fury/serializer/ArraySerializersTest.java +++ b/java/fury-core/src/test/java/org/apache/fury/serializer/ArraySerializersTest.java @@ -52,26 +52,30 @@ public void testObjectArraySerialization(boolean referenceTracking, Language lan .requireClassRegistration(false); Fury fury1 = builder.build(); Fury fury2 = builder.build(); - serDeCheck(fury1, fury2, new Object[] {false, true}); - serDeCheck(fury1, fury2, new Object[] {(byte) 1, (byte) 1}); - serDeCheck(fury1, fury2, new Object[] {(short) 1, (short) 1}); - serDeCheck(fury1, fury2, new Object[] {(char) 1, (char) 1}); - serDeCheck(fury1, fury2, new Object[] {1, 1}); - serDeCheck(fury1, fury2, new Object[] {(float) 1.0, (float) 1.1}); - serDeCheck(fury1, fury2, new Object[] {1.0, 1.1}); - serDeCheck(fury1, fury2, new Object[] {1L, 2L}); - serDeCheck(fury1, fury2, new Boolean[] {false, true}); - serDeCheck(fury1, fury2, new Byte[] {(byte) 1, (byte) 1}); - serDeCheck(fury1, fury2, new Short[] {(short) 1, (short) 1}); - serDeCheck(fury1, fury2, new Character[] {(char) 1, (char) 1}); - serDeCheck(fury1, fury2, new Integer[] {1, 1}); - serDeCheck(fury1, fury2, new Float[] {(float) 1.0, (float) 1.1}); - serDeCheck(fury1, fury2, new Double[] {1.0, 1.1}); - serDeCheck(fury1, fury2, new Long[] {1L, 2L}); - serDeCheck( + serDeCheckTyped(fury1, fury2, new Object[] {false, true}); + serDeCheckTyped(fury1, fury2, new Object[] {(byte) 1, (byte) 1}); + serDeCheckTyped(fury1, fury2, new Object[] {(short) 1, (short) 1}); + if (language == Language.JAVA) { + serDeCheckTyped(fury1, fury2, new Object[] {(char) 1, (char) 1}); + } + serDeCheckTyped(fury1, fury2, new Object[] {1, 1}); + serDeCheckTyped(fury1, fury2, new Object[] {(float) 1.0, (float) 1.1}); + serDeCheckTyped(fury1, fury2, new Object[] {1.0, 1.1}); + serDeCheckTyped(fury1, fury2, new Object[] {1L, 2L}); + serDeCheckTyped(fury1, fury2, new Boolean[] {false, true}); + serDeCheckTyped(fury1, fury2, new Byte[] {(byte) 1, (byte) 1}); + serDeCheckTyped(fury1, fury2, new Short[] {(short) 1, (short) 1}); + if (language == Language.JAVA) { + serDeCheckTyped(fury1, fury2, new Character[] {(char) 1, (char) 1}); + } + serDeCheckTyped(fury1, fury2, new Integer[] {1, 1}); + serDeCheckTyped(fury1, fury2, new Float[] {(float) 1.0, (float) 1.1}); + serDeCheckTyped(fury1, fury2, new Double[] {1.0, 1.1}); + serDeCheckTyped(fury1, fury2, new Long[] {1L, 2L}); + serDeCheckTyped( fury1, fury2, new Object[] {false, true, (byte) 1, (byte) 1, (float) 1.0, (float) 1.1}); - serDeCheck(fury1, fury2, new String[] {"str", "str"}); - serDeCheck(fury1, fury2, new Object[] {"str", 1}); + serDeCheckTyped(fury1, fury2, new String[] {"str", "str"}); + serDeCheckTyped(fury1, fury2, new Object[] {"str", 1}); } @Test(dataProvider = "furyCopyConfig") @@ -106,15 +110,15 @@ public void testMultiArraySerialization(boolean referenceTracking, Language lang .requireClassRegistration(false); Fury fury1 = builder.build(); Fury fury2 = builder.build(); - serDeCheck(fury1, fury2, new Object[][] {{false, true}, {false, true}}); - serDeCheck( + serDeCheckTyped(fury1, fury2, new Object[][] {{false, true}, {false, true}}); + serDeCheckTyped( fury1, fury2, new Object[][] { {false, true, (byte) 1, (byte) 1, (float) 1.0, (float) 1.1}, {false, true, (byte) 1, (byte) 1, (float) 1.0, (float) 1.1} }); - serDeCheck(fury1, fury2, new Integer[][] {{1, 2}, {1, 2}}); + serDeCheckTyped(fury1, fury2, new Integer[][] {{1, 2}, {1, 2}}); } @Test(dataProvider = "furyCopyConfig") diff --git a/java/fury-core/src/test/java/org/apache/fury/serializer/EnumSerializerTest.java b/java/fury-core/src/test/java/org/apache/fury/serializer/EnumSerializerTest.java index bec2323535..7a5c350d49 100644 --- a/java/fury-core/src/test/java/org/apache/fury/serializer/EnumSerializerTest.java +++ b/java/fury-core/src/test/java/org/apache/fury/serializer/EnumSerializerTest.java @@ -64,6 +64,12 @@ public void testEnumSerialization(boolean referenceTracking, Language language) .requireClassRegistration(false); Fury fury1 = builder.build(); Fury fury2 = builder.build(); + if (fury1.getLanguage() != Language.JAVA) { + fury1.register(EnumSerializerTest.EnumFoo.class); + fury2.register(EnumSerializerTest.EnumFoo.class); + fury1.register(EnumSerializerTest.EnumSubClass.class); + fury2.register(EnumSerializerTest.EnumSubClass.class); + } assertEquals(EnumFoo.A, serDe(fury1, fury2, EnumFoo.A)); assertEquals(EnumFoo.B, serDe(fury1, fury2, EnumFoo.B)); assertEquals(EnumSubClass.A, serDe(fury1, fury2, EnumSubClass.A)); diff --git a/java/fury-core/src/test/java/org/apache/fury/serializer/SerializersTest.java b/java/fury-core/src/test/java/org/apache/fury/serializer/SerializersTest.java index fd72cb8f84..a2c5bd609f 100644 --- a/java/fury-core/src/test/java/org/apache/fury/serializer/SerializersTest.java +++ b/java/fury-core/src/test/java/org/apache/fury/serializer/SerializersTest.java @@ -57,17 +57,14 @@ public void testStringBuilder(boolean referenceTracking, Language language) { Fury fury1 = builder.build(); Fury fury2 = builder.build(); assertEquals("str", serDe(fury1, fury2, "str")); - assertEquals("str", serDe(fury1, fury2, new StringBuilder("str")).toString()); - assertEquals("str", serDe(fury1, fury2, new StringBuffer("str")).toString()); + assertEquals("str", serDeObject(fury1, fury2, new StringBuilder("str")).toString()); + assertEquals("str", serDeObject(fury1, fury2, new StringBuffer("str")).toString()); } - @Test(dataProvider = "crossLanguageReferenceTrackingConfig") - public void testBigInt(boolean referenceTracking, Language language) { + @Test(dataProvider = "referenceTrackingConfig") + public void testBigInt(boolean referenceTracking) { FuryBuilder builder = - Fury.builder() - .withLanguage(language) - .withRefTracking(referenceTracking) - .requireClassRegistration(false); + Fury.builder().withRefTracking(referenceTracking).requireClassRegistration(false); Fury fury1 = builder.build(); Fury fury2 = builder.build(); assertEquals(BigInteger.valueOf(100), serDe(fury1, fury2, BigInteger.valueOf(100))); diff --git a/java/fury-core/src/main/java/org/apache/fury/type/Type.java b/java/fury-format/src/main/java/org/apache/fury/format/type/ArrowType.java similarity index 97% rename from java/fury-core/src/main/java/org/apache/fury/type/Type.java rename to java/fury-format/src/main/java/org/apache/fury/format/type/ArrowType.java index 24e17579e7..cc41567277 100644 --- a/java/fury-core/src/main/java/org/apache/fury/type/Type.java +++ b/java/fury-format/src/main/java/org/apache/fury/format/type/ArrowType.java @@ -17,12 +17,12 @@ * under the License. */ -package org.apache.fury.type; +package org.apache.fury.format.type; import org.apache.fury.util.Preconditions; /** Keep in sync with Type::type in arrow/type_fwd.h */ -public enum Type { +public enum ArrowType { /// A NULL type having no physical storage NA, // NA = 0 @@ -164,12 +164,12 @@ public enum Type { private short id; - Type() { + ArrowType() { Preconditions.checkArgument(ordinal() < Short.MAX_VALUE); this.id = (short) ordinal(); } - Type(int id) { + ArrowType(int id) { Preconditions.checkArgument(id < Short.MAX_VALUE && id >= 0); this.id = (short) id; } diff --git a/java/fury-format/src/main/java/org/apache/fury/format/type/DataTypes.java b/java/fury-format/src/main/java/org/apache/fury/format/type/DataTypes.java index 3bb83c4085..450c83718f 100644 --- a/java/fury-format/src/main/java/org/apache/fury/format/type/DataTypes.java +++ b/java/fury-format/src/main/java/org/apache/fury/format/type/DataTypes.java @@ -36,259 +36,268 @@ import org.apache.arrow.vector.types.DateUnit; import org.apache.arrow.vector.types.FloatingPointPrecision; import org.apache.arrow.vector.types.TimeUnit; -import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.types.pojo.Schema; import org.apache.fury.exception.FuryException; import org.apache.fury.io.MemoryBufferOutputStream; import org.apache.fury.memory.MemoryBuffer; -import org.apache.fury.type.Type; import org.apache.fury.util.DecimalUtils; import org.apache.fury.util.Preconditions; /** Arrow data type utils. */ public class DataTypes { - public static Field PRIMITIVE_BOOLEAN_ARRAY_FIELD = primitiveArrayField(ArrowType.Bool.INSTANCE); + public static Field PRIMITIVE_BOOLEAN_ARRAY_FIELD = + primitiveArrayField(org.apache.arrow.vector.types.pojo.ArrowType.Bool.INSTANCE); public static Field PRIMITIVE_BYTE_ARRAY_FIELD = primitiveArrayField(intType(8)); public static Field PRIMITIVE_SHORT_ARRAY_FIELD = primitiveArrayField(intType(16)); public static Field PRIMITIVE_INT_ARRAY_FIELD = primitiveArrayField(intType(32)); public static Field PRIMITIVE_LONG_ARRAY_FIELD = primitiveArrayField(intType(64)); public static Field PRIMITIVE_FLOAT_ARRAY_FIELD = - primitiveArrayField(new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)); + primitiveArrayField( + new org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint( + FloatingPointPrecision.SINGLE)); public static Field PRIMITIVE_DOUBLE_ARRAY_FIELD = - primitiveArrayField(new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)); + primitiveArrayField( + new org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint( + FloatingPointPrecision.DOUBLE)); // Array item field default name public static final String ARRAY_ITEM_NAME = "item"; - private static final ArrowType.ArrowTypeVisitor typeWidthVisitor = - new DefaultTypeVisitor() { + private static final org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeVisitor + typeWidthVisitor = + new DefaultTypeVisitor() { - @Override - public Integer visit(ArrowType.Struct type) { - return -1; - } + @Override + public Integer visit(org.apache.arrow.vector.types.pojo.ArrowType.Struct type) { + return -1; + } - @Override - public Integer visit(ArrowType.List type) { - return -1; - } + @Override + public Integer visit(org.apache.arrow.vector.types.pojo.ArrowType.List type) { + return -1; + } - @Override - public Integer visit(ArrowType.Map type) { - return -1; - } + @Override + public Integer visit(org.apache.arrow.vector.types.pojo.ArrowType.Map type) { + return -1; + } - @Override - public Integer visit(ArrowType.Bool type) { - return 1; - } + @Override + public Integer visit(org.apache.arrow.vector.types.pojo.ArrowType.Bool type) { + return 1; + } - @Override - public Integer visit(ArrowType.Int type) { - return type.getBitWidth() / 8; - } + @Override + public Integer visit(org.apache.arrow.vector.types.pojo.ArrowType.Int type) { + return type.getBitWidth() / 8; + } - @Override - public Integer visit(ArrowType.FloatingPoint type) { - switch (type.getPrecision()) { - case SINGLE: - return 4; - case DOUBLE: - return 8; - default: - return unsupported(type); - } - } + @Override + public Integer visit(org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint type) { + switch (type.getPrecision()) { + case SINGLE: + return 4; + case DOUBLE: + return 8; + default: + return unsupported(type); + } + } - @Override - public Integer visit(ArrowType.Date type) { - return 4; - } + @Override + public Integer visit(org.apache.arrow.vector.types.pojo.ArrowType.Date type) { + return 4; + } - @Override - public Integer visit(ArrowType.Timestamp type) { - return 8; - } + @Override + public Integer visit(org.apache.arrow.vector.types.pojo.ArrowType.Timestamp type) { + return 8; + } - @Override - public Integer visit(ArrowType.Binary type) { - return -1; - } + @Override + public Integer visit(org.apache.arrow.vector.types.pojo.ArrowType.Binary type) { + return -1; + } - @Override - public Integer visit(ArrowType.Decimal type) { - return -1; - } + @Override + public Integer visit(org.apache.arrow.vector.types.pojo.ArrowType.Decimal type) { + return -1; + } - @Override - public Integer visit(ArrowType.Utf8 type) { - return -1; - } - }; + @Override + public Integer visit(org.apache.arrow.vector.types.pojo.ArrowType.Utf8 type) { + return -1; + } + }; - private static final ArrowType.ArrowTypeVisitor typeIdVisitor = - new DefaultTypeVisitor() { + private static final org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeVisitor + typeIdVisitor = + new DefaultTypeVisitor() { - @Override - public Type visit(ArrowType.Bool type) { - return Type.BOOL; - } - - @Override - public Type visit(ArrowType.Int type) { - if (type.getIsSigned()) { - int byteWidth = type.getBitWidth() / 8; - switch (byteWidth) { - case 1: - return Type.INT8; - case 2: - return Type.INT16; - case 4: - return Type.INT32; - case 8: - return Type.INT64; - default: - return unsupported(type); + @Override + public ArrowType visit(org.apache.arrow.vector.types.pojo.ArrowType.Bool type) { + return ArrowType.BOOL; } - } - return unsupported(type); - } - @Override - public Type visit(ArrowType.FloatingPoint type) { - switch (type.getPrecision()) { - case SINGLE: - return Type.FLOAT; - case DOUBLE: - return Type.DOUBLE; - default: + @Override + public ArrowType visit(org.apache.arrow.vector.types.pojo.ArrowType.Int type) { + if (type.getIsSigned()) { + int byteWidth = type.getBitWidth() / 8; + switch (byteWidth) { + case 1: + return ArrowType.INT8; + case 2: + return ArrowType.INT16; + case 4: + return ArrowType.INT32; + case 8: + return ArrowType.INT64; + default: + return unsupported(type); + } + } return unsupported(type); - } - } + } - @Override - public Type visit(ArrowType.Date type) { - switch (type.getUnit()) { - case DAY: - return Type.DATE32; - case MILLISECOND: - return Type.DATE64; - default: - return unsupported(type); - } - } + @Override + public ArrowType visit( + org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint type) { + switch (type.getPrecision()) { + case SINGLE: + return ArrowType.FLOAT; + case DOUBLE: + return ArrowType.DOUBLE; + default: + return unsupported(type); + } + } - @Override - public Type visit(ArrowType.Timestamp type) { - return Type.TIMESTAMP; - } + @Override + public ArrowType visit(org.apache.arrow.vector.types.pojo.ArrowType.Date type) { + switch (type.getUnit()) { + case DAY: + return ArrowType.DATE32; + case MILLISECOND: + return ArrowType.DATE64; + default: + return unsupported(type); + } + } - @Override - public Type visit(ArrowType.Binary type) { - return Type.BINARY; - } + @Override + public ArrowType visit(org.apache.arrow.vector.types.pojo.ArrowType.Timestamp type) { + return ArrowType.TIMESTAMP; + } - @Override - public Type visit(ArrowType.Decimal type) { - return Type.DECIMAL; - } + @Override + public ArrowType visit(org.apache.arrow.vector.types.pojo.ArrowType.Binary type) { + return ArrowType.BINARY; + } - @Override - public Type visit(ArrowType.Utf8 type) { - return Type.STRING; - } + @Override + public ArrowType visit(org.apache.arrow.vector.types.pojo.ArrowType.Decimal type) { + return ArrowType.DECIMAL; + } - @Override - public Type visit(ArrowType.Struct type) { - return Type.STRUCT; - } + @Override + public ArrowType visit(org.apache.arrow.vector.types.pojo.ArrowType.Utf8 type) { + return ArrowType.STRING; + } - @Override - public Type visit(ArrowType.List type) { - return Type.LIST; - } + @Override + public ArrowType visit(org.apache.arrow.vector.types.pojo.ArrowType.Struct type) { + return ArrowType.STRUCT; + } - @Override - public Type visit(ArrowType.Map type) { - return Type.MAP; - } - }; + @Override + public ArrowType visit(org.apache.arrow.vector.types.pojo.ArrowType.List type) { + return ArrowType.LIST; + } - public static int getTypeWidth(ArrowType type) { + @Override + public ArrowType visit(org.apache.arrow.vector.types.pojo.ArrowType.Map type) { + return ArrowType.MAP; + } + }; + + public static int getTypeWidth(org.apache.arrow.vector.types.pojo.ArrowType type) { return type.accept(typeWidthVisitor); } - public static Type getTypeId(ArrowType type) { + public static ArrowType getTypeId(org.apache.arrow.vector.types.pojo.ArrowType type) { return type.accept(typeIdVisitor); } - public static short getTypeIdValue(ArrowType type) { + public static short getTypeIdValue(org.apache.arrow.vector.types.pojo.ArrowType type) { return type.accept(typeIdVisitor).getId(); } - public static ArrowType.Bool bool() { - return ArrowType.Bool.INSTANCE; + public static org.apache.arrow.vector.types.pojo.ArrowType.Bool bool() { + return org.apache.arrow.vector.types.pojo.ArrowType.Bool.INSTANCE; } - public static ArrowType.Int intType(int bitWidth) { - return new ArrowType.Int(bitWidth, true); + public static org.apache.arrow.vector.types.pojo.ArrowType.Int intType(int bitWidth) { + return new org.apache.arrow.vector.types.pojo.ArrowType.Int(bitWidth, true); } - public static ArrowType.Int int8() { + public static org.apache.arrow.vector.types.pojo.ArrowType.Int int8() { return intType(8); } - public static ArrowType.Int int16() { + public static org.apache.arrow.vector.types.pojo.ArrowType.Int int16() { return intType(16); } - public static ArrowType.Int int32() { + public static org.apache.arrow.vector.types.pojo.ArrowType.Int int32() { return intType(32); } - public static ArrowType.Int int64() { + public static org.apache.arrow.vector.types.pojo.ArrowType.Int int64() { return intType(64); } - public static ArrowType.FloatingPoint float32() { - return new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE); + public static org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint float32() { + return new org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint( + FloatingPointPrecision.SINGLE); } - public static ArrowType.FloatingPoint float64() { - return new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE); + public static org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint float64() { + return new org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint( + FloatingPointPrecision.DOUBLE); } - public static ArrowType.Date date32() { - return new ArrowType.Date(DateUnit.DAY); + public static org.apache.arrow.vector.types.pojo.ArrowType.Date date32() { + return new org.apache.arrow.vector.types.pojo.ArrowType.Date(DateUnit.DAY); } - public static ArrowType.Date date64() { - return new ArrowType.Date(DateUnit.MILLISECOND); + public static org.apache.arrow.vector.types.pojo.ArrowType.Date date64() { + return new org.apache.arrow.vector.types.pojo.ArrowType.Date(DateUnit.MILLISECOND); } - public static ArrowType.Timestamp timestamp() { - return new ArrowType.Timestamp(TimeUnit.MICROSECOND, null); + public static org.apache.arrow.vector.types.pojo.ArrowType.Timestamp timestamp() { + return new org.apache.arrow.vector.types.pojo.ArrowType.Timestamp(TimeUnit.MICROSECOND, null); } - public static ArrowType.Binary binary() { - return ArrowType.Binary.INSTANCE; + public static org.apache.arrow.vector.types.pojo.ArrowType.Binary binary() { + return org.apache.arrow.vector.types.pojo.ArrowType.Binary.INSTANCE; } - public static ArrowType.Utf8 utf8() { - return ArrowType.Utf8.INSTANCE; + public static org.apache.arrow.vector.types.pojo.ArrowType.Utf8 utf8() { + return org.apache.arrow.vector.types.pojo.ArrowType.Utf8.INSTANCE; } - public static ArrowType.Decimal decimal() { + public static org.apache.arrow.vector.types.pojo.ArrowType.Decimal decimal() { return decimal(DecimalUtils.MAX_PRECISION, DecimalUtils.MAX_SCALE); } - public static ArrowType.Decimal decimal(int precision, int scale) { - return new ArrowType.Decimal(precision, scale); + public static org.apache.arrow.vector.types.pojo.ArrowType.Decimal decimal( + int precision, int scale) { + return new org.apache.arrow.vector.types.pojo.ArrowType.Decimal(precision, scale); } - public static ArrowType.Decimal bigintDecimal() { + public static org.apache.arrow.vector.types.pojo.ArrowType.Decimal bigintDecimal() { return decimal(DecimalUtils.MAX_PRECISION, 0); } @@ -297,7 +306,11 @@ public static Field field(String name, FieldType fieldType) { return field(name, fieldType, Collections.emptyList()); } - public static Field field(String name, boolean nullable, ArrowType type, Field... children) { + public static Field field( + String name, + boolean nullable, + org.apache.arrow.vector.types.pojo.ArrowType type, + Field... children) { return field(name, new FieldType(nullable, type, null), children); } @@ -305,11 +318,16 @@ public static Field field(String name, FieldType fieldType, Field... children) { return field(name, fieldType, Arrays.asList(children)); } - public static Field field(String name, boolean nullable, ArrowType type, List children) { + public static Field field( + String name, + boolean nullable, + org.apache.arrow.vector.types.pojo.ArrowType type, + List children) { return field(name, new FieldType(nullable, type, null), children); } - public static Field field(String name, ArrowType type, Field... children) { + public static Field field( + String name, org.apache.arrow.vector.types.pojo.ArrowType type, Field... children) { return field(name, true, type, children); } @@ -317,34 +335,36 @@ public static Field field(String name, FieldType fieldType, List children return new Field(name, fieldType, children); } - public static Field notNullField(String name, ArrowType type, Field... children) { + public static Field notNullField( + String name, org.apache.arrow.vector.types.pojo.ArrowType type, Field... children) { return field(name, false, type, children); } - public static FieldType notNullFieldType(ArrowType type) { + public static FieldType notNullFieldType(org.apache.arrow.vector.types.pojo.ArrowType type) { return new FieldType(false, type, null); } /* ========================= array field utils ========================= */ - public static Field primitiveArrayField(ArrowType type) { + public static Field primitiveArrayField(org.apache.arrow.vector.types.pojo.ArrowType type) { return primitiveArrayField("", type); } - public static Field primitiveArrayField(String name, ArrowType type) { + public static Field primitiveArrayField( + String name, org.apache.arrow.vector.types.pojo.ArrowType type) { return field( name, - FieldType.nullable(ArrowType.List.INSTANCE), + FieldType.nullable(org.apache.arrow.vector.types.pojo.ArrowType.List.INSTANCE), Collections.singletonList(field(ARRAY_ITEM_NAME, false, type))); } - public static Field arrayField(ArrowType type) { + public static Field arrayField(org.apache.arrow.vector.types.pojo.ArrowType type) { return arrayField("", type); } - public static Field arrayField(String name, ArrowType type) { + public static Field arrayField(String name, org.apache.arrow.vector.types.pojo.ArrowType type) { return field( name, - FieldType.nullable(ArrowType.List.INSTANCE), + FieldType.nullable(org.apache.arrow.vector.types.pojo.ArrowType.List.INSTANCE), Collections.singletonList(field(ARRAY_ITEM_NAME, true, type))); } @@ -355,7 +375,7 @@ public static Field arrayField(FieldType valueType) { public static Field arrayField(String name, FieldType valueType) { return field( name, - FieldType.nullable(ArrowType.List.INSTANCE), + FieldType.nullable(org.apache.arrow.vector.types.pojo.ArrowType.List.INSTANCE), Collections.singletonList(field(ARRAY_ITEM_NAME, valueType))); } @@ -365,7 +385,9 @@ public static Field arrayField(Field valueField) { public static Field arrayField(String name, Field valueField) { return field( - name, FieldType.nullable(ArrowType.List.INSTANCE), Collections.singletonList(valueField)); + name, + FieldType.nullable(org.apache.arrow.vector.types.pojo.ArrowType.List.INSTANCE), + Collections.singletonList(valueField)); } public static Field arrayElementField(Field field) { @@ -373,11 +395,16 @@ public static Field arrayElementField(Field field) { } /* ========================= map field utils start ========================= */ - public static Field mapField(ArrowType keyType, ArrowType itemType) { + public static Field mapField( + org.apache.arrow.vector.types.pojo.ArrowType keyType, + org.apache.arrow.vector.types.pojo.ArrowType itemType) { return mapField("", keyType, itemType); } - public static Field mapField(String name, ArrowType keyType, ArrowType itemType) { + public static Field mapField( + String name, + org.apache.arrow.vector.types.pojo.ArrowType keyType, + org.apache.arrow.vector.types.pojo.ArrowType itemType) { return mapField( name, field(MapVector.KEY_NAME, false, keyType), @@ -392,7 +419,8 @@ public static Field mapField(String name, Field keyField, Field itemField) { Preconditions.checkArgument(!keyField.isNullable(), "Map's keys must be non-nullable"); // Map's key-item pairs must be non-nullable structs Field valueField = structField(false, keyField, itemField); - return field(name, true, new ArrowType.Map(false), valueField); + return field( + name, true, new org.apache.arrow.vector.types.pojo.ArrowType.Map(false), valueField); } public static Field keyFieldForMap(Field mapField) { @@ -425,11 +453,13 @@ public static Field structField(boolean nullable, Field... fields) { } public static Field structField(String name, boolean nullable, Field... fields) { - return field(name, nullable, ArrowType.Struct.INSTANCE, fields); + return field( + name, nullable, org.apache.arrow.vector.types.pojo.ArrowType.Struct.INSTANCE, fields); } public static Field structField(String name, boolean nullable, List fields) { - return field(name, nullable, ArrowType.Struct.INSTANCE, fields); + return field( + name, nullable, org.apache.arrow.vector.types.pojo.ArrowType.Struct.INSTANCE, fields); } /* ========================= struct field utils end ========================= */ @@ -475,7 +505,7 @@ public static long computeSchemaHash(Schema schema) { } private static long computeHash(long hash, Field field) { - Type typeID = getTypeId(field.getType()); + ArrowType typeID = getTypeId(field.getType()); while (true) { try { hash = Math.addExact(Math.multiplyExact(hash, 31), (long) typeID.getId()); diff --git a/java/fury-format/src/main/java/org/apache/fury/format/vectorized/ArrowSerializers.java b/java/fury-format/src/main/java/org/apache/fury/format/vectorized/ArrowSerializers.java index efc117a695..a03c29a06f 100644 --- a/java/fury-format/src/main/java/org/apache/fury/format/vectorized/ArrowSerializers.java +++ b/java/fury-format/src/main/java/org/apache/fury/format/vectorized/ArrowSerializers.java @@ -31,6 +31,7 @@ import org.apache.arrow.vector.ipc.message.IpcOption; import org.apache.arrow.vector.ipc.message.MessageSerializer; import org.apache.fury.Fury; +import org.apache.fury.config.Language; import org.apache.fury.io.MemoryBufferReadableChannel; import org.apache.fury.io.MemoryBufferWritableChannel; import org.apache.fury.io.MockWritableChannel; @@ -39,7 +40,7 @@ import org.apache.fury.memory.Platform; import org.apache.fury.serializer.BufferObject; import org.apache.fury.serializer.Serializers.CrossLanguageCompatibleSerializer; -import org.apache.fury.type.Type; +import org.apache.fury.type.Types; /** Serializers for apache arrow. */ public class ArrowSerializers { @@ -57,7 +58,7 @@ public VectorSchemaRootSerializer(Fury fury) { } public VectorSchemaRootSerializer(Fury fury, BufferAllocator allocator) { - super(fury, VectorSchemaRoot.class, Type.FURY_ARROW_RECORD_BATCH.getId()); + super(fury, VectorSchemaRoot.class); this.allocator = allocator; } @@ -166,6 +167,10 @@ public MemoryBuffer toBuffer() { } public static void registerSerializers(Fury fury) { + if (fury.getLanguage() != Language.JAVA) { + fury.register(ArrowTable.class, Types.ARROW_TABLE); + fury.register(VectorSchemaRoot.class, Types.ARROW_RECORD_BATCH); + } fury.registerSerializer(ArrowTable.class, new ArrowTableSerializer(fury)); fury.registerSerializer(VectorSchemaRoot.class, new VectorSchemaRootSerializer(fury)); } diff --git a/java/fury-format/src/main/java/org/apache/fury/format/vectorized/ArrowTableSerializer.java b/java/fury-format/src/main/java/org/apache/fury/format/vectorized/ArrowTableSerializer.java index 451e5e07cd..25bda0e1c8 100644 --- a/java/fury-format/src/main/java/org/apache/fury/format/vectorized/ArrowTableSerializer.java +++ b/java/fury-format/src/main/java/org/apache/fury/format/vectorized/ArrowTableSerializer.java @@ -32,7 +32,6 @@ import org.apache.fury.memory.MemoryBuffer; import org.apache.fury.memory.Platform; import org.apache.fury.serializer.Serializers; -import org.apache.fury.type.Type; /** Serializers for {@link ArrowTable}. */ public class ArrowTableSerializer @@ -46,7 +45,7 @@ public ArrowTableSerializer(Fury fury) { } public ArrowTableSerializer(Fury fury, BufferAllocator allocator) { - super(fury, ArrowTable.class, Type.FURY_ARROW_TABLE.getId()); + super(fury, ArrowTable.class); this.allocator = allocator; } diff --git a/python/README.md b/python/README.md index 73704ab108..d06a9a8c77 100644 --- a/python/README.md +++ b/python/README.md @@ -49,3 +49,13 @@ FURY_DEBUG=true python setup.py build_ext --inplace # For linux cygdb build ``` + +## Debug with lldb + +```bash +lldb +(lldb) target create -- python +(lldb) settings set -- target.run-args "-c" "from pyfury.tests.test_serializer import test_enum; test_enum()" +(lldb) run +(lldb) bt +``` diff --git a/python/pyfury/__init__.py b/python/pyfury/__init__.py index c4396421f8..fcf9e80585 100644 --- a/python/pyfury/__init__.py +++ b/python/pyfury/__init__.py @@ -19,7 +19,6 @@ from pyfury._fury import ( # noqa: F401 # pylint: disable=unused-import Fury, Language, - OpaqueObject, ) try: @@ -27,29 +26,19 @@ except ImportError: ENABLE_FURY_CYTHON_SERIALIZATION = False +from pyfury._registry import ClassInfo + if ENABLE_FURY_CYTHON_SERIALIZATION: - from pyfury._serialization import ( # noqa: F401,F811 - Fury, - Language, - ClassInfo, - OpaqueObject, - ComplexObjectSerializer, - ) -else: - from pyfury._fury import ( # noqa: F401,F403,F811 # pylint: disable=unused-import - Fury, - Language, - ClassInfo, - OpaqueObject, - ) - from pyfury._struct import ( # noqa: F401,F403,F811 # pylint: disable=unused-import - ComplexObjectSerializer, - ) + from pyfury._serialization import Fury, ClassInfo # noqa: F401,F811 + +from pyfury._struct import ( # noqa: F401,F403,F811 # pylint: disable=unused-import + ComplexObjectSerializer, +) from pyfury.serializer import * # noqa: F401,F403 # pylint: disable=unused-import from pyfury.type import ( # noqa: F401 # pylint: disable=unused-import record_class_factory, get_qualified_classname, - FuryType, + TypeId, Int8Type, Int16Type, Int32Type, diff --git a/python/pyfury/_fury.py b/python/pyfury/_fury.py index cea736389d..21ca687b90 100644 --- a/python/pyfury/_fury.py +++ b/python/pyfury/_fury.py @@ -15,61 +15,20 @@ # specific language governing permissions and limitations # under the License. -import array -import dataclasses -import datetime import enum import logging import os import warnings -from dataclasses import dataclass -from typing import Dict, Tuple, TypeVar, Union, Iterable - -from pyfury.lib import mmh3 +from abc import ABC, abstractmethod +from typing import Union, Iterable, TypeVar from pyfury.buffer import Buffer -from pyfury.meta.metastring import Encoding from pyfury.resolver import ( MapRefResolver, NoRefResolver, NULL_FLAG, NOT_NULL_VALUE_FLAG, ) -from pyfury._serializer import ( - Serializer, - SerializationContext, - NOT_SUPPORT_CROSS_LANGUAGE, - BufferObject, - PickleSerializer, - Numpy1DArraySerializer, - PyArraySerializer, - PYINT_CLASS_ID, - PYFLOAT_CLASS_ID, - PYBOOL_CLASS_ID, - STRING_CLASS_ID, - PICKLE_CLASS_ID, - NOT_NULL_STRING_FLAG, - NOT_NULL_PYINT_FLAG, - NOT_NULL_PYBOOL_FLAG, - NO_CLASS_ID, - NoneSerializer, - _PickleStub, - PickleStrongCacheStub, - PICKLE_STRONG_CACHE_CLASS_ID, - PICKLE_CACHE_CLASS_ID, - PickleCacheStub, - SMALL_STRING_THRESHOLD, -) -from pyfury.type import ( - FuryType, - Int8Type, - Int16Type, - Int32Type, - Int64Type, - Float32Type, - Float64Type, - load_class, -) from pyfury.util import is_little_endian, set_bit, get_bit, clear_bit try: @@ -84,517 +43,26 @@ logger = logging.getLogger(__name__) -DEFAULT_DYNAMIC_WRITE_STRING_ID = -1 - - MAGIC_NUMBER = 0x62D4 - - -class MetaStringBytes: - __slots__ = ( - "data", - "length", - "hashcode", - "dynamic_write_string_id", - ) - - def __init__(self, data, hashcode=None): - self.data = data - self.length = len(data) - if hashcode is None: - hashcode = (mmh3.hash_buffer(data, 47)[0] >> 8) << 8 - self.hashcode = hashcode - self.dynamic_write_string_id = DEFAULT_DYNAMIC_WRITE_STRING_ID - - def __eq__(self, other): - return type(other) is MetaStringBytes and other.hashcode == self.hashcode - - def __hash__(self): - return self.hashcode - - -class ClassInfo: - __slots__ = ( - "cls", - "class_id", - "serializer", - "class_name_bytes", - "type_tag_bytes", - ) - - def __init__( - self, - cls: type = None, - class_id: int = NO_CLASS_ID, - serializer: Serializer = None, - class_name_bytes: bytes = None, - type_tag_bytes: bytes = None, - ): - self.cls = cls - self.class_id = class_id - self.serializer = serializer - self.class_name_bytes = MetaStringBytes(class_name_bytes) - self.type_tag_bytes = ( - MetaStringBytes(type_tag_bytes) if type_tag_bytes else None - ) - - def __repr__(self): - return ( - f"ClassInfo(cls={self.cls}, class_id={self.class_id}, " - f"serializer={self.serializer})" - ) - - -class ClassResolver: - __slots__ = ( - "fury", - "_type_id_to_class", - "_type_id_to_serializer", - "_type_id_and_cls_to_serializer", - "_type_tag_to_class_x_lang_map", - "_enum_str_to_str", - "_class_id_counter", - "_used_classes_id", - "_classes_info", - "_registered_id2_class_info", - "_hash_to_enum_string", - "_enum_str_to_class", - "_hash_to_classinfo", - "_dynamic_id_to_classinfo_list", - "_dynamic_id_to_enum_str_list", - "_serializer", - "_dynamic_write_string_id", - "_dynamic_written_enum_string", - ) - - _type_id_to_class: Dict[int, type] - _type_id_to_serializer: Dict[int, Serializer] - _type_id_and_cls_to_serializer: Dict[Tuple[int, type], Serializer] - _classes_info: Dict[type, "ClassInfo"] - - def __init__(self, fury): - self.fury = fury - self._type_id_to_class = dict() - self._type_id_to_serializer = dict() - self._type_id_and_cls_to_serializer = dict() - self._type_tag_to_class_x_lang_map = dict() - self._class_id_counter = PICKLE_CACHE_CLASS_ID + 1 - self._used_classes_id = set() - - self._classes_info = dict() - self._registered_id2_class_info = [] - - self._enum_str_to_str = dict() - self._enum_str_to_class = dict() - self._hash_to_enum_string = dict() - self._hash_to_classinfo = dict() - self._dynamic_id_to_classinfo_list = list() - self._dynamic_id_to_enum_str_list = list() - - self._serializer = None - self._dynamic_write_string_id = 0 - self._dynamic_written_enum_string = [] - - def initialize(self): - self.register_class(int, class_id=PYINT_CLASS_ID) - self.register_class(float, class_id=PYFLOAT_CLASS_ID) - self.register_class(bool, class_id=PYBOOL_CLASS_ID) - self.register_class(str, class_id=STRING_CLASS_ID) - self.register_class(_PickleStub, class_id=PICKLE_CLASS_ID) - self.register_class( - PickleStrongCacheStub, class_id=PICKLE_STRONG_CACHE_CLASS_ID - ) - self.register_class(PickleCacheStub, class_id=PICKLE_CACHE_CLASS_ID) - self._add_default_serializers() - - # `Union[type, TypeVar]` is not supported in py3.6 - def register_serializer(self, cls, serializer): - assert isinstance(cls, (type, TypeVar)), cls - type_id = serializer.get_xtype_id() - if type_id != NOT_SUPPORT_CROSS_LANGUAGE: - self._add_x_lang_serializer(cls, serializer=serializer) - else: - self.register_class(cls) - self._classes_info[cls].serializer = serializer - - # `Union[type, TypeVar]` is not supported in py3.6 - def register_class(self, cls, *, class_id: int = None, type_tag: str = None): - """Register class with given type id or tag, if tag is not None, it will be used for - cross-language serialization.""" - if type_tag is not None: - assert class_id is None, ( - f"Type tag {type_tag} has been set already, " - f"set class id at the same time is not allowed." - ) - from pyfury._struct import ComplexObjectSerializer - - self.register_serializer( - cls, ComplexObjectSerializer(self.fury, cls, type_tag) - ) - return - classinfo = self._classes_info.get(cls) - if classinfo is None: - if isinstance(cls, TypeVar): - class_name_bytes = (cls.__module__ + "#" + cls.__name__).encode("utf-8") - else: - class_name_bytes = (cls.__module__ + "#" + cls.__qualname__).encode( - "utf-8" - ) - class_id = class_id if class_id is not None else self._next_class_id() - assert class_id not in self._used_classes_id, ( - self._used_classes_id, - self._classes_info, - ) - classinfo = ClassInfo( - cls=cls, class_name_bytes=class_name_bytes, class_id=class_id - ) - self._classes_info[cls] = classinfo - if len(self._registered_id2_class_info) <= class_id: - self._registered_id2_class_info.extend( - [None] * (class_id - len(self._registered_id2_class_info) + 1) - ) - self._registered_id2_class_info[class_id] = classinfo - else: - if classinfo.class_id == NO_CLASS_ID: - class_id = class_id if class_id is not None else self._next_class_id() - assert class_id not in self._used_classes_id, ( - self._used_classes_id, - self._classes_info, - ) - classinfo.class_id = class_id - if len(self._registered_id2_class_info) <= class_id: - self._registered_id2_class_info.extend( - [None] * (class_id - len(self._registered_id2_class_info) + 1) - ) - self._registered_id2_class_info[class_id] = classinfo - else: - if class_id is not None and classinfo.class_id != class_id: - raise ValueError( - f"Inconsistent class id {class_id} vs {classinfo.class_id} " - f"for class {cls}" - ) - - def _next_class_id(self): - class_id = self._class_id_counter = self._class_id_counter + 1 - while class_id in self._used_classes_id: - class_id = self._class_id_counter = self._class_id_counter + 1 - return class_id - - def _add_serializer(self, cls: type, serializer=None, serializer_cls=None): - if serializer_cls: - serializer = serializer_cls(self.fury, cls) - self.register_serializer(cls, serializer) - - def _add_x_lang_serializer(self, cls: type, serializer=None, serializer_cls=None): - if serializer_cls: - serializer = serializer_cls(self.fury, cls) - type_id = serializer.get_xtype_id() - from pyfury._serializer import NOT_SUPPORT_CROSS_LANGUAGE - - assert type_id != NOT_SUPPORT_CROSS_LANGUAGE - self._type_id_and_cls_to_serializer[(type_id, cls)] = serializer - self.register_class(cls) - classinfo = self._classes_info[cls] - classinfo.serializer = serializer - if type_id == FuryType.FURY_TYPE_TAG.value: - type_tag = serializer.get_xtype_tag() - assert type(type_tag) is str - assert type_tag not in self._type_tag_to_class_x_lang_map - classinfo.type_tag_bytes = MetaStringBytes(type_tag.encode("utf-8")) - self._type_tag_to_class_x_lang_map[type_tag] = cls - else: - self._type_id_to_serializer[type_id] = serializer - if type_id > NOT_SUPPORT_CROSS_LANGUAGE: - self._type_id_to_class[type_id] = cls - - def _add_default_serializers(self): - import pyfury.serializer as serializers - from pyfury._serializer import PyArraySerializer, Numpy1DArraySerializer - - self._add_x_lang_serializer(int, serializer_cls=serializers.ByteSerializer) - self._add_x_lang_serializer(int, serializer_cls=serializers.Int16Serializer) - self._add_x_lang_serializer(int, serializer_cls=serializers.Int32Serializer) - self._add_x_lang_serializer(int, serializer_cls=serializers.Int64Serializer) - self._add_x_lang_serializer(float, serializer_cls=serializers.FloatSerializer) - self._add_x_lang_serializer(float, serializer_cls=serializers.DoubleSerializer) - self._add_serializer(type(None), serializer_cls=NoneSerializer) - self._add_serializer(bool, serializer_cls=serializers.BooleanSerializer) - self._add_serializer(Int8Type, serializer_cls=serializers.ByteSerializer) - self._add_serializer(Int16Type, serializer_cls=serializers.Int16Serializer) - self._add_serializer(Int32Type, serializer_cls=serializers.Int32Serializer) - self._add_serializer(Int64Type, serializer_cls=serializers.Int64Serializer) - self._add_serializer(Float32Type, serializer_cls=serializers.FloatSerializer) - self._add_serializer(Float64Type, serializer_cls=serializers.DoubleSerializer) - self._add_serializer(str, serializer_cls=serializers.StringSerializer) - self._add_serializer(datetime.date, serializer_cls=serializers.DateSerializer) - self._add_serializer( - datetime.datetime, serializer_cls=serializers.TimestampSerializer - ) - self._add_serializer(bytes, serializer_cls=serializers.BytesSerializer) - self._add_serializer(list, serializer_cls=serializers.ListSerializer) - self._add_serializer(tuple, serializer_cls=serializers.TupleSerializer) - self._add_serializer(dict, serializer_cls=serializers.MapSerializer) - self._add_serializer(set, serializer_cls=serializers.SetSerializer) - self._add_serializer(enum.Enum, serializer_cls=serializers.EnumSerializer) - self._add_serializer(slice, serializer_cls=serializers.SliceSerializer) - from pyfury import PickleCacheSerializer, PickleStrongCacheSerializer - - self._add_serializer( - PickleStrongCacheStub, serializer=PickleStrongCacheSerializer(self.fury) - ) - self._add_serializer( - PickleCacheStub, serializer=PickleCacheSerializer(self.fury) - ) - try: - import pyarrow as pa - from pyfury.format.serializer import ( - ArrowRecordBatchSerializer, - ArrowTableSerializer, - ) - - self._add_serializer( - pa.RecordBatch, serializer_cls=ArrowRecordBatchSerializer - ) - self._add_serializer(pa.Table, serializer_cls=ArrowTableSerializer) - except Exception: - pass - for typecode in PyArraySerializer.typecode_dict.keys(): - self._add_serializer( - array.array, - serializer=PyArraySerializer(self.fury, array.array, typecode), - ) - self._add_serializer( - PyArraySerializer.typecodearray_type[typecode], - serializer=PyArraySerializer(self.fury, array.array, typecode), - ) - if np: - for dtype in Numpy1DArraySerializer.dtypes_dict.keys(): - self._add_serializer( - np.ndarray, - serializer=Numpy1DArraySerializer(self.fury, array.array, dtype), - ) - - def get_serializer(self, cls: type = None, type_id: int = None, obj=None): - """ - Returns - ------- - Returns or create serializer for the provided class - """ - assert cls is not None or type_id is not None or obj is not None - if obj is not None: - cls = type(obj) - if cls is int and 2**63 - 1 >= obj >= -(2**63): - type_id = FuryType.INT64.value - elif cls is float: - type_id = FuryType.DOUBLE.value - elif cls is array.array: - info = PyArraySerializer.typecode_dict.get(obj.typecode) - if info is not None: - type_id = info[1] - elif np and cls is np.ndarray and obj.ndim == 1: - info = Numpy1DArraySerializer.dtypes_dict.get(obj.dtype) - if info: - type_id = info[2] - if type_id is not None: - if cls is not None: - serializer_ = self._type_id_and_cls_to_serializer[(type_id, cls)] - else: - serializer_ = self._type_id_to_serializer[type_id] - else: - class_info = self._classes_info.get(cls) - if class_info is not None: - serializer_ = class_info.serializer - else: - self._add_serializer(cls, serializer=self.get_or_create_serializer(cls)) - serializer_ = self._classes_info.get(cls).serializer - self._serializer = serializer_ - return serializer_ - - def get_or_create_serializer(self, cls): - return self.get_or_create_classinfo(cls).serializer - - def get_or_create_classinfo(self, cls): - class_info = self._classes_info.get(cls) - if class_info is not None: - if class_info.serializer is not None: - return class_info - else: - class_info.serializer = self._create_serializer(cls) - return class_info - else: - serializer = self._create_serializer(cls) - class_id = ( - NO_CLASS_ID - if type(serializer) is not PickleSerializer - else PICKLE_CLASS_ID - ) - class_name_bytes = (cls.__module__ + "#" + cls.__qualname__).encode("utf-8") - class_info = ClassInfo( - cls=cls, - class_name_bytes=class_name_bytes, - serializer=serializer, - class_id=class_id, - ) - self._classes_info[cls] = class_info - return class_info - - def _create_serializer(self, cls): - mro = cls.__mro__ - classinfo_ = self._classes_info.get(cls) - for clz in mro: - class_info = self._classes_info.get(clz) - if ( - class_info - and class_info.serializer - and class_info.serializer.support_subclass() - ): - if classinfo_ is None or classinfo_.class_id == NO_CLASS_ID: - logger.info("Class %s not registered", cls) - serializer = type(class_info.serializer)(self.fury, cls) - break - else: - if dataclasses.is_dataclass(cls): - if classinfo_ is None or classinfo_.class_id == NO_CLASS_ID: - logger.info("Class %s not registered", cls) - logger.info("Class %s not registered", cls) - from pyfury import DataClassSerializer - - serializer = DataClassSerializer(self.fury, cls) - else: - serializer = PickleSerializer(self.fury, cls) - return serializer - - def write_classinfo(self, buffer: Buffer, classinfo: ClassInfo): - class_id = classinfo.class_id - if class_id != NO_CLASS_ID: - buffer.write_varint32(class_id << 1) - return - buffer.write_varint32(1) - self.write_enum_string_bytes(buffer, classinfo.class_name_bytes) - - def read_classinfo(self, buffer): - header = buffer.read_varint32() - if header & 0b1 == 0: - class_id = header >> 1 - classinfo = self._registered_id2_class_info[class_id] - if classinfo.serializer is None: - classinfo.serializer = self._create_serializer(classinfo.cls) - return classinfo - meta_str_header = buffer.read_varint32() - length = meta_str_header >> 1 - if meta_str_header & 0b1 != 0: - return self._dynamic_id_to_classinfo_list[length - 1] - class_name_bytes_hash = buffer.read_int64() - reader_index = buffer.reader_index - buffer.check_bound(reader_index, length) - buffer.reader_index = reader_index + length - classinfo = self._hash_to_classinfo.get(class_name_bytes_hash) - if classinfo is None: - classname_bytes = buffer.get_bytes(reader_index, length) - full_class_name = classname_bytes.decode(encoding="utf-8") - cls = load_class(full_class_name) - classinfo = self.get_or_create_classinfo(cls) - self._hash_to_classinfo[class_name_bytes_hash] = classinfo - self._dynamic_id_to_classinfo_list.append(classinfo) - return classinfo - - def write_enum_string_bytes( - self, buffer: Buffer, enum_string_bytes: MetaStringBytes - ): - dynamic_write_string_id = enum_string_bytes.dynamic_write_string_id - if dynamic_write_string_id == DEFAULT_DYNAMIC_WRITE_STRING_ID: - dynamic_write_string_id = self._dynamic_write_string_id - enum_string_bytes.dynamic_write_string_id = dynamic_write_string_id - self._dynamic_write_string_id += 1 - self._dynamic_written_enum_string.append(enum_string_bytes) - buffer.write_varint32(enum_string_bytes.length << 1) - if enum_string_bytes.length <= SMALL_STRING_THRESHOLD: - # TODO(chaokunyang) support meta string encoding - buffer.write_int8(Encoding.UTF_8.value) - else: - buffer.write_int64(enum_string_bytes.hashcode) - buffer.write_bytes(enum_string_bytes.data) - else: - buffer.write_varint32(((dynamic_write_string_id + 1) << 1) | 1) - - def read_enum_string_bytes(self, buffer: Buffer) -> MetaStringBytes: - header = buffer.read_varint32() - length = header >> 1 - if header & 0b1 != 0: - return self._dynamic_id_to_enum_str_list[length - 1] - if length <= SMALL_STRING_THRESHOLD: - buffer.read_int8() - if length <= 8: - v1 = buffer.read_bytes_as_int64(length) - v2 = 0 - else: - v1 = buffer.read_int64() - v2 = buffer.read_bytes_as_int64(length - 8) - hashcode = v1 * 31 + v2 - enum_str = self._hash_to_enum_string.get(hashcode) - if enum_str is None: - str_bytes = buffer.get_bytes(buffer.reader_index - length, length) - enum_str = MetaStringBytes(str_bytes, hashcode=hashcode) - self._hash_to_enum_string[hashcode] = enum_str - else: - hashcode = buffer.read_int64() - reader_index = buffer.reader_index - buffer.check_bound(reader_index, length) - buffer.reader_index = reader_index + length - enum_str = self._hash_to_enum_string.get(hashcode) - if enum_str is None: - str_bytes = buffer.get_bytes(reader_index, length) - enum_str = MetaStringBytes(str_bytes, hashcode=hashcode) - self._hash_to_enum_string[hashcode] = enum_str - self._dynamic_id_to_enum_str_list.append(enum_str) - return enum_str - - def xwrite_class(self, buffer, cls): - class_name_bytes = self._classes_info[cls].class_name_bytes - self.write_enum_string_bytes(buffer, class_name_bytes) - - def xwrite_type_tag(self, buffer, cls): - type_tag_bytes = self._classes_info[cls].type_tag_bytes - self.write_enum_string_bytes(buffer, type_tag_bytes) - - def read_class_by_type_tag(self, buffer): - tag = self.xread_classname(buffer) - return self._type_tag_to_class_x_lang_map[tag] - - def xread_class(self, buffer): - class_name_bytes = self.read_enum_string_bytes(buffer) - cls = self._enum_str_to_class.get(class_name_bytes) - if cls is None: - full_class_name = class_name_bytes.data.decode(encoding="utf-8") - cls = load_class(full_class_name) - self._enum_str_to_class[class_name_bytes] = cls - return cls - - def xread_classname(self, buffer) -> str: - str_bytes = self.read_enum_string_bytes(buffer) - str_ = self._enum_str_to_str.get(str_bytes) - if str_ is None: - str_ = str_bytes.data.decode(encoding="utf-8") - self._enum_str_to_str[str_bytes] = str_ - return str_ - - def get_class_by_type_id(self, type_id: int): - return self._type_id_to_class[type_id] - - def reset(self): - self.reset_write() - self.reset_read() - - def reset_read(self): - self._dynamic_id_to_classinfo_list.clear() - self._dynamic_id_to_enum_str_list.clear() - - def reset_write(self): - if self._dynamic_write_string_id != 0: - self._dynamic_write_string_id = 0 - for enum_str in self._dynamic_written_enum_string: - enum_str.dynamic_write_string_id = DEFAULT_DYNAMIC_WRITE_STRING_ID - self._dynamic_written_enum_string.clear() +DEFAULT_DYNAMIC_WRITE_STRING_ID = -1 +DYNAMIC_TYPE_ID = -1 +USE_CLASSNAME = 0 +USE_CLASS_ID = 1 +# preserve 0 as flag for class id not set in ClassInfo` +NO_CLASS_ID = 0 +PYINT_CLASS_ID = 1 +PYFLOAT_CLASS_ID = 2 +PYBOOL_CLASS_ID = 3 +STRING_CLASS_ID = 4 +PICKLE_CLASS_ID = 5 +PICKLE_STRONG_CACHE_CLASS_ID = 6 +PICKLE_CACHE_CLASS_ID = 7 +# `NOT_NULL_VALUE_FLAG` + `CLASS_ID << 1` in little-endian order +NOT_NULL_PYINT_FLAG = NOT_NULL_VALUE_FLAG & 0b11111111 | (PYINT_CLASS_ID << 9) +NOT_NULL_PYFLOAT_FLAG = NOT_NULL_VALUE_FLAG & 0b11111111 | (PYFLOAT_CLASS_ID << 9) +NOT_NULL_PYBOOL_FLAG = NOT_NULL_VALUE_FLAG & 0b11111111 | (PYBOOL_CLASS_ID << 9) +NOT_NULL_STRING_FLAG = NOT_NULL_VALUE_FLAG & 0b11111111 | (STRING_CLASS_ID << 9) +SMALL_STRING_THRESHOLD = 16 class Language(enum.Enum): @@ -607,11 +75,24 @@ class Language(enum.Enum): RUST = 6 -@dataclass -class OpaqueObject: - language: Language - classname: str - ordinal: int +class BufferObject(ABC): + """ + Fury binary representation of an object. + Note: This class is used for zero-copy out-of-band serialization and shouldn't + be used for any other cases. + """ + + @abstractmethod + def total_bytes(self) -> int: + """total size for serialized bytes of an object""" + + @abstractmethod + def write_to(self, buffer: "Buffer"): + """Write serialized object to a buffer.""" + + @abstractmethod + def to_buffer(self) -> "Buffer": + """Write serialized data as Buffer.""" class Fury: @@ -627,10 +108,10 @@ class Fury: "unpickler", "_buffer_callback", "_buffers", + "metastring_resolver", "_unsupported_callback", "_unsupported_objects", "_peer_language", - "_native_objects", ) serialization_context: "SerializationContext" @@ -659,6 +140,10 @@ def __init__( self.ref_resolver = MapRefResolver() else: self.ref_resolver = NoRefResolver() + from pyfury._serialization import MetaStringResolver + from pyfury._registry import ClassResolver + + self.metastring_resolver = MetaStringResolver() self.class_resolver = ClassResolver(self) self.class_resolver.initialize() self.serialization_context = SerializationContext() @@ -679,14 +164,27 @@ def __init__( self._unsupported_callback = None self._unsupported_objects = None self._peer_language = None - self._native_objects = [] def register_serializer(self, cls: type, serializer): self.class_resolver.register_serializer(cls, serializer) # `Union[type, TypeVar]` is not supported in py3.6 - def register_class(self, cls, *, class_id: int = None, type_tag: str = None): - self.class_resolver.register_class(cls, class_id=class_id, type_tag=type_tag) + def register_type( + self, + cls: Union[type, TypeVar], + *, + type_id: int = None, + namespace: str = None, + typename: str = None, + serializer=None, + ): + return self.class_resolver.register_type( + cls, + type_id=type_id, + namespace=namespace, + typename=typename, + serializer=serializer, + ) def serialize( self, @@ -750,18 +248,7 @@ def _serialize( if self.language == Language.PYTHON: self.serialize_ref(buffer, obj) else: - start_offset = buffer.writer_index - buffer.write_int32(-1) # preserve 4-byte for nativeObjects start offsets. - buffer.write_int32(-1) # preserve 4-byte for nativeObjects size self.xserialize_ref(buffer, obj) - buffer.put_int32(start_offset, buffer.writer_index) - buffer.put_int32(start_offset + 4, len(self._native_objects)) - self.ref_resolver.reset_write() - # fury write opaque object classname which cause later write of classname - # only write an id. - self.class_resolver.reset_write() - for native_object in self._native_objects: - self.serialize_ref(buffer, native_object) self.reset_write() if buffer is not self.buffer: return buffer @@ -785,26 +272,26 @@ def serialize_ref(self, buffer, obj, classinfo=None): if self.ref_resolver.write_ref_or_null(buffer, obj): return if classinfo is None: - classinfo = self.class_resolver.get_or_create_classinfo(cls) + classinfo = self.class_resolver.get_classinfo(cls) self.class_resolver.write_classinfo(buffer, classinfo) classinfo.serializer.write(buffer, obj) def serialize_nonref(self, buffer, obj): cls = type(obj) if cls is str: - buffer.write_varint32(STRING_CLASS_ID << 1) + buffer.write_varuint32(STRING_CLASS_ID << 1) buffer.write_string(obj) return elif cls is int: - buffer.write_varint32(PYINT_CLASS_ID << 1) + buffer.write_varuint32(PYINT_CLASS_ID << 1) buffer.write_varint64(obj) return elif cls is bool: - buffer.write_varint32(PYBOOL_CLASS_ID << 1) + buffer.write_varuint32(PYBOOL_CLASS_ID << 1) buffer.write_bool(obj) return else: - classinfo = self.class_resolver.get_or_create_classinfo(cls) + classinfo = self.class_resolver.get_classinfo(cls) self.class_resolver.write_classinfo(buffer, classinfo) classinfo.serializer.write(buffer, obj) @@ -820,27 +307,13 @@ def xserialize_ref(self, buffer, obj, serializer=None): self.xserialize_nonref(buffer, obj, serializer=serializer) def xserialize_nonref(self, buffer, obj, serializer=None): - cls = type(obj) - serializer = serializer or self.class_resolver.get_serializer(obj=obj) - type_id = serializer.get_xtype_id() - buffer.write_int16(type_id) - if type_id != NOT_SUPPORT_CROSS_LANGUAGE: - if type_id == FuryType.FURY_TYPE_TAG.value: - self.class_resolver.xwrite_type_tag(buffer, cls) - if type_id < NOT_SUPPORT_CROSS_LANGUAGE: - self.class_resolver.xwrite_class(buffer, cls) + if serializer is not None: serializer.xwrite(buffer, obj) - else: - # Write classname so it can be used for debugging which object doesn't - # support cross-language. - # TODO add a config to disable this to reduce space cost. - self.class_resolver.xwrite_class(buffer, cls) - # serializer may increase reference id multi times internally, thus peer - # cross-language later fields/objects deserialization will use wrong - # reference id since we skip opaque objects deserialization. - # So we stash native objects and serialize all those object at the last. - buffer.write_varint32(len(self._native_objects)) - self._native_objects.append(obj) + return + cls = type(obj) + classinfo = self.class_resolver.get_classinfo(cls) + self.class_resolver.write_typeinfo(buffer, classinfo) + classinfo.serializer.xwrite(buffer, obj) def deserialize( self, @@ -900,16 +373,6 @@ def _deserialize( "produced with buffer_callback null." ) if is_target_x_lang: - native_objects_start_offset = buffer.read_int32() - native_objects_size = buffer.read_int32() - if self._peer_language == Language.PYTHON: - native_objects_buffer = buffer.slice(native_objects_start_offset) - for i in range(native_objects_size): - self._native_objects.append( - self.deserialize_ref(native_objects_buffer) - ) - self.ref_resolver.reset_read() - self.class_resolver.reset_read() obj = self.xdeserialize_ref(buffer) else: obj = self.deserialize_ref(buffer) @@ -935,12 +398,11 @@ def deserialize_nonref(self, buffer): def xdeserialize_ref(self, buffer, serializer=None): if serializer is None or serializer.need_to_write_ref: ref_resolver = self.ref_resolver - red_id = ref_resolver.try_preserve_ref_id(buffer) - + ref_id = ref_resolver.try_preserve_ref_id(buffer) # indicates that the object is first read. - if red_id >= NOT_NULL_VALUE_FLAG: + if ref_id >= NOT_NULL_VALUE_FLAG: o = self.xdeserialize_nonref(buffer, serializer=serializer) - ref_resolver.set_read_object(red_id, o) + ref_resolver.set_read_object(ref_id, o) return o else: return ref_resolver.get_read_object() @@ -950,45 +412,16 @@ def xdeserialize_ref(self, buffer, serializer=None): return self.xdeserialize_nonref(buffer, serializer=serializer) def xdeserialize_nonref(self, buffer, serializer=None): - type_id = buffer.read_int16() - cls = None - if type_id != NOT_SUPPORT_CROSS_LANGUAGE: - if type_id == FuryType.FURY_TYPE_TAG.value: - cls = self.class_resolver.read_class_by_type_tag(buffer) - if type_id < NOT_SUPPORT_CROSS_LANGUAGE: - if self._peer_language is not Language.PYTHON: - self.class_resolver.read_enum_string_bytes(buffer) - cls = self.class_resolver.get_class_by_type_id(-type_id) - serializer = serializer or self.class_resolver.get_serializer( - type_id=-type_id - ) - else: - cls = self.class_resolver.xread_class(buffer) - serializer = serializer or self.class_resolver.get_serializer( - cls=cls, type_id=type_id - ) - else: - if type_id != FuryType.FURY_TYPE_TAG.value: - cls = self.class_resolver.get_class_by_type_id(type_id) - serializer = serializer or self.class_resolver.get_serializer( - cls=cls, type_id=type_id - ) - assert cls is not None - return serializer.xread(buffer) - else: - class_name = self.class_resolver.xread_classname(buffer) - ordinal = buffer.read_varint32() - if self._peer_language != Language.PYTHON: - return OpaqueObject(self._peer_language, class_name, ordinal) - else: - return self._native_objects[ordinal] + if serializer is None: + serializer = self.class_resolver.read_typeinfo(buffer).serializer + return serializer.xread(buffer) def write_buffer_object(self, buffer, buffer_object: BufferObject): if self._buffer_callback is None or self._buffer_callback(buffer_object): buffer.write_bool(True) size = buffer_object.total_bytes() # writer length. - buffer.write_varint32(size) + buffer.write_varuint32(size) writer_index = buffer.writer_index buffer.ensure(writer_index + size) buf = buffer.slice(buffer.writer_index, size) @@ -1027,7 +460,7 @@ def write_ref_pyobject(self, buffer, value, classinfo=None): if self.ref_resolver.write_ref_or_null(buffer, value): return if classinfo is None: - classinfo = self.class_resolver.get_or_create_classinfo(type(value)) + classinfo = self.class_resolver.get_classinfo(type(value)) self.class_resolver.write_classinfo(buffer, classinfo) classinfo.serializer.write(buffer, value) @@ -1038,7 +471,6 @@ def reset_write(self): self.ref_resolver.reset_write() self.class_resolver.reset_write() self.serialization_context.reset() - self._native_objects.clear() self.pickler.clear_memo() self._buffer_callback = None self._unsupported_callback = None @@ -1047,7 +479,6 @@ def reset_read(self): self.ref_resolver.reset_read() self.class_resolver.reset_read() self.serialization_context.reset() - self._native_objects.clear() self.unpickler = None self._buffers = None self._unsupported_objects = None @@ -1057,6 +488,36 @@ def reset(self): self.reset_read() +class SerializationContext: + """ + A context is used to add some context-related information, so that the + serializers can setup relation between serializing different objects. + The context will be reset after finished serializing/deserializing the + object tree. + """ + + __slots__ = ("objects",) + + def __init__(self): + self.objects = dict() + + def add(self, key, obj): + self.objects[id(key)] = obj + + def __contains__(self, key): + return id(key) in self.objects + + def __getitem__(self, key): + return self.objects[id(key)] + + def get(self, key): + return self.objects.get(id(key)) + + def reset(self): + if len(self.objects) > 0: + self.objects.clear() + + _ENABLE_CLASS_REGISTRATION_FORCIBLY = os.getenv( "ENABLE_CLASS_REGISTRATION_FORCIBLY", "0" ) in { diff --git a/python/pyfury/_registry.py b/python/pyfury/_registry.py new file mode 100644 index 0000000000..aa313b09d4 --- /dev/null +++ b/python/pyfury/_registry.py @@ -0,0 +1,627 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import array +import dataclasses +import datetime +import enum +import functools +import logging +from typing import TypeVar, Union +from enum import Enum + +from pyfury._serialization import ENABLE_FURY_CYTHON_SERIALIZATION +from pyfury import Language +from pyfury.error import TypeUnregisteredError + +from pyfury.serializer import ( + Serializer, + Numpy1DArraySerializer, + NDArraySerializer, + PyArraySerializer, + DynamicPyArraySerializer, + _PickleStub, + PickleStrongCacheStub, + PickleCacheStub, + NoneSerializer, + BooleanSerializer, + ByteSerializer, + Int16Serializer, + Int32Serializer, + Int64Serializer, + DynamicIntSerializer, + FloatSerializer, + DoubleSerializer, + DynamicFloatSerializer, + StringSerializer, + DateSerializer, + TimestampSerializer, + BytesSerializer, + ListSerializer, + TupleSerializer, + MapSerializer, + SetSerializer, + EnumSerializer, + SliceSerializer, + PickleCacheSerializer, + PickleStrongCacheSerializer, + PickleSerializer, +) +from pyfury._struct import ComplexObjectSerializer +from pyfury.buffer import Buffer +from pyfury.meta.metastring import MetaStringEncoder, MetaStringDecoder +from pyfury.type import ( + TypeId, + Int8Type, + Int16Type, + Int32Type, + Int64Type, + Float32Type, + Float64Type, + load_class, +) +from pyfury._fury import ( + DYNAMIC_TYPE_ID, + # preserve 0 as flag for class id not set in ClassInfo` + NO_CLASS_ID, + PYINT_CLASS_ID, + PYFLOAT_CLASS_ID, + PYBOOL_CLASS_ID, + STRING_CLASS_ID, + PICKLE_CLASS_ID, + PICKLE_STRONG_CACHE_CLASS_ID, + PICKLE_CACHE_CLASS_ID, +) + +try: + import numpy as np +except ImportError: + np = None + +logger = logging.getLogger(__name__) + + +if ENABLE_FURY_CYTHON_SERIALIZATION: + from pyfury._serialization import ClassInfo +else: + + class ClassInfo: + __slots__ = ( + "cls", + "type_id", + "serializer", + "namespace_bytes", + "typename_bytes", + "dynamic_type", + ) + + def __init__( + self, + cls: type = None, + type_id: int = NO_CLASS_ID, + serializer: Serializer = None, + namespace_bytes=None, + typename_bytes=None, + dynamic_type: bool = False, + ): + self.cls = cls + self.type_id = type_id + self.serializer = serializer + self.namespace_bytes = namespace_bytes + self.typename_bytes = typename_bytes + self.dynamic_type = dynamic_type + + def __repr__(self): + return ( + f"ClassInfo(cls={self.cls}, type_id={self.type_id}, " + f"serializer={self.serializer})" + ) + + +class ClassResolver: + __slots__ = ( + "fury", + "_metastr_to_str", + "_type_id_counter", + "_classes_info", + "_hash_to_metastring", + "_metastr_to_class", + "_hash_to_classinfo", + "_dynamic_id_to_classinfo_list", + "_dynamic_id_to_metastr_list", + "_dynamic_write_string_id", + "_dynamic_written_metastr", + "_ns_type_to_classinfo", + "_named_type_to_classinfo", + "namespace_encoder", + "namespace_decoder", + "typename_encoder", + "typename_decoder", + "require_registration", + "metastring_resolver", + "language", + "_type_id_to_classinfo", + ) + + def __init__(self, fury): + self.fury = fury + self.metastring_resolver = fury.metastring_resolver + self.language = fury.language + self.require_registration = fury.require_class_registration + self._metastr_to_str = dict() + self._metastr_to_class = dict() + self._hash_to_metastring = dict() + self._hash_to_classinfo = dict() + self._dynamic_written_metastr = [] + self._type_id_to_classinfo = dict() + self._type_id_counter = PICKLE_CACHE_CLASS_ID + 1 + self._dynamic_write_string_id = 0 + # hold objects to avoid gc, since `flat_hash_map/vector` doesn't + # hold python reference. + self._classes_info = dict() + self._ns_type_to_classinfo = dict() + self._named_type_to_classinfo = dict() + self.namespace_encoder = MetaStringEncoder(".", "_") + self.namespace_decoder = MetaStringDecoder(".", "_") + self.typename_encoder = MetaStringEncoder("$", "_") + self.typename_decoder = MetaStringDecoder("$", "_") + + def initialize(self): + if self.fury.language == Language.PYTHON: + self._initialize_py() + else: + self._initialize_xlang() + + def _initialize_py(self): + register = functools.partial(self._register_type, internal=True) + register(int, type_id=PYINT_CLASS_ID, serializer=Int64Serializer) + register(float, type_id=PYFLOAT_CLASS_ID, serializer=DoubleSerializer) + register(bool, type_id=PYBOOL_CLASS_ID, serializer=BooleanSerializer) + register(str, type_id=STRING_CLASS_ID, serializer=StringSerializer) + register(_PickleStub, type_id=PICKLE_CLASS_ID, serializer=PickleSerializer) + register( + PickleStrongCacheStub, + type_id=PICKLE_STRONG_CACHE_CLASS_ID, + serializer=PickleStrongCacheSerializer(self.fury), + ) + register( + PickleCacheStub, + type_id=PICKLE_CACHE_CLASS_ID, + serializer=PickleCacheSerializer(self.fury), + ) + register(type(None), serializer=NoneSerializer) + register(Int8Type, serializer=ByteSerializer) + register(Int16Type, serializer=Int16Serializer) + register(Int32Type, serializer=Int32Serializer) + register(Int64Type, serializer=Int64Serializer) + register(Float32Type, serializer=FloatSerializer) + register(Float64Type, serializer=DoubleSerializer) + register(datetime.date, serializer=DateSerializer) + register(datetime.datetime, serializer=TimestampSerializer) + register(bytes, serializer=BytesSerializer) + register(list, serializer=ListSerializer) + register(tuple, serializer=TupleSerializer) + register(dict, serializer=MapSerializer) + register(set, serializer=SetSerializer) + register(enum.Enum, serializer=EnumSerializer) + register(slice, serializer=SliceSerializer) + try: + import pyarrow as pa + from pyfury.format.serializer import ( + ArrowRecordBatchSerializer, + ArrowTableSerializer, + ) + + register(pa.RecordBatch, serializer=ArrowRecordBatchSerializer) + register(pa.Table, serializer=ArrowTableSerializer) + except Exception: + pass + for size, ftype, type_id in PyArraySerializer.typecode_dict.values(): + register(ftype, serializer=PyArraySerializer(self.fury, ftype, type_id)) + register(array.array, serializer=DynamicPyArraySerializer) + if np: + register(np.ndarray, serializer=NDArraySerializer) + + def _initialize_xlang(self): + register = functools.partial(self._register_type, internal=True) + register(bool, type_id=TypeId.BOOL, serializer=BooleanSerializer) + register(Int8Type, type_id=TypeId.INT8, serializer=ByteSerializer) + register(Int16Type, type_id=TypeId.INT16, serializer=Int16Serializer) + register(Int32Type, type_id=TypeId.INT32, serializer=Int32Serializer) + register(Int64Type, type_id=TypeId.INT64, serializer=Int64Serializer) + register(int, type_id=DYNAMIC_TYPE_ID, serializer=DynamicIntSerializer) + register( + Float32Type, + type_id=TypeId.FLOAT32, + serializer=FloatSerializer, + ) + register( + Float64Type, + type_id=TypeId.FLOAT64, + serializer=DoubleSerializer, + ) + register(float, type_id=DYNAMIC_TYPE_ID, serializer=DynamicFloatSerializer) + register(str, type_id=TypeId.STRING, serializer=StringSerializer) + # TODO(chaokunyang) DURATION DECIMAL + register( + datetime.datetime, type_id=TypeId.TIMESTAMP, serializer=TimestampSerializer + ) + register(datetime.date, type_id=TypeId.LOCAL_DATE, serializer=DateSerializer) + register(bytes, type_id=TypeId.BINARY, serializer=BytesSerializer) + for itemsize, ftype, typeid in PyArraySerializer.typecode_dict.values(): + register( + ftype, + type_id=typeid, + serializer=PyArraySerializer(self.fury, ftype, typeid), + ) + register( + array.array, type_id=DYNAMIC_TYPE_ID, serializer=DynamicPyArraySerializer + ) + if np: + # overwrite pyarray with same type id. + # if pyarray are needed, one must annotate that value with XXXArrayType + # as a field of a struct. + for dtype, ( + itemsize, + format, + ftype, + typeid, + ) in Numpy1DArraySerializer.dtypes_dict.items(): + register( + ftype, + type_id=typeid, + serializer=Numpy1DArraySerializer(self.fury, ftype, dtype), + ) + register(np.ndarray, type_id=DYNAMIC_TYPE_ID, serializer=NDArraySerializer) + register(list, type_id=TypeId.LIST, serializer=ListSerializer) + register(set, type_id=TypeId.SET, serializer=SetSerializer) + register(dict, type_id=TypeId.MAP, serializer=MapSerializer) + try: + import pyarrow as pa + from pyfury.format.serializer import ( + ArrowRecordBatchSerializer, + ArrowTableSerializer, + ) + + register( + pa.RecordBatch, + type_id=TypeId.ARROW_RECORD_BATCH, + serializer=ArrowRecordBatchSerializer, + ) + register( + pa.Table, type_id=TypeId.ARROW_TABLE, serializer=ArrowTableSerializer + ) + except Exception: + pass + + def register_type( + self, + cls: Union[type, TypeVar], + *, + type_id: int = None, + namespace: str = None, + typename: str = None, + serializer=None, + ): + return self._register_type( + cls, + type_id=type_id, + namespace=namespace, + typename=typename, + serializer=serializer, + ) + + def _register_type( + self, + cls: Union[type, TypeVar], + *, + type_id: int = None, + namespace: str = None, + typename: str = None, + serializer=None, + internal=False, + ): + """Register class with given type id or typename. If typename is not None, it will be used for + cross-language serialization.""" + if serializer is not None and not isinstance(serializer, Serializer): + try: + serializer = serializer(self.fury, cls) + except BaseException: + try: + serializer = serializer(self.fury) + except BaseException: + serializer = serializer() + n_params = len({typename, type_id, None}) - 1 + if n_params == 0 and typename is None: + type_id = self._next_type_id() + if n_params == 2: + raise TypeError( + f"type name {typename} and id {type_id} should not be set at the same time" + ) + if type_id not in {0, None}: + # multiple class can have same tpe id + if type_id in self._type_id_to_classinfo and cls in self._classes_info: + raise TypeError(f"{cls} registered already") + elif cls in self._classes_info: + raise TypeError(f"{cls} registered already") + register_type = ( + self._register_xtype + if self.fury.language == Language.XLANG + else self._register_pytype + ) + return register_type( + cls, + type_id=type_id, + namespace=namespace, + typename=typename, + serializer=serializer, + internal=internal, + ) + + def _register_xtype( + self, + cls: Union[type, TypeVar], + *, + type_id: int = None, + namespace: str = None, + typename: str = None, + serializer=None, + internal=False, + ): + if serializer is None: + if issubclass(cls, enum.Enum): + serializer = EnumSerializer(self.fury, cls) + type_id = ( + TypeId.NAMED_ENUM + if type_id is None + else ((type_id << 8) + TypeId.ENUM) + ) + else: + serializer = ComplexObjectSerializer(self.fury, cls) + type_id = ( + TypeId.NAMED_STRUCT + if type_id is None + else ((type_id << 8) + TypeId.STRUCT) + ) + elif not internal: + type_id = ( + TypeId.NAMED_EXT if type_id is None else ((type_id << 8) + TypeId.EXT) + ) + return self.__register_type( + cls, + type_id=type_id, + serializer=serializer, + namespace=namespace, + typename=typename, + internal=internal, + ) + + def _register_pytype( + self, + cls: Union[type, TypeVar], + *, + type_id: int = None, + namespace: str = None, + typename: str = None, + serializer: Serializer = None, + internal: bool = False, + ): + return self.__register_type( + cls, + type_id=type_id, + namespace=namespace, + typename=typename, + serializer=serializer, + internal=internal, + ) + + def __register_type( + self, + cls: Union[type, TypeVar], + *, + type_id: int = None, + namespace: str = None, + typename: str = None, + serializer: Serializer = None, + internal: bool = False, + ): + dynamic_type = type_id < 0 + if not internal and serializer is None: + serializer = self._create_serializer(cls) + if typename is None: + classinfo = ClassInfo(cls, type_id, serializer, None, None, dynamic_type) + else: + if namespace is None: + splits = typename.rsplit(".", 1) + if len(splits) == 2: + namespace, typename = splits + ns_metastr = self.namespace_encoder.encode(namespace or "") + ns_meta_bytes = self.metastring_resolver.get_metastr_bytes(ns_metastr) + type_metastr = self.typename_encoder.encode(typename) + type_meta_bytes = self.metastring_resolver.get_metastr_bytes(type_metastr) + classinfo = ClassInfo( + cls, type_id, serializer, ns_meta_bytes, type_meta_bytes, dynamic_type + ) + self._named_type_to_classinfo[(namespace, typename)] = classinfo + self._ns_type_to_classinfo[(ns_meta_bytes, type_meta_bytes)] = classinfo + self._classes_info[cls] = classinfo + if type_id > 0 and ( + self.language == Language.PYTHON or not TypeId.is_namespaced_type(type_id) + ): + if type_id not in self._type_id_to_classinfo or not internal: + self._type_id_to_classinfo[type_id] = classinfo + self._classes_info[cls] = classinfo + return classinfo + + def _next_type_id(self): + type_id = self._type_id_counter = self._type_id_counter + 1 + while type_id in self._type_id_to_classinfo: + type_id = self._type_id_counter = self._type_id_counter + 1 + return type_id + + def register_serializer(self, cls: Union[type, TypeVar], serializer): + assert isinstance(cls, (type, TypeVar)), cls + if cls not in self._classes_info: + raise TypeUnregisteredError(f"{cls} not registered") + classinfo = self._classes_info[cls] + if self.fury.language == Language.PYTHON: + classinfo.serializer = serializer + return + type_id = prev_type_id = classinfo.type_id + self._type_id_to_classinfo.pop(prev_type_id) + if classinfo.serializer is not serializer: + if classinfo.typename_bytes is not None: + type_id = classinfo.type_id & 0xFFFFFF00 | TypeId.NAMED_EXT + else: + type_id = classinfo.type_id & 0xFFFFFF00 | TypeId.EXT + self._type_id_to_classinfo[type_id] = classinfo + + def get_serializer(self, cls: type): + """ + Returns + ------- + Returns or create serializer for the provided class + """ + return self.get_classinfo(cls).serializer + + def get_classinfo(self, cls, create=True): + class_info = self._classes_info.get(cls) + if class_info is not None: + if class_info.serializer is None: + class_info.serializer = self._create_serializer(cls) + return class_info + elif not create: + return None + if self.language != Language.PYTHON or ( + self.require_registration and not issubclass(cls, Enum) + ): + raise TypeUnregisteredError(f"{cls} not registered") + logger.info("Class %s not registered", cls) + serializer = self._create_serializer(cls) + type_id = ( + NO_CLASS_ID if type(serializer) is not PickleSerializer else PICKLE_CLASS_ID + ) + return self.__register_type( + cls, + type_id=type_id, + namespace=cls.__module__, + typename=cls.__qualname__, + serializer=serializer, + ) + + def _create_serializer(self, cls): + for clz in cls.__mro__: + class_info = self._classes_info.get(clz) + if ( + class_info + and class_info.serializer + and class_info.serializer.support_subclass() + ): + serializer = type(class_info.serializer)(self.fury, cls) + break + else: + if dataclasses.is_dataclass(cls): + from pyfury import DataClassSerializer + + serializer = DataClassSerializer(self.fury, cls) + elif issubclass(cls, enum.Enum): + serializer = EnumSerializer(self.fury, cls) + else: + serializer = PickleSerializer(self.fury, cls) + return serializer + + def write_classinfo(self, buffer: Buffer, classinfo): + if classinfo.dynamic_type: + return + type_id = classinfo.type_id + if type_id != NO_CLASS_ID: + buffer.write_varuint32(type_id << 1) + return + buffer.write_varuint32(1) + self.metastring_resolver.write_meta_string_bytes( + buffer, classinfo.namespace_bytes + ) + self.metastring_resolver.write_meta_string_bytes( + buffer, classinfo.typename_bytes + ) + + def read_classinfo(self, buffer): + header = buffer.read_varuint32() + if header & 0b1 == 0: + type_id = header >> 1 + classinfo = self._type_id_to_classinfo[type_id] + if classinfo.serializer is None: + classinfo.serializer = self._create_serializer(classinfo.cls) + return classinfo + ns_metabytes = self.metastring_resolver.read_meta_string_bytes(buffer) + type_metabytes = self.metastring_resolver.read_meta_string_bytes(buffer) + return self._load_metabytes_to_classinfo(ns_metabytes, type_metabytes) + + def _load_metabytes_to_classinfo(self, ns_metabytes, type_metabytes): + typeinfo = self._ns_type_to_classinfo.get((ns_metabytes, type_metabytes)) + if typeinfo is not None: + return typeinfo + ns = ns_metabytes.decode(self.namespace_decoder) + typename = type_metabytes.decode(self.typename_decoder) + # the hash computed between languages may be different. + typeinfo = self._named_type_to_classinfo.get((ns, typename)) + if typeinfo is not None: + self._ns_type_to_classinfo[(ns_metabytes, type_metabytes)] = typeinfo + return typeinfo + cls = load_class(ns + "#" + typename) + classinfo = self.get_classinfo(cls) + self._ns_type_to_classinfo[(ns_metabytes, type_metabytes)] = classinfo + return classinfo + + def write_typeinfo(self, buffer, classinfo): + type_id = classinfo.type_id + internal_type_id = type_id & 0xFF + buffer.write_varuint32(type_id) + if TypeId.is_namespaced_type(internal_type_id): + self.metastring_resolver.write_meta_string_bytes( + buffer, classinfo.namespace_bytes + ) + self.metastring_resolver.write_meta_string_bytes( + buffer, classinfo.typename_bytes + ) + + def read_typeinfo(self, buffer): + type_id = buffer.read_varuint32() + internal_type_id = type_id & 0xFF + if TypeId.is_namespaced_type(internal_type_id): + ns_metabytes = self.metastring_resolver.read_meta_string_bytes(buffer) + type_metabytes = self.metastring_resolver.read_meta_string_bytes(buffer) + typeinfo = self._ns_type_to_classinfo.get((ns_metabytes, type_metabytes)) + if typeinfo is None: + ns = ns_metabytes.decode(self.namespace_decoder) + typename = type_metabytes.decode(self.typename_decoder) + # TODO(chaokunyang) generate a dynamic class and serializer + # when meta share is enabled. + name = ns + "." + typename if ns else typename + raise TypeUnregisteredError(f"{name} not registered") + return typeinfo + else: + return self._type_id_to_classinfo[type_id] + + def reset(self): + pass + + def reset_read(self): + pass + + def reset_write(self): + pass diff --git a/python/pyfury/_serialization.pyx b/python/pyfury/_serialization.pyx index 8e7ad3c47b..86abdf0210 100644 --- a/python/pyfury/_serialization.pyx +++ b/python/pyfury/_serialization.pyx @@ -19,28 +19,23 @@ # cython: embedsignature = True # cython: language_level = 3 # cython: annotate = True -import array -import dataclasses import datetime -import enum import logging import os -import sys import warnings -from typing import TypeVar, Union, Iterable, get_type_hints +from typing import TypeVar, Union, Iterable from pyfury._util import get_bit, set_bit, clear_bit -from pyfury._fury import Language, OpaqueObject +from pyfury._fury import Language from pyfury._fury import _PicklerStub, _UnpicklerStub, Pickler, Unpickler from pyfury._fury import _ENABLE_CLASS_REGISTRATION_FORCIBLY -from pyfury.error import ClassNotCompatibleError from pyfury.lib import mmh3 from pyfury.meta.metastring import Encoding -from pyfury.type import is_primitive_type, FuryType, Int8Type, Int16Type, Int32Type, \ - Int64Type, Float32Type, Float64Type, Int16ArrayType, Int32ArrayType, \ - Int64ArrayType, Float32ArrayType, Float64ArrayType, infer_field, load_class +from pyfury.type import is_primitive_type from pyfury.util import is_little_endian +from pyfury.includes.libserialization cimport TypeId, IsNamespacedType +from libc.stdint cimport int8_t, int16_t, int32_t, int64_t, uint64_t from libc.stdint cimport * from libcpp.vector cimport vector from cpython cimport PyObject @@ -59,8 +54,6 @@ try: except ImportError: np = None -import pickle # nosec # pylint: disable=import_pickle - cimport cython logger = logging.getLogger(__name__) @@ -91,8 +84,8 @@ cdef int8_t REF_VALUE_FLAG = 0 cdef class MapRefResolver: cdef flat_hash_map[uint64_t, int32_t] written_objects_id # id(obj) -> ref_id # Hold object to avoid tmp object gc when serialize nested fields/objects. - cdef vector[PyObject*] written_objects - cdef vector[PyObject*] read_objects + cdef vector[PyObject *] written_objects + cdef vector[PyObject *] read_objects cdef vector[int32_t] read_ref_ids cdef object read_object cdef c_bool ref_tracking @@ -116,21 +109,21 @@ cdef class MapRefResolver: if obj is None: buffer.write_int8(NULL_FLAG) return True - cdef uint64_t object_id = obj + cdef uint64_t object_id = obj cdef int32_t next_id cdef flat_hash_map[uint64_t, int32_t].iterator it = \ self.written_objects_id.find(object_id) if it == self.written_objects_id.end(): next_id = self.written_objects_id.size() self.written_objects_id[object_id] = next_id - self.written_objects.push_back(obj) + self.written_objects.push_back( obj) Py_INCREF(obj) buffer.write_int8(REF_VALUE_FLAG) return False else: # The obj has been written previously. buffer.write_int8(REF_FLAG) - buffer.write_varint32(deref(it).second) + buffer.write_varuint32( deref(it).second) return True cpdef inline int8_t read_ref_or_null(self, Buffer buffer): @@ -140,8 +133,8 @@ cdef class MapRefResolver: cdef int32_t ref_id if head_flag == REF_FLAG: # read reference id and get object from reference resolver - ref_id = buffer.read_varint32() - self.read_object = (self.read_objects[ref_id]) + ref_id = buffer.read_varuint32() + self.read_object = (self.read_objects[ref_id]) return REF_FLAG else: self.read_object = None @@ -163,8 +156,8 @@ cdef class MapRefResolver: head_flag = buffer.read_int8() if head_flag == REF_FLAG: # read reference id and get object from reference resolver - ref_id = buffer.read_varint32() - self.read_object = (self.read_objects[ref_id]) + ref_id = buffer.read_varuint32() + self.read_object = (self.read_objects[ref_id]) # `head_flag` except `REF_FLAG` can be used as stub reference id because # we use `refId >= NOT_NULL_VALUE_FLAG` to read data. return head_flag @@ -182,7 +175,7 @@ cdef class MapRefResolver: cdef c_bool need_inc = self.read_objects[ref_id] == NULL if need_inc: Py_INCREF(obj) - self.read_objects[ref_id] = obj + self.read_objects[ref_id] = obj cpdef inline get_read_object(self, id_=None): if not self.ref_tracking: @@ -190,7 +183,7 @@ cdef class MapRefResolver: if id_ is None: return self.read_object cdef int32_t ref_id = id_ - return (self.read_objects[ref_id]) + return (self.read_objects[ref_id]) cpdef inline set_read_object(self, int32_t ref_id, obj): if not self.ref_tracking: @@ -199,7 +192,7 @@ cdef class MapRefResolver: need_inc = self.read_objects[ref_id] == NULL if need_inc: Py_INCREF(obj) - self.read_objects[ref_id] = obj + self.read_objects[ref_id] = obj cpdef inline reset(self): self.reset_write() @@ -220,8 +213,6 @@ cdef class MapRefResolver: self.read_ref_ids.clear() self.read_object = None - -cdef int8_t NOT_SUPPORT_CROSS_LANGUAGE = 0 cdef int8_t USE_CLASSNAME = 0 cdef int8_t USE_CLASS_ID = 1 # preserve 0 as flag for class id not set in ClassInfo` @@ -247,601 +238,356 @@ cdef int32_t NOT_NULL_STRING_FLAG = NOT_NULL_VALUE_FLAG & 0b11111111 | \ cdef int32_t SMALL_STRING_THRESHOLD = 16 -cdef class BufferObject: - """ - Fury binary representation of an object. - Note: This class is used for zero-copy out-of-band serialization and shouldn't be - used for any other cases. - """ - - cpdef int32_t total_bytes(self): - """total size for serialized bytes of an object""" - raise NotImplementedError - - cpdef write_to(self, Buffer buffer): - """Write serialized object to a buffer.""" - raise NotImplementedError - - cpdef Buffer to_buffer(self): - """Write serialized data as Buffer.""" - raise NotImplementedError - - @cython.final -cdef class BytesBufferObject(BufferObject): - cdef public bytes binary - - def __init__(self, bytes binary): - self.binary = binary - - cpdef inline int32_t total_bytes(self): - return len(self.binary) - - cpdef inline write_to(self, Buffer buffer): - buffer.write_bytes(self.binary) - - cpdef inline Buffer to_buffer(self): - return Buffer(self.binary) - +cdef class MetaStringBytes: + cdef public bytes data + cdef int16_t length + cdef public int8_t encoding + cdef public int64_t hashcode + cdef public int16_t dynamic_write_string_id -@cython.final -cdef class _PickleStub: - pass + def __init__(self, data, hashcode): + self.data = data + self.length = len(data) + self.hashcode = hashcode + self.encoding = hashcode & 0xff + self.dynamic_write_string_id = DEFAULT_DYNAMIC_WRITE_STRING_ID + def __eq__(self, other): + return type(other) is MetaStringBytes and other.hashcode == self.hashcode -@cython.final -cdef class PickleStrongCacheStub: - pass + def __hash__(self): + return self.hashcode + def decode(self, decoder): + return decoder.decode(self.data, Encoding(self.encoding)) -@cython.final -cdef class PickleCacheStub: - pass + def __repr__(self): + return f"MetaStringBytes(data={self.data}, hashcode={self.hashcode})" @cython.final -cdef class ClassResolver: +cdef class MetaStringResolver: cdef: - readonly Fury fury - dict _type_id_to_class # Dict[int, type] - dict _type_id_to_serializer # Dict[int, Serializer] - dict _type_id_and_cls_to_serializer # Dict[Tuple[int, type], Serializer] - dict _type_tag_to_class_x_lang_map - int16_t _class_id_counter - set _used_classes_id - - public list _registered_id2_class_info - vector[PyObject*] _c_registered_id2_class_info - - # cls -> ClassInfo - flat_hash_map[uint64_t, PyObject*] _c_classes_info - # hash -> ClassInfo - flat_hash_map[int64_t, PyObject*] _c_hash_to_classinfo - # hash -> MetaStringBytes - flat_hash_map[int64_t, PyObject*] _c_hash_to_enum_string_bytes - flat_hash_map[pair[int64_t, int64_t], PyObject*] _c_hash_to_small_metastring_bytes - # classname MetaStringBytes address -> class - flat_hash_map[uint64_t, PyObject*] _c_str_bytes_to_class - # classname MetaStringBytes address -> str - flat_hash_map[uint64_t, PyObject*] _c_enum_str_to_str - int16_t dynamic_write_string_id - vector[PyObject*] _c_dynamic_written_enum_string - vector[PyObject*] _c_dynamic_id_to_enum_string_vec - vector[PyObject*] _c_dynamic_id_to_classinfo_vec - Serializer _serializer - - # hold objects to avoid gc, since flat_hash_map/vector doesn't - # hold python reference. - public dict _classes_info # Dict[type, "ClassInfo"] - set _class_set - set _classname_set + vector[PyObject *] _c_dynamic_written_enum_string + vector[PyObject *] _c_dynamic_id_to_enum_string_vec + # hash -> MetaStringBytes + flat_hash_map[int64_t, PyObject *] _c_hash_to_metastr_bytes + flat_hash_map[pair[int64_t, int64_t], PyObject *] _c_hash_to_small_metastring_bytes set _enum_str_set + dict _metastr_to_metastr_bytes - def __init__(self, fury): - self.fury = fury - self._type_id_to_class = dict() - self._type_id_to_serializer = dict() - self._type_id_and_cls_to_serializer = dict() - self._type_tag_to_class_x_lang_map = dict() - self._class_id_counter = PICKLE_CACHE_CLASS_ID + 1 - self._used_classes_id = set() - self._registered_id2_class_info = list() - - self.dynamic_write_string_id = 0 - self._serializer = None - - self._classes_info = dict() - self._class_set = set() - self._classname_set = set() + def __init__(self): self._enum_str_set = set() - - def initialize(self): - self.register_class(int, class_id=PYINT_CLASS_ID) - self.register_class(float, class_id=PYFLOAT_CLASS_ID) - self.register_class(bool, class_id=PYBOOL_CLASS_ID) - self.register_class(str, class_id=STRING_CLASS_ID) - self.register_class(_PickleStub, class_id=PICKLE_CLASS_ID) - self.register_class(PickleStrongCacheStub, class_id=PICKLE_STRONG_CACHE_CLASS_ID) - self.register_class(PickleCacheStub, class_id=PICKLE_CACHE_CLASS_ID) - self._add_default_serializers() - - def register_serializer(self, cls: Union[type, TypeVar], serializer): - assert isinstance(cls, (type, TypeVar)), cls - type_id = serializer.get_xtype_id() - if type_id != NOT_SUPPORT_CROSS_LANGUAGE: - self._add_x_lang_serializer(cls, serializer=serializer) - else: - self.register_class(cls) - self._classes_info[cls].serializer = serializer - - def register_class(self, cls: Union[type, TypeVar], *, class_id: int = None, type_tag: str = None): - """Register class with given type id or tag, if tag is not None, it will be used for - cross-language serialization.""" - if type_tag is not None: - assert class_id is None, (f"Type tag {type_tag} has been set already, " - f"set class id at the same time is not allowed.") - self.register_serializer( - cls, ComplexObjectSerializer(self.fury, cls, type_tag) - ) - return - classinfo = self._classes_info.get(cls) - if classinfo is None: - if isinstance(cls, TypeVar): - class_name_bytes = (cls.__module__ + "#" + cls.__name__).encode("utf-8") - else: - class_name_bytes = (cls.__module__ + "#" + cls.__qualname__) \ - .encode("utf-8") - class_id = class_id if class_id is not None else self._next_class_id() - assert class_id not in self._used_classes_id, ( - self._used_classes_id, - self._classes_info, - ) - classinfo = ClassInfo( - cls=cls, class_name_bytes=class_name_bytes, class_id=class_id - ) - self._classes_info[cls] = classinfo - self._c_classes_info[cls] = classinfo - if len(self._registered_id2_class_info) <= class_id: - self._registered_id2_class_info.extend( - [None] * (class_id - len(self._registered_id2_class_info) + 1) - ) - self._registered_id2_class_info[class_id] = classinfo - self._c_registered_id2_class_info.resize(class_id + 1) - self._c_registered_id2_class_info[class_id] = classinfo - else: - if classinfo.class_id == NO_CLASS_ID: - class_id = class_id if class_id is not None else self._next_class_id() - assert class_id not in self._used_classes_id, ( - self._used_classes_id, - self._classes_info, - ) - classinfo.class_id = class_id - if len(self._registered_id2_class_info) <= class_id: - self._registered_id2_class_info.extend( - [None] * (class_id - len(self._registered_id2_class_info) + 1) - ) - self._registered_id2_class_info[class_id] = classinfo - self._c_registered_id2_class_info.resize(class_id + 1) - self._c_registered_id2_class_info[class_id] = classinfo - else: - if class_id is not None and classinfo.class_id != class_id: - raise ValueError( - f"Inconsistent class id {class_id} vs {classinfo.class_id} " - f"for class {cls}" - ) - - def _next_class_id(self): - class_id = self._class_id_counter = self._class_id_counter + 1 - while class_id in self._used_classes_id: - class_id = self._class_id_counter = self._class_id_counter + 1 - return class_id - - def _add_serializer( - self, - cls: Union[type, TypeVar], - serializer=None, - serializer_cls=None): - if serializer_cls: - serializer = serializer_cls(self.fury, cls) - self.register_serializer(cls, serializer) - - def _add_x_lang_serializer(self, - cls: Union[type, TypeVar], - serializer=None, - serializer_cls=None): - if serializer_cls: - serializer = serializer_cls(self.fury, cls) - type_id = serializer.get_xtype_id() - - assert type_id != NOT_SUPPORT_CROSS_LANGUAGE - self._type_id_and_cls_to_serializer[(type_id, cls)] = serializer - self.register_class(cls) - classinfo = self._classes_info[cls] - classinfo.serializer = serializer - if type_id == FuryType.FURY_TYPE_TAG.value: - type_tag = serializer.get_xtype_tag() - assert type(type_tag) is str - assert type_tag not in self._type_tag_to_class_x_lang_map - classinfo.type_tag_bytes = MetaStringBytes(type_tag.encode("utf-8")) - self._type_tag_to_class_x_lang_map[type_tag] = cls - else: - self._type_id_to_serializer[type_id] = serializer - if type_id > NOT_SUPPORT_CROSS_LANGUAGE: - self._type_id_to_class[type_id] = cls - - def _add_default_serializers(self): - self._add_x_lang_serializer(int, serializer_cls=ByteSerializer) - self._add_x_lang_serializer(int, serializer_cls=Int16Serializer) - self._add_x_lang_serializer(int, serializer_cls=Int32Serializer) - self._add_x_lang_serializer(int, serializer_cls=Int64Serializer) - self._add_x_lang_serializer(float, serializer_cls=FloatSerializer) - self._add_x_lang_serializer(float, serializer_cls=DoubleSerializer) - self._add_serializer(type(None), serializer_cls=NoneSerializer) - self._add_serializer(bool, serializer_cls=BooleanSerializer) - self._add_serializer(Int8Type, serializer_cls=ByteSerializer) - self._add_serializer(Int16Type, serializer_cls=Int16Serializer) - self._add_serializer(Int32Type, serializer_cls=Int32Serializer) - self._add_serializer(Int64Type, serializer_cls=Int64Serializer) - self._add_serializer(Float32Type, serializer_cls=FloatSerializer) - self._add_serializer(Float64Type, serializer_cls=DoubleSerializer) - self._add_serializer(str, serializer_cls=StringSerializer) - self._add_serializer(datetime.date, serializer_cls=DateSerializer) - self._add_serializer( - datetime.datetime, serializer_cls=TimestampSerializer - ) - self._add_serializer(bytes, serializer_cls=BytesSerializer) - self._add_serializer(list, serializer_cls=ListSerializer) - self._add_serializer(tuple, serializer_cls=TupleSerializer) - self._add_serializer(dict, serializer_cls=MapSerializer) - self._add_serializer(set, serializer_cls=SetSerializer) - self._add_serializer(enum.Enum, serializer_cls=EnumSerializer) - self._add_serializer(slice, serializer_cls=SliceSerializer) - from pyfury import PickleCacheSerializer, PickleStrongCacheSerializer - - self._add_serializer(PickleStrongCacheStub, - serializer=PickleStrongCacheSerializer(self.fury)) - self._add_serializer(PickleCacheStub, - serializer=PickleCacheSerializer(self.fury)) - - try: - import pyarrow as pa - from pyfury.format.serializer import ( - ArrowRecordBatchSerializer, - ArrowTableSerializer, - ) - - self._add_serializer( - pa.RecordBatch, serializer_cls=ArrowRecordBatchSerializer - ) - self._add_serializer(pa.Table, serializer_cls=ArrowTableSerializer) - except Exception: - pass - for typecode in PyArraySerializer.typecode_dict.keys(): - self._add_serializer( - array.array, - serializer=PyArraySerializer(self.fury, array.array, typecode), - ) - self._add_serializer( - PyArraySerializer.typecodearray_type[typecode], - serializer=PyArraySerializer(self.fury, array.array, typecode), - ) - if np: - for dtype in Numpy1DArraySerializer.dtypes_dict.keys(): - self._add_serializer( - np.ndarray, - serializer=Numpy1DArraySerializer(self.fury, array.array, dtype), - ) - - cpdef inline Serializer get_serializer(self, cls=None, type_id=None, obj=None): - """ - Returns - ------- - Returns or create serializer for the provided class - """ - assert cls is not None or type_id is not None or obj is not None - - cdef Serializer serializer_ - if obj is not None: - cls = type(obj) - if cls is int and 2**63 - 1 >= obj >= -(2**63): - type_id = FuryType.INT64.value - elif cls is float: - type_id = FuryType.DOUBLE.value - elif cls is array.array: - info = PyArraySerializer.typecode_dict.get(obj.typecode) - if info is not None: - type_id = info[1] - elif np and cls is np.ndarray and obj.ndim == 1: - info = Numpy1DArraySerializer.dtypes_dict.get(obj.dtype) - if info: - type_id = info[2] - if type_id is not None: - if cls is not None: - serializer_ = self._type_id_and_cls_to_serializer[(type_id, cls)] - else: - serializer_ = self._type_id_to_serializer[type_id] - else: - class_info = self._classes_info.get(cls) - if class_info is not None: - serializer_ = class_info.serializer - else: - self._add_serializer(cls, serializer=self.get_or_create_serializer(cls)) - serializer_ = self._classes_info.get(cls).serializer - self._serializer = serializer_ - return serializer_ - - cpdef inline Serializer get_or_create_serializer(self, cls): - return self.get_or_create_classinfo(cls).serializer - - cpdef inline ClassInfo get_or_create_classinfo(self, cls): - cdef PyObject* classinfo_ptr = self._c_classes_info[cls] - cdef ClassInfo class_info - if classinfo_ptr != NULL: - class_info = classinfo_ptr - if class_info.serializer is not None: - return class_info - else: - class_info.serializer = self._create_serializer(cls) - return class_info - else: - serializer = self._create_serializer(cls) - if type(serializer) is PickleSerializer: - class_id = PICKLE_CLASS_ID - else: - class_id = NO_CLASS_ID - class_name_bytes = (cls.__module__ + "#" + cls.__qualname__).encode("utf-8") - class_info = ClassInfo( - cls=cls, class_name_bytes=class_name_bytes, - serializer=serializer, class_id=class_id - ) - self._classes_info[cls] = class_info - self._c_classes_info[cls] = class_info - return class_info - - cdef _create_serializer(self, cls): - mro = cls.__mro__ - classinfo_ = self._classes_info.get(cls) - for clz in mro: - class_info = self._classes_info.get(clz) - if ( - class_info - and class_info.serializer - and class_info.serializer.support_subclass() - ): - if classinfo_ is None or classinfo_.class_id == NO_CLASS_ID: - logger.info("Class %s not registered", cls) - serializer = type(class_info.serializer)(self.fury, cls) - break - else: - if dataclasses.is_dataclass(cls): - if classinfo_ is None or classinfo_.class_id == NO_CLASS_ID: - logger.info("Class %s not registered", cls) - from pyfury import DataClassSerializer - - serializer = DataClassSerializer(self.fury, cls) - else: - serializer = PickleSerializer(self.fury, cls) - return serializer - - cpdef inline write_classinfo(self, Buffer buffer, ClassInfo classinfo): - cdef int32_t class_id = classinfo.class_id - if class_id != NO_CLASS_ID: - buffer.write_varint32((class_id << 1)) - return - buffer.write_varint32(1) - self._write_enum_string_bytes(buffer, classinfo.class_name_bytes) - - cpdef inline ClassInfo read_classinfo(self, Buffer buffer): - cdef int32_t h1 = buffer.read_varint32() - cdef int32_t class_id = h1 >> 1 - cdef ClassInfo classinfo - cdef PyObject* classinfo_ptr - # registered class id are greater than `NO_CLASS_ID`. - if h1 & 0b1 == 0: - assert class_id >= 0, class_id - classinfo_ptr = self._c_registered_id2_class_info[class_id] - if classinfo_ptr == NULL: - raise ValueError(f"Unexpected class_id {class_id} " - f"{self._registered_id2_class_info}") - classinfo = classinfo_ptr - if classinfo.serializer is None: - classinfo.serializer = self._create_serializer(classinfo.cls) - return classinfo - cdef int32_t header = buffer.read_varint32() - cdef int32_t length = header >> 1 - if header & 0b1 != 0: - return self._c_dynamic_id_to_classinfo_vec[length - 1] - cdef int64_t class_name_bytes_hash = buffer.read_int64() - cdef int32_t reader_index = buffer.reader_index - buffer.check_bound(reader_index, length) - buffer.reader_index = reader_index + length - classinfo_ptr = self._c_hash_to_classinfo[class_name_bytes_hash] - if classinfo_ptr != NULL: - self._c_dynamic_id_to_classinfo_vec.push_back(classinfo_ptr) - return classinfo_ptr - cdef bytes classname_bytes = buffer.get_bytes(reader_index, length) - cdef str full_class_name = classname_bytes.decode(encoding="utf-8") - cls = load_class(full_class_name) - classinfo = self.get_or_create_classinfo(cls) - classinfo_ptr = classinfo - self._c_hash_to_classinfo[class_name_bytes_hash] = classinfo_ptr - self._c_dynamic_id_to_classinfo_vec.push_back(classinfo_ptr) - return classinfo - - cdef inline _write_enum_string_bytes( - self, Buffer buffer, MetaStringBytes enum_string_bytes): - cdef int16_t dynamic_class_id = enum_string_bytes.dynamic_write_string_id - cdef int32_t length = enum_string_bytes.length - if dynamic_class_id == DEFAULT_DYNAMIC_WRITE_STRING_ID: - dynamic_class_id = self.dynamic_write_string_id - enum_string_bytes.dynamic_write_string_id = dynamic_class_id + self._metastr_to_metastr_bytes = dict() + + cpdef inline write_meta_string_bytes( + self, Buffer buffer, MetaStringBytes metastr_bytes): + cdef int16_t dynamic_type_id = metastr_bytes.dynamic_write_string_id + cdef int32_t length = metastr_bytes.length + if dynamic_type_id == DEFAULT_DYNAMIC_WRITE_STRING_ID: + dynamic_type_id = self.dynamic_write_string_id + metastr_bytes.dynamic_write_string_id = dynamic_type_id self.dynamic_write_string_id += 1 - self._c_dynamic_written_enum_string.push_back(enum_string_bytes) - buffer.write_varint32(length << 1) + self._c_dynamic_written_enum_string.push_back( metastr_bytes) + buffer.write_varuint32(length << 1) if length <= SMALL_STRING_THRESHOLD: - buffer.write_int8(Encoding.UTF_8.value) + buffer.write_int8(metastr_bytes.encoding) else: - buffer.write_int64(enum_string_bytes.hashcode) - buffer.write_bytes(enum_string_bytes.data) + buffer.write_int64(metastr_bytes.hashcode) + buffer.write_bytes(metastr_bytes.data) else: - buffer.write_varint32(((dynamic_class_id + 1) << 1) | 1) + buffer.write_varuint32(((dynamic_type_id + 1) << 1) | 1) - cdef inline MetaStringBytes _read_enum_string_bytes(self, Buffer buffer): - cdef int32_t header = buffer.read_varint32() + cpdef inline MetaStringBytes read_meta_string_bytes(self, Buffer buffer): + cdef int32_t header = buffer.read_varuint32() cdef int32_t length = header >> 1 if header & 0b1 != 0: - return self._c_dynamic_id_to_enum_string_vec[length - 1] + return self._c_dynamic_id_to_enum_string_vec[length - 1] cdef int64_t v1 = 0, v2 = 0, hashcode - cdef PyObject* enum_str_ptr + cdef PyObject * enum_str_ptr cdef int32_t reader_index + cdef encoding = 0 if length <= SMALL_STRING_THRESHOLD: - # TODO(chaokunyang) support metastring encoding - buffer.read_int8() + encoding = buffer.read_int8() if length <= 8: v1 = buffer.read_bytes_as_int64(length) else: v1 = buffer.read_int64() v2 = buffer.read_bytes_as_int64(length - 8) - hashcode = v1 * 31 + v2 + hashcode = ((v1 * 31 + v2) >> 8 << 8) | encoding enum_str_ptr = self._c_hash_to_small_metastring_bytes[pair[int64_t, int64_t](v1, v2)] if enum_str_ptr == NULL: reader_index = buffer.reader_index str_bytes = buffer.get_bytes(reader_index - length, length) enum_str = MetaStringBytes(str_bytes, hashcode=hashcode) self._enum_str_set.add(enum_str) - enum_str_ptr = enum_str + enum_str_ptr = enum_str self._c_hash_to_small_metastring_bytes[pair[int64_t, int64_t](v1, v2)] = enum_str_ptr else: hashcode = buffer.read_int64() reader_index = buffer.reader_index buffer.check_bound(reader_index, length) buffer.reader_index = reader_index + length - enum_str_ptr = self._c_hash_to_enum_string_bytes[hashcode] + enum_str_ptr = self._c_hash_to_metastr_bytes[hashcode] if enum_str_ptr == NULL: str_bytes = buffer.get_bytes(reader_index, length) enum_str = MetaStringBytes(str_bytes, hashcode=hashcode) self._enum_str_set.add(enum_str) - enum_str_ptr = enum_str - self._c_hash_to_enum_string_bytes[hashcode] = enum_str_ptr + enum_str_ptr = enum_str + self._c_hash_to_metastr_bytes[hashcode] = enum_str_ptr self._c_dynamic_id_to_enum_string_vec.push_back(enum_str_ptr) - return enum_str_ptr - - cpdef inline xwrite_class(self, Buffer buffer, cls): - cdef PyObject* classinfo_ptr = self._c_classes_info[cls] - assert classinfo_ptr != NULL - cdef MetaStringBytes class_name_bytes = (classinfo_ptr).class_name_bytes - self._write_enum_string_bytes(buffer, class_name_bytes) - - cpdef inline xwrite_type_tag(self, Buffer buffer, cls): - cdef PyObject* classinfo_ptr = self._c_classes_info[cls] - assert classinfo_ptr != NULL - cdef MetaStringBytes type_tag_bytes = (classinfo_ptr).type_tag_bytes - self._write_enum_string_bytes(buffer, type_tag_bytes) - - cpdef inline read_class_by_type_tag(self, Buffer buffer): - tag = self.xread_classname(buffer) - return self._type_tag_to_class_x_lang_map[tag] - - cpdef inline xread_class(self, Buffer buffer): - cdef MetaStringBytes str_bytes = self._read_enum_string_bytes(buffer) - cdef uint64_t object_id = str_bytes - cdef PyObject* cls_ptr = self._c_str_bytes_to_class[object_id] - if cls_ptr != NULL: - return cls_ptr - cdef str full_class_name = str_bytes.data.decode(encoding="utf-8") - cls = load_class(full_class_name) - self._c_str_bytes_to_class[object_id] = cls - self._class_set.add(cls) - return cls - - cpdef inline str xread_classname(self, Buffer buffer): - cdef MetaStringBytes str_bytes = self._read_enum_string_bytes(buffer) - cdef uint64_t object_id = str_bytes - cdef PyObject* classname_ptr = self._c_enum_str_to_str[object_id] - if classname_ptr != NULL: - return classname_ptr - cdef str full_class_name = str_bytes.data.decode(encoding="utf-8") - self._c_enum_str_to_str[object_id] = full_class_name - self._classname_set.add(full_class_name) - return full_class_name - - cpdef inline get_class_by_type_id(self, int32_t type_id): - return self._type_id_to_class[type_id] + return enum_str_ptr - cpdef inline reset(self): - self.reset_write() - self.reset_read() + def get_metastr_bytes(self, metastr): + metastr_bytes = self._metastr_to_metastr_bytes.get(metastr) + if metastr_bytes is not None: + return metastr_bytes + cdef int64_t v1 = 0, v2 = 0, hashcode + length = len(metastr.encoded_data) + if length <= SMALL_STRING_THRESHOLD: + data_buf = Buffer(metastr.encoded_data) + if length <= 8: + v1 = data_buf.read_bytes_as_int64(length) + else: + v1 = data_buf.read_int64() + v2 = data_buf.read_bytes_as_int64(length - 8) + value_hash = ((v1 * 31 + v2) >> 8 << 8) | metastr.encoding.value + else: + value_hash = mmh3.hash_buffer(metastr.encoded_data, seed=47)[0] + value_hash = value_hash >> 8 << 8 + value_hash |= metastr.encoding.value & 0xFF + self._metastr_to_metastr_bytes[metastr] = metastr_bytes = MetaStringBytes(metastr.encoded_data, value_hash) + return metastr_bytes cpdef inline reset_read(self): self._c_dynamic_id_to_enum_string_vec.clear() - self._c_dynamic_id_to_classinfo_vec.clear() cpdef inline reset_write(self): if self.dynamic_write_string_id != 0: self.dynamic_write_string_id = 0 for ptr in self._c_dynamic_written_enum_string: - (ptr).dynamic_write_string_id = \ + ( ptr).dynamic_write_string_id = \ DEFAULT_DYNAMIC_WRITE_STRING_ID self._c_dynamic_written_enum_string.clear() -@cython.final -cdef class MetaStringBytes: - cdef bytes data - cdef int16_t length - cdef int64_t hashcode - cdef int16_t dynamic_write_string_id - - def __init__(self, data, hashcode=None): - self.data = data - self.length = len(data) - if hashcode is None: - hashcodes = mmh3.hash_buffer(data, 47) - hashcode = hashcodes[0] - assert isinstance(hashcode, int) - # FIXME: why using & 0xffffffffffffff00 overflow in cython - hashcode = (hashcode >> 8) << 8 - self.hashcode = hashcode - self.dynamic_write_string_id = DEFAULT_DYNAMIC_WRITE_STRING_ID - - def __eq__(self, other): - return type(other) is MetaStringBytes and other.hashcode == self.hashcode - - def __hash__(self): - return self.hashcode - - @cython.final cdef class ClassInfo: + """ + If dynamic_type is true, the serializer will be a dynamic typed serializer + and it will write type info when writing the data. + In such cases, the `write_classinfo` should not write typeinfo. + In general, if we have 4 type for one class, we will have 5 serializers. + For example, we have int8/16/32/64/128 for python `int` type, then we have 6 serializers + for python `int`: `Int8/1632/64/128Serializer` for `int8/16/32/64/128` each, and another + `IntSerializer` for `int` which will dispatch to different `int8/16/32/64/128` type + according the actual value. + We do not get the acutal type here, because it will introduce extra computing. + For example, we have want to get actual `Int8/16/32/64Serializer`, we must check and + extract the actutal here which will introduce cost, and we will do same thing again + when serializing the actual data. + """ cdef public object cls - cdef public int16_t class_id + cdef public int16_t type_id cdef public Serializer serializer - cdef public MetaStringBytes class_name_bytes - cdef public MetaStringBytes type_tag_bytes + cdef public MetaStringBytes namespace_bytes + cdef public MetaStringBytes typename_bytes + cdef public c_bool dynamic_type def __init__( self, cls: Union[type, TypeVar] = None, - class_id: int = NO_CLASS_ID, + type_id: int = NO_CLASS_ID, serializer: Serializer = None, - class_name_bytes: bytes = None, - type_tag_bytes: bytes = None, + namespace_bytes: MetaStringBytes = None, + typename_bytes: MetaStringBytes = None, + dynamic_type: bool = False, ): self.cls = cls - self.class_id = class_id + self.type_id = type_id self.serializer = serializer - self.class_name_bytes = MetaStringBytes(class_name_bytes) - if type_tag_bytes is None: - self.type_tag_bytes = None - else: - self.type_tag_bytes = MetaStringBytes(type_tag_bytes) + self.namespace_bytes = namespace_bytes + self.typename_bytes = typename_bytes + self.dynamic_type = dynamic_type def __repr__(self): - return f"ClassInfo(cls={self.cls}, class_id={self.class_id}, " \ + return f"ClassInfo(cls={self.cls}, type_id={self.type_id}, " \ f"serializer={self.serializer})" +@cython.final +cdef class ClassResolver: + cdef: + readonly Fury fury + readonly MetaStringResolver metastring_resolver + object _resolver + vector[PyObject *] _c_registered_id_to_class_info + # cls -> ClassInfo + flat_hash_map[uint64_t, PyObject *] _c_classes_info + # hash -> ClassInfo + flat_hash_map[pair[int64_t, int64_t], PyObject *] _c_meta_hash_to_classinfo + MetaStringResolver meta_string_resolver + + def __init__(self, fury): + self.fury = fury + self.metastring_resolver = fury.metastring_resolver + from pyfury._registry import ClassResolver + self._resolver = ClassResolver(fury) + + def initialize(self): + self._resolver.initialize() + for classinfo in self._resolver._classes_info.values(): + self._populate_typeinfo(classinfo) + + def register_type( + self, + cls: Union[type, TypeVar], + *, + type_id: int = None, + namespace: str = None, + typename: str = None, + serializer=None, + ): + typeinfo = self._resolver.register_type( + cls, + type_id=type_id, + namespace=namespace, + typename=typename, + serializer=serializer, + ) + self._populate_typeinfo(typeinfo) + + cdef _populate_typeinfo(self, typeinfo): + type_id = typeinfo.type_id + if type_id >= self._c_registered_id_to_class_info.size(): + self._c_registered_id_to_class_info.resize(type_id * 2, NULL) + if type_id > 0 and (self.fury.language == Language.PYTHON or not IsNamespacedType(type_id)): + self._c_registered_id_to_class_info[type_id] = typeinfo + self._c_classes_info[ typeinfo.cls] = typeinfo + if typeinfo.typename_bytes is not None: + self._load_bytes_to_classinfo(type_id, typeinfo.namespace_bytes, typeinfo.typename_bytes) + + def register_serializer(self, cls: Union[type, TypeVar], serializer): + classinfo1 = self._resolver.get_classinfo(cls) + self._resolver.register_serializer(cls, serializer) + classinfo2 = self._resolver.get_classinfo(cls) + if classinfo1.type_id != classinfo2.type_id: + self._c_registered_id_to_class_info[classinfo1.type_id] = NULL + self._populate_typeinfo(classinfo2) + + cpdef inline Serializer get_serializer(self, cls): + """ + Returns + ------- + Returns or create serializer for the provided class + """ + return self.get_classinfo(cls).serializer + + cpdef inline ClassInfo get_classinfo(self, cls, create=True): + cdef PyObject * classinfo_ptr = self._c_classes_info[ cls] + cdef ClassInfo class_info + if classinfo_ptr != NULL: + class_info = classinfo_ptr + if class_info.serializer is not None: + return class_info + else: + class_info.serializer = self._resolver._create_serializer(cls) + return class_info + elif not create: + return None + else: + class_info = self._resolver.get_classinfo(cls, create=create) + self._c_classes_info[ cls] = class_info + self._populate_typeinfo(class_info) + return class_info + + cpdef inline write_classinfo(self, Buffer buffer, ClassInfo classinfo): + if classinfo.dynamic_type: + return + cdef int32_t type_id = classinfo.type_id + if type_id != NO_CLASS_ID: + buffer.write_varuint32((type_id << 1)) + return + buffer.write_varuint32(1) + self.metastring_resolver.write_meta_string_bytes( + buffer, classinfo.namespace_bytes + ) + self.metastring_resolver.write_meta_string_bytes( + buffer, classinfo.typename_bytes + ) + + cpdef inline ClassInfo read_classinfo(self, Buffer buffer): + cdef int32_t h1 = buffer.read_varuint32() + cdef int32_t type_id = h1 >> 1 + cdef ClassInfo classinfo + cdef PyObject * classinfo_ptr + # registered class id are greater than `NO_CLASS_ID`. + if h1 & 0b1 == 0: + if type_id < 0 or type_id >= self._c_registered_id_to_class_info.size(): + raise ValueError(f"Unexpected type_id {type_id}") + classinfo_ptr = self._c_registered_id_to_class_info[type_id] + if classinfo_ptr == NULL: + raise ValueError(f"Unexpected type_id {type_id}") + classinfo = classinfo_ptr + if classinfo.serializer is None: + classinfo.serializer = self._resolver._create_serializer(classinfo.cls) + return classinfo + cdef MetaStringBytes ns_metabytes = self.metastring_resolver.read_meta_string_bytes(buffer) + cdef MetaStringBytes type_metabytes = self.metastring_resolver.read_meta_string_bytes(buffer) + return self._load_bytes_to_classinfo(type_id, ns_metabytes, type_metabytes) + + cdef inline ClassInfo _load_bytes_to_classinfo( + self, int32_t type_id, MetaStringBytes ns_metabytes, MetaStringBytes type_metabytes): + cdef PyObject * classinfo_ptr = self._c_meta_hash_to_classinfo[ + pair[int64_t, int64_t](ns_metabytes.hashcode, type_metabytes.hashcode)] + if classinfo_ptr != NULL: + return classinfo_ptr + classinfo = self._resolver._load_metabytes_to_classinfo(ns_metabytes, type_metabytes) + classinfo_ptr = classinfo + self._c_meta_hash_to_classinfo[pair[int64_t, int64_t]( + ns_metabytes.hashcode, type_metabytes.hashcode)] = classinfo_ptr + return classinfo + + cpdef write_typeinfo(self, Buffer buffer, ClassInfo classinfo): + if classinfo.dynamic_type: + return + cdef: + int32_t type_id = classinfo.type_id + int32_t internal_type_id = type_id & 0xFF + buffer.write_varuint32(type_id) + if IsNamespacedType(internal_type_id): + self.metastring_resolver.write_meta_string_bytes(buffer, classinfo.namespace_bytes) + self.metastring_resolver.write_meta_string_bytes(buffer, classinfo.typename_bytes) + + cpdef inline ClassInfo read_typeinfo(self, Buffer buffer): + cdef: + int32_t type_id = buffer.read_varuint32() + int32_t internal_type_id = type_id & 0xFF + cdef MetaStringBytes namespace_bytes, typename_bytes + if IsNamespacedType(internal_type_id): + namespace_bytes = self.metastring_resolver.read_meta_string_bytes(buffer) + typename_bytes = self.metastring_resolver.read_meta_string_bytes(buffer) + return self._load_bytes_to_classinfo(type_id, namespace_bytes, typename_bytes) + if type_id < 0 or type_id > self._c_registered_id_to_class_info.size(): + raise ValueError(f"Unexpected type_id {type_id}") + classinfo_ptr = self._c_registered_id_to_class_info[type_id] + if classinfo_ptr == NULL: + raise ValueError(f"Unexpected type_id {type_id}") + classinfo = classinfo_ptr + return classinfo + + cpdef inline reset(self): + pass + + cpdef inline reset_read(self): + pass + + cpdef inline reset_write(self): + pass + + @cython.final cdef class Fury: cdef readonly object language @@ -849,6 +595,7 @@ cdef class Fury: cdef readonly c_bool require_class_registration cdef readonly MapRefResolver ref_resolver cdef readonly ClassResolver class_resolver + cdef readonly MetaStringResolver metastring_resolver cdef readonly SerializationContext serialization_context cdef Buffer buffer cdef public object pickler # pickle.Pickler @@ -858,14 +605,13 @@ cdef class Fury: cdef object _unsupported_callback cdef object _unsupported_objects # iterator cdef object _peer_language - cdef list _native_objects def __init__( - self, - language=Language.XLANG, - ref_tracking: bool = False, - require_class_registration: bool = True, - ): + self, + language=Language.XLANG, + ref_tracking: bool = False, + require_class_registration: bool = True, + ): """ :param require_class_registration: Whether to require registering classes for serialization, enabled by default. @@ -883,6 +629,7 @@ cdef class Fury: self.require_class_registration = False self.ref_tracking = ref_tracking self.ref_resolver = MapRefResolver(ref_tracking) + self.metastring_resolver = MetaStringResolver() self.class_resolver = ClassResolver(self) self.class_resolver.initialize() self.serialization_context = SerializationContext() @@ -903,14 +650,21 @@ cdef class Fury: self._unsupported_callback = None self._unsupported_objects = None self._peer_language = None - self._native_objects = [] def register_serializer(self, cls: Union[type, TypeVar], Serializer serializer): self.class_resolver.register_serializer(cls, serializer) - def register_class(self, cls: Union[type, TypeVar], *, - class_id: int = None, type_tag: str = None): - self.class_resolver.register_class(cls, class_id=class_id, type_tag=type_tag) + def register_type( + self, + cls: Union[type, TypeVar], + *, + type_id: int = None, + namespace: str = None, + typename: str = None, + serializer=None, + ): + self.class_resolver.register_type( + cls, type_id=type_id, namespace=namespace, typename=typename, serializer=serializer) def serialize( self, obj, @@ -968,21 +722,7 @@ cdef class Fury: if self.language == Language.PYTHON: self.serialize_ref(buffer, obj) else: - start_offset = buffer.writer_index - # preserve 4-byte for nativeObjects start offsets. - buffer.write_int32(-1) - # preserve 4-byte for nativeObjects size - buffer.write_int32(-1) self.xserialize_ref(buffer, obj) - buffer.put_int32(start_offset, buffer.writer_index) - buffer.put_int32(start_offset + 4, len(self._native_objects)) - self.ref_resolver.reset_write() - # fury write opaque object classname which cause later write of classname - # only write an id. - self.class_resolver.reset_write() - for native_object in self._native_objects: - self.serialize_ref(buffer, native_object) - self.reset_write() if buffer is not self.buffer: return buffer else: @@ -1010,29 +750,29 @@ cdef class Fury: if self.ref_resolver.write_ref_or_null(buffer, obj): return if classinfo is None: - classinfo = self.class_resolver.get_or_create_classinfo(cls) + classinfo = self.class_resolver.get_classinfo(cls) self.class_resolver.write_classinfo(buffer, classinfo) classinfo.serializer.write(buffer, obj) cpdef inline serialize_nonref(self, Buffer buffer, obj): cls = type(obj) if cls is str: - buffer.write_varint32(STRING_CLASS_ID << 1) + buffer.write_varuint32(STRING_CLASS_ID << 1) buffer.write_string(obj) return elif cls is int: - buffer.write_varint32(PYINT_CLASS_ID << 1) + buffer.write_varuint32(PYINT_CLASS_ID << 1) buffer.write_varint64(obj) return elif cls is bool: - buffer.write_varint32(PYBOOL_CLASS_ID << 1) + buffer.write_varuint32(PYBOOL_CLASS_ID << 1) buffer.write_bool(obj) return elif cls is float: - buffer.write_varint32(PYFLOAT_CLASS_ID << 1) + buffer.write_varuint32(PYFLOAT_CLASS_ID << 1) buffer.write_double(obj) return - cdef ClassInfo classinfo = self.class_resolver.get_or_create_classinfo(cls) + cdef ClassInfo classinfo = self.class_resolver.get_classinfo(cls) self.class_resolver.write_classinfo(buffer, classinfo) classinfo.serializer.write(buffer, obj) @@ -1054,27 +794,11 @@ cdef class Fury: cpdef inline xserialize_nonref( self, Buffer buffer, obj, Serializer serializer=None): - cls = type(obj) - serializer = serializer or self.class_resolver.get_serializer(obj=obj) - cdef int16_t type_id = serializer.get_xtype_id() - buffer.write_int16(type_id) - if type_id != NOT_SUPPORT_CROSS_LANGUAGE: - if type_id == FuryType.FURY_TYPE_TAG.value: - self.class_resolver.xwrite_type_tag(buffer, cls) - if type_id < NOT_SUPPORT_CROSS_LANGUAGE: - self.class_resolver.xwrite_class(buffer, cls) - serializer.xwrite(buffer, obj) - else: - # Write classname so it can be used for debugging which object doesn't - # support cross-language. - # TODO add a config to disable this to reduce space cost. - self.class_resolver.xwrite_class(buffer, cls) - # serializer may increase reference id multi times internally, thus peer - # cross-language later fields/objects deserialization will use wrong - # reference id since we skip opaque objects deserialization. - # So we stash native objects and serialize all those object at the last. - buffer.write_varint32(len(self._native_objects)) - self._native_objects.append(obj) + if serializer is None: + classinfo = self.class_resolver.get_classinfo(type(obj)) + self.class_resolver.write_typeinfo(buffer, classinfo) + serializer = classinfo.serializer + serializer.xwrite(buffer, obj) def deserialize( self, @@ -1133,17 +857,6 @@ cdef class Fury: ) if not is_target_x_lang: return self.deserialize_ref(buffer) - cdef int32_t native_objects_start_offset = buffer.read_int32() - cdef int32_t native_objects_size = buffer.read_int32() - if self._peer_language == Language.PYTHON: - if native_objects_size > 0: - native_objects_buffer = buffer.slice(native_objects_start_offset) - for i in range(native_objects_size): - self._native_objects.append( - self.deserialize_ref(native_objects_buffer) - ) - self.ref_resolver.reset_read() - self.class_resolver.reset_read() return self.xdeserialize_ref(buffer) cpdef inline deserialize_ref(self, Buffer buffer): @@ -1180,20 +893,18 @@ cdef class Fury: return buffer.read_double() return classinfo.serializer.read(buffer) - cpdef inline xdeserialize_ref( - self, Buffer buffer, Serializer serializer=None): + cpdef inline xdeserialize_ref(self, Buffer buffer, Serializer serializer=None): cdef MapRefResolver ref_resolver - cdef int32_t red_id + cdef int32_t ref_id if serializer is None or serializer.need_to_write_ref: ref_resolver = self.ref_resolver - red_id = ref_resolver.try_preserve_ref_id(buffer) - + ref_id = ref_resolver.try_preserve_ref_id(buffer) # indicates that the object is first read. - if red_id >= NOT_NULL_VALUE_FLAG: + if ref_id >= NOT_NULL_VALUE_FLAG: o = self.xdeserialize_nonref( buffer, serializer=serializer ) - ref_resolver.set_read_object(red_id, o) + ref_resolver.set_read_object(ref_id, o) return o else: return ref_resolver.get_read_object() @@ -1206,46 +917,18 @@ cdef class Fury: cpdef inline xdeserialize_nonref( self, Buffer buffer, Serializer serializer=None): - cdef int16_t type_id = buffer.read_int16() - cls = None - if type_id != NOT_SUPPORT_CROSS_LANGUAGE: - if type_id == FuryType.FURY_TYPE_TAG.value: - cls = self.class_resolver.read_class_by_type_tag(buffer) - if type_id < NOT_SUPPORT_CROSS_LANGUAGE: - if self._peer_language is not Language.PYTHON: - self.class_resolver.xread_classname(buffer) - cls = self.class_resolver.get_class_by_type_id(-type_id) - serializer = serializer or self.class_resolver.get_serializer( - type_id=-type_id - ) - else: - cls = self.class_resolver.xread_class(buffer) - serializer = serializer or self.class_resolver.get_serializer( - cls=cls, type_id=type_id - ) - else: - if type_id != FuryType.FURY_TYPE_TAG.value: - cls = self.class_resolver.get_class_by_type_id(type_id) - serializer = serializer or self.class_resolver.get_serializer( - cls=cls, type_id=type_id - ) - assert cls is not None - return serializer.xread(buffer) - cdef str class_name = self.class_resolver.xread_classname(buffer) - cdef int32_t ordinal = buffer.read_varint32() - if self._peer_language != Language.PYTHON: - return OpaqueObject(self._peer_language, class_name, ordinal) - else: - return self._native_objects[ordinal] + if serializer is None: + serializer = self.class_resolver.read_typeinfo(buffer).serializer + return serializer.xread(buffer) - cpdef inline write_buffer_object(self, Buffer buffer, BufferObject buffer_object): + cpdef inline write_buffer_object(self, Buffer buffer, buffer_object): if self._buffer_callback is not None and self._buffer_callback(buffer_object): buffer.write_bool(False) return buffer.write_bool(True) cdef int32_t size = buffer_object.total_bytes() # writer length. - buffer.write_varint32(size) + buffer.write_varuint32(size) cdef int32_t writer_index = buffer.writer_index buffer.ensure(writer_index + size) cdef Buffer buf = buffer.slice(buffer.writer_index, size) @@ -1257,7 +940,7 @@ cdef class Fury: if not in_band: assert self._buffers is not None return next(self._buffers) - cdef int32_t size = buffer.read_varint32() + cdef int32_t size = buffer.read_varuint32() cdef Buffer buf = buffer.slice(buffer.reader_index, size) buffer.reader_index += size return buf @@ -1282,7 +965,7 @@ cdef class Fury: if self.ref_resolver.write_ref_or_null(buffer, value): return if classinfo is None: - classinfo = self.class_resolver.get_or_create_classinfo(type(value)) + classinfo = self.class_resolver.get_classinfo(type(value)) self.class_resolver.write_classinfo(buffer, classinfo) classinfo.serializer.write(buffer, value) @@ -1300,16 +983,16 @@ cdef class Fury: cpdef inline reset_write(self): self.ref_resolver.reset_write() self.class_resolver.reset_write() + self.metastring_resolver.reset_write() self.serialization_context.reset() - self._native_objects.clear() self.pickler.clear_memo() self._unsupported_callback = None cpdef inline reset_read(self): self.ref_resolver.reset_read() self.class_resolver.reset_read() + self.metastring_resolver.reset_read() self.serialization_context.reset() - self._native_objects.clear() self._buffers = None self.unpickler = None self._unsupported_objects = None @@ -1318,7 +1001,6 @@ cdef class Fury: self.reset_write() self.reset_read() - cpdef inline write_nullable_pybool(Buffer buffer, value): if value is None: buffer.write_int8(NULL_FLAG) @@ -1347,7 +1029,6 @@ cpdef inline write_nullable_pystr(Buffer buffer, value): buffer.write_int8(NOT_NULL_VALUE_FLAG) buffer.write_string(value) - cpdef inline read_nullable_pybool(Buffer buffer): if buffer.read_int8() == NOT_NULL_VALUE_FLAG: return buffer.read_bool() @@ -1396,7 +1077,6 @@ cdef class SerializationContext: if len(self.objects) > 0: self.objects.clear() - cdef class Serializer: cdef readonly Fury fury cdef readonly object type_ @@ -1407,28 +1087,6 @@ cdef class Serializer: self.type_ = type_ self.need_to_write_ref = not is_primitive_type(type_) - cpdef int16_t get_xtype_id(self): - """ - Returns - ------- - Returns NOT_SUPPORT_CROSS_LANGUAGE if the serializer doesn't - support cross-language serialization. - Return a number in range (0, 32767) if the serializer support - cross-language serialization and native serialization data is the - same with cross-language serialization. - Return a negative short in range [-32768, 0) if the serializer - support cross-language serialization and native serialization data - is not the same with cross-language serialization. - """ - return NOT_SUPPORT_CROSS_LANGUAGE - - cpdef str get_xtype_tag(self): - """ - Returns - ------- - a type tag used for setup type mapping between languages. - """ - cpdef write(self, Buffer buffer, value): raise NotImplementedError @@ -1445,17 +1103,7 @@ cdef class Serializer: def support_subclass(cls) -> bool: return False - cdef class CrossLanguageCompatibleSerializer(Serializer): - cpdef write(self, Buffer buffer, value): - raise NotImplementedError - - cpdef read(self, Buffer buffer): - raise NotImplementedError - - def __init__(self, fury, type_): - super().__init__(fury, type_) - cpdef xwrite(self, Buffer buffer, value): self.write(buffer, value) @@ -1465,9 +1113,6 @@ cdef class CrossLanguageCompatibleSerializer(Serializer): @cython.final cdef class BooleanSerializer(CrossLanguageCompatibleSerializer): - cpdef inline int16_t get_xtype_id(self): - return FuryType.BOOL.value - cpdef inline write(self, Buffer buffer, value): buffer.write_bool(value) @@ -1475,27 +1120,8 @@ cdef class BooleanSerializer(CrossLanguageCompatibleSerializer): return buffer.read_bool() -@cython.final -cdef class NoneSerializer(Serializer): - - cpdef inline xwrite(self, Buffer buffer, value): - raise NotImplementedError - - cpdef inline xread(self, Buffer buffer): - raise NotImplementedError - - cpdef inline write(self, Buffer buffer, value): - pass - - cpdef inline read(self, Buffer buffer): - return None - - @cython.final cdef class ByteSerializer(CrossLanguageCompatibleSerializer): - cpdef inline int16_t get_xtype_id(self): - return FuryType.INT8.value - cpdef inline write(self, Buffer buffer, value): buffer.write_int8(value) @@ -1505,9 +1131,6 @@ cdef class ByteSerializer(CrossLanguageCompatibleSerializer): @cython.final cdef class Int16Serializer(CrossLanguageCompatibleSerializer): - cpdef inline int16_t get_xtype_id(self): - return FuryType.INT16.value - cpdef inline write(self, Buffer buffer, value): buffer.write_int16(value) @@ -1517,26 +1140,20 @@ cdef class Int16Serializer(CrossLanguageCompatibleSerializer): @cython.final cdef class Int32Serializer(CrossLanguageCompatibleSerializer): - cpdef inline int16_t get_xtype_id(self): - return FuryType.INT32.value - cpdef inline write(self, Buffer buffer, value): - buffer.write_int32(value) + buffer.write_varint32(value) cpdef inline read(self, Buffer buffer): - return buffer.read_int32() + return buffer.read_varint32() @cython.final cdef class Int64Serializer(CrossLanguageCompatibleSerializer): - cpdef inline int16_t get_xtype_id(self): - return FuryType.INT64.value - cpdef inline xwrite(self, Buffer buffer, value): - buffer.write_int64(value) + buffer.write_varint64(value) cpdef inline xread(self, Buffer buffer): - return buffer.read_int64() + return buffer.read_varint64() cpdef inline write(self, Buffer buffer, value): buffer.write_varint64(value) @@ -1544,12 +1161,31 @@ cdef class Int64Serializer(CrossLanguageCompatibleSerializer): cpdef inline read(self, Buffer buffer): return buffer.read_varint64() +cdef int64_t INT8_MIN_VALUE = -1 << 7 +cdef int64_t INT8_MAX_VALUE = 1 << 7 - 1 +cdef int64_t INT16_MIN_VALUE = -1 << 15 +cdef int64_t INT16_MAX_VALUE = 1 << 15 - 1 +cdef int64_t INT32_MIN_VALUE = -1 << 31 +cdef int64_t INT32_MAX_VALUE = 1 << 31 - 1 +cdef float FLOAT32_MIN_VALUE = 1.17549e-38 +cdef float FLOAT32_MAX_VALUE = 3.40282e+38 + @cython.final -cdef class FloatSerializer(CrossLanguageCompatibleSerializer): - cpdef inline int16_t get_xtype_id(self): - return FuryType.FLOAT.value +cdef class DynamicIntSerializer(CrossLanguageCompatibleSerializer): + cpdef inline xwrite(self, Buffer buffer, value): + # TODO(chaokunyang) check value range and write type and value + buffer.write_varuint32( TypeId.INT64) + buffer.write_varint64(value) + cpdef inline xread(self, Buffer buffer): + type_id = buffer.read_varuint32() + assert type_id == TypeId.INT64, type_id + return buffer.read_varint64() + + +@cython.final +cdef class FloatSerializer(CrossLanguageCompatibleSerializer): cpdef inline write(self, Buffer buffer, value): buffer.write_float(value) @@ -1559,9 +1195,6 @@ cdef class FloatSerializer(CrossLanguageCompatibleSerializer): @cython.final cdef class DoubleSerializer(CrossLanguageCompatibleSerializer): - cpdef inline int16_t get_xtype_id(self): - return FuryType.DOUBLE.value - cpdef inline write(self, Buffer buffer, value): buffer.write_double(value) @@ -1570,10 +1203,20 @@ cdef class DoubleSerializer(CrossLanguageCompatibleSerializer): @cython.final -cdef class StringSerializer(CrossLanguageCompatibleSerializer): - cpdef inline int16_t get_xtype_id(self): - return FuryType.STRING.value +cdef class DynamicFloatSerializer(CrossLanguageCompatibleSerializer): + cpdef inline xwrite(self, Buffer buffer, value): + # TODO(chaokunyang) check value range and write type and value + buffer.write_varuint32( TypeId.FLOAT64) + buffer.write_double(value) + + cpdef inline xread(self, Buffer buffer): + cdef int32_t type_id = buffer.read_varuint32() + assert type_id == TypeId.FLOAT64, type_id + return buffer.read_double() + +@cython.final +cdef class StringSerializer(CrossLanguageCompatibleSerializer): cpdef inline write(self, Buffer buffer, value): buffer.write_string(value) @@ -1586,9 +1229,6 @@ cdef _base_date = datetime.date(1970, 1, 1) @cython.final cdef class DateSerializer(CrossLanguageCompatibleSerializer): - cpdef inline int16_t get_xtype_id(self): - return FuryType.DATE32.value - cpdef inline write(self, Buffer buffer, value): if type(value) is not datetime.date: raise TypeError( @@ -1606,9 +1246,6 @@ cdef class DateSerializer(CrossLanguageCompatibleSerializer): @cython.final cdef class TimestampSerializer(CrossLanguageCompatibleSerializer): - cpdef inline int16_t get_xtype_id(self): - return FuryType.TIMESTAMP.value - cpdef inline write(self, Buffer buffer, value): if type(value) is not datetime.datetime: raise TypeError( @@ -1624,19 +1261,6 @@ cdef class TimestampSerializer(CrossLanguageCompatibleSerializer): return datetime.datetime.fromtimestamp(ts) -@cython.final -cdef class BytesSerializer(CrossLanguageCompatibleSerializer): - cpdef inline int16_t get_xtype_id(self): - return FuryType.BINARY.value - - cpdef inline write(self, Buffer buffer, value): - self.fury.write_buffer_object(buffer, BytesBufferObject(value)) - - cpdef inline read(self, Buffer buffer): - fury_buf = self.fury.read_buffer_object(buffer) - return fury_buf.to_pybytes() - - """ Collection serialization format: https://fury.apache.org/docs/specification/fury_xlang_serialization_spec/#list @@ -1648,7 +1272,6 @@ cdef int8_t COLLECTION_DEFAULT_FLAG = 0b0 cdef int8_t COLLECTION_TRACKING_REF = 0b1 cdef int8_t COLLECTION_NOT_SAME_TYPE = 0b1000 - cdef class CollectionSerializer(Serializer): cdef ClassResolver class_resolver cdef MapRefResolver ref_resolver @@ -1660,9 +1283,6 @@ cdef class CollectionSerializer(Serializer): self.ref_resolver = fury.ref_resolver self.elem_serializer = elem_serializer - cpdef int16_t get_xtype_id(self): - return -FuryType.LIST.value - cdef pair[int8_t, int64_t] write_header(self, Buffer buffer, value): cdef int8_t collect_flag = COLLECTION_DEFAULT_FLAG elem_type = type(next(iter(value))) @@ -1682,7 +1302,7 @@ cdef class CollectionSerializer(Serializer): cdef pair[int8_t, int64_t] header_pair = self.write_header(buffer, value) cdef int8_t collect_flag = header_pair.first cdef int64_t elem_type_ptr = header_pair.second - cdef elem_type = int2obj(elem_type_ptr) + cdef elem_type = int2obj(elem_type_ptr) cdef MapRefResolver ref_resolver = self.ref_resolver cdef ClassResolver class_resolver = self.class_resolver if (collect_flag & COLLECTION_NOT_SAME_TYPE) == 0: @@ -1716,7 +1336,7 @@ cdef class CollectionSerializer(Serializer): buffer.write_double(s) else: if not ref_resolver.write_ref_or_null(buffer, s): - classinfo = class_resolver.get_or_create_classinfo(cls) + classinfo = class_resolver.get_classinfo(cls) class_resolver.write_classinfo(buffer, classinfo) classinfo.serializer.write(buffer, s) @@ -1763,7 +1383,7 @@ cdef class CollectionSerializer(Serializer): cpdef _write_same_type_no_ref(self, Buffer buffer, value, elem_type): cdef MapRefResolver ref_resolver = self.ref_resolver cdef ClassResolver class_resolver = self.class_resolver - classinfo = class_resolver.get_or_create_classinfo(elem_type) + classinfo = class_resolver.get_classinfo(elem_type) class_resolver.write_classinfo(buffer, classinfo) for s in value: classinfo.serializer.write(buffer, s) @@ -1779,7 +1399,7 @@ cdef class CollectionSerializer(Serializer): cpdef _write_same_type_ref(self, Buffer buffer, value, elem_type): cdef MapRefResolver ref_resolver = self.ref_resolver cdef ClassResolver class_resolver = self.class_resolver - classinfo = class_resolver.get_or_create_classinfo(elem_type) + classinfo = class_resolver.get_classinfo(elem_type) class_resolver.write_classinfo(buffer, classinfo) for s in value: if not ref_resolver.write_ref_or_null(buffer, s): @@ -1808,24 +1428,20 @@ cdef class CollectionSerializer(Serializer): except AttributeError: value = list(value) len_ = len(value) - buffer.write_varint32(len_) + buffer.write_varuint32(len_) for s in value: self.fury.xserialize_ref( buffer, s, serializer=self.elem_serializer ) len_ += 1 - cdef class ListSerializer(CollectionSerializer): - cpdef int16_t get_xtype_id(self): - return FuryType.LIST.value - cpdef read(self, Buffer buffer): cdef MapRefResolver ref_resolver = self.fury.ref_resolver cdef ClassResolver class_resolver = self.fury.class_resolver cdef int64_t len_and_flag = buffer.read_varint64() cdef int64_t len_ = len_and_flag >> 4 - cdef int8_t collect_flag = (len_and_flag & 0xF) + cdef int8_t collect_flag = (len_and_flag & 0xF) cdef list list_ = PyList_New(len_) ref_resolver.reference(list_) if len_ == 0: @@ -1857,7 +1473,7 @@ cdef class ListSerializer(CollectionSerializer): PyList_SET_ITEM(collection_, index, element) cpdef xread(self, Buffer buffer): - cdef int32_t len_ = buffer.read_varint32() + cdef int32_t len_ = buffer.read_varuint32() cdef list collection_ = PyList_New(len_) self.fury.ref_resolver.reference(collection_) for i in range(len_): @@ -1868,7 +1484,6 @@ cdef class ListSerializer(CollectionSerializer): PyList_SET_ITEM(collection_, i, elem) return collection_ - cdef inline get_next_elenment( Buffer buffer, MapRefResolver ref_resolver, @@ -1905,7 +1520,7 @@ cdef class TupleSerializer(CollectionSerializer): cdef ClassResolver class_resolver = self.fury.class_resolver cdef int64_t len_and_flag = buffer.read_varint64() cdef int64_t len_ = len_and_flag >> 4 - cdef int8_t collect_flag = (len_and_flag & 0xF) + cdef int8_t collect_flag = (len_and_flag & 0xF) cdef tuple tuple_ = PyTuple_New(len_) if len_ == 0: return tuple_ @@ -1936,7 +1551,7 @@ cdef class TupleSerializer(CollectionSerializer): PyTuple_SET_ITEM(collection_, index, element) cpdef inline xread(self, Buffer buffer): - cdef int32_t len_ = buffer.read_varint32() + cdef int32_t len_ = buffer.read_varuint32() cdef tuple tuple_ = PyTuple_New(len_) for i in range(len_): elem = self.fury.xdeserialize_ref( @@ -1952,15 +1567,9 @@ cdef class StringArraySerializer(ListSerializer): def __init__(self, fury, type_): super().__init__(fury, type_, StringSerializer(fury, str)) - cpdef inline int16_t get_xtype_id(self): - return FuryType.FURY_STRING_ARRAY.value - @cython.final cdef class SetSerializer(CollectionSerializer): - cpdef inline int16_t get_xtype_id(self): - return FuryType.FURY_SET.value - cpdef inline read(self, Buffer buffer): cdef MapRefResolver ref_resolver = self.fury.ref_resolver cdef ClassResolver class_resolver = self.fury.class_resolver @@ -1968,7 +1577,7 @@ cdef class SetSerializer(CollectionSerializer): ref_resolver.reference(instance) cdef int64_t len_and_flag = buffer.read_varint64() cdef int64_t len_ = len_and_flag >> 4 - cdef int8_t collect_flag = (len_and_flag & 0xF) + cdef int8_t collect_flag = (len_and_flag & 0xF) cdef int32_t ref_id cdef ClassInfo classinfo if len_ == 0: @@ -2015,7 +1624,7 @@ cdef class SetSerializer(CollectionSerializer): collection_.add(element) cpdef inline xread(self, Buffer buffer): - cdef int32_t len_ = buffer.read_varint32() + cdef int32_t len_ = buffer.read_varuint32() cdef set instance = set() self.fury.ref_resolver.reference(instance) for i in range(len_): @@ -2039,12 +1648,9 @@ cdef class MapSerializer(Serializer): self.key_serializer = key_serializer self.value_serializer = value_serializer - cpdef inline int16_t get_xtype_id(self): - return FuryType.MAP.value - cpdef inline write(self, Buffer buffer, o): cdef dict value = o - buffer.write_varint32(len(value)) + buffer.write_varuint32(len(value)) cdef ClassInfo key_classinfo cdef ClassInfo value_classinfo cdef int64_t key_addr, value_addr @@ -2060,7 +1666,7 @@ cdef class MapSerializer(Serializer): buffer.write_string(k) else: if not self.ref_resolver.write_ref_or_null(buffer, k): - key_classinfo = self.class_resolver.get_or_create_classinfo(key_cls) + key_classinfo = self.class_resolver.get_classinfo(key_cls) self.class_resolver.write_classinfo(buffer, key_classinfo) key_classinfo.serializer.write(buffer, k) value_cls = type(v) @@ -2079,14 +1685,14 @@ cdef class MapSerializer(Serializer): else: if not self.ref_resolver.write_ref_or_null(buffer, v): value_classinfo = self.class_resolver. \ - get_or_create_classinfo(value_cls) + get_classinfo(value_cls) self.class_resolver.write_classinfo(buffer, value_classinfo) value_classinfo.serializer.write(buffer, v) cpdef inline read(self, Buffer buffer): cdef MapRefResolver ref_resolver = self.ref_resolver cdef ClassResolver class_resolver = self.class_resolver - cdef int32_t len_ = buffer.read_varint32() + cdef int32_t len_ = buffer.read_varuint32() cdef dict map_ = _PyDict_NewPresized(len_) ref_resolver.reference(map_) cdef int32_t ref_id @@ -2125,7 +1731,7 @@ cdef class MapSerializer(Serializer): cpdef inline xwrite(self, Buffer buffer, o): cdef dict value = o - buffer.write_varint32(len(value)) + buffer.write_varuint32(len(value)) cdef int64_t key_addr, value_addr cdef Py_ssize_t pos = 0 while PyDict_Next(value, &pos, &key_addr, &value_addr) != 0: @@ -2141,7 +1747,7 @@ cdef class MapSerializer(Serializer): ) cpdef inline xread(self, Buffer buffer): - cdef int32_t len_ = buffer.read_varint32() + cdef int32_t len_ = buffer.read_varuint32() cdef dict map_ = _PyDict_NewPresized(len_) self.fury.ref_resolver.reference(map_) for i in range(len_): @@ -2170,7 +1776,7 @@ cdef class SubMapSerializer(Serializer): self.value_serializer = value_serializer cpdef inline write(self, Buffer buffer, value): - buffer.write_varint32(len(value)) + buffer.write_varuint32(len(value)) cdef ClassInfo key_classinfo cdef ClassInfo value_classinfo for k, v in value.items(): @@ -2180,7 +1786,7 @@ cdef class SubMapSerializer(Serializer): buffer.write_string(k) else: if not self.ref_resolver.write_ref_or_null(buffer, k): - key_classinfo = self.class_resolver.get_or_create_classinfo(key_cls) + key_classinfo = self.class_resolver.get_classinfo(key_cls) self.class_resolver.write_classinfo(buffer, key_classinfo) key_classinfo.serializer.write(buffer, k) value_cls = type(v) @@ -2199,7 +1805,7 @@ cdef class SubMapSerializer(Serializer): else: if not self.ref_resolver.write_ref_or_null(buffer, v): value_classinfo = self.class_resolver. \ - get_or_create_classinfo(value_cls) + get_classinfo(value_cls) self.class_resolver.write_classinfo(buffer, value_classinfo) value_classinfo.serializer.write(buffer, v) @@ -2208,7 +1814,7 @@ cdef class SubMapSerializer(Serializer): cdef ClassResolver class_resolver = self.fury.class_resolver map_ = self.type_() ref_resolver.reference(map_) - cdef int32_t len_ = buffer.read_varint32() + cdef int32_t len_ = buffer.read_varuint32() cdef int32_t ref_id cdef ClassInfo key_classinfo cdef ClassInfo value_classinfo @@ -2244,218 +1850,6 @@ cdef class SubMapSerializer(Serializer): return map_ -# Use numpy array or python array module. -typecode_dict = { - # use bytes serializer for byte array. - "h": (2, FuryType.FURY_PRIMITIVE_SHORT_ARRAY.value), - "i": (4, FuryType.FURY_PRIMITIVE_INT_ARRAY.value), - "l": (8, FuryType.FURY_PRIMITIVE_LONG_ARRAY.value), - "f": (4, FuryType.FURY_PRIMITIVE_FLOAT_ARRAY.value), - "d": (8, FuryType.FURY_PRIMITIVE_DOUBLE_ARRAY.value), -} -if np: - typecode_dict = { - k: (itemsize, -type_id) for k, (itemsize, type_id) in typecode_dict.items() - } - - -@cython.final -cdef class PyArraySerializer(CrossLanguageCompatibleSerializer): - typecode_dict = typecode_dict - typecodearray_type = { - "h": Int16ArrayType, - "i": Int32ArrayType, - "l": Int64ArrayType, - "f": Float32ArrayType, - "d": Float64ArrayType, - } - cdef str typecode - cdef int8_t itemsize - cdef int16_t type_id - - def __init__(self, fury, type_, str typecode): - super().__init__(fury, type_) - self.typecode = typecode - self.itemsize, self.type_id = PyArraySerializer.typecode_dict[self.typecode] - - cpdef int16_t get_xtype_id(self): - return self.type_id - - cpdef inline xwrite(self, Buffer buffer, value): - assert value.itemsize == self.itemsize - view = memoryview(value) - assert view.format == self.typecode - assert view.itemsize == self.itemsize - assert view.c_contiguous # TODO handle contiguous - cdef int32_t nbytes = len(value) * self.itemsize - buffer.write_varint32(nbytes) - buffer.write_buffer(value) - - cpdef inline xread(self, Buffer buffer): - data = buffer.read_bytes_and_size() - arr = array.array(self.typecode, []) - arr.frombytes(data) - return arr - - cpdef inline write(self, Buffer buffer, value: array.array): - cdef int32_t nbytes = len(value) * value.itemsize - buffer.write_string(value.typecode) - buffer.write_varint32(nbytes) - buffer.write_buffer(value) - - cpdef inline read(self, Buffer buffer): - typecode = buffer.read_string() - data = buffer.read_bytes_and_size() - arr = array.array(typecode, []) - arr.frombytes(data) - return arr - - -if np: - _np_dtypes_dict = { - # use bytes serializer for byte array. - np.dtype(np.bool_): (1, "?", FuryType.FURY_PRIMITIVE_BOOL_ARRAY.value), - np.dtype(np.int16): (2, "h", FuryType.FURY_PRIMITIVE_SHORT_ARRAY.value), - np.dtype(np.int32): (4, "i", FuryType.FURY_PRIMITIVE_INT_ARRAY.value), - np.dtype(np.int64): (8, "l", FuryType.FURY_PRIMITIVE_LONG_ARRAY.value), - np.dtype(np.float32): (4, "f", FuryType.FURY_PRIMITIVE_FLOAT_ARRAY.value), - np.dtype(np.float64): (8, "d", FuryType.FURY_PRIMITIVE_DOUBLE_ARRAY.value), - } -else: - _np_dtypes_dict = {} - - -@cython.final -cdef class Numpy1DArraySerializer(CrossLanguageCompatibleSerializer): - dtypes_dict = _np_dtypes_dict - cdef object dtype - cdef str typecode - cdef int8_t itemsize - cdef int16_t type_id - - def __init__(self, fury, type_, dtype): - super().__init__(fury, type_) - self.dtype = dtype - self.itemsize, self.typecode, self.type_id = _np_dtypes_dict[self.dtype] - - cpdef int16_t get_xtype_id(self): - return self.type_id - - cpdef inline xwrite(self, Buffer buffer, value): - assert value.itemsize == self.itemsize - view = memoryview(value) - try: - assert view.format == self.typecode - except AssertionError as e: - raise e - assert view.itemsize == self.itemsize - cdef int32_t nbytes = len(value) * self.itemsize - buffer.write_varint32(nbytes) - if self.dtype == np.dtype("bool") or not view.c_contiguous: - buffer.write_bytes(value.tobytes()) - else: - buffer.write_buffer(value) - - cpdef inline xread(self, Buffer buffer): - data = buffer.read_bytes_and_size() - return np.frombuffer(data, dtype=self.dtype) - - cpdef inline write(self, Buffer buffer, value): - self.fury.handle_unsupported_write(buffer, value) - - cpdef inline read(self, Buffer buffer): - return self.fury.handle_unsupported_read(buffer) - - -cdef _get_hash(Fury fury, list field_names, dict type_hints): - from pyfury._struct import StructHashVisitor - - visitor = StructHashVisitor(fury) - for index, key in enumerate(field_names): - infer_field(key, type_hints[key], visitor, types_path=[]) - hash_ = visitor.get_hash() - assert hash_ != 0 - return hash_ - - -cdef class ComplexObjectSerializer(Serializer): - cdef str _type_tag - cdef object _type_hints - cdef list _field_names - cdef list _serializers - cdef int32_t _hash - - def __init__(self, fury, clz: type, type_tag: str): - super().__init__(fury, clz) - self._type_tag = type_tag - self._type_hints = get_type_hints(clz) - self._field_names = sorted(self._type_hints.keys()) - self._serializers = [None] * len(self._field_names) - from pyfury._struct import ComplexTypeVisitor - - visitor = ComplexTypeVisitor(fury) - for index, key in enumerate(self._field_names): - serializer = infer_field(key, self._type_hints[key], visitor, types_path=[]) - self._serializers[index] = serializer - if self.fury.language == Language.PYTHON: - logger.warning( - "Type of class %s shouldn't be serialized using cross-language " - "serializer", - clz, - ) - self._hash = 0 - - cpdef int16_t get_xtype_id(self): - return FuryType.FURY_TYPE_TAG.value - - cpdef str get_xtype_tag(self): - return self._type_tag - - cpdef write(self, Buffer buffer, value): - return self.xwrite(buffer, value) - - cpdef read(self, Buffer buffer): - return self.xread(buffer) - - cpdef xwrite(self, Buffer buffer, value): - if self._hash == 0: - self._hash = _get_hash(self.fury, self._field_names, self._type_hints) - buffer.write_int32(self._hash) - cdef Serializer serializer - cdef int32_t index - for index, field_name in enumerate(self._field_names): - field_value = getattr(value, field_name) - serializer = self._serializers[index] - self.fury.xserialize_ref( - buffer, field_value, serializer=serializer - ) - - cpdef xread(self, Buffer buffer): - cdef int32_t hash_ = buffer.read_int32() - if self._hash == 0: - self._hash = _get_hash(self.fury, self._field_names, self._type_hints) - if hash_ != self._hash: - raise ClassNotCompatibleError( - f"Hash {hash_} is not consistent with {self._hash} " - f"for class {self.type_}", - ) - obj = self.type_.__new__(self.type_) - self.fury.ref_resolver.reference(obj) - cdef Serializer serializer - cdef int32_t index - for index, field_name in enumerate(self._field_names): - serializer = self._serializers[index] - field_value = self.fury.xdeserialize_ref( - buffer, serializer=serializer - ) - setattr( - obj, - field_name, - field_value, - ) - return obj - - @cython.final cdef class EnumSerializer(Serializer): @classmethod @@ -2532,18 +1926,3 @@ cdef class SliceSerializer(Serializer): cpdef xread(self, Buffer buffer): raise NotImplementedError - - -@cython.final -cdef class PickleSerializer(Serializer): - cpdef inline xwrite(self, Buffer buffer, value): - raise NotImplementedError - - cpdef inline xread(self, Buffer buffer): - raise NotImplementedError - - cpdef inline write(self, Buffer buffer, value): - self.fury.handle_unsupported_write(buffer, value) - - cpdef inline read(self, Buffer buffer): - return self.fury.handle_unsupported_read(buffer) diff --git a/python/pyfury/_serializer.py b/python/pyfury/_serializer.py index c6f39b1667..ea31e1291b 100644 --- a/python/pyfury/_serializer.py +++ b/python/pyfury/_serializer.py @@ -15,23 +15,20 @@ # specific language governing permissions and limitations # under the License. -import array import datetime import logging from abc import ABC, abstractmethod from typing import Dict, Iterable, Any -from pyfury.buffer import Buffer +from pyfury._fury import ( + NOT_NULL_STRING_FLAG, + NOT_NULL_PYINT_FLAG, + NOT_NULL_PYBOOL_FLAG, +) from pyfury.resolver import NOT_NULL_VALUE_FLAG, NULL_FLAG from pyfury.type import ( - FuryType, + TypeId, is_primitive_type, - # Int8ArrayType, - Int16ArrayType, - Int32ArrayType, - Int64ArrayType, - Float32ArrayType, - Float64ArrayType, ) try: @@ -42,74 +39,6 @@ logger = logging.getLogger(__name__) -NOT_SUPPORT_CROSS_LANGUAGE = 0 -USE_CLASSNAME = 0 -USE_CLASS_ID = 1 -# preserve 0 as flag for class id not set in ClassInfo` -NO_CLASS_ID = 0 -PYINT_CLASS_ID = 1 -PYFLOAT_CLASS_ID = 2 -PYBOOL_CLASS_ID = 3 -STRING_CLASS_ID = 4 -PICKLE_CLASS_ID = 5 -PICKLE_STRONG_CACHE_CLASS_ID = 6 -PICKLE_CACHE_CLASS_ID = 7 -# `NOT_NULL_VALUE_FLAG` + `CLASS_ID << 1` in little-endian order -NOT_NULL_PYINT_FLAG = NOT_NULL_VALUE_FLAG & 0b11111111 | (PYINT_CLASS_ID << 9) -NOT_NULL_PYFLOAT_FLAG = NOT_NULL_VALUE_FLAG & 0b11111111 | (PYFLOAT_CLASS_ID << 9) -NOT_NULL_PYBOOL_FLAG = NOT_NULL_VALUE_FLAG & 0b11111111 | (PYBOOL_CLASS_ID << 9) -NOT_NULL_STRING_FLAG = NOT_NULL_VALUE_FLAG & 0b11111111 | (STRING_CLASS_ID << 9) -SMALL_STRING_THRESHOLD = 16 - - -class _PickleStub: - pass - - -class PickleStrongCacheStub: - pass - - -class PickleCacheStub: - pass - - -class BufferObject(ABC): - """ - Fury binary representation of an object. - Note: This class is used for zero-copy out-of-band serialization and shouldn't - be used for any other cases. - """ - - @abstractmethod - def total_bytes(self) -> int: - """total size for serialized bytes of an object""" - - @abstractmethod - def write_to(self, buffer: "Buffer"): - """Write serialized object to a buffer.""" - - @abstractmethod - def to_buffer(self) -> "Buffer": - """Write serialized data as Buffer.""" - - -class BytesBufferObject(BufferObject): - __slots__ = ("binary",) - - def __init__(self, binary: bytes): - self.binary = binary - - def total_bytes(self) -> int: - return len(self.binary) - - def write_to(self, buffer: "Buffer"): - buffer.write_bytes(self.binary) - - def to_buffer(self) -> "Buffer": - return Buffer(self.binary) - - class Serializer(ABC): __slots__ = "fury", "type_", "need_to_write_ref" @@ -118,29 +47,6 @@ def __init__(self, fury, type_: type): self.type_: type = type_ self.need_to_write_ref = not is_primitive_type(type_) - def get_xtype_id(self): - """ - Returns - ------- - Returns NOT_SUPPORT_CROSS_LANGUAGE if the serializer doesn't - support cross-language serialization. - Return a number in range (0, 32767) if the serializer support - cross-language serialization and native serialization data is the - same with cross-language serialization. - Return a negative short in range [-32768, 0) if the serializer - support cross-language serialization and native serialization data - is not the same with cross-language serialization. - """ - return NOT_SUPPORT_CROSS_LANGUAGE - - def get_xtype_tag(self): - """ - Returns - ------- - a type tag used for setup type mapping between languages. - """ - raise RuntimeError("Tag is only for struct.") - def write(self, buffer, value): raise NotImplementedError @@ -171,24 +77,7 @@ def xread(self, buffer): return self.read(buffer) -class NoneSerializer(Serializer): - def xwrite(self, buffer, value): - raise NotImplementedError - - def xread(self, buffer): - raise NotImplementedError - - def write(self, buffer, value): - pass - - def read(self, buffer): - return None - - class BooleanSerializer(CrossLanguageCompatibleSerializer): - def get_xtype_id(self): - return FuryType.BOOL.value - def write(self, buffer, value): buffer.write_bool(value) @@ -197,9 +86,6 @@ def read(self, buffer): class ByteSerializer(CrossLanguageCompatibleSerializer): - def get_xtype_id(self): - return FuryType.INT8.value - def write(self, buffer, value): buffer.write_int8(value) @@ -208,9 +94,6 @@ def read(self, buffer): class Int16Serializer(CrossLanguageCompatibleSerializer): - def get_xtype_id(self): - return FuryType.INT16.value - def write(self, buffer, value): buffer.write_int16(value) @@ -219,25 +102,19 @@ def read(self, buffer): class Int32Serializer(CrossLanguageCompatibleSerializer): - def get_xtype_id(self): - return FuryType.INT32.value - def write(self, buffer, value): - buffer.write_int32(value) + buffer.write_varint32(value) def read(self, buffer): - return buffer.read_int32() + return buffer.read_varint32() class Int64Serializer(Serializer): - def get_xtype_id(self): - return FuryType.INT64.value - def xwrite(self, buffer, value): - buffer.write_int64(value) + buffer.write_varint64(value) def xread(self, buffer): - return buffer.read_int64() + return buffer.read_varint64() def write(self, buffer, value): buffer.write_varint64(value) @@ -246,10 +123,19 @@ def read(self, buffer): return buffer.read_varint64() -class FloatSerializer(CrossLanguageCompatibleSerializer): - def get_xtype_id(self): - return FuryType.FLOAT.value +class DynamicIntSerializer(CrossLanguageCompatibleSerializer): + def xwrite(self, buffer, value): + # TODO(chaokunyang) check value range and write type and value + buffer.write_varuint32(TypeId.INT64) + buffer.write_varint64(value) + + def xread(self, buffer): + type_id = buffer.read_varuint32() + assert type_id == TypeId.INT64, type_id + return buffer.read_varint64() + +class FloatSerializer(CrossLanguageCompatibleSerializer): def write(self, buffer, value): buffer.write_float(value) @@ -258,9 +144,6 @@ def read(self, buffer): class DoubleSerializer(CrossLanguageCompatibleSerializer): - def get_xtype_id(self): - return FuryType.DOUBLE.value - def write(self, buffer, value): buffer.write_double(value) @@ -268,10 +151,19 @@ def read(self, buffer): return buffer.read_double() -class StringSerializer(CrossLanguageCompatibleSerializer): - def get_xtype_id(self): - return FuryType.STRING.value +class DynamicFloatSerializer(CrossLanguageCompatibleSerializer): + def xwrite(self, buffer, value): + # TODO(chaokunyang) check value range and write type and value + buffer.write_varuint32(TypeId.FLOAT64) + buffer.write_double(value) + def xread(self, buffer): + type_id = buffer.read_varuint32() + assert type_id == TypeId.FLOAT64, type_id + return buffer.read_double() + + +class StringSerializer(CrossLanguageCompatibleSerializer): def write(self, buffer, value: str): buffer.write_string(value) @@ -283,9 +175,6 @@ def read(self, buffer): class DateSerializer(CrossLanguageCompatibleSerializer): - def get_xtype_id(self): - return FuryType.DATE32.value - def write(self, buffer, value: datetime.date): if not isinstance(value, datetime.date): raise TypeError( @@ -302,9 +191,6 @@ def read(self, buffer): class TimestampSerializer(CrossLanguageCompatibleSerializer): - def get_xtype_id(self): - return FuryType.TIMESTAMP.value - def write(self, buffer, value: datetime.datetime): if not isinstance(value, datetime.datetime): raise TypeError( @@ -320,130 +206,6 @@ def read(self, buffer): return datetime.datetime.fromtimestamp(ts) -class BytesSerializer(CrossLanguageCompatibleSerializer): - def get_xtype_id(self): - return FuryType.BINARY.value - - def write(self, buffer, value: bytes): - assert isinstance(value, bytes) - self.fury.write_buffer_object(buffer, BytesBufferObject(value)) - - def read(self, buffer): - fury_buf = self.fury.read_buffer_object(buffer) - return fury_buf.to_pybytes() - - -# Use numpy array or python array module. -typecode_dict = { - # use bytes serializer for byte array. - "h": (2, FuryType.FURY_PRIMITIVE_SHORT_ARRAY.value), - "i": (4, FuryType.FURY_PRIMITIVE_INT_ARRAY.value), - "l": (8, FuryType.FURY_PRIMITIVE_LONG_ARRAY.value), - "f": (4, FuryType.FURY_PRIMITIVE_FLOAT_ARRAY.value), - "d": (8, FuryType.FURY_PRIMITIVE_DOUBLE_ARRAY.value), -} -if np: - typecode_dict = { - k: (itemsize, -type_id) for k, (itemsize, type_id) in typecode_dict.items() - } - - -class PyArraySerializer(CrossLanguageCompatibleSerializer): - typecode_dict = typecode_dict - typecodearray_type = { - "h": Int16ArrayType, - "i": Int32ArrayType, - "l": Int64ArrayType, - "f": Float32ArrayType, - "d": Float64ArrayType, - } - - def __init__(self, fury, type_, typecode): - super().__init__(fury, type_) - self.typecode = typecode - self.itemsize, self.type_id = PyArraySerializer.typecode_dict[self.typecode] - - def get_xtype_id(self): - return self.type_id - - def xwrite(self, buffer, value): - assert value.itemsize == self.itemsize - view = memoryview(value) - assert view.format == self.typecode - assert view.itemsize == self.itemsize - assert view.c_contiguous # TODO handle contiguous - nbytes = len(value) * self.itemsize - buffer.write_varint32(nbytes) - buffer.write_buffer(value) - - def xread(self, buffer): - data = buffer.read_bytes_and_size() - arr = array.array(self.typecode, []) - arr.frombytes(data) - return arr - - def write(self, buffer, value: array.array): - nbytes = len(value) * value.itemsize - buffer.write_string(value.typecode) - buffer.write_varint32(nbytes) - buffer.write_buffer(value) - - def read(self, buffer): - typecode = buffer.read_string() - data = buffer.read_bytes_and_size() - arr = array.array(typecode, []) - arr.frombytes(data) - return arr - - -if np: - _np_dtypes_dict = { - # use bytes serializer for byte array. - np.dtype(np.bool_): (1, "?", FuryType.FURY_PRIMITIVE_BOOL_ARRAY.value), - np.dtype(np.int16): (2, "h", FuryType.FURY_PRIMITIVE_SHORT_ARRAY.value), - np.dtype(np.int32): (4, "i", FuryType.FURY_PRIMITIVE_INT_ARRAY.value), - np.dtype(np.int64): (8, "l", FuryType.FURY_PRIMITIVE_LONG_ARRAY.value), - np.dtype(np.float32): (4, "f", FuryType.FURY_PRIMITIVE_FLOAT_ARRAY.value), - np.dtype(np.float64): (8, "d", FuryType.FURY_PRIMITIVE_DOUBLE_ARRAY.value), - } -else: - _np_dtypes_dict = {} - - -class Numpy1DArraySerializer(CrossLanguageCompatibleSerializer): - dtypes_dict = _np_dtypes_dict - - def __init__(self, fury, type_, dtype): - super().__init__(fury, type_) - self.dtype = dtype - self.itemsize, self.typecode, self.type_id = _np_dtypes_dict[self.dtype] - - def get_xtype_id(self): - return self.type_id - - def xwrite(self, buffer, value): - assert value.itemsize == self.itemsize - view = memoryview(value) - assert view.format == self.typecode - assert view.itemsize == self.itemsize - nbytes = len(value) * self.itemsize - buffer.write_varint32(nbytes) - if self.dtype == np.dtype("bool") or not view.c_contiguous: - buffer.write_bytes(value.tobytes()) - else: - buffer.write_buffer(value) - - def xread(self, buffer): - data = buffer.read_bytes_and_size() - return np.frombuffer(data, dtype=self.dtype) - - def write(self, buffer, value): - self.fury.handle_unsupported_write(buffer, value) - - def read(self, buffer): - return self.fury.handle_unsupported_read(buffer) - - class CollectionSerializer(Serializer): __slots__ = "class_resolver", "ref_resolver", "elem_serializer" @@ -453,15 +215,12 @@ def __init__(self, fury, type_, elem_serializer=None): self.ref_resolver = fury.ref_resolver self.elem_serializer = elem_serializer - def get_xtype_id(self): - return -FuryType.LIST.value - def write(self, buffer, value: Iterable[Any]): - buffer.write_varint32(len(value)) + buffer.write_varuint32(len(value)) for s in value: cls = type(s) if cls is str: - buffer.write_int16(NOT_NULL_STRING_FLAG) + buffer.write_int16() buffer.write_string(s) elif cls is int: buffer.write_int16(NOT_NULL_PYINT_FLAG) @@ -471,12 +230,12 @@ def write(self, buffer, value: Iterable[Any]): buffer.write_bool(s) else: if not self.ref_resolver.write_ref_or_null(buffer, s): - classinfo = self.class_resolver.get_or_create_classinfo(cls) + classinfo = self.class_resolver.get_classinfo(cls) self.class_resolver.write_classinfo(buffer, classinfo) classinfo.serializer.write(buffer, s) def read(self, buffer): - len_ = buffer.read_varint32() + len_ = buffer.read_varuint32() collection_ = self.new_instance(self.type_) for i in range(len_): self.handle_read_elem(self.fury.deserialize_ref(buffer), collection_) @@ -497,13 +256,13 @@ def xwrite(self, buffer, value): except AttributeError: value = list(value) len_ = len(value) - buffer.write_varint32(len_) + buffer.write_varuint32(len_) for s in value: self.fury.xserialize_ref(buffer, s, serializer=self.elem_serializer) len_ += 1 def xread(self, buffer): - len_ = buffer.read_varint32() + len_ = buffer.read_varuint32() collection_ = self.new_instance(self.type_) for i in range(len_): self.handle_read_elem( @@ -514,11 +273,8 @@ def xread(self, buffer): class ListSerializer(CollectionSerializer): - def get_xtype_id(self): - return FuryType.LIST.value - def read(self, buffer): - len_ = buffer.read_varint32() + len_ = buffer.read_varuint32() instance = [] self.fury.ref_resolver.reference(instance) for i in range(len_): @@ -528,7 +284,7 @@ def read(self, buffer): class TupleSerializer(CollectionSerializer): def read(self, buffer): - len_ = buffer.read_varint32() + len_ = buffer.read_varuint32() collection_ = [] for i in range(len_): collection_.append(self.fury.deserialize_ref(buffer)) @@ -539,14 +295,8 @@ class StringArraySerializer(ListSerializer): def __init__(self, fury, type_): super().__init__(fury, type_, StringSerializer(fury, str)) - def get_xtype_id(self): - return FuryType.FURY_STRING_ARRAY.value - class SetSerializer(CollectionSerializer): - def get_xtype_id(self): - return FuryType.FURY_SET.value - def new_instance(self, type_): instance = set() self.fury.ref_resolver.reference(instance) @@ -571,11 +321,8 @@ def __init__(self, fury, type_, key_serializer=None, value_serializer=None): self.key_serializer = key_serializer self.value_serializer = value_serializer - def get_xtype_id(self): - return FuryType.MAP.value - def write(self, buffer, value: Dict): - buffer.write_varint32(len(value)) + buffer.write_varuint32(len(value)) for k, v in value.items(): key_cls = type(k) if key_cls is str: @@ -583,7 +330,7 @@ def write(self, buffer, value: Dict): buffer.write_string(k) else: if not self.ref_resolver.write_ref_or_null(buffer, k): - classinfo = self.class_resolver.get_or_create_classinfo(key_cls) + classinfo = self.class_resolver.get_classinfo(key_cls) self.class_resolver.write_classinfo(buffer, classinfo) classinfo.serializer.write(buffer, k) value_cls = type(v) @@ -595,12 +342,12 @@ def write(self, buffer, value: Dict): buffer.write_varint64(v) else: if not self.ref_resolver.write_ref_or_null(buffer, v): - classinfo = self.class_resolver.get_or_create_classinfo(value_cls) + classinfo = self.class_resolver.get_classinfo(value_cls) self.class_resolver.write_classinfo(buffer, classinfo) classinfo.serializer.write(buffer, v) def read(self, buffer): - len_ = buffer.read_varint32() + len_ = buffer.read_varuint32() map_ = self.type_() self.fury.ref_resolver.reference(map_) for i in range(len_): @@ -610,13 +357,13 @@ def read(self, buffer): return map_ def xwrite(self, buffer, value: Dict): - buffer.write_varint32(len(value)) + buffer.write_varuint32(len(value)) for k, v in value.items(): self.fury.xserialize_ref(buffer, k, serializer=self.key_serializer) self.fury.xserialize_ref(buffer, v, serializer=self.value_serializer) def xread(self, buffer): - len_ = buffer.read_varint32() + len_ = buffer.read_varuint32() map_ = {} self.fury.ref_resolver.reference(map_) for i in range(len_): @@ -702,47 +449,3 @@ def xwrite(self, buffer, value): def xread(self, buffer): raise NotImplementedError - - -class PickleSerializer(Serializer): - def xwrite(self, buffer, value): - raise NotImplementedError - - def xread(self, buffer): - raise NotImplementedError - - def write(self, buffer, value): - self.fury.handle_unsupported_write(buffer, value) - - def read(self, buffer): - return self.fury.handle_unsupported_read(buffer) - - -class SerializationContext: - """ - A context is used to add some context-related information, so that the - serializers can setup relation between serializing different objects. - The context will be reset after finished serializing/deserializing the - object tree. - """ - - __slots__ = ("objects",) - - def __init__(self): - self.objects = dict() - - def add(self, key, obj): - self.objects[id(key)] = obj - - def __contains__(self, key): - return id(key) in self.objects - - def __getitem__(self, key): - return self.objects[id(key)] - - def get(self, key): - return self.objects.get(id(key)) - - def reset(self): - if len(self.objects) > 0: - self.objects.clear() diff --git a/python/pyfury/_struct.py b/python/pyfury/_struct.py index 0591b3076b..bd00f950ce 100644 --- a/python/pyfury/_struct.py +++ b/python/pyfury/_struct.py @@ -19,7 +19,6 @@ import logging import typing -from pyfury._serializer import NOT_SUPPORT_CROSS_LANGUAGE from pyfury.buffer import Buffer from pyfury.error import ClassNotCompatibleError from pyfury.serializer import ( @@ -31,7 +30,7 @@ from pyfury.type import ( TypeVisitor, infer_field, - FuryType, + TypeId, Int8Type, Int16Type, Int32Type, @@ -40,7 +39,6 @@ Float64Type, is_py_array_type, compute_string_hash, - qualified_class_name, ) logger = logging.getLogger(__name__) @@ -103,9 +101,8 @@ def _get_hash(fury, field_names: list, type_hints: dict): class ComplexObjectSerializer(Serializer): - def __init__(self, fury, clz: type, type_tag: str): + def __init__(self, fury, clz): super().__init__(fury, clz) - self._type_tag = type_tag self._type_hints = typing.get_type_hints(clz) self._field_names = sorted(self._type_hints.keys()) self._serializers = [None] * len(self._field_names) @@ -123,12 +120,6 @@ def __init__(self, fury, clz: type, type_tag: str): ) self._hash = 0 - def get_xtype_id(self): - return FuryType.FURY_TYPE_TAG.value - - def get_xtype_tag(self): - return self._type_tag - def write(self, buffer, value): return self.xwrite(buffer, value) @@ -176,31 +167,32 @@ def __init__( def visit_list(self, field_name, elem_type, types_path=None): # TODO add list element type to hash. - id_ = abs(ListSerializer(self.fury, list).get_xtype_id()) - self._hash = self._compute_field_hash(self._hash, id_) + xtype_id = self.fury.class_resolver.get_classinfo(list).type_id + self._hash = self._compute_field_hash(self._hash, abs(xtype_id)) def visit_dict(self, field_name, key_type, value_type, types_path=None): # TODO add map key/value type to hash. - id_ = abs(MapSerializer(self.fury, dict).get_xtype_id()) - self._hash = self._compute_field_hash(self._hash, id_) + xtype_id = self.fury.class_resolver.get_classinfo(dict).type_id + self._hash = self._compute_field_hash(self._hash, abs(xtype_id)) def visit_customized(self, field_name, type_, types_path=None): - serializer = self.fury.class_resolver.get_serializer(type_) - if serializer.get_xtype_id() != NOT_SUPPORT_CROSS_LANGUAGE: - tag = serializer.get_xtype_tag() - else: - tag = qualified_class_name(type_) - tag_hash = compute_string_hash(tag) - self._hash = self._compute_field_hash(self._hash, tag_hash) + classinfo = self.fury.class_resolver.get_classinfo(type_, create=False) + if classinfo is None: + return + hash_value = classinfo.type_id + if TypeId.is_namespaced_type(classinfo.type_id): + hash_value = compute_string_hash(classinfo.namespace + classinfo.typename) + self._hash = self._compute_field_hash(self._hash, hash_value) def visit_other(self, field_name, type_, types_path=None): - if type_ not in basic_types and not is_py_array_type(type_): - # FIXME ignore unknown types for hash calculation - return None - serializer = self.fury.class_resolver.get_serializer(type_) - assert not isinstance(serializer, (PickleSerializer,)) - id_ = serializer.get_xtype_id() - assert id_ is not None, serializer + classinfo = self.fury.class_resolver.get_classinfo(type_, create=False) + if classinfo is None: + id_ = 0 + else: + serializer = classinfo.serializer + assert not isinstance(serializer, (PickleSerializer,)) + id_ = classinfo.type_id + assert id_ is not None, serializer id_ = abs(id_) self._hash = self._compute_field_hash(self._hash, id_) diff --git a/python/pyfury/_util.pxd b/python/pyfury/_util.pxd index 77f76aceb8..360fe4354b 100644 --- a/python/pyfury/_util.pxd +++ b/python/pyfury/_util.pxd @@ -124,20 +124,22 @@ cdef class Buffer: cpdef inline double read_double(self) - cpdef inline write_flagged_varint32(self, c_bool flag, int32_t v) + cpdef inline write_varint64(self, int64_t v) - cpdef inline c_bool read_varint32_flag(self) + cpdef inline write_varuint64(self, int64_t v) - cpdef inline int32_t read_flagged_varint(self) + cpdef inline int64_t read_varint64(self) - cpdef inline write_varint64(self, int64_t v) + cpdef inline int64_t read_varuint64(self) - cpdef inline int64_t read_varint64(self) + cpdef inline write_varuint32(self, int32_t value) cpdef inline write_varint32(self, int32_t value) cpdef inline int32_t read_varint32(self) + cpdef inline int32_t read_varuint32(self) + cpdef put_buffer(self, uint32_t offset, v, int32_t src_index, int32_t length) cdef inline write_c_buffer(self, const uint8_t* value, int32_t length) diff --git a/python/pyfury/_util.pyx b/python/pyfury/_util.pyx index b758705613..9baaca5c04 100644 --- a/python/pyfury/_util.pyx +++ b/python/pyfury/_util.pyx @@ -26,7 +26,7 @@ from libcpp.memory cimport shared_ptr, make_shared from libc.stdint cimport * from libcpp cimport bool as c_bool from pyfury.includes.libutil cimport( - CBuffer, AllocateBuffer, GetBit, SetBit, ClearBit, SetBitTo + CBuffer, AllocateBuffer, GetBit, SetBit, ClearBit, SetBitTo, CStatus, StatusCode ) cdef int32_t max_buffer_size = 2 ** 31 - 1 @@ -212,14 +212,14 @@ cdef class Buffer: cpdef inline write_bytes_and_size(self, bytes value): cdef const unsigned char[:] data = value cdef int32_t length = data.nbytes - self.write_varint32(length) + self.write_varuint32(length) if length > 0: self.grow(length) self.c_buffer.get().CopyFrom(self.writer_index, &data[0], 0, length) self.writer_index += length cpdef inline bytes read_bytes_and_size(self): - cdef int32_t length = self.read_varint32() + cdef int32_t length = self.read_varuint32() value = self.get_bytes(self.reader_index, length) self.reader_index += length return value @@ -238,16 +238,10 @@ cdef class Buffer: return value cpdef inline int64_t read_bytes_as_int64(self, int32_t length): - cdef int32_t size_ = self.c_buffer.get().size() - cdef int64_t result - cdef int32_t i - # if offset + length > size_: - if size_- (self.reader_index + 8) > 0: - result = self.get_int64(self.reader_index) - result = result & (0xffffffffffffffffL >> ((8 - length) * 8)) - else: - for i in range(length): - result = result | ((self.read_int8()) & 0xff) << (i * 8) + cdef int64_t result = 0 + cdef CStatus status = self.c_buffer.get().GetBytesAsInt64(self.reader_index, length, &result) + if status.code() != StatusCode.OK: + raise ValueError(status.message()) self.reader_index += length return result @@ -359,6 +353,9 @@ cdef class Buffer: return data cpdef inline write_varint32(self, int32_t value): + return self.write_varuint32((value << 1) ^ (value >> 31)) + + cpdef inline write_varuint32(self, int32_t value): self.grow(5) cdef int32_t actual_bytes_written = self.c_buffer.get()\ .PutVarUint32(self.writer_index, value) @@ -366,6 +363,10 @@ cdef class Buffer: return actual_bytes_written cpdef inline int32_t read_varint32(self): + cdef uint32_t v = self.read_varuint32() + return (v >> 1) ^ -(v & 1) + + cpdef inline int32_t read_varuint32(self): cdef: uint32_t read_length = 0 int8_t b @@ -392,98 +393,10 @@ cdef class Buffer: result |= (b & 0x7F) << 28 return result - cpdef inline write_flagged_varint32(self, c_bool flag, int32_t v): - self.grow(5) - cdef: - int32_t value = v - int32_t offset = self.writer_index - int8_t first = (value & 0x3F) - uint8_t* arr = self.c_buffer.get().data() - if flag: - # Mask first 6 bits, bit 8 is the flag. - first = first | 0x80 - if value >> 6 == 0: - arr[offset] = first - self.writer_index += 1 - return 1 - if value >> 13 == 0: - arr[offset] = first | 0x40 # Set bit 7. - arr[offset + 1] = (value >> 6) - self.writer_index += 2 - return 2 - if value >> 20 == 0: - arr[offset] = first | 0x40 # Set bit 7. - arr[offset + 1] = (value >> 6 | 0x80) - arr[offset + 2] = (value >> 13) - self.writer_index += 3 - return 3 - if value >> 27 == 0: - arr[offset] = first | 0x40 # Set bit 7. - arr[offset + 1] = (value >> 6 | 0x80) - arr[offset + 2] = (value >> 13 | 0x80) - arr[offset + 3] = (value >> 20) - self.writer_index += 4 - return 4 - arr[offset] = first | 0x40 # Set bit 7. - arr[offset + 1] = (value >> 6 | 0x80) - arr[offset + 2] = (value >> 13 | 0x80) - arr[offset + 3] = (value >> 20 | 0x80) - arr[offset + 4] = (value >> 27) - self.writer_index += 5 - return 5 - - cpdef inline c_bool read_varint32_flag(self): - cdef int32_t offset = self.reader_index - self.check_bound(offset, 1) - cdef int8_t head = ((self._c_address + offset))[0] - return (head & 0x80) != 0 - - cpdef inline int32_t read_flagged_varint(self): - cdef: - uint32_t read_bytes_length = 1 - int32_t b - int32_t result - uint32_t position = self.reader_index - int8_t * arr = (self.c_buffer.get().data() + position) - if self._c_size - self.reader_index > 5: - b = arr[0] - result = b & 0x3F # Mask first 6 bits. - if (b & 0x40) != 0: # Bit 7 means another byte, bit 8 is flag bit. - read_bytes_length += 1 - b = arr[1] - result |= (b & 0x7F) << 6 - if (b & 0x80) != 0: - read_bytes_length += 1 - b = arr[2] - result |= (b & 0x7F) << 13 - if (b & 0x80) != 0: - read_bytes_length += 1 - b = arr[3] - result |= (b & 0x7F) << 20 - if (b & 0x80) != 0: - read_bytes_length += 1 - b = arr[4] - result |= b << 27 - self.reader_index += read_bytes_length - return result - else: - b = self.read_int8() - result = b & 0x3F # Mask first 6 bits. - if (b & 0x40) != 0: - b = self.read_int8() - result |= (b & 0x7F) << 6 - if (b & 0x80) != 0: - b = self.read_int8() - result |= (b & 0x7F) << 13 - if (b & 0x80) != 0: - b = self.read_int8() - result |= (b & 0x7F) << 20 - if (b & 0x80) != 0: - b = self.read_int8() - result |= b << 27 - return result + cpdef inline write_varint64(self, int64_t value): + return self.write_varuint64((value << 1) ^ (value >> 63)) - cpdef inline write_varint64(self, int64_t v): + cpdef inline write_varuint64(self, int64_t v): cdef: uint64_t value = v int64_t offset = self.writer_index @@ -534,6 +447,10 @@ cdef class Buffer: return 9 cpdef inline int64_t read_varint64(self): + cdef uint64_t v = self.read_varuint64() + return ((v >> 1) ^ -(v & 1)) + + cpdef inline int64_t read_varuint64(self): cdef: uint32_t read_length = 1 int64_t b @@ -609,7 +526,7 @@ cdef class Buffer: return result cdef inline write_c_buffer(self, const uint8_t* value, int32_t length): - self.write_varint32(length) + self.write_varuint32(length) if length <= 0: # access an emtpy buffer may raise out-of-bound exception. return self.grow(length) @@ -618,7 +535,7 @@ cdef class Buffer: self.writer_index += length cdef inline int32_t read_c_buffer(self, uint8_t** buf): - cdef int32_t length = self.read_varint32() + cdef int32_t length = self.read_varuint32() cdef uint8_t* binary_data = self.c_buffer.get().data() self.check_bound(self.reader_index, length) buf[0] = binary_data + self.reader_index diff --git a/python/pyfury/error.py b/python/pyfury/error.py index 91088d4815..f2e7c7a67b 100644 --- a/python/pyfury/error.py +++ b/python/pyfury/error.py @@ -24,5 +24,9 @@ class ClassNotCompatibleError(FuryError): pass +class TypeUnregisteredError(FuryError): + pass + + class CompileError(FuryError): pass diff --git a/python/pyfury/format/serializer.py b/python/pyfury/format/serializer.py index 1235e70c93..f805c4045d 100644 --- a/python/pyfury/format/serializer.py +++ b/python/pyfury/format/serializer.py @@ -18,7 +18,6 @@ import pyarrow as pa from pyfury.serializer import CrossLanguageCompatibleSerializer, BufferObject from pyfury.buffer import Buffer -from pyfury.type import FuryType class ArrowRecordBatchSerializer(CrossLanguageCompatibleSerializer): @@ -35,9 +34,6 @@ def read(self, buffer: Buffer) -> pa.Table: [batch] = [batch for batch in reader] return batch - def get_xtype_id(self): - return FuryType.FURY_ARROW_RECORD_BATCH.value - class ArrowRecordBatchBufferObject(BufferObject): def __init__(self, batch: pa.RecordBatch): @@ -81,9 +77,6 @@ def read(self, buffer: Buffer) -> pa.Table: batches = [batch for batch in reader] return pa.Table.from_batches(batches) - def get_xtype_id(self): - return FuryType.FURY_ARROW_TABLE.value - class ArrowTableBufferObject(BufferObject): def __init__(self, table: pa.Table): diff --git a/python/pyfury/includes/libserialization.pxd b/python/pyfury/includes/libserialization.pxd new file mode 100644 index 0000000000..c5f3163123 --- /dev/null +++ b/python/pyfury/includes/libserialization.pxd @@ -0,0 +1,71 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from libc.stdint cimport int32_t +from libcpp cimport bool as c_bool + +cdef extern from "fury/type/type.h" namespace "fury" nogil: + + # Declare the C++ TypeId enum + cdef enum class TypeId(int32_t): + BOOL = 1 + INT8 = 2 + INT16 = 3 + INT32 = 4 + VAR_INT32 = 5 + INT64 = 6 + VAR_INT64 = 7 + SLI_INT64 = 8 + FLOAT16 = 9 + FLOAT32 = 10 + FLOAT64 = 11 + STRING = 12 + ENUM = 13 + NAMED_ENUM = 14 + STRUCT = 15 + POLYMORPHIC_STRUCT = 16 + COMPATIBLE_STRUCT = 17 + POLYMORPHIC_COMPATIBLE_STRUCT = 18 + NAMED_STRUCT = 19 + NAMED_POLYMORPHIC_STRUCT = 20 + NAMED_COMPATIBLE_STRUCT = 21 + NAMED_POLYMORPHIC_COMPATIBLE_STRUCT = 22 + EXT = 23 + POLYMORPHIC_EXT = 24 + NAMED_EXT = 25 + NAMED_POLYMORPHIC_EXT = 26 + LIST = 27 + SET = 28 + MAP = 29 + DURATION = 30 + TIMESTAMP = 31 + LOCAL_DATE = 32 + DECIMAL = 33 + BINARY = 34 + ARRAY = 35 + BOOL_ARRAY = 36 + INT8_ARRAY = 37 + INT16_ARRAY = 38 + INT32_ARRAY = 39 + INT64_ARRAY = 40 + FLOAT16_ARRAY = 41 + FLOAT32_ARRAY = 42 + FLOAT64_ARRAY = 43 + ARROW_RECORD_BATCH = 44 + ARROW_TABLE = 45 + + cdef c_bool IsNamespacedType(int32_t type_id) diff --git a/python/pyfury/includes/libutil.pxd b/python/pyfury/includes/libutil.pxd index 8220cb2225..49f4c71521 100644 --- a/python/pyfury/includes/libutil.pxd +++ b/python/pyfury/includes/libutil.pxd @@ -21,6 +21,25 @@ from libcpp.memory cimport shared_ptr from libcpp.string cimport string as c_string cdef extern from "fury/util/buffer.h" namespace "fury" nogil: + cdef cppclass CStatus" fury::Status": + c_string ToString() const + + c_string CodeAsString() const + + c_string message() const + + StatusCode code() const + + cdef enum class StatusCode(char): + OK = 0, + OutOfMemory = 1, + OutOfBound = 2, + KeyError = 3, + TypeError = 4, + Invalid = 5, + IOError = 6, + UnknownError = 7 + cdef cppclass CBuffer" fury::Buffer": CBuffer(uint8_t* data, uint32_t size, c_bool own_data=True) @@ -63,6 +82,8 @@ cdef extern from "fury/util/buffer.h" namespace "fury" nogil: inline double GetDouble(uint32_t offset) + inline CStatus GetBytesAsInt64(uint32_t offset, uint32_t length, int64_t* target) + inline uint32_t PutVarUint32(uint32_t offset, int32_t value) inline int32_t GetVarUint32(uint32_t offset, uint32_t *readBytesLength) diff --git a/python/pyfury/meta/metastring.py b/python/pyfury/meta/metastring.py index 4ff065105b..c1f73585ad 100644 --- a/python/pyfury/meta/metastring.py +++ b/python/pyfury/meta/metastring.py @@ -541,7 +541,7 @@ def _encode_generic(self, chars: List[str], bits_per_char: int) -> bytes: strip_last_char = len(bytes_array) * 8 >= total_bits + bits_per_char if strip_last_char: bytes_array[0] = bytes_array[0] | 0x80 - return bytes_array + return bytes(bytes_array) def _char_to_value(self, c: str, bits_per_char: int) -> int: """ diff --git a/python/pyfury/resolver.py b/python/pyfury/resolver.py index 3b86493e6a..4ba360bcc7 100644 --- a/python/pyfury/resolver.py +++ b/python/pyfury/resolver.py @@ -147,7 +147,7 @@ def write_ref_or_null(self, buffer, obj): # The obj has been written previously. if written_id is not None: buffer.write_int8(REF_FLAG) - buffer.write_varint32(written_id[0]) + buffer.write_varuint32(written_id[0]) return True else: written_id = len(self.written_objects) diff --git a/python/pyfury/serializer.py b/python/pyfury/serializer.py index b348be83d0..878c456f63 100644 --- a/python/pyfury/serializer.py +++ b/python/pyfury/serializer.py @@ -15,12 +15,12 @@ # specific language governing permissions and limitations # under the License. +import array import itertools import os import pickle -from weakref import WeakValueDictionary - import typing +from weakref import WeakValueDictionary import pyfury.lib.mmh3 from pyfury.buffer import Buffer @@ -38,77 +38,105 @@ except ImportError: np = None -from pyfury._serializer import ( # noqa: F401 # pylint: disable=unused-import - BufferObject, - BytesBufferObject, - Serializer, - CrossLanguageCompatibleSerializer, - BooleanSerializer, - ByteSerializer, - Int16Serializer, - Int32Serializer, - Int64Serializer, - FloatSerializer, - DoubleSerializer, - StringSerializer, - DateSerializer, - TimestampSerializer, - BytesSerializer, - PyArraySerializer, - Numpy1DArraySerializer, - CollectionSerializer, - ListSerializer, - TupleSerializer, - StringArraySerializer, - SetSerializer, - MapSerializer, - SubMapSerializer, - EnumSerializer, - SliceSerializer, - PickleSerializer, +from pyfury._fury import ( NOT_NULL_PYINT_FLAG, - PickleStrongCacheStub, - PickleCacheStub, - PICKLE_STRONG_CACHE_CLASS_ID, - PICKLE_CACHE_CLASS_ID, + BufferObject, ) -try: - from pyfury._serialization import ENABLE_FURY_CYTHON_SERIALIZATION - - if ENABLE_FURY_CYTHON_SERIALIZATION: - from pyfury._serialization import ( # noqa: F401, F811 - BufferObject, - BytesBufferObject, - Serializer, - CrossLanguageCompatibleSerializer, - BooleanSerializer, - ByteSerializer, - Int16Serializer, - Int32Serializer, - Int64Serializer, - FloatSerializer, - DoubleSerializer, - StringSerializer, - DateSerializer, - TimestampSerializer, - BytesSerializer, - PyArraySerializer, - Numpy1DArraySerializer, - CollectionSerializer, - ListSerializer, - TupleSerializer, - StringArraySerializer, - SetSerializer, - MapSerializer, - SubMapSerializer, - EnumSerializer, - SliceSerializer, - PickleSerializer, - PickleStrongCacheStub, - PickleCacheStub, - ) -except ImportError: +from pyfury._serialization import ENABLE_FURY_CYTHON_SERIALIZATION + +if ENABLE_FURY_CYTHON_SERIALIZATION: + from pyfury._serialization import ( # noqa: F401, F811 + Serializer, + CrossLanguageCompatibleSerializer, + BooleanSerializer, + ByteSerializer, + Int16Serializer, + Int32Serializer, + Int64Serializer, + DynamicIntSerializer, + FloatSerializer, + DoubleSerializer, + DynamicFloatSerializer, + StringSerializer, + DateSerializer, + TimestampSerializer, + CollectionSerializer, + ListSerializer, + TupleSerializer, + StringArraySerializer, + SetSerializer, + MapSerializer, + SubMapSerializer, + EnumSerializer, + SliceSerializer, + ) +else: + from pyfury._serializer import ( # noqa: F401 # pylint: disable=unused-import + Serializer, + CrossLanguageCompatibleSerializer, + BooleanSerializer, + ByteSerializer, + Int16Serializer, + Int32Serializer, + Int64Serializer, + FloatSerializer, + DoubleSerializer, + StringSerializer, + DateSerializer, + TimestampSerializer, + CollectionSerializer, + ListSerializer, + TupleSerializer, + StringArraySerializer, + SetSerializer, + MapSerializer, + SubMapSerializer, + EnumSerializer, + SliceSerializer, + DynamicIntSerializer, + DynamicFloatSerializer, + ) + +from pyfury.type import ( + Int16ArrayType, + Int32ArrayType, + Int64ArrayType, + Float32ArrayType, + Float64ArrayType, + BoolNDArrayType, + Int16NDArrayType, + Int32NDArrayType, + Int64NDArrayType, + Float32NDArrayType, + Float64NDArrayType, + TypeId, +) + + +class NoneSerializer(Serializer): + def xwrite(self, buffer, value): + raise NotImplementedError + + def xread(self, buffer): + raise NotImplementedError + + def write(self, buffer, value): + pass + + def read(self, buffer): + return None + + +class _PickleStub: + pass + + +class PickleStrongCacheStub: + pass + + +class PickleCacheStub: pass @@ -141,19 +169,6 @@ def xwrite(self, buffer, value): def xread(self, buffer): raise NotImplementedError - @staticmethod - def new_classinfo(fury, clear_threshold: int = 1000): - from pyfury import ClassInfo - - return ClassInfo( - PickleStrongCacheStub, - PICKLE_STRONG_CACHE_CLASS_ID, - serializer=PickleStrongCacheSerializer( - fury, clear_threshold=clear_threshold - ), - class_name_bytes=b"PickleStrongCacheStub", - ) - class PickleCacheSerializer(Serializer): __slots__ = "_cached", "_reverse_cached" @@ -190,17 +205,6 @@ def xwrite(self, buffer, value): def xread(self, buffer): raise NotImplementedError - @staticmethod - def new_classinfo(fury): - from pyfury import ClassInfo - - return ClassInfo( - PickleCacheStub, - PICKLE_CACHE_CLASS_ID, - serializer=PickleCacheSerializer(fury), - class_name_bytes=b"PickleCacheStub", - ) - class PandasRangeIndexSerializer(Serializer): __slots__ = "_cached" @@ -396,3 +400,205 @@ def xwrite(self, buffer: Buffer, value): def xread(self, buffer): raise NotImplementedError + + +# Use numpy array or python array module. +typecode_dict = { + # use bytes serializer for byte array. + "h": (2, Int16ArrayType, TypeId.INT16_ARRAY), + "i": (4, Int32ArrayType, TypeId.INT32_ARRAY), + "l": (8, Int64ArrayType, TypeId.INT64_ARRAY), + "f": (4, Float32ArrayType, TypeId.FLOAT32_ARRAY), + "d": (8, Float64ArrayType, TypeId.FLOAT64_ARRAY), +} + +typeid_code = { + TypeId.INT16_ARRAY: "h", + TypeId.INT32_ARRAY: "i", + TypeId.INT64_ARRAY: "l", + TypeId.FLOAT32_ARRAY: "f", + TypeId.FLOAT64_ARRAY: "d", +} + + +class PyArraySerializer(CrossLanguageCompatibleSerializer): + typecode_dict = typecode_dict + typecodearray_type = { + "h": Int16ArrayType, + "i": Int32ArrayType, + "l": Int64ArrayType, + "f": Float32ArrayType, + "d": Float64ArrayType, + } + + def __init__(self, fury, ftype, type_id: str): + super().__init__(fury, ftype) + self.typecode = typeid_code[type_id] + self.itemsize, ftype, self.type_id = typecode_dict[self.typecode] + + def xwrite(self, buffer, value): + assert value.itemsize == self.itemsize + view = memoryview(value) + assert view.format == self.typecode + assert view.itemsize == self.itemsize + assert view.c_contiguous # TODO handle contiguous + nbytes = len(value) * self.itemsize + buffer.write_varuint32(nbytes) + buffer.write_buffer(value) + + def xread(self, buffer): + data = buffer.read_bytes_and_size() + arr = array.array(self.typecode, []) + arr.frombytes(data) + return arr + + def write(self, buffer, value: array.array): + nbytes = len(value) * value.itemsize + buffer.write_string(value.typecode) + buffer.write_varuint32(nbytes) + buffer.write_buffer(value) + + def read(self, buffer): + typecode = buffer.read_string() + data = buffer.read_bytes_and_size() + arr = array.array(typecode, []) + arr.frombytes(data) + return arr + + +class DynamicPyArraySerializer(Serializer): + def xwrite(self, buffer, value): + itemsize, ftype, type_id = typecode_dict[value.typecode] + view = memoryview(value) + nbytes = len(value) * itemsize + buffer.write_varuint32(type_id) + buffer.write_varuint32(nbytes) + if not view.c_contiguous: + buffer.write_bytes(value.tobytes()) + else: + buffer.write_buffer(value) + + def xread(self, buffer): + type_id = buffer.read_varint32() + typecode = typeid_code[type_id] + data = buffer.read_bytes_and_size() + arr = array.array(typecode, []) + arr.frombytes(data) + return arr + + def write(self, buffer, value): + self.fury.handle_unsupported_write(buffer, value) + + def read(self, buffer): + return self.fury.handle_unsupported_read(buffer) + + +if np: + _np_dtypes_dict = { + # use bytes serializer for byte array. + np.dtype(np.bool_): (1, "?", BoolNDArrayType, TypeId.BOOL_ARRAY), + np.dtype(np.int16): (2, "h", Int16NDArrayType, TypeId.INT16_ARRAY), + np.dtype(np.int32): (4, "i", Int32NDArrayType, TypeId.INT32_ARRAY), + np.dtype(np.int64): (8, "l", Int64NDArrayType, TypeId.INT64_ARRAY), + np.dtype(np.float32): (4, "f", Float32NDArrayType, TypeId.FLOAT32_ARRAY), + np.dtype(np.float64): (8, "d", Float64NDArrayType, TypeId.FLOAT64_ARRAY), + } +else: + _np_dtypes_dict = {} + + +class Numpy1DArraySerializer(Serializer): + dtypes_dict = _np_dtypes_dict + + def __init__(self, fury, ftype, dtype): + super().__init__(fury, ftype) + self.dtype = dtype + self.itemsize, self.format, self.typecode, self.type_id = _np_dtypes_dict[ + self.dtype + ] + + def xwrite(self, buffer, value): + assert value.itemsize == self.itemsize + view = memoryview(value) + try: + assert view.format == self.typecode + except AssertionError as e: + raise e + assert view.itemsize == self.itemsize + nbytes = len(value) * self.itemsize + buffer.write_varuint32(nbytes) + if self.dtype == np.dtype("bool") or not view.c_contiguous: + buffer.write_bytes(value.tobytes()) + else: + buffer.write_buffer(value) + + def xread(self, buffer): + data = buffer.read_bytes_and_size() + return np.frombuffer(data, dtype=self.dtype) + + def write(self, buffer, value): + self.fury.handle_unsupported_write(buffer, value) + + def read(self, buffer): + return self.fury.handle_unsupported_read(buffer) + + +class NDArraySerializer(Serializer): + def xwrite(self, buffer, value): + itemsize, typecode, ftype, type_id = _np_dtypes_dict[value.dtype] + view = memoryview(value) + nbytes = len(value) * itemsize + buffer.write_varuint32(type_id) + buffer.write_varuint32(nbytes) + if value.dtype == np.dtype("bool") or not view.c_contiguous: + buffer.write_bytes(value.tobytes()) + else: + buffer.write_buffer(value) + + def xread(self, buffer): + raise NotImplementedError("Multi-dimensional array not supported currently") + + def write(self, buffer, value): + self.fury.handle_unsupported_write(buffer, value) + + def read(self, buffer): + return self.fury.handle_unsupported_read(buffer) + + +class BytesSerializer(CrossLanguageCompatibleSerializer): + def write(self, buffer, value): + self.fury.write_buffer_object(buffer, BytesBufferObject(value)) + + def read(self, buffer): + fury_buf = self.fury.read_buffer_object(buffer) + return fury_buf.to_pybytes() + + +class BytesBufferObject(BufferObject): + __slots__ = ("binary",) + + def __init__(self, binary: bytes): + self.binary = binary + + def total_bytes(self) -> int: + return len(self.binary) + + def write_to(self, buffer: "Buffer"): + buffer.write_bytes(self.binary) + + def to_buffer(self) -> "Buffer": + return Buffer(self.binary) + + +class PickleSerializer(Serializer): + def xwrite(self, buffer, value): + raise NotImplementedError + + def xread(self, buffer): + raise NotImplementedError + + def write(self, buffer, value): + self.fury.handle_unsupported_write(buffer, value) + + def read(self, buffer): + return self.fury.handle_unsupported_read(buffer) diff --git a/python/pyfury/tests/test_buffer.py b/python/pyfury/tests/test_buffer.py index 5fe4d7e0d0..f2145fd440 100644 --- a/python/pyfury/tests/test_buffer.py +++ b/python/pyfury/tests/test_buffer.py @@ -92,20 +92,20 @@ def test_empty_buffer(): def test_write_varint32(): buf = Buffer.allocate(32) - for i in range(32): + for i in range(1): for j in range(i): buf.write_int8(1) buf.read_int8() - check_positive_varint32(buf, 1, 1) - check_positive_varint32(buf, 1 << 6, 1) - check_positive_varint32(buf, 1 << 7, 2) - check_positive_varint32(buf, 1 << 13, 2) - check_positive_varint32(buf, 1 << 14, 3) - check_positive_varint32(buf, 1 << 20, 3) - check_positive_varint32(buf, 1 << 21, 4) - check_positive_varint32(buf, 1 << 27, 4) - check_positive_varint32(buf, 1 << 28, 5) - check_positive_varint32(buf, 1 << 30, 5) + check_varuint32(buf, 1, 1) + check_varuint32(buf, 1 << 6, 1) + check_varuint32(buf, 1 << 7, 2) + check_varuint32(buf, 1 << 13, 2) + check_varuint32(buf, 1 << 14, 3) + check_varuint32(buf, 1 << 20, 3) + check_varuint32(buf, 1 << 21, 4) + check_varuint32(buf, 1 << 27, 4) + check_varuint32(buf, 1 << 28, 5) + check_varuint32(buf, 1 << 30, 5) check_varint32(buf, -1) check_varint32(buf, -1 << 6) @@ -119,11 +119,11 @@ def test_write_varint32(): check_varint32(buf, -1 << 30) -def check_positive_varint32(buf: Buffer, value: int, bytes_written: int): +def check_varuint32(buf: Buffer, value: int, bytes_written: int): assert buf.writer_index == buf.reader_index - actual_bytes_written = buf.write_varint32(value) + actual_bytes_written = buf.write_varuint32(value) assert actual_bytes_written == bytes_written - varint = buf.read_varint32() + varint = buf.read_varuint32() assert buf.writer_index == buf.reader_index assert value == varint @@ -155,70 +155,70 @@ def test_grow(): assert buffer.own_data() -def test_write_varint64(): +def test_write_varuint64(): buf = Buffer.allocate(32) - check_varint64(buf, -1, 9) + check_varuint64(buf, -1, 9) for i in range(32): for j in range(i): buf.write_int8(1) buf.read_int8() - check_varint64(buf, -1, 9) - check_varint64(buf, 1, 1) - check_varint64(buf, 1 << 6, 1) - check_varint64(buf, 1 << 7, 2) - check_varint64(buf, -(2**6), 9) - check_varint64(buf, -(2**7), 9) - check_varint64(buf, 1 << 13, 2) - check_varint64(buf, 1 << 14, 3) - check_varint64(buf, -(2**13), 9) - check_varint64(buf, -(2**14), 9) - check_varint64(buf, 1 << 20, 3) - check_varint64(buf, 1 << 21, 4) - check_varint64(buf, -(2**20), 9) - check_varint64(buf, -(2**21), 9) - check_varint64(buf, 1 << 27, 4) - check_varint64(buf, 1 << 28, 5) - check_varint64(buf, -(2**27), 9) - check_varint64(buf, -(2**28), 9) - check_varint64(buf, 1 << 30, 5) - check_varint64(buf, -(2**30), 9) - check_varint64(buf, 1 << 31, 5) - check_varint64(buf, -(2**31), 9) - check_varint64(buf, 1 << 32, 5) - check_varint64(buf, -(2**32), 9) - check_varint64(buf, 1 << 34, 5) - check_varint64(buf, -(2**34), 9) - check_varint64(buf, 1 << 35, 6) - check_varint64(buf, -(2**35), 9) - check_varint64(buf, 1 << 41, 6) - check_varint64(buf, -(2**41), 9) - check_varint64(buf, 1 << 42, 7) - check_varint64(buf, -(2**42), 9) - check_varint64(buf, 1 << 48, 7) - check_varint64(buf, -(2**48), 9) - check_varint64(buf, 1 << 49, 8) - check_varint64(buf, -(2**49), 9) - check_varint64(buf, 1 << 55, 8) - check_varint64(buf, -(2**55), 9) - check_varint64(buf, 1 << 56, 9) - check_varint64(buf, -(2**56), 9) - check_varint64(buf, 1 << 62, 9) - check_varint64(buf, -(2**62), 9) - check_varint64(buf, 1 << 63 - 1, 9) - check_varint64(buf, -(2**63), 9) - - -def check_varint64(buf: Buffer, value: int, bytes_written: int): + check_varuint64(buf, -1, 9) + check_varuint64(buf, 1, 1) + check_varuint64(buf, 1 << 6, 1) + check_varuint64(buf, 1 << 7, 2) + check_varuint64(buf, -(2**6), 9) + check_varuint64(buf, -(2**7), 9) + check_varuint64(buf, 1 << 13, 2) + check_varuint64(buf, 1 << 14, 3) + check_varuint64(buf, -(2**13), 9) + check_varuint64(buf, -(2**14), 9) + check_varuint64(buf, 1 << 20, 3) + check_varuint64(buf, 1 << 21, 4) + check_varuint64(buf, -(2**20), 9) + check_varuint64(buf, -(2**21), 9) + check_varuint64(buf, 1 << 27, 4) + check_varuint64(buf, 1 << 28, 5) + check_varuint64(buf, -(2**27), 9) + check_varuint64(buf, -(2**28), 9) + check_varuint64(buf, 1 << 30, 5) + check_varuint64(buf, -(2**30), 9) + check_varuint64(buf, 1 << 31, 5) + check_varuint64(buf, -(2**31), 9) + check_varuint64(buf, 1 << 32, 5) + check_varuint64(buf, -(2**32), 9) + check_varuint64(buf, 1 << 34, 5) + check_varuint64(buf, -(2**34), 9) + check_varuint64(buf, 1 << 35, 6) + check_varuint64(buf, -(2**35), 9) + check_varuint64(buf, 1 << 41, 6) + check_varuint64(buf, -(2**41), 9) + check_varuint64(buf, 1 << 42, 7) + check_varuint64(buf, -(2**42), 9) + check_varuint64(buf, 1 << 48, 7) + check_varuint64(buf, -(2**48), 9) + check_varuint64(buf, 1 << 49, 8) + check_varuint64(buf, -(2**49), 9) + check_varuint64(buf, 1 << 55, 8) + check_varuint64(buf, -(2**55), 9) + check_varuint64(buf, 1 << 56, 9) + check_varuint64(buf, -(2**56), 9) + check_varuint64(buf, 1 << 62, 9) + check_varuint64(buf, -(2**62), 9) + check_varuint64(buf, 1 << 63 - 1, 9) + check_varuint64(buf, -(2**63), 9) + + +def check_varuint64(buf: Buffer, value: int, bytes_written: int): reader_index = buf.reader_index assert buf.writer_index == buf.reader_index - actual_bytes_written = buf.write_varint64(value) + actual_bytes_written = buf.write_varuint64(value) assert actual_bytes_written == bytes_written - varint = buf.read_varint64() + varint = buf.read_varuint64() assert buf.writer_index == buf.reader_index assert value == varint # test slow read branch in `read_varint64` assert ( - buf.slice(reader_index, buf.reader_index - reader_index).read_varint64() + buf.slice(reader_index, buf.reader_index - reader_index).read_varuint64() == value ) @@ -233,5 +233,22 @@ def test_write_buffer(): assert buf.read(3) == b"123" +def test_read_bytes_as_int64(): + # test small buffer whose length < 8 + buf = Buffer(b"1234") + assert buf.read_bytes_as_int64(0) == 0 + assert buf.read_bytes_as_int64(1) == 49 + + # test big buffer whose length > 8 + buf = Buffer(b"12345678901234") + assert buf.read_bytes_as_int64(0) == 0 + assert buf.read_bytes_as_int64(1) == 49 + assert buf.read_bytes_as_int64(8) == 4123106164818064178 + + # test fix for `OverflowError: Python int too large to convert to C long` + buf = Buffer(b"\xa6IOr\x9ch)\x80\x12\x02") + buf.read_bytes_as_int64(8) + + if __name__ == "__main__": test_grow() diff --git a/python/pyfury/tests/test_cross_language.py b/python/pyfury/tests/test_cross_language.py index 9c0cc93e95..795b143848 100644 --- a/python/pyfury/tests/test_cross_language.py +++ b/python/pyfury/tests/test_cross_language.py @@ -243,7 +243,7 @@ def test_buffer(data_file_path): assert buffer.read_int64() == 2**63 - 1 assert math.isclose(buffer.read_float(), -1.1, rel_tol=1e-03) assert math.isclose(buffer.read_double(), -1.1, rel_tol=1e-03) - assert buffer.read_varint32() == 100 + assert buffer.read_varuint32() == 100 binary = b"ab" assert buffer.read_bytes(buffer.read_int32()) == binary buffer.write_bool(True) @@ -253,7 +253,7 @@ def test_buffer(data_file_path): buffer.write_int64(2**63 - 1) buffer.write_float(-1.1) buffer.write_double(-1.1) - buffer.write_varint32(100) + buffer.write_varuint32(100) buffer.write_int32(len(binary)) buffer.write_bytes(binary) with open(data_file_path, "wb+") as f: @@ -286,7 +286,8 @@ def test_cross_language_serializer(data_file_path): assert _deserialize_and_append(fury, buffer, objects) == -(2**15) assert _deserialize_and_append(fury, buffer, objects) == 2**31 - 1 assert _deserialize_and_append(fury, buffer, objects) == -(2**31) - assert _deserialize_and_append(fury, buffer, objects) == 2**63 - 1 + x = _deserialize_and_append(fury, buffer, objects) + assert x == 2**63 - 1, x assert _deserialize_and_append(fury, buffer, objects) == -(2**63) assert _deserialize_and_append(fury, buffer, objects) == -1.0 assert _deserialize_and_append(fury, buffer, objects) == -1.0 @@ -444,7 +445,7 @@ class ComplexObject2: def test_serialize_simple_struct_local(): fury = pyfury.Fury(language=pyfury.Language.XLANG, ref_tracking=True) - fury.register_class(ComplexObject2, type_tag="test.ComplexObject2") + fury.register_type(ComplexObject2, namespace="test", typename="ComplexObject2") obj = ComplexObject2(f1=True, f2={-1: 2}) new_buf = fury.serialize(obj) assert fury.deserialize(new_buf) == obj @@ -453,16 +454,31 @@ def test_serialize_simple_struct_local(): @cross_language_test def test_serialize_simple_struct(data_file_path): fury = pyfury.Fury(language=pyfury.Language.XLANG, ref_tracking=True) - fury.register_class(ComplexObject2, type_tag="test.ComplexObject2") + fury.register_type(ComplexObject2, namespace="test", typename="ComplexObject2") obj = ComplexObject2(f1=True, f2={-1: 2}) struct_round_back(data_file_path, fury, obj) +@cross_language_test +def test_struct_hash(data_file_path): + with open(data_file_path, "rb") as f: + data_bytes = f.read() + debug_print(f"len {len(data_bytes)}") + read_hash = pyfury.Buffer(data_bytes).read_int32() + fury = pyfury.Fury(language=pyfury.Language.XLANG, ref_tracking=True) + fury.register_type(ComplexObject1, typename="ComplexObject1") + serializer = fury.class_resolver.get_serializer(ComplexObject1) + from pyfury._struct import _get_hash + + v = _get_hash(fury, serializer._field_names, serializer._type_hints) + assert read_hash == v, (read_hash, v) + + @cross_language_test def test_serialize_complex_struct(data_file_path): fury = pyfury.Fury(language=pyfury.Language.XLANG, ref_tracking=True) - fury.register_class(ComplexObject1, type_tag="test.ComplexObject1") - fury.register_class(ComplexObject2, type_tag="test.ComplexObject2") + fury.register_type(ComplexObject1, namespace="test", typename="ComplexObject1") + fury.register_type(ComplexObject2, namespace="test", typename="ComplexObject2") obj2 = ComplexObject2(f1=True, f2={-1: 2}) obj1 = ComplexObject1( @@ -498,33 +514,7 @@ def struct_round_back(data_file_path, fury, obj1): f.write(new_buf) -@cross_language_test -def test_serialize_opaque_object(data_file_path): - with open(data_file_path, "rb") as f: - data_bytes = f.read() - debug_print(f"len {len(data_bytes)}") - fury = pyfury.Fury(language=pyfury.Language.XLANG, ref_tracking=True) - fury.register_class(ComplexObject1, type_tag="test.ComplexObject1") - new_obj = fury.deserialize(data_bytes) - debug_print(new_obj) - assert new_obj.f2 == "abc" - assert isinstance(new_obj.f1, pyfury.OpaqueObject) - assert isinstance(new_obj.f3[0], pyfury.OpaqueObject) - assert isinstance(new_obj.f3[1], pyfury.OpaqueObject) - # new_buf = fury.serialize(new_obj) - # debug_print(f"new_buf size {len(new_buf)}") - # assert fury.deserialize(new_buf) == new_obj - # with open(data_file_path, "wb+") as f: - # f.write(new_buf) - - class ComplexObject1Serializer(pyfury.serializer.Serializer): - def get_xtype_id(self): - return pyfury.type.FuryType.FURY_TYPE_TAG.value - - def get_xtype_tag(self): - return "test.ComplexObject1" - def write(self, buffer, value): self.xwrite(buffer, value) @@ -553,8 +543,10 @@ def test_register_serializer(data_file_path): data_bytes = f.read() buffer = pyfury.Buffer(data_bytes) fury = pyfury.Fury(language=pyfury.Language.XLANG, ref_tracking=True) - fury.register_serializer( - ComplexObject1, ComplexObject1Serializer(fury, ComplexObject1) + fury.register_type( + ComplexObject1, + typename="test.ComplexObject1", + serializer=ComplexObject1Serializer(fury, ComplexObject1), ) new_obj = fury.deserialize(buffer) expected = ComplexObject1(*[None] * 12) diff --git a/python/pyfury/tests/test_metastring_resolver.py b/python/pyfury/tests/test_metastring_resolver.py new file mode 100644 index 0000000000..f0cbaeea73 --- /dev/null +++ b/python/pyfury/tests/test_metastring_resolver.py @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyfury import Buffer +from pyfury._serialization import MetaStringResolver, MetaStringBytes +from pyfury.meta.metastring import MetaStringEncoder + + +def test_metastring_resolver(): + resolver = MetaStringResolver() + encoder = MetaStringEncoder("$", "_") + metastr1 = encoder.encode("hello, world") + metabytes1 = resolver.get_metastr_bytes(metastr1) + buffer = Buffer.allocate(32) + resolver.write_meta_string_bytes(buffer, metabytes1) + assert resolver.read_meta_string_bytes(buffer) == metabytes1 + metabytes2 = MetaStringBytes( + data=b"\xbf\x05\xa4q\xa9\x92S\x96\xa6IOr\x9ch)\x80", + hashcode=-5456063526933366015, + ) + resolver.write_meta_string_bytes(buffer, metabytes2) + assert resolver.read_meta_string_bytes(buffer) == metabytes2 diff --git a/python/pyfury/tests/test_serializer.py b/python/pyfury/tests/test_serializer.py index 59da5b1567..361639a6a3 100644 --- a/python/pyfury/tests/test_serializer.py +++ b/python/pyfury/tests/test_serializer.py @@ -41,7 +41,7 @@ Numpy1DArraySerializer, ) from pyfury.tests.core import require_pyarrow -from pyfury.type import FuryType +from pyfury.type import TypeId from pyfury.util import lazy_import pa = lazy_import("pyarrow") @@ -85,14 +85,18 @@ def test_dict(): @pytest.mark.parametrize("language", [Language.XLANG, Language.PYTHON]) def test_basic_serializer(language): fury = Fury(language=language, ref_tracking=True) - datetime_serializer = fury.class_resolver.get_serializer(datetime.datetime) + classinfo = fury.class_resolver.get_classinfo(datetime.datetime) assert isinstance( - datetime_serializer, (TimestampSerializer, _serialization.TimestampSerializer) + classinfo.serializer, (TimestampSerializer, _serialization.TimestampSerializer) ) - assert datetime_serializer.get_xtype_id() == FuryType.TIMESTAMP.value - date_serializer = fury.class_resolver.get_serializer(datetime.date) - assert isinstance(date_serializer, (DateSerializer, _serialization.DateSerializer)) - assert date_serializer.get_xtype_id() == FuryType.DATE32.value + if language == Language.XLANG: + assert classinfo.type_id == TypeId.TIMESTAMP + classinfo = fury.class_resolver.get_classinfo(datetime.date) + assert isinstance( + classinfo.serializer, (DateSerializer, _serialization.DateSerializer) + ) + if language == Language.XLANG: + assert classinfo.type_id == TypeId.LOCAL_DATE assert ser_de(fury, True) is True assert ser_de(fury, False) is False assert ser_de(fury, -1) == -1 @@ -227,7 +231,8 @@ def test_array_serializer(language): fury = Fury(language=language, ref_tracking=True, require_class_registration=False) for typecode in PyArraySerializer.typecode_dict.keys(): arr = array.array(typecode, list(range(10))) - assert ser_de(fury, arr) == arr + new_arr = ser_de(fury, arr) + assert np.array_equal(new_arr, arr) for dtype in Numpy1DArraySerializer.dtypes_dict.keys(): arr = np.array(list(range(10)), dtype=dtype) new_arr = ser_de(fury, arr) @@ -330,12 +335,6 @@ def xwrite(self, buffer, value: Bar): def xread(self, buffer): return Bar(buffer.read_int32(), buffer.read_int32()) - def get_xtype_id(self): - return pyfury.FuryType.FURY_TYPE_TAG.value - - def get_xtype_tag(self): - return "test.Bar" - class RegisterClass: def __init__(self, f1=None): @@ -362,7 +361,7 @@ def xwrite(self, buffer, value): def xread(self, buffer): raise NotImplementedError - fury.register_serializer(A, Serializer(fury, RegisterClass)) + fury.register_type(A, serializer=Serializer(fury, RegisterClass)) assert fury.deserialize(fury.serialize(RegisterClass(100))).f1 == 100 @@ -372,7 +371,7 @@ class C: pass -def test_register_class(): +def test_register_type(): fury = Fury(language=Language.PYTHON, ref_tracking=True) class Serializer(pyfury.Serializer): @@ -388,9 +387,9 @@ def xwrite(self, buffer, value): def xread(self, buffer): raise NotImplementedError - fury.register_serializer(A, Serializer(fury, A)) - fury.register_serializer(A.B, Serializer(fury, A.B)) - fury.register_serializer(A.B.C, Serializer(fury, A.B.C)) + fury.register_type(A, serializer=Serializer(fury, A)) + fury.register_type(A.B, serializer=Serializer(fury, A.B)) + fury.register_type(A.B.C, serializer=Serializer(fury, A.B.C)) assert isinstance(fury.deserialize(fury.serialize(A())), A) assert isinstance(fury.deserialize(fury.serialize(A.B())), A.B) assert isinstance(fury.deserialize(fury.serialize(A.B.C())), A.B.C) @@ -411,7 +410,9 @@ def test_pickle_fallback(): def test_unsupported_callback(): - fury = Fury(language=Language.PYTHON, ref_tracking=True) + fury = Fury( + language=Language.PYTHON, ref_tracking=True, require_class_registration=False + ) def f1(x): return x @@ -420,8 +421,6 @@ def f2(x): return x + x obj1 = [1, True, f1, f2, {1: 2}] - with pytest.raises(Exception): - fury.serialize(obj1) unsupported_objects = [] binary1 = fury.serialize(obj1, unsupported_callback=unsupported_objects.append) assert len(unsupported_objects) == 2 @@ -469,12 +468,6 @@ def test_enum(): assert ser_de(fury, EnumClass.E3) == EnumClass.E3 assert ser_de(fury, EnumClass.E4) == EnumClass.E4 assert isinstance(fury.class_resolver.get_serializer(EnumClass), EnumSerializer) - assert isinstance( - fury.class_resolver.get_serializer(obj=EnumClass.E1), EnumSerializer - ) - assert isinstance( - fury.class_resolver.get_serializer(obj=EnumClass.E4), EnumSerializer - ) def test_duplicate_serialize(): @@ -494,25 +487,19 @@ class CacheClass1: def test_cache_serializer(): fury = Fury(language=Language.PYTHON, ref_tracking=True) - fury.register_serializer(CacheClass1, pyfury.PickleStrongCacheSerializer(fury)) + fury.register_type(CacheClass1, serializer=pyfury.PickleStrongCacheSerializer(fury)) assert ser_de(fury, CacheClass1(1)) == CacheClass1(1) - fury.register_serializer(CacheClass1, pyfury.PickleCacheSerializer(fury)) + fury.register_type(CacheClass1, serializer=pyfury.PickleCacheSerializer(fury)) assert ser_de(fury, CacheClass1(1)) == CacheClass1(1) - classinfo = pyfury.PickleStrongCacheSerializer.new_classinfo(fury) - buffer = Buffer.allocate(32) - fury.serialize_ref(buffer, CacheClass1(1), classinfo) - assert fury.deserialize_ref(buffer) == CacheClass1(1) - classinfo = pyfury.PickleCacheSerializer.new_classinfo(fury) - fury.serialize_ref(buffer, CacheClass1(1), classinfo) - assert fury.deserialize_ref(buffer) == CacheClass1(1) - def test_pandas_range_index(): fury = Fury( language=Language.PYTHON, ref_tracking=True, require_class_registration=False ) - fury.register_serializer(pd.RangeIndex, pyfury.PandasRangeIndexSerializer(fury)) + fury.register_type( + pd.RangeIndex, serializer=pyfury.PandasRangeIndexSerializer(fury) + ) index = pd.RangeIndex(1, 100, 2, name="a") new_index = ser_de(fury, index) pd.testing.assert_index_equal(new_index, new_index) @@ -530,7 +517,9 @@ class PyDataClass1: def test_py_serialize_dataclass(): - fury = Fury(language=Language.PYTHON, ref_tracking=True) + fury = Fury( + language=Language.PYTHON, ref_tracking=True, require_class_registration=False + ) obj1 = PyDataClass1( f1=1, f2=-2.0, f3="abc", f4=True, f5="xyz", f6=[1, 2], f7={"k1": "v1"} ) diff --git a/python/pyfury/tests/test_struct.py b/python/pyfury/tests/test_struct.py index d1412026dd..940259b560 100644 --- a/python/pyfury/tests/test_struct.py +++ b/python/pyfury/tests/test_struct.py @@ -23,6 +23,7 @@ import pyfury from pyfury import Fury, Language +from pyfury.error import TypeUnregisteredError def ser_de(fury, obj): @@ -51,10 +52,10 @@ class ComplexObject: def test_struct(): fury = Fury(language=Language.XLANG, ref_tracking=True) - fury.register_class(SimpleObject, type_tag="example.SimpleObject") - fury.register_class(ComplexObject, type_tag="example.ComplexObject") + fury.register_type(SimpleObject, typename="SimpleObject") + fury.register_type(ComplexObject, typename="example.ComplexObject") o = SimpleObject(f1={1: 1.0 / 3}) - # assert ser_de(fury, o) == o + assert ser_de(fury, o) == o o = ComplexObject( f1="str", @@ -92,11 +93,20 @@ class ChildClass1(SuperClass1): f3: Dict[str, pyfury.Float64Type] = None +def test_require_class_registration(): + fury = Fury(language=Language.PYTHON, ref_tracking=True) + obj = ChildClass1(f1="a", f2=-10, f3={"a": -10.0, "b": 1 / 3}) + with pytest.raises(TypeUnregisteredError): + fury.serialize(obj) + + def test_inheritance(): type_hints = typing.get_type_hints(ChildClass1) print(type_hints) assert type_hints.keys() == {"f1", "f2", "f3"} - fury = Fury(language=Language.PYTHON, ref_tracking=True) + fury = Fury( + language=Language.PYTHON, ref_tracking=True, require_class_registration=False + ) obj = ChildClass1(f1="a", f2=-10, f3={"a": -10.0, "b": 1 / 3}) assert ser_de(fury, obj) == obj assert ( diff --git a/python/pyfury/type.py b/python/pyfury/type.py index e2aea26290..dc31817451 100644 --- a/python/pyfury/type.py +++ b/python/pyfury/type.py @@ -17,7 +17,6 @@ import array import dataclasses -import enum import importlib import inspect @@ -25,6 +24,13 @@ from typing import TypeVar from abc import ABC, abstractmethod +try: + import numpy as np + + ndarray = np.ndarray +except ImportError: + np, ndarray = None, None + # modified from `fluent python` def record_class_factory(cls_name, field_names): @@ -118,111 +124,124 @@ def get_qualified_classname(obj): return t.__module__ + "." + t.__name__ -class FuryType(enum.Enum): +class TypeId: """ - Fury added type for cross-language serialization. + Fury type for cross-language serialization. See `org.apache.fury.types.Type` """ - NA = 0 - # BOOL Boolean as 1 bit LSB bit-packed ordering + # a boolean value (true or false). BOOL = 1 - # UINT8 Unsigned 8-bit little-endian integer - UINT8 = 2 - # INT8 Signed 8-bit little-endian integer - INT8 = 3 - # UINT16 Unsigned 16-bit little-endian integer - UINT16 = 4 - # INT16 Signed 16-bit little-endian integer - INT16 = 5 - # UINT32 Unsigned 32-bit little-endian integer - UINT32 = 6 - # INT32 Signed 32-bit little-endian integer - INT32 = 7 - # UINT64 Unsigned 64-bit little-endian integer - UINT64 = 8 - # INT64 Signed 64-bit little-endian integer - INT64 = 9 - # HALF_FLOAT 2-byte floating point value - HALF_FLOAT = 10 - # FLOAT 4-byte floating point value - FLOAT = 11 - # DOUBLE 8-byte floating point value - DOUBLE = 12 - # STRING UTF8 variable-length string as List - STRING = 13 - # BINARY Variable-length bytes (no guarantee of UTF8-ness) - BINARY = 14 - # FIXED_SIZE_BINARY Fixed-size binary. Each value occupies the same number of bytes - FIXED_SIZE_BINARY = 15 - # DATE32 int32_t days since the UNIX epoch - DATE32 = 16 - # DATE64 int64_t milliseconds since the UNIX epoch - DATE64 = 17 - # TIMESTAMP Exact timestamp encoded with int64 since UNIX epoch - # Default unit millisecond - TIMESTAMP = 18 - # TIME32 Time as signed 32-bit integer representing either seconds or - # milliseconds since midnight - TIME32 = 19 - # TIME64 Time as signed 64-bit integer representing either microseconds or - # nanoseconds since midnight - TIME64 = 20 - # INTERVAL_MONTHS YEAR_MONTH interval in SQL style - INTERVAL_MONTHS = 21 - # INTERVAL_DAY_TIME DAY_TIME interval in SQL style - INTERVAL_DAY_TIME = 22 - # DECIMAL128 Precision- and scale-based decimal type with 128 bits. - DECIMAL128 = 23 - # DECIMAL256 Precision- and scale-based decimal type with 256 bits. - DECIMAL256 = 24 - # LIST A list of some logical data type - LIST = 25 - # STRUCT Struct of logical types - STRUCT = 26 - # SPARSE_UNION Sparse unions of logical types - SPARSE_UNION = 27 - # DENSE_UNION Dense unions of logical types - DENSE_UNION = 28 - # DICTIONARY Dictionary-encoded type also called "categorical" or "factor" - # in other programming languages. Holds the dictionary value - # type but not the dictionary itself which is part of the - # ArrayData struct - DICTIONARY = 29 - # MAP Map a repeated struct logical type - MAP = 30 - # EXTENSION Custom data type implemented by user - EXTENSION = 31 - # FIXED_SIZE_LIST Fixed size list of some logical type - FIXED_SIZE_LIST = 31 - # DURATION Measure of elapsed time in either seconds milliseconds microseconds - # or nanoseconds. - DURATION = 33 - # LARGE_STRING Like STRING but with 64-bit offsets - LARGE_STRING = 34 - # LARGE_BINARY Like BINARY but with 64-bit offsets - LARGE_BINARY = 35 - # LARGE_LIST Like LIST but with 64-bit offsets - LARGE_LIST = 36 - # MAX_ID Leave this at the end - MAX_ID = 37 - DECIMAL = DECIMAL128 - - FURY_TYPE_TAG = 256 - FURY_SET = 257 - FURY_PRIMITIVE_BOOL_ARRAY = 258 - FURY_PRIMITIVE_SHORT_ARRAY = 259 - FURY_PRIMITIVE_INT_ARRAY = 260 - FURY_PRIMITIVE_LONG_ARRAY = 261 - FURY_PRIMITIVE_FLOAT_ARRAY = 262 - FURY_PRIMITIVE_DOUBLE_ARRAY = 263 - FURY_STRING_ARRAY = 264 - FURY_SERIALIZED_OBJECT = 265 - FURY_BUFFER = 266 - FURY_ARROW_RECORD_BATCH = 267 - FURY_ARROW_TABLE = 268 - - + # a 8-bit signed integer. + INT8 = 2 + # a 16-bit signed integer. + INT16 = 3 + # a 32-bit signed integer. + INT32 = 4 + # a 32-bit signed integer which use fury var_int32 encoding. + VAR_INT32 = 5 + # a 64-bit signed integer. + INT64 = 6 + # a 64-bit signed integer which use fury PVL encoding. + VAR_INT64 = 7 + # a 64-bit signed integer which use fury SLI encoding. + SLI_INT64 = 8 + # a 16-bit floating point number. + FLOAT16 = 9 + # a 32-bit floating point number. + FLOAT32 = 10 + # a 64-bit floating point number including NaN and Infinity. + FLOAT64 = 11 + # a text string encoded using Latin1/UTF16/UTF-8 encoding. + STRING = 12 + # a data type consisting of a set of named values. Rust enum with non-predefined field values are not supported as + # an enum + ENUM = 13 + # an enum whose value will be serialized as the registered name. + NAMED_ENUM = 14 + # a morphic(final) type serialized by Fury Struct serializer. i.e. it doesn't have subclasses. Suppose we're + # deserializing `List[SomeClass]`, we can save dynamic serializer dispatch since `SomeClass` is morphic(final). + STRUCT = 15 + # a type which is not morphic(not final). i.e. it have subclasses. Suppose we're deserializing + # `List[SomeClass]`, we must dispatch serializer dynamically since `SomeClass` is polymorphic(non-final). + POLYMORPHIC_STRUCT = 16 + # a morphic(final) type serialized by Fury compatible Struct serializer. + COMPATIBLE_STRUCT = 17 + # a non-morphic(non-final) type serialized by Fury compatible Struct serializer. + POLYMORPHIC_COMPATIBLE_STRUCT = 18 + # a `struct` whose type mapping will be encoded as a name. + NAMED_STRUCT = 19 + # a `polymorphic_struct` whose type mapping will be encoded as a name. + NAMED_POLYMORPHIC_STRUCT = 20 + # a `compatible_struct` whose type mapping will be encoded as a name. + NAMED_COMPATIBLE_STRUCT = 21 + # a `polymorphic_compatible_struct` whose type mapping will be encoded as a name. + NAMED_POLYMORPHIC_COMPATIBLE_STRUCT = 22 + # a type which will be serialized by a customized serializer. + EXT = 23 + # an `ext` type which is not morphic(not final). + POLYMORPHIC_EXT = 24 + # an `ext` type whose type mapping will be encoded as a name. + NAMED_EXT = 25 + # an `polymorphic_ext` type whose type mapping will be encoded as a name. + NAMED_POLYMORPHIC_EXT = 26 + # a sequence of objects. + LIST = 27 + # an unordered set of unique elements. + SET = 28 + # a map of key-value pairs. Mutable types such as `list/map/set/array/tensor/arrow` are not allowed as key of map. + MAP = 29 + # an absolute length of time, independent of any calendar/timezone, as a count of nanoseconds. + DURATION = 30 + # a point in time, independent of any calendar/timezone, as a count of nanoseconds. The count is relative + # to an epoch at UTC midnight on January 1, 1970. + TIMESTAMP = 31 + # a naive date without timezone. The count is days relative to an epoch at UTC midnight on Jan 1, 1970. + LOCAL_DATE = 32 + # exact decimal value represented as an integer value in two's complement. + DECIMAL = 33 + # an variable-length array of bytes. + BINARY = 34 + # a multidimensional array which every sub-array can have different sizes but all have same type. + # only allow numeric components. Other arrays will be taken as List. The implementation should support the + # interoperability between array and list. + ARRAY = 35 + # one dimensional bool array. + BOOL_ARRAY = 36 + # one dimensional int16 array. + INT8_ARRAY = 37 + # one dimensional int16 array. + INT16_ARRAY = 38 + # one dimensional int32 array. + INT32_ARRAY = 39 + # one dimensional int64 array. + INT64_ARRAY = 40 + # one dimensional half_float_16 array. + FLOAT16_ARRAY = 41 + # one dimensional float32 array. + FLOAT32_ARRAY = 42 + # one dimensional float64 array. + FLOAT64_ARRAY = 43 + # an arrow [record batch](https://arrow.apache.org/docs/cpp/tables.html#record-batches) object. + ARROW_RECORD_BATCH = 44 + # an arrow [table](https://arrow.apache.org/docs/cpp/tables.html#tables) object. + ARROW_TABLE = 45 + BOUND = 64 + + @staticmethod + def is_namespaced_type(type_id: int) -> bool: + return type_id in __NAMESPACED_TYPES__ + + +__NAMESPACED_TYPES__ = { + TypeId.NAMED_EXT, + TypeId.NAMED_POLYMORPHIC_EXT, + TypeId.NAMED_ENUM, + TypeId.NAMED_STRUCT, + TypeId.NAMED_POLYMORPHIC_STRUCT, + TypeId.NAMED_COMPATIBLE_STRUCT, + TypeId.NAMED_POLYMORPHIC_COMPATIBLE_STRUCT, +} Int8Type = TypeVar("Int8Type", bound=int) Int16Type = TypeVar("Int16Type", bound=int) Int32Type = TypeVar("Int32Type", bound=int) @@ -250,11 +269,18 @@ def is_primitive_type(type_) -> bool: # Int8ArrayType = TypeVar("Int8ArrayType", bound=array.ArrayType) +BoolArrayType = TypeVar("BoolArrayType") Int16ArrayType = TypeVar("Int16ArrayType", bound=array.ArrayType) Int32ArrayType = TypeVar("Int32ArrayType", bound=array.ArrayType) Int64ArrayType = TypeVar("Int64ArrayType", bound=array.ArrayType) Float32ArrayType = TypeVar("Float32ArrayType", bound=array.ArrayType) Float64ArrayType = TypeVar("Float64ArrayType", bound=array.ArrayType) +BoolNDArrayType = TypeVar("BoolNDArrayType", bound=ndarray) +Int16NDArrayType = TypeVar("Int16NDArrayType", bound=ndarray) +Int32NDArrayType = TypeVar("Int32NDArrayType", bound=ndarray) +Int64NDArrayType = TypeVar("Int64NDArrayType", bound=ndarray) +Float32NDArrayType = TypeVar("Float32NDArrayType", bound=ndarray) +Float64NDArrayType = TypeVar("Float64NDArrayType", bound=ndarray) _py_array_types = { From 3c1df17a1edd8b64196475c8c6a7db5acf2e502a Mon Sep 17 00:00:00 2001 From: Aliothmoon <107878625+Aliothmoon@users.noreply.github.com> Date: Mon, 30 Dec 2024 23:18:53 +0800 Subject: [PATCH 10/13] fix(java): Fix the issue caused by not using readCompressedBytesString during deserialization when string compression is enabled. (#1991) Fix the issue caused by not using readCompressedBytesString during deserialization when string compression is enabled. ## What does this PR do? fix issue [#1984 ](https://github.com/apache/fury/issues/1984) ## Related issues ## Does this PR introduce any user-facing change? - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark --- .../fury/serializer/StringSerializer.java | 6 +++- .../fury/serializer/StringSerializerTest.java | 30 +++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java b/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java index 99c34c260f..fd84c7ca3a 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java @@ -176,7 +176,11 @@ public String readString(MemoryBuffer buffer) { public Expression readStringExpr(Expression strSerializer, Expression buffer) { if (isJava) { if (STRING_VALUE_FIELD_IS_BYTES) { - return new Invoke(strSerializer, "readBytesString", STRING_TYPE, buffer); + if (compressString) { + return new Invoke(strSerializer, "readCompressedBytesString", STRING_TYPE, buffer); + } else { + return new Invoke(strSerializer, "readBytesString", STRING_TYPE, buffer); + } } else { if (!STRING_VALUE_FIELD_IS_CHARS) { throw new UnsupportedOperationException(); diff --git a/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java b/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java index 15c46c57f7..123f3e54e6 100644 --- a/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java +++ b/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java @@ -29,6 +29,7 @@ import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentLinkedQueue; +import lombok.Data; import org.apache.fury.Fury; import org.apache.fury.FuryTestBase; import org.apache.fury.collection.Tuple2; @@ -160,6 +161,35 @@ public void testJavaStringSimple() { } } + @Data + public static class Simple { + private String str; + + public Simple(String str) { + this.str = str; + } + } + + /** Test for #1984 */ + @Test + public void testJavaCompressedString() { + Fury fury = + Fury.builder() + .withStringCompressed(true) + .withLanguage(Language.JAVA) + .requireClassRegistration(false) + .build(); + + Simple a = + new Simple( + "STG@ON DEMAND Solutions@GeoComputing Switch/ Hub@Digi Edgeport/216 – 16 port Serial Hub"); + + byte[] bytes = fury.serialize(a); + + Simple b = (Simple) fury.deserialize(bytes); + assertEquals(a, b); + } + @Test(dataProvider = "stringCompress") public void testJavaString(boolean stringCompress) { Fury fury = From 1c250e1d2f2098fd4c701f2eb298d04c804c3e1b Mon Sep 17 00:00:00 2001 From: PAN <46820719+pandalee99@users.noreply.github.com> Date: Wed, 1 Jan 2025 21:56:18 +0800 Subject: [PATCH 11/13] feat(c++): Support the UTF-8 to UTF-16 with SIMD (#1990) ## What does this PR do? To support the utf8 utf16 and using simd to accelerate the optimization ``` c++ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) ``` The logic of converting UTF-8 to UTF-16 isn't that complicated. but there are still lots of optimizations that I haven't come up with yet. So, I'll first design a version that's a bit faster than the original one, and then think about how to make further optimizations. Judging from the tests, the logic is correct: ``` text [----------] 9 tests from UTF8ToUTF16Test [ RUN ] UTF8ToUTF16Test.BasicConversion [ OK ] UTF8ToUTF16Test.BasicConversion (0 ms) [ RUN ] UTF8ToUTF16Test.EmptyString [ OK ] UTF8ToUTF16Test.EmptyString (0 ms) [ RUN ] UTF8ToUTF16Test.SurrogatePairs [ OK ] UTF8ToUTF16Test.SurrogatePairs (0 ms) [ RUN ] UTF8ToUTF16Test.BoundaryValues [ OK ] UTF8ToUTF16Test.BoundaryValues (0 ms) [ RUN ] UTF8ToUTF16Test.SpecialCharacters [ OK ] UTF8ToUTF16Test.SpecialCharacters (0 ms) [ RUN ] UTF8ToUTF16Test.LittleEndian [ OK ] UTF8ToUTF16Test.LittleEndian (0 ms) [ RUN ] UTF8ToUTF16Test.BigEndian [ OK ] UTF8ToUTF16Test.BigEndian (0 ms) [ RUN ] UTF8ToUTF16Test.RoundTripConversion [ OK ] UTF8ToUTF16Test.RoundTripConversion (0 ms) ``` image And from the performance perspective, it's improved compared to serial processing: image The speed of execution has been significantly improved Actually, this code doesn't use libraries like AVX2 or really apply SIMD to process. The main reason is that the structure of UTF-8 encoding is complex and not fixed. It involves multi-byte encoding, and we need to analyze it byte by byte when dealing with different bytes. So, without clear rules and a uniform length, it becomes really hard to directly parallelize the processing of each byte. During the process of converting UTF-8 to UTF-16, we have to handle characters of different lengths, ranging from 1 to 4 bytes, which makes it difficult to break it down into structures that can be directly applied to SIMD operations. There are also some code style changes, uniform writing ## Related issues Close #1964 ## Does this PR introduce any user-facing change? - [x] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark --- cpp/fury/util/string_util.cc | 209 +++++++++++++++++++++++++- cpp/fury/util/string_util.h | 4 +- cpp/fury/util/string_util_test.cc | 241 +++++++++++++++++++++++++++++- 3 files changed, 451 insertions(+), 3 deletions(-) diff --git a/cpp/fury/util/string_util.cc b/cpp/fury/util/string_util.cc index 5413c72af8..3c0543c913 100644 --- a/cpp/fury/util/string_util.cc +++ b/cpp/fury/util/string_util.cc @@ -58,6 +58,129 @@ inline void utf16SurrogatePairToUtf8(uint16_t high, uint16_t low, char *&utf8) { *utf8++ = static_cast((code_point & 0x3F) | 0x80); } +std::u16string utf8ToUtf16SIMD(const std::string &utf8, bool is_little_endian) { + std::u16string utf16; + utf16.reserve(utf8.size()); // Reserve space to avoid frequent reallocations + + char buffer[64]; // Buffer to hold temporary UTF-16 results + char16_t *output = + reinterpret_cast(buffer); // Use char16_t for output + + size_t i = 0; + size_t n = utf8.size(); + + while (i + 32 <= n) { + + for (int j = 0; j < 32; ++j) { + uint8_t byte = utf8[i + j]; + + if (byte < 0x80) { + // 1-byte character (ASCII) + *output++ = static_cast(byte); + } else if (byte < 0xE0) { + // 2-byte character + uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + j + 1] & 0x3F); + if (!is_little_endian) { + utf16_char = (utf16_char >> 8) | + (utf16_char << 8); // Swap bytes for big-endian + } + *output++ = utf16_char; + ++j; + } else if (byte < 0xF0) { + // 3-byte character + uint16_t utf16_char = ((byte & 0x0F) << 12) | + ((utf8[i + j + 1] & 0x3F) << 6) | + (utf8[i + j + 2] & 0x3F); + if (!is_little_endian) { + utf16_char = (utf16_char >> 8) | + (utf16_char << 8); // Swap bytes for big-endian + } + *output++ = utf16_char; + j += 2; + } else { + // 4-byte character (surrogate pair handling required) + uint32_t code_point = + ((byte & 0x07) << 18) | ((utf8[i + j + 1] & 0x3F) << 12) | + ((utf8[i + j + 2] & 0x3F) << 6) | (utf8[i + j + 3] & 0x3F); + + // Convert the code point to a surrogate pair + uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10); + uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); + + if (!is_little_endian) { + high_surrogate = (high_surrogate >> 8) | + (high_surrogate << 8); // Swap bytes for big-endian + low_surrogate = (low_surrogate >> 8) | + (low_surrogate << 8); // Swap bytes for big-endian + } + + *output++ = high_surrogate; + *output++ = low_surrogate; + + j += 3; + } + } + + // Append the processed buffer to the final utf16 string + utf16.append(reinterpret_cast(buffer), + output - reinterpret_cast(buffer)); + output = + reinterpret_cast(buffer); // Reset output buffer pointer + i += 32; + } + + // Handle remaining characters + while (i < n) { + uint8_t byte = utf8[i]; + + if (byte < 0x80) { + *output++ = static_cast(byte); + } else if (byte < 0xE0) { + uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + 1] & 0x3F); + if (!is_little_endian) { + utf16_char = + (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian + } + *output++ = utf16_char; + ++i; + } else if (byte < 0xF0) { + uint16_t utf16_char = ((byte & 0x0F) << 12) | + ((utf8[i + 1] & 0x3F) << 6) | (utf8[i + 2] & 0x3F); + if (!is_little_endian) { + utf16_char = + (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian + } + *output++ = utf16_char; + i += 2; + } else { + uint32_t code_point = ((byte & 0x07) << 18) | + ((utf8[i + 1] & 0x3F) << 12) | + ((utf8[i + 2] & 0x3F) << 6) | (utf8[i + 3] & 0x3F); + + uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10); + uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); + + if (!is_little_endian) { + high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8); + low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8); + } + + *output++ = high_surrogate; + *output++ = low_surrogate; + + i += 3; + } + + ++i; + } + + // Append the last part of the buffer to the utf16 string + utf16.append(reinterpret_cast(buffer), + output - reinterpret_cast(buffer)); + + return utf16; +} + #if defined(__x86_64__) || defined(_M_X64) bool isLatin(const std::string &str) { @@ -168,6 +291,10 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) { return utf8; } +std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) { + return utf8ToUtf16SIMD(utf8, is_little_endian); +} + #elif defined(__ARM_NEON) || defined(__ARM_NEON__) bool isLatin(const std::string &str) { @@ -264,6 +391,10 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) { return utf8; } +std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) { + return utf8ToUtf16SIMD(utf8, is_little_endian); +} + #elif defined(__riscv) && __riscv_vector bool isLatin(const std::string &str) { @@ -365,6 +496,10 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) { return utf8; } +std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) { + return utf8ToUtf16SIMD(utf8, is_little_endian); +} + #else bool isLatin(const std::string &str) { @@ -414,6 +549,78 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) { return utf8; } +// Fallback implementation without SIMD acceleration +std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) { + std::u16string utf16; // Resulting UTF-16 string + size_t i = 0; // Index for traversing the UTF-8 string + size_t n = utf8.size(); // Total length of the UTF-8 string + + // Loop through each byte of the UTF-8 string + while (i < n) { + uint32_t code_point = 0; // The Unicode code point + unsigned char c = utf8[i]; // Current byte of the UTF-8 string + + // Determine the number of bytes for this character based on its first byte + if ((c & 0x80) == 0) { + // 1-byte character (ASCII) + code_point = c; + ++i; + } else if ((c & 0xE0) == 0xC0) { + // 2-byte character + code_point = c & 0x1F; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + i += 2; + } else if ((c & 0xF0) == 0xE0) { + // 3-byte character + code_point = c & 0x0F; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); + i += 3; + } else if ((c & 0xF8) == 0xF0) { + // 4-byte character + code_point = c & 0x07; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 3] & 0x3F); + i += 4; + } else { + // Invalid UTF-8 byte sequence + throw std::invalid_argument("Invalid UTF-8 encoding."); + } + + // If the code point is beyond the BMP range, use surrogate pairs + if (code_point >= 0x10000) { + code_point -= 0x10000; // Subtract 0x10000 to get the surrogate pair + uint16_t high_surrogate = 0xD800 + (code_point >> 10); // High surrogate + uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); // Low surrogate + + // If not little-endian, swap bytes of the surrogates + if (!is_little_endian) { + high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8); + low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8); + } + + // Add both high and low surrogates to the UTF-16 string + utf16.push_back(high_surrogate); + utf16.push_back(low_surrogate); + } else { + // For code points within the BMP range, directly store as a 16-bit value + uint16_t utf16_char = static_cast(code_point); + + // If not little-endian, swap the bytes of the 16-bit character + if (!is_little_endian) { + utf16_char = (utf16_char >> 8) | (utf16_char << 8); + } + + // Add the UTF-16 character to the string + utf16.push_back(utf16_char); + } + } + + // Return the resulting UTF-16 string + return utf16; +} + #endif -} // namespace fury +} // namespace fury \ No newline at end of file diff --git a/cpp/fury/util/string_util.h b/cpp/fury/util/string_util.h index 9cb4cc7e83..d5bdde543a 100644 --- a/cpp/fury/util/string_util.h +++ b/cpp/fury/util/string_util.h @@ -27,4 +27,6 @@ bool isLatin(const std::string &str); std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian); -} // namespace fury +std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian); + +} // namespace fury \ No newline at end of file diff --git a/cpp/fury/util/string_util_test.cc b/cpp/fury/util/string_util_test.cc index 9b2213b9b0..60c72c80b3 100644 --- a/cpp/fury/util/string_util_test.cc +++ b/cpp/fury/util/string_util_test.cc @@ -298,9 +298,248 @@ TEST(UTF16ToUTF8Test, PerformanceTest) { } } +// Generate random UTF-8 string +std::string generateRandomUTF8String(size_t length) { + std::string str; + std::mt19937 generator(std::random_device{}()); + std::uniform_int_distribution distribution(0, 0x10FFFF); + + while (str.size() < length) { + uint32_t code_point = distribution(generator); + + // Skip surrogate pairs (0xD800 to 0xDFFF) and other invalid Unicode code + // points + if ((code_point >= 0xD800 && code_point <= 0xDFFF) || + code_point > 0x10FFFF) { + continue; + } + + if (code_point <= 0x7F) { + str.push_back(static_cast(code_point)); + } else if (code_point <= 0x7FF) { + str.push_back(0xC0 | (code_point >> 6)); + str.push_back(0x80 | (code_point & 0x3F)); + } else if (code_point <= 0xFFFF) { + str.push_back(0xE0 | (code_point >> 12)); + str.push_back(0x80 | ((code_point >> 6) & 0x3F)); + str.push_back(0x80 | (code_point & 0x3F)); + } else if (code_point <= 0x10FFFF) { + str.push_back(0xF0 | (code_point >> 18)); + str.push_back(0x80 | ((code_point >> 12) & 0x3F)); + str.push_back(0x80 | ((code_point >> 6) & 0x3F)); + str.push_back(0x80 | (code_point & 0x3F)); + } + } + + return str; +} + +std::u16string utf8ToUtf16BaseLine(const std::string &utf8, + bool is_little_endian) { + std::u16string utf16; // Resulting UTF-16 string + size_t i = 0; // Index for traversing the UTF-8 string + size_t n = utf8.size(); // Total length of the UTF-8 string + + // Loop through each byte of the UTF-8 string + while (i < n) { + uint32_t code_point = 0; // The Unicode code point + unsigned char c = utf8[i]; // Current byte of the UTF-8 string + + // Determine the number of bytes for this character based on its first byte + if ((c & 0x80) == 0) { + // 1-byte character (ASCII) + code_point = c; + ++i; + } else if ((c & 0xE0) == 0xC0) { + // 2-byte character + code_point = c & 0x1F; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + i += 2; + } else if ((c & 0xF0) == 0xE0) { + // 3-byte character + code_point = c & 0x0F; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); + i += 3; + } else if ((c & 0xF8) == 0xF0) { + // 4-byte character + code_point = c & 0x07; + code_point = (code_point << 6) | (utf8[i + 1] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 2] & 0x3F); + code_point = (code_point << 6) | (utf8[i + 3] & 0x3F); + i += 4; + } else { + // Invalid UTF-8 byte sequence + throw std::invalid_argument("Invalid UTF-8 encoding."); + } + + // If the code point is beyond the BMP range, use surrogate pairs + if (code_point >= 0x10000) { + code_point -= 0x10000; // Subtract 0x10000 to get the surrogate pair + uint16_t high_surrogate = 0xD800 + (code_point >> 10); // High surrogate + uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); // Low surrogate + + // If not little-endian, swap bytes of the surrogates + if (!is_little_endian) { + high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8); + low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8); + } + + // Add both high and low surrogates to the UTF-16 string + utf16.push_back(high_surrogate); + utf16.push_back(low_surrogate); + } else { + // For code points within the BMP range, directly store as a 16-bit value + uint16_t utf16_char = static_cast(code_point); + + // If not little-endian, swap the bytes of the 16-bit character + if (!is_little_endian) { + utf16_char = (utf16_char >> 8) | (utf16_char << 8); + } + + // Add the UTF-16 character to the string + utf16.push_back(utf16_char); + } + } + + // Return the resulting UTF-16 string + return utf16; +} + +// Testing Basic Logic +TEST(UTF8ToUTF16Test, BasicConversion) { + std::string utf8 = u8"Hello, 世界!"; + std::u16string utf16 = fury::utf8ToUtf16(utf8, true); + ASSERT_EQ(utf16, u"Hello, 世界!"); +} + +// Testing Empty String +TEST(UTF8ToUTF16Test, EmptyString) { + std::string utf8 = ""; + std::u16string utf16 = fury::utf8ToUtf16(utf8, true); + ASSERT_EQ(utf16, u""); +} + +// Testing emoji +TEST(UTF8ToUTF16Test, SurrogatePairs) { + std::string utf8 = "\xF0\x9F\x98\x80"; // 😀 emoji + std::u16string utf16 = fury::utf8ToUtf16(utf8, true); + std::u16string expected_utf16 = {0xD83D, 0xDE00}; // Surrogate pair for emoji + ASSERT_EQ(utf16, expected_utf16); +} + +// Correct Boundary testing for U+FFFD (replacement character) +TEST(UTF8ToUTF16Test, BoundaryValues) { + // "\xEF\xBF\xBD" is the UTF-8 encoding for U+FFFD (replacement character) + std::string utf8 = "\xEF\xBF\xBD"; // U+FFFD in UTF-8 + std::u16string utf16 = fury::utf8ToUtf16(utf8, true); + std::u16string expected_utf16 = { + 0xFFFD}; // Expected UTF-16 representation of U+FFFD + ASSERT_EQ(utf16, expected_utf16); +} + +// Testing Special Characters +TEST(UTF8ToUTF16Test, SpecialCharacters) { + std::string utf8 = " \n\t"; + std::u16string utf16 = fury::utf8ToUtf16(utf8, true); + ASSERT_EQ(utf16, u" \n\t"); +} + +// Testing LittleEndian +TEST(UTF8ToUTF16Test, LittleEndian) { + std::string utf8 = "ab"; + std::u16string utf16 = fury::utf8ToUtf16(utf8, true); + std::u16string expected_utf16 = { + 0x61, 0x62}; // Little-endian UTF-16 representation of "ab" + ASSERT_EQ(utf16, expected_utf16); +} + +// Correct BigEndian testing for BOM (Byte Order Mark) +TEST(UTF8ToUTF16Test, BigEndian) { + std::string utf8 = "\xEF\xBB\xBF"; // BOM in UTF-8 (0xFEFF) + std::u16string utf16 = fury::utf8ToUtf16(utf8, false); // Big-endian + std::u16string expected_utf16 = {0xFFFE}; // Expected BOM in UTF-16 + ASSERT_EQ(utf16, expected_utf16); +} + +// Testing round-trip conversion (UTF-8 -> UTF-16 -> UTF-8) +TEST(UTF8ToUTF16Test, RoundTripConversion) { + std::string original_utf8 = u8"Hello, 世界!"; + std::u16string utf16 = fury::utf8ToUtf16(original_utf8, true); + std::string utf8_converted_back = fury::utf16ToUtf8(utf16, true); + ASSERT_EQ(original_utf8, utf8_converted_back); +} + +// Testing Performance +TEST(UTF8ToUTF16Test, PerformanceTest) { + const size_t num_tests = 1000; + const size_t string_length = 1000; + // Default little_endian + bool is_little_endian = true; + + // Random UTF-8 + std::vector test_strings; + for (size_t i = 0; i < num_tests; ++i) { + test_strings.push_back(generateRandomUTF8String(string_length)); + } + + // Standard Library + try { + auto start_time = std::chrono::high_resolution_clock::now(); + std::wstring_convert, wchar_t> convert; + // Loop through test strings and convert each UTF-8 string to UTF-16 + for (const auto &str : test_strings) { + std::wstring wide_str = convert.from_bytes(str); + std::u16string utf16; + for (wchar_t wc : wide_str) { + utf16.push_back(static_cast(wc)); + } + } + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); + FURY_LOG(INFO) << "Standard Library Running Time: " << duration << " ns"; + } catch (const std::exception &e) { + FURY_LOG(FATAL) << "Caught exception in standard library conversion: " + << e.what(); + } + + // BaseLine + try { + auto start_time = std::chrono::high_resolution_clock::now(); + for (const auto &str : test_strings) { + std::u16string utf16 = utf8ToUtf16BaseLine(str, is_little_endian); + } + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); + FURY_LOG(INFO) << "BaseLine Running Time: " << duration << " ns"; + } catch (const std::exception &e) { + FURY_LOG(FATAL) << "Caught exception in baseline conversion: " << e.what(); + } + + // Optimized (SIMD) + try { + auto start_time = std::chrono::high_resolution_clock::now(); + for (const auto &str : test_strings) { + std::u16string utf16 = fury::utf8ToUtf16(str, is_little_endian); + } + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( + end_time - start_time) + .count(); + FURY_LOG(INFO) << "SIMD Optimized Running Time: " << duration << " ns"; + } catch (const std::exception &e) { + FURY_LOG(FATAL) << "Caught exception in SIMD optimized conversion: " + << e.what(); + } +} + } // namespace fury int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -} +} \ No newline at end of file From d7ddd904f8805948c8dd25d8b469c9f720d7cc0c Mon Sep 17 00:00:00 2001 From: orisgarno <42269241+orisgarno@users.noreply.github.com> Date: Tue, 7 Jan 2025 20:36:30 +0700 Subject: [PATCH 12/13] fix(java): Compatible mode on de/serialize api failed to deserialize (#1996) ## What does this PR do? Read and write class data on COMPATIBLE mode for de/serializeJavaObject api. When COMPATIBLE mode is on and need to serialize and deserialize different POJO, users are required to register classes those are going to be serialized or deserialized. ## Related issues ## Does this PR introduce any user-facing change? - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark --------- Co-authored-by: Shawn Yang --- docs/guide/java_serialization_guide.md | 9 +- .../src/main/java/org/apache/fury/Fury.java | 9 +- .../fury/serializer/EnumSerializerTest.java | 6 + ...DifferentPOJOCompatibleSerializerTest.java | 104 ++++++++++++++++++ .../classes/ClassCompleteField.java | 60 ++++++++++ .../compatible/classes/ClassMissingField.java | 46 ++++++++ .../classes/SubclassCompleteField.java | 55 +++++++++ .../classes/SubclassMissingField.java | 44 ++++++++ 8 files changed, 331 insertions(+), 2 deletions(-) create mode 100644 java/fury-core/src/test/java/org/apache/fury/serializer/compatible/DifferentPOJOCompatibleSerializerTest.java create mode 100644 java/fury-core/src/test/java/org/apache/fury/serializer/compatible/classes/ClassCompleteField.java create mode 100644 java/fury-core/src/test/java/org/apache/fury/serializer/compatible/classes/ClassMissingField.java create mode 100644 java/fury-core/src/test/java/org/apache/fury/serializer/compatible/classes/SubclassCompleteField.java create mode 100644 java/fury-core/src/test/java/org/apache/fury/serializer/compatible/classes/SubclassMissingField.java diff --git a/docs/guide/java_serialization_guide.md b/docs/guide/java_serialization_guide.md index 5270390113..707643935b 100644 --- a/docs/guide/java_serialization_guide.md +++ b/docs/guide/java_serialization_guide.md @@ -102,7 +102,7 @@ public class Example { | `compressLong` | Enables or disables long compression for smaller size. | `true` | | `compressString` | Enables or disables string compression for smaller size. | `false` | | `classLoader` | The classloader should not be updated; Fury caches class metadata. Use `LoaderBinding` or `ThreadSafeFury` for classloader updates. | `Thread.currentThread().getContextClassLoader()` | -| `compatibleMode` | Type forward/backward compatibility config. Also Related to `checkClassVersion` config. `SCHEMA_CONSISTENT`: Class schema must be consistent between serialization peer and deserialization peer. `COMPATIBLE`: Class schema can be different between serialization peer and deserialization peer. They can add/delete fields independently. | `CompatibleMode.SCHEMA_CONSISTENT` | +| `compatibleMode` | Type forward/backward compatibility config. Also Related to `checkClassVersion` config. `SCHEMA_CONSISTENT`: Class schema must be consistent between serialization peer and deserialization peer. `COMPATIBLE`: Class schema can be different between serialization peer and deserialization peer. They can add/delete fields independently. [See more](#class-inconsistency-and-class-version-check). | `CompatibleMode.SCHEMA_CONSISTENT` | | `checkClassVersion` | Determines whether to check the consistency of the class schema. If enabled, Fury checks, writes, and checks consistency using the `classVersionHash`. It will be automatically disabled when `CompatibleMode#COMPATIBLE` is enabled. Disabling is not recommended unless you can ensure the class won't evolve. | `false` | | `checkJdkClassSerializable` | Enables or disables checking of `Serializable` interface for classes under `java.*`. If a class under `java.*` is not `Serializable`, Fury will throw an `UnsupportedOperationException`. | `true` | | `registerGuavaTypes` | Whether to pre-register Guava types such as `RegularImmutableMap`/`RegularImmutableList`. These types are not public API, but seem pretty stable. | `true` | @@ -518,6 +518,13 @@ fury with `CompatibleMode.COMPATIBLE` has more performance and space cost, do not set it by default if your classes are always consistent between serialization and deserialization. +### Deserialize POJO into another type + +Fury allows you to serialize one POJO and deserialize it into a different POJO. To achieve this, configure Fury with +`CompatibleMode` set to `org.apache.fury.config.CompatibleMode.COMPATIBLE`. Additionally, you only need to register the +specific classes you want to serialize or deserialize to setup type mapping relationship; there's no need to register any nested classes within them. +[See example here](/java/fury-core/src/test/java/org/apache/fury/serializer/compatible/DifferentPOJOCompatibleSerializerTest.java) + ### Use wrong API for deserialization If you serialize an object by invoking `Fury#serialize`, you should invoke `Fury#deserialize` for deserialization diff --git a/java/fury-core/src/main/java/org/apache/fury/Fury.java b/java/fury-core/src/main/java/org/apache/fury/Fury.java index bb5b102a8a..7b364042df 100644 --- a/java/fury-core/src/main/java/org/apache/fury/Fury.java +++ b/java/fury-core/src/main/java/org/apache/fury/Fury.java @@ -1070,6 +1070,7 @@ public void serializeJavaObject(MemoryBuffer buffer, Object obj) { buffer.writeInt32(-1); // preserve 4-byte for meta start offsets. if (!refResolver.writeRefOrNull(buffer, obj)) { ClassInfo classInfo = classResolver.getOrUpdateClassInfo(obj.getClass()); + classResolver.writeClass(buffer, classInfo); writeData(buffer, classInfo, obj); MetaContext metaContext = serializationContext.getMetaContext(); if (metaContext != null && !metaContext.writingClassDefs.isEmpty()) { @@ -1119,7 +1120,13 @@ public T deserializeJavaObject(MemoryBuffer buffer, Class cls) { T obj; int nextReadRefId = refResolver.tryPreserveRefId(buffer); if (nextReadRefId >= NOT_NULL_VALUE_FLAG) { - obj = (T) readDataInternal(buffer, classResolver.getClassInfo(cls)); + ClassInfo classInfo; + if (shareMeta) { + classInfo = classResolver.readClassInfo(buffer); + } else { + classInfo = classResolver.getClassInfo(cls); + } + obj = (T) readDataInternal(buffer, classInfo); return obj; } else { return null; diff --git a/java/fury-core/src/test/java/org/apache/fury/serializer/EnumSerializerTest.java b/java/fury-core/src/test/java/org/apache/fury/serializer/EnumSerializerTest.java index 7a5c350d49..825aa56a04 100644 --- a/java/fury-core/src/test/java/org/apache/fury/serializer/EnumSerializerTest.java +++ b/java/fury-core/src/test/java/org/apache/fury/serializer/EnumSerializerTest.java @@ -140,8 +140,10 @@ public void testEnumSerializationAsString() { .build(); // serialize enum "B" + furySerialization.register(cls1); byte[] bytes = furySerialization.serializeJavaObject(cls1.getEnumConstants()[1]); + furyDeserialize.register(cls2); Object data = furyDeserialize.deserializeJavaObject(bytes, cls2); assertEquals(cls2.getEnumConstants()[0], data); } @@ -175,8 +177,10 @@ public void testEnumSerializationAsString_differentClass() { .build(); // serialize enum "B" + furySerialization.register(cls1); byte[] bytes = furySerialization.serializeJavaObject(cls1.getEnumConstants()[1]); + furyDeserialize.register(cls2); Object data = furyDeserialize.deserializeJavaObject(bytes, cls2); assertEquals(cls2.getEnumConstants()[0], data); } @@ -209,9 +213,11 @@ public void testEnumSerializationAsString_invalidEnum() { .withAsyncCompilation(false) .build(); + furySerialization.register(cls1); byte[] bytes = furySerialization.serializeJavaObject(cls1.getEnumConstants()[0]); try { + furyDeserialize.register(cls2); furyDeserialize.deserializeJavaObject(bytes, cls2); fail("expected to throw exception"); } catch (Exception e) { diff --git a/java/fury-core/src/test/java/org/apache/fury/serializer/compatible/DifferentPOJOCompatibleSerializerTest.java b/java/fury-core/src/test/java/org/apache/fury/serializer/compatible/DifferentPOJOCompatibleSerializerTest.java new file mode 100644 index 0000000000..3a7436d9c2 --- /dev/null +++ b/java/fury-core/src/test/java/org/apache/fury/serializer/compatible/DifferentPOJOCompatibleSerializerTest.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.fury.serializer.compatible; + +import org.apache.fury.Fury; +import org.apache.fury.config.CompatibleMode; +import org.apache.fury.config.Language; +import org.apache.fury.serializer.compatible.classes.ClassCompleteField; +import org.apache.fury.serializer.compatible.classes.ClassMissingField; +import org.testng.Assert; +import org.testng.annotations.Test; + +/** + * Test COMPATIBILITY mode that supports - same field type and name can be deserialized to other + * class with different name - scrambled field order to make sure it could handle different field + * order - missing or extra field from source class to target class - generic class + */ +public class DifferentPOJOCompatibleSerializerTest extends Assert { + + Fury getFury(Class... classes) { + Fury instance = + Fury.builder() + .withLanguage(Language.JAVA) + .withRefTracking(true) + .withCompatibleMode(CompatibleMode.COMPATIBLE) + .withMetaShare(true) + .withScopedMetaShare(true) + .requireClassRegistration(false) + .withAsyncCompilation(true) + .serializeEnumByName(true) + .build(); + if (classes != null) { + for (Class clazz : classes) { + instance.register(clazz); + } + } + ; + return instance; + } + + @Test + void testTargetHasLessFieldComparedToSourceClass() throws InterruptedException { + + ClassCompleteField subclass = new ClassCompleteField<>("subclass", "subclass2"); + ClassCompleteField> classCompleteField = + new ClassCompleteField<>(subclass, subclass); + byte[] serialized = getFury(ClassCompleteField.class).serializeJavaObject(classCompleteField); + ClassMissingField> classMissingField = + getFury(ClassMissingField.class).deserializeJavaObject(serialized, ClassMissingField.class); + + assertEq(classCompleteField, classMissingField); + } + + @Test + void testTargetHasMoreFieldComparedToSourceClass() throws InterruptedException { + + ClassMissingField subclass = new ClassMissingField<>("subclass"); + ClassMissingField classMissingField = new ClassMissingField(subclass); + byte[] serialized = getFury(ClassMissingField.class).serializeJavaObject(classMissingField); + + ClassCompleteField classCompleteField = + getFury(ClassCompleteField.class) + .deserializeJavaObject(serialized, ClassCompleteField.class); + + assertEq(classCompleteField, classMissingField); + } + + void assertEq(ClassCompleteField classCompleteField, ClassMissingField classMissingField) { + assertEqSubClass( + (ClassCompleteField) classCompleteField.getPrivateFieldSubClass(), + (ClassMissingField) classMissingField.getPrivateFieldSubClass()); + assertEquals(classCompleteField.getPrivateMap(), classMissingField.getPrivateMap()); + assertEquals(classCompleteField.getPrivateList(), classMissingField.getPrivateList()); + assertEquals(classCompleteField.getPrivateString(), classMissingField.getPrivateString()); + assertEquals(classCompleteField.getPrivateInt(), classMissingField.getPrivateInt()); + } + + void assertEqSubClass( + ClassCompleteField classCompleteField, ClassMissingField classMissingField) { + assertEquals( + classCompleteField.getPrivateFieldSubClass(), classMissingField.getPrivateFieldSubClass()); + assertEquals(classCompleteField.getPrivateMap(), classMissingField.getPrivateMap()); + assertEquals(classCompleteField.getPrivateList(), classMissingField.getPrivateList()); + assertEquals(classCompleteField.getPrivateString(), classMissingField.getPrivateString()); + assertEquals(classCompleteField.getPrivateInt(), classMissingField.getPrivateInt()); + } +} diff --git a/java/fury-core/src/test/java/org/apache/fury/serializer/compatible/classes/ClassCompleteField.java b/java/fury-core/src/test/java/org/apache/fury/serializer/compatible/classes/ClassCompleteField.java new file mode 100644 index 0000000000..c2b169c648 --- /dev/null +++ b/java/fury-core/src/test/java/org/apache/fury/serializer/compatible/classes/ClassCompleteField.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.fury.serializer.compatible.classes; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import lombok.EqualsAndHashCode; +import lombok.Getter; + +@EqualsAndHashCode +@Getter +public class ClassCompleteField { + private boolean privateBoolean = true; + private int privateInt = 10; + private String privateString = "notNull"; + private Map privateMap; + private List privateList; + private T privateFieldSubClass; + + private boolean privateBoolean2 = true; + private int privateInt2 = 10; + private String privateString2 = "notNull"; + private Map privateMap2; + private List privateList2; + private T privateFieldSubClass2; + + public ClassCompleteField(T privateFieldSubClass, T privateFieldSubClass2) { + privateMap = new HashMap<>(); + privateMap.put("a", "b"); + privateList = new ArrayList<>(); + privateList.add("a"); + + privateMap2 = new HashMap<>(); + privateMap2.put("a", "b"); + privateList2 = new ArrayList<>(); + privateList2.add("a"); + + this.privateFieldSubClass = privateFieldSubClass; + this.privateFieldSubClass2 = privateFieldSubClass2; + } +} diff --git a/java/fury-core/src/test/java/org/apache/fury/serializer/compatible/classes/ClassMissingField.java b/java/fury-core/src/test/java/org/apache/fury/serializer/compatible/classes/ClassMissingField.java new file mode 100644 index 0000000000..32e2a9e6a7 --- /dev/null +++ b/java/fury-core/src/test/java/org/apache/fury/serializer/compatible/classes/ClassMissingField.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.fury.serializer.compatible.classes; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import lombok.EqualsAndHashCode; +import lombok.Getter; + +@EqualsAndHashCode +@Getter +public class ClassMissingField { + private T privateFieldSubClass; + private List privateList; + private Map privateMap; + private String privateString = "missing"; + private int privateInt = 999; + private boolean privateBoolean = false; + + public ClassMissingField(T privateFieldSubClass) { + privateMap = new HashMap<>(); + privateMap.put("z", "x"); + privateList = new ArrayList<>(); + privateList.add("c"); + this.privateFieldSubClass = privateFieldSubClass; + } +} diff --git a/java/fury-core/src/test/java/org/apache/fury/serializer/compatible/classes/SubclassCompleteField.java b/java/fury-core/src/test/java/org/apache/fury/serializer/compatible/classes/SubclassCompleteField.java new file mode 100644 index 0000000000..da183502d1 --- /dev/null +++ b/java/fury-core/src/test/java/org/apache/fury/serializer/compatible/classes/SubclassCompleteField.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.fury.serializer.compatible.classes; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import lombok.EqualsAndHashCode; +import lombok.Getter; + +@Getter +@EqualsAndHashCode +public class SubclassCompleteField { + private boolean privateBoolean = true; + private int privateInt = 10; + private String privateString = "notNull"; + private Map privateMap; + private List privateList; + + private boolean privateBoolean2 = true; + private int privateInt2 = 10; + private String privateString2 = "notNull"; + private Map privateMap2; + private List privateList2; + + public SubclassCompleteField() { + privateMap = new HashMap<>(); + privateMap.put("a", "b"); + privateList = new ArrayList<>(); + privateList.add("a"); + + privateMap2 = new HashMap<>(); + privateMap2.put("a", "b"); + privateList2 = new ArrayList<>(); + privateList2.add("a"); + } +} diff --git a/java/fury-core/src/test/java/org/apache/fury/serializer/compatible/classes/SubclassMissingField.java b/java/fury-core/src/test/java/org/apache/fury/serializer/compatible/classes/SubclassMissingField.java new file mode 100644 index 0000000000..233b14cb74 --- /dev/null +++ b/java/fury-core/src/test/java/org/apache/fury/serializer/compatible/classes/SubclassMissingField.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.fury.serializer.compatible.classes; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import lombok.EqualsAndHashCode; +import lombok.Getter; + +@Getter +@EqualsAndHashCode +public class SubclassMissingField { + private boolean privateBoolean = true; + private int privateInt = 10; + private String privateString = "notNull"; + private Map privateMap; + private List privateList; + + public SubclassMissingField() { + privateMap = new HashMap<>(); + privateMap.put("a", "b"); + privateList = new ArrayList<>(); + privateList.add("a"); + } +} From 880c6e50e93889971eb9426515c19a68f78e9324 Mon Sep 17 00:00:00 2001 From: Shawn Yang Date: Wed, 8 Jan 2025 00:32:10 +0800 Subject: [PATCH 13/13] feat(python): support latin1/utf16 string encoding in python (#1997) ## What does this PR do? Support support latin1/utf16 string encoding in python. For utf16, since python doesn't use surrogate pairs, this pr also added a vectorized surrogate pairs check function. Note: - Python UCS-2 doesn't contains surrogate pairs, we must check utf16 first before contruct string from the binary. This is different from java/nodejs ## Related issues Closes #1967 ## Does this PR introduce any user-facing change? - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark --- cpp/fury/util/string_util.cc | 2 +- cpp/fury/util/string_util.h | 66 ++++++++++++- cpp/fury/util/string_util_test.cc | 21 +++- .../fury/serializer/ArraySerializers.java | 4 +- .../apache/fury/serializer/Serializers.java | 6 +- .../fury/serializer/StringSerializer.java | 97 ++++++------------- python/README.md | 2 +- python/pyfury/_fury.py | 15 +-- python/pyfury/_serialization.pyx | 9 +- python/pyfury/_util.pyx | 56 +++++++++-- python/pyfury/includes/libutil.pxd | 4 + python/pyfury/tests/test_serializer.py | 13 +++ 12 files changed, 191 insertions(+), 104 deletions(-) diff --git a/cpp/fury/util/string_util.cc b/cpp/fury/util/string_util.cc index 3c0543c913..3c28ac7949 100644 --- a/cpp/fury/util/string_util.cc +++ b/cpp/fury/util/string_util.cc @@ -623,4 +623,4 @@ std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) { #endif -} // namespace fury \ No newline at end of file +} // namespace fury diff --git a/cpp/fury/util/string_util.h b/cpp/fury/util/string_util.h index d5bdde543a..35254eab2c 100644 --- a/cpp/fury/util/string_util.h +++ b/cpp/fury/util/string_util.h @@ -19,14 +19,78 @@ #pragma once +#include #include +// AVX not included here since some older intel cpu doesn't support avx2 +// but the built wheel for avx2 is same as sse2. +#if defined(__ARM_NEON) || defined(__ARM_NEON__) +#include +#define USE_NEON_SIMD +#elif defined(__SSE2__) +#include +#define USE_SSE2_SIMD +#endif namespace fury { bool isLatin(const std::string &str); +static inline bool hasSurrogatePairFallback(const uint16_t *data, size_t size) { + for (size_t i = 0; i < size; ++i) { + auto c = data[i]; + if (c >= 0xD800 && c <= 0xDFFF) { + return true; + } + } + return false; +} + +#if defined(USE_NEON_SIMD) +inline bool utf16HasSurrogatePairs(const uint16_t *data, size_t length) { + size_t i = 0; + uint16x8_t lower_bound = vdupq_n_u16(0xD800); + uint16x8_t higher_bound = vdupq_n_u16(0xDFFF); + for (; i + 7 < length; i += 8) { + uint16x8_t chunk = vld1q_u16(data + i); + uint16x8_t mask1 = vcgeq_u16(chunk, lower_bound); + uint16x8_t mask2 = vcleq_u16(chunk, higher_bound); + if (vmaxvq_u16(mask1 & mask2)) { + return true; // Detected a high surrogate + } + } + return hasSurrogatePairFallback(data + i, length - i); +} +#elif defined(USE_SSE2_SIMD) +inline bool utf16HasSurrogatePairs(const uint16_t *data, size_t length) { + size_t i = 0; + __m128i lower_bound = _mm_set1_epi16(0xd7ff); + __m128i higher_bound = _mm_set1_epi16(0xe000); + for (; i + 7 < length; i += 8) { + __m128i chunk = + _mm_loadu_si128(reinterpret_cast(data + i)); + __m128i cmp1 = _mm_cmpgt_epi16(chunk, lower_bound); + __m128i cmp2 = _mm_cmpgt_epi16(higher_bound, chunk); + if (_mm_movemask_epi8(_mm_and_si128(cmp1, cmp2)) != 0) { + return true; // Detected a surrogate + } + } + return hasSurrogatePairFallback(data + i, length - i); +} +#else +inline bool utf16HasSurrogatePairs(const uint16_t *data, size_t length) { + return hasSurrogatePairFallback(data, length); +} +#endif + +inline bool utf16HasSurrogatePairs(const std::u16string &str) { + // Get the data pointer + const std::uint16_t *data = + reinterpret_cast(str.data()); + return utf16HasSurrogatePairs(data, str.size()); +} + std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian); std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian); -} // namespace fury \ No newline at end of file +} // namespace fury diff --git a/cpp/fury/util/string_util_test.cc b/cpp/fury/util/string_util_test.cc index 60c72c80b3..5fe800ddaa 100644 --- a/cpp/fury/util/string_util_test.cc +++ b/cpp/fury/util/string_util_test.cc @@ -122,7 +122,24 @@ std::u16string generateRandomUTF16String(size_t length) { return str; } -// Basic implementation +TEST(StringUtilTest, TestUtf16HasSurrogatePairs) { + EXPECT_FALSE(utf16HasSurrogatePairs(std::u16string({0x99, 0x100}))); + std::u16string utf16 = {0xD83D, 0xDE00}; // 😀 emoji + EXPECT_TRUE(utf16HasSurrogatePairs(utf16)); + EXPECT_TRUE(utf16HasSurrogatePairs(generateRandomUTF16String(3) + u"性能好")); + EXPECT_TRUE( + utf16HasSurrogatePairs(generateRandomUTF16String(10) + u"性能好")); + EXPECT_TRUE( + utf16HasSurrogatePairs(generateRandomUTF16String(30) + u"性能好")); + EXPECT_TRUE( + utf16HasSurrogatePairs(generateRandomUTF16String(60) + u"性能好")); + EXPECT_TRUE( + utf16HasSurrogatePairs(generateRandomUTF16String(120) + u"性能好")); + EXPECT_TRUE( + utf16HasSurrogatePairs(generateRandomUTF16String(200) + u"性能好")); + EXPECT_TRUE( + utf16HasSurrogatePairs(generateRandomUTF16String(300) + u"性能好")); +} // Swap bytes to convert from big endian to little endian inline uint16_t swapBytes(uint16_t value) { @@ -542,4 +559,4 @@ TEST(UTF8ToUTF16Test, PerformanceTest) { int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -} \ No newline at end of file +} diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/ArraySerializers.java b/java/fury-core/src/main/java/org/apache/fury/serializer/ArraySerializers.java index 1d091894a1..d21de50995 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/ArraySerializers.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/ArraySerializers.java @@ -682,7 +682,7 @@ public void xwrite(MemoryBuffer buffer, String[] value) { for (String elem : value) { if (elem != null) { buffer.writeByte(Fury.NOT_NULL_VALUE_FLAG); - stringSerializer.writeUTF8String(buffer, elem); + stringSerializer.writeString(buffer, elem); } else { buffer.writeByte(Fury.NULL_FLAG); } @@ -695,7 +695,7 @@ public String[] xread(MemoryBuffer buffer) { String[] value = new String[numElements]; for (int i = 0; i < numElements; i++) { if (buffer.readByte() >= Fury.NOT_NULL_VALUE_FLAG) { - value[i] = stringSerializer.readUTF8String(buffer); + value[i] = stringSerializer.readString(buffer); } else { value[i] = null; } diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java b/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java index e5b01225c1..a57a072559 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java @@ -228,7 +228,7 @@ public AbstractStringBuilderSerializer(Fury fury, Class type) { @Override public void xwrite(MemoryBuffer buffer, T value) { - stringSerializer.writeUTF8String(buffer, value.toString()); + stringSerializer.writeString(buffer, value.toString()); } @Override @@ -276,7 +276,7 @@ public StringBuilder read(MemoryBuffer buffer) { @Override public StringBuilder xread(MemoryBuffer buffer) { - return new StringBuilder(stringSerializer.readUTF8String(buffer)); + return new StringBuilder(stringSerializer.readString(buffer)); } } @@ -299,7 +299,7 @@ public StringBuffer read(MemoryBuffer buffer) { @Override public StringBuffer xread(MemoryBuffer buffer) { - return new StringBuffer(stringSerializer.readUTF8String(buffer)); + return new StringBuffer(stringSerializer.readString(buffer)); } } diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java b/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java index fd84c7ca3a..22f3eeca90 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java @@ -121,7 +121,7 @@ public void write(MemoryBuffer buffer, String value) { @Override public void xwrite(MemoryBuffer buffer, String value) { - writeUTF8String(buffer, value); + writeJavaString(buffer, value); } @Override @@ -131,68 +131,52 @@ public String read(MemoryBuffer buffer) { @Override public String xread(MemoryBuffer buffer) { - return readUTF8String(buffer); + return readJavaString(buffer); } public void writeString(MemoryBuffer buffer, String value) { - if (isJava) { - writeJavaString(buffer, value); - } else { - writeUTF8String(buffer, value); - } + writeJavaString(buffer, value); } public Expression writeStringExpr(Expression strSerializer, Expression buffer, Expression str) { - if (isJava) { - if (STRING_VALUE_FIELD_IS_BYTES) { - if (compressString) { - return new Invoke(strSerializer, "writeCompressedBytesString", buffer, str); - } else { - return new StaticInvoke(StringSerializer.class, "writeBytesString", buffer, str); - } + if (STRING_VALUE_FIELD_IS_BYTES) { + if (compressString) { + return new Invoke(strSerializer, "writeCompressedBytesString", buffer, str); } else { - if (!STRING_VALUE_FIELD_IS_CHARS) { - throw new UnsupportedOperationException(); - } - if (compressString) { - return new Invoke(strSerializer, "writeCompressedCharsString", buffer, str); - } else { - return new Invoke(strSerializer, "writeCharsString", buffer, str); - } + return new StaticInvoke(StringSerializer.class, "writeBytesString", buffer, str); } } else { - return new Invoke(strSerializer, "writeUTF8String", buffer, str); + if (!STRING_VALUE_FIELD_IS_CHARS) { + throw new UnsupportedOperationException(); + } + if (compressString) { + return new Invoke(strSerializer, "writeCompressedCharsString", buffer, str); + } else { + return new Invoke(strSerializer, "writeCharsString", buffer, str); + } } } public String readString(MemoryBuffer buffer) { - if (isJava) { - return readJavaString(buffer); - } else { - return readUTF8String(buffer); - } + return readJavaString(buffer); } public Expression readStringExpr(Expression strSerializer, Expression buffer) { - if (isJava) { - if (STRING_VALUE_FIELD_IS_BYTES) { - if (compressString) { - return new Invoke(strSerializer, "readCompressedBytesString", STRING_TYPE, buffer); - } else { - return new Invoke(strSerializer, "readBytesString", STRING_TYPE, buffer); - } + if (STRING_VALUE_FIELD_IS_BYTES) { + if (compressString) { + return new Invoke(strSerializer, "readCompressedBytesString", STRING_TYPE, buffer); } else { - if (!STRING_VALUE_FIELD_IS_CHARS) { - throw new UnsupportedOperationException(); - } - if (compressString) { - return new Invoke(strSerializer, "readCompressedCharsString", STRING_TYPE, buffer); - } else { - return new Invoke(strSerializer, "readCharsString", STRING_TYPE, buffer); - } + return new Invoke(strSerializer, "readBytesString", STRING_TYPE, buffer); } } else { - return new Invoke(strSerializer, "readUTF8String", STRING_TYPE, buffer); + if (!STRING_VALUE_FIELD_IS_CHARS) { + throw new UnsupportedOperationException(); + } + if (compressString) { + return new Invoke(strSerializer, "readCompressedCharsString", STRING_TYPE, buffer); + } else { + return new Invoke(strSerializer, "readCharsString", STRING_TYPE, buffer); + } } } @@ -275,13 +259,6 @@ public void writeJavaString(MemoryBuffer buffer, String value) { } } - @CodegenInvoke - public void writeUTF8String(MemoryBuffer buffer, String value) { - byte[] bytes = value.getBytes(StandardCharsets.UTF_8); - buffer.writeVarUint32(bytes.length); - buffer.writeBytes(bytes); - } - // Invoked by fury JIT public String readJavaString(MemoryBuffer buffer) { if (STRING_VALUE_FIELD_IS_BYTES) { @@ -367,24 +344,6 @@ public void writeCharsString(MemoryBuffer buffer, String value) { } } - @CodegenInvoke - public String readUTF8String(MemoryBuffer buffer) { - int numBytes = buffer.readVarUint32Small14(); - buffer.checkReadableBytes(numBytes); - final byte[] targetArray = buffer.getHeapMemory(); - if (targetArray != null) { - String str = - new String( - targetArray, buffer._unsafeHeapReaderIndex(), numBytes, StandardCharsets.UTF_8); - buffer.increaseReaderIndex(numBytes); - return str; - } else { - final byte[] tmpArray = getByteArray(numBytes); - buffer.readBytes(tmpArray, 0, numBytes); - return new String(tmpArray, 0, numBytes, StandardCharsets.UTF_8); - } - } - public char[] readCharsLatin1(MemoryBuffer buffer, int numBytes) { buffer.checkReadableBytes(numBytes); byte[] srcArray = buffer.getHeapMemory(); diff --git a/python/README.md b/python/README.md index d06a9a8c77..4f3859d8e0 100644 --- a/python/README.md +++ b/python/README.md @@ -12,7 +12,7 @@ pip install -v -e . ### Environment Requirements -- python 3.6+ +- python 3.8+ ## Testing diff --git a/python/pyfury/_fury.py b/python/pyfury/_fury.py index 21ca687b90..d97c6230c2 100644 --- a/python/pyfury/_fury.py +++ b/python/pyfury/_fury.py @@ -156,9 +156,10 @@ def __init__( stacklevel=2, ) self.pickler = Pickler(self.buffer) + self.unpickler = Unpickler(self.buffer) else: - self.pickler = _PicklerStub(self.buffer) - self.unpickler = None + self.pickler = _PicklerStub() + self.unpickler = _UnpicklerStub() self._buffer_callback = None self._buffers = None self._unsupported_callback = None @@ -334,10 +335,6 @@ def _deserialize( ): if type(buffer) == bytes: buffer = Buffer(buffer) - if self.require_class_registration: - self.unpickler = _UnpicklerStub(buffer) - else: - self.unpickler = Unpickler(buffer) if unsupported_objects is not None: self._unsupported_objects = iter(unsupported_objects) if self.language == Language.XLANG: @@ -527,9 +524,6 @@ def reset(self): class _PicklerStub: - def __init__(self, buf): - self.buf = buf - def dump(self, o): raise ValueError( f"Class {type(o)} is not registered, " @@ -542,9 +536,6 @@ def clear_memo(self): class _UnpicklerStub: - def __init__(self, buf): - self.buf = buf - def load(self): raise ValueError( "pickle is not allowed when class registration enabled, Please register" diff --git a/python/pyfury/_serialization.pyx b/python/pyfury/_serialization.pyx index 86abdf0210..0da4ee9ae2 100644 --- a/python/pyfury/_serialization.pyx +++ b/python/pyfury/_serialization.pyx @@ -643,7 +643,8 @@ cdef class Fury: ) self.pickler = Pickler(self.buffer) else: - self.pickler = _PicklerStub(self.buffer) + self.pickler = _PicklerStub() + self.unpickler = _UnpicklerStub() self.unpickler = None self._buffer_callback = None self._buffers = None @@ -815,9 +816,7 @@ cdef class Fury: cpdef inline _deserialize( self, Buffer buffer, buffers=None, unsupported_objects=None): - if self.require_class_registration: - self.unpickler = _UnpicklerStub(buffer) - else: + if not self.require_class_registration: self.unpickler = Unpickler(buffer) if unsupported_objects is not None: self._unsupported_objects = iter(unsupported_objects) @@ -955,6 +954,8 @@ cdef class Fury: cpdef inline handle_unsupported_read(self, Buffer buffer): cdef c_bool in_band = buffer.read_bool() if in_band: + if self.unpickler is None: + self.unpickler.buffer = Unpickler(buffer) return self.unpickler.load() else: assert self._unsupported_objects is not None diff --git a/python/pyfury/_util.pyx b/python/pyfury/_util.pyx index 9baaca5c04..ca87d81e0c 100644 --- a/python/pyfury/_util.pyx +++ b/python/pyfury/_util.pyx @@ -22,14 +22,16 @@ cimport cython from cpython cimport * +from cpython.unicode cimport * from libcpp.memory cimport shared_ptr, make_shared from libc.stdint cimport * from libcpp cimport bool as c_bool from pyfury.includes.libutil cimport( - CBuffer, AllocateBuffer, GetBit, SetBit, ClearBit, SetBitTo, CStatus, StatusCode + CBuffer, AllocateBuffer, GetBit, SetBit, ClearBit, SetBitTo, CStatus, StatusCode, utf16HasSurrogatePairs ) cdef int32_t max_buffer_size = 2 ** 31 - 1 +cdef int UTF16_LE = -1 @cython.final @@ -149,7 +151,6 @@ cdef class Buffer: cpdef inline check_bound(self, int32_t offset, int32_t length): cdef int32_t size_ = self.c_buffer.get().size() - # if offset + length > size_: if offset | length | (offset + length) | (size_- (offset + length)) < 0: raise ValueError(f"Address range {offset, offset + length} " f"out of bound {0, size_}") @@ -543,15 +544,52 @@ cdef class Buffer: return length cpdef inline write_string(self, str value): - cdef Py_ssize_t length - cdef const char * buf = PyUnicode_AsUTF8AndSize(value, &length) - self.write_c_buffer(buf, length) + cdef Py_ssize_t length = PyUnicode_GET_LENGTH(value) + cdef int32_t kind = PyUnicode_KIND(value) + # Note: buffer will be native endian for PyUnicode_2BYTE_KIND + cdef void* buffer = PyUnicode_DATA(value) + cdef uint64_t header = 0 + cdef int32_t buffer_size + if kind == PyUnicode_1BYTE_KIND: + buffer_size = length + header = (length << 2) | 0 + elif kind == PyUnicode_2BYTE_KIND: + buffer_size = length << 1 + header = (length << 3) | 1 + else: + buffer = (PyUnicode_AsUTF8AndSize(value, &length)) + buffer_size = length + header = (buffer_size << 2) | 2 + self.write_varuint64(header) + if buffer_size == 0: # access an emtpy buffer may raise out-of-bound exception. + return + self.grow(buffer_size) + self.check_bound(self.writer_index, buffer_size) + self.c_buffer.get().CopyFrom(self.writer_index, buffer, 0, buffer_size) + self.writer_index += buffer_size cpdef inline str read_string(self): - cdef uint8_t* buf - cdef int32_t length = self.read_c_buffer(&buf) - str_obj = PyUnicode_DecodeUTF8(buf, length, "strict") - return str_obj + cdef uint64_t header = self.read_varuint64() + cdef uint32_t size = header >> 2 + self.check_bound(self.reader_index, size) + cdef const char * buf = (self.c_buffer.get().data() + self.reader_index) + self.reader_index += size + cdef uint32_t encoding = header & 0b11 + if encoding == 0: + # PyUnicode_FromASCII + return PyUnicode_DecodeLatin1(buf, size, "strict") + elif encoding == 1: + if utf16HasSurrogatePairs(buf, size >> 1): + return PyUnicode_DecodeUTF16( + buf, + size, # len of string in bytes + NULL, # special error handling options, we don't need any + &UTF16_LE, # fury use little-endian + ) + else: + return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, buf, size >> 1) + else: + return PyUnicode_DecodeUTF8(buf, size, "strict") def __len__(self): return self._c_size diff --git a/python/pyfury/includes/libutil.pxd b/python/pyfury/includes/libutil.pxd index 49f4c71521..72a640033d 100644 --- a/python/pyfury/includes/libutil.pxd +++ b/python/pyfury/includes/libutil.pxd @@ -107,3 +107,7 @@ cdef extern from "fury/util/bit_util.h" namespace "fury::util" nogil: void SetBitTo(uint8_t *bits, int64_t i, c_bool bit_is_set) c_string hex(uint8_t *data, int32_t length) + + +cdef extern from "fury/util/string_util.h" namespace "fury" nogil: + c_bool utf16HasSurrogatePairs(uint16_t* data, size_t size) diff --git a/python/pyfury/tests/test_serializer.py b/python/pyfury/tests/test_serializer.py index 361639a6a3..ec6d399d4a 100644 --- a/python/pyfury/tests/test_serializer.py +++ b/python/pyfury/tests/test_serializer.py @@ -61,6 +61,15 @@ def test_tuple(): assert ser_de(fury, (-1.0, 2)) == (-1.0, 2) +def test_string(): + fury = Fury(language=Language.PYTHON, ref_tracking=True) + assert ser_de(fury, "hello") == "hello" + assert ser_de(fury, "hello,世界") == "hello,世界" + assert ser_de(fury, "hello,世界" * 10) == "hello,世界" * 10 + assert ser_de(fury, "hello,😀") == "hello,😀" + assert ser_de(fury, "hello,😀" * 10) == "hello,😀" * 10 + + def test_dict(): fury = Fury(language=Language.PYTHON, ref_tracking=True) assert ser_de(fury, {1: 2}) == {1: 2} @@ -544,3 +553,7 @@ def func(x): df = pd.DataFrame({"a": list(range(10))}) df_sum = fury.deserialize(fury.serialize(df.sum)) assert df_sum().equals(df.sum()) + + +if __name__ == "__main__": + test_string()