diff --git a/flow/record/adapter/elastic.py b/flow/record/adapter/elastic.py index f5469e3..e1163c7 100644 --- a/flow/record/adapter/elastic.py +++ b/flow/record/adapter/elastic.py @@ -106,7 +106,7 @@ def record_to_document(self, record: Record, index: str) -> dict: } if self.hash_record: - document["_id"] = hashlib.md5(document["_source"].encode()).hexdigest() + document["_id"] = hashlib.md5(document["_source"].encode(errors="surrogateescape")).hexdigest() return document diff --git a/flow/record/adapter/line.py b/flow/record/adapter/line.py index 3765fb0..28a7697 100644 --- a/flow/record/adapter/line.py +++ b/flow/record/adapter/line.py @@ -69,7 +69,7 @@ def write(self, rec: Record) -> None: for key, value in rdict.items(): if rdict_types: key = f"{key} ({rdict_types[key]})" - self.fp.write(fmt.format(key, value).encode()) + self.fp.write(fmt.format(key, value).encode(errors="surrogateescape")) def flush(self) -> None: if self.fp: diff --git a/flow/record/adapter/sqlite.py b/flow/record/adapter/sqlite.py index 2e308d2..7fb4b82 100644 --- a/flow/record/adapter/sqlite.py +++ b/flow/record/adapter/sqlite.py @@ -187,7 +187,7 @@ def read_table(self, table_name: str) -> Iterator[Record]: if value == 0: row[idx] = None elif isinstance(value, str): - row[idx] = value.encode("utf-8") + row[idx] = value.encode(errors="surrogateescape") yield descriptor_cls.init_from_dict(dict(zip(fnames, row))) def __iter__(self) -> Iterator[Record]: diff --git a/flow/record/adapter/text.py b/flow/record/adapter/text.py index 0b36d8a..0d4b359 100644 --- a/flow/record/adapter/text.py +++ b/flow/record/adapter/text.py @@ -41,7 +41,7 @@ def write(self, rec): buf = self.format_spec.format_map(DefaultMissing(rec._asdict())) else: buf = repr(rec) - self.fp.write(buf.encode() + b"\n") + self.fp.write(buf.encode(errors="surrogateescape") + b"\n") # because stdout is usually line buffered we force flush here if wanted if self.auto_flush: diff --git a/flow/record/adapter/xlsx.py b/flow/record/adapter/xlsx.py index a9bf8c7..8742d45 100644 --- a/flow/record/adapter/xlsx.py +++ b/flow/record/adapter/xlsx.py @@ -36,7 +36,7 @@ def sanitize_fieldvalues(values: Iterator[Any]) -> Iterator[Any]: elif isinstance(value, bytes): base64_encode = False try: - new_value = 'b"' + value.decode() + '"' + new_value = 'b"' + value.decode(errors="surrogateescape") + '"' if ILLEGAL_CHARACTERS_RE.search(new_value): base64_encode = True else: @@ -142,7 +142,7 @@ def __iter__(self): if field_types[idx] == "bytes": if value[1] == '"': # If so, we know this is b"" # Cut of the b" at the start and the trailing " - value = value[2:-1].encode() + value = value[2:-1].encode(errors="surrogateescape") else: # If not, we know it is base64 encoded (so we cut of the starting 'base64:') value = b64decode(value[7:]) diff --git a/flow/record/base.py b/flow/record/base.py index 43de77b..2f915cb 100644 --- a/flow/record/base.py +++ b/flow/record/base.py @@ -61,7 +61,7 @@ from collections import OrderedDict -from .utils import to_native_str, to_str +from .utils import to_str from .whitelist import WHITELIST, WHITELIST_TREE log = logging.getLogger(__package__) @@ -513,7 +513,7 @@ def __init__(self, name: str, fields: Optional[Sequence[tuple[str, str]]] = None name, fields = parse_def(name) self.name = name - self._field_tuples = tuple([(to_native_str(k), to_str(v)) for k, v in fields]) + self._field_tuples = tuple([(to_str(k), to_str(v)) for k, v in fields]) self.recordType = _generate_record_class(name, self._field_tuples) self.recordType._desc = self diff --git a/flow/record/fieldtypes/__init__.py b/flow/record/fieldtypes/__init__.py index 9f1bf68..2a02077 100644 --- a/flow/record/fieldtypes/__init__.py +++ b/flow/record/fieldtypes/__init__.py @@ -28,7 +28,6 @@ from flow.record.base import FieldType RE_NORMALIZE_PATH = re.compile(r"[\\/]+") -NATIVE_UNICODE = isinstance("", str) UTC = timezone.utc @@ -207,10 +206,7 @@ def _pack(self): class string(string_type, FieldType): def __new__(cls, value): if isinstance(value, bytes_type): - value = cls._decode(value, "utf-8") - if isinstance(value, bytes_type): - # Still bytes, so decoding failed (Python 2) - return bytes(value) + value = value.decode(errors="surrogateescape") return super().__new__(cls, value) def _pack(self): @@ -221,27 +217,6 @@ def __format__(self, spec): return defang(self) return str.__format__(self, spec) - @classmethod - def _decode(cls, data, encoding): - """Decode a byte-string into a unicode-string. - - Python 3: When `data` contains invalid unicode characters a `UnicodeDecodeError` is raised. - Python 2: When `data` contains invalid unicode characters the original byte-string is returned. - """ - if NATIVE_UNICODE: - # Raises exception on decode error - return data.decode(encoding) - try: - return data.decode(encoding) - except UnicodeDecodeError: - # Fallback to bytes (Python 2 only) - preview = data[:16].encode("hex_codec") + (".." if len(data) > 16 else "") - warnings.warn( - "Got binary data in string field (hex: {}). Compatibility is not guaranteed.".format(preview), - RuntimeWarning, - ) - return data - # Alias for backwards compatibility wstring = string @@ -278,7 +253,7 @@ def __new__(cls, *args, **kwargs): if len(args) == 1 and not kwargs: arg = args[0] if isinstance(arg, bytes_type): - arg = arg.decode("utf-8") + arg = arg.decode(errors="surrogateescape") if isinstance(arg, string_type): # If we are on Python 3.11 or newer, we can use fromisoformat() to parse the string (fast path) # diff --git a/flow/record/fieldtypes/net/ipv4.py b/flow/record/fieldtypes/net/ipv4.py index c212732..86efb22 100644 --- a/flow/record/fieldtypes/net/ipv4.py +++ b/flow/record/fieldtypes/net/ipv4.py @@ -3,7 +3,6 @@ import warnings from flow.record import FieldType -from flow.record.utils import to_native_str def addr_long(s): @@ -45,9 +44,6 @@ def __init__(self, addr, netmask=None): DeprecationWarning, stacklevel=5, ) - if isinstance(addr, type("")): - addr = to_native_str(addr) - if not isinstance(addr, str): raise TypeError("Subnet() argument 1 must be string, not {}".format(type(addr).__name__)) @@ -67,9 +63,6 @@ def __contains__(self, addr): if addr is None: return False - if isinstance(addr, type("")): - addr = to_native_str(addr) - if isinstance(addr, str): addr = addr_long(addr) diff --git a/flow/record/jsonpacker.py b/flow/record/jsonpacker.py index 96c646e..06264c8 100644 --- a/flow/record/jsonpacker.py +++ b/flow/record/jsonpacker.py @@ -47,12 +47,8 @@ def pack_obj(self, obj): serial["_recorddescriptor"] = obj._desc.identifier for field_type, field_name in obj._desc.get_field_tuples(): - # PYTHON2: Because "bytes" are also "str" we have to handle this here - if field_type == "bytes" and isinstance(serial[field_name], str): - serial[field_name] = base64.b64encode(serial[field_name]).decode() - # Boolean field types should be cast to a bool instead of staying ints - elif field_type == "boolean" and isinstance(serial[field_name], int): + if field_type == "boolean" and isinstance(serial[field_name], int): serial[field_name] = bool(serial[field_name]) return serial diff --git a/flow/record/utils.py b/flow/record/utils.py index a359139..a1c5cf4 100644 --- a/flow/record/utils.py +++ b/flow/record/utils.py @@ -3,13 +3,10 @@ import base64 import os import sys +import warnings from functools import wraps from typing import BinaryIO, TextIO -_native = str -_unicode = type("") -_bytes = type(b"") - def get_stdout(binary: bool = False) -> TextIO | BinaryIO: """Return the stdout stream as binary or text stream. @@ -50,33 +47,32 @@ def is_stdout(fp: TextIO | BinaryIO) -> bool: def to_bytes(value): """Convert a value to a byte string.""" - if value is None or isinstance(value, _bytes): + if value is None or isinstance(value, bytes): return value - if isinstance(value, _unicode): - return value.encode("utf-8") - return _bytes(value) + if isinstance(value, str): + return value.encode(errors="surrogateescape") + return bytes(value) def to_str(value): """Convert a value to a unicode string.""" - if value is None or isinstance(value, _unicode): + if value is None or isinstance(value, str): return value - if isinstance(value, _bytes): - return value.decode("utf-8") - return _unicode(value) + if isinstance(value, bytes): + return value.decode(errors="surrogateescape") + return str(value) def to_native_str(value): - """Convert a value to a native `str`.""" - if value is None or isinstance(value, _native): - return value - if isinstance(value, _unicode): - # Python 2: unicode -> str - return value.encode("utf-8") - if isinstance(value, _bytes): - # Python 3: bytes -> str - return value.decode("utf-8") - return _native(value) + warnings.warn( + ( + "The to_native_str() function is deprecated, " + "this function will be removed in flow.record 3.20, " + "use to_str() instead" + ), + DeprecationWarning, + ) + return to_str(value) def to_base64(value): diff --git a/tests/test_adapter_line.py b/tests/test_adapter_line.py new file mode 100644 index 0000000..bfa641a --- /dev/null +++ b/tests/test_adapter_line.py @@ -0,0 +1,29 @@ +from io import BytesIO + +from flow.record import RecordDescriptor +from flow.record.adapter.line import LineWriter + + +def test_line_writer_write_surrogateescape(): + output = BytesIO() + + lw = LineWriter( + path=output, + fields="name", + ) + + TestRecord = RecordDescriptor( + "test/string", + [ + ("string", "name"), + ], + ) + + # construct from 'bytes' but with invalid unicode bytes + record = TestRecord(b"R\xc3\xa9\xeamy") + lw.write(record) + + output.seek(0) + data = output.read() + + assert data == b"--[ RECORD 1 ]--\nname = R\xc3\xa9\xeamy\n" diff --git a/tests/test_adapter_text.py b/tests/test_adapter_text.py new file mode 100644 index 0000000..5dd6ae4 --- /dev/null +++ b/tests/test_adapter_text.py @@ -0,0 +1,28 @@ +from io import BytesIO + +from flow.record import RecordDescriptor +from flow.record.adapter.text import TextWriter + + +def test_text_writer_write_surrogateescape(): + output = BytesIO() + + tw = TextWriter( + path=output, + ) + + TestRecord = RecordDescriptor( + "test/string", + [ + ("string", "name"), + ], + ) + + # construct from 'bytes' but with invalid unicode bytes + record = TestRecord(b"R\xc3\xa9\xeamy") + tw.write(record) + + output.seek(0) + data = output.read() + + assert data == b"\n" diff --git a/tests/test_fieldtypes.py b/tests/test_fieldtypes.py index 81bd2aa..bb28758 100644 --- a/tests/test_fieldtypes.py +++ b/tests/test_fieldtypes.py @@ -213,15 +213,8 @@ def test_string(): assert r.name == "Rémy" # construct from 'bytes' but with invalid unicode bytes - if isinstance("", str): - # Python 3 - with pytest.raises(UnicodeDecodeError): - TestRecord(b"R\xc3\xa9\xeamy") - else: - # Python 2 - with pytest.warns(RuntimeWarning): - r = TestRecord(b"R\xc3\xa9\xeamy") - assert r.name + r = TestRecord(b"R\xc3\xa9\xeamy") + assert r.name == "Ré\udceamy" def test_wstring(): diff --git a/tests/test_json_packer.py b/tests/test_json_packer.py index 2ea2883..acd2edb 100644 --- a/tests/test_json_packer.py +++ b/tests/test_json_packer.py @@ -90,3 +90,23 @@ def test_record_pack_bool_regression() -> None: # pack the json string back to a record and make sure it is the same as before assert packer.unpack(data) == record + + +def test_record_pack_surrogateescape() -> None: + TestRecord = RecordDescriptor( + "test/string", + [ + ("string", "name"), + ], + ) + + record = TestRecord(b"R\xc3\xa9\xeamy") + packer = JsonRecordPacker() + + data = packer.pack(record) + + # pack to json string and check if the 3rd and 4th byte are properly surrogate escaped + assert data.startswith('{"name": "R\\u00e9\\udceamy",') + + # pack the json string back to a record and make sure it is the same as before + assert packer.unpack(data) == record diff --git a/tests/test_record.py b/tests/test_record.py index f6b883b..e9d11a0 100644 --- a/tests/test_record.py +++ b/tests/test_record.py @@ -1,4 +1,5 @@ import importlib +import inspect import os import sys from unittest.mock import patch @@ -27,8 +28,6 @@ from flow.record.exceptions import RecordDescriptorError from flow.record.stream import RecordFieldRewriter -from . import utils_inspect as inspect - def test_record_creation(): TestRecord = RecordDescriptor( @@ -288,8 +287,30 @@ def isatty(): writer.write(record) out, err = capsys.readouterr() - modifier = "" if isinstance("", str) else "u" - expected = "\n".format(u=modifier) + expected = "\n" + assert out == expected + + +def test_record_printer_stdout_surrogateescape(capsys): + Record = RecordDescriptor( + "test/a", + [ + ("string", "name"), + ], + ) + record = Record(b"R\xc3\xa9\xeamy") + + # fake capsys to be a tty. + def isatty(): + return True + + capsys._capture.out.tmpfile.isatty = isatty + + writer = RecordPrinter(getattr(sys.stdout, "buffer", sys.stdout)) + writer.write(record) + + out, err = capsys.readouterr() + expected = "\n" assert out == expected diff --git a/tests/utils_inspect.py b/tests/utils_inspect.py deleted file mode 100644 index 0b84678..0000000 --- a/tests/utils_inspect.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -Backport of `inspect.signature` for Python 2. - -Based on: https://github.com/python/cpython/blob/3.7/Lib/inspect.py -""" - -import collections -import inspect - - -class _empty: - pass - - -class Parameter: - POSITIONAL_ONLY = 0 - POSITIONAL_OR_KEYWORD = 1 - VAR_POSITIONAL = 2 - KEYWORD_ONLY = 3 - VAR_KEYWORD = 4 - - empty = _empty - - def __init__(self, name, kind, default=_empty): - self.name = name - self.kind = kind - self.default = default - - -class Signature: - empty = _empty - - def __init__(self, parameters=None): - self.parameters = parameters - - -def signature(obj): - try: - # Python 3 - return inspect.signature(obj) - except AttributeError: - # Python 2 - spec = inspect.getargspec(obj) - - # Create parameter objects which are compatible with python 3 objects - parameters = collections.OrderedDict() - for i in range(0, len(spec.args)): - arg = spec.args[i] - default = _empty - if spec.defaults and (len(spec.args) - i <= len(spec.defaults)): - default = spec.defaults[i - len(spec.args)] - parameters[arg] = Parameter(name=arg, default=default, kind=Parameter.POSITIONAL_OR_KEYWORD) - if spec.varargs: - parameters[spec.varargs] = Parameter(name=spec.varargs, kind=Parameter.VAR_POSITIONAL) - if spec.keywords: - parameters[spec.keywords] = Parameter(name=spec.keywords, kind=Parameter.VAR_KEYWORD) - - return Signature(parameters=parameters)