From 6c26d5c2bc99427633a76da704efa35acdf20ace Mon Sep 17 00:00:00 2001 From: Alexander Ivanov <11728682+nahk-ivanov@users.noreply.github.com> Date: Wed, 23 Oct 2024 20:35:27 -0700 Subject: [PATCH 1/2] Fix JSON serialization for UTF-32 characters. When serializing the data in JSON-compatible form, 4-byte UTF32 characters need to be split into two 2-byte code points.. This change fixes that by introducing new emitter setting `UseUtf16SurrogatePairs`, which is set when JSON-compatible builder is requested. --- .../Serialization/SerializationTests.cs | 9 ++++++ YamlDotNet/Core/Emitter.cs | 18 +++++++++-- YamlDotNet/Core/EmitterSettings.cs | 32 ++++++++++++++++++- YamlDotNet/Serialization/SerializerBuilder.cs | 3 +- 4 files changed, 58 insertions(+), 4 deletions(-) diff --git a/YamlDotNet.Test/Serialization/SerializationTests.cs b/YamlDotNet.Test/Serialization/SerializationTests.cs index 7bff67b1..1088f0ad 100644 --- a/YamlDotNet.Test/Serialization/SerializationTests.cs +++ b/YamlDotNet.Test/Serialization/SerializationTests.cs @@ -886,6 +886,15 @@ public void SerializationOfAnchorWorksInJson() .BeEquivalentTo(@"{""x"": {""z"": {""v"": ""1""}}, ""y"": {""k"": {""z"": {""v"": ""1""}}}}"); } + [Fact] + public void SerializationOfUtf32WorksInJson() + { + var obj = new { TestProperty = "Sea life \U0001F99E" }; + + SerializerBuilder.JsonCompatible().Build().Serialize(obj).Trim().Should() + .Be(@"{""TestProperty"": ""Sea life \uD83E\uDD9E""}"); + } + [Fact] // Todo: this is actually roundtrip public void DeserializationOfDefaultsWorkInJson() diff --git a/YamlDotNet/Core/Emitter.cs b/YamlDotNet/Core/Emitter.cs index 01824d10..7931c8a0 100644 --- a/YamlDotNet/Core/Emitter.cs +++ b/YamlDotNet/Core/Emitter.cs @@ -66,6 +66,7 @@ public class Emitter : IEmitter private bool isWhitespace; private bool isIndentation; private readonly bool forceIndentLess; + private readonly bool useUtf16SurrogatePair; private readonly string newLine; private bool isDocumentEndWritten; @@ -148,6 +149,7 @@ public Emitter(TextWriter output, EmitterSettings settings) this.maxSimpleKeyLength = settings.MaxSimpleKeyLength; this.skipAnchorName = settings.SkipAnchorName; this.forceIndentLess = !settings.IndentSequences; + this.useUtf16SurrogatePair = settings.UseUtf16SurrogatePairs; this.newLine = settings.NewLine; this.output = output; @@ -1189,8 +1191,20 @@ private void WriteDoubleQuotedScalar(string value, bool allowBreaks) { if (index + 1 < value.Length && IsLowSurrogate(value[index + 1])) { - Write('U'); - Write(char.ConvertToUtf32(character, value[index + 1]).ToString("X08", CultureInfo.InvariantCulture)); + if (useUtf16SurrogatePair) + { + Write('u'); + Write(code.ToString("X04", CultureInfo.InvariantCulture)); + Write('\\'); + Write('u'); + Write(((ushort)value[index + 1]).ToString("X04", CultureInfo.InvariantCulture)); + } + else + { + Write('U'); + Write(char.ConvertToUtf32(character, value[index + 1]).ToString("X08", CultureInfo.InvariantCulture)); + } + index++; } else diff --git a/YamlDotNet/Core/EmitterSettings.cs b/YamlDotNet/Core/EmitterSettings.cs index cf44f15c..4cf5feec 100644 --- a/YamlDotNet/Core/EmitterSettings.cs +++ b/YamlDotNet/Core/EmitterSettings.cs @@ -63,13 +63,22 @@ public sealed class EmitterSettings /// public bool IndentSequences { get; } + /// + /// If true, then 4-byte UTF-32 characters are broken into two 2-byte code-points. + /// + /// + /// This ensures compatibility with JSON format, as it does not allow '\Uxxxxxxxxx' + /// and instead expects two escaped 2-byte character '\uxxxx\uxxxx'. + /// + public bool UseUtf16SurrogatePairs { get; } + public static readonly EmitterSettings Default = new EmitterSettings(); public EmitterSettings() { } - public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxSimpleKeyLength, bool skipAnchorName = false, bool indentSequences = false, string? newLine = null) + public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxSimpleKeyLength, bool skipAnchorName = false, bool indentSequences = false, bool useUtf16SurrogatePairs = false, string? newLine = null) { if (bestIndent < 2 || bestIndent > 9) { @@ -92,6 +101,7 @@ public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxS MaxSimpleKeyLength = maxSimpleKeyLength; SkipAnchorName = skipAnchorName; IndentSequences = indentSequences; + UseUtf16SurrogatePairs = useUtf16SurrogatePairs; NewLine = newLine ?? Environment.NewLine; } @@ -104,6 +114,7 @@ public EmitterSettings WithBestIndent(int bestIndent) MaxSimpleKeyLength, SkipAnchorName, IndentSequences, + UseUtf16SurrogatePairs, NewLine ); } @@ -117,6 +128,7 @@ public EmitterSettings WithBestWidth(int bestWidth) MaxSimpleKeyLength, SkipAnchorName, IndentSequences, + UseUtf16SurrogatePairs, NewLine ); } @@ -130,6 +142,7 @@ public EmitterSettings WithMaxSimpleKeyLength(int maxSimpleKeyLength) maxSimpleKeyLength, SkipAnchorName, IndentSequences, + UseUtf16SurrogatePairs, NewLine ); } @@ -143,6 +156,7 @@ public EmitterSettings WithNewLine(string newLine) MaxSimpleKeyLength, SkipAnchorName, IndentSequences, + UseUtf16SurrogatePairs, newLine ); } @@ -167,6 +181,7 @@ public EmitterSettings WithoutAnchorName() MaxSimpleKeyLength, true, IndentSequences, + UseUtf16SurrogatePairs, NewLine ); } @@ -180,6 +195,21 @@ public EmitterSettings WithIndentedSequences() MaxSimpleKeyLength, SkipAnchorName, true, + UseUtf16SurrogatePairs, + NewLine + ); + } + + public EmitterSettings WithUtf16SurrogatePairs() + { + return new EmitterSettings( + BestIndent, + BestWidth, + IsCanonical, + MaxSimpleKeyLength, + SkipAnchorName, + IndentSequences, + true, NewLine ); } diff --git a/YamlDotNet/Serialization/SerializerBuilder.cs b/YamlDotNet/Serialization/SerializerBuilder.cs index bae3896d..ec9a7feb 100755 --- a/YamlDotNet/Serialization/SerializerBuilder.cs +++ b/YamlDotNet/Serialization/SerializerBuilder.cs @@ -366,7 +366,8 @@ public SerializerBuilder JsonCompatible() { this.emitterSettings = this.emitterSettings .WithMaxSimpleKeyLength(int.MaxValue) - .WithoutAnchorName(); + .WithoutAnchorName() + .WithUtf16SurrogatePairs(); return this .WithTypeConverter(new GuidConverter(true), w => w.InsteadOf()) From 733363547f72cef8e45f419e8cc1c25279e0640a Mon Sep 17 00:00:00 2001 From: Alexander Ivanov <11728682+nahk-ivanov@users.noreply.github.com> Date: Thu, 24 Oct 2024 09:01:42 -0700 Subject: [PATCH 2/2] Address comments --- YamlDotNet/Core/EmitterSettings.cs | 32 +++++++++---------- .../Serialization/StaticSerializerBuilder.cs | 3 +- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/YamlDotNet/Core/EmitterSettings.cs b/YamlDotNet/Core/EmitterSettings.cs index 4cf5feec..57ee975d 100644 --- a/YamlDotNet/Core/EmitterSettings.cs +++ b/YamlDotNet/Core/EmitterSettings.cs @@ -78,7 +78,7 @@ public EmitterSettings() { } - public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxSimpleKeyLength, bool skipAnchorName = false, bool indentSequences = false, bool useUtf16SurrogatePairs = false, string? newLine = null) + public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxSimpleKeyLength, bool skipAnchorName = false, bool indentSequences = false, string? newLine = null, bool useUtf16SurrogatePairs = false) { if (bestIndent < 2 || bestIndent > 9) { @@ -101,8 +101,8 @@ public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxS MaxSimpleKeyLength = maxSimpleKeyLength; SkipAnchorName = skipAnchorName; IndentSequences = indentSequences; - UseUtf16SurrogatePairs = useUtf16SurrogatePairs; NewLine = newLine ?? Environment.NewLine; + UseUtf16SurrogatePairs = useUtf16SurrogatePairs; } public EmitterSettings WithBestIndent(int bestIndent) @@ -114,8 +114,8 @@ public EmitterSettings WithBestIndent(int bestIndent) MaxSimpleKeyLength, SkipAnchorName, IndentSequences, - UseUtf16SurrogatePairs, - NewLine + NewLine, + UseUtf16SurrogatePairs ); } @@ -128,8 +128,8 @@ public EmitterSettings WithBestWidth(int bestWidth) MaxSimpleKeyLength, SkipAnchorName, IndentSequences, - UseUtf16SurrogatePairs, - NewLine + NewLine, + UseUtf16SurrogatePairs ); } @@ -142,8 +142,8 @@ public EmitterSettings WithMaxSimpleKeyLength(int maxSimpleKeyLength) maxSimpleKeyLength, SkipAnchorName, IndentSequences, - UseUtf16SurrogatePairs, - NewLine + NewLine, + UseUtf16SurrogatePairs ); } @@ -156,8 +156,8 @@ public EmitterSettings WithNewLine(string newLine) MaxSimpleKeyLength, SkipAnchorName, IndentSequences, - UseUtf16SurrogatePairs, - newLine + newLine, + UseUtf16SurrogatePairs ); } @@ -181,8 +181,8 @@ public EmitterSettings WithoutAnchorName() MaxSimpleKeyLength, true, IndentSequences, - UseUtf16SurrogatePairs, - NewLine + NewLine, + UseUtf16SurrogatePairs ); } @@ -195,8 +195,8 @@ public EmitterSettings WithIndentedSequences() MaxSimpleKeyLength, SkipAnchorName, true, - UseUtf16SurrogatePairs, - NewLine + NewLine, + UseUtf16SurrogatePairs ); } @@ -209,8 +209,8 @@ public EmitterSettings WithUtf16SurrogatePairs() MaxSimpleKeyLength, SkipAnchorName, IndentSequences, - true, - NewLine + NewLine, + true ); } } diff --git a/YamlDotNet/Serialization/StaticSerializerBuilder.cs b/YamlDotNet/Serialization/StaticSerializerBuilder.cs index 4c165fbc..76726701 100644 --- a/YamlDotNet/Serialization/StaticSerializerBuilder.cs +++ b/YamlDotNet/Serialization/StaticSerializerBuilder.cs @@ -370,7 +370,8 @@ public StaticSerializerBuilder JsonCompatible() { this.emitterSettings = this.emitterSettings .WithMaxSimpleKeyLength(int.MaxValue) - .WithoutAnchorName(); + .WithoutAnchorName() + .WithUtf16SurrogatePairs(); return this .WithTypeConverter(new GuidConverter(true), w => w.InsteadOf())