From 6c26d5c2bc99427633a76da704efa35acdf20ace Mon Sep 17 00:00:00 2001
From: Alexander Ivanov <11728682+nahk-ivanov@users.noreply.github.com>
Date: Wed, 23 Oct 2024 20:35:27 -0700
Subject: [PATCH 1/2] Fix JSON serialization for UTF-32 characters.
When serializing the data in JSON-compatible form, 4-byte UTF32 characters need to be split into two 2-byte code points..
This change fixes that by introducing new emitter setting `UseUtf16SurrogatePairs`, which is set when JSON-compatible builder is requested.
---
.../Serialization/SerializationTests.cs | 9 ++++++
YamlDotNet/Core/Emitter.cs | 18 +++++++++--
YamlDotNet/Core/EmitterSettings.cs | 32 ++++++++++++++++++-
YamlDotNet/Serialization/SerializerBuilder.cs | 3 +-
4 files changed, 58 insertions(+), 4 deletions(-)
diff --git a/YamlDotNet.Test/Serialization/SerializationTests.cs b/YamlDotNet.Test/Serialization/SerializationTests.cs
index 7bff67b1..1088f0ad 100644
--- a/YamlDotNet.Test/Serialization/SerializationTests.cs
+++ b/YamlDotNet.Test/Serialization/SerializationTests.cs
@@ -886,6 +886,15 @@ public void SerializationOfAnchorWorksInJson()
.BeEquivalentTo(@"{""x"": {""z"": {""v"": ""1""}}, ""y"": {""k"": {""z"": {""v"": ""1""}}}}");
}
+ [Fact]
+ public void SerializationOfUtf32WorksInJson()
+ {
+ var obj = new { TestProperty = "Sea life \U0001F99E" };
+
+ SerializerBuilder.JsonCompatible().Build().Serialize(obj).Trim().Should()
+ .Be(@"{""TestProperty"": ""Sea life \uD83E\uDD9E""}");
+ }
+
[Fact]
// Todo: this is actually roundtrip
public void DeserializationOfDefaultsWorkInJson()
diff --git a/YamlDotNet/Core/Emitter.cs b/YamlDotNet/Core/Emitter.cs
index 01824d10..7931c8a0 100644
--- a/YamlDotNet/Core/Emitter.cs
+++ b/YamlDotNet/Core/Emitter.cs
@@ -66,6 +66,7 @@ public class Emitter : IEmitter
private bool isWhitespace;
private bool isIndentation;
private readonly bool forceIndentLess;
+ private readonly bool useUtf16SurrogatePair;
private readonly string newLine;
private bool isDocumentEndWritten;
@@ -148,6 +149,7 @@ public Emitter(TextWriter output, EmitterSettings settings)
this.maxSimpleKeyLength = settings.MaxSimpleKeyLength;
this.skipAnchorName = settings.SkipAnchorName;
this.forceIndentLess = !settings.IndentSequences;
+ this.useUtf16SurrogatePair = settings.UseUtf16SurrogatePairs;
this.newLine = settings.NewLine;
this.output = output;
@@ -1189,8 +1191,20 @@ private void WriteDoubleQuotedScalar(string value, bool allowBreaks)
{
if (index + 1 < value.Length && IsLowSurrogate(value[index + 1]))
{
- Write('U');
- Write(char.ConvertToUtf32(character, value[index + 1]).ToString("X08", CultureInfo.InvariantCulture));
+ if (useUtf16SurrogatePair)
+ {
+ Write('u');
+ Write(code.ToString("X04", CultureInfo.InvariantCulture));
+ Write('\\');
+ Write('u');
+ Write(((ushort)value[index + 1]).ToString("X04", CultureInfo.InvariantCulture));
+ }
+ else
+ {
+ Write('U');
+ Write(char.ConvertToUtf32(character, value[index + 1]).ToString("X08", CultureInfo.InvariantCulture));
+ }
+
index++;
}
else
diff --git a/YamlDotNet/Core/EmitterSettings.cs b/YamlDotNet/Core/EmitterSettings.cs
index cf44f15c..4cf5feec 100644
--- a/YamlDotNet/Core/EmitterSettings.cs
+++ b/YamlDotNet/Core/EmitterSettings.cs
@@ -63,13 +63,22 @@ public sealed class EmitterSettings
///
public bool IndentSequences { get; }
+ ///
+ /// If true, then 4-byte UTF-32 characters are broken into two 2-byte code-points.
+ ///
+ ///
+ /// This ensures compatibility with JSON format, as it does not allow '\Uxxxxxxxxx'
+ /// and instead expects two escaped 2-byte character '\uxxxx\uxxxx'.
+ ///
+ public bool UseUtf16SurrogatePairs { get; }
+
public static readonly EmitterSettings Default = new EmitterSettings();
public EmitterSettings()
{
}
- public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxSimpleKeyLength, bool skipAnchorName = false, bool indentSequences = false, string? newLine = null)
+ public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxSimpleKeyLength, bool skipAnchorName = false, bool indentSequences = false, bool useUtf16SurrogatePairs = false, string? newLine = null)
{
if (bestIndent < 2 || bestIndent > 9)
{
@@ -92,6 +101,7 @@ public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxS
MaxSimpleKeyLength = maxSimpleKeyLength;
SkipAnchorName = skipAnchorName;
IndentSequences = indentSequences;
+ UseUtf16SurrogatePairs = useUtf16SurrogatePairs;
NewLine = newLine ?? Environment.NewLine;
}
@@ -104,6 +114,7 @@ public EmitterSettings WithBestIndent(int bestIndent)
MaxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
+ UseUtf16SurrogatePairs,
NewLine
);
}
@@ -117,6 +128,7 @@ public EmitterSettings WithBestWidth(int bestWidth)
MaxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
+ UseUtf16SurrogatePairs,
NewLine
);
}
@@ -130,6 +142,7 @@ public EmitterSettings WithMaxSimpleKeyLength(int maxSimpleKeyLength)
maxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
+ UseUtf16SurrogatePairs,
NewLine
);
}
@@ -143,6 +156,7 @@ public EmitterSettings WithNewLine(string newLine)
MaxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
+ UseUtf16SurrogatePairs,
newLine
);
}
@@ -167,6 +181,7 @@ public EmitterSettings WithoutAnchorName()
MaxSimpleKeyLength,
true,
IndentSequences,
+ UseUtf16SurrogatePairs,
NewLine
);
}
@@ -180,6 +195,21 @@ public EmitterSettings WithIndentedSequences()
MaxSimpleKeyLength,
SkipAnchorName,
true,
+ UseUtf16SurrogatePairs,
+ NewLine
+ );
+ }
+
+ public EmitterSettings WithUtf16SurrogatePairs()
+ {
+ return new EmitterSettings(
+ BestIndent,
+ BestWidth,
+ IsCanonical,
+ MaxSimpleKeyLength,
+ SkipAnchorName,
+ IndentSequences,
+ true,
NewLine
);
}
diff --git a/YamlDotNet/Serialization/SerializerBuilder.cs b/YamlDotNet/Serialization/SerializerBuilder.cs
index bae3896d..ec9a7feb 100755
--- a/YamlDotNet/Serialization/SerializerBuilder.cs
+++ b/YamlDotNet/Serialization/SerializerBuilder.cs
@@ -366,7 +366,8 @@ public SerializerBuilder JsonCompatible()
{
this.emitterSettings = this.emitterSettings
.WithMaxSimpleKeyLength(int.MaxValue)
- .WithoutAnchorName();
+ .WithoutAnchorName()
+ .WithUtf16SurrogatePairs();
return this
.WithTypeConverter(new GuidConverter(true), w => w.InsteadOf())
From 733363547f72cef8e45f419e8cc1c25279e0640a Mon Sep 17 00:00:00 2001
From: Alexander Ivanov <11728682+nahk-ivanov@users.noreply.github.com>
Date: Thu, 24 Oct 2024 09:01:42 -0700
Subject: [PATCH 2/2] Address comments
---
YamlDotNet/Core/EmitterSettings.cs | 32 +++++++++----------
.../Serialization/StaticSerializerBuilder.cs | 3 +-
2 files changed, 18 insertions(+), 17 deletions(-)
diff --git a/YamlDotNet/Core/EmitterSettings.cs b/YamlDotNet/Core/EmitterSettings.cs
index 4cf5feec..57ee975d 100644
--- a/YamlDotNet/Core/EmitterSettings.cs
+++ b/YamlDotNet/Core/EmitterSettings.cs
@@ -78,7 +78,7 @@ public EmitterSettings()
{
}
- public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxSimpleKeyLength, bool skipAnchorName = false, bool indentSequences = false, bool useUtf16SurrogatePairs = false, string? newLine = null)
+ public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxSimpleKeyLength, bool skipAnchorName = false, bool indentSequences = false, string? newLine = null, bool useUtf16SurrogatePairs = false)
{
if (bestIndent < 2 || bestIndent > 9)
{
@@ -101,8 +101,8 @@ public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxS
MaxSimpleKeyLength = maxSimpleKeyLength;
SkipAnchorName = skipAnchorName;
IndentSequences = indentSequences;
- UseUtf16SurrogatePairs = useUtf16SurrogatePairs;
NewLine = newLine ?? Environment.NewLine;
+ UseUtf16SurrogatePairs = useUtf16SurrogatePairs;
}
public EmitterSettings WithBestIndent(int bestIndent)
@@ -114,8 +114,8 @@ public EmitterSettings WithBestIndent(int bestIndent)
MaxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
- UseUtf16SurrogatePairs,
- NewLine
+ NewLine,
+ UseUtf16SurrogatePairs
);
}
@@ -128,8 +128,8 @@ public EmitterSettings WithBestWidth(int bestWidth)
MaxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
- UseUtf16SurrogatePairs,
- NewLine
+ NewLine,
+ UseUtf16SurrogatePairs
);
}
@@ -142,8 +142,8 @@ public EmitterSettings WithMaxSimpleKeyLength(int maxSimpleKeyLength)
maxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
- UseUtf16SurrogatePairs,
- NewLine
+ NewLine,
+ UseUtf16SurrogatePairs
);
}
@@ -156,8 +156,8 @@ public EmitterSettings WithNewLine(string newLine)
MaxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
- UseUtf16SurrogatePairs,
- newLine
+ newLine,
+ UseUtf16SurrogatePairs
);
}
@@ -181,8 +181,8 @@ public EmitterSettings WithoutAnchorName()
MaxSimpleKeyLength,
true,
IndentSequences,
- UseUtf16SurrogatePairs,
- NewLine
+ NewLine,
+ UseUtf16SurrogatePairs
);
}
@@ -195,8 +195,8 @@ public EmitterSettings WithIndentedSequences()
MaxSimpleKeyLength,
SkipAnchorName,
true,
- UseUtf16SurrogatePairs,
- NewLine
+ NewLine,
+ UseUtf16SurrogatePairs
);
}
@@ -209,8 +209,8 @@ public EmitterSettings WithUtf16SurrogatePairs()
MaxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
- true,
- NewLine
+ NewLine,
+ true
);
}
}
diff --git a/YamlDotNet/Serialization/StaticSerializerBuilder.cs b/YamlDotNet/Serialization/StaticSerializerBuilder.cs
index 4c165fbc..76726701 100644
--- a/YamlDotNet/Serialization/StaticSerializerBuilder.cs
+++ b/YamlDotNet/Serialization/StaticSerializerBuilder.cs
@@ -370,7 +370,8 @@ public StaticSerializerBuilder JsonCompatible()
{
this.emitterSettings = this.emitterSettings
.WithMaxSimpleKeyLength(int.MaxValue)
- .WithoutAnchorName();
+ .WithoutAnchorName()
+ .WithUtf16SurrogatePairs();
return this
.WithTypeConverter(new GuidConverter(true), w => w.InsteadOf())