Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix JSON serialization for UTF-32 characters. #998

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions YamlDotNet.Test/Serialization/SerializationTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -886,6 +886,15 @@ public void SerializationOfAnchorWorksInJson()
.BeEquivalentTo(@"{""x"": {""z"": {""v"": ""1""}}, ""y"": {""k"": {""z"": {""v"": ""1""}}}}");
}

[Fact]
public void SerializationOfUtf32WorksInJson()
{
var obj = new { TestProperty = "Sea life \U0001F99E" };

SerializerBuilder.JsonCompatible().Build().Serialize(obj).Trim().Should()
.Be(@"{""TestProperty"": ""Sea life \uD83E\uDD9E""}");
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apparently, lobster in UTF32 is different from lobster in UTF16. 🤷

https://www.unicodecharacter.org/U+1F99E

}

[Fact]
// Todo: this is actually roundtrip
public void DeserializationOfDefaultsWorkInJson()
Expand Down
18 changes: 16 additions & 2 deletions YamlDotNet/Core/Emitter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ public class Emitter : IEmitter
private bool isWhitespace;
private bool isIndentation;
private readonly bool forceIndentLess;
private readonly bool useUtf16SurrogatePair;
private readonly string newLine;

private bool isDocumentEndWritten;
Expand Down Expand Up @@ -148,6 +149,7 @@ public Emitter(TextWriter output, EmitterSettings settings)
this.maxSimpleKeyLength = settings.MaxSimpleKeyLength;
this.skipAnchorName = settings.SkipAnchorName;
this.forceIndentLess = !settings.IndentSequences;
this.useUtf16SurrogatePair = settings.UseUtf16SurrogatePairs;
this.newLine = settings.NewLine;

this.output = output;
Expand Down Expand Up @@ -1189,8 +1191,20 @@ private void WriteDoubleQuotedScalar(string value, bool allowBreaks)
{
if (index + 1 < value.Length && IsLowSurrogate(value[index + 1]))
{
Write('U');
Write(char.ConvertToUtf32(character, value[index + 1]).ToString("X08", CultureInfo.InvariantCulture));
if (useUtf16SurrogatePair)
{
Write('u');
Write(code.ToString("X04", CultureInfo.InvariantCulture));
Write('\\');
Write('u');
Write(((ushort)value[index + 1]).ToString("X04", CultureInfo.InvariantCulture));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should you check to make sure this won’t go out of bounds?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure but do you need to advance the index by one to handle the case when utf32 character is in the middle of the string?

Copy link
Author

@nahk-ivanov nahk-ivanov Oct 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't all of this already done on lines 1192 and 1208? The previous code worked somehow 🤷 we are just rendering it differently, not reading more or less than what was read before.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’m not at a computer, just a phone, so reviewing prs can be a bit tricky. If it’s already done then great. Ignore this comment.

}
else
{
Write('U');
Write(char.ConvertToUtf32(character, value[index + 1]).ToString("X08", CultureInfo.InvariantCulture));
}

index++;
}
else
Expand Down
32 changes: 31 additions & 1 deletion YamlDotNet/Core/EmitterSettings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,22 @@ public sealed class EmitterSettings
/// </summary>
public bool IndentSequences { get; }

/// <summary>
/// If true, then 4-byte UTF-32 characters are broken into two 2-byte code-points.
/// </summary>
/// <remarks>
/// This ensures compatibility with JSON format, as it does not allow '\Uxxxxxxxxx'
/// and instead expects two escaped 2-byte character '\uxxxx\uxxxx'.
/// </remarks>
public bool UseUtf16SurrogatePairs { get; }

public static readonly EmitterSettings Default = new EmitterSettings();

public EmitterSettings()
{
}

public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxSimpleKeyLength, bool skipAnchorName = false, bool indentSequences = false, string? newLine = null)
public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxSimpleKeyLength, bool skipAnchorName = false, bool indentSequences = false, bool useUtf16SurrogatePairs = false, string? newLine = null)
{
if (bestIndent < 2 || bestIndent > 9)
{
Expand All @@ -92,6 +101,7 @@ public EmitterSettings(int bestIndent, int bestWidth, bool isCanonical, int maxS
MaxSimpleKeyLength = maxSimpleKeyLength;
SkipAnchorName = skipAnchorName;
IndentSequences = indentSequences;
UseUtf16SurrogatePairs = useUtf16SurrogatePairs;
NewLine = newLine ?? Environment.NewLine;
}

Expand All @@ -104,6 +114,7 @@ public EmitterSettings WithBestIndent(int bestIndent)
MaxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
UseUtf16SurrogatePairs,
NewLine
);
}
Expand All @@ -117,6 +128,7 @@ public EmitterSettings WithBestWidth(int bestWidth)
MaxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
UseUtf16SurrogatePairs,
NewLine
);
}
Expand All @@ -130,6 +142,7 @@ public EmitterSettings WithMaxSimpleKeyLength(int maxSimpleKeyLength)
maxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
UseUtf16SurrogatePairs,
NewLine
);
}
Expand All @@ -143,6 +156,7 @@ public EmitterSettings WithNewLine(string newLine)
MaxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
UseUtf16SurrogatePairs,
newLine
);
}
Expand All @@ -167,6 +181,7 @@ public EmitterSettings WithoutAnchorName()
MaxSimpleKeyLength,
true,
IndentSequences,
UseUtf16SurrogatePairs,
NewLine
);
}
Expand All @@ -180,6 +195,21 @@ public EmitterSettings WithIndentedSequences()
MaxSimpleKeyLength,
SkipAnchorName,
true,
UseUtf16SurrogatePairs,
NewLine
);
}

public EmitterSettings WithUtf16SurrogatePairs()
{
return new EmitterSettings(
BestIndent,
BestWidth,
IsCanonical,
MaxSimpleKeyLength,
SkipAnchorName,
IndentSequences,
true,
NewLine
);
}
Expand Down
3 changes: 2 additions & 1 deletion YamlDotNet/Serialization/SerializerBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,8 @@ public SerializerBuilder JsonCompatible()
{
this.emitterSettings = this.emitterSettings
.WithMaxSimpleKeyLength(int.MaxValue)
.WithoutAnchorName();
.WithoutAnchorName()
.WithUtf16SurrogatePairs();

return this
.WithTypeConverter(new GuidConverter(true), w => w.InsteadOf<GuidConverter>())
Expand Down