Skip to content

Commit

Permalink
* prefer ByteArrayEqualityComparer that using `Span<>.SequenceEqual…
Browse files Browse the repository at this point in the history
…(ReadOnlySpan<>) over `IEnumerable<>.SequenceEqual(IEnumerable<>)` @ `BaseSaver.IsSameUser()`

* implement custom equality methods to use `ByteArrayEqualityComparer` for prop `byte[] XxHash3` @ `ReplySaver.UniqueSignature`
@ crawler

+ class `ByteArrayEqualityComparer`
@ shared

* rename `CommonEmbeddedMetadataXxHash3ToIgnore/ICC/*` to convert their filename from `ulong` to bytes in hex
* update `MetadataConsumer.CommonEmbeddedMetadataXxHash3ToIgnore.Icc` @ appsettings.json
* replace all `ulong` hash fields with `required byte[]` @ `Db.ImageHash`
@ imagePipeline

* replace all `ulong XxHash3` fields in entity classes with `required byte[]`
@ c#
  • Loading branch information
n0099 committed Apr 30, 2024
1 parent a4e50d6 commit 307ee3b
Show file tree
Hide file tree
Showing 26 changed files with 73 additions and 41 deletions.
2 changes: 1 addition & 1 deletion c#/crawler/src/Db/ReplySignature.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ public class ReplySignature
{
public long UserId { get; set; }
public uint SignatureId { get; set; }
public ulong XxHash3 { get; set; }
public required byte[] XxHash3 { get; set; }
public required byte[] ProtoBufBytes { get; set; }
public uint FirstSeenAt { get; set; }
public uint LastSeenAt { get; set; }
Expand Down
6 changes: 4 additions & 2 deletions c#/crawler/src/Tieba/Crawl/Saver/BaseSaver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ bool IsTimestampingFieldName(string name) => name is nameof(IPost.LastSeenAt)
revisionNullFieldsBitMask |= whichBitToMask; // mask the corresponding field bit with 1
}
}
if (revision != null) revision.NullFieldsBitMask = (NullFieldsBitMask?)revisionNullFieldsBitMask.NullIfZero();
if (revision != null)
revision.NullFieldsBitMask = (NullFieldsBitMask?)revisionNullFieldsBitMask.NullIfZero();
return revision;
}).OfType<TRevision>().ToList();
if (newRevisions.Count == 0) return; // quick exit to prevent execute sql with WHERE FALSE clause
Expand Down Expand Up @@ -130,5 +131,6 @@ private static bool IsLatestReplierUser(string pName, PropertyEntry p, EntityEnt
private static bool IsSameUser(User a, User b) =>
(a.Uid, a.Name, a.DisplayName, a.Portrait, a.PortraitUpdatedAt, a.Gender, a.FansNickname, a.IpGeolocation)
== (b.Uid, b.Name, b.DisplayName, b.Portrait, b.PortraitUpdatedAt, b.Gender, b.FansNickname, b.IpGeolocation)
&& (a.Icon == b.Icon || (a.Icon != null && b.Icon != null && a.Icon.SequenceEqual(b.Icon)));
&& (a.Icon == b.Icon
|| (a.Icon != null && b.Icon != null && new ByteArrayEqualityComparer().Equals(a.Icon, b.Icon)));
}
18 changes: 15 additions & 3 deletions c#/crawler/src/Tieba/Crawl/Saver/Post/ReplySaver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ private Action SaveReplySignatures(CrawlerDbContext db, IEnumerable<ReplyPost> r
{
UserId = r.AuthorUid,
SignatureId = (uint)r.SignatureId!,
XxHash3 = XxHash3.HashToUInt64(r.Signature!),
XxHash3 = XxHash3.Hash(r.Signature!),
ProtoBufBytes = r.Signature!,
FirstSeenAt = now,
LastSeenAt = now
Expand All @@ -142,7 +142,7 @@ private Action SaveReplySignatures(CrawlerDbContext db, IEnumerable<ReplyPost> r
var existingSignatures = (
from s in db.ReplySignatures.AsTracking().ForUpdate()
where uniqueSignatures.Select(us => us.Id).Contains(s.SignatureId)
&& uniqueSignatures.Select(us => us.XxHash3).Contains(s.XxHash3)
&& uniqueSignatures.Select(us => us.XxHash3).Contains(s.XxHash3, new ByteArrayEqualityComparer())
select s
).ToList();
(from existing in existingSignatures
Expand Down Expand Up @@ -170,5 +170,17 @@ join newInReply in signatures on existing.SignatureId equals newInReply.Signatur
};
}

private sealed record UniqueSignature(uint Id, ulong XxHash3);
private sealed record UniqueSignature(uint Id, byte[] XxHash3)
{
public bool Equals(UniqueSignature? other) =>
other != null && Id == other.Id && new ByteArrayEqualityComparer().Equals(XxHash3, other.XxHash3);

public override int GetHashCode()
{
var hash = default(HashCode);
hash.Add(Id);
hash.AddBytes(XxHash3);
return hash.ToHashCode();
}
}
}
34 changes: 17 additions & 17 deletions c#/imagePipeline/appsettings.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,23 +28,23 @@
"CommonEmbeddedMetadataXxHash3ToIgnore": {
"Exif": [],
"Icc": [
15295336236538839648,
5088525931300380143,
11377371376846116862,
12790265642504860643,
2312022491946436839,
13825194526826816856,
11564595951513508422,
11425690781213542939,
7029493641152027663,
675439972720715839,
17530108583999846437,
8572314333541615503,
2453093087249539934,
6340676997749381937,
10691729796898709084,
11631441974507259315,
317573671597317124
"D443F344DB0EC260",
"469E135FD585DDEF",
"9DE48AFCC29C17FE",
"B18026D60C62B3E3",
"2015F434F12E48E7",
"BFDCF52C522FE558",
"A07DB2BEFCE21A46",
"9E90353AC7B04A1B",
"618DC73907BBDC0F",
"095FA513991E283F",
"F34774E0EF563825",
"76F6FA77E1BCC78F",
"220B232A7279A35E",
"57FE9C2296579B31",
"9460A79A629B5E5C",
"A16B2ED98C12C5B3",
"04683F925D931C04"
],
"Iptc": [],
"Xmp": []
Expand Down
6 changes: 4 additions & 2 deletions c#/imagePipeline/src/Consumer/HashConsumer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ public HashConsumer(FailedImageHandler failedImageHandler)
_failedImageHandler = failedImageHandler;
_imageHashSettersKeyByAlgorithm = new Dictionary<ImgHashBase, Action<ImageHash, byte[]>>
{
{PHash.Create(), (image, bytes) => image.PHash = BitConverter.ToUInt64(bytes)},
{AverageHash.Create(), (image, bytes) => image.AverageHash = BitConverter.ToUInt64(bytes)},
{PHash.Create(), (image, bytes) => image.PHash = bytes},
{AverageHash.Create(), (image, bytes) => image.AverageHash = bytes},
{BlockMeanHash.Create(), (image, bytes) => image.BlockMeanHash = bytes},
{MarrHildrethHash.Create(), (image, bytes) => image.MarrHildrethHash = bytes}
}.AsReadOnly();
Expand All @@ -35,6 +35,8 @@ protected override IEnumerable<ImageId> ConsumeInternal(
{
ImageId = imageKeyWithMatrix.ImageId,
FrameIndex = imageKeyWithMatrix.FrameIndex,
PHash = [],
AverageHash = [],
BlockMeanHash = [],
MarrHildrethHash = [],
ThumbHash = []
Expand Down
15 changes: 8 additions & 7 deletions c#/imagePipeline/src/Consumer/MetadataConsumer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ public partial class MetadataConsumer : IConsumer<ImageWithBytes>
{
private readonly ILogger<MetadataConsumer> _logger;
private readonly FailedImageHandler _failedImageHandler;
private readonly (ulong[] Exif, ulong[] Icc, ulong[] Iptc, ulong[] Xmp) _commonEmbeddedMetadataXxHash3ToIgnore;
private readonly (IEnumerable<byte[]> Exif, IEnumerable<byte[]> Icc, IEnumerable<byte[]> Iptc, IEnumerable<byte[]> Xmp)
_commonEmbeddedMetadataXxHash3ToIgnore;

static MetadataConsumer() => NetTopologySuite.NtsGeometryServices.Instance = new(
coordinateSequenceFactory: NetTopologySuite.Geometries.Implementation.CoordinateArraySequenceFactory.Instance,
Expand All @@ -27,8 +28,8 @@ public MetadataConsumer
{
(_logger, _failedImageHandler) = (logger, failedImageHandler);
var section = config.GetSection("MetadataConsumer").GetSection("CommonEmbeddedMetadataXxHash3ToIgnore");
ulong[] GetCommonXxHash3ToIgnore(string key) =>
section.GetSection(key).Get<ulong[]>() ?? [];
IEnumerable<byte[]> GetCommonXxHash3ToIgnore(string key) =>
(section.GetSection(key).Get<string[]>() ?? []).Select(Convert.FromHexString);
_commonEmbeddedMetadataXxHash3ToIgnore = (
Exif: GetCommonXxHash3ToIgnore("Exif"),
Icc: GetCommonXxHash3ToIgnore("Icc"),
Expand Down Expand Up @@ -70,7 +71,7 @@ private Func<ImageWithBytes, ImageMetadata> GetImageMetaData
Height = (ushort)info.Height,
BitsPerPixel = (ushort)info.PixelType.BitsPerPixel,
FrameCount = (uint)info.FrameMetadataCollection.Count,
XxHash3 = XxHash3.HashToUInt64(imageBytes),
XxHash3 = XxHash3.Hash(imageBytes),
DownloadedByteSize = image.ExpectedByteSize == imageBytes.Length
? null
: new() {DownloadedByteSize = (uint)imageBytes.Length},
Expand All @@ -89,7 +90,7 @@ private Func<ImageWithBytes, ImageMetadata> GetImageMetaData
};

private TEmbeddedMetadata? CreateEmbeddedFromProfile<TImageSharpProfile, TEmbeddedMetadata>(
IEnumerable<ulong> commonXxHash3ToIgnore,
IEnumerable<byte[]> commonXxHash3ToIgnore,
TImageSharpProfile? profile,
Func<TImageSharpProfile, byte[]?> rawBytesSelector)
where TImageSharpProfile : class
Expand All @@ -101,11 +102,11 @@ private Func<ImageWithBytes, ImageMetadata> GetImageMetaData
if (rawBytes.Length > 65535)
_logger.LogWarning("Embedded {} in image contains {} bytes",
typeof(TEmbeddedMetadata).Name.ToUpperInvariant(), rawBytes.Length);
var xxHash3 = XxHash3.HashToUInt64(rawBytes);
var xxHash3 = XxHash3.Hash(rawBytes);
return new()
{
XxHash3 = xxHash3,
RawBytes = commonXxHash3ToIgnore.Contains(xxHash3) ? null : rawBytes
RawBytes = commonXxHash3ToIgnore.Contains(xxHash3, new ByteArrayEqualityComparer()) ? null : rawBytes
};
}

Expand Down
4 changes: 2 additions & 2 deletions c#/imagePipeline/src/Db/ImageHash.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ public class ImageHash
{
public uint ImageId { get; set; }
public uint FrameIndex { get; set; }
public ulong PHash { get; set; }
public ulong AverageHash { get; set; }
public required byte[] PHash { get; set; }
public required byte[] AverageHash { get; set; }
public required byte[] BlockMeanHash { get; set; }
public required byte[] MarrHildrethHash { get; set; }
public required byte[] ThumbHash { get; set; }
Expand Down
14 changes: 7 additions & 7 deletions c#/imagePipeline/src/Db/ImageMetadata.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ public interface IImageMetadata

public interface IEmbedded : IImageMetadata
{
public ulong XxHash3 { get; set; }
public byte[] XxHash3 { get; set; }
public byte[]? RawBytes { get; set; }
}

Expand All @@ -27,7 +27,7 @@ public interface IEmbedded : IImageMetadata
public ushort Height { get; set; }
public ushort BitsPerPixel { get; set; }
public uint FrameCount { get; set; }
public ulong XxHash3 { get; set; }
public required byte[] XxHash3 { get; set; }
public ByteSize? DownloadedByteSize { get; set; }
public Exif? EmbeddedExif { get; set; }
public Icc? EmbeddedIcc { get; set; }
Expand Down Expand Up @@ -88,12 +88,12 @@ public enum ExifOrientation
public Point? GpsCoordinate { get; set; }
public float? GpsImgDirection { get; set; }
public string? GpsImgDirectionRef { get; set; }
public ulong XxHash3 { get; set; }
public byte[] XxHash3 { get; set; } = null!;
public byte[]? RawBytes { get; set; }

// workaround to work with MetadataConsumer.CreateEmbeddedFromProfile()
// https://stackoverflow.com/questions/75266722/type-cannot-satisfy-the-new-constraint-on-parameter-tparam-because-type
public ICollection<TagName> TagNames { get; set; } = [];
public IEnumerable<TagName> TagNames { get; set; } = [];

public class TagName : IImageMetadata
{
Expand All @@ -105,21 +105,21 @@ public class TagName : IImageMetadata
public class Icc : IEmbedded
{
[Key] public uint ImageId { get; set; }
public ulong XxHash3 { get; set; }
public byte[] XxHash3 { get; set; } = null!;
public byte[]? RawBytes { get; set; }
}

public class Iptc : IEmbedded
{
[Key] public uint ImageId { get; set; }
public ulong XxHash3 { get; set; }
public byte[] XxHash3 { get; set; } = null!;
public byte[]? RawBytes { get; set; }
}

public class Xmp : IEmbedded
{
[Key] public uint ImageId { get; set; }
public ulong XxHash3 { get; set; }
public byte[] XxHash3 { get; set; } = null!;
public byte[]? RawBytes { get; set; }
}

Expand Down
15 changes: 15 additions & 0 deletions c#/shared/src/ByteArrayEqualityComparer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
namespace tbm.Shared
{
public class ByteArrayEqualityComparer : EqualityComparer<byte[]>
{
public override bool Equals(byte[]? x, byte[]? y) =>
x == y || (x != null && y != null && x.AsSpan().SequenceEqual(y.AsSpan()));

public override int GetHashCode(byte[] obj)
{
var hash = default(HashCode);
hash.AddBytes(obj);
return hash.ToHashCode();
}
}
}

0 comments on commit 307ee3b

Please sign in to comment.