diff --git a/c#/crawler/src/Db/ReplySignature.cs b/c#/crawler/src/Db/ReplySignature.cs index f7228794..d26227a5 100644 --- a/c#/crawler/src/Db/ReplySignature.cs +++ b/c#/crawler/src/Db/ReplySignature.cs @@ -5,7 +5,7 @@ public class ReplySignature { public long UserId { get; set; } public uint SignatureId { get; set; } - public ulong XxHash3 { get; set; } + public required byte[] XxHash3 { get; set; } public required byte[] ProtoBufBytes { get; set; } public uint FirstSeenAt { get; set; } public uint LastSeenAt { get; set; } diff --git a/c#/crawler/src/Tieba/Crawl/Saver/BaseSaver.cs b/c#/crawler/src/Tieba/Crawl/Saver/BaseSaver.cs index 0dc2ef69..ce760beb 100644 --- a/c#/crawler/src/Tieba/Crawl/Saver/BaseSaver.cs +++ b/c#/crawler/src/Tieba/Crawl/Saver/BaseSaver.cs @@ -92,7 +92,8 @@ bool IsTimestampingFieldName(string name) => name is nameof(IPost.LastSeenAt) revisionNullFieldsBitMask |= whichBitToMask; // mask the corresponding field bit with 1 } } - if (revision != null) revision.NullFieldsBitMask = (NullFieldsBitMask?)revisionNullFieldsBitMask.NullIfZero(); + if (revision != null) + revision.NullFieldsBitMask = (NullFieldsBitMask?)revisionNullFieldsBitMask.NullIfZero(); return revision; }).OfType().ToList(); if (newRevisions.Count == 0) return; // quick exit to prevent execute sql with WHERE FALSE clause @@ -130,5 +131,6 @@ private static bool IsLatestReplierUser(string pName, PropertyEntry p, EntityEnt private static bool IsSameUser(User a, User b) => (a.Uid, a.Name, a.DisplayName, a.Portrait, a.PortraitUpdatedAt, a.Gender, a.FansNickname, a.IpGeolocation) == (b.Uid, b.Name, b.DisplayName, b.Portrait, b.PortraitUpdatedAt, b.Gender, b.FansNickname, b.IpGeolocation) - && (a.Icon == b.Icon || (a.Icon != null && b.Icon != null && a.Icon.SequenceEqual(b.Icon))); + && (a.Icon == b.Icon + || (a.Icon != null && b.Icon != null && new ByteArrayEqualityComparer().Equals(a.Icon, b.Icon))); } diff --git a/c#/crawler/src/Tieba/Crawl/Saver/Post/ReplySaver.cs b/c#/crawler/src/Tieba/Crawl/Saver/Post/ReplySaver.cs index a0c71108..a77fd202 100644 --- a/c#/crawler/src/Tieba/Crawl/Saver/Post/ReplySaver.cs +++ b/c#/crawler/src/Tieba/Crawl/Saver/Post/ReplySaver.cs @@ -130,7 +130,7 @@ private Action SaveReplySignatures(CrawlerDbContext db, IEnumerable r { UserId = r.AuthorUid, SignatureId = (uint)r.SignatureId!, - XxHash3 = XxHash3.HashToUInt64(r.Signature!), + XxHash3 = XxHash3.Hash(r.Signature!), ProtoBufBytes = r.Signature!, FirstSeenAt = now, LastSeenAt = now @@ -142,7 +142,7 @@ private Action SaveReplySignatures(CrawlerDbContext db, IEnumerable r var existingSignatures = ( from s in db.ReplySignatures.AsTracking().ForUpdate() where uniqueSignatures.Select(us => us.Id).Contains(s.SignatureId) - && uniqueSignatures.Select(us => us.XxHash3).Contains(s.XxHash3) + && uniqueSignatures.Select(us => us.XxHash3).Contains(s.XxHash3, new ByteArrayEqualityComparer()) select s ).ToList(); (from existing in existingSignatures @@ -170,5 +170,17 @@ join newInReply in signatures on existing.SignatureId equals newInReply.Signatur }; } - private sealed record UniqueSignature(uint Id, ulong XxHash3); + private sealed record UniqueSignature(uint Id, byte[] XxHash3) + { + public bool Equals(UniqueSignature? other) => + other != null && Id == other.Id && new ByteArrayEqualityComparer().Equals(XxHash3, other.XxHash3); + + public override int GetHashCode() + { + var hash = default(HashCode); + hash.Add(Id); + hash.AddBytes(XxHash3); + return hash.ToHashCode(); + } + } } diff --git a/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/317573671597317124 b/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/04683F925D931C04 similarity index 100% rename from c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/317573671597317124 rename to c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/04683F925D931C04 diff --git a/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/675439972720715839 b/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/095FA513991E283F similarity index 100% rename from c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/675439972720715839 rename to c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/095FA513991E283F diff --git a/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/2312022491946436839 b/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/2015F434F12E48E7 similarity index 100% rename from c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/2312022491946436839 rename to c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/2015F434F12E48E7 diff --git a/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/2453093087249539934 b/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/220B232A7279A35E similarity index 100% rename from c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/2453093087249539934 rename to c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/220B232A7279A35E diff --git a/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/5088525931300380143 b/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/469E135FD585DDEF similarity index 100% rename from c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/5088525931300380143 rename to c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/469E135FD585DDEF diff --git a/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/6340676997749381937 b/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/57FE9C2296579B31 similarity index 100% rename from c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/6340676997749381937 rename to c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/57FE9C2296579B31 diff --git a/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/7029493641152027663 b/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/618DC73907BBDC0F similarity index 100% rename from c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/7029493641152027663 rename to c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/618DC73907BBDC0F diff --git a/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/8572314333541615503 b/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/76F6FA77E1BCC78F similarity index 100% rename from c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/8572314333541615503 rename to c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/76F6FA77E1BCC78F diff --git a/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/10691729796898709084 b/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/9460A79A629B5E5C similarity index 100% rename from c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/10691729796898709084 rename to c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/9460A79A629B5E5C diff --git a/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/11377371376846116862 b/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/9DE48AFCC29C17FE similarity index 100% rename from c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/11377371376846116862 rename to c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/9DE48AFCC29C17FE diff --git a/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/11425690781213542939 b/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/9E90353AC7B04A1B similarity index 100% rename from c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/11425690781213542939 rename to c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/9E90353AC7B04A1B diff --git a/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/11564595951513508422 b/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/A07DB2BEFCE21A46 similarity index 100% rename from c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/11564595951513508422 rename to c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/A07DB2BEFCE21A46 diff --git a/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/11631441974507259315 b/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/A16B2ED98C12C5B3 similarity index 100% rename from c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/11631441974507259315 rename to c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/A16B2ED98C12C5B3 diff --git a/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/12790265642504860643 b/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/B18026D60C62B3E3 similarity index 100% rename from c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/12790265642504860643 rename to c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/B18026D60C62B3E3 diff --git a/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/13825194526826816856 b/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/BFDCF52C522FE558 similarity index 100% rename from c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/13825194526826816856 rename to c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/BFDCF52C522FE558 diff --git a/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/15295336236538839648 b/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/D443F344DB0EC260 similarity index 100% rename from c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/15295336236538839648 rename to c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/D443F344DB0EC260 diff --git a/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/17530108583999846437 b/c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/F34774E0EF563825 similarity index 100% rename from c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/17530108583999846437 rename to c#/imagePipeline/CommonEmbeddedMetadataXxHash3ToIgnore/ICC/F34774E0EF563825 diff --git a/c#/imagePipeline/appsettings.json b/c#/imagePipeline/appsettings.json index 5a36c9dc..78ba0af5 100644 --- a/c#/imagePipeline/appsettings.json +++ b/c#/imagePipeline/appsettings.json @@ -28,23 +28,23 @@ "CommonEmbeddedMetadataXxHash3ToIgnore": { "Exif": [], "Icc": [ - 15295336236538839648, - 5088525931300380143, - 11377371376846116862, - 12790265642504860643, - 2312022491946436839, - 13825194526826816856, - 11564595951513508422, - 11425690781213542939, - 7029493641152027663, - 675439972720715839, - 17530108583999846437, - 8572314333541615503, - 2453093087249539934, - 6340676997749381937, - 10691729796898709084, - 11631441974507259315, - 317573671597317124 + "D443F344DB0EC260", + "469E135FD585DDEF", + "9DE48AFCC29C17FE", + "B18026D60C62B3E3", + "2015F434F12E48E7", + "BFDCF52C522FE558", + "A07DB2BEFCE21A46", + "9E90353AC7B04A1B", + "618DC73907BBDC0F", + "095FA513991E283F", + "F34774E0EF563825", + "76F6FA77E1BCC78F", + "220B232A7279A35E", + "57FE9C2296579B31", + "9460A79A629B5E5C", + "A16B2ED98C12C5B3", + "04683F925D931C04" ], "Iptc": [], "Xmp": [] diff --git a/c#/imagePipeline/src/Consumer/HashConsumer.cs b/c#/imagePipeline/src/Consumer/HashConsumer.cs index a144ed0f..eff05a10 100644 --- a/c#/imagePipeline/src/Consumer/HashConsumer.cs +++ b/c#/imagePipeline/src/Consumer/HashConsumer.cs @@ -15,8 +15,8 @@ public HashConsumer(FailedImageHandler failedImageHandler) _failedImageHandler = failedImageHandler; _imageHashSettersKeyByAlgorithm = new Dictionary> { - {PHash.Create(), (image, bytes) => image.PHash = BitConverter.ToUInt64(bytes)}, - {AverageHash.Create(), (image, bytes) => image.AverageHash = BitConverter.ToUInt64(bytes)}, + {PHash.Create(), (image, bytes) => image.PHash = bytes}, + {AverageHash.Create(), (image, bytes) => image.AverageHash = bytes}, {BlockMeanHash.Create(), (image, bytes) => image.BlockMeanHash = bytes}, {MarrHildrethHash.Create(), (image, bytes) => image.MarrHildrethHash = bytes} }.AsReadOnly(); @@ -35,6 +35,8 @@ protected override IEnumerable ConsumeInternal( { ImageId = imageKeyWithMatrix.ImageId, FrameIndex = imageKeyWithMatrix.FrameIndex, + PHash = [], + AverageHash = [], BlockMeanHash = [], MarrHildrethHash = [], ThumbHash = [] diff --git a/c#/imagePipeline/src/Consumer/MetadataConsumer.cs b/c#/imagePipeline/src/Consumer/MetadataConsumer.cs index 4f298656..afa5d9a6 100644 --- a/c#/imagePipeline/src/Consumer/MetadataConsumer.cs +++ b/c#/imagePipeline/src/Consumer/MetadataConsumer.cs @@ -13,7 +13,8 @@ public partial class MetadataConsumer : IConsumer { private readonly ILogger _logger; private readonly FailedImageHandler _failedImageHandler; - private readonly (ulong[] Exif, ulong[] Icc, ulong[] Iptc, ulong[] Xmp) _commonEmbeddedMetadataXxHash3ToIgnore; + private readonly (IEnumerable Exif, IEnumerable Icc, IEnumerable Iptc, IEnumerable Xmp) + _commonEmbeddedMetadataXxHash3ToIgnore; static MetadataConsumer() => NetTopologySuite.NtsGeometryServices.Instance = new( coordinateSequenceFactory: NetTopologySuite.Geometries.Implementation.CoordinateArraySequenceFactory.Instance, @@ -27,8 +28,8 @@ public MetadataConsumer { (_logger, _failedImageHandler) = (logger, failedImageHandler); var section = config.GetSection("MetadataConsumer").GetSection("CommonEmbeddedMetadataXxHash3ToIgnore"); - ulong[] GetCommonXxHash3ToIgnore(string key) => - section.GetSection(key).Get() ?? []; + IEnumerable GetCommonXxHash3ToIgnore(string key) => + (section.GetSection(key).Get() ?? []).Select(Convert.FromHexString); _commonEmbeddedMetadataXxHash3ToIgnore = ( Exif: GetCommonXxHash3ToIgnore("Exif"), Icc: GetCommonXxHash3ToIgnore("Icc"), @@ -70,7 +71,7 @@ private Func GetImageMetaData Height = (ushort)info.Height, BitsPerPixel = (ushort)info.PixelType.BitsPerPixel, FrameCount = (uint)info.FrameMetadataCollection.Count, - XxHash3 = XxHash3.HashToUInt64(imageBytes), + XxHash3 = XxHash3.Hash(imageBytes), DownloadedByteSize = image.ExpectedByteSize == imageBytes.Length ? null : new() {DownloadedByteSize = (uint)imageBytes.Length}, @@ -89,7 +90,7 @@ private Func GetImageMetaData }; private TEmbeddedMetadata? CreateEmbeddedFromProfile( - IEnumerable commonXxHash3ToIgnore, + IEnumerable commonXxHash3ToIgnore, TImageSharpProfile? profile, Func rawBytesSelector) where TImageSharpProfile : class @@ -101,11 +102,11 @@ private Func GetImageMetaData if (rawBytes.Length > 65535) _logger.LogWarning("Embedded {} in image contains {} bytes", typeof(TEmbeddedMetadata).Name.ToUpperInvariant(), rawBytes.Length); - var xxHash3 = XxHash3.HashToUInt64(rawBytes); + var xxHash3 = XxHash3.Hash(rawBytes); return new() { XxHash3 = xxHash3, - RawBytes = commonXxHash3ToIgnore.Contains(xxHash3) ? null : rawBytes + RawBytes = commonXxHash3ToIgnore.Contains(xxHash3, new ByteArrayEqualityComparer()) ? null : rawBytes }; } diff --git a/c#/imagePipeline/src/Db/ImageHash.cs b/c#/imagePipeline/src/Db/ImageHash.cs index cebc8905..27179d05 100644 --- a/c#/imagePipeline/src/Db/ImageHash.cs +++ b/c#/imagePipeline/src/Db/ImageHash.cs @@ -5,8 +5,8 @@ public class ImageHash { public uint ImageId { get; set; } public uint FrameIndex { get; set; } - public ulong PHash { get; set; } - public ulong AverageHash { get; set; } + public required byte[] PHash { get; set; } + public required byte[] AverageHash { get; set; } public required byte[] BlockMeanHash { get; set; } public required byte[] MarrHildrethHash { get; set; } public required byte[] ThumbHash { get; set; } diff --git a/c#/imagePipeline/src/Db/ImageMetadata.cs b/c#/imagePipeline/src/Db/ImageMetadata.cs index ffa7ee09..20aea734 100644 --- a/c#/imagePipeline/src/Db/ImageMetadata.cs +++ b/c#/imagePipeline/src/Db/ImageMetadata.cs @@ -17,7 +17,7 @@ public interface IImageMetadata public interface IEmbedded : IImageMetadata { - public ulong XxHash3 { get; set; } + public byte[] XxHash3 { get; set; } public byte[]? RawBytes { get; set; } } @@ -27,7 +27,7 @@ public interface IEmbedded : IImageMetadata public ushort Height { get; set; } public ushort BitsPerPixel { get; set; } public uint FrameCount { get; set; } - public ulong XxHash3 { get; set; } + public required byte[] XxHash3 { get; set; } public ByteSize? DownloadedByteSize { get; set; } public Exif? EmbeddedExif { get; set; } public Icc? EmbeddedIcc { get; set; } @@ -88,12 +88,12 @@ public enum ExifOrientation public Point? GpsCoordinate { get; set; } public float? GpsImgDirection { get; set; } public string? GpsImgDirectionRef { get; set; } - public ulong XxHash3 { get; set; } + public byte[] XxHash3 { get; set; } = null!; public byte[]? RawBytes { get; set; } // workaround to work with MetadataConsumer.CreateEmbeddedFromProfile() // https://stackoverflow.com/questions/75266722/type-cannot-satisfy-the-new-constraint-on-parameter-tparam-because-type - public ICollection TagNames { get; set; } = []; + public IEnumerable TagNames { get; set; } = []; public class TagName : IImageMetadata { @@ -105,21 +105,21 @@ public class TagName : IImageMetadata public class Icc : IEmbedded { [Key] public uint ImageId { get; set; } - public ulong XxHash3 { get; set; } + public byte[] XxHash3 { get; set; } = null!; public byte[]? RawBytes { get; set; } } public class Iptc : IEmbedded { [Key] public uint ImageId { get; set; } - public ulong XxHash3 { get; set; } + public byte[] XxHash3 { get; set; } = null!; public byte[]? RawBytes { get; set; } } public class Xmp : IEmbedded { [Key] public uint ImageId { get; set; } - public ulong XxHash3 { get; set; } + public byte[] XxHash3 { get; set; } = null!; public byte[]? RawBytes { get; set; } } diff --git a/c#/shared/src/ByteArrayEqualityComparer.cs b/c#/shared/src/ByteArrayEqualityComparer.cs new file mode 100644 index 00000000..9516c069 --- /dev/null +++ b/c#/shared/src/ByteArrayEqualityComparer.cs @@ -0,0 +1,15 @@ +namespace tbm.Shared +{ + public class ByteArrayEqualityComparer : EqualityComparer + { + public override bool Equals(byte[]? x, byte[]? y) => + x == y || (x != null && y != null && x.AsSpan().SequenceEqual(y.AsSpan())); + + public override int GetHashCode(byte[] obj) + { + var hash = default(HashCode); + hash.AddBytes(obj); + return hash.ToHashCode(); + } + } +}