From 252710bf61ca63cc78eeb6373b4c564e8dbb0217 Mon Sep 17 00:00:00 2001 From: n0099 Date: Tue, 2 Apr 2024 00:49:32 +0800 Subject: [PATCH] * prefer `I?ReadOnlyDictionary<,>` over `I?Dictionary<,>` * add field for auto prop `SplitEntities` @ RevisionWithSplitting.cs * using primary ctor @ SaverChangeSet.cs @ c#/crawler --- .../src/Db/Revision/RevisionWithSplitting.cs | 10 +++--- c#/crawler/src/Tieba/ClientRequester.cs | 4 +-- c#/crawler/src/Tieba/Crawl/CrawlPost.cs | 5 +-- .../src/Tieba/Crawl/Facade/BaseCrawlFacade.cs | 2 +- .../Tieba/Crawl/Facade/SubReplyCrawlFacade.cs | 2 +- .../Crawl/Facade/ThreadArchiveCrawlFacade.cs | 2 +- .../Tieba/Crawl/Facade/ThreadCrawlFacade.cs | 34 +++++++++--------- .../Crawl/Facade/ThreadLateCrawlFacade.cs | 2 +- .../Tieba/Crawl/Saver/IRevisionProperties.cs | 6 ++-- .../src/Tieba/Crawl/Saver/SaverChangeSet.cs | 35 +++++++++---------- .../Tieba/Crawl/Saver/SaverWithRevision.cs | 2 +- c#/imagePipeline/src/Consumer/HashConsumer.cs | 7 ++-- 12 files changed, 56 insertions(+), 55 deletions(-) diff --git a/c#/crawler/src/Db/Revision/RevisionWithSplitting.cs b/c#/crawler/src/Db/Revision/RevisionWithSplitting.cs index 8ad79d14..9c14b5f2 100644 --- a/c#/crawler/src/Db/Revision/RevisionWithSplitting.cs +++ b/c#/crawler/src/Db/Revision/RevisionWithSplitting.cs @@ -3,16 +3,18 @@ namespace tbm.Crawler.Db.Revision; public abstract class RevisionWithSplitting : IRevision where TBaseRevision : class, IRevision { + private readonly Dictionary _splitEntities = []; + public uint TakenAt { get; set; } public ushort? NullFieldsBitMask { get; set; } - public IDictionary SplitEntities { get; } = new Dictionary(); + public IReadOnlyDictionary SplitEntities => _splitEntities; public virtual bool IsAllFieldsIsNullExceptSplit() => throw new NotSupportedException(); protected TValue? GetSplitEntityValue (Func valueSelector) where TSplitEntity : class, TBaseRevision => - SplitEntities.TryGetValue(typeof(TSplitEntity), out var entity) + _splitEntities.TryGetValue(typeof(TSplitEntity), out var entity) ? valueSelector((TSplitEntity)entity) : default; @@ -20,10 +22,10 @@ protected void SetSplitEntityValue (TValue? value, Action valueSetter, Func entityFactory) where TSplitEntity : class, TBaseRevision { - if (SplitEntities.TryGetValue(typeof(TSplitEntity), out var entity)) + if (_splitEntities.TryGetValue(typeof(TSplitEntity), out var entity)) valueSetter((TSplitEntity)entity, value); else - SplitEntities[typeof(TSplitEntity)] = entityFactory(); + _splitEntities[typeof(TSplitEntity)] = entityFactory(); } public class ModelBuilderExtension(ModelBuilder builder, string baseTableName) diff --git a/c#/crawler/src/Tieba/ClientRequester.cs b/c#/crawler/src/Tieba/ClientRequester.cs index f2f71c52..e7a58a22 100644 --- a/c#/crawler/src/Tieba/ClientRequester.cs +++ b/c#/crawler/src/Tieba/ClientRequester.cs @@ -19,7 +19,7 @@ public class ClientRequester( public async Task RequestJson( string url, string clientVersion, - IDictionary postParam, + IReadOnlyDictionary postParam, CancellationToken stoppingToken = default) => await Request(() => PostJson(url, postParam, clientVersion, stoppingToken), stream => { @@ -84,7 +84,7 @@ private static async Task Request private async Task PostJson( string url, - IDictionary postParam, + IReadOnlyDictionary postParam, string clientVersion, CancellationToken stoppingToken = default) { diff --git a/c#/crawler/src/Tieba/Crawl/CrawlPost.cs b/c#/crawler/src/Tieba/Crawl/CrawlPost.cs index e2bce116..cc9ce99a 100644 --- a/c#/crawler/src/Tieba/Crawl/CrawlPost.cs +++ b/c#/crawler/src/Tieba/Crawl/CrawlPost.cs @@ -46,7 +46,8 @@ public async Task CrawlThreads var threadsLatestReplyPostedAt = currentPageChangeSet.AllAfter .Select(th => th.LatestReplyPostedAt).ToList(); minLatestReplyPostedAt = threadsLatestReplyPostedAt.Min(); - if (crawlingPage == 1) _latestReplyPostedAtCheckpointCache[fid] = threadsLatestReplyPostedAt.Max(); + if (crawlingPage == 1) + _latestReplyPostedAtCheckpointCache[fid] = threadsLatestReplyPostedAt.Max(); } else { // retry this page @@ -98,7 +99,7 @@ await Task.WhenAll(shouldCrawlParentPosts.Select(async tid => } public async Task CrawlSubReplies( - IDictionary> savedRepliesKeyByTid, + IReadOnlyDictionary> savedRepliesKeyByTid, Fid fid, CancellationToken stoppingToken = default) { diff --git a/c#/crawler/src/Tieba/Crawl/Facade/BaseCrawlFacade.cs b/c#/crawler/src/Tieba/Crawl/Facade/BaseCrawlFacade.cs index 4ddd4374..ddd0c938 100644 --- a/c#/crawler/src/Tieba/Crawl/Facade/BaseCrawlFacade.cs +++ b/c#/crawler/src/Tieba/Crawl/Facade/BaseCrawlFacade.cs @@ -122,7 +122,7 @@ protected virtual void ThrowIfEmptyUsersEmbedInPosts() { } protected virtual void PostParseHook( TResponse response, CrawlRequestFlag flag, - IDictionary parsedPostsInResponse) { } + IReadOnlyDictionary parsedPostsInResponse) { } protected virtual void BeforeCommitSaveHook(CrawlerDbContext db, UserSaver userSaver) { } protected virtual void PostCommitSaveHook( SaverChangeSet savedPosts, diff --git a/c#/crawler/src/Tieba/Crawl/Facade/SubReplyCrawlFacade.cs b/c#/crawler/src/Tieba/Crawl/Facade/SubReplyCrawlFacade.cs index 95a1542b..b5c00303 100644 --- a/c#/crawler/src/Tieba/Crawl/Facade/SubReplyCrawlFacade.cs +++ b/c#/crawler/src/Tieba/Crawl/Facade/SubReplyCrawlFacade.cs @@ -24,7 +24,7 @@ public class SubReplyCrawlFacade( protected override void PostParseHook( SubReplyResponse response, CrawlRequestFlag flag, - IDictionary parsedPostsInResponse) + IReadOnlyDictionary parsedPostsInResponse) { foreach (var sr in parsedPostsInResponse.Values) { diff --git a/c#/crawler/src/Tieba/Crawl/Facade/ThreadArchiveCrawlFacade.cs b/c#/crawler/src/Tieba/Crawl/Facade/ThreadArchiveCrawlFacade.cs index c3978c94..489c0e41 100644 --- a/c#/crawler/src/Tieba/Crawl/Facade/ThreadArchiveCrawlFacade.cs +++ b/c#/crawler/src/Tieba/Crawl/Facade/ThreadArchiveCrawlFacade.cs @@ -19,7 +19,7 @@ public class ThreadArchiveCrawlFacade( protected override void PostParseHook( ThreadResponse response, CrawlRequestFlag flag, - IDictionary parsedPostsInResponse) + IReadOnlyDictionary parsedPostsInResponse) { // the second respond with flag is as same as the first one so just skip it if (flag == CrawlRequestFlag.ThreadClientVersion602) return; var data = response.Data; diff --git a/c#/crawler/src/Tieba/Crawl/Facade/ThreadCrawlFacade.cs b/c#/crawler/src/Tieba/Crawl/Facade/ThreadCrawlFacade.cs index 3965c216..41b65213 100644 --- a/c#/crawler/src/Tieba/Crawl/Facade/ThreadCrawlFacade.cs +++ b/c#/crawler/src/Tieba/Crawl/Facade/ThreadCrawlFacade.cs @@ -42,6 +42,23 @@ protected override void BeforeCommitSaveHook(CrawlerDbContext db, UserSaver user _ = db.Users.UpsertRange(newLatestRepliersExceptLocked).NoUpdate().Run(); } + protected override void PostParseHook( + ThreadResponse response, + CrawlRequestFlag flag, + IReadOnlyDictionary parsedPostsInResponse) + { + var data = response.Data; + if (flag == CrawlRequestFlag.ThreadClientVersion602) FillFromRequestingWith602(data.ThreadList); + if (flag != CrawlRequestFlag.None) return; + UserParser.Parse(data.UserList); + UserParser.ResetUsersIcon(); + ParseLatestRepliers(data.ThreadList); + + // remove livepost threads since their real parent forum may not match with current crawling fid + data.ThreadList.Where(th => th.LivePostType != "") + .ForEach(th => Posts.TryRemove((Tid)th.Tid, out _)); + } + protected void ParseLatestRepliers(IEnumerable threads) => threads.Select(th => th.LastReplyer ?? null) // LastReplyer will be null when LivePostType != "" .OfType() // filter out nulls @@ -67,21 +84,4 @@ join parsed in Posts.Values on (Tid)inResponse.Tid equals parsed.Tid // LastReplyer will be null when LivePostType != "", but LastTimeInt will have expected timestamp value t.parsed.LatestReplierUid = t.inResponse.LastReplyer?.Uid; }); - - protected override void PostParseHook( - ThreadResponse response, - CrawlRequestFlag flag, - IDictionary parsedPostsInResponse) - { - var data = response.Data; - if (flag == CrawlRequestFlag.ThreadClientVersion602) FillFromRequestingWith602(data.ThreadList); - if (flag != CrawlRequestFlag.None) return; - UserParser.Parse(data.UserList); - UserParser.ResetUsersIcon(); - ParseLatestRepliers(data.ThreadList); - - // remove livepost threads since their real parent forum may not match with current crawling fid - data.ThreadList.Where(th => th.LivePostType != "") - .ForEach(th => Posts.TryRemove((Tid)th.Tid, out _)); - } } diff --git a/c#/crawler/src/Tieba/Crawl/Facade/ThreadLateCrawlFacade.cs b/c#/crawler/src/Tieba/Crawl/Facade/ThreadLateCrawlFacade.cs index 36b0235c..65411620 100644 --- a/c#/crawler/src/Tieba/Crawl/Facade/ThreadLateCrawlFacade.cs +++ b/c#/crawler/src/Tieba/Crawl/Facade/ThreadLateCrawlFacade.cs @@ -8,7 +8,7 @@ public class ThreadLateCrawlFacade( public delegate ThreadLateCrawlFacade New(Fid fid); public async Task CrawlThenSave( - IDictionary failureCountsKeyByTid, + IReadOnlyDictionary failureCountsKeyByTid, CancellationToken stoppingToken = default) { var threads = await Task.WhenAll( diff --git a/c#/crawler/src/Tieba/Crawl/Saver/IRevisionProperties.cs b/c#/crawler/src/Tieba/Crawl/Saver/IRevisionProperties.cs index 3e7a209b..7086bf96 100644 --- a/c#/crawler/src/Tieba/Crawl/Saver/IRevisionProperties.cs +++ b/c#/crawler/src/Tieba/Crawl/Saver/IRevisionProperties.cs @@ -2,10 +2,10 @@ namespace tbm.Crawler.Tieba.Crawl.Saver; public interface IRevisionProperties { - protected static IDictionary> Cache { get; } = GetPropsKeyByType( + protected static IReadOnlyDictionary> Cache { get; } = GetPropsKeyByType( [typeof(ThreadRevision), typeof(ReplyRevision), typeof(SubReplyRevision), typeof(UserRevision)]); - private static IDictionary> GetPropsKeyByType(IEnumerable types) => + private static IReadOnlyDictionary> GetPropsKeyByType(IEnumerable types) => types.ToDictionary(type => type, type => - (IDictionary)type.GetProperties().ToDictionary(prop => prop.Name)); + (IReadOnlyDictionary)type.GetProperties().ToDictionary(prop => prop.Name)); } diff --git a/c#/crawler/src/Tieba/Crawl/Saver/SaverChangeSet.cs b/c#/crawler/src/Tieba/Crawl/Saver/SaverChangeSet.cs index 772f28cc..2728c322 100644 --- a/c#/crawler/src/Tieba/Crawl/Saver/SaverChangeSet.cs +++ b/c#/crawler/src/Tieba/Crawl/Saver/SaverChangeSet.cs @@ -2,27 +2,24 @@ namespace tbm.Crawler.Tieba.Crawl.Saver; -public class SaverChangeSet where TPost : class, IPost +public class SaverChangeSet( + IReadOnlyCollection existingBefore, + ICollection existingAfterAndNewlyAdded, + Func postIdSelector) + where TPost : class, IPost { - public SaverChangeSet( - IReadOnlyCollection existingBefore, - ICollection existingAfterAndNewlyAdded, - Func postIdSelector) - { - Existing = existingBefore - .OrderBy(postIdSelector) - .EquiZip(existingAfterAndNewlyAdded + public IReadOnlyCollection<(TPost Before, TPost After)> Existing { get; } = existingBefore + .OrderBy(postIdSelector) + .EquiZip(existingAfterAndNewlyAdded .IntersectBy(existingBefore.Select(postIdSelector), postIdSelector) .OrderBy(postIdSelector), - (before, after) => (before, after)) - .ToList().AsReadOnly(); - NewlyAdded = existingAfterAndNewlyAdded - .ExceptBy(existingBefore.Select(postIdSelector), postIdSelector) - .ToList().AsReadOnly(); - AllAfter = existingAfterAndNewlyAdded.ToList().AsReadOnly(); - } + (before, after) => (before, after)) + .ToList().AsReadOnly(); - public IReadOnlyCollection<(TPost Before, TPost After)> Existing { get; } - public IReadOnlyCollection NewlyAdded { get; } - public IReadOnlyCollection AllAfter { get; } + public IReadOnlyCollection NewlyAdded { get; } = existingAfterAndNewlyAdded + .ExceptBy(existingBefore.Select(postIdSelector), postIdSelector) + .ToList().AsReadOnly(); + + public IReadOnlyCollection AllAfter { get; } = existingAfterAndNewlyAdded + .ToList().AsReadOnly(); } diff --git a/c#/crawler/src/Tieba/Crawl/Saver/SaverWithRevision.cs b/c#/crawler/src/Tieba/Crawl/Saver/SaverWithRevision.cs index 0dbe91fa..c1bf5f67 100644 --- a/c#/crawler/src/Tieba/Crawl/Saver/SaverWithRevision.cs +++ b/c#/crawler/src/Tieba/Crawl/Saver/SaverWithRevision.cs @@ -5,7 +5,7 @@ public abstract class SaverWithRevision : IRevisionProperties { protected delegate void RevisionUpsertDelegate(CrawlerDbContext db, IEnumerable revision); - protected virtual IDictionary RevisionUpsertDelegatesKeyBySplitEntityType => + protected virtual IReadOnlyDictionary RevisionUpsertDelegatesKeyBySplitEntityType => throw new NotSupportedException(); protected virtual NullFieldsBitMask GetRevisionNullFieldBitMask(string fieldName) => diff --git a/c#/imagePipeline/src/Consumer/HashConsumer.cs b/c#/imagePipeline/src/Consumer/HashConsumer.cs index 16d59f3c..a144ed0f 100644 --- a/c#/imagePipeline/src/Consumer/HashConsumer.cs +++ b/c#/imagePipeline/src/Consumer/HashConsumer.cs @@ -1,3 +1,4 @@ +using System.Collections.ObjectModel; using OpenCvSharp.ImgHash; using Size = OpenCvSharp.Size; @@ -6,19 +7,19 @@ namespace tbm.ImagePipeline.Consumer; public sealed class HashConsumer : MatrixConsumer, IDisposable { private readonly FailedImageHandler _failedImageHandler; - private readonly Dictionary> _imageHashSettersKeyByAlgorithm; + private readonly ReadOnlyDictionary> _imageHashSettersKeyByAlgorithm; [SuppressMessage("Correctness", "SS004:Implement Equals() and GetHashcode() methods for a type used in a collection.")] public HashConsumer(FailedImageHandler failedImageHandler) { _failedImageHandler = failedImageHandler; - _imageHashSettersKeyByAlgorithm = new() + _imageHashSettersKeyByAlgorithm = new Dictionary> { {PHash.Create(), (image, bytes) => image.PHash = BitConverter.ToUInt64(bytes)}, {AverageHash.Create(), (image, bytes) => image.AverageHash = BitConverter.ToUInt64(bytes)}, {BlockMeanHash.Create(), (image, bytes) => image.BlockMeanHash = bytes}, {MarrHildrethHash.Create(), (image, bytes) => image.MarrHildrethHash = bytes} - }; + }.AsReadOnly(); } public void Dispose() => _imageHashSettersKeyByAlgorithm.Keys.ForEach(hash => hash.Dispose());