From a57af6701e3710998039b8541f676ab46c8c2606 Mon Sep 17 00:00:00 2001 From: n0099 Date: Sun, 12 May 2024 02:49:06 +0800 Subject: [PATCH] * move methods `GetNowTimestamp()` & `UnescapedJsonSerialize()` & field `JsonSerializerOptions` to `tbm.Shared.BaseHelper` @ Helper.cs @ crawler + static class `BaseHelper` * now will log at warn level and serialize entities count group by type to json @ `TbmDbContext.LogDbUpdateConcurrencyException()` @ shared @ c# --- c#/crawler/src/Db/CrawlerDbContext.cs | 2 +- c#/crawler/src/Helper.cs | 16 +-------------- c#/crawler/src/Tieba/Crawl/CrawlPost.cs | 2 +- .../Tieba/Crawl/Crawler/ThreadLateCrawler.cs | 2 +- c#/crawler/src/Tieba/Crawl/CrawlerLocks.cs | 6 +++--- .../src/Tieba/Crawl/Facade/CrawlFacade.cs | 4 ++-- .../Tieba/Crawl/Facade/ThreadCrawlFacade.cs | 2 +- .../Tieba/Crawl/Parser/Post/ReplyParser.cs | 2 +- .../src/Tieba/Crawl/Parser/UserParser.cs | 2 +- .../Tieba/Crawl/Saver/AuthorRevisionSaver.cs | 2 +- c#/crawler/src/Tieba/Crawl/Saver/BaseSaver.cs | 4 ++-- .../src/Tieba/Crawl/Saver/Post/ReplySaver.cs | 2 +- .../ForumModeratorRevisionCrawlWorker.cs | 2 +- c#/crawler/src/Worker/RetryCrawlWorker.cs | 2 +- c#/shared/BaseHelper.cs | 20 +++++++++++++++++++ c#/shared/src/Db/TbmDbContext.cs | 7 ++++--- 16 files changed, 42 insertions(+), 35 deletions(-) create mode 100644 c#/shared/BaseHelper.cs diff --git a/c#/crawler/src/Db/CrawlerDbContext.cs b/c#/crawler/src/Db/CrawlerDbContext.cs index ed6efca9..f76deb30 100644 --- a/c#/crawler/src/Db/CrawlerDbContext.cs +++ b/c#/crawler/src/Db/CrawlerDbContext.cs @@ -32,7 +32,7 @@ public void TimestampingEntities() => // https://www.entityframeworktutorial.net/faq/set-created-and-modified-date-in-efcore.aspx ChangeTracker.Entries().ForEach(e => { - Helper.GetNowTimestamp(out var now); + BaseHelper.GetNowTimestamp(out var now); var originalEntityState = e.State; // copy e.State since it might change after any prop value updated var createdAtProp = e.Property(ie => ie.CreatedAt); var updatedAtProp = e.Property(ie => ie.UpdatedAt); diff --git a/c#/crawler/src/Helper.cs b/c#/crawler/src/Helper.cs index e3aa7cdf..fc4fd077 100644 --- a/c#/crawler/src/Helper.cs +++ b/c#/crawler/src/Helper.cs @@ -1,10 +1,7 @@ -using System.Text.Encodings.Web; -using System.Text.Unicode; - namespace tbm.Crawler; #pragma warning disable AV1708 // Type name contains term that should be avoided -public abstract partial class Helper +public static class Helper #pragma warning restore AV1708 // Type name contains term that should be avoided { [SuppressMessage("Member Design", "AV1130:Return type in method signature should be an interface to an unchangeable collection")] @@ -23,15 +20,4 @@ public static byte[]? SerializedProtoBufWrapperOrNullIfEmpty public static PostContentWrapper? WrapPostContent(IEnumerable? contents) => contents == null ? null : new() {Value = {contents}}; - - public static void GetNowTimestamp(out Time now) => now = GetNowTimestamp(); - [SuppressMessage("Maintainability", "AV1551:Method overload should call another overload")] - public static Time GetNowTimestamp() => (Time)DateTimeOffset.Now.ToUnixTimeSeconds(); -} -public abstract partial class Helper -{ - private static readonly JsonSerializerOptions UnescapedSerializeOptions = - new() {Encoder = JavaScriptEncoder.Create(UnicodeRanges.All)}; - public static string UnescapedJsonSerialize(TValue value) => - JsonSerializer.Serialize(value, UnescapedSerializeOptions); } diff --git a/c#/crawler/src/Tieba/Crawl/CrawlPost.cs b/c#/crawler/src/Tieba/Crawl/CrawlPost.cs index ad386a30..535359cd 100644 --- a/c#/crawler/src/Tieba/Crawl/CrawlPost.cs +++ b/c#/crawler/src/Tieba/Crawl/CrawlPost.cs @@ -145,7 +145,7 @@ private Action SaveThreadMissingFirstReply Pid = thread.FirstReplyPid, Excerpt = Helper.SerializedProtoBufWrapperOrNullIfEmpty(thread.FirstReplyExcerpt, () => new ThreadAbstractWrapper {Value = {thread.FirstReplyExcerpt}}), - LastSeenAt = Helper.GetNowTimestamp() + LastSeenAt = BaseHelper.GetNowTimestamp() }; if (newEntity.Pid == null && newEntity.Excerpt == null) return; // skip if all fields are empty diff --git a/c#/crawler/src/Tieba/Crawl/Crawler/ThreadLateCrawler.cs b/c#/crawler/src/Tieba/Crawl/Crawler/ThreadLateCrawler.cs index 812e7fbe..01554bd2 100644 --- a/c#/crawler/src/Tieba/Crawl/Crawler/ThreadLateCrawler.cs +++ b/c#/crawler/src/Tieba/Crawl/Crawler/ThreadLateCrawler.cs @@ -92,7 +92,7 @@ public class ThreadLateCrawler( { logger.LogWarning("TiebaException: {} {}", string.Join(' ', e.GetInnerExceptions().Select(ex => ex.Message)), - Helper.UnescapedJsonSerialize(e.Data)); + BaseHelper.UnescapedJsonSerialize(e.Data)); } else { diff --git a/c#/crawler/src/Tieba/Crawl/CrawlerLocks.cs b/c#/crawler/src/Tieba/Crawl/CrawlerLocks.cs index 97e9f053..6704ece8 100644 --- a/c#/crawler/src/Tieba/Crawl/CrawlerLocks.cs +++ b/c#/crawler/src/Tieba/Crawl/CrawlerLocks.cs @@ -23,7 +23,7 @@ public IReadOnlySet AcquireRange(LockId lockId, IEnumerable pages) var acquiredPages = pages.ToHashSet(); lock (_crawling) { // lock the entire ConcurrentDictionary since following bulk insert should be a single atomic operation - Helper.GetNowTimestamp(out var now); + BaseHelper.GetNowTimestamp(out var now); if (!_crawling.ContainsKey(lockId)) { // if no one is locking any page in lockId, just insert pages then return it as is var pageTimeDict = acquiredPages.Select(page => KeyValuePair.Create(page, now)); @@ -114,9 +114,9 @@ protected override void LogTrace() logger.LogTrace("Lock: type={} crawlingIdCount={} crawlingPageCount={} crawlingPageCountsKeyById={}" + " failedIdCount={} failedPageCount={} failures={}", LockType, _crawling.Count, _crawling.Values.Sum(d => d.Count), - Helper.UnescapedJsonSerialize(_crawling.ToDictionary(pair => pair.Key.ToString(), pair => pair.Value.Count)), + BaseHelper.UnescapedJsonSerialize(_crawling.ToDictionary(pair => pair.Key.ToString(), pair => pair.Value.Count)), _failed.Count, _failed.Values.Sum(d => d.Count), - Helper.UnescapedJsonSerialize(_failed.ToDictionary(pair => pair.Key.ToString(), pair => pair.Value))); + BaseHelper.UnescapedJsonSerialize(_failed.ToDictionary(pair => pair.Key.ToString(), pair => pair.Value))); } } diff --git a/c#/crawler/src/Tieba/Crawl/Facade/CrawlFacade.cs b/c#/crawler/src/Tieba/Crawl/Facade/CrawlFacade.cs index e2487628..150217e4 100644 --- a/c#/crawler/src/Tieba/Crawl/Facade/CrawlFacade.cs +++ b/c#/crawler/src/Tieba/Crawl/Facade/CrawlFacade.cs @@ -161,7 +161,7 @@ private async Task CrawlPages( .Select(page => (Page)page) .SequenceEqual(pages) ? $"within the range [{pages[0]}-{pages[^1]}]" - : JsonSerializer.Serialize(pages); + : BaseHelper.UnescapedJsonSerialize(pages); Logger.LogInformation("Cannot crawl any page within {} for lock type {}, id {} since they've already been locked", pagesText, locks.LockType, lockId); } @@ -198,7 +198,7 @@ private async Task LogException( { if (!te.ShouldSilent) Logger.LogWarning("TiebaException: {} {}", string.Join(' ', e.GetInnerExceptions().Select(ex => ex.Message)), - Helper.UnescapedJsonSerialize(e.Data)); + BaseHelper.UnescapedJsonSerialize(e.Data)); } else { diff --git a/c#/crawler/src/Tieba/Crawl/Facade/ThreadCrawlFacade.cs b/c#/crawler/src/Tieba/Crawl/Facade/ThreadCrawlFacade.cs index d3afc35b..e785e395 100644 --- a/c#/crawler/src/Tieba/Crawl/Facade/ThreadCrawlFacade.cs +++ b/c#/crawler/src/Tieba/Crawl/Facade/ThreadCrawlFacade.cs @@ -36,7 +36,7 @@ protected override void OnBeforeCommitSave(CrawlerDbContext db, UserSaver userSa .IntersectBy(newlyLockedLatestRepliers, u => u.Uid) .Select(u => { - u.CreatedAt = Helper.GetNowTimestamp(); + u.CreatedAt = BaseHelper.GetNowTimestamp(); return u; }); db.Users.AddRange(newLatestRepliersExceptLocked); diff --git a/c#/crawler/src/Tieba/Crawl/Parser/Post/ReplyParser.cs b/c#/crawler/src/Tieba/Crawl/Parser/Post/ReplyParser.cs index 2f55e79c..93ad9eb8 100644 --- a/c#/crawler/src/Tieba/Crawl/Parser/Post/ReplyParser.cs +++ b/c#/crawler/src/Tieba/Crawl/Parser/Post/ReplyParser.cs @@ -42,7 +42,7 @@ protected override ReplyPost Convert(Reply inPost) { logger.LogInformation("Detected an image in the content of reply with pid {} references to {}" + " instead of common domains of tieba image hosting service, content={}", - o.Pid, c.OriginSrc, Helper.UnescapedJsonSerialize(c)); + o.Pid, c.OriginSrc, BaseHelper.UnescapedJsonSerialize(c)); } } o.Content = Helper.SerializedProtoBufWrapperOrNullIfEmpty(inPost.Content, diff --git a/c#/crawler/src/Tieba/Crawl/Parser/UserParser.cs b/c#/crawler/src/Tieba/Crawl/Parser/UserParser.cs index 71bfd197..54c39377 100644 --- a/c#/crawler/src/Tieba/Crawl/Parser/UserParser.cs +++ b/c#/crawler/src/Tieba/Crawl/Parser/UserParser.cs @@ -40,7 +40,7 @@ public void Parse(IEnumerable inUsers) => } catch (Exception e) { - e.Data["raw"] = Helper.UnescapedJsonSerialize(el); + e.Data["raw"] = BaseHelper.UnescapedJsonSerialize(el); throw new InvalidDataException("User parse error.", e); } }).OfType().ForEach(u => users[u.Uid] = u); diff --git a/c#/crawler/src/Tieba/Crawl/Saver/AuthorRevisionSaver.cs b/c#/crawler/src/Tieba/Crawl/Saver/AuthorRevisionSaver.cs index df202930..311ab34f 100644 --- a/c#/crawler/src/Tieba/Crawl/Saver/AuthorRevisionSaver.cs +++ b/c#/crawler/src/Tieba/Crawl/Saver/AuthorRevisionSaver.cs @@ -49,7 +49,7 @@ private void SaveAuthorRevisions( where TPost : BasePost where TRevision : AuthorRevision { - Helper.GetNowTimestamp(out var now); + BaseHelper.GetNowTimestamp(out var now); var existingRevisionOfExistingUsers = dbSet.AsNoTracking() .Where(e => e.Fid == db.Fid && posts.Select(p => p.AuthorUid).Distinct().Contains(e.Uid)) diff --git a/c#/crawler/src/Tieba/Crawl/Saver/BaseSaver.cs b/c#/crawler/src/Tieba/Crawl/Saver/BaseSaver.cs index 6bd591af..84e74489 100644 --- a/c#/crawler/src/Tieba/Crawl/Saver/BaseSaver.cs +++ b/c#/crawler/src/Tieba/Crawl/Saver/BaseSaver.cs @@ -66,8 +66,8 @@ bool IsTimestampingFieldName(string name) => name is nameof(BasePost.LastSeenAt) logger.LogWarning("Updating field {} is not existing in revision table, " + "newValue={}, oldValue={}, newObject={}, oldObject={}", pName, ToHexWhenByteArray(p.CurrentValue), ToHexWhenByteArray(p.OriginalValue), - Helper.UnescapedJsonSerialize(newPostOrUser), - Helper.UnescapedJsonSerialize(entry.OriginalValues.ToObject())); + BaseHelper.UnescapedJsonSerialize(newPostOrUser), + BaseHelper.UnescapedJsonSerialize(entry.OriginalValues.ToObject())); } else { diff --git a/c#/crawler/src/Tieba/Crawl/Saver/Post/ReplySaver.cs b/c#/crawler/src/Tieba/Crawl/Saver/Post/ReplySaver.cs index 9dc34bad..7bcc8714 100644 --- a/c#/crawler/src/Tieba/Crawl/Saver/Post/ReplySaver.cs +++ b/c#/crawler/src/Tieba/Crawl/Saver/Post/ReplySaver.cs @@ -122,7 +122,7 @@ public partial class ReplySaver private Action SaveReplySignatures(CrawlerDbContext db, IEnumerable replies) { - Helper.GetNowTimestamp(out var now); + BaseHelper.GetNowTimestamp(out var now); var signatures = replies .Where(r => r is {SignatureId: not null, Signature: not null}) .DistinctBy(r => r.SignatureId) diff --git a/c#/crawler/src/Worker/ForumModeratorRevisionCrawlWorker.cs b/c#/crawler/src/Worker/ForumModeratorRevisionCrawlWorker.cs index 7cf80590..f7818f3a 100644 --- a/c#/crawler/src/Worker/ForumModeratorRevisionCrawlWorker.cs +++ b/c#/crawler/src/Worker/ForumModeratorRevisionCrawlWorker.cs @@ -63,7 +63,7 @@ private async Task Save( await using var transaction = await db.Database.BeginTransactionAsync (IsolationLevel.ReadCommitted, stoppingToken); - Helper.GetNowTimestamp(out var now); + BaseHelper.GetNowTimestamp(out var now); var revisions = moderators .GroupBy(t => t.Portrait) .Select(g => new ForumModeratorRevision diff --git a/c#/crawler/src/Worker/RetryCrawlWorker.cs b/c#/crawler/src/Worker/RetryCrawlWorker.cs index 2a61f625..81e57aff 100644 --- a/c#/crawler/src/Worker/RetryCrawlWorker.cs +++ b/c#/crawler/src/Worker/RetryCrawlWorker.cs @@ -70,7 +70,7 @@ FailureCount FailureCountSelector(Tid tid) => var failureCountsKeyByTid = tidGroupByFid .Cast().ToDictionary(tid => tid, FailureCountSelector); logger.LogTrace("Retrying previous failed thread late crawl with fid={}, threadsId={}", - fid, Helper.UnescapedJsonSerialize(tidGroupByFid)); + fid, BaseHelper.UnescapedJsonSerialize(tidGroupByFid)); await threadLateFacade.Value(fid).CrawlThenSave(failureCountsKeyByTid, stoppingToken); } } diff --git a/c#/shared/BaseHelper.cs b/c#/shared/BaseHelper.cs new file mode 100644 index 00000000..6ac98faf --- /dev/null +++ b/c#/shared/BaseHelper.cs @@ -0,0 +1,20 @@ +using System.Diagnostics.CodeAnalysis; +using System.Text.Encodings.Web; +using System.Text.Json; +using System.Text.Unicode; + +namespace tbm.Shared; + +#pragma warning disable AV1708 // Type name contains term that should be avoided +public static class BaseHelper +#pragma warning restore AV1708 // Type name contains term that should be avoided +{ + public static void GetNowTimestamp(out UInt32 now) => now = GetNowTimestamp(); + [SuppressMessage("Maintainability", "AV1551:Method overload should call another overload")] + public static UInt32 GetNowTimestamp() => (UInt32)DateTimeOffset.Now.ToUnixTimeSeconds(); + + private static readonly JsonSerializerOptions UnescapedSerializeOptions = + new() {Encoder = JavaScriptEncoder.Create(UnicodeRanges.All)}; + public static string UnescapedJsonSerialize(TValue value) => + JsonSerializer.Serialize(value, UnescapedSerializeOptions); +} diff --git a/c#/shared/src/Db/TbmDbContext.cs b/c#/shared/src/Db/TbmDbContext.cs index 61b79319..6f49a6cf 100644 --- a/c#/shared/src/Db/TbmDbContext.cs +++ b/c#/shared/src/Db/TbmDbContext.cs @@ -12,9 +12,10 @@ namespace tbm.Shared.Db; public abstract class TbmDbContext(ILogger logger) : DbContext { public void LogDbUpdateConcurrencyException(DbUpdateConcurrencyException e) => - logger.LogError(e, "DbUpdateConcurrencyException: {}", - e.Entries.GroupBy(ee => ee.Entity.GetType()) - .ToDictionary(g => g.Key, g => g.Count())); + logger.LogWarning(e, "DbUpdateConcurrencyException: {}", + BaseHelper.UnescapedJsonSerialize(e.Entries + .GroupBy(ee => ee.Entity.GetType()) + .ToDictionary(g => g.Key, g => g.Count()))); public int SaveChangesForUpdate() {