diff --git a/c#/crawler/src/Tieba/Crawl/Facade/ThreadCrawlFacade.cs b/c#/crawler/src/Tieba/Crawl/Facade/ThreadCrawlFacade.cs index 747d7863..afb69ef1 100644 --- a/c#/crawler/src/Tieba/Crawl/Facade/ThreadCrawlFacade.cs +++ b/c#/crawler/src/Tieba/Crawl/Facade/ThreadCrawlFacade.cs @@ -21,7 +21,7 @@ public class ThreadCrawlFacade( protected override void OnBeforeCommitSave(CrawlerDbContext db, UserSaver userSaver) { // OnBeforeCommitSave() should get invoked after UserSaver.Save() by the base.SaveCrawled() // so only latest repliers that not exists in parsed users are being inserted - // note this will bypass user revision detection since not invoking BaseSaver.SavePostsOrUsers() but directly DbContext.AddRange() + // note this will bypass user revision detection since not invoking BaseSaver.SaveEntitiesWithRevision() but directly DbContext.AddRange() // users has already been added into DbContext and tracking var existingUsersId = db.ChangeTracker.Entries().Select(ee => ee.Entity.Uid); diff --git a/c#/crawler/src/Tieba/Crawl/Saver/BaseSaver.cs b/c#/crawler/src/Tieba/Crawl/Saver/BaseSaver.cs index 2404796e..16b57b14 100644 --- a/c#/crawler/src/Tieba/Crawl/Saver/BaseSaver.cs +++ b/c#/crawler/src/Tieba/Crawl/Saver/BaseSaver.cs @@ -8,56 +8,58 @@ public abstract class BaseSaver(ILogger> #pragma warning restore S1939 // Inheritance list should not be redundant where TBaseRevision : BaseRevisionWithSplitting { - protected void SavePostsOrUsers( + protected void SaveEntitiesWithRevision( CrawlerDbContext db, IFieldChangeIgnorance.FieldChangeIgnoranceDelegates userFieldChangeIgnorance, - Func revisionFactory, - ILookup existingOrNewLookup, - Func existingSelector) - where TPostOrUser : class + Func revisionFactory, + ILookup existingOrNewLookup, + Func existingSelector) + where TEntity : class where TRevision : BaseRevisionWithSplitting { - db.Set().AddRange(existingOrNewLookup[false]); // newly added - var newRevisions = existingOrNewLookup[true].Select(newPostOrUser => + db.Set().AddRange(existingOrNewLookup[false]); // newly added + var newRevisions = existingOrNewLookup[true].Select(newEntity => { - var postOrUserInTracking = existingSelector(newPostOrUser); - var entry = db.Entry(postOrUserInTracking); + var entityInTracking = existingSelector(newEntity); + var entityEntry = db.Entry(entityInTracking); - // this will mutate postOrUserInTracking which is referenced by entry - entry.CurrentValues.SetValues(newPostOrUser); + // this will mutate existingEntity which is referenced by entry + entityEntry.CurrentValues.SetValues(newEntity); bool IsTimestampingFieldName(string name) => name is nameof(BasePost.LastSeenAt) or nameof(TimestampedEntity.CreatedAt) or nameof(TimestampedEntity.UpdatedAt); // rollback changes that overwrite original values with the default value 0 or null // for all fields of TimestampedEntity and BasePost.LastSeenAt - // this will also affect the entity instance which postOrUserInTracking references to it - entry.Properties + // this will also affect the entity instance which entityInTracking references to it + entityEntry.Properties .Where(prop => prop.IsModified && IsTimestampingFieldName(prop.Metadata.Name)) .ForEach(prop => prop.IsModified = false); var revision = default(TRevision); var revisionNullFieldsBitMask = 0; - var whichPostType = typeof(TPostOrUser); + var whichPostType = typeof(TEntity); var entryIsUser = whichPostType == typeof(User); - foreach (var p in entry.Properties) + foreach (var p in entityEntry.Properties) { var pName = p.Metadata.Name; if (!p.IsModified || IsTimestampingFieldName(pName)) continue; - if (IFieldChangeIgnorance.GlobalFieldChangeIgnorance.Update(whichPostType, pName, p.OriginalValue, p.CurrentValue) + if (IFieldChangeIgnorance.GlobalFieldChangeIgnorance.Update( + whichPostType, pName, p.OriginalValue, p.CurrentValue) || (entryIsUser && userFieldChangeIgnorance.Update( whichPostType, pName, p.OriginalValue, p.CurrentValue))) { p.IsModified = false; continue; // skip following revision check } - if (IFieldChangeIgnorance.GlobalFieldChangeIgnorance.Revision(whichPostType, pName, p.OriginalValue, p.CurrentValue) + if (IFieldChangeIgnorance.GlobalFieldChangeIgnorance.Revision( + whichPostType, pName, p.OriginalValue, p.CurrentValue) || (entryIsUser && userFieldChangeIgnorance.Revision( whichPostType, pName, p.OriginalValue, p.CurrentValue))) continue; - if (IsLatestReplierUser(pName, p, entry)) return null; + if (IsLatestReplierUser(pName, p, entityEntry)) return null; if (!IRevisionProperties.Cache[typeof(TRevision)].TryGetValue(pName, out var revisionProp)) { @@ -66,12 +68,12 @@ bool IsTimestampingFieldName(string name) => name is nameof(BasePost.LastSeenAt) logger.LogWarning("Updating field {} is not existing in revision table, " + "newValue={}, oldValue={}, newObject={}, oldObject={}", pName, ToHexWhenByteArray(p.CurrentValue), ToHexWhenByteArray(p.OriginalValue), - SharedHelper.UnescapedJsonSerialize(newPostOrUser), - SharedHelper.UnescapedJsonSerialize(entry.OriginalValues.ToObject())); + SharedHelper.UnescapedJsonSerialize(newEntity), + SharedHelper.UnescapedJsonSerialize(entityEntry.OriginalValues.ToObject())); } else { - revision ??= revisionFactory(postOrUserInTracking); + revision ??= revisionFactory(entityInTracking); // quote from MSDN https://learn.microsoft.com/en-us/dotnet/api/system.reflection.propertyinfo.setvalue // If the property type of this PropertyInfo object is a value type and value is null @@ -109,14 +111,14 @@ bool IsTimestampingFieldName(string name) => name is nameof(BasePost.LastSeenAt) private static bool IsLatestReplierUser(string pName, PropertyEntry p, EntityEntry entry) { // ThreadCrawlFacade.ParseLatestRepliers() will save users with empty string as portrait - // they will soon be updated by (sub) reply crawler after it find out the latest reply + // they may soon be updated by (sub) reply crawler after it find out the latest reply // so we should ignore its revision update for all fields // ignore entire record is not possible via IFieldChangeIgnorance.GlobalFieldChangeIgnorance.Revision() // since it can only determine one field at the time if (pName != nameof(User.Portrait) || p.OriginalValue is not "") return false; // invokes OriginalValues.ToObject() to get a new instance - // since postOrUserInTracking is reference to the changed one + // since entityInTracking is reference to the changed one var user = (User)entry.OriginalValues.ToObject(); // create another user instance with only fields of latest replier filled diff --git a/c#/crawler/src/Tieba/Crawl/Saver/Post/PostSaver.cs b/c#/crawler/src/Tieba/Crawl/Saver/Post/PostSaver.cs index 1cb49460..0730050d 100644 --- a/c#/crawler/src/Tieba/Crawl/Saver/Post/PostSaver.cs +++ b/c#/crawler/src/Tieba/Crawl/Saver/Post/PostSaver.cs @@ -30,13 +30,13 @@ protected SaverChangeSet Save( ExpressionStarter existingPostPredicate) where TRevision : BaseRevisionWithSplitting { - var existingPostsKeyById = db.Set() + var existingPostsKeyById = db.Set().AsTracking() .Where(existingPostPredicate).ToDictionary(postIdSelector); - // deep copy before entities get mutated by BaseSaver.SavePostsOrUsers() + // deep copy before entities get mutated by BaseSaver.SaveEntitiesWithRevision() var existingBeforeMerge = existingPostsKeyById.Select(pair => (TPost)pair.Value.Clone()).ToList(); - SavePostsOrUsers(db, UserFieldChangeIgnorance, revisionFactory, + SaveEntitiesWithRevision(db, UserFieldChangeIgnorance, revisionFactory, Posts.Values.ToLookup(p => existingPostsKeyById.ContainsKey(postIdSelector(p))), p => existingPostsKeyById[postIdSelector(p)]); return new(existingBeforeMerge, Posts.Values, postIdSelector); diff --git a/c#/crawler/src/Tieba/Crawl/Saver/UserSaver.cs b/c#/crawler/src/Tieba/Crawl/Saver/UserSaver.cs index b549736e..9c2803d1 100644 --- a/c#/crawler/src/Tieba/Crawl/Saver/UserSaver.cs +++ b/c#/crawler/src/Tieba/Crawl/Saver/UserSaver.cs @@ -33,23 +33,23 @@ protected override Dictionary } public partial class UserSaver( ILogger logger, SaverLocks locks, - ConcurrentDictionary users) + IDictionary users) : BaseSaver(logger) { - public delegate UserSaver New(ConcurrentDictionary users); + public delegate UserSaver New(IDictionary users); public void Save( CrawlerDbContext db, PostType postType, IFieldChangeIgnorance.FieldChangeIgnoranceDelegates userFieldChangeIgnorance) { - if (users.IsEmpty) return; + if (users.Count == 0) return; locks.AcquireLocksThen(newlyLocked => { var existingUsersKeyByUid = (from user in db.Users.AsTracking() where newlyLocked.Select(u => u.Uid).Contains(user.Uid) select user).ToDictionary(u => u.Uid); - SavePostsOrUsers(db, userFieldChangeIgnorance, + SaveEntitiesWithRevision(db, userFieldChangeIgnorance, u => new UserRevision { TakenAt = u.UpdatedAt ?? u.CreatedAt, diff --git a/c#/crawler/src/Worker/PushAllPostContentsIntoSonicWorker.cs b/c#/crawler/src/Worker/PushAllPostContentsIntoSonicWorker.cs index 57e92107..40e1aba6 100644 --- a/c#/crawler/src/Worker/PushAllPostContentsIntoSonicWorker.cs +++ b/c#/crawler/src/Worker/PushAllPostContentsIntoSonicWorker.cs @@ -21,10 +21,11 @@ protected override async Task DoWork(CancellationToken stoppingToken) var forumPostCountsTuples = db.Database.GetDbConnection() #pragma warning restore IDISP004 // Don't ignore created IDisposable .Query<(Fid Fid, int ReplyCount, int SubReplyCount)>( - string.Join(" UNION ALL ", (from f in db.Forums select f.Fid).AsEnumerable().Select(fid => - $"SELECT '{fid}'," - + $"COALESCE((SELECT id FROM \"tbmc_f{fid}_reply\" ORDER BY id DESC LIMIT 1), 0)," - + $"COALESCE((SELECT id FROM \"tbmc_f{fid}_subReply\" ORDER BY id DESC LIMIT 1), 0)"))) + string.Join(" UNION ALL ", (from f in db.Forums.AsNoTracking() select f.Fid) + .AsEnumerable().Select(fid => + $"SELECT '{fid}'," + + $"COALESCE((SELECT id FROM \"tbmc_f{fid}_reply\" ORDER BY id DESC LIMIT 1), 0)," + + $"COALESCE((SELECT id FROM \"tbmc_f{fid}_subReply\" ORDER BY id DESC LIMIT 1), 0)"))) .ToList(); var forumCount = forumPostCountsTuples.Count * 2; // reply and sub reply var totalPostCount = forumPostCountsTuples.Sum(t => t.ReplyCount) diff --git a/c#/imagePipeline/src/ImageBatchConsumingWorker.cs b/c#/imagePipeline/src/ImageBatchConsumingWorker.cs index 6f6df2e8..da9ce333 100644 --- a/c#/imagePipeline/src/ImageBatchConsumingWorker.cs +++ b/c#/imagePipeline/src/ImageBatchConsumingWorker.cs @@ -197,7 +197,7 @@ private async Task ConsumeOcrConsumer( IQueryable forumScripts, CancellationToken stoppingToken = default) { - var scriptGroupings = forumScripts + var scriptGroupings = forumScripts.AsNoTracking() .GroupBy(e => e.Fid, e => e.Script).ToList(); var scripts = scriptGroupings.SelectMany(i => i).Distinct().ToList(); var recognizedTextLinesKeyByScript = new Dictionary>(scripts.Count); @@ -217,7 +217,7 @@ IEnumerable GetImagesInCurrentFid() // try to know which fid owns current image batch return imageKeysWithMatrix.IntersectBy( - from replyContentImage in db.ReplyContentImages + from replyContentImage in db.ReplyContentImages.AsNoTracking() where imageKeysWithMatrix .Select(imageKeyWithMatrix => imageKeyWithMatrix.ImageId) .Contains(replyContentImage.ImageId)