Skip to content

Commit

Permalink
* rename all names related to PostOrUser with Entity @ `SaveEntit…
Browse files Browse the repository at this point in the history
…iesWithRevision()`

* rename method `SavePostsOrUsers()` to `SaveEntitiesWithRevision()`
@ BaseSaver.cs

* replace the type of primary ctor param `users` from `ConcurrentDictionary<,>` to `IDictionary<,>` since saver should only be called only once @ UserSaver.cs
* explicitly invoke `IQueryable<>.As(No)Tracking()` @ `PostSaver.Save()` & `PushAllPostContentsIntoSonicWorker.DoWork()`
@ crawler

* explicitly invoke `IQueryable<>.AsNoTracking()` @ `ImageBatchConsumingWorker.ConsumeOcrConsumer()`
@ imagePipeline
@ c#
  • Loading branch information
n0099 committed May 14, 2024
1 parent f91793e commit 5ca7b35
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 37 deletions.
2 changes: 1 addition & 1 deletion c#/crawler/src/Tieba/Crawl/Facade/ThreadCrawlFacade.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ public class ThreadCrawlFacade(
protected override void OnBeforeCommitSave(CrawlerDbContext db, UserSaver userSaver)
{ // OnBeforeCommitSave() should get invoked after UserSaver.Save() by the base.SaveCrawled()
// so only latest repliers that not exists in parsed users are being inserted
// note this will bypass user revision detection since not invoking BaseSaver.SavePostsOrUsers() but directly DbContext.AddRange()
// note this will bypass user revision detection since not invoking BaseSaver.SaveEntitiesWithRevision() but directly DbContext.AddRange()

// users has already been added into DbContext and tracking
var existingUsersId = db.ChangeTracker.Entries<User>().Select(ee => ee.Entity.Uid);
Expand Down
48 changes: 25 additions & 23 deletions c#/crawler/src/Tieba/Crawl/Saver/BaseSaver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,56 +8,58 @@ public abstract class BaseSaver<TBaseRevision>(ILogger<BaseSaver<TBaseRevision>>
#pragma warning restore S1939 // Inheritance list should not be redundant
where TBaseRevision : BaseRevisionWithSplitting
{
protected void SavePostsOrUsers<TPostOrUser, TRevision>(
protected void SaveEntitiesWithRevision<TEntity, TRevision>(
CrawlerDbContext db,
IFieldChangeIgnorance.FieldChangeIgnoranceDelegates userFieldChangeIgnorance,
Func<TPostOrUser, TRevision> revisionFactory,
ILookup<bool, TPostOrUser> existingOrNewLookup,
Func<TPostOrUser, TPostOrUser> existingSelector)
where TPostOrUser : class
Func<TEntity, TRevision> revisionFactory,
ILookup<bool, TEntity> existingOrNewLookup,
Func<TEntity, TEntity> existingSelector)
where TEntity : class
where TRevision : BaseRevisionWithSplitting
{
db.Set<TPostOrUser>().AddRange(existingOrNewLookup[false]); // newly added
var newRevisions = existingOrNewLookup[true].Select(newPostOrUser =>
db.Set<TEntity>().AddRange(existingOrNewLookup[false]); // newly added
var newRevisions = existingOrNewLookup[true].Select(newEntity =>
{
var postOrUserInTracking = existingSelector(newPostOrUser);
var entry = db.Entry(postOrUserInTracking);
var entityInTracking = existingSelector(newEntity);
var entityEntry = db.Entry(entityInTracking);

// this will mutate postOrUserInTracking which is referenced by entry
entry.CurrentValues.SetValues(newPostOrUser);
// this will mutate existingEntity which is referenced by entry
entityEntry.CurrentValues.SetValues(newEntity);

bool IsTimestampingFieldName(string name) => name is nameof(BasePost.LastSeenAt)
or nameof(TimestampedEntity.CreatedAt) or nameof(TimestampedEntity.UpdatedAt);

// rollback changes that overwrite original values with the default value 0 or null
// for all fields of TimestampedEntity and BasePost.LastSeenAt
// this will also affect the entity instance which postOrUserInTracking references to it
entry.Properties
// this will also affect the entity instance which entityInTracking references to it
entityEntry.Properties
.Where(prop => prop.IsModified && IsTimestampingFieldName(prop.Metadata.Name))
.ForEach(prop => prop.IsModified = false);

var revision = default(TRevision);
var revisionNullFieldsBitMask = 0;
var whichPostType = typeof(TPostOrUser);
var whichPostType = typeof(TEntity);
var entryIsUser = whichPostType == typeof(User);
foreach (var p in entry.Properties)
foreach (var p in entityEntry.Properties)
{
var pName = p.Metadata.Name;
if (!p.IsModified || IsTimestampingFieldName(pName)) continue;

if (IFieldChangeIgnorance.GlobalFieldChangeIgnorance.Update(whichPostType, pName, p.OriginalValue, p.CurrentValue)
if (IFieldChangeIgnorance.GlobalFieldChangeIgnorance.Update(
whichPostType, pName, p.OriginalValue, p.CurrentValue)
|| (entryIsUser && userFieldChangeIgnorance.Update(
whichPostType, pName, p.OriginalValue, p.CurrentValue)))
{
p.IsModified = false;
continue; // skip following revision check
}
if (IFieldChangeIgnorance.GlobalFieldChangeIgnorance.Revision(whichPostType, pName, p.OriginalValue, p.CurrentValue)
if (IFieldChangeIgnorance.GlobalFieldChangeIgnorance.Revision(
whichPostType, pName, p.OriginalValue, p.CurrentValue)
|| (entryIsUser && userFieldChangeIgnorance.Revision(
whichPostType, pName, p.OriginalValue, p.CurrentValue)))
continue;

if (IsLatestReplierUser(pName, p, entry)) return null;
if (IsLatestReplierUser(pName, p, entityEntry)) return null;

if (!IRevisionProperties.Cache[typeof(TRevision)].TryGetValue(pName, out var revisionProp))
{
Expand All @@ -66,12 +68,12 @@ bool IsTimestampingFieldName(string name) => name is nameof(BasePost.LastSeenAt)
logger.LogWarning("Updating field {} is not existing in revision table, " +
"newValue={}, oldValue={}, newObject={}, oldObject={}",
pName, ToHexWhenByteArray(p.CurrentValue), ToHexWhenByteArray(p.OriginalValue),
SharedHelper.UnescapedJsonSerialize(newPostOrUser),
SharedHelper.UnescapedJsonSerialize(entry.OriginalValues.ToObject()));
SharedHelper.UnescapedJsonSerialize(newEntity),
SharedHelper.UnescapedJsonSerialize(entityEntry.OriginalValues.ToObject()));
}
else
{
revision ??= revisionFactory(postOrUserInTracking);
revision ??= revisionFactory(entityInTracking);

// quote from MSDN https://learn.microsoft.com/en-us/dotnet/api/system.reflection.propertyinfo.setvalue
// If the property type of this PropertyInfo object is a value type and value is null
Expand Down Expand Up @@ -109,14 +111,14 @@ bool IsTimestampingFieldName(string name) => name is nameof(BasePost.LastSeenAt)
private static bool IsLatestReplierUser(string pName, PropertyEntry p, EntityEntry entry)
{
// ThreadCrawlFacade.ParseLatestRepliers() will save users with empty string as portrait
// they will soon be updated by (sub) reply crawler after it find out the latest reply
// they may soon be updated by (sub) reply crawler after it find out the latest reply
// so we should ignore its revision update for all fields
// ignore entire record is not possible via IFieldChangeIgnorance.GlobalFieldChangeIgnorance.Revision()
// since it can only determine one field at the time
if (pName != nameof(User.Portrait) || p.OriginalValue is not "") return false;

// invokes OriginalValues.ToObject() to get a new instance
// since postOrUserInTracking is reference to the changed one
// since entityInTracking is reference to the changed one
var user = (User)entry.OriginalValues.ToObject();

// create another user instance with only fields of latest replier filled
Expand Down
6 changes: 3 additions & 3 deletions c#/crawler/src/Tieba/Crawl/Saver/Post/PostSaver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,13 @@ protected SaverChangeSet<TPost> Save<TRevision>(
ExpressionStarter<TPost> existingPostPredicate)
where TRevision : BaseRevisionWithSplitting
{
var existingPostsKeyById = db.Set<TPost>()
var existingPostsKeyById = db.Set<TPost>().AsTracking()
.Where(existingPostPredicate).ToDictionary(postIdSelector);

// deep copy before entities get mutated by BaseSaver.SavePostsOrUsers()
// deep copy before entities get mutated by BaseSaver.SaveEntitiesWithRevision()
var existingBeforeMerge = existingPostsKeyById.Select(pair => (TPost)pair.Value.Clone()).ToList();

SavePostsOrUsers(db, UserFieldChangeIgnorance, revisionFactory,
SaveEntitiesWithRevision(db, UserFieldChangeIgnorance, revisionFactory,
Posts.Values.ToLookup(p => existingPostsKeyById.ContainsKey(postIdSelector(p))),
p => existingPostsKeyById[postIdSelector(p)]);
return new(existingBeforeMerge, Posts.Values, postIdSelector);
Expand Down
8 changes: 4 additions & 4 deletions c#/crawler/src/Tieba/Crawl/Saver/UserSaver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,23 +33,23 @@ protected override Dictionary<Type, AddRevisionDelegate>
}
public partial class UserSaver(
ILogger<UserSaver> logger, SaverLocks<Uid> locks,
ConcurrentDictionary<Uid, User> users)
IDictionary<Uid, User> users)
: BaseSaver<BaseUserRevision>(logger)
{
public delegate UserSaver New(ConcurrentDictionary<Uid, User> users);
public delegate UserSaver New(IDictionary<Uid, User> users);

public void Save(
CrawlerDbContext db,
PostType postType,
IFieldChangeIgnorance.FieldChangeIgnoranceDelegates userFieldChangeIgnorance)
{
if (users.IsEmpty) return;
if (users.Count == 0) return;
locks.AcquireLocksThen(newlyLocked =>
{
var existingUsersKeyByUid = (from user in db.Users.AsTracking()
where newlyLocked.Select(u => u.Uid).Contains(user.Uid)
select user).ToDictionary(u => u.Uid);
SavePostsOrUsers(db, userFieldChangeIgnorance,
SaveEntitiesWithRevision(db, userFieldChangeIgnorance,
u => new UserRevision
{
TakenAt = u.UpdatedAt ?? u.CreatedAt,
Expand Down
9 changes: 5 additions & 4 deletions c#/crawler/src/Worker/PushAllPostContentsIntoSonicWorker.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@ protected override async Task DoWork(CancellationToken stoppingToken)
var forumPostCountsTuples = db.Database.GetDbConnection()
#pragma warning restore IDISP004 // Don't ignore created IDisposable
.Query<(Fid Fid, int ReplyCount, int SubReplyCount)>(
string.Join(" UNION ALL ", (from f in db.Forums select f.Fid).AsEnumerable().Select(fid =>
$"SELECT '{fid}',"
+ $"COALESCE((SELECT id FROM \"tbmc_f{fid}_reply\" ORDER BY id DESC LIMIT 1), 0),"
+ $"COALESCE((SELECT id FROM \"tbmc_f{fid}_subReply\" ORDER BY id DESC LIMIT 1), 0)")))
string.Join(" UNION ALL ", (from f in db.Forums.AsNoTracking() select f.Fid)
.AsEnumerable().Select(fid =>
$"SELECT '{fid}',"
+ $"COALESCE((SELECT id FROM \"tbmc_f{fid}_reply\" ORDER BY id DESC LIMIT 1), 0),"
+ $"COALESCE((SELECT id FROM \"tbmc_f{fid}_subReply\" ORDER BY id DESC LIMIT 1), 0)")))
.ToList();
var forumCount = forumPostCountsTuples.Count * 2; // reply and sub reply
var totalPostCount = forumPostCountsTuples.Sum(t => t.ReplyCount)
Expand Down
4 changes: 2 additions & 2 deletions c#/imagePipeline/src/ImageBatchConsumingWorker.cs
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ private async Task ConsumeOcrConsumer(
IQueryable<ForumScript> forumScripts,
CancellationToken stoppingToken = default)
{
var scriptGroupings = forumScripts
var scriptGroupings = forumScripts.AsNoTracking()
.GroupBy(e => e.Fid, e => e.Script).ToList();
var scripts = scriptGroupings.SelectMany(i => i).Distinct().ToList();
var recognizedTextLinesKeyByScript = new Dictionary<string, List<ImageOcrLine>>(scripts.Count);
Expand All @@ -217,7 +217,7 @@ IEnumerable<ImageKeyWithMatrix> GetImagesInCurrentFid()

// try to know which fid owns current image batch
return imageKeysWithMatrix.IntersectBy(
from replyContentImage in db.ReplyContentImages
from replyContentImage in db.ReplyContentImages.AsNoTracking()
where imageKeysWithMatrix
.Select(imageKeyWithMatrix => imageKeyWithMatrix.ImageId)
.Contains(replyContentImage.ImageId)
Expand Down

0 comments on commit 5ca7b35

Please sign in to comment.