diff --git a/c#/crawler/src/Db/CrawlerDbContext.cs b/c#/crawler/src/Db/CrawlerDbContext.cs index 8129f31b..9d64fbe1 100644 --- a/c#/crawler/src/Db/CrawlerDbContext.cs +++ b/c#/crawler/src/Db/CrawlerDbContext.cs @@ -78,25 +78,25 @@ protected override void OnModelCreating(ModelBuilder b) b.Entity().ToTable($"tbmc_f{Fid}_subReply_content"); _ = new RevisionWithSplitting - .ModelBuilder(b, "tbmcr_thread", e => new {e.Tid, e.TakenAt}) + .ModelBuilder(b, "tbmcr_thread", e => new {e.Tid, e.TakenAt, e.DuplicateIndex}) .HasBaseTable() .SplitToTable("viewCount"); _ = new RevisionWithSplitting - .ModelBuilder(b, "tbmcr_reply", e => new {e.Pid, e.TakenAt}) + .ModelBuilder(b, "tbmcr_reply", e => new {e.Pid, e.TakenAt, e.DuplicateIndex}) .HasBaseTable() .SplitToTable("agreeCount") .SplitToTable("subReplyCount") .SplitToTable("floor"); _ = new RevisionWithSplitting - .ModelBuilder(b, "tbmcr_subReply", e => new {e.Spid, e.TakenAt}) + .ModelBuilder(b, "tbmcr_subReply", e => new {e.Spid, e.TakenAt, e.DuplicateIndex}) .HasBaseTable() .SplitToTable("agreeCount") .SplitToTable("disagreeCount"); _ = new RevisionWithSplitting - .ModelBuilder(b, "tbmcr_user", e => new {e.Uid, e.TakenAt}) + .ModelBuilder(b, "tbmcr_user", e => new {e.Uid, e.TakenAt, e.DuplicateIndex}) .HasBaseTable() .SplitToTable("ipGeolocation") .SplitToTable("portraitUpdatedAt") diff --git a/c#/crawler/src/Db/Revision/Splitting/BaseRevisionWithSplitting.cs b/c#/crawler/src/Db/Revision/Splitting/BaseRevisionWithSplitting.cs index fe8122c2..e965fa8e 100644 --- a/c#/crawler/src/Db/Revision/Splitting/BaseRevisionWithSplitting.cs +++ b/c#/crawler/src/Db/Revision/Splitting/BaseRevisionWithSplitting.cs @@ -5,6 +5,7 @@ namespace tbm.Crawler.Db.Revision.Splitting; public abstract class BaseRevisionWithSplitting : RowVersionedEntity { public uint TakenAt { get; set; } + public ushort? DuplicateIndex { get; set; } public ushort? NullFieldsBitMask { get; set; } public virtual bool IsAllFieldsIsNullExceptSplit() => throw new NotSupportedException(); } diff --git a/c#/crawler/src/Db/Revision/Splitting/RevisionWithSplitting.cs b/c#/crawler/src/Db/Revision/Splitting/RevisionWithSplitting.cs index 2892a30f..4bd00a2d 100644 --- a/c#/crawler/src/Db/Revision/Splitting/RevisionWithSplitting.cs +++ b/c#/crawler/src/Db/Revision/Splitting/RevisionWithSplitting.cs @@ -1,5 +1,3 @@ -using System.Collections.ObjectModel; - namespace tbm.Crawler.Db.Revision.Splitting; public abstract class RevisionWithSplitting : BaseRevisionWithSplitting @@ -50,26 +48,5 @@ public ModelBuilder SplitToTable(string tableNameSuffix) visitor.Visit(keySelector)); return this; } - - /// https://stackoverflow.com/questions/38316519/replace-parameter-type-in-lambda-expression/38345590#38345590 - private sealed class ReplaceParameterTypeVisitor : ExpressionVisitor - { - private ReadOnlyCollection? _parameters; - - protected override Expression VisitParameter(ParameterExpression node) => - _parameters?.FirstOrDefault(p => p.Name == node.Name) ?? - (node.Type == typeof(TSource) ? Expression.Parameter(typeof(TTarget), node.Name) : node); - - protected override Expression VisitLambda(Expression node) - { - _parameters = VisitAndConvert(node.Parameters, nameof(VisitLambda)); - return Expression.Lambda(Visit(node.Body), _parameters); - } - - protected override Expression VisitMember(MemberExpression node) => - node.Member.DeclaringType == typeof(TSource) - ? Expression.Property(Visit(node.Expression)!, node.Member.Name) - : base.VisitMember(node); - } } } diff --git a/c#/crawler/src/EntryPoint.cs b/c#/crawler/src/EntryPoint.cs index 4759d0ec..49bc550a 100644 --- a/c#/crawler/src/EntryPoint.cs +++ b/c#/crawler/src/EntryPoint.cs @@ -37,7 +37,7 @@ protected override void ConfigureContainer(HostBuilderContext context, Container builder.RegisterImplementsOfBaseTypes(typeof(EntryPoint).Assembly, [ typeof(ICrawler<,>), typeof(ICrawlFacade<>), - typeof(IPostParser<,>), typeof(SaverWithRevision<>) + typeof(IPostParser<,>), typeof(SaverWithRevision<,>) ]); builder.RegisterType(); builder.RegisterType(); diff --git a/c#/crawler/src/GlobalUsings.cs b/c#/crawler/src/GlobalUsings.cs index ab2c271b..3ac5a9a2 100644 --- a/c#/crawler/src/GlobalUsings.cs +++ b/c#/crawler/src/GlobalUsings.cs @@ -62,6 +62,7 @@ global using PostId = System.UInt64; global using Tid = System.UInt64; global using Pid = System.UInt64; +global using Spid = System.UInt64; global using Time = System.UInt32; global using Page = System.UInt32; global using FailureCount = System.UInt16; diff --git a/c#/crawler/src/ReplaceParameterTypeVisitor.cs b/c#/crawler/src/ReplaceParameterTypeVisitor.cs new file mode 100644 index 00000000..872cf7a5 --- /dev/null +++ b/c#/crawler/src/ReplaceParameterTypeVisitor.cs @@ -0,0 +1,24 @@ +using System.Collections.ObjectModel; + +namespace tbm.Crawler; + +/// https://stackoverflow.com/questions/38316519/replace-parameter-type-in-lambda-expression/38345590#38345590 +public class ReplaceParameterTypeVisitor : ExpressionVisitor +{ + private ReadOnlyCollection? _parameters; + + protected override Expression VisitParameter(ParameterExpression node) => + _parameters?.FirstOrDefault(p => p.Name == node.Name) ?? + (node.Type == typeof(TSource) ? Expression.Parameter(typeof(TTarget), node.Name) : node); + + protected override Expression VisitLambda(Expression node) + { + _parameters = VisitAndConvert(node.Parameters, nameof(VisitLambda)); + return Expression.Lambda(Visit(node.Body), _parameters); + } + + protected override Expression VisitMember(MemberExpression node) => + node.Member.DeclaringType == typeof(TSource) + ? Expression.Property(Visit(node.Expression)!, node.Member.Name) + : base.VisitMember(node); +} diff --git a/c#/crawler/src/Tieba/Crawl/Saver/Post/PostSaver.cs b/c#/crawler/src/Tieba/Crawl/Saver/Post/PostSaver.cs index 50b589c6..1e4d59d2 100644 --- a/c#/crawler/src/Tieba/Crawl/Saver/Post/PostSaver.cs +++ b/c#/crawler/src/Tieba/Crawl/Saver/Post/PostSaver.cs @@ -2,12 +2,12 @@ namespace tbm.Crawler.Tieba.Crawl.Saver.Post; -public abstract class PostSaver( - ILogger> logger, +public abstract class PostSaver( + ILogger> logger, ConcurrentDictionary posts, AuthorRevisionSaver.New authorRevisionSaverFactory, PostType currentPostType) - : SaverWithRevision(logger), IPostSaver + : SaverWithRevision(logger), IPostSaver where TPost : BasePost where TBaseRevision : BaseRevisionWithSplitting { diff --git a/c#/crawler/src/Tieba/Crawl/Saver/Post/ReplySaver.cs b/c#/crawler/src/Tieba/Crawl/Saver/Post/ReplySaver.cs index d44ba4ea..093c00ee 100644 --- a/c#/crawler/src/Tieba/Crawl/Saver/Post/ReplySaver.cs +++ b/c#/crawler/src/Tieba/Crawl/Saver/Post/ReplySaver.cs @@ -6,30 +6,25 @@ public class ReplySaver( ReplyContentImageSaver replyContentImageSaver, ReplySignatureSaver replySignatureSaver, AuthorRevisionSaver.New authorRevisionSaverFactory) - : PostSaver( + : PostSaver( logger, posts, authorRevisionSaverFactory, PostType.Reply) { public delegate ReplySaver New(ConcurrentDictionary posts); - protected override Dictionary - AddRevisionDelegatesKeyBySplitEntityType { get; } = new() - { - { - typeof(ReplyRevision.SplitFloor), (db, revisions) => - db.Set() - .AddRange(revisions.OfType()) - }, - { - typeof(ReplyRevision.SplitSubReplyCount), (db, revisions) => - db.Set() - .AddRange(revisions.OfType()) - }, + private Lazy>? _addSplitRevisionsDelegatesKeyByEntityType; + protected override Lazy> + AddSplitRevisionsDelegatesKeyByEntityType => + _addSplitRevisionsDelegatesKeyByEntityType ??= new(() => new() { - typeof(ReplyRevision.SplitAgreeCount), (db, revisions) => - db.Set() - .AddRange(revisions.OfType()) - } - }; + {typeof(ReplyRevision.SplitFloor), AddSplitRevisions}, + {typeof(ReplyRevision.SplitSubReplyCount), AddSplitRevisions}, + {typeof(ReplyRevision.SplitAgreeCount), AddSplitRevisions} + }); + + protected override Pid RevisionEntityIdSelector(BaseReplyRevision entity) => entity.Pid; + protected override Expression> + IsRevisionEntityIdEqualsExpression(BaseReplyRevision newRevision) => + existingRevision => existingRevision.Pid == newRevision.Pid; public override bool UserFieldUpdateIgnorance(string propName, object? oldValue, object? newValue) => propName switch { // FansNickname in reply response will always be null diff --git a/c#/crawler/src/Tieba/Crawl/Saver/Post/SubReplySaver.cs b/c#/crawler/src/Tieba/Crawl/Saver/Post/SubReplySaver.cs index 65909144..2a18dc08 100644 --- a/c#/crawler/src/Tieba/Crawl/Saver/Post/SubReplySaver.cs +++ b/c#/crawler/src/Tieba/Crawl/Saver/Post/SubReplySaver.cs @@ -4,25 +4,24 @@ public class SubReplySaver( ILogger logger, ConcurrentDictionary posts, AuthorRevisionSaver.New authorRevisionSaverFactory) - : PostSaver( + : PostSaver( logger, posts, authorRevisionSaverFactory, PostType.SubReply) { public delegate SubReplySaver New(ConcurrentDictionary posts); - protected override Dictionary - AddRevisionDelegatesKeyBySplitEntityType { get; } = new() - { - { - typeof(SubReplyRevision.SplitAgreeCount), (db, revisions) => - db.Set() - .AddRange(revisions.OfType()) - }, + private Lazy>? _addSplitRevisionsDelegatesKeyByEntityType; + protected override Lazy> + AddSplitRevisionsDelegatesKeyByEntityType => + _addSplitRevisionsDelegatesKeyByEntityType ??= new(() => new() { - typeof(SubReplyRevision.SplitDisagreeCount), (db, revisions) => - db.Set() - .AddRange(revisions.OfType()) - } - }; + {typeof(SubReplyRevision.SplitAgreeCount), AddSplitRevisions}, + {typeof(SubReplyRevision.SplitDisagreeCount), AddSplitRevisions}, + }); + + protected override Spid RevisionEntityIdSelector(BaseSubReplyRevision entity) => entity.Spid; + protected override Expression> + IsRevisionEntityIdEqualsExpression(BaseSubReplyRevision newRevision) => + existingRevision => existingRevision.Spid == newRevision.Spid; public override bool UserFieldUpdateIgnorance (string propName, object? oldValue, object? newValue) => propName switch diff --git a/c#/crawler/src/Tieba/Crawl/Saver/Post/ThreadSaver.cs b/c#/crawler/src/Tieba/Crawl/Saver/Post/ThreadSaver.cs index 8ac56330..e1e05e8e 100644 --- a/c#/crawler/src/Tieba/Crawl/Saver/Post/ThreadSaver.cs +++ b/c#/crawler/src/Tieba/Crawl/Saver/Post/ThreadSaver.cs @@ -6,20 +6,23 @@ public class ThreadSaver( ILogger logger, ConcurrentDictionary posts, AuthorRevisionSaver.New authorRevisionSaverFactory) - : PostSaver( + : PostSaver( logger, posts, authorRevisionSaverFactory, PostType.Thread) { public delegate ThreadSaver New(ConcurrentDictionary posts); - protected override Dictionary - AddRevisionDelegatesKeyBySplitEntityType { get; } = new() - { + private Lazy>? _addSplitRevisionsDelegatesKeyByEntityType; + protected override Lazy> + AddSplitRevisionsDelegatesKeyByEntityType => + _addSplitRevisionsDelegatesKeyByEntityType ??= new(() => new() { - typeof(ThreadRevision.SplitViewCount), (db, revisions) => - db.Set() - .AddRange(revisions.OfType()) - } - }; + {typeof(ThreadRevision.SplitViewCount), AddSplitRevisions} + }); + + protected override Tid RevisionEntityIdSelector(BaseThreadRevision entity) => entity.Tid; + protected override Expression> + IsRevisionEntityIdEqualsExpression(BaseThreadRevision newRevision) => + existingRevision => existingRevision.Tid == newRevision.Tid; public override bool UserFieldUpdateIgnorance (string propName, object? oldValue, object? newValue) => propName switch diff --git a/c#/crawler/src/Tieba/Crawl/Saver/SaverWithRevision.cs b/c#/crawler/src/Tieba/Crawl/Saver/SaverWithRevision.cs index d545475a..b193e49d 100644 --- a/c#/crawler/src/Tieba/Crawl/Saver/SaverWithRevision.cs +++ b/c#/crawler/src/Tieba/Crawl/Saver/SaverWithRevision.cs @@ -2,15 +2,21 @@ namespace tbm.Crawler.Tieba.Crawl.Saver; -public abstract partial class SaverWithRevision( - ILogger> logger) +public abstract partial class SaverWithRevision( + ILogger> logger) : IRevisionProperties where TBaseRevision : BaseRevisionWithSplitting { - protected delegate void AddRevisionDelegate(CrawlerDbContext db, IEnumerable revision); - protected abstract IReadOnlyDictionary AddRevisionDelegatesKeyBySplitEntityType { get; } + protected delegate void AddSplitRevisionsDelegate(CrawlerDbContext db, IEnumerable revisions); + protected abstract Lazy> + AddSplitRevisionsDelegatesKeyByEntityType { get; } + protected abstract NullFieldsBitMask GetRevisionNullFieldBitMask(string fieldName); + protected abstract TEntityId RevisionEntityIdSelector(TBaseRevision entity); + protected abstract Expression> + IsRevisionEntityIdEqualsExpression(TBaseRevision newRevision); + protected virtual bool ShouldIgnoreEntityRevision(string propName, PropertyEntry propEntry, EntityEntry entityEntry) => false; protected virtual bool FieldUpdateIgnorance(string propName, object? oldValue, object? newValue) => false; protected virtual bool FieldRevisionIgnorance(string propName, object? oldValue, object? newValue) => false; @@ -19,8 +25,34 @@ public abstract partial class SaverWithRevision( nameof(BasePost.AuthorUid) when newValue is 0L && oldValue is not null => true, _ => false }; + + protected void AddSplitRevisions(CrawlerDbContext db, IEnumerable revisions) + where TRevision : TBaseRevision + { + var newRevisions = revisions.OfType().ToList(); + var dbSet = db.Set(); + var visitor = new ReplaceParameterTypeVisitor(); + var existingRevisions = dbSet + .Where(newRevisions.Aggregate( + + // https://github.com/npgsql/npgsql/issues/4437 + // https://github.com/dotnet/efcore/issues/32092 + LinqKit.PredicateBuilder.New(), + (predicate, newRevision) => predicate.Or(LinqKit.PredicateBuilder + .New(existingRevision => existingRevision.TakenAt == newRevision.TakenAt) + .And((Expression>) + visitor.Visit(IsRevisionEntityIdEqualsExpression(newRevision)))))) + .ToList(); + (from existingRevision in existingRevisions + join newRevision in newRevisions + on RevisionEntityIdSelector(existingRevision) equals RevisionEntityIdSelector(newRevision) + select (existingRevision, newRevision)) + .ForEach(t => + t.newRevision.DuplicateIndex = t.existingRevision.DuplicateIndex ?? 0 + 1); + dbSet.AddRange(newRevisions); + } } -public abstract partial class SaverWithRevision +public abstract partial class SaverWithRevision { protected void SaveEntitiesWithRevision( CrawlerDbContext db, @@ -120,6 +152,6 @@ bool IsTimestampingFieldName(string name) => name is nameof(BasePost.LastSeenAt) newRevisions.OfType>() .SelectMany(rev => rev.SplitEntities) .GroupBy(pair => pair.Key, pair => pair.Value) - .ForEach(g => AddRevisionDelegatesKeyBySplitEntityType[g.Key](db, g)); + .ForEach(g => AddSplitRevisionsDelegatesKeyByEntityType.Value[g.Key](db, g)); } } diff --git a/c#/crawler/src/Tieba/Crawl/Saver/UserSaver.cs b/c#/crawler/src/Tieba/Crawl/Saver/UserSaver.cs index eadf5c4d..0b48c5cb 100644 --- a/c#/crawler/src/Tieba/Crawl/Saver/UserSaver.cs +++ b/c#/crawler/src/Tieba/Crawl/Saver/UserSaver.cs @@ -4,25 +4,20 @@ namespace tbm.Crawler.Tieba.Crawl.Saver; public partial class UserSaver { - protected override Dictionary - AddRevisionDelegatesKeyBySplitEntityType { get; } = new() - { - { - typeof(UserRevision.SplitDisplayName), (db, revisions) => - db.Set() - .AddRange(revisions.OfType()) - }, - { - typeof(UserRevision.SplitPortraitUpdatedAt), (db, revisions) => - db.Set() - .AddRange(revisions.OfType()) - }, + private Lazy>? _addSplitRevisionsDelegatesKeyByEntityType; + protected override Lazy> + AddSplitRevisionsDelegatesKeyByEntityType => + _addSplitRevisionsDelegatesKeyByEntityType ??= new(() => new() { - typeof(UserRevision.SplitIpGeolocation), (db, revisions) => - db.Set() - .AddRange(revisions.OfType()) - } - }; + {typeof(UserRevision.SplitDisplayName), AddSplitRevisions}, + {typeof(UserRevision.SplitPortraitUpdatedAt), AddSplitRevisions}, + {typeof(UserRevision.SplitIpGeolocation), AddSplitRevisions} + }); + + protected override Uid RevisionEntityIdSelector(BaseUserRevision entity) => entity.Uid; + protected override Expression> + IsRevisionEntityIdEqualsExpression(BaseUserRevision newRevision) => + existingRevision => existingRevision.Uid == newRevision.Uid; protected override bool ShouldIgnoreEntityRevision(string propName, PropertyEntry propEntry, EntityEntry entityEntry) { @@ -87,7 +82,7 @@ protected override bool FieldRevisionIgnorance public partial class UserSaver( ILogger logger, SaverLocks locks, IDictionary users) - : SaverWithRevision(logger) + : SaverWithRevision(logger) { public delegate UserSaver New(IDictionary users); public delegate bool FieldChangeIgnorance(string propName, object? oldValue, object? newValue);