Skip to content

Commit

Permalink
* split class UserParserAndSaver into two classes `User(Saver|Parse…
Browse files Browse the repository at this point in the history
…r)` to fix `AV1000: Type '' contains the word 'and', which suggests it has multiple purposes`

+ primary ctor param `user(Saver|Parser)Factory`
+ fields `_users`, and `_usersParser` with its get-only prop for initialization
- required prop `Users`
+ param `userSaver` for its only usage in class `ThreadCrawlFacade` @ `BeforeCommitSaveHook()`
@ BaseCrawlFacade.cs
@ c#/crawler
  • Loading branch information
n0099 committed Mar 30, 2024
1 parent 8dd056f commit 086352d
Show file tree
Hide file tree
Showing 9 changed files with 172 additions and 155 deletions.
2 changes: 1 addition & 1 deletion c#/crawler/src/EntryPoint.cs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ protected override void ConfigureContainer(HostBuilderContext context, Container
// in order to sync their timer of WithLogTrace
.AutoActivate());
builder.RegisterType<AuthorRevisionSaver>();
builder.RegisterType<UserParserAndSaver>();
builder.RegisterType<UserParser>();
builder.RegisterType<ThreadLateCrawlerAndSaver>();
builder.RegisterType<SonicPusher>();
builder.RegisterType<CrawlPost>();
Expand Down
20 changes: 14 additions & 6 deletions c#/crawler/src/Tieba/Crawl/Facade/BaseCrawlFacade.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ public abstract class BaseCrawlFacade<TPost, TBaseRevision, TResponse, TPostProt
BaseCrawler<TResponse, TPostProtoBuf> crawler,
BaseParser<TPost, TPostProtoBuf> parser,
Func<ConcurrentDictionary<PostId, TPost>, BaseSaver<TPost, TBaseRevision>> saverFactory,
Func<ConcurrentDictionary<Uid, User>, UserSaver> userSaverFactory,
Func<ConcurrentDictionary<Uid, User>, UserParser> userParserFactory,
CrawlerLocks locks,
CrawlerLocks.LockId lockId,
Fid fid)
Expand All @@ -16,6 +18,8 @@ public abstract class BaseCrawlFacade<TPost, TBaseRevision, TResponse, TPostProt
where TPostProtoBuf : class, IMessage<TPostProtoBuf>
{
private readonly HashSet<Page> _lockingPages = [];
private readonly ConcurrentDictionary<Uid, User> _users = new();
private UserParser? _userParser;
private ExceptionHandler _exceptionHandler = _ => { };

public delegate void ExceptionHandler(Exception ex);
Expand All @@ -25,11 +29,11 @@ public required ILogger<BaseCrawlFacade<TPost, TBaseRevision, TResponse, TPostPr
Logger { private get; init; }
public required CrawlerDbContext.New DbContextFactory { private get; init; }
public required ClientRequesterTcs RequesterTcs { private get; init; }
public required UserParserAndSaver Users { protected get; init; }

// ReSharper restore UnusedAutoPropertyAccessor.Global
protected Fid Fid { get; } = fid;
protected ConcurrentDictionary<PostId, TPost> Posts { get; } = new();
protected UserParser UserParser => _userParser ??= userParserFactory(_users);

public virtual void Dispose()
{
Expand All @@ -41,10 +45,14 @@ public virtual void Dispose()
{
var db = DbContextFactory(Fid);
using var transaction = db.Database.BeginTransaction(IsolationLevel.ReadCommitted);

var saver = saverFactory(Posts);
var savedPosts = Posts.IsEmpty ? null : saver.SavePosts(db);
Users.SaveUsers(db, saver.PostType, saver.UserFieldChangeIgnorance);
BeforeCommitSaveHook(db);

var userSaver = userSaverFactory(_users);
userSaver.SaveUsers(db, saver.PostType, saver.UserFieldChangeIgnorance);

BeforeCommitSaveHook(db, userSaver);
try
{
db.TimestampingEntities();
Expand All @@ -55,7 +63,7 @@ public virtual void Dispose()
finally
{
saver.OnPostSaveEvent();
Users.PostSaveHook();
userSaver.PostSaveHook();
}
return savedPosts;
}
Expand Down Expand Up @@ -115,7 +123,7 @@ protected virtual void PostParseHook(
TResponse response,
CrawlRequestFlag flag,
IDictionary<PostId, TPost> parsedPostsInResponse) { }
protected virtual void BeforeCommitSaveHook(CrawlerDbContext db) { }
protected virtual void BeforeCommitSaveHook(CrawlerDbContext db, UserSaver userSaver) { }
protected virtual void PostCommitSaveHook(
SaverChangeSet<TPost> savedPosts,
CancellationToken stoppingToken = default) { }
Expand All @@ -129,7 +137,7 @@ private void ValidateThenParse(BaseCrawler<TResponse, TPostProtoBuf>.Response re
if (flag == CrawlRequestFlag.None)
{
if (postsEmbeddedUsers.Count == 0 && postsInResponse.Any()) ThrowIfEmptyUsersEmbedInPosts();
if (postsEmbeddedUsers.Count != 0) Users.ParseUsers(postsEmbeddedUsers);
if (postsEmbeddedUsers.Count != 0) UserParser.ParseUsers(postsEmbeddedUsers);
}
PostParseHook(response, flag, parsedPostsInResponse);
}
Expand Down
6 changes: 4 additions & 2 deletions c#/crawler/src/Tieba/Crawl/Facade/ReplyCrawlFacade.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@ public class ReplyCrawlFacade(
ReplyCrawler.New crawler,
ReplyParser parser,
ReplySaver.New saver,
UserSaver.New userSaver,
UserParser.New userParser,
SonicPusher pusher,
IIndex<string, CrawlerLocks> locks,
Fid fid,
Tid tid)
: BaseCrawlFacade<ReplyPost, BaseReplyRevision, ReplyResponse, Reply>
(crawler(fid, tid), parser, saver.Invoke, locks["reply"], new(fid, tid), fid)
(crawler(fid, tid), parser, saver.Invoke, userSaver.Invoke, userParser.Invoke, locks["reply"], new(fid, tid), fid)
{
public delegate ReplyCrawlFacade New(Fid fid, Tid tid);

Expand All @@ -21,7 +23,7 @@ protected override void PostParseHook(
{
parsedPostsInResponse.Values.ForEach(r => r.Tid = tid);
var data = response.Data;
Users.ParseUsers(data.UserList);
UserParser.ParseUsers(data.UserList);
FillAuthorInfoBackToReply(data.UserList, parsedPostsInResponse.Values);
if (data.Page.CurrentPage == 1) SaveParentThreadTitle(data.PostList);
}
Expand Down
6 changes: 4 additions & 2 deletions c#/crawler/src/Tieba/Crawl/Facade/SubReplyCrawlFacade.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@ public class SubReplyCrawlFacade(
SubReplyCrawler.New crawler,
SubReplyParser parser,
SubReplySaver.New saver,
UserSaver.New userSaver,
UserParser.New userParser,
SonicPusher pusher,
IIndex<string, CrawlerLocks> locks,
Fid fid,
Tid tid,
Pid pid)
: BaseCrawlFacade<SubReplyPost, BaseSubReplyRevision, SubReplyResponse, SubReply>
(crawler(tid, pid), parser, saver.Invoke, locks["subReply"], new(fid, tid, pid), fid)
(crawler(tid, pid), parser, saver.Invoke, userSaver.Invoke, userParser.Invoke, locks["subReply"], new(fid, tid, pid), fid)
{
public delegate SubReplyCrawlFacade New(Fid fid, Tid tid, Pid pid);

Expand All @@ -27,7 +29,7 @@ protected override void PostParseHook(
sr.Tid = tid;
sr.Pid = pid;
}
Users.ResetUsersIcon();
UserParser.ResetUsersIcon();
}

protected override void PostCommitSaveHook(
Expand Down
6 changes: 4 additions & 2 deletions c#/crawler/src/Tieba/Crawl/Facade/ThreadArchiveCrawlFacade.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ public class ThreadArchiveCrawlFacade(
ThreadArchiveCrawler.New crawler,
ThreadParser parser,
ThreadSaver.New saver,
UserSaver.New userSaver,
UserParser.New userParser,
IIndex<string, CrawlerLocks> locks,
Fid fid,
string forumName)
: ThreadCrawlFacade(crawler.Invoke, parser, saver, locks, fid, forumName)
: ThreadCrawlFacade(crawler.Invoke, parser, saver, userSaver.Invoke, userParser.Invoke, locks, fid, forumName)
{
public new delegate ThreadArchiveCrawlFacade New(Fid fid, string forumName);

Expand All @@ -18,7 +20,7 @@ protected override void PostParseHook(
{ // the second respond with flag is as same as the first one so just skip it
if (flag == CrawlRequestFlag.ThreadClientVersion602) return;
var data = response.Data;
Users.ParseUsers(data.ThreadList.Select(th => th.Author));
UserParser.ParseUsers(data.ThreadList.Select(th => th.Author));
ParseLatestRepliers(data.ThreadList);
FillFromRequestingWith602(data.ThreadList);

Expand Down
12 changes: 7 additions & 5 deletions c#/crawler/src/Tieba/Crawl/Facade/ThreadCrawlFacade.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,19 @@ public class ThreadCrawlFacade(
ThreadCrawler.New crawler,
ThreadParser parser,
ThreadSaver.New saver,
UserSaver.New userSaver,
UserParser.New userParser,
IIndex<string, CrawlerLocks> locks,
Fid fid,
string forumName)
: BaseCrawlFacade<ThreadPost, BaseThreadRevision, ThreadResponse, Thread>
(crawler(forumName), parser, saver.Invoke, locks["thread"], new(fid), fid)
(crawler(forumName), parser, saver.Invoke, userSaver.Invoke, userParser.Invoke, locks["thread"], new(fid), fid)
{
private readonly Dictionary<Uid, User> _latestRepliers = [];

public delegate ThreadCrawlFacade New(Fid fid, string forumName);

protected override void BeforeCommitSaveHook(CrawlerDbContext db)
protected override void BeforeCommitSaveHook(CrawlerDbContext db, UserSaver userSaver)
{ // BeforeCommitSaveHook() should get invoked after UserParserAndSaver.SaveUsers() by the base.SaveCrawled()
// so only latest repliers that not exists in parsed users are being inserted
// note this will bypass user revision detection since not invoking CommonInSavers.SavePostsOrUsers() but directly DbContext.AddRange()
Expand All @@ -27,7 +29,7 @@ protected override void BeforeCommitSaveHook(CrawlerDbContext db)
.ToList();
if (newLatestRepliers.Count == 0) return;

var newlyLockedLatestRepliers = Users.AcquireUidLocksForSave(newLatestRepliers.Select(u => u.Uid));
var newlyLockedLatestRepliers = userSaver.AcquireUidLocksForSave(newLatestRepliers.Select(u => u.Uid));
var newLatestRepliersExceptLocked = newLatestRepliers
.IntersectBy(newlyLockedLatestRepliers, u => u.Uid)
.Select(u =>
Expand Down Expand Up @@ -72,8 +74,8 @@ protected override void PostParseHook(
var data = response.Data;
if (flag == CrawlRequestFlag.ThreadClientVersion602) FillFromRequestingWith602(data.ThreadList);
if (flag != CrawlRequestFlag.None) return;
Users.ParseUsers(data.UserList);
Users.ResetUsersIcon();
UserParser.ParseUsers(data.UserList);
UserParser.ResetUsersIcon();
ParseLatestRepliers(data.ThreadList);

// remove livepost threads since their real parent forum may not match with current crawling fid
Expand Down
52 changes: 52 additions & 0 deletions c#/crawler/src/Tieba/Crawl/Parser/UserParser.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
namespace tbm.Crawler.Tieba.Crawl.Parser;

public partial class UserParser(ConcurrentDictionary<Uid, User> users)
{
public delegate UserParser New(ConcurrentDictionary<Uid, User> users);

public void ParseUsers(IEnumerable<TbClient.User> inUsers) =>
inUsers.Select(el =>
{
static (string Portrait, uint? UpdateTime) ExtractPortrait(string portrait) =>
ExtractPortraitRegex().Match(portrait) is {Success: true} m
? (m.Groups["portrait"].Value, Time.Parse(m.Groups["timestamp"].ValueSpan, CultureInfo.InvariantCulture))
: (portrait, null);

var uid = el.Uid;
if (uid == 0) return null; // in client version 12.x the last user in list will be an empty user with uid 0
var (portrait, portraitUpdatedAt) = ExtractPortrait(el.Portrait);
if (uid < 0) // historical anonymous user
{
return new() {Uid = uid, Name = el.NameShow, Portrait = portrait, PortraitUpdatedAt = portraitUpdatedAt};
}

// will be an empty string when the user hasn't set a username for their baidu account yet
var name = el.Name.NullIfEmpty();
var nameShow = el.NameShow.NullIfEmpty();
var u = new User();
try
{
u.Uid = uid;
u.Name = name;
u.DisplayName = name == nameShow ? null : nameShow;
u.Portrait = portrait;
u.PortraitUpdatedAt = portraitUpdatedAt;
u.Gender = (byte)el.Gender; // 0 when the user hasn't explicitly set their gender
u.FansNickname = el.FansNickname.NullIfEmpty();
u.Icon = Helper.SerializedProtoBufWrapperOrNullIfEmpty(el.Iconinfo,
() => new UserIconWrapper {Value = {el.Iconinfo}});
u.IpGeolocation = el.IpAddress.NullIfEmpty();
return u;
}
catch (Exception e)
{
e.Data["raw"] = Helper.UnescapedJsonSerialize(el);
throw new InvalidDataException("User parse error.", e);
}
}).OfType<User>().ForEach(u => users[u.Uid] = u);

public void ResetUsersIcon() => users.Values.ForEach(u => u.Icon = null);

[GeneratedRegex("^(?<portrait>.+)\\?t=(?<timestamp>[0-9]+)$", RegexOptions.Compiled, matchTimeoutMilliseconds: 100)]
private static partial Regex ExtractPortraitRegex();
}
86 changes: 86 additions & 0 deletions c#/crawler/src/Tieba/Crawl/Saver/UserSaver.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
namespace tbm.Crawler.Tieba.Crawl.Saver;

public partial class UserSaver
{
protected override Dictionary<Type, RevisionUpsertDelegate>
RevisionUpsertDelegatesKeyBySplitEntityType { get; } = new()
{
{
typeof(UserRevision.SplitDisplayName), (db, revisions) =>
db.Set<UserRevision.SplitDisplayName>()
.UpsertRange(revisions.OfType<UserRevision.SplitDisplayName>()).NoUpdate().Run()
},
{
typeof(UserRevision.SplitPortraitUpdatedAt), (db, revisions) =>
db.Set<UserRevision.SplitPortraitUpdatedAt>()
.UpsertRange(revisions.OfType<UserRevision.SplitPortraitUpdatedAt>()).NoUpdate().Run()
},
{
typeof(UserRevision.SplitIpGeolocation), (db, revisions) =>
db.Set<UserRevision.SplitIpGeolocation>()
.UpsertRange(revisions.OfType<UserRevision.SplitIpGeolocation>()).NoUpdate().Run()
}
};

[SuppressMessage("StyleCop.CSharp.SpacingRules", "SA1025:Code should not contain multiple whitespace in a row")]
protected override NullFieldsBitMask GetRevisionNullFieldBitMask(string fieldName) => fieldName switch
{
nameof(User.Name) => 1,
nameof(User.Gender) => 1 << 3,
nameof(User.Icon) => 1 << 5,
_ => 0
};
}
public partial class UserSaver(ILogger<UserSaver> logger, ConcurrentDictionary<Uid, User> users)
: CommonInSavers<BaseUserRevision>(logger)
{
public delegate UserSaver New(ConcurrentDictionary<Uid, User> users);

private static readonly HashSet<Uid> UserIdLocks = [];

Check failure on line 39 in c#/crawler/src/Tieba/Crawl/Saver/UserSaver.cs

View workflow job for this annotation

GitHub Actions / build (crawler)

Check failure on line 39 in c#/crawler/src/Tieba/Crawl/Saver/UserSaver.cs

View workflow job for this annotation

GitHub Actions / build (crawler)

private readonly List<Uid> _savedUsersId = [];

public void SaveUsers(
CrawlerDbContext db,
string postType,
FieldChangeIgnoranceDelegates userFieldChangeIgnorance)
{
if (users.IsEmpty) return;
lock (UserIdLocks)
{
var usersExceptLocked = new Dictionary<Uid, User>(users.ExceptBy(UserIdLocks, pair => pair.Key));
if (usersExceptLocked.Count == 0) return;
_savedUsersId.AddRange(usersExceptLocked.Keys);
UserIdLocks.UnionWith(_savedUsersId);

var existingUsersKeyByUid = (from user in db.Users.AsTracking().ForUpdate()
where usersExceptLocked.Keys.Contains(user.Uid)
select user).ToDictionary(u => u.Uid);
SavePostsOrUsers(db, userFieldChangeIgnorance,
u => new UserRevision
{
TakenAt = u.UpdatedAt ?? u.CreatedAt,
Uid = u.Uid,
TriggeredBy = postType
},
usersExceptLocked.Values.ToLookup(u => existingUsersKeyByUid.ContainsKey(u.Uid)),
u => existingUsersKeyByUid[u.Uid]);
}
}

public IEnumerable<Uid> AcquireUidLocksForSave(IEnumerable<Uid> usersId)
{
lock (UserIdLocks)
{
var exceptLocked = usersId.Except(UserIdLocks).ToList();
if (exceptLocked.Count == 0) return exceptLocked;
_savedUsersId.AddRange(exceptLocked); // assume all given users are saved
UserIdLocks.UnionWith(exceptLocked);
return exceptLocked;
}
}

public void PostSaveHook()
{
lock (UserIdLocks) if (_savedUsersId.Count != 0) UserIdLocks.ExceptWith(_savedUsersId);
}
}
Loading

0 comments on commit 086352d

Please sign in to comment.