Skip to content

Commit

Permalink
* now will log multiple entities with different value of revisioning …
Browse files Browse the repository at this point in the history
…field sharing the same author uid

* rename param `postAuthorFieldValueSelector` to `postRevisioningFieldSelector`
@ `AuthorRevisionSaver.Save()`

* now will log multiple entities with different value of revisioning field sharing the same signature id @ `Save()`
+ record `ReplySignatureProjection`
@ ReplySignatureSaver.cs

* move parts of method `ToHexWhenByteArray()` from `SaverWithRevision.SaveEntitiesWithRevision()` to `shared.SharedHelper`
@ c#/crawler
  • Loading branch information
n0099 committed Jun 2, 2024
1 parent 407373f commit 4338ccf
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 9 deletions.
23 changes: 19 additions & 4 deletions c#/crawler/src/Tieba/Crawl/Saver/AuthorRevisionSaver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ namespace tbm.Crawler.Tieba.Crawl.Saver;
// locks only using AuthorRevision.Fid and Uid, ignoring TriggeredBy
// this prevents inserting multiple entities with similar time and other fields with the same values
public class AuthorRevisionSaver(
ILogger<AuthorRevisionSaver> logger,
SaverLocks<(Fid Fid, Uid Uid)> authorExpGradeLocks,
PostType triggeredByPostType)
{
Expand Down Expand Up @@ -34,19 +35,33 @@ public Action SaveAuthorExpGradeRevisions<TPostWithAuthorExpGrade>
return authorExpGradeLocks.Dispose;
}

private static void Save<TPost, TRevision, TValue>(
private void Save<TPost, TRevision, TValue>(
CrawlerDbContext db,
IReadOnlyCollection<TPost> posts,
SaverLocks<(Fid Fid, Uid Uid)> locks,
IQueryable<TRevision> dbSet,
Func<TPost, TValue?> postAuthorFieldValueSelector,
Func<TPost, TValue?> postRevisioningFieldSelector,
Func<TValue?, TValue?, bool> isValueChangedPredicate,
Expression<Func<TRevision, LatestAuthorRevisionProjection<TValue>>> latestRevisionProjectionFactory,
Func<(Uid Uid, TValue? Value, Time DiscoveredAt), TRevision> revisionFactory)
where TPost : BasePost
where TRevision : AuthorRevision
{ // only takes the first of multiple post from the same author
var uniquePosts = posts.DistinctBy(p => p.AuthorUid).ToList();
if (uniquePosts.Count != posts.Count) (
from p in posts
group p by p.AuthorUid into g
where g.Count() > 1
from p in g
group p by (p.AuthorUid, postRevisioningFieldSelector(p)) into g
group g by g.Key.AuthorUid into gg
where gg.Count() > 1
from g in gg
select g)
.ForEach(g => logger.LogWarning(
"Multiple entities with different value of revisioning field sharing the same TPost.AuthorUid {}: {}",
g.Key, SharedHelper.UnescapedJsonSerialize(g)));

SharedHelper.GetNowTimestamp(out var now);
var existingRevisionOfExistingUsers = dbSet.AsNoTracking()
.Where(e => e.Fid == db.Fid
Expand All @@ -59,7 +74,7 @@ private static void Save<TPost, TRevision, TValue>(
(
e.Uid,
Existing: (e.DiscoveredAt, e.Value),
NewInPost: (DiscoveredAt: now, Value: postAuthorFieldValueSelector(p))
NewInPost: (DiscoveredAt: now, Value: postRevisioningFieldSelector(p))
)).ToList();
var newRevisionOfExistingUsers = existingRevisionOfExistingUsers

Expand All @@ -70,7 +85,7 @@ private static void Save<TPost, TRevision, TValue>(
.Select(t => (t.Uid, t.NewInPost.Value, t.NewInPost.DiscoveredAt));
var newRevisionOfNewUsers = uniquePosts
.ExceptBy(existingRevisionOfExistingUsers.Select(t => t.Uid), p => p.AuthorUid)
.Select(p => (Uid: p.AuthorUid, Value: postAuthorFieldValueSelector(p), DiscoveredAt: now));
.Select(p => (Uid: p.AuthorUid, Value: postRevisioningFieldSelector(p), DiscoveredAt: now));
var newRevisions = newRevisionOfNewUsers
.Concat(newRevisionOfExistingUsers)
.Select(revisionFactory)
Expand Down
46 changes: 42 additions & 4 deletions c#/crawler/src/Tieba/Crawl/Saver/ReplySignatureSaver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,18 @@

namespace tbm.Crawler.Tieba.Crawl.Saver;

public class ReplySignatureSaver(SaverLocks<ReplySignatureSaver.UniqueSignature> locks)
public class ReplySignatureSaver(
ILogger<ReplySignatureSaver> logger,
SaverLocks<ReplySignatureSaver.UniqueSignature> locks)
{
public Action Save(CrawlerDbContext db, IEnumerable<ReplyPost> replies)
{
SharedHelper.GetNowTimestamp(out var now);
var signatures = replies
.Where(r => r is {SignatureId: not null, Signature: not null})
var repliesWithSignature = replies
.Where(r => r is {SignatureId: not null, Signature: not null}).ToList();
var signatures = repliesWithSignature

// only takes the first of multiple signature sharing the same id
.DistinctBy(r => r.SignatureId)
.Select(r => new ReplySignature
{
Expand All @@ -20,6 +25,21 @@ public Action Save(CrawlerDbContext db, IEnumerable<ReplyPost> replies)
LastSeenAt = now
}).ToList();
if (signatures.Count == 0) return () => { };
if (signatures.Count != repliesWithSignature
.GroupBy(r => new ReplySignatureProjection(r.SignatureId!.Value, r.Signature!))
.Count()) (
from r in repliesWithSignature
group r by r.SignatureId into g
where g.Count() > 1
from r in g
group r by new ReplySignatureProjection(r.SignatureId!.Value, r.Signature!) into g
group g by g.Key.SignatureId into gg
where gg.Count() > 1
from g in gg
select g)
.ForEach(g => logger.LogWarning(
"Multiple entities with different value of revisioning field sharing the same signature id {}: {}",
g.Key.SignatureId, SharedHelper.UnescapedJsonSerialize(g)));

var existingSignatures = (
from s in db.ReplySignatures.AsTracking()
Expand All @@ -43,10 +63,28 @@ join newInReply in signatures on existing.SignatureId equals newInReply.Signatur
return locks.Dispose;
}

private sealed record ReplySignatureProjection(uint SignatureId, byte[] Signature)
{
public bool Equals(ReplySignatureProjection? other) =>
other != null
&& SignatureId == other.SignatureId
&& ByteArrayEqualityComparer.Instance.Equals(Signature, other.Signature);

public override int GetHashCode()
{
var hash = default(HashCode);
hash.Add(SignatureId);
hash.AddBytes(Signature);
return hash.ToHashCode();
}
}

public sealed record UniqueSignature(uint Id, byte[] XxHash3)
{
public bool Equals(UniqueSignature? other) =>
other != null && Id == other.Id && ByteArrayEqualityComparer.Instance.Equals(XxHash3, other.XxHash3);
other != null
&& Id == other.Id
&& ByteArrayEqualityComparer.Instance.Equals(XxHash3, other.XxHash3);

public override int GetHashCode()
{
Expand Down
2 changes: 1 addition & 1 deletion c#/crawler/src/Tieba/Crawl/Saver/SaverWithRevision.cs
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ bool IsTimestampingFieldName(string name) => name is nameof(BasePost.LastSeenAt)
if (!IRevisionProperties.Cache[typeof(TRevision)].TryGetValue(pName, out var revisionProp))
{
object? ToHexWhenByteArray(object? value) =>
value is byte[] bytes ? $"0x{Convert.ToHexString(bytes).ToLowerInvariant()}" : value;
value is byte[] bytes ? bytes.ToHex() : value;
logger.LogWarning("Updating field {} is not existing in revision table, " +
"newValue={}, oldValue={}, newObject={}, oldObject={}",
pName, ToHexWhenByteArray(p.CurrentValue), ToHexWhenByteArray(p.OriginalValue),
Expand Down
1 change: 1 addition & 0 deletions c#/shared/src/SharedHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@ public static string UnescapedJsonSerialize<TValue>(TValue value) =>
public static void GetNowTimestamp(out UInt32 now) => now = GetNowTimestamp();
[SuppressMessage("Maintainability", "AV1551:Method overload should call another overload")]
public static UInt32 GetNowTimestamp() => (UInt32)DateTimeOffset.Now.ToUnixTimeSeconds();
public static string ToHex(this byte[] bytes) => $"0x{Convert.ToHexString(bytes).ToLowerInvariant()}";
}

0 comments on commit 4338ccf

Please sign in to comment.