Skip to content

Commit

Permalink
+ class ReplyContentImageSaver to extract the method `SaveReplyCont…
Browse files Browse the repository at this point in the history
…entImages()` from `ReplySaver`

* rename method `ReplySignatureSaver.SaveReplySignatures()` to `Save()`
* fix namespace not matching with path hierarchy  @ `ReplySaver` & `ReplyParser`
@ c#/crawler
  • Loading branch information
n0099 committed May 16, 2024
1 parent c6adc12 commit 114f3c8
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 58 deletions.
1 change: 1 addition & 0 deletions c#/crawler/src/EntryPoint.cs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ protected override void ConfigureContainer(HostBuilderContext context, Container
// eager initial all keyed CrawlerLocks singleton instances
// in order to sync their timer of WithLogTrace
.AutoActivate());
builder.RegisterType<ReplyContentImageSaver>();
builder.RegisterType<ReplySignatureSaver>();
builder.RegisterType<AuthorRevisionSaver>();
builder.RegisterType<UserParser>();
Expand Down
2 changes: 1 addition & 1 deletion c#/crawler/src/Tieba/Crawl/Parser/Post/ReplyParser.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
namespace tbm.Crawler.Tieba.Crawl.Parser;
namespace tbm.Crawler.Tieba.Crawl.Parser.Post;

public partial class ReplyParser(ILogger<ReplyParser> logger)
: PostParser<ReplyPost, Reply>
Expand Down
61 changes: 5 additions & 56 deletions c#/crawler/src/Tieba/Crawl/Saver/Post/ReplySaver.cs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
using PredicateBuilder = LinqKit.PredicateBuilder;

namespace tbm.Crawler.Tieba.Crawl.Saver;
namespace tbm.Crawler.Tieba.Crawl.Saver.Post;

public partial class ReplySaver(
public class ReplySaver(
ILogger<ReplySaver> logger,
ConcurrentDictionary<PostId, ReplyPost> posts,
SaverLocks<string> imageInReplyLocks,
ReplyContentImageSaver replyContentImageSaver,
ReplySignatureSaver replySignatureSaver,
AuthorRevisionSaver.New authorRevisionSaverFactory)
: PostSaver<ReplyPost, BaseReplyRevision>(
Expand Down Expand Up @@ -69,61 +69,10 @@ public override SaverChangeSet<ReplyPost> Save(CrawlerDbContext db)

db.ReplyContents.AddRange(changeSet.NewlyAdded
.Select(r => new ReplyContent {Pid = r.Pid, ProtoBufBytes = r.Content}));
SaveReplyContentImages(db, changeSet.NewlyAdded);
replyContentImageSaver.Save(db, changeSet.NewlyAdded);
PostSaveHandlers += AuthorRevisionSaver.SaveAuthorExpGradeRevisions(db, changeSet.AllAfter).Invoke;
PostSaveHandlers += replySignatureSaver.SaveReplySignatures(db, changeSet.AllAfter).Invoke;
PostSaveHandlers += replySignatureSaver.Save(db, changeSet.AllAfter).Invoke;

return changeSet;
}
}
public partial class ReplySaver
{
private void SaveReplyContentImages(CrawlerDbContext db, IEnumerable<ReplyPost> replies)
{
var pidAndImageList = (
from r in replies
from c in r.OriginalContents
where c.Type == 3
where // only save image filename without extension that extracted from url by ReplyParser.Convert()
ReplyParser.ValidateContentImageFilenameRegex().IsMatch(c.OriginSrc)
select (r.Pid, Image: new ImageInReply
{
UrlFilename = c.OriginSrc,
ExpectedByteSize = c.OriginSize
}))
.DistinctBy(t => (t.Pid, t.Image.UrlFilename))
.ToList();
if (pidAndImageList.Count == 0) return;

var imagesKeyByUrlFilename = pidAndImageList.Select(t => t.Image)
.DistinctBy(image => image.UrlFilename).ToDictionary(image => image.UrlFilename);
var existingImages = (
from e in db.ImageInReplies.AsTracking()
where imagesKeyByUrlFilename.Keys.Contains(e.UrlFilename)
select e)
.ToDictionary(e => e.UrlFilename);
(from existing in existingImages.Values
where existing.ExpectedByteSize == 0 // randomly respond with 0
join newInContent in imagesKeyByUrlFilename.Values
on existing.UrlFilename equals newInContent.UrlFilename
select (existing, newInContent))
.ForEach(t => t.existing.ExpectedByteSize = t.newInContent.ExpectedByteSize);
var newImagesUrlFilename = imagesKeyByUrlFilename.ExceptByKey(existingImages.Keys).Keys().ToList();
db.ReplyContentImages.AddRange(pidAndImageList
.ExceptBy(imageInReplyLocks.AcquireLocks(newImagesUrlFilename), t => t.Image.UrlFilename)
.Select(t => new ReplyContentImage
{
Pid = t.Pid,

// no need to manually invoke DbContext.AddRange(images) since EF Core will do these chore
// https://stackoverflow.com/questions/5212751/how-can-i-retrieve-id-of-inserted-entity-using-entity-framework/41146434#41146434
// reuse the same instance from imagesKeyByUrlFilename
// will prevent assigning multiple different instances with the same key
// which will cause EF Core to insert identify entry more than one time leading to duplicated entry error
// https://github.com/dotnet/efcore/issues/30236
ImageInReply = existingImages.TryGetValue(t.Image.UrlFilename, out var e)
? e
: imagesKeyByUrlFilename[t.Image.UrlFilename]
}));
}
}
53 changes: 53 additions & 0 deletions c#/crawler/src/Tieba/Crawl/Saver/ReplyContentImageSaver.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
namespace tbm.Crawler.Tieba.Crawl.Saver;

public class ReplyContentImageSaver(SaverLocks<string> locks)
{
public void Save(CrawlerDbContext db, IEnumerable<ReplyPost> replies)
{
var pidAndImageList = (
from r in replies
from c in r.OriginalContents
where c.Type == 3
where // only save image filename without extension that extracted from url by ReplyParser.Convert()
ReplyParser.ValidateContentImageFilenameRegex().IsMatch(c.OriginSrc)
select (r.Pid, Image: new ImageInReply
{
UrlFilename = c.OriginSrc,
ExpectedByteSize = c.OriginSize
}))
.DistinctBy(t => (t.Pid, t.Image.UrlFilename))
.ToList();
if (pidAndImageList.Count == 0) return;

var imagesKeyByUrlFilename = pidAndImageList.Select(t => t.Image)
.DistinctBy(image => image.UrlFilename).ToDictionary(image => image.UrlFilename);
var existingImages = (
from e in db.ImageInReplies.AsTracking()
where imagesKeyByUrlFilename.Keys.Contains(e.UrlFilename)
select e)
.ToDictionary(e => e.UrlFilename);
(from existing in existingImages.Values
where existing.ExpectedByteSize == 0 // randomly respond with 0
join newInContent in imagesKeyByUrlFilename.Values
on existing.UrlFilename equals newInContent.UrlFilename
select (existing, newInContent))
.ForEach(t => t.existing.ExpectedByteSize = t.newInContent.ExpectedByteSize);
var newImagesUrlFilename = imagesKeyByUrlFilename.ExceptByKey(existingImages.Keys).Keys().ToList();
db.ReplyContentImages.AddRange(pidAndImageList
.ExceptBy(locks.AcquireLocks(newImagesUrlFilename), t => t.Image.UrlFilename)
.Select(t => new ReplyContentImage
{
Pid = t.Pid,

// no need to manually invoke DbContext.AddRange(images) since EF Core will do these chore
// https://stackoverflow.com/questions/5212751/how-can-i-retrieve-id-of-inserted-entity-using-entity-framework/41146434#41146434
// reuse the same instance from imagesKeyByUrlFilename
// will prevent assigning multiple different instances with the same key
// which will cause EF Core to insert identify entry more than one time leading to duplicated entry error
// https://github.com/dotnet/efcore/issues/30236
ImageInReply = existingImages.TryGetValue(t.Image.UrlFilename, out var e)
? e
: imagesKeyByUrlFilename[t.Image.UrlFilename]
}));
}
}
2 changes: 1 addition & 1 deletion c#/crawler/src/Tieba/Crawl/Saver/ReplySignatureSaver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ namespace tbm.Crawler.Tieba.Crawl.Saver;

public class ReplySignatureSaver(SaverLocks<ReplySignatureSaver.UniqueSignature> locks)
{
public Action SaveReplySignatures(CrawlerDbContext db, IEnumerable<ReplyPost> replies)
public Action Save(CrawlerDbContext db, IEnumerable<ReplyPost> replies)
{
SharedHelper.GetNowTimestamp(out var now);
var signatures = replies
Expand Down

0 comments on commit 114f3c8

Please sign in to comment.