From 30c29705e929771ae7e8e4ad62953e1aa5d9899c Mon Sep 17 00:00:00 2001 From: n0099 Date: Wed, 12 Jun 2024 04:34:49 +0800 Subject: [PATCH] * fix `BasePostContent.ProtoBufBytes` might be null that cannot get parsed as protobuf encoding * adding logging about duration of each forum and its starts @ `SimplifyImagesInAllReplyContentsWorker.DoWork()` @ crawler * rename all variables with type `Stopwatch` from `sw` to `stopwatch` @ ImageBatchConsumingWorker.cs @ imagePipeline * round mantissa in interpolation of `processMemory` for logging to two digits @ `TransformEntityWorker.Transform()` @ shared @ c# --- .../SimplifyImagesInAllReplyContentsWorker.cs | 22 ++++++++++++++----- .../src/ImageBatchConsumingWorker.cs | 14 ++++++------ c#/shared/src/TransformEntityWorker.cs | 5 +++-- 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/c#/crawler/src/Worker/SimplifyImagesInAllReplyContentsWorker.cs b/c#/crawler/src/Worker/SimplifyImagesInAllReplyContentsWorker.cs index f570b196..257ab796 100644 --- a/c#/crawler/src/Worker/SimplifyImagesInAllReplyContentsWorker.cs +++ b/c#/crawler/src/Worker/SimplifyImagesInAllReplyContentsWorker.cs @@ -9,10 +9,13 @@ public class SimplifyImagesInAllReplyContentsWorker( { protected override async Task DoWork(CancellationToken stoppingToken) { + var stopwatch = new Stopwatch(); + stopwatch.Start(); await using var dbDefaultFactory = dbContextDefaultFactory(); var db = dbDefaultFactory.Value(); foreach (var fid in from e in db.Forums select e.Fid) { + logger.LogInformation("Simplify images in reply contents of fid {} started", fid); await using var dbFactory = dbContextFactory(); await Transform( () => dbFactory.Value(fid), @@ -20,6 +23,8 @@ await Transform( readingEntity => readingEntity.Pid, readingEntity => { + if (readingEntity.ProtoBufBytes == null) + return new() {Pid = readingEntity.Pid, ProtoBufBytes = null}; var protoBuf = Reply.Parser.ParseFrom(readingEntity.ProtoBufBytes); ReplyParser.SimplifyImagesInReplyContent(logger, ref protoBuf); return new() {Pid = readingEntity.Pid, ProtoBufBytes = protoBuf.ToByteArray()}; @@ -30,13 +35,18 @@ await Transform( p.IsModified = !ByteArrayEqualityComparer.Instance.Equals(p.OriginalValue, p.CurrentValue); }, (writingDb, writingEntities) => replyContentImageSaver - .Save(writingDb, writingEntities.Select(e => new ReplyPost - { - Pid = e.Pid, - Content = null!, - ContentsProtoBuf = Reply.Parser.ParseFrom(e.ProtoBufBytes).Content - })), + .Save(writingDb, writingEntities + .Where(e => e.ProtoBufBytes != null) + .Select(e => new ReplyPost + { + Pid = e.Pid, + Content = null!, + ContentsProtoBuf = Reply.Parser.ParseFrom(e.ProtoBufBytes).Content + })), stoppingToken); + logger.LogInformation("Simplify images in reply contents of fid {} finished after {:F2}s", + fid, stopwatch.Elapsed.TotalSeconds); + stopwatch.Restart(); } } } diff --git a/c#/imagePipeline/src/ImageBatchConsumingWorker.cs b/c#/imagePipeline/src/ImageBatchConsumingWorker.cs index a16def59..267425cc 100644 --- a/c#/imagePipeline/src/ImageBatchConsumingWorker.cs +++ b/c#/imagePipeline/src/ImageBatchConsumingWorker.cs @@ -76,10 +76,10 @@ void MarkImagesInReplyAsConsumed logger.LogTrace("Start to consume {} image(s): [{}]", imagesWithBytes.Count, string.Join(',', imagesInReply.Select(i => i.ImageId))); - var sw = new Stopwatch(); + var stopwatch = new Stopwatch(); void LogStopwatch(string consumerType, IReadOnlyCollection imagesId) => logger.LogTrace("Spend {}ms to {} for {} image(s): [{}]", - sw.ElapsedMilliseconds, consumerType, imagesId.Count, string.Join(',', imagesId)); + stopwatch.ElapsedMilliseconds, consumerType, imagesId.Count, string.Join(',', imagesId)); void ConsumeConsumer( Expression> selector, IReadOnlyCollection images, @@ -87,7 +87,7 @@ void ConsumeConsumer( where TConsumer : IConsumer { using var consumer = consumerFactory(); - sw.Restart(); + stopwatch.Restart(); #pragma warning disable IDE0042 // Deconstruct variable declaration var imagesId = consumer.Value.Consume(db, images, stoppingToken); #pragma warning restore IDE0042 // Deconstruct variable declaration @@ -262,12 +262,12 @@ async Task> ConsumeByFidWithScript( var ocrConsumer = consumerFactory.Value(script); await ocrConsumer.InitializePaddleOcr(stoppingToken); - var sw = new Stopwatch(); - sw.Start(); + var stopwatch = new Stopwatch(); + stopwatch.Start(); #pragma warning disable IDE0042 // Deconstruct variable declaration var imagesId = ocrConsumer.Consume(db, imagesInCurrentFid, stoppingToken); #pragma warning restore IDE0042 // Deconstruct variable declaration - sw.Stop(); + stopwatch.Stop(); markImageInReplyAsConsumed(imagesId.Consumed); var failed = imagesId.Failed.ToList(); @@ -275,7 +275,7 @@ async Task> ConsumeByFidWithScript( logger.LogError("Failed to detect and recognize {} script text for fid {} in {} image(s): [{}]", script, fid, failed.Count, string.Join(',', failed)); logger.LogTrace("Spend {}ms to detect and recognize {} script text for fid {} in {} image(s): [{}]", - sw.ElapsedMilliseconds, script, fid, imagesInCurrentFid.Count, + stopwatch.ElapsedMilliseconds, script, fid, imagesInCurrentFid.Count, string.Join(',', imagesInCurrentFid.Select(i => i.ImageId))); return ocrConsumer.RecognizedTextLines; diff --git a/c#/shared/src/TransformEntityWorker.cs b/c#/shared/src/TransformEntityWorker.cs index a88c9bc6..1c9f4f46 100644 --- a/c#/shared/src/TransformEntityWorker.cs +++ b/c#/shared/src/TransformEntityWorker.cs @@ -55,7 +55,7 @@ async Task SaveThenLog(int processedCount, Process currentProcess) writingEntities.Clear(); writingDb.ChangeTracker.Clear(); - logger.LogTrace("processedEntityCount:{} updatedEntityCount:{} elapsed:{}ms processMemory:{}MiB exceptions:{}", + logger.LogTrace("processedEntityCount:{} updatedEntityCount:{} elapsed:{}ms processMemory:{:F2}MiB exceptions:{}", processedCount, updatedEntityCount, stopwatch.ElapsedMilliseconds, currentProcess.PrivateMemorySize64 / 1024f / 1024, @@ -66,7 +66,8 @@ async Task SaveThenLog(int processedCount, Process currentProcess) foreach (var readingEntity in readingEntities) { processedEntityCount++; - if (processedEntityCount % saveByNthEntityCount == 0) await SaveThenLog(processedEntityCount, process); + if (processedEntityCount % saveByNthEntityCount == 0) + await SaveThenLog(processedEntityCount, process); if (stoppingToken.IsCancellationRequested) break; try {