From e63d418817eace48612812355fc2d9b62069e63e Mon Sep 17 00:00:00 2001 From: Bill Wagner Date: Tue, 28 Jan 2025 15:35:33 -0500 Subject: [PATCH 1/7] First spike: Get dates This commit adds the code to get all the dates for every published file (markdown or YAML) in a repo. --- cleanrepo/Program.cs | 59 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/cleanrepo/Program.cs b/cleanrepo/Program.cs index a332c7e..a022ee6 100644 --- a/cleanrepo/Program.cs +++ b/cleanrepo/Program.cs @@ -24,7 +24,8 @@ class Program "FilterImagesForText", "ReplaceRedirectTargets", "ReplaceWithRelativeLinks", - "RemoveRedirectHops" + "RemoveRedirectHops", + "AuditMSDate" ]; static void Main(string[] args) @@ -349,6 +350,36 @@ static void RunOptions(Options options) docFxRepo.RemoveAllRedirectHops(); Console.WriteLine("\nFinished removing redirect hops."); + break; + } + // Audit the 'ms.date' property in all markdown files. + case "AuditMSDate": + { + Console.WriteLine($"\nAuditing the 'ms.date' property in all markdown files in '{options.TargetDirectory}'..."); + + if (docFxRepo.AllTocFiles is null) + return; + + List articleFiles = HelperMethods.GetMarkdownFiles(options.TargetDirectory, "snippets", "includes"); + + + articleFiles.AddRange(HelperMethods.GetYAMLFiles(options.TargetDirectory)); + + if (articleFiles is null) + return; + + var linkedArticles = from article in articleFiles + where (string.Compare(article.Name, "toc.md", true) != 0) && + (string.Compare(article.Name, "toc.yml", true) != 0) && + docFxRepo.AllTocFiles.Any(tocFile => IsFileLinkedFromTocFile(article, tocFile)) + select article; + foreach (var article in linkedArticles) + { + // Get the ms.date value: + DateOnly? msDate = GetmsDate(article.FullName); + Console.WriteLine($"{article.FullName}: {msDate}"); + } + break; } default: @@ -363,6 +394,32 @@ static void RunOptions(Options options) Console.WriteLine($"Elapsed time: {stopwatch.Elapsed.ToHumanReadableString()}"); } + private static DateOnly? GetmsDate(string filePath) + { + DateOnly? msDate = default; + foreach (var line in File.ReadLines(filePath)) + { + if (line.Contains("ms.date")) + { + string[] parts = line.Split(":"); + if (parts.Length > 1) + { + string date = parts[1].Trim().Replace("\"", ""); // yeah, remove quotes. + if (DateOnly.TryParse(date, out DateOnly parsedDate)) + { + msDate = parsedDate; + break; + } + else + { + Console.Error.WriteLine($"Invalid date format in {filePath}: {date}"); + } + } + } + } + return msDate; + } + #region Replace site-relative links private static void ReplaceLinks(List linkingFiles, string urlBasePath, string rootDirectory) { From 68b20bc84c2d179d3855a96a9b0e1bc5812c913e Mon Sep 17 00:00:00 2001 From: Bill Wagner Date: Tue, 28 Jan 2025 17:25:24 -0500 Subject: [PATCH 2/7] Query and write diffs First pass at a query to find diffs since the ms.date value has been updated. --- DotNet.DocsTools/GitHubObjects/FileHistory.cs | 87 +++++++++++++++++++ .../GitHubObjects/ResponseExtractors.cs | 6 ++ cleanrepo/CleanRepo.csproj | 3 + cleanrepo/Program.cs | 44 ++++++++-- 4 files changed, 133 insertions(+), 7 deletions(-) create mode 100644 DotNet.DocsTools/GitHubObjects/FileHistory.cs diff --git a/DotNet.DocsTools/GitHubObjects/FileHistory.cs b/DotNet.DocsTools/GitHubObjects/FileHistory.cs new file mode 100644 index 0000000..0fa6d91 --- /dev/null +++ b/DotNet.DocsTools/GitHubObjects/FileHistory.cs @@ -0,0 +1,87 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Text.Json; +using System.Threading.Tasks; +using DotNetDocs.Tools.GitHubCommunications; + +namespace DotNet.DocsTools.GitHubObjects; + +/// +/// The query variables for file history +/// +/// The GitHub org for the repository +/// The repository name +/// The path to the file +public readonly record struct FileHistoryVariables(string owner, string repo, string path); + +public class FileHistory : IGitHubQueryResult +{ + private const string FileHistoryQueryText = """ + query FileHistory($owner: String!, $repo: String!, $path: String!, $cursor: String) { + repository(owner: $owner, name: $repo) { + defaultBranchRef { + target { + ... on Commit { + history( + first: 25 + after: $cursor + path: $path + ) { + nodes { + committedDate + changedFilesIfAvailable + additions + deletions + } + pageInfo { + hasNextPage + endCursor + } + } + } + } + } + } + } + """; + + + public DateTime CommittedDate { get; } + + public int? ChangedFilesIfAvailable { get; } + + public int Additions { get; } + + public int Deletions { get; } + private FileHistory(JsonElement element) + { + CommittedDate = ResponseExtractors.DateTimeProperty(element, "committedDate"); + ChangedFilesIfAvailable = ResponseExtractors.IntProperty(element, "changedFilesIfAvailable"); + Additions = ResponseExtractors.IntProperty(element, "additions"); + Deletions = ResponseExtractors.IntProperty(element, "deletions"); + } + + public static GraphQLPacket GetQueryPacket(FileHistoryVariables variables, bool isScalar) => + (isScalar) + ? throw new InvalidOperationException("This query is not a scalar query") + : new() + { + query = FileHistoryQueryText, + variables = + { + ["owner"] = variables.owner, + ["repo"] = variables.repo, + ["path"] = variables.path, + } + }; + + public static IEnumerable NavigationToNodes(bool isScalar) => + (isScalar) + ? throw new InvalidOperationException("This query is not a scalar query") + : ["repository", "defaultBranchRef", "target", "history"]; + + public static FileHistory? FromJsonElement(JsonElement element, FileHistoryVariables variables) => + new FileHistory(element); +} diff --git a/DotNet.DocsTools/GitHubObjects/ResponseExtractors.cs b/DotNet.DocsTools/GitHubObjects/ResponseExtractors.cs index 031a1a6..e080452 100644 --- a/DotNet.DocsTools/GitHubObjects/ResponseExtractors.cs +++ b/DotNet.DocsTools/GitHubObjects/ResponseExtractors.cs @@ -112,4 +112,10 @@ internal static int IntProperty(JsonElement element, string propertyName) } throw new ArgumentException($"Property {propertyName} not found in Json element. Did you possibly access the parent node?", nameof(element)); } + + internal static DateTime DateTimeProperty(JsonElement element, string propertyName) + { + return OptionalDateProperty(element, propertyName) + ?? throw new ArgumentException("Requested property shouldn't be null", nameof(propertyName)); + } } diff --git a/cleanrepo/CleanRepo.csproj b/cleanrepo/CleanRepo.csproj index 992e7fc..f1b3c8c 100644 --- a/cleanrepo/CleanRepo.csproj +++ b/cleanrepo/CleanRepo.csproj @@ -21,6 +21,9 @@ + + + PreserveNewest diff --git a/cleanrepo/Program.cs b/cleanrepo/Program.cs index a022ee6..6f03fdd 100644 --- a/cleanrepo/Program.cs +++ b/cleanrepo/Program.cs @@ -4,6 +4,9 @@ using System.Text.Json; using System.Text.RegularExpressions; using CleanRepo.Extensions; +using DotNet.DocsTools.GitHubObjects; +using DotNetDocs.Tools.GitHubCommunications; +using DotNetDocs.Tools.GraphQLQueries; using Microsoft.Build.Construction; using Microsoft.Extensions.Configuration; using Microsoft.Extensions.Hosting; @@ -28,7 +31,7 @@ class Program "AuditMSDate" ]; - static void Main(string[] args) + static async Task Main(string[] args) { HostApplicationBuilder builder = Host.CreateApplicationBuilder(args); builder.Configuration.Sources.Clear(); @@ -46,10 +49,10 @@ static void Main(string[] args) builder.Configuration.GetSection(nameof(Options)) .Bind(options); - RunOptions(options); + await RunOptions(options); } - static void RunOptions(Options options) + static async Task RunOptions(Options options) { if (String.IsNullOrEmpty(options.Function)) { @@ -357,6 +360,12 @@ static void RunOptions(Options options) { Console.WriteLine($"\nAuditing the 'ms.date' property in all markdown files in '{options.TargetDirectory}'..."); + var config = new ConfigurationBuilder() + .AddEnvironmentVariables() + .Build(); + string key = config["GitHubKey"]!; + IGitHubClient client = IGitHubClient.CreateGitHubClient(key); + if (docFxRepo.AllTocFiles is null) return; @@ -376,8 +385,29 @@ static void RunOptions(Options options) foreach (var article in linkedArticles) { // Get the ms.date value: - DateOnly? msDate = GetmsDate(article.FullName); - Console.WriteLine($"{article.FullName}: {msDate}"); + DateOnly? msDate = await GetmsDate(article.FullName); + + var query = new EnumerationQuery(client); + var path = article.FullName.Replace(options.DocFxDirectory, "").Replace('\\', '/').Remove(0,1); + + var variables = new FileHistoryVariables("dotnet", "docs", path); + DateOnly? commitDate = default; + int numberChanges = 0; + int numberPRs = 0; + await foreach (var history in query.PerformQuery(variables)) + { + numberPRs++; + commitDate ??= DateOnly.FromDateTime(history.CommittedDate); + if (msDate >= DateOnly.FromDateTime(history.CommittedDate.AddDays(-7))) // edit vs. merge. + { + break; + } + numberChanges += Math.Max(history.Deletions, history.Additions); + } + if (numberChanges > 0) + { + Console.WriteLine($"msDate: {msDate}, commitDate: {commitDate}, {numberPRs} merged with {numberChanges} changes to {path}"); + } } break; @@ -394,10 +424,10 @@ static void RunOptions(Options options) Console.WriteLine($"Elapsed time: {stopwatch.Elapsed.ToHumanReadableString()}"); } - private static DateOnly? GetmsDate(string filePath) + private static async Task GetmsDate(string filePath) { DateOnly? msDate = default; - foreach (var line in File.ReadLines(filePath)) + await foreach (var line in File.ReadLinesAsync(filePath)) { if (line.Contains("ms.date")) { From a05e13944f4a2ba0bdc3d515f4a2b5775eabf52b Mon Sep 17 00:00:00 2001 From: Bill Wagner Date: Thu, 30 Jan 2025 14:56:11 -0500 Subject: [PATCH 3/7] This works well. Functionality is all there. Time to do some refactoring. Then, update the readme --- cleanrepo/Program.cs | 74 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 67 insertions(+), 7 deletions(-) diff --git a/cleanrepo/Program.cs b/cleanrepo/Program.cs index 6f03fdd..d89df7c 100644 --- a/cleanrepo/Program.cs +++ b/cleanrepo/Program.cs @@ -358,8 +358,11 @@ static async Task RunOptions(Options options) // Audit the 'ms.date' property in all markdown files. case "AuditMSDate": { + string[] progressMarkers = ["| -", "/ \\", "- |", "\\ /"]; + string erase = "\b\b\b\b\b\b\b\b\b\b\b"; Console.WriteLine($"\nAuditing the 'ms.date' property in all markdown files in '{options.TargetDirectory}'..."); + var config = new ConfigurationBuilder() .AddEnvironmentVariables() .Build(); @@ -374,6 +377,8 @@ static async Task RunOptions(Options options) articleFiles.AddRange(HelperMethods.GetYAMLFiles(options.TargetDirectory)); + Console.WriteLine($"Total number of files to process: {articleFiles.Count}"); + if (articleFiles is null) return; @@ -382,34 +387,67 @@ static async Task RunOptions(Options options) (string.Compare(article.Name, "toc.yml", true) != 0) && docFxRepo.AllTocFiles.Any(tocFile => IsFileLinkedFromTocFile(article, tocFile)) select article; + int totalArticles = 0; + int freshArticles = 0; + int trulyStateArticles = 0; + int falseStaleArticles = 0; + // Make this configurable: + DateOnly staleContentDate = DateOnly.FromDateTime(DateTime.Now.AddYears(-1)); + Console.WriteLine($"PRs Changes Last Commit ms.date Path"); + Console.Write($"{totalArticles,7} {progressMarkers[totalArticles % progressMarkers.Length]}"); foreach (var article in linkedArticles) { - // Get the ms.date value: + totalArticles++; + Console.Write($"{erase}{totalArticles,7} {progressMarkers[totalArticles % progressMarkers.Length]}"); + // Do the cheapest test first: Is this fresh? DateOnly? msDate = await GetmsDate(article.FullName); + if (msDate > staleContentDate) + { + freshArticles++; + continue; + } + + // Next cheapest test: Are there recent commits? + DateOnly? commitDate = GetCommitDate(options.DocFxDirectory, article.FullName); + if (commitDate < staleContentDate) + { + trulyStateArticles++; + continue; + } + + // reset commit Date. That way, the value from GitHub is used + // instead of the file value. + commitDate = null; + // Give a week from msDate to allow for PR edits before merging. + DateOnly msDateMergeDate = DateOnly.FromDateTime(new DateTime(msDate.Value, default).AddDays(7)); var query = new EnumerationQuery(client); var path = article.FullName.Replace(options.DocFxDirectory, "").Replace('\\', '/').Remove(0,1); var variables = new FileHistoryVariables("dotnet", "docs", path); - DateOnly? commitDate = default; int numberChanges = 0; int numberPRs = 0; await foreach (var history in query.PerformQuery(variables)) { - numberPRs++; commitDate ??= DateOnly.FromDateTime(history.CommittedDate); - if (msDate >= DateOnly.FromDateTime(history.CommittedDate.AddDays(-7))) // edit vs. merge. + if ((DateOnly.FromDateTime(history.CommittedDate) <= msDateMergeDate) || + (DateOnly.FromDateTime(history.CommittedDate) <= staleContentDate)) { break; } - numberChanges += Math.Max(history.Deletions, history.Additions); + numberPRs++; + if (history.ChangedFilesIfAvailable < 500) + numberChanges += Math.Max(history.Deletions, history.Additions); } if (numberChanges > 0) { - Console.WriteLine($"msDate: {msDate}, commitDate: {commitDate}, {numberPRs} merged with {numberChanges} changes to {path}"); + Console.Write(erase); + falseStaleArticles++; + Console.WriteLine($"{numberPRs,3} {numberChanges,7} {commitDate:MM-dd-yyyy} {msDate:MM-dd-yyyy} {path}"); + Console.Write($"{totalArticles,7} {progressMarkers[totalArticles % progressMarkers.Length]}"); } } - + Console.WriteLine($"{erase} {totalArticles} checked. Fresh: {freshArticles}. Truly stale: {trulyStateArticles}. Updated but not fresh: {falseStaleArticles}"); break; } default: @@ -424,6 +462,28 @@ static async Task RunOptions(Options options) Console.WriteLine($"Elapsed time: {stopwatch.Elapsed.ToHumanReadableString()}"); } + private static DateOnly GetCommitDate(string folder, string path) + { + // Create a new process + Process process = new Process(); + process.StartInfo.FileName = "git"; + process.StartInfo.Arguments = $"""log -1 --format="%cd" --date=short {path}"""; + process.StartInfo.RedirectStandardOutput = true; + process.StartInfo.UseShellExecute = false; + process.StartInfo.CreateNoWindow = true; + process.StartInfo.WorkingDirectory = folder; + + // Start the process + process.Start(); + + // Read the output + string output = process.StandardOutput.ReadToEnd(); + + // Wait for the process to exit + process.WaitForExit(); + return DateOnly.Parse(output); + } + private static async Task GetmsDate(string filePath) { DateOnly? msDate = default; From 3a2bd80bc8f44ca6fafacb6c98ea8481e5e5f776 Mon Sep 17 00:00:00 2001 From: Bill Wagner Date: Thu, 30 Jan 2025 16:15:18 -0500 Subject: [PATCH 4/7] refactor and add docs Refactor the code a bit, and update the readme file. --- cleanrepo/Program.cs | 235 ++++++++++++++++++++++--------------------- cleanrepo/README.md | 2 + 2 files changed, 125 insertions(+), 112 deletions(-) diff --git a/cleanrepo/Program.cs b/cleanrepo/Program.cs index d89df7c..fb460f8 100644 --- a/cleanrepo/Program.cs +++ b/cleanrepo/Program.cs @@ -358,23 +358,13 @@ static async Task RunOptions(Options options) // Audit the 'ms.date' property in all markdown files. case "AuditMSDate": { - string[] progressMarkers = ["| -", "/ \\", "- |", "\\ /"]; - string erase = "\b\b\b\b\b\b\b\b\b\b\b"; Console.WriteLine($"\nAuditing the 'ms.date' property in all markdown files in '{options.TargetDirectory}'..."); - - var config = new ConfigurationBuilder() - .AddEnvironmentVariables() - .Build(); - string key = config["GitHubKey"]!; - IGitHubClient client = IGitHubClient.CreateGitHubClient(key); - if (docFxRepo.AllTocFiles is null) return; List articleFiles = HelperMethods.GetMarkdownFiles(options.TargetDirectory, "snippets", "includes"); - articleFiles.AddRange(HelperMethods.GetYAMLFiles(options.TargetDirectory)); Console.WriteLine($"Total number of files to process: {articleFiles.Count}"); @@ -382,72 +372,7 @@ static async Task RunOptions(Options options) if (articleFiles is null) return; - var linkedArticles = from article in articleFiles - where (string.Compare(article.Name, "toc.md", true) != 0) && - (string.Compare(article.Name, "toc.yml", true) != 0) && - docFxRepo.AllTocFiles.Any(tocFile => IsFileLinkedFromTocFile(article, tocFile)) - select article; - int totalArticles = 0; - int freshArticles = 0; - int trulyStateArticles = 0; - int falseStaleArticles = 0; - // Make this configurable: - DateOnly staleContentDate = DateOnly.FromDateTime(DateTime.Now.AddYears(-1)); - Console.WriteLine($"PRs Changes Last Commit ms.date Path"); - Console.Write($"{totalArticles,7} {progressMarkers[totalArticles % progressMarkers.Length]}"); - foreach (var article in linkedArticles) - { - totalArticles++; - Console.Write($"{erase}{totalArticles,7} {progressMarkers[totalArticles % progressMarkers.Length]}"); - // Do the cheapest test first: Is this fresh? - DateOnly? msDate = await GetmsDate(article.FullName); - if (msDate > staleContentDate) - { - freshArticles++; - continue; - } - - // Next cheapest test: Are there recent commits? - DateOnly? commitDate = GetCommitDate(options.DocFxDirectory, article.FullName); - if (commitDate < staleContentDate) - { - trulyStateArticles++; - continue; - } - - // reset commit Date. That way, the value from GitHub is used - // instead of the file value. - commitDate = null; - // Give a week from msDate to allow for PR edits before merging. - DateOnly msDateMergeDate = DateOnly.FromDateTime(new DateTime(msDate.Value, default).AddDays(7)); - - var query = new EnumerationQuery(client); - var path = article.FullName.Replace(options.DocFxDirectory, "").Replace('\\', '/').Remove(0,1); - - var variables = new FileHistoryVariables("dotnet", "docs", path); - int numberChanges = 0; - int numberPRs = 0; - await foreach (var history in query.PerformQuery(variables)) - { - commitDate ??= DateOnly.FromDateTime(history.CommittedDate); - if ((DateOnly.FromDateTime(history.CommittedDate) <= msDateMergeDate) || - (DateOnly.FromDateTime(history.CommittedDate) <= staleContentDate)) - { - break; - } - numberPRs++; - if (history.ChangedFilesIfAvailable < 500) - numberChanges += Math.Max(history.Deletions, history.Additions); - } - if (numberChanges > 0) - { - Console.Write(erase); - falseStaleArticles++; - Console.WriteLine($"{numberPRs,3} {numberChanges,7} {commitDate:MM-dd-yyyy} {msDate:MM-dd-yyyy} {path}"); - Console.Write($"{totalArticles,7} {progressMarkers[totalArticles % progressMarkers.Length]}"); - } - } - Console.WriteLine($"{erase} {totalArticles} checked. Fresh: {freshArticles}. Truly stale: {trulyStateArticles}. Updated but not fresh: {falseStaleArticles}"); + await AuditMSDateAccuracy(options, docFxRepo, articleFiles); break; } default: @@ -462,52 +387,90 @@ static async Task RunOptions(Options options) Console.WriteLine($"Elapsed time: {stopwatch.Elapsed.ToHumanReadableString()}"); } - private static DateOnly GetCommitDate(string folder, string path) + private static async Task AuditMSDateAccuracy(Options options, DocFxRepo docFxRepo, List articleFiles) { - // Create a new process - Process process = new Process(); - process.StartInfo.FileName = "git"; - process.StartInfo.Arguments = $"""log -1 --format="%cd" --date=short {path}"""; - process.StartInfo.RedirectStandardOutput = true; - process.StartInfo.UseShellExecute = false; - process.StartInfo.CreateNoWindow = true; - process.StartInfo.WorkingDirectory = folder; + var config = new ConfigurationBuilder() + .AddEnvironmentVariables() + .Build(); + string key = config["GitHubKey"]!; + IGitHubClient client = IGitHubClient.CreateGitHubClient(key); + + int totalArticles = 0; + int freshArticles = 0; + int trulyStateArticles = 0; + int falseStaleArticles = 0; + // Make this configurable: + DateOnly staleContentDate = DateOnly.FromDateTime(DateTime.Now.AddYears(-1)); + + var linkedArticles = from article in articleFiles + where (string.Compare(article.Name, "toc.md", true) != 0) && + (string.Compare(article.Name, "toc.yml", true) != 0) && + docFxRepo.AllTocFiles.Any(tocFile => IsFileLinkedFromTocFile(article, tocFile)) + select article; + string[] progressMarkers = ["| -", "/ \\", "- |", "\\ /"]; + const string removeProgressMarkers = "\b\b\b\b\b\b\b\b\b\b\b"; + + Console.WriteLine($"PRs Changes Last Commit ms.date Path"); + Console.Write($"{totalArticles,7} {progressMarkers[totalArticles % progressMarkers.Length]}"); + foreach (var article in linkedArticles) + { + totalArticles++; + Console.Write($"{removeProgressMarkers}{totalArticles,7} {progressMarkers[totalArticles % progressMarkers.Length]}"); + // First, don't do more work on fresh artricles. This is the + // least expensive (in time) test to look for. + DateOnly? msDate = await HelperMethods.GetmsDate(article.FullName); + if (msDate > staleContentDate) + { + freshArticles++; + continue; + } - // Start the process - process.Start(); + // Next, use git history to get the last commit. This starts a process, + // so it's quite a bit more expensive than the msDate check. + DateOnly? commitDate = await HelperMethods.GetCommitDate(options.DocFxDirectory, article.FullName); + if (commitDate < staleContentDate) + { + trulyStateArticles++; + continue; + } - // Read the output - string output = process.StandardOutput.ReadToEnd(); + // Give a week from msDate to allow for PR edits before merging. + // Without this buffer of time, the checks below often include + // the PR where the date was updated. That creates in a lot of + // false positives. + DateOnly msDateMergeDate = DateOnly.FromDateTime(new DateTime(msDate.Value, default).AddDays(7)); - // Wait for the process to exit - process.WaitForExit(); - return DateOnly.Parse(output); - } + var query = new EnumerationQuery(client); - private static async Task GetmsDate(string filePath) - { - DateOnly? msDate = default; - await foreach (var line in File.ReadLinesAsync(filePath)) - { - if (line.Contains("ms.date")) + // Even on windows, the paths need to be unix-style for the GitHub API, + // and the opening slash must be removed. + var path = article.FullName.Replace(options.DocFxDirectory, "").Replace('\\', '/').Remove(0, 1); + + var variables = new FileHistoryVariables("dotnet", "docs", path); + int numberChanges = 0; + int numberPRs = 0; + await foreach (var history in query.PerformQuery(variables)) { - string[] parts = line.Split(":"); - if (parts.Length > 1) + if ((DateOnly.FromDateTime(history.CommittedDate) <= msDateMergeDate) || + (DateOnly.FromDateTime(history.CommittedDate) <= staleContentDate)) { - string date = parts[1].Trim().Replace("\"", ""); // yeah, remove quotes. - if (DateOnly.TryParse(date, out DateOnly parsedDate)) - { - msDate = parsedDate; - break; - } - else - { - Console.Error.WriteLine($"Invalid date format in {filePath}: {date}"); - } + break; + } + if (history.ChangedFilesIfAvailable < 100) // not a bulk PR + { + numberPRs++; + numberChanges += Math.Max(history.Deletions, history.Additions); } } + if (numberChanges > 0) + { + Console.Write(removeProgressMarkers); + falseStaleArticles++; + Console.WriteLine($"{numberPRs,3} {numberChanges,7} {commitDate:MM-dd-yyyy} {msDate:MM-dd-yyyy} {path}"); + Console.Write($"{totalArticles,7} {progressMarkers[totalArticles % progressMarkers.Length]}"); + } } - return msDate; + Console.WriteLine($"{removeProgressMarkers} {totalArticles} checked. Fresh: {freshArticles}. Truly stale: {trulyStateArticles}. Updated but not fresh: {falseStaleArticles}"); } #region Replace site-relative links @@ -1824,5 +1787,53 @@ public static Dictionary>> FilterMedia return dir; } + + internal static async Task GetmsDate(string filePath) + { + DateOnly? msDate = default; + await foreach (var line in File.ReadLinesAsync(filePath)) + { + if (line.Contains("ms.date")) + { + string[] parts = line.Split(":"); + if (parts.Length > 1) + { + string date = parts[1].Trim().Replace("\"", ""); // yeah, remove quotes. + if (DateOnly.TryParse(date, out DateOnly parsedDate)) + { + msDate = parsedDate; + break; + } + else + { + Console.Error.WriteLine($"Invalid date format in {filePath}: {date}"); + } + } + } + } + return msDate; + } + + internal static async Task GetCommitDate(string folder, string path) + { + // Create a new process + Process process = new Process(); + process.StartInfo.FileName = "git"; + process.StartInfo.Arguments = $"""log -1 --format="%cd" --date=short {path}"""; + process.StartInfo.RedirectStandardOutput = true; + process.StartInfo.UseShellExecute = false; + process.StartInfo.CreateNoWindow = true; + process.StartInfo.WorkingDirectory = folder; + + // Start the process + process.Start(); + + // Read the output + string output = await process.StandardOutput.ReadToEndAsync(); + + // Wait for the process to exit + await process.WaitForExitAsync(); + return DateOnly.Parse(output); + } } #endregion diff --git a/cleanrepo/README.md b/cleanrepo/README.md index 9ba4eb6..2f9625b 100644 --- a/cleanrepo/README.md +++ b/cleanrepo/README.md @@ -11,6 +11,7 @@ This command-line tool helps you clean up a DocFx-based content repo. It can: - Remove daisy chains (or hops) within the redirection files for the docset. - Replace site-relative links with file-relative links (includes image links). - Filter image list based on strings found in images. +- Compare `ms.date` metadata and recent commit data. ## Usage @@ -31,6 +32,7 @@ The available functions are described in the following table. | ReplaceWithRelativeLinks | Replace site-relative links with file-relative links. | | CatalogImagesWithText | Map images to the markdown/YAML files that reference them, with all text found in images. The output file is prefixed with `OcrImageFiles-`. | | FilterImagesForText | Filter images for text. The output file is prefixed with `FilteredOcrImageFiles-`. | +| AuditMSDate | Compare `ms.date` metadata to most recent commits. This can take a long time on a full repo. It also requires a [GitHub PAT](https://github.com/settings/tokens) with read privileges for the repository you want to check. Store this PAT in an environment variable named `GITHUB_KEY`. ## Image to text examples From 8ac391106b131eeb627ac056ace56188e51c2779 Mon Sep 17 00:00:00 2001 From: Bill Wagner Date: Thu, 30 Jan 2025 16:51:14 -0500 Subject: [PATCH 5/7] note a couple build warnings. --- cleanrepo/Program.cs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cleanrepo/Program.cs b/cleanrepo/Program.cs index fb460f8..576d86e 100644 --- a/cleanrepo/Program.cs +++ b/cleanrepo/Program.cs @@ -389,6 +389,8 @@ static async Task RunOptions(Options options) private static async Task AuditMSDateAccuracy(Options options, DocFxRepo docFxRepo, List articleFiles) { + if (options.DocFxDirectory is null) + return; var config = new ConfigurationBuilder() .AddEnvironmentVariables() .Build(); @@ -399,7 +401,7 @@ private static async Task AuditMSDateAccuracy(Options options, DocFxRepo docFxRe int freshArticles = 0; int trulyStateArticles = 0; int falseStaleArticles = 0; - // Make this configurable: + // This could be configurable in time (or now, even): DateOnly staleContentDate = DateOnly.FromDateTime(DateTime.Now.AddYears(-1)); var linkedArticles = from article in articleFiles From e10af11aa12867afc306d673c7dbece495c4ef60 Mon Sep 17 00:00:00 2001 From: Bill Wagner Date: Fri, 31 Jan 2025 08:59:54 -0500 Subject: [PATCH 6/7] Apply suggestions from code review Co-authored-by: Genevieve Warren <24882762+gewarren@users.noreply.github.com> --- cleanrepo/Program.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanrepo/Program.cs b/cleanrepo/Program.cs index 576d86e..606d35f 100644 --- a/cleanrepo/Program.cs +++ b/cleanrepo/Program.cs @@ -438,7 +438,7 @@ private static async Task AuditMSDateAccuracy(Options options, DocFxRepo docFxRe // Give a week from msDate to allow for PR edits before merging. // Without this buffer of time, the checks below often include - // the PR where the date was updated. That creates in a lot of + // the PR where the date was updated. That results in a lot of // false positives. DateOnly msDateMergeDate = DateOnly.FromDateTime(new DateTime(msDate.Value, default).AddDays(7)); From 2302f30c713118df6b188c763bf9e82a96c2c505 Mon Sep 17 00:00:00 2001 From: Bill Wagner Date: Fri, 31 Jan 2025 09:25:26 -0500 Subject: [PATCH 7/7] respond to feedback. --- cleanrepo/Program.cs | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/cleanrepo/Program.cs b/cleanrepo/Program.cs index 606d35f..9933a8d 100644 --- a/cleanrepo/Program.cs +++ b/cleanrepo/Program.cs @@ -369,9 +369,6 @@ static async Task RunOptions(Options options) Console.WriteLine($"Total number of files to process: {articleFiles.Count}"); - if (articleFiles is null) - return; - await AuditMSDateAccuracy(options, docFxRepo, articleFiles); break; } @@ -404,23 +401,22 @@ private static async Task AuditMSDateAccuracy(Options options, DocFxRepo docFxRe // This could be configurable in time (or now, even): DateOnly staleContentDate = DateOnly.FromDateTime(DateTime.Now.AddYears(-1)); - var linkedArticles = from article in articleFiles - where (string.Compare(article.Name, "toc.md", true) != 0) && - (string.Compare(article.Name, "toc.yml", true) != 0) && - docFxRepo.AllTocFiles.Any(tocFile => IsFileLinkedFromTocFile(article, tocFile)) - select article; string[] progressMarkers = ["| -", "/ \\", "- |", "\\ /"]; const string removeProgressMarkers = "\b\b\b\b\b\b\b\b\b\b\b"; Console.WriteLine($"PRs Changes Last Commit ms.date Path"); Console.Write($"{totalArticles,7} {progressMarkers[totalArticles % progressMarkers.Length]}"); - foreach (var article in linkedArticles) + foreach (var article in articleFiles) { totalArticles++; Console.Write($"{removeProgressMarkers}{totalArticles,7} {progressMarkers[totalArticles % progressMarkers.Length]}"); // First, don't do more work on fresh artricles. This is the // least expensive (in time) test to look for. DateOnly? msDate = await HelperMethods.GetmsDate(article.FullName); + if (msDate is null) + { + continue; + } if (msDate > staleContentDate) { freshArticles++;