Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a command to CleanRepo to provide data on the difference bewteen the ms.date value and latest commit #469

Merged
merged 7 commits into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions DotNet.DocsTools/GitHubObjects/FileHistory.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
using DotNetDocs.Tools.GitHubCommunications;

namespace DotNet.DocsTools.GitHubObjects;

/// <summary>
/// The query variables for file history
/// </summary>
/// <param name="owner">The GitHub org for the repository</param>
/// <param name="repo">The repository name</param>
/// <param name="path">The path to the file</param>
public readonly record struct FileHistoryVariables(string owner, string repo, string path);

public class FileHistory : IGitHubQueryResult<FileHistory, FileHistoryVariables>
{
private const string FileHistoryQueryText = """
query FileHistory($owner: String!, $repo: String!, $path: String!, $cursor: String) {
repository(owner: $owner, name: $repo) {
defaultBranchRef {
target {
... on Commit {
history(
first: 25
after: $cursor
path: $path
) {
nodes {
committedDate
changedFilesIfAvailable
additions
deletions
}
pageInfo {
hasNextPage
endCursor
}
}
}
}
}
}
}
""";


public DateTime CommittedDate { get; }

public int? ChangedFilesIfAvailable { get; }

public int Additions { get; }

public int Deletions { get; }
private FileHistory(JsonElement element)
{
CommittedDate = ResponseExtractors.DateTimeProperty(element, "committedDate");
ChangedFilesIfAvailable = ResponseExtractors.IntProperty(element, "changedFilesIfAvailable");
Additions = ResponseExtractors.IntProperty(element, "additions");
Deletions = ResponseExtractors.IntProperty(element, "deletions");
}

public static GraphQLPacket GetQueryPacket(FileHistoryVariables variables, bool isScalar) =>
(isScalar)
? throw new InvalidOperationException("This query is not a scalar query")
: new()
{
query = FileHistoryQueryText,
variables =
{
["owner"] = variables.owner,
["repo"] = variables.repo,
["path"] = variables.path,
}
};

public static IEnumerable<string> NavigationToNodes(bool isScalar) =>
(isScalar)
? throw new InvalidOperationException("This query is not a scalar query")
: ["repository", "defaultBranchRef", "target", "history"];

public static FileHistory? FromJsonElement(JsonElement element, FileHistoryVariables variables) =>
new FileHistory(element);
}
6 changes: 6 additions & 0 deletions DotNet.DocsTools/GitHubObjects/ResponseExtractors.cs
Original file line number Diff line number Diff line change
Expand Up @@ -112,4 +112,10 @@ internal static int IntProperty(JsonElement element, string propertyName)
}
throw new ArgumentException($"Property {propertyName} not found in Json element. Did you possibly access the parent node?", nameof(element));
}

internal static DateTime DateTimeProperty(JsonElement element, string propertyName)
{
return OptionalDateProperty(element, propertyName)
?? throw new ArgumentException("Requested property shouldn't be null", nameof(propertyName));
}
}
3 changes: 3 additions & 0 deletions cleanrepo/CleanRepo.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
<PackageReference Include="Tesseract" Version="5.2.0" />
<PackageReference Include="Microsoft.Build" Version="17.12.6" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\DotNet.DocsTools\DotNet.DocsTools.csproj" />
</ItemGroup>
<ItemGroup>
<None Update="appSettings.json">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
Expand Down
168 changes: 164 additions & 4 deletions cleanrepo/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
using System.Text.Json;
using System.Text.RegularExpressions;
using CleanRepo.Extensions;
using DotNet.DocsTools.GitHubObjects;
using DotNetDocs.Tools.GitHubCommunications;
using DotNetDocs.Tools.GraphQLQueries;
using Microsoft.Build.Construction;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.Hosting;
Expand All @@ -24,10 +27,11 @@ class Program
"FilterImagesForText",
"ReplaceRedirectTargets",
"ReplaceWithRelativeLinks",
"RemoveRedirectHops"
"RemoveRedirectHops",
"AuditMSDate"
];

static void Main(string[] args)
static async Task Main(string[] args)
{
HostApplicationBuilder builder = Host.CreateApplicationBuilder(args);
builder.Configuration.Sources.Clear();
Expand All @@ -45,10 +49,10 @@ static void Main(string[] args)
builder.Configuration.GetSection(nameof(Options))
.Bind(options);

RunOptions(options);
await RunOptions(options);
}

static void RunOptions(Options options)
static async Task RunOptions(Options options)
{
if (String.IsNullOrEmpty(options.Function))
{
Expand Down Expand Up @@ -351,6 +355,26 @@ static void RunOptions(Options options)
Console.WriteLine("\nFinished removing redirect hops.");
break;
}
// Audit the 'ms.date' property in all markdown files.
case "AuditMSDate":
{
Console.WriteLine($"\nAuditing the 'ms.date' property in all markdown files in '{options.TargetDirectory}'...");

if (docFxRepo.AllTocFiles is null)
BillWagner marked this conversation as resolved.
Show resolved Hide resolved
return;

List<FileInfo> articleFiles = HelperMethods.GetMarkdownFiles(options.TargetDirectory, "snippets", "includes");

articleFiles.AddRange(HelperMethods.GetYAMLFiles(options.TargetDirectory));

Console.WriteLine($"Total number of files to process: {articleFiles.Count}");

if (articleFiles is null)
return;

await AuditMSDateAccuracy(options, docFxRepo, articleFiles);
break;
}
default:
{
Console.WriteLine($"\nUnknown function '{options.Function}'. " +
Expand All @@ -363,6 +387,94 @@ static void RunOptions(Options options)
Console.WriteLine($"Elapsed time: {stopwatch.Elapsed.ToHumanReadableString()}");
}

private static async Task AuditMSDateAccuracy(Options options, DocFxRepo docFxRepo, List<FileInfo> articleFiles)
{
if (options.DocFxDirectory is null)
return;
var config = new ConfigurationBuilder()
.AddEnvironmentVariables()
.Build();
string key = config["GitHubKey"]!;
IGitHubClient client = IGitHubClient.CreateGitHubClient(key);

int totalArticles = 0;
int freshArticles = 0;
int trulyStateArticles = 0;
int falseStaleArticles = 0;
// This could be configurable in time (or now, even):
DateOnly staleContentDate = DateOnly.FromDateTime(DateTime.Now.AddYears(-1));

var linkedArticles = from article in articleFiles
BillWagner marked this conversation as resolved.
Show resolved Hide resolved
where (string.Compare(article.Name, "toc.md", true) != 0) &&
(string.Compare(article.Name, "toc.yml", true) != 0) &&
docFxRepo.AllTocFiles.Any(tocFile => IsFileLinkedFromTocFile(article, tocFile))
select article;
string[] progressMarkers = ["| -", "/ \\", "- |", "\\ /"];
const string removeProgressMarkers = "\b\b\b\b\b\b\b\b\b\b\b";

Console.WriteLine($"PRs Changes Last Commit ms.date Path");
Console.Write($"{totalArticles,7} {progressMarkers[totalArticles % progressMarkers.Length]}");
foreach (var article in linkedArticles)
{
totalArticles++;
Console.Write($"{removeProgressMarkers}{totalArticles,7} {progressMarkers[totalArticles % progressMarkers.Length]}");
// First, don't do more work on fresh artricles. This is the
// least expensive (in time) test to look for.
DateOnly? msDate = await HelperMethods.GetmsDate(article.FullName);
if (msDate > staleContentDate)
{
freshArticles++;
continue;
}

// Next, use git history to get the last commit. This starts a process,
// so it's quite a bit more expensive than the msDate check.
DateOnly? commitDate = await HelperMethods.GetCommitDate(options.DocFxDirectory, article.FullName);
if (commitDate < staleContentDate)
{
trulyStateArticles++;
continue;
}

// Give a week from msDate to allow for PR edits before merging.
// Without this buffer of time, the checks below often include
// the PR where the date was updated. That creates in a lot of
BillWagner marked this conversation as resolved.
Show resolved Hide resolved
// false positives.
DateOnly msDateMergeDate = DateOnly.FromDateTime(new DateTime(msDate.Value, default).AddDays(7));

var query = new EnumerationQuery<FileHistory, FileHistoryVariables>(client);

// Even on windows, the paths need to be unix-style for the GitHub API,
// and the opening slash must be removed.
var path = article.FullName.Replace(options.DocFxDirectory, "").Replace('\\', '/').Remove(0, 1);

var variables = new FileHistoryVariables("dotnet", "docs", path);
int numberChanges = 0;
int numberPRs = 0;
await foreach (var history in query.PerformQuery(variables))
{
if ((DateOnly.FromDateTime(history.CommittedDate) <= msDateMergeDate) ||
(DateOnly.FromDateTime(history.CommittedDate) <= staleContentDate))
{
break;
}
if (history.ChangedFilesIfAvailable < 100) // not a bulk PR
{
numberPRs++;
numberChanges += Math.Max(history.Deletions, history.Additions);
}
}
if (numberChanges > 0)
{
Console.Write(removeProgressMarkers);
falseStaleArticles++;
Console.WriteLine($"{numberPRs,3} {numberChanges,7} {commitDate:MM-dd-yyyy} {msDate:MM-dd-yyyy} {path}");
Console.Write($"{totalArticles,7} {progressMarkers[totalArticles % progressMarkers.Length]}");
}
}
Console.WriteLine($"{removeProgressMarkers} {totalArticles} checked. Fresh: {freshArticles}. Truly stale: {trulyStateArticles}. Updated but not fresh: {falseStaleArticles}");
}

#region Replace site-relative links
private static void ReplaceLinks(List<FileInfo> linkingFiles, string urlBasePath, string rootDirectory)
{
Expand Down Expand Up @@ -1677,5 +1789,53 @@ public static Dictionary<string, List<KeyValuePair<string, string>>> FilterMedia

return dir;
}

internal static async Task<DateOnly?> GetmsDate(string filePath)
{
DateOnly? msDate = default;
await foreach (var line in File.ReadLinesAsync(filePath))
{
if (line.Contains("ms.date"))
{
string[] parts = line.Split(":");
if (parts.Length > 1)
{
string date = parts[1].Trim().Replace("\"", ""); // yeah, remove quotes.
if (DateOnly.TryParse(date, out DateOnly parsedDate))
{
msDate = parsedDate;
break;
}
else
{
Console.Error.WriteLine($"Invalid date format in {filePath}: {date}");
}
}
}
}
return msDate;
}

internal static async Task<DateOnly> GetCommitDate(string folder, string path)
{
// Create a new process
Process process = new Process();
process.StartInfo.FileName = "git";
process.StartInfo.Arguments = $"""log -1 --format="%cd" --date=short {path}""";
process.StartInfo.RedirectStandardOutput = true;
process.StartInfo.UseShellExecute = false;
process.StartInfo.CreateNoWindow = true;
process.StartInfo.WorkingDirectory = folder;

// Start the process
process.Start();

// Read the output
string output = await process.StandardOutput.ReadToEndAsync();

// Wait for the process to exit
await process.WaitForExitAsync();
return DateOnly.Parse(output);
}
}
#endregion
2 changes: 2 additions & 0 deletions cleanrepo/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ This command-line tool helps you clean up a DocFx-based content repo. It can:
- Remove daisy chains (or hops) within the redirection files for the docset.
- Replace site-relative links with file-relative links (includes image links).
- Filter image list based on strings found in images.
- Compare `ms.date` metadata and recent commit data.

## Usage

Expand All @@ -31,6 +32,7 @@ The available functions are described in the following table.
| ReplaceWithRelativeLinks | Replace site-relative links with file-relative links. |
| CatalogImagesWithText | Map images to the markdown/YAML files that reference them, with all text found in images. The output file is prefixed with `OcrImageFiles-`. |
| FilterImagesForText | Filter images for text. The output file is prefixed with `FilteredOcrImageFiles-`. |
| AuditMSDate | Compare `ms.date` metadata to most recent commits. This can take a long time on a full repo. It also requires a [GitHub PAT](https://github.com/settings/tokens) with read privileges for the repository you want to check. Store this PAT in an environment variable named `GITHUB_KEY`.

## Image to text examples

Expand Down
Loading