Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

landing_worker: add and use metadata from Revision rather than HgPatchHelper (bug 1936171) #200

Merged
merged 5 commits into from
Feb 3, 2025
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 10 additions & 19 deletions src/lando/api/legacy/workers/landing_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import subprocess
from contextlib import contextmanager
from datetime import datetime
from io import StringIO
from pathlib import Path
from typing import (
Optional,
Expand All @@ -15,7 +14,6 @@
from django.db import transaction

from lando.api.legacy.commit_message import bug_list_to_commit_string, parse_bugs
from lando.api.legacy.hgexports import HgPatchHelper
from lando.api.legacy.notifications import (
notify_user_of_bug_update_failure,
notify_user_of_landing_failure,
Expand All @@ -30,6 +28,7 @@
from lando.main.scm.abstract_scm import AbstractSCM
from lando.main.scm.exceptions import (
AutoformattingException,
NoDiffStartLine,
PatchConflict,
SCMException,
SCMInternalServerError,
Expand Down Expand Up @@ -262,13 +261,15 @@ def run_job(self, job: LandingJob) -> bool:

# Run through the patches one by one and try to apply them.
for revision in job.revisions.all():
# TODO: Rather than parsing the patch details from the full HG patch
# stored in the job, we should read the revision's metadata (and
# move to only store the diff in the patch_string, rather than an
# export).
# https://bugzilla.mozilla.org/show_bug.cgi?id=1936171
patch_helper = HgPatchHelper(StringIO(revision.patch_string))
if not patch_helper.diff_start_line:

try:
scm.apply_patch(
revision.diff,
revision.commit_message,
revision.author,
revision.timestamp,
)
except NoDiffStartLine:
message = (
"Lando encountered a malformed patch, please try again. "
"If this error persists please file a bug: "
Expand All @@ -282,16 +283,6 @@ def run_job(self, job: LandingJob) -> bool:
self.notify_user_of_landing_failure(job)
return True

date = patch_helper.get_header("Date")
user = patch_helper.get_header("User")

try:
scm.apply_patch(
patch_helper.get_diff(),
patch_helper.get_commit_description(),
user,
date,
)
except PatchConflict as exc:
breakdown = scm.process_merge_conflict(
repo.pull_path, revision.revision_id, str(exc)
Expand Down
3 changes: 1 addition & 2 deletions src/lando/api/tests/test_landings.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,8 @@
add_job_with_revisions,
)
from lando.main.models.revision import Revision
from lando.main.scm import SCM_TYPE_HG
from lando.main.scm import SCM_TYPE_GIT, SCM_TYPE_HG
from lando.main.scm.abstract_scm import AbstractSCM
from lando.main.scm.consts import SCM_TYPE_GIT
from lando.main.scm.git import GitSCM
from lando.main.scm.hg import HgSCM, LostPushRace
from lando.utils import HgPatchHelper
Expand Down
3 changes: 2 additions & 1 deletion src/lando/api/tests/test_transplants.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,7 @@ def test_integrated_transplant_records_approvers_peers_and_owners(

worker = LandingWorker(repos=Repo.objects.all(), sleep_seconds=0.01)
assert worker.run_job(job)
assert job.status == LandingJobStatus.LANDED
for revision in job.revisions.all():
if revision.revision_id == 1:
assert revision.data["peers_and_owners"] == [101]
Expand Down Expand Up @@ -1184,7 +1185,7 @@ def test_integrated_transplant_sec_approval_group_is_excluded_from_reviewers_lis
# Check the transplanted patch for our alternate commit message.
transplanted_patch = Revision.get_from_revision_id(revision["id"])
assert transplanted_patch is not None, "Transplanted patch should be retrievable."
assert sec_approval_project["name"] not in transplanted_patch.patch_string
assert sec_approval_project["name"] not in transplanted_patch.patch


def test_warning_wip_commit_message(phabdouble):
Expand Down
122 changes: 108 additions & 14 deletions src/lando/main/models/revision.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,16 @@
from __future__ import annotations

import logging
from typing import Any
import re
from io import StringIO
from typing import Any, Optional

from django.db import models
from django.utils.translation import gettext_lazy

from lando.api.legacy.hgexports import HgPatchHelper
from lando.main.models.base import BaseModel
from lando.main.scm.exceptions import NoDiffStartLine
from lando.utils import build_patch_for_revision

logger = logging.getLogger(__name__)
Expand All @@ -33,20 +37,22 @@ class Revision(BaseModel):
Includes a reference to the related Phabricator revision and diff ID if one exists.
"""

def __str__(self):
return f"Revision {self.revision_id} Diff {self.diff_id}"

# revision_id and diff_id map to Phabricator IDs (integers).
revision_id = models.IntegerField(blank=True, null=True, unique=True)

# diff_id is that of the latest diff on the revision at landing request time. It
# does not track all diffs.
diff_id = models.IntegerField(blank=True, null=True)

# The actual patch.
# The actual patch with Mercurial metadata format.
patch = models.TextField(blank=True, default="")

# Patch metadata, such as author, timestamp, etc...
# Patch metadata, such as
# - author_name
# - author_email
# - commit_message
# - timestamp
# - ...
patch_data = models.JSONField(blank=True, default=dict)

# A general purpose data field to store arbitrary information about this revision.
Expand All @@ -55,6 +61,11 @@ def __str__(self):
# The commit ID generated by the landing worker, before pushing to remote repo.
commit_id = models.CharField(max_length=40, null=True, blank=True)

_patch_helper: Optional[HgPatchHelper] = None

def __str__(self):
return f"Revision {self.revision_id} Diff {self.diff_id}"

def __repr__(self) -> str:
"""Return a human-readable representation of the instance."""
# Add an identifier for the Phabricator revision if it exists.
Expand All @@ -67,13 +78,6 @@ def __repr__(self) -> str:
def patch_bytes(self) -> bytes:
return self.patch.encode("utf-8")

@property
def patch_string(self) -> str:
"""Return the patch as a UTF-8 encoded string."""
# Here for compatiblity, as an alias.
# TODO: remove this in the near future.
return self.patch

@classmethod
def get_from_revision_id(cls, revision_id: int) -> "Revision" | None:
"""Return a Revision object from a given ID."""
Expand All @@ -82,7 +86,14 @@ def get_from_revision_id(cls, revision_id: int) -> "Revision" | None:

@classmethod
def new_from_patch(cls, raw_diff: str, patch_data: dict[str, str]) -> Revision:
"""Construct a new Revision from patch data."""
"""Construct a new Revision from patch data.

`patch_data` is expected to contain the following keys:
- author_name
- author_email
- commit_message
- timestamp (unix timestamp as a string)
"""
rev = Revision()
rev.set_patch(raw_diff, patch_data)
rev.save()
Expand All @@ -104,6 +115,89 @@ def serialize(self) -> dict[str, Any]:
"updated_at": self.updated_at,
}

@property
def author(self):
"""Get the full author string in "Name <Email>" format."""
parts = []
if self.author_name:
parts.append(self.author_name)
if self.author_email:
parts.append(f"<{self.author_email}>")

return " ".join(parts)

@property
def author_name(self) -> Optional[str]:
metadata = self._parse_metadata_from_patch()
return metadata.get("author_name")

@property
def author_email(self) -> Optional[str]:
metadata = self._parse_metadata_from_patch()
return metadata.get("author_email")

@property
def commit_message(self) -> Optional[str]:
metadata = self._parse_metadata_from_patch()
return metadata.get("commit_message")

@property
def timestamp(self) -> Optional[str]:
metadata = self._parse_metadata_from_patch()
if ts := metadata.get("timestamp"):
# Some codepaths (via Phabricator) have the timestamp set as an int.
# We make sure it's always a string.
return str(ts)
return None
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feels a bit repetitive to call _parse_metadata_from_patch in each method. Maybe all these values should be determined at the same time, whenever self.patch_data is set.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The metadata is only parsed one, and cached, as part of the _parse_metadata_from_patch() method https://github.com/mozilla-conduit/lando/pull/200/files#diff-6f00d8fe4977345e9360a87afcf4861324a3b6ff012d6a6a5716640b4b44c04dR153-R168

But calling it in every method makes sure we call it the first time we ever need any metadata.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The repetitiveness aspect (i.e., calling _parse_metadata_from_patch in every method) was more so what I meant, not so much efficiency. I think changing _parse_metadata_from_patch to a property (e.g., _parsed_metadata_from_patch) and then using it directly in these methods would alleviate this issue.

This would become:

    @property
    def timestamp(self) -> Optional[str]:
        if ts := self._parsed_metadata_from_patch.get("timestamp"):
            # Some codepaths (via Phabricator) have the timestamp set as an int.
            # We make sure it's always a string.
            return str(ts)
        return None

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah! Yeah, that's better.


def _parse_metadata_from_patch(self) -> dict[str, str]:
"""Parse Hg metadata out of the raw patch, and update the patch_data if empty."""
if not self.patch_data:
commit_message = self.patch_helper.get_commit_description()
author_name, author_email = self._parse_author_string(
self.patch_helper.get_header("User")
)
timestamp = self.patch_helper.get_timestamp()

self.patch_data = {"commit_message": commit_message, "timestamp": timestamp}
if author_name:
self.patch_data["author_name"] = author_name
if author_email:
self.patch_data["author_email"] = author_email

return self.patch_data

@staticmethod
def _parse_author_string(author: str) -> tuple[str, str]:
"""Parse a Git author string into author name and email.

The returned tuple will have the empty string "" for unmatched parts.
"""
r = re.compile(
r"^(?P<author_name>.*?)? *<?(?P<author_email>[^ \t\n\r\f\v<]+@[^ \t\n\r\f\v>]+)>?"
)
m = r.match(author)
if not m:
return (author, "")
return m.groups()

@property
def diff(self) -> str:
"""Return the unified diff text without any metadata"""
# The HgPatchHelper currently returns leading newline, which we don't want to
# return here, so we strip it.
return self.patch_helper.get_diff().lstrip()

@property
def patch_helper(self) -> HgPatchHelper:
"""Create and cache an HgPatchHelper to parse the raw patch with Hg metadata."""
if not self._patch_helper:
self._patch_helper = HgPatchHelper(StringIO(self.patch))
if not self._patch_helper.diff_start_line:
raise NoDiffStartLine

return self._patch_helper
Comment on lines +181 to +196
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These two methods are hg-specific, wonder if we should be making them more scm-agnostic?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the purpose of this change is a stop gap to start hiding the HgPatchHelper.

Historically, we transformed Phab revision into a Hg Patch, which we no longer need to do.

In this first cut, we hide the patch helper from everything else, and let the interface of the Revision be sufficient, and we can change the implementation later on.



class DiffWarningStatus(models.TextChoices):
ACTIVE = "ACTIVE", gettext_lazy("Active")
Expand Down
4 changes: 4 additions & 0 deletions src/lando/main/scm/hg.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,10 @@ def apply_patch(
# patcher since both attempts failed.
raise exc

if re.match("^[0-9]+$", commit_date):
# If the commit_date is a unix timestamp, convert to Hg internal format.
commit_date = f"{commit_date} 0"

self.run_hg(
["commit"]
+ ["--date", commit_date]
Expand Down
72 changes: 72 additions & 0 deletions src/lando/main/tests/test_models.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,26 @@
from datetime import datetime
from unittest.mock import MagicMock, patch

import pytest
from django.conf import settings
from django.core.exceptions import ValidationError

from lando.main.models import Repo
from lando.main.models.revision import Revision
from lando.main.scm import (
SCM_TYPE_GIT,
SCM_TYPE_HG,
)

DIFF_ONLY = """
diff --git a/test.txt b/test.txt
--- a/test.txt
+++ b/test.txt
@@ -1,1 +1,2 @@
TEST
+adding another line
""".lstrip()


@pytest.mark.parametrize(
"git_returncode,hg_returncode,scm_type",
Expand Down Expand Up @@ -84,3 +95,64 @@ def test__models__Repo__system_path_validator(path, expected_exception):
repo.clean_fields()
else:
repo.clean_fields() # Should not raise any exception


@pytest.mark.parametrize(
"author, expected",
[
(
"A. Uthor <[email protected]>",
("A. Uthor", "[email protected]"),
),
(
"[email protected]",
("", "[email protected]"),
),
(
"<[email protected]>",
("", "[email protected]"),
),
(
"A. Uthor",
("A. Uthor", ""),
),
(
"@ Uthor",
("@ Uthor", ""),
),
(
"<@ Uthor>",
("<@ Uthor>", ""),
),
],
)
def test__models__Revision___parse_author_string(author, expected):
assert Revision._parse_author_string(author) == expected


@pytest.mark.django_db()
def test__models__Revision__metadata():
author = "A. Uthor"
email = "[email protected]"
commit_message = """Multiline Commit Message

More lines
"""
timestamp = datetime.now().strftime("%s")

r = Revision.new_from_patch(
raw_diff=DIFF_ONLY,
patch_data={
"author_name": author,
"author_email": email,
"commit_message": commit_message,
"timestamp": timestamp,
},
)

assert r.author_name == author
assert r.author_email == email
assert r.author == f"{author} <{email}>"
assert r.commit_message == commit_message
assert r.timestamp == timestamp
assert r.diff == DIFF_ONLY
Loading