From 0d4ea2c5ebab5175f2a948c39dfd9bd572e6b7e4 Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Thu, 16 Mar 2023 15:52:41 +0100 Subject: [PATCH 01/23] mypy: `pglookout/logutil.py` [BF-1560] --- Makefile | 4 ++-- pglookout/logutil.py | 33 ++++++++++++++++++++++----------- pyproject.toml | 1 - stubs/__init__.pyi | 1 + stubs/systemd/__init__.pyi | 1 + stubs/systemd/daemon.pyi | 19 +++++++++++++++++++ 6 files changed, 45 insertions(+), 14 deletions(-) create mode 100644 stubs/__init__.pyi create mode 100644 stubs/systemd/__init__.pyi create mode 100644 stubs/systemd/daemon.pyi diff --git a/Makefile b/Makefile index bc3696c..e1df801 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ long_ver = $(shell git describe --long 2>/dev/null || echo $(short_ver)-0-unknow generated = pglookout/version.py PYTHON ?= python3 -PYTHON_SOURCE_DIRS = pglookout/ test/ +PYTHON_SOURCE_DIRS = pglookout/ test/ stubs/ all: $(generated) : 'try "make rpm" or "make deb" or "make test"' @@ -17,7 +17,7 @@ unittest: $(generated) $(PYTHON) -m pytest mypy: $(generated) - $(PYTHON) -m mypy + MYPYPATH=stubs $(PYTHON) -m mypy flake8: $(generated) $(PYTHON) -m flake8 $(PYTHON_SOURCE_DIRS) diff --git a/pglookout/logutil.py b/pglookout/logutil.py index c5d861a..f0150ef 100644 --- a/pglookout/logutil.py +++ b/pglookout/logutil.py @@ -5,39 +5,50 @@ Copyright (c) 2015 Ohmu Ltd See LICENSE for details """ +from __future__ import annotations + +from logging.handlers import SysLogHandler +from typing import Final, TYPE_CHECKING import logging -import logging.handlers import os +with_systemd: bool = False try: from systemd import daemon # pylint: disable=no-name-in-module + + with_systemd = True except ImportError: - daemon = None + if not TYPE_CHECKING: + daemon = None -LOG_FORMAT = "%(asctime)s\t%(name)s\t%(threadName)s\t%(levelname)s\t%(message)s" -LOG_FORMAT_SHORT = "%(levelname)s\t%(message)s" -LOG_FORMAT_SYSLOG = "%(name)s %(threadName)s %(levelname)s: %(message)s" +LOG_FORMAT: Final[str] = "%(asctime)s\t%(name)s\t%(threadName)s\t%(levelname)s\t%(message)s" +LOG_FORMAT_SHORT: Final[str] = "%(levelname)s\t%(message)s" +LOG_FORMAT_SYSLOG: Final[str] = "%(name)s %(threadName)s %(levelname)s: %(message)s" -def set_syslog_handler(address, facility, logger): - syslog_handler = logging.handlers.SysLogHandler(address=address, facility=facility) +def set_syslog_handler(address: str, facility: str | int, logger: logging.Logger) -> SysLogHandler: + if isinstance(facility, str): + facility_id: int = SysLogHandler.facility_names.get(facility, SysLogHandler.LOG_LOCAL2) + else: + facility_id = facility + syslog_handler = SysLogHandler(address=address, facility=facility_id) logger.addHandler(syslog_handler) formatter = logging.Formatter(LOG_FORMAT_SYSLOG) syslog_handler.setFormatter(formatter) return syslog_handler -def configure_logging(level=logging.DEBUG, short_log=False): +def configure_logging(level: int = logging.DEBUG, short_log: bool = False) -> None: # Are we running under systemd? if os.getenv("NOTIFY_SOCKET"): logging.basicConfig(level=level, format=LOG_FORMAT_SYSLOG) - if not daemon: + if not with_systemd: print("WARNING: Running under systemd but python-systemd not available, systemd won't see our notifications") else: logging.basicConfig(level=level, format=LOG_FORMAT_SHORT if short_log else LOG_FORMAT) -def notify_systemd(status): - if daemon: +def notify_systemd(status: str) -> None: + if with_systemd: daemon.notify(status) diff --git a/pyproject.toml b/pyproject.toml index 16ce87c..7b1701f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,6 @@ exclude = [ 'pglookout/cluster_monitor.py', 'pglookout/common.py', 'pglookout/current_master.py', - 'pglookout/logutil.py', 'pglookout/pglookout.py', 'pglookout/pgutil.py', 'pglookout/statsd.py', diff --git a/stubs/__init__.pyi b/stubs/__init__.pyi new file mode 100644 index 0000000..3bf9baf --- /dev/null +++ b/stubs/__init__.pyi @@ -0,0 +1 @@ +# Copyright (c) 2023 Aiven, Helsinki, Finland. https://aiven.io/ diff --git a/stubs/systemd/__init__.pyi b/stubs/systemd/__init__.pyi new file mode 100644 index 0000000..3bf9baf --- /dev/null +++ b/stubs/systemd/__init__.pyi @@ -0,0 +1 @@ +# Copyright (c) 2023 Aiven, Helsinki, Finland. https://aiven.io/ diff --git a/stubs/systemd/daemon.pyi b/stubs/systemd/daemon.pyi new file mode 100644 index 0000000..4ddc10b --- /dev/null +++ b/stubs/systemd/daemon.pyi @@ -0,0 +1,19 @@ +# Copyright (c) 2023 Aiven, Helsinki, Finland. https://aiven.io/ + +from __future__ import annotations + +from typing import AnyStr, IO + +def notify( + status: str, + unset_environment: bool = False, + pid: int = 0, + fds: IO[AnyStr] | None = None, +) -> bool: + """ + notify(status, unset_environment=False, pid=0, fds=None) -> bool + + Send a message to the init system about a status change. + Wraps sd_notify(3). + """ + ... From 238eb76eacff197d704f197b5bd02a3c8027465d Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Thu, 16 Mar 2023 17:30:50 +0100 Subject: [PATCH 02/23] mypy: `pglookout/pgutil.py` [BF-1560] --- pglookout/pgutil.py | 105 +++++++++++++++++++++++++++++++++++++------- pyproject.toml | 2 - test/test_pgutil.py | 8 ++-- 3 files changed, 94 insertions(+), 21 deletions(-) diff --git a/pglookout/pgutil.py b/pglookout/pgutil.py index 3b8562b..7d5f692 100644 --- a/pglookout/pgutil.py +++ b/pglookout/pgutil.py @@ -5,16 +5,80 @@ Copyright (c) 2015 Ohmu Ltd See LICENSE for details """ +from __future__ import annotations + +from typing import cast, Literal, TypedDict from urllib.parse import parse_qs, urlparse # pylint: disable=no-name-in-module, import-error import psycopg2.extensions -def create_connection_string(connection_info): - return psycopg2.extensions.make_dsn(**connection_info) +class DsnDictBase(TypedDict, total=False): + user: str + password: str + host: str + port: str | int + + +class DsnDict(DsnDictBase, total=False): + dbname: str + + +class DsnDictDeprecated(DsnDictBase, total=False): + database: str + + +class ConnectionParameterKeywords(TypedDict, total=False): + """Parameter Keywords for Connection. + See: + https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-PARAMKEYWORDS + """ -def mask_connection_info(info): + host: str + hostaddr: str + port: str + dbname: str + user: str + password: str + passfile: str + channel_binding: Literal["require", "prefer", "disable"] + connect_timeout: str + client_encoding: str + options: str + application_name: str + fallback_application_name: str + keepalives: Literal["0", "1"] + keepalives_idle: str + keepalives_interval: str + keepalives_count: str + tcp_user_timeout: str + replication: Literal["true", "on", "yes", "1", "database", "false", "off", "no", "0"] + gssencmode: Literal["disable", "prefer", "require"] + sslmode: Literal["disable", "allow", "prefer", "require", "verify-ca", "verify-full"] + requiressl: Literal["0", "1"] + sslcompression: Literal["0", "1"] + sslcert: str + sslkey: str + sslpassword: str + sslrootcert: str + sslcrl: str + sslcrldir: str + sslsni: Literal["0", "1"] + requirepeer: str + ssl_min_protocol_version: Literal["TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"] + ssl_max_protocol_version: Literal["TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"] + krbsrvname: str + gsslib: str + service: str + target_session_attrs: Literal["any", "read-write", "read-only", "primary", "standby", "prefer-standby"] + + +def create_connection_string(connection_info: DsnDict | DsnDictDeprecated | ConnectionParameterKeywords) -> str: + return str(psycopg2.extensions.make_dsn(**connection_info)) + + +def mask_connection_info(info: str) -> str: masked_info = get_connection_info(info) password = masked_info.pop("password", None) connection_string = create_connection_string(masked_info) @@ -22,24 +86,29 @@ def mask_connection_info(info): return f"{connection_string}; {message}" -def get_connection_info_from_config_line(line): +def get_connection_info_from_config_line(line: str) -> ConnectionParameterKeywords: _, value = line.split("=", 1) value = value.strip()[1:-1].replace("''", "'") return get_connection_info(value) -def get_connection_info(info): - """turn a connection info object into a dict or return it if it was a - dict already. supports both the traditional libpq format and the new - url format""" +def get_connection_info( + info: str | DsnDict | DsnDictDeprecated | ConnectionParameterKeywords, +) -> ConnectionParameterKeywords: + """Get a normalized connection info dict from a connection string or a dict. + + Supports both the traditional libpq format and the new url format. + """ if isinstance(info, dict): - return info.copy() + # Potentially, we might clean deprecated DSN dicts: `database` -> `dbname`. + # Also, psycopg2 will validate the keys and values. + return parse_connection_string_libpq(create_connection_string(info)) if info.startswith("postgres://") or info.startswith("postgresql://"): return parse_connection_string_url(info) return parse_connection_string_libpq(info) -def parse_connection_string_url(url): +def parse_connection_string_url(url: str) -> ConnectionParameterKeywords: # drop scheme from the url as some versions of urlparse don't handle # query and path properly for urls with a non-http scheme schemeless_url = url.split(":", 1)[1] @@ -57,12 +126,15 @@ def parse_connection_string_url(url): fields["dbname"] = p.path[1:] for k, v in parse_qs(p.query).items(): fields[k] = v[-1] - return fields + return cast(ConnectionParameterKeywords, fields) + +def parse_connection_string_libpq(connection_string: str) -> ConnectionParameterKeywords: + """Parse a postgresql connection string. -def parse_connection_string_libpq(connection_string): - """parse a postgresql connection string as defined in - http://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNSTRING""" + See: + http://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNSTRING + """ fields = {} while True: connection_string = connection_string.strip() @@ -92,5 +164,8 @@ def parse_connection_string_libpq(connection_string): value, connection_string = res else: value, connection_string = rem, "" + # This one is case-insensitive. To continue benefiting from mypy, we make it lowercase. + if key == "replication": + value = value.lower() fields[key] = value - return fields + return cast(ConnectionParameterKeywords, fields) diff --git a/pyproject.toml b/pyproject.toml index 7b1701f..b3197ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,6 @@ exclude = [ 'pglookout/common.py', 'pglookout/current_master.py', 'pglookout/pglookout.py', - 'pglookout/pgutil.py', 'pglookout/statsd.py', 'pglookout/version.py', 'pglookout/webserver.py', @@ -44,7 +43,6 @@ exclude = [ 'test/test_cluster_monitor.py', 'test/test_common.py', 'test/test_lookout.py', - 'test/test_pgutil.py', 'test/test_webserver.py', # Other. 'setup.py', diff --git a/test/test_pgutil.py b/test/test_pgutil.py index d70e6f5..7dc969f 100644 --- a/test/test_pgutil.py +++ b/test/test_pgutil.py @@ -6,14 +6,14 @@ See LICENSE for details """ -from pglookout.pgutil import create_connection_string, get_connection_info, mask_connection_info +from pglookout.pgutil import ConnectionParameterKeywords, create_connection_string, get_connection_info, mask_connection_info from pytest import raises -def test_connection_info(): +def test_connection_info() -> None: url = "postgres://hannu:secret@dbhost.local:5555/abc?replication=true&sslmode=foobar&sslmode=require" cs = "host=dbhost.local user='hannu' dbname='abc'\nreplication=true password=secret sslmode=require port=5555" - ci = { + ci: ConnectionParameterKeywords = { "host": "dbhost.local", "port": "5555", "user": "hannu", @@ -39,7 +39,7 @@ def test_connection_info(): get_connection_info("foo=bar bar='x") -def test_mask_connection_info(): +def test_mask_connection_info() -> None: url = "postgres://michael:secret@dbhost.local:5555/abc?replication=true&sslmode=foobar&sslmode=require" cs = "host=dbhost.local user='michael' dbname='abc'\nreplication=true password=secret sslmode=require port=5555" ci = get_connection_info(cs) From 3db5684adadc3a06e9f56e5e8a089a741cdf566a Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Fri, 17 Mar 2023 10:10:37 +0100 Subject: [PATCH 03/23] mypy: `pglookout/current-master.py` [BF-1560] Also in this commit: - removed `python2` leftovers (from pglookout 2.0.0 (2018-10-12)) --- debian/rules | 2 +- pglookout/current_master.py | 4 ++-- pyproject.toml | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/debian/rules b/debian/rules index 5184822..c33a948 100755 --- a/debian/rules +++ b/debian/rules @@ -3,4 +3,4 @@ export PYBUILD_NAME=pglookout %: - dh $@ --with python2 --with systemd --buildsystem=pybuild + dh $@ --with python3 --with systemd --buildsystem=pybuild diff --git a/pglookout/current_master.py b/pglookout/current_master.py index 0824feb..8f7154f 100644 --- a/pglookout/current_master.py +++ b/pglookout/current_master.py @@ -8,7 +8,7 @@ See the file `LICENSE` for details. """ -from __future__ import print_function +from __future__ import annotations from . import version @@ -19,7 +19,7 @@ import time -def main(args=None): +def main(args: list[str] | None = None) -> int: if args is None: args = sys.argv[1:] diff --git a/pyproject.toml b/pyproject.toml index b3197ac..5fb6e35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,6 @@ exclude = [ 'pglookout/__main__.py', 'pglookout/cluster_monitor.py', 'pglookout/common.py', - 'pglookout/current_master.py', 'pglookout/pglookout.py', 'pglookout/statsd.py', 'pglookout/version.py', From 2ce13ab9002303040ea44cbc4f11fa6bb65722bf Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Fri, 17 Mar 2023 15:06:01 +0100 Subject: [PATCH 04/23] mypy: `pglookout/statsd.py` [BF-1560] --- pglookout/statsd.py | 63 ++++++++++++++++++++++++++++++++++++++------- pyproject.toml | 1 - 2 files changed, 53 insertions(+), 11 deletions(-) diff --git a/pglookout/statsd.py b/pglookout/statsd.py index d1490f2..4e1bee6 100644 --- a/pglookout/statsd.py +++ b/pglookout/statsd.py @@ -9,28 +9,65 @@ https://github.com/influxdata/telegraf/tree/master/plugins/inputs/statsd """ +from __future__ import annotations + +from typing import Literal import logging import socket +StatsdMetricType = Literal[ + b"g", # gauge + b"c", # counter + b"s", # set + b"ms", # timing + b"h", # histogram + b"d", # distribution +] + class StatsClient: - def __init__(self, host="127.0.0.1", port=8125, tags=None): - self.log = logging.getLogger("StatsClient") - self._dest_addr = (host, port) - self._socket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - self._tags = tags or {} + def __init__( + self, + host: str | None = "127.0.0.1", + port: int = 8125, + tags: dict[str, str] | None = None, + ) -> None: + self.log: logging.Logger = logging.getLogger("StatsClient") + self._dest_addr: tuple[str | None, int] = (host, port) + self._socket: socket.socket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + self._tags: dict[str, str] = tags or {} - def gauge(self, metric, value, tags=None): + def gauge( + self, + metric: str, + value: int | float | str, + tags: dict[str, str] | None = None, + ) -> None: self._send(metric, b"g", value, tags) - def increase(self, metric, inc_value=1, tags=None): + def increase( + self, + metric: str, + inc_value: int | float = 1, + tags: dict[str, str] | None = None, + ) -> None: self._send(metric, b"c", inc_value, tags) - def timing(self, metric, value, tags=None): + def timing( + self, + metric: str, + value: int | float, + tags: dict[str, str] | None = None, + ) -> None: self._send(metric, b"ms", value, tags) - def unexpected_exception(self, ex, where, tags=None): + def unexpected_exception( + self, + ex: Exception, + where: str, + tags: dict[str, str] | None = None, + ) -> None: all_tags = { "exception": ex.__class__.__name__, "where": where, @@ -38,7 +75,13 @@ def unexpected_exception(self, ex, where, tags=None): all_tags.update(tags or {}) self.increase("exception", tags=all_tags) - def _send(self, metric, metric_type, value, tags): + def _send( + self, + metric: str, + metric_type: StatsdMetricType, + value: int | float | str, + tags: dict[str, str] | None, + ) -> None: if None in self._dest_addr: # stats sending is disabled return diff --git a/pyproject.toml b/pyproject.toml index 5fb6e35..829cb98 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,6 @@ exclude = [ 'pglookout/cluster_monitor.py', 'pglookout/common.py', 'pglookout/pglookout.py', - 'pglookout/statsd.py', 'pglookout/version.py', 'pglookout/webserver.py', # Tests. From 25617d1442ecd8fc524980661191fa06ff6808e8 Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Thu, 16 Mar 2023 15:14:19 +0100 Subject: [PATCH 05/23] mypy: `pglookout/common.py` [BF-1560] Also in this commit: - More unit tests; - Removed redundant `()` in regex. --- pglookout/common.py | 37 ++++++++++++++++--------- pyproject.toml | 2 -- test/test_common.py | 66 +++++++++++++++++++++++++++++++++++++-------- 3 files changed, 79 insertions(+), 26 deletions(-) diff --git a/pglookout/common.py b/pglookout/common.py index 4d1f032..6975650 100644 --- a/pglookout/common.py +++ b/pglookout/common.py @@ -4,40 +4,51 @@ Copyright (c) 2015 Ohmu Ltd See LICENSE for details """ -import datetime +from __future__ import annotations + +from datetime import datetime, timedelta +from typing import Final + import re -def convert_xlog_location_to_offset(wal_location): +def convert_xlog_location_to_offset(wal_location: str) -> int: log_id, offset = wal_location.split("/") return int(log_id, 16) << 32 | int(offset, 16) -ISO_EXT_RE = re.compile( +ISO_EXT_RE: Final[re.Pattern[str]] = re.compile( r"(?P\d{4})-(?P\d\d)-(?P\d\d)(T(?P\d\d):(?P\d\d)" r"(:(?P\d\d)(.(?P\d{6}))?)?Z)?$" ) -ISO_BASIC_RE = re.compile( +ISO_BASIC_RE: Final[re.Pattern[str]] = re.compile( r"(?P\d{4})(?P\d\d)(?P\d\d)(T(?P\d\d)(?P\d\d)" r"((?P\d\d)((?P\d{6}))?)?Z)?$" ) +ISO_GROUP_NAMES: Final[tuple[str, ...]] = ( + "year", + "month", + "day", + "hour", + "minute", + "second", + "microsecond", +) -def parse_iso_datetime(value): +def parse_iso_datetime(value: str) -> datetime: match = ISO_EXT_RE.match(value) if not match: match = ISO_BASIC_RE.match(value) if not match: raise ValueError(f"Invalid ISO timestamp {value!r}") - parts = dict( - (key, int(match.group(key) or "0")) for key in ("year", "month", "day", "hour", "minute", "second", "microsecond") - ) - return datetime.datetime(tzinfo=None, **parts) + parts = {key: int(match.group(key) or "0") for key in ISO_GROUP_NAMES} + return datetime(tzinfo=None, **parts) -def get_iso_timestamp(fetch_time=None): +def get_iso_timestamp(fetch_time: datetime | None = None) -> str: if not fetch_time: - fetch_time = datetime.datetime.utcnow() - elif fetch_time.tzinfo: - fetch_time = fetch_time.replace(tzinfo=None) - datetime.timedelta(seconds=fetch_time.utcoffset().seconds) + fetch_time = datetime.utcnow() + elif (offset := fetch_time.utcoffset()) is not None: + fetch_time = fetch_time.replace(tzinfo=None) - timedelta(seconds=offset.seconds) return fetch_time.isoformat() + "Z" diff --git a/pyproject.toml b/pyproject.toml index 829cb98..7b59a5b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,14 +32,12 @@ exclude = [ # Implementation. 'pglookout/__main__.py', 'pglookout/cluster_monitor.py', - 'pglookout/common.py', 'pglookout/pglookout.py', 'pglookout/version.py', 'pglookout/webserver.py', # Tests. 'test/conftest.py', 'test/test_cluster_monitor.py', - 'test/test_common.py', 'test/test_lookout.py', 'test/test_webserver.py', # Other. diff --git a/test/test_common.py b/test/test_common.py index 8139d6d..6df4b55 100644 --- a/test/test_common.py +++ b/test/test_common.py @@ -4,33 +4,77 @@ Copyright (c) 2015 Ohmu Ltd See LICENSE for details """ - +from datetime import datetime from pglookout.common import convert_xlog_location_to_offset, get_iso_timestamp, ISO_EXT_RE, parse_iso_datetime -from pytest import raises -import datetime +import pytest -def test_convert_xlog_location_to_offset(): +def test_convert_xlog_location_to_offset() -> None: assert convert_xlog_location_to_offset("1/00000000") == 1 << 32 assert convert_xlog_location_to_offset("F/AAAAAAAA") == (0xF << 32) | 0xAAAAAAAA - with raises(ValueError): + with pytest.raises(ValueError): convert_xlog_location_to_offset("x") - with raises(ValueError): + with pytest.raises(ValueError): convert_xlog_location_to_offset("x/y") -def test_parse_iso_datetime(): - date = datetime.datetime.utcnow() +def test_parse_iso_datetime() -> None: + date = datetime.utcnow() date.replace(microsecond=0) assert date == parse_iso_datetime(date.isoformat() + "Z") - with raises(ValueError): + with pytest.raises(ValueError): parse_iso_datetime("foobar") -def test_get_iso_timestamp(): +def test_get_iso_timestamp() -> None: v = get_iso_timestamp() assert ISO_EXT_RE.match(v) - ts = datetime.datetime.now() + ts = datetime.now() v = get_iso_timestamp(ts) assert parse_iso_datetime(v) == ts + + +@pytest.mark.parametrize( + "timestamp", + [ + datetime(2021, 1, 1, 23, 42, 11, 123456), + datetime(2021, 1, 1, 23, 42, 11), + datetime(2021, 1, 1, 23, 42), + datetime(2021, 1, 1, 23), + datetime(2021, 1, 1), + ], +) +def test_roundtrip(timestamp: datetime) -> None: + ts2 = parse_iso_datetime(get_iso_timestamp(timestamp)) + + assert ts2 == timestamp + + +@pytest.mark.parametrize( + ("value", "normalized_value"), + # fmt: off + [ + # Extended format + ("2021-01-01T00:00:00.000000Z", "2021-01-01T00:00:00Z"), # noqa: E241 + ("2021-01-01T23:42:11.123456Z", "2021-01-01T23:42:11.123456Z"), # noqa: E241 + ("2021-01-01T00:00:00Z", "2021-01-01T00:00:00Z"), # noqa: E241 + ("2021-01-01T23:42:11Z", "2021-01-01T23:42:11Z"), # noqa: E241 + ("2021-01-01T00:00Z", "2021-01-01T00:00:00Z"), # noqa: E241 + ("2021-01-01T23:42Z", "2021-01-01T23:42:00Z"), # noqa: E241 + ("2021-01-01", "2021-01-01T00:00:00Z"), # noqa: E241 + # Basic format + ("20210101T000000Z", "2021-01-01T00:00:00Z"), # noqa: E241 + ("20210101T234211123456Z", "2021-01-01T23:42:11.123456Z"), # noqa: E241 + ("20210101T000000Z", "2021-01-01T00:00:00Z"), # noqa: E241 + ("20210101T234211Z", "2021-01-01T23:42:11Z"), # noqa: E241 + ("20210101T0000Z", "2021-01-01T00:00:00Z"), # noqa: E241 + ("20210101T2342Z", "2021-01-01T23:42:00Z"), # noqa: E241 + ("20210101", "2021-01-01T00:00:00Z"), # noqa: E241 + ], + # fmt: on +) +def test_reverse_roundtrip(value: str, normalized_value: str) -> None: + v2 = get_iso_timestamp(parse_iso_datetime(value)) + + assert v2 == normalized_value From 016d14db3722dd1002a765112672afb0ba3f2d09 Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Fri, 17 Mar 2023 10:21:29 +0100 Subject: [PATCH 06/23] mypy: `pglookout/config.py` [BF-1560] New file to type hint the configuration. Also: - Alphabetically sort configuration keys in the `README`. This has the effect of grouping related keys closer. - Add missing key: `replication_catchup_timeout` --- README.rst | 153 +++++++++++++++++++++++--------------------- pglookout/config.py | 51 +++++++++++++++ 2 files changed, 130 insertions(+), 74 deletions(-) create mode 100644 pglookout/config.py diff --git a/README.rst b/README.rst index 145fd01..8b5d4e7 100644 --- a/README.rst +++ b/README.rst @@ -201,6 +201,11 @@ description of the current state of the cluster which is under monitoring. Configuration keys ================== +``alert_file_dir`` (default ``os.getcwd()``) + +Directory in which alert files for replication warning and failover +are created. + ``autofollow`` (default ``false``) Do you want pglookout to try to start following the new primary. Useful @@ -210,38 +215,24 @@ to start following the new primary. Requires ``pg_data_directory``, ``pg_start_command`` and ``pg_stop_command`` configuration keys to be set. +``cluster_monitor_health_timeout_seconds`` (default: ``2 * replication_state_check_interval``) + +If set, it will increase the statsd counter `cluster_monitor_health_timeout` if the +`cluster_monitor` thread has not successfully completed a check since +`cluster_monitor_health_timeout_seconds`. + ``db_poll_interval`` (default ``5.0``) Interval on how often should the connections defined in remote_conns be polled for information on DB replication state. -``remote_conns`` (default ``{}``) - -PG database connection strings that the pglookout process should monitor. -Keys of the object should be names of the remotes and values must be valid -PostgreSQL connection strings or connection info objects. - -``primary_conninfo_template`` - -Connection string or connection info object template to use when setting a new -primary_conninfo value for recovery.conf after a failover has happened. Any -provided hostname and database name in the template is ignored and they are -replaced with a replication connection to the new primary node. - -Required when ``autofollow`` is true. - -``observers`` (default ``{}``) +``failover_command`` (default ``""``) -This object contains key value pairs like ``{"1.2.3.4": -"http://2.3.4.5:15000"}``. They are used to determine the location of -pglookout observer processes. Observers are processes that don't take any -actions, but simply give a third party viewpoint on the state of the -cluster. Useful especially during net splits. +Shell command to execute in case the node has deemed itself in need of promotion -``poll_observers_on_warning_only`` (default ``False``) +``failover_sleep_time`` (default ``0.0``) -this allows observers to be polled only when replication lag is over -``warning_replication_time_lag`` +Time to sleep after a failover command has been issued. ``http_address`` (default ``""``) @@ -251,55 +242,36 @@ HTTP webserver address, by default pglookout binds to all interfaces. HTTP webserver port. -``replication_state_check_interval`` (default ``10.0``) +``json_state_file_path`` (default ``"/tmp/pglookout_state.json"``) -How often should pglookout check the replication state in order to -make decisions on should the node be promoted. +Location of a JSON state file which describes the state of the +pglookout process. -``failover_sleep_time`` (default ``0.0``) +``known_gone_nodes`` (default ``[]``) -Time to sleep after a failover command has been issued. +Lists nodes that are explicitly known to have left the cluster. If the old +primary is removed in a controlled manner it should be added to this list to +ensure there's no extra delay when making promotion decision. + +``log_level`` (default ``"INFO"``) + +Determines log level of pglookout. ``maintenance_mode_file`` (default ``"/tmp/pglookout_maintenance_mode_file"``) If a file exists in this location, this node will not be considered for promotion to primary. -``missing_master_from_config_timeout`` (default ``15``) - -In seconds the amount of time before we do a failover decision if a -previously existing primary has been removed from the config file and -we have gotten a SIGHUP. - -``alert_file_dir`` (default ``os.getcwd()``) - -Directory in which alert files for replication warning and failover -are created. - -``json_state_file_path`` (default ``"/tmp/pglookout_state.json"``) - -Location of a JSON state file which describes the state of the -pglookout process. - ``max_failover_replication_time_lag`` (default ``120.0``) Replication time lag after which failover_command will be executed and a failover_has_happened file will be created. -``warning_replication_time_lag`` (default ``30.0``) - -Replication time lag at which point to execute -over_warning_limit_command and to create a warning file. - -``failover_command`` (default ``""``) - -Shell command to execute in case the node has deemed itself in need of promotion - -``known_gone_nodes`` (default ``[]``) +``missing_master_from_config_timeout`` (default ``15``) -Lists nodes that are explicitly known to have left the cluster. If the old -primary is removed in a controlled manner it should be added to this list to -ensure there's no extra delay when making promotion decision. +In seconds the amount of time before we do a failover decision if a +previously existing primary has been removed from the config file and +we have gotten a SIGHUP. ``never_promote_these_nodes`` (default ``[]``) @@ -308,6 +280,14 @@ you have primary ``p`` which fails and replicas ``a`` and ```b``, even if ``b`` is ahead but is listed in ``never_promote_these_nodes``, ``a`` will be promoted. +``observers`` (default ``{}``) + +This object contains key value pairs like ``{"1.2.3.4": +"http://2.3.4.5:15000"}``. They are used to determine the location of +pglookout observer processes. Observers are processes that don't take any +actions, but simply give a third party viewpoint on the state of the +cluster. Useful especially during net splits. + ``over_warning_limit_command`` (default ``null``) Shell command to be executed once replication lag is warning_replication_time_lag @@ -316,10 +296,6 @@ Shell command to be executed once replication lag is warning_replication_time_la The key of the entry in ``remote_conns`` that matches this node. -``log_level`` (default ``"INFO"``) - -Determines log level of pglookout. - ``pg_data_directory`` (default ``"/var/lib/pgsql/data"``) PG data directory that needs to be set when autofollow has been turned on. @@ -336,18 +312,35 @@ true. Usually something like "sudo systemctl start postgresql". Command to stop a PostgreSQL process on a node which has autofollow set to true. Usually something like "sudo systemctl start postgresql". -``syslog`` (default ``false``) +``poll_observers_on_warning_only`` (default ``False``) -Determines whether syslog logging should be turned on or not. +this allows observers to be polled only when replication lag is over +``warning_replication_time_lag`` -``syslog_address`` (default ``"/dev/log"``) +``primary_conninfo_template`` -Determines syslog address to use in logging (requires syslog to be -true as well) +Connection string or connection info object template to use when setting a new +primary_conninfo value for recovery.conf after a failover has happened. Any +provided hostname and database name in the template is ignored and they are +replaced with a replication connection to the new primary node. -``syslog_facility`` (default ``"local2"``) +Required when ``autofollow`` is true. -Determines syslog log facility. (requires syslog to be true as well) +``remote_conns`` (default ``{}``) + +PG database connection strings that the pglookout process should monitor. +Keys of the object should be names of the remotes and values must be valid +PostgreSQL connection strings or connection info objects. + +``replication_catchup_timeout`` (default ``300.0``) + +How long should pglookout wait for a node to catch up with the primary +before it decides to promote itself. + +``replication_state_check_interval`` (default ``10.0``) + +How often should pglookout check the replication state in order to +make decisions on should the node be promoted. ``statsd`` (default: disabled) @@ -368,13 +361,25 @@ The ``tags`` setting can be used to enter optional tag values for the metrics. Metrics sending follows the `Telegraf spec`_. -.. _`Telegraf spec`: https://github.com/influxdata/telegraf/tree/master/plugins/inputs/statsd +``syslog`` (default ``false``) -``cluster_monitor_health_timeout_seconds`` (default: ``2 * replication_state_check_interval``) +Determines whether syslog logging should be turned on or not. -If set, it will increase the statsd counter `cluster_monitor_health_timeout` if the -`cluster_monitor` thread has not successfully completed a check since -`cluster_monitor_health_timeout_seconds`. +``syslog_address`` (default ``"/dev/log"``) + +Determines syslog address to use in logging (requires syslog to be +true as well) + +``syslog_facility`` (default ``"local2"``) + +Determines syslog log facility. (requires syslog to be true as well) + +``warning_replication_time_lag`` (default ``30.0``) + +Replication time lag at which point to execute +over_warning_limit_command and to create a warning file. + +.. _`Telegraf spec`: https://github.com/influxdata/telegraf/tree/master/plugins/inputs/statsd License ======= diff --git a/pglookout/config.py b/pglookout/config.py new file mode 100644 index 0000000..97f8f69 --- /dev/null +++ b/pglookout/config.py @@ -0,0 +1,51 @@ +# Copyright (c) 2023 Aiven, Helsinki, Finland. https://aiven.io/ +from __future__ import annotations + +from typing import Literal, TypedDict + + +class Statsd(TypedDict, total=False): + host: str + port: int + tags: dict[str, str] + + +class Config(TypedDict, total=False): + alert_file_dir: str + autofollow: bool + cluster_monitor_health_timeout_seconds: float + db_poll_interval: float + failover_command: str + failover_sleep_time: float + http_address: str + http_port: int + json_state_file_path: str + known_gone_nodes: list[str] + log_level: Literal["NOTSET", "DEBUG", "INFO", "WARNING", "WARN", "ERROR", "FATAL", "CRITICAL"] + maintenance_mode_file: str + max_failover_replication_time_lag: float + missing_master_from_config_timeout: float + never_promote_these_nodes: list[str] + observers: dict[str, str] + over_warning_limit_command: str + own_db: str + pg_data_directory: str + pg_start_command: str + pg_stop_command: str + poll_observers_on_warning_only: bool + primary_conninfo_template: str + remote_conns: dict[str, str] + replication_catchup_timeout: float + replication_state_check_interval: float + statsd: Statsd + syslog: bool + syslog_address: str + # fmt: off + # https://docs.python.org/3/library/logging.handlers.html#logging.handlers.SysLogHandler.encodePriority + syslog_facility: Literal[ + "auth", "authpriv", "console", "cron", "daemon", "ftp", "kern", "lpr", + "mail", "news", "ntp", "security", "solaris-cron", "syslog", "user", "uucp", + "local0", "local1", "local2", "local3", "local4", "local5", "local6", "local7", + ] + # fmt: on + warning_replication_time_lag: float From bf067b315c30f3f53ec791b5cce6ecaec759d61d Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Thu, 23 Mar 2023 16:23:27 +0100 Subject: [PATCH 07/23] mypy: `pglookout/common_types.py` [BF-1560] --- pglookout/common_types.py | 53 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 pglookout/common_types.py diff --git a/pglookout/common_types.py b/pglookout/common_types.py new file mode 100644 index 0000000..3c20cdf --- /dev/null +++ b/pglookout/common_types.py @@ -0,0 +1,53 @@ +# Copyright (c) 2023 Aiven, Helsinki, Finland. https://aiven.io/ +from __future__ import annotations + +from datetime import datetime +from typing import Any, Dict, TypedDict + + +class ReplicationSlotAsDict(TypedDict, total=True): + slot_name: str + plugin: str + slot_type: str + database: str + catalog_xmin: str + restart_lsn: str + confirmed_flush_lsn: str + state_data: str + + +class MemberState(TypedDict, total=False): + """Represents the state of a member of the cluster. + + Note: + This is a very loose type as no key is mandatory. This is because + it is too dangerous to impose a stricter type until we have a + better test coverage, as it would change some behaviour in the + code (some unconventional behaviour was detected, and it may be a + bug or a feature). + """ + + # Connection Status + connection: bool + fetch_time: str + # Queried Status + db_time: str | datetime + pg_is_in_recovery: bool + pg_last_xact_replay_timestamp: datetime | str | None + pg_last_xlog_receive_location: str | None + pg_last_xlog_replay_location: str | None + # Replication info + replication_slots: list[ReplicationSlotAsDict] + replication_time_lag: float | None + min_replication_time_lag: float + replication_start_time: float | None + + +# Note for future improvements: +# If we want ObserverState to accept arbitrary keys, we have three choices: +# - Use a different type (pydantic, dataclasses, etc.) +# - Use a TypedDict for static keys (connection, fetch_time) and a sub-dict for +# dynamic keys (received from state.json). +# - Wait for something like `allow_extra` to be implemented into TypedDict (unlikely) +# https://github.com/python/mypy/issues/4617 +ObserverState = Dict[str, Any] From a462bf92d1f2121c89296e5dac6b5c646894cf6f Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Fri, 17 Mar 2023 10:21:29 +0100 Subject: [PATCH 08/23] mypy: `pglookout/webserver.py` [BF-1560] --- pglookout/webserver.py | 80 +++++++++++++++++++++++++++--------------- pyproject.toml | 2 -- test/test_webserver.py | 19 ++++++---- 3 files changed, 65 insertions(+), 36 deletions(-) diff --git a/pglookout/webserver.py b/pglookout/webserver.py index 0394ca6..09b990c 100644 --- a/pglookout/webserver.py +++ b/pglookout/webserver.py @@ -7,8 +7,13 @@ This file is under the Apache License, Version 2.0. See the file `LICENSE` for details. """ -from http.server import HTTPServer, SimpleHTTPRequestHandler -from logging import getLogger +from __future__ import annotations + +from http.server import BaseHTTPRequestHandler, HTTPServer, SimpleHTTPRequestHandler +from logging import getLogger, Logger +from pglookout.common_types import MemberState +from pglookout.config import Config +from queue import Queue from socketserver import ThreadingMixIn from threading import Thread @@ -17,43 +22,62 @@ class ThreadedWebServer(ThreadingMixIn, HTTPServer): - cluster_state = None - log = None - cluster_monitor_check_queue = None - allow_reuse_address = True + allow_reuse_address: bool = True + + def __init__( + self, + address: str, + port: int, + RequestHandlerClass: type[BaseHTTPRequestHandler], + cluster_state: dict[str, MemberState], + log: Logger, + cluster_monitor_check_queue: Queue[str], + ) -> None: + super().__init__((address, port), RequestHandlerClass) + self.cluster_state: dict[str, MemberState] = cluster_state + self.log: Logger = log + self.cluster_monitor_check_queue: Queue[str] = cluster_monitor_check_queue class WebServer(Thread): - def __init__(self, config, cluster_state, cluster_monitor_check_queue): - Thread.__init__(self) - self.config = config - self.cluster_state = cluster_state - self.cluster_monitor_check_queue = cluster_monitor_check_queue - self.log = getLogger("WebServer") - self.address = self.config.get("http_address", "") - self.port = self.config.get("http_port", 15000) - self.server = None + def __init__( + self, config: Config, cluster_state: dict[str, MemberState], cluster_monitor_check_queue: Queue[str] + ) -> None: + super().__init__() + self.config: Config = config + self.cluster_state: dict[str, MemberState] = cluster_state + self.cluster_monitor_check_queue: Queue[str] = cluster_monitor_check_queue + self.log: Logger = getLogger("WebServer") + self.address: str = self.config.get("http_address", "") + self.port: int = self.config.get("http_port", 15000) + self.server: ThreadedWebServer | None = None self.log.debug("WebServer initialized with address: %r port: %r", self.address, self.port) - self.is_initialized = threading.Event() + self.is_initialized: threading.Event = threading.Event() - def run(self): + def run(self) -> None: # We bind the port only when we start running - self.server = ThreadedWebServer((self.address, self.port), RequestHandler) - self.server.cluster_state = self.cluster_state - self.server.log = self.log - self.server.cluster_monitor_check_queue = self.cluster_monitor_check_queue + self.server = ThreadedWebServer( + address=self.address, + port=self.port, + RequestHandlerClass=RequestHandler, + cluster_state=self.cluster_state, + log=self.log, + cluster_monitor_check_queue=self.cluster_monitor_check_queue, + ) self.is_initialized.set() self.server.serve_forever() - def close(self): - if self.server: - self.log.debug("Closing WebServer") - self.server.shutdown() - self.log.debug("Closed WebServer") + def close(self) -> None: + if self.server is None: + return + + self.log.debug("Closing WebServer") + self.server.shutdown() + self.log.debug("Closed WebServer") class RequestHandler(SimpleHTTPRequestHandler): - def do_GET(self): + def do_GET(self) -> None: assert isinstance(self.server, ThreadedWebServer), f"server: {self.server!r}" self.server.log.debug("Got request: %r", self.path) if self.path.startswith("/state.json"): @@ -66,7 +90,7 @@ def do_GET(self): else: self.send_response(404) - def do_POST(self): + def do_POST(self) -> None: assert isinstance(self.server, ThreadedWebServer), f"server: {self.server!r}" self.server.log.debug("Got request: %r", self.path) if self.path.startswith("/check"): diff --git a/pyproject.toml b/pyproject.toml index 7b59a5b..8cae98b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,12 +34,10 @@ exclude = [ 'pglookout/cluster_monitor.py', 'pglookout/pglookout.py', 'pglookout/version.py', - 'pglookout/webserver.py', # Tests. 'test/conftest.py', 'test/test_cluster_monitor.py', 'test/test_lookout.py', - 'test/test_webserver.py', # Other. 'setup.py', 'version.py', diff --git a/test/test_webserver.py b/test/test_webserver.py index d81e71d..a4dffb5 100644 --- a/test/test_webserver.py +++ b/test/test_webserver.py @@ -6,6 +6,10 @@ This file is under the Apache License, Version 2.0. See the file `LICENSE` for details. """ +from __future__ import annotations + +from pglookout.common_types import MemberState +from pglookout.config import Config from pglookout.webserver import WebServer from queue import Queue @@ -13,21 +17,24 @@ import requests -def test_webserver(): - config = { +def test_webserver() -> None: + config: Config = { "http_port": random.randint(10000, 32000), } - cluster_state = { - "hello": 123, + cluster_state: dict[str, MemberState] = { + "1.2.3.4": { + "connection": False, + "fetch_time": "2021-01-01T23:42:11.123456Z", + } } http_port = config["http_port"] base_url = f"http://127.0.0.1:{http_port}" - cluster_monitor_check_queue = Queue() + cluster_monitor_check_queue: Queue[str] = Queue() web = WebServer(config=config, cluster_state=cluster_state, cluster_monitor_check_queue=cluster_monitor_check_queue) try: web.start() - # wait for the thread to have started, else we're blocking forever as web.close can't shutdown the thread + # wait for the thread to have started, else we're blocking forever as web.close can't shut down the thread web.is_initialized.wait(timeout=30.0) result = requests.get(f"{base_url}/state.json", timeout=5).json() From 9b4e13fe3340acc28d35086c8a0ac4bff7b6633e Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Fri, 17 Mar 2023 15:57:59 +0100 Subject: [PATCH 09/23] mypy: `pglookout/cluster_monitor.py` [BF-1560] --- pglookout/cluster_monitor.py | 226 ++++++++++++++++++++++------------- pglookout/config.py | 2 +- pyproject.toml | 6 +- requirements.dev.txt | 1 - test/test_cluster_monitor.py | 40 ++++--- 5 files changed, 169 insertions(+), 106 deletions(-) diff --git a/pglookout/cluster_monitor.py b/pglookout/cluster_monitor.py index 287a79a..a642972 100644 --- a/pglookout/cluster_monitor.py +++ b/pglookout/cluster_monitor.py @@ -7,17 +7,23 @@ This file is under the Apache License, Version 2.0. See the file `LICENSE` for details. """ +from __future__ import annotations -from . import logutil -from .common import get_iso_timestamp, parse_iso_datetime -from .pgutil import mask_connection_info from concurrent.futures import as_completed, ThreadPoolExecutor from dataclasses import asdict, dataclass from email.utils import parsedate -from psycopg2.extras import RealDictCursor -from queue import Empty +from logging.handlers import SysLogHandler +from pglookout import logutil +from pglookout.common import get_iso_timestamp, parse_iso_datetime +from pglookout.common_types import MemberState, ObserverState, ReplicationSlotAsDict +from pglookout.config import Config +from pglookout.pgutil import mask_connection_info +from pglookout.statsd import StatsClient +from psycopg2.extensions import POLL_OK, POLL_READ, POLL_WRITE +from psycopg2.extras import RealDictCursor, RealDictRow +from queue import Empty, Queue from threading import Thread -from typing import List +from typing import Callable, cast, Final import datetime import errno @@ -27,6 +33,9 @@ import select import time +# https://www.psycopg.org/docs/connection.html#connection.server_version +PG_VERSION_10: Final[int] = 10_00_00 # 10.0.0 + class PglookoutTimeout(Exception): pass @@ -44,17 +53,17 @@ class ReplicationSlot: state_data: str -def wait_select(conn, timeout=5.0): +def wait_select(conn: psycopg2.connection, timeout: float = 5.0) -> None: end_time = time.monotonic() + timeout while time.monotonic() < end_time: time_left = end_time - time.monotonic() state = conn.poll() try: - if state == psycopg2.extensions.POLL_OK: + if state == POLL_OK: return - if state == psycopg2.extensions.POLL_READ: + if state == POLL_READ: select.select([conn.fileno()], [], [], min(timeout, time_left)) - elif state == psycopg2.extensions.POLL_WRITE: + elif state == POLL_WRITE: select.select([], [conn.fileno()], [], min(timeout, time_left)) else: raise psycopg2.OperationalError(f"bad state from poll: {state}") @@ -67,14 +76,14 @@ def wait_select(conn, timeout=5.0): class ClusterMonitor(Thread): def __init__( self, - config, - cluster_state, - observer_state, - create_alert_file, - cluster_monitor_check_queue, - failover_decision_queue, - is_replication_lag_over_warning_limit, - stats, + config: Config, + cluster_state: dict[str, MemberState], + observer_state: dict[str, ObserverState], + create_alert_file: Callable[[str], None], + cluster_monitor_check_queue: Queue[str], + failover_decision_queue: Queue[str], + is_replication_lag_over_warning_limit: Callable[[], bool], + stats: StatsClient, ): """Thread which collects cluster state. @@ -83,36 +92,42 @@ def __init__( in the cluster_state/observer_state dictionaries, which are shared with the main thread. """ Thread.__init__(self) - self.log = logging.getLogger("ClusterMonitor") - self.stats = stats - self.running = True - self.cluster_state = cluster_state - self.observer_state = observer_state - self.config = config - self.create_alert_file = create_alert_file - self.db_conns = {} - self.cluster_monitor_check_queue = cluster_monitor_check_queue - self.failover_decision_queue = failover_decision_queue - self.is_replication_lag_over_warning_limit = is_replication_lag_over_warning_limit - self.session = requests.Session() + self.log: logging.Logger = logging.getLogger("ClusterMonitor") + self.stats: StatsClient = stats + self.running: bool = True + self.cluster_state: dict[str, MemberState] = cluster_state + self.observer_state: dict[str, ObserverState] = observer_state + self.config: Config = config + self.create_alert_file: Callable[[str], None] = create_alert_file + self.db_conns: dict[str, psycopg2.connection | None] = {} + self.cluster_monitor_check_queue: Queue[str] = cluster_monitor_check_queue + self.failover_decision_queue: Queue[str] = failover_decision_queue + self.is_replication_lag_over_warning_limit: Callable[[], bool] = is_replication_lag_over_warning_limit + self.session: requests.Session = requests.Session() if self.config.get("syslog"): - self.syslog_handler = logutil.set_syslog_handler( + # Function `set_syslog_handler` already adds the handler to the provided logger. + # We just keep a reference to it here. + self.syslog_handler: SysLogHandler = logutil.set_syslog_handler( address=self.config.get("syslog_address", "/dev/log"), facility=self.config.get("syslog_facility", "local2"), logger=self.log, ) - self.last_monitoring_success_time = None + self.last_monitoring_success_time: float | None = None self.log.debug("Initialized ClusterMonitor with: %r", cluster_state) - def _connect_to_db(self, instance, dsn): + def _connect_to_db(self, instance: str, dsn: str | None) -> psycopg2.connection | None: conn = self.db_conns.get(instance) + if conn: return conn + if not dsn: self.log.warning("Can't connect to %s, dsn is %r", instance, dsn) return None + masked_connection_info = mask_connection_info(dsn) inst_info_str = f"{instance!r} ({masked_connection_info})" + try: self.log.info("Connecting to %s", inst_info_str) conn = psycopg2.connect(dsn=dsn, async_=True) @@ -133,19 +148,29 @@ def _connect_to_db(self, instance, dsn): self.log.exception("Failed to connect to %s (%s)", instance, inst_info_str) self.stats.unexpected_exception(ex, where="_connect_to_db") conn = None + self.db_conns[instance] = conn return conn - def _fetch_observer_state(self, instance, uri): - result = {"fetch_time": get_iso_timestamp(), "connection": True} + def _fetch_observer_state(self, instance: str, uri: str) -> ObserverState | None: + now_iso = get_iso_timestamp() + result = {"fetch_time": now_iso, "connection": True} fetch_uri = uri + "/state.json" + try: response = self.session.get(fetch_uri, timeout=5.0) # check time difference for large skews - remote_server_time = parsedate(response.headers["date"]) - remote_server_time = datetime.datetime.fromtimestamp(time.mktime(remote_server_time)) - time_diff = parse_iso_datetime(result["fetch_time"]) - remote_server_time + remote_server_ptime = parsedate(response.headers["date"]) + if remote_server_ptime is None: + self.log.error( + "Failed to parse date from observer node %r, response: %r, ignoring response", + instance, + response.json(), + ) + return None + remote_server_time = datetime.datetime.fromtimestamp(time.mktime(remote_server_ptime)) + time_diff = parse_iso_datetime(now_iso) - remote_server_time if time_diff > datetime.timedelta(seconds=5): self.log.error( "Time difference between us and observer node %r is %r, response: %r, ignoring response", @@ -168,16 +193,19 @@ def _fetch_observer_state(self, instance, uri): self.log.exception("Problem in fetching state from observer: %r, %r", instance, fetch_uri) self.stats.unexpected_exception(ex, where="_fetch_observer_state") result["connection"] = False + return result - def fetch_observer_state(self, instance, uri): + def fetch_observer_state(self, instance: str, uri: str) -> None: start_time = time.monotonic() result = self._fetch_observer_state(instance, uri) + if result: if instance in self.observer_state: self.observer_state[instance].update(result) else: self.observer_state[instance] = result + self.log.debug( "Observer: %r state was: %r, took: %.4fs to fetch", instance, @@ -185,18 +213,20 @@ def fetch_observer_state(self, instance, uri): time.monotonic() - start_time, ) - def connect_to_cluster_nodes_and_cleanup_old_nodes(self): + def connect_to_cluster_nodes_and_cleanup_old_nodes(self) -> None: leftover_conns = set(self.db_conns) - set(self.config.get("remote_conns", {})) + for leftover_instance in leftover_conns: self.log.debug("Removing leftover state for: %r", leftover_instance) self.db_conns.pop(leftover_instance) - self.cluster_state.pop(leftover_instance, "") - self.observer_state.pop(leftover_instance, "") + self.cluster_state.pop(leftover_instance, None) + self.observer_state.pop(leftover_instance, None) + # Making sure we have a connection to all currently configured db hosts for instance, connect_string in self.config.get("remote_conns", {}).items(): self._connect_to_db(instance, dsn=connect_string) - def _fetch_replication_slot_info(self, instance: str, cursor: RealDictCursor) -> List[ReplicationSlot]: + def _fetch_replication_slot_info(self, instance: str, cursor: RealDictCursor) -> list[ReplicationSlot]: """Fetch logical replication slot definitions""" self.log.debug("reading replication slot state from %r", instance) @@ -217,59 +247,49 @@ def _fetch_replication_slot_info(self, instance: str, cursor: RealDictCursor) -> """ ) wait_select(cursor.connection) - replication_slots = [ReplicationSlot(**slot) for slot in cursor.fetchall()] + replication_slots = [ + ReplicationSlot(**cast(RealDictRow, slot)) for slot in cursor.fetchall() # type: ignore[redundant-cast] + ] self.log.debug("found %d replication slot(s)", len(replication_slots)) return replication_slots - def _query_cluster_member_state(self, instance, db_conn): + def _query_cluster_member_state(self, instance: str, db_conn: psycopg2.connection | None) -> MemberState: """Query a single cluster member for its state""" - f_result = None - result = {"fetch_time": get_iso_timestamp(), "connection": False} + f_result: MemberState | None = None + result: MemberState = {"fetch_time": get_iso_timestamp(), "connection": False} + if not db_conn: - db_conn = self._connect_to_db(instance, self.config["remote_conns"].get(instance)) + dsn: str | None = self.config["remote_conns"].get(instance) + db_conn = self._connect_to_db(instance, dsn) if not db_conn: return result + phase = "querying status from" try: self.log.debug("%s %r", phase, instance) + c = db_conn.cursor(cursor_factory=RealDictCursor) - if db_conn.server_version >= 100000: - fields = [ - "now() AS db_time", - "pg_is_in_recovery()", - "pg_last_xact_replay_timestamp()", - "pg_last_wal_receive_lsn() AS pg_last_xlog_receive_location", - "pg_last_wal_replay_lsn() AS pg_last_xlog_replay_location", - ] - else: - fields = [ - "now() AS db_time", - "pg_is_in_recovery()", - "pg_last_xact_replay_timestamp()", - "pg_last_xlog_receive_location()", - "pg_last_xlog_replay_location()", - ] - joined_fields = ", ".join(fields) - c.execute(f"SELECT {joined_fields}") + + c.execute(self._get_statement_query_status(db_conn.server_version)) wait_select(c.connection) - maybe_standby_result = c.fetchone() + maybe_standby_result: MemberState = cast(MemberState, c.fetchone()) + if maybe_standby_result["pg_is_in_recovery"]: f_result = maybe_standby_result else: # First try reading current WAL LSN separately as txid_current may fail in some cases phase = "getting master LSN position" - if db_conn.server_version >= 100000: - wal_lsn_column = "pg_current_wal_lsn() AS pg_last_xlog_replay_location" - else: - wal_lsn_column = "pg_current_xlog_location() AS pg_last_xlog_replay_location" - c.execute(f"SELECT {wal_lsn_column}") + + c.execute(self._get_statement_query_master_lsn_position(db_conn.server_version)) wait_select(c.connection) - master_position = c.fetchone() + master_position: RealDictRow = cast(RealDictRow, c.fetchone()) maybe_standby_result["pg_last_xlog_replay_location"] = master_position["pg_last_xlog_replay_location"] f_result = maybe_standby_result - if db_conn.server_version >= 100000: - f_result["replication_slots"] = [asdict(slot) for slot in self._fetch_replication_slot_info(instance, c)] + if db_conn.server_version >= PG_VERSION_10: + f_result["replication_slots"] = [ + cast(ReplicationSlotAsDict, asdict(slot)) for slot in self._fetch_replication_slot_info(instance, c) + ] # This is only run on masters to create txid traffic every db_poll_interval phase = "updating transaction on" @@ -277,9 +297,10 @@ def _query_cluster_member_state(self, instance, db_conn): # With pg_current_wal_lsn we simulate replay_location on the master # With txid_current we force a new transaction to occur every poll interval to ensure there's # a heartbeat for the replication lag. - c.execute(f"SELECT txid_current(), {wal_lsn_column}") + c.execute(self._get_statement_query_updating_transaction(db_conn.server_version)) wait_select(c.connection) - master_result = c.fetchone() + master_result: RealDictRow = cast(RealDictRow, c.fetchone()) + f_result["pg_last_xlog_replay_location"] = master_result["pg_last_xlog_replay_location"] except ( PglookoutTimeout, @@ -296,12 +317,45 @@ def _query_cluster_member_state(self, instance, db_conn): return result @staticmethod - def _parse_status_query_result(result): + def _get_statement_query_status(server_version: int) -> str: + if server_version >= PG_VERSION_10: + return ( + "SELECT now() AS db_time, " + "pg_is_in_recovery(), " + "pg_last_xact_replay_timestamp(), " + "pg_last_wal_receive_lsn() AS pg_last_xlog_receive_location, " + "pg_last_wal_replay_lsn() AS pg_last_xlog_replay_location" + ) + return ( + "SELECT now() AS db_time, " + "pg_is_in_recovery(), " + "pg_last_xact_replay_timestamp(), " + "pg_last_xlog_receive_location(), " + "pg_last_xlog_replay_location()" + ) + + @staticmethod + def _get_statement_query_master_lsn_position(server_version: int) -> str: + if server_version >= PG_VERSION_10: + return "SELECT pg_current_wal_lsn() AS pg_last_xlog_replay_location" + return "SELECT pg_current_xlog_location() AS pg_last_xlog_replay_location" + + @staticmethod + def _get_statement_query_updating_transaction(server_version: int) -> str: + if server_version >= PG_VERSION_10: + return "SELECT txid_current(), pg_current_wal_lsn() AS pg_last_xlog_replay_location" + return "SELECT txid_current(), pg_current_xlog_location() AS pg_last_xlog_replay_location" + + # FIXME: Find a tighter input + return type + @staticmethod + def _parse_status_query_result(result: MemberState) -> MemberState: if not result: return {} + + db_time = cast(datetime.datetime, result["db_time"]) # abs is for catching time travel (as in going from the future to the past - if result["pg_last_xact_replay_timestamp"]: - replication_time_lag = abs(result["db_time"] - result["pg_last_xact_replay_timestamp"]) + if isinstance(result["pg_last_xact_replay_timestamp"], datetime.datetime): + replication_time_lag: datetime.timedelta = abs(db_time - result["pg_last_xact_replay_timestamp"]) result["replication_time_lag"] = replication_time_lag.total_seconds() result["pg_last_xact_replay_timestamp"] = get_iso_timestamp(result["pg_last_xact_replay_timestamp"]) @@ -317,10 +371,10 @@ def _parse_status_query_result(result): "replication_time_lag": None, # differentiate from actual lag=0.0 } ) - result.update({"db_time": get_iso_timestamp(result["db_time"]), "connection": True}) + result.update({"db_time": get_iso_timestamp(db_time), "connection": True}) return result - def update_cluster_member_state(self, instance, db_conn): + def update_cluster_member_state(self, instance: str, db_conn: psycopg2.connection | None) -> None: """Update the cluster state entry for a single cluster member""" start_time = time.monotonic() result = self._query_cluster_member_state(instance, db_conn) @@ -348,7 +402,7 @@ def update_cluster_member_state(self, instance, db_conn): else: self.cluster_state[instance]["min_replication_time_lag"] = min(min_lag, now_lag) - def main_monitoring_loop(self, requested_check=False): + def main_monitoring_loop(self, requested_check: bool = False) -> None: self.connect_to_cluster_nodes_and_cleanup_old_nodes() thread_count = len(self.db_conns) + len(self.config.get("observers", {})) futures = [] @@ -367,12 +421,14 @@ def main_monitoring_loop(self, requested_check=False): self.last_monitoring_success_time = time.monotonic() - def run(self): + def run(self) -> None: self.main_monitoring_loop() while self.running: requested_check = False try: - requested_check = self.cluster_monitor_check_queue.get(timeout=self.config.get("db_poll_interval", 5.0)) + requested_check = bool( + self.cluster_monitor_check_queue.get(timeout=self.config.get("db_poll_interval", 5.0)) + ) except Empty: pass self.main_monitoring_loop(requested_check) diff --git a/pglookout/config.py b/pglookout/config.py index 97f8f69..761263d 100644 --- a/pglookout/config.py +++ b/pglookout/config.py @@ -34,7 +34,7 @@ class Config(TypedDict, total=False): pg_stop_command: str poll_observers_on_warning_only: bool primary_conninfo_template: str - remote_conns: dict[str, str] + remote_conns: dict[str, str] # instance name -> dsn replication_catchup_timeout: float replication_state_check_interval: float statsd: Statsd diff --git a/pyproject.toml b/pyproject.toml index 8cae98b..e01c7e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,18 +31,20 @@ files = [ exclude = [ # Implementation. 'pglookout/__main__.py', - 'pglookout/cluster_monitor.py', 'pglookout/pglookout.py', 'pglookout/version.py', # Tests. 'test/conftest.py', - 'test/test_cluster_monitor.py', 'test/test_lookout.py', # Other. 'setup.py', 'version.py', ] +[[tool.mypy.overrides]] +module = "test.test_cluster_monitor" +# ignore no-untyped-call because conftest can only type hinted at the end. Remove at the end. +disallow_untyped_calls = false [tool.pylint.'MESSAGES CONTROL'] disable = [ diff --git a/requirements.dev.txt b/requirements.dev.txt index dca3637..1dd3f89 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -2,7 +2,6 @@ black==22.10.0 flake8 flake8-pyproject==1.2.2 isort==5.10.1 -mock mypy==1.1.1 types-psycopg2==2.9.21.8 types-requests==2.26.1 diff --git a/test/test_cluster_monitor.py b/test/test_cluster_monitor.py index 58d3ab7..710ccee 100644 --- a/test/test_cluster_monitor.py +++ b/test/test_cluster_monitor.py @@ -7,12 +7,15 @@ from .conftest import TestPG from contextlib import closing from datetime import datetime, timedelta -from mock import patch from packaging import version from pglookout import statsd from pglookout.cluster_monitor import ClusterMonitor +from pglookout.common_types import MemberState, ObserverState +from pglookout.config import Config from psycopg2.extras import RealDictCursor from queue import Queue +from typing import NoReturn +from unittest.mock import patch import base64 import psycopg2 @@ -20,28 +23,31 @@ import time -def test_replication_lag(): +def test_replication_lag() -> None: # pylint: disable=protected-access now = datetime.now() - status = { + status: MemberState = { "db_time": now, "pg_is_in_recovery": True, "pg_last_xact_replay_timestamp": now, "pg_last_xlog_receive_location": "0/0000001", "pg_last_xlog_replay_location": "0/0000002", } + result = ClusterMonitor._parse_status_query_result(status.copy()) assert result["replication_time_lag"] == 0.0 - status["db_time"] += timedelta(seconds=50, microseconds=42) + + status["db_time"] = now + timedelta(seconds=50, microseconds=42) result = ClusterMonitor._parse_status_query_result(status.copy()) assert result["replication_time_lag"] == 50.000042 + status["db_time"] = now + timedelta(hours=42) result = ClusterMonitor._parse_status_query_result(status.copy()) assert result["replication_time_lag"] == 151200.0 -def test_main_loop(db): - config = { +def test_main_loop(db: TestPG) -> None: + config: Config = { "remote_conns": { "test1db": db.connection_string("testuser"), "test2db": db.connection_string("otheruser"), @@ -49,14 +55,14 @@ def test_main_loop(db): "observers": {"local": "URL"}, "poll_observers_on_warning_only": True, } - cluster_state = {} - observer_state = {} + cluster_state: dict[str, MemberState] = {} + observer_state: dict[str, ObserverState] = {} - def create_alert_file(arg): + def create_alert_file(arg: str) -> NoReturn: raise Exception(arg) - cluster_monitor_check_queue = Queue() - failover_decision_queue = Queue() + cluster_monitor_check_queue: Queue[str] = Queue() + failover_decision_queue: Queue[str] = Queue() cm = ClusterMonitor( config=config, @@ -103,7 +109,7 @@ def test_fetch_replication_slot_info(db: TestPG) -> None: if version.parse(db.pgver) < version.parse("10"): pytest.skip(f"unsupported pg version: {db.pgver}") - config = { + config: Config = { "remote_conns": { "test1db": db.connection_string("testuser"), "test2db": db.connection_string("otheruser"), @@ -111,14 +117,14 @@ def test_fetch_replication_slot_info(db: TestPG) -> None: "observers": {"local": "URL"}, "poll_observers_on_warning_only": True, } - cluster_state = {} - observer_state = {} + cluster_state: dict[str, MemberState] = {} + observer_state: dict[str, ObserverState] = {} - def create_alert_file(arg): + def create_alert_file(arg: str) -> NoReturn: raise Exception(arg) - cluster_monitor_check_queue = Queue() - failover_decision_queue = Queue() + cluster_monitor_check_queue: Queue[str] = Queue() + failover_decision_queue: Queue[str] = Queue() cm = ClusterMonitor( config=config, From 45d413b20e0e362c2080e80484fb626db7058752 Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Thu, 23 Mar 2023 15:35:45 +0100 Subject: [PATCH 10/23] mypy: `pglookout/pglookout.py` [BF-1560] --- pglookout/cluster_monitor.py | 10 +- pglookout/common_types.py | 21 +- pglookout/current_master.py | 3 +- pglookout/default.py | 10 + pglookout/pglookout.py | 561 ++++++++++++++++++++++------------- pyproject.toml | 1 - test/test_cluster_monitor.py | 6 +- 7 files changed, 395 insertions(+), 217 deletions(-) create mode 100644 pglookout/default.py diff --git a/pglookout/cluster_monitor.py b/pglookout/cluster_monitor.py index a642972..29eab48 100644 --- a/pglookout/cluster_monitor.py +++ b/pglookout/cluster_monitor.py @@ -15,7 +15,7 @@ from logging.handlers import SysLogHandler from pglookout import logutil from pglookout.common import get_iso_timestamp, parse_iso_datetime -from pglookout.common_types import MemberState, ObserverState, ReplicationSlotAsDict +from pglookout.common_types import MemberState, ObservedState, ReplicationSlotAsDict from pglookout.config import Config from pglookout.pgutil import mask_connection_info from pglookout.statsd import StatsClient @@ -78,7 +78,7 @@ def __init__( self, config: Config, cluster_state: dict[str, MemberState], - observer_state: dict[str, ObserverState], + observer_state: dict[str, ObservedState], create_alert_file: Callable[[str], None], cluster_monitor_check_queue: Queue[str], failover_decision_queue: Queue[str], @@ -96,7 +96,7 @@ def __init__( self.stats: StatsClient = stats self.running: bool = True self.cluster_state: dict[str, MemberState] = cluster_state - self.observer_state: dict[str, ObserverState] = observer_state + self.observer_state: dict[str, ObservedState] = observer_state self.config: Config = config self.create_alert_file: Callable[[str], None] = create_alert_file self.db_conns: dict[str, psycopg2.connection | None] = {} @@ -152,7 +152,7 @@ def _connect_to_db(self, instance: str, dsn: str | None) -> psycopg2.connection self.db_conns[instance] = conn return conn - def _fetch_observer_state(self, instance: str, uri: str) -> ObserverState | None: + def _fetch_observer_state(self, instance: str, uri: str) -> ObservedState | None: now_iso = get_iso_timestamp() result = {"fetch_time": now_iso, "connection": True} fetch_uri = uri + "/state.json" @@ -194,7 +194,7 @@ def _fetch_observer_state(self, instance: str, uri: str) -> ObserverState | None self.stats.unexpected_exception(ex, where="_fetch_observer_state") result["connection"] = False - return result + return cast(ObservedState, result) def fetch_observer_state(self, instance: str, uri: str) -> None: start_time = time.monotonic() diff --git a/pglookout/common_types.py b/pglookout/common_types.py index 3c20cdf..43fd3fa 100644 --- a/pglookout/common_types.py +++ b/pglookout/common_types.py @@ -2,7 +2,7 @@ from __future__ import annotations from datetime import datetime -from typing import Any, Dict, TypedDict +from typing import TypedDict, Union class ReplicationSlotAsDict(TypedDict, total=True): @@ -44,10 +44,25 @@ class MemberState(TypedDict, total=False): # Note for future improvements: -# If we want ObserverState to accept arbitrary keys, we have three choices: +# If we want ObservedState to accept arbitrary keys, we have three choices: # - Use a different type (pydantic, dataclasses, etc.) # - Use a TypedDict for static keys (connection, fetch_time) and a sub-dict for # dynamic keys (received from state.json). # - Wait for something like `allow_extra` to be implemented into TypedDict (unlikely) # https://github.com/python/mypy/issues/4617 -ObserverState = Dict[str, Any] +class ObservedState(dict[str, Union[MemberState, bool, str]]): + """Represents an observed state, from the perspective of an observer. + + Note: + The content of this type is dynamic, as it depends on the number of + members in the cluster. There are two static keys, connection and + fetch_time, and N dynamic keys, one for each member of the cluster. + Like so:: + + connection: bool + fetch_time: str + name_or_ip_1: MemberState + name_or_ip_2: MemberState + ... + name_or_ip_N: MemberState + """ diff --git a/pglookout/current_master.py b/pglookout/current_master.py index 8f7154f..f6dfb54 100644 --- a/pglookout/current_master.py +++ b/pglookout/current_master.py @@ -11,6 +11,7 @@ from __future__ import annotations from . import version +from pglookout.default import JSON_STATE_FILE_PATH import argparse import json @@ -43,7 +44,7 @@ def main(args: list[str] | None = None) -> int: try: with open(arg.state, "r") as fp: config = json.load(fp) - state_file_path = config.get("json_state_file_path", "/tmp/pglookout_state.json") # pylint: disable=no-member + state_file_path = config.get("json_state_file_path", JSON_STATE_FILE_PATH) # pylint: disable=no-member if time.monotonic() - os.stat(state_file_path).st_mtime > 60.0: # file older than one minute, pglookout probably dead, exit with minus one return -1 diff --git a/pglookout/default.py b/pglookout/default.py new file mode 100644 index 0000000..e6e2288 --- /dev/null +++ b/pglookout/default.py @@ -0,0 +1,10 @@ +# Copyright (c) 2023 Aiven, Helsinki, Finland. https://aiven.io/ +from typing import Final + +WARNING_REPLICATION_TIME_LAG: Final[float] = 30.0 +MAX_FAILOVER_REPLICATION_TIME_LAG: Final[float] = 120.0 +REPLICATION_CATCHUP_TIMEOUT: Final[float] = 300.0 +MISSING_MASTER_FROM_CONFIG_TIMEOUT: Final[float] = 15.0 +MAINTENANCE_MODE_FILE: Final[str] = "/tmp/pglookout_maintenance_mode_file" +PG_DATA_DIRECTORY: Final[str] = "/var/lib/pgsql/data" +JSON_STATE_FILE_PATH: Final[str] = "/tmp/pglookout_state.json" diff --git a/pglookout/pglookout.py b/pglookout/pglookout.py index 1c1bc16..abda47a 100755 --- a/pglookout/pglookout.py +++ b/pglookout/pglookout.py @@ -7,72 +7,96 @@ This file is under the Apache License, Version 2.0. See the file `LICENSE` for details. """ -from . import logutil, statsd, version -from .cluster_monitor import ClusterMonitor -from .common import convert_xlog_location_to_offset, get_iso_timestamp, parse_iso_datetime -from .pgutil import create_connection_string, get_connection_info, get_connection_info_from_config_line -from .webserver import WebServer -from packaging.version import parse -from psycopg2.extensions import adapt +from __future__ import annotations + +from argparse import ArgumentParser +from copy import deepcopy +from datetime import datetime, timedelta +from logging import DEBUG, getLevelNamesMapping, getLogger, Logger +from logging.handlers import SysLogHandler +from packaging.version import parse as parse_version +from pathlib import Path +from pglookout.cluster_monitor import ClusterMonitor +from pglookout.common import convert_xlog_location_to_offset, get_iso_timestamp, parse_iso_datetime +from pglookout.common_types import MemberState, ObservedState +from pglookout.config import Config, Statsd +from pglookout.default import ( + JSON_STATE_FILE_PATH, + MAINTENANCE_MODE_FILE, + MAX_FAILOVER_REPLICATION_TIME_LAG, + MISSING_MASTER_FROM_CONFIG_TIMEOUT, + PG_DATA_DIRECTORY, + REPLICATION_CATCHUP_TIMEOUT, + WARNING_REPLICATION_TIME_LAG, +) +from pglookout.logutil import configure_logging, notify_systemd, set_syslog_handler +from pglookout.pgutil import ( + ConnectionParameterKeywords, + create_connection_string, + get_connection_info, + get_connection_info_from_config_line, +) +from pglookout.statsd import StatsClient +from pglookout.version import __version__ +from pglookout.webserver import WebServer +from psycopg2.extensions import adapt, QuotedString from queue import Empty, Queue -from typing import Optional +from signal import SIGHUP, SIGINT, signal, SIGTERM +from socket import gethostname +from subprocess import CalledProcessError, check_call +from types import FrameType +from typing import cast, Final, Optional -import argparse -import copy -import datetime import json -import logging -import logging.handlers -import os -import signal -import socket -import subprocess import sys import time +DEFAULT_LOG_LEVEL: Final[str] = "DEBUG" +LOG_LEVEL_NAMES_MAPPING: Final[dict[str, int]] = getLevelNamesMapping() + class PgLookout: - def __init__(self, config_path): - self.log = logging.getLogger("pglookout") + def __init__(self, config_path: Path | str) -> None: + self.log: Logger = getLogger("pglookout") # dummy to make sure we never get an AttributeError -> gets overwritten after the first config loading - self.stats = statsd.StatsClient(host=None) - self.running = True - self.replication_lag_over_warning_limit = False - - self.config_path = config_path - self.config = {} - self.log_level = "DEBUG" - - self.connected_master_nodes = {} - self.disconnected_master_nodes = {} - self.connected_observer_nodes = {} - self.disconnected_observer_nodes = {} - self.replication_catchup_timeout = None - self.replication_lag_warning_boundary = None - self.replication_lag_failover_timeout = None - self.missing_master_from_config_timeout = None - self.own_db = None - self.current_master = None - self.failover_command = None - self.known_gone_nodes = None - self.over_warning_limit_command = None - self.never_promote_these_nodes = None - self.primary_conninfo_template = None - self.cluster_monitor = None - self.syslog_handler = None - self.cluster_nodes_change_time = time.monotonic() - self.cluster_monitor_check_queue = Queue() - self.failover_decision_queue = Queue() - self.observer_state_newer_than = datetime.datetime.min - self._start_time = None + self.stats: StatsClient = StatsClient(host=None) + self.running: bool = True + self.replication_lag_over_warning_limit: bool = False + + self.config_path: Path = Path(config_path) + self.config: Config = {} + self.log_level: int = LOG_LEVEL_NAMES_MAPPING[DEFAULT_LOG_LEVEL] + + self.connected_master_nodes: dict[str, MemberState] = {} + self.disconnected_master_nodes: dict[str, MemberState] = {} + self.connected_observer_nodes: dict[str, str | None] = {} # name => ISO fetch time + self.disconnected_observer_nodes: dict[str, str | None] = {} # name => ISO fetch time + self.replication_catchup_timeout: float = REPLICATION_CATCHUP_TIMEOUT + self.replication_lag_warning_boundary: float = WARNING_REPLICATION_TIME_LAG + self.replication_lag_failover_timeout: float = MAX_FAILOVER_REPLICATION_TIME_LAG + self.missing_master_from_config_timeout: float = MISSING_MASTER_FROM_CONFIG_TIMEOUT + self.own_db: str = "" + self.current_master: str | None = None + self.failover_command: list[str] = [] + self.known_gone_nodes: list[str] = [] + self.over_warning_limit_command: str | None = None + self.never_promote_these_nodes: list[str] = [] + self.primary_conninfo_template: ConnectionParameterKeywords = {} + self.cluster_monitor: ClusterMonitor | None = None + self.syslog_handler: SysLogHandler | None = None + self.cluster_nodes_change_time: float = time.monotonic() + self.cluster_monitor_check_queue: Queue[str] = Queue() + self.failover_decision_queue: Queue[str] = Queue() + self.observer_state_newer_than: datetime = datetime.min + self._start_time: float | None = None self.load_config() - signal.signal(signal.SIGHUP, self.load_config) - signal.signal(signal.SIGINT, self.quit) - signal.signal(signal.SIGTERM, self.quit) + signal(SIGHUP, self.load_config_from_signal) + signal(SIGINT, self.quit) + signal(SIGTERM, self.quit) - self.cluster_state = {} - self.observer_state = {} + self.cluster_state: dict[str, MemberState] = {} + self.observer_state: dict[str, ObservedState] = {} self.cluster_monitor = ClusterMonitor( config=self.config, @@ -86,33 +110,71 @@ def __init__(self, config_path): ) # cluster_monitor doesn't exist at the time of reading the config initially self.cluster_monitor.log.setLevel(self.log_level) - self.webserver = WebServer(self.config, self.cluster_state, self.cluster_monitor_check_queue) + self.webserver: WebServer = WebServer(self.config, self.cluster_state, self.cluster_monitor_check_queue) + + if not self._is_initial_state_valid(): + self.log.error("Initial state is invalid, exiting.") + sys.exit(1) - logutil.notify_systemd("READY=1") + notify_systemd("READY=1") self.log.info( "PGLookout initialized, local hostname: %r, own_db: %r, cwd: %r", - socket.gethostname(), + gethostname(), self.own_db, - os.getcwd(), + Path.cwd(), ) - def quit(self, _signal=None, _frame=None): + def _is_initial_state_valid(self) -> bool: + """Check if the initial state of PgLookout is valid. + + Note: + This method is only needed because we had to pick some default values before loading the config. + A better approach would be to load the config first, and then initialize PgLookout. + For that, :any:`load_config` should be split into two methods: one for loading the config, and one for + applying the config when doing hot reloads. The method loading the config should make minimal usage of + ``self`` and return a new ``Config`` object. The method applying the config should accept a ``Config`` + object and apply it to ``self``. Meanwhile, ``__init__`` should use this ``Config`` object to initialize + itself, and it could be in different ways than during hot reloads (some internals don't depend on the config). + + Note: + To developers: Any attribute that should be mandatory in the config should be checked here. + """ + is_valid = True + + if not self.config or not self.config_path.is_file(): + self.log.error("Config is empty!") + is_valid = False + + if not self.own_db: + self.log.error("`own_db` has not been set!") + is_valid = False + + return is_valid + + def quit(self, _signal: int | None = None, _frame: FrameType | None = None) -> None: + if self.cluster_monitor is None: + raise RuntimeError("Cluster monitor is not initialized!") + self.log.warning("Quitting, signal: %r, frame: %r", _signal, _frame) self.cluster_monitor.running = False self.running = False self.webserver.close() - def load_config(self, _signal=None, _frame=None): + def load_config_from_signal(self, _signal: int, _frame: FrameType | None = None) -> None: self.log.debug( "Loading JSON config from: %r, signal: %r, frame: %r", self.config_path, _signal, _frame, ) + self.load_config() + + def load_config(self) -> None: + self.log.debug("Loading JSON config from: %r", self.config_path) previous_remote_conns = self.config.get("remote_conns") try: - with open(self.config_path) as fp: + with self.config_path.open() as fp: self.config = json.load(fp) except Exception as ex: # pylint: disable=broad-except self.log.exception("Invalid JSON config, exiting") @@ -120,35 +182,35 @@ def load_config(self, _signal=None, _frame=None): sys.exit(1) # statsd settings may have changed - stats = self.config.get("statsd", {}) - self.stats = statsd.StatsClient(host=stats.get("host"), port=stats.get("port"), tags=stats.get("tags")) + stats: Statsd = self.config.get("statsd", {}) + self.stats = StatsClient(**stats) if previous_remote_conns != self.config.get("remote_conns"): self.cluster_nodes_change_time = time.monotonic() - if self.config.get("autofollow"): + if self.config.get("autofollow", False): try: self.primary_conninfo_template = get_connection_info(self.config["primary_conninfo_template"]) except (KeyError, ValueError): self.log.exception("Invalid or missing primary_conninfo_template; not enabling autofollow") self.config["autofollow"] = False - if self.cluster_monitor: - self.cluster_monitor.config = copy.deepcopy(self.config) + if self.cluster_monitor is not None: + self.cluster_monitor.config = deepcopy(self.config) - if self.config.get("syslog") and not self.syslog_handler: - self.syslog_handler = logutil.set_syslog_handler( + if self.config.get("syslog") and self.syslog_handler is None: + self.syslog_handler = set_syslog_handler( address=self.config.get("syslog_address", "/dev/log"), facility=self.config.get("syslog_facility", "local2"), - logger=logging.getLogger(), + logger=getLogger(), ) - self.own_db = self.config.get("own_db") + self.own_db = self.config.get("own_db", "") - log_level_name = self.config.get("log_level", "DEBUG") - self.log_level = getattr(logging, log_level_name) + log_level_name = self.config.get("log_level", DEFAULT_LOG_LEVEL) + self.log_level = LOG_LEVEL_NAMES_MAPPING[log_level_name] try: self.log.setLevel(self.log_level) - if self.cluster_monitor: + if self.cluster_monitor is not None: self.cluster_monitor.log.setLevel(self.log_level) except ValueError: print(f"Problem setting log level {self.log_level!r}") @@ -156,12 +218,17 @@ def load_config(self, _signal=None, _frame=None): self.known_gone_nodes = self.config.get("known_gone_nodes", []) self.never_promote_these_nodes = self.config.get("never_promote_these_nodes", []) # we need the failover_command to be converted into subprocess [] format + # XXX BF-1971: The next two lines are potentially unsafe. We should use shlex.split instead. self.failover_command = self.config.get("failover_command", "").split() self.over_warning_limit_command = self.config.get("over_warning_limit_command") - self.replication_lag_warning_boundary = self.config.get("warning_replication_time_lag", 30.0) - self.replication_lag_failover_timeout = self.config.get("max_failover_replication_time_lag", 120.0) - self.replication_catchup_timeout = self.config.get("replication_catchup_timeout", 300.0) - self.missing_master_from_config_timeout = self.config.get("missing_master_from_config_timeout", 15.0) + self.replication_lag_warning_boundary = self.config.get("warning_replication_time_lag", WARNING_REPLICATION_TIME_LAG) + self.replication_lag_failover_timeout = self.config.get( + "max_failover_replication_time_lag", MAX_FAILOVER_REPLICATION_TIME_LAG + ) + self.replication_catchup_timeout = self.config.get("replication_catchup_timeout", REPLICATION_CATCHUP_TIMEOUT) + self.missing_master_from_config_timeout = self.config.get( + "missing_master_from_config_timeout", MISSING_MASTER_FROM_CONFIG_TIMEOUT + ) if self.replication_lag_warning_boundary >= self.replication_lag_failover_timeout: msg = "Replication lag warning boundary (%s) is not lower than its failover timeout (%s)" @@ -177,14 +244,14 @@ def load_config(self, _signal=None, _frame=None): self.log.debug("Loaded config: %r from: %r", self.config, self.config_path) self.cluster_monitor_check_queue.put("new config came, recheck") - def write_cluster_state_to_json_file(self): + def write_cluster_state_to_json_file(self) -> None: """Periodically write a JSON state file to disk Currently only used to share state with the current_master helper command, pglookout itself does not rely in this file. """ start_time = time.monotonic() - state_file_path = self.config.get("json_state_file_path", "/tmp/pglookout_state.json") + state_file_path = Path(self.config.get("json_state_file_path", JSON_STATE_FILE_PATH)) overall_state = { "db_nodes": self.cluster_state, "observer_nodes": self.observer_state, @@ -193,13 +260,15 @@ def write_cluster_state_to_json_file(self): try: json_to_dump = json.dumps(overall_state, indent=4) self.log.debug( - "Writing JSON state file to: %r, file_size: %r", + "Writing JSON state file to: %s, file_size: %r", state_file_path, len(json_to_dump), ) - with open(state_file_path + ".tmp", "w") as fp: - fp.write(json_to_dump) - os.rename(state_file_path + ".tmp", state_file_path) + + state_file_path_tmp = state_file_path.with_name(f"{state_file_path.name}.tmp") + state_file_path_tmp.write_text(json_to_dump) + state_file_path_tmp.rename(state_file_path) + self.log.debug( "Wrote JSON state file to disk, took %.4fs", time.monotonic() - start_time, @@ -212,88 +281,130 @@ def write_cluster_state_to_json_file(self): ) self.stats.unexpected_exception(ex, where="write_cluster_state_to_json_file") - def create_node_map(self, cluster_state, observer_state): + def create_node_map( + self, cluster_state: dict[str, MemberState], observer_state: dict[str, ObservedState] + ) -> tuple[str | None, MemberState | None, dict[str, MemberState]]: """Computes roles for each known member of cluster. - Use the information gathered in the cluster_state and observer_state to figure out the roles of each member.""" - standby_nodes, master_node, master_instance = {}, None, None - connected_master_nodes, disconnected_master_nodes = {}, {} - connected_observer_nodes, disconnected_observer_nodes = {}, {} + Use the information gathered in the ``cluster_state`` and ``observer_state`` to figure out the roles of each member. + + Returns: + A 3-tuple with the following elements: + - The name of the master instance + - The state of the master instance + - A dictionary of the standby instances and their states + """ + master_instance: str | None = None + master_node: MemberState | None = None + standby_nodes: dict[str, MemberState] = {} + + connected_master_nodes: dict[str, MemberState] = {} + disconnected_master_nodes: dict[str, MemberState] = {} + connected_observer_nodes: dict[str, str | None] = {} + disconnected_observer_nodes: dict[str, str | None] = {} + self.log.debug( "Creating node map out of cluster_state: %r and observer_state: %r", cluster_state, observer_state, ) - for instance, state in cluster_state.items(): - if "pg_is_in_recovery" in state: - if state["pg_is_in_recovery"]: - standby_nodes[instance] = state - elif state["connection"]: - connected_master_nodes[instance] = state - elif not state["connection"]: - disconnected_master_nodes[instance] = state + + for instance_name, member_state in cluster_state.items(): + if "pg_is_in_recovery" in member_state: + if member_state["pg_is_in_recovery"]: + standby_nodes[instance_name] = member_state + elif member_state["connection"]: + connected_master_nodes[instance_name] = member_state + else: + disconnected_master_nodes[instance_name] = member_state else: self.log.debug( "No knowledge on instance: %r state: %r of whether it's in recovery or not", - instance, - state, + instance_name, + member_state, ) - for observer_name, state in observer_state.items(): # pylint: disable=too-many-nested-blocks - connected = state.get("connection", False) + for observer_name, observed_state in observer_state.items(): # pylint: disable=too-many-nested-blocks + connected = cast(bool, observed_state.get("connection", False)) + state_fetch_time = cast(Optional[str], observed_state.get("fetch_time")) if connected: - connected_observer_nodes[observer_name] = state.get("fetch_time") + connected_observer_nodes[observer_name] = state_fetch_time else: - disconnected_observer_nodes[observer_name] = state.get("fetch_time") - for instance, db_state in state.items(): - if instance not in cluster_state: + disconnected_observer_nodes[observer_name] = state_fetch_time + for ob_state_key, ob_state_value in observed_state.items(): + if ob_state_key in ["connection", "fetch_time"]: + continue + + observed_member_name = ob_state_key + observed_member_state = cast(MemberState, ob_state_value) + + if observed_member_name not in cluster_state: # A single observer can observe multiple different replication clusters. # Ignore data on nodes that don't belong in our own cluster self.log.debug( "Ignoring instance: %r since it does not belong into our own replication cluster", - instance, + observed_member_name, ) continue - if isinstance(db_state, dict): # other keys are "connection" and "fetch_time" - own_fetch_time = parse_iso_datetime(cluster_state[instance]["fetch_time"]) - observer_fetch_time = parse_iso_datetime(db_state["fetch_time"]) - self.log.debug( - "observer_name: %r, instance: %r, state: %r, observer_fetch_time: %r", + + if not isinstance(observed_member_state, dict): # other keys are "connection" and "fetch_time" + self.log.error( + "Observer %r has invalid state for instance %r: %r", observer_name, - instance, - db_state, - observer_fetch_time, + observed_member_name, + observed_member_state, + ) + self.log.error( + "Allowed keys are 'connection' (bool) and 'fetch_time' (str)" + " and all observed instance names for the cluster." ) - if "pg_is_in_recovery" in db_state: - if db_state["pg_is_in_recovery"]: - # we always trust ourselves the most for localhost, and - # in case we are actually connected to the other node - if observer_fetch_time >= own_fetch_time and instance != self.own_db: - if instance not in standby_nodes or standby_nodes[instance]["connection"] is False: - standby_nodes[instance] = db_state - else: - master_node = connected_master_nodes.get(instance, {}) - connected = master_node.get("connection", False) - self.log.debug( - "Observer: %r sees %r as master, we see: %r, same_master: %r, connection: %r", - observer_name, - instance, - self.current_master, - instance == self.current_master, - db_state.get("connection"), - ) - if self.within_dbpoll_time(observer_fetch_time, own_fetch_time) and instance != self.own_db: - if connected or db_state["connection"]: - connected_master_nodes[instance] = db_state - else: - disconnected_master_nodes[instance] = db_state + continue + + own_fetch_time = parse_iso_datetime(cluster_state[observed_member_name]["fetch_time"]) + observer_fetch_time = parse_iso_datetime(observed_member_state["fetch_time"]) + self.log.debug( + "observer_name: %r, instance: %r, state: %r, observer_fetch_time: %r", + observer_name, + observed_member_name, + observed_member_state, + observer_fetch_time, + ) + if "pg_is_in_recovery" in observed_member_state: + if observed_member_state["pg_is_in_recovery"]: + # we always trust ourselves the most for localhost, and + # in case we are actually connected to the other node + if observer_fetch_time >= own_fetch_time and observed_member_name != self.own_db: + if ( + observed_member_name not in standby_nodes + or standby_nodes[observed_member_name]["connection"] is False + ): + standby_nodes[observed_member_name] = observed_member_state else: - self.log.warning( - "No knowledge on %r %r from observer: %r is in recovery", - instance, - db_state, + master_node = connected_master_nodes.get(observed_member_name, MemberState()) + connected = master_node.get("connection", False) + self.log.debug( + "Observer: %r sees %r as master, we see: %r, same_master: %r, connection: %r", observer_name, + observed_member_name, + self.current_master, + observed_member_name == self.current_master, + observed_member_state.get("connection"), ) + if ( + self.within_dbpoll_time(observer_fetch_time, own_fetch_time) + and observed_member_name != self.own_db + ): + if connected or observed_member_state["connection"]: + connected_master_nodes[observed_member_name] = observed_member_state + else: + disconnected_master_nodes[observed_member_name] = observed_member_state + else: + self.log.warning( + "No knowledge on %r %r from observer: %r is in recovery", + observed_member_name, + observed_member_state, + observer_name, + ) self.connected_master_nodes = connected_master_nodes self.disconnected_master_nodes = disconnected_master_nodes @@ -325,19 +436,22 @@ def create_node_map(self, cluster_state, observer_state): return master_instance, master_node, standby_nodes - def is_restoring_or_catching_up_normally(self, state): + def is_restoring_or_catching_up_normally(self, state: MemberState) -> bool: """ Return True if node is still in the replication catchup phase and replication lag alerts/metrics should not yet be generated. """ replication_start_time = state.get("replication_start_time") min_lag = state.get("min_replication_time_lag", self.replication_lag_warning_boundary) + if replication_start_time and time.monotonic() - replication_start_time > self.replication_catchup_timeout: # we've been replicating for too long and should have caught up with the master already return False + if not state.get("pg_last_xlog_receive_location"): # node has not received anything from the master yet return True + if min_lag >= self.replication_lag_warning_boundary: # node is catching up the master and has not gotten close enough yet return True @@ -345,7 +459,7 @@ def is_restoring_or_catching_up_normally(self, state): # node has caught up with the master so we should be in sync return False - def emit_stats(self, state): + def emit_stats(self, state: MemberState) -> None: if self.is_restoring_or_catching_up_normally(state): # do not emit misleading lag stats during catchup at restore return @@ -354,9 +468,10 @@ def emit_stats(self, state): if replication_time_lag is not None: self.stats.gauge("pg.replication_lag", replication_time_lag) - def is_master_observer_new_enough(self, observer_state): + def is_master_observer_new_enough(self, observer_state: dict[str, ObservedState]) -> bool: if not self.replication_lag_over_warning_limit: return True + if not self.current_master or self.current_master not in self.config.get("observers", {}): self.log.warning( "Replication lag is over warning limit, but" @@ -364,33 +479,37 @@ def is_master_observer_new_enough(self, observer_state): self.current_master, ) return True - db_poll_intervals = datetime.timedelta(seconds=5 * self.config.get("db_poll_interval", 5.0)) - now = datetime.datetime.utcnow() + + db_poll_intervals = timedelta(seconds=5 * self.config.get("db_poll_interval", 5.0)) + now = datetime.utcnow() if (now - self.observer_state_newer_than) < db_poll_intervals: self.log.warning( "Replication lag is over warning limit, but" " not waiting for observers to be polled because 5 db_poll_intervals have passed" ) return True + if self.current_master not in observer_state: self.log.warning( "Replication lag is over warning limit, but observer for master (%s) has not been polled yet", self.current_master, ) return False - fetch_time = parse_iso_datetime(observer_state[self.current_master]["fetch_time"]) + + fetch_time = parse_iso_datetime(cast(str, observer_state[self.current_master]["fetch_time"])) if fetch_time < self.observer_state_newer_than: self.log.warning( "Replication lag is over warning limit, but observer's data for master is stale, older than %r", self.observer_state_newer_than, ) return False + return True - def check_cluster_state(self): - master_node = None - cluster_state = copy.deepcopy(self.cluster_state) - observer_state = copy.deepcopy(self.observer_state) + def check_cluster_state(self) -> None: + # master_node = None + cluster_state = deepcopy(self.cluster_state) + observer_state = deepcopy(self.observer_state) configured_node_count = len(self.config.get("remote_conns", {})) if not cluster_state or len(cluster_state) != configured_node_count: self.log.warning( @@ -451,7 +570,12 @@ def check_cluster_state(self): return self.consider_failover(own_state, master_node, standby_nodes) - def consider_failover(self, own_state, master_node, standby_nodes): + def consider_failover( + self, + own_state: MemberState, + master_node: MemberState | None, + standby_nodes: dict[str, MemberState], + ) -> None: if not master_node: # no master node at all in the cluster? self.log.warning( @@ -486,10 +610,10 @@ def consider_failover(self, own_state, master_node, standby_nodes): return self.check_replication_lag(own_state, standby_nodes) - def is_replication_lag_over_warning_limit(self): + def is_replication_lag_over_warning_limit(self) -> bool: return self.replication_lag_over_warning_limit - def check_replication_lag(self, own_state, standby_nodes): + def check_replication_lag(self, own_state: MemberState, standby_nodes: dict[str, MemberState]) -> None: if self.is_restoring_or_catching_up_normally(own_state): # do not raise alerts during catchup at restore return @@ -508,7 +632,7 @@ def check_replication_lag(self, own_state, standby_nodes): if not self.replication_lag_over_warning_limit: # we just went over the boundary self.replication_lag_over_warning_limit = True if self.config.get("poll_observers_on_warning_only"): - self.observer_state_newer_than = datetime.datetime.utcnow() + self.observer_state_newer_than = datetime.utcnow() self.create_alert_file("replication_delay_warning") if self.over_warning_limit_command: self.log.warning( @@ -528,7 +652,7 @@ def check_replication_lag(self, own_state, standby_nodes): elif self.replication_lag_over_warning_limit: self.replication_lag_over_warning_limit = False self.delete_alert_file("replication_delay_warning") - self.observer_state_newer_than = datetime.datetime.min + self.observer_state_newer_than = datetime.min if replication_lag >= self.replication_lag_failover_timeout: self.log.warning( @@ -545,14 +669,14 @@ def check_replication_lag(self, own_state, standby_nodes): standby_nodes, ) - def get_replication_positions(self, standby_nodes): + def get_replication_positions(self, standby_nodes: dict[str, MemberState]) -> dict[int, set[str]]: self.log.debug("Getting replication positions from: %r", standby_nodes) - known_replication_positions = {} + known_replication_positions: dict[int, set[str]] = {} for instance, node_state in standby_nodes.items(): - now = datetime.datetime.utcnow() + now = datetime.utcnow() if ( node_state["connection"] - and now - parse_iso_datetime(node_state["fetch_time"]) < datetime.timedelta(seconds=20) + and now - parse_iso_datetime(node_state["fetch_time"]) < timedelta(seconds=20) and instance not in self.never_promote_these_nodes ): # noqa # pylint: disable=line-too-long # use pg_last_xlog_receive_location if it's available, @@ -569,13 +693,14 @@ def get_replication_positions(self, standby_nodes): known_replication_positions.setdefault(wal_pos, set()).add(instance) return known_replication_positions - def _been_in_contact_with_master_within_failover_timeout(self): + def _been_in_contact_with_master_within_failover_timeout(self) -> bool: # no need to do anything here if there are no disconnected masters if self.disconnected_master_nodes: disconnected_master_node = list(self.disconnected_master_nodes.values())[0] db_time = disconnected_master_node.get("db_time", get_iso_timestamp()) or get_iso_timestamp() - time_since_last_contact = datetime.datetime.utcnow() - parse_iso_datetime(db_time) - if time_since_last_contact < datetime.timedelta(seconds=self.replication_lag_failover_timeout): + db_time_as_dt = db_time if isinstance(db_time, datetime) else parse_iso_datetime(db_time) + time_since_last_contact = datetime.utcnow() - db_time_as_dt + if time_since_last_contact < timedelta(seconds=self.replication_lag_failover_timeout): self.log.debug( "We've had contact with master: %r at: %r within the last %.2fs, not failing over", disconnected_master_node, @@ -585,7 +710,7 @@ def _been_in_contact_with_master_within_failover_timeout(self): return True return False - def do_failover_decision(self, own_state, standby_nodes): + def do_failover_decision(self, own_state: MemberState, standby_nodes: dict[str, MemberState]) -> None: if self.connected_master_nodes: self.log.warning( "We still have some connected masters: %r, not failing over", @@ -637,7 +762,7 @@ def do_failover_decision(self, own_state, standby_nodes): self.log.warning( "Canceling failover even though we were the node the furthest along, since " "this node has an existing maintenance_mode_file: %r", - self.config.get("maintenance_mode_file", "/tmp/pglookout_maintenance_mode_file"), + self.config.get("maintenance_mode_file", MAINTENANCE_MODE_FILE), ) elif self.own_db in self.never_promote_these_nodes: self.log.warning( @@ -674,18 +799,18 @@ def do_failover_decision(self, own_state, standby_nodes): furthest_along_instance, ) - def modify_recovery_conf_to_point_at_new_master(self, new_master_instance): - with open(os.path.join(self.config.get("pg_data_directory"), "PG_VERSION"), "r") as fp: - pg_version = fp.read().strip() + def modify_recovery_conf_to_point_at_new_master(self, new_master_instance: str) -> bool: + pg_data_directory = Path(self.config.get("pg_data_directory", PG_DATA_DIRECTORY)) + pg_version = (pg_data_directory / "PG_VERSION").read_text().strip() - if parse(pg_version) >= parse("12"): + if parse_version(pg_version) >= parse_version("12"): recovery_conf_filename = "postgresql.auto.conf" else: recovery_conf_filename = "recovery.conf" - path_to_recovery_conf = os.path.join(self.config.get("pg_data_directory"), recovery_conf_filename) - with open(path_to_recovery_conf, "r") as fp: - old_conf = fp.read().splitlines() + path_to_recovery_conf = pg_data_directory / recovery_conf_filename + old_conf = path_to_recovery_conf.read_text().splitlines() + has_recovery_target_timeline = False new_conf = [] old_conn_info = None @@ -707,34 +832,44 @@ def modify_recovery_conf_to_point_at_new_master(self, new_master_instance): master_instance_conn_info = get_connection_info(self.config["remote_conns"][new_master_instance]) assert "host" in master_instance_conn_info new_conn_info["host"] = master_instance_conn_info["host"] + if "port" in master_instance_conn_info: new_conn_info["port"] = master_instance_conn_info["port"] + if new_conn_info == old_conn_info: self.log.debug( "recovery.conf already contains conninfo matching %r, not updating", new_master_instance, ) return False + # Otherwise we append the new primary_conninfo - quoted_connection_string = adapt(create_connection_string(new_conn_info)) + # Mypy: ignore the typing of `adapt`, as we cannot override it ourselves. The provided stubs are incomplete. + # Writing our own stubs would discard all the other type information from `psycopg2.extensions`. + quoted_connection_string: QuotedString = adapt( + create_connection_string(new_conn_info) + ) # type: ignore[no-untyped-call] new_conf.append(f"primary_conninfo = {quoted_connection_string}") + # The timeline of the recovery.conf will require a higher timeline target if not has_recovery_target_timeline: new_conf.append("recovery_target_timeline = 'latest'") + # prepend our tag iso_timestamp = get_iso_timestamp() new_conf.insert( 0, f"# pglookout updated primary_conninfo for instance {new_master_instance} at {iso_timestamp}", ) + # Replace old recovery.conf with a fresh copy - with open(path_to_recovery_conf + "_temp", "w") as fp: - fp.write("\n".join(new_conf) + "\n") + path_to_recovery_conf_new = path_to_recovery_conf.with_name(f"{path_to_recovery_conf.name}._temp") + path_to_recovery_conf_new.write_text("\n".join(new_conf) + "\n") + path_to_recovery_conf_new.rename(path_to_recovery_conf) - os.rename(path_to_recovery_conf + "_temp", path_to_recovery_conf) return True - def start_following_new_master(self, new_master_instance): + def start_following_new_master(self, new_master_instance: str) -> None: start_time = time.monotonic() updated_config = self.modify_recovery_conf_to_point_at_new_master(new_master_instance) if not updated_config: @@ -760,12 +895,12 @@ def start_following_new_master(self, new_master_instance): time.monotonic() - start_time, ) - def execute_external_command(self, command): + def execute_external_command(self, command: list[str] | str) -> int: self.log.warning("Executing external command: %r", command) return_code, output = 0, "" try: - output = subprocess.check_call(command) - except subprocess.CalledProcessError as err: + check_call(command) + except CalledProcessError as err: self.log.exception( "Problem with executing: %r, return_code: %r, output: %r", command, @@ -777,33 +912,37 @@ def execute_external_command(self, command): self.log.warning("Executed external command: %r, output: %r", return_code, output) return return_code - def check_for_maintenance_mode_file(self): - return os.path.exists(self.config.get("maintenance_mode_file", "/tmp/pglookout_maintenance_mode_file")) + def check_for_maintenance_mode_file(self) -> bool: + return Path(self.config.get("maintenance_mode_file", MAINTENANCE_MODE_FILE)).is_file() - def create_alert_file(self, filename): + def create_alert_file(self, filename: str) -> None: + alert_file_dir = Path(self.config.get("alert_file_dir", Path.cwd())) + filepath = alert_file_dir / filename + self.log.debug("Creating alert file: %r", str(filepath)) try: - filepath = os.path.join(self.config.get("alert_file_dir", os.getcwd()), filename) - self.log.debug("Creating alert file: %r", filepath) - with open(filepath, "w") as fp: - fp.write("alert") + filepath.write_text("alert") except Exception as ex: # pylint: disable=broad-except self.log.exception("Problem writing alert file: %r", filename) self.stats.unexpected_exception(ex, where="create_alert_file") - def delete_alert_file(self, filename): - filepath = os.path.join(self.config.get("alert_file_dir", os.getcwd()), filename) + def delete_alert_file(self, filename: str) -> None: + alert_file_dir = Path(self.config.get("alert_file_dir", Path.cwd())) + filepath = alert_file_dir / filename try: - if os.path.exists(filepath): + if filepath.is_file(): self.log.debug("Deleting alert file: %r", filepath) - os.unlink(filepath) + filepath.unlink(missing_ok=True) except Exception as ex: # pylint: disable=broad-except self.log.exception("Problem unlinking: %r", filepath) self.stats.unexpected_exception(ex, where="delete_alert_file") - def within_dbpoll_time(self, time1, time2): + def within_dbpoll_time(self, time1: datetime, time2: datetime) -> bool: return abs((time1 - time2).total_seconds()) < self.config.get("db_poll_interval", 5.0) def _check_cluster_monitor_thread_health(self, now: float) -> None: + if self.cluster_monitor is None: + raise RuntimeError("Cluster Monitor is not initialized!") + health_timeout_seconds = self._get_health_timeout_seconds() if health_timeout_seconds: last_successful_run = self.cluster_monitor.last_monitoring_success_time or self._start_time @@ -814,7 +953,7 @@ def _check_cluster_monitor_thread_health(self, now: float) -> None: self.stats.increase("cluster_monitor_health_timeout") self.log.warning("cluster_monitor has not been running for %.1f seconds", seconds_since_last_run) - def _get_health_timeout_seconds(self) -> Optional[float]: + def _get_health_timeout_seconds(self) -> float | None: if "cluster_monitor_health_timeout_seconds" in self.config: config_value = self.config.get("cluster_monitor_health_timeout_seconds") return config_value if config_value is None else float(config_value) @@ -824,7 +963,7 @@ def _get_health_timeout_seconds(self) -> Optional[float]: def _get_check_interval(self) -> float: return float(self.config.get("replication_state_check_interval", 5.0)) - def main_loop(self): + def main_loop(self) -> None: while self.running: try: self.check_cluster_state() @@ -832,11 +971,13 @@ def main_loop(self): except Exception as ex: # pylint: disable=broad-except self.log.exception("Failed to check cluster state") self.stats.unexpected_exception(ex, where="main_loop_check_cluster_state") + try: self.write_cluster_state_to_json_file() except Exception as ex: # pylint: disable=broad-except self.log.exception("Failed to write cluster state") self.stats.unexpected_exception(ex, where="main_loop_writer_cluster_state") + try: self.failover_decision_queue.get(timeout=self._get_check_interval()) q = self.failover_decision_queue @@ -849,18 +990,18 @@ def main_loop(self): except Empty: pass - def run(self): + def run(self) -> None: + if self.cluster_monitor is None: + raise RuntimeError("Cluster Monitor is not initialized!") + self._start_time = time.monotonic() self.cluster_monitor.start() self.webserver.start() self.main_loop() -def main(args=None): - if args is None: - args = sys.argv[1:] - - parser = argparse.ArgumentParser( +def get_argument_parser() -> ArgumentParser: + parser = ArgumentParser( prog="pglookout", description="postgresql replication monitoring and failover daemon", ) @@ -868,19 +1009,31 @@ def main(args=None): "--version", action="version", help="show program version", - version=version.__version__, + version=__version__, ) - parser.add_argument("config", help="configuration file") + # it's a type of filepath + parser.add_argument("config", type=Path, help="configuration file") + + return parser + + +def main(args: list[str] | None = None) -> int: + if args is None: + args = sys.argv[1:] + + parser = get_argument_parser() arg = parser.parse_args(args) - if not os.path.exists(arg.config): + if not arg.config.is_file(): print(f"pglookout: {arg.config!r} doesn't exist") return 1 - logutil.configure_logging() + configure_logging() pglookout = PgLookout(arg.config) - return pglookout.run() + pglookout.run() + + return 0 if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index e01c7e7..21912c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,6 @@ files = [ exclude = [ # Implementation. 'pglookout/__main__.py', - 'pglookout/pglookout.py', 'pglookout/version.py', # Tests. 'test/conftest.py', diff --git a/test/test_cluster_monitor.py b/test/test_cluster_monitor.py index 710ccee..b841be2 100644 --- a/test/test_cluster_monitor.py +++ b/test/test_cluster_monitor.py @@ -10,7 +10,7 @@ from packaging import version from pglookout import statsd from pglookout.cluster_monitor import ClusterMonitor -from pglookout.common_types import MemberState, ObserverState +from pglookout.common_types import MemberState, ObservedState from pglookout.config import Config from psycopg2.extras import RealDictCursor from queue import Queue @@ -56,7 +56,7 @@ def test_main_loop(db: TestPG) -> None: "poll_observers_on_warning_only": True, } cluster_state: dict[str, MemberState] = {} - observer_state: dict[str, ObserverState] = {} + observer_state: dict[str, ObservedState] = {} def create_alert_file(arg: str) -> NoReturn: raise Exception(arg) @@ -118,7 +118,7 @@ def test_fetch_replication_slot_info(db: TestPG) -> None: "poll_observers_on_warning_only": True, } cluster_state: dict[str, MemberState] = {} - observer_state: dict[str, ObserverState] = {} + observer_state: dict[str, ObservedState] = {} def create_alert_file(arg: str) -> NoReturn: raise Exception(arg) From 6ce993ed4d63e1eedd8a87d33dd37fd4562ebd05 Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Fri, 24 Mar 2023 19:02:21 +0100 Subject: [PATCH 11/23] mypy: Refactor `check_cluster_state` [BF-1560] This refactor is made to simplify the branching of a part of this method. This helps `mypy` and makes it clearer about what the different preconditions. --- pglookout/pglookout.py | 111 ++++++++++++++++++++++++++++++----------- 1 file changed, 83 insertions(+), 28 deletions(-) diff --git a/pglookout/pglookout.py b/pglookout/pglookout.py index abda47a..a6cb009 100755 --- a/pglookout/pglookout.py +++ b/pglookout/pglookout.py @@ -537,38 +537,93 @@ def check_cluster_state(self) -> None: if self.own_db and self.own_db != master_instance and self.config.get("autofollow"): self.start_following_new_master(master_instance) + self._make_failover_decision(observer_state, standby_nodes, master_node) + + def _make_failover_decision( + self, + observer_state: dict[str, ObservedState], + standby_nodes: dict[str, MemberState], + master_node: MemberState | None, + ) -> None: + assert self.own_db is not None own_state = self.cluster_state.get(self.own_db) - observer_info = ",".join(observer_state) or "no" - if self.own_db: # Emit stats if we're a non-observer node - self.emit_stats(own_state) - else: # We're an observer ourselves, grab the IP address from HTTP server address - observer_info = self.config.get("http_address", observer_info) + consider_failover = False - standby_info = ",".join(standby_nodes) or "no" - self.log.debug( - "Cluster has %s standbys, %s observers and %s as master, own_db: %r, own_state: %r", - standby_info, - observer_info, - self.current_master, - self.own_db, - own_state or "observer", - ) + if not self.own_db: + self._log_state_when_own_db_is_not_in_cluster( + observer_state=observer_state, + standby_nodes=standby_nodes, + ) + else: + assert own_state is not None + consider_failover = self._is_failover_consideration_needed( + own_state=own_state, + observer_state=observer_state, + standby_nodes=standby_nodes, + master_node=master_node, + ) - if self.own_db: - if self.own_db == self.current_master: - # We are the master of this cluster, nothing to do - self.log.debug( - "We %r: %r are still the master node: %r of this cluster, nothing to do.", - self.own_db, - own_state, - master_node, - ) - return - if not standby_nodes: - self.log.warning("No standby nodes set, master node: %r", master_node) - return - self.consider_failover(own_state, master_node, standby_nodes) + if consider_failover and own_state: + self.consider_failover( + own_state=own_state, + master_node=master_node, + standby_nodes=standby_nodes, + ) + + def _log_state_when_own_db_is_not_in_cluster( + self, + observer_state: dict[str, ObservedState], + standby_nodes: dict[str, MemberState], + ) -> None: + if self.log.isEnabledFor(DEBUG): + observer_info = self.config.get("http_address", ",".join(observer_state.keys()) or "no") + standby_info = ",".join(standby_nodes) or "no" + self.log.debug( + "Cluster has %s standbys, %s observers and %s as master, own_db: %r, own_state: %r", + standby_info, + observer_info, + self.current_master, + self.own_db, + "observer", + ) + + def _is_failover_consideration_needed( + self, + own_state: MemberState, + observer_state: dict[str, ObservedState], + standby_nodes: dict[str, MemberState], + master_node: MemberState | None, + ) -> bool: + self.emit_stats(own_state) # Emit stats if we're a non-observer node + + if self.log.isEnabledFor(DEBUG): + observer_info = ",".join(observer_state) or "no" + standby_info = ",".join(standby_nodes) or "no" + self.log.debug( + "Cluster has %s standbys, %s observers and %s as master, own_db: %r, own_state: %r", + standby_info, + observer_info, + self.current_master, + self.own_db, + own_state, + ) + + if self.own_db == self.current_master: + # We are the master of this cluster, nothing to do + self.log.debug( + "We %r: %r are still the master node: %r of this cluster, nothing to do.", + self.own_db, + own_state, + master_node, + ) + return False + + if not standby_nodes: + self.log.warning("No standby nodes set, master node: %r", master_node) + return False + + return True def consider_failover( self, From 0d438e9040253dea10dbfb75d67c622773de1ba1 Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Mon, 27 Mar 2023 12:19:21 +0200 Subject: [PATCH 12/23] mypy: `test/test_pglookout.py` [BF-1560] Rename `test_lookout.py` -> `test_pglookout.py` to follow conventions. --- pglookout/config.py | 2 +- pyproject.toml | 1 - test/{test_lookout.py => test_pglookout.py} | 402 ++++++++++---------- 3 files changed, 210 insertions(+), 195 deletions(-) rename test/{test_lookout.py => test_pglookout.py} (73%) diff --git a/pglookout/config.py b/pglookout/config.py index 761263d..9575841 100644 --- a/pglookout/config.py +++ b/pglookout/config.py @@ -13,7 +13,7 @@ class Statsd(TypedDict, total=False): class Config(TypedDict, total=False): alert_file_dir: str autofollow: bool - cluster_monitor_health_timeout_seconds: float + cluster_monitor_health_timeout_seconds: float | None db_poll_interval: float failover_command: str failover_sleep_time: float diff --git a/pyproject.toml b/pyproject.toml index 21912c6..48a7c04 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,6 @@ exclude = [ 'pglookout/version.py', # Tests. 'test/conftest.py', - 'test/test_lookout.py', # Other. 'setup.py', 'version.py', diff --git a/test/test_lookout.py b/test/test_pglookout.py similarity index 73% rename from test/test_lookout.py rename to test/test_pglookout.py index 3b0a902..9aa3421 100644 --- a/test/test_lookout.py +++ b/test/test_pglookout.py @@ -7,53 +7,58 @@ This file is under the Apache License, Version 2.0. See the file `LICENSE` for details. """ +from __future__ import annotations + +from datetime import datetime, timedelta +from pathlib import Path from pglookout.common import get_iso_timestamp +from pglookout.common_types import MemberState, ObservedState +from pglookout.pglookout import PgLookout from pglookout.pgutil import get_connection_info, get_connection_info_from_config_line -from typing import Optional, Union -from unittest.mock import patch +from psycopg2._psycopg import connection as pg_connection # pylint: disable=no-name-in-module +from typing import cast +from unittest.mock import Mock, patch -import datetime import json import os import pytest import time -def test_connect_to_cluster_nodes_and_cleanup_old_nodes(pgl): +def test_connect_to_cluster_nodes_and_cleanup_old_nodes(pgl: PgLookout) -> None: + assert pgl.cluster_monitor is not None pgl.cluster_monitor.db_conns = { - "1.2.3.4": "bar", - "2.3.4.5": "foo", - "3.4.5.6": "foo", - None: "foo", + "1.2.3.4": Mock(spec_set=pg_connection), + "2.3.4.5": Mock(spec_set=pg_connection), + "3.4.5.6": Mock(spec_set=pg_connection), } pgl.cluster_monitor.connect_to_cluster_nodes_and_cleanup_old_nodes() assert pgl.cluster_monitor.db_conns == {} -def test_state_file_write(pgl, tmpdir): - state_file_path = tmpdir.join("state_file").strpath - pgl.config["json_state_file_path"] = state_file_path +def test_state_file_write(pgl: PgLookout, tmp_path: Path) -> None: + state_file_path = tmp_path / "state_file" + pgl.config["json_state_file_path"] = str(state_file_path) pgl.write_cluster_state_to_json_file() - assert os.path.exists(state_file_path) - with open(state_file_path, "r") as fp: - state = json.load(fp) + assert state_file_path.is_file() + state = json.loads(state_file_path.read_text()) assert isinstance(state, dict) -def test_load_config(pgl): +def test_load_config(pgl: PgLookout) -> None: pgl.own_db = "old_value" pgl.load_config() assert pgl.own_db == "1.2.3.4" def _create_db_node_state( - pg_last_xlog_receive_location=None, - pg_is_in_recovery=True, - connection=True, - replication_time_lag=None, - fetch_time=None, - db_time=None, -): + pg_last_xlog_receive_location: str | None = None, + pg_is_in_recovery: bool = True, + connection: bool = True, + replication_time_lag: float | None = None, + fetch_time: datetime | None = None, + db_time: datetime | None = None, +) -> MemberState: return { "connection": connection, "db_time": get_iso_timestamp(db_time), @@ -68,16 +73,16 @@ def _create_db_node_state( def _add_to_observer_state( - pgl, - observer_name, - db_name, - pg_last_xlog_receive_location=None, - pg_is_in_recovery=True, - connection=True, - replication_time_lag=None, - fetch_time=None, - db_time=None, -): + pgl: PgLookout, + observer_name: str, + db_name: str, + pg_last_xlog_receive_location: str | None = None, + pg_is_in_recovery: bool = True, + connection: bool = True, + replication_time_lag: float | None = None, + fetch_time: datetime | None = None, + db_time: datetime | None = None, +) -> None: db_node_state = _create_db_node_state( pg_last_xlog_receive_location, pg_is_in_recovery, @@ -86,28 +91,30 @@ def _add_to_observer_state( fetch_time=fetch_time, db_time=db_time, ) - update_dict = { - "fetch_time": get_iso_timestamp(), - "connection": True, - db_name: db_node_state, - } - if observer_name in pgl.observer_state: + update_dict = ObservedState( + { + "fetch_time": get_iso_timestamp(), + "connection": True, + db_name: db_node_state, + } + ) + if observer_name in pgl.observer_state.keys(): pgl.observer_state[observer_name].update(update_dict) else: pgl.observer_state[observer_name] = update_dict def _add_db_to_cluster_state( - pgl, - instance, - pg_last_xlog_receive_location=None, - pg_is_in_recovery=True, - connection=True, - replication_time_lag=None, - fetch_time=None, - db_time=None, - conn_info=None, -): + pgl: PgLookout, + instance: str, + pg_last_xlog_receive_location: str | None = None, + pg_is_in_recovery: bool = True, + connection: bool = True, + replication_time_lag: float | None = None, + fetch_time: datetime | None = None, + db_time: datetime | None = None, + conn_info: str | None = None, +) -> None: db_node_state = _create_db_node_state( pg_last_xlog_receive_location, pg_is_in_recovery, @@ -120,7 +127,7 @@ def _add_db_to_cluster_state( pgl.config["remote_conns"][instance] = conn_info or {"host": instance} -def test_check_cluster_state_warning(pgl): +def test_check_cluster_state_warning(pgl: PgLookout) -> None: _add_db_to_cluster_state( pgl, "kuu", @@ -134,16 +141,16 @@ def test_check_cluster_state_warning(pgl): pgl.current_master = "old_master" pgl.own_db = "kuu" pgl.over_warning_limit_command = "fake_command" - pgl.execute_external_command.return_value = 0 + cast(Mock, pgl.execute_external_command).return_value = 0 pgl.check_cluster_state() - assert pgl.execute_external_command.call_count == 1 - assert pgl.create_alert_file.call_count == 1 + assert cast(Mock, pgl.execute_external_command).call_count == 1 + assert cast(Mock, pgl.create_alert_file).call_count == 1 pgl.check_cluster_state() # call count does not change when we have sent a single warning - assert pgl.execute_external_command.call_count == 1 + assert cast(Mock, pgl.execute_external_command).call_count == 1 assert pgl.replication_lag_over_warning_limit - assert pgl.create_alert_file.call_count == 1 + assert cast(Mock, pgl.create_alert_file).call_count == 1 # and then the replication catches up _add_db_to_cluster_state( @@ -159,13 +166,13 @@ def test_check_cluster_state_warning(pgl): assert pgl.replication_lag_over_warning_limit is False -def test_check_cluster_do_failover_one_standby(pgl): +def test_check_cluster_do_failover_one_standby(pgl: PgLookout) -> None: _add_db_to_cluster_state( pgl, "old_master", pg_is_in_recovery=False, connection=False, - db_time=datetime.datetime(year=2014, month=1, day=1), + db_time=datetime(year=2014, month=1, day=1), ) _add_db_to_cluster_state( @@ -178,23 +185,23 @@ def test_check_cluster_do_failover_one_standby(pgl): ) pgl.own_db = "own_db" - pgl.execute_external_command.return_value = 0 + cast(Mock, pgl.execute_external_command).return_value = 0 pgl.replication_lag_over_warning_limit = False pgl.check_cluster_state() - assert pgl.execute_external_command.call_count == 0 + assert cast(Mock, pgl.execute_external_command).call_count == 0 assert pgl.replication_lag_over_warning_limit is True pgl.check_cluster_state() - assert pgl.execute_external_command.call_count == 1 + assert cast(Mock, pgl.execute_external_command).call_count == 1 assert pgl.replication_lag_over_warning_limit is False -def test_check_cluster_master_gone_one_standby_one_observer(pgl): +def test_check_cluster_master_gone_one_standby_one_observer(pgl: PgLookout) -> None: _add_db_to_cluster_state( pgl, "old_master", pg_is_in_recovery=False, connection=False, - db_time=datetime.datetime(year=2014, month=1, day=1), + db_time=datetime(year=2014, month=1, day=1), ) _add_db_to_cluster_state( @@ -212,7 +219,7 @@ def test_check_cluster_master_gone_one_standby_one_observer(pgl): "old_master", pg_is_in_recovery=False, connection=False, - db_time=datetime.datetime(year=2014, month=1, day=1), + db_time=datetime(year=2014, month=1, day=1), ) _add_to_observer_state( pgl, @@ -226,7 +233,7 @@ def test_check_cluster_master_gone_one_standby_one_observer(pgl): # Simulate existing master connection pgl.current_master = "old_master" - pgl.execute_external_command.return_value = 0 + cast(Mock, pgl.execute_external_command).return_value = 0 pgl.replication_lag_over_warning_limit = False del pgl.config["remote_conns"]["old_master"] @@ -237,23 +244,23 @@ def test_check_cluster_master_gone_one_standby_one_observer(pgl): # First call does not promote due to missing master because config has been updated just recently and there's # by default a grace period that's waited after list of known cluster nodes changes pgl.check_cluster_state() - assert pgl.execute_external_command.call_count == 0 + assert cast(Mock, pgl.execute_external_command).call_count == 0 assert pgl.replication_lag_over_warning_limit is False # If we say that master is known to be gone promotion will happen even though config was updated recently pgl.known_gone_nodes.append("old_master") pgl.check_cluster_state() - assert pgl.execute_external_command.call_count == 1 + assert cast(Mock, pgl.execute_external_command).call_count == 1 assert pgl.replication_lag_over_warning_limit is False -def test_check_cluster_do_failover_one_standby_one_observer(pgl): +def test_check_cluster_do_failover_one_standby_one_observer(pgl: PgLookout) -> None: _add_db_to_cluster_state( pgl, "old_master", pg_is_in_recovery=False, connection=False, - db_time=datetime.datetime(year=2014, month=1, day=1), + db_time=datetime(year=2014, month=1, day=1), ) _add_db_to_cluster_state( @@ -271,7 +278,7 @@ def test_check_cluster_do_failover_one_standby_one_observer(pgl): "old_master", pg_is_in_recovery=False, connection=False, - db_time=datetime.datetime(year=2014, month=1, day=1), + db_time=datetime(year=2014, month=1, day=1), ) _add_to_observer_state( pgl, @@ -283,23 +290,23 @@ def test_check_cluster_do_failover_one_standby_one_observer(pgl): replication_time_lag=130.0, ) - pgl.execute_external_command.return_value = 0 + cast(Mock, pgl.execute_external_command).return_value = 0 pgl.replication_lag_over_warning_limit = False pgl.check_cluster_state() - assert pgl.execute_external_command.call_count == 0 + assert cast(Mock, pgl.execute_external_command).call_count == 0 assert pgl.replication_lag_over_warning_limit is True pgl.check_cluster_state() - assert pgl.execute_external_command.call_count == 1 + assert cast(Mock, pgl.execute_external_command).call_count == 1 assert pgl.replication_lag_over_warning_limit is False -def test_check_cluster_do_failover_with_a_node_which_is_is_maintenance(pgl): +def test_check_cluster_do_failover_with_a_node_which_is_is_maintenance(pgl: PgLookout) -> None: _add_db_to_cluster_state( pgl, "old_master", pg_is_in_recovery=False, connection=False, - db_time=datetime.datetime(year=2014, month=1, day=1), + db_time=datetime(year=2014, month=1, day=1), ) _add_db_to_cluster_state( @@ -313,16 +320,16 @@ def test_check_cluster_do_failover_with_a_node_which_is_is_maintenance(pgl): pgl.never_promote_these_nodes = [] pgl.own_db = "kuu" - pgl.execute_external_command.return_value = 0 + cast(Mock, pgl.execute_external_command).return_value = 0 pgl.replication_lag_over_warning_limit = True - pgl.check_for_maintenance_mode_file.return_value = True + cast(Mock, pgl.check_for_maintenance_mode_file).return_value = True pgl.check_cluster_state() - assert pgl.execute_external_command.call_count == 0 + assert cast(Mock, pgl.execute_external_command).call_count == 0 assert pgl.replication_lag_over_warning_limit is True - assert pgl.check_for_maintenance_mode_file.call_count == 1 + assert cast(Mock, pgl.check_for_maintenance_mode_file).call_count == 1 -def test_check_cluster_do_failover_with_a_node_which_should_never_be_promoted(pgl): +def test_check_cluster_do_failover_with_a_node_which_should_never_be_promoted(pgl: PgLookout) -> None: _add_db_to_cluster_state(pgl, "old_master", pg_is_in_recovery=False, connection=False) _add_db_to_cluster_state( @@ -335,14 +342,14 @@ def test_check_cluster_do_failover_with_a_node_which_should_never_be_promoted(pg ) pgl.never_promote_these_nodes = ["kuu"] pgl.own_db = "kuu" - pgl.execute_external_command.return_value = 0 + cast(Mock, pgl.execute_external_command).return_value = 0 pgl.replication_lag_over_warning_limit = True pgl.check_cluster_state() - assert pgl.execute_external_command.call_count == 0 + assert cast(Mock, pgl.execute_external_command).call_count == 0 assert pgl.replication_lag_over_warning_limit is True -def test_check_cluster_do_failover_two_standbys(pgl): +def test_check_cluster_do_failover_two_standbys(pgl: PgLookout) -> None: _add_db_to_cluster_state(pgl, "old_master", pg_is_in_recovery=False, connection=False) _add_db_to_cluster_state( @@ -364,22 +371,22 @@ def test_check_cluster_do_failover_two_standbys(pgl): replication_time_lag=130.0, ) - pgl.execute_external_command.return_value = 0 + cast(Mock, pgl.execute_external_command).return_value = 0 pgl.replication_lag_over_warning_limit = True pgl.check_cluster_state() - assert pgl.execute_external_command.call_count == 0 + assert cast(Mock, pgl.execute_external_command).call_count == 0 assert pgl.replication_lag_over_warning_limit is True # we keep the warning on def test_check_cluster_do_failover_two_standbys_when_the_one_ahead_can_never_be_promoted( - pgl, -): + pgl: PgLookout, +) -> None: _add_db_to_cluster_state( pgl, "old_master", pg_is_in_recovery=False, connection=False, - db_time=datetime.datetime(year=2014, month=1, day=1), + db_time=datetime(year=2014, month=1, day=1), ) _add_db_to_cluster_state( @@ -401,14 +408,14 @@ def test_check_cluster_do_failover_two_standbys_when_the_one_ahead_can_never_be_ replication_time_lag=130.0, ) pgl.never_promote_these_nodes = ["puu"] - pgl.execute_external_command.return_value = 0 + cast(Mock, pgl.execute_external_command).return_value = 0 pgl.replication_lag_over_warning_limit = True pgl.check_cluster_state() - assert pgl.execute_external_command.call_count == 1 + assert cast(Mock, pgl.execute_external_command).call_count == 1 assert pgl.replication_lag_over_warning_limit is False -def test_failover_with_no_master_anymore(pgl): +def test_failover_with_no_master_anymore(pgl: PgLookout) -> None: # this should trigger an immediate failover as we have two # standbys online but we've never seen a master pgl.own_db = "kuu" @@ -429,12 +436,12 @@ def test_failover_with_no_master_anymore(pgl): replication_time_lag=1, ) - pgl.execute_external_command.return_value = 0 + cast(Mock, pgl.execute_external_command).return_value = 0 pgl.check_cluster_state() - assert pgl.execute_external_command.call_count == 1 + assert cast(Mock, pgl.execute_external_command).call_count == 1 -def test_failover_over_replication_lag_when_still_connected_to_master(pgl): +def test_failover_over_replication_lag_when_still_connected_to_master(pgl: PgLookout) -> None: _add_db_to_cluster_state(pgl, "old_master", pg_is_in_recovery=False, connection=False) # We will make our own node to be the furthest along so we get considered for promotion @@ -449,13 +456,13 @@ def test_failover_over_replication_lag_when_still_connected_to_master(pgl): pgl.own_db = "kuu" pgl.check_cluster_state() - assert pgl.execute_external_command.call_count == 0 + assert cast(Mock, pgl.execute_external_command).call_count == 0 assert pgl.replication_lag_over_warning_limit is True # we keep the warning on def test_failover_over_replication_lag_with_one_observer_one_standby_no_connections( - pgl, -): + pgl: PgLookout, +) -> None: _add_db_to_cluster_state(pgl, "old_master", pg_is_in_recovery=False, connection=False) # We will make our own node to be the furthest along so we get considered for promotion @@ -475,7 +482,7 @@ def test_failover_over_replication_lag_with_one_observer_one_standby_no_connecti "old_master", pg_is_in_recovery=False, connection=False, - db_time=datetime.datetime(year=2014, month=1, day=1), + db_time=datetime(year=2014, month=1, day=1), ) _add_to_observer_state( pgl, @@ -488,11 +495,11 @@ def test_failover_over_replication_lag_with_one_observer_one_standby_no_connecti ) pgl.observer_state["observer"]["connection"] = False pgl.check_cluster_state() - assert pgl.execute_external_command.call_count == 0 + assert cast(Mock, pgl.execute_external_command).call_count == 0 assert pgl.replication_lag_over_warning_limit is True # we keep the warning on -def test_cluster_state_when_observer_has_also_non_members_of_our_current_cluster(pgl): +def test_cluster_state_when_observer_has_also_non_members_of_our_current_cluster(pgl: PgLookout) -> None: _add_db_to_cluster_state(pgl, "old_master", pg_is_in_recovery=False, connection=True) # We will make our own node to be the furthest along so we get considered for promotion @@ -512,7 +519,7 @@ def test_cluster_state_when_observer_has_also_non_members_of_our_current_cluster "old_master", pg_is_in_recovery=False, connection=False, - db_time=datetime.datetime(year=2014, month=1, day=1), + db_time=datetime(year=2014, month=1, day=1), ) _add_to_observer_state( pgl, @@ -537,7 +544,7 @@ def test_cluster_state_when_observer_has_also_non_members_of_our_current_cluster assert "old_master" in pgl.connected_master_nodes -def test_failover_no_connections(pgl): +def test_failover_no_connections(pgl: PgLookout) -> None: _add_db_to_cluster_state(pgl, "old_master", pg_is_in_recovery=False, connection=False) # We will make our own node to be the furthest along so we get considered for promotion @@ -561,17 +568,17 @@ def test_failover_no_connections(pgl): replication_time_lag=130.0, ) pgl.check_cluster_state() - assert pgl.execute_external_command.call_count == 0 + assert cast(Mock, pgl.execute_external_command).call_count == 0 assert pgl.replication_lag_over_warning_limit is True # we keep the warning on -def test_failover_master_two_standbys_one_observer_no_connection_between_standbys(pgl): +def test_failover_master_two_standbys_one_observer_no_connection_between_standbys(pgl: PgLookout) -> None: _add_db_to_cluster_state( pgl, "old_master", pg_is_in_recovery=False, connection=False, - db_time=datetime.datetime(year=2014, month=1, day=1), + db_time=datetime(year=2014, month=1, day=1), ) # We will make our own node to be the furthest along so we get considered for promotion _add_db_to_cluster_state( @@ -600,7 +607,7 @@ def test_failover_master_two_standbys_one_observer_no_connection_between_standby "old_master", pg_is_in_recovery=False, connection=False, - db_time=datetime.datetime(year=2014, month=1, day=1), + db_time=datetime(year=2014, month=1, day=1), ) _add_to_observer_state( pgl, @@ -620,17 +627,17 @@ def test_failover_master_two_standbys_one_observer_no_connection_between_standby connection=True, replication_time_lag=130.0, ) - pgl.execute_external_command.return_value = 0 + cast(Mock, pgl.execute_external_command).return_value = 0 pgl.check_cluster_state() - assert pgl.execute_external_command.call_count == 0 + assert cast(Mock, pgl.execute_external_command).call_count == 0 assert pgl.replication_lag_over_warning_limit is True pgl.check_cluster_state() - assert pgl.execute_external_command.call_count == 1 + assert cast(Mock, pgl.execute_external_command).call_count == 1 assert pgl.replication_lag_over_warning_limit is False -def test_failover_master_one_standby_one_observer_no_connections(pgl): +def test_failover_master_one_standby_one_observer_no_connections(pgl: PgLookout) -> None: pgl.own_db = "own" # Add observer state @@ -649,7 +656,7 @@ def test_failover_master_one_standby_one_observer_no_connections(pgl): pgl.check_cluster_state() assert pgl.replication_lag_over_warning_limit is True # we keep the warning on - assert pgl.execute_external_command.call_count == 0 + assert cast(Mock, pgl.execute_external_command).call_count == 0 # Add observer state _add_to_observer_state(pgl, "observer", "old_master", pg_is_in_recovery=False, connection=True) @@ -675,7 +682,7 @@ def test_failover_master_one_standby_one_observer_no_connections(pgl): pgl.check_cluster_state() # No failover yet - assert pgl.execute_external_command.call_count == 0 + assert cast(Mock, pgl.execute_external_command).call_count == 0 assert pgl.replication_lag_over_warning_limit is True # we keep the warning on # observer state @@ -685,7 +692,7 @@ def test_failover_master_one_standby_one_observer_no_connections(pgl): "old_master", pg_is_in_recovery=False, connection=False, - db_time=datetime.datetime(year=2014, month=1, day=1), + db_time=datetime(year=2014, month=1, day=1), ) _add_to_observer_state( pgl, @@ -702,14 +709,14 @@ def test_failover_master_one_standby_one_observer_no_connections(pgl): "old_master", pg_is_in_recovery=False, connection=False, - db_time=datetime.datetime(year=2014, month=1, day=1), + db_time=datetime(year=2014, month=1, day=1), ) # now do failover pgl.check_cluster_state() - assert pgl.execute_external_command.call_count == 1 + assert cast(Mock, pgl.execute_external_command).call_count == 1 -def test_find_current_master(pgl): +def test_find_current_master(pgl: PgLookout) -> None: _add_db_to_cluster_state(pgl, "master", pg_is_in_recovery=False, connection=True) # We will make our own node to be the furthest along so we get considered for promotion _add_db_to_cluster_state( @@ -725,13 +732,13 @@ def test_find_current_master(pgl): assert pgl.current_master == "master" -def test_two_standby_failover_and_autofollow(pgl, tmpdir): +def test_two_standby_failover_and_autofollow(pgl: PgLookout, tmp_path: Path) -> None: _add_db_to_cluster_state( pgl, "old_master", pg_is_in_recovery=False, connection=False, - fetch_time=datetime.datetime(year=2014, month=1, day=1), + fetch_time=datetime(year=2014, month=1, day=1), ) # We will make our own node to be the furthest from master so we don't get considered for promotion _add_db_to_cluster_state( @@ -754,7 +761,7 @@ def test_two_standby_failover_and_autofollow(pgl, tmpdir): pgl.check_cluster_state() assert pgl.replication_lag_over_warning_limit is True # we keep the warning on - assert pgl.execute_external_command.call_count == 0 + assert cast(Mock, pgl.execute_external_command).call_count == 0 assert pgl.current_master == "old_master" _add_db_to_cluster_state( @@ -767,29 +774,26 @@ def test_two_standby_failover_and_autofollow(pgl, tmpdir): conn_info={"host": "otherhost.example.com", "port": 11111}, ) - pg_data_dir = tmpdir.join("test_pgdata").strpath - os.makedirs(pg_data_dir) + pg_data_dir = tmp_path / "test_pgdata" + pg_data_dir.mkdir(parents=True, exist_ok=True) primary_conninfo = ( "user=replication password=vjsh8l7sv4a902y1tsdz " "host=old_master port=5432 " "sslmode=prefer sslcompression=1 krbsrvname=postgres" ) old_recovery_conf = f"standby_mode = 'on'\nprimary_conninfo = '{primary_conninfo}'\n" - with open(os.path.join(pg_data_dir, "recovery.conf"), "w") as fp: - fp.write(old_recovery_conf) + (pg_data_dir / "recovery.conf").write_text(old_recovery_conf) - pgl.config["pg_data_directory"] = pg_data_dir + pgl.config["pg_data_directory"] = str(pg_data_dir) pgl.config["autofollow"] = True pgl.primary_conninfo_template = get_connection_info(primary_conninfo) - with open(os.path.join(pg_data_dir, "PG_VERSION"), "w") as fp: - fp.write("11\n") + (pg_data_dir / "PG_VERSION").write_text("11\n") pgl.check_cluster_state() assert pgl.current_master == "other" - with open(os.path.join(pg_data_dir, "recovery.conf"), "r") as fp: - new_lines = fp.read().splitlines() + new_lines = (pg_data_dir / "recovery.conf").read_text().splitlines() assert new_lines.pop(0).startswith("# pglookout updated primary_conninfo") assert new_lines.pop(0) == "standby_mode = 'on'" assert new_lines[0].startswith("primary_conninfo = ") @@ -801,8 +805,8 @@ def test_two_standby_failover_and_autofollow(pgl, tmpdir): assert new_conn_info == dict(old_conn_info, host="otherhost.example.com", port="11111") -def test_replication_positions(pgl): - standby_nodes = { +def test_replication_positions(pgl: PgLookout) -> None: + standby_nodes: dict[str, MemberState] = { "10.255.255.10": { "connection": True, "db_time": "2014-08-28T14:09:57.919301+00:00Z", @@ -814,24 +818,31 @@ def test_replication_positions(pgl): "replication_time_lag": 254.341944, }, } + # the above node shouldn't show up as its fetch_time is (way) older than 20 seconds - positions = {} + positions: dict[int, set[str]] = {} assert pgl.get_replication_positions(standby_nodes) == positions standby_nodes["10.255.255.10"]["fetch_time"] = get_iso_timestamp() - positions[0x9000090] = set(["10.255.255.10"]) + positions[0x9000090] = {"10.255.255.10"} assert pgl.get_replication_positions(standby_nodes) == positions + # add another standby, further ahead - standby_nodes["10.255.255.11"] = dict(standby_nodes["10.255.255.10"], pg_last_xlog_receive_location="1/0000AAAA") - positions[1 << 32 | 0xAAAA] = set(["10.255.255.11"]) + standby_nodes["10.255.255.11"] = cast( + MemberState, dict(standby_nodes["10.255.255.10"], pg_last_xlog_receive_location="1/0000AAAA") + ) + positions[1 << 32 | 0xAAAA] = {"10.255.255.11"} assert pgl.get_replication_positions(standby_nodes) == positions + # add another standby which hasn't received anything - standby_nodes["10.255.255.12"] = dict(standby_nodes["10.255.255.10"], pg_last_xlog_receive_location=None) + standby_nodes["10.255.255.12"] = cast( + MemberState, dict(standby_nodes["10.255.255.10"], pg_last_xlog_receive_location=None) + ) positions[0x9000090].add("10.255.255.12") assert pgl.get_replication_positions(standby_nodes) == positions -def test_node_map(pgl): - cluster_state = { +def test_node_map(pgl: PgLookout) -> None: + cluster_state: dict[str, MemberState] = { "10.255.255.10": { "connection": True, "db_time": "2014-08-28T14:26:51.067084+00:00Z", @@ -848,38 +859,40 @@ def test_node_map(pgl): }, } observer_state = { - "10.255.255.11": { - "10.255.255.10": { + "10.255.255.11": ObservedState( + { + "10.255.255.10": { + "connection": True, + "db_time": "2014-08-28T14:26:47.105901+00:00Z", + "fetch_time": "2014-08-28T14:26:47.104849Z", + "pg_is_in_recovery": False, + "pg_last_xact_replay_timestamp": "2014-08-28T14:05:43.577357+00:00Z", + "pg_last_xlog_receive_location": "0/9000090", + "pg_last_xlog_replay_location": "0/9000090", + "replication_time_lag": 1263.528544, + }, + "10.255.255.9": { + "connection": False, + "db_time": "2014-08-28T14:06:15.172820+00:00Z", + "fetch_time": "2014-08-28T14:26:47.107115Z", + "pg_is_in_recovery": False, + "pg_last_xact_replay_timestamp": None, + "pg_last_xlog_receive_location": None, + "pg_last_xlog_replay_location": None, + }, "connection": True, - "db_time": "2014-08-28T14:26:47.105901+00:00Z", - "fetch_time": "2014-08-28T14:26:47.104849Z", - "pg_is_in_recovery": False, - "pg_last_xact_replay_timestamp": "2014-08-28T14:05:43.577357+00:00Z", - "pg_last_xlog_receive_location": "0/9000090", - "pg_last_xlog_replay_location": "0/9000090", - "replication_time_lag": 1263.528544, - }, - "10.255.255.9": { - "connection": False, - "db_time": "2014-08-28T14:06:15.172820+00:00Z", - "fetch_time": "2014-08-28T14:26:47.107115Z", - "pg_is_in_recovery": False, - "pg_last_xact_replay_timestamp": None, - "pg_last_xlog_receive_location": None, - "pg_last_xlog_replay_location": None, - }, - "connection": True, - "fetch_time": "2014-08-28T14:26:51.069891Z", - } + "fetch_time": "2014-08-28T14:26:51.069891Z", + } + ) } master_host, _, standby_nodes = pgl.create_node_map(cluster_state, observer_state) assert master_host == "10.255.255.10" assert standby_nodes == {} -def test_node_map_disconnected_current_master(pgl): +def test_node_map_disconnected_current_master(pgl: PgLookout) -> None: pgl.current_master = "10.255.255.7" - cluster_state = { + cluster_state: dict[str, MemberState] = { "10.255.255.7": { "connection": False, "db_time": "2014-09-07T15:26:23.957151+00:00Z", @@ -900,14 +913,14 @@ def test_node_map_disconnected_current_master(pgl): "replication_time_lag": 43.586525, }, } - observer_state = {} + observer_state: dict[str, ObservedState] = {} master_host, _, standby_nodes = pgl.create_node_map(cluster_state, observer_state) assert master_host == "10.255.255.7" assert list(standby_nodes.keys())[0] == "10.255.255.8" -def test_standbys_failover_equal_replication_positions(pgl): - now = datetime.datetime.utcnow() +def test_standbys_failover_equal_replication_positions(pgl: PgLookout) -> None: + now = datetime.utcnow() _add_db_to_cluster_state( pgl, instance="192.168.54.183", @@ -926,8 +939,8 @@ def test_standbys_failover_equal_replication_positions(pgl): pg_is_in_recovery=False, connection=False, replication_time_lag=0.0, - fetch_time=now - datetime.timedelta(seconds=3600), - db_time=now - datetime.timedelta(seconds=3600), + fetch_time=now - timedelta(seconds=3600), + db_time=now - timedelta(seconds=3600), conn_info="foobar", ) _add_db_to_cluster_state( @@ -947,15 +960,15 @@ def test_standbys_failover_equal_replication_positions(pgl): # highest standby currently. pgl.own_db = "192.168.54.183" pgl.check_cluster_state() - assert pgl.execute_external_command.call_count == 0 + assert cast(Mock, pgl.execute_external_command).call_count == 0 # If we're the highest we should see call_count increment pgl.own_db = "192.168.63.4" pgl.check_cluster_state() - assert pgl.execute_external_command.call_count == 1 + assert cast(Mock, pgl.execute_external_command).call_count == 1 -def test_node_map_when_only_observer_sees_master(pgl): - cluster_state = { +def test_node_map_when_only_observer_sees_master(pgl: PgLookout) -> None: + cluster_state: dict[str, MemberState] = { "10.255.255.10": { "connection": False, "db_time": "2014-08-28T14:26:51.067084+00:00Z", @@ -967,21 +980,23 @@ def test_node_map_when_only_observer_sees_master(pgl): "replication_time_lag": 1267.489727, }, } - observer_state = { - "10.255.255.11": { - "10.255.255.10": { + observer_state: dict[str, ObservedState] = { + "10.255.255.11": ObservedState( + { + "10.255.255.10": { + "connection": True, + "db_time": "2014-08-28T14:26:47.105901+00:00Z", + "fetch_time": "2014-08-28T14:26:50.104849Z", + "pg_is_in_recovery": False, + "pg_last_xact_replay_timestamp": "2014-08-28T14:05:43.577357+00:00Z", + "pg_last_xlog_receive_location": "0/9000090", + "pg_last_xlog_replay_location": "0/9000090", + "replication_time_lag": 1263.528544, + }, "connection": True, - "db_time": "2014-08-28T14:26:47.105901+00:00Z", - "fetch_time": "2014-08-28T14:26:50.104849Z", - "pg_is_in_recovery": False, - "pg_last_xact_replay_timestamp": "2014-08-28T14:05:43.577357+00:00Z", - "pg_last_xlog_receive_location": "0/9000090", - "pg_last_xlog_replay_location": "0/9000090", - "replication_time_lag": 1263.528544, - }, - "connection": True, - "fetch_time": "2014-08-28T14:26:51.069891Z", - } + "fetch_time": "2014-08-28T14:26:51.069891Z", + } + ) } master_instance, _, _ = pgl.create_node_map(cluster_state, observer_state) assert master_instance == "10.255.255.10" @@ -989,7 +1004,7 @@ def test_node_map_when_only_observer_sees_master(pgl): assert master_instance in pgl.connected_master_nodes -def test_poll_observers_on_warning_only(pgl): +def test_poll_observers_on_warning_only(pgl: PgLookout) -> None: pgl.config["poll_observers_on_warning_only"] = True pgl.config["observers"] = {"local": "URL"} pgl.own_db = "kuu" @@ -998,7 +1013,7 @@ def test_poll_observers_on_warning_only(pgl): "master", pg_is_in_recovery=False, connection=True, - db_time=datetime.datetime.min, + db_time=datetime.min, ) _add_db_to_cluster_state( pgl, @@ -1010,7 +1025,7 @@ def test_poll_observers_on_warning_only(pgl): pgl.check_cluster_state() assert "master" not in pgl.disconnected_master_nodes assert "master" in pgl.connected_master_nodes - assert pgl.execute_external_command.call_count == 0 + assert cast(Mock, pgl.execute_external_command).call_count == 0 assert pgl.replication_lag_over_warning_limit assert pgl.observer_state_newer_than is not None @@ -1019,7 +1034,7 @@ def test_poll_observers_on_warning_only(pgl): "master", pg_is_in_recovery=False, connection=False, - db_time=datetime.datetime.min, + db_time=datetime.min, ) _add_db_to_cluster_state( pgl, @@ -1034,17 +1049,17 @@ def test_poll_observers_on_warning_only(pgl): "master", pg_is_in_recovery=False, connection=False, - db_time=datetime.datetime.min, + db_time=datetime.min, ) pgl.check_cluster_state() # this check makes sure we did not skip doing checks because db_poll_inteval has # pass but because observer data is available and create_node_map was called assert "master" in pgl.disconnected_master_nodes - assert pgl.execute_external_command.call_count == 1 + assert cast(Mock, pgl.execute_external_command).call_count == 1 @pytest.mark.parametrize( - "start_time_offset, last_run_offset, timeout_configured, signaled", + ("start_time_offset", "last_run_offset", "timeout_configured", "signaled"), [ # timeout disabled, should never alert (None, None, "null", False), @@ -1080,12 +1095,13 @@ def test_poll_observers_on_warning_only(pgl): ], ) def test_check_cluster_monitor_health( - pgl, - start_time_offset: Optional[float], - last_run_offset: Optional[float], - timeout_configured: Optional[Union[float, str]], + pgl: PgLookout, + start_time_offset: float | None, + last_run_offset: float | None, + timeout_configured: float | str | None, signaled: bool, ) -> None: + assert pgl.cluster_monitor is not None now = time.monotonic() if timeout_configured is not None: if timeout_configured == "null": From 1f471551d9e001239387cacb1ee6b06f3fce7f67 Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Mon, 27 Mar 2023 13:58:12 +0200 Subject: [PATCH 13/23] mypy: New method to normalize config [BF-1560] Change `remote_conns` type hint. The temporary `str` type is no longer accepted. When loading the config, a conversion is done to only have the dict-style type in the config's `remote_conns`. --- pglookout/cluster_monitor.py | 9 +++++---- pglookout/config.py | 3 ++- pglookout/pglookout.py | 22 +++++++++++++++++++--- pglookout/pgutil.py | 12 +++++++----- test/test_pglookout.py | 9 +++------ 5 files changed, 36 insertions(+), 19 deletions(-) diff --git a/pglookout/cluster_monitor.py b/pglookout/cluster_monitor.py index 29eab48..93ea4bf 100644 --- a/pglookout/cluster_monitor.py +++ b/pglookout/cluster_monitor.py @@ -17,7 +17,7 @@ from pglookout.common import get_iso_timestamp, parse_iso_datetime from pglookout.common_types import MemberState, ObservedState, ReplicationSlotAsDict from pglookout.config import Config -from pglookout.pgutil import mask_connection_info +from pglookout.pgutil import ConnectionInfo, create_connection_string, mask_connection_info from pglookout.statsd import StatsClient from psycopg2.extensions import POLL_OK, POLL_READ, POLL_WRITE from psycopg2.extras import RealDictCursor, RealDictRow @@ -115,7 +115,7 @@ def __init__( self.last_monitoring_success_time: float | None = None self.log.debug("Initialized ClusterMonitor with: %r", cluster_state) - def _connect_to_db(self, instance: str, dsn: str | None) -> psycopg2.connection | None: + def _connect_to_db(self, instance: str, dsn: ConnectionInfo | None) -> psycopg2.connection | None: conn = self.db_conns.get(instance) if conn: @@ -130,7 +130,8 @@ def _connect_to_db(self, instance: str, dsn: str | None) -> psycopg2.connection try: self.log.info("Connecting to %s", inst_info_str) - conn = psycopg2.connect(dsn=dsn, async_=True) + dsn_string = create_connection_string(dsn) + conn = psycopg2.connect(dsn=dsn_string, async_=True) wait_select(conn) self.log.debug("Connected to %s", inst_info_str) except (PglookoutTimeout, psycopg2.OperationalError) as ex: @@ -259,7 +260,7 @@ def _query_cluster_member_state(self, instance: str, db_conn: psycopg2.connectio result: MemberState = {"fetch_time": get_iso_timestamp(), "connection": False} if not db_conn: - dsn: str | None = self.config["remote_conns"].get(instance) + dsn: ConnectionInfo | None = self.config["remote_conns"].get(instance) db_conn = self._connect_to_db(instance, dsn) if not db_conn: return result diff --git a/pglookout/config.py b/pglookout/config.py index 9575841..5860dee 100644 --- a/pglookout/config.py +++ b/pglookout/config.py @@ -1,6 +1,7 @@ # Copyright (c) 2023 Aiven, Helsinki, Finland. https://aiven.io/ from __future__ import annotations +from pglookout.pgutil import ConnectionInfo from typing import Literal, TypedDict @@ -34,7 +35,7 @@ class Config(TypedDict, total=False): pg_stop_command: str poll_observers_on_warning_only: bool primary_conninfo_template: str - remote_conns: dict[str, str] # instance name -> dsn + remote_conns: dict[str, ConnectionInfo] replication_catchup_timeout: float replication_state_check_interval: float statsd: Statsd diff --git a/pglookout/pglookout.py b/pglookout/pglookout.py index a6cb009..2ccf2d7 100755 --- a/pglookout/pglookout.py +++ b/pglookout/pglookout.py @@ -45,7 +45,7 @@ from socket import gethostname from subprocess import CalledProcessError, check_call from types import FrameType -from typing import cast, Final, Optional +from typing import Any, cast, Final, Optional import json import sys @@ -169,13 +169,29 @@ def load_config_from_signal(self, _signal: int, _frame: FrameType | None = None) ) self.load_config() + @staticmethod + def normalize_config(config: dict[str, Any]) -> Config: + """Normalize the config to a known format. + + Warning: + This method does not give any guarantees about the validity of the config. + + Note: + A better approach would be to use a schema to validate the config, or + tu use something like ``pydantic`` or ``dataclasses`` to define the config. + """ + if "remote_conns" in config: + config["remote_conns"] = {name: get_connection_info(info) for name, info in config["remote_conns"].items()} + + return cast(Config, config) + def load_config(self) -> None: self.log.debug("Loading JSON config from: %r", self.config_path) previous_remote_conns = self.config.get("remote_conns") try: - with self.config_path.open() as fp: - self.config = json.load(fp) + config = json.loads(self.config_path.read_text()) + self.config = self.normalize_config(config) except Exception as ex: # pylint: disable=broad-except self.log.exception("Invalid JSON config, exiting") self.stats.unexpected_exception(ex, where="load_config") diff --git a/pglookout/pgutil.py b/pglookout/pgutil.py index 7d5f692..799fcca 100644 --- a/pglookout/pgutil.py +++ b/pglookout/pgutil.py @@ -7,7 +7,7 @@ """ from __future__ import annotations -from typing import cast, Literal, TypedDict +from typing import cast, Literal, TypedDict, Union from urllib.parse import parse_qs, urlparse # pylint: disable=no-name-in-module, import-error import psycopg2.extensions @@ -74,11 +74,15 @@ class ConnectionParameterKeywords(TypedDict, total=False): target_session_attrs: Literal["any", "read-write", "read-only", "primary", "standby", "prefer-standby"] +#: Type alias for allowed connection info types. +ConnectionInfo = Union[DsnDict, DsnDictDeprecated, ConnectionParameterKeywords] + + def create_connection_string(connection_info: DsnDict | DsnDictDeprecated | ConnectionParameterKeywords) -> str: return str(psycopg2.extensions.make_dsn(**connection_info)) -def mask_connection_info(info: str) -> str: +def mask_connection_info(info: str | ConnectionInfo) -> str: masked_info = get_connection_info(info) password = masked_info.pop("password", None) connection_string = create_connection_string(masked_info) @@ -92,9 +96,7 @@ def get_connection_info_from_config_line(line: str) -> ConnectionParameterKeywor return get_connection_info(value) -def get_connection_info( - info: str | DsnDict | DsnDictDeprecated | ConnectionParameterKeywords, -) -> ConnectionParameterKeywords: +def get_connection_info(info: str | ConnectionInfo) -> ConnectionParameterKeywords: """Get a normalized connection info dict from a connection string or a dict. Supports both the traditional libpq format and the new url format. diff --git a/test/test_pglookout.py b/test/test_pglookout.py index 9aa3421..4286011 100644 --- a/test/test_pglookout.py +++ b/test/test_pglookout.py @@ -14,7 +14,7 @@ from pglookout.common import get_iso_timestamp from pglookout.common_types import MemberState, ObservedState from pglookout.pglookout import PgLookout -from pglookout.pgutil import get_connection_info, get_connection_info_from_config_line +from pglookout.pgutil import ConnectionInfo, DsnDict, get_connection_info, get_connection_info_from_config_line from psycopg2._psycopg import connection as pg_connection # pylint: disable=no-name-in-module from typing import cast from unittest.mock import Mock, patch @@ -113,7 +113,7 @@ def _add_db_to_cluster_state( replication_time_lag: float | None = None, fetch_time: datetime | None = None, db_time: datetime | None = None, - conn_info: str | None = None, + conn_info: ConnectionInfo | None = None, ) -> None: db_node_state = _create_db_node_state( pg_last_xlog_receive_location, @@ -124,7 +124,7 @@ def _add_db_to_cluster_state( db_time=db_time, ) pgl.cluster_state[instance] = db_node_state - pgl.config["remote_conns"][instance] = conn_info or {"host": instance} + pgl.config["remote_conns"][instance] = get_connection_info(conn_info or cast(DsnDict, {"host": instance})) def test_check_cluster_state_warning(pgl: PgLookout) -> None: @@ -930,7 +930,6 @@ def test_standbys_failover_equal_replication_positions(pgl: PgLookout) -> None: replication_time_lag=400.435871, fetch_time=now, db_time=now, - conn_info="foobar", ) _add_db_to_cluster_state( pgl, @@ -941,7 +940,6 @@ def test_standbys_failover_equal_replication_positions(pgl: PgLookout) -> None: replication_time_lag=0.0, fetch_time=now - timedelta(seconds=3600), db_time=now - timedelta(seconds=3600), - conn_info="foobar", ) _add_db_to_cluster_state( pgl, @@ -952,7 +950,6 @@ def test_standbys_failover_equal_replication_positions(pgl: PgLookout) -> None: replication_time_lag=401.104655, fetch_time=now, db_time=now, - conn_info="foobar", ) pgl.current_master = "192.168.57.180" From 5d4ac6e8a331a3031a2bdf5e1ccc152322ecc57c Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Mon, 27 Mar 2023 15:51:13 +0200 Subject: [PATCH 14/23] mypy: `version.py` [BF-1560] Extraction of the 3 read functions: from file, git, and Makefile. --- Makefile | 2 +- pyproject.toml | 5 +-- version.py | 101 ++++++++++++++++++++++++++++++++++--------------- 3 files changed, 72 insertions(+), 36 deletions(-) diff --git a/Makefile b/Makefile index e1df801..a958cd6 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ long_ver = $(shell git describe --long 2>/dev/null || echo $(short_ver)-0-unknow generated = pglookout/version.py PYTHON ?= python3 -PYTHON_SOURCE_DIRS = pglookout/ test/ stubs/ +PYTHON_SOURCE_DIRS = pglookout/ test/ stubs/ version.py all: $(generated) : 'try "make rpm" or "make deb" or "make test"' diff --git a/pyproject.toml b/pyproject.toml index 48a7c04..94dd101 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,16 +27,13 @@ strict = true files = [ 'pglookout/', 'test/', + 'version.py', ] exclude = [ - # Implementation. - 'pglookout/__main__.py', - 'pglookout/version.py', # Tests. 'test/conftest.py', # Other. 'setup.py', - 'version.py', ] [[tool.mypy.overrides]] diff --git a/version.py b/version.py index a3e88b6..1f12706 100644 --- a/version.py +++ b/version.py @@ -5,56 +5,95 @@ Copyright (c) 2015 Ohmu Ltd See LICENSE for details """ +from __future__ import annotations -import imp -import os +from datetime import datetime +from pathlib import Path +from textwrap import dedent +from typing import Final + +import imp # pylint: disable=deprecated-module import subprocess +ROOT_DIR: Final[Path] = Path(__file__).parent + -def save_version(new_ver, old_ver, version_file): +def save_version(new_ver: str, old_ver: str | None, version_file: Path) -> bool: + """Write new version to the version file if it has changed.""" if not new_ver: return False - version_file = os.path.join(os.path.dirname(__file__), version_file) + if not old_ver or new_ver != old_ver: - with open(version_file, "w") as fp: - fp.write("__version__ = '{}'\n".format(new_ver)) + version_file.write_text( + dedent( + f"""\ + # Copyright (c) {datetime.now():%Y} Aiven, Helsinki, Finland. https://aiven.io/ + __version__ = '{new_ver}' + """ + ) + ) + return True -def get_project_version(version_file): - version_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), version_file) +def get_existing_version(version_file: Path) -> str | None: + """Read version from the version file""" try: - module = imp.load_source("verfile", version_file) - file_ver = module.__version__ - except IOError: - file_ver = None + module = imp.load_source("verfile", str(version_file)) + return str(module.__version__) + except (IOError, AttributeError): + return None - os.chdir(os.path.dirname(__file__) or ".") + +def get_makefile_version(makefile_path: Path) -> str | None: + """Read version from ``Makefile``.""" + if makefile_path.is_file(): + lines = makefile_path.read_text().splitlines() + try: + it_short_ver = (line.split("=", 1)[1].strip() for line in lines if line.startswith("short_ver")) + return next(it_short_ver) + except StopIteration: + pass + return None + + +def get_git_version(repo_dir: Path) -> str | None: + """Read version from git.""" try: - git_out = subprocess.check_output(["git", "describe", "--always"], - stderr=getattr(subprocess, "DEVNULL", None)) + git_out = subprocess.check_output( + ["git", "describe", "--always"], cwd=repo_dir, stderr=getattr(subprocess, "DEVNULL", None) + ) except (OSError, subprocess.CalledProcessError): - pass - else: - git_ver = git_out.splitlines()[0].strip().decode("utf-8") - if "." not in git_ver: - git_ver = "0.0.1-0-unknown-{}".format(git_ver) - if save_version(git_ver, file_ver, version_file): - return git_ver - - makefile = os.path.join(os.path.dirname(__file__), "Makefile") - if os.path.exists(makefile): - with open(makefile, "r") as fp: - lines = fp.readlines() - short_ver = [line.split("=", 1)[1].strip() for line in lines if line.startswith("short_ver")][0] - if save_version(short_ver, file_ver, version_file): - return short_ver + return None + + git_ver = git_out.splitlines()[0].strip().decode("utf-8") + if "." not in git_ver: + git_ver = f"0.0.1-0-unknown-{git_ver}" + + return git_ver + + +def get_project_version(version_file_name: str) -> str: + """Read version from git or from the version file""" + version_file = ROOT_DIR / version_file_name + file_ver = get_existing_version(version_file) + + git_ver = get_git_version(ROOT_DIR) + if git_ver and save_version(git_ver, file_ver, version_file): + return git_ver + + makefile = ROOT_DIR / "Makefile" + short_ver = get_makefile_version(makefile) + if short_ver and save_version(short_ver, file_ver, version_file): + return short_ver if not file_ver: - raise Exception("version not available from git or from file {!r}".format(version_file)) + raise Exception(f"version not available from git or from file {version_file!r}") return file_ver + if __name__ == "__main__": import sys + get_project_version(sys.argv[1]) From 51bb85e34a7e4bfa5324435f001992f4428c1626 Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Mon, 27 Mar 2023 16:07:01 +0200 Subject: [PATCH 15/23] mypy: `setup.py` [BF-1560] --- Makefile | 2 +- pyproject.toml | 3 +-- requirements.dev.txt | 1 + setup.py | 12 ++++++------ 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index a958cd6..955ea1e 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ long_ver = $(shell git describe --long 2>/dev/null || echo $(short_ver)-0-unknow generated = pglookout/version.py PYTHON ?= python3 -PYTHON_SOURCE_DIRS = pglookout/ test/ stubs/ version.py +PYTHON_SOURCE_DIRS = pglookout/ test/ stubs/ version.py setup.py all: $(generated) : 'try "make rpm" or "make deb" or "make test"' diff --git a/pyproject.toml b/pyproject.toml index 94dd101..4a92069 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,12 +28,11 @@ files = [ 'pglookout/', 'test/', 'version.py', + 'setup.py', ] exclude = [ # Tests. 'test/conftest.py', - # Other. - 'setup.py', ] [[tool.mypy.overrides]] diff --git a/requirements.dev.txt b/requirements.dev.txt index 1dd3f89..2635d85 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -5,5 +5,6 @@ isort==5.10.1 mypy==1.1.1 types-psycopg2==2.9.21.8 types-requests==2.26.1 +types-setuptools==67.6.0.5 pylint==2.15.4 pytest diff --git a/setup.py b/setup.py index b068297..ee3789a 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,12 @@ -from setuptools import setup, find_packages -import os +# Copyright (c) 2023 Aiven, Helsinki, Finland. https://aiven.io/ +from pathlib import Path +from setuptools import find_packages, setup + import sys import version - -readme_path = os.path.join(os.path.dirname(__file__), "README.rst") -with open(readme_path, "r") as fp: - readme_text = fp.read() +readme_path = Path(__file__).parent / "README.rst" +readme_text = readme_path.read_text() version_for_setup_py = version.get_project_version("pglookout/version.py") From e9834d4092e3411602bba91a3b6ee219fe8e4088 Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Mon, 27 Mar 2023 16:08:10 +0200 Subject: [PATCH 16/23] mypy: Remove python2 dependency [BF-1560] Package https://pypi.org/project/futures/ is not needed in Python3. --- setup.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/setup.py b/setup.py index ee3789a..a3564ac 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,6 @@ from pathlib import Path from setuptools import find_packages, setup -import sys import version readme_path = Path(__file__).parent / "README.rst" @@ -18,9 +17,6 @@ "requests >= 1.2.0", ] -if sys.version_info[0] == 2: - requires.append("futures") - setup( name="pglookout", From a3ad6f850a0ef8386d574d432d667dbac43dda27 Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Mon, 27 Mar 2023 16:54:19 +0200 Subject: [PATCH 17/23] mypy: `conftest.py` [BF-1560] Also includes some cleanup to better leverage pytest features. --- pyproject.toml | 4 -- test/conftest.py | 125 ++++++++++++++++++----------------- test/test_cluster_monitor.py | 11 +-- 3 files changed, 72 insertions(+), 68 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4a92069..10f5aa9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,10 +30,6 @@ files = [ 'version.py', 'setup.py', ] -exclude = [ - # Tests. - 'test/conftest.py', -] [[tool.mypy.overrides]] module = "test.test_cluster_monitor" diff --git a/test/conftest.py b/test/conftest.py index 869ad7e..a27b015 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -4,32 +4,37 @@ Copyright (c) 2016 Ohmu Ltd See LICENSE for details """ -from pglookout import logutil, pgutil +# pylint: disable=protected-access +from __future__ import annotations + +from pathlib import Path +from pglookout import logutil from pglookout.pglookout import PgLookout -from py import path as py_path # pylint: disable=no-name-in-module -from unittest.mock import Mock +from pglookout.pgutil import DsnDict +from textwrap import dedent +from typing import cast, Final, Generator +from unittest.mock import Mock, patch import os import pytest import signal import subprocess -import tempfile import time -PG_VERSIONS = ["15", "14", "13", "12", "11", "10"] +PG_VERSIONS: Final[list[str]] = ["15", "14", "13", "12", "11", "10"] logutil.configure_logging() @pytest.fixture -def pgl(): +def pgl() -> Generator[PgLookout, None, None]: pgl_ = PgLookout("pglookout.json") + assert pgl_.cluster_monitor is not None pgl_.config["remote_conns"] = {} - pgl_.check_for_maintenance_mode_file = Mock() - pgl_.check_for_maintenance_mode_file.return_value = False - pgl_.cluster_monitor._connect_to_db = Mock() # pylint: disable=protected-access - pgl_.create_alert_file = Mock() - pgl_.execute_external_command = Mock() + pgl_.check_for_maintenance_mode_file = Mock(return_value=False) # type: ignore[method-assign] + pgl_.cluster_monitor._connect_to_db = Mock() # type: ignore[method-assign] + pgl_.create_alert_file = Mock() # type: ignore[method-assign] + pgl_.execute_external_command = Mock() # type: ignore[method-assign] try: yield pgl_ finally: @@ -37,52 +42,51 @@ def pgl(): class TestPG: - def __init__(self, pgdata): - self.pgbin = self.find_pgbin() - self.pgdata = pgdata - self.pg = None + def __init__(self, pgdata: Path) -> None: + self.pgbin: Path = self.find_pgbin() + self.pgdata: Path = pgdata + self.pg: subprocess.Popen[bytes] | None = None @staticmethod - def find_pgbin(versions=None): + def find_pgbin(versions: list[str] | None = None) -> Path: pathformats = ["/usr/pgsql-{ver}/bin", "/usr/lib/postgresql/{ver}/bin"] for ver in versions or PG_VERSIONS: for pathfmt in pathformats: - pgbin = pathfmt.format(ver=ver) - if os.path.exists(pgbin): + pgbin = Path(pathfmt.format(ver=ver)) + if pgbin.is_dir(): return pgbin - return "/usr/bin" + return Path("/usr/bin") @property - def pgver(self): - with open(os.path.join(self.pgdata, "PG_VERSION"), "r") as fp: - return fp.read().strip() + def pgver(self) -> str: + return (self.pgdata / "PG_VERSION").read_text().strip() - def connection_string(self, user="testuser", dbname="postgres"): - return pgutil.create_connection_string( + def connection_dict(self, user: str = "testuser", dbname: str = "postgres") -> DsnDict: + return cast( + DsnDict, { "dbname": dbname, "host": self.pgdata, "port": 5432, "user": user, - } + }, ) - def createuser(self, user="testuser"): - self.run_cmd("createuser", "-h", self.pgdata, "-p", "5432", "-s", user) + def createuser(self, user: str = "testuser") -> None: + self.run_cmd("createuser", "-h", str(self.pgdata), "-p", "5432", "-s", user) - def run_cmd(self, cmd, *args): - argv = [os.path.join(self.pgbin, cmd)] - argv.extend(args) + def run_cmd(self, cmd: str, *args: str) -> None: + argv = [str(self.pgbin / cmd), *args] subprocess.check_call(argv) - def run_pg(self): + def run_pg(self) -> None: self.pg = subprocess.Popen( # pylint: disable=bad-option-value,consider-using-with [ - os.path.join(self.pgbin, "postgres"), + str(self.pgbin / "postgres"), "-D", - self.pgdata, + str(self.pgdata), "-k", - self.pgdata, + str(self.pgdata), "-p", "5432", "-c", @@ -91,7 +95,7 @@ def run_pg(self): ) time.sleep(1.0) # let pg start - def kill(self, force=True, immediate=True): + def kill(self, force: bool = True, immediate: bool = True) -> None: if self.pg is None: return if force: @@ -107,24 +111,25 @@ def kill(self, force=True, immediate=True): raise Exception(f"PG pid {self.pg.pid} not dead") -# NOTE: cannot use 'tmpdir' fixture here, it only works in 'function' scope @pytest.fixture(scope="session") -def db(): - tmpdir_obj = py_path.local(tempfile.mkdtemp(prefix="pglookout_dbtest_")) - tmpdir = str(tmpdir_obj) +def db(tmp_path_factory: pytest.TempPathFactory) -> Generator[TestPG, None, None]: + tmpdir = tmp_path_factory.mktemp(basename="pglookout_dbtest_") # try to find the binaries for these versions in some path - pgdata = os.path.join(tmpdir, "pgdata") + pgdata = tmpdir / "pgdata" db = TestPG(pgdata) # pylint: disable=redefined-outer-name - db.run_cmd("initdb", "-D", pgdata, "--encoding", "utf-8") - # NOTE: point $HOME to tmpdir - $HOME shouldn't affect most tests, but - # psql triest to find .pgpass file from there as do our functions that - # manipulate pgpass. By pointing $HOME there we make sure we're not - # making persistent changes to the environment. - os.environ["HOME"] = tmpdir + db.run_cmd("initdb", "-D", str(pgdata), "--encoding", "utf-8") + # allow replication connections - with open(os.path.join(pgdata, "pg_hba.conf"), "w") as fp: - fp.write("local all all trust\nlocal replication all trust\n") - with open(os.path.join(pgdata, "postgresql.conf"), "a") as fp: + (pgdata / "pg_hba.conf").write_text( + dedent( + """\ + local all all trust + local replication all trust + """ + ) + ) + + with (pgdata / "postgresql.conf").open("a") as fp: fp.write( "max_wal_senders = 2\n" "wal_level = logical\n" @@ -136,14 +141,16 @@ def db(): ) if db.pgver < "13": fp.write("wal_keep_segments = 100\n") - db.run_pg() - try: - db.createuser() - db.createuser("otheruser") - yield db - finally: - db.kill() + + # NOTE: point $HOME to tmpdir - $HOME shouldn't affect most tests, but + # psql triest to find .pgpass file from there as do our functions that + # manipulate pgpass. By pointing $HOME there we make sure we're not + # making persistent changes to the environment. + with patch.dict(os.environ, {"HOME": str(tmpdir)}): + db.run_pg() try: - tmpdir_obj.remove(rec=1) - except: # pylint: disable=bare-except - pass + db.createuser() + db.createuser("otheruser") + yield db + finally: + db.kill() diff --git a/test/test_cluster_monitor.py b/test/test_cluster_monitor.py index b841be2..845d862 100644 --- a/test/test_cluster_monitor.py +++ b/test/test_cluster_monitor.py @@ -12,6 +12,7 @@ from pglookout.cluster_monitor import ClusterMonitor from pglookout.common_types import MemberState, ObservedState from pglookout.config import Config +from pglookout.pgutil import create_connection_string from psycopg2.extras import RealDictCursor from queue import Queue from typing import NoReturn @@ -49,8 +50,8 @@ def test_replication_lag() -> None: def test_main_loop(db: TestPG) -> None: config: Config = { "remote_conns": { - "test1db": db.connection_string("testuser"), - "test2db": db.connection_string("otheruser"), + "test1db": db.connection_dict("testuser"), + "test2db": db.connection_dict("otheruser"), }, "observers": {"local": "URL"}, "poll_observers_on_warning_only": True, @@ -111,8 +112,8 @@ def test_fetch_replication_slot_info(db: TestPG) -> None: config: Config = { "remote_conns": { - "test1db": db.connection_string("testuser"), - "test2db": db.connection_string("otheruser"), + "test1db": db.connection_dict("testuser"), + "test2db": db.connection_dict("otheruser"), }, "observers": {"local": "URL"}, "poll_observers_on_warning_only": True, @@ -138,7 +139,7 @@ def create_alert_file(arg: str) -> NoReturn: ) cm.main_monitoring_loop(requested_check=True) - with closing(psycopg2.connect(db.connection_string(), connect_timeout=15)) as conn: + with closing(psycopg2.connect(create_connection_string(db.connection_dict()), connect_timeout=15)) as conn: with conn.cursor(cursor_factory=RealDictCursor) as cursor: cursor.execute("SELECT pg_catalog.pg_create_logical_replication_slot('testslot1', 'test_decoding')") From 7434873c76aa912d76cf2a3fe76974036af20e34 Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Mon, 27 Mar 2023 16:56:49 +0200 Subject: [PATCH 18/23] Bump `Makefile` version to `2.1.0` [BF-1560] --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 955ea1e..700617c 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -short_ver = 2.0.3 +short_ver = 2.1.0 long_ver = $(shell git describe --long 2>/dev/null || echo $(short_ver)-0-unknown-g`git describe --always`) generated = pglookout/version.py From 30bcd0d4eca27f21a44f94063897ba57283be4c9 Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Mon, 27 Mar 2023 17:15:03 +0200 Subject: [PATCH 19/23] mypy: Compatible with Python 3.9 logging [BF-1560] --- pglookout/pglookout.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pglookout/pglookout.py b/pglookout/pglookout.py index 2ccf2d7..aa9828f 100755 --- a/pglookout/pglookout.py +++ b/pglookout/pglookout.py @@ -12,7 +12,7 @@ from argparse import ArgumentParser from copy import deepcopy from datetime import datetime, timedelta -from logging import DEBUG, getLevelNamesMapping, getLogger, Logger +from logging import _nameToLevel, DEBUG, getLogger, Logger from logging.handlers import SysLogHandler from packaging.version import parse as parse_version from pathlib import Path @@ -52,7 +52,7 @@ import time DEFAULT_LOG_LEVEL: Final[str] = "DEBUG" -LOG_LEVEL_NAMES_MAPPING: Final[dict[str, int]] = getLevelNamesMapping() +LOG_LEVEL_NAMES_MAPPING: Final[dict[str, int]] = deepcopy(_nameToLevel) class PgLookout: From e4e771ae39928b023b352660fd83e95f8b0a8eaf Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Mon, 27 Mar 2023 18:12:33 +0200 Subject: [PATCH 20/23] pylint: Enable more messages [BF-1560] --- pglookout/cluster_monitor.py | 5 ++--- pglookout/current_master.py | 19 +++++++++---------- pglookout/logutil.py | 2 +- pglookout/pglookout.py | 30 ++++++++++++++---------------- pglookout/pgutil.py | 8 +++++--- pyproject.toml | 32 ++++++++------------------------ test/conftest.py | 2 +- 7 files changed, 40 insertions(+), 58 deletions(-) diff --git a/pglookout/cluster_monitor.py b/pglookout/cluster_monitor.py index 93ea4bf..70b4348 100644 --- a/pglookout/cluster_monitor.py +++ b/pglookout/cluster_monitor.py @@ -178,9 +178,9 @@ def _fetch_observer_state(self, instance: str, uri: str) -> ObservedState | None instance, time_diff, response.json(), - ) # pylint: disable=no-member + ) return None - result.update(response.json()) # pylint: disable=no-member + result.update(response.json()) except requests.ConnectionError as ex: self.log.warning( "%s (%s) fetching state from observer: %r, %r", @@ -347,7 +347,6 @@ def _get_statement_query_updating_transaction(server_version: int) -> str: return "SELECT txid_current(), pg_current_wal_lsn() AS pg_last_xlog_replay_location" return "SELECT txid_current(), pg_current_xlog_location() AS pg_last_xlog_replay_location" - # FIXME: Find a tighter input + return type @staticmethod def _parse_status_query_result(result: MemberState) -> MemberState: if not result: diff --git a/pglookout/current_master.py b/pglookout/current_master.py index f6dfb54..b9e068c 100644 --- a/pglookout/current_master.py +++ b/pglookout/current_master.py @@ -11,11 +11,11 @@ from __future__ import annotations from . import version +from pathlib import Path from pglookout.default import JSON_STATE_FILE_PATH import argparse import json -import os import sys import time @@ -34,22 +34,21 @@ def main(args: list[str] | None = None) -> int: help="show program version", version=version.__version__, ) - parser.add_argument("state", help="pglookout state file") + parser.add_argument("state", type=Path, help="pglookout state file") arg = parser.parse_args(args) - if not os.path.exists(arg.state): - print(f"pglookout_current_master: {arg.state!r} doesn't exist") + state_file: Path = arg.state + if not state_file.is_file(): + print(f"pglookout_current_master: {arg.state!s} doesn't exist") return 1 try: - with open(arg.state, "r") as fp: - config = json.load(fp) - state_file_path = config.get("json_state_file_path", JSON_STATE_FILE_PATH) # pylint: disable=no-member - if time.monotonic() - os.stat(state_file_path).st_mtime > 60.0: + config = json.loads(state_file.read_text(encoding="utf-8")) + state_file_path = Path(config.get("json_state_file_path", JSON_STATE_FILE_PATH)) + if time.monotonic() - state_file_path.stat().st_mtime > 60.0: # file older than one minute, pglookout probably dead, exit with minus one return -1 - with open(state_file_path, "r") as fp: - state_dict = json.load(fp) + state_dict = json.loads(state_file_path.read_text(encoding="utf-8")) current_master = state_dict["current_master"] print(current_master) except: # pylint: disable=bare-except diff --git a/pglookout/logutil.py b/pglookout/logutil.py index f0150ef..ee70b52 100644 --- a/pglookout/logutil.py +++ b/pglookout/logutil.py @@ -15,7 +15,7 @@ with_systemd: bool = False try: - from systemd import daemon # pylint: disable=no-name-in-module + from systemd import daemon with_systemd = True except ImportError: diff --git a/pglookout/pglookout.py b/pglookout/pglookout.py index aa9828f..ec4a84b 100755 --- a/pglookout/pglookout.py +++ b/pglookout/pglookout.py @@ -190,7 +190,7 @@ def load_config(self) -> None: previous_remote_conns = self.config.get("remote_conns") try: - config = json.loads(self.config_path.read_text()) + config = json.loads(self.config_path.read_text(encoding="utf-8")) self.config = self.normalize_config(config) except Exception as ex: # pylint: disable=broad-except self.log.exception("Invalid JSON config, exiting") @@ -282,7 +282,7 @@ def write_cluster_state_to_json_file(self) -> None: ) state_file_path_tmp = state_file_path.with_name(f"{state_file_path.name}.tmp") - state_file_path_tmp.write_text(json_to_dump) + state_file_path_tmp.write_text(json_to_dump, encoding="utf-8") state_file_path_tmp.rename(state_file_path) self.log.debug( @@ -452,7 +452,7 @@ def create_node_map( return master_instance, master_node, standby_nodes - def is_restoring_or_catching_up_normally(self, state: MemberState) -> bool: + def _is_restoring_or_catching_up_normally(self, state: MemberState) -> bool: """ Return True if node is still in the replication catchup phase and replication lag alerts/metrics should not yet be generated. @@ -476,7 +476,7 @@ def is_restoring_or_catching_up_normally(self, state: MemberState) -> bool: return False def emit_stats(self, state: MemberState) -> None: - if self.is_restoring_or_catching_up_normally(state): + if self._is_restoring_or_catching_up_normally(state): # do not emit misleading lag stats during catchup at restore return @@ -484,7 +484,7 @@ def emit_stats(self, state: MemberState) -> None: if replication_time_lag is not None: self.stats.gauge("pg.replication_lag", replication_time_lag) - def is_master_observer_new_enough(self, observer_state: dict[str, ObservedState]) -> bool: + def _is_master_observer_new_enough(self, observer_state: dict[str, ObservedState]) -> bool: if not self.replication_lag_over_warning_limit: return True @@ -536,7 +536,7 @@ def check_cluster_state(self) -> None: ) return - if self.config.get("poll_observers_on_warning_only") and not self.is_master_observer_new_enough(observer_state): + if self.config.get("poll_observers_on_warning_only") and not self._is_master_observer_new_enough(observer_state): self.log.warning("observer data is not good enough, skipping check") return @@ -551,7 +551,7 @@ def check_cluster_state(self) -> None: ) self.current_master = master_instance if self.own_db and self.own_db != master_instance and self.config.get("autofollow"): - self.start_following_new_master(master_instance) + self._start_following_new_master(master_instance) self._make_failover_decision(observer_state, standby_nodes, master_node) @@ -685,7 +685,7 @@ def is_replication_lag_over_warning_limit(self) -> bool: return self.replication_lag_over_warning_limit def check_replication_lag(self, own_state: MemberState, standby_nodes: dict[str, MemberState]) -> None: - if self.is_restoring_or_catching_up_normally(own_state): + if self._is_restoring_or_catching_up_normally(own_state): # do not raise alerts during catchup at restore return @@ -749,8 +749,7 @@ def get_replication_positions(self, standby_nodes: dict[str, MemberState]) -> di node_state["connection"] and now - parse_iso_datetime(node_state["fetch_time"]) < timedelta(seconds=20) and instance not in self.never_promote_these_nodes - ): # noqa # pylint: disable=line-too-long - # use pg_last_xlog_receive_location if it's available, + ): # use pg_last_xlog_receive_location if it's available, # otherwise fall back to pg_last_xlog_replay_location but # note that both of them can be None. We prefer # receive_location over replay_location as some nodes may @@ -870,7 +869,7 @@ def do_failover_decision(self, own_state: MemberState, standby_nodes: dict[str, furthest_along_instance, ) - def modify_recovery_conf_to_point_at_new_master(self, new_master_instance: str) -> bool: + def _modify_recovery_conf_to_point_at_new_master(self, new_master_instance: str) -> bool: pg_data_directory = Path(self.config.get("pg_data_directory", PG_DATA_DIRECTORY)) pg_version = (pg_data_directory / "PG_VERSION").read_text().strip() @@ -940,9 +939,9 @@ def modify_recovery_conf_to_point_at_new_master(self, new_master_instance: str) return True - def start_following_new_master(self, new_master_instance: str) -> None: + def _start_following_new_master(self, new_master_instance: str) -> None: start_time = time.monotonic() - updated_config = self.modify_recovery_conf_to_point_at_new_master(new_master_instance) + updated_config = self._modify_recovery_conf_to_point_at_new_master(new_master_instance) if not updated_config: self.log.info( "Already following master %r, no need to start following it again", @@ -979,7 +978,7 @@ def execute_external_command(self, command: list[str] | str) -> int: err.output, ) self.stats.unexpected_exception(err, where="execute_external_command") - return_code = err.returncode # pylint: disable=no-member + return_code = err.returncode self.log.warning("Executed external command: %r, output: %r", return_code, output) return return_code @@ -1028,8 +1027,7 @@ def _get_health_timeout_seconds(self) -> float | None: if "cluster_monitor_health_timeout_seconds" in self.config: config_value = self.config.get("cluster_monitor_health_timeout_seconds") return config_value if config_value is None else float(config_value) - else: - return self._get_check_interval() * 2 + return self._get_check_interval() * 2 def _get_check_interval(self) -> float: return float(self.config.get("replication_state_check_interval", 5.0)) diff --git a/pglookout/pgutil.py b/pglookout/pgutil.py index 799fcca..1ea70be 100644 --- a/pglookout/pgutil.py +++ b/pglookout/pgutil.py @@ -8,7 +8,7 @@ from __future__ import annotations from typing import cast, Literal, TypedDict, Union -from urllib.parse import parse_qs, urlparse # pylint: disable=no-name-in-module, import-error +from urllib.parse import parse_qs, urlparse import psycopg2.extensions @@ -131,7 +131,9 @@ def parse_connection_string_url(url: str) -> ConnectionParameterKeywords: return cast(ConnectionParameterKeywords, fields) -def parse_connection_string_libpq(connection_string: str) -> ConnectionParameterKeywords: +def parse_connection_string_libpq( + connection_string: str, +) -> ConnectionParameterKeywords: """Parse a postgresql connection string. See: @@ -159,7 +161,7 @@ def parse_connection_string_libpq(connection_string: str) -> ConnectionParameter value += rem[i] else: raise ValueError(f"invalid connection_string fragment {rem!r}") - connection_string = rem[i + 1 :] # pylint: disable=undefined-loop-variable + connection_string = rem[i + 1 :] else: res = rem.split(None, 1) if len(res) > 1: diff --git a/pyproject.toml b/pyproject.toml index 10f5aa9..51b1995 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,43 +27,23 @@ strict = true files = [ 'pglookout/', 'test/', - 'version.py', - 'setup.py', + './*.py', ] -[[tool.mypy.overrides]] -module = "test.test_cluster_monitor" -# ignore no-untyped-call because conftest can only type hinted at the end. Remove at the end. -disallow_untyped_calls = false - [tool.pylint.'MESSAGES CONTROL'] +enable = [ + 'useless-suppression', +] disable = [ - 'bad-option-value', 'duplicate-code', 'fixme', - 'import-outside-toplevel', 'invalid-name', - 'len-as-condition', - 'locally-disabled', 'missing-docstring', - 'no-else-raise', - 'no-else-return', - 'no-self-use', - 'raise-missing-from', - 'too-few-public-methods', - 'too-many-ancestors', - 'too-many-arguments', - 'too-many-boolean-expressions', 'too-many-branches', - 'too-many-function-args', 'too-many-instance-attributes', 'too-many-locals', - 'too-many-public-methods', 'too-many-statements', - 'ungrouped-imports', - 'unspecified-encoding', 'wrong-import-order', - 'wrong-import-position', ] [tool.pylint.'FORMAT'] @@ -76,3 +56,7 @@ reports = 'no' [tool.pylint.'TYPECHECK'] extension-pkg-whitelist = 'pydantic' + +[tool.pylint.'DESIGN'] +max-args = 10 +max-attributes = 10 diff --git a/test/conftest.py b/test/conftest.py index a27b015..8165f39 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -80,7 +80,7 @@ def run_cmd(self, cmd: str, *args: str) -> None: subprocess.check_call(argv) def run_pg(self) -> None: - self.pg = subprocess.Popen( # pylint: disable=bad-option-value,consider-using-with + self.pg = subprocess.Popen( # pylint: disable=consider-using-with [ str(self.pgbin / "postgres"), "-D", From 56c01a875bd558da18056f2b5e620f2e5945d98a Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Tue, 28 Mar 2023 12:10:54 +0200 Subject: [PATCH 21/23] pylint: Fix broad exception raised [BF-1560] --- pglookout/pglookout.py | 1 - test/conftest.py | 2 +- test/test_cluster_monitor.py | 8 ++++++-- version.py | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pglookout/pglookout.py b/pglookout/pglookout.py index ec4a84b..d8d4ae1 100755 --- a/pglookout/pglookout.py +++ b/pglookout/pglookout.py @@ -523,7 +523,6 @@ def _is_master_observer_new_enough(self, observer_state: dict[str, ObservedState return True def check_cluster_state(self) -> None: - # master_node = None cluster_state = deepcopy(self.cluster_state) observer_state = deepcopy(self.observer_state) configured_node_count = len(self.config.get("remote_conns", {})) diff --git a/test/conftest.py b/test/conftest.py index 8165f39..47937e6 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -108,7 +108,7 @@ def kill(self, force: bool = True, immediate: bool = True) -> None: while (self.pg.poll() is None) and (time.monotonic() < timeout): time.sleep(0.1) if not force and self.pg.poll() is None: - raise Exception(f"PG pid {self.pg.pid} not dead") + raise TimeoutError(f"PG pid {self.pg.pid} not dead") @pytest.fixture(scope="session") diff --git a/test/test_cluster_monitor.py b/test/test_cluster_monitor.py index 845d862..2fe4e40 100644 --- a/test/test_cluster_monitor.py +++ b/test/test_cluster_monitor.py @@ -24,6 +24,10 @@ import time +class AlertFileCreatedException(Exception): + pass + + def test_replication_lag() -> None: # pylint: disable=protected-access now = datetime.now() @@ -60,7 +64,7 @@ def test_main_loop(db: TestPG) -> None: observer_state: dict[str, ObservedState] = {} def create_alert_file(arg: str) -> NoReturn: - raise Exception(arg) + raise AlertFileCreatedException(arg) cluster_monitor_check_queue: Queue[str] = Queue() failover_decision_queue: Queue[str] = Queue() @@ -122,7 +126,7 @@ def test_fetch_replication_slot_info(db: TestPG) -> None: observer_state: dict[str, ObservedState] = {} def create_alert_file(arg: str) -> NoReturn: - raise Exception(arg) + raise AlertFileCreatedException(arg) cluster_monitor_check_queue: Queue[str] = Queue() failover_decision_queue: Queue[str] = Queue() diff --git a/version.py b/version.py index 1f12706..5e2ccf0 100644 --- a/version.py +++ b/version.py @@ -88,7 +88,7 @@ def get_project_version(version_file_name: str) -> str: return short_ver if not file_ver: - raise Exception(f"version not available from git or from file {version_file!r}") + raise ValueError(f"version not available from git or from file {version_file!r}") return file_ver From 09a15b1d4aabe2feb4c512a1cff1d24837cafeab Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Tue, 28 Mar 2023 12:11:49 +0200 Subject: [PATCH 22/23] pylint: Dev requirements are now pinned [BF-1560] This fixes some behaviour discrepancy between local and CI. --- requirements.dev.txt | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/requirements.dev.txt b/requirements.dev.txt index 2635d85..f2148a8 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,10 +1,10 @@ -black==22.10.0 -flake8 -flake8-pyproject==1.2.2 -isort==5.10.1 +black==23.1.0 +flake8==6.0.0 +flake8-pyproject==1.2.3 +isort==5.12.0 mypy==1.1.1 -types-psycopg2==2.9.21.8 -types-requests==2.26.1 +types-psycopg2==2.9.21.9 +types-requests==2.28.11.16 types-setuptools==67.6.0.5 -pylint==2.15.4 -pytest +pylint==2.17.1 +pytest==7.2.2 From 9a0abaeaab31f37a787c73e343f755231acc92b8 Mon Sep 17 00:00:00 2001 From: Samuel Giffard Date: Tue, 4 Apr 2023 16:42:00 +0200 Subject: [PATCH 23/23] make: Dev reqs are now in isolation [BF-1560] This makes sure that the tests can run in their own environment, without affecting the system deps, while still using the system deps. --- .github/workflows/build.yml | 4 +-- .gitignore | 2 ++ Makefile | 58 ++++++++++++++++++++++--------------- pglookout.spec | 1 - requirements.dev.txt | 1 + 5 files changed, 38 insertions(+), 28 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c9bafd8..08157fa 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -28,9 +28,7 @@ jobs: python-version: ${{ matrix.python-version }} - id: dependencies - run: | - pip install -e . - pip install -r requirements.dev.txt + run: make local-install - id: mypy run: make mypy diff --git a/.gitignore b/.gitignore index ec949a5..4d2565b 100644 --- a/.gitignore +++ b/.gitignore @@ -14,5 +14,7 @@ coverage.xml rpm/ dist/ +.venv-test/ + # Generated targets pglookout-rpm-src.tar diff --git a/Makefile b/Makefile index 700617c..cb7f2b7 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,8 @@ short_ver = 2.1.0 long_ver = $(shell git describe --long 2>/dev/null || echo $(short_ver)-0-unknown-g`git describe --always`) generated = pglookout/version.py +VENV = .venv-test +VENV_PYTHON = $(VENV)/bin/python PYTHON ?= python3 PYTHON_SOURCE_DIRS = pglookout/ test/ stubs/ version.py setup.py @@ -11,35 +13,45 @@ all: $(generated) pglookout/version.py: version.py $(PYTHON) $^ $@ -test: mypy flake8 pylint unittest +# This venv is only used for tests and development. It has access to system site packages, because pglookout +# would need them to run. Development deps are kept in this venv, so that they don't interfere with the +# system python. +$(VENV): + $(PYTHON) -m venv --system-site-packages $(VENV) + $(VENV)/bin/pip install -r requirements.dev.txt -unittest: $(generated) - $(PYTHON) -m pytest +local-install: $(VENV) + $(VENV_PYTHON) -m pip install -e . -mypy: $(generated) - MYPYPATH=stubs $(PYTHON) -m mypy +test: mypy flake8 pylint unittest fmt-check -flake8: $(generated) - $(PYTHON) -m flake8 $(PYTHON_SOURCE_DIRS) +unittest: $(generated) $(VENV) + $(VENV_PYTHON) -m pytest -pylint: $(generated) - $(PYTHON) -m pylint $(PYTHON_SOURCE_DIRS) +mypy: $(generated) $(VENV) + MYPYPATH=stubs $(VENV_PYTHON) -m mypy -fmt: $(generated) - isort $(PYTHON_SOURCE_DIRS) - black $(PYTHON_SOURCE_DIRS) +flake8: $(generated) $(VENV) + $(VENV_PYTHON) -m flake8 $(PYTHON_SOURCE_DIRS) -fmt-check: $(generated) - isort --check $(PYTHON_SOURCE_DIRS) - black --check $(PYTHON_SOURCE_DIRS) +pylint: $(generated) $(VENV) + $(VENV_PYTHON) -m pylint $(PYTHON_SOURCE_DIRS) -coverage: - $(PYTHON) -m pytest $(PYTEST_ARG) --cov-report term-missing --cov-branch \ +fmt: $(generated) $(VENV) + $(VENV_PYTHON) -m isort $(PYTHON_SOURCE_DIRS) + $(VENV_PYTHON) -m black $(PYTHON_SOURCE_DIRS) + +fmt-check: $(generated) $(VENV) + $(VENV_PYTHON) -m isort --check $(PYTHON_SOURCE_DIRS) + $(VENV_PYTHON) -m black --check $(PYTHON_SOURCE_DIRS) + +coverage: $(VENV) + $(VENV_PYTHON) -m pytest $(PYTEST_ARG) --cov-report term-missing --cov-branch \ --cov-report xml:coverage.xml --cov pglookout test/ clean: - $(RM) -r *.egg-info/ build/ dist/ - $(RM) ../pglookout_* test-*.xml $(generated) + $(RM) -r *.egg-info/ build/ dist/ $(VENV) .hypothesis + $(RM) ../pglookout_* test-*.xml $(generated) .coverage coverage.xml deb: $(generated) cp debian/changelog.in debian/changelog @@ -59,10 +71,8 @@ rpm: $(generated) build-dep-fed: sudo dnf -y install --best --allowerasing \ - python3-devel python3-pytest python3-pylint \ - python3-mock python3-psycopg2 \ - python3-requests rpm-build systemd-python3 \ - python3-flake8 python3-pytest-cov python3-packaging + python3-devel python3-psycopg2 python3-requests \ + rpm-build systemd-python3 python3-packaging build-dep-deb: sudo apt-get install \ @@ -70,4 +80,4 @@ build-dep-deb: python3-all python3-setuptools python3-psycopg2 python3-requests \ python3-packaging -.PHONY: rpm +.PHONY: rpm local-install diff --git a/pglookout.spec b/pglookout.spec index 4bf34c8..2e6fd79 100644 --- a/pglookout.spec +++ b/pglookout.spec @@ -8,7 +8,6 @@ Source0: pglookout-rpm-src.tar Obsoletes: python3-pglookout Requires: python3-packaging, python3-psycopg2, python3-requests, python3-setuptools, systemd-python3, systemd BuildRequires: python3-packaging, python3-psycopg2, python3-requests, python3-setuptools, systemd-python3, systemd -BuildRequires: python3-pytest, python3-pylint BuildArch: noarch %description diff --git a/requirements.dev.txt b/requirements.dev.txt index f2148a8..6eb07d9 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -8,3 +8,4 @@ types-requests==2.28.11.16 types-setuptools==67.6.0.5 pylint==2.17.1 pytest==7.2.2 +pytest-cov==4.0.0