From 0d4ea2c5ebab5175f2a948c39dfd9bd572e6b7e4 Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Thu, 16 Mar 2023 15:52:41 +0100
Subject: [PATCH 01/23] mypy: `pglookout/logutil.py` [BF-1560]

---
 Makefile                   |  4 ++--
 pglookout/logutil.py       | 33 ++++++++++++++++++++++-----------
 pyproject.toml             |  1 -
 stubs/__init__.pyi         |  1 +
 stubs/systemd/__init__.pyi |  1 +
 stubs/systemd/daemon.pyi   | 19 +++++++++++++++++++
 6 files changed, 45 insertions(+), 14 deletions(-)
 create mode 100644 stubs/__init__.pyi
 create mode 100644 stubs/systemd/__init__.pyi
 create mode 100644 stubs/systemd/daemon.pyi

diff --git a/Makefile b/Makefile
index bc3696c..e1df801 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@ long_ver = $(shell git describe --long 2>/dev/null || echo $(short_ver)-0-unknow
 generated = pglookout/version.py
 
 PYTHON ?= python3
-PYTHON_SOURCE_DIRS = pglookout/ test/
+PYTHON_SOURCE_DIRS = pglookout/ test/ stubs/
 
 all: $(generated)
 	: 'try "make rpm" or "make deb" or "make test"'
@@ -17,7 +17,7 @@ unittest: $(generated)
 	$(PYTHON) -m pytest
 
 mypy: $(generated)
-	$(PYTHON) -m mypy
+	MYPYPATH=stubs $(PYTHON) -m mypy
 
 flake8: $(generated)
 	$(PYTHON) -m flake8 $(PYTHON_SOURCE_DIRS)
diff --git a/pglookout/logutil.py b/pglookout/logutil.py
index c5d861a..f0150ef 100644
--- a/pglookout/logutil.py
+++ b/pglookout/logutil.py
@@ -5,39 +5,50 @@
 Copyright (c) 2015 Ohmu Ltd
 See LICENSE for details
 """
+from __future__ import annotations
+
+from logging.handlers import SysLogHandler
+from typing import Final, TYPE_CHECKING
 
 import logging
-import logging.handlers
 import os
 
+with_systemd: bool = False
 try:
     from systemd import daemon  # pylint: disable=no-name-in-module
+
+    with_systemd = True
 except ImportError:
-    daemon = None
+    if not TYPE_CHECKING:
+        daemon = None
 
-LOG_FORMAT = "%(asctime)s\t%(name)s\t%(threadName)s\t%(levelname)s\t%(message)s"
-LOG_FORMAT_SHORT = "%(levelname)s\t%(message)s"
-LOG_FORMAT_SYSLOG = "%(name)s %(threadName)s %(levelname)s: %(message)s"
+LOG_FORMAT: Final[str] = "%(asctime)s\t%(name)s\t%(threadName)s\t%(levelname)s\t%(message)s"
+LOG_FORMAT_SHORT: Final[str] = "%(levelname)s\t%(message)s"
+LOG_FORMAT_SYSLOG: Final[str] = "%(name)s %(threadName)s %(levelname)s: %(message)s"
 
 
-def set_syslog_handler(address, facility, logger):
-    syslog_handler = logging.handlers.SysLogHandler(address=address, facility=facility)
+def set_syslog_handler(address: str, facility: str | int, logger: logging.Logger) -> SysLogHandler:
+    if isinstance(facility, str):
+        facility_id: int = SysLogHandler.facility_names.get(facility, SysLogHandler.LOG_LOCAL2)
+    else:
+        facility_id = facility
+    syslog_handler = SysLogHandler(address=address, facility=facility_id)
     logger.addHandler(syslog_handler)
     formatter = logging.Formatter(LOG_FORMAT_SYSLOG)
     syslog_handler.setFormatter(formatter)
     return syslog_handler
 
 
-def configure_logging(level=logging.DEBUG, short_log=False):
+def configure_logging(level: int = logging.DEBUG, short_log: bool = False) -> None:
     # Are we running under systemd?
     if os.getenv("NOTIFY_SOCKET"):
         logging.basicConfig(level=level, format=LOG_FORMAT_SYSLOG)
-        if not daemon:
+        if not with_systemd:
             print("WARNING: Running under systemd but python-systemd not available, systemd won't see our notifications")
     else:
         logging.basicConfig(level=level, format=LOG_FORMAT_SHORT if short_log else LOG_FORMAT)
 
 
-def notify_systemd(status):
-    if daemon:
+def notify_systemd(status: str) -> None:
+    if with_systemd:
         daemon.notify(status)
diff --git a/pyproject.toml b/pyproject.toml
index 16ce87c..7b1701f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,7 +34,6 @@ exclude = [
     'pglookout/cluster_monitor.py',
     'pglookout/common.py',
     'pglookout/current_master.py',
-    'pglookout/logutil.py',
     'pglookout/pglookout.py',
     'pglookout/pgutil.py',
     'pglookout/statsd.py',
diff --git a/stubs/__init__.pyi b/stubs/__init__.pyi
new file mode 100644
index 0000000..3bf9baf
--- /dev/null
+++ b/stubs/__init__.pyi
@@ -0,0 +1 @@
+# Copyright (c) 2023 Aiven, Helsinki, Finland. https://aiven.io/
diff --git a/stubs/systemd/__init__.pyi b/stubs/systemd/__init__.pyi
new file mode 100644
index 0000000..3bf9baf
--- /dev/null
+++ b/stubs/systemd/__init__.pyi
@@ -0,0 +1 @@
+# Copyright (c) 2023 Aiven, Helsinki, Finland. https://aiven.io/
diff --git a/stubs/systemd/daemon.pyi b/stubs/systemd/daemon.pyi
new file mode 100644
index 0000000..4ddc10b
--- /dev/null
+++ b/stubs/systemd/daemon.pyi
@@ -0,0 +1,19 @@
+# Copyright (c) 2023 Aiven, Helsinki, Finland. https://aiven.io/
+
+from __future__ import annotations
+
+from typing import AnyStr, IO
+
+def notify(
+    status: str,
+    unset_environment: bool = False,
+    pid: int = 0,
+    fds: IO[AnyStr] | None = None,
+) -> bool:
+    """
+    notify(status, unset_environment=False, pid=0, fds=None) -> bool
+
+    Send a message to the init system about a status change.
+    Wraps sd_notify(3).
+    """
+    ...

From 238eb76eacff197d704f197b5bd02a3c8027465d Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Thu, 16 Mar 2023 17:30:50 +0100
Subject: [PATCH 02/23] mypy: `pglookout/pgutil.py` [BF-1560]

---
 pglookout/pgutil.py | 105 +++++++++++++++++++++++++++++++++++++-------
 pyproject.toml      |   2 -
 test/test_pgutil.py |   8 ++--
 3 files changed, 94 insertions(+), 21 deletions(-)

diff --git a/pglookout/pgutil.py b/pglookout/pgutil.py
index 3b8562b..7d5f692 100644
--- a/pglookout/pgutil.py
+++ b/pglookout/pgutil.py
@@ -5,16 +5,80 @@
 Copyright (c) 2015 Ohmu Ltd
 See LICENSE for details
 """
+from __future__ import annotations
+
+from typing import cast, Literal, TypedDict
 from urllib.parse import parse_qs, urlparse  # pylint: disable=no-name-in-module, import-error
 
 import psycopg2.extensions
 
 
-def create_connection_string(connection_info):
-    return psycopg2.extensions.make_dsn(**connection_info)
+class DsnDictBase(TypedDict, total=False):
+    user: str
+    password: str
+    host: str
+    port: str | int
+
+
+class DsnDict(DsnDictBase, total=False):
+    dbname: str
+
+
+class DsnDictDeprecated(DsnDictBase, total=False):
+    database: str
+
+
+class ConnectionParameterKeywords(TypedDict, total=False):
+    """Parameter Keywords for Connection.
 
+    See:
+        https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-PARAMKEYWORDS
+    """
 
-def mask_connection_info(info):
+    host: str
+    hostaddr: str
+    port: str
+    dbname: str
+    user: str
+    password: str
+    passfile: str
+    channel_binding: Literal["require", "prefer", "disable"]
+    connect_timeout: str
+    client_encoding: str
+    options: str
+    application_name: str
+    fallback_application_name: str
+    keepalives: Literal["0", "1"]
+    keepalives_idle: str
+    keepalives_interval: str
+    keepalives_count: str
+    tcp_user_timeout: str
+    replication: Literal["true", "on", "yes", "1", "database", "false", "off", "no", "0"]
+    gssencmode: Literal["disable", "prefer", "require"]
+    sslmode: Literal["disable", "allow", "prefer", "require", "verify-ca", "verify-full"]
+    requiressl: Literal["0", "1"]
+    sslcompression: Literal["0", "1"]
+    sslcert: str
+    sslkey: str
+    sslpassword: str
+    sslrootcert: str
+    sslcrl: str
+    sslcrldir: str
+    sslsni: Literal["0", "1"]
+    requirepeer: str
+    ssl_min_protocol_version: Literal["TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"]
+    ssl_max_protocol_version: Literal["TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"]
+    krbsrvname: str
+    gsslib: str
+    service: str
+    target_session_attrs: Literal["any", "read-write", "read-only", "primary", "standby", "prefer-standby"]
+
+
+def create_connection_string(connection_info: DsnDict | DsnDictDeprecated | ConnectionParameterKeywords) -> str:
+    return str(psycopg2.extensions.make_dsn(**connection_info))
+
+
+def mask_connection_info(info: str) -> str:
     masked_info = get_connection_info(info)
     password = masked_info.pop("password", None)
     connection_string = create_connection_string(masked_info)
@@ -22,24 +86,29 @@ def mask_connection_info(info):
     return f"{connection_string}; {message}"
 
 
-def get_connection_info_from_config_line(line):
+def get_connection_info_from_config_line(line: str) -> ConnectionParameterKeywords:
     _, value = line.split("=", 1)
     value = value.strip()[1:-1].replace("''", "'")
     return get_connection_info(value)
 
 
-def get_connection_info(info):
-    """turn a connection info object into a dict or return it if it was a
-    dict already.  supports both the traditional libpq format and the new
-    url format"""
+def get_connection_info(
+    info: str | DsnDict | DsnDictDeprecated | ConnectionParameterKeywords,
+) -> ConnectionParameterKeywords:
+    """Get a normalized connection info dict from a connection string or a dict.
+
+    Supports both the traditional libpq format and the new url format.
+    """
     if isinstance(info, dict):
-        return info.copy()
+        # Potentially, we might clean deprecated DSN dicts: `database` -> `dbname`.
+        # Also, psycopg2 will validate the keys and values.
+        return parse_connection_string_libpq(create_connection_string(info))
     if info.startswith("postgres://") or info.startswith("postgresql://"):
         return parse_connection_string_url(info)
     return parse_connection_string_libpq(info)
 
 
-def parse_connection_string_url(url):
+def parse_connection_string_url(url: str) -> ConnectionParameterKeywords:
     # drop scheme from the url as some versions of urlparse don't handle
     # query and path properly for urls with a non-http scheme
     schemeless_url = url.split(":", 1)[1]
@@ -57,12 +126,15 @@ def parse_connection_string_url(url):
         fields["dbname"] = p.path[1:]
     for k, v in parse_qs(p.query).items():
         fields[k] = v[-1]
-    return fields
+    return cast(ConnectionParameterKeywords, fields)
+
 
+def parse_connection_string_libpq(connection_string: str) -> ConnectionParameterKeywords:
+    """Parse a postgresql connection string.
 
-def parse_connection_string_libpq(connection_string):
-    """parse a postgresql connection string as defined in
-    http://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNSTRING"""
+    See:
+        http://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNSTRING
+    """
     fields = {}
     while True:
         connection_string = connection_string.strip()
@@ -92,5 +164,8 @@ def parse_connection_string_libpq(connection_string):
                 value, connection_string = res
             else:
                 value, connection_string = rem, ""
+        # This one is case-insensitive. To continue benefiting from mypy, we make it lowercase.
+        if key == "replication":
+            value = value.lower()
         fields[key] = value
-    return fields
+    return cast(ConnectionParameterKeywords, fields)
diff --git a/pyproject.toml b/pyproject.toml
index 7b1701f..b3197ac 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,7 +35,6 @@ exclude = [
     'pglookout/common.py',
     'pglookout/current_master.py',
     'pglookout/pglookout.py',
-    'pglookout/pgutil.py',
     'pglookout/statsd.py',
     'pglookout/version.py',
     'pglookout/webserver.py',
@@ -44,7 +43,6 @@ exclude = [
     'test/test_cluster_monitor.py',
     'test/test_common.py',
     'test/test_lookout.py',
-    'test/test_pgutil.py',
     'test/test_webserver.py',
     # Other.
     'setup.py',
diff --git a/test/test_pgutil.py b/test/test_pgutil.py
index d70e6f5..7dc969f 100644
--- a/test/test_pgutil.py
+++ b/test/test_pgutil.py
@@ -6,14 +6,14 @@
 See LICENSE for details
 """
 
-from pglookout.pgutil import create_connection_string, get_connection_info, mask_connection_info
+from pglookout.pgutil import ConnectionParameterKeywords, create_connection_string, get_connection_info, mask_connection_info
 from pytest import raises
 
 
-def test_connection_info():
+def test_connection_info() -> None:
     url = "postgres://hannu:secret@dbhost.local:5555/abc?replication=true&sslmode=foobar&sslmode=require"
     cs = "host=dbhost.local user='hannu'   dbname='abc'\nreplication=true   password=secret sslmode=require port=5555"
-    ci = {
+    ci: ConnectionParameterKeywords = {
         "host": "dbhost.local",
         "port": "5555",
         "user": "hannu",
@@ -39,7 +39,7 @@ def test_connection_info():
         get_connection_info("foo=bar bar='x")
 
 
-def test_mask_connection_info():
+def test_mask_connection_info() -> None:
     url = "postgres://michael:secret@dbhost.local:5555/abc?replication=true&sslmode=foobar&sslmode=require"
     cs = "host=dbhost.local user='michael'   dbname='abc'\nreplication=true   password=secret sslmode=require port=5555"
     ci = get_connection_info(cs)

From 3db5684adadc3a06e9f56e5e8a089a741cdf566a Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Fri, 17 Mar 2023 10:10:37 +0100
Subject: [PATCH 03/23] mypy: `pglookout/current-master.py` [BF-1560]

Also in this commit:
- removed `python2` leftovers (from pglookout 2.0.0 (2018-10-12))
---
 debian/rules                | 2 +-
 pglookout/current_master.py | 4 ++--
 pyproject.toml              | 1 -
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/debian/rules b/debian/rules
index 5184822..c33a948 100755
--- a/debian/rules
+++ b/debian/rules
@@ -3,4 +3,4 @@
 export PYBUILD_NAME=pglookout
 
 %:
-	dh $@ --with python2 --with systemd --buildsystem=pybuild
+	dh $@ --with python3 --with systemd --buildsystem=pybuild
diff --git a/pglookout/current_master.py b/pglookout/current_master.py
index 0824feb..8f7154f 100644
--- a/pglookout/current_master.py
+++ b/pglookout/current_master.py
@@ -8,7 +8,7 @@
 See the file `LICENSE` for details.
 """
 
-from __future__ import print_function
+from __future__ import annotations
 
 from . import version
 
@@ -19,7 +19,7 @@
 import time
 
 
-def main(args=None):
+def main(args: list[str] | None = None) -> int:
     if args is None:
         args = sys.argv[1:]
 
diff --git a/pyproject.toml b/pyproject.toml
index b3197ac..5fb6e35 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,7 +33,6 @@ exclude = [
     'pglookout/__main__.py',
     'pglookout/cluster_monitor.py',
     'pglookout/common.py',
-    'pglookout/current_master.py',
     'pglookout/pglookout.py',
     'pglookout/statsd.py',
     'pglookout/version.py',

From 2ce13ab9002303040ea44cbc4f11fa6bb65722bf Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Fri, 17 Mar 2023 15:06:01 +0100
Subject: [PATCH 04/23] mypy: `pglookout/statsd.py` [BF-1560]

---
 pglookout/statsd.py | 63 ++++++++++++++++++++++++++++++++++++++-------
 pyproject.toml      |  1 -
 2 files changed, 53 insertions(+), 11 deletions(-)

diff --git a/pglookout/statsd.py b/pglookout/statsd.py
index d1490f2..4e1bee6 100644
--- a/pglookout/statsd.py
+++ b/pglookout/statsd.py
@@ -9,28 +9,65 @@
 
     https://github.com/influxdata/telegraf/tree/master/plugins/inputs/statsd
 """
+from __future__ import annotations
+
+from typing import Literal
 
 import logging
 import socket
 
+StatsdMetricType = Literal[
+    b"g",  # gauge
+    b"c",  # counter
+    b"s",  # set
+    b"ms",  # timing
+    b"h",  # histogram
+    b"d",  # distribution
+]
+
 
 class StatsClient:
-    def __init__(self, host="127.0.0.1", port=8125, tags=None):
-        self.log = logging.getLogger("StatsClient")
-        self._dest_addr = (host, port)
-        self._socket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
-        self._tags = tags or {}
+    def __init__(
+        self,
+        host: str | None = "127.0.0.1",
+        port: int = 8125,
+        tags: dict[str, str] | None = None,
+    ) -> None:
+        self.log: logging.Logger = logging.getLogger("StatsClient")
+        self._dest_addr: tuple[str | None, int] = (host, port)
+        self._socket: socket.socket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+        self._tags: dict[str, str] = tags or {}
 
-    def gauge(self, metric, value, tags=None):
+    def gauge(
+        self,
+        metric: str,
+        value: int | float | str,
+        tags: dict[str, str] | None = None,
+    ) -> None:
         self._send(metric, b"g", value, tags)
 
-    def increase(self, metric, inc_value=1, tags=None):
+    def increase(
+        self,
+        metric: str,
+        inc_value: int | float = 1,
+        tags: dict[str, str] | None = None,
+    ) -> None:
         self._send(metric, b"c", inc_value, tags)
 
-    def timing(self, metric, value, tags=None):
+    def timing(
+        self,
+        metric: str,
+        value: int | float,
+        tags: dict[str, str] | None = None,
+    ) -> None:
         self._send(metric, b"ms", value, tags)
 
-    def unexpected_exception(self, ex, where, tags=None):
+    def unexpected_exception(
+        self,
+        ex: Exception,
+        where: str,
+        tags: dict[str, str] | None = None,
+    ) -> None:
         all_tags = {
             "exception": ex.__class__.__name__,
             "where": where,
@@ -38,7 +75,13 @@ def unexpected_exception(self, ex, where, tags=None):
         all_tags.update(tags or {})
         self.increase("exception", tags=all_tags)
 
-    def _send(self, metric, metric_type, value, tags):
+    def _send(
+        self,
+        metric: str,
+        metric_type: StatsdMetricType,
+        value: int | float | str,
+        tags: dict[str, str] | None,
+    ) -> None:
         if None in self._dest_addr:
             # stats sending is disabled
             return
diff --git a/pyproject.toml b/pyproject.toml
index 5fb6e35..829cb98 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,7 +34,6 @@ exclude = [
     'pglookout/cluster_monitor.py',
     'pglookout/common.py',
     'pglookout/pglookout.py',
-    'pglookout/statsd.py',
     'pglookout/version.py',
     'pglookout/webserver.py',
     # Tests.

From 25617d1442ecd8fc524980661191fa06ff6808e8 Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Thu, 16 Mar 2023 15:14:19 +0100
Subject: [PATCH 05/23] mypy: `pglookout/common.py` [BF-1560]

Also in this commit:
- More unit tests;
- Removed redundant `()` in regex.
---
 pglookout/common.py | 37 ++++++++++++++++---------
 pyproject.toml      |  2 --
 test/test_common.py | 66 +++++++++++++++++++++++++++++++++++++--------
 3 files changed, 79 insertions(+), 26 deletions(-)

diff --git a/pglookout/common.py b/pglookout/common.py
index 4d1f032..6975650 100644
--- a/pglookout/common.py
+++ b/pglookout/common.py
@@ -4,40 +4,51 @@
 Copyright (c) 2015 Ohmu Ltd
 See LICENSE for details
 """
-import datetime
+from __future__ import annotations
+
+from datetime import datetime, timedelta
+from typing import Final
+
 import re
 
 
-def convert_xlog_location_to_offset(wal_location):
+def convert_xlog_location_to_offset(wal_location: str) -> int:
     log_id, offset = wal_location.split("/")
     return int(log_id, 16) << 32 | int(offset, 16)
 
 
-ISO_EXT_RE = re.compile(
+ISO_EXT_RE: Final[re.Pattern[str]] = re.compile(
     r"(?P<year>\d{4})-(?P<month>\d\d)-(?P<day>\d\d)(T(?P<hour>\d\d):(?P<minute>\d\d)"
     r"(:(?P<second>\d\d)(.(?P<microsecond>\d{6}))?)?Z)?$"
 )
-ISO_BASIC_RE = re.compile(
+ISO_BASIC_RE: Final[re.Pattern[str]] = re.compile(
     r"(?P<year>\d{4})(?P<month>\d\d)(?P<day>\d\d)(T(?P<hour>\d\d)(?P<minute>\d\d)"
     r"((?P<second>\d\d)((?P<microsecond>\d{6}))?)?Z)?$"
 )
+ISO_GROUP_NAMES: Final[tuple[str, ...]] = (
+    "year",
+    "month",
+    "day",
+    "hour",
+    "minute",
+    "second",
+    "microsecond",
+)
 
 
-def parse_iso_datetime(value):
+def parse_iso_datetime(value: str) -> datetime:
     match = ISO_EXT_RE.match(value)
     if not match:
         match = ISO_BASIC_RE.match(value)
     if not match:
         raise ValueError(f"Invalid ISO timestamp {value!r}")
-    parts = dict(
-        (key, int(match.group(key) or "0")) for key in ("year", "month", "day", "hour", "minute", "second", "microsecond")
-    )
-    return datetime.datetime(tzinfo=None, **parts)
+    parts = {key: int(match.group(key) or "0") for key in ISO_GROUP_NAMES}
+    return datetime(tzinfo=None, **parts)
 
 
-def get_iso_timestamp(fetch_time=None):
+def get_iso_timestamp(fetch_time: datetime | None = None) -> str:
     if not fetch_time:
-        fetch_time = datetime.datetime.utcnow()
-    elif fetch_time.tzinfo:
-        fetch_time = fetch_time.replace(tzinfo=None) - datetime.timedelta(seconds=fetch_time.utcoffset().seconds)
+        fetch_time = datetime.utcnow()
+    elif (offset := fetch_time.utcoffset()) is not None:
+        fetch_time = fetch_time.replace(tzinfo=None) - timedelta(seconds=offset.seconds)
     return fetch_time.isoformat() + "Z"
diff --git a/pyproject.toml b/pyproject.toml
index 829cb98..7b59a5b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,14 +32,12 @@ exclude = [
     # Implementation.
     'pglookout/__main__.py',
     'pglookout/cluster_monitor.py',
-    'pglookout/common.py',
     'pglookout/pglookout.py',
     'pglookout/version.py',
     'pglookout/webserver.py',
     # Tests.
     'test/conftest.py',
     'test/test_cluster_monitor.py',
-    'test/test_common.py',
     'test/test_lookout.py',
     'test/test_webserver.py',
     # Other.
diff --git a/test/test_common.py b/test/test_common.py
index 8139d6d..6df4b55 100644
--- a/test/test_common.py
+++ b/test/test_common.py
@@ -4,33 +4,77 @@
 Copyright (c) 2015 Ohmu Ltd
 See LICENSE for details
 """
-
+from datetime import datetime
 from pglookout.common import convert_xlog_location_to_offset, get_iso_timestamp, ISO_EXT_RE, parse_iso_datetime
-from pytest import raises
 
-import datetime
+import pytest
 
 
-def test_convert_xlog_location_to_offset():
+def test_convert_xlog_location_to_offset() -> None:
     assert convert_xlog_location_to_offset("1/00000000") == 1 << 32
     assert convert_xlog_location_to_offset("F/AAAAAAAA") == (0xF << 32) | 0xAAAAAAAA
-    with raises(ValueError):
+    with pytest.raises(ValueError):
         convert_xlog_location_to_offset("x")
-    with raises(ValueError):
+    with pytest.raises(ValueError):
         convert_xlog_location_to_offset("x/y")
 
 
-def test_parse_iso_datetime():
-    date = datetime.datetime.utcnow()
+def test_parse_iso_datetime() -> None:
+    date = datetime.utcnow()
     date.replace(microsecond=0)
     assert date == parse_iso_datetime(date.isoformat() + "Z")
-    with raises(ValueError):
+    with pytest.raises(ValueError):
         parse_iso_datetime("foobar")
 
 
-def test_get_iso_timestamp():
+def test_get_iso_timestamp() -> None:
     v = get_iso_timestamp()
     assert ISO_EXT_RE.match(v)
-    ts = datetime.datetime.now()
+    ts = datetime.now()
     v = get_iso_timestamp(ts)
     assert parse_iso_datetime(v) == ts
+
+
+@pytest.mark.parametrize(
+    "timestamp",
+    [
+        datetime(2021, 1, 1, 23, 42, 11, 123456),
+        datetime(2021, 1, 1, 23, 42, 11),
+        datetime(2021, 1, 1, 23, 42),
+        datetime(2021, 1, 1, 23),
+        datetime(2021, 1, 1),
+    ],
+)
+def test_roundtrip(timestamp: datetime) -> None:
+    ts2 = parse_iso_datetime(get_iso_timestamp(timestamp))
+
+    assert ts2 == timestamp
+
+
+@pytest.mark.parametrize(
+    ("value", "normalized_value"),
+    # fmt: off
+    [
+        # Extended format
+        ("2021-01-01T00:00:00.000000Z",     "2021-01-01T00:00:00Z"),         # noqa: E241
+        ("2021-01-01T23:42:11.123456Z",     "2021-01-01T23:42:11.123456Z"),  # noqa: E241
+        ("2021-01-01T00:00:00Z",            "2021-01-01T00:00:00Z"),         # noqa: E241
+        ("2021-01-01T23:42:11Z",            "2021-01-01T23:42:11Z"),         # noqa: E241
+        ("2021-01-01T00:00Z",               "2021-01-01T00:00:00Z"),         # noqa: E241
+        ("2021-01-01T23:42Z",               "2021-01-01T23:42:00Z"),         # noqa: E241
+        ("2021-01-01",                      "2021-01-01T00:00:00Z"),         # noqa: E241
+        # Basic format
+        ("20210101T000000Z",                "2021-01-01T00:00:00Z"),         # noqa: E241
+        ("20210101T234211123456Z",          "2021-01-01T23:42:11.123456Z"),  # noqa: E241
+        ("20210101T000000Z",                "2021-01-01T00:00:00Z"),         # noqa: E241
+        ("20210101T234211Z",                "2021-01-01T23:42:11Z"),         # noqa: E241
+        ("20210101T0000Z",                  "2021-01-01T00:00:00Z"),         # noqa: E241
+        ("20210101T2342Z",                  "2021-01-01T23:42:00Z"),         # noqa: E241
+        ("20210101",                        "2021-01-01T00:00:00Z"),         # noqa: E241
+    ],
+    # fmt: on
+)
+def test_reverse_roundtrip(value: str, normalized_value: str) -> None:
+    v2 = get_iso_timestamp(parse_iso_datetime(value))
+
+    assert v2 == normalized_value

From 016d14db3722dd1002a765112672afb0ba3f2d09 Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Fri, 17 Mar 2023 10:21:29 +0100
Subject: [PATCH 06/23] mypy: `pglookout/config.py` [BF-1560]

New file to type hint the configuration. Also:
- Alphabetically sort configuration keys in the `README`. This has the effect of grouping related keys closer.
- Add missing key: `replication_catchup_timeout`
---
 README.rst          | 153 +++++++++++++++++++++++---------------------
 pglookout/config.py |  51 +++++++++++++++
 2 files changed, 130 insertions(+), 74 deletions(-)
 create mode 100644 pglookout/config.py

diff --git a/README.rst b/README.rst
index 145fd01..8b5d4e7 100644
--- a/README.rst
+++ b/README.rst
@@ -201,6 +201,11 @@ description of the current state of the cluster which is under monitoring.
 Configuration keys
 ==================
 
+``alert_file_dir`` (default ``os.getcwd()``)
+
+Directory in which alert files for replication warning and failover
+are created.
+
 ``autofollow`` (default ``false``)
 
 Do you want pglookout to try to start following the new primary. Useful
@@ -210,38 +215,24 @@ to start following the new primary.
 Requires ``pg_data_directory``, ``pg_start_command``
 and ``pg_stop_command`` configuration keys to be set.
 
+``cluster_monitor_health_timeout_seconds`` (default: ``2 * replication_state_check_interval``)
+
+If set, it will increase the statsd counter `cluster_monitor_health_timeout` if the
+`cluster_monitor` thread has not successfully completed a check since
+`cluster_monitor_health_timeout_seconds`.
+
 ``db_poll_interval`` (default ``5.0``)
 
 Interval on how often should the connections defined in remote_conns
 be polled for information on DB replication state.
 
-``remote_conns`` (default ``{}``)
-
-PG database connection strings that the pglookout process should monitor.
-Keys of the object should be names of the remotes and values must be valid
-PostgreSQL connection strings or connection info objects.
-
-``primary_conninfo_template``
-
-Connection string or connection info object template to use when setting a new
-primary_conninfo value for recovery.conf after a failover has happened.  Any
-provided hostname and database name in the template is ignored and they are
-replaced with a replication connection to the new primary node.
-
-Required when ``autofollow`` is true.
-
-``observers`` (default ``{}``)
+``failover_command`` (default ``""``)
 
-This object contains key value pairs like ``{"1.2.3.4":
-"http://2.3.4.5:15000"}``.  They are used to determine the location of
-pglookout observer processes.  Observers are processes that don't take any
-actions, but simply give a third party viewpoint on the state of the
-cluster.  Useful especially during net splits.
+Shell command to execute in case the node has deemed itself in need of promotion
 
-``poll_observers_on_warning_only`` (default ``False``)
+``failover_sleep_time`` (default ``0.0``)
 
-this allows observers to be polled only when replication lag is over
-``warning_replication_time_lag``
+Time to sleep after a failover command has been issued.
 
 ``http_address`` (default ``""``)
 
@@ -251,55 +242,36 @@ HTTP webserver address, by default pglookout binds to all interfaces.
 
 HTTP webserver port.
 
-``replication_state_check_interval`` (default ``10.0``)
+``json_state_file_path`` (default ``"/tmp/pglookout_state.json"``)
 
-How often should pglookout check the replication state in order to
-make decisions on should the node be promoted.
+Location of a JSON state file which describes the state of the
+pglookout process.
 
-``failover_sleep_time`` (default ``0.0``)
+``known_gone_nodes`` (default ``[]``)
 
-Time to sleep after a failover command has been issued.
+Lists nodes that are explicitly known to have left the cluster.  If the old
+primary is removed in a controlled manner it should be added to this list to
+ensure there's no extra delay when making promotion decision.
+
+``log_level`` (default ``"INFO"``)
+
+Determines log level of pglookout.
 
 ``maintenance_mode_file`` (default ``"/tmp/pglookout_maintenance_mode_file"``)
 
 If a file exists in this location, this node will not be considered
 for promotion to primary.
 
-``missing_master_from_config_timeout`` (default ``15``)
-
-In seconds the amount of time before we do a failover decision if a
-previously existing primary has been removed from the config file and
-we have gotten a SIGHUP.
-
-``alert_file_dir`` (default ``os.getcwd()``)
-
-Directory in which alert files for replication warning and failover
-are created.
-
-``json_state_file_path`` (default ``"/tmp/pglookout_state.json"``)
-
-Location of a JSON state file which describes the state of the
-pglookout process.
-
 ``max_failover_replication_time_lag`` (default ``120.0``)
 
 Replication time lag after which failover_command will be executed and a
 failover_has_happened file will be created.
 
-``warning_replication_time_lag`` (default ``30.0``)
-
-Replication time lag at which point to execute
-over_warning_limit_command and to create a warning file.
-
-``failover_command`` (default ``""``)
-
-Shell command to execute in case the node has deemed itself in need of promotion
-
-``known_gone_nodes`` (default ``[]``)
+``missing_master_from_config_timeout`` (default ``15``)
 
-Lists nodes that are explicitly known to have left the cluster.  If the old
-primary is removed in a controlled manner it should be added to this list to
-ensure there's no extra delay when making promotion decision.
+In seconds the amount of time before we do a failover decision if a
+previously existing primary has been removed from the config file and
+we have gotten a SIGHUP.
 
 ``never_promote_these_nodes`` (default ``[]``)
 
@@ -308,6 +280,14 @@ you have primary ``p`` which fails and replicas ``a`` and ```b``, even if
 ``b`` is ahead but is listed in ``never_promote_these_nodes``, ``a`` will be
 promoted.
 
+``observers`` (default ``{}``)
+
+This object contains key value pairs like ``{"1.2.3.4":
+"http://2.3.4.5:15000"}``.  They are used to determine the location of
+pglookout observer processes.  Observers are processes that don't take any
+actions, but simply give a third party viewpoint on the state of the
+cluster.  Useful especially during net splits.
+
 ``over_warning_limit_command`` (default ``null``)
 
 Shell command to be executed once replication lag is warning_replication_time_lag
@@ -316,10 +296,6 @@ Shell command to be executed once replication lag is warning_replication_time_la
 
 The key of the entry in ``remote_conns`` that matches this node.
 
-``log_level`` (default ``"INFO"``)
-
-Determines log level of pglookout.
-
 ``pg_data_directory`` (default ``"/var/lib/pgsql/data"``)
 
 PG data directory that needs to be set when autofollow has been turned on.
@@ -336,18 +312,35 @@ true. Usually something like "sudo systemctl start postgresql".
 Command to stop a PostgreSQL process on a node which has autofollow set to
 true. Usually something like "sudo systemctl start postgresql".
 
-``syslog`` (default ``false``)
+``poll_observers_on_warning_only`` (default ``False``)
 
-Determines whether syslog logging should be turned on or not.
+this allows observers to be polled only when replication lag is over
+``warning_replication_time_lag``
 
-``syslog_address`` (default ``"/dev/log"``)
+``primary_conninfo_template``
 
-Determines syslog address to use in logging (requires syslog to be
-true as well)
+Connection string or connection info object template to use when setting a new
+primary_conninfo value for recovery.conf after a failover has happened.  Any
+provided hostname and database name in the template is ignored and they are
+replaced with a replication connection to the new primary node.
 
-``syslog_facility`` (default ``"local2"``)
+Required when ``autofollow`` is true.
 
-Determines syslog log facility. (requires syslog to be true as well)
+``remote_conns`` (default ``{}``)
+
+PG database connection strings that the pglookout process should monitor.
+Keys of the object should be names of the remotes and values must be valid
+PostgreSQL connection strings or connection info objects.
+
+``replication_catchup_timeout`` (default ``300.0``)
+
+How long should pglookout wait for a node to catch up with the primary
+before it decides to promote itself.
+
+``replication_state_check_interval`` (default ``10.0``)
+
+How often should pglookout check the replication state in order to
+make decisions on should the node be promoted.
 
 ``statsd`` (default: disabled)
 
@@ -368,13 +361,25 @@ The ``tags`` setting can be used to enter optional tag values for the metrics.
 
 Metrics sending follows the `Telegraf spec`_.
 
-.. _`Telegraf spec`: https://github.com/influxdata/telegraf/tree/master/plugins/inputs/statsd
+``syslog`` (default ``false``)
 
-``cluster_monitor_health_timeout_seconds`` (default: ``2 * replication_state_check_interval``)
+Determines whether syslog logging should be turned on or not.
 
-If set, it will increase the statsd counter `cluster_monitor_health_timeout` if the
-`cluster_monitor` thread has not successfully completed a check since
-`cluster_monitor_health_timeout_seconds`.
+``syslog_address`` (default ``"/dev/log"``)
+
+Determines syslog address to use in logging (requires syslog to be
+true as well)
+
+``syslog_facility`` (default ``"local2"``)
+
+Determines syslog log facility. (requires syslog to be true as well)
+
+``warning_replication_time_lag`` (default ``30.0``)
+
+Replication time lag at which point to execute
+over_warning_limit_command and to create a warning file.
+
+.. _`Telegraf spec`: https://github.com/influxdata/telegraf/tree/master/plugins/inputs/statsd
 
 License
 =======
diff --git a/pglookout/config.py b/pglookout/config.py
new file mode 100644
index 0000000..97f8f69
--- /dev/null
+++ b/pglookout/config.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2023 Aiven, Helsinki, Finland. https://aiven.io/
+from __future__ import annotations
+
+from typing import Literal, TypedDict
+
+
+class Statsd(TypedDict, total=False):
+    host: str
+    port: int
+    tags: dict[str, str]
+
+
+class Config(TypedDict, total=False):
+    alert_file_dir: str
+    autofollow: bool
+    cluster_monitor_health_timeout_seconds: float
+    db_poll_interval: float
+    failover_command: str
+    failover_sleep_time: float
+    http_address: str
+    http_port: int
+    json_state_file_path: str
+    known_gone_nodes: list[str]
+    log_level: Literal["NOTSET", "DEBUG", "INFO", "WARNING", "WARN", "ERROR", "FATAL", "CRITICAL"]
+    maintenance_mode_file: str
+    max_failover_replication_time_lag: float
+    missing_master_from_config_timeout: float
+    never_promote_these_nodes: list[str]
+    observers: dict[str, str]
+    over_warning_limit_command: str
+    own_db: str
+    pg_data_directory: str
+    pg_start_command: str
+    pg_stop_command: str
+    poll_observers_on_warning_only: bool
+    primary_conninfo_template: str
+    remote_conns: dict[str, str]
+    replication_catchup_timeout: float
+    replication_state_check_interval: float
+    statsd: Statsd
+    syslog: bool
+    syslog_address: str
+    # fmt: off
+    # https://docs.python.org/3/library/logging.handlers.html#logging.handlers.SysLogHandler.encodePriority
+    syslog_facility: Literal[
+        "auth", "authpriv", "console", "cron", "daemon", "ftp", "kern", "lpr",
+        "mail", "news", "ntp", "security", "solaris-cron", "syslog", "user", "uucp",
+        "local0", "local1", "local2", "local3", "local4", "local5", "local6", "local7",
+    ]
+    # fmt: on
+    warning_replication_time_lag: float

From bf067b315c30f3f53ec791b5cce6ecaec759d61d Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Thu, 23 Mar 2023 16:23:27 +0100
Subject: [PATCH 07/23] mypy: `pglookout/common_types.py` [BF-1560]

---
 pglookout/common_types.py | 53 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 pglookout/common_types.py

diff --git a/pglookout/common_types.py b/pglookout/common_types.py
new file mode 100644
index 0000000..3c20cdf
--- /dev/null
+++ b/pglookout/common_types.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2023 Aiven, Helsinki, Finland. https://aiven.io/
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Any, Dict, TypedDict
+
+
+class ReplicationSlotAsDict(TypedDict, total=True):
+    slot_name: str
+    plugin: str
+    slot_type: str
+    database: str
+    catalog_xmin: str
+    restart_lsn: str
+    confirmed_flush_lsn: str
+    state_data: str
+
+
+class MemberState(TypedDict, total=False):
+    """Represents the state of a member of the cluster.
+
+    Note:
+        This is a very loose type as no key is mandatory. This is because
+        it is too dangerous to impose a stricter type until we have a
+        better test coverage, as it would change some behaviour in the
+        code (some unconventional behaviour was detected, and it may be a
+        bug or a feature).
+    """
+
+    # Connection Status
+    connection: bool
+    fetch_time: str
+    # Queried Status
+    db_time: str | datetime
+    pg_is_in_recovery: bool
+    pg_last_xact_replay_timestamp: datetime | str | None
+    pg_last_xlog_receive_location: str | None
+    pg_last_xlog_replay_location: str | None
+    # Replication info
+    replication_slots: list[ReplicationSlotAsDict]
+    replication_time_lag: float | None
+    min_replication_time_lag: float
+    replication_start_time: float | None
+
+
+# Note for future improvements:
+# If we want ObserverState to accept arbitrary keys, we have three choices:
+# - Use a different type (pydantic, dataclasses, etc.)
+# - Use a TypedDict for static keys (connection, fetch_time) and a sub-dict for
+#   dynamic keys (received from state.json).
+# - Wait for something like `allow_extra` to be implemented into TypedDict (unlikely)
+#   https://github.com/python/mypy/issues/4617
+ObserverState = Dict[str, Any]

From a462bf92d1f2121c89296e5dac6b5c646894cf6f Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Fri, 17 Mar 2023 10:21:29 +0100
Subject: [PATCH 08/23] mypy: `pglookout/webserver.py` [BF-1560]

---
 pglookout/webserver.py | 80 +++++++++++++++++++++++++++---------------
 pyproject.toml         |  2 --
 test/test_webserver.py | 19 ++++++----
 3 files changed, 65 insertions(+), 36 deletions(-)

diff --git a/pglookout/webserver.py b/pglookout/webserver.py
index 0394ca6..09b990c 100644
--- a/pglookout/webserver.py
+++ b/pglookout/webserver.py
@@ -7,8 +7,13 @@
 This file is under the Apache License, Version 2.0.
 See the file `LICENSE` for details.
 """
-from http.server import HTTPServer, SimpleHTTPRequestHandler
-from logging import getLogger
+from __future__ import annotations
+
+from http.server import BaseHTTPRequestHandler, HTTPServer, SimpleHTTPRequestHandler
+from logging import getLogger, Logger
+from pglookout.common_types import MemberState
+from pglookout.config import Config
+from queue import Queue
 from socketserver import ThreadingMixIn
 from threading import Thread
 
@@ -17,43 +22,62 @@
 
 
 class ThreadedWebServer(ThreadingMixIn, HTTPServer):
-    cluster_state = None
-    log = None
-    cluster_monitor_check_queue = None
-    allow_reuse_address = True
+    allow_reuse_address: bool = True
+
+    def __init__(
+        self,
+        address: str,
+        port: int,
+        RequestHandlerClass: type[BaseHTTPRequestHandler],
+        cluster_state: dict[str, MemberState],
+        log: Logger,
+        cluster_monitor_check_queue: Queue[str],
+    ) -> None:
+        super().__init__((address, port), RequestHandlerClass)
+        self.cluster_state: dict[str, MemberState] = cluster_state
+        self.log: Logger = log
+        self.cluster_monitor_check_queue: Queue[str] = cluster_monitor_check_queue
 
 
 class WebServer(Thread):
-    def __init__(self, config, cluster_state, cluster_monitor_check_queue):
-        Thread.__init__(self)
-        self.config = config
-        self.cluster_state = cluster_state
-        self.cluster_monitor_check_queue = cluster_monitor_check_queue
-        self.log = getLogger("WebServer")
-        self.address = self.config.get("http_address", "")
-        self.port = self.config.get("http_port", 15000)
-        self.server = None
+    def __init__(
+        self, config: Config, cluster_state: dict[str, MemberState], cluster_monitor_check_queue: Queue[str]
+    ) -> None:
+        super().__init__()
+        self.config: Config = config
+        self.cluster_state: dict[str, MemberState] = cluster_state
+        self.cluster_monitor_check_queue: Queue[str] = cluster_monitor_check_queue
+        self.log: Logger = getLogger("WebServer")
+        self.address: str = self.config.get("http_address", "")
+        self.port: int = self.config.get("http_port", 15000)
+        self.server: ThreadedWebServer | None = None
         self.log.debug("WebServer initialized with address: %r port: %r", self.address, self.port)
-        self.is_initialized = threading.Event()
+        self.is_initialized: threading.Event = threading.Event()
 
-    def run(self):
+    def run(self) -> None:
         # We bind the port only when we start running
-        self.server = ThreadedWebServer((self.address, self.port), RequestHandler)
-        self.server.cluster_state = self.cluster_state
-        self.server.log = self.log
-        self.server.cluster_monitor_check_queue = self.cluster_monitor_check_queue
+        self.server = ThreadedWebServer(
+            address=self.address,
+            port=self.port,
+            RequestHandlerClass=RequestHandler,
+            cluster_state=self.cluster_state,
+            log=self.log,
+            cluster_monitor_check_queue=self.cluster_monitor_check_queue,
+        )
         self.is_initialized.set()
         self.server.serve_forever()
 
-    def close(self):
-        if self.server:
-            self.log.debug("Closing WebServer")
-            self.server.shutdown()
-            self.log.debug("Closed WebServer")
+    def close(self) -> None:
+        if self.server is None:
+            return
+
+        self.log.debug("Closing WebServer")
+        self.server.shutdown()
+        self.log.debug("Closed WebServer")
 
 
 class RequestHandler(SimpleHTTPRequestHandler):
-    def do_GET(self):
+    def do_GET(self) -> None:
         assert isinstance(self.server, ThreadedWebServer), f"server: {self.server!r}"
         self.server.log.debug("Got request: %r", self.path)
         if self.path.startswith("/state.json"):
@@ -66,7 +90,7 @@ def do_GET(self):
         else:
             self.send_response(404)
 
-    def do_POST(self):
+    def do_POST(self) -> None:
         assert isinstance(self.server, ThreadedWebServer), f"server: {self.server!r}"
         self.server.log.debug("Got request: %r", self.path)
         if self.path.startswith("/check"):
diff --git a/pyproject.toml b/pyproject.toml
index 7b59a5b..8cae98b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,12 +34,10 @@ exclude = [
     'pglookout/cluster_monitor.py',
     'pglookout/pglookout.py',
     'pglookout/version.py',
-    'pglookout/webserver.py',
     # Tests.
     'test/conftest.py',
     'test/test_cluster_monitor.py',
     'test/test_lookout.py',
-    'test/test_webserver.py',
     # Other.
     'setup.py',
     'version.py',
diff --git a/test/test_webserver.py b/test/test_webserver.py
index d81e71d..a4dffb5 100644
--- a/test/test_webserver.py
+++ b/test/test_webserver.py
@@ -6,6 +6,10 @@
 This file is under the Apache License, Version 2.0.
 See the file `LICENSE` for details.
 """
+from __future__ import annotations
+
+from pglookout.common_types import MemberState
+from pglookout.config import Config
 from pglookout.webserver import WebServer
 from queue import Queue
 
@@ -13,21 +17,24 @@
 import requests
 
 
-def test_webserver():
-    config = {
+def test_webserver() -> None:
+    config: Config = {
         "http_port": random.randint(10000, 32000),
     }
-    cluster_state = {
-        "hello": 123,
+    cluster_state: dict[str, MemberState] = {
+        "1.2.3.4": {
+            "connection": False,
+            "fetch_time": "2021-01-01T23:42:11.123456Z",
+        }
     }
     http_port = config["http_port"]
     base_url = f"http://127.0.0.1:{http_port}"
-    cluster_monitor_check_queue = Queue()
+    cluster_monitor_check_queue: Queue[str] = Queue()
 
     web = WebServer(config=config, cluster_state=cluster_state, cluster_monitor_check_queue=cluster_monitor_check_queue)
     try:
         web.start()
-        # wait for the thread to have started, else we're blocking forever as web.close can't shutdown the thread
+        # wait for the thread to have started, else we're blocking forever as web.close can't shut down the thread
         web.is_initialized.wait(timeout=30.0)
 
         result = requests.get(f"{base_url}/state.json", timeout=5).json()

From 9b4e13fe3340acc28d35086c8a0ac4bff7b6633e Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Fri, 17 Mar 2023 15:57:59 +0100
Subject: [PATCH 09/23] mypy: `pglookout/cluster_monitor.py` [BF-1560]

---
 pglookout/cluster_monitor.py | 226 ++++++++++++++++++++++-------------
 pglookout/config.py          |   2 +-
 pyproject.toml               |   6 +-
 requirements.dev.txt         |   1 -
 test/test_cluster_monitor.py |  40 ++++---
 5 files changed, 169 insertions(+), 106 deletions(-)

diff --git a/pglookout/cluster_monitor.py b/pglookout/cluster_monitor.py
index 287a79a..a642972 100644
--- a/pglookout/cluster_monitor.py
+++ b/pglookout/cluster_monitor.py
@@ -7,17 +7,23 @@
 This file is under the Apache License, Version 2.0.
 See the file `LICENSE` for details.
 """
+from __future__ import annotations
 
-from . import logutil
-from .common import get_iso_timestamp, parse_iso_datetime
-from .pgutil import mask_connection_info
 from concurrent.futures import as_completed, ThreadPoolExecutor
 from dataclasses import asdict, dataclass
 from email.utils import parsedate
-from psycopg2.extras import RealDictCursor
-from queue import Empty
+from logging.handlers import SysLogHandler
+from pglookout import logutil
+from pglookout.common import get_iso_timestamp, parse_iso_datetime
+from pglookout.common_types import MemberState, ObserverState, ReplicationSlotAsDict
+from pglookout.config import Config
+from pglookout.pgutil import mask_connection_info
+from pglookout.statsd import StatsClient
+from psycopg2.extensions import POLL_OK, POLL_READ, POLL_WRITE
+from psycopg2.extras import RealDictCursor, RealDictRow
+from queue import Empty, Queue
 from threading import Thread
-from typing import List
+from typing import Callable, cast, Final
 
 import datetime
 import errno
@@ -27,6 +33,9 @@
 import select
 import time
 
+# https://www.psycopg.org/docs/connection.html#connection.server_version
+PG_VERSION_10: Final[int] = 10_00_00  # 10.0.0
+
 
 class PglookoutTimeout(Exception):
     pass
@@ -44,17 +53,17 @@ class ReplicationSlot:
     state_data: str
 
 
-def wait_select(conn, timeout=5.0):
+def wait_select(conn: psycopg2.connection, timeout: float = 5.0) -> None:
     end_time = time.monotonic() + timeout
     while time.monotonic() < end_time:
         time_left = end_time - time.monotonic()
         state = conn.poll()
         try:
-            if state == psycopg2.extensions.POLL_OK:
+            if state == POLL_OK:
                 return
-            if state == psycopg2.extensions.POLL_READ:
+            if state == POLL_READ:
                 select.select([conn.fileno()], [], [], min(timeout, time_left))
-            elif state == psycopg2.extensions.POLL_WRITE:
+            elif state == POLL_WRITE:
                 select.select([], [conn.fileno()], [], min(timeout, time_left))
             else:
                 raise psycopg2.OperationalError(f"bad state from poll: {state}")
@@ -67,14 +76,14 @@ def wait_select(conn, timeout=5.0):
 class ClusterMonitor(Thread):
     def __init__(
         self,
-        config,
-        cluster_state,
-        observer_state,
-        create_alert_file,
-        cluster_monitor_check_queue,
-        failover_decision_queue,
-        is_replication_lag_over_warning_limit,
-        stats,
+        config: Config,
+        cluster_state: dict[str, MemberState],
+        observer_state: dict[str, ObserverState],
+        create_alert_file: Callable[[str], None],
+        cluster_monitor_check_queue: Queue[str],
+        failover_decision_queue: Queue[str],
+        is_replication_lag_over_warning_limit: Callable[[], bool],
+        stats: StatsClient,
     ):
         """Thread which collects cluster state.
 
@@ -83,36 +92,42 @@ def __init__(
         in the cluster_state/observer_state dictionaries, which are shared with the main thread.
         """
         Thread.__init__(self)
-        self.log = logging.getLogger("ClusterMonitor")
-        self.stats = stats
-        self.running = True
-        self.cluster_state = cluster_state
-        self.observer_state = observer_state
-        self.config = config
-        self.create_alert_file = create_alert_file
-        self.db_conns = {}
-        self.cluster_monitor_check_queue = cluster_monitor_check_queue
-        self.failover_decision_queue = failover_decision_queue
-        self.is_replication_lag_over_warning_limit = is_replication_lag_over_warning_limit
-        self.session = requests.Session()
+        self.log: logging.Logger = logging.getLogger("ClusterMonitor")
+        self.stats: StatsClient = stats
+        self.running: bool = True
+        self.cluster_state: dict[str, MemberState] = cluster_state
+        self.observer_state: dict[str, ObserverState] = observer_state
+        self.config: Config = config
+        self.create_alert_file: Callable[[str], None] = create_alert_file
+        self.db_conns: dict[str, psycopg2.connection | None] = {}
+        self.cluster_monitor_check_queue: Queue[str] = cluster_monitor_check_queue
+        self.failover_decision_queue: Queue[str] = failover_decision_queue
+        self.is_replication_lag_over_warning_limit: Callable[[], bool] = is_replication_lag_over_warning_limit
+        self.session: requests.Session = requests.Session()
         if self.config.get("syslog"):
-            self.syslog_handler = logutil.set_syslog_handler(
+            # Function `set_syslog_handler` already adds the handler to the provided logger.
+            # We just keep a reference to it here.
+            self.syslog_handler: SysLogHandler = logutil.set_syslog_handler(
                 address=self.config.get("syslog_address", "/dev/log"),
                 facility=self.config.get("syslog_facility", "local2"),
                 logger=self.log,
             )
-        self.last_monitoring_success_time = None
+        self.last_monitoring_success_time: float | None = None
         self.log.debug("Initialized ClusterMonitor with: %r", cluster_state)
 
-    def _connect_to_db(self, instance, dsn):
+    def _connect_to_db(self, instance: str, dsn: str | None) -> psycopg2.connection | None:
         conn = self.db_conns.get(instance)
+
         if conn:
             return conn
+
         if not dsn:
             self.log.warning("Can't connect to %s, dsn is %r", instance, dsn)
             return None
+
         masked_connection_info = mask_connection_info(dsn)
         inst_info_str = f"{instance!r} ({masked_connection_info})"
+
         try:
             self.log.info("Connecting to %s", inst_info_str)
             conn = psycopg2.connect(dsn=dsn, async_=True)
@@ -133,19 +148,29 @@ def _connect_to_db(self, instance, dsn):
             self.log.exception("Failed to connect to %s (%s)", instance, inst_info_str)
             self.stats.unexpected_exception(ex, where="_connect_to_db")
             conn = None
+
         self.db_conns[instance] = conn
         return conn
 
-    def _fetch_observer_state(self, instance, uri):
-        result = {"fetch_time": get_iso_timestamp(), "connection": True}
+    def _fetch_observer_state(self, instance: str, uri: str) -> ObserverState | None:
+        now_iso = get_iso_timestamp()
+        result = {"fetch_time": now_iso, "connection": True}
         fetch_uri = uri + "/state.json"
+
         try:
             response = self.session.get(fetch_uri, timeout=5.0)
 
             # check time difference for large skews
-            remote_server_time = parsedate(response.headers["date"])
-            remote_server_time = datetime.datetime.fromtimestamp(time.mktime(remote_server_time))
-            time_diff = parse_iso_datetime(result["fetch_time"]) - remote_server_time
+            remote_server_ptime = parsedate(response.headers["date"])
+            if remote_server_ptime is None:
+                self.log.error(
+                    "Failed to parse date from observer node %r, response: %r, ignoring response",
+                    instance,
+                    response.json(),
+                )
+                return None
+            remote_server_time = datetime.datetime.fromtimestamp(time.mktime(remote_server_ptime))
+            time_diff = parse_iso_datetime(now_iso) - remote_server_time
             if time_diff > datetime.timedelta(seconds=5):
                 self.log.error(
                     "Time difference between us and observer node %r is %r, response: %r, ignoring response",
@@ -168,16 +193,19 @@ def _fetch_observer_state(self, instance, uri):
             self.log.exception("Problem in fetching state from observer: %r, %r", instance, fetch_uri)
             self.stats.unexpected_exception(ex, where="_fetch_observer_state")
             result["connection"] = False
+
         return result
 
-    def fetch_observer_state(self, instance, uri):
+    def fetch_observer_state(self, instance: str, uri: str) -> None:
         start_time = time.monotonic()
         result = self._fetch_observer_state(instance, uri)
+
         if result:
             if instance in self.observer_state:
                 self.observer_state[instance].update(result)
             else:
                 self.observer_state[instance] = result
+
         self.log.debug(
             "Observer: %r state was: %r, took: %.4fs to fetch",
             instance,
@@ -185,18 +213,20 @@ def fetch_observer_state(self, instance, uri):
             time.monotonic() - start_time,
         )
 
-    def connect_to_cluster_nodes_and_cleanup_old_nodes(self):
+    def connect_to_cluster_nodes_and_cleanup_old_nodes(self) -> None:
         leftover_conns = set(self.db_conns) - set(self.config.get("remote_conns", {}))
+
         for leftover_instance in leftover_conns:
             self.log.debug("Removing leftover state for: %r", leftover_instance)
             self.db_conns.pop(leftover_instance)
-            self.cluster_state.pop(leftover_instance, "")
-            self.observer_state.pop(leftover_instance, "")
+            self.cluster_state.pop(leftover_instance, None)
+            self.observer_state.pop(leftover_instance, None)
+
         #  Making sure we have a connection to all currently configured db hosts
         for instance, connect_string in self.config.get("remote_conns", {}).items():
             self._connect_to_db(instance, dsn=connect_string)
 
-    def _fetch_replication_slot_info(self, instance: str, cursor: RealDictCursor) -> List[ReplicationSlot]:
+    def _fetch_replication_slot_info(self, instance: str, cursor: RealDictCursor) -> list[ReplicationSlot]:
         """Fetch logical replication slot definitions"""
 
         self.log.debug("reading replication slot state from %r", instance)
@@ -217,59 +247,49 @@ def _fetch_replication_slot_info(self, instance: str, cursor: RealDictCursor) ->
         """
         )
         wait_select(cursor.connection)
-        replication_slots = [ReplicationSlot(**slot) for slot in cursor.fetchall()]
+        replication_slots = [
+            ReplicationSlot(**cast(RealDictRow, slot)) for slot in cursor.fetchall()  # type: ignore[redundant-cast]
+        ]
         self.log.debug("found %d replication slot(s)", len(replication_slots))
         return replication_slots
 
-    def _query_cluster_member_state(self, instance, db_conn):
+    def _query_cluster_member_state(self, instance: str, db_conn: psycopg2.connection | None) -> MemberState:
         """Query a single cluster member for its state"""
-        f_result = None
-        result = {"fetch_time": get_iso_timestamp(), "connection": False}
+        f_result: MemberState | None = None
+        result: MemberState = {"fetch_time": get_iso_timestamp(), "connection": False}
+
         if not db_conn:
-            db_conn = self._connect_to_db(instance, self.config["remote_conns"].get(instance))
+            dsn: str | None = self.config["remote_conns"].get(instance)
+            db_conn = self._connect_to_db(instance, dsn)
             if not db_conn:
                 return result
+
         phase = "querying status from"
         try:
             self.log.debug("%s %r", phase, instance)
+
             c = db_conn.cursor(cursor_factory=RealDictCursor)
-            if db_conn.server_version >= 100000:
-                fields = [
-                    "now() AS db_time",
-                    "pg_is_in_recovery()",
-                    "pg_last_xact_replay_timestamp()",
-                    "pg_last_wal_receive_lsn() AS pg_last_xlog_receive_location",
-                    "pg_last_wal_replay_lsn() AS pg_last_xlog_replay_location",
-                ]
-            else:
-                fields = [
-                    "now() AS db_time",
-                    "pg_is_in_recovery()",
-                    "pg_last_xact_replay_timestamp()",
-                    "pg_last_xlog_receive_location()",
-                    "pg_last_xlog_replay_location()",
-                ]
-            joined_fields = ", ".join(fields)
-            c.execute(f"SELECT {joined_fields}")
+
+            c.execute(self._get_statement_query_status(db_conn.server_version))
             wait_select(c.connection)
-            maybe_standby_result = c.fetchone()
+            maybe_standby_result: MemberState = cast(MemberState, c.fetchone())
+
             if maybe_standby_result["pg_is_in_recovery"]:
                 f_result = maybe_standby_result
             else:
                 # First try reading current WAL LSN separately as txid_current may fail in some cases
                 phase = "getting master LSN position"
-                if db_conn.server_version >= 100000:
-                    wal_lsn_column = "pg_current_wal_lsn() AS pg_last_xlog_replay_location"
-                else:
-                    wal_lsn_column = "pg_current_xlog_location() AS pg_last_xlog_replay_location"
-                c.execute(f"SELECT {wal_lsn_column}")
+
+                c.execute(self._get_statement_query_master_lsn_position(db_conn.server_version))
                 wait_select(c.connection)
-                master_position = c.fetchone()
+                master_position: RealDictRow = cast(RealDictRow, c.fetchone())
                 maybe_standby_result["pg_last_xlog_replay_location"] = master_position["pg_last_xlog_replay_location"]
                 f_result = maybe_standby_result
 
-                if db_conn.server_version >= 100000:
-                    f_result["replication_slots"] = [asdict(slot) for slot in self._fetch_replication_slot_info(instance, c)]
+                if db_conn.server_version >= PG_VERSION_10:
+                    f_result["replication_slots"] = [
+                        cast(ReplicationSlotAsDict, asdict(slot)) for slot in self._fetch_replication_slot_info(instance, c)
+                    ]
 
                 # This is only run on masters to create txid traffic every db_poll_interval
                 phase = "updating transaction on"
@@ -277,9 +297,10 @@ def _query_cluster_member_state(self, instance, db_conn):
                 # With pg_current_wal_lsn we simulate replay_location on the master
                 # With txid_current we force a new transaction to occur every poll interval to ensure there's
                 # a heartbeat for the replication lag.
-                c.execute(f"SELECT txid_current(), {wal_lsn_column}")
+                c.execute(self._get_statement_query_updating_transaction(db_conn.server_version))
                 wait_select(c.connection)
-                master_result = c.fetchone()
+                master_result: RealDictRow = cast(RealDictRow, c.fetchone())
+
                 f_result["pg_last_xlog_replay_location"] = master_result["pg_last_xlog_replay_location"]
         except (
             PglookoutTimeout,
@@ -296,12 +317,45 @@ def _query_cluster_member_state(self, instance, db_conn):
         return result
 
     @staticmethod
-    def _parse_status_query_result(result):
+    def _get_statement_query_status(server_version: int) -> str:
+        if server_version >= PG_VERSION_10:
+            return (
+                "SELECT now() AS db_time, "
+                "pg_is_in_recovery(), "
+                "pg_last_xact_replay_timestamp(), "
+                "pg_last_wal_receive_lsn() AS pg_last_xlog_receive_location, "
+                "pg_last_wal_replay_lsn() AS pg_last_xlog_replay_location"
+            )
+        return (
+            "SELECT now() AS db_time, "
+            "pg_is_in_recovery(), "
+            "pg_last_xact_replay_timestamp(), "
+            "pg_last_xlog_receive_location(), "
+            "pg_last_xlog_replay_location()"
+        )
+
+    @staticmethod
+    def _get_statement_query_master_lsn_position(server_version: int) -> str:
+        if server_version >= PG_VERSION_10:
+            return "SELECT pg_current_wal_lsn() AS pg_last_xlog_replay_location"
+        return "SELECT pg_current_xlog_location() AS pg_last_xlog_replay_location"
+
+    @staticmethod
+    def _get_statement_query_updating_transaction(server_version: int) -> str:
+        if server_version >= PG_VERSION_10:
+            return "SELECT txid_current(), pg_current_wal_lsn() AS pg_last_xlog_replay_location"
+        return "SELECT txid_current(), pg_current_xlog_location() AS pg_last_xlog_replay_location"
+
+    # FIXME: Find a tighter input + return type
+    @staticmethod
+    def _parse_status_query_result(result: MemberState) -> MemberState:
         if not result:
             return {}
+
+        db_time = cast(datetime.datetime, result["db_time"])
         # abs is for catching time travel (as in going from the future to the past
-        if result["pg_last_xact_replay_timestamp"]:
-            replication_time_lag = abs(result["db_time"] - result["pg_last_xact_replay_timestamp"])
+        if isinstance(result["pg_last_xact_replay_timestamp"], datetime.datetime):
+            replication_time_lag: datetime.timedelta = abs(db_time - result["pg_last_xact_replay_timestamp"])
             result["replication_time_lag"] = replication_time_lag.total_seconds()
             result["pg_last_xact_replay_timestamp"] = get_iso_timestamp(result["pg_last_xact_replay_timestamp"])
 
@@ -317,10 +371,10 @@ def _parse_status_query_result(result):
                     "replication_time_lag": None,  # differentiate from actual lag=0.0
                 }
             )
-        result.update({"db_time": get_iso_timestamp(result["db_time"]), "connection": True})
+        result.update({"db_time": get_iso_timestamp(db_time), "connection": True})
         return result
 
-    def update_cluster_member_state(self, instance, db_conn):
+    def update_cluster_member_state(self, instance: str, db_conn: psycopg2.connection | None) -> None:
         """Update the cluster state entry for a single cluster member"""
         start_time = time.monotonic()
         result = self._query_cluster_member_state(instance, db_conn)
@@ -348,7 +402,7 @@ def update_cluster_member_state(self, instance, db_conn):
             else:
                 self.cluster_state[instance]["min_replication_time_lag"] = min(min_lag, now_lag)
 
-    def main_monitoring_loop(self, requested_check=False):
+    def main_monitoring_loop(self, requested_check: bool = False) -> None:
         self.connect_to_cluster_nodes_and_cleanup_old_nodes()
         thread_count = len(self.db_conns) + len(self.config.get("observers", {}))
         futures = []
@@ -367,12 +421,14 @@ def main_monitoring_loop(self, requested_check=False):
 
         self.last_monitoring_success_time = time.monotonic()
 
-    def run(self):
+    def run(self) -> None:
         self.main_monitoring_loop()
         while self.running:
             requested_check = False
             try:
-                requested_check = self.cluster_monitor_check_queue.get(timeout=self.config.get("db_poll_interval", 5.0))
+                requested_check = bool(
+                    self.cluster_monitor_check_queue.get(timeout=self.config.get("db_poll_interval", 5.0))
+                )
             except Empty:
                 pass
             self.main_monitoring_loop(requested_check)
diff --git a/pglookout/config.py b/pglookout/config.py
index 97f8f69..761263d 100644
--- a/pglookout/config.py
+++ b/pglookout/config.py
@@ -34,7 +34,7 @@ class Config(TypedDict, total=False):
     pg_stop_command: str
     poll_observers_on_warning_only: bool
     primary_conninfo_template: str
-    remote_conns: dict[str, str]
+    remote_conns: dict[str, str]  # instance name -> dsn
     replication_catchup_timeout: float
     replication_state_check_interval: float
     statsd: Statsd
diff --git a/pyproject.toml b/pyproject.toml
index 8cae98b..e01c7e7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,18 +31,20 @@ files = [
 exclude = [
     # Implementation.
     'pglookout/__main__.py',
-    'pglookout/cluster_monitor.py',
     'pglookout/pglookout.py',
     'pglookout/version.py',
     # Tests.
     'test/conftest.py',
-    'test/test_cluster_monitor.py',
     'test/test_lookout.py',
     # Other.
     'setup.py',
     'version.py',
 ]
 
+[[tool.mypy.overrides]]
+module = "test.test_cluster_monitor"
+# ignore no-untyped-call because conftest can only type hinted at the end. Remove at the end.
+disallow_untyped_calls = false
 
 [tool.pylint.'MESSAGES CONTROL']
 disable = [
diff --git a/requirements.dev.txt b/requirements.dev.txt
index dca3637..1dd3f89 100644
--- a/requirements.dev.txt
+++ b/requirements.dev.txt
@@ -2,7 +2,6 @@ black==22.10.0
 flake8
 flake8-pyproject==1.2.2
 isort==5.10.1
-mock
 mypy==1.1.1
 types-psycopg2==2.9.21.8
 types-requests==2.26.1
diff --git a/test/test_cluster_monitor.py b/test/test_cluster_monitor.py
index 58d3ab7..710ccee 100644
--- a/test/test_cluster_monitor.py
+++ b/test/test_cluster_monitor.py
@@ -7,12 +7,15 @@
 from .conftest import TestPG
 from contextlib import closing
 from datetime import datetime, timedelta
-from mock import patch
 from packaging import version
 from pglookout import statsd
 from pglookout.cluster_monitor import ClusterMonitor
+from pglookout.common_types import MemberState, ObserverState
+from pglookout.config import Config
 from psycopg2.extras import RealDictCursor
 from queue import Queue
+from typing import NoReturn
+from unittest.mock import patch
 
 import base64
 import psycopg2
@@ -20,28 +23,31 @@
 import time
 
 
-def test_replication_lag():
+def test_replication_lag() -> None:
     # pylint: disable=protected-access
     now = datetime.now()
-    status = {
+    status: MemberState = {
         "db_time": now,
         "pg_is_in_recovery": True,
         "pg_last_xact_replay_timestamp": now,
         "pg_last_xlog_receive_location": "0/0000001",
         "pg_last_xlog_replay_location": "0/0000002",
     }
+
     result = ClusterMonitor._parse_status_query_result(status.copy())
     assert result["replication_time_lag"] == 0.0
-    status["db_time"] += timedelta(seconds=50, microseconds=42)
+
+    status["db_time"] = now + timedelta(seconds=50, microseconds=42)
     result = ClusterMonitor._parse_status_query_result(status.copy())
     assert result["replication_time_lag"] == 50.000042
+
     status["db_time"] = now + timedelta(hours=42)
     result = ClusterMonitor._parse_status_query_result(status.copy())
     assert result["replication_time_lag"] == 151200.0
 
 
-def test_main_loop(db):
-    config = {
+def test_main_loop(db: TestPG) -> None:
+    config: Config = {
         "remote_conns": {
             "test1db": db.connection_string("testuser"),
             "test2db": db.connection_string("otheruser"),
@@ -49,14 +55,14 @@ def test_main_loop(db):
         "observers": {"local": "URL"},
         "poll_observers_on_warning_only": True,
     }
-    cluster_state = {}
-    observer_state = {}
+    cluster_state: dict[str, MemberState] = {}
+    observer_state: dict[str, ObserverState] = {}
 
-    def create_alert_file(arg):
+    def create_alert_file(arg: str) -> NoReturn:
         raise Exception(arg)
 
-    cluster_monitor_check_queue = Queue()
-    failover_decision_queue = Queue()
+    cluster_monitor_check_queue: Queue[str] = Queue()
+    failover_decision_queue: Queue[str] = Queue()
 
     cm = ClusterMonitor(
         config=config,
@@ -103,7 +109,7 @@ def test_fetch_replication_slot_info(db: TestPG) -> None:
     if version.parse(db.pgver) < version.parse("10"):
         pytest.skip(f"unsupported pg version: {db.pgver}")
 
-    config = {
+    config: Config = {
         "remote_conns": {
             "test1db": db.connection_string("testuser"),
             "test2db": db.connection_string("otheruser"),
@@ -111,14 +117,14 @@ def test_fetch_replication_slot_info(db: TestPG) -> None:
         "observers": {"local": "URL"},
         "poll_observers_on_warning_only": True,
     }
-    cluster_state = {}
-    observer_state = {}
+    cluster_state: dict[str, MemberState] = {}
+    observer_state: dict[str, ObserverState] = {}
 
-    def create_alert_file(arg):
+    def create_alert_file(arg: str) -> NoReturn:
         raise Exception(arg)
 
-    cluster_monitor_check_queue = Queue()
-    failover_decision_queue = Queue()
+    cluster_monitor_check_queue: Queue[str] = Queue()
+    failover_decision_queue: Queue[str] = Queue()
 
     cm = ClusterMonitor(
         config=config,

From 45d413b20e0e362c2080e80484fb626db7058752 Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Thu, 23 Mar 2023 15:35:45 +0100
Subject: [PATCH 10/23] mypy: `pglookout/pglookout.py` [BF-1560]

---
 pglookout/cluster_monitor.py |  10 +-
 pglookout/common_types.py    |  21 +-
 pglookout/current_master.py  |   3 +-
 pglookout/default.py         |  10 +
 pglookout/pglookout.py       | 561 ++++++++++++++++++++++-------------
 pyproject.toml               |   1 -
 test/test_cluster_monitor.py |   6 +-
 7 files changed, 395 insertions(+), 217 deletions(-)
 create mode 100644 pglookout/default.py

diff --git a/pglookout/cluster_monitor.py b/pglookout/cluster_monitor.py
index a642972..29eab48 100644
--- a/pglookout/cluster_monitor.py
+++ b/pglookout/cluster_monitor.py
@@ -15,7 +15,7 @@
 from logging.handlers import SysLogHandler
 from pglookout import logutil
 from pglookout.common import get_iso_timestamp, parse_iso_datetime
-from pglookout.common_types import MemberState, ObserverState, ReplicationSlotAsDict
+from pglookout.common_types import MemberState, ObservedState, ReplicationSlotAsDict
 from pglookout.config import Config
 from pglookout.pgutil import mask_connection_info
 from pglookout.statsd import StatsClient
@@ -78,7 +78,7 @@ def __init__(
         self,
         config: Config,
         cluster_state: dict[str, MemberState],
-        observer_state: dict[str, ObserverState],
+        observer_state: dict[str, ObservedState],
         create_alert_file: Callable[[str], None],
         cluster_monitor_check_queue: Queue[str],
         failover_decision_queue: Queue[str],
@@ -96,7 +96,7 @@ def __init__(
         self.stats: StatsClient = stats
         self.running: bool = True
         self.cluster_state: dict[str, MemberState] = cluster_state
-        self.observer_state: dict[str, ObserverState] = observer_state
+        self.observer_state: dict[str, ObservedState] = observer_state
         self.config: Config = config
         self.create_alert_file: Callable[[str], None] = create_alert_file
         self.db_conns: dict[str, psycopg2.connection | None] = {}
@@ -152,7 +152,7 @@ def _connect_to_db(self, instance: str, dsn: str | None) -> psycopg2.connection
         self.db_conns[instance] = conn
         return conn
 
-    def _fetch_observer_state(self, instance: str, uri: str) -> ObserverState | None:
+    def _fetch_observer_state(self, instance: str, uri: str) -> ObservedState | None:
         now_iso = get_iso_timestamp()
         result = {"fetch_time": now_iso, "connection": True}
         fetch_uri = uri + "/state.json"
@@ -194,7 +194,7 @@ def _fetch_observer_state(self, instance: str, uri: str) -> ObserverState | None
             self.stats.unexpected_exception(ex, where="_fetch_observer_state")
             result["connection"] = False
 
-        return result
+        return cast(ObservedState, result)
 
     def fetch_observer_state(self, instance: str, uri: str) -> None:
         start_time = time.monotonic()
diff --git a/pglookout/common_types.py b/pglookout/common_types.py
index 3c20cdf..43fd3fa 100644
--- a/pglookout/common_types.py
+++ b/pglookout/common_types.py
@@ -2,7 +2,7 @@
 from __future__ import annotations
 
 from datetime import datetime
-from typing import Any, Dict, TypedDict
+from typing import TypedDict, Union
 
 
 class ReplicationSlotAsDict(TypedDict, total=True):
@@ -44,10 +44,25 @@ class MemberState(TypedDict, total=False):
 
 
 # Note for future improvements:
-# If we want ObserverState to accept arbitrary keys, we have three choices:
+# If we want ObservedState to accept arbitrary keys, we have three choices:
 # - Use a different type (pydantic, dataclasses, etc.)
 # - Use a TypedDict for static keys (connection, fetch_time) and a sub-dict for
 #   dynamic keys (received from state.json).
 # - Wait for something like `allow_extra` to be implemented into TypedDict (unlikely)
 #   https://github.com/python/mypy/issues/4617
-ObserverState = Dict[str, Any]
+class ObservedState(dict[str, Union[MemberState, bool, str]]):
+    """Represents an observed state, from the perspective of an observer.
+
+    Note:
+        The content of this type is dynamic, as it depends on the number of
+        members in the cluster. There are two static keys, connection and
+        fetch_time, and N dynamic keys, one for each member of the cluster.
+        Like so::
+
+            connection: bool
+            fetch_time: str
+            name_or_ip_1: MemberState
+            name_or_ip_2: MemberState
+            ...
+            name_or_ip_N: MemberState
+    """
diff --git a/pglookout/current_master.py b/pglookout/current_master.py
index 8f7154f..f6dfb54 100644
--- a/pglookout/current_master.py
+++ b/pglookout/current_master.py
@@ -11,6 +11,7 @@
 from __future__ import annotations
 
 from . import version
+from pglookout.default import JSON_STATE_FILE_PATH
 
 import argparse
 import json
@@ -43,7 +44,7 @@ def main(args: list[str] | None = None) -> int:
     try:
         with open(arg.state, "r") as fp:
             config = json.load(fp)
-        state_file_path = config.get("json_state_file_path", "/tmp/pglookout_state.json")  # pylint: disable=no-member
+        state_file_path = config.get("json_state_file_path", JSON_STATE_FILE_PATH)  # pylint: disable=no-member
         if time.monotonic() - os.stat(state_file_path).st_mtime > 60.0:
             # file older than one minute, pglookout probably dead, exit with minus one
             return -1
diff --git a/pglookout/default.py b/pglookout/default.py
new file mode 100644
index 0000000..e6e2288
--- /dev/null
+++ b/pglookout/default.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2023 Aiven, Helsinki, Finland. https://aiven.io/
+from typing import Final
+
+WARNING_REPLICATION_TIME_LAG: Final[float] = 30.0
+MAX_FAILOVER_REPLICATION_TIME_LAG: Final[float] = 120.0
+REPLICATION_CATCHUP_TIMEOUT: Final[float] = 300.0
+MISSING_MASTER_FROM_CONFIG_TIMEOUT: Final[float] = 15.0
+MAINTENANCE_MODE_FILE: Final[str] = "/tmp/pglookout_maintenance_mode_file"
+PG_DATA_DIRECTORY: Final[str] = "/var/lib/pgsql/data"
+JSON_STATE_FILE_PATH: Final[str] = "/tmp/pglookout_state.json"
diff --git a/pglookout/pglookout.py b/pglookout/pglookout.py
index 1c1bc16..abda47a 100755
--- a/pglookout/pglookout.py
+++ b/pglookout/pglookout.py
@@ -7,72 +7,96 @@
 This file is under the Apache License, Version 2.0.
 See the file `LICENSE` for details.
 """
-from . import logutil, statsd, version
-from .cluster_monitor import ClusterMonitor
-from .common import convert_xlog_location_to_offset, get_iso_timestamp, parse_iso_datetime
-from .pgutil import create_connection_string, get_connection_info, get_connection_info_from_config_line
-from .webserver import WebServer
-from packaging.version import parse
-from psycopg2.extensions import adapt
+from __future__ import annotations
+
+from argparse import ArgumentParser
+from copy import deepcopy
+from datetime import datetime, timedelta
+from logging import DEBUG, getLevelNamesMapping, getLogger, Logger
+from logging.handlers import SysLogHandler
+from packaging.version import parse as parse_version
+from pathlib import Path
+from pglookout.cluster_monitor import ClusterMonitor
+from pglookout.common import convert_xlog_location_to_offset, get_iso_timestamp, parse_iso_datetime
+from pglookout.common_types import MemberState, ObservedState
+from pglookout.config import Config, Statsd
+from pglookout.default import (
+    JSON_STATE_FILE_PATH,
+    MAINTENANCE_MODE_FILE,
+    MAX_FAILOVER_REPLICATION_TIME_LAG,
+    MISSING_MASTER_FROM_CONFIG_TIMEOUT,
+    PG_DATA_DIRECTORY,
+    REPLICATION_CATCHUP_TIMEOUT,
+    WARNING_REPLICATION_TIME_LAG,
+)
+from pglookout.logutil import configure_logging, notify_systemd, set_syslog_handler
+from pglookout.pgutil import (
+    ConnectionParameterKeywords,
+    create_connection_string,
+    get_connection_info,
+    get_connection_info_from_config_line,
+)
+from pglookout.statsd import StatsClient
+from pglookout.version import __version__
+from pglookout.webserver import WebServer
+from psycopg2.extensions import adapt, QuotedString
 from queue import Empty, Queue
-from typing import Optional
+from signal import SIGHUP, SIGINT, signal, SIGTERM
+from socket import gethostname
+from subprocess import CalledProcessError, check_call
+from types import FrameType
+from typing import cast, Final, Optional
 
-import argparse
-import copy
-import datetime
 import json
-import logging
-import logging.handlers
-import os
-import signal
-import socket
-import subprocess
 import sys
 import time
 
+DEFAULT_LOG_LEVEL: Final[str] = "DEBUG"
+LOG_LEVEL_NAMES_MAPPING: Final[dict[str, int]] = getLevelNamesMapping()
+
 
 class PgLookout:
-    def __init__(self, config_path):
-        self.log = logging.getLogger("pglookout")
+    def __init__(self, config_path: Path | str) -> None:
+        self.log: Logger = getLogger("pglookout")
         # dummy to make sure we never get an AttributeError -> gets overwritten after the first config loading
-        self.stats = statsd.StatsClient(host=None)
-        self.running = True
-        self.replication_lag_over_warning_limit = False
-
-        self.config_path = config_path
-        self.config = {}
-        self.log_level = "DEBUG"
-
-        self.connected_master_nodes = {}
-        self.disconnected_master_nodes = {}
-        self.connected_observer_nodes = {}
-        self.disconnected_observer_nodes = {}
-        self.replication_catchup_timeout = None
-        self.replication_lag_warning_boundary = None
-        self.replication_lag_failover_timeout = None
-        self.missing_master_from_config_timeout = None
-        self.own_db = None
-        self.current_master = None
-        self.failover_command = None
-        self.known_gone_nodes = None
-        self.over_warning_limit_command = None
-        self.never_promote_these_nodes = None
-        self.primary_conninfo_template = None
-        self.cluster_monitor = None
-        self.syslog_handler = None
-        self.cluster_nodes_change_time = time.monotonic()
-        self.cluster_monitor_check_queue = Queue()
-        self.failover_decision_queue = Queue()
-        self.observer_state_newer_than = datetime.datetime.min
-        self._start_time = None
+        self.stats: StatsClient = StatsClient(host=None)
+        self.running: bool = True
+        self.replication_lag_over_warning_limit: bool = False
+
+        self.config_path: Path = Path(config_path)
+        self.config: Config = {}
+        self.log_level: int = LOG_LEVEL_NAMES_MAPPING[DEFAULT_LOG_LEVEL]
+
+        self.connected_master_nodes: dict[str, MemberState] = {}
+        self.disconnected_master_nodes: dict[str, MemberState] = {}
+        self.connected_observer_nodes: dict[str, str | None] = {}  # name => ISO fetch time
+        self.disconnected_observer_nodes: dict[str, str | None] = {}  # name => ISO fetch time
+        self.replication_catchup_timeout: float = REPLICATION_CATCHUP_TIMEOUT
+        self.replication_lag_warning_boundary: float = WARNING_REPLICATION_TIME_LAG
+        self.replication_lag_failover_timeout: float = MAX_FAILOVER_REPLICATION_TIME_LAG
+        self.missing_master_from_config_timeout: float = MISSING_MASTER_FROM_CONFIG_TIMEOUT
+        self.own_db: str = ""
+        self.current_master: str | None = None
+        self.failover_command: list[str] = []
+        self.known_gone_nodes: list[str] = []
+        self.over_warning_limit_command: str | None = None
+        self.never_promote_these_nodes: list[str] = []
+        self.primary_conninfo_template: ConnectionParameterKeywords = {}
+        self.cluster_monitor: ClusterMonitor | None = None
+        self.syslog_handler: SysLogHandler | None = None
+        self.cluster_nodes_change_time: float = time.monotonic()
+        self.cluster_monitor_check_queue: Queue[str] = Queue()
+        self.failover_decision_queue: Queue[str] = Queue()
+        self.observer_state_newer_than: datetime = datetime.min
+        self._start_time: float | None = None
         self.load_config()
 
-        signal.signal(signal.SIGHUP, self.load_config)
-        signal.signal(signal.SIGINT, self.quit)
-        signal.signal(signal.SIGTERM, self.quit)
+        signal(SIGHUP, self.load_config_from_signal)
+        signal(SIGINT, self.quit)
+        signal(SIGTERM, self.quit)
 
-        self.cluster_state = {}
-        self.observer_state = {}
+        self.cluster_state: dict[str, MemberState] = {}
+        self.observer_state: dict[str, ObservedState] = {}
 
         self.cluster_monitor = ClusterMonitor(
             config=self.config,
@@ -86,33 +110,71 @@ def __init__(self, config_path):
         )
         # cluster_monitor doesn't exist at the time of reading the config initially
         self.cluster_monitor.log.setLevel(self.log_level)
-        self.webserver = WebServer(self.config, self.cluster_state, self.cluster_monitor_check_queue)
+        self.webserver: WebServer = WebServer(self.config, self.cluster_state, self.cluster_monitor_check_queue)
+
+        if not self._is_initial_state_valid():
+            self.log.error("Initial state is invalid, exiting.")
+            sys.exit(1)
 
-        logutil.notify_systemd("READY=1")
+        notify_systemd("READY=1")
         self.log.info(
             "PGLookout initialized, local hostname: %r, own_db: %r, cwd: %r",
-            socket.gethostname(),
+            gethostname(),
             self.own_db,
-            os.getcwd(),
+            Path.cwd(),
         )
 
-    def quit(self, _signal=None, _frame=None):
+    def _is_initial_state_valid(self) -> bool:
+        """Check if the initial state of PgLookout is valid.
+
+        Note:
+            This method is only needed because we had to pick some default values before loading the config.
+            A better approach would be to load the config first, and then initialize PgLookout.
+            For that, :any:`load_config` should be split into two methods: one for loading the config, and one for
+            applying the config when doing hot reloads. The method loading the config should make minimal usage of
+            ``self`` and return a new ``Config`` object. The method applying the config should accept a ``Config``
+            object and apply it to ``self``. Meanwhile, ``__init__`` should use this ``Config`` object to initialize
+            itself, and it could be in different ways than during hot reloads (some internals don't depend on the config).
+
+        Note:
+            To developers: Any attribute that should be mandatory in the config should be checked here.
+        """
+        is_valid = True
+
+        if not self.config or not self.config_path.is_file():
+            self.log.error("Config is empty!")
+            is_valid = False
+
+        if not self.own_db:
+            self.log.error("`own_db` has not been set!")
+            is_valid = False
+
+        return is_valid
+
+    def quit(self, _signal: int | None = None, _frame: FrameType | None = None) -> None:
+        if self.cluster_monitor is None:
+            raise RuntimeError("Cluster monitor is not initialized!")
+
         self.log.warning("Quitting, signal: %r, frame: %r", _signal, _frame)
         self.cluster_monitor.running = False
         self.running = False
         self.webserver.close()
 
-    def load_config(self, _signal=None, _frame=None):
+    def load_config_from_signal(self, _signal: int, _frame: FrameType | None = None) -> None:
         self.log.debug(
             "Loading JSON config from: %r, signal: %r, frame: %r",
             self.config_path,
             _signal,
             _frame,
         )
+        self.load_config()
+
+    def load_config(self) -> None:
+        self.log.debug("Loading JSON config from: %r", self.config_path)
 
         previous_remote_conns = self.config.get("remote_conns")
         try:
-            with open(self.config_path) as fp:
+            with self.config_path.open() as fp:
                 self.config = json.load(fp)
         except Exception as ex:  # pylint: disable=broad-except
             self.log.exception("Invalid JSON config, exiting")
@@ -120,35 +182,35 @@ def load_config(self, _signal=None, _frame=None):
             sys.exit(1)
 
         # statsd settings may have changed
-        stats = self.config.get("statsd", {})
-        self.stats = statsd.StatsClient(host=stats.get("host"), port=stats.get("port"), tags=stats.get("tags"))
+        stats: Statsd = self.config.get("statsd", {})
+        self.stats = StatsClient(**stats)
 
         if previous_remote_conns != self.config.get("remote_conns"):
             self.cluster_nodes_change_time = time.monotonic()
 
-        if self.config.get("autofollow"):
+        if self.config.get("autofollow", False):
             try:
                 self.primary_conninfo_template = get_connection_info(self.config["primary_conninfo_template"])
             except (KeyError, ValueError):
                 self.log.exception("Invalid or missing primary_conninfo_template; not enabling autofollow")
                 self.config["autofollow"] = False
 
-        if self.cluster_monitor:
-            self.cluster_monitor.config = copy.deepcopy(self.config)
+        if self.cluster_monitor is not None:
+            self.cluster_monitor.config = deepcopy(self.config)
 
-        if self.config.get("syslog") and not self.syslog_handler:
-            self.syslog_handler = logutil.set_syslog_handler(
+        if self.config.get("syslog") and self.syslog_handler is None:
+            self.syslog_handler = set_syslog_handler(
                 address=self.config.get("syslog_address", "/dev/log"),
                 facility=self.config.get("syslog_facility", "local2"),
-                logger=logging.getLogger(),
+                logger=getLogger(),
             )
-        self.own_db = self.config.get("own_db")
+        self.own_db = self.config.get("own_db", "")
 
-        log_level_name = self.config.get("log_level", "DEBUG")
-        self.log_level = getattr(logging, log_level_name)
+        log_level_name = self.config.get("log_level", DEFAULT_LOG_LEVEL)
+        self.log_level = LOG_LEVEL_NAMES_MAPPING[log_level_name]
         try:
             self.log.setLevel(self.log_level)
-            if self.cluster_monitor:
+            if self.cluster_monitor is not None:
                 self.cluster_monitor.log.setLevel(self.log_level)
         except ValueError:
             print(f"Problem setting log level {self.log_level!r}")
@@ -156,12 +218,17 @@ def load_config(self, _signal=None, _frame=None):
         self.known_gone_nodes = self.config.get("known_gone_nodes", [])
         self.never_promote_these_nodes = self.config.get("never_promote_these_nodes", [])
         # we need the failover_command to be converted into subprocess [] format
+        # XXX BF-1971: The next two lines are potentially unsafe. We should use shlex.split instead.
         self.failover_command = self.config.get("failover_command", "").split()
         self.over_warning_limit_command = self.config.get("over_warning_limit_command")
-        self.replication_lag_warning_boundary = self.config.get("warning_replication_time_lag", 30.0)
-        self.replication_lag_failover_timeout = self.config.get("max_failover_replication_time_lag", 120.0)
-        self.replication_catchup_timeout = self.config.get("replication_catchup_timeout", 300.0)
-        self.missing_master_from_config_timeout = self.config.get("missing_master_from_config_timeout", 15.0)
+        self.replication_lag_warning_boundary = self.config.get("warning_replication_time_lag", WARNING_REPLICATION_TIME_LAG)
+        self.replication_lag_failover_timeout = self.config.get(
+            "max_failover_replication_time_lag", MAX_FAILOVER_REPLICATION_TIME_LAG
+        )
+        self.replication_catchup_timeout = self.config.get("replication_catchup_timeout", REPLICATION_CATCHUP_TIMEOUT)
+        self.missing_master_from_config_timeout = self.config.get(
+            "missing_master_from_config_timeout", MISSING_MASTER_FROM_CONFIG_TIMEOUT
+        )
 
         if self.replication_lag_warning_boundary >= self.replication_lag_failover_timeout:
             msg = "Replication lag warning boundary (%s) is not lower than its failover timeout (%s)"
@@ -177,14 +244,14 @@ def load_config(self, _signal=None, _frame=None):
         self.log.debug("Loaded config: %r from: %r", self.config, self.config_path)
         self.cluster_monitor_check_queue.put("new config came, recheck")
 
-    def write_cluster_state_to_json_file(self):
+    def write_cluster_state_to_json_file(self) -> None:
         """Periodically write a JSON state file to disk
 
         Currently only used to share state with the current_master helper command, pglookout itself does
         not rely in this file.
         """
         start_time = time.monotonic()
-        state_file_path = self.config.get("json_state_file_path", "/tmp/pglookout_state.json")
+        state_file_path = Path(self.config.get("json_state_file_path", JSON_STATE_FILE_PATH))
         overall_state = {
             "db_nodes": self.cluster_state,
             "observer_nodes": self.observer_state,
@@ -193,13 +260,15 @@ def write_cluster_state_to_json_file(self):
         try:
             json_to_dump = json.dumps(overall_state, indent=4)
             self.log.debug(
-                "Writing JSON state file to: %r, file_size: %r",
+                "Writing JSON state file to: %s, file_size: %r",
                 state_file_path,
                 len(json_to_dump),
             )
-            with open(state_file_path + ".tmp", "w") as fp:
-                fp.write(json_to_dump)
-            os.rename(state_file_path + ".tmp", state_file_path)
+
+            state_file_path_tmp = state_file_path.with_name(f"{state_file_path.name}.tmp")
+            state_file_path_tmp.write_text(json_to_dump)
+            state_file_path_tmp.rename(state_file_path)
+
             self.log.debug(
                 "Wrote JSON state file to disk, took %.4fs",
                 time.monotonic() - start_time,
@@ -212,88 +281,130 @@ def write_cluster_state_to_json_file(self):
             )
             self.stats.unexpected_exception(ex, where="write_cluster_state_to_json_file")
 
-    def create_node_map(self, cluster_state, observer_state):
+    def create_node_map(
+        self, cluster_state: dict[str, MemberState], observer_state: dict[str, ObservedState]
+    ) -> tuple[str | None, MemberState | None, dict[str, MemberState]]:
         """Computes roles for each known member of cluster.
 
-        Use the information gathered in the cluster_state and observer_state to figure out the roles of each member."""
-        standby_nodes, master_node, master_instance = {}, None, None
-        connected_master_nodes, disconnected_master_nodes = {}, {}
-        connected_observer_nodes, disconnected_observer_nodes = {}, {}
+        Use the information gathered in the ``cluster_state`` and ``observer_state`` to figure out the roles of each member.
+
+        Returns:
+            A 3-tuple with the following elements:
+            - The name of the master instance
+            - The state of the master instance
+            - A dictionary of the standby instances and their states
+        """
+        master_instance: str | None = None
+        master_node: MemberState | None = None
+        standby_nodes: dict[str, MemberState] = {}
+
+        connected_master_nodes: dict[str, MemberState] = {}
+        disconnected_master_nodes: dict[str, MemberState] = {}
+        connected_observer_nodes: dict[str, str | None] = {}
+        disconnected_observer_nodes: dict[str, str | None] = {}
+
         self.log.debug(
             "Creating node map out of cluster_state: %r and observer_state: %r",
             cluster_state,
             observer_state,
         )
-        for instance, state in cluster_state.items():
-            if "pg_is_in_recovery" in state:
-                if state["pg_is_in_recovery"]:
-                    standby_nodes[instance] = state
-                elif state["connection"]:
-                    connected_master_nodes[instance] = state
-                elif not state["connection"]:
-                    disconnected_master_nodes[instance] = state
+
+        for instance_name, member_state in cluster_state.items():
+            if "pg_is_in_recovery" in member_state:
+                if member_state["pg_is_in_recovery"]:
+                    standby_nodes[instance_name] = member_state
+                elif member_state["connection"]:
+                    connected_master_nodes[instance_name] = member_state
+                else:
+                    disconnected_master_nodes[instance_name] = member_state
             else:
                 self.log.debug(
                     "No knowledge on instance: %r state: %r of whether it's in recovery or not",
-                    instance,
-                    state,
+                    instance_name,
+                    member_state,
                 )
 
-        for observer_name, state in observer_state.items():  # pylint: disable=too-many-nested-blocks
-            connected = state.get("connection", False)
+        for observer_name, observed_state in observer_state.items():  # pylint: disable=too-many-nested-blocks
+            connected = cast(bool, observed_state.get("connection", False))
+            state_fetch_time = cast(Optional[str], observed_state.get("fetch_time"))
             if connected:
-                connected_observer_nodes[observer_name] = state.get("fetch_time")
+                connected_observer_nodes[observer_name] = state_fetch_time
             else:
-                disconnected_observer_nodes[observer_name] = state.get("fetch_time")
-            for instance, db_state in state.items():
-                if instance not in cluster_state:
+                disconnected_observer_nodes[observer_name] = state_fetch_time
+            for ob_state_key, ob_state_value in observed_state.items():
+                if ob_state_key in ["connection", "fetch_time"]:
+                    continue
+
+                observed_member_name = ob_state_key
+                observed_member_state = cast(MemberState, ob_state_value)
+
+                if observed_member_name not in cluster_state:
                     # A single observer can observe multiple different replication clusters.
                     # Ignore data on nodes that don't belong in our own cluster
                     self.log.debug(
                         "Ignoring instance: %r since it does not belong into our own replication cluster",
-                        instance,
+                        observed_member_name,
                     )
                     continue
-                if isinstance(db_state, dict):  # other keys are "connection" and "fetch_time"
-                    own_fetch_time = parse_iso_datetime(cluster_state[instance]["fetch_time"])
-                    observer_fetch_time = parse_iso_datetime(db_state["fetch_time"])
-                    self.log.debug(
-                        "observer_name: %r, instance: %r, state: %r, observer_fetch_time: %r",
+
+                if not isinstance(observed_member_state, dict):  # other keys are "connection" and "fetch_time"
+                    self.log.error(
+                        "Observer %r has invalid state for instance %r: %r",
                         observer_name,
-                        instance,
-                        db_state,
-                        observer_fetch_time,
+                        observed_member_name,
+                        observed_member_state,
+                    )
+                    self.log.error(
+                        "Allowed keys are 'connection' (bool) and 'fetch_time' (str)"
+                        " and all observed instance names for the cluster."
                     )
-                    if "pg_is_in_recovery" in db_state:
-                        if db_state["pg_is_in_recovery"]:
-                            # we always trust ourselves the most for localhost, and
-                            # in case we are actually connected to the other node
-                            if observer_fetch_time >= own_fetch_time and instance != self.own_db:
-                                if instance not in standby_nodes or standby_nodes[instance]["connection"] is False:
-                                    standby_nodes[instance] = db_state
-                        else:
-                            master_node = connected_master_nodes.get(instance, {})
-                            connected = master_node.get("connection", False)
-                            self.log.debug(
-                                "Observer: %r sees %r as master, we see: %r, same_master: %r, connection: %r",
-                                observer_name,
-                                instance,
-                                self.current_master,
-                                instance == self.current_master,
-                                db_state.get("connection"),
-                            )
-                            if self.within_dbpoll_time(observer_fetch_time, own_fetch_time) and instance != self.own_db:
-                                if connected or db_state["connection"]:
-                                    connected_master_nodes[instance] = db_state
-                                else:
-                                    disconnected_master_nodes[instance] = db_state
+                    continue
+
+                own_fetch_time = parse_iso_datetime(cluster_state[observed_member_name]["fetch_time"])
+                observer_fetch_time = parse_iso_datetime(observed_member_state["fetch_time"])
+                self.log.debug(
+                    "observer_name: %r, instance: %r, state: %r, observer_fetch_time: %r",
+                    observer_name,
+                    observed_member_name,
+                    observed_member_state,
+                    observer_fetch_time,
+                )
+                if "pg_is_in_recovery" in observed_member_state:
+                    if observed_member_state["pg_is_in_recovery"]:
+                        # we always trust ourselves the most for localhost, and
+                        # in case we are actually connected to the other node
+                        if observer_fetch_time >= own_fetch_time and observed_member_name != self.own_db:
+                            if (
+                                observed_member_name not in standby_nodes
+                                or standby_nodes[observed_member_name]["connection"] is False
+                            ):
+                                standby_nodes[observed_member_name] = observed_member_state
                     else:
-                        self.log.warning(
-                            "No knowledge on %r %r from observer: %r is in recovery",
-                            instance,
-                            db_state,
+                        master_node = connected_master_nodes.get(observed_member_name, MemberState())
+                        connected = master_node.get("connection", False)
+                        self.log.debug(
+                            "Observer: %r sees %r as master, we see: %r, same_master: %r, connection: %r",
                             observer_name,
+                            observed_member_name,
+                            self.current_master,
+                            observed_member_name == self.current_master,
+                            observed_member_state.get("connection"),
                         )
+                        if (
+                            self.within_dbpoll_time(observer_fetch_time, own_fetch_time)
+                            and observed_member_name != self.own_db
+                        ):
+                            if connected or observed_member_state["connection"]:
+                                connected_master_nodes[observed_member_name] = observed_member_state
+                            else:
+                                disconnected_master_nodes[observed_member_name] = observed_member_state
+                else:
+                    self.log.warning(
+                        "No knowledge on %r %r from observer: %r is in recovery",
+                        observed_member_name,
+                        observed_member_state,
+                        observer_name,
+                    )
 
         self.connected_master_nodes = connected_master_nodes
         self.disconnected_master_nodes = disconnected_master_nodes
@@ -325,19 +436,22 @@ def create_node_map(self, cluster_state, observer_state):
 
         return master_instance, master_node, standby_nodes
 
-    def is_restoring_or_catching_up_normally(self, state):
+    def is_restoring_or_catching_up_normally(self, state: MemberState) -> bool:
         """
         Return True if node is still in the replication catchup phase and
         replication lag alerts/metrics should not yet be generated.
         """
         replication_start_time = state.get("replication_start_time")
         min_lag = state.get("min_replication_time_lag", self.replication_lag_warning_boundary)
+
         if replication_start_time and time.monotonic() - replication_start_time > self.replication_catchup_timeout:
             # we've been replicating for too long and should have caught up with the master already
             return False
+
         if not state.get("pg_last_xlog_receive_location"):
             # node has not received anything from the master yet
             return True
+
         if min_lag >= self.replication_lag_warning_boundary:
             # node is catching up the master and has not gotten close enough yet
             return True
@@ -345,7 +459,7 @@ def is_restoring_or_catching_up_normally(self, state):
         # node has caught up with the master so we should be in sync
         return False
 
-    def emit_stats(self, state):
+    def emit_stats(self, state: MemberState) -> None:
         if self.is_restoring_or_catching_up_normally(state):
             # do not emit misleading lag stats during catchup at restore
             return
@@ -354,9 +468,10 @@ def emit_stats(self, state):
         if replication_time_lag is not None:
             self.stats.gauge("pg.replication_lag", replication_time_lag)
 
-    def is_master_observer_new_enough(self, observer_state):
+    def is_master_observer_new_enough(self, observer_state: dict[str, ObservedState]) -> bool:
         if not self.replication_lag_over_warning_limit:
             return True
+
         if not self.current_master or self.current_master not in self.config.get("observers", {}):
             self.log.warning(
                 "Replication lag is over warning limit, but"
@@ -364,33 +479,37 @@ def is_master_observer_new_enough(self, observer_state):
                 self.current_master,
             )
             return True
-        db_poll_intervals = datetime.timedelta(seconds=5 * self.config.get("db_poll_interval", 5.0))
-        now = datetime.datetime.utcnow()
+
+        db_poll_intervals = timedelta(seconds=5 * self.config.get("db_poll_interval", 5.0))
+        now = datetime.utcnow()
         if (now - self.observer_state_newer_than) < db_poll_intervals:
             self.log.warning(
                 "Replication lag is over warning limit, but"
                 " not waiting for observers to be polled because 5 db_poll_intervals have passed"
             )
             return True
+
         if self.current_master not in observer_state:
             self.log.warning(
                 "Replication lag is over warning limit, but observer for master (%s) has not been polled yet",
                 self.current_master,
             )
             return False
-        fetch_time = parse_iso_datetime(observer_state[self.current_master]["fetch_time"])
+
+        fetch_time = parse_iso_datetime(cast(str, observer_state[self.current_master]["fetch_time"]))
         if fetch_time < self.observer_state_newer_than:
             self.log.warning(
                 "Replication lag is over warning limit, but observer's data for master  is stale, older than %r",
                 self.observer_state_newer_than,
             )
             return False
+
         return True
 
-    def check_cluster_state(self):
-        master_node = None
-        cluster_state = copy.deepcopy(self.cluster_state)
-        observer_state = copy.deepcopy(self.observer_state)
+    def check_cluster_state(self) -> None:
+        # master_node = None
+        cluster_state = deepcopy(self.cluster_state)
+        observer_state = deepcopy(self.observer_state)
         configured_node_count = len(self.config.get("remote_conns", {}))
         if not cluster_state or len(cluster_state) != configured_node_count:
             self.log.warning(
@@ -451,7 +570,12 @@ def check_cluster_state(self):
                 return
             self.consider_failover(own_state, master_node, standby_nodes)
 
-    def consider_failover(self, own_state, master_node, standby_nodes):
+    def consider_failover(
+        self,
+        own_state: MemberState,
+        master_node: MemberState | None,
+        standby_nodes: dict[str, MemberState],
+    ) -> None:
         if not master_node:
             # no master node at all in the cluster?
             self.log.warning(
@@ -486,10 +610,10 @@ def consider_failover(self, own_state, master_node, standby_nodes):
                 return
         self.check_replication_lag(own_state, standby_nodes)
 
-    def is_replication_lag_over_warning_limit(self):
+    def is_replication_lag_over_warning_limit(self) -> bool:
         return self.replication_lag_over_warning_limit
 
-    def check_replication_lag(self, own_state, standby_nodes):
+    def check_replication_lag(self, own_state: MemberState, standby_nodes: dict[str, MemberState]) -> None:
         if self.is_restoring_or_catching_up_normally(own_state):
             # do not raise alerts during catchup at restore
             return
@@ -508,7 +632,7 @@ def check_replication_lag(self, own_state, standby_nodes):
             if not self.replication_lag_over_warning_limit:  # we just went over the boundary
                 self.replication_lag_over_warning_limit = True
                 if self.config.get("poll_observers_on_warning_only"):
-                    self.observer_state_newer_than = datetime.datetime.utcnow()
+                    self.observer_state_newer_than = datetime.utcnow()
                 self.create_alert_file("replication_delay_warning")
                 if self.over_warning_limit_command:
                     self.log.warning(
@@ -528,7 +652,7 @@ def check_replication_lag(self, own_state, standby_nodes):
         elif self.replication_lag_over_warning_limit:
             self.replication_lag_over_warning_limit = False
             self.delete_alert_file("replication_delay_warning")
-            self.observer_state_newer_than = datetime.datetime.min
+            self.observer_state_newer_than = datetime.min
 
         if replication_lag >= self.replication_lag_failover_timeout:
             self.log.warning(
@@ -545,14 +669,14 @@ def check_replication_lag(self, own_state, standby_nodes):
                 standby_nodes,
             )
 
-    def get_replication_positions(self, standby_nodes):
+    def get_replication_positions(self, standby_nodes: dict[str, MemberState]) -> dict[int, set[str]]:
         self.log.debug("Getting replication positions from: %r", standby_nodes)
-        known_replication_positions = {}
+        known_replication_positions: dict[int, set[str]] = {}
         for instance, node_state in standby_nodes.items():
-            now = datetime.datetime.utcnow()
+            now = datetime.utcnow()
             if (
                 node_state["connection"]
-                and now - parse_iso_datetime(node_state["fetch_time"]) < datetime.timedelta(seconds=20)
+                and now - parse_iso_datetime(node_state["fetch_time"]) < timedelta(seconds=20)
                 and instance not in self.never_promote_these_nodes
             ):  # noqa # pylint: disable=line-too-long
                 # use pg_last_xlog_receive_location if it's available,
@@ -569,13 +693,14 @@ def get_replication_positions(self, standby_nodes):
                 known_replication_positions.setdefault(wal_pos, set()).add(instance)
         return known_replication_positions
 
-    def _been_in_contact_with_master_within_failover_timeout(self):
+    def _been_in_contact_with_master_within_failover_timeout(self) -> bool:
         # no need to do anything here if there are no disconnected masters
         if self.disconnected_master_nodes:
             disconnected_master_node = list(self.disconnected_master_nodes.values())[0]
             db_time = disconnected_master_node.get("db_time", get_iso_timestamp()) or get_iso_timestamp()
-            time_since_last_contact = datetime.datetime.utcnow() - parse_iso_datetime(db_time)
-            if time_since_last_contact < datetime.timedelta(seconds=self.replication_lag_failover_timeout):
+            db_time_as_dt = db_time if isinstance(db_time, datetime) else parse_iso_datetime(db_time)
+            time_since_last_contact = datetime.utcnow() - db_time_as_dt
+            if time_since_last_contact < timedelta(seconds=self.replication_lag_failover_timeout):
                 self.log.debug(
                     "We've had contact with master: %r at: %r within the last %.2fs, not failing over",
                     disconnected_master_node,
@@ -585,7 +710,7 @@ def _been_in_contact_with_master_within_failover_timeout(self):
                 return True
         return False
 
-    def do_failover_decision(self, own_state, standby_nodes):
+    def do_failover_decision(self, own_state: MemberState, standby_nodes: dict[str, MemberState]) -> None:
         if self.connected_master_nodes:
             self.log.warning(
                 "We still have some connected masters: %r, not failing over",
@@ -637,7 +762,7 @@ def do_failover_decision(self, own_state, standby_nodes):
                 self.log.warning(
                     "Canceling failover even though we were the node the furthest along, since "
                     "this node has an existing maintenance_mode_file: %r",
-                    self.config.get("maintenance_mode_file", "/tmp/pglookout_maintenance_mode_file"),
+                    self.config.get("maintenance_mode_file", MAINTENANCE_MODE_FILE),
                 )
             elif self.own_db in self.never_promote_these_nodes:
                 self.log.warning(
@@ -674,18 +799,18 @@ def do_failover_decision(self, own_state, standby_nodes):
                 furthest_along_instance,
             )
 
-    def modify_recovery_conf_to_point_at_new_master(self, new_master_instance):
-        with open(os.path.join(self.config.get("pg_data_directory"), "PG_VERSION"), "r") as fp:
-            pg_version = fp.read().strip()
+    def modify_recovery_conf_to_point_at_new_master(self, new_master_instance: str) -> bool:
+        pg_data_directory = Path(self.config.get("pg_data_directory", PG_DATA_DIRECTORY))
+        pg_version = (pg_data_directory / "PG_VERSION").read_text().strip()
 
-        if parse(pg_version) >= parse("12"):
+        if parse_version(pg_version) >= parse_version("12"):
             recovery_conf_filename = "postgresql.auto.conf"
         else:
             recovery_conf_filename = "recovery.conf"
 
-        path_to_recovery_conf = os.path.join(self.config.get("pg_data_directory"), recovery_conf_filename)
-        with open(path_to_recovery_conf, "r") as fp:
-            old_conf = fp.read().splitlines()
+        path_to_recovery_conf = pg_data_directory / recovery_conf_filename
+        old_conf = path_to_recovery_conf.read_text().splitlines()
+
         has_recovery_target_timeline = False
         new_conf = []
         old_conn_info = None
@@ -707,34 +832,44 @@ def modify_recovery_conf_to_point_at_new_master(self, new_master_instance):
         master_instance_conn_info = get_connection_info(self.config["remote_conns"][new_master_instance])
         assert "host" in master_instance_conn_info
         new_conn_info["host"] = master_instance_conn_info["host"]
+
         if "port" in master_instance_conn_info:
             new_conn_info["port"] = master_instance_conn_info["port"]
+
         if new_conn_info == old_conn_info:
             self.log.debug(
                 "recovery.conf already contains conninfo matching %r, not updating",
                 new_master_instance,
             )
             return False
+
         # Otherwise we append the new primary_conninfo
-        quoted_connection_string = adapt(create_connection_string(new_conn_info))
+        # Mypy: ignore the typing of `adapt`, as we cannot override it ourselves. The provided stubs are incomplete.
+        # Writing our own stubs would discard all the other type information from `psycopg2.extensions`.
+        quoted_connection_string: QuotedString = adapt(
+            create_connection_string(new_conn_info)
+        )  # type: ignore[no-untyped-call]
         new_conf.append(f"primary_conninfo = {quoted_connection_string}")
+
         # The timeline of the recovery.conf will require a higher timeline target
         if not has_recovery_target_timeline:
             new_conf.append("recovery_target_timeline = 'latest'")
+
         # prepend our tag
         iso_timestamp = get_iso_timestamp()
         new_conf.insert(
             0,
             f"# pglookout updated primary_conninfo for instance {new_master_instance} at {iso_timestamp}",
         )
+
         # Replace old recovery.conf with a fresh copy
-        with open(path_to_recovery_conf + "_temp", "w") as fp:
-            fp.write("\n".join(new_conf) + "\n")
+        path_to_recovery_conf_new = path_to_recovery_conf.with_name(f"{path_to_recovery_conf.name}._temp")
+        path_to_recovery_conf_new.write_text("\n".join(new_conf) + "\n")
+        path_to_recovery_conf_new.rename(path_to_recovery_conf)
 
-        os.rename(path_to_recovery_conf + "_temp", path_to_recovery_conf)
         return True
 
-    def start_following_new_master(self, new_master_instance):
+    def start_following_new_master(self, new_master_instance: str) -> None:
         start_time = time.monotonic()
         updated_config = self.modify_recovery_conf_to_point_at_new_master(new_master_instance)
         if not updated_config:
@@ -760,12 +895,12 @@ def start_following_new_master(self, new_master_instance):
             time.monotonic() - start_time,
         )
 
-    def execute_external_command(self, command):
+    def execute_external_command(self, command: list[str] | str) -> int:
         self.log.warning("Executing external command: %r", command)
         return_code, output = 0, ""
         try:
-            output = subprocess.check_call(command)
-        except subprocess.CalledProcessError as err:
+            check_call(command)
+        except CalledProcessError as err:
             self.log.exception(
                 "Problem with executing: %r, return_code: %r, output: %r",
                 command,
@@ -777,33 +912,37 @@ def execute_external_command(self, command):
         self.log.warning("Executed external command: %r, output: %r", return_code, output)
         return return_code
 
-    def check_for_maintenance_mode_file(self):
-        return os.path.exists(self.config.get("maintenance_mode_file", "/tmp/pglookout_maintenance_mode_file"))
+    def check_for_maintenance_mode_file(self) -> bool:
+        return Path(self.config.get("maintenance_mode_file", MAINTENANCE_MODE_FILE)).is_file()
 
-    def create_alert_file(self, filename):
+    def create_alert_file(self, filename: str) -> None:
+        alert_file_dir = Path(self.config.get("alert_file_dir", Path.cwd()))
+        filepath = alert_file_dir / filename
+        self.log.debug("Creating alert file: %r", str(filepath))
         try:
-            filepath = os.path.join(self.config.get("alert_file_dir", os.getcwd()), filename)
-            self.log.debug("Creating alert file: %r", filepath)
-            with open(filepath, "w") as fp:
-                fp.write("alert")
+            filepath.write_text("alert")
         except Exception as ex:  # pylint: disable=broad-except
             self.log.exception("Problem writing alert file: %r", filename)
             self.stats.unexpected_exception(ex, where="create_alert_file")
 
-    def delete_alert_file(self, filename):
-        filepath = os.path.join(self.config.get("alert_file_dir", os.getcwd()), filename)
+    def delete_alert_file(self, filename: str) -> None:
+        alert_file_dir = Path(self.config.get("alert_file_dir", Path.cwd()))
+        filepath = alert_file_dir / filename
         try:
-            if os.path.exists(filepath):
+            if filepath.is_file():
                 self.log.debug("Deleting alert file: %r", filepath)
-                os.unlink(filepath)
+                filepath.unlink(missing_ok=True)
         except Exception as ex:  # pylint: disable=broad-except
             self.log.exception("Problem unlinking: %r", filepath)
             self.stats.unexpected_exception(ex, where="delete_alert_file")
 
-    def within_dbpoll_time(self, time1, time2):
+    def within_dbpoll_time(self, time1: datetime, time2: datetime) -> bool:
         return abs((time1 - time2).total_seconds()) < self.config.get("db_poll_interval", 5.0)
 
     def _check_cluster_monitor_thread_health(self, now: float) -> None:
+        if self.cluster_monitor is None:
+            raise RuntimeError("Cluster Monitor is not initialized!")
+
         health_timeout_seconds = self._get_health_timeout_seconds()
         if health_timeout_seconds:
             last_successful_run = self.cluster_monitor.last_monitoring_success_time or self._start_time
@@ -814,7 +953,7 @@ def _check_cluster_monitor_thread_health(self, now: float) -> None:
                     self.stats.increase("cluster_monitor_health_timeout")
                     self.log.warning("cluster_monitor has not been running for %.1f seconds", seconds_since_last_run)
 
-    def _get_health_timeout_seconds(self) -> Optional[float]:
+    def _get_health_timeout_seconds(self) -> float | None:
         if "cluster_monitor_health_timeout_seconds" in self.config:
             config_value = self.config.get("cluster_monitor_health_timeout_seconds")
             return config_value if config_value is None else float(config_value)
@@ -824,7 +963,7 @@ def _get_health_timeout_seconds(self) -> Optional[float]:
     def _get_check_interval(self) -> float:
         return float(self.config.get("replication_state_check_interval", 5.0))
 
-    def main_loop(self):
+    def main_loop(self) -> None:
         while self.running:
             try:
                 self.check_cluster_state()
@@ -832,11 +971,13 @@ def main_loop(self):
             except Exception as ex:  # pylint: disable=broad-except
                 self.log.exception("Failed to check cluster state")
                 self.stats.unexpected_exception(ex, where="main_loop_check_cluster_state")
+
             try:
                 self.write_cluster_state_to_json_file()
             except Exception as ex:  # pylint: disable=broad-except
                 self.log.exception("Failed to write cluster state")
                 self.stats.unexpected_exception(ex, where="main_loop_writer_cluster_state")
+
             try:
                 self.failover_decision_queue.get(timeout=self._get_check_interval())
                 q = self.failover_decision_queue
@@ -849,18 +990,18 @@ def main_loop(self):
             except Empty:
                 pass
 
-    def run(self):
+    def run(self) -> None:
+        if self.cluster_monitor is None:
+            raise RuntimeError("Cluster Monitor is not initialized!")
+
         self._start_time = time.monotonic()
         self.cluster_monitor.start()
         self.webserver.start()
         self.main_loop()
 
 
-def main(args=None):
-    if args is None:
-        args = sys.argv[1:]
-
-    parser = argparse.ArgumentParser(
+def get_argument_parser() -> ArgumentParser:
+    parser = ArgumentParser(
         prog="pglookout",
         description="postgresql replication monitoring and failover daemon",
     )
@@ -868,19 +1009,31 @@ def main(args=None):
         "--version",
         action="version",
         help="show program version",
-        version=version.__version__,
+        version=__version__,
     )
-    parser.add_argument("config", help="configuration file")
+    # it's a type of filepath
+    parser.add_argument("config", type=Path, help="configuration file")
+
+    return parser
+
+
+def main(args: list[str] | None = None) -> int:
+    if args is None:
+        args = sys.argv[1:]
+
+    parser = get_argument_parser()
     arg = parser.parse_args(args)
 
-    if not os.path.exists(arg.config):
+    if not arg.config.is_file():
         print(f"pglookout: {arg.config!r} doesn't exist")
         return 1
 
-    logutil.configure_logging()
+    configure_logging()
 
     pglookout = PgLookout(arg.config)
-    return pglookout.run()
+    pglookout.run()
+
+    return 0
 
 
 if __name__ == "__main__":
diff --git a/pyproject.toml b/pyproject.toml
index e01c7e7..21912c6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,6 @@ files = [
 exclude = [
     # Implementation.
     'pglookout/__main__.py',
-    'pglookout/pglookout.py',
     'pglookout/version.py',
     # Tests.
     'test/conftest.py',
diff --git a/test/test_cluster_monitor.py b/test/test_cluster_monitor.py
index 710ccee..b841be2 100644
--- a/test/test_cluster_monitor.py
+++ b/test/test_cluster_monitor.py
@@ -10,7 +10,7 @@
 from packaging import version
 from pglookout import statsd
 from pglookout.cluster_monitor import ClusterMonitor
-from pglookout.common_types import MemberState, ObserverState
+from pglookout.common_types import MemberState, ObservedState
 from pglookout.config import Config
 from psycopg2.extras import RealDictCursor
 from queue import Queue
@@ -56,7 +56,7 @@ def test_main_loop(db: TestPG) -> None:
         "poll_observers_on_warning_only": True,
     }
     cluster_state: dict[str, MemberState] = {}
-    observer_state: dict[str, ObserverState] = {}
+    observer_state: dict[str, ObservedState] = {}
 
     def create_alert_file(arg: str) -> NoReturn:
         raise Exception(arg)
@@ -118,7 +118,7 @@ def test_fetch_replication_slot_info(db: TestPG) -> None:
         "poll_observers_on_warning_only": True,
     }
     cluster_state: dict[str, MemberState] = {}
-    observer_state: dict[str, ObserverState] = {}
+    observer_state: dict[str, ObservedState] = {}
 
     def create_alert_file(arg: str) -> NoReturn:
         raise Exception(arg)

From 6ce993ed4d63e1eedd8a87d33dd37fd4562ebd05 Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Fri, 24 Mar 2023 19:02:21 +0100
Subject: [PATCH 11/23] mypy: Refactor `check_cluster_state` [BF-1560]

This refactor is made to simplify the branching of a part of this method.
This helps `mypy` and makes it clearer about what the different preconditions.
---
 pglookout/pglookout.py | 111 ++++++++++++++++++++++++++++++-----------
 1 file changed, 83 insertions(+), 28 deletions(-)

diff --git a/pglookout/pglookout.py b/pglookout/pglookout.py
index abda47a..a6cb009 100755
--- a/pglookout/pglookout.py
+++ b/pglookout/pglookout.py
@@ -537,38 +537,93 @@ def check_cluster_state(self) -> None:
             if self.own_db and self.own_db != master_instance and self.config.get("autofollow"):
                 self.start_following_new_master(master_instance)
 
+        self._make_failover_decision(observer_state, standby_nodes, master_node)
+
+    def _make_failover_decision(
+        self,
+        observer_state: dict[str, ObservedState],
+        standby_nodes: dict[str, MemberState],
+        master_node: MemberState | None,
+    ) -> None:
+        assert self.own_db is not None
         own_state = self.cluster_state.get(self.own_db)
 
-        observer_info = ",".join(observer_state) or "no"
-        if self.own_db:  # Emit stats if we're a non-observer node
-            self.emit_stats(own_state)
-        else:  # We're an observer ourselves, grab the IP address from HTTP server address
-            observer_info = self.config.get("http_address", observer_info)
+        consider_failover = False
 
-        standby_info = ",".join(standby_nodes) or "no"
-        self.log.debug(
-            "Cluster has %s standbys, %s observers and %s as master, own_db: %r, own_state: %r",
-            standby_info,
-            observer_info,
-            self.current_master,
-            self.own_db,
-            own_state or "observer",
-        )
+        if not self.own_db:
+            self._log_state_when_own_db_is_not_in_cluster(
+                observer_state=observer_state,
+                standby_nodes=standby_nodes,
+            )
+        else:
+            assert own_state is not None
+            consider_failover = self._is_failover_consideration_needed(
+                own_state=own_state,
+                observer_state=observer_state,
+                standby_nodes=standby_nodes,
+                master_node=master_node,
+            )
 
-        if self.own_db:
-            if self.own_db == self.current_master:
-                # We are the master of this cluster, nothing to do
-                self.log.debug(
-                    "We %r: %r are still the master node: %r of this cluster, nothing to do.",
-                    self.own_db,
-                    own_state,
-                    master_node,
-                )
-                return
-            if not standby_nodes:
-                self.log.warning("No standby nodes set, master node: %r", master_node)
-                return
-            self.consider_failover(own_state, master_node, standby_nodes)
+        if consider_failover and own_state:
+            self.consider_failover(
+                own_state=own_state,
+                master_node=master_node,
+                standby_nodes=standby_nodes,
+            )
+
+    def _log_state_when_own_db_is_not_in_cluster(
+        self,
+        observer_state: dict[str, ObservedState],
+        standby_nodes: dict[str, MemberState],
+    ) -> None:
+        if self.log.isEnabledFor(DEBUG):
+            observer_info = self.config.get("http_address", ",".join(observer_state.keys()) or "no")
+            standby_info = ",".join(standby_nodes) or "no"
+            self.log.debug(
+                "Cluster has %s standbys, %s observers and %s as master, own_db: %r, own_state: %r",
+                standby_info,
+                observer_info,
+                self.current_master,
+                self.own_db,
+                "observer",
+            )
+
+    def _is_failover_consideration_needed(
+        self,
+        own_state: MemberState,
+        observer_state: dict[str, ObservedState],
+        standby_nodes: dict[str, MemberState],
+        master_node: MemberState | None,
+    ) -> bool:
+        self.emit_stats(own_state)  # Emit stats if we're a non-observer node
+
+        if self.log.isEnabledFor(DEBUG):
+            observer_info = ",".join(observer_state) or "no"
+            standby_info = ",".join(standby_nodes) or "no"
+            self.log.debug(
+                "Cluster has %s standbys, %s observers and %s as master, own_db: %r, own_state: %r",
+                standby_info,
+                observer_info,
+                self.current_master,
+                self.own_db,
+                own_state,
+            )
+
+        if self.own_db == self.current_master:
+            # We are the master of this cluster, nothing to do
+            self.log.debug(
+                "We %r: %r are still the master node: %r of this cluster, nothing to do.",
+                self.own_db,
+                own_state,
+                master_node,
+            )
+            return False
+
+        if not standby_nodes:
+            self.log.warning("No standby nodes set, master node: %r", master_node)
+            return False
+
+        return True
 
     def consider_failover(
         self,

From 0d438e9040253dea10dbfb75d67c622773de1ba1 Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Mon, 27 Mar 2023 12:19:21 +0200
Subject: [PATCH 12/23] mypy: `test/test_pglookout.py` [BF-1560]

Rename `test_lookout.py` -> `test_pglookout.py` to follow conventions.
---
 pglookout/config.py                         |   2 +-
 pyproject.toml                              |   1 -
 test/{test_lookout.py => test_pglookout.py} | 402 ++++++++++----------
 3 files changed, 210 insertions(+), 195 deletions(-)
 rename test/{test_lookout.py => test_pglookout.py} (73%)

diff --git a/pglookout/config.py b/pglookout/config.py
index 761263d..9575841 100644
--- a/pglookout/config.py
+++ b/pglookout/config.py
@@ -13,7 +13,7 @@ class Statsd(TypedDict, total=False):
 class Config(TypedDict, total=False):
     alert_file_dir: str
     autofollow: bool
-    cluster_monitor_health_timeout_seconds: float
+    cluster_monitor_health_timeout_seconds: float | None
     db_poll_interval: float
     failover_command: str
     failover_sleep_time: float
diff --git a/pyproject.toml b/pyproject.toml
index 21912c6..48a7c04 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,7 +34,6 @@ exclude = [
     'pglookout/version.py',
     # Tests.
     'test/conftest.py',
-    'test/test_lookout.py',
     # Other.
     'setup.py',
     'version.py',
diff --git a/test/test_lookout.py b/test/test_pglookout.py
similarity index 73%
rename from test/test_lookout.py
rename to test/test_pglookout.py
index 3b0a902..9aa3421 100644
--- a/test/test_lookout.py
+++ b/test/test_pglookout.py
@@ -7,53 +7,58 @@
 This file is under the Apache License, Version 2.0.
 See the file `LICENSE` for details.
 """
+from __future__ import annotations
+
+from datetime import datetime, timedelta
+from pathlib import Path
 from pglookout.common import get_iso_timestamp
+from pglookout.common_types import MemberState, ObservedState
+from pglookout.pglookout import PgLookout
 from pglookout.pgutil import get_connection_info, get_connection_info_from_config_line
-from typing import Optional, Union
-from unittest.mock import patch
+from psycopg2._psycopg import connection as pg_connection  # pylint: disable=no-name-in-module
+from typing import cast
+from unittest.mock import Mock, patch
 
-import datetime
 import json
 import os
 import pytest
 import time
 
 
-def test_connect_to_cluster_nodes_and_cleanup_old_nodes(pgl):
+def test_connect_to_cluster_nodes_and_cleanup_old_nodes(pgl: PgLookout) -> None:
+    assert pgl.cluster_monitor is not None
     pgl.cluster_monitor.db_conns = {
-        "1.2.3.4": "bar",
-        "2.3.4.5": "foo",
-        "3.4.5.6": "foo",
-        None: "foo",
+        "1.2.3.4": Mock(spec_set=pg_connection),
+        "2.3.4.5": Mock(spec_set=pg_connection),
+        "3.4.5.6": Mock(spec_set=pg_connection),
     }
     pgl.cluster_monitor.connect_to_cluster_nodes_and_cleanup_old_nodes()
     assert pgl.cluster_monitor.db_conns == {}
 
 
-def test_state_file_write(pgl, tmpdir):
-    state_file_path = tmpdir.join("state_file").strpath
-    pgl.config["json_state_file_path"] = state_file_path
+def test_state_file_write(pgl: PgLookout, tmp_path: Path) -> None:
+    state_file_path = tmp_path / "state_file"
+    pgl.config["json_state_file_path"] = str(state_file_path)
     pgl.write_cluster_state_to_json_file()
-    assert os.path.exists(state_file_path)
-    with open(state_file_path, "r") as fp:
-        state = json.load(fp)
+    assert state_file_path.is_file()
+    state = json.loads(state_file_path.read_text())
     assert isinstance(state, dict)
 
 
-def test_load_config(pgl):
+def test_load_config(pgl: PgLookout) -> None:
     pgl.own_db = "old_value"
     pgl.load_config()
     assert pgl.own_db == "1.2.3.4"
 
 
 def _create_db_node_state(
-    pg_last_xlog_receive_location=None,
-    pg_is_in_recovery=True,
-    connection=True,
-    replication_time_lag=None,
-    fetch_time=None,
-    db_time=None,
-):
+    pg_last_xlog_receive_location: str | None = None,
+    pg_is_in_recovery: bool = True,
+    connection: bool = True,
+    replication_time_lag: float | None = None,
+    fetch_time: datetime | None = None,
+    db_time: datetime | None = None,
+) -> MemberState:
     return {
         "connection": connection,
         "db_time": get_iso_timestamp(db_time),
@@ -68,16 +73,16 @@ def _create_db_node_state(
 
 
 def _add_to_observer_state(
-    pgl,
-    observer_name,
-    db_name,
-    pg_last_xlog_receive_location=None,
-    pg_is_in_recovery=True,
-    connection=True,
-    replication_time_lag=None,
-    fetch_time=None,
-    db_time=None,
-):
+    pgl: PgLookout,
+    observer_name: str,
+    db_name: str,
+    pg_last_xlog_receive_location: str | None = None,
+    pg_is_in_recovery: bool = True,
+    connection: bool = True,
+    replication_time_lag: float | None = None,
+    fetch_time: datetime | None = None,
+    db_time: datetime | None = None,
+) -> None:
     db_node_state = _create_db_node_state(
         pg_last_xlog_receive_location,
         pg_is_in_recovery,
@@ -86,28 +91,30 @@ def _add_to_observer_state(
         fetch_time=fetch_time,
         db_time=db_time,
     )
-    update_dict = {
-        "fetch_time": get_iso_timestamp(),
-        "connection": True,
-        db_name: db_node_state,
-    }
-    if observer_name in pgl.observer_state:
+    update_dict = ObservedState(
+        {
+            "fetch_time": get_iso_timestamp(),
+            "connection": True,
+            db_name: db_node_state,
+        }
+    )
+    if observer_name in pgl.observer_state.keys():
         pgl.observer_state[observer_name].update(update_dict)
     else:
         pgl.observer_state[observer_name] = update_dict
 
 
 def _add_db_to_cluster_state(
-    pgl,
-    instance,
-    pg_last_xlog_receive_location=None,
-    pg_is_in_recovery=True,
-    connection=True,
-    replication_time_lag=None,
-    fetch_time=None,
-    db_time=None,
-    conn_info=None,
-):
+    pgl: PgLookout,
+    instance: str,
+    pg_last_xlog_receive_location: str | None = None,
+    pg_is_in_recovery: bool = True,
+    connection: bool = True,
+    replication_time_lag: float | None = None,
+    fetch_time: datetime | None = None,
+    db_time: datetime | None = None,
+    conn_info: str | None = None,
+) -> None:
     db_node_state = _create_db_node_state(
         pg_last_xlog_receive_location,
         pg_is_in_recovery,
@@ -120,7 +127,7 @@ def _add_db_to_cluster_state(
     pgl.config["remote_conns"][instance] = conn_info or {"host": instance}
 
 
-def test_check_cluster_state_warning(pgl):
+def test_check_cluster_state_warning(pgl: PgLookout) -> None:
     _add_db_to_cluster_state(
         pgl,
         "kuu",
@@ -134,16 +141,16 @@ def test_check_cluster_state_warning(pgl):
     pgl.current_master = "old_master"
     pgl.own_db = "kuu"
     pgl.over_warning_limit_command = "fake_command"
-    pgl.execute_external_command.return_value = 0
+    cast(Mock, pgl.execute_external_command).return_value = 0
     pgl.check_cluster_state()
-    assert pgl.execute_external_command.call_count == 1
-    assert pgl.create_alert_file.call_count == 1
+    assert cast(Mock, pgl.execute_external_command).call_count == 1
+    assert cast(Mock, pgl.create_alert_file).call_count == 1
     pgl.check_cluster_state()
 
     # call count does not change when we have sent a single warning
-    assert pgl.execute_external_command.call_count == 1
+    assert cast(Mock, pgl.execute_external_command).call_count == 1
     assert pgl.replication_lag_over_warning_limit
-    assert pgl.create_alert_file.call_count == 1
+    assert cast(Mock, pgl.create_alert_file).call_count == 1
 
     # and then the replication catches up
     _add_db_to_cluster_state(
@@ -159,13 +166,13 @@ def test_check_cluster_state_warning(pgl):
     assert pgl.replication_lag_over_warning_limit is False
 
 
-def test_check_cluster_do_failover_one_standby(pgl):
+def test_check_cluster_do_failover_one_standby(pgl: PgLookout) -> None:
     _add_db_to_cluster_state(
         pgl,
         "old_master",
         pg_is_in_recovery=False,
         connection=False,
-        db_time=datetime.datetime(year=2014, month=1, day=1),
+        db_time=datetime(year=2014, month=1, day=1),
     )
 
     _add_db_to_cluster_state(
@@ -178,23 +185,23 @@ def test_check_cluster_do_failover_one_standby(pgl):
     )
 
     pgl.own_db = "own_db"
-    pgl.execute_external_command.return_value = 0
+    cast(Mock, pgl.execute_external_command).return_value = 0
     pgl.replication_lag_over_warning_limit = False
     pgl.check_cluster_state()
-    assert pgl.execute_external_command.call_count == 0
+    assert cast(Mock, pgl.execute_external_command).call_count == 0
     assert pgl.replication_lag_over_warning_limit is True
     pgl.check_cluster_state()
-    assert pgl.execute_external_command.call_count == 1
+    assert cast(Mock, pgl.execute_external_command).call_count == 1
     assert pgl.replication_lag_over_warning_limit is False
 
 
-def test_check_cluster_master_gone_one_standby_one_observer(pgl):
+def test_check_cluster_master_gone_one_standby_one_observer(pgl: PgLookout) -> None:
     _add_db_to_cluster_state(
         pgl,
         "old_master",
         pg_is_in_recovery=False,
         connection=False,
-        db_time=datetime.datetime(year=2014, month=1, day=1),
+        db_time=datetime(year=2014, month=1, day=1),
     )
 
     _add_db_to_cluster_state(
@@ -212,7 +219,7 @@ def test_check_cluster_master_gone_one_standby_one_observer(pgl):
         "old_master",
         pg_is_in_recovery=False,
         connection=False,
-        db_time=datetime.datetime(year=2014, month=1, day=1),
+        db_time=datetime(year=2014, month=1, day=1),
     )
     _add_to_observer_state(
         pgl,
@@ -226,7 +233,7 @@ def test_check_cluster_master_gone_one_standby_one_observer(pgl):
 
     # Simulate existing master connection
     pgl.current_master = "old_master"
-    pgl.execute_external_command.return_value = 0
+    cast(Mock, pgl.execute_external_command).return_value = 0
     pgl.replication_lag_over_warning_limit = False
 
     del pgl.config["remote_conns"]["old_master"]
@@ -237,23 +244,23 @@ def test_check_cluster_master_gone_one_standby_one_observer(pgl):
     # First call does not promote due to missing master because config has been updated just recently and there's
     # by default a grace period that's waited after list of known cluster nodes changes
     pgl.check_cluster_state()
-    assert pgl.execute_external_command.call_count == 0
+    assert cast(Mock, pgl.execute_external_command).call_count == 0
     assert pgl.replication_lag_over_warning_limit is False
 
     # If we say that master is known to be gone promotion will happen even though config was updated recently
     pgl.known_gone_nodes.append("old_master")
     pgl.check_cluster_state()
-    assert pgl.execute_external_command.call_count == 1
+    assert cast(Mock, pgl.execute_external_command).call_count == 1
     assert pgl.replication_lag_over_warning_limit is False
 
 
-def test_check_cluster_do_failover_one_standby_one_observer(pgl):
+def test_check_cluster_do_failover_one_standby_one_observer(pgl: PgLookout) -> None:
     _add_db_to_cluster_state(
         pgl,
         "old_master",
         pg_is_in_recovery=False,
         connection=False,
-        db_time=datetime.datetime(year=2014, month=1, day=1),
+        db_time=datetime(year=2014, month=1, day=1),
     )
 
     _add_db_to_cluster_state(
@@ -271,7 +278,7 @@ def test_check_cluster_do_failover_one_standby_one_observer(pgl):
         "old_master",
         pg_is_in_recovery=False,
         connection=False,
-        db_time=datetime.datetime(year=2014, month=1, day=1),
+        db_time=datetime(year=2014, month=1, day=1),
     )
     _add_to_observer_state(
         pgl,
@@ -283,23 +290,23 @@ def test_check_cluster_do_failover_one_standby_one_observer(pgl):
         replication_time_lag=130.0,
     )
 
-    pgl.execute_external_command.return_value = 0
+    cast(Mock, pgl.execute_external_command).return_value = 0
     pgl.replication_lag_over_warning_limit = False
     pgl.check_cluster_state()
-    assert pgl.execute_external_command.call_count == 0
+    assert cast(Mock, pgl.execute_external_command).call_count == 0
     assert pgl.replication_lag_over_warning_limit is True
     pgl.check_cluster_state()
-    assert pgl.execute_external_command.call_count == 1
+    assert cast(Mock, pgl.execute_external_command).call_count == 1
     assert pgl.replication_lag_over_warning_limit is False
 
 
-def test_check_cluster_do_failover_with_a_node_which_is_is_maintenance(pgl):
+def test_check_cluster_do_failover_with_a_node_which_is_is_maintenance(pgl: PgLookout) -> None:
     _add_db_to_cluster_state(
         pgl,
         "old_master",
         pg_is_in_recovery=False,
         connection=False,
-        db_time=datetime.datetime(year=2014, month=1, day=1),
+        db_time=datetime(year=2014, month=1, day=1),
     )
 
     _add_db_to_cluster_state(
@@ -313,16 +320,16 @@ def test_check_cluster_do_failover_with_a_node_which_is_is_maintenance(pgl):
 
     pgl.never_promote_these_nodes = []
     pgl.own_db = "kuu"
-    pgl.execute_external_command.return_value = 0
+    cast(Mock, pgl.execute_external_command).return_value = 0
     pgl.replication_lag_over_warning_limit = True
-    pgl.check_for_maintenance_mode_file.return_value = True
+    cast(Mock, pgl.check_for_maintenance_mode_file).return_value = True
     pgl.check_cluster_state()
-    assert pgl.execute_external_command.call_count == 0
+    assert cast(Mock, pgl.execute_external_command).call_count == 0
     assert pgl.replication_lag_over_warning_limit is True
-    assert pgl.check_for_maintenance_mode_file.call_count == 1
+    assert cast(Mock, pgl.check_for_maintenance_mode_file).call_count == 1
 
 
-def test_check_cluster_do_failover_with_a_node_which_should_never_be_promoted(pgl):
+def test_check_cluster_do_failover_with_a_node_which_should_never_be_promoted(pgl: PgLookout) -> None:
     _add_db_to_cluster_state(pgl, "old_master", pg_is_in_recovery=False, connection=False)
 
     _add_db_to_cluster_state(
@@ -335,14 +342,14 @@ def test_check_cluster_do_failover_with_a_node_which_should_never_be_promoted(pg
     )
     pgl.never_promote_these_nodes = ["kuu"]
     pgl.own_db = "kuu"
-    pgl.execute_external_command.return_value = 0
+    cast(Mock, pgl.execute_external_command).return_value = 0
     pgl.replication_lag_over_warning_limit = True
     pgl.check_cluster_state()
-    assert pgl.execute_external_command.call_count == 0
+    assert cast(Mock, pgl.execute_external_command).call_count == 0
     assert pgl.replication_lag_over_warning_limit is True
 
 
-def test_check_cluster_do_failover_two_standbys(pgl):
+def test_check_cluster_do_failover_two_standbys(pgl: PgLookout) -> None:
     _add_db_to_cluster_state(pgl, "old_master", pg_is_in_recovery=False, connection=False)
 
     _add_db_to_cluster_state(
@@ -364,22 +371,22 @@ def test_check_cluster_do_failover_two_standbys(pgl):
         replication_time_lag=130.0,
     )
 
-    pgl.execute_external_command.return_value = 0
+    cast(Mock, pgl.execute_external_command).return_value = 0
     pgl.replication_lag_over_warning_limit = True
     pgl.check_cluster_state()
-    assert pgl.execute_external_command.call_count == 0
+    assert cast(Mock, pgl.execute_external_command).call_count == 0
     assert pgl.replication_lag_over_warning_limit is True  # we keep the warning on
 
 
 def test_check_cluster_do_failover_two_standbys_when_the_one_ahead_can_never_be_promoted(
-    pgl,
-):
+    pgl: PgLookout,
+) -> None:
     _add_db_to_cluster_state(
         pgl,
         "old_master",
         pg_is_in_recovery=False,
         connection=False,
-        db_time=datetime.datetime(year=2014, month=1, day=1),
+        db_time=datetime(year=2014, month=1, day=1),
     )
 
     _add_db_to_cluster_state(
@@ -401,14 +408,14 @@ def test_check_cluster_do_failover_two_standbys_when_the_one_ahead_can_never_be_
         replication_time_lag=130.0,
     )
     pgl.never_promote_these_nodes = ["puu"]
-    pgl.execute_external_command.return_value = 0
+    cast(Mock, pgl.execute_external_command).return_value = 0
     pgl.replication_lag_over_warning_limit = True
     pgl.check_cluster_state()
-    assert pgl.execute_external_command.call_count == 1
+    assert cast(Mock, pgl.execute_external_command).call_count == 1
     assert pgl.replication_lag_over_warning_limit is False
 
 
-def test_failover_with_no_master_anymore(pgl):
+def test_failover_with_no_master_anymore(pgl: PgLookout) -> None:
     # this should trigger an immediate failover as we have two
     # standbys online but we've never seen a master
     pgl.own_db = "kuu"
@@ -429,12 +436,12 @@ def test_failover_with_no_master_anymore(pgl):
         replication_time_lag=1,
     )
 
-    pgl.execute_external_command.return_value = 0
+    cast(Mock, pgl.execute_external_command).return_value = 0
     pgl.check_cluster_state()
-    assert pgl.execute_external_command.call_count == 1
+    assert cast(Mock, pgl.execute_external_command).call_count == 1
 
 
-def test_failover_over_replication_lag_when_still_connected_to_master(pgl):
+def test_failover_over_replication_lag_when_still_connected_to_master(pgl: PgLookout) -> None:
     _add_db_to_cluster_state(pgl, "old_master", pg_is_in_recovery=False, connection=False)
 
     # We will make our own node to be the furthest along so we get considered for promotion
@@ -449,13 +456,13 @@ def test_failover_over_replication_lag_when_still_connected_to_master(pgl):
     pgl.own_db = "kuu"
 
     pgl.check_cluster_state()
-    assert pgl.execute_external_command.call_count == 0
+    assert cast(Mock, pgl.execute_external_command).call_count == 0
     assert pgl.replication_lag_over_warning_limit is True  # we keep the warning on
 
 
 def test_failover_over_replication_lag_with_one_observer_one_standby_no_connections(
-    pgl,
-):
+    pgl: PgLookout,
+) -> None:
     _add_db_to_cluster_state(pgl, "old_master", pg_is_in_recovery=False, connection=False)
 
     # We will make our own node to be the furthest along so we get considered for promotion
@@ -475,7 +482,7 @@ def test_failover_over_replication_lag_with_one_observer_one_standby_no_connecti
         "old_master",
         pg_is_in_recovery=False,
         connection=False,
-        db_time=datetime.datetime(year=2014, month=1, day=1),
+        db_time=datetime(year=2014, month=1, day=1),
     )
     _add_to_observer_state(
         pgl,
@@ -488,11 +495,11 @@ def test_failover_over_replication_lag_with_one_observer_one_standby_no_connecti
     )
     pgl.observer_state["observer"]["connection"] = False
     pgl.check_cluster_state()
-    assert pgl.execute_external_command.call_count == 0
+    assert cast(Mock, pgl.execute_external_command).call_count == 0
     assert pgl.replication_lag_over_warning_limit is True  # we keep the warning on
 
 
-def test_cluster_state_when_observer_has_also_non_members_of_our_current_cluster(pgl):
+def test_cluster_state_when_observer_has_also_non_members_of_our_current_cluster(pgl: PgLookout) -> None:
     _add_db_to_cluster_state(pgl, "old_master", pg_is_in_recovery=False, connection=True)
 
     # We will make our own node to be the furthest along so we get considered for promotion
@@ -512,7 +519,7 @@ def test_cluster_state_when_observer_has_also_non_members_of_our_current_cluster
         "old_master",
         pg_is_in_recovery=False,
         connection=False,
-        db_time=datetime.datetime(year=2014, month=1, day=1),
+        db_time=datetime(year=2014, month=1, day=1),
     )
     _add_to_observer_state(
         pgl,
@@ -537,7 +544,7 @@ def test_cluster_state_when_observer_has_also_non_members_of_our_current_cluster
     assert "old_master" in pgl.connected_master_nodes
 
 
-def test_failover_no_connections(pgl):
+def test_failover_no_connections(pgl: PgLookout) -> None:
     _add_db_to_cluster_state(pgl, "old_master", pg_is_in_recovery=False, connection=False)
 
     # We will make our own node to be the furthest along so we get considered for promotion
@@ -561,17 +568,17 @@ def test_failover_no_connections(pgl):
         replication_time_lag=130.0,
     )
     pgl.check_cluster_state()
-    assert pgl.execute_external_command.call_count == 0
+    assert cast(Mock, pgl.execute_external_command).call_count == 0
     assert pgl.replication_lag_over_warning_limit is True  # we keep the warning on
 
 
-def test_failover_master_two_standbys_one_observer_no_connection_between_standbys(pgl):
+def test_failover_master_two_standbys_one_observer_no_connection_between_standbys(pgl: PgLookout) -> None:
     _add_db_to_cluster_state(
         pgl,
         "old_master",
         pg_is_in_recovery=False,
         connection=False,
-        db_time=datetime.datetime(year=2014, month=1, day=1),
+        db_time=datetime(year=2014, month=1, day=1),
     )
     # We will make our own node to be the furthest along so we get considered for promotion
     _add_db_to_cluster_state(
@@ -600,7 +607,7 @@ def test_failover_master_two_standbys_one_observer_no_connection_between_standby
         "old_master",
         pg_is_in_recovery=False,
         connection=False,
-        db_time=datetime.datetime(year=2014, month=1, day=1),
+        db_time=datetime(year=2014, month=1, day=1),
     )
     _add_to_observer_state(
         pgl,
@@ -620,17 +627,17 @@ def test_failover_master_two_standbys_one_observer_no_connection_between_standby
         connection=True,
         replication_time_lag=130.0,
     )
-    pgl.execute_external_command.return_value = 0
+    cast(Mock, pgl.execute_external_command).return_value = 0
     pgl.check_cluster_state()
-    assert pgl.execute_external_command.call_count == 0
+    assert cast(Mock, pgl.execute_external_command).call_count == 0
     assert pgl.replication_lag_over_warning_limit is True
     pgl.check_cluster_state()
-    assert pgl.execute_external_command.call_count == 1
+    assert cast(Mock, pgl.execute_external_command).call_count == 1
 
     assert pgl.replication_lag_over_warning_limit is False
 
 
-def test_failover_master_one_standby_one_observer_no_connections(pgl):
+def test_failover_master_one_standby_one_observer_no_connections(pgl: PgLookout) -> None:
     pgl.own_db = "own"
 
     # Add observer state
@@ -649,7 +656,7 @@ def test_failover_master_one_standby_one_observer_no_connections(pgl):
 
     pgl.check_cluster_state()
     assert pgl.replication_lag_over_warning_limit is True  # we keep the warning on
-    assert pgl.execute_external_command.call_count == 0
+    assert cast(Mock, pgl.execute_external_command).call_count == 0
 
     # Add observer state
     _add_to_observer_state(pgl, "observer", "old_master", pg_is_in_recovery=False, connection=True)
@@ -675,7 +682,7 @@ def test_failover_master_one_standby_one_observer_no_connections(pgl):
     pgl.check_cluster_state()
 
     # No failover yet
-    assert pgl.execute_external_command.call_count == 0
+    assert cast(Mock, pgl.execute_external_command).call_count == 0
     assert pgl.replication_lag_over_warning_limit is True  # we keep the warning on
 
     # observer state
@@ -685,7 +692,7 @@ def test_failover_master_one_standby_one_observer_no_connections(pgl):
         "old_master",
         pg_is_in_recovery=False,
         connection=False,
-        db_time=datetime.datetime(year=2014, month=1, day=1),
+        db_time=datetime(year=2014, month=1, day=1),
     )
     _add_to_observer_state(
         pgl,
@@ -702,14 +709,14 @@ def test_failover_master_one_standby_one_observer_no_connections(pgl):
         "old_master",
         pg_is_in_recovery=False,
         connection=False,
-        db_time=datetime.datetime(year=2014, month=1, day=1),
+        db_time=datetime(year=2014, month=1, day=1),
     )
     # now do failover
     pgl.check_cluster_state()
-    assert pgl.execute_external_command.call_count == 1
+    assert cast(Mock, pgl.execute_external_command).call_count == 1
 
 
-def test_find_current_master(pgl):
+def test_find_current_master(pgl: PgLookout) -> None:
     _add_db_to_cluster_state(pgl, "master", pg_is_in_recovery=False, connection=True)
     # We will make our own node to be the furthest along so we get considered for promotion
     _add_db_to_cluster_state(
@@ -725,13 +732,13 @@ def test_find_current_master(pgl):
     assert pgl.current_master == "master"
 
 
-def test_two_standby_failover_and_autofollow(pgl, tmpdir):
+def test_two_standby_failover_and_autofollow(pgl: PgLookout, tmp_path: Path) -> None:
     _add_db_to_cluster_state(
         pgl,
         "old_master",
         pg_is_in_recovery=False,
         connection=False,
-        fetch_time=datetime.datetime(year=2014, month=1, day=1),
+        fetch_time=datetime(year=2014, month=1, day=1),
     )
     # We will make our own node to be the furthest from master so we don't get considered for promotion
     _add_db_to_cluster_state(
@@ -754,7 +761,7 @@ def test_two_standby_failover_and_autofollow(pgl, tmpdir):
     pgl.check_cluster_state()
 
     assert pgl.replication_lag_over_warning_limit is True  # we keep the warning on
-    assert pgl.execute_external_command.call_count == 0
+    assert cast(Mock, pgl.execute_external_command).call_count == 0
     assert pgl.current_master == "old_master"
 
     _add_db_to_cluster_state(
@@ -767,29 +774,26 @@ def test_two_standby_failover_and_autofollow(pgl, tmpdir):
         conn_info={"host": "otherhost.example.com", "port": 11111},
     )
 
-    pg_data_dir = tmpdir.join("test_pgdata").strpath
-    os.makedirs(pg_data_dir)
+    pg_data_dir = tmp_path / "test_pgdata"
+    pg_data_dir.mkdir(parents=True, exist_ok=True)
     primary_conninfo = (
         "user=replication password=vjsh8l7sv4a902y1tsdz "
         "host=old_master port=5432 "
         "sslmode=prefer sslcompression=1 krbsrvname=postgres"
     )
     old_recovery_conf = f"standby_mode = 'on'\nprimary_conninfo = '{primary_conninfo}'\n"
-    with open(os.path.join(pg_data_dir, "recovery.conf"), "w") as fp:
-        fp.write(old_recovery_conf)
+    (pg_data_dir / "recovery.conf").write_text(old_recovery_conf)
 
-    pgl.config["pg_data_directory"] = pg_data_dir
+    pgl.config["pg_data_directory"] = str(pg_data_dir)
     pgl.config["autofollow"] = True
     pgl.primary_conninfo_template = get_connection_info(primary_conninfo)
 
-    with open(os.path.join(pg_data_dir, "PG_VERSION"), "w") as fp:
-        fp.write("11\n")
+    (pg_data_dir / "PG_VERSION").write_text("11\n")
 
     pgl.check_cluster_state()
     assert pgl.current_master == "other"
 
-    with open(os.path.join(pg_data_dir, "recovery.conf"), "r") as fp:
-        new_lines = fp.read().splitlines()
+    new_lines = (pg_data_dir / "recovery.conf").read_text().splitlines()
     assert new_lines.pop(0).startswith("# pglookout updated primary_conninfo")
     assert new_lines.pop(0) == "standby_mode = 'on'"
     assert new_lines[0].startswith("primary_conninfo = ")
@@ -801,8 +805,8 @@ def test_two_standby_failover_and_autofollow(pgl, tmpdir):
     assert new_conn_info == dict(old_conn_info, host="otherhost.example.com", port="11111")
 
 
-def test_replication_positions(pgl):
-    standby_nodes = {
+def test_replication_positions(pgl: PgLookout) -> None:
+    standby_nodes: dict[str, MemberState] = {
         "10.255.255.10": {
             "connection": True,
             "db_time": "2014-08-28T14:09:57.919301+00:00Z",
@@ -814,24 +818,31 @@ def test_replication_positions(pgl):
             "replication_time_lag": 254.341944,
         },
     }
+
     # the above node shouldn't show up as its fetch_time is (way) older than 20 seconds
-    positions = {}
+    positions: dict[int, set[str]] = {}
     assert pgl.get_replication_positions(standby_nodes) == positions
     standby_nodes["10.255.255.10"]["fetch_time"] = get_iso_timestamp()
-    positions[0x9000090] = set(["10.255.255.10"])
+    positions[0x9000090] = {"10.255.255.10"}
     assert pgl.get_replication_positions(standby_nodes) == positions
+
     # add another standby, further ahead
-    standby_nodes["10.255.255.11"] = dict(standby_nodes["10.255.255.10"], pg_last_xlog_receive_location="1/0000AAAA")
-    positions[1 << 32 | 0xAAAA] = set(["10.255.255.11"])
+    standby_nodes["10.255.255.11"] = cast(
+        MemberState, dict(standby_nodes["10.255.255.10"], pg_last_xlog_receive_location="1/0000AAAA")
+    )
+    positions[1 << 32 | 0xAAAA] = {"10.255.255.11"}
     assert pgl.get_replication_positions(standby_nodes) == positions
+
     # add another standby which hasn't received anything
-    standby_nodes["10.255.255.12"] = dict(standby_nodes["10.255.255.10"], pg_last_xlog_receive_location=None)
+    standby_nodes["10.255.255.12"] = cast(
+        MemberState, dict(standby_nodes["10.255.255.10"], pg_last_xlog_receive_location=None)
+    )
     positions[0x9000090].add("10.255.255.12")
     assert pgl.get_replication_positions(standby_nodes) == positions
 
 
-def test_node_map(pgl):
-    cluster_state = {
+def test_node_map(pgl: PgLookout) -> None:
+    cluster_state: dict[str, MemberState] = {
         "10.255.255.10": {
             "connection": True,
             "db_time": "2014-08-28T14:26:51.067084+00:00Z",
@@ -848,38 +859,40 @@ def test_node_map(pgl):
         },
     }
     observer_state = {
-        "10.255.255.11": {
-            "10.255.255.10": {
+        "10.255.255.11": ObservedState(
+            {
+                "10.255.255.10": {
+                    "connection": True,
+                    "db_time": "2014-08-28T14:26:47.105901+00:00Z",
+                    "fetch_time": "2014-08-28T14:26:47.104849Z",
+                    "pg_is_in_recovery": False,
+                    "pg_last_xact_replay_timestamp": "2014-08-28T14:05:43.577357+00:00Z",
+                    "pg_last_xlog_receive_location": "0/9000090",
+                    "pg_last_xlog_replay_location": "0/9000090",
+                    "replication_time_lag": 1263.528544,
+                },
+                "10.255.255.9": {
+                    "connection": False,
+                    "db_time": "2014-08-28T14:06:15.172820+00:00Z",
+                    "fetch_time": "2014-08-28T14:26:47.107115Z",
+                    "pg_is_in_recovery": False,
+                    "pg_last_xact_replay_timestamp": None,
+                    "pg_last_xlog_receive_location": None,
+                    "pg_last_xlog_replay_location": None,
+                },
                 "connection": True,
-                "db_time": "2014-08-28T14:26:47.105901+00:00Z",
-                "fetch_time": "2014-08-28T14:26:47.104849Z",
-                "pg_is_in_recovery": False,
-                "pg_last_xact_replay_timestamp": "2014-08-28T14:05:43.577357+00:00Z",
-                "pg_last_xlog_receive_location": "0/9000090",
-                "pg_last_xlog_replay_location": "0/9000090",
-                "replication_time_lag": 1263.528544,
-            },
-            "10.255.255.9": {
-                "connection": False,
-                "db_time": "2014-08-28T14:06:15.172820+00:00Z",
-                "fetch_time": "2014-08-28T14:26:47.107115Z",
-                "pg_is_in_recovery": False,
-                "pg_last_xact_replay_timestamp": None,
-                "pg_last_xlog_receive_location": None,
-                "pg_last_xlog_replay_location": None,
-            },
-            "connection": True,
-            "fetch_time": "2014-08-28T14:26:51.069891Z",
-        }
+                "fetch_time": "2014-08-28T14:26:51.069891Z",
+            }
+        )
     }
     master_host, _, standby_nodes = pgl.create_node_map(cluster_state, observer_state)
     assert master_host == "10.255.255.10"
     assert standby_nodes == {}
 
 
-def test_node_map_disconnected_current_master(pgl):
+def test_node_map_disconnected_current_master(pgl: PgLookout) -> None:
     pgl.current_master = "10.255.255.7"
-    cluster_state = {
+    cluster_state: dict[str, MemberState] = {
         "10.255.255.7": {
             "connection": False,
             "db_time": "2014-09-07T15:26:23.957151+00:00Z",
@@ -900,14 +913,14 @@ def test_node_map_disconnected_current_master(pgl):
             "replication_time_lag": 43.586525,
         },
     }
-    observer_state = {}
+    observer_state: dict[str, ObservedState] = {}
     master_host, _, standby_nodes = pgl.create_node_map(cluster_state, observer_state)
     assert master_host == "10.255.255.7"
     assert list(standby_nodes.keys())[0] == "10.255.255.8"
 
 
-def test_standbys_failover_equal_replication_positions(pgl):
-    now = datetime.datetime.utcnow()
+def test_standbys_failover_equal_replication_positions(pgl: PgLookout) -> None:
+    now = datetime.utcnow()
     _add_db_to_cluster_state(
         pgl,
         instance="192.168.54.183",
@@ -926,8 +939,8 @@ def test_standbys_failover_equal_replication_positions(pgl):
         pg_is_in_recovery=False,
         connection=False,
         replication_time_lag=0.0,
-        fetch_time=now - datetime.timedelta(seconds=3600),
-        db_time=now - datetime.timedelta(seconds=3600),
+        fetch_time=now - timedelta(seconds=3600),
+        db_time=now - timedelta(seconds=3600),
         conn_info="foobar",
     )
     _add_db_to_cluster_state(
@@ -947,15 +960,15 @@ def test_standbys_failover_equal_replication_positions(pgl):
     # highest standby currently.
     pgl.own_db = "192.168.54.183"
     pgl.check_cluster_state()
-    assert pgl.execute_external_command.call_count == 0
+    assert cast(Mock, pgl.execute_external_command).call_count == 0
     # If we're the highest we should see call_count increment
     pgl.own_db = "192.168.63.4"
     pgl.check_cluster_state()
-    assert pgl.execute_external_command.call_count == 1
+    assert cast(Mock, pgl.execute_external_command).call_count == 1
 
 
-def test_node_map_when_only_observer_sees_master(pgl):
-    cluster_state = {
+def test_node_map_when_only_observer_sees_master(pgl: PgLookout) -> None:
+    cluster_state: dict[str, MemberState] = {
         "10.255.255.10": {
             "connection": False,
             "db_time": "2014-08-28T14:26:51.067084+00:00Z",
@@ -967,21 +980,23 @@ def test_node_map_when_only_observer_sees_master(pgl):
             "replication_time_lag": 1267.489727,
         },
     }
-    observer_state = {
-        "10.255.255.11": {
-            "10.255.255.10": {
+    observer_state: dict[str, ObservedState] = {
+        "10.255.255.11": ObservedState(
+            {
+                "10.255.255.10": {
+                    "connection": True,
+                    "db_time": "2014-08-28T14:26:47.105901+00:00Z",
+                    "fetch_time": "2014-08-28T14:26:50.104849Z",
+                    "pg_is_in_recovery": False,
+                    "pg_last_xact_replay_timestamp": "2014-08-28T14:05:43.577357+00:00Z",
+                    "pg_last_xlog_receive_location": "0/9000090",
+                    "pg_last_xlog_replay_location": "0/9000090",
+                    "replication_time_lag": 1263.528544,
+                },
                 "connection": True,
-                "db_time": "2014-08-28T14:26:47.105901+00:00Z",
-                "fetch_time": "2014-08-28T14:26:50.104849Z",
-                "pg_is_in_recovery": False,
-                "pg_last_xact_replay_timestamp": "2014-08-28T14:05:43.577357+00:00Z",
-                "pg_last_xlog_receive_location": "0/9000090",
-                "pg_last_xlog_replay_location": "0/9000090",
-                "replication_time_lag": 1263.528544,
-            },
-            "connection": True,
-            "fetch_time": "2014-08-28T14:26:51.069891Z",
-        }
+                "fetch_time": "2014-08-28T14:26:51.069891Z",
+            }
+        )
     }
     master_instance, _, _ = pgl.create_node_map(cluster_state, observer_state)
     assert master_instance == "10.255.255.10"
@@ -989,7 +1004,7 @@ def test_node_map_when_only_observer_sees_master(pgl):
     assert master_instance in pgl.connected_master_nodes
 
 
-def test_poll_observers_on_warning_only(pgl):
+def test_poll_observers_on_warning_only(pgl: PgLookout) -> None:
     pgl.config["poll_observers_on_warning_only"] = True
     pgl.config["observers"] = {"local": "URL"}
     pgl.own_db = "kuu"
@@ -998,7 +1013,7 @@ def test_poll_observers_on_warning_only(pgl):
         "master",
         pg_is_in_recovery=False,
         connection=True,
-        db_time=datetime.datetime.min,
+        db_time=datetime.min,
     )
     _add_db_to_cluster_state(
         pgl,
@@ -1010,7 +1025,7 @@ def test_poll_observers_on_warning_only(pgl):
     pgl.check_cluster_state()
     assert "master" not in pgl.disconnected_master_nodes
     assert "master" in pgl.connected_master_nodes
-    assert pgl.execute_external_command.call_count == 0
+    assert cast(Mock, pgl.execute_external_command).call_count == 0
     assert pgl.replication_lag_over_warning_limit
     assert pgl.observer_state_newer_than is not None
 
@@ -1019,7 +1034,7 @@ def test_poll_observers_on_warning_only(pgl):
         "master",
         pg_is_in_recovery=False,
         connection=False,
-        db_time=datetime.datetime.min,
+        db_time=datetime.min,
     )
     _add_db_to_cluster_state(
         pgl,
@@ -1034,17 +1049,17 @@ def test_poll_observers_on_warning_only(pgl):
         "master",
         pg_is_in_recovery=False,
         connection=False,
-        db_time=datetime.datetime.min,
+        db_time=datetime.min,
     )
     pgl.check_cluster_state()
     # this check makes sure we did not skip doing checks because db_poll_inteval has
     # pass but because observer data is available and create_node_map was called
     assert "master" in pgl.disconnected_master_nodes
-    assert pgl.execute_external_command.call_count == 1
+    assert cast(Mock, pgl.execute_external_command).call_count == 1
 
 
 @pytest.mark.parametrize(
-    "start_time_offset, last_run_offset, timeout_configured, signaled",
+    ("start_time_offset", "last_run_offset", "timeout_configured", "signaled"),
     [
         # timeout disabled, should never alert
         (None, None, "null", False),
@@ -1080,12 +1095,13 @@ def test_poll_observers_on_warning_only(pgl):
     ],
 )
 def test_check_cluster_monitor_health(
-    pgl,
-    start_time_offset: Optional[float],
-    last_run_offset: Optional[float],
-    timeout_configured: Optional[Union[float, str]],
+    pgl: PgLookout,
+    start_time_offset: float | None,
+    last_run_offset: float | None,
+    timeout_configured: float | str | None,
     signaled: bool,
 ) -> None:
+    assert pgl.cluster_monitor is not None
     now = time.monotonic()
     if timeout_configured is not None:
         if timeout_configured == "null":

From 1f471551d9e001239387cacb1ee6b06f3fce7f67 Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Mon, 27 Mar 2023 13:58:12 +0200
Subject: [PATCH 13/23] mypy: New method to normalize config [BF-1560]

Change `remote_conns` type hint. The temporary `str` type is no longer accepted. When loading the config, a conversion is done to only have the dict-style type in the config's `remote_conns`.
---
 pglookout/cluster_monitor.py |  9 +++++----
 pglookout/config.py          |  3 ++-
 pglookout/pglookout.py       | 22 +++++++++++++++++++---
 pglookout/pgutil.py          | 12 +++++++-----
 test/test_pglookout.py       |  9 +++------
 5 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/pglookout/cluster_monitor.py b/pglookout/cluster_monitor.py
index 29eab48..93ea4bf 100644
--- a/pglookout/cluster_monitor.py
+++ b/pglookout/cluster_monitor.py
@@ -17,7 +17,7 @@
 from pglookout.common import get_iso_timestamp, parse_iso_datetime
 from pglookout.common_types import MemberState, ObservedState, ReplicationSlotAsDict
 from pglookout.config import Config
-from pglookout.pgutil import mask_connection_info
+from pglookout.pgutil import ConnectionInfo, create_connection_string, mask_connection_info
 from pglookout.statsd import StatsClient
 from psycopg2.extensions import POLL_OK, POLL_READ, POLL_WRITE
 from psycopg2.extras import RealDictCursor, RealDictRow
@@ -115,7 +115,7 @@ def __init__(
         self.last_monitoring_success_time: float | None = None
         self.log.debug("Initialized ClusterMonitor with: %r", cluster_state)
 
-    def _connect_to_db(self, instance: str, dsn: str | None) -> psycopg2.connection | None:
+    def _connect_to_db(self, instance: str, dsn: ConnectionInfo | None) -> psycopg2.connection | None:
         conn = self.db_conns.get(instance)
 
         if conn:
@@ -130,7 +130,8 @@ def _connect_to_db(self, instance: str, dsn: str | None) -> psycopg2.connection
 
         try:
             self.log.info("Connecting to %s", inst_info_str)
-            conn = psycopg2.connect(dsn=dsn, async_=True)
+            dsn_string = create_connection_string(dsn)
+            conn = psycopg2.connect(dsn=dsn_string, async_=True)
             wait_select(conn)
             self.log.debug("Connected to %s", inst_info_str)
         except (PglookoutTimeout, psycopg2.OperationalError) as ex:
@@ -259,7 +260,7 @@ def _query_cluster_member_state(self, instance: str, db_conn: psycopg2.connectio
         result: MemberState = {"fetch_time": get_iso_timestamp(), "connection": False}
 
         if not db_conn:
-            dsn: str | None = self.config["remote_conns"].get(instance)
+            dsn: ConnectionInfo | None = self.config["remote_conns"].get(instance)
             db_conn = self._connect_to_db(instance, dsn)
             if not db_conn:
                 return result
diff --git a/pglookout/config.py b/pglookout/config.py
index 9575841..5860dee 100644
--- a/pglookout/config.py
+++ b/pglookout/config.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2023 Aiven, Helsinki, Finland. https://aiven.io/
 from __future__ import annotations
 
+from pglookout.pgutil import ConnectionInfo
 from typing import Literal, TypedDict
 
 
@@ -34,7 +35,7 @@ class Config(TypedDict, total=False):
     pg_stop_command: str
     poll_observers_on_warning_only: bool
     primary_conninfo_template: str
-    remote_conns: dict[str, str]  # instance name -> dsn
+    remote_conns: dict[str, ConnectionInfo]
     replication_catchup_timeout: float
     replication_state_check_interval: float
     statsd: Statsd
diff --git a/pglookout/pglookout.py b/pglookout/pglookout.py
index a6cb009..2ccf2d7 100755
--- a/pglookout/pglookout.py
+++ b/pglookout/pglookout.py
@@ -45,7 +45,7 @@
 from socket import gethostname
 from subprocess import CalledProcessError, check_call
 from types import FrameType
-from typing import cast, Final, Optional
+from typing import Any, cast, Final, Optional
 
 import json
 import sys
@@ -169,13 +169,29 @@ def load_config_from_signal(self, _signal: int, _frame: FrameType | None = None)
         )
         self.load_config()
 
+    @staticmethod
+    def normalize_config(config: dict[str, Any]) -> Config:
+        """Normalize the config to a known format.
+
+        Warning:
+            This method does not give any guarantees about the validity of the config.
+
+        Note:
+            A better approach would be to use a schema to validate the config, or
+            tu use something like ``pydantic`` or ``dataclasses`` to define the config.
+        """
+        if "remote_conns" in config:
+            config["remote_conns"] = {name: get_connection_info(info) for name, info in config["remote_conns"].items()}
+
+        return cast(Config, config)
+
     def load_config(self) -> None:
         self.log.debug("Loading JSON config from: %r", self.config_path)
 
         previous_remote_conns = self.config.get("remote_conns")
         try:
-            with self.config_path.open() as fp:
-                self.config = json.load(fp)
+            config = json.loads(self.config_path.read_text())
+            self.config = self.normalize_config(config)
         except Exception as ex:  # pylint: disable=broad-except
             self.log.exception("Invalid JSON config, exiting")
             self.stats.unexpected_exception(ex, where="load_config")
diff --git a/pglookout/pgutil.py b/pglookout/pgutil.py
index 7d5f692..799fcca 100644
--- a/pglookout/pgutil.py
+++ b/pglookout/pgutil.py
@@ -7,7 +7,7 @@
 """
 from __future__ import annotations
 
-from typing import cast, Literal, TypedDict
+from typing import cast, Literal, TypedDict, Union
 from urllib.parse import parse_qs, urlparse  # pylint: disable=no-name-in-module, import-error
 
 import psycopg2.extensions
@@ -74,11 +74,15 @@ class ConnectionParameterKeywords(TypedDict, total=False):
     target_session_attrs: Literal["any", "read-write", "read-only", "primary", "standby", "prefer-standby"]
 
 
+#: Type alias for allowed connection info types.
+ConnectionInfo = Union[DsnDict, DsnDictDeprecated, ConnectionParameterKeywords]
+
+
 def create_connection_string(connection_info: DsnDict | DsnDictDeprecated | ConnectionParameterKeywords) -> str:
     return str(psycopg2.extensions.make_dsn(**connection_info))
 
 
-def mask_connection_info(info: str) -> str:
+def mask_connection_info(info: str | ConnectionInfo) -> str:
     masked_info = get_connection_info(info)
     password = masked_info.pop("password", None)
     connection_string = create_connection_string(masked_info)
@@ -92,9 +96,7 @@ def get_connection_info_from_config_line(line: str) -> ConnectionParameterKeywor
     return get_connection_info(value)
 
 
-def get_connection_info(
-    info: str | DsnDict | DsnDictDeprecated | ConnectionParameterKeywords,
-) -> ConnectionParameterKeywords:
+def get_connection_info(info: str | ConnectionInfo) -> ConnectionParameterKeywords:
     """Get a normalized connection info dict from a connection string or a dict.
 
     Supports both the traditional libpq format and the new url format.
diff --git a/test/test_pglookout.py b/test/test_pglookout.py
index 9aa3421..4286011 100644
--- a/test/test_pglookout.py
+++ b/test/test_pglookout.py
@@ -14,7 +14,7 @@
 from pglookout.common import get_iso_timestamp
 from pglookout.common_types import MemberState, ObservedState
 from pglookout.pglookout import PgLookout
-from pglookout.pgutil import get_connection_info, get_connection_info_from_config_line
+from pglookout.pgutil import ConnectionInfo, DsnDict, get_connection_info, get_connection_info_from_config_line
 from psycopg2._psycopg import connection as pg_connection  # pylint: disable=no-name-in-module
 from typing import cast
 from unittest.mock import Mock, patch
@@ -113,7 +113,7 @@ def _add_db_to_cluster_state(
     replication_time_lag: float | None = None,
     fetch_time: datetime | None = None,
     db_time: datetime | None = None,
-    conn_info: str | None = None,
+    conn_info: ConnectionInfo | None = None,
 ) -> None:
     db_node_state = _create_db_node_state(
         pg_last_xlog_receive_location,
@@ -124,7 +124,7 @@ def _add_db_to_cluster_state(
         db_time=db_time,
     )
     pgl.cluster_state[instance] = db_node_state
-    pgl.config["remote_conns"][instance] = conn_info or {"host": instance}
+    pgl.config["remote_conns"][instance] = get_connection_info(conn_info or cast(DsnDict, {"host": instance}))
 
 
 def test_check_cluster_state_warning(pgl: PgLookout) -> None:
@@ -930,7 +930,6 @@ def test_standbys_failover_equal_replication_positions(pgl: PgLookout) -> None:
         replication_time_lag=400.435871,
         fetch_time=now,
         db_time=now,
-        conn_info="foobar",
     )
     _add_db_to_cluster_state(
         pgl,
@@ -941,7 +940,6 @@ def test_standbys_failover_equal_replication_positions(pgl: PgLookout) -> None:
         replication_time_lag=0.0,
         fetch_time=now - timedelta(seconds=3600),
         db_time=now - timedelta(seconds=3600),
-        conn_info="foobar",
     )
     _add_db_to_cluster_state(
         pgl,
@@ -952,7 +950,6 @@ def test_standbys_failover_equal_replication_positions(pgl: PgLookout) -> None:
         replication_time_lag=401.104655,
         fetch_time=now,
         db_time=now,
-        conn_info="foobar",
     )
 
     pgl.current_master = "192.168.57.180"

From 5d4ac6e8a331a3031a2bdf5e1ccc152322ecc57c Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Mon, 27 Mar 2023 15:51:13 +0200
Subject: [PATCH 14/23] mypy: `version.py` [BF-1560]

Extraction of the 3 read functions: from file, git, and Makefile.
---
 Makefile       |   2 +-
 pyproject.toml |   5 +--
 version.py     | 101 ++++++++++++++++++++++++++++++++++---------------
 3 files changed, 72 insertions(+), 36 deletions(-)

diff --git a/Makefile b/Makefile
index e1df801..a958cd6 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@ long_ver = $(shell git describe --long 2>/dev/null || echo $(short_ver)-0-unknow
 generated = pglookout/version.py
 
 PYTHON ?= python3
-PYTHON_SOURCE_DIRS = pglookout/ test/ stubs/
+PYTHON_SOURCE_DIRS = pglookout/ test/ stubs/ version.py
 
 all: $(generated)
 	: 'try "make rpm" or "make deb" or "make test"'
diff --git a/pyproject.toml b/pyproject.toml
index 48a7c04..94dd101 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,16 +27,13 @@ strict = true
 files = [
     'pglookout/',
     'test/',
+    'version.py',
 ]
 exclude = [
-    # Implementation.
-    'pglookout/__main__.py',
-    'pglookout/version.py',
     # Tests.
     'test/conftest.py',
     # Other.
     'setup.py',
-    'version.py',
 ]
 
 [[tool.mypy.overrides]]
diff --git a/version.py b/version.py
index a3e88b6..1f12706 100644
--- a/version.py
+++ b/version.py
@@ -5,56 +5,95 @@
 Copyright (c) 2015 Ohmu Ltd
 See LICENSE for details
 """
+from __future__ import annotations
 
-import imp
-import os
+from datetime import datetime
+from pathlib import Path
+from textwrap import dedent
+from typing import Final
+
+import imp  # pylint: disable=deprecated-module
 import subprocess
 
+ROOT_DIR: Final[Path] = Path(__file__).parent
+
 
-def save_version(new_ver, old_ver, version_file):
+def save_version(new_ver: str, old_ver: str | None, version_file: Path) -> bool:
+    """Write new version to the version file if it has changed."""
     if not new_ver:
         return False
-    version_file = os.path.join(os.path.dirname(__file__), version_file)
+
     if not old_ver or new_ver != old_ver:
-        with open(version_file, "w") as fp:
-            fp.write("__version__ = '{}'\n".format(new_ver))
+        version_file.write_text(
+            dedent(
+                f"""\
+        # Copyright (c) {datetime.now():%Y} Aiven, Helsinki, Finland. https://aiven.io/
+        __version__ = '{new_ver}'
+        """
+            )
+        )
+
     return True
 
 
-def get_project_version(version_file):
-    version_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), version_file)
+def get_existing_version(version_file: Path) -> str | None:
+    """Read version from the version file"""
     try:
-        module = imp.load_source("verfile", version_file)
-        file_ver = module.__version__
-    except IOError:
-        file_ver = None
+        module = imp.load_source("verfile", str(version_file))
+        return str(module.__version__)
+    except (IOError, AttributeError):
+        return None
 
-    os.chdir(os.path.dirname(__file__) or ".")
+
+def get_makefile_version(makefile_path: Path) -> str | None:
+    """Read version from ``Makefile``."""
+    if makefile_path.is_file():
+        lines = makefile_path.read_text().splitlines()
+        try:
+            it_short_ver = (line.split("=", 1)[1].strip() for line in lines if line.startswith("short_ver"))
+            return next(it_short_ver)
+        except StopIteration:
+            pass
+    return None
+
+
+def get_git_version(repo_dir: Path) -> str | None:
+    """Read version from git."""
     try:
-        git_out = subprocess.check_output(["git", "describe", "--always"],
-                                          stderr=getattr(subprocess, "DEVNULL", None))
+        git_out = subprocess.check_output(
+            ["git", "describe", "--always"], cwd=repo_dir, stderr=getattr(subprocess, "DEVNULL", None)
+        )
     except (OSError, subprocess.CalledProcessError):
-        pass
-    else:
-        git_ver = git_out.splitlines()[0].strip().decode("utf-8")
-        if "." not in git_ver:
-            git_ver = "0.0.1-0-unknown-{}".format(git_ver)
-        if save_version(git_ver, file_ver, version_file):
-            return git_ver
-
-    makefile = os.path.join(os.path.dirname(__file__), "Makefile")
-    if os.path.exists(makefile):
-        with open(makefile, "r") as fp:
-            lines = fp.readlines()
-        short_ver = [line.split("=", 1)[1].strip() for line in lines if line.startswith("short_ver")][0]
-        if save_version(short_ver, file_ver, version_file):
-            return short_ver
+        return None
+
+    git_ver = git_out.splitlines()[0].strip().decode("utf-8")
+    if "." not in git_ver:
+        git_ver = f"0.0.1-0-unknown-{git_ver}"
+
+    return git_ver
+
+
+def get_project_version(version_file_name: str) -> str:
+    """Read version from git or from the version file"""
+    version_file = ROOT_DIR / version_file_name
+    file_ver = get_existing_version(version_file)
+
+    git_ver = get_git_version(ROOT_DIR)
+    if git_ver and save_version(git_ver, file_ver, version_file):
+        return git_ver
+
+    makefile = ROOT_DIR / "Makefile"
+    short_ver = get_makefile_version(makefile)
+    if short_ver and save_version(short_ver, file_ver, version_file):
+        return short_ver
 
     if not file_ver:
-        raise Exception("version not available from git or from file {!r}".format(version_file))
+        raise Exception(f"version not available from git or from file {version_file!r}")
 
     return file_ver
 
+
 if __name__ == "__main__":
     import sys
+
     get_project_version(sys.argv[1])

From 51bb85e34a7e4bfa5324435f001992f4428c1626 Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Mon, 27 Mar 2023 16:07:01 +0200
Subject: [PATCH 15/23] mypy: `setup.py` [BF-1560]

---
 Makefile             |  2 +-
 pyproject.toml       |  3 +--
 requirements.dev.txt |  1 +
 setup.py             | 12 ++++++------
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/Makefile b/Makefile
index a958cd6..955ea1e 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@ long_ver = $(shell git describe --long 2>/dev/null || echo $(short_ver)-0-unknow
 generated = pglookout/version.py
 
 PYTHON ?= python3
-PYTHON_SOURCE_DIRS = pglookout/ test/ stubs/ version.py
+PYTHON_SOURCE_DIRS = pglookout/ test/ stubs/ version.py setup.py
 
 all: $(generated)
 	: 'try "make rpm" or "make deb" or "make test"'
diff --git a/pyproject.toml b/pyproject.toml
index 94dd101..4a92069 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,12 +28,11 @@ files = [
     'pglookout/',
     'test/',
     'version.py',
+    'setup.py',
 ]
 exclude = [
     # Tests.
     'test/conftest.py',
-    # Other.
-    'setup.py',
 ]
 
 [[tool.mypy.overrides]]
diff --git a/requirements.dev.txt b/requirements.dev.txt
index 1dd3f89..2635d85 100644
--- a/requirements.dev.txt
+++ b/requirements.dev.txt
@@ -5,5 +5,6 @@ isort==5.10.1
 mypy==1.1.1
 types-psycopg2==2.9.21.8
 types-requests==2.26.1
+types-setuptools==67.6.0.5
 pylint==2.15.4
 pytest
diff --git a/setup.py b/setup.py
index b068297..ee3789a 100644
--- a/setup.py
+++ b/setup.py
@@ -1,12 +1,12 @@
-from setuptools import setup, find_packages
-import os
+# Copyright (c) 2023 Aiven, Helsinki, Finland. https://aiven.io/
+from pathlib import Path
+from setuptools import find_packages, setup
+
 import sys
 import version
 
-
-readme_path = os.path.join(os.path.dirname(__file__), "README.rst")
-with open(readme_path, "r") as fp:
-    readme_text = fp.read()
+readme_path = Path(__file__).parent / "README.rst"
+readme_text = readme_path.read_text()
 
 
 version_for_setup_py = version.get_project_version("pglookout/version.py")

From e9834d4092e3411602bba91a3b6ee219fe8e4088 Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Mon, 27 Mar 2023 16:08:10 +0200
Subject: [PATCH 16/23] mypy: Remove python2 dependency [BF-1560]

Package https://pypi.org/project/futures/ is not needed in Python3.
---
 setup.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/setup.py b/setup.py
index ee3789a..a3564ac 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,6 @@
 from pathlib import Path
 from setuptools import find_packages, setup
 
-import sys
 import version
 
 readme_path = Path(__file__).parent / "README.rst"
@@ -18,9 +17,6 @@
     "requests >= 1.2.0",
 ]
 
-if sys.version_info[0] == 2:
-    requires.append("futures")
-
 
 setup(
     name="pglookout",

From a3ad6f850a0ef8386d574d432d667dbac43dda27 Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Mon, 27 Mar 2023 16:54:19 +0200
Subject: [PATCH 17/23] mypy: `conftest.py` [BF-1560]

Also includes some cleanup to better leverage pytest features.
---
 pyproject.toml               |   4 --
 test/conftest.py             | 125 ++++++++++++++++++-----------------
 test/test_cluster_monitor.py |  11 +--
 3 files changed, 72 insertions(+), 68 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4a92069..10f5aa9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,10 +30,6 @@ files = [
     'version.py',
     'setup.py',
 ]
-exclude = [
-    # Tests.
-    'test/conftest.py',
-]
 
 [[tool.mypy.overrides]]
 module = "test.test_cluster_monitor"
diff --git a/test/conftest.py b/test/conftest.py
index 869ad7e..a27b015 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -4,32 +4,37 @@
 Copyright (c) 2016 Ohmu Ltd
 See LICENSE for details
 """
-from pglookout import logutil, pgutil
+# pylint: disable=protected-access
+from __future__ import annotations
+
+from pathlib import Path
+from pglookout import logutil
 from pglookout.pglookout import PgLookout
-from py import path as py_path  # pylint: disable=no-name-in-module
-from unittest.mock import Mock
+from pglookout.pgutil import DsnDict
+from textwrap import dedent
+from typing import cast, Final, Generator
+from unittest.mock import Mock, patch
 
 import os
 import pytest
 import signal
 import subprocess
-import tempfile
 import time
 
-PG_VERSIONS = ["15", "14", "13", "12", "11", "10"]
+PG_VERSIONS: Final[list[str]] = ["15", "14", "13", "12", "11", "10"]
 
 logutil.configure_logging()
 
 
 @pytest.fixture
-def pgl():
+def pgl() -> Generator[PgLookout, None, None]:
     pgl_ = PgLookout("pglookout.json")
+    assert pgl_.cluster_monitor is not None
     pgl_.config["remote_conns"] = {}
-    pgl_.check_for_maintenance_mode_file = Mock()
-    pgl_.check_for_maintenance_mode_file.return_value = False
-    pgl_.cluster_monitor._connect_to_db = Mock()  # pylint: disable=protected-access
-    pgl_.create_alert_file = Mock()
-    pgl_.execute_external_command = Mock()
+    pgl_.check_for_maintenance_mode_file = Mock(return_value=False)  # type: ignore[method-assign]
+    pgl_.cluster_monitor._connect_to_db = Mock()  # type: ignore[method-assign]
+    pgl_.create_alert_file = Mock()  # type: ignore[method-assign]
+    pgl_.execute_external_command = Mock()  # type: ignore[method-assign]
     try:
         yield pgl_
     finally:
@@ -37,52 +42,51 @@ def pgl():
 
 
 class TestPG:
-    def __init__(self, pgdata):
-        self.pgbin = self.find_pgbin()
-        self.pgdata = pgdata
-        self.pg = None
+    def __init__(self, pgdata: Path) -> None:
+        self.pgbin: Path = self.find_pgbin()
+        self.pgdata: Path = pgdata
+        self.pg: subprocess.Popen[bytes] | None = None
 
     @staticmethod
-    def find_pgbin(versions=None):
+    def find_pgbin(versions: list[str] | None = None) -> Path:
         pathformats = ["/usr/pgsql-{ver}/bin", "/usr/lib/postgresql/{ver}/bin"]
         for ver in versions or PG_VERSIONS:
             for pathfmt in pathformats:
-                pgbin = pathfmt.format(ver=ver)
-                if os.path.exists(pgbin):
+                pgbin = Path(pathfmt.format(ver=ver))
+                if pgbin.is_dir():
                     return pgbin
-        return "/usr/bin"
+        return Path("/usr/bin")
 
     @property
-    def pgver(self):
-        with open(os.path.join(self.pgdata, "PG_VERSION"), "r") as fp:
-            return fp.read().strip()
+    def pgver(self) -> str:
+        return (self.pgdata / "PG_VERSION").read_text().strip()
 
-    def connection_string(self, user="testuser", dbname="postgres"):
-        return pgutil.create_connection_string(
+    def connection_dict(self, user: str = "testuser", dbname: str = "postgres") -> DsnDict:
+        return cast(
+            DsnDict,
             {
                 "dbname": dbname,
                 "host": self.pgdata,
                 "port": 5432,
                 "user": user,
-            }
+            },
         )
 
-    def createuser(self, user="testuser"):
-        self.run_cmd("createuser", "-h", self.pgdata, "-p", "5432", "-s", user)
+    def createuser(self, user: str = "testuser") -> None:
+        self.run_cmd("createuser", "-h", str(self.pgdata), "-p", "5432", "-s", user)
 
-    def run_cmd(self, cmd, *args):
-        argv = [os.path.join(self.pgbin, cmd)]
-        argv.extend(args)
+    def run_cmd(self, cmd: str, *args: str) -> None:
+        argv = [str(self.pgbin / cmd), *args]
         subprocess.check_call(argv)
 
-    def run_pg(self):
+    def run_pg(self) -> None:
         self.pg = subprocess.Popen(  # pylint: disable=bad-option-value,consider-using-with
             [
-                os.path.join(self.pgbin, "postgres"),
+                str(self.pgbin / "postgres"),
                 "-D",
-                self.pgdata,
+                str(self.pgdata),
                 "-k",
-                self.pgdata,
+                str(self.pgdata),
                 "-p",
                 "5432",
                 "-c",
@@ -91,7 +95,7 @@ def run_pg(self):
         )
         time.sleep(1.0)  # let pg start
 
-    def kill(self, force=True, immediate=True):
+    def kill(self, force: bool = True, immediate: bool = True) -> None:
         if self.pg is None:
             return
         if force:
@@ -107,24 +111,25 @@ def kill(self, force=True, immediate=True):
             raise Exception(f"PG pid {self.pg.pid} not dead")
 
 
-# NOTE: cannot use 'tmpdir' fixture here, it only works in 'function' scope
 @pytest.fixture(scope="session")
-def db():
-    tmpdir_obj = py_path.local(tempfile.mkdtemp(prefix="pglookout_dbtest_"))
-    tmpdir = str(tmpdir_obj)
+def db(tmp_path_factory: pytest.TempPathFactory) -> Generator[TestPG, None, None]:
+    tmpdir = tmp_path_factory.mktemp(basename="pglookout_dbtest_")
     # try to find the binaries for these versions in some path
-    pgdata = os.path.join(tmpdir, "pgdata")
+    pgdata = tmpdir / "pgdata"
     db = TestPG(pgdata)  # pylint: disable=redefined-outer-name
-    db.run_cmd("initdb", "-D", pgdata, "--encoding", "utf-8")
-    # NOTE: point $HOME to tmpdir - $HOME shouldn't affect most tests, but
-    # psql triest to find .pgpass file from there as do our functions that
-    # manipulate pgpass.  By pointing $HOME there we make sure we're not
-    # making persistent changes to the environment.
-    os.environ["HOME"] = tmpdir
+    db.run_cmd("initdb", "-D", str(pgdata), "--encoding", "utf-8")
+
     # allow replication connections
-    with open(os.path.join(pgdata, "pg_hba.conf"), "w") as fp:
-        fp.write("local all all trust\nlocal replication all trust\n")
-    with open(os.path.join(pgdata, "postgresql.conf"), "a") as fp:
+    (pgdata / "pg_hba.conf").write_text(
+        dedent(
+            """\
+        local all all trust
+        local replication all trust
+    """
+        )
+    )
+
+    with (pgdata / "postgresql.conf").open("a") as fp:
         fp.write(
             "max_wal_senders = 2\n"
             "wal_level = logical\n"
@@ -136,14 +141,16 @@ def db():
         )
         if db.pgver < "13":
             fp.write("wal_keep_segments = 100\n")
-    db.run_pg()
-    try:
-        db.createuser()
-        db.createuser("otheruser")
-        yield db
-    finally:
-        db.kill()
+
+    # NOTE: point $HOME to tmpdir - $HOME shouldn't affect most tests, but
+    # psql triest to find .pgpass file from there as do our functions that
+    # manipulate pgpass.  By pointing $HOME there we make sure we're not
+    # making persistent changes to the environment.
+    with patch.dict(os.environ, {"HOME": str(tmpdir)}):
+        db.run_pg()
         try:
-            tmpdir_obj.remove(rec=1)
-        except:  # pylint: disable=bare-except
-            pass
+            db.createuser()
+            db.createuser("otheruser")
+            yield db
+        finally:
+            db.kill()
diff --git a/test/test_cluster_monitor.py b/test/test_cluster_monitor.py
index b841be2..845d862 100644
--- a/test/test_cluster_monitor.py
+++ b/test/test_cluster_monitor.py
@@ -12,6 +12,7 @@
 from pglookout.cluster_monitor import ClusterMonitor
 from pglookout.common_types import MemberState, ObservedState
 from pglookout.config import Config
+from pglookout.pgutil import create_connection_string
 from psycopg2.extras import RealDictCursor
 from queue import Queue
 from typing import NoReturn
@@ -49,8 +50,8 @@ def test_replication_lag() -> None:
 def test_main_loop(db: TestPG) -> None:
     config: Config = {
         "remote_conns": {
-            "test1db": db.connection_string("testuser"),
-            "test2db": db.connection_string("otheruser"),
+            "test1db": db.connection_dict("testuser"),
+            "test2db": db.connection_dict("otheruser"),
         },
         "observers": {"local": "URL"},
         "poll_observers_on_warning_only": True,
@@ -111,8 +112,8 @@ def test_fetch_replication_slot_info(db: TestPG) -> None:
 
     config: Config = {
         "remote_conns": {
-            "test1db": db.connection_string("testuser"),
-            "test2db": db.connection_string("otheruser"),
+            "test1db": db.connection_dict("testuser"),
+            "test2db": db.connection_dict("otheruser"),
         },
         "observers": {"local": "URL"},
         "poll_observers_on_warning_only": True,
@@ -138,7 +139,7 @@ def create_alert_file(arg: str) -> NoReturn:
     )
     cm.main_monitoring_loop(requested_check=True)
 
-    with closing(psycopg2.connect(db.connection_string(), connect_timeout=15)) as conn:
+    with closing(psycopg2.connect(create_connection_string(db.connection_dict()), connect_timeout=15)) as conn:
         with conn.cursor(cursor_factory=RealDictCursor) as cursor:
             cursor.execute("SELECT pg_catalog.pg_create_logical_replication_slot('testslot1', 'test_decoding')")
 

From 7434873c76aa912d76cf2a3fe76974036af20e34 Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Mon, 27 Mar 2023 16:56:49 +0200
Subject: [PATCH 18/23] Bump `Makefile` version to `2.1.0` [BF-1560]

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 955ea1e..700617c 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-short_ver = 2.0.3
+short_ver = 2.1.0
 long_ver = $(shell git describe --long 2>/dev/null || echo $(short_ver)-0-unknown-g`git describe --always`)
 generated = pglookout/version.py
 

From 30bcd0d4eca27f21a44f94063897ba57283be4c9 Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Mon, 27 Mar 2023 17:15:03 +0200
Subject: [PATCH 19/23] mypy: Compatible with Python 3.9 logging [BF-1560]

---
 pglookout/pglookout.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pglookout/pglookout.py b/pglookout/pglookout.py
index 2ccf2d7..aa9828f 100755
--- a/pglookout/pglookout.py
+++ b/pglookout/pglookout.py
@@ -12,7 +12,7 @@
 from argparse import ArgumentParser
 from copy import deepcopy
 from datetime import datetime, timedelta
-from logging import DEBUG, getLevelNamesMapping, getLogger, Logger
+from logging import _nameToLevel, DEBUG, getLogger, Logger
 from logging.handlers import SysLogHandler
 from packaging.version import parse as parse_version
 from pathlib import Path
@@ -52,7 +52,7 @@
 import time
 
 DEFAULT_LOG_LEVEL: Final[str] = "DEBUG"
-LOG_LEVEL_NAMES_MAPPING: Final[dict[str, int]] = getLevelNamesMapping()
+LOG_LEVEL_NAMES_MAPPING: Final[dict[str, int]] = deepcopy(_nameToLevel)
 
 
 class PgLookout:

From e4e771ae39928b023b352660fd83e95f8b0a8eaf Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Mon, 27 Mar 2023 18:12:33 +0200
Subject: [PATCH 20/23] pylint: Enable more messages [BF-1560]

---
 pglookout/cluster_monitor.py |  5 ++---
 pglookout/current_master.py  | 19 +++++++++----------
 pglookout/logutil.py         |  2 +-
 pglookout/pglookout.py       | 30 ++++++++++++++----------------
 pglookout/pgutil.py          |  8 +++++---
 pyproject.toml               | 32 ++++++++------------------------
 test/conftest.py             |  2 +-
 7 files changed, 40 insertions(+), 58 deletions(-)

diff --git a/pglookout/cluster_monitor.py b/pglookout/cluster_monitor.py
index 93ea4bf..70b4348 100644
--- a/pglookout/cluster_monitor.py
+++ b/pglookout/cluster_monitor.py
@@ -178,9 +178,9 @@ def _fetch_observer_state(self, instance: str, uri: str) -> ObservedState | None
                     instance,
                     time_diff,
                     response.json(),
-                )  # pylint: disable=no-member
+                )
                 return None
-            result.update(response.json())  # pylint: disable=no-member
+            result.update(response.json())
         except requests.ConnectionError as ex:
             self.log.warning(
                 "%s (%s) fetching state from observer: %r, %r",
@@ -347,7 +347,6 @@ def _get_statement_query_updating_transaction(server_version: int) -> str:
             return "SELECT txid_current(), pg_current_wal_lsn() AS pg_last_xlog_replay_location"
         return "SELECT txid_current(), pg_current_xlog_location() AS pg_last_xlog_replay_location"
 
-    # FIXME: Find a tighter input + return type
     @staticmethod
     def _parse_status_query_result(result: MemberState) -> MemberState:
         if not result:
diff --git a/pglookout/current_master.py b/pglookout/current_master.py
index f6dfb54..b9e068c 100644
--- a/pglookout/current_master.py
+++ b/pglookout/current_master.py
@@ -11,11 +11,11 @@
 from __future__ import annotations
 
 from . import version
+from pathlib import Path
 from pglookout.default import JSON_STATE_FILE_PATH
 
 import argparse
 import json
-import os
 import sys
 import time
 
@@ -34,22 +34,21 @@ def main(args: list[str] | None = None) -> int:
         help="show program version",
         version=version.__version__,
     )
-    parser.add_argument("state", help="pglookout state file")
+    parser.add_argument("state", type=Path, help="pglookout state file")
     arg = parser.parse_args(args)
 
-    if not os.path.exists(arg.state):
-        print(f"pglookout_current_master: {arg.state!r} doesn't exist")
+    state_file: Path = arg.state
+    if not state_file.is_file():
+        print(f"pglookout_current_master: {arg.state!s} doesn't exist")
         return 1
 
     try:
-        with open(arg.state, "r") as fp:
-            config = json.load(fp)
-        state_file_path = config.get("json_state_file_path", JSON_STATE_FILE_PATH)  # pylint: disable=no-member
-        if time.monotonic() - os.stat(state_file_path).st_mtime > 60.0:
+        config = json.loads(state_file.read_text(encoding="utf-8"))
+        state_file_path = Path(config.get("json_state_file_path", JSON_STATE_FILE_PATH))
+        if time.monotonic() - state_file_path.stat().st_mtime > 60.0:
             # file older than one minute, pglookout probably dead, exit with minus one
             return -1
-        with open(state_file_path, "r") as fp:
-            state_dict = json.load(fp)
+        state_dict = json.loads(state_file_path.read_text(encoding="utf-8"))
         current_master = state_dict["current_master"]
         print(current_master)
     except:  # pylint: disable=bare-except
diff --git a/pglookout/logutil.py b/pglookout/logutil.py
index f0150ef..ee70b52 100644
--- a/pglookout/logutil.py
+++ b/pglookout/logutil.py
@@ -15,7 +15,7 @@
 
 with_systemd: bool = False
 try:
-    from systemd import daemon  # pylint: disable=no-name-in-module
+    from systemd import daemon
 
     with_systemd = True
 except ImportError:
diff --git a/pglookout/pglookout.py b/pglookout/pglookout.py
index aa9828f..ec4a84b 100755
--- a/pglookout/pglookout.py
+++ b/pglookout/pglookout.py
@@ -190,7 +190,7 @@ def load_config(self) -> None:
 
         previous_remote_conns = self.config.get("remote_conns")
         try:
-            config = json.loads(self.config_path.read_text())
+            config = json.loads(self.config_path.read_text(encoding="utf-8"))
             self.config = self.normalize_config(config)
         except Exception as ex:  # pylint: disable=broad-except
             self.log.exception("Invalid JSON config, exiting")
@@ -282,7 +282,7 @@ def write_cluster_state_to_json_file(self) -> None:
             )
 
             state_file_path_tmp = state_file_path.with_name(f"{state_file_path.name}.tmp")
-            state_file_path_tmp.write_text(json_to_dump)
+            state_file_path_tmp.write_text(json_to_dump, encoding="utf-8")
             state_file_path_tmp.rename(state_file_path)
 
             self.log.debug(
@@ -452,7 +452,7 @@ def create_node_map(
 
         return master_instance, master_node, standby_nodes
 
-    def is_restoring_or_catching_up_normally(self, state: MemberState) -> bool:
+    def _is_restoring_or_catching_up_normally(self, state: MemberState) -> bool:
         """
         Return True if node is still in the replication catchup phase and
         replication lag alerts/metrics should not yet be generated.
@@ -476,7 +476,7 @@ def is_restoring_or_catching_up_normally(self, state: MemberState) -> bool:
         return False
 
     def emit_stats(self, state: MemberState) -> None:
-        if self.is_restoring_or_catching_up_normally(state):
+        if self._is_restoring_or_catching_up_normally(state):
             # do not emit misleading lag stats during catchup at restore
             return
 
@@ -484,7 +484,7 @@ def emit_stats(self, state: MemberState) -> None:
         if replication_time_lag is not None:
             self.stats.gauge("pg.replication_lag", replication_time_lag)
 
-    def is_master_observer_new_enough(self, observer_state: dict[str, ObservedState]) -> bool:
+    def _is_master_observer_new_enough(self, observer_state: dict[str, ObservedState]) -> bool:
         if not self.replication_lag_over_warning_limit:
             return True
 
@@ -536,7 +536,7 @@ def check_cluster_state(self) -> None:
             )
             return
 
-        if self.config.get("poll_observers_on_warning_only") and not self.is_master_observer_new_enough(observer_state):
+        if self.config.get("poll_observers_on_warning_only") and not self._is_master_observer_new_enough(observer_state):
             self.log.warning("observer data is not good enough, skipping check")
             return
 
@@ -551,7 +551,7 @@ def check_cluster_state(self) -> None:
             )
             self.current_master = master_instance
             if self.own_db and self.own_db != master_instance and self.config.get("autofollow"):
-                self.start_following_new_master(master_instance)
+                self._start_following_new_master(master_instance)
 
         self._make_failover_decision(observer_state, standby_nodes, master_node)
 
@@ -685,7 +685,7 @@ def is_replication_lag_over_warning_limit(self) -> bool:
         return self.replication_lag_over_warning_limit
 
     def check_replication_lag(self, own_state: MemberState, standby_nodes: dict[str, MemberState]) -> None:
-        if self.is_restoring_or_catching_up_normally(own_state):
+        if self._is_restoring_or_catching_up_normally(own_state):
             # do not raise alerts during catchup at restore
             return
 
@@ -749,8 +749,7 @@ def get_replication_positions(self, standby_nodes: dict[str, MemberState]) -> di
                 node_state["connection"]
                 and now - parse_iso_datetime(node_state["fetch_time"]) < timedelta(seconds=20)
                 and instance not in self.never_promote_these_nodes
-            ):  # noqa # pylint: disable=line-too-long
-                # use pg_last_xlog_receive_location if it's available,
+            ):  # use pg_last_xlog_receive_location if it's available,
                 # otherwise fall back to pg_last_xlog_replay_location but
                 # note that both of them can be None.  We prefer
                 # receive_location over replay_location as some nodes may
@@ -870,7 +869,7 @@ def do_failover_decision(self, own_state: MemberState, standby_nodes: dict[str,
                 furthest_along_instance,
             )
 
-    def modify_recovery_conf_to_point_at_new_master(self, new_master_instance: str) -> bool:
+    def _modify_recovery_conf_to_point_at_new_master(self, new_master_instance: str) -> bool:
         pg_data_directory = Path(self.config.get("pg_data_directory", PG_DATA_DIRECTORY))
         pg_version = (pg_data_directory / "PG_VERSION").read_text().strip()
 
@@ -940,9 +939,9 @@ def modify_recovery_conf_to_point_at_new_master(self, new_master_instance: str)
 
         return True
 
-    def start_following_new_master(self, new_master_instance: str) -> None:
+    def _start_following_new_master(self, new_master_instance: str) -> None:
         start_time = time.monotonic()
-        updated_config = self.modify_recovery_conf_to_point_at_new_master(new_master_instance)
+        updated_config = self._modify_recovery_conf_to_point_at_new_master(new_master_instance)
         if not updated_config:
             self.log.info(
                 "Already following master %r, no need to start following it again",
@@ -979,7 +978,7 @@ def execute_external_command(self, command: list[str] | str) -> int:
                 err.output,
             )
             self.stats.unexpected_exception(err, where="execute_external_command")
-            return_code = err.returncode  # pylint: disable=no-member
+            return_code = err.returncode
         self.log.warning("Executed external command: %r, output: %r", return_code, output)
         return return_code
 
@@ -1028,8 +1027,7 @@ def _get_health_timeout_seconds(self) -> float | None:
         if "cluster_monitor_health_timeout_seconds" in self.config:
             config_value = self.config.get("cluster_monitor_health_timeout_seconds")
             return config_value if config_value is None else float(config_value)
-        else:
-            return self._get_check_interval() * 2
+        return self._get_check_interval() * 2
 
     def _get_check_interval(self) -> float:
         return float(self.config.get("replication_state_check_interval", 5.0))
diff --git a/pglookout/pgutil.py b/pglookout/pgutil.py
index 799fcca..1ea70be 100644
--- a/pglookout/pgutil.py
+++ b/pglookout/pgutil.py
@@ -8,7 +8,7 @@
 from __future__ import annotations
 
 from typing import cast, Literal, TypedDict, Union
-from urllib.parse import parse_qs, urlparse  # pylint: disable=no-name-in-module, import-error
+from urllib.parse import parse_qs, urlparse
 
 import psycopg2.extensions
 
@@ -131,7 +131,9 @@ def parse_connection_string_url(url: str) -> ConnectionParameterKeywords:
     return cast(ConnectionParameterKeywords, fields)
 
 
-def parse_connection_string_libpq(connection_string: str) -> ConnectionParameterKeywords:
+def parse_connection_string_libpq(
+    connection_string: str,
+) -> ConnectionParameterKeywords:
     """Parse a postgresql connection string.
 
     See:
@@ -159,7 +161,7 @@ def parse_connection_string_libpq(connection_string: str) -> ConnectionParameter
                     value += rem[i]
             else:
                 raise ValueError(f"invalid connection_string fragment {rem!r}")
-            connection_string = rem[i + 1 :]  # pylint: disable=undefined-loop-variable
+            connection_string = rem[i + 1 :]
         else:
             res = rem.split(None, 1)
             if len(res) > 1:
diff --git a/pyproject.toml b/pyproject.toml
index 10f5aa9..51b1995 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,43 +27,23 @@ strict = true
 files = [
     'pglookout/',
     'test/',
-    'version.py',
-    'setup.py',
+    './*.py',
 ]
 
-[[tool.mypy.overrides]]
-module = "test.test_cluster_monitor"
-# ignore no-untyped-call because conftest can only type hinted at the end. Remove at the end.
-disallow_untyped_calls = false
-
 [tool.pylint.'MESSAGES CONTROL']
+enable = [
+    'useless-suppression',
+]
 disable = [
-    'bad-option-value',
     'duplicate-code',
     'fixme',
-    'import-outside-toplevel',
     'invalid-name',
-    'len-as-condition',
-    'locally-disabled',
     'missing-docstring',
-    'no-else-raise',
-    'no-else-return',
-    'no-self-use',
-    'raise-missing-from',
-    'too-few-public-methods',
-    'too-many-ancestors',
-    'too-many-arguments',
-    'too-many-boolean-expressions',
     'too-many-branches',
-    'too-many-function-args',
     'too-many-instance-attributes',
     'too-many-locals',
-    'too-many-public-methods',
     'too-many-statements',
-    'ungrouped-imports',
-    'unspecified-encoding',
     'wrong-import-order',
-    'wrong-import-position',
 ]
 
 [tool.pylint.'FORMAT']
@@ -76,3 +56,7 @@ reports = 'no'
 
 [tool.pylint.'TYPECHECK']
 extension-pkg-whitelist = 'pydantic'
+
+[tool.pylint.'DESIGN']
+max-args = 10
+max-attributes = 10
diff --git a/test/conftest.py b/test/conftest.py
index a27b015..8165f39 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -80,7 +80,7 @@ def run_cmd(self, cmd: str, *args: str) -> None:
         subprocess.check_call(argv)
 
     def run_pg(self) -> None:
-        self.pg = subprocess.Popen(  # pylint: disable=bad-option-value,consider-using-with
+        self.pg = subprocess.Popen(  # pylint: disable=consider-using-with
             [
                 str(self.pgbin / "postgres"),
                 "-D",

From 56c01a875bd558da18056f2b5e620f2e5945d98a Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Tue, 28 Mar 2023 12:10:54 +0200
Subject: [PATCH 21/23] pylint: Fix broad exception raised [BF-1560]

---
 pglookout/pglookout.py       | 1 -
 test/conftest.py             | 2 +-
 test/test_cluster_monitor.py | 8 ++++++--
 version.py                   | 2 +-
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/pglookout/pglookout.py b/pglookout/pglookout.py
index ec4a84b..d8d4ae1 100755
--- a/pglookout/pglookout.py
+++ b/pglookout/pglookout.py
@@ -523,7 +523,6 @@ def _is_master_observer_new_enough(self, observer_state: dict[str, ObservedState
         return True
 
     def check_cluster_state(self) -> None:
-        # master_node = None
         cluster_state = deepcopy(self.cluster_state)
         observer_state = deepcopy(self.observer_state)
         configured_node_count = len(self.config.get("remote_conns", {}))
diff --git a/test/conftest.py b/test/conftest.py
index 8165f39..47937e6 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -108,7 +108,7 @@ def kill(self, force: bool = True, immediate: bool = True) -> None:
         while (self.pg.poll() is None) and (time.monotonic() < timeout):
             time.sleep(0.1)
         if not force and self.pg.poll() is None:
-            raise Exception(f"PG pid {self.pg.pid} not dead")
+            raise TimeoutError(f"PG pid {self.pg.pid} not dead")
 
 
 @pytest.fixture(scope="session")
diff --git a/test/test_cluster_monitor.py b/test/test_cluster_monitor.py
index 845d862..2fe4e40 100644
--- a/test/test_cluster_monitor.py
+++ b/test/test_cluster_monitor.py
@@ -24,6 +24,10 @@
 import time
 
 
+class AlertFileCreatedException(Exception):
+    pass
+
+
 def test_replication_lag() -> None:
     # pylint: disable=protected-access
     now = datetime.now()
@@ -60,7 +64,7 @@ def test_main_loop(db: TestPG) -> None:
     observer_state: dict[str, ObservedState] = {}
 
     def create_alert_file(arg: str) -> NoReturn:
-        raise Exception(arg)
+        raise AlertFileCreatedException(arg)
 
     cluster_monitor_check_queue: Queue[str] = Queue()
     failover_decision_queue: Queue[str] = Queue()
@@ -122,7 +126,7 @@ def test_fetch_replication_slot_info(db: TestPG) -> None:
     observer_state: dict[str, ObservedState] = {}
 
     def create_alert_file(arg: str) -> NoReturn:
-        raise Exception(arg)
+        raise AlertFileCreatedException(arg)
 
     cluster_monitor_check_queue: Queue[str] = Queue()
     failover_decision_queue: Queue[str] = Queue()
diff --git a/version.py b/version.py
index 1f12706..5e2ccf0 100644
--- a/version.py
+++ b/version.py
@@ -88,7 +88,7 @@ def get_project_version(version_file_name: str) -> str:
         return short_ver
 
     if not file_ver:
-        raise Exception(f"version not available from git or from file {version_file!r}")
+        raise ValueError(f"version not available from git or from file {version_file!r}")
 
     return file_ver
 

From 09a15b1d4aabe2feb4c512a1cff1d24837cafeab Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Tue, 28 Mar 2023 12:11:49 +0200
Subject: [PATCH 22/23] pylint: Dev requirements are now pinned [BF-1560]

This fixes some behaviour discrepancy between local and CI.
---
 requirements.dev.txt | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/requirements.dev.txt b/requirements.dev.txt
index 2635d85..f2148a8 100644
--- a/requirements.dev.txt
+++ b/requirements.dev.txt
@@ -1,10 +1,10 @@
-black==22.10.0
-flake8
-flake8-pyproject==1.2.2
-isort==5.10.1
+black==23.1.0
+flake8==6.0.0
+flake8-pyproject==1.2.3
+isort==5.12.0
 mypy==1.1.1
-types-psycopg2==2.9.21.8
-types-requests==2.26.1
+types-psycopg2==2.9.21.9
+types-requests==2.28.11.16
 types-setuptools==67.6.0.5
-pylint==2.15.4
-pytest
+pylint==2.17.1
+pytest==7.2.2

From 9a0abaeaab31f37a787c73e343f755231acc92b8 Mon Sep 17 00:00:00 2001
From: Samuel Giffard <samuel.giffard@aiven.io>
Date: Tue, 4 Apr 2023 16:42:00 +0200
Subject: [PATCH 23/23] make: Dev reqs are now in isolation [BF-1560]

This makes sure that the tests can run in their own environment, without affecting the system deps, while still using the system deps.
---
 .github/workflows/build.yml |  4 +--
 .gitignore                  |  2 ++
 Makefile                    | 58 ++++++++++++++++++++++---------------
 pglookout.spec              |  1 -
 requirements.dev.txt        |  1 +
 5 files changed, 38 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c9bafd8..08157fa 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -28,9 +28,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
 
       - id: dependencies
-        run: |
-          pip install -e .
-          pip install -r requirements.dev.txt
+        run: make local-install
 
       - id: mypy
         run: make mypy
diff --git a/.gitignore b/.gitignore
index ec949a5..4d2565b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,5 +14,7 @@ coverage.xml
 rpm/
 dist/
 
+.venv-test/
+
 # Generated targets
 pglookout-rpm-src.tar
diff --git a/Makefile b/Makefile
index 700617c..cb7f2b7 100644
--- a/Makefile
+++ b/Makefile
@@ -2,6 +2,8 @@ short_ver = 2.1.0
 long_ver = $(shell git describe --long 2>/dev/null || echo $(short_ver)-0-unknown-g`git describe --always`)
 generated = pglookout/version.py
 
+VENV = .venv-test
+VENV_PYTHON = $(VENV)/bin/python
 PYTHON ?= python3
 PYTHON_SOURCE_DIRS = pglookout/ test/ stubs/ version.py setup.py
 
@@ -11,35 +13,45 @@ all: $(generated)
 pglookout/version.py: version.py
 	$(PYTHON) $^ $@
 
-test: mypy flake8 pylint unittest
+# This venv is only used for tests and development. It has access to system site packages, because pglookout
+# would need them to run. Development deps are kept in this venv, so that they don't interfere with the
+# system python.
+$(VENV):
+	$(PYTHON) -m venv --system-site-packages $(VENV)
+	$(VENV)/bin/pip install -r requirements.dev.txt
 
-unittest: $(generated)
-	$(PYTHON) -m pytest
+local-install: $(VENV)
+	$(VENV_PYTHON) -m pip install -e .
 
-mypy: $(generated)
-	MYPYPATH=stubs $(PYTHON) -m mypy
+test: mypy flake8 pylint unittest fmt-check
 
-flake8: $(generated)
-	$(PYTHON) -m flake8 $(PYTHON_SOURCE_DIRS)
+unittest: $(generated) $(VENV)
+	$(VENV_PYTHON) -m pytest
 
-pylint: $(generated)
-	$(PYTHON) -m pylint $(PYTHON_SOURCE_DIRS)
+mypy: $(generated) $(VENV)
+	MYPYPATH=stubs $(VENV_PYTHON) -m mypy
 
-fmt: $(generated)
-	isort $(PYTHON_SOURCE_DIRS)
-	black $(PYTHON_SOURCE_DIRS)
+flake8: $(generated) $(VENV)
+	$(VENV_PYTHON) -m flake8 $(PYTHON_SOURCE_DIRS)
 
-fmt-check: $(generated)
-	isort --check $(PYTHON_SOURCE_DIRS)
-	black --check $(PYTHON_SOURCE_DIRS)
+pylint: $(generated) $(VENV)
+	$(VENV_PYTHON) -m pylint $(PYTHON_SOURCE_DIRS)
 
-coverage:
-	$(PYTHON) -m pytest $(PYTEST_ARG) --cov-report term-missing --cov-branch \
+fmt: $(generated) $(VENV)
+	$(VENV_PYTHON) -m isort $(PYTHON_SOURCE_DIRS)
+	$(VENV_PYTHON) -m black $(PYTHON_SOURCE_DIRS)
+
+fmt-check: $(generated) $(VENV)
+	$(VENV_PYTHON) -m isort --check $(PYTHON_SOURCE_DIRS)
+	$(VENV_PYTHON) -m black --check $(PYTHON_SOURCE_DIRS)
+
+coverage: $(VENV)
+	$(VENV_PYTHON) -m pytest $(PYTEST_ARG) --cov-report term-missing --cov-branch \
 		--cov-report xml:coverage.xml --cov pglookout test/
 
 clean:
-	$(RM) -r *.egg-info/ build/ dist/
-	$(RM) ../pglookout_* test-*.xml $(generated)
+	$(RM) -r *.egg-info/ build/ dist/ $(VENV) .hypothesis
+	$(RM) ../pglookout_* test-*.xml $(generated) .coverage coverage.xml
 
 deb: $(generated)
 	cp debian/changelog.in debian/changelog
@@ -59,10 +71,8 @@ rpm: $(generated)
 
 build-dep-fed:
 	sudo dnf -y install --best --allowerasing \
-		python3-devel python3-pytest python3-pylint \
-		python3-mock python3-psycopg2 \
-		python3-requests rpm-build systemd-python3 \
-		python3-flake8 python3-pytest-cov python3-packaging
+		python3-devel python3-psycopg2 python3-requests \
+		rpm-build systemd-python3 python3-packaging
 
 build-dep-deb:
 	sudo apt-get install \
@@ -70,4 +80,4 @@ build-dep-deb:
 		python3-all python3-setuptools python3-psycopg2 python3-requests \
 		python3-packaging
 
-.PHONY: rpm
+.PHONY: rpm local-install
diff --git a/pglookout.spec b/pglookout.spec
index 4bf34c8..2e6fd79 100644
--- a/pglookout.spec
+++ b/pglookout.spec
@@ -8,7 +8,6 @@ Source0:        pglookout-rpm-src.tar
 Obsoletes:      python3-pglookout
 Requires:       python3-packaging, python3-psycopg2, python3-requests, python3-setuptools, systemd-python3, systemd
 BuildRequires:  python3-packaging, python3-psycopg2, python3-requests, python3-setuptools, systemd-python3, systemd
-BuildRequires:  python3-pytest, python3-pylint
 BuildArch:      noarch
 
 %description
diff --git a/requirements.dev.txt b/requirements.dev.txt
index f2148a8..6eb07d9 100644
--- a/requirements.dev.txt
+++ b/requirements.dev.txt
@@ -8,3 +8,4 @@ types-requests==2.28.11.16
 types-setuptools==67.6.0.5
 pylint==2.17.1
 pytest==7.2.2
+pytest-cov==4.0.0