diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..8af22b5 --- /dev/null +++ b/.flake8 @@ -0,0 +1,35 @@ +[flake8] +select = + E, W, # pep8 errors and warnings + F, # pyflakes + C9, # McCabe + N8, # Naming Conventions + #B, S, # bandit + #C, # commas + #D, # docstrings + #P, # string-format + #Q, # quotes + +ignore = + E122, # continuation line missing indentation or outdented + E123, # closing bracket does not match indentation of opening bracket's line + E127, # continuation line over-indented for visual indent + E131, # continuation line unaligned for hanging + E203, # whitespace before ':' + E225, # missing whitespace around operator + E226, # missing whitespace around arithmetic operator + E24, # multiple spaces after ',' or tab after ',' + E275, # missing whitespace after keyword + E305, # expected 2 blank lines after end of function or class + E306, # expected 1 blank line before a nested definition + E402, # module level import not at top of file + E722, # do not use bare except, specify exception instead + E731, # do not assign a lambda expression, use a def + E741, # do not use variables named 'l', 'O', or 'I' + + F722, # syntax error in forward annotation + + W503, # line break before binary operator + W504, # line break after binary operator + +max-line-length = 200 diff --git a/.github/workflows/docker-jepsen.yml b/.github/workflows/docker-jepsen.yml new file mode 100644 index 0000000..056043c --- /dev/null +++ b/.github/workflows/docker-jepsen.yml @@ -0,0 +1,23 @@ +name: Jepsen tests + +on: + schedule: + - cron: '0 0 * * *' + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + test: + name: jepsen + runs-on: ubuntu-22.04 + steps: + - name: Test + run: make jepsen + + - uses: actions/upload-artifact@v3 + if: failure() + with: + name: logs + path: tests/logs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1a94725 --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +*.pyc +*.swp +.tox/ +*.egg-info/ +htmlcov/ +./Dockerfile +docker/zookeeper/zookeeper*.tar.gz +test_ssh_key* +.idea +junit_report/ +logs/ +venv/ +.python-version +.mypy_cache diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..6db8ecb --- /dev/null +++ b/.pylintrc @@ -0,0 +1,407 @@ +[MASTER] + +# Specify a configuration file. +#rcfile= + +# Python code to execute, usually for sys.path manipulation such as +# pygtk.require(). +#init-hook= + +# Add files or directories to the blacklist. They should be base names, not +# paths. +ignore= + +# Add files or directories matching the regex patterns to the blacklist. The +# regex matches against base names, not paths. +ignore-patterns= + +# Pickle collected data for later comparisons. +persistent=yes + +# List of plugins (as comma separated values of python modules names) to load, +# usually to register additional checkers. +load-plugins= + +# Use multiple processes to speed up Pylint. +jobs=4 + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +unsafe-load-any-extension=no + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code +extension-pkg-whitelist= + +# Allow optimization of some AST trees. This will activate a peephole AST +# optimizer, which will apply various small optimizations. For instance, it can +# be used to obtain the result of joining multiple strings with the addition +# operator. Joining a lot of strings can lead to a maximum recursion error in +# Pylint and this flag can prevent that. It has one side effect, the resulting +# AST will be different than the one from reality. This option is deprecated +# and it will be removed in Pylint 2.0. +optimize-ast=no + + +[MESSAGES CONTROL] + +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED +confidence= + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +#enable= + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once).You can also use "--disable=all" to +# disable everything first and then reenable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use"--disable=all --enable=classes +# --disable=W" +disable=broad-except + + +[REPORTS] + +# Set the output format. Available formats are text, parseable, colorized, msvs +# (visual studio) and html. You can also give a reporter class, eg +# mypackage.mymodule.MyReporterClass. +output-format=text + +# Put messages in a separate file for each module / package specified on the +# command line instead of printing them on stdout. Reports (if any) will be +# written in a file name "pylint_global.[txt|html]". This option is deprecated +# and it will be removed in Pylint 2.0. +files-output=no + +# Tells whether to display a full report or only the messages +reports=yes + +# Python expression which should return a note less than 10 (10 is the highest +# note). You have access to the variables errors warning, statement which +# respectively contain the number of errors / warnings messages and the total +# number of statements analyzed. This is used by the global evaluation report +# (RP0004). +evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details +#msg-template= + + +[BASIC] + +# Good variable names which should always be accepted, separated by a comma +good-names=db,zk,i,j,k,ex,Run,_ + +# Bad variable names which should always be refused, separated by a comma +bad-names=foo,bar,baz,toto,tutu,tata + +# Colon-delimited sets of names that determine each other's naming style when +# the name regexes allow several styles. +name-group= + +# Include a hint for the correct naming format with invalid-name +include-naming-hint=no + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. +property-classes=abc.abstractproperty + +# Regular expression matching correct variable names +variable-rgx=[a-z_][a-z0-9_]{2,30}$ + +# Naming hint for variable names +variable-name-hint=[a-z_][a-z0-9_]{2,30}$ + +# Regular expression matching correct function names +function-rgx=[a-z_][a-z0-9_]{2,30}$ + +# Naming hint for function names +function-name-hint=[a-z_][a-z0-9_]{2,30}$ + +# Regular expression matching correct argument names +argument-rgx=[a-z_][a-z0-9_]{2,30}$ + +# Naming hint for argument names +argument-name-hint=[a-z_][a-z0-9_]{2,30}$ + +# Regular expression matching correct attribute names +attr-rgx=[a-z_][a-z0-9_]{2,30}$ + +# Naming hint for attribute names +attr-name-hint=[a-z_][a-z0-9_]{2,30}$ + +# Regular expression matching correct method names +method-rgx=[a-z_][a-z0-9_]{2,30}$ + +# Naming hint for method names +method-name-hint=[a-z_][a-z0-9_]{2,30}$ + +# Regular expression matching correct module names +module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ + +# Naming hint for module names +module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ + +# Regular expression matching correct class attribute names +class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ + +# Naming hint for class attribute names +class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ + +# Regular expression matching correct class names +class-rgx=[A-Z_][a-zA-Z0-9]+$ + +# Naming hint for class names +class-name-hint=[A-Z_][a-zA-Z0-9]+$ + +# Regular expression matching correct constant names +const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ + +# Naming hint for constant names +const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$ + +# Regular expression matching correct inline iteration names +inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ + +# Naming hint for inline iteration names +inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$ + +# Regular expression which should only match function or class names that do +# not require a docstring. +no-docstring-rgx=^_ + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=-1 + + +[ELIF] + +# Maximum number of nested blocks for function / method body +max-nested-blocks=5 + + +[FORMAT] + +# Maximum number of characters on a single line. +max-line-length=120 + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=^\s*(# )??$ + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=no + +# List of optional constructs for which whitespace checking is disabled. `dict- +# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. +# `trailing-comma` allows a space between comma and closing bracket: (a, ). +# `empty-line` allows space-only lines. +no-space-check=trailing-comma,dict-separator + +# Maximum number of lines in a module +max-module-lines=1000 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string=' ' + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + + +[LOGGING] + +# Logging modules to check that the string format arguments are in logging +# function parameter format +logging-modules=logging + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=FIXME,XXX,TODO + + +[SIMILARITIES] + +# Minimum lines number of a similarity. +min-similarity-lines=4 + +# Ignore comments when computing similarities. +ignore-comments=yes + +# Ignore docstrings when computing similarities. +ignore-docstrings=yes + +# Ignore imports when computing similarities. +ignore-imports=no + + +[SPELLING] + +# Spelling dictionary name. Available dictionaries: none. To make it working +# install python-enchant package. +spelling-dict= + +# List of comma separated words that should not be checked. +spelling-ignore-words= + +# A path to a file that contains private dictionary; one word per line. +spelling-private-dict-file= + +# Tells whether to store unknown words to indicated private dictionary in +# --spelling-private-dict-file option instead of raising a message. +spelling-store-unknown-words=no + + +[TYPECHECK] + +# Tells whether missing members accessed in mixin class should be ignored. A +# mixin class is detected if its name ends with "mixin" (case insensitive). +ignore-mixin-members=yes + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis. It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=optparse.Values,thread._local,_thread._local + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members= + +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators=contextlib.contextmanager + + +[VARIABLES] + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# A regular expression matching the name of dummy variables (i.e. expectedly +# not used). +dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid to define new builtins when possible. +additional-builtins= + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=cb_,_cb + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules=six.moves,future.builtins + + +[CLASSES] + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__,__new__,setUp + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=mcs + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict,_fields,_replace,_source,_make + + +[DESIGN] + +# Maximum number of arguments for function / method +max-args=5 + +# Argument names that match this expression will be ignored. Default to name +# with leading underscore +ignored-argument-names=_.* + +# Maximum number of locals for function / method body +max-locals=15 + +# Maximum number of return / yield for function / method body +max-returns=6 + +# Maximum number of branch for function / method body +max-branches=12 + +# Maximum number of statements in function / method body +max-statements=50 + +# Maximum number of parents for a class (see R0901). +max-parents=7 + +# Maximum number of attributes for a class (see R0902). +max-attributes=7 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=2 + +# Maximum number of public methods for a class (see R0904). +max-public-methods=20 + +# Maximum number of boolean expressions in a if statement +max-bool-expr=5 + + +[IMPORTS] + +# Deprecated modules which should not be used, separated by a comma +deprecated-modules=optparse + +# Create a graph of every (i.e. internal and external) dependencies in the +# given file (report RP0402 must not be disabled) +import-graph= + +# Create a graph of external dependencies in the given file (report RP0402 must +# not be disabled) +ext-import-graph= + +# Create a graph of internal dependencies in the given file (report RP0402 must +# not be disabled) +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when being caught. Defaults to +# "Exception" +overgeneral-exceptions=Exception diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..61909df --- /dev/null +++ b/AUTHORS @@ -0,0 +1,20 @@ +The following authors have created the source code of "PgConsul" +published and distributed by YANDEX LLC as the owner: + +Anna Krkhanbarova annkpx@yandex-team.com +Evgeny Arhipov arhipov@yandex-team.com +Vladimir Borodin d0uble@yandex-team.com +Denis Volkov denchick@yandex-team.com +Dmitriy Sarafannikov dsarafan@yandex-team.com +Evgeny Efimkin efimkin@yandex-team.com +Georgy Rylov godjan@yandex-team.com +Ilya Sivanev isiv@yandex-team.com +Dmitry Smal mialinx@yandex-team.com +Sviatoslav Ermilin munakoiso@yandex-team.com +Evgeny Dyukov secwall@yandex-team.com +Alexander Shadchin shadchin@yandex-team.com +Daniil Zakhlystov usernamedt@yandex-team.com +Victor Popov vicpopov@yandex-team.com +Vladimir Leskov vladimirlesk@yandex-team.com +Leonid Borchuk xifos@yandex-team.com +Aleksandr Ovsyannikov zxczxc@yandex-team.com diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..bb6c67e --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,35 @@ +# Notice to external contributors + + +## General info + +Hello! In order for us (YANDEX LLC) to accept patches and other contributions from you, you will have to adopt our Yandex Contributor License Agreement (the CLA). The current version of the CLA can be found here: +1) https://yandex.ru/legal/cla/?lang=en (in English) and +2) https://yandex.ru/legal/cla/?lang=ru (in Russian). + +By adopting the CLA, you state the following: + +* You obviously wish and are willingly licensing your contributions to us for our open source projects under the terms of the CLA, +* You have read the terms and conditions of the CLA and agree with them in full, +* You are legally able to provide and license your contributions as stated, +* We may use your contributions for our open source projects and for any other our project too, +* We rely on your assurances concerning the rights of third parties in relation to your contributions. + +If you agree with these principles, please read and adopt our CLA. By providing us your contributions, you hereby declare that you have already read and adopt our CLA, and we may freely merge your contributions with our corresponding open source project and use it in further in accordance with terms and conditions of the CLA. + +## Provide contributions + +If you have already adopted terms and conditions of the CLA, you are able to provide your contributions. When you submit your pull request, please add the following information into it: + +``` +I hereby agree to the terms of the CLA available at: [link]. +``` + +Replace the bracketed text as follows: +* [link] is the link to the current version of the CLA: https://yandex.ru/legal/cla/?lang=en (in English) or https://yandex.ru/legal/cla/?lang=ru (in Russian). + +It is enough to provide us such notification once. + +## Other questions + +If you have any questions, please mail us at opensource@yandex-team.ru. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..da3adca --- /dev/null +++ b/Dockerfile @@ -0,0 +1,53 @@ +FROM ubuntu:bionic +ENV container docker +ENV DEBIAN_FRONTEND noninteractive +ADD https://www.postgresql.org/media/keys/ACCC4CF8.asc keyring.asc +RUN echo 'APT::Install-Recommends "0"; \n\ +APT::Get::Assume-Yes "true"; \n\ +APT::Get::allow-downgrades "true"; \n\ +APT::Install-Suggests "0";' > /etc/apt/apt.conf.d/01buildconfig && \ + apt-get update && \ + apt-get install -qq --no-install-recommends gpg gpg-agent && \ + apt-key add keyring.asc + +RUN echo "deb http://apt.postgresql.org/pub/repos/apt bionic-pgdg main" > /etc/apt/sources.list.d/pgdg.list && \ + apt-get update && apt-get install wget gnupg ca-certificates locales && \ + locale-gen en_US.UTF-8 && \ + apt-get update && \ + apt-get install \ + openjdk-11-jre-headless \ + less \ + bind9-host \ + net-tools \ + iputils-ping \ + sudo \ + telnet \ + git \ + pgbouncer \ + python3-dev \ + python3-pip \ + python3-venv \ + python3-wheel \ + python3-setuptools \ + openssh-server \ + libpq-dev \ + gcc \ + faketime \ + rsync \ + openssl \ + iptables \ + coreutils && \ + pip3 install git+https://github.com/Supervisor/supervisor.git@4619168a4d820b37641a4719e211cf867bd7f49d && \ + pip3 install wheel && \ + rm -rf /var/run && \ + ln -s /dev/shm /var/run +COPY ./ /var/lib/dist +COPY tests/generate_certs.sh /usr/local/bin/generate_certs.sh +RUN chmod 755 /usr/local/bin/generate_certs.sh +RUN mkdir /root/.ssh && \ + chmod 700 /root/.ssh && \ + cp /var/lib/dist/test_ssh_key.pub /root/.ssh/authorized_keys && \ + mkdir -p /etc/supervisor/conf.d && \ + cp /var/lib/dist/tests/conf/supervisord.conf /etc/supervisor/supervisord.conf && \ + cp /var/lib/dist/docker/base/ssh.conf /etc/supervisor/conf.d/ssh.conf +CMD ["/usr/local/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..a57bb8e --- /dev/null +++ b/LICENSE @@ -0,0 +1,13 @@ +Copyright 2023 YANDEX LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..aaa821c --- /dev/null +++ b/Makefile @@ -0,0 +1,111 @@ +.PHONY: clean all + +PG_MAJOR=14 + +PGCONSUL_IMAGE=pgconsul:behave +PROJECT=pgconsul +ZK_VERSION=3.7.1 +export ZK_VERSION +INSTALL_DIR=$(DESTDIR)opt/yandex/pgconsul +REPLICATION_TYPE=quorum + +clean_report: + rm -rf htmlcov + +clean: clean_report + rm -rf ../yamail-pgconsul_*.build ../yamail-pgconsul_*.changes ../yamail-pgconsul_*.deb Dockerfile* docker/zookeeper/zookeeper-*.tar.gz test_ssh_key* + mv --force static/pgconsul.sudoers.d.orig static/pgconsul.sudoers.d 2>/dev/null || true + mv --force static/pgconsul.init.d.orig static/pgconsul.init.d 2>/dev/null || true + rm -rf .tox __pycache__ pgconsul.egg-info .mypy_cache + rm -rf junit_report + +install: + echo "Installing into $(INSTALL_DIR)" + # Create installation directories + mkdir -p $(DESTDIR)/opt/yandex + mkdir -p $(DESTDIR)/usr/local/bin + mkdir -p $(DESTDIR)/etc/pgconsul/plugins + # Make venv + python3 -m venv $(INSTALL_DIR) +# echo `git rev-list HEAD --count`-`git rev-parse --short HEAD` > $(INSTALL_DIR)/package.release + echo "1-0303030" > $(INSTALL_DIR)/package.release + # Install dependencies and pgconsul as python packages in venv + $(INSTALL_DIR)/bin/pip install wheel + $(INSTALL_DIR)/bin/pip install --pre -r requirements.txt + $(INSTALL_DIR)/bin/pip install --pre . + # Deliver pgconsul static files + make -C static install + mkdir -p $(DESTDIR)/etc/pgconsul/plugins + # Fix "ValueError: bad marshal data (unknown type code)" + find $(INSTALL_DIR) -name __pycache__ -type d -exec rm -rf {} + + # Make symlinks in /usr/local/bin + ln -s /opt/yandex/pgconsul/bin/pgconsul $(DESTDIR)/usr/local/bin + ln -s /opt/yandex/pgconsul/bin/pgconsul-util $(DESTDIR)/usr/local/bin + # Replace redundant paths with actual ones + # E.g. /tmp/build/opt/yandex/pgconsul -> /opt/yandex/pgconsul + test -n '$(DESTDIR)' \ + && grep -l -r -F '$(INSTALL_DIR)' $(INSTALL_DIR) \ + | xargs sed -i -e 's|$(INSTALL_DIR)|/opt/yandex/pgconsul|' \ + || true + +build: + cp -f docker/base/Dockerfile . + yes | ssh-keygen -m PEM -t rsa -N '' -f test_ssh_key -C jepsen || true + wget https://mirror.yandex.ru/mirrors/apache/zookeeper/zookeeper-$(ZK_VERSION)/apache-zookeeper-$(ZK_VERSION)-bin.tar.gz -nc -O docker/zookeeper/zookeeper-$(ZK_VERSION).tar.gz || true + docker compose -p $(PROJECT) down --rmi all --remove-orphans + docker compose -p $(PROJECT) -f jepsen-compose.yml down --rmi all --remove-orphans + docker build -t pgconsulbase:latest . --label pgconsul_tests + docker compose -p $(PROJECT) build --build-arg replication_type=$(REPLICATION_TYPE) --build-arg pg_major=$(PG_MAJOR) + +build_package: + docker build -f ./docker/dpkg/Dockerfile . --tag pgconsul_package_build:1.0 && docker run -e VERSION=$(BUILD_VERSION) -e BUILD_NUMBER=$(BUILD_NUM) pgconsul_package_build:1.0 + +build_pgconsul: + rm -rf logs/ + cp -f tests/Dockerfile ./Dockerfile_pgconsul_behave + docker build -t $(PGCONSUL_IMAGE) \ + --build-arg pg_major=$(PG_MAJOR) \ + -f ./Dockerfile_pgconsul_behave . \ + --label pgconsul_tests + +jepsen_test: + docker compose -p $(PROJECT) -f jepsen-compose.yml up -d + docker exec pgconsul_postgresql1_1 /usr/local/bin/generate_certs.sh + docker exec pgconsul_postgresql2_1 /usr/local/bin/generate_certs.sh + docker exec pgconsul_postgresql3_1 /usr/local/bin/generate_certs.sh + docker exec pgconsul_zookeeper1_1 bash -c '/usr/local/bin/generate_certs.sh && supervisorctl restart zookeeper' + docker exec pgconsul_zookeeper2_1 bash -c '/usr/local/bin/generate_certs.sh && supervisorctl restart zookeeper' + docker exec pgconsul_zookeeper3_1 bash -c '/usr/local/bin/generate_certs.sh && supervisorctl restart zookeeper' + docker exec pgconsul_postgresql1_1 chmod +x /usr/local/bin/setup.sh + docker exec pgconsul_postgresql2_1 chmod +x /usr/local/bin/setup.sh + docker exec pgconsul_postgresql3_1 chmod +x /usr/local/bin/setup.sh + timeout 600 docker exec pgconsul_postgresql1_1 /usr/local/bin/setup.sh $(PG_MAJOR) + timeout 600 docker exec pgconsul_postgresql2_1 /usr/local/bin/setup.sh $(PG_MAJOR) pgconsul_postgresql1_1.pgconsul_pgconsul_net + timeout 600 docker exec pgconsul_postgresql3_1 /usr/local/bin/setup.sh $(PG_MAJOR) pgconsul_postgresql1_1.pgconsul_pgconsul_net + mkdir -p logs + docker exec pgconsul_jepsen_1 chmod +x /root/jepsen/run.sh + (docker exec pgconsul_jepsen_1 /root/jepsen/run.sh >logs/jepsen.log 2>&1 && tail -n 4 logs/jepsen.log && ./docker/jepsen/save_logs.sh $PG_MAJOR) || (./docker/jepsen/save_logs.sh $PG_MAJOR && tail -n 18 logs/jepsen.log && exit 1) + docker compose -p $(PROJECT) -f jepsen-compose.yml down --rmi all + +check_test: build_pgconsul + PROJECT=$(PROJECT) \ + PGCONSUL_IMAGE=$(PGCONSUL_IMAGE) \ + PG_MAJOR=$(PG_MAJOR) \ + tox -e behave -- $(TEST_ARGS) + +check_test_unstoppable: build_pgconsul + PROJECT=$(PROJECT) \ + PGCONSUL_IMAGE=$(PGCONSUL_IMAGE) \ + PG_MAJOR=$(PG_MAJOR) \ + tox -e behave_unstoppable -- $(TEST_ARGS) + +lint: + tox -e yapf,flake8,pylint,bandit + +jepsen: build jepsen_test + +check: build check_test + +check_unstoppable: build check_test_unstoppable + +check-world: clean build check_test jepsen_test diff --git a/README.md b/README.md new file mode 100644 index 0000000..8336307 --- /dev/null +++ b/README.md @@ -0,0 +1,291 @@ +# PgConsul + +## Purpose +PgConsul is a tool for maintaining High-Availability Postgresql cluster configurations. It is responsible for cluster recovery in case of emergencies. + +## Scope +* Adjusting replication mode depending on cluster nodes connectivity. +* Switching the primary role across cluster members, if needed. +* Isolating a node from the load in the event of an abnormal situation. + +## How it works + +### Overview + +Once started, pgconsul enters the processing loop where it performs the following once a second: + +1. Checks for Zookeper lock. +2. Collects information about the replicas and primary status. +3. Writes the collected information to ZK. +4. Decides whether to interfere with the cluster operation. + +Step 4 currently depends on the following factors: +* Interaction with ZK is running smoothly. +* Whether the current instance holds a primary or synchronous replica lock. +* Whether there is a primary lock in ZK. +* Whether there are active replicas in the cluster. +* Replication type meets the requirements from the config. + +### Sequence of actions +1. Initialization. Loading plugins. +* 1.1. Making sure there is no stop flag. If there is, pgconsul won't start. +* 1.2. Checking connectivity between PG and ZK. If ZK can't be reached and the current role is primary, connection pooler stops. If PG is not operable, while ZK is up and running, then primary lock is released. +* 1.3. Checking whether pg_rewind can be run. If not, pgconsul terminates. +* 1.4. Making sure the active cluster is not run on "empty" ZK, for this is most likely caused by an error in the configuration. If both of these conditions are met, pgconsul terminates: +* 1.4.1. Checking if there are child nodes in ZK at ```/all_hosts```. +* 1.4.2. Checking the timeline of the current instance: it must exceed 1. +* 1.5. In blocking mode, creating a child node with the current instance's hostname at ```/all_hosts/```. +2. Main loop. +* 2.1. Waiting during ```global:iteration_timeout``` +* 2.2. Identifying the current local role and status of PG. +* 2.3. Identifying the status of ZK (if there is a connect). +* 2.4. Writing a status file with information from 2.2. and 2.3. +* 2.5. Depending on the role (primary, replica, or malfunction), different checks and actions are performed. For details, see steps 3, 4, and 5 below. +* 2.6. Reinitializing a connection to PG and ZK if it is lost. + +3. Actions and checks performed if the local role is "primary" +* 3.1. Trying to get a primary lock. If the attempt fails: +* 3.1.1. Local connection pooler stops. +* 3.1.2. If the lock holder is not determined, a connection is reinitialized. Return to (step 2) +* 3.1.3. If the holder is determined and this is not the current instance, the host actually turns into a replica. The role transfer procedure is described in 4.3. +* 3.2. Writing replica information to ZK if the ZK information about the timeline matches that received from PG. +* 3.3. If ZK timeline matches the local timeline but step 3.2. failed, stop connection pooler and return to step 2. +* 3.4. If ZK has no timeline information, the local one is written. +* 3.5. If the local and ZK timelines are different: +* 3.5.1. Make a checkpoint. +* 3.5.2. If the ZK timeline exceeds the local one, stop connection pooler and go to step 2. +* 3.5.3. If the local timeline exceeds the ZK timeline, information in ZK is overwritten. +* 3.6. Starting connection pooler. +* 3.7. If ```primary:change_replication_type``` is set: +* 3.7.1. Compare the current and desired replication type: sync or async. +* 3.7.2. Set the appropriate replication type. If sync, also set the name of the host holding a sync_replica lock. + +4. Actions and checks performed if the local role is "replica" +* 4.1. Checking if there is connectivity to ZK. If not, return to step 2. +* 4.2. Checking if there is a primary lock. If not, do a failover: (any exception is captured and pgconsul is aborted). +* 4.2.1. Making some checks. If any of them fails, return to step 2: +* 4.2.1.1. Checking if the current instance is a sync replica. +* 4.2.1.2. Checking if enough time has passed since the last failover attempt (set by the ```replica:min_failover_timeout``` option). +* 4.2.1.3. If the ZK timeline is determined, compare it with the current one. If not, skip this check. +* 4.2.1.4. Making sure a sufficient number of loops completed (see ```replica:failover_checks```). +* 4.2.1.5. Making sure the primary is actually dead by making SELECT 42 from the host specified in the recovery.conf file (make an attempt each time; if no response is received, increment the counter until it exceeds the ```replica:dead_primary_checks``` value). +* 4.2.2. Getting a ZK primary lock. If the attempt fails, return to step 2. +* 4.2.3. Trying to delete information about the status of the previous failover. If the attempt fails, release the primary lock and return to step 2. +* 4.2.4. If replication_slots is used: +* 4.2.1.5. Marking in ZK that the failover status is "creating_slots". +* 4.2.1.6. Reading the list of cluster members in the shard and excluding the current instance from the list. If the attempt fails, release the lock and return to step 2. +* 4.2.1.7. Creating replication_slots. +* 4.2.5. Marking in ZK that the failover status is "promoting". Trying to run pg_ctl promote. If the attempt fails, release the lock and return to step 2. +* 4.2.7. Waiting until PG is up and running (during ```global:iteration_timeout```) +* 4.2.7. Marking in ZK that the failover status is "checkpointing" and making a checkpoint. +* 4.2.8. Writing the current timeline in ZK, updating the failover status to "finished", and setting the current time to last_failover_time. +* 4.3. Checking if the local information about the primary role location matches the address of the host holding the primary lock. Otherwise: +* 4.3.1. Stop connection pooler. +* 4.3.2. If the number of primary_switch attempts does not exceed the ```:primary_switch_checks``` value or the instance is under transition, go to step 2. (```self._return_to_cluster()```) +* 4.3.3. If PG is already in the failover state, go to step 2. +* 4.3.4. If PG is being restored from an archive, do a failover: +* 4.3.4.1. Creating and filling out a recovery.conf file pointing to the new primary. +* 4.3.4.2. Waiting until PG gets consistent. +* 4.3.4.3. Waiting until the primary starts sending WALs: +* 4.3.4.5. Return to step 2. +* 4.3.5. If the rewind retries counter exceeds the ```global:max_rewind_retries``` value, set a regular flag (see 1.1.) and abort pgconsul with an error message returned. +* 4.3.6. Making a rewind retry, as step 3.1.3.4. failed. If the attempt does not succeed, go to step 2. +* 4.3.6.1. Stopping PG if it does not change to the normal state (in terms of pg_control) +* 4.3.6.2. Deleting the recovery.conf file and disabling archiving for a while. +* 4.3.6.3. Running PG and resetting postgresql.auto.conf +* 4.3.6.4. Setting a rewind lock in ZK, running a rewind, and releasing the lock. +* 4.3.6.5. Repeat the actions similar to 3.1.3.4. +* 4.3.7. If the replication slots are enabled, add them. +* 4.4. Checking that ZK contains information about the current replica and it is marked as "streaming". Otherwise: +* 4.4.1. If the current replica's type is "sync", release the sync replica lock. +* 4.4.2. Making a checkpoint. +* 4.4.3. If the current timeline is less than ZK timeline by 1: +* 4.4.3.1. Wait for logs from the primary during ```replica:recovery_timeout```. Otherwise, make a failover retry (step 4.3). +* 4.5. If ```replica:start_pooler``` is set to "yes", start connection pooler. +* 4.6. If the current replica is marked as "streaming", try to get a sync replica lock. + +5. Actions and checks performed if the local role can't be identified +* 5.1. Stopping connection pooler. +* 5.2. Releasing the primary and sync replica locks in ZK if the current instance is holding them. +* 5.3. Based on the previously saved state (see 2.2 and 2.3), trying to determine the role, primary, timeline, PG version, and pgdata directory location. In the event of a failure or if this information is unavailable, assign the "replica" role and set the last primary to None. +* 5.4. Checking if there is an active primary lock: +* 5.4.1. Comparing the hostname of its holder with information from 5.3. If the previous local role was "replica" and the primary has not changed, try to run PG and return to step 2. +* 5.4.2. If the primary has changed or the previous local role was different from "replica", switch the local instance to "replica" mode and then follow step 4.3. +* 5.5. If there are no active locks (the cluster is inactive): +* 5.5.1. If the previous role was "primary" and the timeline information from ZK does not match the latest local timeline, return to step 2. + +### pgconsul-util + +The delivery kit includes pgconsul-util that enables you to switch to another primary or initialize a cluster if it is run from a backup or the ZK address changes. +For a detailed key description, see ```pgconsul-util --help``` and ```pgconsul-util --help```. + +#### Scheduled primary switch +PgConsul supports scheduled switching over to a different primary. This functionality assumes that the primary role switches over to the current synchronous replica. +To initiate this, use ```switchover``` mode, e.g.: +``` +pgconsul-util -c pgconsul.conf switchover +2017-01-19 15:50:32,583 DEBUG: lock holders: {u'sync_replica': u'pgtest01i.some.net', u'primary': u'pgtest01h.some.net', u'timeline': 38} +2017-01-19 15:50:32,583 INFO: switchover pgtest01h.some.net (timeline: 38) to pgtest01i.some.net +type "yes" to continue: yes +2017-01-19 15:50:35,157 INFO: initiating switchover with {u'timeline': 38, u'hostname': u'pgtest01h.some.net'} +2017-01-19 15:50:35,173 DEBUG: No lock instance for switchover/primary. Creating one. +2017-01-19 15:50:35,531 DEBUG: state: {u'info': {u'timeline': 38, u'hostname': u'pgtest01h.some.net'}, u'progress': u'scheduled', u'failover': u'finished', u'replicas': [{u'replay_location_diff': 128, u'write_location_diff': 0, u'sync_state': u'sync', u'sent_location_diff': 0, u'primary_location': u'5/760B6700', u'client_hostname': u'pgtest01i.some.net', u'state': u'streaming'}, {u'replay_location_diff': 128, u'write_location_diff': 0, u'sync_state': u'async', u'sent_location_diff': 0, u'primary_location': u'5/760B6700', u'client_hostname': u'pgtest01f.some.net', u'state': u'streaming'}]} +2017-01-19 15:50:35,673 DEBUG: current switchover status: scheduled, failover: finished +2017-01-19 15:50:36,832 DEBUG: current switchover status: initiated, failover: switchover_initiated +2017-01-19 15:50:38,258 DEBUG: current switchover status: initiated, failover: switchover_primary_shut +2017-01-19 15:50:39,401 DEBUG: current switchover status: promoting_replica, failover: promoting +2017-01-19 15:50:40,559 DEBUG: current switchover status: promoting_replica, failover: promoting +2017-01-19 15:50:41,689 DEBUG: current switchover status: promoting_replica, failover: promoting +2017-01-19 15:50:42,897 DEBUG: current switchover status: promoting_replica, failover: promoting +2017-01-19 15:50:45,079 INFO: primary is now pgtest01i.some.net +2017-01-19 15:50:45,142 DEBUG: full state: {u'info': {u'timeline': 38, u'hostname': u'pgtest01h.some.net'}, u'progress': u'finished', u'failover': u'finished', u'replicas': [{u'replay_location_diff': 128, u'write_location_diff': 0, u'sync_state': u'sync', u'sent_location_diff': 0, u'primary_location': u'5/760B6780', u'client_hostname': u'pgtest01i.some.net', u'state': u'streaming'}, {u'replay_location_diff': 128, u'write_location_diff': 0, u'sync_state': u'async', u'sent_location_diff': 0, u'primary_location': u'5/760B6780', u'client_hostname': u'pgtest01f.some.net', u'state': u'streaming'}]} +2017-01-19 15:50:45,142 DEBUG: waiting for replicas to appear... +2017-01-19 15:50:46,206 DEBUG: replicas up: pgtest01h.some.net@5/77002098 +2017-01-19 15:50:47,270 DEBUG: replicas up: pgtest01h.some.net@5/77002198 +2017-01-19 15:50:48,335 DEBUG: replicas up: pgtest01h.some.net@5/77002198 +2017-01-19 15:50:49,416 DEBUG: replicas up: pgtest01h.some.net@5/770024F8 +2017-01-19 15:50:50,497 DEBUG: replicas up: pgtest01h.some.net@5/770024F8 +2017-01-19 15:50:51,561 DEBUG: replicas up: pgtest01f.some.net@5/77002580, pgtest01h.some.net@5/77002580 +2017-01-19 15:50:51,561 INFO: switchover finished, status "finished" +``` +By default, the 60s timeout is set for switching over, starting the primary, and having replicas appear in ```streaming``` status (for each stage). You can override the parameter value with the```--timeout``` option. The expected amount of replicas is set using the ```--replicas``` option and defaults to 2. +If the switchover fails for some reason and/or it is required to reset the switchover status (for example, there is a typo when explicitly setting the primary or timeline), use the ```--reset``` option. However, as this functionality involves intervening in the distributed algorithm, you should only do this if there is a guarantee that no switchover will occur. Otherwise, there is a risk of failover for the cluster. +In addition, you can explicitly set the primary and timeline to switch over. Please keep in mind that, if they differ from the actual ones, the pgconsul logic will ignore them. + +#### Migration to a different prefix or address in ZK +PgConsul has protection set up against running a working cluster in "empty" ZK. This is done to avoid the consequences of the configuration error (see 1.4.) +If the current instance's timeline exceeds 1 at startup (meaning that primary promote was performed at least once), while ```/all_hosts@ZK``` contains no child node, pgconsul crashes. + +At the same time, startup like this may be required, for example, under a managed change of the ZK address or prefix. + +To do this, you can use the utility's ```initzk``` mode, e.g.: +``` +pgconsul-init --config pgconsul.conf --zk new.zk.addr:port --prefix /new_prefix pg01a.fq.dn pg01b.fq.dn pg01c.fq.dn +``` + +Unless otherwise specified, the ZK prefixes and addresses are used from the configuration (by default, ```/etc/pgconsul.conf```). +The only required parameter is a list of space-separated hostnames. + +### Configuration +Pay special attention to the following: + +1. You can set up the ```change_replication_type``` and ```change_replication_metric``` parameters so that pgconsul does not change the replication type at all. Or, in the event of issues, it only degrades to asynchronous replication at daytime, while always performs synchronous replication at nighttime and weekends when the load is lower. + +2. The ```allow_potential_data_loss``` parameter assumes switching the primary even if none of the replicas is synchronous (i.e., with data loss). In this case, the replica with the older xlog position becomes a new primary. + +#### Sample configuration with a description + +```ini +[global] +# Username the daemon will run under. +daemon_user = postgres + +# Log file path. If the path is relative, the parent directory will be working_dir (below) +log_file = /var/log/pgconsul/pgconsul.log + +# Startup without going to background mode +foreground = no + +# Log details. Possible values: debug, info, warning, error, and critical. +log_level = debug + +# Path to the pid file. +pid_file = /var/run/pgconsul/pgconsul.pid + +# Daemon working directory (cwd) +working_dir = /tmp + +# Local PG instance connection string. +local_conn_string = dbname=postgres user=postgres connect_timeout=1 + +# Additional parameters in case of connecting to the primary. +# Used to invoke pg_rewind. +append_rewind_conn_string = port=5432 dbname=postgres user=xxx password=xxx connect_timeout=10 sslmode=verify-full + +# Connection string used to verify if PG is available. +append_primary_conn_string = port=6432 dbname=postgres user=xxx password=xxx connect_timeout=1 sslmode=verify-full + +# Timeout in seconds between main loop iterations (see above). +iteration_timeout = 1 + +# Zookeeper connection string +zk_hosts = zk02d.some.net:2181,zk02e.some.net:2181,zk02g.some.net:2181 + +# Path to the directory with executable files from the PG delivery kit (pg_rewind, pg_controldata, pg_ctl) +bin_path = /usr/lib/postgresql/9.6/bin + +# Whether to use replication_slots if the roles change +use_replication_slots = yes + +# Command to generate the recovery.conf file. The following arguments are passed to the command: +# # %m is the primary hostname +# # %p is the full path to the recovery.conf file +generate_recovery_conf = /usr/local/yandex/populate_recovery_conf.py -s -r -p %p %m + +# Maximum number pg_rewind retries. Once this number is reached, pgysnc sets a flag and aborts (see) +max_rewind_retries = 3 + +# Whether connection pooler is used as a standalone instance +standalone_pooler = yes + +# Address at which the connection pooler check is running if standalone_pooler = yes +pooler_addr = localhost + +# Port at which the connection pooler check is running if standalone_pooler = yes +pooler_port = 6432 + +# Timeout of the connection pooler check at address:port in seconds +pooler_conn_timeout = 1 + +[primary] +# Whether to change the replication type to synchronous (or asynchronous) +# Only done if there is a lock in ZK. +change_replication_type = yes + +# Criterion for changing the replication type: +# 'count' means that replication becomes asynchronous if all replicas are down +# and synchronous if at least one replica is available. +# 'load' means that replication becomes asynchronous if the number of sessions exceeds overload_sessions_ratio. +# If this parameter returns to the normal value, replication becomes synchronous again. +# 'time' indicates that the replication type will only change at the specified time. Requires that the count or load is present (see above) +change_replication_metric = count,load,time + +# Session number threshold (including inactive ones), after reaching which the replication type should be changed (if the respective argument is set above) +overload_sessions_ratio = 75 + +# Schedule for disabling synchronous replication: if the current time falls within the set interval, pgconsul may disable synchronous replication. +# In the example below, the weekday change hours are specified and weekend ones are set to "never". +weekday_change_hours = 10-22 +weekend_change_hours = 0-0 + +# Number of checks after which the old primary becomes a replica of the new primary. +primary_switch_checks = 3 + +[replica] +# Number of checks after which a synchronous replica becomes the primary. +failover_checks = 3 + +# Whether to start connection pooler on the replica if no anomalies are detected. +start_pooler = yes + +# Number of checks after which the replica will change the primary (replication source). +primary_switch_checks = 5 + +# Interval (sec) during which new failover attempts are not allowed. The counter is started after the last failover. +min_failover_timeout = 3600 + +# Allow a failover if a cluster has no synchronous replicas. +allow_potential_data_loss = no + +# Cluster instance recovery timeout. Once the set threshold is reached, pg_rewind is started. +recovery_timeout = 60 + +# Number of primary availability check retries via the PG protocol before a failover is run. +# Relevant if there is no connectivity between ZK and the current primary. +dead_primary_checks = 86400 +``` + +### External components + +* [Kazoo](https://github.com/python-zk/kazoo) is used to interact with Zookeeper +* [Psycopg2](https://github.com/psycopg/psycopg2) is used to interact with Postgresql diff --git a/bin/pgconsul-util b/bin/pgconsul-util new file mode 100755 index 0000000..3f03565 --- /dev/null +++ b/bin/pgconsul-util @@ -0,0 +1,9 @@ +#!/usr/bin/env python +# coding: utf-8 +""" +Entry point for the controlling utility. +""" +from pgconsul import cli + +if __name__ == '__main__': + cli.entry() diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..988f98a --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,146 @@ +version: '2.2' + +services: + zookeeper1: + build: + context: ./docker/zookeeper + args: + - VERSION=${ZK_VERSION} + privileged: true + hostname: pgconsul_zookeeper1_1 + domainname: pgconsul_pgconsul_net + init: true + networks: + pgconsul_net: + ipv4_address: 192.168.233.10 + zookeeper2: + build: + context: ./docker/zookeeper + args: + - VERSION=${ZK_VERSION} + privileged: true + hostname: pgconsul_zookeeper2_1 + domainname: pgconsul_pgconsul_net + init: true + networks: + pgconsul_net: + ipv4_address: 192.168.233.11 + zookeeper3: + build: + context: ./docker/zookeeper + args: + - VERSION=${ZK_VERSION} + privileged: true + hostname: pgconsul_zookeeper3_1 + domainname: pgconsul_pgconsul_net + init: true + networks: + pgconsul_net: + ipv4_address: 192.168.233.12 + backup1: + build: ./docker/backup + hostname: pgconsul_backup_1 + domainname: pgconsul_pgconsul_net + init: true + networks: + pgconsul_net: + ipv4_address: 192.168.233.13 + postgresql1: + build: ./docker/pgconsul + privileged: true + hostname: pgconsul_postgresql1_1 + domainname: pgconsul_pgconsul_net + init: true + extra_hosts: + - "pgconsul_postgresql2_1.pgconsul_pgconsul_net:192.168.233.15" + - "pgconsul_postgresql3_1.pgconsul_pgconsul_net:192.168.233.16" + - "pgconsul_postgresql4_1.pgconsul_pgconsul_net:192.168.233.17" + - "pgconsul_postgresql5_1.pgconsul_pgconsul_net:192.168.233.18" + - "pgconsul_zookeeper1_1.pgconsul_pgconsul_net:192.168.233.10" + - "pgconsul_zookeeper2_1.pgconsul_pgconsul_net:192.168.233.11" + - "pgconsul_zookeeper3_1.pgconsul_pgconsul_net:192.168.233.12" + - "pgconsul_backup1_1.pgconsul_pgconsul_net:192.168.233.13" + networks: + pgconsul_net: + ipv4_address: 192.168.233.14 + postgresql2: + build: ./docker/pgconsul + privileged: true + hostname: pgconsul_postgresql2_1 + domainname: pgconsul_pgconsul_net + init: true + extra_hosts: + - "pgconsul_postgresql1_1.pgconsul_pgconsul_net:192.168.233.14" + - "pgconsul_postgresql3_1.pgconsul_pgconsul_net:192.168.233.16" + - "pgconsul_postgresql4_1.pgconsul_pgconsul_net:192.168.233.17" + - "pgconsul_postgresql5_1.pgconsul_pgconsul_net:192.168.233.18" + - "pgconsul_zookeeper1_1.pgconsul_pgconsul_net:192.168.233.10" + - "pgconsul_zookeeper2_1.pgconsul_pgconsul_net:192.168.233.11" + - "pgconsul_zookeeper3_1.pgconsul_pgconsul_net:192.168.233.12" + - "pgconsul_backup1_1.pgconsul_pgconsul_net:192.168.233.13" + networks: + pgconsul_net: + ipv4_address: 192.168.233.15 + postgresql3: + build: ./docker/pgconsul + privileged: true + hostname: pgconsul_postgresql3_1 + domainname: pgconsul_pgconsul_net + init: true + extra_hosts: + - "pgconsul_postgresql1_1.pgconsul_pgconsul_net:192.168.233.14" + - "pgconsul_postgresql2_1.pgconsul_pgconsul_net:192.168.233.15" + - "pgconsul_postgresql4_1.pgconsul_pgconsul_net:192.168.233.17" + - "pgconsul_postgresql5_1.pgconsul_pgconsul_net:192.168.233.18" + - "pgconsul_zookeeper1_1.pgconsul_pgconsul_net:192.168.233.10" + - "pgconsul_zookeeper2_1.pgconsul_pgconsul_net:192.168.233.11" + - "pgconsul_zookeeper3_1.pgconsul_pgconsul_net:192.168.233.12" + - "pgconsul_backup1_1.pgconsul_pgconsul_net:192.168.233.13" + networks: + pgconsul_net: + ipv4_address: 192.168.233.16 + postgresql4: + build: ./docker/pgconsul + privileged: true + hostname: pgconsul_postgresql4_1 + domainname: pgconsul_pgconsul_net + init: true + extra_hosts: + - "pgconsul_postgresql1_1.pgconsul_pgconsul_net:192.168.233.14" + - "pgconsul_postgresql2_1.pgconsul_pgconsul_net:192.168.233.15" + - "pgconsul_postgresql3_1.pgconsul_pgconsul_net:192.168.233.16" + - "pgconsul_postgresql5_1.pgconsul_pgconsul_net:192.168.233.18" + - "pgconsul_zookeeper1_1.pgconsul_pgconsul_net:192.168.233.10" + - "pgconsul_zookeeper2_1.pgconsul_pgconsul_net:192.168.233.11" + - "pgconsul_zookeeper3_1.pgconsul_pgconsul_net:192.168.233.12" + - "pgconsul_backup1_1.pgconsul_pgconsul_net:192.168.233.13" + networks: + pgconsul_net: + ipv4_address: 192.168.233.17 + postgresql5: + build: ./docker/pgconsul + privileged: true + hostname: pgconsul_postgresql5_1 + domainname: pgconsul_pgconsul_net + init: true + extra_hosts: + - "pgconsul_postgresql1_1.pgconsul_pgconsul_net:192.168.233.14" + - "pgconsul_postgresql2_1.pgconsul_pgconsul_net:192.168.233.15" + - "pgconsul_postgresql3_1.pgconsul_pgconsul_net:192.168.233.16" + - "pgconsul_postgresql4_1.pgconsul_pgconsul_net:192.168.233.17" + - "pgconsul_zookeeper1_1.pgconsul_pgconsul_net:192.168.233.10" + - "pgconsul_zookeeper2_1.pgconsul_pgconsul_net:192.168.233.11" + - "pgconsul_zookeeper3_1.pgconsul_pgconsul_net:192.168.233.12" + - "pgconsul_backup1_1.pgconsul_pgconsul_net:192.168.233.13" + networks: + pgconsul_net: + ipv4_address: 192.168.233.18 + +networks: + pgconsul_net: + driver: bridge + ipam: + driver: default + config: + - subnet: 192.168.233.0/24 + gateway: 192.168.233.1 diff --git a/docker/backup/Dockerfile b/docker/backup/Dockerfile new file mode 100644 index 0000000..e607e6c --- /dev/null +++ b/docker/backup/Dockerfile @@ -0,0 +1,6 @@ +FROM pgconsulbase:latest +RUN mkdir /archive && chown backup.backup /archive && \ + cp /var/lib/dist/docker/backup/rsync.secrets /etc && \ + chmod 0600 /etc/rsync.secrets && \ + cp /var/lib/dist/docker/backup/rsyncd.conf /etc && \ + cp /var/lib/dist/docker/backup/rsync.conf /etc/supervisor/conf.d diff --git a/docker/backup/rsync.conf b/docker/backup/rsync.conf new file mode 100644 index 0000000..da08bd9 --- /dev/null +++ b/docker/backup/rsync.conf @@ -0,0 +1,4 @@ +[program:rsync] +command = /usr/bin/rsync --no-detach --daemon --config="/etc/rsyncd.conf" +redirect_stderr = true +autorestart=true diff --git a/docker/backup/rsync.secrets b/docker/backup/rsync.secrets new file mode 100755 index 0000000..e1faf47 --- /dev/null +++ b/docker/backup/rsync.secrets @@ -0,0 +1 @@ +archive:123456 diff --git a/docker/backup/rsyncd.conf b/docker/backup/rsyncd.conf new file mode 100644 index 0000000..2b964d3 --- /dev/null +++ b/docker/backup/rsyncd.conf @@ -0,0 +1,13 @@ +uid = root +gid = root +use chroot = no +max connections = 100 +pid file = /var/run/rsyncd.pid +log file = /var/log/rsync.log +[archive] + path = /archive + auth users = archive + read only = false + write only = false + transfer logging = true + secrets file = /etc/rsync.secrets diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile new file mode 100644 index 0000000..da3adca --- /dev/null +++ b/docker/base/Dockerfile @@ -0,0 +1,53 @@ +FROM ubuntu:bionic +ENV container docker +ENV DEBIAN_FRONTEND noninteractive +ADD https://www.postgresql.org/media/keys/ACCC4CF8.asc keyring.asc +RUN echo 'APT::Install-Recommends "0"; \n\ +APT::Get::Assume-Yes "true"; \n\ +APT::Get::allow-downgrades "true"; \n\ +APT::Install-Suggests "0";' > /etc/apt/apt.conf.d/01buildconfig && \ + apt-get update && \ + apt-get install -qq --no-install-recommends gpg gpg-agent && \ + apt-key add keyring.asc + +RUN echo "deb http://apt.postgresql.org/pub/repos/apt bionic-pgdg main" > /etc/apt/sources.list.d/pgdg.list && \ + apt-get update && apt-get install wget gnupg ca-certificates locales && \ + locale-gen en_US.UTF-8 && \ + apt-get update && \ + apt-get install \ + openjdk-11-jre-headless \ + less \ + bind9-host \ + net-tools \ + iputils-ping \ + sudo \ + telnet \ + git \ + pgbouncer \ + python3-dev \ + python3-pip \ + python3-venv \ + python3-wheel \ + python3-setuptools \ + openssh-server \ + libpq-dev \ + gcc \ + faketime \ + rsync \ + openssl \ + iptables \ + coreutils && \ + pip3 install git+https://github.com/Supervisor/supervisor.git@4619168a4d820b37641a4719e211cf867bd7f49d && \ + pip3 install wheel && \ + rm -rf /var/run && \ + ln -s /dev/shm /var/run +COPY ./ /var/lib/dist +COPY tests/generate_certs.sh /usr/local/bin/generate_certs.sh +RUN chmod 755 /usr/local/bin/generate_certs.sh +RUN mkdir /root/.ssh && \ + chmod 700 /root/.ssh && \ + cp /var/lib/dist/test_ssh_key.pub /root/.ssh/authorized_keys && \ + mkdir -p /etc/supervisor/conf.d && \ + cp /var/lib/dist/tests/conf/supervisord.conf /etc/supervisor/supervisord.conf && \ + cp /var/lib/dist/docker/base/ssh.conf /etc/supervisor/conf.d/ssh.conf +CMD ["/usr/local/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"] diff --git a/docker/base/ssh.conf b/docker/base/ssh.conf new file mode 100644 index 0000000..d082ef3 --- /dev/null +++ b/docker/base/ssh.conf @@ -0,0 +1,23 @@ +[program:prestart_sshd] +user=root +command=bash -c "mkdir -p /run/sshd" +autostart=true +autorestart=unexpected +exitcodes=0 +startsecs=0 +priority=1 +stdout_logfile=/proc/self/fd/1 +stdout_logfile_maxbytes=0 +stderr_logfile=/proc/self/fd/2 +stderr_logfile_maxbytes=0 + +[program:sshd] +user=root +command=/usr/sbin/sshd -D +autostart=true +autorestart=true +stdout_logfile=/proc/self/fd/1 +stdout_logfile_maxbytes=0 +stderr_logfile=/proc/self/fd/2 +stderr_logfile_maxbytes=0 +priority=10 diff --git a/docker/dpkg/Dockerfile b/docker/dpkg/Dockerfile new file mode 100644 index 0000000..7096307 --- /dev/null +++ b/docker/dpkg/Dockerfile @@ -0,0 +1,20 @@ +FROM ubuntu:18.04 + +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=Europe/Moskow +RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone + +RUN sed -i "s/archive.ubuntu.com/mirror.yandex.ru/g" /etc/apt/sources.list \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + lsb-release libssl-dev gnupg openssl libssl-dev\ + debhelper debootstrap devscripts make equivs \ + python3=3.6.7-1~18.04 + +RUN alias python3.6=python3 + +WORKDIR /root/pgconsul +COPY . /root/pgconsul + +RUN mk-build-deps --build-dep --install --tool='apt-get -o Debug::pkgProblemResolver=yes --no-install-recommends --yes' debian/control +RUN dpkg-buildpackage -us -uc diff --git a/docker/jepsen/Dockerfile b/docker/jepsen/Dockerfile new file mode 100644 index 0000000..49f4c86 --- /dev/null +++ b/docker/jepsen/Dockerfile @@ -0,0 +1,13 @@ +FROM pgconsulbase:latest +ENV LEIN_ROOT 1 +RUN apt-get -qq update && apt-get install libjna-java \ + gnuplot \ + wget && \ + cp /var/lib/dist/test_ssh_key /root/.ssh/id_rsa && \ + chmod 600 /root/.ssh/id_rsa && \ + wget https://raw.githubusercontent.com/technomancy/leiningen/stable/bin/lein -O /usr/bin/lein && \ + chmod +x /usr/bin/lein && \ + cp -r /var/lib/dist/docker/jepsen/jepsen /root/ && \ + cd /root/jepsen && \ + lein install && \ + lein deps diff --git a/docker/jepsen/jepsen/project.clj b/docker/jepsen/jepsen/project.clj new file mode 100644 index 0000000..1e6b290 --- /dev/null +++ b/docker/jepsen/jepsen/project.clj @@ -0,0 +1,11 @@ +(defproject jepsen.pgconsul "0.1.0-SNAPSHOT" + :description "PgConsul tests" + :url "https://yandex.com" + :license {:name "Eclipse Public License" + :url "http://www.eclipse.org/legal/epl-v10.html"} + :dependencies [[org.clojure/clojure "1.10.3"] + [org.clojure/tools.nrepl "0.2.13"] + [clojure-complete "0.2.5"] + [jepsen "0.2.6"] + [org.clojure/java.jdbc "0.7.12"] + [org.postgresql/postgresql "42.3.2"]]) diff --git a/docker/jepsen/jepsen/run.sh b/docker/jepsen/jepsen/run.sh new file mode 100755 index 0000000..388b16e --- /dev/null +++ b/docker/jepsen/jepsen/run.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +set -e +set -x + +cd "$(dirname "$0")" +export LEIN_ROOT=1 +for i in zookeeper1 zookeeper2 zookeeper3 postgresql1 postgresql2 postgresql3 +do + ssh-keyscan -t rsa pgconsul_${i}_1.pgconsul_pgconsul_net >> /root/.ssh/known_hosts +done +lein test diff --git a/docker/jepsen/jepsen/src/jepsen/pgconsul.clj b/docker/jepsen/jepsen/src/jepsen/pgconsul.clj new file mode 100644 index 0000000..4bd79bd --- /dev/null +++ b/docker/jepsen/jepsen/src/jepsen/pgconsul.clj @@ -0,0 +1,264 @@ +(ns jepsen.pgconsul + "Tests for PgConsul" + (:require [clojure.tools.logging :refer :all] + [clojure.core.reducers :as r] + [clojure.set :as set] + [clojure.string :as string] + [jepsen [tests :as tests] + [os :as os] + [db :as db] + [client :as client] + [control :as control] + [nemesis :as nemesis] + [generator :as gen] + [checker :as checker] + [util :as util :refer [timeout]] + [net :as net]] + [knossos [op :as op]] + [clojure.java.jdbc :as j])) + +(def register (atom 0)) + +(defn open-conn + "Given a JDBC connection spec, opens a new connection unless one already + exists. JDBC represents open connections as a map with a :connection key. + Won't open if a connection is already open." + [spec] + (if (:connection spec) + spec + (j/add-connection spec (j/get-connection spec)))) + +(defn close-conn + "Given a spec with JDBC connection, closes connection and returns the spec w/o connection." + [spec] + (when-let [conn (:connection spec)] + (.close conn)) + {:classname (:classname spec) + :subprotocol (:subprotocol spec) + :subname (:subname spec) + :user (:user spec) + :password (:password spec)}) + +(defmacro with-conn + "This macro takes that atom and binds a connection for the duration of + its body, automatically reconnecting on any + exception." + [[conn-sym conn-atom] & body] + `(let [~conn-sym (locking ~conn-atom + (swap! ~conn-atom open-conn))] + (try + ~@body + (catch Throwable t# + (locking ~conn-atom + (swap! ~conn-atom (comp open-conn close-conn))) + (throw t#))))) + +(defn conn-spec + "Return postgresql connection spec for given node name" + [node] + {:classname "org.postgresql.Driver" + :subprotocol "postgresql" + :subname (str "//" (name node) ":6432/postgres?prepareThreshold=0") + :user "repl" + :password "repl"}) + +(defn noop-client + "Noop client" + [] + (reify client/Client + (setup! [_ test] + (info "noop-client setup")) + (invoke! [this test op] + (assoc op :type :info, :error "noop")) + (close! [_ test]) + (teardown! [_ test] (info "teardown")) + client/Reusable + (reusable? [_ test] true))) + +(defn pg-client + "PostgreSQL client" + [conn] + (reify client/Client + (setup! [_ test] + (info "pg-client setup")) + (open! [_ test node] + (let [conn (atom (conn-spec node))] + (cond (string/includes? (name node) "postgresql") + (pg-client conn) + true + (noop-client)))) + (invoke! [this test op] + (try + (timeout 5000 (assoc op :type :info, :error "timeout") + (with-conn [c conn] + (case (:f op) + :read (assoc op :type :ok, + :value (->> (j/query c ["select value from set for update"] + {:row-fn :value}) + (vec) + (set))) + :add (do (j/execute! c [(str "insert into set values (" + (get op :value) ")")]) + (assoc op :type :ok))))) + (catch Throwable t# + (let [m# (.getMessage t#)] + (cond (re-find #"ERROR: cannot execute .* in a read-only transaction" m#) + (assoc op :type :info, :error "read-only") + true + (assoc op :type :info, :error m#)))))) + (close! [_ test] (close-conn conn)) + (teardown! [_ test]) + client/Reusable + (reusable? [_ test] true))) + +(defn db + "PostgreSQL database" + [] + (reify db/DB + (setup! [_ test node] + (info (str (name node) " setup"))) + + (teardown! [_ test node] + (info (str (name node) " teardown"))))) + +(defn r [_ _] {:type :invoke, :f :read, :value nil}) +(defn a [_ _] {:type :invoke, :f :add, :value (swap! register (fn [current-state] (+ current-state 1)))}) + +(def pgconsul-set + "Given a set of :add operations followed by a final :read, verifies that + every successfully added element is present in the read, and that the read + contains only elements for which an add was attempted." + (reify checker/Checker + (check [this test history opts] + (let [attempts (->> history + (r/filter op/invoke?) + (r/filter #(= :add (:f %))) + (r/map :value) + (into #{})) + adds (->> history + (r/filter op/ok?) + (r/filter #(= :add (:f %))) + (r/map :value) + (into #{})) + final-read (->> history + (r/filter op/ok?) + (r/filter #(= :read (:f %))) + (r/map :value) + (reduce (fn [_ x] x) nil))] + (if-not final-read + {:valid? false + :error "Set was never read"} + + (let [; The OK set is every read value which we tried to add + ok (set/intersection final-read attempts) + + ; Unexpected records are those we *never* attempted. + unexpected (set/difference final-read attempts) + + ; Lost records are those we definitely added but weren't read + lost (set/difference adds final-read) + + ; Recovered records are those where we didn't know if the add + ; succeeded or not, but we found them in the final set. + recovered (set/difference ok adds)] + + {:valid? (and (empty? lost) (empty? unexpected)) + :ok (util/integer-interval-set-str ok) + :lost (util/integer-interval-set-str lost) + :unexpected (util/integer-interval-set-str unexpected) + :recovered (util/integer-interval-set-str recovered) + :ok-frac (util/fraction (count ok) (count attempts)) + :unexpected-frac (util/fraction (count unexpected) (count attempts)) + :lost-frac (util/fraction (count lost) (count attempts)) + :recovered-frac (util/fraction (count recovered) (count attempts))})))))) + +(defn killer + "Executes pkill -9 `procname`" + [] + (reify nemesis/Nemesis + (setup! [this test] + this) + (invoke! [this test op] + (case (:f op) + :kill (assoc op :value + (try + (let [procname (rand-nth [:postgres + :pgconsul]) + node (rand-nth (filter (fn [x] (string/includes? (name x) "postgresql")) + (:nodes test)))] + (control/on node + (control/exec :pkill :-9 procname)) + (assoc op :value [:killed procname :on node])) + (catch Throwable t# + (let [m# (.getMessage t#)] + (do (warn (str "Unable to run pkill: " + m#)) + m#))))))) + (teardown! [this test] + (info (str "Stopping killer"))) + nemesis/Reflection + (fs [this] #{}))) + +(defn switcher + "Executes switchover" + [] + (reify nemesis/Nemesis + (setup! [this test] + this) + (invoke! [this test op] + (case (:f op) + :switch (assoc op :value + (try + (let [node (rand-nth (filter (fn [x] (string/includes? (name x) "postgresql")) + (:nodes test)))] + (control/on node + (control/exec :timeout :10 :pgconsul-util :switchover :-y)) + (assoc op :value [:switchover :on node])) + (catch Throwable t# + (let [m# (.getMessage t#)] + (do (warn (str "Unable to run switch: " + m#)) + m#))))))) + (teardown! [this test] + (info (str "Stopping switcher"))) + nemesis/Reflection + (fs [this] #{}))) + +(def nemesis-starts [:start-halves :start-ring :start-one :switch :kill]) + +(defn pgconsul-test + [pg-nodes zk-nodes] + {:nodes (concat pg-nodes zk-nodes) + :name "pgconsul" + :os os/noop + :db (db) + :ssh {:private-key-path "/root/.ssh/id_rsa"} + :net net/iptables + :client (pg-client nil) + :nemesis (nemesis/compose {{:start-halves :start} (nemesis/partition-random-halves) + {:start-ring :start} (nemesis/partition-majorities-ring) + {:start-one :start + ; All partitioners heal all nodes on stop so we define stop once + :stop :stop} (nemesis/partition-random-node) + #{:switch} (switcher) + #{:kill} (killer)}) + :generator (gen/phases + (->> a + (gen/stagger 1/50) + (gen/nemesis + (fn [] (map gen/once + [{:type :info, :f (rand-nth nemesis-starts)} + {:type :info, :f (rand-nth nemesis-starts)} + {:type :sleep, :value 60} + {:type :info, :f :stop} + {:type :sleep, :value 60}]))) + (gen/time-limit 7200)) + (->> r + (gen/stagger 1) + (gen/nemesis + (fn [] (map gen/once + [{:type :info, :f :stop} + {:type :sleep, :value 60}]))) + (gen/time-limit 600))) + :checker pgconsul-set + :remote control/ssh}) diff --git a/docker/jepsen/jepsen/test/jepsen/pgconsul_test.clj b/docker/jepsen/jepsen/test/jepsen/pgconsul_test.clj new file mode 100644 index 0000000..7a2002f --- /dev/null +++ b/docker/jepsen/jepsen/test/jepsen/pgconsul_test.clj @@ -0,0 +1,15 @@ +(ns jepsen.pgconsul-test + (:require [clojure.test :refer :all] + [jepsen.core :as jepsen] + [jepsen.pgconsul :as pgconsul])) + +(def pg_nodes ["pgconsul_postgresql1_1.pgconsul_pgconsul_net" + "pgconsul_postgresql2_1.pgconsul_pgconsul_net" + "pgconsul_postgresql3_1.pgconsul_pgconsul_net"]) + +(def zk_nodes ["pgconsul_zookeeper1_1.pgconsul_pgconsul_net" + "pgconsul_zookeeper2_1.pgconsul_pgconsul_net" + "pgconsul_zookeeper3_1.pgconsul_pgconsul_net"]) + +(deftest pgconsul-test + (is (:valid? (:results (jepsen/run! (pgconsul/pgconsul-test pg_nodes zk_nodes)))))) diff --git a/docker/jepsen/save_logs.sh b/docker/jepsen/save_logs.sh new file mode 100755 index 0000000..f037f71 --- /dev/null +++ b/docker/jepsen/save_logs.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +for i in 1 2 3 +do + mkdir -p logs/postgresql${i} + mkdir -p logs/zookeeper${i} + for service in pgbouncer pgconsul + do + docker exec pgconsul_postgresql${i}_1 cat \ + /var/log/${service}.log > \ + logs/postgresql${i}/${service}.log + done + docker exec pgconsul_postgresql${i}_1 cat \ + /var/log/postgresql/postgresql-$1-main.log > \ + logs/postgresql${i}/postgresql.log + docker exec pgconsul_zookeeper${i}_1 cat \ + /var/log/zookeeper/zookeeper--server-pgconsul_zookeeper${i}_1.log > \ + logs/zookeeper${i}/zk.log 2>&1 +done diff --git a/docker/pgconsul/Dockerfile b/docker/pgconsul/Dockerfile new file mode 100644 index 0000000..64b4d67 --- /dev/null +++ b/docker/pgconsul/Dockerfile @@ -0,0 +1,52 @@ +FROM pgconsulbase:latest +ARG replication_type +ARG pg_major +ARG pg_version +ARG pg_common_version +ARG pgbouncer_version +ARG libpq_version + +ENV PG_MAJOR $pg_major + +RUN apt-get update && \ + apt-get install \ + libpq5 \ + postgresql-$PG_MAJOR \ + postgresql-client-$PG_MAJOR \ + postgresql-common \ + postgresql-client-common \ + postgresql-server-dev-$PG_MAJOR \ + build-essential \ + python-daemon \ + python-psycopg2 \ + python-setuptools \ + python-kazoo \ + pgbouncer + +RUN git clone https://github.com/g0djan/lwaldump.git lwaldump && \ + cd lwaldump && git checkout REL_13_STABLE && make -s && make -s install && cd .. + +RUN pg_dropcluster --stop $PG_MAJOR main && \ + ln -s /usr/lib/postgresql/$PG_MAJOR/bin /usr/bin/postgresql && \ + mkdir -p /etc/pgconsul/plugins && \ + cd /var/lib/dist && \ + DESTDIR=/ make install && \ + cp /var/lib/dist/docker/pgconsul/pgconsul_${replication_type}.conf /etc/pgconsul.conf && \ + cp /var/lib/dist/docker/pgconsul/gen_rec_conf.sh /usr/local/bin/gen_rec_conf.sh && \ + echo "*:*:*:repl:repl" > /var/lib/postgresql/.pgpass && \ + chmod 600 /var/lib/postgresql/.pgpass && \ + chown postgres:postgres /var/lib/postgresql/.pgpass && \ + echo "*:*:*:repl:repl" > /root/.pgpass && \ + chmod 600 /root/.pgpass && \ + mkdir -p /etc/pgbouncer && \ + cp /var/lib/dist/docker/pgconsul/postgresql.conf /root/postgresql.conf && \ + cp /var/lib/dist/docker/pgconsul/pg_hba.conf /root/pg_hba.conf && \ + cp /var/lib/dist/docker/pgconsul/supervisor.conf /etc/supervisor/conf.d/pgconsul.conf && \ + cp /var/lib/dist/docker/pgconsul/pgbouncer.ini /etc/pgbouncer/pgbouncer.ini && \ + cp /var/lib/dist/docker/pgconsul/userlist.txt /etc/pgbouncer/userlist.txt && \ + cp /var/lib/dist/docker/pgconsul/sudoers /etc/sudoers.d/pgconsul && \ + cp /var/lib/dist/docker/pgconsul/setup.sh /usr/local/bin/setup.sh && \ + cp /var/lib/dist/docker/pgconsul/archive.passwd /etc/archive.passwd && \ + chown postgres:postgres /etc/archive.passwd && \ + chmod 600 /etc/archive.passwd + diff --git a/docker/pgconsul/archive.passwd b/docker/pgconsul/archive.passwd new file mode 100644 index 0000000..9f358a4 --- /dev/null +++ b/docker/pgconsul/archive.passwd @@ -0,0 +1 @@ +123456 diff --git a/docker/pgconsul/gen_rec_conf.sh b/docker/pgconsul/gen_rec_conf.sh new file mode 100755 index 0000000..08c488e --- /dev/null +++ b/docker/pgconsul/gen_rec_conf.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +set -xe + +nm=$(hostname -f | sed -e 's/\./_/g' -e 's/\-/_/g') +echo "recovery_target_timeline = 'latest'\nprimary_conninfo = 'host=$1 port=5432 user=repl application_name=$nm options=''-c wal_sender_timeout=30000'''\nprimary_slot_name = '$nm'" > $2 +pgdata=$(pg_lsclusters | tail -n 1 | awk '{print $6}') +touch ${pgdata}/standby.signal diff --git a/docker/pgconsul/pg_hba.conf b/docker/pgconsul/pg_hba.conf new file mode 100644 index 0000000..c5d5f3f --- /dev/null +++ b/docker/pgconsul/pg_hba.conf @@ -0,0 +1,9 @@ +local all postgres ident +local all all peer +host all all 127.0.0.1/32 scram-sha-256 +host all all ::1/128 scram-sha-256 +local replication postgres peer +host replication repl 0.0.0.0/0 scram-sha-256 +host replication repl ::/0 scram-sha-256 +host all repl 0.0.0.0/0 scram-sha-256 +host all repl ::/0 scram-sha-256 diff --git a/docker/pgconsul/pgbouncer.ini b/docker/pgconsul/pgbouncer.ini new file mode 100644 index 0000000..93bcc80 --- /dev/null +++ b/docker/pgconsul/pgbouncer.ini @@ -0,0 +1,25 @@ +[databases] +* = host=localhost +[pgbouncer] +logfile = /var/log/postgresql/pgbouncer.log +pidfile = /var/run/postgresql/pgbouncer.pid +listen_addr = * +listen_port = 6432 +auth_type = plain +auth_file = /etc/pgbouncer/userlist.txt +admin_users = postgres +stats_users = postgres +pool_mode = session +server_reset_query = +server_reset_query_always = 0 +ignore_startup_parameters = extra_float_digits +server_check_delay = 30 +application_name_add_host = 1 +max_client_conn = 1000 +default_pool_size = 50 +min_pool_size = 0 +log_connections = 1 +log_disconnections = 1 +log_pooler_errors = 1 +server_idle_timeout = 20 +server_connect_timeout = 3 diff --git a/docker/pgconsul/pgconsul_quorum.conf b/docker/pgconsul/pgconsul_quorum.conf new file mode 100644 index 0000000..a0510ba --- /dev/null +++ b/docker/pgconsul/pgconsul_quorum.conf @@ -0,0 +1,54 @@ +[global] +zk_lockpath_prefix = /pgconsul/postgresql/ +daemon_user = postgres +log_level = debug +log_file = /var/log/pgconsul/pgconsul.log +pid_file = /var/run/pgconsul/pgconsul.pid +working_dir = /tmp +local_conn_string = dbname=postgres user=postgres connect_timeout=1 +append_primary_conn_string = dbname=postgres user=repl password=repl connect_timeout=1 +iteration_timeout = 1 +zk_hosts = pgconsul_zookeeper1_1.pgconsul_pgconsul_net:2281,pgconsul_zookeeper2_1.pgconsul_pgconsul_net:2281,pgconsul_zookeeper3_1.pgconsul_pgconsul_net:2281 +use_replication_slots = yes +standalone_pooler = yes +quorum_commit = yes +use_lwaldump = yes +recovery_conf_rel_path = conf.d/recovery.conf +zk_connect_max_delay = 20 +zk_auth = yes +zk_username = user1 +zk_password = testpassword123 +zk_ssl = yes +keyfile = /etc/zk-ssl/server.key +certfile = /etc/zk-ssl/server.crt +ca_cert = /etc/zk-ssl/ca.cert.pem +verify_certs = yes + +[primary] +change_replication_type = yes +change_replication_metric = count,time +weekday_change_hours = 0-0 +weekend_change_hours = 0-0 +primary_switch_checks = 3 + +[replica] +failover_checks = 3 +primary_unavailability_timeout = 6 +start_pooler = yes +primary_switch_checks = 5 +min_failover_timeout = 300 +allow_potential_data_loss = no +recovery_timeout = 60 + +[commands] +promote = /bin/bash -c 'sleep 0.5; /usr/bin/postgresql/pg_ctl promote -D %p' +rewind = /usr/bin/postgresql/pg_rewind --target-pgdata=%p --source-server='host=%m dbname=postgres user=repl password=repl connect_timeout=1' +pg_start = /usr/bin/postgresql/pg_ctl start -s -w -t %t -D %p --log=/var/log/postgresql/postgresql.log +pg_stop = /usr/bin/postgresql/pg_ctl stop -s -m fast -w -t %t -D %p +pg_status = /usr/bin/postgresql/pg_ctl status -s -D %p +pg_reload = /bin/bash -c "/bin/bash -c 'sleep 0.2; /usr/bin/postgresql/pg_ctl reload -s -D %p' &" +pooler_start = sudo supervisorctl start pgbouncer +pooler_stop = sudo supervisorctl stop pgbouncer +pooler_status = sudo supervisorctl status pgbouncer >/dev/null 2>&1 +generate_recovery_conf = /usr/local/bin/gen_rec_conf.sh %m %p +get_control_parameter = /usr/bin/postgresql/pg_controldata %p | grep '%a:' diff --git a/docker/pgconsul/pgconsul_sync.conf b/docker/pgconsul/pgconsul_sync.conf new file mode 100644 index 0000000..3f81980 --- /dev/null +++ b/docker/pgconsul/pgconsul_sync.conf @@ -0,0 +1,54 @@ +[global] +zk_lockpath_prefix = /pgconsul/postgresql/ +daemon_user = postgres +log_level = debug +log_file = /var/log/pgconsul/pgconsul.log +pid_file = /var/run/pgconsul/pgconsul.pid +working_dir = /tmp +local_conn_string = dbname=postgres user=postgres connect_timeout=1 +append_primary_conn_string = dbname=postgres user=repl password=repl connect_timeout=1 +iteration_timeout = 1 +zk_hosts = pgconsul_zookeeper1_1.pgconsul_pgconsul_net:2281,pgconsul_zookeeper2_1.pgconsul_pgconsul_net:2281,pgconsul_zookeeper3_1.pgconsul_pgconsul_net:2281 +use_replication_slots = yes +standalone_pooler = yes +quorum_commit = no +use_lwaldump = yes +recovery_conf_rel_path = conf.d/recovery.conf +zk_connect_max_delay = 20 +zk_auth = yes +zk_username = user1 +zk_password = testpassword123 +zk_ssl = yes +keyfile = /etc/zk-ssl/server.key +certfile = /etc/zk-ssl/server.crt +ca_cert = /etc/zk-ssl/ca.cert.pem +verify_certs = yes + +[primary] +change_replication_type = yes +change_replication_metric = count,time +weekday_change_hours = 0-0 +weekend_change_hours = 0-0 +primary_switch_checks = 3 + +[replica] +failover_checks = 3 +primary_unavailability_timeout = 6 +start_pooler = yes +primary_switch_checks = 5 +min_failover_timeout = 300 +allow_potential_data_loss = no +recovery_timeout = 60 + +[commands] +promote = /usr/bin/postgresql/pg_ctl promote -D %p +rewind = /usr/bin/postgresql/pg_rewind --target-pgdata=%p --source-server='host=%m dbname=postgres user=repl password=repl connect_timeout=1' +pg_start = /usr/bin/postgresql/pg_ctl start -s -w -t %t -D %p --log=/var/log/postgresql/postgresql.log +pg_stop = /usr/bin/postgresql/pg_ctl stop -s -m fast -w -t %t -D %p +pg_status = /usr/bin/postgresql/pg_ctl status -s -D %p +pg_reload = /usr/bin/postgresql/pg_ctl reload -s -D %p +pooler_start = sudo supervisorctl start pgbouncer +pooler_stop = sudo supervisorctl stop pgbouncer +pooler_status = sudo supervisorctl status pgbouncer >/dev/null 2>&1 +generate_recovery_conf = /usr/local/bin/gen_rec_conf.sh %m %p +get_control_parameter = /usr/bin/postgresql/pg_controldata %p | grep '%a:' diff --git a/docker/pgconsul/postgresql.conf b/docker/pgconsul/postgresql.conf new file mode 100644 index 0000000..5cfff3f --- /dev/null +++ b/docker/pgconsul/postgresql.conf @@ -0,0 +1,34 @@ +listen_addresses = '*' +external_pid_file = '/var/run/postgresql/main.pid' +port = 5432 +max_connections = 100 +unix_socket_directories = '/var/run/postgresql,/tmp' +ssl = false +shared_buffers = 128MB +dynamic_shared_memory_type = posix +log_min_messages = info +log_line_prefix = '%t [%p-%l] %q%u@%d ' +log_timezone = 'UTC' +log_hostname = on +stats_temp_directory = '/var/run/postgresql/pg_stat_tmp' +datestyle = 'iso, mdy' +timezone = 'UTC' +lc_messages = 'C' +lc_monetary = 'C' +lc_numeric = 'C' +lc_time = 'C' +default_text_search_config = 'pg_catalog.english' +wal_level = hot_standby +wal_compression = on +wal_log_hints = on +min_wal_size = 1GB +max_wal_size = 16GB +max_replication_slots = 10 +max_wal_senders = 10 +hot_standby = on +wal_receiver_status_interval = 1s +hot_standby_feedback = on +archive_mode = on +archive_command = 'echo "%p" && rsync --contimeout=1 --timeout=1 -a --password-file=/etc/archive.passwd %p rsync://archive@pgconsul_backup1_1.pgconsul_pgconsul_net:/archive/%f' +archive_timeout = 30 +restore_command = 'rsync -a --password-file=/etc/archive.passwd rsync://archive@pgconsul_backup1_1.pgconsul_pgconsul_net:/archive/%f %p' diff --git a/docker/pgconsul/setup.sh b/docker/pgconsul/setup.sh new file mode 100644 index 0000000..fe0cddb --- /dev/null +++ b/docker/pgconsul/setup.sh @@ -0,0 +1,90 @@ +#!/bin/bash +set -ex + +PG_MAJOR=$1 +PRIMARY=$2 +PGDATA="/var/lib/postgresql/${PG_MAJOR}/main" + +wait_pg() { + tries=0 + ret=1 + while [ ${tries} -le 60 ] + do + if (echo "select 1" | su - postgres -c "psql --set ON_ERROR_STOP=1" >/dev/null 2>&1) + then + ret=0 + break + else + tries=$(( tries + 1 )) + sleep 1 + fi + done + return ${ret} +} + +make_config() { + cat /root/postgresql.conf > ${PGDATA}/postgresql.conf + cat /root/pg_hba.conf > ${PGDATA}/pg_hba.conf + chown postgres:postgres ${PGDATA}/postgresql.conf ${PGDATA}/pg_hba.conf + echo "include = '${PGDATA}/postgresql.conf'" > /etc/postgresql/${PG_MAJOR}/main/postgresql.conf + echo "data_directory = '${PGDATA}'" >> /etc/postgresql/${PG_MAJOR}/main/postgresql.conf + echo "hba_file = '${PGDATA}/pg_hba.conf'" >> ${PGDATA}/postgresql.conf + echo "ident_file = '/etc/postgresql/${PG_MAJOR}/main/pg_ident.conf'" >> ${PGDATA}/postgresql.conf + echo "include_if_exists = '${PGDATA}/conf.d/recovery.conf'" >> ${PGDATA}/postgresql.conf +} + +supervisorctl stop pgconsul + +if [ "${PRIMARY}" = "" ] +then + pg_createcluster ${PG_MAJOR} main -- -k --auth-host=md5 + make_config + sudo -u postgres mkdir ${PGDATA}/conf.d + pg_ctlcluster ${PG_MAJOR} main start && \ + if ! wait_pg + then + exit 1 + fi + sudo -u postgres psql --set ON_ERROR_STOP=1 -c 'CREATE EXTENSION IF NOT EXISTS lwaldump' + while : + do + echo "create user repl with encrypted password 'repl' replication superuser;" | su - postgres -c psql >/dev/null 2>&1 + supervisorctl start pgconsul 2>/dev/null >/dev/null || supervisorctl status pgconsul + if psql --set ON_ERROR_STOP=1 -c 'CREATE TABLE IF NOT EXISTS set (value integer primary key)' "host=localhost port=6432 dbname=postgres user=repl" >/dev/null 2>&1 + then + break + else + sleep 1 + fi + done +else + pg_createcluster $PG_MAJOR main -- --auth-host=md5 + make_config + echo -n "Waiting while primary is ready... " + while : + do + psql --set ON_ERROR_STOP=1 -c 'select 1' "host=${PRIMARY} port=6432 dbname=postgres user=repl" >/dev/null 2>&1 && \ + if [ -f /tmp/pgconsul_init ] + then + echo "trying to start pgconsul" + supervisorctl start pgconsul 2>/dev/null >/dev/null || supervisorctl status pgconsul + else + echo "starting setup" + rm -rf ${PGDATA}/* && \ + (psql "host=${PRIMARY} port=6432 dbname=postgres user=repl" -c "select pg_drop_replication_slot('$(hostname -f | sed -e 's/\./_/g' -e 's/\-/_/g')');" >/dev/null 2>&1 || true) && \ + psql "host=${PRIMARY} port=6432 dbname=postgres user=repl" -c "select pg_create_physical_replication_slot('$(hostname -f | sed -e 's/\./_/g' -e 's/\-/_/g')');" >/dev/null 2>&1 || true && \ + su - postgres -c "pg_basebackup --pgdata=${PGDATA} --wal-method=fetch --dbname=\"host=${PRIMARY} port=5432 dbname=postgres user=repl\"" && \ + su - postgres -c "/usr/local/bin/gen_rec_conf.sh ${PRIMARY} ${PGDATA}/conf.d/recovery.conf" && \ + pg_ctlcluster $PG_MAJOR main start; \ + wait_pg && \ + touch /tmp/pgconsul_init && \ + (supervisorctl start pgconsul 2>/dev/null >/dev/null || supervisorctl status pgconsul) + fi + if psql --set ON_ERROR_STOP=1 -c 'select 1' "host=localhost port=6432 dbname=postgres user=repl" >/dev/null 2>&1 + then + break + else + sleep 1 + fi + done +fi diff --git a/docker/pgconsul/sudoers b/docker/pgconsul/sudoers new file mode 100644 index 0000000..d488ee6 --- /dev/null +++ b/docker/pgconsul/sudoers @@ -0,0 +1 @@ +postgres ALL, ALL = NOPASSWD: /usr/local/bin/supervisorctl * diff --git a/docker/pgconsul/supervisor.conf b/docker/pgconsul/supervisor.conf new file mode 100644 index 0000000..4f025cd --- /dev/null +++ b/docker/pgconsul/supervisor.conf @@ -0,0 +1,50 @@ +[program:pgbouncer] +command=/usr/sbin/pgbouncer /etc/pgbouncer/pgbouncer.ini +process_name=%(program_name)s +autostart=false +autorestart=false +stopsignal=TERM +user=postgres +stdout_logfile=/var/log/pgbouncer.log +stdout_logfile_maxbytes=0 +stderr_logfile=/var/log/pgbouncer.log +stderr_logfile_maxbytes=0 + +[program:prestart_pgconsul] +user=root +command=bash -c "mkdir -p /var/run/pgconsul && chown postgres:postgres /var/run/pgconsul && mkdir -p /var/log/pgconsul && chown postgres:postgres /var/log/pgconsul" +autostart=true +autorestart=unexpected +exitcodes=0 +startsecs=0 +priority=1 +stdout_logfile=/proc/self/fd/1 +stdout_logfile_maxbytes=0 +stderr_logfile=/proc/self/fd/2 +stderr_logfile_maxbytes=0 + +[program:pgconsul] +command=/usr/local/bin/pgconsul -f yes +process_name=%(program_name)s +autostart=false +autorestart=true +stopsignal=TERM +user=postgres +priority=10 +stdout_logfile=/var/log/pgconsul.log +stdout_logfile_maxbytes=0 +stderr_logfile=/var/log/pgconsul.log +stderr_logfile_maxbytes=0 + +[program:prestart_postgresql] +user=root +command=bash -c "mkdir -p /var/run/postgresql/pg_stat_tmp && chown -R postgres:postgres /var/run/postgresql" +autostart=true +autorestart=unexpected +exitcodes=0 +startsecs=0 +priority=1 +stdout_logfile=/proc/self/fd/1 +stdout_logfile_maxbytes=0 +stderr_logfile=/proc/self/fd/2 +stderr_logfile_maxbytes=0 diff --git a/docker/pgconsul/userlist.txt b/docker/pgconsul/userlist.txt new file mode 100644 index 0000000..1a98a66 --- /dev/null +++ b/docker/pgconsul/userlist.txt @@ -0,0 +1 @@ +"repl" "repl" diff --git a/docker/zookeeper/Dockerfile b/docker/zookeeper/Dockerfile new file mode 100644 index 0000000..171de98 --- /dev/null +++ b/docker/zookeeper/Dockerfile @@ -0,0 +1,11 @@ +FROM pgconsulbase:latest +ARG VERSION +ENV ZK_VERSION ${VERSION} +RUN tar -xzf /var/lib/dist/docker/zookeeper/zookeeper-${VERSION}.tar.gz -C /opt && \ + mv /opt/apache-zookeeper-${VERSION}-bin /opt/zookeeper && \ + cp /var/lib/dist/docker/zookeeper/zoo.cfg /opt/zookeeper/conf/zoo.cfg && \ + cp /var/lib/dist/docker/zookeeper/pre.sh /opt/zookeeper/bin/pre.sh && \ + chmod +x /opt/zookeeper/bin/pre.sh && \ + sed -i "s/{zk_version}/${VERSION}/g" /var/lib/dist/docker/zookeeper/zookeeper.conf && \ + cp /var/lib/dist/docker/zookeeper/zookeeper.conf /etc/supervisor/conf.d && \ + mkdir -p /var/log/zookeeper diff --git a/docker/zookeeper/pre.sh b/docker/zookeeper/pre.sh new file mode 100755 index 0000000..6305d1e --- /dev/null +++ b/docker/zookeeper/pre.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# This puts zookeeper node id to its 'myid' file +# For more info look here at steps 4, 5 : http://zookeeper.apache.org/doc/r3.5.7/zookeeperAdmin.html#sc_zkMulitServerSetup +mkdir -p /tmp/zookeeper +for ip in $(ifconfig -a | grep 'inet' | awk '{print $2}') +do + ID=$(grep -F "$ip" /opt/zookeeper/conf/zoo.cfg | cut -d= -f1 | cut -d. -f2) + if [ -n "$ID" ] + then + echo "$ID" > /tmp/zookeeper/myid + break + fi +done diff --git a/docker/zookeeper/zoo.cfg b/docker/zookeeper/zoo.cfg new file mode 100644 index 0000000..902c9f1 --- /dev/null +++ b/docker/zookeeper/zoo.cfg @@ -0,0 +1,50 @@ +# The number of milliseconds of each tick +tickTime=2000 +# The number of ticks that the initial +# synchronization phase can take +initLimit=100 +# The number of ticks that can pass between +# sending a request and getting an acknowledgement +syncLimit=20 +# the directory where the snapshot is stored. +# do not use /tmp for storage, /tmp here is just +# example sakes. +dataDir=/tmp/zookeeper +# the port at which the clients will connect +clientPort=2181 +# the maximum session timeout in milliseconds that the server +# will allow the client to negotiate. +maxSessionTimeout=60000 +snapCount=1000000 +forceSync=no +# the timeout value for opening connections for leader election notifications. +cnxTimeout=3000 +# the maximum number of client connections. +# increase this if you need to handle more clients +#maxClientCnxns=60 +# +# Be sure to read the maintenance section of the +# administrator guide before turning on autopurge. +# +# http://zookeeper.apache.org/doc/current/zookeeperAdmin.html#sc_maintenance +# +# The number of snapshots to retain in dataDir +autopurge.snapRetainCount=3 +# Purge task interval in hours +# Set to "0" to disable auto purge feature +autopurge.purgeInterval=0 +leaderServes=yes +quorumListenOnAllIPs=true +jute.maxbuffer=16777216 +secureClientPort=2281 +serverCnxnFactory=org.apache.zookeeper.server.NettyServerCnxnFactory +portUnification=false +skipACL=no +ssl.trustStore.password=testpassword123 +ssl.trustStore.location=/etc/zk-ssl/truststore.jks +ssl.keyStore.password=testpassword321 +ssl.keyStore.location=/etc/zk-ssl/server.jks + +server.1=192.168.233.10:2188:2189 +server.2=192.168.233.11:2188:2189 +server.3=192.168.233.12:2188:2189 diff --git a/docker/zookeeper/zookeeper.conf b/docker/zookeeper/zookeeper.conf new file mode 100644 index 0000000..75bd8a4 --- /dev/null +++ b/docker/zookeeper/zookeeper.conf @@ -0,0 +1,25 @@ +[program:zookeeper_pre] +command=/opt/zookeeper/bin/pre.sh +process_name=%(program_name)s +autostart=true +autorestart=true +stdout_logfile=/proc/self/fd/1 +stdout_logfile_maxbytes=0 +stderr_logfile=/proc/self/fd/2 +stderr_logfile_maxbytes=0 +priority=100 + +[program:zookeeper] +command=/opt/zookeeper/bin/zkServer.sh start-foreground +environment=ZOO_LOG_DIR=/var/log/zookeeper,ZOO_LOG4J_PROP='INFO,ROLLINGFILE' +process_name=%(program_name)s +autostart=true +autorestart=true +stopsignal=TERM +log_level=debug +user=root +stdout_logfile=/proc/self/fd/1 +stdout_logfile_maxbytes=0 +stderr_logfile=/proc/self/fd/2 +stderr_logfile_maxbytes=0 +priority=200 diff --git a/jepsen-compose.yml b/jepsen-compose.yml new file mode 100644 index 0000000..73aec05 --- /dev/null +++ b/jepsen-compose.yml @@ -0,0 +1,127 @@ +version: '2.2' + +services: + zookeeper1: + build: + context: ./docker/zookeeper + args: + - VERSION=${ZK_VERSION} + privileged: true + hostname: pgconsul_zookeeper1_1 + domainname: pgconsul_pgconsul_net + container_name: pgconsul_zookeeper1_1 + init: true + networks: + pgconsul_net: + ipv4_address: 192.168.233.10 + zookeeper2: + build: + context: ./docker/zookeeper + args: + - VERSION=${ZK_VERSION} + privileged: true + hostname: pgconsul_zookeeper2_1 + domainname: pgconsul_pgconsul_net + container_name: pgconsul_zookeeper2_1 + init: true + networks: + pgconsul_net: + ipv4_address: 192.168.233.11 + zookeeper3: + build: + context: ./docker/zookeeper + args: + - VERSION=${ZK_VERSION} + privileged: true + hostname: pgconsul_zookeeper3_1 + domainname: pgconsul_pgconsul_net + container_name: pgconsul_zookeeper3_1 + init: true + networks: + pgconsul_net: + ipv4_address: 192.168.233.12 + backup1: + build: ./docker/backup + hostname: pgconsul_backup1_1 + domainname: pgconsul_pgconsul_net + init: true + networks: + pgconsul_net: + ipv4_address: 192.168.233.13 + postgresql1: + build: ./docker/pgconsul + privileged: true + hostname: pgconsul_postgresql1_1 + domainname: pgconsul_pgconsul_net + container_name: pgconsul_postgresql1_1 + init: true + extra_hosts: + - "pgconsul_postgresql2_1.pgconsul_pgconsul_net:192.168.233.15" + - "pgconsul_postgresql3_1.pgconsul_pgconsul_net:192.168.233.16" + - "pgconsul_zookeeper1_1.pgconsul_pgconsul_net:192.168.233.10" + - "pgconsul_zookeeper2_1.pgconsul_pgconsul_net:192.168.233.11" + - "pgconsul_zookeeper3_1.pgconsul_pgconsul_net:192.168.233.12" + - "pgconsul_backup1_1.pgconsul_pgconsul_net:192.168.233.13" + networks: + pgconsul_net: + ipv4_address: 192.168.233.14 + postgresql2: + build: ./docker/pgconsul + privileged: true + hostname: pgconsul_postgresql2_1 + domainname: pgconsul_pgconsul_net + container_name: pgconsul_postgresql2_1 + init: true + extra_hosts: + - "pgconsul_postgresql1_1.pgconsul_pgconsul_net:192.168.233.14" + - "pgconsul_postgresql3_1.pgconsul_pgconsul_net:192.168.233.16" + - "pgconsul_zookeeper1_1.pgconsul_pgconsul_net:192.168.233.10" + - "pgconsul_zookeeper2_1.pgconsul_pgconsul_net:192.168.233.11" + - "pgconsul_zookeeper3_1.pgconsul_pgconsul_net:192.168.233.12" + - "pgconsul_backup1_1.pgconsul_pgconsul_net:192.168.233.13" + networks: + pgconsul_net: + ipv4_address: 192.168.233.15 + postgresql3: + build: ./docker/pgconsul + privileged: true + hostname: pgconsul_postgresql3_1 + domainname: pgconsul_pgconsul_net + container_name: pgconsul_postgresql3_1 + init: true + extra_hosts: + - "pgconsul_postgresql1_1.pgconsul_pgconsul_net:192.168.233.14" + - "pgconsul_postgresql2_1.pgconsul_pgconsul_net:192.168.233.15" + - "pgconsul_zookeeper1_1.pgconsul_pgconsul_net:192.168.233.10" + - "pgconsul_zookeeper2_1.pgconsul_pgconsul_net:192.168.233.11" + - "pgconsul_zookeeper3_1.pgconsul_pgconsul_net:192.168.233.12" + - "pgconsul_backup1_1.pgconsul_pgconsul_net:192.168.233.13" + networks: + pgconsul_net: + ipv4_address: 192.168.233.16 + jepsen: + build: ./docker/jepsen + hostname: pgconsul_jepsen_1 + domainname: pgconsul_pgconsul_net + container_name: pgconsul_jepsen_1 + init: true + extra_hosts: + - "pgconsul_postgresql1_1.pgconsul_pgconsul_net:192.168.233.14" + - "pgconsul_postgresql2_1.pgconsul_pgconsul_net:192.168.233.15" + - "pgconsul_postgresql3_1.pgconsul_pgconsul_net:192.168.233.16" + - "pgconsul_zookeeper1_1.pgconsul_pgconsul_net:192.168.233.10" + - "pgconsul_zookeeper2_1.pgconsul_pgconsul_net:192.168.233.11" + - "pgconsul_zookeeper3_1.pgconsul_pgconsul_net:192.168.233.12" + - "pgconsul_backup1_1.pgconsul_pgconsul_net:192.168.233.13" + networks: + pgconsul_net: + ipv4_address: 192.168.233.17 + +networks: + pgconsul_net: + driver: bridge + ipam: + driver: default + config: + - subnet: 192.168.233.0/24 + gateway: 192.168.233.1 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f4cc611 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +kazoo==2.6.1 +psycopg2-binary==2.8.4 +lockfile==0.12.2 +python-daemon==2.1.1 +pyyaml==5.3.1 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..2d65043 --- /dev/null +++ b/setup.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +# encoding: utf-8 + +from setuptools import setup + +setup( + name='pgconsul', + version='3.0', + author='Vladimir Borodin', + author_email='d0uble@yandex-team.ru', + url='https://github.com/yandex/pgconsul', + description="Automatic failover of PostgreSQL with help of ZK", + long_description="Automatic failover of PostgreSQL with help of ZK", + license="PostgreSQL", + platforms=["Linux", "BSD", "MacOS"], + zip_safe=False, + packages=['pgconsul'], + package_dir={'pgconsul': 'src'}, + package_data={'pgconsul': ['src/plugins/', 'plugins/*.py']}, + entry_points={ + 'console_scripts': [ + 'pgconsul = pgconsul:main', + ] + }, + scripts=["bin/pgconsul-util"], +) diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..3e20c42 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,207 @@ +""" +Automatic failover of PostgreSQL with help of ZK +""" +# encoding: utf-8 + +import logging +import os +import sys + +try: + from configparser import RawConfigParser +except ImportError: + from ConfigParser import RawConfigParser +from argparse import ArgumentParser +from pwd import getpwnam + +from lockfile import AlreadyLocked +from lockfile.pidlockfile import PIDLockFile +import daemon +from .main import pgconsul + + +def parse_cmd_args(): + """ + Parse args and return result + """ + usage = "Usage: %prog [options]" + parser = ArgumentParser(usage=usage) + parser.add_argument("-c", "--config", dest="config_file", default='/etc/pgconsul.conf') + parser.add_argument("-p", "--pid-file", dest="pid_file", default=None) + parser.add_argument("-l", "--log-file", dest="log_file", default=None) + parser.add_argument("-f", "--foreground", dest="foreground", default="no") + parser.add_argument("--log-level", dest="log_level", default=None) + parser.add_argument("-w", "--working-dir", dest="working_dir", default=None) + return parser.parse_args() + + +def read_config(filename=None, options=None): + """ + Merge config with default values and cmd options + """ + defaults = { + 'global': { + 'log_file': '/var/log/pgconsul/pgconsul.log', + 'log_level': 'debug', + 'pid_file': '/var/run/pgconsul/pgconsul.pid', + 'working_dir': '.', + 'foreground': 'no', + 'local_conn_string': 'dbname=postgres ' + 'user=postgres connect_timeout=1', + 'append_primary_conn_string': 'connect_timeout=1', + 'iteration_timeout': 1.0, + 'zk_hosts': 'localhost:2181', + 'zk_lockpath_prefix': None, + 'plugins_path': '/etc/pgconsul/plugins', + 'recovery_conf_rel_path': 'recovery.conf', + 'use_replication_slots': 'no', + 'max_rewind_retries': 3, + 'postgres_timeout': 60, + 'election_timeout': 5, + 'priority': 0, + 'update_prio_in_zk': 'yes', + 'standalone_pooler': 'yes', + 'pooler_port': 6432, + 'pooler_addr': 'localhost', + 'pooler_conn_timeout': 1, + 'stream_from': None, + 'autofailover': 'yes', + 'do_consecutive_primary_switch': 'no', + 'quorum_commit': 'no', + 'use_lwaldump': 'no', + 'zk_connect_max_delay': 60, + 'zk_auth': 'no', + 'zk_username': None, + 'zk_password': None, + 'zk_ssl': 'no', + 'keyfile': None, + 'certfile': None, + 'ca_cert': None, + 'verify_certs': 'no', + }, + 'primary': { + 'change_replication_type': 'yes', + 'change_replication_metric': 'count,load', + 'overload_sessions_ratio': 75, + 'weekday_change_hours': '10-22', + 'weekend_change_hours': '0-0', + 'primary_switch_checks': 3, + 'sync_replication_in_maintenance': 'yes', + }, + 'replica': { + 'primary_unavailability_timeout': 5, + 'start_pooler': 'yes', + 'primary_switch_checks': 3, + 'min_failover_timeout': 3600, + 'allow_potential_data_loss': 'no', + 'recovery_timeout': 60, + 'can_delayed': 'no', + 'primary_switch_restart': 'yes', + }, + 'commands': { + 'promote': '/usr/lib/postgresql/10/bin/pg_ctl promote -D %p', + 'rewind': "/usr/lib/postgresql/10/bin/pg_rewind" + " --target-pgdata=%p --source-server='host=%m connect_timeout=10'", + 'get_control_parameter': "/usr/lib/postgresql/10/bin/pg_controldata %p | grep '%a:'", + 'pg_start': 'sudo service postgresql-10 start', + 'pg_stop': '/usr/lib/postgresql/10/bin/pg_ctl stop -s -m fast -w -t %t -D %p', + 'pg_status': 'sudo service postgresql-10 status', + 'pg_reload': '/usr/lib/postgresql/10/bin/pg_ctl reload -s -D %p', + 'pooler_start': 'sudo service pgbouncer start', + 'pooler_stop': 'sudo service pgbouncer stop', + 'pooler_status': 'sudo service pgbouncer status >/dev/null 2>&1', + 'list_clusters': 'pg_lsclusters --no-header', + 'generate_recovery_conf': '/usr/local/yandex/populate_recovery_conf.py -s -r -p %p %m', + }, + 'debug': {}, + 'plugins': {'wals_to_upload': 20}, + } + + config = RawConfigParser() + if not filename: + filename = options.config_file + + config.read(filename) + + # + # Appending default config with default values. + # + for section in defaults: + if not config.has_section(section): + config.add_section(section) + for key, value in defaults[section].items(): + if not config.has_option(section, key): + config.set(section, key, value) + + # + # Rewriting global config with parameters from command line. + # + if options: + for key, value in vars(options).items(): + if value is not None: + config.set('global', key, value) + + return config + + +def init_logging(config): + """ + Set log level and format + """ + level = getattr(logging, config.get('global', 'log_level').upper()) + logging.getLogger('kazoo').setLevel(logging.WARN) + logging.basicConfig(level=level, format='%(asctime)s %(levelname)s:\t%(message)s') + + +def start(config): + """ + Start daemon + """ + usr = getpwnam(config.get('global', 'daemon_user')) + + init_logging(config) + + pidfile = PIDLockFile(config.get('global', 'pid_file'), timeout=-1) + + try: + pidfile.acquire() + except AlreadyLocked: + try: + os.kill(pidfile.read_pid(), 0) + print('Already running!') + sys.exit(1) + except OSError: + pass + + pidfile.break_lock() + + if config.getboolean('global', 'foreground'): + working_dir = config.get('global', 'working_dir') + with daemon.DaemonContext( + working_directory=working_dir, + uid=usr.pw_uid, + gid=usr.pw_gid, + detach_process=False, + stdout=sys.stdout, + stderr=sys.stderr, + pidfile=pidfile, + ): + pgconsul(config=config).start() + else: + working_dir = config.get('global', 'working_dir') + logfile = open(config.get('global', 'log_file'), 'a') + with daemon.DaemonContext(working_directory=working_dir, stdout=logfile, stderr=logfile, pidfile=pidfile): + pgconsul(config=config).start() + + +def main(): + """ + Main function. All magic is done here + """ + + options = parse_cmd_args() + config = read_config(filename=options.config_file, options=options) + start(config) + + +if __name__ == '__main__': + main() diff --git a/src/cli.py b/src/cli.py new file mode 100644 index 0000000..3215091 --- /dev/null +++ b/src/cli.py @@ -0,0 +1,365 @@ +# coding: utf-8 +""" +Various utility fucntions: + - Zookeeper structures init + - Scheduled switchover +""" +import argparse +import functools +import json +import yaml +import socket +import sys +import logging + +from . import read_config, init_logging, zk as zookeeper +from . import helpers +from . import utils +from .exceptions import SwitchoverException + + +class ParseHosts(argparse.Action): + """ + Check validity of provided hostnames + """ + + def __call__(self, parser, namespace, values, option_string=None): + for value in values: + try: + socket.getaddrinfo(value, 0) + except Exception as exc: + raise ValueError('invalid hostname: %s: %s' % (value, exc)) + namespace.members.append(value) + + +def entry(): + """ + Entry point. + """ + opts = parse_args() + conf = read_config( + filename=opts.config_file, + options=opts, + ) + init_logging(conf) + try: + opts.action(opts, conf) + except (KeyboardInterrupt, EOFError): + logging.error('abort') + sys.exit(1) + except RuntimeError as err: + logging.error(err) + sys.exit(1) + except Exception as exc: + logging.exception(exc) + sys.exit(1) + + +def maintenance_enabled(zk): + """ + Returns True if all hosts confirmed that maintenance is enabled. + """ + for host in zk.get_alive_hosts(): + if zk.get(zk.get_host_maintenance_path(host)) != 'enable': + return False + return True + + +def maintenance_disabled(zk): + """ + Common maintenance node should be deleted + """ + return zk.get(zk.MAINTENANCE_PATH) is None + + +def _wait_maintenance_enabled(zk, timeout): + is_maintenance_enabled = functools.partial(maintenance_enabled, zk) + if not helpers.await_for(is_maintenance_enabled, timeout, 'enabling maintenance mode'): + # Return cluster to last state, i.e. disable maintenance. + zk.write(zk.MAINTENANCE_PATH, 'disable') + raise TimeoutError + logging.info('Success') + + +def _wait_maintenance_disabled(zk, timeout): + is_maintenance_disabled = functools.partial(maintenance_disabled, zk) + if not helpers.await_for(is_maintenance_disabled, timeout, 'disabling maintenance mode'): + # Return cluster to the last state, i.e. enable maintenance. + # There is obvious race condition between time when primary deletes this node + # and we write value here. We assume that big timeout will help us here. + zk.write(zk.MAINTENANCE_PATH, 'enable') + raise TimeoutError + logging.info('Success') + + +def maintenance(opts, conf): + """ + Enable or disable maintenance mode. + """ + zk = zookeeper.Zookeeper(config=conf, plugins=None) + if opts.mode == 'enable': + zk.ensure_path(zk.MAINTENANCE_PATH) + zk.noexcept_write(zk.MAINTENANCE_PATH, 'enable', need_lock=False) + if opts.wait_all: + _wait_maintenance_enabled(zk, opts.timeout) + elif opts.mode == 'disable': + zk.write(zk.MAINTENANCE_PATH, 'disable', need_lock=False) + if opts.wait_all: + _wait_maintenance_disabled(zk, opts.timeout) + elif opts.mode == 'show': + val = zk.get(zk.MAINTENANCE_PATH) or 'disable' + print('{val}d'.format(val=val)) + + +def initzk(opts, conf): + """ + Creates structures in zk.MEMBERS_PATH corresponding + to members` names or checks if it has been done earlier. + ! We override iteration_timeout here because it's timeout for ZK operations, + for initzk is not important how fast zk response, but it's use in cluster restore + and can fail if zk didn't response for 1 second + """ + conf.set('global', 'iteration_timeout', 5) + zk = zookeeper.Zookeeper(config=conf, plugins=None) + for host in opts.members: + path = '{members}/{host}'.format(members=zk.MEMBERS_PATH, host=host) + if opts.test: + logging.debug(f'Fetching path "{path}"...') + if not zk.exists_path(path): + logging.debug(f'Path "{path}" not found in ZK, initialization has not been performed earlier') + sys.exit(2) + else: + logging.debug('creating "%s"...', path) + if not zk.ensure_path(path): + raise RuntimeError(f'Could not create path "{path}" in ZK') + if opts.test: + logging.debug('Initialization for all fqdns has been performed earlier') + else: + logging.debug('ZK structures are initialized') + + +def switchover(opts, conf): + """ + Perform planned switchover. + """ + try: + switch = utils.Switchover( + conf=conf, primary=opts.primary, timeline=opts.timeline, new_primary=opts.destination, timeout=opts.timeout + ) + if opts.reset: + return switch.reset(force=True) + logging.info('switchover %(primary)s (timeline: %(timeline)s) to %(sync_replica)s', switch.plan()) + # ask user confirmation if necessary. + if not opts.yes: + helpers.confirm() + # perform returns False on soft-fail. + # right now it happens when an unexpected host has become + # the new primary instead of intended sync replica. + if not switch.is_possible(): + logging.error('Switchover is impossible now.') + sys.exit(1) + if not switch.perform(opts.replicas, block=opts.block): + sys.exit(2) + except SwitchoverException as exc: + logging.error('unable to switchover: %s', exc) + sys.exit(1) + + +def show_info(opts, conf): + """ + Show cluster's information + """ + info = _show_info(opts, conf) + style = {'sort_keys': True, 'indent': 4} + if info is not None: + if opts.json: + print(json.dumps(info, **style)) + else: + print(yaml.dump(info, **style)) + + +def _show_info(opts, conf): + zk = zookeeper.Zookeeper(config=conf, plugins=None) + zk_state = zk.get_state() + zk_state['primary'] = zk_state.pop('lock_holder') # rename field name to avoid misunderstunding + if zk_state[zk.MAINTENANCE_PATH]['status'] is None: + zk_state[zk.MAINTENANCE_PATH] = None + + if opts.short: + return { + 'alive': zk_state['alive'], + 'primary': zk_state['primary'], + 'last_failover_time': zk_state[zk.LAST_FAILOVER_TIME_PATH], + 'maintenance': zk_state[zk.MAINTENANCE_PATH], + 'replics_info': _short_replica_infos(zk_state['replics_info']), + } + + db_state = _get_db_state(conf) + return {**db_state, **zk_state} + + +def _get_db_state(conf): + fname = '%s/.pgconsul_db_state.cache' % conf.get('global', 'working_dir') + try: + with open(fname, 'r') as fobj: + return json.loads(fobj.read()) + except Exception: + logging.info("Can't load pgconsul status from %s, skipping", fname) + return dict() + + +def _short_replica_infos(replics): + ret = {} + if replics is None: + return ret + for replica in replics: + ret[replica['client_hostname']] = ', '.join( + [ + replica['state'], + 'sync_state {0}'.format(replica['sync_state']), + 'replay_lag_msec {0}'.format(replica['replay_lag_msec']), + ] + ) + return ret + + +def parse_args(): + """ + Parse multiple commands. + """ + arg = argparse.ArgumentParser( + description=""" + pgconsul utility + """ + ) + arg.add_argument( + '-c', + '--config', + dest='config_file', + type=str, + metavar='', + default='/etc/pgconsul.conf', + help='path to pgconsul main config file', + ) + arg.add_argument( + '--zk', + type=str, + dest='zk_hosts', + metavar=',[,...]', + help='override config zookeeper connection string', + ) + arg.add_argument( + '--zk-prefix', + metavar='', + type=str, + dest='zk_lockpath_prefix', + help='override config zookeeper path prefix', + ) + arg.set_defaults(action=lambda *_: arg.print_help()) + + subarg = arg.add_subparsers( + help='possible actions', title='subcommands', description='for more info, see -h' + ) + + # Init ZK command + initzk_arg = subarg.add_parser('initzk', help='define zookeeper structures') + initzk_arg.add_argument( + 'members', + metavar=' [ ...]', + action=ParseHosts, + default=[], + nargs='+', + help='Space-separated list of cluster members hostnames', + ) + initzk_arg.add_argument( + '-t', + '--test', + action='store_true', + default=False, + help='Check if zookeeper intialization had already been performed for given hosts. Returns 0 if it had.', + ) + initzk_arg.set_defaults(action=initzk) + + maintenance_arg = subarg.add_parser('maintenance', help='maintenance mode') + maintenance_arg.add_argument( + '-m', '--mode', metavar='[enable, disable, show]', default='enable', help='Enable or disable maintenance mode' + ) + maintenance_arg.add_argument( + '-w', + '--wait_all', + help='Wait for all alive high-availability hosts finish entering/exiting maintenance mode', + action='store_true', + default=False, + ) + maintenance_arg.add_argument( + '-t', '--timeout', help='Set timeout for maintenance command with --wait_all option', type=int, default=5 * 60 + ) + maintenance_arg.set_defaults(action=maintenance) + + # Info command + info_arg = subarg.add_parser('info', help='info about cluster') + info_arg.add_argument( + '-s', + '--short', + help='short output from zookeeper', + action='store_true', + default=False, + ) + info_arg.add_argument( + '-j', + '--json', + help='show output in json format', + action='store_true', + default=False, + ) + info_arg.set_defaults(action=show_info) + + # Scheduled switchover command + switch_arg = subarg.add_parser( + 'switchover', + help='perform graceful switchover', + description=""" + Perform graceful switchover of the current primary. + The default is to auto-detect its hostname and + timeline in ZK. + This behaviour can be overridden with options below. + """, + ) + switch_arg.add_argument('-d', '--destination', help='sets host where to switch', default=None, metavar='') + switch_arg.add_argument( + '-b', '--block', help='block until switchover completes or fails', default=False, action='store_true' + ) + switch_arg.add_argument( + '-t', + '--timeout', + help='limit each step to this amount of seconds', + type=int, + default=60, + metavar='', + ) + switch_arg.add_argument( + '-y', '--yes', help='do not ask confirmation before proceeding', default=False, action='store_true' + ) + switch_arg.add_argument( + '-r', + '--reset', + help='reset switchover state in ZK (potentially disruptive)', + default=False, + action='store_true', + ) + switch_arg.add_argument( + '--replicas', + help='if in blocking mode, wait until this number of replicas become online', + type=int, + default=2, + metavar='', + ) + switch_arg.add_argument('--primary', help='override current primary hostname', default=None, metavar='') + switch_arg.add_argument('--timeline', help='override current primary timeline', default=None, metavar='') + switch_arg.set_defaults(action=switchover) + + try: + return arg.parse_args() + except ValueError as err: + arg.exit(message='%s\n' % err) + exit(1) diff --git a/src/command_manager.py b/src/command_manager.py new file mode 100644 index 0000000..7f5b2ce --- /dev/null +++ b/src/command_manager.py @@ -0,0 +1,74 @@ +from . import helpers + + +_substitutions = { + 'pgdata': '%p', + 'primary_host': '%m', + 'timeout': '%t', + 'argument': '%a', +} + + +@helpers.decorate_all_class_methods(helpers.func_name_logger) +class CommandManager: + def __init__(self, config): + self._config = config + + def _prepare_command(self, command_name, **kwargs): + command = self._config.get('commands', command_name) + for arg_name, arg_value in kwargs.items(): + command = command.replace(_substitutions[arg_name], str(arg_value)) + return command + + def _exec_command(self, command_name, **kwargs): + command = self._prepare_command(command_name, **kwargs) + return helpers.subprocess_call(command) + + def promote(self, pgdata): + return self._exec_command('promote', pgdata=pgdata) + + def rewind(self, pgdata, primary_host): + return self._exec_command('rewind', pgdata=pgdata, primary_host=primary_host) + + def get_control_parameter(self, pgdata, parameter, preproc=None, log=True): + command = self._prepare_command('get_control_parameter', pgdata=pgdata, argument=parameter) + res = helpers.subprocess_popen(command, log_cmd=log) + if not res: + return None + value = res.communicate()[0].decode('utf-8').split(':')[-1].strip() + if preproc: + return preproc(value) + else: + return value + + def list_clusters(self, log=True): + command = self._prepare_command('list_clusters') + res = helpers.subprocess_popen(command, log_cmd=log) + if not res: + return None + output, _ = res.communicate() + return output.decode('utf-8').rstrip('\n').split('\n') + + def start_postgresql(self, timeout, pgdata): + return self._exec_command('pg_start', timeout=timeout, pgdata=pgdata) + + def stop_postgresql(self, timeout, pgdata): + return self._exec_command('pg_stop', timeout=timeout, pgdata=pgdata) + + def get_postgresql_status(self, pgdata): + return self._exec_command('pg_status', pgdata=pgdata) + + def reload_postgresql(self, pgdata): + return self._exec_command('pg_reload', pgdata=pgdata) + + def start_pooler(self): + return self._exec_command('pooler_start') + + def stop_pooler(self): + return self._exec_command('pooler_stop') + + def get_pooler_status(self): + return self._exec_command('pooler_status') + + def generate_recovery_conf(self, filepath, primary_host): + return self._exec_command('generate_recovery_conf', pgdata=filepath, primary_host=primary_host) diff --git a/src/exceptions.py b/src/exceptions.py new file mode 100644 index 0000000..f542d70 --- /dev/null +++ b/src/exceptions.py @@ -0,0 +1,36 @@ +# coding: utf8 +""" +Describes exception classes used in pgconsul. +""" + + +class pgconsulException(Exception): + """ + Generic pgconsul exception. + """ + + pass + + +class SwitchoverException(pgconsulException): + """ + Exception for fatal errors during switchover. + """ + + pass + + +class PGIsShuttingDown(pgconsulException): + """ + Postgres is shutting down + """ + + pass + + +class PGIsStartingUp(pgconsulException): + """ + Postgres is starting up + """ + + pass diff --git a/src/failover_election.py b/src/failover_election.py new file mode 100644 index 0000000..dd17df5 --- /dev/null +++ b/src/failover_election.py @@ -0,0 +1,234 @@ +# encoding: utf-8 +import logging +import time + +from . import helpers + +STATUS_CLEANUP = 'cleanup' +STATUS_FAILED = 'failed' +STATUS_DONE = 'done' +STATUS_SELECTION = 'selection' +STATUS_REGISTRATION = 'registration' + + +class ElectionError(Exception): + """Base exception for all exceptions in election logic""" + + +class StatusChangeError(ElectionError): + def __str__(self): + return 'Failed to change election status.' + + +class NoWinnerError(ElectionError): + def __str__(self): + return 'No winner found in election.' + + +class VoteFailError(ElectionError): + def __str__(self): + return 'Failed to vote in election.' + + +class CleanupError(ElectionError): + def __str__(self): + return 'Failed to clean up current votes.' + + +class ElectionTimeout(ElectionError): + def __str__(self): + return 'Election process timed out.' + + +class FailoverElection(object): + """ + Contains logic needed for failover election + """ + + def __init__( + self, + config, + _zk, + timeout, + replics_info, + replication_manager, + allow_data_loss, + host_priority, + host_lsn, + quorum_size, + ): + self.config = config + self._zk = _zk + self._timeout = timeout + self._replica_infos = replics_info + self._replication_manager = replication_manager + self._allow_data_loss = allow_data_loss + self._host_priority = host_priority + self._host_lsn = host_lsn + self._quorum_size = quorum_size + + def _get_host_vote(self, hostname): + lsn = self._zk.get(self._zk.get_election_vote_path(hostname) + '/lsn', preproc=int) + if lsn is None: + logging.error("Failed to get '%s' lsn for elections.", hostname) + return None + priority = self._zk.get(self._zk.get_election_vote_path(hostname) + '/prio', preproc=int) + if priority is None: + logging.error("Failed to get '%s' priority for elections.", hostname) + return None + return lsn, priority + + def _collect_votes(self): + votes = {} + app_name_map = {helpers.app_name_from_fqdn(host): host for host in self._zk.get_ha_hosts()} + for info in self._replica_infos: + app_name = info['application_name'] + replica = app_name_map.get(app_name) + if not replica: + continue + vote = self._get_host_vote(replica) + if vote is not None: + votes[replica] = vote + logging.info('Collected votes are: %s', votes) + return votes + + @staticmethod + def _determine_election_winner(votes): + best_vote = None + winner = None + + for replica, vote in votes.items(): + if vote is None: + continue + if best_vote is None or vote > best_vote: + best_vote = vote + winner = replica + if winner is None: + raise NoWinnerError + return winner + + def _vote_in_election(self): + if not self._zk.ensure_path(self._zk.get_election_vote_path()): + raise VoteFailError + if not self._zk.write(self._zk.get_election_vote_path() + '/lsn', self._host_lsn, need_lock=False): + raise VoteFailError + if not self._zk.write(self._zk.get_election_vote_path() + '/prio', self._host_priority, need_lock=False): + raise VoteFailError + + def _is_election_valid(self, votes): + if len(votes) < self._quorum_size: + logging.error('Not enough votes for quorum.') + return False + is_promote_safe = self._replication_manager.is_promote_safe( + votes, + replica_infos=self._replica_infos, + ) + if not self._allow_data_loss and not is_promote_safe: + logging.error('Sync replica vote is required but was not found.') + return False + return True + + def _cleanup_votes(self): + for replica in self._zk.get_ha_hosts(): + if not self._zk.delete(self._zk.get_election_vote_path(replica), recursive=True): + raise CleanupError + + def _await_election_status(self, status): + if not helpers.await_for( + lambda: self._zk.get(self._zk.ELECTION_STATUS_PATH) == status, self._timeout, f'election status {status}' + ): + raise ElectionTimeout + + def _await_lock_holder_fits(self, lock, condition, condition_name): + return helpers.await_for( + lambda: condition(self._zk.get_current_lock_holder(lock)), self._timeout, condition_name + ) + + def _write_election_status(self, status): + logging.info('Changing election status to: %s', status) + if not self._zk.write(self._zk.ELECTION_STATUS_PATH, status, need_lock=False): + raise StatusChangeError + + def _participate_in_election(self): + """ + Logic for election participant. + :return: 'True' only if this host became a new leader as a result of election. + """ + # + # The order of actions inside this function is very important and was validated to avoid race conditions. + # + logging.debug('Participate in election') + self._await_election_status(STATUS_REGISTRATION) + self._vote_in_election() + self._await_election_status(STATUS_DONE) + if self._zk.get(self._zk.ELECTION_WINNER_PATH) == helpers.get_hostname(): + if not self._zk.try_acquire_lock(self._zk.PRIMARY_LOCK_PATH, timeout=self._timeout): + return False + if not self._await_lock_holder_fits( + self._zk.ELECTION_MANAGER_LOCK_PATH, + lambda holder: holder is None, + f'lock {self._zk.ELECTION_MANAGER_LOCK_PATH} is empty', + ): + raise ElectionTimeout + if self._zk.get(self._zk.ELECTION_STATUS_PATH) == STATUS_FAILED: + self._zk.release_lock(self._zk.PRIMARY_LOCK_PATH) + return False + return True + return False + + def _manage_election(self): + """ + Logic for election manager. Each election is guaranteed to have single manager. + :return: 'True' only if this host became a new leader as a result of election. + """ + # + # The order of actions inside this function is very important and was validated to avoid race conditions. + # + logging.debug('Manage election') + self._cleanup_votes() + self._write_election_status(STATUS_REGISTRATION) + self._vote_in_election() + time.sleep(self._timeout / 2.0) + self._write_election_status(STATUS_SELECTION) + votes = self._collect_votes() + if not self._is_election_valid(votes): + return False + winner_host = FailoverElection._determine_election_winner(votes) + if not self._zk.write(self._zk.ELECTION_WINNER_PATH, winner_host, need_lock=False): + return False + self._write_election_status(STATUS_DONE) + if winner_host == helpers.get_hostname(): + return self._zk.try_acquire_lock(self._zk.PRIMARY_LOCK_PATH, timeout=self._timeout) + if not self._await_lock_holder_fits( + self._zk.PRIMARY_LOCK_PATH, + lambda holder: holder is not None, + f'lock {self._zk.PRIMARY_LOCK_PATH} is not empty', + ): + self._write_election_status(STATUS_FAILED) + raise ElectionTimeout + return False + + def make_election(self): + """ + Take part in election as participant or as a manager. + Returns True if this host is election winner and False otherwise. + """ + # + # The order of actions inside this function is very important and was validated to avoid race conditions. + # + if not self._zk.try_acquire_lock(self._zk.ELECTION_ENTER_LOCK_PATH, allow_queue=True, timeout=self._timeout): + return False + if self._zk.get_current_lock_holder(self._zk.ELECTION_MANAGER_LOCK_PATH): + self._zk.release_lock(self._zk.ELECTION_ENTER_LOCK_PATH) + return self._participate_in_election() + if self._zk.get_current_lock_holder(self._zk.PRIMARY_LOCK_PATH): + return False + self._write_election_status(STATUS_CLEANUP) + if not self._zk.try_acquire_lock(self._zk.ELECTION_MANAGER_LOCK_PATH, timeout=self._timeout): + return False + try: + self._zk.release_lock(self._zk.ELECTION_ENTER_LOCK_PATH) + is_winner = self._manage_election() + finally: + self._zk.release_lock(self._zk.ELECTION_MANAGER_LOCK_PATH) + return is_winner diff --git a/src/helpers.py b/src/helpers.py new file mode 100644 index 0000000..f5640b2 --- /dev/null +++ b/src/helpers.py @@ -0,0 +1,265 @@ +""" +Some helper functions and decorators +""" +# encoding: utf-8 + +import json +import logging +import operator +import os +import random +import re +import shutil +import socket +import subprocess +import time +import traceback +from functools import wraps + + +def get_input(*args, **kwargs): + """ + Python cross-compatible input function + """ + fun = input + return fun(*args, **kwargs) + + +def confirm(prompt='yes', no_raise=False): + """ + prompt user for confirmation. Raise if doesnt match. + """ + confirmation = get_input('type "%s" to continue: ' % prompt) + if confirmation.lower() == prompt: + return True + if no_raise: + return None + raise RuntimeError('there was no confirmation') + + +def load_json_or_default(data): + if data == '': + return [] + return json.loads(data) + + +def subprocess_popen(cmd, log_cmd=True): + """ + subprocess popen wrapper + """ + try: + if log_cmd: + logging.debug(cmd) + return subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + except Exception: + logging.error("Could not run command '%s'", cmd) + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + return None + + +def await_for_value(event, timeout, event_name): + return get_exponentially_retrying(timeout, event_name, None, event)() + + +def await_for(event, timeout, event_name): + return get_exponentially_retrying(timeout, event_name, False, return_none_on_false(event))() + + +def subprocess_call(cmd, fail_comment=None, log_cmd=True): + """ + subprocess call wrapper + """ + proc = subprocess_popen(cmd, log_cmd) + if proc.wait() != 0: + for line in proc.stdout: + logging.error(line.rstrip()) + for line in proc.stderr: + logging.error(line.rstrip()) + if fail_comment: + logging.error(fail_comment) + return proc.returncode + + +def app_name_from_fqdn(fqdn): + return fqdn.replace('.', '_').replace('-', '_') + + +def get_hostname(): + """ + return fqdn of local machine + """ + return socket.getfqdn() + + +def backup_dir(src, dst): + """ + This function is basicaly 'rsync --delete -a ' + """ + if os.path.exists(dst): + shutil.rmtree(dst) + shutil.copytree(src, dst) + + +def get_lockpath_prefix(): + """ + return lockpath prefix based on hostname + """ + prefix = re.match('[a-z-]+[0-9]+', get_hostname()).group(0) + return '/pgconsul/%s/' % prefix + + +def get_oldest_replica(replics_info): + # "-1 * priority" used in sorting because we need to sorting like + # ORDER BY write_location_diff ASC, priority DESC + replics = sorted(replics_info, key=lambda x: (x['write_location_diff'], -1 * int(x['priority']))) + if len(replics): + return replics[0]['application_name'] + return None + + +def make_current_replics_quorum(replics_info, alive_hosts): + """ + Returns set of replics which participate in quorum now. + It is intersection of alive replics (holds alive lock) and streaming replics + """ + streaming_replics = filter(lambda x: x['state'] == 'streaming', replics_info) + alive_replics = set(map(operator.itemgetter('application_name'), streaming_replics)) + alive_hosts_map = {host: app_name_from_fqdn(host) for host in alive_hosts} + return {host for host, app_name in alive_hosts_map.items() if app_name in alive_replics} + + +def check_last_failover_time(last, config): + """ + Returns True if last failover has been done quite ago + and False otherwise + """ + min_failover = config.getfloat('replica', 'min_failover_timeout') + now = time.time() + if last: + return (now - last) > min_failover + else: + return True + + +def return_none_on_error(func): + """ + Decorator for function to return None on any exception (and log it) + """ + + @wraps(func) + def wrapper(*args, **kwargs): + """ + wrapper for function + """ + try: + return func(*args, **kwargs) + except Exception: + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + + return None + + return wrapper + + +def return_none_on_false(func): + @wraps(func) + def wrapper(*args, **kwargs): + if func(*args, **kwargs): + return True + return None + + return wrapper + + +def get_exponentially_retrying(timeout, event_name, timeout_returnvalue, func): + """ + This function returns an exponentially retrying decorator. + If timeout == -1, then we won't stop waiting until we get the result. + """ + + @wraps(func) + def wrapper(*args, **kwargs): + retrying_end = time.time() + timeout + sleep_time = 1 + while timeout == -1 or time.time() < retrying_end: + result = func(*args, **kwargs) + if result is not None: + return result + if timeout == -1: + current_sleep = sleep_time + else: + current_sleep = min(sleep_time, retrying_end - time.time()) + if current_sleep > 0: + logging.info(f'Waiting {current_sleep} for {event_name}') + time.sleep(current_sleep) + sleep_time = 1.1 * sleep_time + 0.1 * random.random() + logging.warning('Retrying timeout expired.') + return timeout_returnvalue + + return wrapper + + +def write_status_file(db_state, zk_state, path): + """ + Save json status file + """ + try: + data = {'zk_state': zk_state, 'db_state': db_state, 'ts': time.time()} + fname = os.path.join(path, 'pgconsul.status') + with open(fname, 'w') as fobj: + fobj.write(json.dumps(data)) + fobj.flush() + except Exception: + logging.warning('Could not write status-file. Ignoring it.') + + +def func_name_logger(func): + @wraps(func) + def wrapper(*args, **kwargs): + logging.info('Called: {}'.format(func.__name__)) + return func(*args, **kwargs) + + return wrapper + + +def decorate_all_class_methods(decorator): + def class_decorator(Cls): + class NewCls(object): + def __init__(self, *args, **kwargs): + self.oInstance = Cls(*args, **kwargs) + + def __getattribute__(self, s): + """ + this is called whenever any attribute of a NewCls object is accessed. This function first tries to + get the attribute off NewCls. If it fails then it tries to fetch the attribute from self.oInstance (an + instance of the decorated class). If it manages to fetch the attribute from self.oInstance, and + the attribute is an instance method then `decorator` is applied. + """ + try: + x = super(NewCls, self).__getattribute__(s) + except AttributeError: + pass + else: + return x + x = self.oInstance.__getattribute__(s) + if isinstance(x, type(self.__init__)): # it is an instance method + return decorator(x) # this is equivalent of just decorating the method + else: + return x + + return NewCls + + return class_decorator + + +class IterationTimer: + def __init__(self): + self.start = time.time() + + def sleep(self, timeout): + now = time.time() + if now - self.start > float(timeout): + return + time.sleep(float(timeout) - (now - self.start)) diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..c6672b5 --- /dev/null +++ b/src/main.py @@ -0,0 +1,1870 @@ +""" +Main module. pgconsul class defined here. +""" +# encoding: utf-8 + +import atexit +import functools +import json +import logging +import os +import random +import signal +import sys +import time +import traceback + +import psycopg2 + +from . import helpers, sdnotify +from .command_manager import CommandManager +from .failover_election import ElectionError, FailoverElection +from .helpers import IterationTimer, get_hostname +from .pg import Postgres +from .plugin import PluginRunner, load_plugins +from .replication_manager import QuorumReplicationManager, SingleSyncReplicationManager +from .zk import Zookeeper, ZookeeperException + + +class pgconsul(object): + """ + pgconsul class + """ + + DESTRUCTIVE_OPERATIONS = ['rewind'] + + def __init__(self, **kwargs): + logging.debug('Initializing main class.') + self.config = kwargs.get('config') + self._cmd_manager = CommandManager(self.config) + self._should_run = True + self.is_in_maintenance = False + + random.seed(os.urandom(16)) + + plugins = load_plugins(self.config.get('global', 'plugins_path')) + + self.db = Postgres(config=self.config, plugins=PluginRunner(plugins['Postgres']), cmd_manager=self._cmd_manager) + self.zk = Zookeeper(config=self.config, plugins=PluginRunner(plugins['Zookeeper'])) + self.startup_checks() + + signal.signal(signal.SIGTERM, self._sigterm_handler) + + self.checks = {'primary_switch': 0, 'failover': 0, 'rewind': 0} + self._is_single_node = False + self.notifier = sdnotify.Notifier() + + if self.config.getboolean('global', 'quorum_commit'): + self._replication_manager = QuorumReplicationManager( + self.config, + self.db, + self.zk, + ) + else: + self._replication_manager = SingleSyncReplicationManager( + self.config, + self.db, + self.zk, + ) + + def _sigterm_handler(self, *_): + self._should_run = False + + def re_init_db(self): + """ + Reinit db connection + """ + try: + if not self.db.is_alive(): + db_state = self.db.get_state() + logging.error( + 'Could not get data from PostgreSQL. Seems, ' + 'that it is dead. Getting last role from cached ' + 'file. And trying to reconnect.' + ) + if db_state.get('prev_state'): + self.db.role = db_state['prev_state']['role'] + self.db.pg_version = db_state['prev_state']['pg_version'] + self.db.pgdata = db_state['prev_state']['pgdata'] + self.db.reconnect() + except KeyError: + logging.error('Could not get data from PostgreSQL and cache-file. Exiting.') + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + sys.exit(1) + except Exception: + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + + def re_init_zk(self): + """ + Reinit zk connection + """ + try: + if not self.zk.is_alive(): + logging.warning('Some error with ZK client. Trying to reconnect.') + self.zk.reconnect() + except Exception: + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + + def startup_checks(self): + """ + Perform some basic checks on startup + """ + work_dir = self.config.get('global', 'working_dir') + fname = '%s/.pgconsul_rewind_fail.flag' % work_dir + + if os.path.exists(fname): + logging.error('Rewind fail flag exists. Exiting.') + sys.exit(1) + + if self.db.is_alive() and not self.zk.is_alive(): + if self.db.role == 'primary' and self.db.pgpooler('status'): + self.db.pgpooler('stop') + + if not self.db.is_alive() and self.zk.is_alive(): + if self.zk.get_current_lock_holder() == helpers.get_hostname(): + res = self.zk.release_lock() + if res: + logging.info('Released lock in ZK since postgres is dead.') + + db_state = self.db.get_state() + if db_state['prev_state'] is not None: + # Ok, it means that current start is not the first one. + # In this case we should check that we are able to do pg_rewind. + if not db_state['alive']: + self.db.pgdata = db_state['prev_state']['pgdata'] + if not self.db.is_ready_for_pg_rewind(): + sys.exit(0) + + # Abort startup if zk.MEMBERS_PATH is empty + # (no one is participating in cluster), but + # timeline indicates a mature (tli>1) and operating database system. + tli = self.db.get_state().get('timeline', 0) + if not self._get_zk_members() and tli > 1: + logging.error( + 'ZK "%s" empty but timeline indicates operating cluster (%i > 1)', + self.zk.MEMBERS_PATH, + tli, + ) + self.db.pgpooler('stop') + sys.exit(1) + + if ( + self.config.getboolean('global', 'quorum_commit') + and not self.config.getboolean('global', 'use_lwaldump') + and not self.config.getboolean('replica', 'allow_potential_data_loss') + ): + logging.error("Using quorum_commit allow only with use_lwaldump or with allow_potential_data_loss") + exit(1) + + if ( + self.db.is_alive() + and not self.db.check_extension_installed('lwaldump') + and self.config.getboolean('global', 'use_lwaldump') + ): + logging.error("lwaldump is not installed") + exit(1) + + if self.db.is_alive() and not self.db.ensure_archive_mode(): + logging.error("archive mode is not enabled on instance - pgconsul support only archive mode yet ") + exit(1) + + # pylint: disable=W0212 + def stop(self, *_): + """ + Stop iterations + """ + logging.info('Stopping') + atexit._run_exitfuncs() + os._exit(0) + + def _init_zk(self, my_prio): + if not self._replication_manager.init_zk(): + return False + + if not self.config.getboolean('global', 'update_prio_in_zk') and helpers.get_hostname() in self.zk.get_children( + self.zk.MEMBERS_PATH + ): + logging.info("Don't have to write priority to ZK") + return True + + return self.zk.ensure_path(self.zk.get_host_prio_path()) and self.zk.noexcept_write( + self.zk.get_host_prio_path(), my_prio, need_lock=False + ) + + def start(self): + """ + Start iterations + """ + my_prio = self.config.get('global', 'priority') + self.notifier.ready() + while True: + if self._init_zk(my_prio): + break + logging.error('Failed to init ZK') + self.re_init_zk() + + while self._should_run: + try: + self.run_iteration(my_prio) + except Exception: + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + self.stop() + + def update_maintenance_status(self, role, primary_fqdn): + maintenance_status = self.zk.get(self.zk.MAINTENANCE_PATH) # can be None, 'enable', 'disable' + + if maintenance_status == 'enable': + # maintenance node exists with 'enable' value, we are in maintenance now + self.is_in_maintenance = True + if role == 'primary' and self._update_replication_on_maintenance_enter(): + return + # Write current ts to zk on maintenance enabled, it's be dropped on disable + maintenance_ts = self.zk.get(self.zk.MAINTENANCE_TIME_PATH) + if maintenance_ts is None: + self.zk.write(self.zk.MAINTENANCE_TIME_PATH, time.time(), need_lock=False) + # Write current primary to zk on maintenance enabled, it's be dropped on disable + current_primary = self.zk.get(self.zk.MAINTENANCE_PRIMARY_PATH) + if current_primary is None and primary_fqdn is not None: + self.zk.write(self.zk.MAINTENANCE_PRIMARY_PATH, primary_fqdn, need_lock=False) + elif maintenance_status == 'disable': + # maintenance node exists with 'disable' value, we are not in maintenance now + # and should delete this node. We delete it recursively, we don't won't to wait + # all cluster members to delete each own node, because some of them may be + # already dead and we can wait it infinitely. Maybe we should wait each member + # with timeout and then delete recursively (TODO). + logging.debug('Disabling maintenance mode, deleting maintenance node') + self.zk.delete(self.zk.MAINTENANCE_PATH, recursive=True) + self.is_in_maintenance = False + elif maintenance_status is None: + # maintenance node doesn't exists, we are not in maintenance mode + self.is_in_maintenance = False + + def _update_replication_on_maintenance_enter(self): + if not self.config.getboolean('primary', 'change_replication_type'): + # Replication type change is restricted, we do nothing here + return True + if self.config.getboolean('primary', 'sync_replication_in_maintenance'): + # It is allowed to have sync replication in maintenance here + return True + current_replication = self.db.get_replication_state() + if current_replication[0] == 'async': + # Ok, it is already async + return True + return self._replication_manager.change_replication_to_async() + + def run_iteration(self, my_prio): + timer = IterationTimer() + _, terminal_state = self.db.is_alive_and_in_terminal_state() + if not terminal_state: + logging.debug('Database is starting up or shutting down') + role = self.db.get_role() + logging.debug('Role: %s', str(role)) + + db_state = self.db.get_state() + self.notifier.notify() + logging.debug(db_state) + try: + zk_state = self.zk.get_state() + logging.debug(zk_state) + helpers.write_status_file(db_state, zk_state, self.config.get('global', 'working_dir')) + self.update_maintenance_status(role, db_state.get('primary_fqdn')) + self._zk_alive_refresh(role, db_state, zk_state) + if self.is_in_maintenance: + logging.warning('Cluster in maintenance mode') + self.zk.reconnect() + self.zk.write(self.zk.get_host_maintenance_path(), 'enable') + return + except ZookeeperException: + logging.error("Zookeeper exception while getting ZK state") + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + if role == 'primary' and not self.is_in_maintenance and not self._is_single_node: + logging.error("Upper exception was for primary") + my_hostname = helpers.get_hostname() + self.resolve_zk_primary_lock(my_hostname) + else: + self.re_init_zk() + return + stream_from = self.config.get('global', 'stream_from') + if role is None: + is_dead = terminal_state + self.dead_iter(db_state, zk_state, is_actually_dead=is_dead) + elif role == 'primary': + if self._is_single_node: + self.single_node_primary_iter(db_state, zk_state) + else: + self.primary_iter(db_state, zk_state) + elif role == 'replica': + if stream_from: + self.non_ha_replica_iter(db_state, zk_state) + else: + self.replica_iter(db_state, zk_state) + self.re_init_db() + self.re_init_zk() + + # Dead PostgreSQL probably means + # that our node is being removed. + # No point in updating all_hosts + # in this case + all_hosts = self.zk.get_children(self.zk.MEMBERS_PATH) + prio = self.zk.noexcept_get(self.zk.get_host_prio_path()) + if role and all_hosts and not prio: + if not self.zk.noexcept_write(self.zk.get_host_prio_path(), my_prio, need_lock=False): + logging.warning('Could not write priority to ZK') + + logging.debug('Finished iteration.') + timer.sleep(self.config.getfloat('global', 'iteration_timeout')) + + def release_lock_and_return_to_cluster(self): + my_hostname = helpers.get_hostname() + self.db.pgpooler('stop') + holder = self.zk.get_current_lock_holder() + if holder == my_hostname: + self.zk.release_lock() + elif holder is not None: + logging.warning('Lock in ZK is being held by %s. We should return to cluster here.', holder) + self._return_to_cluster(holder, 'primary') + + def single_node_primary_iter(self, db_state, zk_state): + """ + Iteration if local postgresql is single node + """ + logging.info('primary is in single node state') + self.zk.try_acquire_lock() + + self._store_replics_info(db_state, zk_state) + + self.zk.write(self.zk.TIMELINE_INFO_PATH, db_state['timeline']) + + if not self.db.pgpooler('status'): + logging.debug('Here we should open for load.') + self.db.pgpooler('start') + + self.db.ensure_archiving_wal() + + # Enable async replication + current_replication = self.db.get_replication_state() + if current_replication[0] != 'async': + self._replication_manager.change_replication_to_async() + + def primary_iter(self, db_state, zk_state): + """ + Iteration if local postgresql is primary + """ + my_hostname = helpers.get_hostname() + try: + stream_from = self.config.get('global', 'stream_from') + last_op = self.zk.get('%s/%s/op' % (self.zk.MEMBERS_PATH, my_hostname)) + # If we were promoting or rewinding + # and failed we should not acquire lock + if self.is_op_destructive(last_op): + logging.warning('Could not acquire lock due to destructive operation fail: %s', last_op) + return self.release_lock_and_return_to_cluster() + if stream_from: + logging.warning('Host not in HA group We should return to stream_from.') + return self.release_lock_and_return_to_cluster() + + current_promoting_host = zk_state.get(self.zk.CURRENT_PROMOTING_HOST) + if current_promoting_host and current_promoting_host != helpers.get_hostname(): + logging.warning( + 'Host %s was promoted. We should not be primary', zk_state[self.zk.CURRENT_PROMOTING_HOST] + ) + self.resolve_zk_primary_lock(my_hostname) + return None + + if not self.zk.try_acquire_lock(): + self.resolve_zk_primary_lock(my_hostname) + return None + self.zk.write(self.zk.LAST_PRIMARY_AVAILABILITY_TIME, time.time()) + + self._reset_simple_primary_switch_try() + + self.checks['primary_switch'] = 0 + + self._store_replics_info(db_state, zk_state) + + # Make sure local timeline corresponds to that of the cluster. + if not self._verify_timeline(db_state, zk_state): + return None + + if zk_state[self.zk.FAILOVER_MUST_BE_RESET]: + self.reset_failover_node(zk_state) + return None + + # Check for unfinished failover and if self is last promoted host + # In this case self is fully operational primary, need to reset + # failover state in ZK. Otherwise need to try return to cluster as replica + if zk_state[self.zk.FAILOVER_INFO_PATH] in ('promoting', 'checkpointing'): + if zk_state[self.zk.CURRENT_PROMOTING_HOST] == helpers.get_hostname(): + self.reset_failover_node(zk_state) + return None # so zk_state will be updated in the next iter + else: + logging.info( + 'Failover state was "%s" and last promoted host was "%s"', + zk_state[self.zk.FAILOVER_INFO_PATH], + zk_state[self.zk.CURRENT_PROMOTING_HOST], + ) + return self.release_lock_and_return_to_cluster() + + self._drop_stale_switchover(db_state) + + if not self.db.pgpooler('status'): + logging.debug('Here we should open for load.') + self.db.pgpooler('start') + + # Ensure that wal archiving is enabled. It can be disabled earlier due to + # some zk connectivity issues. + self.db.ensure_archiving_wal() + + # Check if replication type (sync/normal) change is needed. + ha_replics_config = self._get_ha_replics() + if ha_replics_config is None: + return None + try: + logging.debug('Checking ha replics for aliveness') + alive_hosts = self.zk.get_alive_hosts(timeout=3, catch_except=False) + ha_replics = {replica for replica in ha_replics_config if replica in alive_hosts} + except Exception: + logging.exception('Fail to get replica status') + ha_replics = ha_replics_config + if len(ha_replics) != len(ha_replics_config): + logging.debug( + 'Some of the replics is unavailable, config replics % alive replics %s', + str(ha_replics_config), + str(ha_replics), + ) + logging.debug('Checking if changing replication type is needed.') + change_replication = self.config.getboolean('primary', 'change_replication_type') + if change_replication: + self._replication_manager.update_replication_type(db_state, ha_replics) + + # Check if scheduled switchover conditions exists + # and local cluster state can handle switchover. + if not self._check_primary_switchover(db_state, zk_state): + return None + + # Perform switchover: shutdown user service, + # release lock, write state. + if not self._do_primary_switchover(zk_state): + return None + + # Ensure that new primary will appear in time, + # and transition current instance to replica. + # Rollback state if this does not happen. + if not self._transition_primary_switchover(): + # wait while replica free primary lock + limit = self.config.getfloat('global', 'postgres_timeout') + + def wait_for_no_one_holds_primary_lock(): + primary = self.zk.get_current_lock_holder() + logging.debug("Current primary lock holder: {0}".format(primary)) + return primary is None + + return helpers.await_for(wait_for_no_one_holds_primary_lock, limit, 'no-one holds primary lock') + + except ZookeeperException: + if not self.zk.try_acquire_lock(): + logging.error("Zookeeper error during primary iteration:") + self.resolve_zk_primary_lock(my_hostname) + return None + except Exception: + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + return None + + def reset_failover_node(self, zk_state): + if ( + self.zk.get(self.zk.FAILOVER_INFO_PATH) == 'finished' + or self.zk.write(self.zk.FAILOVER_INFO_PATH, 'finished') + ) and self.zk.delete(self.zk.CURRENT_PROMOTING_HOST): + self.zk.delete(self.zk.FAILOVER_MUST_BE_RESET) + logging.info('Resetting failover info (was "%s", now "finished")', zk_state[self.zk.FAILOVER_INFO_PATH]) + else: + self.zk.ensure_path(self.zk.FAILOVER_MUST_BE_RESET) + logging.info('Resetting failover failed, will try on next iteration.') + + def resolve_zk_primary_lock(self, my_hostname): + holder = self.zk.get_current_lock_holder() + if holder is None: + if self._replication_manager.should_close(): + self.db.pgpooler('stop') + # We need to stop archiving WAL because when network connectivity + # returns, it can be another primary in cluster. We need to stop + # archiving to prevent "wrong" WAL appears in archive. + self.db.stop_archiving_wal() + else: + self.start_pooler() + logging.warning('Lock in ZK is released but could not be acquired. Reconnecting to ZK.') + self.zk.reconnect() + elif holder != my_hostname: + self.db.pgpooler('stop') + logging.warning('Lock in ZK is being held by %s. We should return to cluster here.', holder) + self._return_to_cluster(holder, 'primary') + + def write_host_stat(self, hostname, db_state): + stream_from = self.config.get('global', 'stream_from') + replics_info = db_state.get('replics_info') + wal_receiver_info = db_state['wal_receiver'] + host_path = '{member_path}/{hostname}'.format(member_path=self.zk.MEMBERS_PATH, hostname=hostname) + replics_info_path = '{host_path}/replics_info'.format(host_path=host_path) + ha_path = '{host_path}/ha'.format(host_path=host_path) + wal_receiver_path = '{host_path}/wal_receiver'.format(host_path=host_path) + if not stream_from: + if not self.zk.ensure_path(ha_path): + logging.warning('Could not write ha host in ZK.') + return False + else: + if self.zk.exists_path(ha_path) and not self.zk.delete(ha_path): + logging.warning('Could not delete ha host in ZK.') + return False + if wal_receiver_info is not None: + if not self.zk.write(wal_receiver_path, wal_receiver_info, preproc=json.dumps, need_lock=False): + logging.warning('Could not write host wal_receiver_info to ZK.') + return False + if replics_info is not None: + if not self.zk.write(replics_info_path, replics_info, preproc=json.dumps, need_lock=False): + logging.warning('Could not write host replics_info to ZK.') + return False + + def remove_stale_operation(self, hostname): + op_path = '%s/%s/op' % (self.zk.MEMBERS_PATH, hostname) + last_op = self.zk.noexcept_get(op_path) + if self.is_op_destructive(last_op): + logging.warning('Stale operation %s detected. Removing track from zk.', last_op) + self.zk.delete(op_path) + + def start_pooler(self): + start_pooler = self.config.getboolean('replica', 'start_pooler') + if not self.db.pgpooler('status') and start_pooler: + self.db.pgpooler('start') + + def get_replics_info(self, zk_state): + stream_from = self.config.get('global', 'stream_from') + if stream_from: + replics_info_path = '{member_path}/{hostname}/replics_info'.format( + member_path=self.zk.MEMBERS_PATH, hostname=stream_from + ) + replics_info = self.zk.noexcept_get(replics_info_path, preproc=json.loads) + else: + replics_info = zk_state[self.zk.REPLICS_INFO_PATH] + return replics_info + + def change_primary(self, db_state, primary): + logging.warning( + 'Seems that primary has been switched to %s ' + 'while we are streaming WAL from %s. ' + 'We should switch primary ' + 'here.', + primary, + db_state['primary_fqdn'], + ) + return self._return_to_cluster(primary, 'replica') + + def replica_return(self, db_state, zk_state): + my_hostname = helpers.get_hostname() + self.write_host_stat(my_hostname, db_state) + holder = zk_state['lock_holder'] + + self.checks['failover'] = 0 + limit = self.config.getfloat('replica', 'recovery_timeout') + + # Try to resume WAL replaying, it can be paused earlier + self.db.pg_wal_replay_resume() + + if not self._check_archive_recovery(limit) and not self._wait_for_streaming(limit): + # Wal receiver is not running and + # postgresql isn't in archive recovery + # We should try to restart + logging.warning('We should try switch primary one more time here.') + return self._return_to_cluster(holder, 'replica', is_dead=False) + + def _get_streaming_replica_from_replics_info(self, fqdn, replics_info): + if not replics_info: + return None + app_name = helpers.app_name_from_fqdn(fqdn) + for replica in replics_info: + if replica['application_name'] == app_name and replica['state'] == 'streaming': + return replica + return None + + def non_ha_replica_iter(self, db_state, zk_state): + try: + logging.info('Current replica is non ha.') + if not zk_state['alive']: + return None + my_hostname = helpers.get_hostname() + self.remove_stale_operation(my_hostname) + self.write_host_stat(my_hostname, db_state) + stream_from = self.config.get('global', 'stream_from') + can_delayed = self.config.getboolean('replica', 'can_delayed') + replics_info = self.get_replics_info(zk_state) or [] + self.checks['failover'] = 0 + streaming = self._get_streaming_replica_from_replics_info(my_hostname, replics_info) and bool( + db_state['wal_receiver'] + ) + streaming_from_primary = self._get_streaming_replica_from_replics_info( + my_hostname, zk_state.get(self.zk.REPLICS_INFO_PATH) + ) and bool(db_state['wal_receiver']) + logging.error( + 'Streaming: %s, streaming from primary: %s, wal_receiver: %s, replics_info: %s', + streaming, + streaming_from_primary, + db_state['wal_receiver'], + replics_info, + ) + if not streaming and not can_delayed: + logging.warning('Seems that we are not really streaming WAL from %s.', stream_from) + self._replication_manager.leave_sync_group() + replication_source_is_dead = self._check_host_is_really_dead(primary=stream_from) + replication_source_replica_info = self._get_streaming_replica_from_replics_info( + stream_from, zk_state.get(self.zk.REPLICS_INFO_PATH) + ) + wal_receiver_info = self._zk_get_wal_receiver_info(stream_from) + replication_source_streams = bool( + wal_receiver_info and wal_receiver_info[0].get('status') == 'streaming' + ) + logging.error(replication_source_replica_info) + current_primary = zk_state['lock_holder'] + if replication_source_is_dead: + # Replication source is dead. We need to streaming from primary while it became alive and start streaming from primary. + if stream_from == current_primary or current_primary is None: + logging.warning( + 'My replication source %s seems dead and it was primary. Waiting new primary appears in cluster or old became alive.', + stream_from, + ) + elif not streaming_from_primary: + logging.warning( + 'My replication source %s seems dead. Try to stream from primary %s', + stream_from, + current_primary, + ) + return self._return_to_cluster(current_primary, 'replica', is_dead=False) + else: + logging.warning( + 'My replication source %s seems dead. We are already streaming from primary %s. Waiting replication source became alive.', + stream_from, + current_primary, + ) + else: + # Replication source is alive. We need to wait while it starts streaming from primary and start streaming from it. + if replication_source_streams: + logging.warning( + 'My replication source %s seems alive and streams, try to stream from it', + stream_from, + ) + return self._return_to_cluster(stream_from, 'replica', is_dead=False) + elif stream_from == current_primary: + logging.warning( + 'My replication source %s seems alive and it is current primary, try to stream from it', + stream_from, + ) + return self._return_to_cluster(stream_from, 'replica', is_dead=False) + else: + logging.warning( + 'My replication source %s seems alive. But it don\'t streaming. Waiting it starts streaming from primary.', + stream_from, + ) + self.checks['primary_switch'] = 0 + self.start_pooler() + self._reset_simple_primary_switch_try() + except Exception: + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + return None + + def _accept_switchover(self, lock_holder, previous_primary): + if not self._can_do_switchover(): + return None + + # WARNING: we shouldn't allow multiple hosts to enter this branch + if not self.zk.write(self.zk.SWITCHOVER_STATE_PATH, 'candidate_found', need_lock=False): + logging.error('Failed to state that we are the new primary candidate in ZK.') + return None + + # + # All checks are done. Waiting for primary shutdown, acquiring the lock in ZK, + # promoting and writing last switchover timestamp to ZK. + # + limit = self.config.getfloat('global', 'postgres_timeout') + # Current primary is lock holder. Otherwise consider last primary as current. + current_primary = lock_holder or previous_primary + if current_primary is not None and not helpers.await_for( + lambda: self._check_primary_is_really_dead(current_primary), limit, 'primary is down' + ): + return None + + # Wait switchover_master_shut state only if current primary is alive, i.e. lock holder exists. + if lock_holder is not None and not helpers.await_for( + lambda: self.zk.get(self.zk.FAILOVER_INFO_PATH) == 'switchover_master_shut', + limit, + 'failover state is switchover_master_shut', + ): + # Mark switchover node as failure + self.zk.write(self.zk.SWITCHOVER_STATE_PATH, 'master_timed_out', need_lock=False) + return False + + if not self.zk.try_acquire_lock(allow_queue=True, timeout=limit): + logging.info('Could not acquire lock in ZK. Not doing anything.') + return None + + if not self._do_failover(): + return False + + self._cleanup_switchover() + self.zk.write(self.zk.LAST_SWITCHOVER_TIME_PATH, time.time()) + + def replica_iter(self, db_state, zk_state): + """ + Iteration if local postgresql is replica + """ + try: + if not zk_state['alive']: + return None + my_hostname = helpers.get_hostname() + my_app_name = helpers.app_name_from_fqdn(my_hostname) + self.remove_stale_operation(my_hostname) + holder = zk_state['lock_holder'] + self.write_host_stat(my_hostname, db_state) + + if self._is_single_node: + logging.error("HA replica shouldn't exist inside a single node cluster") + return None + + replics_info = zk_state[self.zk.REPLICS_INFO_PATH] + streaming = False + for i in replics_info or []: + if i['application_name'] != my_app_name: + continue + if i['state'] == 'streaming': + streaming = True + + if self._detect_replica_switchover(): + logging.warning('Planned switchover condition detected') + self._replication_manager.enter_sync_group(replica_infos=replics_info) + return self._accept_switchover(holder, db_state.get('primary_fqdn')) + + # If there is no primary lock holder and it is not a switchover + # then we should consider current cluster state as failover. + if holder is None: + logging.error('According to ZK primary has died. We should verify it and do failover if possible.') + return self._accept_failover() + + self.checks['failover'] = 0 + + if holder != db_state['primary_fqdn'] and holder != my_hostname: + self._replication_manager.leave_sync_group() + return self.change_primary(db_state, holder) + + self.db.ensure_replaying_wal() + + if not streaming: + logging.warning('Seems that we are not really streaming WAL from %s.', holder) + self._replication_manager.leave_sync_group() + + return self.replica_return(db_state, zk_state) + + self.checks['primary_switch'] = 0 + + self.start_pooler() + self._reset_simple_primary_switch_try() + + self._replication_manager.enter_sync_group(replica_infos=replics_info) + except Exception: + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + return None + + def dead_iter(self, db_state, zk_state, is_actually_dead): + """ + Iteration if local postgresql is dead + """ + if not zk_state['alive'] or db_state['alive']: + return None + + self.db.pgpooler('stop') + + self._replication_manager.leave_sync_group() + self.zk.release_if_hold(self.zk.PRIMARY_LOCK_PATH) + + role = self.db.role + last_tli = self.db.get_data_from_control_file('Latest checkpoint.s TimeLineID', preproc=int, log=False) + last_primary = None + if role == 'replica' and db_state.get('prev_state'): + last_primary = db_state['prev_state'].get('primary_fqdn') + + holder = self.zk.get_current_lock_holder() + if holder and holder != helpers.get_hostname(): + if role == 'replica' and holder == last_primary: + logging.info('Seems that primary has not changed but PostgreSQL is dead. Starting it.') + return self.db.start_postgresql() + + # + # We can get here in two cases: + # We were primary and now we are dead. + # We were replica, primary has changed and now we are dead. + # + logging.warning( + 'Seems that primary is %s and local PostgreSQL is dead. We should return to cluster here.', holder + ) + return self._return_to_cluster(holder, role, is_dead=is_actually_dead) + + else: + # + # The only case we get here is absence of primary (no one holds the + # lock) and our PostgreSQL is dead. + # + logging.error('Seems that all hosts (including me) are dead. Trying to start PostgreSQL.') + if role == 'primary': + zk_timeline = zk_state[self.zk.TIMELINE_INFO_PATH] + if zk_timeline is not None and zk_timeline != last_tli: + logging.error( + 'Seems that I was primary before but not the last one in the cluster. Not doing anything.' + ) + return None + # + # Role was primary. We need to disable archive_command before + # starting postgres to prevent "wrong" last WAL in archive. + # + self.db.stop_archiving_wal_stopped() + return self.db.start_postgresql() + + def _drop_stale_switchover(self, db_state): + if not self.zk.try_acquire_lock(self.zk.SWITCHOVER_LOCK_PATH): + return + try: + switchover_info = self.zk.get(self.zk.SWITCHOVER_PRIMARY_PATH, preproc=json.loads) + if not switchover_info: + return + switchover_state = self.zk.get(self.zk.SWITCHOVER_STATE_PATH) + if ( + switchover_state != 'scheduled' + or switchover_info.get(self.zk.TIMELINE_INFO_PATH) is None + or switchover_info[self.zk.TIMELINE_INFO_PATH] < db_state['timeline'] + ): + logging.warning('Dropping stale switchover') + logging.debug( + 'Switchover info: state %s; info %s; db timeline %s', + switchover_state, + switchover_info, + db_state['timeline'], + ) + self._cleanup_switchover() + finally: + # We want to release this lock regardless of what happened in 'try' block + self.zk.release_lock(self.zk.SWITCHOVER_LOCK_PATH) + + def _cleanup_switchover(self): + self.zk.delete(self.zk.SWITCHOVER_LSN_PATH) + self.zk.delete(self.zk.SWITCHOVER_PRIMARY_PATH) + self.zk.delete(self.zk.SWITCHOVER_STATE_PATH) + self.zk.delete(self.zk.FAILOVER_INFO_PATH) + + def _update_single_node_status(self, role): + """ + In case if current role is 'primary', we should determine new status + and update it locally and in ZK. + Otherwise, we should just update the status from ZK + """ + if role == 'primary': + ha_hosts = self.zk.get_ha_hosts() + if ha_hosts is None: + logging.error('Failed to update single node status because of empty ha host list.') + return + self._is_single_node = len(ha_hosts) == 1 + if self._is_single_node: + self.zk.ensure_path(self.zk.SINGLE_NODE_PATH) + else: + self.zk.delete(self.zk.SINGLE_NODE_PATH) + else: + self._is_single_node = self.zk.exists_path(self.zk.SINGLE_NODE_PATH) + + def _verify_timeline(self, db_state, zk_state): + """ + Make sure current timeline corresponds to the rest of the cluster (@ZK). + Save timeline and some related info into zk + """ + # Skip if role is not primary + if self.db.role != 'primary': + logging.error('We are not primary. Not doing anything.') + return None + + # Establish whether local timeline corresponds to primary timeline at ZK. + tli_res = zk_state[self.zk.TIMELINE_INFO_PATH] == db_state['timeline'] + # If it does, but there is no info on replicas, + # close local PG instance. + if tli_res: + if zk_state['replics_info_written'] is False: + logging.error('Some error with ZK.') + # Actually we should never get here but checking it just in case. + # Here we should end iteration and check and probably close primary + # at the begin of primary_iter + return None + # If ZK does not have timeline info, write it. + elif zk_state[self.zk.TIMELINE_INFO_PATH] is None: + logging.warning('Could not get timeline from ZK. Saving it.') + self.zk.write(self.zk.TIMELINE_INFO_PATH, db_state['timeline']) + # If there is a mismatch in timeline: + # - If ZK timeline is greater than local, there must be another primary. + # In that case local instance have no business holding the lock. + # - If local timeline is greater, local instance has likely been + # promoted recently. + # Update ZK structure to reflect that. + elif tli_res is False: + self.db.checkpoint() + zk_tli = zk_state[self.zk.TIMELINE_INFO_PATH] + db_tli = db_state['timeline'] + if zk_tli and zk_tli > db_tli: + logging.error('ZK timeline is newer than local. Releasing leader lock') + self.db.pgpooler('stop') + + self.zk.release_lock() + # + # This timeout is needed for primary with newer timeline + # to acquire the lock in ZK. + # + time.sleep(10 * self.config.getfloat('global', 'iteration_timeout')) + return None + elif zk_tli and zk_tli < db_tli: + logging.warning('Timeline in ZK is older than ours. Updating it it ZK.') + self.zk.write(self.zk.TIMELINE_INFO_PATH, db_tli) + logging.debug('Timeline verification succeeded') + return True + + def _reset_simple_primary_switch_try(self): + simple_primary_switch_path = self.zk.get_simple_primary_switch_try_path(get_hostname()) + if self.zk.noexcept_get(simple_primary_switch_path) != 'no': + self.zk.noexcept_write(simple_primary_switch_path, 'no', need_lock=False) + + def _set_simple_primary_switch_try(self): + simple_primary_switch_path = self.zk.get_simple_primary_switch_try_path(get_hostname()) + self.zk.noexcept_write(simple_primary_switch_path, 'yes', need_lock=False) + + def _is_simple_primary_switch_tried(self): + if self.zk.noexcept_get(self.zk.get_simple_primary_switch_try_path(get_hostname())) == 'yes': + return True + return False + + def _try_simple_primary_switch_with_lock(self, *args, **kwargs): + if not self.config.getboolean('global', 'do_consecutive_primary_switch'): + return self._simple_primary_switch(*args, **kwargs) + lock_holder = self.zk.get_current_lock_holder(self.zk.PRIMARY_SWITCH_LOCK_PATH) + if ( + lock_holder is None and not self.zk.try_acquire_lock(self.zk.PRIMARY_SWITCH_LOCK_PATH) + ) or lock_holder != helpers.get_hostname(): + return True + result = self._simple_primary_switch(*args, **kwargs) + self.zk.release_lock(self.zk.PRIMARY_SWITCH_LOCK_PATH) + return result + + def _simple_primary_switch(self, limit, new_primary, is_dead): + primary_switch_checks = self.config.getint('replica', 'primary_switch_checks') + need_restart = self.config.getboolean('replica', 'primary_switch_restart') + + logging.info('Starting simple primary switch.') + if self.checks['primary_switch'] >= primary_switch_checks: + self._set_simple_primary_switch_try() + + if need_restart and not is_dead and self.db.stop_postgresql(timeout=limit) != 0: + logging.error('Could not stop PostgreSQL. Will retry.') + self.checks['primary_switch'] = 0 + return True + + if self.db.recovery_conf('create', new_primary) != 0: + logging.error('Could not generate recovery.conf. Will retry.') + self.checks['primary_switch'] = 0 + return True + + if not is_dead and not need_restart: + if not self.db.reload(): + logging.error('Could not reload PostgreSQL. Skipping it.') + self.db.ensure_replaying_wal() + else: + if self.db.start_postgresql() != 0: + logging.error('Could not start PostgreSQL. Skipping it.') + + if self._wait_for_recovery(limit) and self._check_archive_recovery(limit): + # + # We have reached consistent state but there is a small + # chance that we are not streaming changes from new primary + # with: "new timeline N forked off current database system + # timeline N-1 before current recovery point M". + # Checking it with the info from ZK. + # + if self._wait_for_streaming(limit, new_primary): + # + # The easy way succeeded. + # + logging.info('Simple primary switch succeeded.') + self._primary_switch_handle_slots() + return True + else: + return False + + def _rewind_from_source(self, is_postgresql_dead, limit, new_primary): + logging.info("Starting pg_rewind") + + # Trying to connect to a new_primary. If not succeeded - exiting + if not helpers.await_for( + lambda: not self._check_host_is_really_dead(new_primary), + limit, + 'source database alive and ready for rewind', + ): + return None + + if not self.zk.write('%s/%s/op' % (self.zk.MEMBERS_PATH, helpers.get_hostname()), 'rewind', need_lock=False): + logging.error('Unable to save destructive op state: rewind') + return None + + self.db.pgpooler('stop') + + if not is_postgresql_dead and self.db.stop_postgresql(timeout=limit) != 0: + logging.error('Could not stop PostgreSQL. Will retry.') + return None + + self.checks['rewind'] += 1 + if self.db.do_rewind(new_primary) != 0: + logging.error('Error while using pg_rewind. Will retry.') + return True + + # Rewind has finished successfully so we can drop its operation node + self.zk.delete('%s/%s/op' % (self.zk.MEMBERS_PATH, helpers.get_hostname())) + return self._attach_to_primary(new_primary, limit) + + def _attach_to_primary(self, new_primary, limit): + """ + Generate recovery.conf and start PostgreSQL. + """ + logging.info('Converting role to replica of %s.', new_primary) + if self.db.recovery_conf('create', new_primary) != 0: + logging.error('Could not generate recovery.conf. Will retry.') + self.checks['primary_switch'] = 0 + return None + + if self.db.start_postgresql() != 0: + logging.error('Could not start PostgreSQL. Skipping it.') + + if not self._wait_for_recovery(limit): + self.checks['primary_switch'] = 0 + return None + + self._primary_switch_handle_slots() + + if not self._wait_for_streaming(limit): + self.checks['primary_switch'] = 0 + return None + + logging.info('Seems, that returning to cluster succeeded. Unbelievable!') + self.db.checkpoint() + return True + + def _primary_switch_handle_slots(self): + need_slots = self.config.getboolean('global', 'use_replication_slots') + if need_slots: + my_hostname = helpers.get_hostname() + hosts = self.zk.get_children(self.zk.MEMBERS_PATH) + if hosts: + if my_hostname in hosts: + hosts.remove(my_hostname) + hosts = [i.replace('.', '_').replace('-', '_') for i in hosts] + logging.debug(hosts) + if not self.db.replication_slots('drop', hosts): + logging.warning('Could not drop replication slots. Do not forget to do it manually!') + else: + logging.warning( + 'Could not get all hosts list from ZK. ' + 'Replication slots should be dropped but we ' + 'are unable to do it. Skipping it.' + ) + + def _get_db_state(self): + state = self.db.get_data_from_control_file('Database cluster state') + if not state or state == '': + logging.error('Could not get info from controlfile about current cluster state.') + return None + logging.info('Database cluster state is: %s' % state) + return state + + def _return_to_cluster(self, new_primary, role, is_dead=False): + """ + Return to cluster (try stupid method, if it fails we try rewind) + """ + logging.info('Starting returning to cluster.') + if self.checks['primary_switch'] >= 0: + self.checks['primary_switch'] += 1 + else: + self.checks['primary_switch'] = 1 + logging.debug("primary_switch checks is %d", self.checks['primary_switch']) + + failover_state = self.zk.noexcept_get(self.zk.FAILOVER_INFO_PATH) + if failover_state is not None and failover_state not in ('finished', 'promoting', 'checkpointing'): + logging.info( + 'We are not able to return to cluster since failover is still in progress - %s.', failover_state + ) + return None + + limit = self.config.getfloat('replica', 'recovery_timeout') + try: + # + # First we try to know if the cluster + # has been turned off correctly. + # + state = self._get_db_state() + if not state: + return None + + # + # If we are alive replica, we should first try an easy way: + # stop PostgreSQL, regenerate recovery.conf, start PostgreSQL + # and wait for recovery to finish. If last fails within + # a reasonable time, we should go a way harder (see below). + # Simple primary switch will not work if we were promoting or + # rewinding and failed. So only hard way possible in this case. + # + last_op = self.zk.noexcept_get('%s/%s/op' % (self.zk.MEMBERS_PATH, helpers.get_hostname())) + logging.info('Last op is: %s' % str(last_op)) + if role != 'primary' and not self.is_op_destructive(last_op) and not self._is_simple_primary_switch_tried(): + logging.info('Trying to do a simple primary switch.') + result = self._try_simple_primary_switch_with_lock(limit, new_primary, is_dead) + logging.info('Primary switch count: %s finish with result: %s', self.checks['primary_switch'], result) + return None + + # + # If our rewind attempts fail several times + # we should create special flag-file, stop posgresql and then exit. + # + max_rewind_retries = self.config.getint('global', 'max_rewind_retries') + if self.checks['rewind'] > max_rewind_retries: + self.db.pgpooler('stop') + self.db.stop_postgresql(timeout=limit) + work_dir = self.config.get('global', 'working_dir') + fname = '%s/.pgconsul_rewind_fail.flag' % work_dir + with open(fname, 'w') as fobj: + fobj.write(str(time.time())) + logging.error('Could not rewind %d times. Exiting.', max_rewind_retries) + sys.exit(1) + + # + # The hard way starts here. + # + if not self._rewind_from_source(is_dead, limit, new_primary): + return None + + except Exception: + logging.error('Unexpected error while trying to return to cluster. Exiting.') + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + sys.exit(1) + + def _promote(self): + if not self.zk.write(self.zk.FAILOVER_INFO_PATH, 'promoting'): + logging.error('Could not write failover state to ZK.') + return False + + if not self.zk.write(self.zk.CURRENT_PROMOTING_HOST, helpers.get_hostname()): + logging.error('Could not write self as last promoted host.') + return False + + if not self.db.promote(): + logging.error('Could not promote me as a new primary. We should release the lock in ZK here.') + # We need to close here and recheck postgres role. If it was no actual + # promote, we need too delete self as last promoted host, mark failover "finished" + # and return to cluster. If self primary we need to continue promote despite on exit code + # because self already accepted some data modification which will be loss if + # we simply return False here. + if self.db.get_role() != 'primary': + self.db.pgpooler('stop') + if not self.zk.delete(self.zk.CURRENT_PROMOTING_HOST): + logging.error('Could not remove self as current promoting host.') + if not self.zk.write(self.zk.FAILOVER_INFO_PATH, 'finished'): + logging.error('Could not write failover state to ZK.') + return False + + logging.info('Promote command failed but we are current primary. Continue') + + if not self.zk.noexcept_write(self.zk.FAILOVER_INFO_PATH, 'checkpointing'): + logging.warning('Could not write failover state to ZK.') + + logging.debug('Doing checkpoint after promoting.') + if not self.db.checkpoint(query=self.config.get('debug', 'promote_checkpoint_sql', fallback=None)): + logging.warning('Could not checkpoint after failover.') + + my_tli = self.db.get_data_from_control_file('Latest checkpoint.s TimeLineID', preproc=int, log=False) + + if not self.zk.write(self.zk.TIMELINE_INFO_PATH, my_tli): + logging.warning('Could not write timeline to ZK.') + + if not self.zk.write(self.zk.FAILOVER_INFO_PATH, 'finished'): + logging.error('Could not write failover state to ZK.') + + if not self.zk.delete(self.zk.CURRENT_PROMOTING_HOST): + logging.error('Could not remove self as current promoting host.') + + return True + + def _promote_handle_slots(self): + need_slots = self.config.getboolean('global', 'use_replication_slots') + if need_slots: + if not self.zk.write(self.zk.FAILOVER_INFO_PATH, 'creating_slots'): + logging.warning('Could not write failover state to ZK.') + + hosts = self._get_ha_replics() + if hosts is None: + logging.error( + 'Could not get all hosts list from ZK. ' + 'Replication slots should be created but we ' + 'are unable to do it. Releasing the lock.' + ) + return False + + hosts = [i.replace('.', '_').replace('-', '_') for i in hosts] + if not self.db.replication_slots('create', hosts): + logging.error('Could not create replication slots. Releasing the lock in ZK.') + return False + + return True + + def _check_my_timeline_sync(self): + my_tli = self.db.get_data_from_control_file('Latest checkpoint.s TimeLineID', preproc=int, log=False) + try: + zk_tli = self.zk.get(self.zk.TIMELINE_INFO_PATH, preproc=int) + except ZookeeperException: + logging.error('Could not get timeline from ZK.') + return False + if zk_tli is None: + logging.warning('There was no timeline in ZK. Skipping this check.') + elif zk_tli != my_tli: + logging.error( + 'My timeline (%d) differs from timeline in ZK (%d). Checkpointing and skipping iteration.', + my_tli, + zk_tli, + ) + self.db.checkpoint() + return False + return True + + def _check_last_failover_timeout(self): + try: + last_failover_ts = self.zk.get(self.zk.LAST_FAILOVER_TIME_PATH, preproc=float) + except ZookeeperException: + logging.error('Can\'t get last failover time from ZK.') + return False + + if last_failover_ts is None: + logging.warning('There was no last failover ts in ZK. Skipping this check.') + last_failover_ts = 0.0 + diff = time.time() - last_failover_ts + if not helpers.check_last_failover_time(last_failover_ts, self.config): + logging.info('Last time failover has been done %f seconds ago. Not doing anything.', diff) + return False + logging.info('Last failover has been done %f seconds ago.', diff) + return True + + def _check_primary_unavailability_timeout(self): + previous_primary_availability_time = self.zk.noexcept_get(self.zk.LAST_PRIMARY_AVAILABILITY_TIME, preproc=float) + if previous_primary_availability_time is None: + logging.error('Failed to get last primary availability time.') + return False + time_passed = time.time() - previous_primary_availability_time + if time_passed < self.config.getfloat('replica', 'primary_unavailability_timeout'): + logging.info('Last time we seen primary %f seconds ago, not doing anything.', time_passed) + return False + return True + + def _is_older_then_primary(self): + try: + lsn = self.zk.get(self.zk.SWITCHOVER_LSN_PATH) + # If there is no lsn in ZK it means that primary is dead + if lsn is None: + return True + # Our LSN should be greater than LSN in primary's pg_control + # because of shutdown record. For more info about address: + # https://www.postgresql.org/message-id/flat/A7683985-2EC2-40AD-AAAC-B44BD0F29723%40simply.name + return self.db.get_replay_diff(lsn) > 0 + except ZookeeperException: + return False + + def _can_do_failover(self): + autofailover = self.config.getboolean('global', 'autofailover') + + if not autofailover: + logging.info("Autofailover is disabled. Not doing anything.") + return False + + if not self._check_my_timeline_sync(): + return False + + if not self._check_last_failover_timeout(): + return False + if not self._check_host_is_really_dead(): + logging.warning( + 'According to ZK primary has died but it is still accessible through libpq. Not doing anything.' + ) + return False + if not self._check_primary_unavailability_timeout(): + return False + if self.db.is_replaying_wal(self.config.getfloat('global', 'iteration_timeout')): + logging.info("Host is still replaying WAL, so it can't be promoted.") + return False + + replica_infos = self.zk.noexcept_get(self.zk.REPLICS_INFO_PATH, preproc=json.loads) + if replica_infos is None: + logging.error('Unable to get replics info from ZK.') + return False + + allow_data_loss = self.config.getboolean('replica', 'allow_potential_data_loss') + logging.info(f'Data loss is: {allow_data_loss}') + is_promote_safe = self._replication_manager.is_promote_safe( + self.zk.get_alive_hosts(), + replica_infos=replica_infos, + ) + if not allow_data_loss and not is_promote_safe: + logging.warning('Promote is not allowed with given configuration.') + return False + self.db.pg_wal_replay_pause() + election_timeout = self.config.getint('global', 'election_timeout') + priority = self.config.getint('global', 'priority') + election = FailoverElection( + self.config, + self.zk, + election_timeout, + replica_infos, + self._replication_manager, + allow_data_loss, + priority, + self.db.get_wal_receive_lsn(), + len(helpers.make_current_replics_quorum(replica_infos, self.zk.get_alive_hosts(election_timeout / 2))), + ) + try: + return election.make_election() + except (ZookeeperException, ElectionError): + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + return False + + def _get_switchover_candidate(self): + switchover_info = self.zk.get(self.zk.SWITCHOVER_PRIMARY_PATH, preproc=json.loads) + if switchover_info is None: + return None + + if switchover_info.get('destination') is not None: + return switchover_info.get('destination') + replica_infos = self._get_extended_replica_infos() + if replica_infos is None: + return None + if self.config.getboolean('replica', 'allow_potential_data_loss'): + app_name_map = {helpers.app_name_from_fqdn(host): host for host in self.zk.get_ha_hosts()} + return app_name_map.get(helpers.get_oldest_replica(replica_infos)) + return self._replication_manager.get_ensured_sync_replica(replica_infos) + + def _get_extended_replica_infos(self): + replica_infos = self.zk.get(self.zk.REPLICS_INFO_PATH, preproc=json.loads) + if replica_infos is None: + logging.error('Unable to get replica infos from ZK.') + return None + app_name_map = {helpers.app_name_from_fqdn(host): host for host in self.zk.get_ha_hosts()} + for info in replica_infos: + hostname = app_name_map.get(info['application_name']) + if not hostname: + continue + info['priority'] = self.zk.get(self.zk.get_host_prio_path(hostname), preproc=int) + return replica_infos + + def _can_do_switchover(self): + if not self._is_older_then_primary(): + return False + + if not self._check_my_timeline_sync(): + return False + + switchover_candidate = self._get_switchover_candidate() + + # Make sanity check of switchover conditions, and proceed to + # promotion immediately without failover or dead primary checks. + if switchover_candidate != helpers.get_hostname(): + logging.info( + f"Switchover candidate is: {switchover_candidate}. " "We are not a candidate, so we can't promote." + ) + return False + + logging.info('We are switchover candidate, so we have to promote here.') + # If primary is alive and it participates in switchover, then we can proceed + if self.zk.get(self.zk.FAILOVER_INFO_PATH) == 'switchover_initiated': + return True + # If primary is dead but we can't failover, then we also proceed + if self.zk.get_current_lock_holder(self.zk.PRIMARY_LOCK_PATH) is None: + return True + logging.warning("Primary holds the lock but didn't initiate switchover yet. " "Waiting for it...") + return False + + def _accept_failover(self): + """ + Failover magic is here + """ + try: + if not self._can_do_failover(): + return None + + # + # All checks are done. Acquiring the lock in ZK, promoting and + # writing last failover timestamp to ZK. + # + if not self.zk.try_acquire_lock(): + logging.info('Could not acquire lock in ZK. Not doing anything.') + return None + self.db.pg_wal_replay_resume() + + if not self._do_failover(): + return False + + self.zk.write(self.zk.LAST_FAILOVER_TIME_PATH, time.time()) + except Exception: + logging.error('Unexpected error while trying to do failover. Exiting.') + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + sys.exit(1) + + def _do_failover(self): + if not self.zk.delete(self.zk.FAILOVER_INFO_PATH): + logging.error('Could not remove previous failover state. Releasing the lock.') + self.zk.release_lock() + return False + + if not self._promote_handle_slots() or not self._promote(): + self.zk.release_lock() + return False + self._replication_manager.leave_sync_group() + return True + + def _wait_for_recovery(self, limit=-1): + """ + Stop until postgresql complete recovery. + With limit=-1 the loop here can be infinite. + """ + + def check_recovery_completion(): + is_db_alive, terminal_state = self.db.is_alive_and_in_terminal_state() + if not terminal_state: + logging.debug('PostgreSQL in nonterminal state.') + return None + if is_db_alive: + logging.debug('PostgreSQL has completed recovery.') + return True + if self.db.get_postgresql_status() != 0: + logging.error('PostgreSQL service seems to be dead. No recovery is possible in this case.') + return False + return None + + return helpers.await_for_value(check_recovery_completion, limit, "PostgreSQL has completed recovery") + + def _check_archive_recovery(self, limit): + """ + Returns True if postgresql is in recovery from archive + and False if it hasn't started recovery within `limit` seconds + """ + + def check_recovery_start(): + if self._check_postgresql_streaming(): + logging.debug('PostgreSQL is already streaming from primary') + return True + + # we can get here with another role or + # have role changed during this retrying cycle + role = self.db.get_role() + if role != 'replica': + logging.warning('PostgreSQL role changed during archive recovery check. Now it doesn\'t make sense') + self.db.pgpooler('stop') + return False + + if self.db.is_replaying_wal(1): + logging.debug('PostgreSQL is in archive recovery') + return True + return None + + return helpers.await_for_value(check_recovery_start, limit, 'PostgreSQL started archive recovery') + + def _get_replics_info_from_zk(self, primary): + if primary: + replics_info_path = '{member_path}/{hostname}/replics_info'.format( + member_path=self.zk.MEMBERS_PATH, hostname=primary + ) + else: + replics_info_path = self.zk.REPLICS_INFO_PATH + return self.zk.get(replics_info_path, preproc=json.loads) + + @staticmethod + def _is_caught_up(replica_infos): + my_app_name = helpers.app_name_from_fqdn(helpers.get_hostname()) + for replica in replica_infos: + if replica['application_name'] == my_app_name and replica['state'] == 'streaming': + return True + return False + + def _check_postgresql_streaming(self, primary=None): + is_db_alive, terminal_state = self.db.is_alive_and_in_terminal_state() + if not terminal_state: + logging.debug('PostgreSQL in nonterminal state.') + return None + + if not is_db_alive: + logging.error('PostgreSQL is dead. Waiting for streaming is useless.') + return False + + # we can get here with another role or + # have role changed during this retrying cycle + if self.db.get_role() != 'replica': + self.db.pgpooler('stop') + logging.warning("PostgreSQL is not a replica, so it can't be streaming.") + return False + + try: + replica_infos = self._get_replics_info_from_zk(primary) + except ZookeeperException: + logging.error("Can't get replics_info from ZK. Won't wait for timeout.") + return False + + if replica_infos is not None and (pgconsul._is_caught_up(replica_infos) and self.db.check_walreceiver()): + logging.debug('PostgreSQL has started streaming from primary.') + return True + + return None + + def _wait_for_streaming(self, limit=-1, primary=None): + """ + Stop until postgresql start streaming from primary. + With limit=-1 the loop here can be infinite. + """ + check_streaming = functools.partial(self._check_postgresql_streaming, primary) + return helpers.await_for_value(check_streaming, limit, 'PostgreSQL started streaming from primary') + + def _wait_for_lock(self, lock, limit=-1): + """ + Wait until lock acquired + """ + + def is_lock_acquired(): + if self.zk.try_acquire_lock(lock): + return True + # There is a chance that our connection with ZK is dead + # (and that is actual reason of not getting lock). + # So we reinit connection here. + self.re_init_zk() + return False + + return helpers.await_for(is_lock_acquired, limit, f'acquired {lock} lock in ZK') + + def _check_host_is_really_dead(self, primary=None): + return self._check_primary_is_really_dead(primary=primary, check_primary=False) + + def _check_primary_is_really_dead(self, primary=None, check_primary=True): + """ + Returns True if primary is not accessible via postgres protocol + and False otherwise + """ + if not primary: + primary = self.db.recovery_conf('get_primary') + if not primary: + return False + append = self.config.get('global', 'append_primary_conn_string') + if check_primary and ('target_session_attrs' not in append): + ensure_connect_primary = 'target_session_attrs=primary' + else: + ensure_connect_primary = '' + + try: + conn = psycopg2.connect('host=%s %s %s' % (primary, append, ensure_connect_primary)) + conn.autocommit = True + cur = conn.cursor() + cur.execute('SELECT 42') + if cur.fetchone()[0] == 42: + return False + return True + except Exception as err: + logging.debug('%s while trying to check primary health.', str(err)) + return True + + def _get_ha_replics(self): + hosts = self.zk.get_ha_hosts() + if not hosts: + return None + my_hostname = helpers.get_hostname() + if my_hostname in hosts: + hosts.remove(my_hostname) + return set(hosts) + + def _get_zk_members(self): + """ + Checks the presence of subnodes in MEMBERS_PATH at ZK. + """ + while True: + timer = IterationTimer() + self.zk.ensure_path(self.zk.MEMBERS_PATH) + members = self.zk.get_children(self.zk.MEMBERS_PATH) + if members is not None: + return members + self.re_init_zk() + timer.sleep(self.config.getfloat('global', 'iteration_timeout')) + + def _check_primary_switchover(self, db_state, zk_state): + """ + Check if scheduled switchover is initiated. + Perform sanity check on current local and cluster condition. + Abort or postpone switchover if any of them fail. + """ + switchover_info = zk_state[self.zk.SWITCHOVER_ROOT_PATH] + + # Scheduled switchover node exists. + if not switchover_info: + return None + + # The node contains hostname of current instance + if switchover_info.get('hostname') != helpers.get_hostname(): + return None + + # Current instance is primary + if self.db.get_role() != 'primary': + logging.error('Current role is %s, but switchover requested.', self.db.get_role()) + return None + + # There were no failed attempts in the past + state = self.zk.get(self.zk.SWITCHOVER_STATE_PATH) + # Ignore silently if node does not exist + if state is None: + return None + # Ignore failed or in-progress switchovers + if state != 'scheduled': + logging.warning('Switchover state is %s, will not proceed.', state) + return None + + # Timeline of the current instance matches the timeline defined in + # SS node. + if int(switchover_info.get(self.zk.TIMELINE_INFO_PATH)) != db_state['timeline']: + logging.warning( + 'Switchover node has timeline %s, but local is %s, ignoring switchover.', + switchover_info.get(self.zk.TIMELINE_INFO_PATH), + db_state['timeline'], + ) + return None + + # Last switchover was more than N sec ago + last_failover_ts = self.zk.get(self.zk.LAST_FAILOVER_TIME_PATH, preproc=float) + + last_switchover_ts = self.zk.get(self.zk.LAST_SWITCHOVER_TIME_PATH, preproc=float) + + last_role_transition_ts = None + if last_failover_ts is not None or last_switchover_ts is not None: + last_role_transition_ts = max(filter(lambda x: x is not None, [last_switchover_ts, last_failover_ts])) + + alive_replics_number = len([i for i in db_state['replics_info'] if i['state'] == 'streaming']) + + ha_replics = self._get_ha_replics() + if ha_replics is None: + return None + ha_replic_cnt = len(ha_replics) + + if not helpers.check_last_failover_time(last_role_transition_ts, self.config) and ( + alive_replics_number < ha_replic_cnt + ): + logging.warning( + 'Last role transition was %.1f seconds ago,' + ' and alive host count less than HA hosts in zk (HA: %d, ZK: %d) ignoring switchover.', + time.time() - last_role_transition_ts, + ha_replic_cnt, + alive_replics_number, + ) + return None + + # Ensure there is no other failover in progress. + failover_state = self.zk.get(self.zk.FAILOVER_INFO_PATH) + if failover_state not in ('finished', None): + logging.error('Switchover requested, but current failover state is %s.', failover_state) + return None + + switchover_candidate = self._get_switchover_candidate() + if switchover_candidate is None: + return False + + if not self._candidate_is_sync_with_primary(db_state, switchover_candidate): + return False + + logging.info('Scheduled switchover checks passed OK.') + return True + + def _do_primary_switchover(self, zk_state): + """ + Perform steps required on scheduled switchover + if current role is primary + """ + logging.warning('Starting scheduled switchover') + self.zk.write(self.zk.SWITCHOVER_STATE_PATH, 'initiated') + # Deny user requests + self.db.pgpooler('stop') + logging.warning('cluster was closed from user requests') + # check once more if replica is sync with primary + limit = self.config.getfloat('global', 'postgres_timeout') + switchover_candidate = self._get_switchover_candidate() + if not helpers.await_for( + lambda: self._candidate_is_sync_with_primary_with_get_state(switchover_candidate=switchover_candidate), + limit, + "replay lag become zero", + ): + logging.error('check replica lsn diff failed - do not swtichover') + return False + + # Store replics info + db_state = self.db.get_state() + if not self._store_replics_info(db_state, zk_state): + logging.error('replics_info was not stored - do not switchover') + return False + + # Announce intention to perform switchover to the rest of the cluster. + if not self.zk.write(self.zk.FAILOVER_INFO_PATH, 'switchover_initiated'): + logging.error(f'unable to write failover state to zk ({self.zk.FAILOVER_INFO_PATH})') + return False + + if not helpers.await_for( + lambda: self.zk.get(self.zk.SWITCHOVER_STATE_PATH) == "candidate_found", limit, "switchover candidate found" + ): + return False + + # Attempt to shut down local PG instance. + # Failure is not critical. + if self.db.stop_postgresql(timeout=limit) == 0: + lsn = self._cmd_manager.get_control_parameter(db_state['pgdata'], "REDO location") + self.zk.noexcept_write(self.zk.SWITCHOVER_LSN_PATH, lsn) + if not self.zk.noexcept_write(self.zk.FAILOVER_INFO_PATH, 'switchover_master_shut'): + logging.error(f'unable to write failover state to zk ({self.zk.FAILOVER_INFO_PATH})') + return False + else: + logging.error('Unable to stop postgresql') + return False + + # Release leader-lock. + # Wait 5 secs for the actual release. + self.zk.release_lock(lock_type=self.zk.PRIMARY_LOCK_PATH, wait=5) + + return True + + def _candidate_is_sync_with_primary_with_get_state(self, switchover_candidate): + db_state = self.db.get_state() + return self._candidate_is_sync_with_primary(db_state, switchover_candidate) + + def _candidate_is_sync_with_primary(self, db_state, switchover_candidate): + if switchover_candidate is None: + # nothing to check + return True + + replics_info = db_state.get('replics_info', list()) + for replica in replics_info: + if replica.get('sync_state', '') != 'quorum': + continue + if replica.get('application_name', '') != helpers.app_name_from_fqdn(switchover_candidate): + continue + replay_lag = replica.get('replay_location_diff', -1) + if replay_lag != 0: + if not self.config.getboolean('replica', 'allow_potential_data_loss'): + logging.warning( + f"Replica {switchover_candidate} has replay lag {replay_lag} so cannot be primary for switchover" + ) + return None + else: + logging.warning(f"Replica {switchover_candidate} has replay lag {replay_lag} and allow data loss") + return True + + return True + + def _transition_primary_switchover(self): + """ + Wait for N seconds trying to find out new primary, + then transition to replica. + If timeout passed and no one took the lock, rollback + the procedure. + """ + timeout = self.config.getfloat('global', 'postgres_timeout') + if helpers.await_for( + lambda: self.zk.get(self.zk.SWITCHOVER_STATE_PATH) is None, timeout, 'new primary finished switchover' + ): + primary = self.zk.get_current_lock_holder(self.zk.PRIMARY_LOCK_PATH) + if primary is not None: + # From here switchover can be considered successful regardless of this host state + self.zk.delete('%s/%s/op' % (self.zk.MEMBERS_PATH, helpers.get_hostname())) + self._attach_to_primary(primary, self.config.getfloat('replica', 'recovery_timeout')) + return True + # Mark switchover node as failure + self.zk.write(self.zk.SWITCHOVER_STATE_PATH, 'replica_timed_out', need_lock=False) + return False + + def _detect_replica_switchover(self): + """ + Detect planned switchover condition. + """ + + if self.zk.get(self.zk.SWITCHOVER_STATE_PATH) is None: + return False + + db_state = self.db.get_state() + + switchover_info = self.zk.get(self.zk.SWITCHOVER_PRIMARY_PATH, preproc=json.loads) + if not switchover_info: + return False + + # We check that switchover should happen from current timeline + zk_tli = self.zk.get(self.zk.TIMELINE_INFO_PATH, preproc=int) + if zk_tli != switchover_info[self.zk.TIMELINE_INFO_PATH]: + return False + + # Scheduled switchover node with primary (fqdn, tli) info exists. + + # The scheduled switchover was commenced by primary: + # 'switchover_initiated': the primary is in the process + # of shutting itself down + + # If there is an ability to do failover instead of switchover, than let's do it. + autofailover = self.config.getboolean('global', 'autofailover') + failover_state = self.zk.get(self.zk.FAILOVER_INFO_PATH) + if failover_state not in ['switchover_initiated', 'switchover_master_shut'] and autofailover: + return False + + # The node contains hostname of current instance + switchover_primary = switchover_info.get('hostname') + if switchover_primary is not None and switchover_primary != db_state['primary_fqdn']: + logging.error('current primary FQDN is not equal to hostname in switchover node, ignoring switchover') + return False + + return True + + def _zk_alive_refresh(self, role, db_state, zk_state): + self._replication_manager.drop_zk_fail_timestamp() + if role is None: + self.zk.release_lock(self.zk.get_host_alive_lock_path()) + else: + self._update_single_node_status(role) + if self.zk.get_current_lock_holder(self.zk.get_host_alive_lock_path()) is None: + logging.warning("I don't hold my alive lock, let's acquire it") + self.zk.try_acquire_lock(self.zk.get_host_alive_lock_path()) + + def _zk_get_wal_receiver_info(self, host): + return self.zk.get(f'{self.zk.MEMBERS_PATH}/{host}/wal_receiver', preproc=json.loads) + + def is_op_destructive(self, op): + return op in self.DESTRUCTIVE_OPERATIONS + + def _store_replics_info(self, db_state, zk_state): + tli_res = None + if zk_state[self.zk.TIMELINE_INFO_PATH]: + tli_res = zk_state[self.zk.TIMELINE_INFO_PATH] == db_state['timeline'] + + replics_info = db_state.get('replics_info') + + zk_state['replics_info_written'] = None + if tli_res and replics_info is not None: + zk_state['replics_info_written'] = self.zk.write( + self.zk.REPLICS_INFO_PATH, replics_info, preproc=json.dumps + ) + self.write_host_stat(helpers.get_hostname(), db_state) + return True + + return False diff --git a/src/pg.py b/src/pg.py new file mode 100644 index 0000000..47c4cf6 --- /dev/null +++ b/src/pg.py @@ -0,0 +1,831 @@ +""" +Pg wrapper module. Postgres class defined here. +""" +# encoding: utf-8 + +import contextlib +import json +import logging +from functools import partial +import os +import re +import signal +import socket +import sys +import time +import traceback + +import psycopg2 +import psycopg2.errors +from psycopg2.sql import SQL, Identifier + +from . import helpers, exceptions + +if sys.version_info < (3, 0): + DEC2INT_TYPE = psycopg2.extensions.new_type( + psycopg2.extensions.DECIMAL.values, b'DEC2INT', lambda value, curs: int(value) if value is not None else None + ) +else: + DEC2INT_TYPE = psycopg2.extensions.new_type( + psycopg2.extensions.DECIMAL.values, 'DEC2INT', lambda value, curs: int(value) if value is not None else None + ) + +psycopg2.extensions.register_type(DEC2INT_TYPE) + + +def _get_names(cur): + return [r[0].lower() for r in cur.description] + + +def _plain_format(cur): + names = _get_names(cur) + for row in cur.fetchall(): + yield dict(zip(names, tuple(row))) + + +class Postgres(object): + """ + Postgres class + """ + + DISABLED_ARCHIVE_COMMAND = '/bin/false' + + def __init__(self, config, plugins, cmd_manager): + self.config = config + self._plugins = plugins + self._cmd_manager = cmd_manager + + self.state = dict() + + self.conn_local = None + self.role = None + self.pgdata = None + self.pg_version = None + self._offline_detect_pgdata() + self.reconnect() + self.use_lwaldump = self.config.getboolean('global', 'use_lwaldump') or self.config.getboolean( + 'global', 'quorum_commit' + ) + + def _create_cursor(self): + try: + if self.conn_local: + cursor = self.conn_local.cursor() + cursor.execute('SELECT 1;') + return cursor + else: + raise RuntimeError('Local conn is dead') + except Exception: + for line in traceback.format_exc().split('\n'): + logging.debug(line.rstrip()) + self.reconnect() + + def _exec_query(self, query, **kwargs): + cur = self._create_cursor() + if not cur: + raise RuntimeError('Local conn is dead') + cur.execute(query, kwargs) + return cur + + def _get(self, query, **kwargs): + with contextlib.closing(self._exec_query(query, **kwargs)) as cur: + records = list(_plain_format(cur)) + return records + + def _exec_without_result(self, query): + try: + self._exec_query(query) + return True + except Exception: + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + return False + + def get_data_from_control_file(self, parameter, preproc=None, log=True): + """ + Run pg_controldata and grep it's output + """ + return self._cmd_manager.get_control_parameter(self.pgdata, parameter, preproc, log) + + def _local_conn_string_get_port(self): + for param in self.config.get('global', 'local_conn_string').split(): + key, value = param.strip().split('=') + if key == 'port': + port = value + break + else: + port = '5432' + return port + + def _offline_detect_pgdata(self): + """ + Try to find pgdata and version parameter from list_clusters command by port + """ + try: + state = {} + need_port = self._local_conn_string_get_port() + rows = self._cmd_manager.list_clusters() + logging.debug(rows) + for row in rows: + if not row: + continue + version, _, port, pgstate, _, pgdata, _ = row.split() + if port != need_port: + continue + if state.get('pg_version'): + logging.error('Found more than one cluster on %s port', need_port) + return + self.pg_version = state['pg_version'] = version + self.role = state['role'] = 'replica' if 'recovery' in pgstate else 'primary' + self.pgdata = state['pgdata'] = pgdata + except Exception: + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + + @helpers.return_none_on_error + def _get_replication_slots(self): + res = self._exec_query('SELECT slot_name FROM pg_replication_slots;').fetchall() + return [i[0] for i in res] + + def _create_replication_slot(self, slot_name): + query = f"SELECT pg_create_physical_replication_slot('{slot_name}', true)" + return self._exec_without_result(query) + + def _drop_replication_slot(self, slot_name): + query = f"SELECT pg_drop_replication_slot('{slot_name}')" + return self._exec_without_result(query) + + def reconnect(self): + """ + Reestablish connection with local postgresql + """ + nonfatal_errors = { + 'FATAL: the database system is starting up': exceptions.PGIsStartingUp, + 'FATAL: the database system is shutting down': exceptions.PGIsShuttingDown, + } + try: + if self.conn_local: + self.conn_local.close() + if not self.state.get('running', False): + logging.error('PostgreSQL is dead. Unable to reconnect.') + self.conn_local = None + return + self.conn_local = psycopg2.connect(self.config.get('global', 'local_conn_string')) + self.conn_local.autocommit = True + + self.role = self.get_role() + self.pg_version = self._get_pg_version() + self.pgdata = self._get_pgdata_path() + except psycopg2.OperationalError: + logging.error('Could not connect to "%s".', self.config.get('global', 'local_conn_string')) + error_lines = traceback.format_exc().split('\n') + for line in error_lines: + logging.error(line.rstrip()) + for line in error_lines: + for substr, exc in nonfatal_errors.items(): + if substr in line: + raise exc() + + def get_state(self): + """ + Get current database state (if possible) + """ + fname = '%s/.pgconsul_db_state.cache' % self.config.get('global', 'working_dir') + try: + with open(fname, 'r') as fobj: + prev = json.loads(fobj.read()) + except Exception: + prev = None + + data = {'alive': False, 'prev_state': prev} + try: + try: + is_db_alive, terminal_state = self.is_alive_and_in_terminal_state() + if terminal_state: + data['running'] = is_db_alive + data['alive'] = is_db_alive + else: + data['running'] = True + data['alive'] = False + except Exception: + data['running'] = False + data['alive'] = False + # Explicitly update "running" to avoid dead loop + self.state['running'] = data['running'] + + if not data['alive']: + raise RuntimeError('PostgreSQL is dead') + data['role'] = self.get_role() + self.role = data['role'] + data['pg_version'] = self._get_pg_version() + data['pgdata'] = self._get_pgdata_path() + data['opened'] = self.pgpooler('status') + data['timeline'] = self.get_data_from_control_file('Latest checkpoint.s TimeLineID', preproc=int, log=False) + data['wal_receiver'] = self._get_wal_receiver_info() + + if data['role'] == 'primary': + data['replics_info'] = self.get_replics_info('primary') + data['replication_state'] = self.get_replication_state() + data['sessions_ratio'] = self.get_sessions_ratio() + elif data['role'] == 'replica': + data['primary_fqdn'] = self.recovery_conf('get_primary') + data['replics_info'] = self.get_replics_info('replica') + + # + # We ask health of PostgreSQL one more time since it could die + # while we were asking all other things here. It can lead to + # unpredictable results. + # + data['alive'] = self.is_alive() + except Exception: + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + + if data['alive']: + try: + with open(fname, 'w') as fobj: + save_data = data.copy() + del save_data['prev_state'] + fobj.write(json.dumps(save_data)) + except IOError: + logging.warning('Could not write cache file. Skipping it.') + + self.state = data + return data + + def is_alive(self): + return self.is_alive_and_in_terminal_state()[0] + + def is_alive_and_in_terminal_state(self): + """ + Check that postgresql is alive + """ + try: + # In order to check that postgresql is really alive + # we need to check if service is running then + # drop current connection and establish a new one + if self.state.get('running', False): + self.reconnect() + res = self._exec_query('SELECT 42;').fetchone() + if res[0] == 42: + return True, True + else: + self.state['running'] = self.get_postgresql_status() == 0 + return False, True + except (exceptions.PGIsShuttingDown, exceptions.PGIsStartingUp): + return False, False + except Exception: + for line in traceback.format_exc().split('\n'): + logging.debug(line.rstrip()) + return False, True + + def get_role(self): + """ + Get role of local postgresql (replica, primary or None if dead) + """ + try: + res = self._exec_query('SELECT pg_is_in_recovery();') + if res is None: + return None + elif res.fetchone()[0]: + return 'replica' + else: + return 'primary' + except Exception: + return None + + @helpers.return_none_on_error + def _get_pg_version(self): + """ + Get local postgresql version + """ + res = self._exec_query("SHOW server_version_num") + return int(res.fetchone()[0]) + + @helpers.return_none_on_error + def _get_pgdata_path(self): + """ + Get local pg_data + """ + res = self._exec_query('SHOW data_directory;').fetchone() + return res[0] + + @helpers.return_none_on_error + def get_replics_info(self, role): + """ + Get replicas from pg_stat_replication + """ + version = self._get_pg_version() + if version >= 100000: + current_lsn = {'primary': 'pg_current_wal_lsn()', 'replica': 'pg_last_wal_replay_lsn()'} + wal_func = { + 'current_lsn': current_lsn[role], + 'diff_lsn': 'pg_wal_lsn_diff', + 'app_name': 'pg_receivewal', + 'sent_lsn': 'sent_lsn', + 'write_lsn': 'write_lsn', + 'replay_lsn': 'replay_lsn', + } + replay_lag = 'COALESCE(1000*EXTRACT(epoch from replay_lag), 0)::bigint AS replay_lag_msec,' + else: + current_lsn = {'primary': 'pg_current_xlog_location()', 'replica': 'pg_last_xlog_replay_location()'} + wal_func = { + 'current_lsn': current_lsn[role], + 'diff_lsn': 'pg_xlog_location_diff', + 'app_name': 'pg_receivexlog', + 'sent_lsn': 'sent_location', + 'write_lsn': 'sent_location', + 'replay_lsn': 'replay_location', + } + replay_lag = '' + query = """SELECT pid, application_name, + client_hostname, client_addr, state, + {current_lsn} + AS primary_location, + {diff_lsn}({current_lsn}, {sent_lsn}) + AS sent_location_diff, + {diff_lsn}({current_lsn}, {write_lsn}) + AS write_location_diff, + {diff_lsn}({current_lsn}, + {replay_lsn}) + AS replay_location_diff, + {replay_lag} + extract(epoch from backend_start)::bigint AS backend_start_ts, + sync_state FROM pg_stat_replication + WHERE application_name != 'pg_basebackup' + AND application_name != '{app_name}' + AND state = 'streaming'""".format( + current_lsn=wal_func['current_lsn'], + diff_lsn=wal_func['diff_lsn'], + app_name=wal_func['app_name'], + sent_lsn=wal_func['sent_lsn'], + write_lsn=wal_func['write_lsn'], + replay_lag=replay_lag, + replay_lsn=wal_func['replay_lsn'], + ) + return self._get(query) + + @helpers.return_none_on_error + def _get_wal_receiver_info(self): + """ + Get wal_receiver info from pg_stat_wal_receiver + """ + query = """SELECT pid, status, slot_name, + conninfo FROM pg_stat_wal_receiver""" + return self._get(query) + + @helpers.return_none_on_error + def get_replication_state(self): + """ + Get replication type (sync/async) + """ + res = self._exec_query('SHOW synchronous_standby_names;').fetchone() + res = ('async', None) if res[0] == '' else ('sync', res[0]) + return res + + @helpers.return_none_on_error + def get_sessions_ratio(self): + """ + Get ratio of active sessions/max sessions (in percents) + """ + cur = self._exec_query("SELECT count(*) FROM pg_stat_activity WHERE state!='idle';") + cur = cur.fetchone()[0] + max_sessions = self._exec_query('SHOW max_connections;').fetchone()[0] + return (cur / int(max_sessions)) * 100 + + def _execute_versioned_query(self, old_version_query, new_version_query): + version = self._get_pg_version() + if version >= 100000: + return self._exec_query(new_version_query) + else: + return self._exec_query(old_version_query) + + @helpers.return_none_on_error + def lwaldump(self): + """Protected from kill -9 postgres""" + query = """SELECT pg_wal_lsn_diff( + lwaldump(), + '0/00000000')::bigint""" + res = self._exec_query(query).fetchone() + return res[0] + + @helpers.return_none_on_error + def get_wal_receive_lsn(self): + if self.use_lwaldump: + return self.lwaldump() + old_query = """SELECT pg_xlog_location_diff( + pg_last_xlog_receive_location(), + '0/00000000')::bigint""" + new_query = """SELECT pg_wal_lsn_diff( + pg_last_wal_receive_lsn(), + '0/00000000')::bigint""" + res = self._execute_versioned_query(old_query, new_query).fetchone() + return res[0] + + def check_walsender(self, replics_info, holder_fqdn): + """ + Check walsender in sync state and sync holder is same + """ + if not replics_info: + return True + holder_app_name = helpers.app_name_from_fqdn(holder_fqdn) + for replica in replics_info: + try: + if replica['sync_state'] == 'sync' and replica['application_name'] != holder_app_name: + logging.warning('It seems sync replica and sync replica holder are different. Killing walsender.') + os.kill(replica['pid'], signal.SIGTERM) + break + except Exception as exc: + logging.error('Check walsender error: %s', repr(exc)) + return True + + def check_walreceiver(self): + """ + Check if walreceiver is running using pg_stat_wal_receiver view + """ + try: + cur = self._exec_query('SELECT pid FROM pg_stat_wal_receiver WHERE status = \'streaming\'') + except Exception as exc: + logging.error('Unable to get wal receiver state: %s', repr(exc)) + return False + return bool(cur.fetchall()) + + def is_ready_for_pg_rewind(self): + """ + Check if pg_rewind could be used on local postgresql + """ + res = self.get_data_from_control_file('Data page checksum version', preproc=int) + if res: + logging.info("Checksums are enabled, host is ready for pg_rewind.") + return True + + res = self.get_data_from_control_file('wal_log_hints setting') + if res == 'on': + logging.info("Checksums are disabled but wal_log_hints = on, host is ready for pg_rewind.") + return True + + logging.error("Checksums or wal_log_hints should be enabled for pg_rewind to work properly.") + return False + + @helpers.return_none_on_error + def get_replay_diff(self, diff_from='0/00000000'): + new_query = f"""SELECT pg_wal_lsn_diff( + pg_last_wal_replay_lsn(), + '{diff_from}')::bigint""" + old_query = f"""SELECT pg_xlog_location_diff( + pg_last_xlog_replay_location(), + '{diff_from}')::bigint""" + res = self._execute_versioned_query(old_query, new_query).fetchone() + return res[0] + + def recovery_conf(self, action, primary_host=None): + """ + Perform recovery conf action (create, remove, get_primary) + """ + recovery_filepath = os.path.join(self.pgdata, self.config.get('global', 'recovery_conf_rel_path')) + + if action == 'create': + self._plugins.run('before_populate_recovery_conf', primary_host) + res = self._cmd_manager.generate_recovery_conf(recovery_filepath, primary_host) + self._plugins.run('after_populate_recovery_conf', primary_host) + return res + elif action == 'remove': + cmd = 'rm -f ' + recovery_filepath + return helpers.subprocess_call(cmd) + else: + if os.path.exists(recovery_filepath): + with open(recovery_filepath, 'r') as recovery_file: + for i in recovery_file.read().split('\n'): + if 'primary_conninfo' in i: + primary = re.search(r'host=([\w\-\._]*)', i).group(0).split('=')[-1] + return primary + return None + + def promote(self): + """ + Make local postgresql primary + """ + # TODO : potential split brain here in this case: + # 1. We requested for switchover + # 2. Host A was chosen to become a new primary + # 3. Host A promote took too much time, so old primary decided to rollback switchover + # 4. After switchover rollback and old primary returned back as a primary promote finished + # 5. In the end we have old primary with open pooler and host A as a primary with open pooler. + self._plugins.run('before_promote', self.conn_local, self.config) + + # We need to stop archiving WAL and resume after promote + # to prevent wrong history file in archive in case of failure + if not self.stop_archiving_wal(): + logging.error('Could not stop archiving WAL') + return False + + # We need to resume replaying WAL before promote + self.pg_wal_replay_resume() + + promoted = self._cmd_manager.promote(self.pgdata) == 0 + if promoted: + if not self.resume_archiving_wal(): + logging.error('Could not resume archiving WAL') + if self._wait_for_primary_role(): + self._plugins.run('after_promote', self.conn_local, self.config) + return promoted + + def _wait_for_primary_role(self): + """ + Wait until promotion succeeds + """ + sleep_time = self.config.getfloat('global', 'iteration_timeout') + role = self.get_role() + while role != 'primary': + logging.debug('Our role should be primary but we are now "%s".', role) + if role is None: + return False + logging.info('Waiting %.1f second(s) to become primary.', sleep_time) + time.sleep(sleep_time) + role = self.get_role() + return True + + def pgpooler(self, action): + """ + Start/stop/status pooler wrapper + """ + if action == 'stop': + if bool(self._cmd_manager.get_pooler_status()): + return True + self._plugins.run('before_close_from_load') + res = self._cmd_manager.stop_pooler() + after = 'after_close_from_load' + elif action == 'status': + standalone_pooler = self.config.getboolean('global', 'standalone_pooler') + pooler_addr = self.config.get('global', 'pooler_addr') + pooler_port = self.config.get('global', 'pooler_port') + pooler_conn_timeout = self.config.getfloat('global', 'pooler_conn_timeout') + if standalone_pooler: + try: + sock = socket.create_connection((pooler_addr, pooler_port), pooler_conn_timeout) + sock.close() + return True + except socket.error: + return not bool(self._cmd_manager.get_pooler_status()) + else: + return not bool(self._cmd_manager.get_pooler_status()) + elif action == 'start': + if not bool(self._cmd_manager.get_pooler_status()): + return True + self._plugins.run('before_open_for_load') + res = self._cmd_manager.start_pooler() + after = 'after_open_for_load' + else: + raise RuntimeError('Unknown pooler action: %s' % action) + if res == 0: + self._plugins.run(after) + return True + return False + + def do_rewind(self, primary_host): + """ + Run pg_rewind on localhost against primary_host + """ + if self.config.getboolean('global', 'use_replication_slots'): + # + # We should move pg_replslot directory somewhere before rewind + # and move it back after it since pg_rewind doesn't do it. + # + try: + helpers.backup_dir('%s/pg_replslot' % self.pgdata, '/tmp/pgconsul_replslots_backup') + except Exception: + logging.warning('Could not backup replication slots before rewinding. Skipping it.') + res = self._cmd_manager.rewind(self.pgdata, primary_host) + + if self.config.getboolean('global', 'use_replication_slots') and res == 0: + if os.path.exists('/tmp/pgconsul_replslots_backup'): + try: + helpers.backup_dir('/tmp/pgconsul_replslots_backup', '%s/pg_replslot' % self.pgdata) + except Exception: + logging.warning('Could not restore replication slots after rewinding. Skipping it.') + return res + + def change_replication_to_async(self): + return self._change_replication_type('') + + def change_replication_to_sync_host(self, host_fqdn): + return self._change_replication_type(helpers.app_name_from_fqdn(host_fqdn)) + + def change_replication_to_quorum(self, replica_list): + quorum_size = (len(replica_list) + 1) // 2 + replica_list = list(map(helpers.app_name_from_fqdn, replica_list)) + return self._change_replication_type(f"ANY {quorum_size}({','.join(replica_list)})") + + def _get_param_value(self, param): + cursor = self._exec_query(f'SHOW {param}') + (value,) = cursor.fetchone() + return value + + def _alter_system_set_param(self, param, value=None, reset=False): + def equal(): + return self._get_param_value(param) == value + + def unequal(prev_value): + return self._get_param_value(param) != prev_value + + try: + if reset: + prev_value = self._get_param_value(param) + logging.debug(f'Resetting {param} with ALTER SYSTEM') + query = SQL("ALTER SYSTEM RESET {param}").format(param=Identifier(param)) + self._exec_query(query.as_string(self.conn_local)) + await_func = partial(unequal, prev_value) + await_message = f'{param} is reset after reload' + else: + logging.debug(f'Setting {param} to {value} with ALTER SYSTEM') + query = SQL("ALTER SYSTEM SET {param} TO %(value)s").format(param=Identifier(param)) + self._exec_query(query.as_string(self.conn_local), value=value) + await_func = equal + await_message = f'{param} is set to {value} after reload' + except Exception: + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + return False + reload_result = self._cmd_manager.reload_postgresql(self.pgdata) + if reload_result: + logging.debug(f'Reload has failed, not waiting for param {param} change') + return False + + postgres_timeout = self.config.getfloat('global', 'postgres_timeout') + return helpers.await_for(await_func, postgres_timeout, await_message) + + def _change_replication_type(self, synchronous_standby_names): + return self._alter_system_set_param('synchronous_standby_names', synchronous_standby_names) + + def ensure_archive_mode(self): + archive_mode = self._get_param_value('archive_mode') + if archive_mode == 'off': + return False + return True + + def ensure_archiving_wal(self): + archive_command = self._get_param_value('archive_command') + if archive_command == self.DISABLED_ARCHIVE_COMMAND: + logging.info('Archive command was disabled, enabling it') + self.resume_archiving_wal() + config = self._get_postgresql_auto_conf() + if config.get('archive_command') == self.DISABLED_ARCHIVE_COMMAND: + logging.info('Archive command was disabled in postgresql.auto.conf, resetting it') + self.resume_archiving_wal() + + def stop_archiving_wal(self): + return self._alter_system_set_param('archive_command', self.DISABLED_ARCHIVE_COMMAND) + + def resume_archiving_wal(self): + return self._alter_system_set_param('archive_command', reset=True) + + def stop_archiving_wal_stopped(self): + return self._alter_system_stopped('archive_command', self.DISABLED_ARCHIVE_COMMAND) + + def _get_postgresql_auto_conf(self): + config = {} + current_file = os.path.join(self.pgdata, 'postgresql.auto.conf') + with open(current_file, 'r') as fobj: + for line in fobj: + if line.lstrip().startswith('#'): + continue + key, value = line.rstrip('\n').split('=', maxsplit=1) + config[key.strip()] = value.lstrip().lstrip('\'').rstrip('\'') + return config + + # + # We do it with writing to file and not with ALTER SYSTEM command since + # PostgreSQL is stopped when this method is called. + # We are not afraid of future rewriting postgresql.auto.conf with ALTER + # SYSTEM command since this change is temporary. + # + def _alter_system_stopped(self, param, set_value): + """ + Set param to value while PostgreSQL is stopped. + Method should be called only with stopped PostgreSQL. + """ + try: + logging.debug(f'Setting {param} to {set_value} in postgresql.auto.conf') + config = self._get_postgresql_auto_conf() + current_file = os.path.join(self.pgdata, 'postgresql.auto.conf') + new_file = os.path.join(self.pgdata, 'postgresql.auto.conf.new') + old_value = config.get(param) + if old_value == set_value: + logging.debug(f'Param {param} already has value {set_value} in postgresql.auto.conf') + return True + logging.debug(f'Changing {param} from {old_value} to {set_value} in postgresql.auto.conf') + config[param] = set_value + with open(new_file, 'w') as fobj: + fobj.write('# Do not edit this file manually!\n') + fobj.write('# It will be overwritten by the ALTER SYSTEM command.\n') + for key, value in config.items(): + fobj.write(f'{key} = \'{value}\'\n') + os.replace(new_file, current_file) + return True + except Exception: + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + return False + + def checkpoint(self, query=None): + """ + Perform checkpoint + """ + logging.warning('Initiating checkpoint') + if not query: + query = 'CHECKPOINT' + return self._exec_without_result(query) + + def start_postgresql(self, timeout=60): + """ + Start PG server on current host + """ + return self._cmd_manager.start_postgresql(timeout, self.pgdata) + + def get_postgresql_status(self): + """ + Returns PG status on current host + """ + return self._cmd_manager.get_postgresql_status(self.pgdata) + + def stop_postgresql(self, timeout=60): + """ + Stop PG server on current host + + If synchronous replication is ON, but sync replica is dead, then we aren't able to stop PG. + """ + try: + self.change_replication_to_async() # TODO : it can lead to data loss + except Exception: + logging.warning('Could not disable synchronous replication.') + for line in traceback.format_exc().split('\n'): + logging.warning(line.rstrip()) + return self._cmd_manager.stop_postgresql(timeout, self.pgdata) + + def replication_slots(self, action, slots): + """ + Perform replication slots action (create/drop) + """ + current = self._get_replication_slots() + for slot in slots: + if action == 'create': + if current and slot in current: + logging.warning('Slot %s already exists.', slot) + continue + if not self._create_replication_slot(slot): + return False + else: + if current and slot not in current: + logging.warning('Slot %s does not exist.', slot) + continue + if not self._drop_replication_slot(slot): + return False + return True + + def is_replaying_wal(self, check_time): + prev_replay_diff = self.get_replay_diff() + time.sleep(check_time) + replay_diff = self.get_replay_diff() + return prev_replay_diff < replay_diff + + def pg_wal_replay_pause(self): + self._pg_wal_replay("pause") + + def pg_wal_replay_resume(self): + self._pg_wal_replay("resume") + + def is_wal_replay_paused(self): + cur = self._execute_versioned_query( + 'SELECT pg_is_xlog_replay_paused();', + 'SELECT pg_is_wal_replay_paused();', + ) + (paused,) = cur.fetchone() + return paused + + def ensure_replaying_wal(self): + if self.is_wal_replay_paused(): + logging.warning('WAL replay is paused') + self.pg_wal_replay_resume() + + def terminate_backend(self, pid): + """ + Send sigterm to backend by pid + """ + # Note that pid could be already dead by this moment + # So we do not check result + self._exec_without_result(f'SELECT pg_terminate_backend({pid})') + + def _pg_wal_replay(self, pause_or_resume): + logging.debug('WAL replay: %s', pause_or_resume) + self._execute_versioned_query( + f'SELECT pg_xlog_replay_{pause_or_resume}();', + f'SELECT pg_wal_replay_{pause_or_resume}();', + ) + + def check_extension_installed(self, name): + cur = self._exec_query(f"SELECT * FROM pg_extension WHERE extname = '{name}';") + result = cur.fetchall() + return len(result) == 1 + + def reload(self): + return not bool(self._cmd_manager.reload_postgresql(self.pgdata)) diff --git a/src/plugin.py b/src/plugin.py new file mode 100644 index 0000000..f6c73d1 --- /dev/null +++ b/src/plugin.py @@ -0,0 +1,136 @@ +""" +pgconsul plugin support module +""" +# encoding: utf-8 + +import inspect +import os +import sys + + +class PostgresPlugin(object): + """ + Abstract class for postgresql plugin + """ + + def before_close_from_load(self): + """ + This method executed before stopping pooler + """ + pass + + def after_close_from_load(self): + """ + This method executed right after stopping pooler + """ + pass + + def before_promote(self, conn, config): + """ + This method executed before calling pg_ctl promote + """ + pass + + def after_promote(self, conn, config): + """ + This method executed right after calling pg_ctl promote + """ + pass + + def before_open_for_load(self): + """ + This method executed before starting pooler + """ + pass + + def after_open_for_load(self): + """ + This method executed right after starting pooler + """ + pass + + def before_populate_recovery_conf(self, primary_host): + """ + This method executed before generating recovery.conf + """ + pass + + def after_populate_recovery_conf(self, primary_host): + """ + This method executed right after generating recovery.conf + """ + pass + + +class ZookeeperPlugin(object): + """ + Abstract class for zookeeper plugin + """ + + def on_lost(self): + """ + This method executed on zk conn lost + """ + pass + + def on_suspend(self): + """ + This method executed on zk disconnection start + """ + pass + + def on_connect(self): + """ + This method executed on after zk connection is established + """ + pass + + +def load_plugins(path): + """ + Load plugins and return dict with Plugin lists + """ + if path not in sys.path: + sys.path.insert(0, path) + + ret = {'Postgres': [], 'Zookeeper': []} + for i in os.listdir(path): + if not i.endswith('.py'): + continue + + module = __import__(i.split('.')[0]) + + for j in [j for j in dir(module) if not j.startswith('__')]: + try: + j_class = getattr(module, j) + for mro in inspect.getmro(j_class): + if mro == PostgresPlugin: + ret['Postgres'].append(j_class()) + elif mro == ZookeeperPlugin: + ret['Zookeeper'].append(j_class()) + except Exception: + pass + + return ret + + +class PluginRunner(object): + """ + Plugin support helper + """ + + def __init__(self, plugins): + self._plugins = plugins + + def list(self): + """ + Return list of plugins + """ + return self._plugins[:] + + def run(self, method, *args): + """ + Execute method for each plugin + """ + for i in self._plugins: + getattr(i, method)(*args) diff --git a/src/plugins/pgbouncer.py b/src/plugins/pgbouncer.py new file mode 100644 index 0000000..6a44f8d --- /dev/null +++ b/src/plugins/pgbouncer.py @@ -0,0 +1,12 @@ +from pgconsul import plugin +from pgconsul import helpers + + +class PgbouncerPlugin(plugin.PostgresPlugin): + def before_populate_recovery_conf(self, primary_host): + cmd = 'sudo sed -i /etc/pgbouncer/pgbouncer.ini -e "/^* = /s/host=.*$/host=' + primary_host + ' port=6432/"' + helpers.subprocess_popen(cmd) + + def before_promote(self): + cmd = 'sudo sed -i /etc/pgbouncer/pgbouncer.ini -e "/^* = /s/host=.*$/host=localhost/"' + helpers.subprocess_popen(cmd) diff --git a/src/plugins/upload_wals.py b/src/plugins/upload_wals.py new file mode 100644 index 0000000..fe27e8a --- /dev/null +++ b/src/plugins/upload_wals.py @@ -0,0 +1,51 @@ +from pgconsul import plugin +from pgconsul import helpers +import os +import struct +import logging + + +class UploadWals(plugin.PostgresPlugin): + def after_promote(self, conn, config): + # We should finish promote if upload_wals is fail + try: + with conn.cursor() as cur: + cur.execute("SHOW server_version_num") + pg_version = cur.fetchone()[0] + queries = {"pgdata": "SHOW data_directory;", "archive_command": "SHOW archive_command;"} + if int(pg_version) >= 100000: + queries["wal_location"] = "SELECT pg_walfile_name(pg_current_wal_lsn())" + wal_dir = 'pg_wal' + logging.info(queries) + else: + queries["wal_location"] = "SELECT pg_xlogfile_name(pg_current_xlog_location())" + wal_dir = 'pg_xlog' + cur.execute(queries['wal_location']) + current_wal = cur.fetchone()[0] + cur.execute(queries['archive_command']) + archive_command = cur.fetchone()[0] + # wal-g upload in parallel by default + if 'envdir' in archive_command: + archive_command = "/usr/bin/envdir /etc/wal-g/envdir sh -c 'WALG_UPLOAD_CONCURRENCY=1 {}'".format( + archive_command.replace('/usr/bin/envdir /etc/wal-g/envdir ', '') + ) + cur.execute(queries['pgdata']) + pgdata = cur.fetchone()[0] + wals = os.listdir('{pgdata}/{wal_dir}/'.format(pgdata=pgdata, wal_dir=wal_dir)) + wals.sort() + wals_to_upload = [] + for wal in wals: + if wal < current_wal: + try: + logging.info(wal) + struct.unpack('>3I', bytearray.fromhex(wal)) + wals_to_upload.append(wal) + except (struct.error, ValueError): + continue + wals_count = config.get('plugins', 'wals_to_upload') + for wal in wals_to_upload[-wals_count:]: + path = '{pgdata}/{wal_dir}/{wal}'.format(pgdata=pgdata, wal_dir=wal_dir, wal=wal) + cmd = archive_command.replace('%p', path).replace('%f', wal) + helpers.subprocess_call(cmd) + except Exception as error_message: + logging.info(error_message) diff --git a/src/replication_manager.py b/src/replication_manager.py new file mode 100644 index 0000000..40e355e --- /dev/null +++ b/src/replication_manager.py @@ -0,0 +1,353 @@ +import json +import logging +import time + +from . import helpers + + +class SingleSyncReplicationManager: + def __init__(self, config, db, _zk): + self._config = config + self._db = db + self._zk = _zk + self._zk_fail_timestamp = None + + def init_zk(self): + return True + + def drop_zk_fail_timestamp(self): + """ + Reset fail timestamp flag + """ + self._zk_fail_timestamp = None + + def should_close(self): + """ + Check if we are safe to stay open on zk conn loss + """ + try: + if self._zk_fail_timestamp is None: + self._zk_fail_timestamp = time.time() + info = self._db.get_replics_info(self._db.role) + should_wait = False + for replica in info: + if replica['backend_start_ts'] < self._zk_fail_timestamp: + should_wait = True + self._db.terminate_backend(replica['pid']) + if should_wait: + time.sleep(self._config.getfloat('replica', 'primary_unavailability_timeout')) + info = self._db.get_replics_info(self._db.role) + + connected = sum([1 for x in info if x['sync_state'] == 'sync']) + repl_state = self._db.get_replication_state() + if repl_state[0] == 'async': + return False + elif repl_state[0] == 'sync': + logging.info( + 'Probably connect to ZK lost, check the need to close. Connected replicas(sync) num %s', + connected, + ) + return connected < 1 + else: + raise RuntimeError(f'Unexpected replication state: {repl_state}') + except Exception as exc: + logging.error('Error while checking for close conditions: %s', repr(exc)) + return True + + def update_replication_type(self, db_state, ha_replics): + """ + Change replication (if we should). + """ + holder_fqdn = self._zk.get_current_lock_holder(self._zk.SYNC_REPLICA_LOCK_PATH) + if holder_fqdn == helpers.get_hostname(): + logging.info('We are primary but holding sync_replica lock. Releasing it now.') + self._zk.release_lock(self._zk.SYNC_REPLICA_LOCK_PATH) + return + + current = self._db.get_replication_state() + logging.info('Current replication type is %s.', current) + needed = _get_needed_replication_type(self._config, self._db, db_state, ha_replics) + logging.info('Needed replication type is %s.', needed) + + if needed == 'async': + if current[0] == 'async': + logging.debug('We should not change replication type here.') + else: + self.change_replication_to_async() + return + + if holder_fqdn is None: + logging.error( + 'Sync replication type requires explicit ' + 'lock holder but no one seem to hold lock ' + 'right now. Not doing anything.' + ) + return + + if current == (needed, helpers.app_name_from_fqdn(holder_fqdn)): + logging.debug('We should not change replication type here.') + # https://www.postgresql.org/message-id/15617-8dfbde784d8e3258%40postgresql.org + self._db.check_walsender(db_state['replics_info'], holder_fqdn) + else: + logging.info("Here we should turn synchronous replication on.") + if self._db.change_replication_to_sync_host(holder_fqdn): + logging.info('Turned synchronous replication ON.') + + def change_replication_to_async(self): + logging.warning("We should kill synchronous replication here.") + # + # We need to reset `sync` state of replication in `replics_info` + # node in zk before killing synchronous replication here. + # We have race condition between the moment of turning off sync + # replication and the moment of delivering this information to zk. + # (I.e. `change_replication_type` here and `write_host_stat` with + # actual async status in next iteration). + # If connection between primary (we here) and zookeeper will be lost + # then current sync replica will think that it is actual sync and + # will decide that it can promote, but actually status is async. + # To prevent this we rewrite replication status of sync replica + # in zk to async. + # + if not self._reset_sync_replication_in_zk(): + logging.warning('Unable to reset replication status to async in ZK') + logging.warning('Killing synchronous replication is impossible') + return False + if self._db.change_replication_to_async(): + logging.info('Turned synchronous replication OFF.') + return True + return False + + def enter_sync_group(self, replica_infos): + sync_replica_lock_holder = self._zk.get_current_lock_holder(self._zk.SYNC_REPLICA_LOCK_PATH) + if sync_replica_lock_holder is None: + self._zk.acquire_lock(self._zk.SYNC_REPLICA_LOCK_PATH) + return None + + if sync_replica_lock_holder == helpers.get_hostname(): + other = self._zk.get_lock_contenders(self._zk.SYNC_REPLICA_LOCK_PATH) + if len(other) > 1: + logging.info( + 'We are holding sync_replica lock in ZK ' + 'but %s is alive and has higher priority. ' + 'Releasing sync_replica lock.' % other[1] + ) + self._zk.release_lock(self._zk.SYNC_REPLICA_LOCK_PATH) + + if self._check_if_we_are_priority_replica(replica_infos, sync_replica_lock_holder): + logging.info('We have higher priority than current synchronous replica. Trying to acquire the lock.') + self._zk.acquire_lock(self._zk.SYNC_REPLICA_LOCK_PATH, allow_queue=True) + + def leave_sync_group(self): + self._zk.release_if_hold(self._zk.SYNC_REPLICA_LOCK_PATH) + + def is_promote_safe(self, host_group, replica_infos): + sync_replica = self.get_ensured_sync_replica(replica_infos) + logging.info(f'sync replica is {sync_replica}') + return sync_replica in host_group + + def get_ensured_sync_replica(self, replica_infos): + app_name_map = {helpers.app_name_from_fqdn(host): host for host in self._zk.get_ha_hosts()} + for replica in replica_infos: + if replica['sync_state'] == 'sync': + return app_name_map.get(replica['application_name']) + return None + + def _check_if_we_are_priority_replica(self, replica_infos, sync_replica_lock_holder): + """ + Check if we are asynchronous replica and we have higher priority than + current synchronous replica. + """ + prefix = self._zk.MEMBERS_PATH + my_hostname = helpers.get_hostname() + my_app_name = helpers.app_name_from_fqdn(my_hostname) + if sync_replica_lock_holder is None: + return False + + for replica in replica_infos: + if replica['application_name'] != my_app_name: + continue + if replica['sync_state'] != 'async': + return False + + my_priority = self._config.getint('global', 'priority') + sync_priority = self._zk.get(f'{prefix}/{sync_replica_lock_holder}/prio', preproc=int) + if sync_priority is None: + sync_priority = 0 + if my_priority > sync_priority: + return True + + return False + + def _reset_sync_replication_in_zk(self): + """ + This is ugly hack to prevent race condition between 2 moments: + 1. Actual replication status in PostgreSQL became `async` + 2. Information about this will be appear in zookeeper. + We need to reset `sync` replication status in replics_info + """ + replics_info = self._zk.get(self._zk.REPLICS_INFO_PATH, preproc=json.loads) + if replics_info is None: + return False + for replica in replics_info: + if replica['sync_state'] == 'sync': + replica['sync_state'] = 'async' + return self._zk.write(self._zk.REPLICS_INFO_PATH, replics_info, preproc=json.dumps) + + +class QuorumReplicationManager: + def __init__(self, config, db, _zk): + self._config = config + self._db = db + self._zk = _zk + self._zk_fail_timestamp = None + + def drop_zk_fail_timestamp(self): + """ + Reset fail timestamp flag + """ + self._zk_fail_timestamp = None + + def should_close(self): + """ + Check if we are safe to stay open on zk conn loss + """ + try: + if self._zk_fail_timestamp is None: + self._zk_fail_timestamp = time.time() + info = self._db.get_replics_info(self._db.role) + should_wait = False + for replica in info: + if replica['backend_start_ts'] < self._zk_fail_timestamp: + should_wait = True + self._db.terminate_backend(replica['pid']) + if should_wait: + time.sleep(self._config.getfloat('replica', 'primary_unavailability_timeout')) + info = self._db.get_replics_info(self._db.role) + + connected = sum([1 for x in info if x['sync_state'] == 'quorum']) + repl_state = self._db.get_replication_state() + if repl_state[0] == 'async': + return False + elif repl_state[0] == 'sync': + expected = int(repl_state[1].split('(')[0].split(' ')[1]) + logging.info( + 'Probably connect to ZK lost, check the need to close. ' + 'Expected replicas num: %s, connected replicas(quorum) num %s', + expected, + connected, + ) + return connected < expected + else: + raise RuntimeError(f'Unexpected replication state: {repl_state}') + except Exception as exc: + logging.error('Error while checking for close conditions: %s', repr(exc)) + return True + + def init_zk(self): + if not self._zk.ensure_path(self._zk.QUORUM_PATH): + logging.error("Can't create quorum path in ZK") + return False + return True + + def update_replication_type(self, db_state, ha_replics): + """ + Change replication (if we should). + """ + current = self._db.get_replication_state() + logging.info('Current replication type is %s.', current) + needed = _get_needed_replication_type(self._config, self._db, db_state, ha_replics) + logging.info('Needed replication type is %s.', needed) + + if needed == 'async': + if current[0] == 'async': + logging.debug('We should not change replication type here.') + return + self._zk.write(self._zk.QUORUM_PATH, [], preproc=json.dumps) + self.change_replication_to_async() + else: # needed == 'sync' + if current[0] == 'async': + logging.info("Here we should turn synchronous replication on.") + quorum_hosts = self._zk.get_sync_quorum_hosts() + logging.info(f'Quorum hosts will be: {quorum_hosts}') + if not quorum_hosts: + logging.error('No quorum: Not doing anything.') + return + quorum = self._zk.get(self._zk.QUORUM_PATH, preproc=helpers.load_json_or_default) + if quorum is None: + quorum = [] + if set(quorum_hosts) == set(quorum) and current[0] != 'async': + return + if self._db.change_replication_to_quorum(quorum_hosts): + self._zk.write(self._zk.QUORUM_PATH, quorum_hosts, preproc=json.dumps) + logging.info('Turned synchronous replication ON.') + + def change_replication_to_async(self): + self._zk.write(self._zk.QUORUM_PATH, [], preproc=json.dumps) + logging.warning("We should kill synchronous replication here.") + if self._db.change_replication_to_async(): + logging.info('Turned synchronous replication OFF.') + return True + return False + + def enter_sync_group(self, **_kwargs): + self._zk.acquire_lock(self._zk.get_host_quorum_path()) + + def leave_sync_group(self): + self._zk.release_if_hold(self._zk.get_host_quorum_path()) + + def is_promote_safe(self, host_group, **kwargs): + sync_quorum = self._zk.get(self._zk.QUORUM_PATH, preproc=helpers.load_json_or_default) + alive_replics = helpers.make_current_replics_quorum(kwargs['replica_infos'], host_group) + logging.info('Sync quorum was: %s', sync_quorum) + logging.info('Alive hosts was: %s', host_group) + logging.info('Alive replics was: %s', alive_replics) + if sync_quorum is None: + sync_quorum = [] + hosts_in_quorum = len(set(sync_quorum) & alive_replics) + logging.info('%s >= %s', hosts_in_quorum, len(sync_quorum) // 2 + 1) + return hosts_in_quorum >= len(sync_quorum) // 2 + 1 + + def get_ensured_sync_replica(self, replica_infos): + quorum = self._zk.get(self._zk.QUORUM_PATH, preproc=helpers.load_json_or_default) + if quorum is None: + quorum = [] + sync_quorum = {helpers.app_name_from_fqdn(host): host for host in quorum} + quorum_info = [info for info in replica_infos if info['application_name'] in sync_quorum] + return sync_quorum.get(helpers.get_oldest_replica(quorum_info)) + + +def _get_needed_replication_type(config, db, db_state, ha_replics): + """ + return replication type we should set at this moment + """ + # Number of alive-and-well replica instances + streaming_replicas = {i['application_name'] for i in db_state['replics_info'] if i['state'] == 'streaming'} + replics_number = len(streaming_replicas & {helpers.app_name_from_fqdn(host) for host in ha_replics}) + + metric = config.get('primary', 'change_replication_metric') + logging.info(f"Check needed repl type: Metric is {metric}, replics_number is {replics_number}.") + + if 'count' in metric: + if replics_number == 0: + return 'async' + + if 'time' in metric: + current_day = time.localtime().tm_wday + current_hour = time.localtime().tm_hour + key = 'end' if current_day in (5, 6) else 'day' + sync_hours = config.get('primary', 'week%s_change_hours' % key) + + start, stop = [int(i) for i in sync_hours.split('-')] + if not start <= current_hour <= stop: + return 'sync' + + if 'load' in metric: + over = config.getfloat('primary', 'overload_sessions_ratio') + try: + ratio = float(db.get_sessions_ratio()) + except Exception: + ratio = 0.0 + if ratio >= over: + return 'async' + + return 'sync' diff --git a/src/sdnotify.py b/src/sdnotify.py new file mode 100644 index 0000000..034171c --- /dev/null +++ b/src/sdnotify.py @@ -0,0 +1,62 @@ +import os +import socket + + +class Notifier: + def __init__(self, debug=False): + """Instantiate a new notifier object. This will initiate a connection + to the systemd notification socket. + Normally this method silently ignores exceptions (for example, if the + systemd notification socket is not available) to allow applications to + function on non-systemd based systems. However, setting debug=True will + cause this method to raise any exceptions generated to the caller, to + aid in debugging. + """ + self.debug = debug + try: + self.socket = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) + address = os.getenv('NOTIFY_SOCKET') + if address[0] == '@': + address = '\0' + address[1:] + self.socket.connect(address) + except Exception: + self.socket = None + if self.debug: + raise + + def _send(self, msg): + """Send string `msg` as bytes on the notification socket""" + if self.enabled(): + try: + self.socket.sendall(msg.encode()) + except Exception: + if self.debug: + raise + + def enabled(self): + """Return a boolean stating whether watchdog is enabled""" + return bool(self.socket) + + def ready(self): + """Report ready service state, i.e. completed initialisation""" + self._send("READY=1\n") + + def status(self, msg): + """Set a service status message""" + self._send("STATUS=%s\n" % (msg,)) + + def notify(self): + """Report a healthy service state""" + self._send("WATCHDOG=1\n") + + def notify_error(self, msg=None): + """ + Report a watchdog error. This program will likely be killed by the + service manager. + If `msg` is not None, it will be reported as an error message to the + service manager. + """ + if msg: + self.status(msg) + + self._send("WATCHDOG=trigger\n") diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..d518b91 --- /dev/null +++ b/src/utils.py @@ -0,0 +1,250 @@ +""" +Utility functions for various tasks like switchover, ZK init, etc +""" +# encoding: utf-8 + +import copy +import json +import logging +import time +from operator import itemgetter + +from . import read_config, zk +from .exceptions import SwitchoverException +from .helpers import app_name_from_fqdn + + +class Switchover: + """ + 1. Collect coordinates of the systems being switched over + 2. Check if there is already a switchover in progress. If there is, + signal its state and coordinates in log. + 3. Initiate switchover. + 4. in blocking mode, attach to ZK and wait for changes in state (either fail + or success.) + 5. If not in progress, initate. If nonblocking mode is enabled, return. + """ + + def __init__( + self, + conf=None, + primary=None, + syncrep=None, + timeline=None, + new_primary=None, + timeout=60, + config_path='/etc/pgconsul.conf', + ): + """ + Define configuration of the switchover: if None, then autodetect from + ZK. + """ + self.timeout = timeout + self._log = logging.getLogger('switchover') + # Might be useful to read from default config in case the class is being + # called from outside of the CLI utility. + if conf is None: + conf = read_config({'config_file': config_path}) + self._conf = conf + self._zk = zk.Zookeeper(config=conf, plugins=None) + # If primary or syncrep or timeline is provided, use them instead. + # Autodetect (from ZK) if none. + self._new_primary = new_primary + self._plan = self._get_lock_owners(primary, syncrep, timeline) + + def is_possible(self): + """ + Check, whether it's possible to perform switchover now. + """ + if self.in_progress(): + logging.error('Switchover is already in progress: %s', self.state()) + return False + if self._new_primary is not None: + is_alive = self._zk.is_host_alive(self._new_primary, self.timeout / 2) + if not is_alive: + logging.error('Cannot promote dead host: %s', self._new_primary) + return False + is_ha = self._is_ha(self._new_primary) + if not is_ha: + logging.error('Cannot promote non ha host: %s', self._new_primary) + return False + else: + replicas_info = self._zk.get(self._zk.REPLICS_INFO_PATH, preproc=json.loads) + if replicas_info: + connected_app_names = set(map(itemgetter('application_name'), replicas_info)) + ha_hosts = self._zk.get_ha_hosts() + replicas = {host: app_name_from_fqdn(host) for host in ha_hosts} + + for replica, app_name in replicas.items(): + if self._zk.is_host_alive(replica, 1) and app_name in connected_app_names: + # Ok, there is a suitable candidate for switchover + return True + logging.error('Cannot promote because there are no suitable replica for switchover.') + return False + return True + + def perform(self, min_replicas, block=True, timeout=None): + """ + Perform the actual switchover. + """ + min_replicas = min(min_replicas, len(self._zk.get_alive_hosts(1)) - 1) + if timeout is None: + timeout = self.timeout + self._initiate_switchover( + primary=self._plan['primary'], timeline=self._plan['timeline'], new_primary=self._new_primary + ) + if not block: + return True + limit = timeout + while self.in_progress(): + self._log.debug('current switchover status: %(progress)s, failover: %(failover)s', self.state()) + if limit <= 0: + raise SwitchoverException(f'timeout exceeded, current status: {self.in_progress()}') + time.sleep(1) + limit -= 1 + self._wait_for_primary() + state = self.state() + self._log.debug('full state: %s', state) + self._wait_for_replicas(min_replicas) + # We delete all zk states after switchover complete + self._log.info('switchover finished, zk status "%(progress)s"', state) + result = state['progress'] is None + return result + + def in_progress(self, primary=None, timeline=None): + """ + Return True if the cluster is currently in the process of switching + over. + Optionally check for specific hostname being currently the primary + and having a particular timeline. + """ + state = self.state() + # Check if cluster is in process of switching over + if state['progress'] in ('failed', None): + return False + # The constraint, if specified, must match for this function to return + # True (actual state) + conditions = [ + primary is None or primary == state['info'].get('primary'), + timeline is None or timeline == state['info'].get(self._zk.TIMELINE_INFO_PATH), + self._zk.get_current_lock_holder(self._zk.SWITCHOVER_LOCK_PATH) is not None, + ] + if all(conditions): + return state['progress'] + return False + + def state(self): + """ + Current cluster state. + """ + return { + 'progress': self._zk.noexcept_get(self._zk.SWITCHOVER_STATE_PATH), + 'info': self._zk.noexcept_get(self._zk.SWITCHOVER_PRIMARY_PATH, preproc=json.loads) or {}, + 'failover': self._zk.noexcept_get(self._zk.FAILOVER_INFO_PATH), + 'replicas': self._zk.noexcept_get(self._zk.REPLICS_INFO_PATH, preproc=json.loads) or {}, + } + + def plan(self): + """ + Get switchover plan + """ + return copy.deepcopy(self._plan) + + def _get_lock_owners(self, primary=None, syncrep=None, timeline=None): + """ + Get leader and syncreplica lock owners, and timeline. + """ + owners = { + 'primary': primary or self._zk.get_current_lock_holder(self._zk.PRIMARY_LOCK_PATH), + 'sync_replica': syncrep or self._zk.get_current_lock_holder(self._zk.SYNC_REPLICA_LOCK_PATH), + 'timeline': timeline or self._zk.noexcept_get(self._zk.TIMELINE_INFO_PATH, preproc=int), + } + self._log.debug('lock holders: %s', owners) + return owners + + def reset(self, force=False): + """ + Reset state and hostname-timeline + """ + self._log.info('resetting ZK switchover nodes') + if not force and self.in_progress(): + raise SwitchoverException('attempted to reset state while switchover is in progress') + self._lock(self._zk.SWITCHOVER_LOCK_PATH) + if not self._zk.noexcept_write(self._zk.SWITCHOVER_PRIMARY_PATH, '{}', need_lock=False): + raise SwitchoverException(f'unable to reset node {self._zk.SWITCHOVER_PRIMARY_PATH}') + if not self._zk.write(self._zk.SWITCHOVER_STATE_PATH, 'failed', need_lock=False): + raise SwitchoverException(f'unable to reset node {self._zk.SWITCHOVER_STATE_PATH}') + return True + + def _is_ha(self, hostname): + """ + Checks whether given host is ha replica. + """ + ha_path = f'{self._zk.MEMBERS_PATH}/{hostname}/ha' + return self._zk.exists_path(ha_path) + + def _lock(self, node): + """ + Lock switchover structure in ZK + """ + if not self._zk.ensure_path(node): + raise SwitchoverException(f'unable to create switchover node ({node})') + if not self._zk.try_acquire_lock(lock_type=node, allow_queue=True, timeout=self.timeout): + raise SwitchoverException(f'unable to lock switchover node ({node})') + + def _initiate_switchover(self, primary, timeline, new_primary): + """ + Write primary coordinates and 'scheduled' into state node to + initiate switchover. + 1. Lock the hostname-timeline json node. + 2. Set hostname, timeline and destination. + 3. Set state to 'scheduled' + """ + switchover_task = { + 'hostname': primary, + self._zk.TIMELINE_INFO_PATH: timeline, + 'destination': new_primary, + } + self._log.info('initiating switchover with %s', switchover_task) + self._lock(self._zk.SWITCHOVER_LOCK_PATH) + if not self._zk.write(self._zk.SWITCHOVER_PRIMARY_PATH, switchover_task, preproc=json.dumps, need_lock=False): + raise SwitchoverException(f'unable to write to {self._zk.SWITCHOVER_PRIMARY_PATH}') + if not self._zk.write(self._zk.SWITCHOVER_STATE_PATH, 'scheduled', need_lock=False): + raise SwitchoverException(f'unable to write to {self._zk.SWITCHOVER_STATE_PATH}') + self._log.debug('state: %s', self.state()) + + def _wait_for_replicas(self, min_replicas, timeout=None): + """ + Wait for replicas to appear + """ + if timeout is None: + timeout = self.timeout + self._log.debug('waiting for replicas to appear...') + for _ in range(timeout): + time.sleep(1) + replicas = [ + f'{x["application_name"]}@{x["primary_location"]}' + for x in self.state()['replicas'] + if x['state'] == 'streaming' + ] + self._log.debug('replicas up: %s', (', '.join(replicas) if replicas else 'none')) + if len(replicas) >= min_replicas: + return replicas + raise SwitchoverException( + f'expected {min_replicas} replicas to appear within {timeout} secs, got {len(self.state()["replicas"])}' + ) + + def _wait_for_primary(self, timeout=None): + """ + Wait for primary to hold the lock + """ + if timeout is None: + timeout = self.timeout + for _ in range(timeout): + time.sleep(1) + holder = self._zk.get_current_lock_holder(self._zk.PRIMARY_LOCK_PATH) + if holder is not None and holder != self._plan['primary']: + self._log.info('primary is now %s', holder) + return holder + self._log.debug('current holder %s, waiting for new primary to acquire lock...', holder) + raise SwitchoverException(f'no one took primary lock in {timeout} secs') diff --git a/src/yapf_check.py b/src/yapf_check.py new file mode 100644 index 0000000..1b38c4c --- /dev/null +++ b/src/yapf_check.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python +""" +Run yapf on code and check that diff is empty +""" + +import subprocess +import sys + + +OUT = subprocess.check_output(['yapf', sys.argv[1], '-rpd']) + +if len(OUT.decode('utf-8').splitlines()) > 1: + print(OUT.decode('utf-8')) + sys.exit(1) diff --git a/src/zk.py b/src/zk.py new file mode 100644 index 0000000..abcd3ad --- /dev/null +++ b/src/zk.py @@ -0,0 +1,571 @@ +# encoding: utf-8 +""" +Zookeeper wrapper module. Zookeeper class defined here. +""" + +import json +import logging +import os +import traceback +import time + +from kazoo.client import KazooClient, KazooState +from kazoo.exceptions import LockTimeout, NoNodeError, KazooException, ConnectionClosedError +from kazoo.handlers.threading import KazooTimeoutError, SequentialThreadingHandler +from kazoo.security import make_digest_acl + +from . import helpers + + +def _get_host_path(path, hostname): + if hostname is None: + hostname = helpers.get_hostname() + return path % hostname + + +class ZookeeperException(Exception): + """Exception for wrapping all zookeeper connector inner exceptions""" + + +class Zookeeper(object): + """ + Zookeeper class + """ + + PRIMARY_LOCK_PATH = 'leader' + PRIMARY_SWITCH_LOCK_PATH = 'remaster' + SYNC_REPLICA_LOCK_PATH = 'sync_replica' + + QUORUM_PATH = 'quorum' + QUORUM_MEMBER_LOCK_PATH = f'{QUORUM_PATH}/members/%s' + + REPLICS_INFO_PATH = 'replics_info' + TIMELINE_INFO_PATH = 'timeline' + FAILOVER_INFO_PATH = 'failover_state' + FAILOVER_MUST_BE_RESET = 'failover_must_be_reset' + CURRENT_PROMOTING_HOST = 'current_promoting_host' + LAST_FAILOVER_TIME_PATH = 'last_failover_time' + LAST_PRIMARY_AVAILABILITY_TIME = 'last_master_activity_time' + LAST_SWITCHOVER_TIME_PATH = 'last_switchover_time' + SWITCHOVER_ROOT_PATH = 'switchover' + SWITCHOVER_LOCK_PATH = f'{SWITCHOVER_ROOT_PATH}/lock' + SWITCHOVER_LSN_PATH = f'{SWITCHOVER_ROOT_PATH}/lsn' + # A JSON string with primary fqmdn and its timeline + SWITCHOVER_PRIMARY_PATH = f'{SWITCHOVER_ROOT_PATH}/master' + # A simple string with current scheduled switchover state + SWITCHOVER_STATE_PATH = f'{SWITCHOVER_ROOT_PATH}/state' + MAINTENANCE_PATH = 'maintenance' + MAINTENANCE_TIME_PATH = f'{MAINTENANCE_PATH}/ts' + MAINTENANCE_PRIMARY_PATH = f'{MAINTENANCE_PATH}/master' + HOST_MAINTENANCE_PATH = f'{MAINTENANCE_PATH}/%s' + HOST_ALIVE_LOCK_PATH = 'alive/%s' + + SINGLE_NODE_PATH = 'is_single_node' + + ELECTION_ENTER_LOCK_PATH = 'enter_election' + ELECTION_MANAGER_LOCK_PATH = 'epoch_manager' + ELECTION_WINNER_PATH = 'election_winner' + ELECTION_STATUS_PATH = 'election_status' + ELECTION_VOTE_PATH = 'election_vote/%s' + + MEMBERS_PATH = 'all_hosts' + SIMPLE_PRIMARY_SWITCH_TRY_PATH = f'{MEMBERS_PATH}/%s/tried_remaster' + HOST_PRIO_PATH = f'{MEMBERS_PATH}/%s/prio' + + def __init__(self, config, plugins): + self._plugins = plugins + self._zk_hosts = config.get('global', 'zk_hosts') + self._timeout = config.getfloat('global', 'iteration_timeout') + self._zk_connect_max_delay = config.getfloat('global', 'zk_connect_max_delay') + self._zk_auth = config.getboolean('global', 'zk_auth') + self._zk_ssl = config.getboolean('global', 'zk_ssl') + self._verify_certs = config.getboolean('global', 'verify_certs') + if self._zk_auth: + self._zk_username = config.get('global', 'zk_username') + self._zk_password = config.get('global', 'zk_password') + if not self._zk_username or not self._zk_password: + logging.error('zk_username, zk_password required when zk_auth enabled') + if self._zk_ssl: + self._cert = config.get('global', 'certfile') + self._key = config.get('global', 'keyfile') + self._ca = config.get('global', 'ca_cert') + if not self._cert or not self._key or not self._ca: + logging.error('certfile, keyfile, ca_cert required when zk_auth enabled') + try: + self._locks = {} + prefix = config.get('global', 'zk_lockpath_prefix') + self._path_prefix = prefix if prefix is not None else helpers.get_lockpath_prefix() + self._lockpath = self._path_prefix + self.PRIMARY_LOCK_PATH + + self._create_kazoo_client() + event = self._zk.start_async() + event.wait(self._timeout) + if not self._zk.connected: + raise Exception('Could not connect to ZK.') + self._zk.add_listener(self._listener) + self._init_lock(self.PRIMARY_LOCK_PATH) + except Exception: + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + + def __del__(self): + self._zk.remove_listener(self._listener) + self._zk.stop() + + def _create_kazoo_client(self): + conn_retry_options = {'max_tries': 10, 'delay': 0.5, 'backoff': 1.5, 'max_delay': self._zk_connect_max_delay} + command_retry_options = {'max_tries': 0, 'delay': 0, 'backoff': 1, 'max_delay': 5} + args = { + 'hosts': self._zk_hosts, + 'handler': SequentialThreadingHandler(), + 'timeout': self._timeout, + 'connection_retry': conn_retry_options, + 'command_retry': command_retry_options, + } + if self._zk_auth: + acl = make_digest_acl(self._zk_username, self._zk_password, all=True) + args.update( + { + 'default_acl': [acl], + 'auth_data': [ + ( + 'digest', + '{username}:{password}'.format(username=self._zk_username, password=self._zk_password), + ) + ], + } + ) + if self._zk_ssl: + args.update( + { + 'use_ssl': True, + 'certfile': self._cert, + 'keyfile': self._key, + 'ca': self._ca, + 'verify_certs': self._verify_certs, + } + ) + self._zk = KazooClient(**args) + + def _listener(self, state): + if state == KazooState.LOST: + # In the event that a LOST state occurs, its certain that the lock and/or the lease has been lost. + logging.error("Connection to ZK lost, clean all locks") + self._locks = {} + self._plugins.run('on_lost') + elif state == KazooState.SUSPENDED: + logging.warning("Being disconnected from ZK.") + self._plugins.run('on_suspend') + elif state == KazooState.CONNECTED: + logging.info("Reconnected to ZK.") + self._plugins.run('on_connect') + + def _wait(self, event): + event.wait(self._timeout) + + def _get(self, path): + event = self._zk.get_async(path) + self._wait(event) + return event.get_nowait() + + # + # We assume data is already converted to text. + # + def _write(self, path, data, need_lock=True): + if need_lock and self.get_current_lock_holder() != helpers.get_hostname(): + return False + event = self._zk.exists_async(path) + self._wait(event) + if event.get_nowait(): # Node exists + event = self._zk.set_async(path, data.encode()) + else: + event = self._zk.create_async(path, value=data.encode()) + self._wait(event) + if event.exception: + logging.error('Failed to write to node: %s.' % path) + logging.error(event.exception) + return not event.exception + + def _init_lock(self, name): + path = self._path_prefix + name + self._locks[name] = self._zk.Lock(path, helpers.get_hostname()) + + def _acquire_lock(self, name, allow_queue, timeout): + if timeout is None: + timeout = self._timeout + if self._zk.state != KazooState.CONNECTED: + logging.warning('Not able to acquire %s ' % name + 'lock without alive connection.') + return False + if name in self._locks: + lock = self._locks[name] + else: + logging.debug('No lock instance for %s. Creating one.', name) + self._init_lock(name) + lock = self._locks[name] + contenders = lock.contenders() + if len(contenders) != 0: + if contenders[0] == helpers.get_hostname(): + logging.debug('We already hold the %s lock.', name) + return True + if not allow_queue: + logging.warning('%s lock is already taken by %s.', name[0].upper() + name[1:], contenders[0]) + return False + try: + return lock.acquire(blocking=True, timeout=timeout) + except LockTimeout: + logging.warning('Unable to obtain lock %s within timeout (%s s)', name, timeout) + return False + except Exception: + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + return False + + def _release_lock(self, name): + if name in self._locks: + return self._locks[name].release() + + def is_alive(self): + """ + Return True if we are connected to zk + """ + if self._zk.state == KazooState.CONNECTED: + return True + return False + + def reconnect(self): + """ + Reconnect to zk + """ + try: + for lock in self._locks.items(): + if lock[1]: + lock[1].release() + except (KazooException, KazooTimeoutError): + pass + + try: + self._locks = {} + self._zk.stop() + self._zk.close() + self._create_kazoo_client() + event = self._zk.start_async() + event.wait(self._timeout) + if not self._zk.connected: + return False + + self._zk.add_listener(self._listener) + self._init_lock(self.PRIMARY_LOCK_PATH) + return self.is_alive() + except Exception: + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + return False + + def get(self, key, preproc=None): + """ + Get key value from zk + """ + path = self._path_prefix + key + try: + res = self._get(path) + except NoNodeError: + return None + except (KazooException, KazooTimeoutError) as exception: + raise ZookeeperException(exception) + value = res[0].decode('utf-8') + if preproc: + try: + return preproc(value) + except ValueError: + return None + else: + return value + + @helpers.return_none_on_error + def noexcept_get(self, key, preproc=None): + """ + Get key value from zk, without ZK exception forwarding + """ + return self.get(key, preproc) + + @helpers.return_none_on_error + def get_mtime(self, key): + """ + Returns modification time of ZK node + """ + return getattr(self._get_meta(key), 'last_modified', None) + + def _get_meta(self, key): + """ + Get metadata from key. + returns kazoo.protocol.states.ZnodeStat + """ + path = self._path_prefix + key + try: + (_, meta) = self._get(path) + except NoNodeError: + return None + except Exception: + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + return None + else: + return meta + + def ensure_path(self, path): + """ + Check that path exists and create if not + """ + if not path.startswith(self._path_prefix): + path = os.path.join(self._path_prefix, path) + event = self._zk.ensure_path_async(path) + try: + return event.get(timeout=self._timeout) + except (KazooException, KazooTimeoutError): + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + return None + + def exists_path(self, path): + if not path.startswith(self._path_prefix): + path = os.path.join(self._path_prefix, path) + event = self._zk.exists_async(path) + try: + self._wait(event) + except (KazooException, KazooTimeoutError): + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + return False + else: + return bool(event.get_nowait()) + + def get_children(self, path): + """ + Get children nodes of path + """ + try: + if not path.startswith(self._path_prefix): + path = os.path.join(self._path_prefix, path) + event = self._zk.get_children_async(path) + self._wait(event) + return event.get_nowait() + except NoNodeError: + for line in traceback.format_exc().split('\n'): + logging.debug(line.rstrip()) + return None + except Exception: + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + return None + + def get_state(self): + """ + Get current zk state (if possible) + """ + data = {'alive': self.is_alive()} + if not data['alive']: + raise ZookeeperException("Zookeeper connection is unavailable now") + data[self.REPLICS_INFO_PATH] = self.get(self.REPLICS_INFO_PATH, preproc=json.loads) + data[self.LAST_FAILOVER_TIME_PATH] = self.get(self.LAST_FAILOVER_TIME_PATH, preproc=float) + data[self.FAILOVER_INFO_PATH] = self.get(self.FAILOVER_INFO_PATH) + data[self.FAILOVER_MUST_BE_RESET] = self.exists_path(self.FAILOVER_MUST_BE_RESET) + data[self.CURRENT_PROMOTING_HOST] = self.get(self.CURRENT_PROMOTING_HOST) + data['lock_version'] = self.get_current_lock_version() + data['lock_holder'] = self.get_current_lock_holder() + data['single_node'] = self.exists_path(self.SINGLE_NODE_PATH) + data[self.TIMELINE_INFO_PATH] = self.get(self.TIMELINE_INFO_PATH, preproc=int) + data[self.SWITCHOVER_ROOT_PATH] = self.get(self.SWITCHOVER_PRIMARY_PATH, preproc=json.loads) + data[self.MAINTENANCE_PATH] = { + 'status': self.get(self.MAINTENANCE_PATH), + 'ts': self.get(self.MAINTENANCE_TIME_PATH), + } + + data['alive'] = self.is_alive() + if not data['alive']: + raise ZookeeperException("Zookeeper connection is unavailable now") + return data + + def _preproc_write(self, key, data, preproc): + path = self._path_prefix + key + if preproc: + sdata = preproc(data) + else: + sdata = str(data) + return path, sdata + + def write(self, key, data, preproc=None, need_lock=True): + """ + Write value to key in zk + """ + path, sdata = self._preproc_write(key, data, preproc) + try: + return self._write(path, sdata, need_lock=need_lock) + except (KazooException, KazooTimeoutError) as exception: + raise ZookeeperException(exception) + + def noexcept_write(self, key, data, preproc=None, need_lock=True): + """ + Write value to key in zk without zk exceptions forwarding + """ + path, sdata = self._preproc_write(key, data, preproc) + try: + return self._write(path, sdata, need_lock=need_lock) + except Exception: + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + return False + + def delete(self, key, recursive=False): + """ + Delete key from zk + """ + path = self._path_prefix + key + try: + self._zk.delete(path, recursive=recursive) + return True + except NoNodeError: + logging.info('No node %s was found in ZK to delete it.' % key) + return True + except Exception: + for line in traceback.format_exc().split('\n'): + logging.error(line.rstrip()) + return False + + def get_current_lock_version(self): + """ + Get current leader lock version + """ + children = self.get_children(self._lockpath) + if children and len(children) > 0: + return min([i.split('__')[-1] for i in children]) + return None + + def get_lock_contenders(self, name, catch_except=True): + """ + Get a list of all hostnames that are competing for the lock, + including the holder. + """ + try: + if name not in self._locks: + self._init_lock(name) + contenders = self._locks[name].contenders() + if len(contenders) > 0: + return contenders + except Exception as e: + for line in traceback.format_exc().split('\n'): + logging.debug(line.rstrip()) + if not catch_except: + raise e + return [] + + def get_current_lock_holder(self, name=None, catch_except=True): + """ + Get hostname of lock holder + """ + name = name or self.PRIMARY_LOCK_PATH + lock_contenders = self.get_lock_contenders(name, catch_except) + if len(lock_contenders) > 0: + return lock_contenders[0] + else: + return None + + def acquire_lock(self, lock_type, allow_queue=False, timeout=None): + result = self._acquire_lock(lock_type, allow_queue, timeout) + if not result: + raise ZookeeperException(f'Failed to acquire lock {lock_type}') + logging.debug(f'Success acquire lock: {lock_type}') + + def try_acquire_lock(self, lock_type=None, allow_queue=False, timeout=None): + """ + Acquire lock (leader by default) + """ + lock_type = lock_type or self.PRIMARY_LOCK_PATH + return self._acquire_lock(lock_type, allow_queue, timeout) + + def release_lock(self, lock_type=None, wait=0): + """ + Release lock (leader by default) + """ + lock_type = lock_type or self.PRIMARY_LOCK_PATH + # If caller decides to rely on kazoo internal API, + # release the lock and return immediately. + if not wait: + return self._release_lock(lock_type) + + # Otherwise, make sure the lock is actually released. + hostname = helpers.get_hostname() + for _ in range(wait): + try: + self._release_lock(lock_type) + holder = self.get_current_lock_holder(name=lock_type) + if holder != hostname: + return True + except ConnectionClosedError: + # ok, shit happens, now we should reconnect to ensure that we actually released the lock + self.reconnect() + logging.warning('Unable to release lock "%s", retrying', lock_type) + time.sleep(1) + raise RuntimeError('unable to release lock after %i attempts' % wait) + + def release_if_hold(self, lock_type, wait=0): + holder = self.get_current_lock_holder(lock_type) + if holder != helpers.get_hostname(): + return True + return self.release_lock(lock_type, wait) + + def get_host_alive_lock_path(self, hostname=None): + return _get_host_path(self.HOST_ALIVE_LOCK_PATH, hostname) + + def get_host_maintenance_path(self, hostname=None): + return _get_host_path(self.HOST_MAINTENANCE_PATH, hostname) + + def get_host_quorum_path(self, hostname=None): + return _get_host_path(self.QUORUM_MEMBER_LOCK_PATH, hostname) + + def get_host_prio_path(self, hostname=None): + return _get_host_path(self.HOST_PRIO_PATH, hostname) + + def get_simple_primary_switch_try_path(self, hostname=None): + return _get_host_path(self.SIMPLE_PRIMARY_SWITCH_TRY_PATH, hostname) + + def get_election_vote_path(self, hostname=None): + if hostname is None: + hostname = helpers.get_hostname() + return self.ELECTION_VOTE_PATH % hostname + + def get_ha_hosts(self): + all_hosts = self.get_children(self.MEMBERS_PATH) + if all_hosts is None: + logging.error('Failed to get HA host list from ZK') + return None + ha_hosts = [] + for host in all_hosts: + path = f"{self.MEMBERS_PATH}/{host}/ha" + if self.exists_path(path): + ha_hosts.append(host) + logging.debug(f"HA hosts are: {ha_hosts}") + return ha_hosts + + def is_host_alive(self, hostname, timeout=0.0, catch_except=True): + alive_path = self.get_host_alive_lock_path(hostname) + return helpers.await_for( + lambda: self.get_current_lock_holder(alive_path, catch_except) is not None, timeout, f'{hostname} is alive' + ) + + def _is_host_in_sync_quorum(self, hostname): + host_quorum_path = self.get_host_quorum_path(hostname) + return self.get_current_lock_holder(host_quorum_path) is not None + + def get_sync_quorum_hosts(self): + all_hosts = self.get_children(self.MEMBERS_PATH) + if all_hosts is None: + logging.error('Failed to get HA host list from ZK') + return [] + return [host for host in all_hosts if self._is_host_in_sync_quorum(host)] + + def get_alive_hosts(self, timeout=1, catch_except=True): + ha_hosts = self.get_ha_hosts() + if ha_hosts is None: + return [] + alive_hosts = [host for host in ha_hosts if self.is_host_alive(host, timeout, catch_except)] + return alive_hosts diff --git a/static/Makefile b/static/Makefile new file mode 100644 index 0000000..80c6356 --- /dev/null +++ b/static/Makefile @@ -0,0 +1,11 @@ +install: + install -d $(DESTDIR)/etc + install -d $(DESTDIR)/etc/init.d + install -d $(DESTDIR)/etc/sudoers.d + install -d $(DESTDIR)/etc/logrotate.d + install -d $(DESTDIR)/etc/cron.d + install -d $(DESTDIR)/etc/cron.yandex + install -m755 pgconsul.init.d $(DESTDIR)/etc/init.d/pgconsul + install -m400 pgconsul.sudoers.d $(DESTDIR)/etc/sudoers.d/pgconsul + install -m644 pgconsul.logrotate $(DESTDIR)/etc/logrotate.d/pgconsul + install -m644 wd-pgconsul.cron.d $(DESTDIR)/etc/cron.d/wd-pgconsul diff --git a/static/pgconsul.init.d b/static/pgconsul.init.d new file mode 100644 index 0000000..4ee4cdc --- /dev/null +++ b/static/pgconsul.init.d @@ -0,0 +1,107 @@ +#!/bin/sh +# +# chkconfig: 2345 99 01 +# description: pgconsul + +### BEGIN INIT INFO +# Provides: pgconsul +# Required-Start: $remote_fs $syslog +# Required-Stop: $remote_fs $syslog +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: Start pgconsul +# Description: Start pgconsul +### END INIT INFO + +CONFIG=/etc/pgconsul.conf +PIDFILE=$(awk '/^pid_file/ {print $NF}' $CONFIG) +STOPFILE="$(awk '/^working_dir/ {print $NF}' $CONFIG)/pgconsul.stopped" + +start() { + if status >/dev/null 2>&1 + then + echo "Already running" + return 0 + else + # Removing stale pidfile + rm -f ${PIDFILE} + + echo -n "Starting pgconsul: " + ulimit -n 1024 + mkdir -p /var/run/pgconsul/ + mkdir -p /var/log/pgconsul/ + chown -R postgres:postgres /var/run/pgconsul/ + chown -R postgres:postgres /var/log/pgconsul/ + start-stop-daemon -c postgres --exec /usr/local/bin/pgconsul --start + sleep 1 + if status >/dev/null 2>&1 + then + echo "OK." + return 0 + else + echo "FAIL" + return 1 + fi + fi +} + +stop() { + if ! status >/dev/null 2>&1 + then + echo "Already stopped" + else + echo -n "Stopping pgconsul: " + kill $(cat "$PIDFILE") + sleep 1 + if ! status >/dev/null 2>&1 + then + echo "OK." + else + echo "FAIL" + kill -9 $(cat "$PIDFILE") + echo "Killing pgconsul: OK" + fi + fi + + return 0 +} + +status() { + echo -n "pgconsul is " + if [ -f "$PIDFILE" ] + then + if kill -0 $(cat "$PIDFILE") + then + echo "running (with pid $(cat $PIDFILE))" + return 0 + else + echo "not running" + return 1 + fi + else + echo "not running" + return 1 + fi +} + +case "$1" in + start) + rm -f ${STOPFILE} && start + ;; + stop) + touch ${STOPFILE} && stop + ;; + restart) + rm -f ${STOPFILE} && stop && start + ;; + status) + status + ;; + force-reload) + stop && start + ;; + *) + echo "$(basename $0) {start|stop|status|restart|force-reload}" + exit 1 +esac +exit $? diff --git a/static/pgconsul.logrotate b/static/pgconsul.logrotate new file mode 100644 index 0000000..90b69a7 --- /dev/null +++ b/static/pgconsul.logrotate @@ -0,0 +1,8 @@ +/var/log/pgconsul/pgconsul.log { + rotate 30 + daily + compress + missingok + nodateext + copytruncate +} diff --git a/static/pgconsul.sudoers.d b/static/pgconsul.sudoers.d new file mode 100644 index 0000000..9b719ba --- /dev/null +++ b/static/pgconsul.sudoers.d @@ -0,0 +1,3 @@ +postgres ALL, ALL = NOPASSWD: /bin/systemctl restart postgresql@1[0-9]-data.service +postgres ALL, ALL = NOPASSWD: /bin/systemctl start odyssey.service +postgres ALL, ALL = NOPASSWD: /bin/systemctl stop odyssey.service diff --git a/static/wd-pgconsul.cron.d b/static/wd-pgconsul.cron.d new file mode 100644 index 0000000..b730505 --- /dev/null +++ b/static/wd-pgconsul.cron.d @@ -0,0 +1,5 @@ +SHELL=/bin/sh +PATH=/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin +MAILTO=mail-root@yandex-team.ru + +*/1 * * * * root /etc/cron.yandex/wd_pgconsul >/dev/null 2>&1 diff --git a/tests/Dockerfile b/tests/Dockerfile new file mode 100644 index 0000000..d5e9b40 --- /dev/null +++ b/tests/Dockerfile @@ -0,0 +1,102 @@ +FROM ubuntu:bionic + +ARG pg_major +ARG confpath="tests/conf" + +ENV DEBIAN_FRONTEND noninteractive +ENV PG_MAJOR $pg_major +ENV pgconsul_CONFIG /etc/pgconsul.conf +ENV PGBOUNCER_CONFIG /etc/pgbouncer/pgbouncer.ini +ENV POSTGRESQL_DATADIR /var/lib/postgresql/$PG_MAJOR/main +ENV POSTGRESQL_CONFIG $POSTGRESQL_DATADIR/postgresql.conf +ENV POSTGRESQL_PGHBA $POSTGRESQL_DATADIR/pg_hba.conf + +ADD https://www.postgresql.org/media/keys/ACCC4CF8.asc keyring.asc +RUN echo 'APT::Install-Recommends "0"; \n\ +APT::Get::Assume-Yes "true"; \n\ +APT::Get::allow-downgrades "true"; \n\ +APT::Install-Suggests "0";' > /etc/apt/apt.conf.d/01buildconfig && \ + apt-get update && \ + apt-get install -qq --no-install-recommends gpg gpg-agent wget gnupg ca-certificates && \ + apt-key add keyring.asc + +RUN echo "deb http://apt.postgresql.org/pub/repos/apt bionic-pgdg main" > /etc/apt/sources.list.d/pgdg.list && \ + apt-get update && apt-get -y install \ + postgresql-common \ + postgresql-client-common +RUN echo "wal_level = replica" >> /etc/postgresql-common/createcluster.conf + +RUN apt-get -y install \ + git \ + pgbouncer \ + postgresql-$PG_MAJOR \ + postgresql-client-$PG_MAJOR \ + postgresql-server-dev-$PG_MAJOR \ + build-essential \ + python3-dev \ + python3-pip \ + python3-venv \ + python3-setuptools \ + libpq-dev \ + gcc \ + lsof \ + sudo \ + rsync + +RUN git clone https://github.com/g0djan/lwaldump.git lwaldump && \ + cd lwaldump && git checkout REL_13_STABLE && make -s && make -s install && cd .. + +ENV USE_PGXS=1 +RUN git clone https://github.com/man-brain/repl_mon repl_mon && \ + cd repl_mon && make -s && make -s install && cd .. + +RUN pip3 install \ + git+https://github.com/Supervisor/supervisor.git@2c601dbe1a09c98446dbff404d3f046d0c6a4cc9 + +RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1 + +COPY $confpath/postgresql.conf $POSTGRESQL_CONFIG +COPY $confpath/pg_hba.conf $POSTGRESQL_PGHBA +RUN chmod 666 $POSTGRESQL_PGHBA && chmod 666 $POSTGRESQL_CONFIG +RUN cp /etc/postgresql/$PG_MAJOR/main/pg_ident.conf $POSTGRESQL_DATADIR \ + && chown postgres:postgres $POSTGRESQL_DATADIR/pg_ident.conf +RUN ln -s /usr/lib/postgresql/$PG_MAJOR/bin /usr/bin/postgresql + +USER postgres +RUN echo "CHECKPOINT;" | /usr/bin/postgresql/postgres --single \ + -D $POSTGRESQL_DATADIR postgres +RUN mkdir /var/lib/postgresql/$PG_MAJOR/main/conf.d +USER root + +RUN mkdir /var/log/pgconsul && chown postgres:postgres /var/log/pgconsul && \ + mkdir -p /etc/pgconsul/plugins && mkdir /var/log/supervisor && \ + mkdir -p /etc/pgbouncer + +RUN echo '"postgres" ""' >> /etc/pgbouncer/userlist.txt + +COPY $confpath/pgconsul.conf $pgconsul_CONFIG +COPY $confpath/pgbouncer.ini $PGBOUNCER_CONFIG +COPY $confpath/supervisord.conf /etc/supervisor/supervisord.conf +COPY $confpath/pgbouncer.supervisor /etc/supervisor/conf.d/pgbouncer.conf +COPY $confpath/pgconsul.supervisor /etc/supervisor/conf.d/pgconsul.conf +COPY $confpath/gen_rec_conf_with_slot.sh /usr/local/bin/ +COPY $confpath/gen_rec_conf_without_slot.sh /usr/local/bin/ + +COPY $confpath/archive.passwd /etc +RUN chown postgres:postgres /etc/archive.passwd && chmod 600 /etc/archive.passwd +RUN echo "su - postgres -c '/usr/lib/postgresql/$PG_MAJOR/bin/pg_ctl -D /var/lib/postgresql/$PG_MAJOR/main/ promote'" > /usr/bin/promote > /usr/bin/promote && chmod +x /usr/bin/promote + +COPY tests/setup.sh /usr/local/bin/setup.sh + +RUN chmod +x /usr/local/bin/setup.sh && /usr/local/bin/setup.sh $PG_MAJOR + +CMD ["/usr/local/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"] + +RUN mkdir /repo +COPY ./ /repo/ +RUN cd /repo && DESTDIR=/ make install +RUN ln -s /repo/src/plugins/upload_wals.py /etc/pgconsul/plugins/upload_wals.py +RUN chown -R postgres:postgres /opt/yandex/pgconsul /etc/pgconsul/plugins /repo/src/plugins + +COPY tests/generate_certs.sh /usr/local/bin/generate_certs.sh +RUN chmod 755 /usr/local/bin/generate_certs.sh diff --git a/tests/behave.py b/tests/behave.py new file mode 100644 index 0000000..b5f75d8 --- /dev/null +++ b/tests/behave.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 + +import sys + +from behave.__main__ import main as behave_main + + +def main(): + sys.exit(behave_main()) + + +if __name__ == "__main__": + main() diff --git a/tests/conf/archive.passwd b/tests/conf/archive.passwd new file mode 100644 index 0000000..9f358a4 --- /dev/null +++ b/tests/conf/archive.passwd @@ -0,0 +1 @@ +123456 diff --git a/tests/conf/coverage.conf b/tests/conf/coverage.conf new file mode 100644 index 0000000..e725f9f --- /dev/null +++ b/tests/conf/coverage.conf @@ -0,0 +1,12 @@ +[run] +branch = True +data_file = ${COVERAGE_FILE} +cover_pylib = False +parallel = True +include = */pgconsul/* + +[report] +include = */pgconsul/* + +[html] +directory = /coverage/htmlcov diff --git a/tests/conf/gen_rec_conf_with_slot.sh b/tests/conf/gen_rec_conf_with_slot.sh new file mode 100755 index 0000000..4844ab7 --- /dev/null +++ b/tests/conf/gen_rec_conf_with_slot.sh @@ -0,0 +1,7 @@ +#!/bin/sh +set -e + +nm=$(hostname -f | sed -e 's/\./_/g' -e 's/\-/_/g') +echo "recovery_target_timeline = 'latest'\nprimary_conninfo = 'host=$1 application_name=$nm options=''-c wal_sender_timeout=30000'''\nprimary_slot_name = '$nm'" > $2 +pgdata=$(pg_lsclusters | tail -n 1 | awk '{print $6}') +touch ${pgdata}/standby.signal diff --git a/tests/conf/gen_rec_conf_without_slot.sh b/tests/conf/gen_rec_conf_without_slot.sh new file mode 100755 index 0000000..31895c7 --- /dev/null +++ b/tests/conf/gen_rec_conf_without_slot.sh @@ -0,0 +1,7 @@ +#!/bin/sh +set -e + +nm=$(hostname -f | sed -e 's/\./_/g' -e 's/\-/_/g') +echo "recovery_target_timeline = 'latest'\nprimary_conninfo = 'host=$1 application_name=$nm options=''-c wal_sender_timeout=30000'''" > $2 +pgdata=$(pg_lsclusters | tail -n 1 | awk '{print $6}') +touch ${pgdata}/standby.signal diff --git a/tests/conf/pg_hba.conf b/tests/conf/pg_hba.conf new file mode 100644 index 0000000..e4227c2 --- /dev/null +++ b/tests/conf/pg_hba.conf @@ -0,0 +1,5 @@ +local all all trust +host all all 0.0.0.0/0 trust +host all all ::/0 trust +host replication all 0.0.0.0/0 trust +host replication all ::/0 trust diff --git a/tests/conf/pgbouncer.ini b/tests/conf/pgbouncer.ini new file mode 100644 index 0000000..fd71e8b --- /dev/null +++ b/tests/conf/pgbouncer.ini @@ -0,0 +1,25 @@ +[databases] +* = host=localhost +[pgbouncer] +logfile = /var/log/postgresql/pgbouncer.log +pidfile = /var/run/postgresql/pgbouncer.pid +listen_addr = * +listen_port = 6432 +auth_type = trust +auth_file = /etc/pgbouncer/userlist.txt +admin_users = postgres +stats_users = postgres +pool_mode = session +server_reset_query = +server_reset_query_always = 0 +ignore_startup_parameters = extra_float_digits +server_check_delay = 30 +application_name_add_host = 1 +max_client_conn = 1000 +default_pool_size = 50 +min_pool_size = 0 +log_connections = 1 +log_disconnections = 1 +log_pooler_errors = 1 +server_idle_timeout = 20 +server_connect_timeout = 3 diff --git a/tests/conf/pgbouncer.supervisor b/tests/conf/pgbouncer.supervisor new file mode 100644 index 0000000..985b20e --- /dev/null +++ b/tests/conf/pgbouncer.supervisor @@ -0,0 +1,9 @@ +[program:pgbouncer] +command=/usr/sbin/pgbouncer /etc/pgbouncer/pgbouncer.ini +process_name=%(program_name)s +autostart=false +autorestart=false +stopsignal=TERM +user=postgres +stderr_logfile=/var/log/supervisor/pgbouncer.log +stdout_logfile=/var/log/supervisor/pgbouncer.log diff --git a/tests/conf/pgconsul.conf b/tests/conf/pgconsul.conf new file mode 100644 index 0000000..2b88824 --- /dev/null +++ b/tests/conf/pgconsul.conf @@ -0,0 +1,54 @@ +[global] +zk_lockpath_prefix = /pgconsul/postgresql/ +daemon_user = postgres +log_level = debug +log_file = /var/log/pgconsul/pgconsul.log +pid_file = /var/run/pgconsul/pgconsul.pid +working_dir = /tmp +local_conn_string = host=/var/run/postgresql dbname=postgres user=postgres connect_timeout=1 +append_primary_conn_string = dbname=postgres user=postgres connect_timeout=1 +iteration_timeout = 1 +zk_hosts = pgconsul_zookeeper1_1.pgconsul_pgconsul_net:2281,pgconsul_zookeeper2_1.pgconsul_pgconsul_net:2281,pgconsul_zookeeper3_1.pgconsul_pgconsul_net:2281 +use_replication_slots = yes +standalone_pooler = yes +use_lwaldump = yes +recovery_conf_rel_path = conf.d/recovery.conf +zk_connect_max_delay = 20 +election_timeout = 20 +zk_auth = yes +zk_username = user1 +zk_password = testpassword123 +zk_ssl = yes +keyfile = /etc/zk-ssl/server.key +certfile = /etc/zk-ssl/server.crt +ca_cert = /etc/zk-ssl/ca.cert.pem +verify_certs = yes + +[primary] +change_replication_type = yes +change_replication_metric = count,time +weekday_change_hours = 0-0 +weekend_change_hours = 0-0 +primary_switch_checks = 3 + +[replica] +primary_unavailability_timeout = 3 +start_pooler = yes +primary_switch_checks = 5 +min_failover_timeout = 300 +allow_potential_data_loss = no +recovery_timeout = 60 +primary_switch_restart = no + +[commands] +promote = /bin/bash -c 'sleep 0.5; /usr/bin/postgresql/pg_ctl promote -D %p' +rewind = touch /tmp/rewind_called && /usr/bin/postgresql/pg_rewind --restore-target-wal --target-pgdata=%p --source-server='host=%m dbname=postgres user=postgres connect_timeout=1' +get_control_parameter = /usr/bin/postgresql/pg_controldata %p | grep '%a:' +pg_start = /usr/bin/postgresql/pg_ctl start -s -w -t %t -D %p --log=/var/log/postgresql/postgresql.log +pg_stop = /usr/bin/postgresql/pg_ctl stop -s -m fast -w -t %t -D %p +pg_status = /usr/bin/postgresql/pg_ctl status -s -D %p +pg_reload = /bin/bash -c "/bin/bash -c 'sleep 0.2; /usr/bin/postgresql/pg_ctl reload -s -D %p' &" +pooler_start = supervisorctl start pgbouncer +pooler_stop = supervisorctl stop pgbouncer +pooler_status = supervisorctl status pgbouncer >/dev/null 2>&1 +generate_recovery_conf = /usr/local/bin/gen_rec_conf_with_slot.sh %m %p diff --git a/tests/conf/pgconsul.supervisor b/tests/conf/pgconsul.supervisor new file mode 100644 index 0000000..13baa9b --- /dev/null +++ b/tests/conf/pgconsul.supervisor @@ -0,0 +1,21 @@ +[program:prestart_pgconsul] +user=root +command=bash -c "rm -rf /var/run/pgconsul && mkdir -p /var/run/pgconsul && chown postgres:postgres /var/run/pgconsul \ + && rm -f /var/run/postgresql/postmaster.pid && mkdir -p /var/run/postgresql && chown postgres:postgres /var/run/postgresql" +autostart=true +autorestart=unexpected +exitcodes=0 +startsecs=0 +priority=1 + +[program:pgconsul] +command=/usr/local/bin/pgconsul -f yes +process_name=%(program_name)s +autostart=true +autorestart=false +stopsignal=TERM +stopwaitsecs=5 +user=postgres +priority=10 +stderr_logfile=/var/log/supervisor/pgconsul.log +stdout_logfile=/var/log/supervisor/pgconsul.log diff --git a/tests/conf/postgresql.conf b/tests/conf/postgresql.conf new file mode 100644 index 0000000..c41b95d --- /dev/null +++ b/tests/conf/postgresql.conf @@ -0,0 +1,39 @@ +listen_addresses = '*' +external_pid_file = '/var/run/postgresql/postmaster.pid' +port = 5432 +max_connections = 100 +unix_socket_directories = '/var/run/postgresql' +ssl = false +shared_buffers = 128MB +dynamic_shared_memory_type = posix +log_line_prefix = '%m [%p-%l] %q%u@%d ' +log_timezone = 'UTC' +log_hostname = on +stats_temp_directory = '/var/run/postgresql' +datestyle = 'iso, mdy' +timezone = 'UTC' +lc_messages = 'C' +lc_monetary = 'C' +lc_numeric = 'C' +lc_time = 'C' +default_text_search_config = 'pg_catalog.english' +wal_level = replica +wal_compression = on +wal_log_hints = on +min_wal_size = 1GB +max_wal_size = 16GB +max_replication_slots = 10 +max_wal_senders = 10 +hot_standby = on +wal_receiver_status_interval = 1s +hot_standby_feedback = on +# FIXME: Due to unexpected behaviour of postgres without write activity in docker on some machines +# we temporary use postgresql builded with repl_mon from yandex repository +shared_preload_libraries = 'repl_mon' +archive_mode = on +archive_command = 'echo "%p" && rsync --contimeout=1 --timeout=1 -a --password-file=/etc/archive.passwd %p rsync://archive@pgconsul_backup1_1.pgconsul_pgconsul_net:/archive/%f' +archive_timeout = 30 +wal_sender_timeout = '9s' +restore_command = 'rsync -a --password-file=/etc/archive.passwd rsync://archive@pgconsul_backup1_1.pgconsul_pgconsul_net:/archive/%f %p' +include_if_exists = 'conf.d/recovery.conf' +log_min_messages = debug5 diff --git a/tests/conf/supervisord.conf b/tests/conf/supervisord.conf new file mode 100644 index 0000000..7c1ce6f --- /dev/null +++ b/tests/conf/supervisord.conf @@ -0,0 +1,20 @@ +[unix_http_server] +file=/var/run/supervisor.sock +chmod=0777 + +[supervisord] +logfile=/var/log/supervisor.log +logfile_maxbytes=0 +log_level=debug +pidfile=/var/run/supervisord.pid +minfds=1024 +nodaemon=true + +[rpcinterface:supervisor] +supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface + +[supervisorctl] +serverurl=unix:///var/run/supervisor.sock + +[include] +files = /etc/supervisor/conf.d/*.conf diff --git a/tests/environment.py b/tests/environment.py new file mode 100644 index 0000000..e32b3c5 --- /dev/null +++ b/tests/environment.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import signal + +import docker +import yaml + +import steps.helpers as helpers + + +def before_all(context): + """ + Setup environment + """ + # Connect to docker daemon + context.docker = docker.from_env(timeout=600) + client = context.docker + + context.timeout = float(os.environ.get('TIMEOUT', 360)) + context.interval = float(os.environ.get('INTERVAL', 1)) + context.project = str(os.environ.get('PROJECT')) + + context.config = {} + + context.zk_locks = {} + context.zk = None + + context.compose = {} + context.networks = {} + context.containers = {} + context.pg_start_time = {} + with open('docker-compose.yml', 'r') as compose_file: + context.compose = yaml.safe_load(compose_file) + + # Clean all containers + for name in context.compose.get('services', dict()): + try: + container = helpers.DOCKER.containers.get(name) + container.remove(force=True) + except (docker.errors.NotFound, docker.errors.APIError): + pass + + # Create networks from docker-compose.yml + net_opts = { + 'com.docker.network.bridge.enable_ip_masquerade': 'true', + 'com.docker.network.bridge.enable_icc': 'true', + 'com.docker.network.bridge.name': 'test_bridge', + } + for name, network in context.compose.get('networks', dict()).items(): + if 'external' in network: + context.networks[name] = client.networks.get(network['external']['name']) + continue + existing_net = client.networks.list(names=[name]) + if existing_net: + existing_net[0].remove() + context.networks[name] = client.networks.create( + name, driver=network.get('driver'), options=net_opts, ipam=network.get('ipam') + ) + + +def after_all(context): + """ + Cleanup environment after tests run + """ + # Cleanup networks + for network in context.networks.values(): + network.remove() + + +def after_scenario(context, _): + # Cleanup containers + for container in context.containers.values(): + # Simply kill container if it not exited + if helpers.container_get_status(container) != 'exited': + helpers.kill(container, int(signal.SIGKILL)) + + # Remove container's file system + container.remove(v=True, force=True) + + context.containers.clear() + # Cleanup config + context.config.clear() + + # Cleanup zk locks and close connection + context.zk_locks = {} + if context.zk: + context.zk.stop() + context.zk.close() + context.zk = None + + +def extract_log_file(container, cont_base_dir, log_path, log_filename): + log_fullpath = os.path.join(log_path, log_filename) + container_log_file = helpers.container_get_filestream(container, log_fullpath) + with open(os.path.join(cont_base_dir, log_filename), 'w') as log_file: + for line in container_log_file: + log_file.write(line.decode('utf-8')) + + +# Uncomment if you want to debug failed step via pdb +def after_step(context, step): + if step.status == 'failed': + if step.filename == '': + # Sub-step without filename, we don't need its output. + # Same logs will be captured from outer failed step + return + base_dir = 'logs' + os.makedirs(base_dir, exist_ok=True) + for container in context.containers.values(): + hostname = container.attrs['Config']['Hostname'] + cont_base_dir = os.path.join(base_dir, step.filename, str(step.line), hostname) + os.makedirs(cont_base_dir, exist_ok=True) + log_files = [ + ('/var/log/supervisor', 'pgconsul.log'), + ('/var/log/postgresql', 'postgresql.log'), + ('/var/log/zookeeper', 'zookeeper--server-{hostname}.log'.format(hostname=hostname)), + ('/var/log/postgresql', 'pgbouncer.log'), + ] + for log_path, log_file in log_files: + try: + extract_log_file(container, cont_base_dir, log_path, log_file) + except Exception: + pass # Ok, there is no such log file in this container, let's move on + print('Logs for this run were placed in dir %s' % base_dir) + if os.environ.get('DEBUG'): + # -- ENTER DEBUGGER: Zoom in on failure location. + # NOTE: Use pdb++ AKA pdbpp debugger, + # same for pdb (basic python debugger). + import pdb + + pdb.post_mortem(step.exc_traceback) diff --git a/tests/features/archive.feature b/tests/features/archive.feature new file mode 100644 index 0000000..197a008 --- /dev/null +++ b/tests/features/archive.feature @@ -0,0 +1,47 @@ +Feature: Check WAL archiving works correctly + + @archiving + Scenario Outline: Check that archive enabled after restart postgres without maintenance + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + autofailover: 'no' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 3 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 3 + min_failover_timeout: 60 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + postgresql.conf: + archive_command: '/bin/true' + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + And postgresql in container "postgresql1" has value "/bin/true" for option "archive_command" + And container "postgresql1" has following config + """ + postgresql.auto.conf: {} + """ + When we set value "/bin/false" for option "archive_command" in "postgresql.auto.conf" config in container "postgresql1" + Then postgresql in container "postgresql1" has value "/bin/true" for option "archive_command" + And container "postgresql1" has following config + """ + postgresql.auto.conf: {} + """ + + Examples: , + | lock_type | lock_host | + | zookeeper | zookeeper1 | diff --git a/tests/features/async.feature b/tests/features/async.feature new file mode 100644 index 0000000..bd4f454 --- /dev/null +++ b/tests/features/async.feature @@ -0,0 +1,75 @@ +Feature: Asynchronous replication + Check some cases in mode "change_replication_type = no" + + + @failover + Scenario Outline: No failover in "allow_potential_data_loss = no" mode + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: '' + quorum_commit: '' + primary: + change_replication_type: 'no' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf__slot.sh %m %p + """ + Given a following cluster with "" replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + postgresql3: + role: replica + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + sync_state: async + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + sync_state: async + """ + When we stop container "postgresql1" + Then "" has holder "None" for lock "/pgconsul/postgresql/leader" + Then "" has value "None" for key "/pgconsul/postgresql/failover_state" + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + sync_state: async + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + sync_state: async + """ + When we start container "postgresql1" + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + sync_state: async + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + sync_state: async + """ + Then container "postgresql2" is a replica of container "postgresql1" + Then container "postgresql3" is a replica of container "postgresql1" + + Examples: , replication slots + | lock_type | lock_host | with_slots | use_slots | quorum_commit | + | zookeeper | zookeeper1 | without | no | yes | + | zookeeper | zookeeper1 | with | yes | yes | + | zookeeper | zookeeper1 | without | no | no | + | zookeeper | zookeeper1 | with | yes | no | diff --git a/tests/features/autofailover.feature b/tests/features/autofailover.feature new file mode 100644 index 0000000..32faf06 --- /dev/null +++ b/tests/features/autofailover.feature @@ -0,0 +1,209 @@ +Feature: Check pgconsul with disabled autofailover + @switchover_test + Scenario Outline: Check switchover with disabled autofailover + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + autofailover: 'no' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 3 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 3 + min_failover_timeout: 60 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + config: + pgconsul.conf: + global: + priority: 2 + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 3 + + """ + Then container "postgresql3" is in group + When we lock "/pgconsul/postgresql/switchover/lock" in "" + And we set value "{"hostname": "pgconsul_postgresql1_1.pgconsul_pgconsul_net","timeline": 1}" for key "/pgconsul/postgresql/switchover/master" in "" + And we set value "scheduled" for key "/pgconsul/postgresql/switchover/state" in "" + And we release lock "/pgconsul/postgresql/switchover/lock" in "" + Then container "postgresql3" became a primary + And container "postgresql2" is a replica of container "postgresql3" + And container "postgresql1" is a replica of container "postgresql3" + Then postgresql in container "postgresql2" was not rewinded + Then postgresql in container "postgresql1" was not rewinded + Then container "postgresql1" is in group + When we lock "/pgconsul/postgresql/switchover/lock" in "" + And we set value "{"hostname": "pgconsul_postgresql3_1.pgconsul_pgconsul_net","timeline": 2}" for key "/pgconsul/postgresql/switchover/master" in "" + And we set value "scheduled" for key "/pgconsul/postgresql/switchover/state" in "" + And we release lock "/pgconsul/postgresql/switchover/lock" in "" + Then container "postgresql1" became a primary + And container "postgresql3" is a replica of container "postgresql1" + And container "postgresql2" is a replica of container "postgresql1" + When we stop container "postgresql2" + And we lock "/pgconsul/postgresql/switchover/lock" in "" + And we set value "{"hostname": "pgconsul_postgresql1_1.pgconsul_pgconsul_net","timeline": 3}" for key "/pgconsul/postgresql/switchover/master" in "" + And we set value "scheduled" for key "/pgconsul/postgresql/switchover/state" in "" + And we release lock "/pgconsul/postgresql/switchover/lock" in "" + And we wait "30.0" seconds + Then container "postgresql1" is primary + When we wait "30.0" seconds + Then container "postgresql3" became a primary + And container "postgresql1" is a replica of container "postgresql3" + + Examples: , + | lock_type | lock_host | quorum_commit | replication_type | + | zookeeper | zookeeper1 | yes | quorum | + | zookeeper | zookeeper1 | no | sync | + + @failover + Scenario Outline: Check kill primary with disabled autofailover + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + autofailover: 'no' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + config: + pgconsul.conf: + global: + priority: 2 + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 3 + + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + And "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + """ + When we container "postgresql1" + And we wait "30.0" seconds + Then "" has holder "None" for lock "/pgconsul/postgresql/leader" + When we container "postgresql1" + Then container "postgresql2" is a replica of container "postgresql1" + And container "postgresql3" is a replica of container "postgresql1" + + Examples: , , , + | lock_type | lock_host | destroy | repair | quorum_commit | + | zookeeper | zookeeper1 | stop | start | yes | + | zookeeper | zookeeper1 | disconnect from network | connect to network | yes | + | zookeeper | zookeeper1 | stop | start | no | + | zookeeper | zookeeper1 | disconnect from network | connect to network | no | + + Scenario Outline: Check suddenly external promote replica + We consider unexpected external promote as an error, so we leave old primary as it is. + Moreover, pgconsul should switch off pgbouncer on suddenly promoted host to avoid split brain state. + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + autofailover: 'no' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + config: + pgconsul.conf: + global: + priority: 2 + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 3 + + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + And "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + """ + When we promote host "postgresql2" + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + And container "postgresql1" is primary + And pgbouncer is not running in container "postgresql2" + And pgbouncer is running in container "postgresql1" + And pgbouncer is running in container "postgresql3" + + Examples: , + | lock_type | lock_host | quorum_commit | + | zookeeper | zookeeper1 | yes | + | zookeeper | zookeeper1 | no | diff --git a/tests/features/cascade.feature b/tests/features/cascade.feature new file mode 100644 index 0000000..481ea6b --- /dev/null +++ b/tests/features/cascade.feature @@ -0,0 +1,356 @@ +Feature: Check not HA hosts + + @failover + Scenario Outline: Check not ha host from primary + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'no' + postgres_timeout: 5 + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + recovery_timeout: 5 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_without_slot.sh %m %p + """ + Given a following cluster with "" without replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + postgresql3: + role: replica + config: + pgconsul.conf: + global: + stream_from: pgconsul_postgresql1_1.pgconsul_pgconsul_net + stream_from: postgresql1 + """ + + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + And container "postgresql2" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + When we disconnect from network container "postgresql1" + Then "" has holder "pgconsul_postgresql2_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" became a primary + When we connect to network container "postgresql1" + Then container "postgresql3" is a replica of container "postgresql1" + And container "postgresql1" is in group + + Examples: , + | lock_type | lock_host | quorum_commit | replication_type | + | zookeeper | zookeeper1 | yes | quorum | + | zookeeper | zookeeper1 | no | sync | + + @failover + Scenario Outline: Check cascade replica + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'no' + postgres_timeout: 5 + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + recovery_timeout: 5 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_without_slot.sh %m %p + """ + Given a following cluster with "" without replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + postgresql3: + role: replica + config: + pgconsul.conf: + global: + stream_from: pgconsul_postgresql2_1.pgconsul_pgconsul_net + stream_from: postgresql2 + """ + + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + And container "postgresql2" is in group + Then container "postgresql3" is a replica of container "postgresql2" + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + """ + When we disconnect from network container "postgresql1" + Then "" has holder "pgconsul_postgresql2_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" became a primary + When we connect to network container "postgresql1" + Then container "postgresql1" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql1_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + Examples: , + | lock_type | lock_host | quorum_commit | replication_type | + | zookeeper | zookeeper1 | yes | quorum | + | zookeeper | zookeeper1 | no | sync | + + + @auto_stream_from + Scenario Outline: Cascade replica streams from primary when replication source fails + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + recovery_timeout: 30 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_without_slot.sh %m %p + """ + Given a following cluster with "" without replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + postgresql3: + role: replica + config: + pgconsul.conf: + global: + stream_from: pgconsul_postgresql2_1.pgconsul_pgconsul_net + stream_from: postgresql2 + """ + + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + And container "postgresql2" is in quorum group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + """ + Then container "postgresql3" is a replica of container "postgresql2" + When we disconnect from network container "postgresql2" + Then container "postgresql3" is a replica of container "postgresql1" + When we connect to network container "postgresql2" + Then container "postgresql3" is a replica of container "postgresql2" + + Examples: , + | lock_type | lock_host | + | zookeeper | zookeeper1 | + + + @auto_stream_from + Scenario Outline: Cascade replica streams from new primary when old primary fails and it is replication source + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + recovery_timeout: 30 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_without_slot.sh %m %p + """ + Given a following cluster with "" without replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + postgresql3: + role: replica + config: + pgconsul.conf: + global: + stream_from: pgconsul_postgresql1_1.pgconsul_pgconsul_net + stream_from: postgresql1 + """ + + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + And container "postgresql2" is in quorum group + And "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + And container "postgresql3" is a replica of container "postgresql1" + When we disconnect from network container "postgresql1" + Then container "postgresql2" became a primary + And container "postgresql3" is a replica of container "postgresql2" + When we connect to network container "postgresql1" + Then container "postgresql1" is a replica of container "postgresql2" + And container "postgresql3" is a replica of container "postgresql1" + + Examples: , + | lock_type | lock_host | + | zookeeper | zookeeper1 | + + + @auto_stream_from + Scenario Outline: Cascade replica waits new primary if there are no hosts for streaming in HA + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + recovery_timeout: 30 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_without_slot.sh %m %p + postgresql.conf: + wal_sender_timeout: '2s' + wal_receiver_timeout: '2s' + """ + Given a following cluster with "" without replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + stream_from: pgconsul_postgresql1_1.pgconsul_pgconsul_net + stream_from: postgresql1 + """ + + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + """ + When we remember postgresql start time in container "postgresql2" + When we disconnect from network container "postgresql1" + And we wait "10.0" seconds + When we connect to network container "postgresql1" + Then postgresql in container "postgresql2" was not restarted + And postgresql in container "postgresql2" was not rewinded + Then container "postgresql2" is a replica of container "postgresql1" + + Examples: , + | lock_type | lock_host | + | zookeeper | zookeeper1 | + + + + @auto_stream_from + Scenario Outline: Cascade replica returns stream from replication source if it is cascade replica too + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + recovery_timeout: 30 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_without_slot.sh %m %p + """ + Given a following cluster with "" without replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + postgresql3: + role: replica + config: + pgconsul.conf: + global: + stream_from: pgconsul_postgresql2_1.pgconsul_pgconsul_net + stream_from: postgresql2 + postgresql4: + role: replica + config: + pgconsul.conf: + global: + stream_from: pgconsul_postgresql3_1.pgconsul_pgconsul_net + stream_from: postgresql3 + """ + + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + And container "postgresql2" is in quorum group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + """ + Then container "postgresql3" is a replica of container "postgresql2" + When we disconnect from network container "postgresql3" + Then container "postgresql4" is a replica of container "postgresql1" + When we connect to network container "postgresql3" + Then container "postgresql4" is a replica of container "postgresql3" + + Examples: , + | lock_type | lock_host | + | zookeeper | zookeeper1 | diff --git a/tests/features/consecutive_switch.feature b/tests/features/consecutive_switch.feature new file mode 100644 index 0000000..a58cbdd --- /dev/null +++ b/tests/features/consecutive_switch.feature @@ -0,0 +1,80 @@ +Feature: Check that replicas change primary consecutively + + @switchover + Scenario Outline: Change consecutively on failover + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: '' + do_consecutive_primary_switch: 'yes' + election_timeout: 10 + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf__slot.sh %m %p + """ + Given a following cluster with "" replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 4 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 3 + postgresql4: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + postgresql5: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + """ + Then container "postgresql2" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql4_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql5_1.pgconsul_pgconsul_net + state: streaming + """ + When we stop container "postgresql1" + Then container "postgresql2" became a primary + Then "3" containers are replicas of "postgresql2" within "120.0" seconds + And at least "3" postgresql instances were running simultaneously during test + Then postgresql in container "postgresql3" was not rewinded + Then postgresql in container "postgresql4" was not rewinded + Then postgresql in container "postgresql5" was not rewinded + + Examples: , replication slots + | lock_type | lock_host | with_slots | use_slots | quorum_commit | replication_type | + | zookeeper | zookeeper1 | without | no | yes | quorum | + | zookeeper | zookeeper1 | with | yes | yes | quorum | + | zookeeper | zookeeper1 | without | no | no | sync | + | zookeeper | zookeeper1 | with | yes | no | sync | diff --git a/tests/features/coordinator.feature b/tests/features/coordinator.feature new file mode 100644 index 0000000..f59c0a2 --- /dev/null +++ b/tests/features/coordinator.feature @@ -0,0 +1,82 @@ +Feature: Interacting with coordinator + + @failover + Scenario Outline: Destroying most of the cluster (including ZK quorum) + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + And a following cluster with "zookeeper" with replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + """ + Then zookeeper "zookeeper1" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + And container "postgresql2" is in group + And zookeeper "zookeeper1" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + When we container "zookeeper1" + And we container "zookeeper2" + And we container "postgresql1" + Then pgbouncer is not running in container "postgresql1" + When we container "zookeeper1" + And we container "zookeeper2" + Then zookeeper "zookeeper3" has holder "pgconsul_postgresql2_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + And container "postgresql2" became a primary + And zookeeper "zookeeper3" has value "finished" for key "/pgconsul/postgresql/failover_state" + And container "postgresql3" is in group + And zookeeper "zookeeper3" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + And container "postgresql3" is a replica of container "postgresql2" + When we container "postgresql1" + Then zookeeper "zookeeper3" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql1_1.pgconsul_pgconsul_net + state: streaming + """ + And container "postgresql1" is a replica of container "postgresql2" + Then postgresql in container "postgresql3" was not rewinded + Then postgresql in container "postgresql1" was rewinded + Examples: / + | destroy | repair | quorum_commit | replication_type | + | stop | start | yes | quorum | + | disconnect from network | connect to network | yes | quorum | + | stop | start | no | sync | + | disconnect from network | connect to network | no | sync | diff --git a/tests/features/coordinator_fail.feature b/tests/features/coordinator_fail.feature new file mode 100644 index 0000000..f69bb4c --- /dev/null +++ b/tests/features/coordinator_fail.feature @@ -0,0 +1,125 @@ +Feature: Check availability on coordinator failure + + @coordinator_fail + Scenario Outline: Kill coordinator + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: '' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf__slot.sh %m %p + """ + Given a following cluster with "" replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + When we disconnect from network container "zookeeper1" + And we disconnect from network container "zookeeper2" + And we disconnect from network container "zookeeper3" + And we wait "10.0" seconds + Then pgbouncer is running in container "postgresql1" + And pgbouncer is running in container "postgresql2" + And pgbouncer is running in container "postgresql3" + + Examples: , slots + | lock_type | lock_host | with_slots | use_slots | quorum_commit | replication_type | + | zookeeper | zookeeper1 | without | no | yes | quorum | + | zookeeper | zookeeper1 | with | yes | yes | quorum | + | zookeeper | zookeeper1 | without | no | no | sync | + | zookeeper | zookeeper1 | with | yes | no | sync | + + @coordinator_fail + Scenario Outline: Kill coordinator and both replicas + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: '' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf__slot.sh %m %p + """ + Given a following cluster with "" replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + When we disconnect from network container "zookeeper1" + And we disconnect from network container "zookeeper2" + And we disconnect from network container "zookeeper3" + And we disconnect from network container "postgresql2" + And we disconnect from network container "postgresql3" + And we wait "10.0" seconds + Then pgbouncer is not running in container "postgresql1" + + Examples: , slots + | lock_type | lock_host | with_slots | use_slots | quorum_commit | replication_type | + | zookeeper | zookeeper1 | without | no | yes | quorum | + | zookeeper | zookeeper1 | with | yes | yes | quorum | + | zookeeper | zookeeper1 | without | no | no | sync | + | zookeeper | zookeeper1 | with | yes | no | sync | diff --git a/tests/features/dead_primary_switchover.feature b/tests/features/dead_primary_switchover.feature new file mode 100644 index 0000000..c581eac --- /dev/null +++ b/tests/features/dead_primary_switchover.feature @@ -0,0 +1,76 @@ +Feature: Switchover with dead primary + + @switchover + Scenario Outline: Check successful switchover with dead primary + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + autofailover: '' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 3 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 3 + min_failover_timeout: 120 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + config: + pgconsul.conf: + global: + priority: 3 + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + + """ + Then container "postgresql3" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + When we disconnect from network container "postgresql1" + And we make switchover task with params "" in container "postgresql2" + Then container "" became a primary + And container "" is a replica of container "" + Then postgresql in container "" was not rewinded + When we connect to network container "postgresql1" + Then container "" is primary + And container "postgresql1" is a replica of container "" + And container "postgresql1" is in group + And postgresql in container "" was not rewinded + And postgresql in container "postgresql1" was rewinded + + Examples: , + | lock_type | lock_host | destination | new_primary | replica | autofailover | quorum_commit | replication_type | + | zookeeper | zookeeper1 | None | postgresql3 | postgresql2 | no | yes | quorum | + | zookeeper | zookeeper1 | -d pgconsul_postgresql2_1.pgconsul_pgconsul_net | postgresql2 | postgresql3 | no | yes | quorum | + | zookeeper | zookeeper1 | None | postgresql3 | postgresql2 | yes | yes | quorum | + | zookeeper | zookeeper1 | -d pgconsul_postgresql2_1.pgconsul_pgconsul_net | postgresql3 | postgresql2 | yes | yes | quorum | + | zookeeper | zookeeper1 | None | postgresql3 | postgresql2 | no | no | sync | + | zookeeper | zookeeper1 | -d pgconsul_postgresql2_1.pgconsul_pgconsul_net | postgresql2 | postgresql3 | no | no | sync | + | zookeeper | zookeeper1 | None | postgresql3 | postgresql2 | yes | no | sync | + | zookeeper | zookeeper1 | -d pgconsul_postgresql2_1.pgconsul_pgconsul_net | postgresql3 | postgresql2 | yes | no | sync | diff --git a/tests/features/disable_sync.feature b/tests/features/disable_sync.feature new file mode 100644 index 0000000..917b0d9 --- /dev/null +++ b/tests/features/disable_sync.feature @@ -0,0 +1,134 @@ +Feature: Check disable sync replication + Scenario Outline: Disable sync replication when overload + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: yes + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + weekday_change_hours: 0-24 + weekend_change_hours: 0-24 + overload_sessions_ratio: 50 + change_replication_metric: count,time,load + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql3" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + Then container "postgresql1" is primary + When run in container "postgresql1" "88" sessions with timeout 3600 + Then postgresql in container "postgresql1" has empty option "synchronous_standby_names" + + + Examples: / + | lock_type | lock_host | quorum_commit | replication_type | + | zookeeper | zookeeper1 | yes | quorum | + | zookeeper | zookeeper1 | no | sync | + + + Scenario Outline: Destroy all replicas when time to change async is possible + Given a "pgconsul" container common config: + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: yes + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + weekday_change_hours: 0-24 + weekend_change_hours: 0-24 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots: + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql3" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info": + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + When we container "postgresql3" + And we container "postgresql2" + Then container "postgresql1" is primary + Then postgresql in container "postgresql1" has empty option "synchronous_standby_names" + When we container "postgresql3" + When we container "postgresql2" + Then container "postgresql3" is a replica of container "postgresql1" + Then container "postgresql2" is a replica of container "postgresql1" + Then container "postgresql3" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info": + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + + Examples: / + | lock_type | lock_host | destroy | repair | quorum_commit | replication_type | + | zookeeper | zookeeper1 | stop | start | yes | quorum | + | zookeeper | zookeeper1 | disconnect from network | connect to network | yes | quorum | + | zookeeper | zookeeper1 | stop | start | no | sync | + | zookeeper | zookeeper1 | disconnect from network | connect to network | no | sync | + diff --git a/tests/features/failed_promote.feature b/tests/features/failed_promote.feature new file mode 100644 index 0000000..48978eb --- /dev/null +++ b/tests/features/failed_promote.feature @@ -0,0 +1,171 @@ +Feature: Destroy new primary after promote and before sync with zookeeper + + @failed_promote + Scenario Outline: New primary will continue to be primary after restart during promote + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: '' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf__slot.sh %m %p + debug: + promote_checkpoint_sql: CHECKPOINT; SELECT pg_sleep('infinity'); + """ + Given a following cluster with "" replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + When we container "postgresql1" + Then "" has holder "pgconsul_postgresql2_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" became a primary + When we stop container "postgresql2" + When we start container "postgresql2" + Then "" has value "finished" for key "/pgconsul/postgresql/failover_state" + Then container "postgresql3" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + Then container "postgresql3" is a replica of container "postgresql2" + Then postgresql in container "postgresql3" was not rewinded + When we container "postgresql1" + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql1_1.pgconsul_pgconsul_net + state: streaming + """ + Then container "postgresql1" is a replica of container "postgresql2" + Then pgconsul in container "postgresql1" is connected to zookeeper + Then postgresql in container "postgresql1" was rewinded + + Examples: , replication slots, / + | lock_type | lock_host | destroy | repair | with_slots | use_slots | quorum_commit | replication_type | + | zookeeper | zookeeper1 | stop | start | without | no | yes | quorum | + | zookeeper | zookeeper1 | stop | start | with | yes | yes | quorum | + | zookeeper | zookeeper1 | disconnect from network | connect to network | without | no | yes | quorum | + | zookeeper | zookeeper1 | disconnect from network | connect to network | with | yes | yes | quorum | + | zookeeper | zookeeper1 | stop | start | without | no | no | sync | + | zookeeper | zookeeper1 | stop | start | with | yes | no | sync | + | zookeeper | zookeeper1 | disconnect from network | connect to network | without | no | no | sync | + | zookeeper | zookeeper1 | disconnect from network | connect to network | with | yes | no | sync | + + + @failed_promote_return_primary + Scenario Outline: New primary will continue to be primary after returning old primary during restart in promote section + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: '' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf__slot.sh %m %p + debug: + promote_checkpoint_sql: CHECKPOINT; SELECT pg_sleep('infinity'); + """ + Given a following cluster with "" replication slots + """ + postgresql1: + role: primary + config: + pgconsul.conf: + global: + priority: 3 + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + When we container "postgresql1" + Then "" has holder "pgconsul_postgresql2_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" became a primary + When we stop container "postgresql2" + When we container "postgresql1" + When we start container "postgresql2" + Then "" has value "finished" for key "/pgconsul/postgresql/failover_state" + Then container "postgresql1" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql1_1.pgconsul_pgconsul_net + state: streaming + """ + Then container "postgresql3" is a replica of container "postgresql2" + Then container "postgresql1" is a replica of container "postgresql2" + Then pgconsul in container "postgresql1" is connected to zookeeper + Then postgresql in container "postgresql3" was not rewinded + Then postgresql in container "postgresql1" was rewinded + + Examples: , replication slots, / + | lock_type | lock_host | destroy | repair | with_slots | use_slots | quorum_commit | replication_type | + | zookeeper | zookeeper1 | stop | start | without | no | yes | quorum | + | zookeeper | zookeeper1 | stop | start | with | yes | yes | quorum | + | zookeeper | zookeeper1 | disconnect from network | connect to network | without | no | yes | quorum | + | zookeeper | zookeeper1 | disconnect from network | connect to network | with | yes | yes | quorum | + | zookeeper | zookeeper1 | stop | start | without | no | no | sync | + | zookeeper | zookeeper1 | stop | start | with | yes | no | sync | + | zookeeper | zookeeper1 | disconnect from network | connect to network | without | no | no | sync | + | zookeeper | zookeeper1 | disconnect from network | connect to network | with | yes | no | sync | diff --git a/tests/features/failover_timeout.feature b/tests/features/failover_timeout.feature new file mode 100644 index 0000000..c181927 --- /dev/null +++ b/tests/features/failover_timeout.feature @@ -0,0 +1,197 @@ +Feature: Testing min_failover_timeout setting + + @failover + Scenario Outline: Destroy primary and wait min_failover_timeout seconds + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: '' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 240 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf__slot.sh %m %p + """ + Given a following cluster with "" replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + When we container "postgresql1" + Then "" has holder "pgconsul_postgresql2_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" became a primary + Then "" has value "finished" for key "/pgconsul/postgresql/failover_state" + Then container "postgresql3" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + Then container "postgresql3" is a replica of container "postgresql2" + Then postgresql in container "postgresql3" was not rewinded + When we container "postgresql1" + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql1_1.pgconsul_pgconsul_net + state: streaming + """ + Then container "postgresql1" is a replica of container "postgresql2" + Then postgresql in container "postgresql1" was rewinded + When we container "postgresql2" + Then "" has holder "None" for lock "/pgconsul/postgresql/leader" + Then container "postgresql3" is in group + When we wait until "10.0" seconds to failover of "postgresql3" left in "" + Then "" has holder "None" for lock "/pgconsul/postgresql/leader" + Then container "postgresql3" is in group + When we wait "10.0" seconds + Then "" has one of holders "pgconsul_postgresql1_1.pgconsul_pgconsul_net,pgconsul_postgresql3_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then one of containers "postgresql1,postgresql3" became a primary + Then "" has value "finished" for key "/pgconsul/postgresql/failover_state" + Then "" has "1" values for key "/pgconsul/postgresql/replics_info" + When we container "postgresql2" + Then "" has "2" values for key "/pgconsul/postgresql/replics_info" + + Examples: , hronous replication slots, / + | lock_type | lock_host | destroy | repair | with_slots | use_slots | quorum_commit | replication_type | + | zookeeper | zookeeper1 | disconnect from network | connect to network | without | no | yes | quorum | + | zookeeper | zookeeper1 | disconnect from network | connect to network | with | yes | yes | quorum | + | zookeeper | zookeeper1 | disconnect from network | connect to network | without | no | no | sync | + | zookeeper | zookeeper1 | disconnect from network | connect to network | with | yes | no | sync | + + + Scenario Outline: Destroy primary and wait min_failover_timeout seconds with async replication + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: '' + primary: + change_replication_type: 'no' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'yes' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 240 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf__slot.sh %m %p + """ + Given a following cluster with "" replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then "" has holder "pgconsul_postgresql2_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/sync_replica" + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + sync_state: async + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + sync_state: async + """ + When we container "postgresql1" + Then "" has holder "pgconsul_postgresql2_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" became a primary + Then "" has value "finished" for key "/pgconsul/postgresql/failover_state" + Then "" has holder "pgconsul_postgresql3_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/sync_replica" + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + sync_state: async + """ + Then container "postgresql3" is a replica of container "postgresql2" + Then postgresql in container "postgresql3" was not rewinded + When we container "postgresql1" + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + sync_state: async + - client_hostname: pgconsul_postgresql1_1.pgconsul_pgconsul_net + state: streaming + sync_state: async + """ + Then container "postgresql1" is a replica of container "postgresql2" + Then postgresql in container "postgresql1" was rewinded + When we container "postgresql2" + Then "" has holder "None" for lock "/pgconsul/postgresql/leader" + Then "" has holder "pgconsul_postgresql3_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/sync_replica" + When we wait until "10.0" seconds to failover of "postgresql3" left in "" + Then "" has holder "None" for lock "/pgconsul/postgresql/leader" + Then "" has holder "pgconsul_postgresql3_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/sync_replica" + When we wait "10.0" seconds + Then "" has holder "pgconsul_postgresql3_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql3" became a primary + Then "" has value "finished" for key "/pgconsul/postgresql/failover_state" + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/sync_replica" + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql1_1.pgconsul_pgconsul_net + state: streaming + sync_state: async + """ + When we container "postgresql2" + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql1_1.pgconsul_pgconsul_net + state: streaming + sync_state: async + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + sync_state: async + """ + + Examples: , hronous replication slots, / + | lock_type | lock_host | destroy | repair | with_slots | use_slots | + | zookeeper | zookeeper1 | disconnect from network | connect to network | without | no | + | zookeeper | zookeeper1 | disconnect from network | connect to network | with | yes | diff --git a/tests/features/initialization.feature b/tests/features/initialization.feature new file mode 100644 index 0000000..c1b1b96 --- /dev/null +++ b/tests/features/initialization.feature @@ -0,0 +1,94 @@ +Feature: Test that tests infrastructure works correctly. + We need to check that all system starts and works + as expected as is, without any intervention. + + @init + Scenario: pgconsul container check configuration + Given a "pgconsul" container common config + """ + pgconsul.conf: + replica: + primary_switch_checks: 100500 + pgbouncer.ini: + pgbouncer: + query_wait_timeout: 100500 + server_reset_query_always: 2 + postgresql.conf: + checkpoint_timeout: '30s' + """ + Given a "zookeeper" container "zookeeper1" + Given a "zookeeper" container "zookeeper2" + Given a "zookeeper" container "zookeeper3" + Given a "backup" container "backup1" + Given a "pgconsul" container "postgresql1" + Given a replication slot "pgconsul_postgresql2_1_pgconsul_pgconsul_net" in container "postgresql1" + Given a "pgconsul" container "postgresql2" with following config + """ + pgconsul.conf: + replica: + primary_unavailability_timeout: 100500 + allow_potential_data_loss: 'yes' + pgbouncer.ini: + pgbouncer: + server_reset_query_always: 1 + postgresql.conf: + fsync: 'off' + restore_command: 'rsync -a --password-file=/etc/archive.passwd rsync://archive@pgconsul_backup1_1.pgconsul_pgconsul_net:/archive/%f %p' + """ + Then container "postgresql2" has following config + """ + pgconsul.conf: + replica: + primary_unavailability_timeout: 100500 + allow_potential_data_loss: 'yes' + pgbouncer.ini: + pgbouncer: + server_reset_query_always: 1 + postgresql.conf: + fsync: 'off' + restore_command: 'rsync -a --password-file=/etc/archive.passwd rsync://archive@pgconsul_backup1_1.pgconsul_pgconsul_net:/archive/%f %p' + """ + Then postgresql in container "postgresql2" has value "off" for option "fsync" + Then postgresql in container "postgresql2" has value "30s" for option "checkpoint_timeout" + Then pgbouncer is running in container "postgresql2" + Then pgbouncer in container "postgresql2" has value "1" for option "server_reset_query_always" + Then pgbouncer in container "postgresql2" has value "100500" for option "query_wait_timeout" + + Scenario Outline: Check cluster initialization + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + use_replication_slots: 'no' + quorum_commit: '' + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_without_slot.sh %m %p + """ + Given a following cluster with "" without replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + postgresql3: + role: replica + """ + Then container "postgresql1" is primary + Then container "postgresql2" is a replica of container "postgresql1" + Then container "postgresql3" is a replica of container "postgresql1" + Then pgbouncer is running in container "postgresql1" + Then pgbouncer is running in container "postgresql2" + Then pgbouncer is running in container "postgresql3" + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + + Examples: + | lock_type | lock_host | quorum_commit | + | zookeeper | zookeeper1 | no | + | zookeeper | zookeeper1 | yes | diff --git a/tests/features/kill_non_ha_replica.feature b/tests/features/kill_non_ha_replica.feature new file mode 100644 index 0000000..db011fe --- /dev/null +++ b/tests/features/kill_non_ha_replica.feature @@ -0,0 +1,80 @@ +Feature: Destroy non HA replica in various scenarios + + Scenario Outline: Destroy non HA replica + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: '' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf__slot.sh %m %p + """ + Given a following cluster with "" replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + postgresql3: + role: replica + config: + pgconsul.conf: + global: + stream_from: pgconsul_postgresql1_1.pgconsul_pgconsul_net + stream_from: postgresql1 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + When we container "postgresql3" + Then container "postgresql1" is primary + Then container "postgresql2" is a replica of container "postgresql1" + Then container "postgresql2" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + """ + When we container "postgresql3" + Then container "postgresql3" is a replica of container "postgresql1" + Then container "postgresql2" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + Then container "postgresql1" is primary + Then container "postgresql2" is a replica of container "postgresql1" + Then container "postgresql3" is a replica of container "postgresql1" + Then pgconsul in container "postgresql3" is connected to zookeeper + + Examples: , replication slots, / + | lock_type | lock_host | destroy | repair | with_slots | use_slots | quorum_commit | replication_type | + | zookeeper | zookeeper1 | stop | start | without | no | yes | quorum | + | zookeeper | zookeeper1 | stop | start | with | yes | yes | quorum | + | zookeeper | zookeeper1 | disconnect from network | connect to network | without | no | yes | quorum | + | zookeeper | zookeeper1 | disconnect from network | connect to network | with | yes | yes | quorum | + | zookeeper | zookeeper1 | stop | start | without | no | no | sync | + | zookeeper | zookeeper1 | stop | start | with | yes | no | sync | + | zookeeper | zookeeper1 | disconnect from network | connect to network | without | no | no | sync | + | zookeeper | zookeeper1 | disconnect from network | connect to network | with | yes | no | sync | + diff --git a/tests/features/kill_primary.feature b/tests/features/kill_primary.feature new file mode 100644 index 0000000..d9d5f03 --- /dev/null +++ b/tests/features/kill_primary.feature @@ -0,0 +1,377 @@ +Feature: Destroy primary in various scenarios + + + @failover + Scenario Outline: Destroy primary on 2-hosts cluster with primary_switch_restart = + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: '' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + primary_switch_restart: + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf__slot.sh %m %p + """ + Given a following cluster with "" replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" is in quorum group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + """ + When we container "postgresql1" + Then "" has holder "pgconsul_postgresql2_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" became a primary + Then "" has value "finished" for key "/pgconsul/postgresql/failover_state" + Examples: , synchronous replication slots, / + | lock_type | lock_host | destroy | repair | with_slots | use_slots | primary_switch_restart | + | zookeeper | zookeeper1 | disconnect from network | connect to network | with | yes | no | + + + @failover + Scenario Outline: Destroy primary one by one with primary_switch_restart = + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: '' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + primary_switch_restart: + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf__slot.sh %m %p + """ + Given a following cluster with "" replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" is in quorum group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + When we container "postgresql1" + Then "" has holder "pgconsul_postgresql2_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" became a primary + Then "" has value "finished" for key "/pgconsul/postgresql/failover_state" + Then container "postgresql3" is in quorum group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + Then container "postgresql3" is a replica of container "postgresql2" + Then postgresql in container "postgresql3" was not rewinded + Then "" has value "["pgconsul_postgresql3_1.pgconsul_pgconsul_net"]" for key "/pgconsul/postgresql/quorum" + When we container "postgresql2" + Then "" has holder "pgconsul_postgresql3_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql3" became a primary + Examples: , synchronous replication slots, / + | lock_type | lock_host | destroy | repair | with_slots | use_slots | quorum_commit | replication_type | primary_switch_restart | + | zookeeper | zookeeper1 | disconnect from network | connect to network | with | yes | yes | quorum | no | + + + @failover + Scenario Outline: Destroy primary with primary_switch_restart = + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: '' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + primary_switch_restart: + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf__slot.sh %m %p + """ + Given a following cluster with "" replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + When we container "postgresql1" + Then "" has holder "pgconsul_postgresql2_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" became a primary + Then "" has value "finished" for key "/pgconsul/postgresql/failover_state" + Then container "postgresql3" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + Then container "postgresql3" is a replica of container "postgresql2" + Then postgresql in container "postgresql3" was not rewinded + When we container "postgresql1" + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql1_1.pgconsul_pgconsul_net + state: streaming + """ + Then container "postgresql1" is a replica of container "postgresql2" + Then pgconsul in container "postgresql1" is connected to zookeeper + Then postgresql in container "postgresql1" was rewinded + + Examples: , synchronous replication slots, / + | lock_type | lock_host | destroy | repair | with_slots | use_slots | quorum_commit | replication_type | primary_switch_restart | + | zookeeper | zookeeper1 | stop | start | without | no | yes | quorum | yes | + | zookeeper | zookeeper1 | stop | start | with | yes | yes | quorum | yes | + | zookeeper | zookeeper1 | disconnect from network | connect to network | without | no | yes | quorum | yes | + | zookeeper | zookeeper1 | disconnect from network | connect to network | with | yes | yes | quorum | yes | + | zookeeper | zookeeper1 | stop | start | without | no | no | sync | yes | + | zookeeper | zookeeper1 | stop | start | with | yes | no | sync | yes | + | zookeeper | zookeeper1 | disconnect from network | connect to network | without | no | no | sync | yes | + | zookeeper | zookeeper1 | disconnect from network | connect to network | with | yes | no | sync | yes | + | zookeeper | zookeeper1 | stop | start | without | no | yes | quorum | no | + | zookeeper | zookeeper1 | stop | start | with | yes | yes | quorum | no | + | zookeeper | zookeeper1 | disconnect from network | connect to network | without | no | yes | quorum | no | + | zookeeper | zookeeper1 | disconnect from network | connect to network | with | yes | yes | quorum | no | + | zookeeper | zookeeper1 | stop | start | without | no | no | sync | no | + | zookeeper | zookeeper1 | stop | start | with | yes | no | sync | no | + | zookeeper | zookeeper1 | disconnect from network | connect to network | without | no | no | sync | no | + | zookeeper | zookeeper1 | disconnect from network | connect to network | with | yes | no | sync | no | + + + @failover + Scenario Outline: Destroy primary with async replication + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: '' + primary: + change_replication_type: '' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: '' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf__slot.sh %m %p + """ + Given a following cluster with "" replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then "" has holder "pgconsul_postgresql2_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/sync_replica" + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + sync_state: + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + sync_state: async + """ + When we container "postgresql1" + Then "" has holder "pgconsul_postgresql2_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" became a primary + Then "" has value "finished" for key "/pgconsul/postgresql/failover_state" + Then "" has holder "pgconsul_postgresql3_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/sync_replica" + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + sync_state: async + """ + Then container "postgresql3" is a replica of container "postgresql2" + Then postgresql in container "postgresql3" was not rewinded + When we container "postgresql1" + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + sync_state: + - client_hostname: pgconsul_postgresql1_1.pgconsul_pgconsul_net + state: streaming + sync_state: async + """ + Then container "postgresql1" is a replica of container "postgresql2" + Then pgconsul in container "postgresql1" is connected to zookeeper + Then postgresql in container "postgresql1" was rewinded + + Examples: , hronous replication slots, / + | lock_type | lock_host | destroy | repair | with_slots | use_slots | sync_state | change_replication | data_loss | + | zookeeper | zookeeper1 | stop | start | without | no | async | no | yes | + | zookeeper | zookeeper1 | stop | start | with | yes | async | no | yes | + | zookeeper | zookeeper1 | disconnect from network | connect to network | without | no | async | no | yes | + | zookeeper | zookeeper1 | disconnect from network | connect to network | with | yes | async | no | yes | + + + @failover_archive + Scenario Outline: Destroy primary with one replica in archive recovery + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 3 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + primary_switch_restart: 'no' + recovery_timeout: 20 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" is in quorum group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + When we stop container "postgresql3" + When we drop replication slot "pgconsul_postgresql3_1_pgconsul_pgconsul_net" in container "postgresql1" + When we start container "postgresql3" + Then "" has value "["pgconsul_postgresql2_1.pgconsul_pgconsul_net"]" for key "/pgconsul/postgresql/quorum" + When we wait "10.0" seconds + When we container "postgresql1" + Then "" has holder "pgconsul_postgresql2_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" became a primary + Then "" has value "finished" for key "/pgconsul/postgresql/failover_state" + Then container "postgresql3" is in quorum group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + Then container "postgresql3" is a replica of container "postgresql2" + Then postgresql in container "postgresql3" was not rewinded + When we container "postgresql1" + Then container "postgresql1" is a replica of container "postgresql2" + Then pgconsul in container "postgresql1" is connected to zookeeper + Then postgresql in container "postgresql1" was rewinded + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql1_1.pgconsul_pgconsul_net + state: streaming + """ + Examples: , synchronous replication with slots, / + | lock_type | lock_host | destroy | repair | + | zookeeper | zookeeper1 | disconnect from network | connect to network | + | zookeeper | zookeeper1 | stop | start | diff --git a/tests/features/kill_replica.feature b/tests/features/kill_replica.feature new file mode 100644 index 0000000..9a02825 --- /dev/null +++ b/tests/features/kill_replica.feature @@ -0,0 +1,287 @@ +Feature: Destroy synchronous replica in various scenarios + + + Scenario Outline: Destroy synchronous replica + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: '' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf__slot.sh %m %p + """ + Given a following cluster with "" replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql3" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + When we container "postgresql3" + Then container "postgresql1" is primary + Then container "postgresql2" is a replica of container "postgresql1" + Then container "postgresql2" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + """ + When we container "postgresql3" + Then container "postgresql3" is a replica of container "postgresql1" + Then container "postgresql3" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + Then container "postgresql1" is primary + Then container "postgresql2" is a replica of container "postgresql1" + Then container "postgresql3" is a replica of container "postgresql1" + Then pgconsul in container "postgresql3" is connected to zookeeper + + Examples: , replication slots, / + | lock_type | lock_host | destroy | repair | with_slots | use_slots | quorum_commit | replication_type | + | zookeeper | zookeeper1 | stop | start | without | no | yes | quorum | + | zookeeper | zookeeper1 | stop | start | with | yes | yes | quorum | + | zookeeper | zookeeper1 | disconnect from network | connect to network | without | no | yes | quorum | + | zookeeper | zookeeper1 | disconnect from network | connect to network | with | yes | yes | quorum | + | zookeeper | zookeeper1 | stop | start | without | no | no | sync | + | zookeeper | zookeeper1 | stop | start | with | yes | no | sync | + | zookeeper | zookeeper1 | disconnect from network | connect to network | without | no | no | sync | + | zookeeper | zookeeper1 | disconnect from network | connect to network | with | yes | no | sync | + + Scenario Outline: Loss zookeeper connectivity + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: '' + quorum_commit: + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf__slot.sh %m %p + """ + Given a following cluster with "" replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + sync_state: + """ + When we kill "pgconsul" in container "postgresql2" with signal "SIGKILL" + And we wait "10.0" seconds + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + sync_state: async + """ + When we start "pgconsul" in container "postgresql2" + Then container "postgresql2" is a replica of container "postgresql1" + Then container "postgresql2" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + sync_state: + """ + + Examples: , replication slots, / + | lock_type | lock_host | with_slots | use_slots | quorum_commit | replication_type | + | zookeeper | zookeeper1 | without | no | yes | quorum | + | zookeeper | zookeeper1 | with | yes | yes | quorum | + | zookeeper | zookeeper1 | without | no | no | sync | + | zookeeper | zookeeper1 | with | yes | no | sync | + + + Scenario Outline: Loss connect to last quorum replica + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: '' + quorum_commit: + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf__slot.sh %m %p + """ + Given a following cluster with "" replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + sync_state: + """ + When we disconnect from network container "postgresql2" + And we wait "35.0" seconds + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + """ + When we connect to network container "postgresql2" + Then container "postgresql2" is a replica of container "postgresql1" + Then container "postgresql2" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + sync_state: + """ + + Examples: , replication slots, / + | lock_type | lock_host | with_slots | use_slots | quorum_commit | replication_type | + | zookeeper | zookeeper1 | without | no | yes | quorum | + | zookeeper | zookeeper1 | with | yes | yes | quorum | + | zookeeper | zookeeper1 | without | no | no | sync | + | zookeeper | zookeeper1 | with | yes | no | sync | + + + @pause_replication + Scenario Outline: Pause replication on replica + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" is in quorum group + Then container "postgresql3" is in quorum group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + When we pause replaying WAL in container "postgresql2" + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" is in quorum group + Then container "postgresql3" is in quorum group + Then container "postgresql2" is replaying WAL + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + When we pause replaying WAL in container "postgresql3" + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" is in quorum group + Then container "postgresql3" is in quorum group + Then container "postgresql3" is replaying WAL + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + + Examples: + | lock_type | lock_host | + | zookeeper | zookeeper1 | diff --git a/tests/features/maintenance.feature b/tests/features/maintenance.feature new file mode 100644 index 0000000..9af9276 --- /dev/null +++ b/tests/features/maintenance.feature @@ -0,0 +1,296 @@ +Feature: Check maintenance mode + + @maintenance_exit + Scenario Outline: Single-host cluster should exit from the maintenance mode when Postgres is dead. + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + sync_replication_in_maintenance: 'no' + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + When we set value "enable" for key "/pgconsul/postgresql/maintenance" in "" + And we gracefully stop "postgres" in container "postgresql1" + When we set value "disable" for key "/pgconsul/postgresql/maintenance" in "" + Then "" has value "None" for key "/pgconsul/postgresql/maintenance" + And container "postgresql1" became a primary + Examples: , + | lock_type | lock_host | + | zookeeper | zookeeper1 | + + + + @maintenance_exit + Scenario Outline: Single-host cluster should exit from the maintenance mode when the container is unavailable. + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + sync_replication_in_maintenance: 'no' + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + When we set value "enable" for key "/pgconsul/postgresql/maintenance" in "" + When we container "postgresql1" + And we wait "10.0" seconds + When we container "postgresql1" + And we wait "10.0" seconds + When we set value "disable" for key "/pgconsul/postgresql/maintenance" in "" + Then "" has value "None" for key "/pgconsul/postgresql/maintenance" + And container "postgresql1" became a primary + Examples: , , , + | lock_type | lock_host | destroy | repair | + | zookeeper | zookeeper1 | stop | start | + | zookeeper | zookeeper1 | disconnect from network | connect to network | + | zookeeper | zookeeper1 | stop | start | + | zookeeper | zookeeper1 | disconnect from network | connect to network | + + + + Scenario Outline: Check container stop in maintanence mode + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" is in group + And container "postgresql2" is a replica of container "postgresql1" + And container "postgresql3" is a replica of container "postgresql1" + When we set value "enable" for key "/pgconsul/postgresql/maintenance" in "" + And we stop container "postgresql1" + And we wait "10.0" seconds + And we start container "postgresql1" + And we start "postgres" in container "postgresql1" + And we wait "10.0" seconds + When we set value "disable" for key "/pgconsul/postgresql/maintenance" in "" + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + And container "postgresql1" became a primary + Then container "postgresql2" is in group + And container "postgresql2" is a replica of container "postgresql1" + And container "postgresql3" is a replica of container "postgresql1" + Examples: , , , + | lock_type | lock_host | quorum_commit | replication_type | + | zookeeper | zookeeper1 | yes | quorum | + | zookeeper | zookeeper1 | no | sync | + + + + Scenario Outline: Check pgbouncer is untouchable in maintenance mode + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + And container "postgresql2" is a replica of container "postgresql1" + And container "postgresql3" is a replica of container "postgresql1" + When we set value "enable" for key "/pgconsul/postgresql/maintenance" in "" + And we wait "10.0" seconds + Then pgbouncer is running in container "postgresql1" + When we disconnect from network container "postgresql1" + And we wait "10.0" seconds + Then pgbouncer is running in container "postgresql1" + When we connect to network container "postgresql1" + And we wait "10.0" seconds + Then pgbouncer is running in container "postgresql1" + Examples: , , + | lock_type | lock_host | quorum_commit | + | zookeeper | zookeeper1 | yes | + | zookeeper | zookeeper1 | no | + + + + Scenario Outline: Sync replication turns off in maintenance + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + sync_replication_in_maintenance: 'no' + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" is in group + And container "postgresql2" is a replica of container "postgresql1" + And container "postgresql3" is a replica of container "postgresql1" + When we set value "enable" for key "/pgconsul/postgresql/maintenance" in "" + And we wait "10.0" seconds + Then container "postgresql1" replication state is "async" + And postgresql in container "postgresql1" has empty option "synchronous_standby_names" + When we set value "disable" for key "/pgconsul/postgresql/maintenance" in "" + And we wait "10.0" seconds + Then container "postgresql2" is in group + Examples: , , , + | lock_type | lock_host | quorum_commit | replication_type | + | zookeeper | zookeeper1 | yes | quorum | + | zookeeper | zookeeper1 | no | sync | + + + @maintenance_primary + Scenario Outline: Node with current primary exists in maintenance + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + sync_replication_in_maintenance: 'no' + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + And container "postgresql2" is a replica of container "postgresql1" + And container "postgresql3" is a replica of container "postgresql1" + And container "postgresql2" is in quorum group + And container "postgresql3" is in quorum group + When we set value "enable" for key "/pgconsul/postgresql/maintenance" in "" + Then "" has value "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for key "/pgconsul/postgresql/maintenance/master" + When we set value "disable" for key "/pgconsul/postgresql/maintenance" in "" + Then "" has no value for key "/pgconsul/postgresql/maintenance/master" + Examples: , + | lock_type | lock_host | + | zookeeper | zookeeper1 | diff --git a/tests/features/op_track.feature b/tests/features/op_track.feature new file mode 100644 index 0000000..52dd17a --- /dev/null +++ b/tests/features/op_track.feature @@ -0,0 +1,48 @@ +Feature: Destructive operation tracking + + Scenario Outline: No lock on primary if unfinished op is present + Given a "pgconsul" container common config + """ + pgconsul.conf: + replica: + primary_unavailability_timeout: 100500 + """ + And a following cluster with "" without replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + postgresql3: + role: replica + """ + When we set value "rewind" for key "/pgconsul/postgresql/all_hosts/pgconsul_postgresql1_1.pgconsul_pgconsul_net/op" in "" + Then "" has holder "None" for lock "/pgconsul/postgresql/leader" + And pgbouncer is not running in container "postgresql1" + + Examples: + | lock_type | lock_host | + | zookeeper | zookeeper1 | + + Scenario Outline: Unfinished op is properly cleaned up on replica + Given a "pgconsul" container common config + """ + pgconsul.conf: + replica: + primary_unavailability_timeout: 100500 + """ + And a following cluster with "" without replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + postgresql3: + role: replica + """ + When we set value "rewind" for key "/pgconsul/postgresql/all_hosts/pgconsul_postgresql2_1.pgconsul_pgconsul_net/op" in "" + Then "" has value "None" for key "/pgconsul/postgresql/all_hosts/pgconsul_postgresql2_1.pgconsul_pgconsul_net/op" + + Examples: + | lock_type | lock_host | + | zookeeper | zookeeper1 | diff --git a/tests/features/pgconsul_util.feature b/tests/features/pgconsul_util.feature new file mode 100644 index 0000000..482c501 --- /dev/null +++ b/tests/features/pgconsul_util.feature @@ -0,0 +1,884 @@ +Feature: Check pgconsul-util features + + @pgconsul_util_maintenance + Scenario Outline: Check pgconsul-util maintenance works + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + When we run following command on host "postgresql1" + """ + pgconsul-util maintenance -m show + """ + Then command exit with return code "0" + And command result contains following output + """ + disabled + """ + When we run following command on host "postgresql1" + """ + pgconsul-util maintenance -m enable + """ + Then command exit with return code "0" + And "" has value "enable" for key "/pgconsul/postgresql/maintenance" + When we run following command on host "postgresql1" + """ + pgconsul-util maintenance -m show + """ + Then command exit with return code "0" + And command result contains following output + """ + enabled + """ + When we run following command on host "postgresql1" + """ + pgconsul-util maintenance -m disable + """ + Then command exit with return code "0" + And "" has value "None" for key "/pgconsul/postgresql/maintenance" + When we run following command on host "postgresql1" + """ + pgconsul-util maintenance -m show + """ + Then command exit with return code "0" + And command result contains following output + """ + disabled + """ + Examples: , + | lock_type | lock_host | + | zookeeper | zookeeper1 | + + + @pgconsul_util_maintenance + Scenario Outline: Check pgconsul-util maintenance enable with wait_all option works fails + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + And container "postgresql2" is in quorum group + When we gracefully stop "pgconsul" in container "postgresql2" + And we gracefully stop "pgconsul" in container "postgresql1" + And we lock "/pgconsul/postgresql/alive/pgconsul_postgresql1_1.pgconsul_pgconsul_net" in "" + And we lock "/pgconsul/postgresql/alive/pgconsul_postgresql2_1.pgconsul_pgconsul_net" in "" + When we run following command on host "postgresql1" + """ + pgconsul-util maintenance -m enable --wait_all --timeout 10 + """ + Then command exit with return code "1" + And command result contains following output + """ + TimeoutError + """ + When we release lock "/pgconsul/postgresql/alive/pgconsul_postgresql1_1.pgconsul_pgconsul_net" in "" + And we release lock "/pgconsul/postgresql/alive/pgconsul_postgresql2_1.pgconsul_pgconsul_net" in "" + Examples: , + | lock_type | lock_host | + | zookeeper | zookeeper1 | + + + @pgconsul_util_maintenance + Scenario Outline: Check pgconsul-util maintenance with wait_all option works works + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + And container "postgresql2" is in quorum group + When we run following command on host "postgresql1" + """ + pgconsul-util maintenance -m enable --wait_all --timeout 10 + """ + Then command exit with return code "0" + And command result contains following output + """ + Success + """ + And "" has value "enable" for key "/pgconsul/postgresql/maintenance" + When we run following command on host "postgresql1" + """ + pgconsul-util maintenance -m disable --wait_all --timeout 10 + """ + Then command exit with return code "0" + And command result contains following output + """ + Success + """ + And "" has value "None" for key "/pgconsul/postgresql/maintenance" + Examples: , + | lock_type | lock_host | + | zookeeper | zookeeper1 | + + + @pgconsul_util_maintenance + Scenario Outline: Check pgconsul-util maintenance disable with wait_all option works fails + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + And container "postgresql2" is in quorum group + When we run following command on host "postgresql1" + """ + pgconsul-util maintenance -m enable --wait_all --timeout 10 + """ + Then command exit with return code "0" + And command result contains following output + """ + Success + """ + And "" has value "enable" for key "/pgconsul/postgresql/maintenance" + When we gracefully stop "pgconsul" in container "postgresql2" + And we gracefully stop "pgconsul" in container "postgresql1" + And we lock "/pgconsul/postgresql/alive/pgconsul_postgresql1_1.pgconsul_pgconsul_net" in "" + And we lock "/pgconsul/postgresql/alive/pgconsul_postgresql2_1.pgconsul_pgconsul_net" in "" + When we run following command on host "postgresql1" + """ + pgconsul-util maintenance -m disable --wait_all --timeout 10 + """ + Then command exit with return code "1" + And command result contains following output + """ + TimeoutError + """ + When we release lock "/pgconsul/postgresql/alive/pgconsul_postgresql1_1.pgconsul_pgconsul_net" in "" + And we release lock "/pgconsul/postgresql/alive/pgconsul_postgresql2_1.pgconsul_pgconsul_net" in "" + Examples: , + | lock_type | lock_host | + | zookeeper | zookeeper1 | + + + @pgconsul_util_switchover_single + Scenario Outline: Check pgconsul-util switchover single-node cluster works as expected + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + When we run following command on host "postgresql1" + """ + pgconsul-util switchover --yes --block + """ + Then command exit with return code "1" + And command result contains following output + """ + Switchover is impossible now + """ + Examples: , + | lock_type | lock_host | + | zookeeper | zookeeper1 | + + + @pgconsul_util_switchover_stream_from + Scenario Outline: Check pgconsul-util switchover single-node cluster works as expected + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + stream_from: pgconsul_postgresql1_1.pgconsul_pgconsul_net + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + When we run following command on host "postgresql1" + """ + pgconsul-util switchover --yes --block + """ + Then command exit with return code "1" + And command result contains following output + """ + Switchover is impossible now + """ + Examples: , + | lock_type | lock_host | + | zookeeper | zookeeper1 | + + + @pgconsul_util_switchover + Scenario Outline: Check pgconsul-util switchover works as expected + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" is in group + When we run following command on host "postgresql1" + """ + pgconsul-util switchover --yes --block + """ + Then command exit with return code "0" + And command result contains following output + """ + switchover finished, zk status "None" + """ + Then container "postgresql2" became a primary + And container "postgresql1" is a replica of container "postgresql2" + And container "postgresql1" is in group + And postgresql in container "postgresql1" was not rewinded + Examples: , + | lock_type | lock_host | replication_type | quorum_commit | + | zookeeper | zookeeper1 | sync | no | + | zookeeper | zookeeper1 | quorum | yes | + + + @pgconsul_util_switchover + Scenario Outline: Check pgconsul-util targeted switchover works as expected + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + config: + pgconsul.conf: + global: + priority: 1 + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 3 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql3" is in group + When we run following command on host "postgresql1" + """ + pgconsul-util switchover --yes --block --destination pgconsul_postgresql2_1.pgconsul_pgconsul_net + """ + Then command exit with return code "0" + And command result contains following output + """ + switchover finished, zk status "None" + """ + Then container "postgresql2" became a primary + And container "postgresql1" is a replica of container "postgresql2" + And container "postgresql3" is a replica of container "postgresql2" + And container "postgresql3" is in group + And postgresql in container "postgresql1" was not rewinded + And postgresql in container "postgresql3" was not rewinded + Examples: , + | lock_type | lock_host | replication_type | quorum_commit | + | zookeeper | zookeeper1 | sync | no | + | zookeeper | zookeeper1 | quorum | yes | + + + @pgconsul_util_switchover_reset + Scenario Outline: Check pgconsul-util switchover reset works as expected + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + config: + pgconsul.conf: + global: + priority: 1 + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 3 + """ + When we gracefully stop "pgconsul" in container "postgresql1" + And we gracefully stop "pgconsul" in container "postgresql2" + And we gracefully stop "pgconsul" in container "postgresql3" + And we lock "/pgconsul/postgresql/alive/pgconsul_postgresql1_1.pgconsul_pgconsul_net" in "" + And we lock "/pgconsul/postgresql/alive/pgconsul_postgresql2_1.pgconsul_pgconsul_net" in "" + And we lock "/pgconsul/postgresql/alive/pgconsul_postgresql3_1.pgconsul_pgconsul_net" in "" + And we lock "/pgconsul/postgresql/leader" in "" with value "pgconsul_postgresql1_1.pgconsul_pgconsul_net" + And we run following command on host "postgresql1" + """ + pgconsul-util switchover --yes + """ + Then command exit with return code "0" + And command result contains following output + """ + scheduled + """ + Then "" has value "scheduled" for key "/pgconsul/postgresql/switchover/state" + And "" has value "{"hostname": "pgconsul_postgresql1_1.pgconsul_pgconsul_net", "timeline": 1, "destination": null}" for key "/pgconsul/postgresql/switchover/master" + When we run following command on host "postgresql1" + """ + pgconsul-util switchover --reset + """ + Then command exit with return code "0" + And command result contains following output + """ + resetting ZK switchover nodes + """ + Then "" has value "failed" for key "/pgconsul/postgresql/switchover/state" + And "" has value "{}" for key "/pgconsul/postgresql/switchover/master" + When we run following command on host "postgresql1" + """ + pgconsul-util switchover --yes + """ + Then command exit with return code "0" + And command result contains following output + """ + scheduled + """ + Then "" has value "scheduled" for key "/pgconsul/postgresql/switchover/state" + And "" has value "{"hostname": "pgconsul_postgresql1_1.pgconsul_pgconsul_net", "timeline": 1, "destination": null}" for key "/pgconsul/postgresql/switchover/master" + When we release lock "/pgconsul/postgresql/alive/pgconsul_postgresql1_1.pgconsul_pgconsul_net" in "" + And we release lock "/pgconsul/postgresql/alive/pgconsul_postgresql2_1.pgconsul_pgconsul_net" in "" + And we release lock "/pgconsul/postgresql/alive/pgconsul_postgresql3_1.pgconsul_pgconsul_net" in "" + And we release lock "/pgconsul/postgresql/leader" in "" with value "pgconsul_postgresql1_1.pgconsul_pgconsul_net" + And we start "pgconsul" in container "postgresql1" + And we start "pgconsul" in container "postgresql2" + And we start "pgconsul" in container "postgresql3" + Then "" has no value for key "/pgconsul/postgresql/switchover/state" + And "" has no value for key "/pgconsul/postgresql/switchover/master" + Examples: , + | lock_type | lock_host | + | zookeeper | zookeeper1 | + + @pgconsul_util_initzk @pgconsul_util_initzk_test + Scenario Outline: Check pgconsul-util initzk --test works as expected + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + """ + When we gracefully stop "pgconsul" in container "postgresql1" + And we gracefully stop "pgconsul" in container "postgresql2" + And we remove key "/pgconsul/postgresql" in "" + And we run following command on host "postgresql1" + """ + pgconsul-util initzk --test pgconsul_postgresql1_1.pgconsul_pgconsul_net pgconsul_postgresql2_1.pgconsul_pgconsul_net + """ + Then command exit with return code "2" + And command result contains following output + """ + Path "all_hosts/pgconsul_postgresql1_1.pgconsul_pgconsul_net" not found in ZK, initialization has not been performed earlier + """ + + When we start "pgconsul" in container "postgresql1" + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + And "" has value "0" for key "/pgconsul/postgresql/all_hosts/pgconsul_postgresql1_1.pgconsul_pgconsul_net/prio" + + When we run following command on host "postgresql1" + """ + pgconsul-util initzk --test pgconsul_postgresql1_1.pgconsul_pgconsul_net pgconsul_postgresql2_1.pgconsul_pgconsul_net + """ + Then command exit with return code "2" + And command result contains following output + """ + Path "all_hosts/pgconsul_postgresql2_1.pgconsul_pgconsul_net" not found in ZK, initialization has not been performed earlier + """ + + When we start "pgconsul" in container "postgresql2" + Then container "postgresql2" is in quorum group + + When we run following command on host "postgresql1" + """ + pgconsul-util initzk --test pgconsul_postgresql1_1.pgconsul_pgconsul_net pgconsul_postgresql2_1.pgconsul_pgconsul_net + """ + Then command exit with return code "0" + And command result contains following output + """ + Initialization for all fqdns has been performed earlier + """ + Examples: , + | lock_type | lock_host | + | zookeeper | zookeeper1 | + + @pgconsul_util_initzk @pgconsul_util_initzk_do_init + Scenario Outline: Check pgconsul-util initzk works as expected + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + And container "postgresql2" is in quorum group + When we run following command on host "postgresql1" + """ + pgconsul-util switchover --yes --block + """ + Then command exit with return code "0" + + When we gracefully stop "pgconsul" in container "postgresql1" + And we gracefully stop "pgconsul" in container "postgresql2" + And we remove key "/pgconsul/postgresql" in "" + And we start "pgconsul" in container "postgresql1" + And we start "pgconsul" in container "postgresql2" + And we wait "10.0" seconds + Then "pgconsul" is not running in container "postgresql1" + And "pgconsul" is not running in container "postgresql2" + + When we run following command on host "postgresql1" + """ + pgconsul-util initzk pgconsul_postgresql1_1.pgconsul_pgconsul_net pgconsul_postgresql2_1.pgconsul_pgconsul_net + """ + Then command exit with return code "0" + And command result contains following output + """ + ZK structures are initialized + """ + + When we start "pgconsul" in container "postgresql1" + And we start "pgconsul" in container "postgresql2" + Then "" has holder "pgconsul_postgresql2_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + And container "postgresql1" is in quorum group + Examples: , + | lock_type | lock_host | + | zookeeper | zookeeper1 | + + @pgconsul_util_initzk @pgconsul_util_initzk_errors_handling + Scenario Outline: Check pgconsul-util initzk works as expected + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + """ + When we disconnect from network container "zookeeper1" + And we disconnect from network container "zookeeper2" + And we disconnect from network container "zookeeper3" + And we run following command on host "postgresql1" + """ + pgconsul-util initzk --test pgconsul_postgresql1_1.pgconsul_pgconsul_net pgconsul_postgresql2_1.pgconsul_pgconsul_net + """ + Then command exit with return code "1" + And command result contains following output + """ + KazooTimeoutError + """ + + When we run following command on host "postgresql1" + """ + pgconsul-util initzk pgconsul_postgresql1_1.pgconsul_pgconsul_net pgconsul_postgresql2_1.pgconsul_pgconsul_net + """ + Then command exit with return code "1" + And command result contains following output + """ + Could not create path "all_hosts/pgconsul_postgresql1_1.pgconsul_pgconsul_net" in ZK + """ + Examples: , + | lock_type | lock_host | + | zookeeper | zookeeper1 | + + + @pgconsul_util_info + Scenario Outline: Check pgconsul-util info for single-host cluster. + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + + use_replication_slots: 'yes' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + sync_replication_in_maintenance: 'no' + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + When we run following command on host "postgresql1" + """ + pgconsul-util info + """ + Then command exit with return code "0" + And command result contains following output + """ + alive: true + """ + When we run following command on host "postgresql1" + """ + pgconsul-util info -s --json + """ + Then command exit with return code "0" + And command result contains following output + """ + { + "alive": true, + "last_failover_time": null, + "maintenance": null, + "primary": "pgconsul_postgresql1_1.pgconsul_pgconsul_net", + "replics_info": {} + } + """ + Examples: , + | lock_type | lock_host | + | zookeeper | zookeeper1 | + + + @pgconsul_util_info + Scenario Outline: Check pgconsul-util info for HA cluster works + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + config: + pgconsul.conf: + global: + priority: 2 + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 3 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + When we run following command on host "postgresql1" + """ + pgconsul-util info + """ + Then command exit with return code "0" + When we run following command on host "postgresql2" + """ + pgconsul-util info -js + """ + Then command exit with return code "0" + Examples: , + | lock_type | lock_host | + | zookeeper | zookeeper1 | + + + Scenario Outline: Check pgconsul-util info with cascade replica + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'no' + postgres_timeout: 5 + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + recovery_timeout: 5 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_without_slot.sh %m %p + """ + Given a following cluster with "" without replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + postgresql3: + role: replica + config: + pgconsul.conf: + global: + stream_from: pgconsul_postgresql2_1.pgconsul_pgconsul_net + stream_from: postgresql2 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + When we run following command on host "postgresql1" + """ + pgconsul-util info + """ + Then command exit with return code "0" + When we run following command on host "postgresql2" + """ + pgconsul-util info -js + """ + Then command exit with return code "0" + + Examples: , + | lock_type | lock_host | + | zookeeper | zookeeper1 | diff --git a/tests/features/plugins.feature b/tests/features/plugins.feature new file mode 100644 index 0000000..ebe55c0 --- /dev/null +++ b/tests/features/plugins.feature @@ -0,0 +1,50 @@ +Feature: Check plugins + + @plugins + Scenario Outline: Check upload_wals plugin + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + use_lwaldump: 'no' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + """ + Then "" has holder "pgconsul_postgresql3_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/sync_replica" + When we disable archiving in "postgresql1" + And we switch wal in "postgresql1" "10" times + And we container "postgresql1" + Then container "postgresql3" became a primary + And wals present on backup "" + Examples: , , , + | lock_type | backup_host | lock_host | destroy | + | zookeeper | backup1 | zookeeper1 | stop | + | zookeeper | backup1 | zookeeper1 | disconnect from network | diff --git a/tests/features/primary_switch.feature b/tests/features/primary_switch.feature new file mode 100644 index 0000000..4cd39df --- /dev/null +++ b/tests/features/primary_switch.feature @@ -0,0 +1,64 @@ +Feature: Check primary switch logic + + @switchover + Scenario Outline: Correct primary switch from shut down after switchover + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 120 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + config: + pgconsul.conf: + global: + priority: 2 + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 3 + + """ + Then container "postgresql3" is in group + And container "postgresql2" is a replica of container "postgresql1" + When we gracefully stop "pgconsul" in container "postgresql2" + And we kill "postgres" in container "postgresql2" with signal "9" + And we lock "/pgconsul/postgresql/switchover/lock" in "" + And we set value "{"hostname": "pgconsul_postgresql1_1.pgconsul_pgconsul_net","timeline": 1}" for key "/pgconsul/postgresql/switchover/master" in "" + And we set value "scheduled" for key "/pgconsul/postgresql/switchover/state" in "" + And we release lock "/pgconsul/postgresql/switchover/lock" in "" + Then container "postgresql3" became a primary + And container "postgresql1" is a replica of container "postgresql3" + And postgresql in container "postgresql1" was not rewinded + When we start "pgconsul" in container "postgresql2" + Then "" has value "yes" for key "/pgconsul/postgresql/all_hosts/pgconsul_postgresql2_1.pgconsul_pgconsul_net/tried_remaster" + And container "postgresql2" is a replica of container "postgresql3" + And postgresql in container "postgresql2" was not rewinded + + Examples: , + | lock_type | lock_host | quorum_commit | replication_type | + | zookeeper | zookeeper1 | yes | quorum | + | zookeeper | zookeeper1 | no | sync | diff --git a/tests/features/priority.feature b/tests/features/priority.feature new file mode 100644 index 0000000..8c67fb6 --- /dev/null +++ b/tests/features/priority.feature @@ -0,0 +1,180 @@ +Feature: Replicas priority + + + + Scenario Outline: Asynchronous replica with higher priority promoted if replicas have same LSN + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: '' + quorum_commit: '' + primary: + change_replication_type: 'no' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'yes' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf__slot.sh %m %p + """ + Given a following cluster with "" replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + write_location_diff: 0 + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + write_location_diff: 0 + """ + When we stop container "postgresql1" + Then "" has holder "pgconsul_postgresql3_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql3" became a primary + Then "" has value "finished" for key "/pgconsul/postgresql/failover_state" + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + """ + Then container "postgresql2" is a replica of container "postgresql3" + When we start container "postgresql1" + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql1_1.pgconsul_pgconsul_net + state: streaming + """ + Then container "postgresql1" is a replica of container "postgresql3" + + Examples: , replication slots + | lock_type | lock_host | with_slots | use_slots | quorum_commit | + | zookeeper | zookeeper1 | without | no | yes | + | zookeeper | zookeeper1 | with | yes | yes | + | zookeeper | zookeeper1 | without | no | no | + | zookeeper | zookeeper1 | with | yes | no | + + + + + Scenario Outline: Change synchronous replicas priority + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: '' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf__slot.sh %m %p + """ + Given a following cluster with "" replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + """ + Then container "postgresql3" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + When we set value "10" for option "priority" in section "global" in pgconsul config in container "postgresql2" + And we restart "pgconsul" in container "postgresql2" + Then container "postgresql2" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + When we set value "1" for option "priority" in section "global" in pgconsul config in container "postgresql2" + And we restart "pgconsul" in container "postgresql2" + Then container "postgresql3" is in group + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + + Examples: , replication slots + | lock_type | lock_host | with_slots | use_slots | quorum_commit | replication_type | + | zookeeper | zookeeper1 | without | no | yes | quorum | + | zookeeper | zookeeper1 | with | yes | yes | quorum | + | zookeeper | zookeeper1 | without | no | no | sync | + | zookeeper | zookeeper1 | with | yes | no | sync | + + + + Scenario Outline: Missing priority key is always filled from config + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 10 + """ + And a following cluster with "" without replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + postgresql3: + role: replica + """ + When we remove key "/pgconsul/postgresql/all_hosts/pgconsul_postgresql2_1.pgconsul_pgconsul_net/prio" in "" + And we remove key "/pgconsul/postgresql/all_hosts/pgconsul_postgresql1_1.pgconsul_pgconsul_net/prio" in "" + Then "" has value "10" for key "/pgconsul/postgresql/all_hosts/pgconsul_postgresql1_1.pgconsul_pgconsul_net/prio" + And "" has value "10" for key "/pgconsul/postgresql/all_hosts/pgconsul_postgresql2_1.pgconsul_pgconsul_net/prio" + + Examples: + | lock_type | lock_host | + | zookeeper | zookeeper1 | diff --git a/tests/features/reset_sync.feature b/tests/features/reset_sync.feature new file mode 100644 index 0000000..2f2396e --- /dev/null +++ b/tests/features/reset_sync.feature @@ -0,0 +1,63 @@ +Feature: Reset sync replication without HA replics + + Scenario Outline: Reset sync replication without HA replics + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + change_replication_metric: count + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + postgresql3: + role: replica + config: + pgconsul.conf: + global: + stream_from: pgconsul_postgresql1_1.pgconsul_pgconsul_net + stream_from: postgresql1 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql2" is in group + And "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + When we stop container "postgresql2" + Then container "postgresql1" is primary + And container "postgresql3" is a replica of container "postgresql1" + When we wait "10.0" seconds + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + And container "postgresql1" replication state is "async" + And postgresql in container "postgresql1" has empty option "synchronous_standby_names" + + Examples: + | lock_type | lock_host | quorum_commit | replication_type | + | zookeeper | zookeeper1 | yes | quorum | + | zookeeper | zookeeper1 | no | sync | + diff --git a/tests/features/single_node.feature b/tests/features/single_node.feature new file mode 100644 index 0000000..f02914c --- /dev/null +++ b/tests/features/single_node.feature @@ -0,0 +1,94 @@ +Feature: Single node + + Scenario Outline: Single node primary is open with dead ZK + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'no' + postgres_timeout: 5 + election_timeout: 5 + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + recovery_timeout: 5 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_without_slot.sh %m %p + """ + Given a following cluster with "" without replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + stream_from: pgconsul_postgresql1_1.pgconsul_pgconsul_net + stream_from: postgresql1 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + stream_from: pgconsul_postgresql1_1.pgconsul_pgconsul_net + stream_from: postgresql1 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + sync_state: async + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + sync_state: async + """ + When we container "zookeeper1" + And we container "zookeeper2" + And we container "zookeeper3" + And we wait "10.0" seconds + Then container "postgresql1" is primary + And pgbouncer is running in container "postgresql1" + And pgbouncer is running in container "postgresql2" + And pgbouncer is running in container "postgresql3" + And container "postgresql2" is a replica of container "postgresql1" + And container "postgresql3" is a replica of container "postgresql1" + When we container "zookeeper1" + And we container "zookeeper2" + And we container "zookeeper3" + Then pgbouncer is running in container "postgresql1" + And pgbouncer is running in container "postgresql2" + And pgbouncer is running in container "postgresql3" + And container "postgresql2" is a replica of container "postgresql1" + And container "postgresql3" is a replica of container "postgresql1" + + Examples: , , , + | lock_type | lock_host | destroy | repair | + | zookeeper | zookeeper1 | stop | start | + | zookeeper | zookeeper1 | disconnect from network | connect to network | + + Scenario Outline: Check async in single node + Given a "pgconsul" container common config + """ + postgresql.conf: + synchronous_standby_names: 'test' + """ + Given a following cluster with "" without replication slots + """ + postgresql1: + role: primary + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + And postgresql in container "postgresql1" has empty option "synchronous_standby_names" + + Examples: + | lock_type | lock_host | + | zookeeper | zookeeper1 | diff --git a/tests/features/slot.feature b/tests/features/slot.feature new file mode 100644 index 0000000..ed1f8ff --- /dev/null +++ b/tests/features/slot.feature @@ -0,0 +1,76 @@ +Feature: Replication slots + + @slots + Scenario Outline: Slots created on promoted replica + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + primary: + change_replication_type: 'no' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'yes' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + """ + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + sync_state: async + write_location_diff: 0 + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + sync_state: async + write_location_diff: 0 + """ + Then container "postgresql1" has following replication slots + """ + - slot_name: pgconsul_postgresql2_1_pgconsul_pgconsul_net + slot_type: physical + - slot_name: pgconsul_postgresql3_1_pgconsul_pgconsul_net + slot_type: physical + """ + Then container "postgresql2" has following replication slots + """ + """ + Then container "postgresql3" has following replication slots + """ + """ + When we stop container "postgresql1" + Then container "postgresql2" became a primary + Then container "postgresql2" has following replication slots + """ + - slot_name: pgconsul_postgresql1_1_pgconsul_pgconsul_net + slot_type: physical + - slot_name: pgconsul_postgresql3_1_pgconsul_pgconsul_net + slot_type: physical + """ + + Examples: + | lock_type | lock_host | + | zookeeper | zookeeper1 | diff --git a/tests/features/start.feature b/tests/features/start.feature new file mode 100644 index 0000000..1e7c7a6 --- /dev/null +++ b/tests/features/start.feature @@ -0,0 +1,78 @@ +Feature: Check startup logic + + Scenario Outline: pgconsul restarts without zookeeper + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'no' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 1 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 1 + min_failover_timeout: 1 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_without_slot.sh %m %p + """ + Given a following cluster with "" without replication slots + """ + postgresql1: + role: primary + config: + pgconsul.conf: + global: + priority: 2 + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 3 + """ + Then "" has holder "pgconsul_postgresql1_1.pgconsul_pgconsul_net" for lock "/pgconsul/postgresql/leader" + Then container "postgresql3" is in group + And "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + When we lock "/pgconsul/postgresql/switchover/lock" in "" + And we set value "{"hostname": "pgconsul_postgresql1_1.pgconsul_pgconsul_net","timeline": 1}" for key "/pgconsul/postgresql/switchover/master" in "" + And we set value "scheduled" for key "/pgconsul/postgresql/switchover/state" in "" + And we release lock "/pgconsul/postgresql/switchover/lock" in "" + Then container "postgresql3" became a primary + And container "postgresql2" is a replica of container "postgresql3" + And container "postgresql1" is a replica of container "postgresql3" + Then container "postgresql1" is in group + When we disconnect from network container "postgresql1" + And we gracefully stop "pgconsul" in container "postgresql1" + And we start "pgconsul" in container "postgresql1" + And we wait "40.0" seconds + And we connect to network container "postgresql1" + Then container "postgresql1" is in group + And "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql1_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + """ + + Examples: + | lock_type | lock_host | quorum_commit | replication_type | + | zookeeper | zookeeper1 | yes | quorum | + | zookeeper | zookeeper1 | no | sync | diff --git a/tests/features/switchover.feature b/tests/features/switchover.feature new file mode 100644 index 0000000..77f0287 --- /dev/null +++ b/tests/features/switchover.feature @@ -0,0 +1,208 @@ +Feature: Check switchover + + @switchover + Scenario Outline: Check switchover restart + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 3 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 3 + min_failover_timeout: 120 + primary_unavailability_timeout: 2 + primary_switch_restart: '' + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + config: + pgconsul.conf: + global: + priority: 2 + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 3 + + """ + When we remember postgresql start time in container "postgresql1" + When we remember postgresql start time in container "postgresql2" + When we remember postgresql start time in container "postgresql3" + Then container "postgresql3" is in group + When we lock "/pgconsul/postgresql/switchover/lock" in "" + And we set value "{"hostname": "pgconsul_postgresql1_1.pgconsul_pgconsul_net","timeline": 1}" for key "/pgconsul/postgresql/switchover/master" in "" + And we set value "scheduled" for key "/pgconsul/postgresql/switchover/state" in "" + And we release lock "/pgconsul/postgresql/switchover/lock" in "" + Then container "postgresql3" became a primary + And container "postgresql2" is a replica of container "postgresql3" + And container "postgresql1" is a replica of container "postgresql3" + And postgresql in container "postgresql3" was not restarted + And postgresql in container "postgresql2" restarted + And postgresql in container "postgresql1" was restarted + Then container "postgresql1" is in group + When we lock "/pgconsul/postgresql/switchover/lock" in "" + And we set value "{"hostname": "pgconsul_postgresql3_1.pgconsul_pgconsul_net","timeline": 2}" for key "/pgconsul/postgresql/switchover/master" in "" + And we set value "scheduled" for key "/pgconsul/postgresql/switchover/state" in "" + And we release lock "/pgconsul/postgresql/switchover/lock" in "" + Then container "postgresql1" became a primary + And container "postgresql3" is a replica of container "postgresql1" + And container "postgresql2" is a replica of container "postgresql1" + And postgresql in container "postgresql3" was not rewinded + And postgresql in container "postgresql2" was not rewinded + When we stop container "postgresql2" + And we lock "/pgconsul/postgresql/switchover/lock" in "" + And we set value "{"hostname": "pgconsul_postgresql1_1.pgconsul_pgconsul_net","timeline": 3}" for key "/pgconsul/postgresql/switchover/master" in "" + And we set value "scheduled" for key "/pgconsul/postgresql/switchover/state" in "" + And we release lock "/pgconsul/postgresql/switchover/lock" in "" + And we wait "30.0" seconds + Then container "postgresql1" is primary + When we wait "90.0" seconds + Then container "postgresql3" became a primary + And container "postgresql1" is a replica of container "postgresql3" + + Examples: , + | lock_type | lock_host | quorum_commit | replication_type | restart | primary_switch_restart | restarted | + | zookeeper | zookeeper1 | yes | quorum | with | yes | was | + | zookeeper | zookeeper1 | no | sync | with | yes | was | + | zookeeper | zookeeper1 | yes | quorum | without | no | was not | + | zookeeper | zookeeper1 | no | sync | without | no | was not | + + + Scenario Outline: Check failed promote on switchover + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + postgres_timeout: 5 + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 3 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 3 + min_failover_timeout: 120 + primary_unavailability_timeout: 2 + recovery_timeout: 5 + commands: + promote: sleep 3 && false + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + config: + pgconsul.conf: + global: + priority: 2 + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 3 + + """ + Then container "postgresql3" is in group + When we lock "/pgconsul/postgresql/switchover/lock" in "" + And we set value "{"hostname": "pgconsul_postgresql1_1.pgconsul_pgconsul_net","timeline": 1}" for key "/pgconsul/postgresql/switchover/master" in "" + And we set value "scheduled" for key "/pgconsul/postgresql/switchover/state" in "" + And we release lock "/pgconsul/postgresql/switchover/lock" in "" + When we wait "30.0" seconds + Then container "postgresql1" is primary + And container "postgresql2" is a replica of container "postgresql1" + And container "postgresql3" is a replica of container "postgresql1" + And container "postgresql3" is in group + + Examples: , + | lock_type | lock_host | quorum_commit | replication_type | + | zookeeper | zookeeper1 | yes | quorum | + | zookeeper | zookeeper1 | no | sync | + + + @switchover_drop + Scenario Outline: Incorrect switchover nodes being dropped + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + primary: + change_replication_type: 'yes' + primary_switch_checks: 3 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 3 + min_failover_timeout: 120 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + config: + pgconsul.conf: + global: + priority: 2 + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 3 + + """ + When we lock "/pgconsul/postgresql/switchover/lock" in "" + And we set value "{"hostname": null,"timeline": null}" for key "/pgconsul/postgresql/switchover/master" in "" + And we set value "scheduled" for key "/pgconsul/postgresql/switchover/state" in "" + And we release lock "/pgconsul/postgresql/switchover/lock" in "" + Then "zookeeper1" has value "None" for key "/pgconsul/postgresql/switchover/master" + Then "zookeeper1" has value "None" for key "/pgconsul/postgresql/switchover/state" + Then "zookeeper1" has value "None" for key "/pgconsul/postgresql/switchover/lsn" + Then "zookeeper1" has value "None" for key "/pgconsul/postgresql/failover_state" + Then container "postgresql1" is primary + And container "postgresql2" is a replica of container "postgresql1" + And container "postgresql3" is a replica of container "postgresql1" + + Examples: , + | lock_type | lock_host | + | zookeeper | zookeeper1 | diff --git a/tests/features/targeted_switchover.feature b/tests/features/targeted_switchover.feature new file mode 100644 index 0000000..d6fbf50 --- /dev/null +++ b/tests/features/targeted_switchover.feature @@ -0,0 +1,140 @@ +Feature: Targeted switchover + + @switchover + Scenario Outline: Check targeted switchover + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 3 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 3 + min_failover_timeout: 120 + primary_unavailability_timeout: 2 + commands: + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + config: + pgconsul.conf: + global: + priority: 3 + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + + """ + Then container "postgresql3" is in group + When we lock "/pgconsul/postgresql/switchover/lock" in "" + And we set value "{"hostname": "pgconsul_postgresql1_1.pgconsul_pgconsul_net", "timeline": 1, "destination": "pgconsul_postgresql2_1.pgconsul_pgconsul_net"}" for key "/pgconsul/postgresql/switchover/master" in "" + And we set value "scheduled" for key "/pgconsul/postgresql/switchover/state" in "" + And we release lock "/pgconsul/postgresql/switchover/lock" in "" + Then container "postgresql2" became a primary + And container "postgresql3" is a replica of container "postgresql2" + And container "postgresql1" is a replica of container "postgresql2" + And container "postgresql1" is in group + And postgresql in container "postgresql3" was not rewinded + And postgresql in container "postgresql1" was not rewinded + + Examples: , + | lock_type | lock_host | quorum_commit | replication_type | + | zookeeper | zookeeper1 | yes | quorum | + | zookeeper | zookeeper1 | no | sync | + + @switchover + Scenario Outline: Host fail targeted switchover + Given a "pgconsul" container common config + """ + pgconsul.conf: + global: + priority: 0 + use_replication_slots: 'yes' + postgres_timeout: 20 + quorum_commit: '' + primary: + change_replication_type: 'yes' + primary_switch_checks: 3 + replica: + allow_potential_data_loss: 'no' + primary_unavailability_timeout: 1 + primary_switch_checks: 3 + min_failover_timeout: 120 + primary_unavailability_timeout: 2 + commands: + pg_stop: sleep 10 && /usr/bin/postgresql/pg_ctl stop -s -m fast -w -t %t -D %p + generate_recovery_conf: /usr/local/bin/gen_rec_conf_with_slot.sh %m %p + """ + Given a following cluster with "" with replication slots + """ + postgresql1: + role: primary + config: + pgconsul.conf: + global: + priority: 3 + postgresql2: + role: replica + config: + pgconsul.conf: + global: + priority: 1 + postgresql3: + role: replica + config: + pgconsul.conf: + global: + priority: 2 + + """ + Then container "postgresql3" is in group + When we lock "/pgconsul/postgresql/switchover/lock" in "" + And we set value "{"hostname": "pgconsul_postgresql1_1.pgconsul_pgconsul_net", "timeline": 1, "destination": "pgconsul_postgresql2_1.pgconsul_pgconsul_net"}" for key "/pgconsul/postgresql/switchover/master" in "" + And we set value "scheduled" for key "/pgconsul/postgresql/switchover/state" in "" + And we release lock "/pgconsul/postgresql/switchover/lock" in "" + And we disconnect from network container "postgresql2" + And we wait "60.0" seconds + Then container "postgresql1" is primary + And container "postgresql3" is a replica of container "postgresql1" + When we connect to network container "postgresql2" + Then "" has following values for key "/pgconsul/postgresql/replics_info" + """ + - client_hostname: pgconsul_postgresql2_1.pgconsul_pgconsul_net + state: streaming + - client_hostname: pgconsul_postgresql3_1.pgconsul_pgconsul_net + state: streaming + """ + And container "postgresql2" is a replica of container "postgresql1" + When we lock "/pgconsul/postgresql/switchover/lock" in "" + And we set value "{"hostname": "pgconsul_postgresql1_1.pgconsul_pgconsul_net", "timeline": 1, "destination": "pgconsul_postgresql2_1.pgconsul_pgconsul_net"}" for key "/pgconsul/postgresql/switchover/master" in "" + And we set value "scheduled" for key "/pgconsul/postgresql/switchover/state" in "" + And we release lock "/pgconsul/postgresql/switchover/lock" in "" + Then container "postgresql2" became a primary + And container "postgresql3" is a replica of container "postgresql2" + And container "postgresql1" is a replica of container "postgresql2" + And container "postgresql1" is in group + And postgresql in container "postgresql3" was not rewinded + And postgresql in container "postgresql1" was not rewinded + + Examples: , + | lock_type | lock_host | quorum_commit | replication_type | + | zookeeper | zookeeper1 | yes | quorum | + | zookeeper | zookeeper1 | no | sync | diff --git a/tests/generate_certs.sh b/tests/generate_certs.sh new file mode 100644 index 0000000..c4b7daf --- /dev/null +++ b/tests/generate_certs.sh @@ -0,0 +1,103 @@ +#!/bin/bash +set -ex + +FQDN=$(hostname) + +ls /etc/zk-ssl/truststore.jks && exit 0 || true +mkdir /etc/zk-ssl + +echo "-----BEGIN CERTIFICATE----- +MIIE/TCCAuWgAwIBAgIUU9e6chP84r3iZk3JtvnWb1V2N1YwDQYJKoZIhvcNAQEL +BQAwDTELMAkGA1UEBhMCUlUwIBcNMjMwMzEwMDgzNTUzWhgPMzAyMjA3MTEwODM1 +NTNaMA0xCzAJBgNVBAYTAlJVMIICIjANBgkqhkiG9w0BAQEFAAOCAg8AMIICCgKC +AgEAwJuy394cK127yT8nGHVPKF6TG6xL0WpxahyaKwIYp5lbv9wDvzjMPE7KmONU +8GhCFUdEJTRqBkaRdZNYxnOUxufU3+jIf1hq1Csg8q1NXICVWVwfFL2F5mKHgeHQ +n3FaJM2pZQ5iIWFY1c18MgV8qqNWbtyLeppcyZOL9duLM9A8XpYb0JOZis82d+lh +kcxzE1XM+MZEgZfHImh0zod9OMtSAOwQzVXpiA3JO/eHkLQGYcy6KNTm42mubVlX +kBcu/BplnP7gXGOYDt/JyRhGSLAfn762+jRbAlAvbPzOy67hc4pW7aloU5zPBhYf +BaTxM9UPqPtyp7Lxkp9HL68QXtm5MobDuDtZ6ePQtHgHrl7P7PXvEUPwK7BZzgZy +MerVhxIssutA2yBCuu5T7dMSwIsUdvXtgdHRdHDwn1D/V1CxnujDv9l6/T3sCmRv +tWPwTOCUf5BLLw6N6TnSsVR5I9NALKCLYE8LsfCuLdyi363JZqubkdJr1Ro8yI5J +m0GX5pypwZJPV2Ivt6kKVTQiN2hoWNe+3TNPS+7ysqit37s71YRDajZaZ55DopmF ++oIYdA3MqUZEVZyKFifWvo/l2gYarlEtcEJl++OwydirWLAjCPHh9UvDhjKS43bQ +zSlRC+d4CfRqXftmETHVAxMokai3WvAdUpJrW2RrjiuR0MkCAwEAAaNTMFEwHQYD +VR0OBBYEFJGDr6xmoKJFU6cgS90aFg6lUGbhMB8GA1UdIwQYMBaAFJGDr6xmoKJF +U6cgS90aFg6lUGbhMA8GA1UdEwEB/wQFMAMBAf8wDQYJKoZIhvcNAQELBQADggIB +ACj87ymjBlgY9UZTUbudHREPPXfqMi2TgWt5hygQSiTrNeQOodnq+Swp86qX/y8w +xtnvc+iILfFnh9ZevHKmLx+JziN4kD4ywEpHW7zS7c3+2QEjIZUwj5qlIg0ByOBd +0M/kpimmuTwlDylBaY12GcFlZcsbuezzm4hU+0qoCV/zi2DvSdAPKXMAeZ3lOkde +PUYJUpRz/QkkxEhSdM3BQYI51mUiltCHMhe6COoN4MHV7tix0Pj9vPjhAVN/4sot +2PgUiCwY8eNQugZhpTosMTSBLZvg/EKG+4slY75/voNTIxWHAHmnPMOAzVgNTya0 +/eP6NB3MCjFuY2E+fGox9YTomjI5oxBr+1LlwVy7wbwXTrgBz9Z4izScAsVbPrk6 +jSrqNeNWK1f+JVnYZkjgPGgPaQVCJ22vdLmkW7U/ATdeedQS3RCApMnb9VCRTUaO +eY4ccuEvj0huhdcUguw6fBjrhPjoPxKMn6S93ginW8Wz9vo8qLkEg2NtQDFu1Omb +cJM5F8uLRr8NotPV5QPg1koHeBv/N2WTRZiUoavAogR9XdyOtrB8+MBu1nsp4Goi +7/suv9XzMJ7IpgXiQfCM++1x7oooyWWdeFTCzqNDJ1IbQDeOCc9cQgeOAPWcIqWO +nAWt08+eToI1YUvjl6UT0bpVaJEACv+/HfBr1T26u4Jh +-----END CERTIFICATE-----" > /etc/zk-ssl/ca.cert.pem + +echo "-----BEGIN RSA PRIVATE KEY----- +MIIJKAIBAAKCAgEAwJuy394cK127yT8nGHVPKF6TG6xL0WpxahyaKwIYp5lbv9wD +vzjMPE7KmONU8GhCFUdEJTRqBkaRdZNYxnOUxufU3+jIf1hq1Csg8q1NXICVWVwf +FL2F5mKHgeHQn3FaJM2pZQ5iIWFY1c18MgV8qqNWbtyLeppcyZOL9duLM9A8XpYb +0JOZis82d+lhkcxzE1XM+MZEgZfHImh0zod9OMtSAOwQzVXpiA3JO/eHkLQGYcy6 +KNTm42mubVlXkBcu/BplnP7gXGOYDt/JyRhGSLAfn762+jRbAlAvbPzOy67hc4pW +7aloU5zPBhYfBaTxM9UPqPtyp7Lxkp9HL68QXtm5MobDuDtZ6ePQtHgHrl7P7PXv +EUPwK7BZzgZyMerVhxIssutA2yBCuu5T7dMSwIsUdvXtgdHRdHDwn1D/V1CxnujD +v9l6/T3sCmRvtWPwTOCUf5BLLw6N6TnSsVR5I9NALKCLYE8LsfCuLdyi363JZqub +kdJr1Ro8yI5Jm0GX5pypwZJPV2Ivt6kKVTQiN2hoWNe+3TNPS+7ysqit37s71YRD +ajZaZ55DopmF+oIYdA3MqUZEVZyKFifWvo/l2gYarlEtcEJl++OwydirWLAjCPHh +9UvDhjKS43bQzSlRC+d4CfRqXftmETHVAxMokai3WvAdUpJrW2RrjiuR0MkCAwEA +AQKCAgAgemC4RTDE00J2FfMWublGWmQ991i1kFhdh0Mr22ei40ZIXOY42W/+/15E +V5kcDMiP4/uGtobmVgHzLIx8skK1I6SOuScN6i/hZQBiS3zPC1OjxNfs3GR2y8iD +yzstl6SWriNRShKcBFlBfCvkF27FK1PIz+GpI9xflUS1iXa4nvV/EZrRGgJ7GKPb +pnvwZORGr2In1O76V0iZ8bk4ljo0WHyUcToIFeOSMJjtRrkSWnj1BtuhRP1F/a0O +/VC5mF8w3Zai2YulqJmccHoLMc+wNBqxCiy6lhd+lVzZ6OtKB0w2+m3cF4PjDX8P +TK2gewa9McE5QmU8B/2aNsd/L+r3eGEvWAF/1vRq6NcrFwigq8uCTtgw9edRlDnm +RvICkfAbrwhNaixWwqBVQHoy53H29TohxGNNKa6TTKeJvYEdYKgHx55TxkB9X9jc +iSisqb3fgEl4Yh1Izpu+6nULOqdlldfkKPgKJqVB1AT/avR8J09zmMvW5fPa6fFx +alZ1iVahR5bIFEu1lXygsrBP6N+K/ogyztg7ZKLTIN/FguwMKnXMaUbN/Y/ZZXV1 +oGil9vHKnDrRnUGfcm9tyH2Ddcy6RDoDz+O4cYgMGxDhHran2cicVY1q+Yi08q5h +Napk1phNra5HIHnNHwMxQ75ZKZZ3TOGJL+HMF4yRDj19C/6sAQKCAQEA8a9ZQhWw +0vhZENmSYZgGZLa7RZLSbBzQOX/cetdI6/kvmZVcMvNz4q0/UI9XLkqokL1wJiku +O0zXkaVrBVAsgozp4I3oFqwtcAAGw0KwF4FDAS36k4gkE4SmIUl2eI0XMZCPQIKp +3TB81+XdBITtwfPl5yG+IZDkXNu16qUHEhnhvs/kKhMr8flhFC1J4gdrrQhfuRHY +Jv8e1RLJzMhu/ErRjh82LkzB6m3jp0YxBeIA+9Kkw+OX6SlzRbJPirKxJTaZnB8o +wQmzOy1kTRG4qjKswjdTbzf6549721i8QHwSpwPI3NZQhlSkfsvZ5QL4qPW0nRta +m76YeLlS12yQSQKCAQEAzAQz6OcE6yS2q5UfTZluGaU54Zkm0YSnS394pitJpHoh +JSZlvkL1DzpacquDxa3uQLDikai5TqpNnkuufeMJf7I2ygg4n/v4OFaE+/qj5uNA +3QnL3BVT9DCJ0JvQ1qA5Q/6P5WpUHYB7JHBM9BpaE8e4xocJyWSdcSJDaEXns4Hx +WzhpBdVpPSamqB0VHYg1bv6OGFPfwUaRafWhNzljtxbY8RYcz7IfPmnLImFePTtZ +AjzIoAwUIRFzvmoduda0kQKogRVoEeaW1q6ebPUjYjIZvohnpe27EvgCiTNkcaSf +C96uIxHrSvI8114z9CBXer60xQ0Kz+ds18LtY6w8gQKCAQEAkP/JxlsrHje/f9t4 +9jJ2S4BSNLiUpCZZStYKWmzFJEX5J+SzTyI+uZWFcfi9rlk+brApE8wLH6rHfmtH +HQXv3ldajc21m7yq+hIZ/JYK/d8gaxnBxzebpVYlMb1YZZUIgEUhnOuHq9vGWuVe +x7JUztNccGIPJyY9y/RJXUCrUFHU3Vzun8umxuL+OlO9iu02zbZDb85j52mSfvVp +uwHZjGX6+ZCCOh71DIfnWFlFWikwu+Sx05C9eDbVINCM5kK1AwWR/Ve4ZLBEJtHh +5lcmen4ypcb5uLVWRA0SmxPOxcVqj2c24D94Sk+H7UayMLKqqvvW45cgsmYUJgHR +0MsieQKCAQB9goBk4erWtmliuYTeemuPf2RSc6O79b3t5mfU4oCVnUTS1AJ3wD1+ +tsl6DiYs8MnIJoncTk5iJMdHgQvCCnCHjJ3EQLaFRb/4+NErK5C1tEztLt+pb72M +VmgSXCloQH26ZNslqfpBhA895ZCSA7wyuwXjrKPKsAlj1k5d0dOvTVusYNHLcvUh +V6vjdLDO0EL/G79THBZlkwJWi3Q4wyejNX0VJCNpaw1pmjAL4JbXWLFzfO13+LZR +eakZFbNf5sSDCX2cnAzAJnnZbOet5El2WZgY7VXGcLBMBSOaQHGksD/gT4gVrypv +mwLvA9c2cscejkArkdB7AsalHhho30cBAoIBAFJBO0RU7o0S+F6KHIP5aFbItcUd +NfUgoJTAFUD3EnBirvDv0pu8T8zkgKf7PRFkZQIOXocvpX0Zy6N7fiPbvzTA/vH3 +mFqias89pTUAgv43R8ZsAC/qlozUuByegigEz2zeVd34w7MdkgGo1jnqmijAIXZE +INBo0swkxAbix+W1Pur/yvGUpC6xu3ISmdrn0p20B7QhyuoqC3ea/az7ePwx+Pu9 +Jl8tzMujbHNHhw+OQAQOPHi6EUPs/H37euj3G7oBaVUwXJq3Tbwg95W5Jih+CgTB +Sbe6eYpR/j/SYGwbS6/DbHi3IjvblN+2pSPI05JvXMhLC/lAeqcdVJAgTvw= +-----END RSA PRIVATE KEY-----" > /etc/zk-ssl/ca.key + +openssl genrsa -out /etc/zk-ssl/server.key -passout pass:testpassword123 4096 +openssl req -new -key /etc/zk-ssl/server.key -out /etc/zk-ssl/server.csr -passin pass:testpassword123 -subj "/C=NL/ST=Test/L=Test/O=Test/OU=Test/CN=${FQDN}" +openssl x509 -req -days 365 -in /etc/zk-ssl/server.csr -CA /etc/zk-ssl/ca.cert.pem -CAkey /etc/zk-ssl/ca.key -CAcreateserial -out /etc/zk-ssl/server.crt -passin pass:testpassword123 + +if [[ "${FQDN}" == *"zookeeper"* ]]; +then + keytool -import -trustcacerts -alias yandex -file /etc/zk-ssl/ca.cert.pem -keystore /etc/zk-ssl/truststore.jks -storepass testpassword123 -noprompt && \ + openssl pkcs12 -export -in /etc/zk-ssl/server.crt -inkey /etc/zk-ssl/server.key -out /etc/zk-ssl/server.p12 -passout pass:testpassword321 -name ${FQDN} && \ + keytool -importkeystore -destkeystore /etc/zk-ssl/server.jks -srckeystore /etc/zk-ssl/server.p12 -deststorepass testpassword321 -srcstoretype PKCS12 -srcstorepass testpassword321 -alias ${FQDN} && \ + rm -f /etc/zk-ssl/server.p12 +fi + +chmod 755 /etc/zk-ssl/* diff --git a/tests/pgconsul.featureset b/tests/pgconsul.featureset new file mode 100644 index 0000000..ad61569 --- /dev/null +++ b/tests/pgconsul.featureset @@ -0,0 +1,27 @@ +features/pgconsul_util.feature +features/initialization.feature +features/priority.feature +features/async.feature +features/kill_primary.feature +features/kill_replica.feature +features/kill_non_ha_replica.feature +features/slot.feature +features/coordinator.feature +features/coordinator_fail.feature +features/single_node.feature +features/op_track.feature +features/maintenance.feature +features/plugins.feature +features/switchover.feature +features/failover_timeout.feature +features/cascade.feature +features/disable_sync.feature +features/autofailover.feature +features/consecutive_switch.feature +features/targeted_switchover.feature +features/dead_primary_switchover.feature +features/start.feature +features/reset_sync.feature +features/primary_switch.feature +features/failed_promote.feature +features/archive.feature diff --git a/tests/setup.sh b/tests/setup.sh new file mode 100755 index 0000000..219ce18 --- /dev/null +++ b/tests/setup.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +PG_MAJOR=$1 + +sudo -u postgres /usr/lib/postgresql/$PG_MAJOR/bin/postgres --single -D /var/lib/postgresql/$PG_MAJOR/main <<- EOF +CREATE EXTENSION IF NOT EXISTS lwaldump; +EOF diff --git a/tests/steps/cluster.py b/tests/steps/cluster.py new file mode 100644 index 0000000..6cd3fb4 --- /dev/null +++ b/tests/steps/cluster.py @@ -0,0 +1,956 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import copy +import operator +import os +import time + +import psycopg2 +import yaml + +from tests.steps import config +from tests.steps import helpers +from tests.steps import zk +from tests.steps.database import Postgres +from behave import given, register_type, then, when +from parse_type import TypeBuilder + + +register_type(WasOrNot=TypeBuilder.make_enum({"was": True, "was not": False})) +register_type(IsOrNot=TypeBuilder.make_enum({"is": True, "is not": False})) + + +@given('a "{cont_type}" container common config') +def step_common_config(context, cont_type): + context.config[cont_type] = yaml.safe_load(context.text) or {} + + +def _set_use_slots_in_pgconsul_config(config, use_slots): + if 'config' not in config: + config['config'] = {} + if 'pgconsul.conf' not in config['config']: + config['config']['pgconsul.conf'] = {} + pgconsul_conf = config['config']['pgconsul.conf'] + if 'global' not in pgconsul_conf: + pgconsul_conf['global'] = {} + if 'commands' not in pgconsul_conf: + pgconsul_conf['commands'] = {} + if use_slots: + pgconsul_conf['global']['use_replication_slots'] = 'yes' + pgconsul_conf['commands']['generate_recovery_conf'] = '/usr/local/bin/gen_rec_conf_with_slot.sh %m %p' + else: + pgconsul_conf['global']['use_replication_slots'] = 'no' + pgconsul_conf['commands']['generate_recovery_conf'] = '/usr/local/bin/gen_rec_conf_without_slot.sh %m %p' + + +class PGCluster(object): + def __init__(self, members, docker_compose, use_slots=False): + assert isinstance(members, dict) + self.members = members + self.services = docker_compose['services'] + + self.primary = None + self.replicas = {} + + # check all members and remember who is primary and replicas + for member, conf in members.items(): + self.add_primary(member, conf) + self.add_replica(member, conf) + _set_use_slots_in_pgconsul_config(conf, use_slots) + # add recovery.conf config to all replicas + for replica in self.replicas.keys(): + assert replica in self.services, 'missing config for "{name}" in compose'.format(name=replica) + if 'config' not in members[replica]: + members[replica]['config'] = {} + members[replica]['config'].update( + { + 'recovery.conf': { + 'recovery_target_timeline': 'latest', + 'primary_conninfo': 'host={host} application_name={app}'.format( + host=self.member_fqdn(self.replicas[replica]), app=self.member_appname(replica) + ), + 'restore_command': 'rsync -a --password-file=/etc/archive.passwd' + ' rsync://archive@pgconsul_backup1_1.pgconsul_pgconsul_net:' + '/archive/%f %p', + }, + 'standby.signal': {}, + } + ) + # add primary_slot_name to recovery.conf if we are using slots + if use_slots: + members[replica]['config']['recovery.conf'].update( + { + 'primary_slot_name': self.member_slotname(replica), + } + ) + + def add_primary(self, member, conf): + role = conf['role'] + if role == 'primary': + assert self.primary is None, 'detected more than 1 primary {primaries}'.format( + primaries=[self.primary, member] + ) + self.primary = member + + def add_replica(self, member, conf): + role = conf['role'] + if role == 'replica': + self.replicas[member] = conf.get('stream_from', self.primary) + + def member_type(self, member): + return self.members[member].get('type', 'pgconsul') + + def member_fqdn(self, member): + return '{host}.{domain}'.format( + host=self.services[member]['hostname'], + domain=self.services[member]['domainname'], + ) + + def member_appname(self, member): + return self.member_fqdn(member).replace('.', '_') + + def member_slotname(self, member): + return self.member_appname(member) + + def config(self, member): + return self.members[member].get('config', dict()) + + def get_primary(self): + return self.primary + + def get_replicas(self): + return self.replicas + + def get_pg_members(self): + return [self.get_primary()] + list(self.get_replicas().keys()) + + +def execute_step_with_config(context, step, step_config): + context.execute_steps('{step}\n"""\n{config}\n"""'.format(step=step, config=step_config)) + + +@given('a following cluster with "{lock_type}" {with_slots} replication slots') +def step_cluster(context, lock_type, with_slots): + use_slots = with_slots == 'with' + + cluster = PGCluster(yaml.safe_load(context.text) or {}, context.compose, use_slots) + + context.execute_steps(""" Given a "backup" container "backup1" """) + + zk_names = [] + # If we use zookeeper we need to create it in separate containers. + if lock_type == 'zookeeper': + # Find all zookeepers in compose and start it + for name, service_config in context.compose['services'].items(): + image_type = helpers.build_config_get_path(service_config['build']) + if not image_type.endswith('zookeeper'): + continue + zk_names.append(name) + context.execute_steps( + """ + Given a "zookeeper" container "{name}" + """.format( + name=name + ) + ) + + # Start containers + for member in cluster.members: + execute_step_with_config( + context, + 'Given a "{cont_type}" container "{name}" with following config'.format( + cont_type=cluster.member_type(member), name=member + ), + yaml.dump(cluster.config(member), default_flow_style=False), + ) + + # Wait while containers starts in a separate cycle + # after creation of all containers + for member in cluster.members: + context.execute_steps( + """ + Then container "{name}" has status "running" + """.format( + name=member + ) + ) + + if use_slots: + # create replication slots on primary + for replica in cluster.get_replicas().keys(): + context.execute_steps( + """ + Given a replication slot "{name}" in container "{primary}" + """.format( + primary=cluster.get_replicas()[replica], name=cluster.member_slotname(replica) + ) + ) + + # Check that expected to be primary container is primary + context.execute_steps( + """ + Then container "{name}" became a primary + """.format( + name=cluster.get_primary() + ) + ) + + # Check that all replicas are replicas + for replica in cluster.get_replicas().keys(): + context.execute_steps( + """ + Then container "{replica}" is a replica of container "{primary}" + """.format( + replica=replica, primary=cluster.get_replicas()[replica] + ) + ) + + if use_slots: + # Check that replication follows via slots if we using it + # or otherwise not via slots if they are not used + slots = [] + for replica in cluster.get_replicas().keys(): + if cluster.get_replicas()[replica] == cluster.get_primary(): + slots.append( + { + 'slot_type': 'physical', + 'slot_name': cluster.member_slotname(replica), + 'active': use_slots, + } + ) + execute_step_with_config( + context, + 'Then container "{name}" has following replication slots'.format(name=cluster.get_primary()), + yaml.dump(slots, default_flow_style=False), + ) + + # Check that all zk nodes is alive + for name in zk_names: + context.execute_steps( + """ + Then zookeeper "{name}" node is alive + """.format( + name=name + ) + ) + + # Check that pgbouncer running on all dbs and tried_remaster flag for all hosts in 'no' + for container in cluster.get_pg_members(): + context.execute_steps( + """ + Then pgbouncer is running in container "{name}" + And zookeeper "{zk_name}" has value "no" for key "/pgconsul/postgresql/all_hosts/pgconsul_{name}_1.pgconsul_pgconsul_net/tried_remaster" + """.format( + name=container, zk_name=zk_names[0] + ) + ) + + +@given('a "{cont_type}" container "{name}"') +def step_container(context, cont_type, name): + context.execute_steps( + ''' + Given a "{cont_type}" container "{name}" with following config + """ + """ + '''.format( + name=name, cont_type=cont_type + ) + ) + + +@given('a "{cont_type}" container "{name}" with following config') +def step_container_with_config(context, cont_type, name): + conf = yaml.safe_load(context.text) or {} + docker_config = copy.deepcopy(context.compose['services'][name]) + + # Check that image type is correct + build = docker_config.pop('build') + image_type = helpers.build_config_get_path(build) + assert image_type.endswith(cont_type), ( + 'invalid container type, ' + 'expected "{cont_type}", docker-compose.yml has ' + 'build "{build}"'.format(cont_type=cont_type, build=image_type) + ) + + # Pop keys that will be changed + networks = docker_config.pop('networks') + docker_config.pop('name', None) + docker_config.pop('ports', None) + + # while jepsen test use another image for container pgconsul + # we need to create pgconsul container from our custom image + # not image from docker-compose.yml + image = ( + os.environ.get('PGCONSUL_IMAGE') + if cont_type == 'pgconsul' + else '{project}-{name}'.format(project=context.project, name=name) + ) + + # create dict {container_port: None} for each container's + # exposed port (docker will use next free port automatically) + ports = {} + for port in helpers.CONTAINER_PORTS[cont_type]: + ports[port] = ('0.0.0.0', None) + + # Create container + container = helpers.DOCKER.containers.create(image, **docker_config, name=name, ports=ports) + + context.containers[name] = container + + # Connect container to network + for netname, network in networks.items(): + context.networks[netname].connect(container, **network) + + # Process configs + common_config = context.config.get(cont_type, {}) + filenames = set(list(common_config.keys()) + list(conf.keys())) + for conffile in filenames: + confobj = config.fromfile(conffile, helpers.container_get_conffile(container, conffile)) + # merge existing config with common config + confobj.merge(common_config.get(conffile, {})) + # merge existing config with step config + confobj.merge(conf.get(conffile, {})) + helpers.container_inject_config(container, conffile, confobj) + + container.start() + container.reload() + + container.exec_run("/usr/local/bin/generate_certs.sh") + container.exec_run("/usr/local/bin/supervisorctl restart zookeeper") + container.exec_run("/usr/local/bin/supervisorctl restart pgconsul") + + +@given('a replication slot "{slot_name}" in container "{name}"') +@helpers.retry_on_assert +def step_replication_slot(context, slot_name, name): + container = context.containers[name] + db = Postgres(host=helpers.container_get_host(), port=helpers.container_get_tcp_port(container, 5432)) + db.create_replication_slot(slot_name) + + +@then('container "{name}" has following replication slots') +@helpers.retry_on_assert +def step_container_has_replication_slots(context, name): + container = context.containers[name] + db = Postgres(host=helpers.container_get_host(), port=helpers.container_get_tcp_port(container, 5432)) + exp_values = sorted(yaml.safe_load(context.text) or [], key=operator.itemgetter('slot_name')) + assert isinstance(exp_values, list), 'expected list, got {got}'.format(got=type(exp_values)) + + actual_values = sorted(db.get_replication_slots(), key=operator.itemgetter('slot_name')) + result_equal, err = helpers.are_dicts_subsets_of(exp_values, actual_values) + + assert result_equal, err + + +@when('we drop replication slot "{slot}" in container "{name}"') +def step_container_drop_replication_slot(context, slot, name): + container = context.containers[name] + db = Postgres(host=helpers.container_get_host(), port=helpers.container_get_tcp_port(container, 5432)) + db.drop_replication_slot(slot) + + +@then('container "{name}" is primary') +def step_container_is_primary(context, name): + container = context.containers[name] + db = Postgres(host=helpers.container_get_host(), port=helpers.container_get_tcp_port(container, 5432)) + assert db.is_primary(), 'container "{name}" is not primary'.format(name=name) + + +@then('container "{name}" replication state is "{state}"') +@helpers.retry_on_assert +def step_container_replication_state(context, name, state): + container = context.containers[name] + db = Postgres(host=helpers.container_get_host(), port=helpers.container_get_tcp_port(container, 5432)) + actual_state = db.get_replication_state()[0] + assert ( + actual_state == state + ), f'container "{name}" replication state is "{actual_state}", while expected is "{state}"' + + +@then('one of containers "{containers}" became a primary') +@helpers.retry_on_assert +def step_on_of_containers_became_primary(context, containers): + containers = containers.split(',') + primaries = [] + for container in containers: + try: + step_container_became_primary_no_retries(context, container) + primaries.append(container) + except AssertionError: + continue + assert len(primaries) == 1, 'expected one of {containers} is primary, but primaries are "{primaries}"'.format( + containers=containers, primaries=primaries + ) + + +def step_container_became_primary_no_retries(context, name): + container = context.containers[name] + db = Postgres(host=helpers.container_get_host(), port=helpers.container_get_tcp_port(container, 5432)) + assert db.is_primary(), 'container "{name}" is not primary'.format(name=name) + + +@then('container "{name}" became a primary') +@helpers.retry_on_assert +def step_container_became_primary(context, name): + step_container_became_primary_no_retries(context, name) + + +def assert_container_is_replica(context, replica_name, primary_name): + replica = context.containers[replica_name] + primary = context.containers[primary_name] + try: + replicadb = Postgres(host=helpers.container_get_host(), port=helpers.container_get_tcp_port(replica, 5432)) + + assert replicadb.is_primary() is False, 'container "{name}" is primary'.format(name=replica_name) + + assert replicadb.get_walreceiver_stat(), 'wal receiver not started' + + primarydb = Postgres(host=helpers.container_get_host(), port=helpers.container_get_tcp_port(primary, 5432)) + replicas = primarydb.get_replication_stat() + except psycopg2.Error as error: + raise AssertionError(error.pgerror) + + ips = list(helpers.container_get_ip_address(replica)) + myfqdn = helpers.container_get_fqdn(replica) + + # Find replica by one of container ip addresses + # and check that fqdn is same as container fqdn + for stat_replica in replicas: + if any(stat_replica['client_addr'] == ip for ip in ips): + assert ( + stat_replica['client_hostname'] == myfqdn + ), 'incorrect replica fqdn on primary "{fqdn}", expected "{exp}"'.format( + fqdn=stat_replica['client_hostname'], exp=myfqdn + ) + break + else: + assert False, 'container {replica} is not replica of container "{primary}"'.format( + replica=replica_name, primary=primary_name + ) + + +@then('container "{replica_name}" is a replica of container "{primary_name}"') +@helpers.retry_on_assert +def step_container_is_replica(context, replica_name, primary_name): + return assert_container_is_replica(context, replica_name, primary_name) + + +@then('pgbouncer is running in container "{name}"') +@helpers.retry_on_assert +def step_pgbouncer_running(context, name): + container = context.containers[name] + db = Postgres(host=helpers.container_get_host(), port=helpers.container_get_tcp_port(container, 6432)) + assert db.ping(), 'pgbouncer is not running in container "{name}"'.format(name=name) + + +@then('pgbouncer is not running in container "{name}"') +@helpers.retry_on_assert +def step_pgbouncer_not_running(context, name): + container = context.containers[name] + try: + Postgres(host=helpers.container_get_host(), port=helpers.container_get_tcp_port(container, 6432)) + except AssertionError as ae: + err = ae.args[0] + if isinstance(err, psycopg2.OperationalError) and any( + match in err.args[0] + for match in [ + 'Connection refused', # If container is shut and docker-proxy is not listening + 'timeout expired', # If container is disconnected from network and not reachable within timeout + 'server closed the connection unexpectedly', # If docker-proxy accepted connection but bouncer is down + ] + ): + # pgbouncer is really not running, it is what we want + return + + raise AssertionError( + f'pgbouncer is running in container "{name}" but connection can\'t be established. Error is {err!r}' + ) + # pgbouncer is running + raise AssertionError('pgbouncer is running in container "{name}"'.format(name=name)) + + +@then('container "{name}" has following config') +@helpers.retry_on_assert +def step_container_has_config(context, name): + container = context.containers[name] + conf = yaml.safe_load(context.text) or {} + for conffile, confvalue in conf.items(): + confobj = config.fromfile(conffile, helpers.container_get_conffile(container, conffile)) + valid, err = confobj.check_values_equal(confvalue) + assert valid, err + + +@then('postgresql in container "{name:w}" has value "{value}" for option "{option:w}"') +@helpers.retry_on_assert +def step_postgresql_option_has_value(context, name, value, option): + container = context.containers[name] + db = Postgres(host=helpers.container_get_host(), port=helpers.container_get_tcp_port(container, 5432)) + val = db.get_config_option(option) + assert val == value, 'option "{opt}" has value "{val}", expected "{exp}"'.format(opt=option, val=val, exp=value) + + +@then('postgresql in container "{name:w}" has empty option "{option}"') +@helpers.retry_on_assert +def step_postgresql_empty_option(context, name, option): + step_postgresql_option_has_value(context, name, '', option) + + +@when('run in container "{name:w}" "{sessions:d}" sessions with timeout {timeout:d}') +@helpers.retry_on_assert +def step_postgresql_make_sessions(context, name, sessions, timeout): + container = context.containers[name] + for connect in range(sessions): + db = Postgres( + host=helpers.container_get_host(), port=helpers.container_get_tcp_port(container, 5432), async_=True + ) + db.pg_sleep(timeout) + + +@then('pgbouncer in container "{name}" has value "{value}" for option "{option}"') +@helpers.retry_on_assert +def step_pgbouncer_option_has_value(context, name, value, option): + container = context.containers[name] + db = Postgres( + dbname='pgbouncer', + host=helpers.container_get_host(), + port=helpers.container_get_tcp_port(container, 6432), + autocommit=True, + ) + db.cursor.execute('SHOW config') + for row in db.cursor.fetchall(): + if str(row['key']) == str(option): + assert row['value'] == value, 'option "{opt}" has value "{val}", expected "{exp}"'.format( + opt=option, val=row['value'], exp=value + ) + break + else: + assert False, 'missing option "{opt}" in pgboncer config'.format(opt=option) + + +@then('container "{name}" has status "{status}"') +@helpers.retry_on_assert +def step_container_status(context, name, status): + container = context.containers[name] + container.reload() + current_status = helpers.container_get_status(container) + expected_status = str(status).lower() + assert current_status == expected_status, 'Unexpected container state "{state}", expected "{exp}"'.format( + state=current_status, exp=status + ) + + +@when('we kill container "{name}" with signal "{signal}"') +def step_kill_container(context, name, signal): + container = context.containers[name] + helpers.kill(container, signal) + container.reload() + + +def ensure_exec(context, container_name, cmd): + container = context.containers[container_name] + return helpers.exec(container, cmd) + + +@when('we kill "{service}" in container "{name}" with signal "{signal}"') +def step_kill_service(context, service, name, signal): + ensure_exec(context, name, 'pkill --signal %s %s' % (signal, service)) + + +@when('we gracefully stop "{service}" in container "{name}"') +def step_stop_service(context, service, name): + if service == 'postgres': + pgdata = _container_get_pgdata(context, name) + code, output = ensure_exec( + context, name, f'sudo -u postgres /usr/bin/postgresql/pg_ctl stop -s -m fast -w -t 60 -D {pgdata}' + ) + assert code == 0, f'Could not stop postgres: {output}' + else: + ensure_exec(context, name, 'supervisorctl stop %s' % service) + + +def _parse_pgdata(lsclusters_output): + """ + Parse pgdata from 1st row + """ + for row in lsclusters_output.split('\n'): + if not row: + continue + _, _, _, _, _, pgdata, _ = row.split() + return pgdata + + +def _container_get_pgdata(context, name): + """ + Get pgdata in container by name + """ + code, clusters_str = ensure_exec(context, name, 'pg_lsclusters --no-header') + assert code == 0, f'Could not list clusters: {clusters_str}' + return _parse_pgdata(clusters_str) + + +@when('we start "{service}" in container "{name}"') +def step_start_service(context, service, name): + if service == 'postgres': + pgdata = _container_get_pgdata(context, name) + code, output = ensure_exec(context, name, f'sudo -u postgres /usr/bin/postgresql/pg_ctl start -D {pgdata}') + assert code == 0, f'Could not start postgres: {output}' + else: + ensure_exec(context, name, 'supervisorctl start %s' % service) + + +@when('we stop container "{name}"') +def step_stop_container(context, name): + context.execute_steps( + """ + When we kill container "{name}" with signal "SIGTERM" + Then container "{name}" has status "exited" + """.format( + name=name + ) + ) + + +@when('we start container "{name}"') +def step_start_container(context, name): + container = context.containers[name] + container.reload() + status = helpers.container_get_status(container) + assert status == 'exited', 'Unexpected container state "{state}", expected "exited"'.format(state=status) + container.start() + container.reload() + + +@when('we disconnect from network container "{name}"') +def step_disconnect_container(context, name): + networks = context.compose['services'][name]['networks'] + container = context.containers[name] + for netname in networks: + context.networks[netname].disconnect(container) + + +@when('we connect to network container "{name}"') +def step_connect_container(context, name): + networks = context.compose['services'][name]['networks'] + container = context.containers[name] + for netname, network in networks.items(): + context.networks[netname].connect(container, **network) + + +@then('we fail') +def step_fail(_): + raise AssertionError('You asked - we failed') + + +@when('we wait "{interval:f}" seconds') +def step_sleep(_, interval): + time.sleep(interval) + + +@when('we wait until "{interval:f}" seconds to failover of "{container_name}" left in zookeeper "{zk_name}"') +def step_sleep_until_failover_cooldown(context, interval, container_name, zk_name): + last_failover_ts = helpers.get_zk_value(context, zk_name, '/pgconsul/postgresql/last_failover_time') + assert last_failover_ts is not None, 'last_failover_ts should not be "None"' + last_failover_ts = float(last_failover_ts) + + timeout = config.getint(context, container_name, 'pgconsul.conf', 'replica', 'min_failover_timeout') + now = time.time() + wait_duration = (last_failover_ts + timeout) - now - interval + assert wait_duration >= 0, 'we can\'t wait negative amount of time' + time.sleep(wait_duration) + + +@when('we disable archiving in "{name}"') +def step_disable_archiving(context, name): + container = context.containers[name] + db = Postgres(host=helpers.container_get_host(), port=helpers.container_get_tcp_port(container, 5432)) + db.disable_archiving() + + +@when('we switch wal in "{name}" "{times:d}" times') +def switch_wal(context, name, times): + container = context.containers[name] + db = Postgres(host=helpers.container_get_host(), port=helpers.container_get_tcp_port(container, 5432)) + context.wals = [] + for _ in range(times): + context.wals.append(db.switch_and_get_wal()) + time.sleep(1) + + +@then('wals present on backup "{name}"') +@helpers.retry_on_assert +def check_wals(context, name): + container = context.containers[name] + for wal in context.wals: + assert helpers.container_check_file_exists( + container, '/archive/{wal}'.format(wal=wal) + ), 'wal "{wal}" not present '.format(wal=wal) + + +@when('we run following command on host "{name}"') +def step_host_run_command(context, name): + context.last_exit_code, context.last_output = ensure_exec(context, name, context.text) + + +@then('command exit with return code "{code:d}"') +def step_command_return_code(context, code): + assert ( + code == context.last_exit_code + ), f'Expected "{code}", got "{context.last_exit_code}", output was "{context.last_output}"' + + +@then('command result is following output') +def step_command_output_exact(context): + assert context.text == context.last_output, f'Expected "{context.text}", got "{context.last_output}"' + + +@then('command result contains following output') +def step_command_output_contains(context): + assert context.text in context.last_output, f'Expected "{context.text}" not found in got "{context.last_output}"' + + +@when('we promote host "{name}"') +@helpers.retry_on_assert +def promote(context, name): + container = context.containers[name] + helpers.promote_host(container) + + +@when('we make switchover task with params "{params}" in container "{name}"') +def set_switchover_task(context, params, name): + container = context.containers[name] + if params == "None": + params = "" + helpers.set_switchover(container, params) + + +@then('pgconsul in container "{name}" is connected to zookeeper') +@helpers.retry_on_assert +def step_check_pgconsul_zk_connection(context, name): + container = context.containers[name] + _, output = container.exec_run("bash -c '/usr/bin/lsof -i -a -p `supervisorctl pid pgconsul`'", privileged=True) + pgconsul_conns = [] + for line in output.decode().split('\n'): + conns = line.split()[8:] + if '(ESTABLISHED)' in conns: + pgconsul_conns += [c.split('->')[1].rsplit(':', 1) for c in conns if c != '(ESTABLISHED)'] + pgconsul_zk_conns = [c for c in pgconsul_conns if 'zookeeper' in c[0] and '2281' == c[1]] + assert pgconsul_zk_conns, "pgconsul in container {name} is not connected to zookeper".format(name=name) + + +@then('"{x:d}" containers are replicas of "{primary_name}" within "{sec:f}" seconds') +def step_x_containers_are_replicas_of(context, x, primary_name, sec): + timeout = time.time() + sec + while time.time() < timeout: + replicas_count = 0 + for container_name in context.containers: + if 'postgres' not in container_name: + continue + try: + assert_container_is_replica(context, container_name, primary_name) + except AssertionError: + # this container is not a replica of primary, ok + pass + else: + replicas_count += 1 + if replicas_count == x: + return + time.sleep(context.interval) + assert False, "{x} containers are not replicas of {primary}".format(x=x, primary=primary_name) + + +@then('at least "{x}" postgresql instances are running for "{interval:f}" seconds') +def step_x_postgresql_are_running(context, x, interval): + start_time = time.time() + while time.time() < start_time + interval: + x = int(x) + running_count = 0 + for container in context.containers.values(): + try: + db = Postgres(host=helpers.container_get_host(), port=helpers.container_get_tcp_port(container, 6432)) + except AssertionError: + # ok, this db is not running right now + pass + else: + if db.ping(): + running_count += 1 + assert ( + running_count >= x + ), "postgresql should be running in " + "{x} containers, but it is running in {y} containers".format( + x=x, y=running_count + ) + + +def get_minimal_simultaneously_running_count(state_changes, cluster_size): + running_count = 0 + is_cluster_completed = False + minimal_running_count = None + for change in state_changes: + if change.new_state == helpers.DBState.shut_down: + running_count -= 1 + if is_cluster_completed: + minimal_running_count = min(minimal_running_count, running_count) + elif change.new_state == helpers.DBState.working: + running_count += 1 + if running_count == cluster_size: + is_cluster_completed = True + minimal_running_count = cluster_size + return minimal_running_count + + +@then('container "{name}" is in quorum group') +@helpers.retry_on_assert +def step_container_is_in_quorum_group(context, name): + service = context.compose['services'][name] + fqdn = f'{service["hostname"]}.{service["domainname"]}' + assert zk.has_value_in_list(context, 'zookeeper1', '/pgconsul/postgresql/quorum', fqdn) + assert zk.has_subset_of_values( + context, + 'zookeeper1', + '/pgconsul/postgresql/replics_info', + { + fqdn: { + 'state': 'streaming', + } + }, + ) + + +@then('container "{name}" is in sync group') +@helpers.retry_on_assert +def step_container_is_in_sync_group(context, name): + service = context.compose['services'][name] + fqdn = f'{service["hostname"]}.{service["domainname"]}' + context.execute_steps( + f''' + Then zookeeper "zookeeper1" has holder "{fqdn}" for lock "/pgconsul/postgresql/sync_replica" + ''' + ) + assert zk.has_subset_of_values( + context, + 'zookeeper1', + '/pgconsul/postgresql/replics_info', + { + fqdn: { + 'state': 'streaming', + 'sync_state': 'sync', + } + }, + ) + + +@then('quorum replication is in normal state') +def step_quorum_replication_is_in_normal_state(context): + pass + + +@then('sync replication is in normal state') +def step_single_sync_replication_is_in_normal_state(context): + pass + + +@then('at least "{x}" postgresql instances were running simultaneously during test') +def step_x_postgresql_were_running_simultaneously(context, x): + x = int(x) + state_changes = [] + cluster_size = 0 + for name, container in context.containers.items(): + if 'postgres' not in name: + continue + cluster_size += 1 + log_stream = helpers.container_get_filestream(container, "/var/log/postgresql/postgresql.log") + logs = list(map(lambda line: line.decode('u8'), log_stream)) + state_changes.extend(helpers.extract_state_changes_from_postgresql_logs(logs)) + state_changes = sorted(state_changes) + min_running = get_minimal_simultaneously_running_count(state_changes, cluster_size) + assert ( + min_running >= x + ), "postgresql had to be running in " + "{x} containers, but it was running in {y} containers".format( + x=x, y=min_running + ) + + +@when('we set value "{value}" for option "{option}" in section "{section}" in pgconsul config in container "{name}"') +def step_change_pgconsul_option(context, value, option, section, name): + container = context.containers[name] + conffile = 'pgconsul.conf' + confobj = config.fromfile(conffile, helpers.container_get_conffile(container, conffile)) + confobj.merge({section: {option: value}}) + helpers.container_inject_config(container, conffile, confobj) + + +@when('we set value "{value}" for option "{option}" in "{conffile}" config in container "{name}"') +def step_change_option(context, value, option, conffile, name): + container = context.containers[name] + confobj = config.fromfile(conffile, helpers.container_get_conffile(container, conffile)) + confobj.merge({option: value}) + helpers.container_inject_config(container, conffile, confobj) + + +@when('we restart "{service}" in container "{name}"') +def step_restart_service(context, service, name): + if service == 'postgres': + pgdata = _container_get_pgdata(context, name) + code, output = ensure_exec( + context, name, f'sudo -u postgres /usr/bin/postgresql/pg_ctl restart -s -m fast -w -t 60 -D {pgdata}' + ) + assert code == 0, f'Could not restart postgres: {output}' + else: + ensure_exec(context, name, f'supervisorctl restart {service}') + + +@then('"{service}" {running:IsOrNot} running in container "{name}"') +def step_service_running(context, service, running, name): + exit_code, output = ensure_exec(context, name, f'supervisorctl status {service}') + if running: + assert exit_code == 0, f'Service {service} is not running in container {name}' + else: + assert exit_code != 0, f'Service {service} is running in container {name}' + + +def get_postgres_start_time(context, name): + container = context.containers[name] + try: + postgres = Postgres(host=helpers.container_get_host(), port=helpers.container_get_tcp_port(container, 5432)) + return postgres.get_start_time() + except psycopg2.Error as error: + raise AssertionError(error.pgerror) + + +@when('we remember postgresql start time in container "{name}"') +def step_remember_pg_start_time(context, name): + context.pg_start_time[name] = get_postgres_start_time(context, name) + + +@then('postgresql in container "{name}" {restarted:WasOrNot} restarted') +def step_was_pg_restarted(context, name, restarted): + if restarted: + assert get_postgres_start_time(context, name) != context.pg_start_time[name] + else: + assert get_postgres_start_time(context, name) == context.pg_start_time[name] + + +@then('postgresql in container "{name}" {rewinded:WasOrNot} rewinded') +def step_was_pg_rewinded(context, name, rewinded): + container = context.containers[name] + actual_rewinded = helpers.container_file_exists(container, '/tmp/rewind_called') + assert rewinded == actual_rewinded + + +@then('container "{name}" is replaying WAL') +@helpers.retry_on_assert +def step_container_replaying_wal(context, name): + container = context.containers[name] + try: + db = Postgres(host=helpers.container_get_host(), port=helpers.container_get_tcp_port(container, 5432)) + assert not db.is_wal_replay_paused() + except psycopg2.Error as error: + raise AssertionError(error.pgerror) + + +@when('we pause replaying WAL in container "{name}"') +def step_container_pause_replaying_wal(context, name): + container = context.containers[name] + db = Postgres(host=helpers.container_get_host(), port=helpers.container_get_tcp_port(container, 5432)) + db.wal_replay_pause() diff --git a/tests/steps/config.py b/tests/steps/config.py new file mode 100644 index 0000000..92f09a0 --- /dev/null +++ b/tests/steps/config.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from configparser import RawConfigParser + +from tests.steps import helpers + + +def fromfile(filename, fileobj): + if filename in ['pgconsul.conf', 'pgbouncer.ini']: + return ConfigINI(fileobj) + elif filename in ['postgresql.conf', 'postgresql.auto.conf', 'recovery.conf']: + return ConfigPG(fileobj) + elif filename == 'standby.signal': + return EmptyConfig(fileobj) + else: + raise NotImplementedError('Unknown config file {filename}'.format(filename=filename)) + + +def getint(context, container_name, conf_name, section, key): + container = context.containers[container_name] + config = RawConfigParser() + config.read_file(helpers.container_get_conffile(container, conf_name)) + return config.getint(section, key) + + +class Config(object): + def __init__(self): + raise NotImplementedError() + + def merge(self, config): + raise NotImplementedError() + + def write(self): + raise NotImplementedError() + + def check_values_equal(self, config): + raise NotImplementedError() + + +class EmptyConfig(Config): + def __init__(self, _): + pass + + def merge(self, _): + pass + + def write(self, _): + pass + + def check_values_equal(self, _): + pass + + +class ConfigINI(Config): + def __init__(self, fileobj): + self.config = RawConfigParser() + self.config.readfp(fileobj) + + def merge(self, config): + assert isinstance(config, dict) + for section, values in config.items(): + if not self.config.has_section(section): + self.config.add_section(section) + for key, value in values.items(): + self.config.set(section, key, str(value)) + + def write(self, fileobj): + return self.config.write(fileobj) + + def check_values_equal(self, config): + assert isinstance(config, dict) + for section, values in config.items(): + if not self.config.has_section(section): + return False, 'missing section "{sec}"'.format(sec=section) + for key, expected_value in values.items(): + if not self.config.has_option(section, key): + return False, 'missing option "{opt}" in section "{sec}"'.format(opt=key, sec=section) + value = self.config.get(section, key) + if str(value) != str(expected_value): + return False, 'option "{opt}" in section "{sec}" has value "{val}" expected "{exp}"'.format( + opt=key, sec=section, val=value, exp=expected_value + ) + return True, None + + +class ConfigPG(Config): + def __init__(self, fileobj): + self.config = {} + for line in fileobj: + if line.strip().startswith('#'): + continue + tup = line.strip('\n').split('=', maxsplit=1) + assert len(tup) == 2, 'unexpected tuple {tup}'.format(tup=tup) + self.config[tup[0].strip()] = tup[1].strip() + + def merge(self, config): + assert isinstance(config, dict) + for key, value in config.items(): + self.config[key.strip()] = "'{value}'".format(value=str(value).strip().replace("'", r"\'")) + + def write(self, fileobj): + for key, value in self.config.items(): + fileobj.write('{key} = {value}\n'.format(key=key, value=value)) + + def check_values_equal(self, config): + assert isinstance(config, dict) + for key, val in config.items(): + expected_value = str(val).strip() + stripped_key = key.strip() + if stripped_key not in self.config: + return False, 'missing option "{opt}"'.format(opt=key) + + # NOTE: We need to be more carefully here. It is wrong to + # simply replace "'" to "" if value has escaped quote "\'". + # But seems that we have not this cases. + value = self.config[stripped_key].replace("'", '') + if str(value) != str(expected_value): + return False, 'option "{opt}" has value "{val}", expected "{exp}"'.format( + opt=stripped_key, val=value, exp=expected_value + ) + return True, None diff --git a/tests/steps/database.py b/tests/steps/database.py new file mode 100644 index 0000000..002199b --- /dev/null +++ b/tests/steps/database.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# We are using deepcopy in every data return to help +# python run destructor on connection to prevent +# database connection leakage +from copy import deepcopy + +import psycopg2 +from psycopg2.extras import RealDictCursor +import select + + +class Postgres(object): + def __init__( + self, host='localhost', dbname='postgres', user='postgres', port='5432', autocommit=True, async_=False + ): + self.conn = None + try: + self.conn = psycopg2.connect(user=user, host=host, port=port, dbname=dbname, async_=async_) + + if async_: + self.wait(self.conn) + else: + self.conn.autocommit = autocommit + self.cursor = self.conn.cursor(cursor_factory=RealDictCursor) + # Sometimes we leave connections closed on our side + # but open in PostgreSQL (some issue with docker?) + # This query ensures that none of such connections + # will leak + if dbname != 'pgbouncer': + self.cursor.execute( + """-- noinspection SqlResolveForFile + + SELECT pg_terminate_backend(pid) + FROM pg_stat_activity + WHERE client_addr IS NOT NULL + AND state = 'idle' + AND pid != pg_backend_pid() + """ + ) + if async_: + self.wait(self.cursor.connection) + except psycopg2.OperationalError as error: + assert False, error + except psycopg2.DatabaseError as error: + assert False, error + + def __del__(self): + if self.conn: + self.conn.close() + + def ping(self): + try: + self.cursor.execute( + """ + SELECT true AS ping + """ + ) + return deepcopy(self.cursor.fetchone())['ping'] + except psycopg2.OperationalError: + return False + + def pg_sleep(self, timeout=1): + self.cursor.execute( + """ + SELECT pg_sleep({timeout}) + """.format( + timeout=timeout + ) + ) + return None + + def is_primary(self): + self.cursor.execute( + """ + SELECT pg_is_in_recovery() AS in_recovery + """ + ) + return self.cursor.fetchone()['in_recovery'] is False + + def get_replication_stat(self): + self.cursor.execute( + """-- noinspection SqlResolveForFile + SELECT * FROM pg_stat_replication + """ + ) + return deepcopy(self.cursor.fetchall()) + + def get_replication_state(self): + self.cursor.execute('SHOW synchronous_standby_names;') + res = self.cursor.fetchone()['synchronous_standby_names'] + res = ('async', None) if res == '' else ('sync', res) + return res + + def get_walreceiver_stat(self): + self.cursor.execute( + """-- noinspection SqlResolveForFile + SELECT * FROM pg_stat_wal_receiver + """ + ) + return deepcopy(self.cursor.fetchone()) + + def get_config_option(self, option): + self.cursor.execute( + """ + SELECT current_setting(%(option)s) AS opt + """, + {'option': option}, + ) + return deepcopy(self.cursor.fetchone())['opt'] + + def create_replication_slot(self, slot_name): + self.cursor.execute( + """ + SELECT pg_create_physical_replication_slot(%(name)s) + """, + {'name': slot_name}, + ) + return deepcopy(self.cursor.fetchone()) + + def get_replication_slots(self): + self.cursor.execute( + """-- noinspection SqlResolveForFile + SELECT * FROM pg_replication_slots + """ + ) + return deepcopy(self.cursor.fetchall()) + + def drop_replication_slot(self, slot_name): + self.cursor.execute( + """ + SELECT pg_drop_replication_slot(%(name)s) + """, + {'name': slot_name}, + ) + return deepcopy(self.cursor.fetchone()) + + def switch_and_get_wal(self): + self.cursor.execute( + """ + SELECT pg_walfile_name(pg_switch_wal()) + """ + ) + return deepcopy(self.cursor.fetchone())['pg_walfile_name'] + + def disable_archiving(self): + self.cursor.execute( + """ + ALTER SYSTEM SET archive_command = '/bin/true' + """ + ) + self.cursor.execute( + """ + SELECT pg_reload_conf() + """ + ) + return deepcopy(self.cursor.fetchone()) + + def get_start_time(self): + self.cursor.execute( + """ + SELECT pg_postmaster_start_time() AS time + """ + ) + return deepcopy(self.cursor.fetchone())['time'] + + def is_wal_replay_paused(self): + self.cursor.execute( + """ + SELECT pg_is_wal_replay_paused() as paused + """ + ) + return deepcopy(self.cursor.fetchone())['paused'] + + def wal_replay_pause(self): + self.cursor.execute( + """ + SELECT pg_wal_replay_pause() + """ + ) + + def wait(self, conn): + while True: + state = conn.poll() + if state == psycopg2.extensions.POLL_OK: + break + elif state == psycopg2.extensions.POLL_WRITE: + select.select([], [conn.fileno()], []) + elif state == psycopg2.extensions.POLL_READ: + select.select([conn.fileno()], [], []) + else: + raise psycopg2.OperationalError("poll() returned %s" % state) diff --git a/tests/steps/helpers.py b/tests/steps/helpers.py new file mode 100644 index 0000000..f303f68 --- /dev/null +++ b/tests/steps/helpers.py @@ -0,0 +1,326 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import io +import logging +import os +import subprocess +import tarfile +import time +import datetime +import enum +import contextlib +from kazoo.client import KazooClient +from kazoo.exceptions import NoNodeError +from kazoo.security import make_digest_acl +from kazoo.handlers.threading import KazooTimeoutError + +import docker +from docker.errors import APIError + +# Connect to docker daemon +DOCKER = docker.from_env(timeout=600) + +PGDATA = '/var/lib/postgresql/{pg_major}/main'.format(pg_major=os.environ.get('PG_MAJOR')) + +CONFIG_ENVS = { + 'pgconsul.conf': '/etc/pgconsul.conf', + 'postgresql.conf': '{pgdata}/postgresql.conf'.format(pgdata=PGDATA), + 'postgresql.auto.conf': '{pgdata}/postgresql.auto.conf'.format(pgdata=PGDATA), + 'recovery.conf': '{pgdata}/conf.d/recovery.conf'.format(pgdata=PGDATA), + 'standby.signal': '{pgdata}/standby.signal'.format(pgdata=PGDATA), + 'pgbouncer.ini': '/etc/pgbouncer/pgbouncer.ini', +} + +CONTAINER_PORTS = {'pgconsul': ['5432', '6432'], 'zookeeper': ['2181', '2281', '2188', '2189'], 'backup': ['873']} + +LOG = logging.getLogger('helpers') + +DB_SHUTDOWN_MESSAGE = 'database system is shut down' +DB_READY_MESSAGE = 'database system is ready to accept' +POSTGRES_LOG_TIME_FMT = '%Y-%m-%d %H:%M:%S.%f' + + +class DBState(enum.Enum): + shut_down = 1 + working = 2 + + +class DBStateChange(object): + def __init__(self, time, new_state): + self.time = time + self.new_state = new_state + + def __lt__(self, other): + return self.time < other.time + + +def retry_on_error(function, errType): + """ + Decorator for retrying. It catchs AssertionError + while timeout not exceeded. + """ + + def wrapper(*args, **kwargs): + context = args[0] + timeout = time.time() + float(context.timeout) + while True: + try: + return function(*args, **kwargs) + except errType as error: + LOG.info( + '{time}: {func} call: {err}'.format( + time=datetime.datetime.now().strftime("%H:%M:%S"), func=str(function.__name__), err=error + ) + ) + # raise exception if timeout exceeded + if time.time() > timeout: + raise + time.sleep(context.interval) + + return wrapper + + +def retry_on_assert(function): + return retry_on_error(function, AssertionError) + + +def retry_on_kazoo_timeout(function): + return retry_on_error(function, KazooTimeoutError) + + +def is_dict_subset_of(left, right): + for key, value in left.items(): + if key not in right: + return False, f'missing "{key}", expected "{value}"' + if value != right[key]: + message = f'key "{key}" has value "{right[key]}" expected "{value}"' + return False, message + return True, None + + +def is_2d_dict_subset_of(subset, superset): + for key, val in subset.items(): + if key not in superset: + return False + is_subset, _ = is_dict_subset_of(val, superset[key]) + if not is_subset: + return False + return True + + +def are_dicts_subsets_of(exp_values, actual_values): + if len(actual_values) != len(exp_values): + return False, 'expected {exp} values, got {got}'.format(exp=len(exp_values), got=len(actual_values)) + + for i, expected in enumerate(exp_values): + is_subset, err = is_dict_subset_of(expected, actual_values[i]) + + # return immediately if values are not equal + if not is_subset: + return is_subset, err + + return True, None + + +def extract_time_from_log_line(line): + str_time, _ = line.split('UTC') + return datetime.datetime.strptime(str_time.strip(), POSTGRES_LOG_TIME_FMT) + + +def extract_state_changes_from_postgresql_logs(logs): + state_changes = [] + for line in logs: + if DB_READY_MESSAGE in line: + state_changes.append(DBStateChange(extract_time_from_log_line(line), DBState.working)) + elif DB_SHUTDOWN_MESSAGE in line: + state_changes.append(DBStateChange(extract_time_from_log_line(line), DBState.shut_down)) + return state_changes + + +def container_env(container, env_var): + return container.exec_run('/bin/bash -c "echo ${env_var}"'.format(env_var=env_var)).decode().strip('\n') + + +def container_get_fqdn(container): + return '{hostname}.{domain}'.format( + hostname=container.attrs['Config']['Hostname'], domain=container.attrs['Config']['Domainname'] + ) + + +def container_get_ip_address(container): + for network in container.attrs['NetworkSettings']['Networks'].values(): + yield network['IPAddress'] + + +def container_get_host(): + """ + Get exposed host (differs from localhost if you use docker-machine) + """ + machine_name = os.getenv('DOCKER_MACHINE_NAME') + if machine_name: + return subprocess.check_output(['docker-machine', 'ip', machine_name]).decode('utf-8').rstrip() + + return 'localhost' + + +def container_get_tcp_port(container, port): + binding = container.attrs['NetworkSettings']['Ports'].get('{port}/tcp'.format(port=port)) + if binding: + return binding[0]['HostPort'] + + +def container_get_env(container, env): + for env_str in container.attrs['Config']['Env']: + var, value = env_str.split('=') + if var == str(env): + return value + + +def container_get_status(container): + container.reload() + return container.status.strip().lower() + + +def container_file_exists(container, path): + try: + _, _ = container.get_archive(path) + return True + except docker.errors.NotFound: + return False + + +def container_get_tar(container, path): + archive, _ = container.get_archive(path) + raw_tarfile = io.BytesIO() + for chunk in archive: + raw_tarfile.write(chunk) + raw_tarfile.seek(0) + return raw_tarfile + + +def container_get_files(container, path): + tar = tarfile.open(mode='r', fileobj=container_get_tar(container, path)) + for member in tar.getmembers(): + if not member.isfile(): + continue + yield tar.extractfile(member) + tar.close() + + +def container_get_filecontent(container, filepath): + tar = tarfile.open(mode='r', fileobj=container_get_tar(container, filepath)) + fname = os.path.split(filepath)[1] + file_content = tar.extractfile(fname).read() + tar.close() + return file_content + + +def container_get_filestream(container, filepath): + tar = tarfile.open(mode='r', fileobj=container_get_tar(container, filepath)) + fname = os.path.split(filepath)[1] + for line in tar.extractfile(fname).readlines(): + yield line + tar.close() + + +def container_get_conffile(container, filename): + filepath = CONFIG_ENVS.get(filename, filename) + try: + file_content = container_get_filecontent(container, filepath) + return io.StringIO(file_content.decode()) + except docker.errors.NotFound: + return io.StringIO() + + +def kill(container, signal): + """ + Stop container by Sending signal (not fails if container is not running) + """ + try: + container.kill(signal) + except APIError as exc: + if 'is not running' not in str(exc): + raise + + +def container_inject_file(container, filename, fileobj): + # convert file to byte via BytesIO + content = fileobj.read().encode() + infile = io.BytesIO(content) + outfile = io.BytesIO() + filepath = CONFIG_ENVS.get(filename, filename) + path, name = os.path.split(filepath) + + # create tar archive + tar = tarfile.open(mode='w', fileobj=outfile) + tarinfo = tarfile.TarInfo(name) + tarinfo.size = len(content) + tarinfo.mode = 0o0666 + tar.addfile(tarinfo, infile) + tar.close() + container.put_archive(path, outfile.getvalue()) + + +def container_inject_config(container, filename, confobj): + # Write config into StringIO file + conffile = io.StringIO() + confobj.write(conffile) + # We need to seek into begin after write + conffile.seek(os.SEEK_SET) + container_inject_file(container, filename, conffile) + + +def build_config_get_path(build): + if isinstance(build, dict): + return build['context'] + return build + + +def container_check_file_exists(container, filepath): + try: + container.get_archive(filepath) + return True + except docker.errors.NotFound: + return False + + +def promote_host(container): + container.exec_run('bash /usr/bin/promote') + + +def set_switchover(container, params): + return container.exec_run('pgconsul-util switchover -y {params}'.format(params=params)) + + +def get_zk(context, name): + container = context.containers[name] + acl = make_digest_acl('user1', 'testpassword123', all=True) + return KazooClient( + '{host}:{port}'.format(host=container_get_host(), port=container_get_tcp_port(container, 2181)), + default_acl=[acl], + auth_data=[('digest', '{username}:{password}'.format(username='user1', password='testpassword123'))], + ) + + +def get_zk_value(context, zk_name, key): + with contextlib.suppress(Exception): + zk = get_zk(context, zk_name) + zk.start() + try: + value = zk.get(key)[0].decode() + except NoNodeError: + return None + finally: + zk.stop() + zk.close() + return value + return None + + +def exec(container, cmd): + """ + Execute command inside of given container + """ + result = container.exec_run(cmd) + return result.exit_code, result.output.decode().rstrip('\n') diff --git a/tests/steps/zk.py b/tests/steps/zk.py new file mode 100644 index 0000000..5d6a4d0 --- /dev/null +++ b/tests/steps/zk.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from datetime import datetime +import json +import operator + +import kazoo.exceptions +from kazoo.handlers.threading import KazooTimeoutError +import steps.helpers as helpers +import yaml +from behave import then, when + + +@then('zookeeper "{name}" has holder "{holders}" for lock "{key}"') +@then('zookeeper "{name}" has one of holders "{holders}" for lock "{key}"') +@helpers.retry_on_assert +def step_zk_check_holders(context, name, holders, key): + try: + zk = helpers.get_zk(context, name) + contender = None + zk.start() + lock = zk.Lock(key) + contenders = lock.contenders() + if contenders: + contender = contenders[0] + finally: + zk.stop() + zk.close() + for holder in holders.split(','): + if str(contender) == str(holder): + return + raise AssertionError( + '{time}: lock "{key}" holder is "{holder}", expected one of "{exp}"'.format( + key=key, holder=contender, exp=holders, time=datetime.now().strftime("%H:%M:%S") + ) + ) + + +@when('we lock "{key}" in zookeeper "{name}"') +@when('we lock "{key}" in zookeeper "{name}" with value "{value}"') +def step_zk_lock(context, key, name, value=None): + if not context.zk: + context.zk = helpers.get_zk(context, name) + context.zk.start() + lock = context.zk.Lock(key, value) + lock.acquire() + context.zk_locks[key] = lock + + +@when('we release lock "{key}" in zookeeper "{name}"') +def step_zk_release_lock(context, key, name): + if key in context.zk_locks: + context.zk_locks[key].release() + + +@then('zookeeper "{name}" has no value for key "{key}"') +@helpers.retry_on_assert +def step_zk_no_value(context, name, key): + zk_value = helpers.get_zk_value(context, name, key) + assert zk_value is None, '{time}: node "{key}" exists and has value "{val}"'.format( + key=key, val=zk_value, time=datetime.now().strftime("%H:%M:%S") + ) + + +@then('zookeeper "{name}" node is alive') +def step_zk_is_alive(context, name): + key = '/test_is_{0}_alive'.format(name) + try: + step_zk_set_value(context, name, key, name) + step_zk_value(context, name, name, key) + except (AssertionError, KazooTimeoutError): + helpers.LOG.warn( + '{time}: {name} zookeeper looks dead, try to repair'.format( + name=name, time=datetime.now().strftime("%H:%M:%S") + ) + ) + try_to_repair_zk_host(context, name) + step_zk_set_value_with_retries(context, name, key, name) + step_zk_value(context, name, name, key) + + +def try_to_repair_zk_host(context, name): + container = context.containers[name] + # https://stackoverflow.com/questions/57574298/zookeeper-error-the-current-epoch-is-older-than-the-last-zxid + err = 'is older than the last zxid' + container.exec_run( + "grep '{err}' /var/log/zookeeper/zookeeper--server-pgconsul_{name}_1.log && rm -rf /tmp/zookeeper/version-2".format( + err=err, name=name + ) + ) + container.exec_run("/usr/local/bin/supervisorctl restart zookeeper") + + +@then('zookeeper "{name}" has value "{value}" for key "{key}"') +@helpers.retry_on_assert +def step_zk_value(context, name, value, key): + zk_value = helpers.get_zk_value(context, name, key) + assert str(zk_value) == str(value), '{time}: expected value "{exp}", got "{val}"'.format( + exp=value, val=zk_value, time=datetime.now().strftime("%H:%M:%S") + ) + + +@then('zookeeper "{name}" has "{n}" values for key "{key}"') +@helpers.retry_on_assert +def step_zk_key_has_n_values(context, name, n, key): + n = int(n) + zk_value = helpers.get_zk_value(context, name, key) + assert zk_value is not None, 'key {key} does not exists'.format(key=key) + actual_values = json.loads(zk_value) + assert n == len(actual_values), 'expected {n} values in key {key}, but values are {values}'.format( + n=n, key=key, values=actual_values + ) + + +@then('zookeeper "{name}" has following values for key "{key}"') +@helpers.retry_on_assert +def step_zk_key_values(context, name, key): + exp_values = sorted(yaml.safe_load(context.text) or [], key=operator.itemgetter('client_hostname')) + assert isinstance(exp_values, list), '{time}: expected list, got {got}'.format( + got=type(exp_values), time=datetime.now().strftime("%H:%M:%S") + ) + zk_value = helpers.get_zk_value(context, name, key) + assert zk_value is not None, '{time}: key {key} does not exists'.format( + key=key, time=datetime.now().strftime("%H:%M:%S") + ) + + actual_values = sorted(json.loads(zk_value), key=operator.itemgetter('client_hostname')) + + equal, error = helpers.are_dicts_subsets_of(exp_values, actual_values) + assert equal, error + + +def has_value_in_list(context, zk_name, key, value): + zk_value = helpers.get_zk_value(context, zk_name, key) + if zk_value is None or zk_value == "": + return False + + zk_list = json.loads(zk_value) + return value in zk_list + + +def has_subset_of_values(context, zk_name, key, exp_values): + zk_value = helpers.get_zk_value(context, zk_name, key) + if zk_value is None: + return False + + zk_dicts = json.loads(zk_value) + actual_values = {d['client_hostname']: d for d in zk_dicts} + + equal = helpers.is_2d_dict_subset_of(exp_values, actual_values) + return equal + + +@helpers.retry_on_kazoo_timeout +def step_zk_set_value_with_retries(context, value, key, name): + return step_zk_set_value(context, value, key, name) + + +@when('we set value "{value}" for key "{key}" in zookeeper "{name}"') +def step_zk_set_value(context, value, key, name): + try: + zk = helpers.get_zk(context, name) + zk.start() + zk.ensure_path(key) + # There is race condition, node can be deleted after ensure_path and + # before set called. We need to catch exception and create it again. + try: + zk.set(key, value.encode()) + except kazoo.exceptions.NoNodeError: + zk.create(key, value.encode()) + finally: + zk.stop() + zk.close() + + +@when('we remove key "{key}" in zookeeper "{name}"') +def step_zk_remove_key(context, key, name): + try: + zk = helpers.get_zk(context, name) + zk.start() + zk.delete(key, recursive=True) + finally: + zk.stop() + zk.close() diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..81ab11c --- /dev/null +++ b/tox.ini @@ -0,0 +1,53 @@ +[tox] +envlist = behave, yapf, flake8, pylint, bandit + +[testenv:behave] +passenv = * +basepython=python3.10 +commands = behave {posargs} --show-timings --stop tests/features --junit --junit-directory junit_report +deps = behave==1.2.6 + retrying==1.3.3 + docker==4.2.2 + PyYAML==5.3.1 + psycopg2-binary==2.8.5 + kazoo==2.8.0 + coverage==4.5.4 + lockfile + daemon + urllib3<2 + +[testenv:behave_unstoppable] +passenv = * +basepython=python3.10 +commands = behave {posargs} --show-timings tests/features --junit --junit-directory junit_report +deps = behave==1.2.6 + retrying==1.3.3 + docker==4.2.2 + PyYAML==5.3.1 + psycopg2-binary==2.8.5 + kazoo==2.8.0 + coverage==4.5.4 + +[testenv:yapf] +basepython=python3.10 +commands = yapf -rpd src +deps = yapf + +[testenv:flake8] +basepython=python3.10 +commands = flake8 src +deps = flake8 + flake8-string-format + flake8-isort + flake8-commas + flake8-quotes + +[testenv:pylint] +basepython=python3.10 +commands = pylint src +deps = pylint + +[testenv:bandit] +basepython=python3.10 +commands = bandit -r src +deps = bandit diff --git a/wd_pgconsul/wd_pgconsul.py b/wd_pgconsul/wd_pgconsul.py new file mode 100644 index 0000000..77fcde5 --- /dev/null +++ b/wd_pgconsul/wd_pgconsul.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python + +import json +import os +import subprocess +import sys +import time + +from pgconsul import read_config + + +def restart(comment=None): + subprocess.call('/etc/init.d/pgconsul restart', shell=True, stdout=sys.stdout, stderr=sys.stderr) + print('pgconsul has been restarted due to %s.' % comment) + sys.exit(0) + + +def rewind_running(): + pids = [pid for pid in os.listdir('/proc') if pid.isdigit()] + for pid in pids: + try: + cmd = open(os.path.join('/proc', pid, 'cmdline'), 'rb').read() + if 'pg_rewind' in cmd: + return True + except IOError: + # proc has already terminated + continue + return False + + +def main(): + config = read_config(filename='/etc/pgconsul.conf') + work_dir = config.get('global', 'working_dir') + stop_file = os.path.join(work_dir, 'pgconsul.stopped') + + if os.path.exists(stop_file): + print('pgconsul has been stoppped gracefully. Not doing anything.') + sys.exit(0) + + p = subprocess.call('/etc/init.d/pgconsul status', shell=True, stdout=sys.stdout, stderr=sys.stderr) + if p != 0: + restart('dead service') + + status_file = os.path.join(work_dir, 'pgconsul.status') + # We multiply on 3 because sanity checks and pg_rewind may take + # some time without updating status-file + timeout = config.getint('replica', 'recovery_timeout') * 3 + f = open(status_file, 'r') + state = json.loads(f.read()) + f.close() + if float(state['ts']) <= time.time() - timeout and not rewind_running(): + restart('stale info in status-file') + + +if __name__ == '__main__': + main()