From 34fdaf58c123f92fe3fe3f12e673f7e01028de30 Mon Sep 17 00:00:00 2001 From: hbpatre Date: Tue, 14 Nov 2017 18:23:20 +0530 Subject: [PATCH] Health-check, Collectinfo and Summary Improvements (#28) * TOOLS-741: [asadm-collectinfo] Use optimal netstat / ss command line option, when running it for collectinfo. * TOOLS-811: [asadm-collectinfo] Add hdparm output in ACT and collectinfo * TOOLS-851: [asadm-healthcheck] Parse dmesg and perform OS version / OOM / Process blocking health check. * TOOLS-885: [asadm-healthcheck] Collect / Compare sysctl and CPU configuration across nodes in cluster. * TOOLS-945: [asadm-healthcheck] Show numbers for failed tests in healthcheck * TOOLS-953: [asadm] Modify Summary output * TOOLS-957: [asadm-healthcheck] Add health check rule to print INFO in case nodes in cluster are running with firewall rules. * TOOLS-969: [asadm-healthcheck] Check for OOM Kill and warn in sysinfo * TOOLS-970: [asadm-healthcheck] Add query to check hwm breach * TOOLS-972: [asadm] Modify Namespace modifier filtration to use regular expression strictly * TOOLS-997: [asadm-collectInfo] Add cat /proc/pgreg asd/limits and health check. * TOOLS-998: [asadm] Fix TLS error capturing to show proper description * TOOLS-999: [asadm] Make collectinfo error messages user-friendly * TOOLS-1002: [asadm-healthcheck] Add query to check non-default defrag-lwm-pct * TOOLS-1015: [asadm] Reorganize info tables * TOOLS-1017: [asadm-healthcheck] Add new operation to find out values in the minority * TOOLS-1021: [asadm-healthcheck] Show details for cluster remote dc_size difference check * TOOLS-1022: [asadm] WARN at the top of Summary command if migrations are going on * TOOLS-1023: [asadm-healthcheck] Remove the thousand delimiter for config params * TOOLS-1024: [asadm] Modify ssh options and provide documentation * TOOLS-1025: [asadm-healthcheck] Modify SELECT clause to ignore unwanted key/s * TOOLS-1029: [asadm] Modify to display Summary namespace output in table and list view --- README.md | 3 +- asadm-deps/deb/pyasn1/install.sh | 51 + asadm-deps/mac/pyasn1/install.sh | 44 + asadm-deps/rpm/pyasn1/install.sh | 56 + asadm.py | 104 +- lib/basiccontroller.py | 557 ++++---- lib/client/cluster.py | 2 +- lib/client/node.py | 110 +- lib/client/ssl_context.py | 30 +- lib/collectinfo/cinfolog.py | 14 +- lib/collectinfo/loghdlr.py | 6 +- lib/collectinfo_parser/section_filter_list.py | 67 +- lib/collectinfo_parser/sys_section_parser.py | 164 +++ lib/collectinfocontroller.py | 114 +- lib/getcontroller.py | 60 +- lib/health/commands.py | 30 +- lib/health/constants.py | 3 + lib/health/healthchecker.py | 41 +- lib/health/operation.py | 412 ++++-- lib/health/parser.py | 191 ++- lib/health/query.py | 1119 +++++++++++++---- lib/health/query/health.hql | 1119 +++++++++++++---- lib/health/util.py | 190 ++- lib/log/loghdlr.py | 2 +- lib/log/reader.py | 10 +- lib/logcontroller.py | 14 +- lib/utils/util.py | 143 ++- lib/view/view.py | 581 ++++++--- test/e2e/test_info.py | 49 +- test/e2e/test_show.py | 288 ++--- test/unit/test_controller.py | 9 +- 31 files changed, 3998 insertions(+), 1585 deletions(-) create mode 100755 asadm-deps/deb/pyasn1/install.sh create mode 100755 asadm-deps/mac/pyasn1/install.sh create mode 100755 asadm-deps/rpm/pyasn1/install.sh diff --git a/README.md b/README.md index 44e29486..7dff476d 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,8 @@ Admin> help ### Python Modules - ply: >= 3.4 - pexpect: >= 3.0 -- pyOpenSSL: 16.2.0 +- pyOpenSSL: >= 16.2.0 +- pyasn1: >= 0.3.1 ### Installing Python Module Dependencies ``` diff --git a/asadm-deps/deb/pyasn1/install.sh b/asadm-deps/deb/pyasn1/install.sh new file mode 100755 index 00000000..e2682a20 --- /dev/null +++ b/asadm-deps/deb/pyasn1/install.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash + +# Copyright 2013-2017 Aerospike, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +################################################################################ + +PYMODULE=pyasn1 + +################################################################################ + +command_exists () { + type "$1" &> /dev/null ; +} + +################################################################################ + +if [ $EUID -ne 0 ]; then + echo "This script requires root or sudo privileges." + exit 1 +fi + +if ! command_exists pip ; then + echo Installing python-dev + apt-get install -y python-dev + + echo Installing python-setuptools + apt-get install -y python-setuptools + + echo Installing pip + apt-get install python-pip + + if ! command_exists pip ; then + echo "Error while installing pip. Please install pip and run this installation again." + exit 1 + fi +fi + +echo Installing ${PYMODULE} +pip install --upgrade ${PYMODULE} diff --git a/asadm-deps/mac/pyasn1/install.sh b/asadm-deps/mac/pyasn1/install.sh new file mode 100755 index 00000000..56c7dbc7 --- /dev/null +++ b/asadm-deps/mac/pyasn1/install.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +# Copyright 2013-2017 Aerospike, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +################################################################################ + +PYMODULE=pyasn1 + +################################################################################ + +command_exists () { + type "$1" &> /dev/null ; +} + +################################################################################ + +if [ $EUID -ne 0 ]; then + echo "This script requires root or sudo privileges." + exit 1 +fi + +if ! command_exists pip ; then + echo Installing pip + easy_install pip + + if ! command_exists pip ; then + echo "Error while installing pip. Please install pip and run this installation again." + exit 1 + fi +fi +echo Installing ${PYMODULE} +pip install ${PYMODULE} --upgrade \ No newline at end of file diff --git a/asadm-deps/rpm/pyasn1/install.sh b/asadm-deps/rpm/pyasn1/install.sh new file mode 100755 index 00000000..bcd9aeca --- /dev/null +++ b/asadm-deps/rpm/pyasn1/install.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash + +# Copyright 2013-2017 Aerospike, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +################################################################################ + +PYMODULE=pyasn1 + +################################################################################ + +command_exists () { + type "$1" &> /dev/null ; +} + +################################################################################ + +if [ $EUID -ne 0 ]; then + echo "This script requires root or sudo privileges." + exit 1 +fi + +pip_command="pip" +if ! command_exists ${pip_command} ; then + echo Installing epel-release + yum install epel-release + + echo Installing pip + yum install -y python-pip + + if ! command_exists ${pip_command} ; then + if command_exists pip-python ; then + pip_command="pip-python" + elif command_exists python-pip ; then + pip_command="python-pip" + fi + fi + if ! command_exists ${pip_command} ; then + echo "Error while installing pip. Please install pip and run this installation again." + exit 1 + fi +fi + +echo Installing ${PYMODULE} +${pip_command} install --upgrade ${PYMODULE} diff --git a/asadm.py b/asadm.py index 2431362d..ff120284 100755 --- a/asadm.py +++ b/asadm.py @@ -22,6 +22,7 @@ import shlex import sys import logging +import traceback if '-e' not in sys.argv and '--asinfo' not in sys.argv: # asinfo mode or non-interactive mode does not need readline @@ -41,25 +42,44 @@ class BaseLogger(logging.Logger, object): def __init__(self, name, level=logging.WARNING): return super(BaseLogger, self).__init__(name, level=level) + def _handle_exception(self, msg): + if isinstance(msg, Exception) and not isinstance(msg, ShellException): + traceback.print_exc() + + def _print_message(self, msg, level, red_color=False, *args, **kwargs): + try: + message = str(msg).format(*args, **kwargs) + except Exception: + message = str(msg) + + message = level + ": " + message + + if red_color: + message = terminal.fg_red() + message + terminal.fg_clear() + + print message + def debug(self, msg, *args, **kwargs): if self.level <= logging.DEBUG: - print "DEBUG: " + str(msg) + self._print_message(msg=msg, level="DEBUG", red_color=False, *args, **kwargs) def info(self, msg, *args, **kwargs): if self.level <= logging.INFO: - print "INFO: " + str(msg) + self._print_message(msg=msg, level="INFO", red_color=False, *args, **kwargs) def warning(self, msg, *args, **kwargs): if self.level <= logging.WARNING: - print terminal.fg_red() + "ERROR: " + str(msg) + terminal.fg_clear() + self._print_message(msg=msg, level="WARNING", red_color=True, *args, **kwargs) def error(self, msg, *args, **kwargs): if self.level <= logging.ERROR: - print terminal.fg_red() + "ERROR: " + str(msg) + terminal.fg_clear() + self._print_message(msg=msg, level="ERROR", red_color=True, *args, **kwargs) + self._handle_exception(msg) def critical(self, msg, *args, **kwargs): if self.level <= logging.CRITICAL: - print terminal.fg_red() + "ERROR: " + str(msg) + terminal.fg_clear() + self._print_message(msg=msg, level="ERROR", red_color=True, *args, **kwargs) + self._handle_exception(msg) exit(1) logging.setLoggerClass(BaseLogger) @@ -68,6 +88,7 @@ def critical(self, msg, *args, **kwargs): logger = logging.getLogger('asadm') logger.setLevel(logging.INFO) +from lib.controllerlib import ShellException from lib.basiccontroller import BasicRootController from lib.client import info from lib.client.assocket import ASSocket @@ -87,8 +108,8 @@ def critical(self, msg, *args, **kwargs): MULTILEVEL_COMMANDS = ["show", "info"] -class AerospikeShell(cmd.Cmd): +class AerospikeShell(cmd.Cmd): def __init__(self, seed, user=None, password=None, use_services_alumni=False, use_services_alt=False, log_path="", log_analyser=False, collectinfo=False, ssl_context=None, only_connect_seed=False, execute_only_mode=False, timeout=5): @@ -129,7 +150,7 @@ def __init__(self, seed, user=None, password=None, use_services_alumni=False, us if not execute_only_mode: self.intro = str(self.ctrl.loghdlr) else: - if user != None: + if user is not None: if password == "prompt": if sys.stdin.isatty(): password = getpass.getpass("Enter Password:") @@ -143,9 +164,8 @@ def __init__(self, seed, user=None, password=None, use_services_alumni=False, us only_connect_seed=only_connect_seed, timeout=timeout) if not self.ctrl.cluster.get_live_nodes(): - logger.error("Not able to connect any cluster.") self.do_exit('') - exit(0) + logger.critical("Not able to connect any cluster.") self.prompt = "Admin> " self.intro = "" @@ -165,12 +185,11 @@ def __init__(self, seed, user=None, password=None, use_services_alumni=False, us if self.use_rawinput: self.prompt = "\001" + terminal.bold() + terminal.fg_red() + "\002" +\ - self.prompt + "\001" +\ - terminal.unbold() + terminal.fg_clear() + "\002" + self.prompt + "\001" +\ + terminal.unbold() + terminal.fg_clear() + "\002" except Exception as e: - logger.error(e) self.do_exit('') - exit(1) + logger.critical(str(e)) if not execute_only_mode: try: @@ -229,7 +248,7 @@ def precmd(self, line, max_commands_to_print_header=1, if not lines: # allow empty lines return "" except Exception as e: - logger.error(e) + logger.error(str(e)) return "" for line in lines: @@ -243,10 +262,8 @@ def precmd(self, line, max_commands_to_print_header=1, # If single level command then print from first index. For example: health, features, grep etc. index = 0 - print "\n~~~ %s%s%s ~~~" % (terminal.bold(), - ' '.join( - line[index:]), - terminal.reset()) + print "\n~~~ %s%s%s ~~~" % ( + terminal.bold(), ' '.join(line[index:]), terminal.reset()) sys.stdout.write(terminal.reset()) try: @@ -254,7 +271,7 @@ def precmd(self, line, max_commands_to_print_header=1, if response == "EXIT": return "exit" except Exception as e: - logger.error(e) + logger.error(str(e)) return "" # line was handled by execute def _listdir(self, root): @@ -449,7 +466,7 @@ def parse_tls_input(cli_args): crl_check_all=cli_args.crl_check_all).ctx except Exception as e: - print terminal.fg_red() + "SSLContext creation Exception: " + str(e) + terminal.fg_clear() + logger.error("SSLContext creation Exception: " + str(e)) exit(1) @@ -463,7 +480,7 @@ def execute_asinfo_commands(commands_arg, seed, user=None, password=None, ssl_co if not cmds: return - if user != None: + if user is not None: if password == "prompt": if sys.stdin.isatty(): password = getpass.getpass("Enter Password:") @@ -475,16 +492,18 @@ def execute_asinfo_commands(commands_arg, seed, user=None, password=None, ssl_co if not assock.connect(): raise Exception("Could not connect to node") - node_name = "%s:%s"%(seed[0],seed[1]) + node_name = "%s:%s" % (seed[0], seed[1]) + + for command in cmds: + if command: + command = util.strip_string(command) + + result = assock.execute(command) - for cmd in cmds: - if cmd: - cmd = util.strip_string(cmd) - result = assock.execute(cmd) - if result == -1 or result == None: - result = IOError("Error: Invalid command '%s'" % cmd) + if result == -1 or result is None: + result = IOError("Error: Invalid command '%s'" % command) - view.CliView.asinfo({node_name:result}, line_separator, False, None) + view.CliView.asinfo({node_name: result}, line_separator, False, None) return @@ -508,7 +527,7 @@ def main(): parser.add_argument("-l", "--log_analyser", dest="log_analyser", action="store_true", help="Start asadm in log-analyser mode and analyse data from log files") parser.add_argument("--asinfo", dest="asinfo_mode", action="store_true", - #help="Enable asinfo mode to connect directly to seed node without cluster creation. By default asadm connects to all nodes and creates cluster.", + # help="Enable asinfo mode to connect directly to seed node without cluster creation. By default asadm connects to all nodes and creates cluster.", help=argparse.SUPPRESS) parser.add_argument("-e", "--execute", dest="execute", @@ -518,8 +537,9 @@ def main(): parser.add_argument("--no-color", dest="no_color", action="store_true", help="Disable colored output") - parser.add_argument("--profile", dest="profile", action="store_true" # , help="Profile Aerospike Admin for performance issues" - , help=argparse.SUPPRESS) + parser.add_argument("--profile", dest="profile", action="store_true", + # , help="Profile Aerospike Admin for performance issues" + help=argparse.SUPPRESS) parser.add_argument("-u", "--help", dest="help", action="store_true", help="show program usage") parser.add_argument("--version", dest="show_version", @@ -532,7 +552,7 @@ def main(): parser.add_argument("--single_node_cluster", dest="only_connect_seed", action="store_true", help="Enable asadm mode to connect only seed node. By default asadm connects to all nodes in cluster.") parser.add_argument("--timeout", dest="timeout", type=float, default=5, - help="Set timeout value in seconds to node level operations. TLS connection does not support timeout. Default: 5 seconds") + help="Set timeout value in seconds to node level operations. TLS connection does not support timeout. Default: 5 seconds") parser.add_argument("--lineseperator", dest="line_separator", action="store_true", # help="Print output in separate lines. This works only for asinfo mode." @@ -583,8 +603,8 @@ def main(): parser.add_option("-t", "--tls_name", dest="tls_name", help="TLS name of host to verify for TLS connection. It is required if tls_enable is set.") parser.add_option("-U", "--user", dest="user", help="user name") - parser.add_option("-P", "--password", dest="password", action="store_const" # , nargs="?" - , const="prompt", help="password") + parser.add_option("-P", "--password", dest="password", action="store_const", # , nargs="?" + const="prompt", help="password") parser.add_option("-c", "--collectinfo", dest="collectinfo", action="store_true", help="Start asadm to run against offline collectinfo files.") @@ -601,8 +621,8 @@ def main(): parser.add_option("--no-color", dest="no_color", action="store_true", help="Disable colored output") - parser.add_option("--profile", dest="profile", action="store_true" # , help="Profile Aerospike Admin for performance issues" - , help=optparse.SUPPRESS_USAGE) + parser.add_option("--profile", dest="profile", action="store_true", # , help="Profile Aerospike Admin for performance issues" + help=optparse.SUPPRESS_USAGE) parser.add_option("-u", "--help", dest="help", action="store_true", help="show program usage") parser.add_option("--version", dest="show_version", @@ -614,7 +634,7 @@ def main(): action="store_true", help="Enable use of services-alternate instead of services in info request during cluster tending") parser.add_option("--single_node_cluster", dest="only_connect_seed", action="store_true", help="Enable asadm mode to connect only seed node. By default asadm connects to all nodes in cluster.") - parser.add_option("--timeout", dest="timeout",type=float, default=5, + parser.add_option("--timeout", dest="timeout", type=float, default=5, help="Set timeout value in seconds to node level operations. TLS connection does not support timeout. Default: 5 seconds") parser.add_option("--lineseperator", dest="line_separator", action="store_true", @@ -692,8 +712,9 @@ def main(): commands_arg = parse_commands(commands_arg) try: - execute_asinfo_commands(commands_arg, seed, user=cli_args.user, - password=cli_args.password, ssl_context=ssl_context, line_separator=cli_args.line_separator) + execute_asinfo_commands( + commands_arg, seed, user=cli_args.user, password=cli_args.password, + ssl_context=ssl_context, line_separator=cli_args.line_separator) exit(0) except Exception as e: logger.error(str(e)) @@ -705,7 +726,7 @@ def main(): shell = AerospikeShell(seed, user=cli_args.user, password=cli_args.password, use_services_alumni=cli_args.use_services_alumni, - use_services_alt = cli_args.use_services_alternate, + use_services_alt=cli_args.use_services_alternate, log_path=cli_args.log_path, log_analyser=cli_args.log_analyser, collectinfo=cli_args.collectinfo, @@ -812,5 +833,6 @@ def parse_commands(file): commands = line return commands + if __name__ == '__main__': main() diff --git a/lib/basiccontroller.py b/lib/basiccontroller.py index 93122495..e8aff3fe 100644 --- a/lib/basiccontroller.py +++ b/lib/basiccontroller.py @@ -13,37 +13,42 @@ # limitations under the License. import copy -from distutils.version import LooseVersion import json -import time import os -import sys import platform import shutil -import urllib2 import socket +import sys +import time +import urllib2 import zipfile +from distutils.version import LooseVersion from lib.client.cluster import Cluster from lib.collectinfocontroller import CollectinfoRootController -from lib.controllerlib import BaseController, CommandController, CommandHelp, ShellException -from lib.getcontroller import GetConfigController, GetStatisticsController, GetDistributionController, get_sindex_stats, \ - GetPmapController -from lib.health.util import create_health_input_dict, h_eval, create_snapshot_key +from lib.controllerlib import (BaseController, CommandController, CommandHelp, + ShellException) +from lib.getcontroller import (GetConfigController, GetDistributionController, + GetPmapController, GetStatisticsController, + get_sindex_stats) +from lib.health.util import (create_health_input_dict, create_snapshot_key, + h_eval) from lib.utils import util from lib.utils.data import lsof_file_type_desc -from lib.view.view import CliView from lib.view import terminal +from lib.view.view import CliView aslogfile = "" aslogdir = "" + class BasicCommandController(CommandController): cluster = None def __init__(self, cluster): BasicCommandController.cluster = cluster + @CommandHelp('Aerospike Admin') class BasicRootController(BaseController): @@ -111,15 +116,23 @@ def do_watch(self, line): @CommandHelp('The "info" command provides summary tables for various aspects', 'of Aerospike functionality.') class InfoController(BasicCommandController): - def __init__(self): self.modifiers = set(['with']) + self.controller_map = dict( + namespace=InfoNamespaceController) + @CommandHelp('Displays network, namespace, and XDR summary information.') def _do_default(self, line): - actions = (util.Future(self.do_network, line).start(), - util.Future(self.do_namespace, line).start(), - util.Future(self.do_xdr, line).start()) + actions = [util.Future(self.do_network, line).start()] + # We are not using line for any of subcommand, but if user enters 'info object' or 'info usage' then it will + # give error for unexpected format. We can catch this inside InfoNamespaceController but in that case + # it will show incomplete output, for ex. 'info object' will print output of 'info network', 'info xdr' and + # 'info namespace object', but since it is not correct command it should print output for partial correct + # command, in this case it should print data for 'info'. To keep consistent output format, we are passing empty + # list as line. + actions.extend(self.controller_map['namespace'](get_futures=True)([])['futures']) + actions.append(util.Future(self.do_xdr, line).start()) return [action.result() for action in actions] @@ -161,18 +174,6 @@ def do_set(self, line): stats = self.cluster.info_set_statistics(nodes=self.nodes) return util.Future(self.view.info_set, stats, self.cluster, **self.mods) - @CommandHelp('Displays summary information for each namespace.') - def do_namespace(self, line): - stats = self.cluster.info_all_namespace_statistics(nodes=self.nodes) - return util.Future(self.view.info_namespace, stats, self.cluster, - **self.mods) - - @CommandHelp('Displays summary information for objects of each namespace.') - def do_object(self, line): - stats = self.cluster.info_all_namespace_statistics(nodes=self.nodes) - return util.Future(self.view.info_object, stats, self.cluster, - **self.mods) - @CommandHelp('Displays summary information for Cross Datacenter', 'Replication (XDR).') def do_xdr(self, line): @@ -226,12 +227,49 @@ def do_sindex(self, line): return util.Future(self.view.info_sindex, sindex_stats, self.cluster, **self.mods) -@CommandHelp('"asinfo" provides raw access to the info protocol.', - ' Options:', - ' -v - The command to execute', - ' -p - Port to use in case of XDR info command', - ' and XDR is not in asd', - ' -l - Replace semicolons ";" with newlines.') + +@CommandHelp('The "namespace" command provides summary tables for various aspects', + 'of Aerospike namespaces.') +class InfoNamespaceController(BasicCommandController): + def __init__(self, get_futures=False): + self.modifiers = set(['with']) + self.get_futures = get_futures + + @CommandHelp('Displays usage and objects information for namespaces') + def _do_default(self, line): + actions = [util.Future(self.do_usage, line).start(), + util.Future(self.do_object, line).start()] + + if self.get_futures: + # Wrapped to prevent base class from calling result. + return dict(futures=actions) + + return [action.result() for action in actions] + + @CommandHelp('Displays usage information for each namespace.') + def do_usage(self, line): + stats = self.cluster.info_all_namespace_statistics(nodes=self.nodes) + return util.Future(self.view.info_namespace_usage, stats, self.cluster, + **self.mods) + + @CommandHelp('Displays object information for each namespace.') + def do_object(self, line): + stats = self.cluster.info_all_namespace_statistics(nodes=self.nodes) + return util.Future(self.view.info_namespace_object, stats, self.cluster, + **self.mods) + + +@CommandHelp( + '"asinfo" provides raw access to the info protocol.', + ' Options:', + ' -v - The command to execute', + ' -p - Port to use in case of XDR info command and XDR is', + ' not in asd', + ' -l - Replace semicolons ";" with newlines. If output does', + ' not contain semicolons "-l" will attempt to use', + ' colons ":" followed by commas ",".', + ' --no_node_name - Force to display output without printing node names.' +) class ASInfoController(BasicCommandController): def __init__(self): @@ -267,7 +305,7 @@ def _do_default(self, line): raise ShellException( "Do not understand '%s' in '%s'" % (word, " ".join(line))) except Exception: - self.logger.error( + self.logger.warning( "Do not understand '%s' in '%s'" % (word, " ".join(line))) return if value is not None: @@ -306,8 +344,8 @@ class ShowDistributionController(BasicCommandController): def __init__(self): self.modifiers = set(['with', 'for']) - self.getter = GetDistributionController(self.cluster); - + self.getter = GetDistributionController(self.cluster); + @CommandHelp('Shows the distributions of Time to Live and Object Size') def _do_default(self, line): @@ -318,7 +356,7 @@ def _do_default(self, line): @CommandHelp('Shows the distribution of TTLs for namespaces') def do_time_to_live(self, line): - + histogram = self.getter.do_distribution('ttl', nodes=self.nodes) return util.Future(self.view.show_distribution, 'TTL Distribution', @@ -327,7 +365,7 @@ def do_time_to_live(self, line): @CommandHelp('Shows the distribution of Eviction TTLs for namespaces') def do_eviction(self, line): - + histogram = self.getter.do_distribution('evict', nodes=self.nodes) return util.Future(self.view.show_distribution, 'Eviction Distribution', @@ -345,7 +383,7 @@ def do_object_size(self, line): byte_distribution = util.check_arg_and_delete_from_mods(line=line, arg="-b", default=False, modifiers=self.modifiers, mods=self.mods) - + bucket_count = util.get_arg_and_delete_from_mods(line=line, arg="-k", return_type=int, default=5, modifiers=self.modifiers, mods=self.mods) @@ -356,7 +394,7 @@ def do_object_size(self, line): return util.Future(self.view.show_distribution, 'Object Size Distribution', histogram, 'Record Blocks', 'objsz', self.cluster, like=self.mods['for']) - + histogram = self.getter.do_object_size(byte_distribution = True, bucket_count=bucket_count, nodes=self.nodes) @@ -413,7 +451,7 @@ def _do_default(self, line): util.filter_list(list(namespace_set), self.mods['for'])) latency = self.cluster.info_latency( - nodes=self.nodes, back=back, duration=duration, slice_tm=slice_tm, + nodes=self.nodes, back=back, duration=duration, slice_tm=slice_tm, ns_set=namespace_set) hist_latency = {} @@ -504,8 +542,8 @@ def do_namespace(self, line): ns_configs = self.getter.get_namespace(nodes=self.nodes) - return [util.Future(self.view.show_config, - "%s Namespace Configuration" % (ns), configs, self.cluster, + return [util.Future(self.view.show_config, + "%s Namespace Configuration" % (ns), configs, self.cluster, title_every_nth=title_every_nth, flip_output=flip_output, **self.mods) for ns, configs in ns_configs.iteritems()] @@ -646,8 +684,8 @@ def do_namespace(self, line): ns_stats = self.getter.get_namespace(nodes=self.nodes, for_mods=self.mods['for']) return [util.Future(self.view.show_stats, - "%s Namespace Statistics" % (namespace), ns_stats[namespace], - self.cluster, show_total=show_total, + "%s Namespace Statistics" % (namespace), ns_stats[namespace], + self.cluster, show_total=show_total, title_every_nth=title_every_nth, flip_output=flip_output, **self.mods) for namespace in sorted(ns_stats.keys())] @@ -738,10 +776,10 @@ def do_xdr(self, line): @CommandHelp('Displays datacenter statistics') def do_dc(self, line): - + show_total = util.check_arg_and_delete_from_mods(line=line, arg="-t", default=False, modifiers=self.modifiers, mods=self.mods) - + title_every_nth = util.get_arg_and_delete_from_mods(line=line, arg="-r", return_type=int, default=0, modifiers=self.modifiers, mods=self.mods) @@ -757,6 +795,7 @@ def do_dc(self, line): title_every_nth=title_every_nth, flip_output=flip_output, **self.mods) for dc, stats in dc_stats.iteritems()] + @CommandHelp('Displays partition map analysis of Aerospike cluster.') class ShowPmapController(BasicCommandController): def __init__(self): @@ -768,9 +807,9 @@ def _do_default(self, line): return util.Future(self.view.show_pmap, pmap_data, self.cluster) + @CommandHelp('"collectinfo" is used to collect cluster info, aerospike conf file and system stats.') class CollectinfoController(BasicCommandController): - def __init__(self): self.modifiers = set(['with']) @@ -780,7 +819,6 @@ def _collect_local_file(self, src, dest_dir): shutil.copy2(src, dest_dir) except Exception, e: self.logger.error(e) - return def _collectinfo_content(self, func, parm='', alt_parm=''): name = '' @@ -799,8 +837,7 @@ def _collectinfo_content(self, func, parm='', alt_parm=''): if func == 'shell': o, e = util.shell_command(parm) if e: - if e: - self.logger.error(str(e)) + self.logger.warning(str(e)) if alt_parm and alt_parm[0]: info_line = "Data collection for alternative command " + \ @@ -814,7 +851,7 @@ def _collectinfo_content(self, func, parm='', alt_parm=''): self.cmds_error.add(alt_parm[0]) if e_alt: - self.logger.error(str(e_alt)) + self.logger.warning(str(e_alt)) if o_alt: o = o_alt @@ -863,7 +900,7 @@ def _get_metadata(self, response_str, prefix='', old_response=''): try: aws_c += self._get_metadata(response, prefix + rsp + "/", old_response=response) except Exception: - aws_c += (prefix + rsp).strip('/') + '\n' + response + "\n\n" + aws_c += (prefix + rsp).strip('/') + '\n' + response + "\n\n" if last_values: aws_c += prefix.strip('/') + '\n' + '\n'.join(last_values) + "\n\n" @@ -1024,9 +1061,20 @@ def _archive_log(self, logdir): util.shell_command(["tar -czvf " + logdir + ".tgz " + aslogdir]) sys.stderr.write("\x1b[2J\x1b[H") print "\n\n\n" - self.logger.info("Files in " + logdir + " and " + logdir + ".tgz saved. ") + self.logger.info("Files in " + logdir + " and " + logdir + ".tgz saved.") + + def _print_collecinto_summary(self, logdir): + if self.cmds_error: + self.logger.warning("Following commands are either unavailable or giving runtime error...") + self.logger.warning(list(self.cmds_error)) + + print "\n" + self.logger.info("Please provide file " + logdir + ".tgz to Aerospike Support.") self.logger.info("END OF ASCOLLECTINFO") + # If multiple commands are given in execute_only mode then we might need coloring for next commands + terminal.enable_color(True) + def _parse_namespace(self, namespace_data): """ This method will return set of namespaces present given namespace data @@ -1187,12 +1235,19 @@ def _get_meta_for_sec(self, metasec, sec_name, nodeid, metamap): def _get_as_metadata(self): metamap = {} - builds = util.Future(self.cluster.info, 'build', nodes=self.nodes).start().result() - editions = util.Future(self.cluster.info, 'version', nodes=self.nodes).start().result() - xdr_builds = util.Future(self.cluster.info_XDR_build_version, nodes=self.nodes).start().result() - node_ids = util.Future(self.cluster.info_node, nodes=self.nodes).start().result() - ips = util.Future(self.cluster.info_ip_port, nodes=self.nodes).start().result() - udf_data = util.Future(self.cluster.info_udf_list, nodes=self.nodes).start().result() + builds = util.Future(self.cluster.info, 'build', nodes=self.nodes).start() + editions = util.Future(self.cluster.info, 'version', nodes=self.nodes).start() + xdr_builds = util.Future(self.cluster.info_XDR_build_version, nodes=self.nodes).start() + node_ids = util.Future(self.cluster.info_node, nodes=self.nodes).start() + ips = util.Future(self.cluster.info_ip_port, nodes=self.nodes).start() + udf_data = util.Future(self.cluster.info_udf_list, nodes=self.nodes).start() + + builds = builds.result() + editions = editions.result() + xdr_builds = xdr_builds.result() + node_ids = node_ids.result() + ips = ips.result() + udf_data = udf_data.result() for nodeid in builds: metamap[nodeid] = {} @@ -1208,9 +1263,14 @@ def _get_as_metadata(self): def _get_as_histograms(self): histogram_map = {} hist_list = ['ttl', 'objsz'] + hist_dumps = [util.Future(self.cluster.info_histogram, hist, + raw_output=True, + nodes=self.nodes).start() + for hist in hist_list] + + for hist, hist_dump in zip(hist_list, hist_dumps): + hist_dump = hist_dump.result() - for hist in hist_list: - hist_dump = util.Future(self.cluster.info_histogram, hist, raw_output=True, nodes=self.nodes).start().result() for node in hist_dump: if node not in histogram_map: histogram_map[node] = {} @@ -1232,8 +1292,8 @@ def _dump_in_json_file(self, as_logfile_prefix, dump): with open(aslogfile, "w") as f: f.write(json.dumps(dump, indent=4, separators=(',', ':'))) - def _get_collectinfo_data_json(self, default_user, default_pwd, - default_ssh_port, default_ssh_key, credential_file): + def _get_collectinfo_data_json(self, default_user, default_pwd, default_ssh_port, + default_ssh_key, credential_file, enable_ssh): dump_map = {} @@ -1244,7 +1304,11 @@ def _get_collectinfo_data_json(self, default_user, default_pwd, pmap_map = self._get_as_pmap() sys_map = self.cluster.info_system_statistics(default_user=default_user, default_pwd=default_pwd, default_ssh_key=default_ssh_key, - default_ssh_port=default_ssh_port, credential_file=credential_file, nodes=self.nodes) + default_ssh_port=default_ssh_port, credential_file=credential_file, nodes=self.nodes, + collect_remote_data=enable_ssh) + + cluster_names = util.Future( + self.cluster.info, 'cluster-name').start() as_map = self._get_as_data_json() @@ -1264,8 +1328,7 @@ def _get_collectinfo_data_json(self, default_user, default_pwd, # Get the cluster name and add one more level in map cluster_name = 'null' - cluster_names = util.Future( - self.cluster.info, 'cluster-name').start().result() + cluster_names = cluster_names.result() # Cluster name. for node in cluster_names: @@ -1277,8 +1340,8 @@ def _get_collectinfo_data_json(self, default_user, default_pwd, snp_map[cluster_name] = dump_map return snp_map - def _dump_collectinfo_json(self, timestamp, as_logfile_prefix, default_user, default_pwd, default_ssh_port, default_ssh_key, credential_file, - snp_count, wait_time): + def _dump_collectinfo_json(self, timestamp, as_logfile_prefix, default_user, default_pwd, default_ssh_port, + default_ssh_key, credential_file, enable_ssh, snp_count, wait_time): snpshots = {} for i in range(snp_count): @@ -1286,7 +1349,7 @@ def _dump_collectinfo_json(self, timestamp, as_logfile_prefix, default_user, def snp_timestamp = time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime()) self.logger.info("Data collection for Snapshot: " + str(i + 1) + " in progress..") snpshots[snp_timestamp] = self._get_collectinfo_data_json( - default_user, default_pwd, default_ssh_port, default_ssh_key, credential_file) + default_user, default_pwd, default_ssh_port, default_ssh_key, credential_file, enable_ssh) time.sleep(wait_time) @@ -1318,42 +1381,41 @@ def _dump_collectinfo_pretty_print(self, timestamp, as_logfile_prefix, # maintain proper order for output sys_shell_cmds = [ ['hostname -I', 'hostname'], - ['uname -a', ''], - ['lsb_release -a', - 'ls /etc|grep release|xargs -I f cat /etc/f'], + ['top -n3 -b', 'top -l 3'], + ['lsb_release -a', 'ls /etc|grep release|xargs -I f cat /etc/f'], ['cat /proc/meminfo', 'vmstat -s'], ['cat /proc/interrupts', ''], + ['iostat -x 1 10', ''], + [cmd_dmesg, alt_dmesg], + ['sudo pgrep asd | xargs -I f sh -c "cat /proc/f/limits"', ''], + ['lscpu', ''], + ['sudo sysctl -a | grep -E "shmmax|file-max|maxfiles"'], + ['sudo iptables -L', ''], + ['sudo fdisk -l |grep Disk |grep dev | cut -d " " -f 2 | cut -d ":" -f 1 | xargs sudo hdparm -I 2>/dev/null', ''], + ['df -h', ''], + ['free -m', ''], + ['uname -a', ''], + + # Only in pretty print ['cat /proc/partitions', 'fdisk -l'], - [ - 'ls /sys/block/{sd*,xvd*}/queue/rotational |xargs -I f sh -c "echo f; cat f;"', ''], - [ - 'ls /sys/block/{sd*,xvd*}/device/model |xargs -I f sh -c "echo f; cat f;"', ''], - [ - 'ls /sys/block/{sd*,xvd*}/queue/scheduler |xargs -I f sh -c "echo f; cat f;"', ''], - ['rpm -qa|grep -E "citrus|aero"', - 'dpkg -l|grep -E "citrus|aero"'], + ['ls /sys/block/{sd*,xvd*}/queue/rotational |xargs -I f sh -c "echo f; cat f;"', ''], + ['ls /sys/block/{sd*,xvd*}/device/model |xargs -I f sh -c "echo f; cat f;"', ''], + ['ls /sys/block/{sd*,xvd*}/queue/scheduler |xargs -I f sh -c "echo f; cat f;"', ''], + ['rpm -qa|grep -E "citrus|aero"', 'dpkg -l|grep -E "citrus|aero"'], ['ip addr', ''], - ['ip -s link', ''], - ['sudo iptables -L', ''], - ['sudo sysctl -a | grep -E "shmmax|file-max|maxfiles"', - ''], - ['iostat -x 1 10', ''], + ['ip -s link', '', ''], ['sar -n DEV', ''], ['sar -n EDEV', ''], - ['df -h', ''], - ['free -m', ''], - [cmd_dmesg, alt_dmesg], - ['top -n3 -b', 'top -l 3'], ['mpstat -P ALL 2 3', ''], ['uptime', ''], - ['ss -pant | grep %d | grep TIME-WAIT | wc -l' % - (port), 'netstat -pant | grep %d | grep TIME_WAIT | wc -l' % (port)], - ['ss -pant | grep %d | grep CLOSE-WAIT | wc -l' % - (port), 'netstat -pant | grep %d | grep CLOSE_WAIT | wc -l' % (port)], - ['ss -pant | grep %d | grep ESTAB | wc -l' % - (port), 'netstat -pant | grep %d | grep ESTABLISHED | wc -l' % (port)], - ['ss -pant | grep %d | grep LISTEN | wc -l' % - (port), 'netstat -pant | grep %d | grep LISTEN | wc -l' % (port)] + ['ss -ant state time-wait sport = :%d or dport = :%d | wc -l' % + (port,port), 'netstat -ant | grep %d | grep TIME_WAIT | wc -l' % (port)], + ['ss -ant state close-wait sport = :%d or dport = :%d | wc -l' % + (port,port), 'netstat -ant | grep %d | grep CLOSE_WAIT | wc -l' % (port)], + ['ss -ant state established sport = :%d or dport = :%d | wc -l' % + (port,port), 'netstat -ant | grep %d | grep ESTABLISHED | wc -l' % (port)], + ['ss -ant state listen sport = :%d or dport = :%d | wc -l' % + (port,port), 'netstat -ant | grep %d | grep LISTEN | wc -l' % (port)] ] dignostic_info_params = [ 'network', 'namespace', 'set', 'xdr', 'dc', 'sindex'] @@ -1375,7 +1437,7 @@ def _dump_collectinfo_pretty_print(self, timestamp, as_logfile_prefix, ] summary_params = ['summary'] - summary_info_params = ['network', 'namespace', 'object', 'set', 'xdr', 'dc', 'sindex'] + summary_info_params = ['network', 'namespace', 'set', 'xdr', 'dc', 'sindex'] health_params = ['health -v'] hist_list = ['ttl', 'objsz'] @@ -1651,8 +1713,8 @@ def _dump_collectinfo_pretty_print(self, timestamp, as_logfile_prefix, sys.stdout = sys.__stdout__ def _main_collectinfo(self, default_user, default_pwd, default_ssh_port, default_ssh_key, - credential_file, snp_count, wait_time, show_all=False, - verbose=False): + credential_file, snp_count, wait_time, enable_ssh=False, + show_all=False, verbose=False): global aslogdir, output_time timestamp = time.gmtime() output_time = time.strftime("%Y%m%d_%H%M%S", timestamp) @@ -1664,13 +1726,15 @@ def _main_collectinfo(self, default_user, default_pwd, default_ssh_port, default # Coloring might writes extra characters to file, to avoid it we need to disable terminal coloring terminal.enable_color(False) + self.cmds_error = set() + # JSON collectinfo if snp_count < 1: self._archive_log(aslogdir) return self._dump_collectinfo_json(timestamp, as_logfile_prefix, default_user, default_pwd, default_ssh_port, default_ssh_key, - credential_file, snp_count, wait_time,) + credential_file, enable_ssh, snp_count, wait_time,) # Pretty print collectinfo self._dump_collectinfo_pretty_print(timestamp, as_logfile_prefix, show_all=show_all, verbose=verbose) @@ -1678,38 +1742,9 @@ def _main_collectinfo(self, default_user, default_pwd, default_ssh_port, default # Archive collectinfo directory self._archive_log(aslogdir) - # If multiple commands are given in execute_only mode then we might need coloring for next commands - terminal.enable_color(True) + self._print_collecinto_summary(aslogdir) - - @CommandHelp('Collects cluster info, aerospike conf file for local node and system stats from all nodes if remote server credentials provided.', - 'If credentials are not available then it will collect system stats from local node only.', - ' Options:', - ' -n - Number of snapshots. Default: 1', - ' -s - Sleep time in seconds between each snapshot. Default: 5 sec', - ' -U - Default user id for remote servers. This is System user id (not Aerospike user id).', - ' -P - Default password or passphrase for key for remote servers. This is System password (not Aerospike password).', - ' -sp - Default SSH port for remote servers. Default: 22', - ' -sk - Default SSH key (file path) for remote servers.', - ' -cf - Remote System Credentials file path. ', - ' If server credentials are not available in credential file then default credentials will be used ', - ' File format : each line should contain ,,,', - ' Example: 1.2.3.4,uid,pwd', - ' 1.2.3.4:3232,uid,pwd', - ' 1.2.3.4:3232,uid,,key_path', - ' 1.2.3.4:3232,uid,passphrase,key_path', - ' [2001::1234:10],uid,pwd', - ' [2001::1234:10]:3232,uid,,key_path', - ) - def _do_default(self, line): - - default_user = util.get_arg_and_delete_from_mods(line=line, - arg="-U", return_type=str, default=None, - modifiers=self.modifiers, mods=self.mods) - - default_pwd = util.get_arg_and_delete_from_mods(line=line, arg="-P", - return_type=str, default=None, modifiers=self.modifiers, - mods=self.mods) + def _collect_info(self, line, show_all=False): snp_count = util.get_arg_and_delete_from_mods(line=line, arg="-n", return_type=int, default=1, modifiers=self.modifiers, @@ -1719,73 +1754,64 @@ def _do_default(self, line): return_type=int, default=5, modifiers=self.modifiers, mods=self.mods) - default_ssh_port = util.get_arg_and_delete_from_mods(line=line, - arg="-sp", return_type=int, default=None, - modifiers=self.modifiers, mods=self.mods) - - default_ssh_key = util.get_arg_and_delete_from_mods(line=line, - arg="-sk", return_type=str, default=None, - modifiers=self.modifiers, mods=self.mods) - - credential_file = util.get_arg_and_delete_from_mods(line=line, - arg="-cf", return_type=str, default=None, - modifiers=self.modifiers, mods=self.mods) + enable_ssh = util.check_arg_and_delete_from_mods(line=line, arg="--enable-ssh", default=False, modifiers=self.modifiers, mods=self.mods) - self.cmds_error = set() - self._main_collectinfo(default_user, default_pwd, default_ssh_port, default_ssh_key, - credential_file, snp_count, wait_time, show_all=False, verbose=False) - - if self.cmds_error: - self.logger.error( - "Following commands are either unavailable or giving runtime error") - self.logger.error(self.cmds_error) - - @CommandHelp('Collects all default stats and additional stats like "info dump-*" commands output', - ' Options:', - ' verbose - Enable to collect additional stats with detailed output of "info dump-*" commands' - ) - def do_all(self, line): - default_user = util.get_arg_and_delete_from_mods(line=line, - arg="-U", return_type=str, default=None, - modifiers=self.modifiers, mods=self.mods) - - default_pwd = util.get_arg_and_delete_from_mods(line=line, arg="-P", + default_user = util.get_arg_and_delete_from_mods(line=line, arg="--ssh-user", return_type=str, default=None, modifiers=self.modifiers, mods=self.mods) - snp_count = util.get_arg_and_delete_from_mods(line=line, arg="-n", - return_type=int, default=1, modifiers=self.modifiers, - mods=self.mods) - - wait_time = util.get_arg_and_delete_from_mods(line=line, arg="-t", - return_type=int, default=5, modifiers=self.modifiers, + default_pwd = util.get_arg_and_delete_from_mods(line=line, arg="--ssh-pwd", + return_type=str, default=None, modifiers=self.modifiers, mods=self.mods) default_ssh_port = util.get_arg_and_delete_from_mods(line=line, - arg="-sp", return_type=int, default=None, + arg="--ssh-port", return_type=int, default=None, modifiers=self.modifiers, mods=self.mods) default_ssh_key = util.get_arg_and_delete_from_mods(line=line, - arg="-sk", return_type=str, default=None, + arg="--ssh-key", return_type=str, default=None, modifiers=self.modifiers, mods=self.mods) credential_file = util.get_arg_and_delete_from_mods(line=line, - arg="-cf", return_type=str, default=None, + arg="--ssh-cf", return_type=str, default=None, modifiers=self.modifiers, mods=self.mods) verbose = False if 'verbose' in line: verbose = True - self.cmds_error = set() self._main_collectinfo(default_user, default_pwd, default_ssh_port, default_ssh_key, - credential_file, snp_count, wait_time, show_all=True, verbose=verbose) + credential_file, snp_count, wait_time, enable_ssh=enable_ssh, show_all=show_all, verbose=verbose) - if self.cmds_error: - self.logger.error( - "Following commands are either unavailable or giving runtime error") - self.logger.error(self.cmds_error) + @CommandHelp('Collects cluster info, aerospike conf file for local node and system stats from all nodes if remote server credentials provided.', + 'If credentials are not available then it will collect system stats from local node only.', + ' Options:', + ' -n - Number of snapshots. Default: 1', + ' -s - Sleep time in seconds between each snapshot. Default: 5 sec', + ' --enable-ssh - Enable remote server system statistics collection.', + ' --ssh-user - Default user id for remote servers. This is System user id (not Aerospike user id).', + ' --ssh-pwd - Default password or passphrase for key for remote servers. This is System password (not Aerospike password).', + ' --ssh-port - Default SSH port for remote servers. Default: 22', + ' --ssh-key - Default SSH key (file path) for remote servers.', + ' --ssh-cf - Remote System Credentials file path.', + ' If server credentials are not available in credential file then default credentials will be used ', + ' File format : each line should contain ,,,', + ' Example: 1.2.3.4,uid,pwd', + ' 1.2.3.4:3232,uid,pwd', + ' 1.2.3.4:3232,uid,,key_path', + ' 1.2.3.4:3232,uid,passphrase,key_path', + ' [2001::1234:10],uid,pwd', + ' [2001::1234:10]:3232,uid,,key_path', + ) + def _do_default(self, line): + self._collect_info(line=line) + @CommandHelp('Collects all default stats and additional stats like "info dump-*" commands output', + ' Options:', + ' verbose - Enable to collect additional stats with detailed output of "info dump-*" commands' + ) + def do_all(self, line): + self._collect_info(line=line, show_all=True) @CommandHelp('Displays features used in running Aerospike cluster.') class FeaturesController(BasicCommandController): @@ -1864,35 +1890,53 @@ def _get_asconfig_data(self, stanza): else: return self.cluster.info_get_config(nodes=self.nodes, stanza=stanza) + def _get_as_meta_data(self, stanza): + if stanza == "build": + return self.cluster.info("build", nodes=self.nodes) + elif stanza == "edition": + editions = self.cluster.info("edition", nodes=self.nodes) + if not editions: + return editions + + editions_in_shortform = {} + for node, edition in editions.iteritems(): + if not edition or isinstance(edition, Exception): + continue + + editions_in_shortform[node] = util.convert_edition_to_shortform(edition) + + return editions_in_shortform + @CommandHelp( 'Displays health summary. If remote server System credentials provided, then it will collect remote system stats', 'and analyse that also. If credentials are not available then it will collect only localhost system statistics.', ' Options:', - ' -f - Query file path. Default: inbuilt health queries.', - ' -o - Output file path. ', - ' This parameter works if Query file path provided, otherwise health command will work in interactive mode.', - ' -v - Enable to display extra details of assert errors.', - ' -d - Enable to display extra details of exceptions.', - ' -n - Number of snapshots. Default: 3', - ' -s - Sleep time in seconds between each snapshot. Default: 1 sec', - ' -U - Default user id for remote servers. This is System user id (not Aerospike user id).', - ' -P - Default password or passphrase for key for remote servers. This is System password (not Aerospike password).', - ' -sp - Default SSH port for remote servers. Default: 22', - ' -sk - Default SSH key (file path) for remote servers.', - ' -cf - Remote System Credentials file path. ', - ' If server credentials are not available in credential file then default credentials will be used ', - ' File format : each line should contain ,,,', - ' Example: 1.2.3.4,uid,pwd', - ' 1.2.3.4:3232,uid,pwd', - ' 1.2.3.4:3232,uid,,key_path', - ' 1.2.3.4:3232,uid,passphrase,key_path', - ' [2001::1234:10],uid,pwd', - ' [2001::1234:10]:3232,uid,,key_path', - ' -oc - Output filter Category. ', - ' This parameter works if Query file path provided, otherwise health command will work in interactive mode.', - ' Format : string of dot (.) separated category levels', - ' -wl - Output filter Warning level. Expected value CRITICAL or WARNING or INFO ', - ' This parameter works if Query file path provided, otherwise health command will work in interactive mode.', + ' -f - Query file path. Default: inbuilt health queries.', + ' -o - Output file path. ', + ' This parameter works if Query file path provided, otherwise health command will work in interactive mode.', + ' -v - Enable to display extra details of assert errors.', + ' -d - Enable to display extra details of exceptions.', + ' -n - Number of snapshots. Default: 1', + ' -s - Sleep time in seconds between each snapshot. Default: 1 sec', + ' -oc - Output filter Category. ', + ' This parameter works if Query file path provided, otherwise health command will work in interactive mode.', + ' Format : string of dot (.) separated category levels', + ' -wl - Output filter Warning level. Expected value CRITICAL or WARNING or INFO ', + ' This parameter works if Query file path provided, otherwise health command will work in interactive mode.', + ' --enable-ssh - Enable remote server system statistics collection.', + ' --ssh-user - Default user id for remote servers. This is System user id (not Aerospike user id).', + ' --ssh-pwd - Default password or passphrase for key for remote servers. This is System password (not Aerospike password).', + ' --ssh-port - Default SSH port for remote servers. Default: 22', + ' --ssh-key - Default SSH key (file path) for remote servers.', + ' --ssh-cf - Remote System Credentials file path.', + ' If server credentials are not available in credential file then default credentials will be used ', + ' File format : each line should contain ,,,', + ' Example: 1.2.3.4,uid,pwd', + ' 1.2.3.4:3232,uid,pwd', + ' 1.2.3.4:3232,uid,,key_path', + ' 1.2.3.4:3232,uid,passphrase,key_path', + ' [2001::1234:10],uid,pwd', + ' [2001::1234:10]:3232,uid,,key_path', ) def _do_default(self, line): @@ -1901,7 +1945,7 @@ def _do_default(self, line): mods=self.mods) snap_count = util.get_arg_and_delete_from_mods(line=line, arg="-n", - return_type=int, default=3, modifiers=self.modifiers, + return_type=int, default=1, modifiers=self.modifiers, mods=self.mods) sleep_tm = util.get_arg_and_delete_from_mods(line=line, arg="-s", @@ -1914,32 +1958,34 @@ def _do_default(self, line): debug = util.check_arg_and_delete_from_mods(line=line, arg="-d", default=False, modifiers=self.modifiers, mods=self.mods) - credential_file = util.get_arg_and_delete_from_mods(line=line, - arg="-cf", return_type=str, default=None, + output_filter_category = util.get_arg_and_delete_from_mods(line=line, + arg="-oc", return_type=str, default=None, modifiers=self.modifiers, mods=self.mods) - default_user = util.get_arg_and_delete_from_mods(line=line, arg="-U", + output_filter_warning_level = util.get_arg_and_delete_from_mods(line, + arg="-wl", return_type=str, default=None, + modifiers=self.modifiers, mods=self.mods) + + enable_ssh = util.check_arg_and_delete_from_mods(line=line, arg="--enable-ssh", default=False, modifiers=self.modifiers, mods=self.mods) + + default_user = util.get_arg_and_delete_from_mods(line=line, arg="--ssh-user", return_type=str, default=None, modifiers=self.modifiers, mods=self.mods) - default_pwd = util.get_arg_and_delete_from_mods(line=line, arg="-P", + default_pwd = util.get_arg_and_delete_from_mods(line=line, arg="--ssh-pwd", return_type=str, default=None, modifiers=self.modifiers, mods=self.mods) default_ssh_port = util.get_arg_and_delete_from_mods(line=line, - arg="-sp", return_type=int, default=None, + arg="--ssh-port", return_type=int, default=None, modifiers=self.modifiers, mods=self.mods) default_ssh_key = util.get_arg_and_delete_from_mods(line=line, - arg="-sk", return_type=str, default=None, - modifiers=self.modifiers, mods=self.mods) - - output_filter_category = util.get_arg_and_delete_from_mods(line=line, - arg="-oc", return_type=str, default=None, + arg="--ssh-key", return_type=str, default=None, modifiers=self.modifiers, mods=self.mods) - output_filter_warning_level = util.get_arg_and_delete_from_mods(line, - arg="-wl", return_type=str, default=None, + credential_file = util.get_arg_and_delete_from_mods(line=line, + arg="--ssh-cf", return_type=str, default=None, modifiers=self.modifiers, mods=self.mods) # Query file can be specified without -f @@ -1948,6 +1994,9 @@ def _do_default(self, line): return_type=str, default=None, modifiers=self.modifiers, mods=self.mods) + if not query_file and line: + query_file = line[0] + if query_file: query_file = util.strip_string(query_file) @@ -1999,9 +2048,11 @@ def _do_default(self, line): ("namespace", "NAMESPACE", True, True, [ ("CLUSTER", cluster_name), ("NODE", None), (None, None), ("NAMESPACE", None)]) ]), - "cluster": (self.cluster.info, [ + "cluster": (self._get_as_meta_data, [ ("build", "METADATA", False, False, [ ("CLUSTER", cluster_name), ("NODE", None), ("KEY", "version")]), + ("edition", "METADATA", False, False, [ + ("CLUSTER", cluster_name), ("NODE", None), ("KEY", "edition")]), ]), "endpoints": (self._get_asstat_data, [ ("endpoints", "METADATA", False, False, [ @@ -2026,6 +2077,18 @@ def _do_default(self, line): (None, None), ("CLUSTER", cluster_name), ("NODE", None), (None, None), ("DEVICE", None)]), ("meminfo", "SYSTEM", "MEMINFO", True, [("CLUSTER", cluster_name), ("NODE", None)]), + ("dmesg", "SYSTEM", "DMESG", True, + [("CLUSTER", cluster_name), ("NODE", None)]), + ("lscpu", "SYSTEM", "LSCPU", True, + [("CLUSTER", cluster_name), ("NODE", None), ("LSCPU", None)]), + ("iptables", "SYSTEM", "IPTABLES", True, + [("CLUSTER", cluster_name), ("NODE", None)]), + ("sysctlall", "SYSTEM", "SYSCTLALL", True, + [("CLUSTER", cluster_name), ("NODE", None), ("SYSCTL", None)]), + ("hdparm", "SYSTEM", "HDPARM", True, + [("CLUSTER", cluster_name), ("NODE", None), ("HDPARM", None)]), + ("limits", "SYSTEM", "LIMITS", True, + [("CLUSTER", cluster_name), ("NODE", None), ("LIMITS", None)]), ("interrupts", "SYSTEM", "INTERRUPTS", False, [(None, None), ("CLUSTER", cluster_name), ("NODE", None), (None, None), ("INTERRUPT_TYPE", None), (None, None), ("INTERRUPT_ID", None), (None, None), ("INTERRUPT_DEVICE", None)]), ("df", "SYSTEM", "DF", True, [ @@ -2044,7 +2107,7 @@ def _do_default(self, line): # Collecting data sys_stats = self.cluster.info_system_statistics(nodes=self.nodes, default_user=default_user, default_pwd=default_pwd, default_ssh_key=default_ssh_key, - default_ssh_port=default_ssh_port, credential_file=credential_file) + default_ssh_port=default_ssh_port, credential_file=credential_file, collect_remote_data=enable_ssh) for _key, (info_function, stanza_list) in stanza_dict.iteritems(): @@ -2139,33 +2202,31 @@ def _do_default(self, line): health_summary = self.health_checker.execute(query_file=query_file) if health_summary: - try: - self.view.print_health_output(health_summary, verbose, debug, - output_file, output_filter_category, - output_filter_warning_level) - if not verbose: - self.logger.info("Please use -v option for more details on failure. \n") - - except Exception as e: - self.logger.error(e) + self.view.print_health_output(health_summary, verbose, debug, + output_file, output_filter_category, + output_filter_warning_level) + if not verbose: + self.logger.info("Please use -v option for more details on failure. \n") @CommandHelp( 'Displays summary of Aerospike cluster.', ' Options:', - ' -U - Default user id for remote servers. This is System user id (not Aerospike user id).', - ' -P - Default password or passphrase for key for remote servers. This is System password (not Aerospike password).', - ' -sp - Default SSH port for remote servers. Default: 22', - ' -sk - Default SSH key (file path) for remote servers.', - ' -cf - Remote System Credentials file path. ', - ' If server credentials are not available in credential file then default credentials will be used ', - ' File format : each line should contain ,,,', - ' Example: 1.2.3.4,uid,pwd', - ' 1.2.3.4:3232,uid,pwd', - ' 1.2.3.4:3232,uid,,key_path', - ' 1.2.3.4:3232,uid,passphrase,key_path', - ' [2001::1234:10],uid,pwd', - ' [2001::1234:10]:3232,uid,,key_path', + ' -l - Enable to display namespace output in List view. Default: Table view', + ' --enable-ssh - Enable remote server system statistics collection.', + ' --ssh-user - Default user id for remote servers. This is System user id (not Aerospike user id).', + ' --ssh-pwd - Default password or passphrase for key for remote servers. This is System password (not Aerospike password).', + ' --ssh-port - Default SSH port for remote servers. Default: 22', + ' --ssh-key - Default SSH key (file path) for remote servers.', + ' --ssh-cf - Remote System Credentials file path.', + ' If server credentials are not available in credential file then default credentials will be used ', + ' File format : each line should contain ,,,', + ' Example: 1.2.3.4,uid,pwd', + ' 1.2.3.4:3232,uid,pwd', + ' 1.2.3.4:3232,uid,,key_path', + ' 1.2.3.4:3232,uid,passphrase,key_path', + ' [2001::1234:10],uid,pwd', + ' [2001::1234:10]:3232,uid,,key_path', ) class SummaryController(BasicCommandController): @@ -2173,24 +2234,28 @@ def __init__(self): self.modifiers = set(['with']) def _do_default(self, line): - default_user = util.get_arg_and_delete_from_mods(line=line, arg="-U", + enable_list_view = util.check_arg_and_delete_from_mods(line=line, arg="-l", default=False, modifiers=self.modifiers, mods=self.mods) + + enable_ssh = util.check_arg_and_delete_from_mods(line=line, arg="--enable-ssh", default=False, modifiers=self.modifiers, mods=self.mods) + + default_user = util.get_arg_and_delete_from_mods(line=line, arg="--ssh-user", return_type=str, default=None, modifiers=self.modifiers, mods=self.mods) - default_pwd = util.get_arg_and_delete_from_mods(line=line, arg="-P", + default_pwd = util.get_arg_and_delete_from_mods(line=line, arg="--ssh-pwd", return_type=str, default=None, modifiers=self.modifiers, mods=self.mods) default_ssh_port = util.get_arg_and_delete_from_mods(line=line, - arg="-sp", return_type=int, default=None, + arg="--ssh-port", return_type=int, default=None, modifiers=self.modifiers, mods=self.mods) default_ssh_key = util.get_arg_and_delete_from_mods(line=line, - arg="-sk", return_type=str, default=None, + arg="--ssh-key", return_type=str, default=None, modifiers=self.modifiers, mods=self.mods) credential_file = util.get_arg_and_delete_from_mods(line=line, - arg="-cf", return_type=str, default=None, + arg="--ssh-cf", return_type=str, default=None, modifiers=self.modifiers, mods=self.mods) service_stats = util.Future(self.cluster.info_statistics, nodes=self.nodes).start() @@ -2198,7 +2263,7 @@ def _do_default(self, line): set_stats = util.Future(self.cluster.info_set_statistics, nodes=self.nodes).start() os_version = self.cluster.info_system_statistics(nodes=self.nodes, default_user=default_user, default_pwd=default_pwd, default_ssh_key=default_ssh_key, - default_ssh_port=default_ssh_port, credential_file=credential_file, commands=["lsb"]) + default_ssh_port=default_ssh_port, credential_file=credential_file, commands=["lsb"], collect_remote_data=enable_ssh) server_version = util.Future(self.cluster.info, 'build', nodes=self.nodes).start() service_stats = service_stats.result() @@ -2214,6 +2279,4 @@ def _do_default(self, line): metadata["os_version"] = os_version return util.Future(self.view.print_summary, util.create_summary(service_stats=service_stats, namespace_stats=namespace_stats, - set_stats=set_stats, metadata=metadata)) - - + set_stats=set_stats, metadata=metadata), list_view=enable_list_view) diff --git a/lib/client/cluster.py b/lib/client/cluster.py index 4402d9b4..91d9861e 100644 --- a/lib/client/cluster.py +++ b/lib/client/cluster.py @@ -28,7 +28,7 @@ class Cluster(object): # Kinda like a singleton... All instantiated classes will share the same - # state... This makes the class no + # state. cluster_state = {} use_services_alumni = False use_services_alt = False diff --git a/lib/client/node.py b/lib/client/node.py index dc704cc3..f43034d4 100644 --- a/lib/client/node.py +++ b/lib/client/node.py @@ -18,8 +18,9 @@ import re import socket import threading -from lib.client.assocket import ASSocket + from lib.client import util +from lib.client.assocket import ASSocket from lib.collectinfo_parser.full_parser import parse_system_live_command #### Remote Server connection module @@ -40,6 +41,7 @@ except ImportError: PEXPECT_VERSION = NO_MODULE + def getfqdn(address, timeout=0.5): # note: cannot use timeout lib because signal must be run from the # main thread @@ -75,7 +77,7 @@ class Node(object): pool_lock = threading.Lock() def __init__(self, address, port=3000, tls_name=None, timeout=5, user=None, - password=None, ssl_context=None, consider_alumni=False, use_services_alt=False): + password=None, ssl_context=None, consider_alumni=False, use_services_alt=False): """ address -- ip or fqdn for this node port -- info port for this node @@ -123,6 +125,12 @@ def __init__(self, address, port=3000, tls_name=None, timeout=5, user=None, ('meminfo', ['cat /proc/meminfo', 'vmstat -s']), ('interrupts', ['cat /proc/interrupts', '']), ('iostat', ['iostat -x 1 1', '']), + ('dmesg', ['dmesg -T', 'dmesg']), + ('limits', ['sudo pgrep asd | xargs -I f sh -c "sudo cat /proc/f/limits"', '']), + ('lscpu', ['lscpu', '']), + ('sysctlall', ['sudo sysctl vm fs', '']), + ('iptables', ['sudo iptables -S', '']), + ('hdparm', ['sudo fdisk -l |grep Disk |grep dev | cut -d " " -f 2 | cut -d ":" -f 1 | xargs sudo hdparm -I 2>/dev/null', '']), ('df', ['df -h', '']), ('free-m', ['free -m', '']), ('uname', ['uname -a', '']) @@ -177,8 +185,8 @@ def connect(self, address, port): # service addresses self.close() self._initialize_socket_pool() - if (not self.service_addresses - or (self.ip, self.port, self.tls_name) not in + if (not self.service_addresses or + (self.ip, self.port, self.tls_name) not in self.service_addresses): # if asd >= 3.10 and node has only IPv6 address @@ -249,9 +257,10 @@ def __eq__(self, other): def _update_IP(self, address, port): if address not in self.dns_cache: - self.dns_cache[address] = (socket.getaddrinfo(address, port, - socket.AF_UNSPEC, socket.SOCK_STREAM)[0][4][0], - getfqdn(address)) + self.dns_cache[address] = ( + socket.getaddrinfo(address, port, socket.AF_UNSPEC, + socket.SOCK_STREAM)[0][4][0], + getfqdn(address)) self.ip, self.fqdn = self.dns_cache[address] @@ -321,7 +330,8 @@ def _get_connection(self, ip, port): if sock: return sock - sock = ASSocket(ip, port, self.tls_name, self.user, self.password, self.ssl_context, timeout=self._timeout) + sock = ASSocket(ip, port, self.tls_name, self.user, self.password, + self.ssl_context, timeout=self._timeout) if sock.connect(): return sock @@ -358,9 +368,9 @@ def close(self): def _info_cinfo(self, command, ip=None, port=None): # TODO: citrusleaf.py does not support passing a timeout default is # 0.5s - if ip == None: + if ip is None: ip = self.ip - if port == None: + if port is None: port = self.port result = None @@ -414,11 +424,7 @@ def xdr_info(self, command): command -- the info command to execute on this node """ - try: - return self._info_cinfo(command, self.ip, self.xdr_port) - except Exception as e: - self.logger.error("Couldn't get XDR info: " + str(e)) - return e + return self._info_cinfo(command, self.ip, self.xdr_port) @return_exceptions def info_node(self): @@ -476,7 +482,7 @@ def _info_peers_list_helper(self, peers): tls_name = util.find_dns(endpoints) endpoint_list = [] for e in endpoints: - if "[" in e and not "]:" in e: + if "[" in e and "]:" not in e: addr_port = util._parse_string(e, delim=",") else: addr_port = util._parse_string(e, delim=":") @@ -486,8 +492,8 @@ def _info_peers_list_helper(self, peers): if addr.endswith("]"): addr = addr[:-1].strip() - if (len(addr_port) > 1 and addr_port[1] - and len(addr_port[1]) > 0): + if (len(addr_port) > 1 and addr_port[1] and + len(addr_port[1]) > 0): port = addr_port[1] else: port = default_port @@ -842,7 +848,7 @@ def info_latency(self, back=None, duration=None, slice_tm=None, ns_set=None): hist_info = self.info(cmd) except Exception: return data - #tdata = hist_info.split(';')[:-1] + # tdata = hist_info.split(';')[:-1] tdata = hist_info.split(';') hist_name = None ns = None @@ -1090,8 +1096,8 @@ def _set_system_credentials_from_file(self): try: f = open(self.sys_credential_file, 'r') except IOError as e: - self.logger.error("Can not open credential file. error: " + str(e)) - raise + self.logger.warning("Ignoring credential file. Can not open credential file. \n%s." %(str(e))) + return result for line in f.readlines(): if not line or not line.strip(): @@ -1141,9 +1147,8 @@ def _set_system_credentials_from_file(self): except Exception: pass - except Exception: - self.logger.error("Couldn't set credential from given file.") - pass + except Exception as e: + self.logger.warning("Ignoring credential file.\n%s." %(str(e))) finally: if f: f.close() @@ -1167,7 +1172,7 @@ def _set_system_credentials(self): @return_exceptions def info_system_statistics(self, default_user=None, default_pwd=None, default_ssh_key=None, - default_ssh_port=None, credential_file=None, commands=[]): + default_ssh_port=None, credential_file=None, commands=[], collect_remote_data=False): """ Get statistics for a system. @@ -1181,11 +1186,14 @@ def info_system_statistics(self, default_user=None, default_pwd=None, default_ss if self.localhost: return self._get_localhost_system_statistics(cmd_list) - else: + + if collect_remote_data: self._set_default_system_credentials(default_user, default_pwd, default_ssh_key, default_ssh_port, credential_file) return self._get_remote_host_system_statistics(cmd_list) + return {} + @return_exceptions def _get_localhost_system_statistics(self, commands): sys_stats = {} @@ -1231,9 +1239,9 @@ def _spawn_remote_system(self, ip, user, pwd, ssh_key=None, port=None): if ssh_key is not None: try: os.path.isfile(ssh_key) - except: - self.logger.error('private ssh key does not exist, please check and confirm ssh_key ' + str(ssh_key)) - raise + except Exception: + raise Exception('private ssh key %s does not exist'%(str(ssh_key))) + ssh_options += ' -i %s' % (ssh_key) s = pexpect.spawn('ssh %s -l %s %s'%(ssh_options, str(user), str(ip))) @@ -1245,6 +1253,9 @@ def _spawn_remote_system(self, ip, user, pwd, ssh_key=None, port=None): i = s.expect([ssh_newkey_msg, self.remote_system_command_prompt, pwd_passphrase_msg, permission_denied_msg, terminal_prompt_msg, pexpect.TIMEOUT]) if i == 2: # password or passphrase + if pwd is None: + raise Exception("Wrong SSH Password None.") + s.sendline(pwd) i = s.expect([ssh_newkey_msg, self.remote_system_command_prompt, pwd_passphrase_msg, permission_denied_msg, terminal_prompt_msg, pexpect.TIMEOUT]) if i == 4: @@ -1292,12 +1303,12 @@ def _spawn_remote_system(self, ip, user, pwd, ssh_key=None, port=None): s.sendline("unset PROMPT_COMMAND") # sh style - s.sendline ("PS1='[PEXPECT]\$ '") - i = s.expect ([pexpect.TIMEOUT, self.remote_system_command_prompt], timeout=10) + s.sendline("PS1='[PEXPECT]\$ '") + i = s.expect([pexpect.TIMEOUT, self.remote_system_command_prompt], timeout=10) if i == 0: # csh-style. - s.sendline ("set prompt='[PEXPECT]\$ '") - i = s.expect ([pexpect.TIMEOUT, self.remote_system_command_prompt], timeout=10) + s.sendline("set prompt='[PEXPECT]\$ '") + i = s.expect([pexpect.TIMEOUT, self.remote_system_command_prompt], timeout=10) if i == 0: return None @@ -1306,6 +1317,9 @@ def _spawn_remote_system(self, ip, user, pwd, ssh_key=None, port=None): @return_exceptions def _create_ssh_connection(self, ip, user, pwd, ssh_key=None, port=None): + if user is None and pwd is None and ssh_key is None: + raise Exception("Insufficient credentials to connect.") + if PEXPECT_VERSION == NEW_MODULE: return self._login_remote_system(ip, user, pwd, ssh_key, port) @@ -1323,7 +1337,7 @@ def _execute_remote_system_command(self, conn, cmd): if PEXPECT_VERSION == NEW_MODULE: conn.prompt() elif PEXPECT_VERSION == OLD_MODULE: - conn.expect (self.remote_system_command_prompt) + conn.expect(self.remote_system_command_prompt) else: return None return conn.before @@ -1351,9 +1365,9 @@ def _stop_ssh_connection(self, conn): if conn: conn.close() elif PEXPECT_VERSION == OLD_MODULE: - conn.sendline ('exit') + conn.sendline('exit') i = conn.expect([pexpect.EOF, "(?i)there are stopped jobs"]) - if i==1: + if i == 1: conn.sendline("exit") conn.expect(pexpect.EOF) if conn: @@ -1366,12 +1380,11 @@ def _get_remote_host_system_statistics(self, commands): sys_stats = {} if PEXPECT_VERSION == NO_MODULE: - self.logger.error("No module named pexpect. Please install it to collect remote server system statistics.") + self.logger.warning("Ignoring system statistics collection from node %s. No module named pexpect."%(str(self.ip))) return sys_stats sys_stats_collected = False self._set_system_credentials() - # 1 for previous saved credential and one from new inputs max_tries = 1 tries = 0 @@ -1381,14 +1394,16 @@ def _get_remote_host_system_statistics(self, commands): try: s = self._create_ssh_connection(self.ip, self.sys_user_id, self.sys_pwd, self.sys_ssh_key, self.sys_ssh_port) - if not s or isinstance(s, Exception): - s = None - raise - except Exception: + if not s: + raise Exception("Wrong credentials to connect.") + + if isinstance(s, Exception): + raise s + + except Exception as e: if tries >= max_tries: - self.logger.error("Couldn't make SSH login to remote server %s:%s, please provide correct credentials."%(str(self.ip), "22" if self.sys_ssh_port is None else str(self.sys_ssh_port))) - if s: - s.close() + self.logger.warning("Ignoring system statistics collection. Couldn't make SSH login to remote server %s:%s. \n%s" % (str(self.ip), "22" if self.sys_ssh_port is None else str(self.sys_ssh_port), str(e))) + continue try: @@ -1410,10 +1425,9 @@ def _get_remote_host_system_statistics(self, commands): sys_stats_collected = True self._stop_ssh_connection(s) - except Exception: + except Exception as e: if tries >= max_tries: - self.logger.error("Couldn't get or parse remote system stats for remote server %s:%s."%(str(self.ip), "22" if self.sys_ssh_port is None else str(self.sys_ssh_port))) - pass + self.logger.error("Ignoring system statistics collection. Couldn't get or parse remote system stats for remote server %s:%s. \n%s" % (str(self.ip), "22" if self.sys_ssh_port is None else str(self.sys_ssh_port), str(e))) finally: if s and not isinstance(s, Exception): diff --git a/lib/client/ssl_context.py b/lib/client/ssl_context.py index acdee4b6..9d8085c8 100644 --- a/lib/client/ssl_context.py +++ b/lib/client/ssl_context.py @@ -334,7 +334,7 @@ def _match_tlsname(self, cert, tls_name): pass cnnames.add(value) else: - raise ImportError("No module named pyasn1") + raise ImportError("No module named pyasn1. It is required for dnsname_match.") if len(cnnames) > 1: raise Exception("tls_name %r doesn't match either of %s" % ( @@ -466,10 +466,32 @@ def _create_ssl_context(self, enable_tls=False, encrypt_only=False, self.ctx.set_verify( SSL.VERIFY_PEER | SSL.VERIFY_CLIENT_ONCE, self._verify_cb) if cafile or capath: - self.ctx.load_verify_locations(cafile, capath) + try: + self.ctx.load_verify_locations(cafile, capath) + except Exception as e: + path = "" + + if cafile: + path = "cafile=%s"%(str(cafile)) + + if capath: + if path: + path += " and " + path += "capath=%s"%(str(capath)) + + raise Exception("Failed to load CA certificate from %s \n %s"%(path, str(e))) + if certfile: - self.ctx.use_certificate_chain_file(certfile) + try: + self.ctx.use_certificate_chain_file(certfile) + except Exception as e: + raise Exception("Failed to load certificate chain file %s \n %s"%(certfile, str(e))) + if keyfile: - self.ctx.use_privatekey_file(keyfile) + try: + self.ctx.use_privatekey_file(keyfile) + except Exception as e: + raise Exception("Failed to load private key %s \n %s"%(keyfile, str(e))) + if cipher_suite: self.ctx.set_cipher_list(cipher_suite) diff --git a/lib/collectinfo/cinfolog.py b/lib/collectinfo/cinfolog.py index 49cbfa84..537c7538 100644 --- a/lib/collectinfo/cinfolog.py +++ b/lib/collectinfo/cinfolog.py @@ -13,8 +13,9 @@ # limitations under the License. import copy -from lib.collectinfo_parser.full_parser import parse_info_all +from lib.collectinfo_parser.full_parser import parse_info_all +from lib.utils import util class CollectinfoNode(object): @@ -52,12 +53,7 @@ def set_cluster_name(self, cluster_name): self.cluster_name = cluster_name def set_asd_version(self, asd_version): - if asd_version.lower() in ['enterprise', 'true', 'ee'] or 'enterprise' in asd_version.lower(): - self.asd_version = "Enterprise" - elif asd_version.lower() in ['community', 'false', 'ce'] or 'community' in asd_version.lower(): - self.asd_version = "Community" - else: - self.asd_version = "N/E" + self.asd_version = util.convert_edition_to_shortform(asd_version) class CollectinfoSnapshot(object): @@ -155,6 +151,10 @@ def get_data(self, type="", stanza=""): except Exception: data[node] = copy.deepcopy(d[stanza]) + elif type == "meta_data" and stanza == "edition": + edition = copy.deepcopy(d[stanza]) + data[node] = util.convert_edition_to_shortform(edition) + else: data[node] = copy.deepcopy(d[stanza]) diff --git a/lib/collectinfo/loghdlr.py b/lib/collectinfo/loghdlr.py index 012fa155..1973fa4a 100644 --- a/lib/collectinfo/loghdlr.py +++ b/lib/collectinfo/loghdlr.py @@ -14,9 +14,7 @@ import ntpath import os -import logging import shutil -import sys import tarfile import zipfile @@ -52,14 +50,12 @@ def __init__(self, cinfo_path): self.cinfo_path = cinfo_path self._validate_and_extract_compressed_files(cinfo_path, dest_dir=self.COLLECTINFO_DIR) self.cinfo_timestamp = None - self.logger = logging.getLogger('asadm') self.reader = CollectinfoReader() snapshot_added, err_cinfo = self._add_cinfo_log_files(cinfo_path) if snapshot_added == 0: - self.logger.error(err_cinfo) - sys.exit(1) + raise Exception(str(err_cinfo)) def __str__(self): status_str = "" diff --git a/lib/collectinfo_parser/section_filter_list.py b/lib/collectinfo_parser/section_filter_list.py index c1d10499..b71e7c0f 100644 --- a/lib/collectinfo_parser/section_filter_list.py +++ b/lib/collectinfo_parser/section_filter_list.py @@ -339,8 +339,9 @@ # 'parser_func': }, 'ID_42': { - 'enable': False, + 'enable': True, 'raw_section_name': 'dmesg', + 'final_section_name': 'dmesg', 'regex_new': 'dmesg', 'regex_old': CMD_PREFIX + 'dmesg' # 'parser_func': @@ -730,14 +731,68 @@ 'regex_new': "\['config', 'cluster'\]" # 'parser_func' }, - 'ID_102': { + # Leave 102 for merge from pensive + 'ID_103': { 'enable': True, - 'raw_section_name': 'config_cluster', - 'final_section_name': 'cluster', - 'parent_section_name': 'config', - 'regex_new': "\['config', 'cluster'\]" + 'raw_section_name': 'ss -ant state time-wait sport = :%d or dport = :%d | wc -l', + 'regex_new': 'ss -ant state time-wait sport = :%d or dport = :%d [|] wc -l', + # 'parser_func' + }, + 'ID_104': { + 'enable': True, + 'raw_section_name': 'ss -ant state close-wait sport = :%d or dport = :%d | wc -l', + 'regex_new': 'ss -ant state close-wait sport = :%d or dport = :%d [|] wc -l', + # 'parser_func' + }, + 'ID_105': { + 'enable': True, + 'raw_section_name': 'ss -ant state established sport = :%d or dport = :%d | wc -l', + 'regex_new': 'ss -ant state established sport = :%d or dport = :%d [|] wc -l', + # 'parser_func' + }, + 'ID_106': { + 'enable': True, + 'raw_section_name': 'netstat -ant|grep 3000', + 'regex_new': 'netstat -ant[|]grep 3000|^netstat\n', + 'regex_old': CMD_PREFIX + 'netstat -ant[|]grep 3000' # 'parser_func' }, + 'ID_107': { + 'enable': True, + 'raw_section_name': 'lscpu', + 'final_section_name': 'lscpu', + 'regex_new': "[cpu] lscpu\n" + # 'parser_func' + }, + 'ID_108': { + 'enable': True, + 'raw_section_name': 'iptables -S', + 'final_section_name': 'iptables', + 'regex_new': 'iptables', + # 'parser_func': + }, + 'ID_109': { + 'enable': True, + 'raw_section_name': 'sysctl vm sys', + 'final_section_name': 'sysctlall', + 'regex_new': 'sysctlall', + # 'parser_func': + }, + 'ID_110': { + 'enable': True, + 'raw_section_name': 'sudo fdisk -l |grep Disk |grep dev | cut -d " " -f 2 | cut -d ":" -f 1 | xargs sudo hdparm -I 2>/dev/null', + 'final_section_name': 'hdparm', + 'regex_new': 'hdparm', + # 'parser_func': + }, + 'ID_111': { + 'enable': True, + 'raw_section_name': 'sudo pgrep asd | xargs -I f sh -c "cat /proc/f/limits"', + 'final_section_name': 'limits', + 'regex_new': 'limits', + # 'parser_func': + } + #{ # 'enable': True, # 'raw_section_name': 'set', diff --git a/lib/collectinfo_parser/sys_section_parser.py b/lib/collectinfo_parser/sys_section_parser.py index f1d22754..0f047b57 100644 --- a/lib/collectinfo_parser/sys_section_parser.py +++ b/lib/collectinfo_parser/sys_section_parser.py @@ -61,6 +61,24 @@ def parse_sys_section(section_list, imap, parsed_map): elif section == 'ip_addr': _parse_ipaddr_section(imap, parsed_map) + elif section == 'dmesg': + _parse_dmesg_section(imap, parsed_map) + + elif section == 'lscpu': + _parse_lscpu_section(imap, parsed_map) + + elif section == 'iptables': + _parse_iptables_section(imap, parsed_map) + + elif section == 'sysctlall': + _parse_sysctlall_section(imap, parsed_map) + + elif section == 'hdparm': + _parse_hdparm_section(imap, parsed_map) + + elif section == 'limits': + _parse_limits_section(imap, parsed_map) + else: logger.warning( "Section unknown, can not be parsed. Check SYS_SECTION_NAME_LIST. Section: " + section) @@ -569,6 +587,152 @@ def _modify_keys_in_iostat_section(iostatobj_list): change_key_name_in_map(obj, ['rkB/s'], 'rk_b/s') change_key_name_in_map(obj, ['wkB/s'], 'wk_b/s') +def _parse_dmesg_section(imap, parsed_map): + sec_id = 'ID_42' + raw_section_name, final_section_name, _ = get_section_name_from_id(sec_id) + + logger.info("Parsing section: " + final_section_name) + + if not is_valid_section(imap, raw_section_name, final_section_name): + return + + dmesg_section = imap[raw_section_name][0] + + parsed_map[final_section_name] = {} + cpu_list = [] + + parsed_map[final_section_name]["OOM"] = False + parsed_map[final_section_name]["Blocked"] = False + + for line in dmesg_section: + if 'OOM' in line: + parsed_map[final_section_name]["OOM"] |= True + + if 'blocked for more than 120 seconds' in line: + parsed_map[final_section_name]["Blocked"] |= True + + if 'Linux version' in line: + parsed_map[final_section_name]["OS"] = line + +def _parse_lscpu_section(imap, parsed_map): + sec_id = 'ID_107' + raw_section_name, final_section_name, _ = get_section_name_from_id(sec_id) + + logger.info("Parsing section: " + final_section_name) + + if not is_valid_section(imap, raw_section_name, final_section_name): + return + + lscpu_section = imap[raw_section_name][0] + + parsed_map[final_section_name] = {} + + for line in lscpu_section: + if line == "": + continue + lineobj = line.rstrip().split(':') + key = str(lineobj[0]) + val = str(lineobj[1]) + parsed_map[final_section_name][key.strip()] = val.strip() + +def _parse_iptables_section(imap, parsed_map): + sec_id = 'ID_108' + raw_section_name, final_section_name, _ = get_section_name_from_id(sec_id) + + logger.info("Parsing section: " + final_section_name) + + if not is_valid_section(imap, raw_section_name, final_section_name): + return + + iptables_section = imap[raw_section_name][0] + + parsed_map[final_section_name] = {} + + for line in iptables_section: + if "DROP" in line: + parsed_map[final_section_name]["has_firewall"] = True + return + + parsed_map[final_section_name]["has_firewall"] = False + +def _parse_sysctlall_section(imap, parsed_map): + sec_id = 'ID_109' + raw_section_name, final_section_name, _ = get_section_name_from_id(sec_id) + + logger.info("Parsing section: " + final_section_name) + + if not is_valid_section(imap, raw_section_name, final_section_name): + return + + sysctlall_section = imap[raw_section_name][0] + + parsed_map[final_section_name] = {} + + for line in sysctlall_section: + if line == "": + continue + lineobj = line.rstrip().split('=') + key = str(lineobj[0]) + val = str(lineobj[1]) + parsed_map[final_section_name][key.strip()] = val.strip() + +def _parse_hdparm_section(imap, parsed_map): + sec_id = 'ID_110' + raw_section_name, final_section_name, _ = get_section_name_from_id(sec_id) + + logger.info("Parsing section: " + final_section_name) + + if not is_valid_section(imap, raw_section_name, final_section_name): + return + + device_info = {} + hdparm_section = imap[raw_section_name][0] + + for line in hdparm_section: + + if re.search("/dev.*:", line, re.IGNORECASE): + device = line + + if ('Sector size' in line + or 'device size' in line + or 'Model Number' in line + or 'Serial Number' in line + or 'Firmware Revision' in line + or 'Transport' in line + or 'Queue Depth' in line): + + lineobj = line.rstrip().split(':') + key = str(device) + str(lineobj[0]).strip() + val = str(lineobj[1]).strip() + + device_info[key] = val + + parsed_map[final_section_name] = device_info + +def _parse_limits_section(imap, parsed_map): + sec_id = 'ID_111' + raw_section_name, final_section_name, _ = get_section_name_from_id(sec_id) + + logger.info("Parsing section: " + final_section_name) + + if not is_valid_section(imap, raw_section_name, final_section_name): + return + + limits = {} + limits_section = imap[raw_section_name][0] + + for line in limits_section: + + if "Max" not in line: + continue + + lineobj = filter(None, line.rstrip().split(' ')) + key = str(lineobj[0]).strip() + limits["Soft " + key] = str(lineobj[1]).strip() + limits["Hard " + key] = str(lineobj[2]).strip() + + parsed_map[final_section_name] = limits + ### "iostat -x 1 10\n", ### "Linux 2.6.32-279.el6.x86_64 (bfs-dl360g8-02) \t02/02/15 \t_x86_64_\t(24 CPU)\n", diff --git a/lib/collectinfocontroller.py b/lib/collectinfocontroller.py index f74e07c5..dcf532a9 100644 --- a/lib/collectinfocontroller.py +++ b/lib/collectinfocontroller.py @@ -79,15 +79,17 @@ def do_help(self, line): 'The "info" command provides summary tables for various aspects', 'of Aerospike functionality.') class InfoController(CollectinfoCommandController): - def __init__(self): self.modifiers = set() + self.controller_map = dict( + namespace=InfoNamespaceController) + @CommandHelp( 'Displays network, namespace, and xdr summary information.') def _do_default(self, line): self.do_network(line) - self.do_namespace(line) + self.controller_map['namespace']()(line[:]) self.do_xdr(line) @CommandHelp( @@ -113,32 +115,7 @@ def do_network(self, line): versions, builds, cluster=cinfo_log, title_suffix=" (%s)" % (timestamp), **self.mods) - @CommandHelp( - 'Displays namespace summary information.') - def do_namespace(self, line): - ns_stats = self.loghdlr.info_statistics(stanza=STAT_NAMESPACE, flip=True) - - for timestamp in sorted(ns_stats.keys()): - if not ns_stats[timestamp]: - continue - - self.view.info_namespace(util.flip_keys(ns_stats[timestamp]), - self.loghdlr.get_cinfo_log_at(timestamp=timestamp), - title_suffix=" (%s)" % (timestamp), **self.mods) - - @CommandHelp('Displays summary information for objects of each namespace.') - def do_object(self, line): - ns_stats = self.loghdlr.info_statistics(stanza=STAT_NAMESPACE, flip=True) - - for timestamp in sorted(ns_stats.keys()): - if not ns_stats[timestamp]: - continue - - self.view.info_object(util.flip_keys(ns_stats[timestamp]), - self.loghdlr.get_cinfo_log_at(timestamp=timestamp), - title_suffix=" (%s)" % (timestamp), **self.mods) - - def convert_key_to_tuple(self, stats): + def _convert_key_to_tuple(self, stats): for key in stats.keys(): key_tuple = tuple(key.split()) stats[key_tuple] = stats[key] @@ -153,7 +130,7 @@ def do_set(self, line): if not set_stats[timestamp]: continue - self.convert_key_to_tuple(set_stats[timestamp]) + self._convert_key_to_tuple(set_stats[timestamp]) self.view.info_set(util.flip_keys(set_stats[timestamp]), self.loghdlr.get_cinfo_log_at(timestamp=timestamp), title_suffix=" (%s)" % (timestamp), **self.mods) @@ -226,6 +203,44 @@ def do_sindex(self, line): title_suffix=" (%s)" % (timestamp), **self.mods) +@CommandHelp('The "namespace" command provides summary tables for various aspects', + 'of Aerospike namespaces.') +class InfoNamespaceController(CollectinfoCommandController): + def __init__(self): + self.modifiers = set() + + @CommandHelp('Displays usage and objects information for namespaces') + def _do_default(self, line): + self.do_usage(line) + self.do_object(line) + + @CommandHelp('Displays usage information for each namespace.') + def do_usage(self, line): + ns_stats = self.loghdlr.info_statistics(stanza=STAT_NAMESPACE, flip=True) + + for timestamp in sorted(ns_stats.keys()): + if not ns_stats[timestamp]: + continue + + self.view.info_namespace_usage( + util.flip_keys(ns_stats[timestamp]), + self.loghdlr.get_cinfo_log_at(timestamp=timestamp), + title_suffix=" (%s)" % (timestamp), **self.mods) + + @CommandHelp('Displays object information for each namespace.') + def do_object(self, line): + ns_stats = self.loghdlr.info_statistics(stanza=STAT_NAMESPACE, flip=True) + + for timestamp in sorted(ns_stats.keys()): + if not ns_stats[timestamp]: + continue + + self.view.info_namespace_object( + util.flip_keys(ns_stats[timestamp]), + self.loghdlr.get_cinfo_log_at(timestamp=timestamp), + title_suffix=" (%s)" % (timestamp), **self.mods) + + @CommandHelp( '"show" is used to display Aerospike Statistics and', 'configuration.') @@ -812,6 +827,8 @@ def _do_default(self, line): "cluster": (self.loghdlr.info_meta_data, [ ("asd_build", "METADATA", "CLUSTER", True, [ ("CLUSTER", cluster_name), ("NODE", None), ("KEY", "version")]), + ("edition", "METADATA", "CLUSTER", True, [ + ("CLUSTER", cluster_name), ("NODE", None), ("KEY", "edition")]), ]), "endpoints": (self.loghdlr.info_meta_data, [ ("endpoints", "METADATA", "ENDPOINTS", True, [ @@ -835,6 +852,18 @@ def _do_default(self, line): (None, None), ("CLUSTER", cluster_name), ("NODE", None), (None, None), ("DEVICE", None)]), ("meminfo", "SYSTEM", "MEMINFO", True, [("CLUSTER", cluster_name), ("NODE", None)]), + ("dmesg", "SYSTEM", "DMESG", True, + [("CLUSTER", cluster_name), ("NODE", None)]), + ("lscpu", "SYSTEM", "LSCPU", True, + [("CLUSTER", cluster_name), ("NODE", None), ("LSCPU", None)]), + ("sysctlall", "SYSTEM", "SYSCTLALL", True, + [("CLUSTER", cluster_name), ("NODE", None), ("SYSCTL", None)]), + ("iptables", "SYSTEM", "IPTABLES", True, + [("CLUSTER", cluster_name), ("NODE", None)]), + ("hdparm", "SYSTEM", "HDPARM", True, + [("CLUSTER", cluster_name), ("NODE", None), ("HDPARM", None)]), + ("limits", "SYSTEM", "LIMITS", True, + [("CLUSTER", cluster_name), ("NODE", None), ("LIMITS", None)]), ("interrupts", "SYSTEM", "INTERRUPTS", False, [(None, None), ("CLUSTER", cluster_name), ("NODE", None), (None, None), ("INTERRUPT_TYPE", None), (None, None), ("INTERRUPT_ID", None), (None, None), ("INTERRUPT_DEVICE", None)]), ("df", "SYSTEM", "DF", True, [ @@ -882,17 +911,12 @@ def _do_default(self, line): health_summary = self.health_checker.execute(query_file=query_file) if health_summary: - try: - self.view.print_health_output(health_summary, debug=debug, - verbose=verbose, output_file=output_file, - output_filter_category=output_filter_category, - output_filter_warning_level=output_filter_warning_level) - if not verbose: - self.logger.info("Please use -v option for more details on failure. \n") - - except Exception as e: - self.logger.error(e) - + self.view.print_health_output(health_summary, debug=debug, + verbose=verbose, output_file=output_file, + output_filter_category=output_filter_category, + output_filter_warning_level=output_filter_warning_level) + if not verbose: + self.logger.info("Please use -v option for more details on failure. \n") class ListController(CollectinfoCommandController): @@ -934,13 +958,18 @@ def do_scroll(self, line): CliView.pager = CliView.SCROLL -@CommandHelp('Displays summary of Aerospike cluster.') +@CommandHelp('Displays summary of Aerospike cluster.', + ' Options:', + ' -l - Enable to display namespace output in List view. Default: Table view', + ) class SummaryController(CollectinfoCommandController): def __init__(self): self.modifiers = set([]) def _do_default(self, line): + enable_list_view = util.check_arg_and_delete_from_mods(line=line, arg="-l", default=False, modifiers=self.modifiers, mods=self.mods) + service_stats = self.loghdlr.info_statistics(stanza=STAT_SERVICE) namespace_stats = self.loghdlr.info_statistics(stanza=STAT_NAMESPACE) set_stats = self.loghdlr.info_statistics(stanza=STAT_SETS) @@ -955,4 +984,5 @@ def _do_default(self, line): metadata["os_version"] = os_version[last_timestamp] self.view.print_summary(util.create_summary(service_stats=service_stats[last_timestamp], namespace_stats=namespace_stats[last_timestamp], - set_stats=set_stats[last_timestamp], metadata=metadata)) \ No newline at end of file + set_stats=set_stats[last_timestamp], metadata=metadata), + list_view=enable_list_view) diff --git a/lib/getcontroller.py b/lib/getcontroller.py index 1bfe40aa..6bb31152 100644 --- a/lib/getcontroller.py +++ b/lib/getcontroller.py @@ -104,13 +104,14 @@ def __init__(self, cluster): self.cluster = cluster def get_all(self, nodes='all'): - config_map = {'service': (util.Future(self.get_service, nodes=nodes).start()).result(), - 'namespace': (util.Future(self.get_namespace, nodes=nodes).start()).result(), - 'network': (util.Future(self.get_network, nodes=nodes).start()).result(), - 'xdr': (util.Future(self.get_xdr, nodes=nodes).start()).result(), - 'dc': (util.Future(self.get_dc, nodes=nodes).start()).result(), - 'cluster': (util.Future(self.get_cluster, nodes=nodes).start()).result() - } + futures = [('service', (util.Future(self.get_service, nodes=nodes).start())), + ('namespace', (util.Future(self.get_namespace, nodes=nodes).start())), + ('network', (util.Future(self.get_network, nodes=nodes).start())), + ('xdr', (util.Future(self.get_xdr, nodes=nodes).start())), + ('dc', (util.Future(self.get_dc, nodes=nodes).start())), + ('cluster', (util.Future(self.get_cluster, nodes=nodes).start()))] + config_map = dict(((k, f.result()) for k, f in futures)) + return config_map def get_service(self, nodes='all'): @@ -234,14 +235,15 @@ def __init__(self, cluster): self.cluster = cluster def get_all(self, nodes='all'): - stat_map = {'service': (util.Future(self.get_service, nodes=nodes).start()).result(), - 'namespace': (util.Future(self.get_namespace, nodes=nodes).start()).result(), - 'set': (util.Future(self.get_sets, nodes=nodes).start()).result(), - 'bin': (util.Future(self.get_bins, nodes=nodes).start()).result(), - 'sindex': (util.Future(self.get_sindex, nodes=nodes).start()).result(), - 'xdr': (util.Future(self.get_xdr, nodes=nodes).start()).result(), - 'dc': (util.Future(self.get_dc, nodes=nodes).start()).result() - } + futures = [('service', (util.Future(self.get_service, nodes=nodes).start())), + ('namespace', (util.Future(self.get_namespace, nodes=nodes).start())), + ('set', (util.Future(self.get_sets, nodes=nodes).start())), + ('bin', (util.Future(self.get_bins, nodes=nodes).start())), + ('sindex', (util.Future(self.get_sindex, nodes=nodes).start())), + ('xdr', (util.Future(self.get_xdr, nodes=nodes).start())), + ('dc', (util.Future(self.get_dc, nodes=nodes).start()))] + stat_map = dict(((k, f.result()) for k, f in futures)) + return stat_map def get_service(self, nodes='all'): @@ -250,20 +252,24 @@ def get_service(self, nodes='all'): def get_namespace(self, nodes='all', for_mods=[]): namespaces = self.cluster.info_namespaces(nodes=nodes) - namespaces = namespaces.values() namespace_set = set() + for namespace in namespaces: if isinstance(namespace, Exception): continue + namespace_set.update(namespace) - namespace_list = util.filter_list(list(namespace_set), for_mods) + namespace_list = util.filter_list(list(namespace_set), for_mods) + futures = [(namespace, util.Future( + self.cluster.info_namespace_statistics, namespace, nodes=nodes).start()) + for namespace in namespace_list] ns_stats = {} - for namespace in namespace_list: - ns_stats[namespace] = util.Future( - self.cluster.info_namespace_statistics, namespace, - nodes=nodes).start().result() + + for namespace, stat_future in futures: + ns_stats[namespace] = stat_future.result() + for _k in ns_stats[namespace].keys(): if not ns_stats[namespace][_k]: ns_stats[namespace].pop(_k) @@ -396,7 +402,15 @@ def _get_namespace_data(self, namespace_stats, cluster_keys): if cluster_keys[node] not in repl_factor: repl_factor[cluster_keys[node]] = 0 - repl_factor[cluster_keys[node]] = max(repl_factor[cluster_keys[node]], int(params['repl-factor'])) + repl_factor[cluster_keys[node]] = max( + repl_factor[cluster_keys[node]], + util.get_value_from_dict( + params, + ('repl-factor', + 'effective_replication_factor'), # introduced post 3.15.0.1 + default_value=0, + return_type=int + )) for ck in repl_factor: if ck not in ns_info: @@ -553,4 +567,4 @@ def get_pmap(self, nodes='all'): pmap_data = self._get_pmap_data(pmap_info, ns_info, cluster_keys, node_ids) - return pmap_data \ No newline at end of file + return pmap_data diff --git a/lib/health/commands.py b/lib/health/commands.py index 04ed02be..5fc3c040 100644 --- a/lib/health/commands.py +++ b/lib/health/commands.py @@ -15,8 +15,8 @@ import re from lib.health.exceptions import HealthException -from lib.health.operation import SimpleOperation, AggOperation, ComplexOperation, AssertDetailOperation -from lib.health.util import fetch_keys_from_dict, create_snapshot_key +from lib.health.operation import select_keys_from_dict, SimpleOperation, AggOperation, ComplexOperation, AssertDetailOperation +from lib.health.util import create_health_internal_tuple, create_snapshot_key SNAPSHOT_KEY_PREFIX = "SNAPSHOT" SNAPSHOT_KEY_PATTERN = r"SNAPSHOT(\d+)$" @@ -34,6 +34,7 @@ "<=": SimpleOperation("<=").operate, "==": SimpleOperation("==").operate, "!=": SimpleOperation("!=").operate, + "%%": SimpleOperation("%%").operate, "&&": SimpleOperation("AND").operate, "||": SimpleOperation("OR").operate, "AND": AggOperation("AND").operate, @@ -47,7 +48,8 @@ "COUNT": AggOperation("COUNT").operate, "COUNT_ALL": AggOperation("COUNT_ALL").operate, "DIFF": ComplexOperation("DIFF").operate, - "SD_ANOMALY": ComplexOperation("SD_ANOMALY").operate + "SD_ANOMALY": ComplexOperation("SD_ANOMALY").operate, + "NO_MATCH": ComplexOperation("NO_MATCH").operate } assert_op_list = { @@ -56,20 +58,16 @@ def do_operation(op=None, arg1=None, arg2=None, group_by=None, - result_comp_op=None, result_comp_val=None, on_common_only=False): + result_comp_op=None, result_comp_val=None, on_common_only=False, save_param=None): if op in op_list: return op_list[op](arg1, arg2, group_by, result_comp_op, - result_comp_val, on_common_only=on_common_only) - - if op == "%%" and (isinstance(arg1, int) or isinstance(arg1, float)): - return op_list["*"](arg2, float(arg1) / 100, group_by, - on_common_only=on_common_only) + result_comp_val, on_common_only=on_common_only, save_param=save_param) return None -def select_keys(data={}, select_keys=[], select_from_keys=[]): +def select_keys(data={}, select_keys=[], select_from_keys=[], ignore_keys=[], save_param=None): if not data or not isinstance(data, dict): raise HealthException("Wrong Input Data for select operation.") @@ -85,8 +83,12 @@ def select_keys(data={}, select_keys=[], select_from_keys=[]): elif select_from_keys[0].startswith(SNAPSHOT_KEY_PREFIX): select_from_keys[0] = create_snapshot_key(int(re.search(SNAPSHOT_KEY_PATTERN, select_from_keys[0]).group(1))) - result = fetch_keys_from_dict(data=data, keys=select_keys, - from_keys=select_from_keys) + config_param = False + if "CONFIG" in select_from_keys: + config_param = True + + result = select_keys_from_dict(data=data, keys=select_keys, from_keys=select_from_keys, ignore_keys=ignore_keys, + save_param=save_param, config_param=config_param) if not result: raise HealthException( @@ -95,7 +97,7 @@ def select_keys(data={}, select_keys=[], select_from_keys=[]): return result -def do_assert(op=None, data={}, check_val=True, error=None, category=None, +def do_assert(op=None, data={}, check_val=create_health_internal_tuple(True,[]), error=None, category=None, level=None, description=None, success_msg=None): if op in assert_op_list: return assert_op_list[op](data, check_val, error, category, level, @@ -116,7 +118,7 @@ def do_assert_if_check(op=None, arg1=None, arg2=None): arg1 = do_operation(op=op, arg1=arg1, arg2=arg2) # return filter argument should be in boolean form, True for key to skip and False for key to check - return not is_data_true(arg1), do_operation(op="==", arg1=arg1, arg2=False) + return not is_data_true(arg1), do_operation(op="==", arg1=arg1, arg2=(False,[])) def is_data_true(data): """ diff --git a/lib/health/constants.py b/lib/health/constants.py index 9abc2f5c..0024a2df 100644 --- a/lib/health/constants.py +++ b/lib/health/constants.py @@ -13,6 +13,9 @@ # limitations under the License. +HEALTH_PARSER_VAR = "health_parser_var" +MAJORITY = "MAJORITY_VALUE" + class AssertLevel(object): CRITICAL = 0 WARNING = 1 diff --git a/lib/health/healthchecker.py b/lib/health/healthchecker.py index 9e41d71f..6788239a 100644 --- a/lib/health/healthchecker.py +++ b/lib/health/healthchecker.py @@ -15,14 +15,14 @@ import copy from distutils.version import LooseVersion import re -import logging -from lib.view import terminal from lib.health.constants import ParserResultType, HealthResultType, HealthResultCounter, AssertResultKey from lib.health.exceptions import SyntaxException, HealthException from lib.health.parser import HealthParser from lib.health.query import QUERIES +from lib.health.util import is_health_parser_variable from lib.utils.util import parse_queries +from lib.view import terminal VERSION_CONSTRAINT_PATTERN = "SET CONSTRAINT VERSION(.+)" @@ -39,7 +39,6 @@ def __init__(self): self.verbose = False self.no_valid_version = False - self.logger = logging.getLogger('asadm') self.filtered_data_set_to_parser = False def _reset_counters(self): @@ -298,21 +297,17 @@ def _add_assert_output(self, assert_out): def _execute_queries(self, query_source=None, is_source_file=True): self._reset_counters() if not self.health_input_data or not isinstance(self.health_input_data, dict): - self.logger.error("No Health Input Data available") - return False + raise Exception("No Health Input Data available") if not query_source: - self.logger.error("No Input Query Source.") - return False + raise Exception("No Input Query Source.") if not isinstance(query_source, str): - self.logger.error("Query input source is not valid") - return False + raise Exception("Query input source is not valid") queries = parse_queries(query_source, is_file=is_source_file) if not queries: - self.logger.error("Wrong Health query source.") - return False + raise Exception("Wrong Health query source.") try: for query in queries: @@ -372,7 +367,7 @@ def _execute_queries(self, query_source=None, is_source_file=True): else: self._increment_counter(HealthResultCounter.ASSERT_FAILED_COUNTER) self._add_assert_output(result[1]) - else: + elif is_health_parser_variable(result): self._increment_counter( HealthResultCounter.DEBUG_COUNTER) self.debug_outputs.append(result) @@ -385,23 +380,19 @@ def _execute_queries(self, query_source=None, is_source_file=True): def execute(self, query_file=None): health_summary = None - try: - if query_file is None: - if not self._execute_queries(query_source=QUERIES, is_source_file=False): - return {} - health_summary = self._create_health_result_dict() - elif query_file: - if not self._execute_queries(query_source=query_file, is_source_file=True): - return {} - health_summary = self._create_health_result_dict() + if query_file is None: + if not self._execute_queries(query_source=QUERIES, is_source_file=False): + return {} + health_summary = self._create_health_result_dict() - else: - self.logger.error("Wrong Input to execute") + elif query_file: + if not self._execute_queries(query_source=query_file, is_source_file=True): return {} + health_summary = self._create_health_result_dict() - except Exception: - pass + else: + raise Exception("Wrong Query-file input for Health-Checker to execute") self.no_valid_version = False self._reset_parser() diff --git a/lib/health/operation.py b/lib/health/operation.py index 96280a82..8665faf6 100644 --- a/lib/health/operation.py +++ b/lib/health/operation.py @@ -12,17 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import itertools from math import sqrt import operator +import re -from lib.health.constants import AssertResultKey, ParserResultType +from lib.health.constants import AssertResultKey, MAJORITY, ParserResultType from lib.health.exceptions import HealthException -from lib.health.util import deep_merge_dicts, get_kv, merge_key, make_map, make_key +from lib.health.util import create_health_internal_tuple, create_value_list_to_save, deep_merge_dicts,\ + find_majority_element, get_kv, get_value_from_health_internal_tuple, merge_key, make_map, make_key RESULT_TUPLE_HEADER = "RESULT" NOKEY = "" +# Simple Operations + operators = { "+": operator.add, "-": operator.sub, @@ -35,6 +40,7 @@ '<=': operator.le, '==': operator.eq, '!=': operator.ne, + '%%': lambda p, v: find_pct_value(p, v), 'AND': operator.and_, 'OR': operator.or_, 'MAX': max, @@ -43,6 +49,14 @@ } +def find_pct_value(pct, v): + if (isinstance(v, int) or isinstance(v, float)) and (isinstance(pct, int) or isinstance(pct, float)): + return float(v) * (float(pct)/100.0) + + return None + +# Aggregation Operations + def basic_vector_to_scalar_operation(op, kv, typecast=int, initial_value=None): """ Passed Vector values and type of value @@ -66,7 +80,7 @@ def basic_vector_to_scalar_operation(op, kv, typecast=int, initial_value=None): for i in kv: k1, v1 = get_kv(i) - + v1 = get_value_from_health_internal_tuple(v1) try: if not found_first: res = typecast(v1) @@ -89,6 +103,10 @@ def int_vector_to_scalar_operation(op, v): r, _ = basic_vector_to_scalar_operation(op, v, typecast=int) return r +def float_vector_to_scalar_operation(op, v): + r, _ = basic_vector_to_scalar_operation(op, v, typecast=float) + return r + def bool_vector_to_scalar_operation(op, v): r, _ = basic_vector_to_scalar_operation(op, v, typecast=bool) @@ -116,11 +134,13 @@ def vector_to_scalar_equal_operation(op, v): i0 = v[0] k1, v1 = get_kv(i0) + v1 = get_value_from_health_internal_tuple(v1) if v1 and isinstance(v1, list): v1 = sorted(v1) for i in v[1:]: k2, v2 = get_kv(i) + v2 = get_value_from_health_internal_tuple(v2) if v2 and isinstance(v2, list): v2 = sorted(v2) @@ -130,7 +150,9 @@ def vector_to_scalar_equal_operation(op, v): return True -def vector_to_vector_diff_operation(kv, op, a): +# Complex Operations + +def vector_to_vector_diff_operation(kv, op, a, save_param): """ Passed Vector values [ {(name, tag) : value}, {(name, tag) : value} ... @@ -141,6 +163,7 @@ def vector_to_vector_diff_operation(kv, op, a): """ res = {} + temp_res = {} if not kv or not a: raise HealthException("Insufficient input for Diff operation ") @@ -149,27 +172,37 @@ def vector_to_vector_diff_operation(kv, op, a): for x, y in itertools.combinations(kv, 2): k1, v1 = get_kv(x) k2, v2 = get_kv(y) - if op(abs(v1 - v2), a): + + _v1 = get_value_from_health_internal_tuple(v1) + _v2 = get_value_from_health_internal_tuple(v2) + + if op(abs(_v1 - _v2), a): try: - res[make_key(k1)] |= True + temp_res[make_key(k1)] |= True + except Exception: - res[make_key(k1)] = True + temp_res[make_key(k1)] = True try: - res[make_key(k2)] |= True + temp_res[make_key(k2)] |= True except Exception: - res[make_key(k2)] = True + temp_res[make_key(k2)] = True else: try: - res[make_key(k1)] |= False + temp_res[make_key(k1)] |= False except Exception: - res[make_key(k1)] = False + temp_res[make_key(k1)] = False try: - res[make_key(k2)] |= False + temp_res[make_key(k2)] |= False except Exception: - res[make_key(k2)] = False + temp_res[make_key(k2)] = False + + for i in kv: + k, v = get_kv(i) + val_to_save = create_value_list_to_save(save_param, value=temp_res[make_key(k)], op1=v) + res[make_key(k)] = create_health_internal_tuple(temp_res[make_key(k)], val_to_save) except Exception: exception_found = True @@ -177,25 +210,77 @@ def vector_to_vector_diff_operation(kv, op, a): if exception_found: for x in kv: k, v = get_kv(x) - res[make_key(k)] = None + res[make_key(k)] = create_health_internal_tuple(None, None) return res -def vector_to_vector_sd_anomaly_operation(kv, op, a): +def _find_match_operand_value(v, value_list): + if not v or not value_list: + return v + + if v == MAJORITY: + return find_majority_element(value_list) + + return v + + +def vector_to_vector_no_match_operation(kv, op, a, save_param): """ Passed Vector values [ {(name, tag) : value}, {(name, tag) : value} ... - Return boolean dictionary result + Return health internal tuple - { (name, tag) : True/False , (name, tag) : True/False, ... } + (True/False , [(key, value, formatting), (key, value, formatting), ...]) """ res = {} - if not kv or not a: + operand = get_value_from_health_internal_tuple(a) + if not kv: + raise HealthException("Insufficient input for NO_MATCH operation ") + + try: + values = [get_value_from_health_internal_tuple(get_kv(m)[1]) for m in kv] + match_operand = _find_match_operand_value(operand, values) + + result = False + val_to_save = [] + for x in kv: + k, v = get_kv(x) + _val = get_value_from_health_internal_tuple(v) + + if not op(_val, match_operand): + result |= True + val_to_save += create_value_list_to_save(save_param=None, value=result, op1=v) + + if operand and operand == MAJORITY: + key = "Majority Value" + else: + key = "Expected Value" + + val_to_save += create_value_list_to_save(save_param=save_param, key=key, value=match_operand) + res = create_health_internal_tuple(result, val_to_save) + + except Exception: + res = create_health_internal_tuple(False, None) + + return res + + +def vector_to_vector_sd_anomaly_operation(kv, op, a, save_param): + """ + Passed Vector values + [ {(name, tag) : value}, {(name, tag) : value} ... + + Return health internal tuple + + (True/False , [(key, value, formatting), (key, value, formatting), ...]) + """ + res = {} + sd_multiplier = get_value_from_health_internal_tuple(a) + if not kv or not sd_multiplier: raise HealthException("Insufficient input for SD_ANOMALY operation ") - exception_found = False try: n = len(kv) if n < 3: @@ -203,7 +288,7 @@ def vector_to_vector_sd_anomaly_operation(kv, op, a): range_start = 0 range_end = 0 else: - values = [get_kv(m)[1] for m in kv] + values = [get_value_from_health_internal_tuple(get_kv(m)[1]) for m in kv] no_anomaly = False try: @@ -219,27 +304,29 @@ def vector_to_vector_sd_anomaly_operation(kv, op, a): variance += pow((v - mean), 2) variance = float(variance) / float(n) sd = sqrt(variance) - range_start = mean - (a * sd) - range_end = mean + (a * sd) + range_start = mean - (sd_multiplier * sd) + range_end = mean + (sd_multiplier * sd) + result = False + val_to_save = [] for x in kv: k, v = get_kv(x) - if (no_anomaly or (float(v) >= float(range_start) - and float(v) <= float(range_end))): - res[make_key(k)] = False - else: - res[make_key(k)] = True + _val = get_value_from_health_internal_tuple(v) - except Exception: - exception_found = True + if not no_anomaly and (float(_val) < float(range_start) or float(_val) > float(range_end)): + result |= True + val_to_save += create_value_list_to_save(save_param=None, value=result, op1=v) - if exception_found: - for x in kv: - k, v = get_kv(x) - res[make_key(k)] = None + val_to_save += create_value_list_to_save(save_param=save_param, value=result) + res = create_health_internal_tuple(result, val_to_save) + + except Exception: + res = create_health_internal_tuple(False, None) return res +### + class SimpleOperation(): @@ -263,10 +350,32 @@ class SimpleOperation(): def __init__(self, op): self.op = operators[op] - def _operate_each_key(self, arg1, arg2): + def _operate_each_key(self, arg1, arg2, save_param=None): if isinstance(arg1, dict) and isinstance(arg2, dict): return None + if not isinstance(arg1, dict) and not isinstance(arg2, dict): + + try: + raw_arg1 = get_value_from_health_internal_tuple(arg1) + raw_arg2 = get_value_from_health_internal_tuple(arg2) + if self.op == operator.div and raw_arg2 == 0: + val_to_save = create_value_list_to_save(save_param, value=0, op1=arg1, op2=arg2) + return (0, val_to_save) + + # if any of the arg is type float or operation is division + # cast all argument to float + if self.op == operator.div or isinstance(raw_arg1, float) or isinstance(raw_arg2, float): + raw_arg1 = float(raw_arg1) + raw_arg2 = float(raw_arg2) + + result = self.op(raw_arg1, raw_arg2) + val_to_save = create_value_list_to_save(save_param, value=result, op1=arg1, op2=arg2) + return create_health_internal_tuple(result, val_to_save) + + except Exception: + return create_health_internal_tuple(None, []) + dict_first = True if isinstance(arg1, dict): d = arg1 @@ -275,31 +384,17 @@ def _operate_each_key(self, arg1, arg2): d = arg2 v = arg1 dict_first = False - else: - try: - # if any of the arg is type float or operation is division - # cast all argument to float - if self.op == operator.div and arg2 == 0: - return 0 - - if self.op == operator.div or isinstance(arg1, float) or isinstance(arg2, float): - arg1 = float(arg1) - arg2 = float(arg2) - - return self.op(arg1, arg2) - except Exception: - return None res_dict = {} for _k in d: if dict_first: - res_dict[_k] = self._operate_each_key(d[_k], v) + res_dict[_k] = self._operate_each_key(d[_k], v, save_param=save_param) else: - res_dict[_k] = self._operate_each_key(v, d[_k]) + res_dict[_k] = self._operate_each_key(v, d[_k], save_param=save_param) return res_dict - def _operate_dicts(self, arg1, arg2, on_common_only=False): + def _operate_dicts(self, arg1, arg2, on_common_only=False, save_param=None): if isinstance(arg1, dict) and isinstance(arg2, dict): k1_set = set(arg1.keys()) k2_set = set(arg2.keys()) @@ -310,30 +405,30 @@ def _operate_dicts(self, arg1, arg2, on_common_only=False): res_dict = {} for _k in k1_set.intersection(k2_set): res_dict[_k] = self._operate_dicts( - arg1[_k], arg2[_k], on_common_only=on_common_only) + arg1[_k], arg2[_k], on_common_only=on_common_only, save_param=save_param) return res_dict else: - return self._operate_each_key(arg1, arg2) + return self._operate_each_key(arg1, arg2, save_param=save_param) def operate(self, arg1, arg2, group_by=None, result_comp_op=None, - result_comp_val=None, on_common_only=False): + result_comp_val=None, on_common_only=False, save_param=None): if arg1 is None or arg2 is None: raise HealthException("Wrong operands for Simple operation.") # No Group By So No Key Merging - return self._operate_dicts(arg1, arg2, on_common_only=on_common_only) + return self._operate_dicts(arg1, arg2, on_common_only=on_common_only, save_param=save_param) class AggOperation(): operator_and_function = { - '+': lambda v: int_vector_to_scalar_operation(operators["+"], v), - '*': lambda v: int_vector_to_scalar_operation(operators["*"], v), + '+': lambda v: float_vector_to_scalar_operation(operators["+"], v), + '*': lambda v: float_vector_to_scalar_operation(operators["*"], v), 'AND': lambda v: bool_vector_to_scalar_operation(operators["AND"], v), 'OR': lambda v: bool_vector_to_scalar_operation(operators["OR"], v), 'AVG': lambda v: vector_to_scalar_avg_operation(operators["+"], v), - 'MAX': lambda v: int_vector_to_scalar_operation(operators["MAX"], v), - 'MIN': lambda v: int_vector_to_scalar_operation(operators["MIN"], v), + 'MAX': lambda v: float_vector_to_scalar_operation(operators["MAX"], v), + 'MIN': lambda v: float_vector_to_scalar_operation(operators["MIN"], v), '==': lambda v: vector_to_scalar_equal_operation(operators["=="], v), 'COUNT': operators["COUNT"], 'COUNT_ALL': operators["COUNT"], @@ -341,10 +436,17 @@ class AggOperation(): def __init__(self, op): self.op = op - self.op_fn = AggOperation.operator_and_function[op] + self.op_fn = self.op_fn_distributor + + def op_fn_distributor(self, v, save_param): + result = AggOperation.operator_and_function[self.op](v) + + val_to_save = create_value_list_to_save(save_param, value=result, op1=v) + + return create_health_internal_tuple(result, val_to_save) def operate(self, arg1, arg2=None, group_by=None, result_comp_op=None, - result_comp_val=None, on_common_only=False): + result_comp_val=None, on_common_only=False, save_param=None): if not arg1: raise HealthException("Wrong operand for Aggregation operation.") @@ -357,7 +459,9 @@ def operate(self, arg1, arg2=None, group_by=None, result_comp_op=None, "Invalid group ids %s for Aggregation operation." % (str(group_by))) try: - return apply_operator(arg1, NOKEY, self.op_fn, group_by[-1] if group_by else "CLUSTER", on_all_keys=False if self.op=="COUNT" else True) + return apply_operator(arg1, NOKEY, self.op_fn, group_by[-1] if group_by else "CLUSTER", + on_all_keys=False if self.op=="COUNT" else True, save_param=save_param, + update_saved_list=True) except Exception as e: raise HealthException(str(e) + " for Aggregation Operation") @@ -365,14 +469,17 @@ def operate(self, arg1, arg2=None, group_by=None, result_comp_op=None, class ComplexOperation(): operator_and_function = { - 'DIFF': lambda kv, op, a: vector_to_vector_diff_operation(kv, op, a), - 'SD_ANOMALY': lambda kv, op, a: vector_to_vector_sd_anomaly_operation(kv, op, a), + 'DIFF': lambda kv, op, a, sp: vector_to_vector_diff_operation(kv, op, a, sp), + 'SD_ANOMALY': lambda kv, op, a, sp: vector_to_vector_sd_anomaly_operation(kv, op, a, sp), + 'NO_MATCH': lambda kv, op, a, sp: vector_to_vector_no_match_operation(kv, op, a, sp), } def __init__(self, op): + self.op = op self.op_fn = ComplexOperation.operator_and_function[op] - def operate(self, arg1, arg2=None, group_by=None, result_comp_op=None, result_comp_val=None, on_common_only=False): + def operate(self, arg1, arg2=None, group_by=None, result_comp_op=None, result_comp_val=None, + on_common_only=False, save_param=None): if not arg1: # if empty opearand raise HealthException("Wrong operand for Complex operation.") @@ -386,40 +493,15 @@ def operate(self, arg1, arg2=None, group_by=None, result_comp_op=None, result_co "Invalid group ids %s for Complex operation." % (str(group_by))) try: - return apply_operator(arg1, NOKEY, - lambda kv: self.op_fn(kv, operators[result_comp_op], - result_comp_val), group_by[-1] - if group_by else "CLUSTER") + return apply_operator(arg1, NOKEY, + lambda kv, sp: self.op_fn(kv, operators[result_comp_op], result_comp_val, sp), + group_by[-1] if group_by else "CLUSTER", save_param=save_param, + update_saved_list=True) except Exception as e: raise HealthException(str(e) + " for Complex Operation") -class AssertOperation(): - - def __init__(self, op): - self.op = operators[op] - - def operate(self, data={}, check_val=True, error=None): - if not data: - raise HealthException("Wrong Input Data for ASSERT operation.") - - if not isinstance(data, dict): - if not self.op(data, check_val): - return ("ASSERT", error) - return None - - v = find_data_vector(data) - - if not v: - return ("ASSERT", error) - for i in v: - if not self.op(i, check_val): - return ("ASSERT", error) - - return None - - class AssertDetailOperation(): """ @@ -431,7 +513,7 @@ class AssertDetailOperation(): def __init__(self, op): self.op = operators[op] - def operate(self, data={}, check_val=True, error=None, category=None, + def operate(self, data={}, check_val=create_health_internal_tuple(True,[]), error=None, category=None, level=None, description=None, success_msg=None): if not data: raise HealthException("Wrong Input Data for ASSERT operation.") @@ -449,11 +531,11 @@ def operate(self, data={}, check_val=True, error=None, category=None, res[AssertResultKey.LEVEL] = level if not isinstance(data, dict): - if not self.op(data, check_val): + if not self.op(get_value_from_health_internal_tuple(data), get_value_from_health_internal_tuple(check_val)): return (ParserResultType.ASSERT, res) return None - kv = find_kv_vector(NOKEY, data, True) + kv = find_kv_vector(NOKEY, data, recurse=True, update_saved_list=False) if not kv: return (ParserResultType.ASSERT, res) @@ -462,10 +544,15 @@ def operate(self, data={}, check_val=True, error=None, category=None, for i in kv: k, v = get_kv(i) - if not self.op(v, check_val): + kv_tuple = (k, None) + value_to_check = get_value_from_health_internal_tuple(v) + if v[1]: + kv_tuple = (k, v[1]) + + if not self.op(value_to_check, get_value_from_health_internal_tuple(check_val)): res[AssertResultKey.SUCCESS] = False fail = True - res[AssertResultKey.KEYS].append(str(k)) + res[AssertResultKey.KEYS].append(kv_tuple) if not fail: res[AssertResultKey.SUCCESS] = True @@ -534,10 +621,95 @@ def do_multiple_group_by(d, group_by_list): return res + +# Select operation + +def _is_key_in_ignore_keys(key, ignore_keys): + if not key or not ignore_keys: + return False + + return any(re.search(ik[1], key) if ik[0] else key==ik[1] for ik in ignore_keys) + +def select_keys_from_dict(data={}, keys=[], from_keys=[], ignore_keys=[], save_param=None, config_param=False): + """ + Function takes dictionary, list of keys to fetch, list of from_keys to filter scope + + Returns dictionary of selected keys and values + """ + + if not data or not isinstance(data, dict): + raise HealthException("Wrong Input Data for select operation.") + + result_dict = {} + if not keys: + raise HealthException("No key provided for select operation.") + + for _key in data: + if from_keys: + f_key = from_keys[0] + if isinstance(_key, tuple): + # from_keys work with static component keys only, if we get + # tuple keys means we have done with checking of all component + # keys and not found any from key match so no need to check + # further in this direction + break + + if (f_key == "ALL") or (_key == f_key): + # from_key is ALL or matching with _key + child_res = select_keys_from_dict(data[_key], keys=keys, + from_keys=from_keys[1:] if len(from_keys) > 1 else [], + ignore_keys=ignore_keys, + save_param=save_param, config_param=config_param) + + else: + # no key match, need to check further + child_res = select_keys_from_dict(data[_key], keys=keys, + from_keys=from_keys, ignore_keys=ignore_keys, + save_param=save_param, config_param=config_param) + + if child_res: + if f_key == "ALL": + # It assumes ALL is only for top snapshot level + result_dict[(_key, "SNAPSHOT")] = copy.deepcopy(child_res) + else: + result_dict = deep_merge_dicts( + result_dict, copy.deepcopy(child_res)) + + else: + # if (False, "*", None) in keys and isinstance(_key, tuple): + # result_dict[_key] = copy.deepcopy(data[_key]) + if isinstance(_key, tuple) and _key[1] == "KEY": + for check_substring, s_key, new_name in keys: + if ((s_key == "*" and not _is_key_in_ignore_keys(_key[0], ignore_keys)) or (check_substring and re.search(s_key, _key[0])) + or (not check_substring and _key[0] == s_key)): + + val_to_save = create_value_list_to_save(save_param=save_param, key=_key[0], value=data[_key], + formatting=not config_param) + + if new_name: + result_dict[(new_name, "KEY")] = create_health_internal_tuple(data[_key], val_to_save) + + else: + result_dict[_key] = create_health_internal_tuple(data[_key], val_to_save) + + break + + elif data[_key] and isinstance(data[_key], dict): + child_res = select_keys_from_dict(data[_key], keys=keys, ignore_keys=ignore_keys, save_param=save_param, config_param=config_param) + if child_res: + if isinstance(_key, tuple): + result_dict[_key] = copy.deepcopy(child_res) + else: + result_dict = deep_merge_dicts(result_dict, + copy.deepcopy(child_res)) + + return result_dict + + # Recursive worker functions to apply operation -def apply_operator(data, key, op_fn, group_by=None, arg2=None, recurse=False, on_all_keys = True): +def apply_operator(data, key, op_fn, group_by=None, arg2=None, recurse=False, on_all_keys=True, save_param=None, update_saved_list=False): res_dict = {} if not data or not isinstance(data, dict): raise HealthException("Wrong Input Data ") @@ -551,18 +723,18 @@ def apply_operator(data, key, op_fn, group_by=None, arg2=None, recurse=False, on # User merged key for aggregation result if on_all_keys: # Apply operation on all leaf values - res_dict[k] = op_fn(find_kv_vector(NOKEY, data[_key], True)) + res_dict[k] = op_fn(find_kv_vector(NOKEY, data[_key], recurse=True, update_saved_list=update_saved_list), save_param) else: # Apply operation on next level only, no further if isinstance(data[_key], dict): # Next level is dict, so apply operation on keys - res_dict[k] = op_fn(data[_key].keys()) + res_dict[k] = op_fn(data[_key].keys(), save_param) else: # Next level is not dict, so apply operation on value - res_dict[k] = op_fn([data[_key]]) + res_dict[k] = op_fn([data[_key]], save_param) else: res_dict[_key] = apply_operator( - data[_key], k, op_fn, group_by, arg2, recurse, on_all_keys=on_all_keys) + data[_key], k, op_fn, group_by, arg2, recurse, on_all_keys=on_all_keys, save_param=save_param, update_saved_list=update_saved_list) return res_dict @@ -582,7 +754,21 @@ def find_data_vector(data_dict): return v -def find_kv_vector(key, data, recurse=False): +def add_prefix_to_saved_keys(prefix, data): + if not prefix or not data or not data[1]: + return data + + new_saved_value_list = [] + for i in data[1]: + _k = prefix + if i[0] and len(i[0].strip()) > 0: + _k += "/%s"%(i[0]) + new_saved_value_list.append((_k, i[1], i[2])) + + return create_health_internal_tuple(data[0], new_saved_value_list) + + +def find_kv_vector(key, data, recurse=False, update_saved_list=False): """ Function takes a arbitrary next dictionary and creates vector of based level key and value pair in form @@ -598,11 +784,23 @@ def find_kv_vector(key, data, recurse=False): if data is None: return v + if not isinstance(data, dict): + k = merge_key(key, " ", recurse) + v.append(make_map(k, data)) + return v + for _key in sorted(data.keys()): k = merge_key(key, _key, recurse) if not isinstance(data[_key], dict): - v.append(make_map(k, data[_key])) + + if _key[1] == "KEY": + _k = key + + else: + _k = k + v.append(make_map(k, add_prefix_to_saved_keys(_k, data[_key]) if update_saved_list else data[_key])) + # v.append(make_map(k, data[_key])) else: - v.extend(find_kv_vector(k, data[_key], recurse)) + v.extend(find_kv_vector(k, data[_key], recurse=recurse, update_saved_list=update_saved_list)) return v diff --git a/lib/health/parser.py b/lib/health/parser.py index 808a1861..26fea263 100644 --- a/lib/health/parser.py +++ b/lib/health/parser.py @@ -16,10 +16,10 @@ import re from lib.health.commands import select_keys, do_assert, do_operation, do_assert_if_check -from lib.health.constants import AssertLevel +from lib.health.constants import AssertLevel, HEALTH_PARSER_VAR, MAJORITY from lib.health.exceptions import SyntaxException from lib.health.operation import do_multiple_group_by -from lib.health.util import h_eval, create_snapshot_key +from lib.health.util import create_health_internal_tuple, create_snapshot_key, h_eval, is_health_parser_variable try: from ply import lex, yacc @@ -49,10 +49,13 @@ class HealthLexer(object): 'DEVICE_INTERRUPTS': 'DEVICE_INTERRUPTS', 'DEVICE_STAT': 'DEVICE_STAT', 'DF': 'DF', + 'DMESG': 'DMESG', 'ENDPOINTS': 'ENDPOINTS', 'FREE': 'FREE', 'INTERRUPTS': 'INTERRUPTS', 'IOSTAT': 'IOSTAT', + 'LSCPU': 'LSCPU', + 'LIMITS' : 'LIMITS', 'MEM': 'MEM', 'MEMINFO': 'MEMINFO', 'METADATA': 'METADATA', @@ -65,6 +68,9 @@ class HealthLexer(object): 'SERVICES': 'SERVICES', 'STATISTICS': 'STATISTICS', 'SWAP': 'SWAP', + 'SYSCTLALL': 'SYSCTLALL', + 'HDPARM': 'HDPARM', + 'IPTABLES' : 'IPTABLES', 'TASKS': 'TASKS', 'TOP': 'TOP', 'UDF': 'UDF', @@ -99,19 +105,24 @@ class HealthLexer(object): agg_ops = { 'AND': 'AND', - 'OR' : 'OR', 'AVG': 'AVG', - 'SUM': 'SUM', + 'COUNT': 'COUNT', + 'COUNT_ALL': 'COUNT_ALL', 'EQUAL': 'EQUAL', 'MAX': 'MAX', 'MIN': 'MIN', - 'COUNT': 'COUNT', - 'COUNT_ALL': 'COUNT_ALL' + 'OR' : 'OR', + 'SUM': 'SUM' } complex_ops = { 'DIFF': 'DIFF', - 'SD_ANOMALY': 'SD_ANOMALY' + 'SD_ANOMALY': 'SD_ANOMALY', + 'NO_MATCH': 'NO_MATCH' + } + + complex_params = { + 'MAJORITY': MAJORITY, } assert_ops = { @@ -124,21 +135,23 @@ class HealthLexer(object): } reserved = { - 'select': 'SELECT', - 'from': 'FROM', 'as': 'AS', + 'by': 'BY', + 'common': 'COMMON', 'do': 'DO', + 'from': 'FROM', 'group': 'GROUP', - 'by': 'BY', + 'ignore': 'IGNORE', 'like': 'LIKE', 'on': 'ON', - 'common': 'COMMON' + 'save': 'SAVE', + 'select': 'SELECT' } - tokens = ['NUMBER', 'BOOL_VAL', + tokens = ['NUMBER', 'FLOAT', 'BOOL_VAL', 'VAR', 'NEW_VAR', 'COMPONENT', 'GROUP_ID', 'COMPONENT_AND_GROUP_ID', - 'AGG_OP', 'COMPLEX_OP', 'ASSERT_OP', 'ASSERT_LEVEL', + 'AGG_OP', 'COMPLEX_OP', 'COMPLEX_PARAM', 'ASSERT_OP', 'ASSERT_LEVEL', 'STRING', 'COMMA', 'DOT', 'PLUS', 'MINUS', @@ -152,6 +165,11 @@ class HealthLexer(object): 'PCT', ] + list(reserved.values()) + def t_FLOAT(self, t): + r'\d+(\.(\d+)?([eE][-+]?\d+)?|[eE][-+]?\d+)' + t.value = float(t.value) + return t + def t_NUMBER(self, t): r'\d+' t.value = int(t.value) @@ -180,6 +198,9 @@ def t_VAR(self, t): t.type = "AGG_OP" elif t.value in HealthLexer.complex_ops.keys(): t.type = "COMPLEX_OP" + elif t.value in HealthLexer.complex_params.keys(): + t.value = HealthLexer.complex_params[t.value] + t.type = "COMPLEX_PARAM" elif t.value in HealthLexer.assert_ops.keys(): t.type = "ASSERT_OP" elif t.value in HealthLexer.assert_levels.keys(): @@ -187,7 +208,7 @@ def t_VAR(self, t): t.type = "ASSERT_LEVEL" elif t.value in HealthVars: t.type = "VAR" - t.value = (t.value, copy.deepcopy(HealthVars[t.value])) + t.value = (HEALTH_PARSER_VAR, t.value, copy.deepcopy(HealthVars[t.value])) return t def t_STRING(self, t): @@ -257,12 +278,13 @@ def p_statement(self, p): if len(p) > 2 and p[2] is not None: if isinstance(p[2], Exception): val = None - elif isinstance(p[2], tuple): - val = p[2][1] + elif is_health_parser_variable(p[2]): + val = p[2][2] else: val = p[2] - if isinstance(p[1], tuple): - HealthVars[p[1][0]] = val + + if is_health_parser_variable(p[1]): + HealthVars[p[1][1]] = val else: HealthVars[p[1]] = val p[0] = val @@ -295,27 +317,46 @@ def p_agg_operation(self, p): def p_complex_operation(self, p): """ - complex_operation : COMPLEX_OP LPAREN operand COMMA comparison_op COMMA operand RPAREN + complex_operation : COMPLEX_OP LPAREN operand COMMA comparison_op COMMA complex_comparison_operand RPAREN """ p[0] = (p[1], p[3], None, p[5], p[7], False) + def p_complex_comparison_operand(self, p): + """ + complex_comparison_operand : COMPLEX_PARAM + | operand + """ + if is_health_parser_variable(p[1]): + p[0] = p[1][2] + + elif not isinstance(p[1], tuple): + p[0] = create_health_internal_tuple(p[1], []) + + else: + p[0] = p[1] + def p_operand(self, p): """ operand : VAR - | number - | STRING - | BOOL_VAL + | constant """ - if isinstance(p[1], tuple): - p[0] = p[1][1] + if is_health_parser_variable(p[1]): + p[0] = p[1][2] else: - p[0] = h_eval(p[1]) + p[0] = create_health_internal_tuple(p[1], []) + + def p_value(self, p): + """ + value : NUMBER + | FLOAT + """ + p[0] = p[1] def p_number(self, p): """ - number : NUMBER - | PLUS NUMBER - | MINUS NUMBER + number : value + | PLUS value + | MINUS value """ if len(p) == 2: p[0] = p[1] @@ -389,7 +430,7 @@ def p_group_by_statement(self, p): group_by_statement : group_by_clause VAR """ try: - p[0] = do_multiple_group_by(p[2][1], p[1]) + p[0] = do_multiple_group_by(p[2][2], p[1]) except Exception as e: p[0] = e @@ -419,16 +460,32 @@ def p_cmd_statement(self, p): def p_op_statement(self, p): """ - op_statement : opt_group_by_clause DO simple_operation - | opt_group_by_clause DO agg_operation - | opt_group_by_clause DO complex_operation + op_statement : opt_group_by_clause DO simple_operation opt_save_clause + | opt_group_by_clause DO agg_operation opt_save_clause + | opt_group_by_clause DO complex_operation opt_save_clause """ try: p[0] = do_operation(op=p[3][0], arg1=p[3][1], arg2=p[3][2], group_by=p[ - 1], result_comp_op=p[3][3], result_comp_val=p[3][4], on_common_only=p[3][5]) + 1], result_comp_op=p[3][3], result_comp_val=p[3][4], on_common_only=p[3][5], save_param=p[4]) except Exception as e: p[0] = e + def p_opt_save_clause(self, p): + """ + opt_save_clause : SAVE opt_as_clause + | + """ + if len(p) == 3: + if p[2] is None: + # No keyname entered so use same as value keyname + p[0] = "" + else: + # Keyname entered + p[0] = p[2] + else: + # No data saving + p[0] = None + def p_assert_statement(self, p): """ assert_statement : ASSERT_OP LPAREN assert_arg COMMA assert_comparison_arg COMMA error_string COMMA assert_category COMMA ASSERT_LEVEL COMMA assert_desc_string COMMA assert_success_msg COMMA assert_if_condition RPAREN @@ -461,7 +518,7 @@ def p_assert_statement(self, p): pass p[0] = do_assert( - op=p[1], data=data, check_val=True, error=p[7], category=p[9], level=p[11], description=p[13], success_msg=p[15]) + op=p[1], data=data, check_val=create_health_internal_tuple(True,[]), error=p[7], category=p[9], level=p[11], description=p[13], success_msg=p[15]) else: p[0] = do_assert( op=p[1], data=p[3], check_val=p[5], error=p[7], category=p[9], level=p[11], description=p[13], success_msg=p[15]) @@ -485,23 +542,23 @@ def p_opt_assert_if_arg2(self, p): def p_assert_arg(self, p): """ - assert_arg : VAR - | number - | STRING - | BOOL_VAL + assert_arg : operand """ - if isinstance(p[1], tuple): - p[0] = p[1][1] - else: - p[0] = p[1] + p[0] = p[1] def p_assert_comparison_arg(self, p): """ - assert_comparison_arg : number - | STRING - | BOOL_VAL + assert_comparison_arg : constant """ - p[0] = p[1] + p[0] = create_health_internal_tuple(p[1], []) + + def p_constant(self, p): + """ + constant : number + | STRING + | BOOL_VAL + """ + p[0] = h_eval(p[1]) def p_assert_category(self, p): """ @@ -532,13 +589,13 @@ def p_error_string(self, p): def p_select_statement(self, p): """ - select_statement : SELECT select_keys opt_from_clause + select_statement : SELECT select_keys opt_from_clause opt_ignore_clause opt_save_clause | operand """ if len(p) > 2: try: - p[0] = select_keys( - data=self.health_input_data, select_keys=p[2], select_from_keys=p[3]) + p[0] = select_keys(data=self.health_input_data, select_keys=p[2], + select_from_keys=p[3], ignore_keys=p[4], save_param=p[5]) except Exception as e: p[0] = e else: @@ -630,6 +687,42 @@ def p_select_key(self, p): else: p[0] = (False, p[1], p[2]) + def p_opt_ignore_clause(self, p): + """ + opt_ignore_clause : IGNORE ignore_keys + | + """ + if len(p) == 1: + p[0] = [] + else: + p[0] = p[2] + + def p_ignore_keys(self, p): + """ + ignore_keys : ignore_keys COMMA ignore_key + | ignore_key + """ + if len(p) > 2: + p[1].append(p[3]) + p[0] = p[1] + else: + p[0] = [p[1]] + + def p_ignore_key(self, p): + """ + ignore_key : LIKE LPAREN key RPAREN + | key + """ + if len(p) > 2: + pattern = p[3] + if not pattern.startswith("^"): + pattern = "^" + str(pattern) + if not pattern.endswith("$"): + pattern += "$" + p[0] = (True, pattern) + else: + p[0] = (False, p[1]) + def p_key(self, p): """ key : STRING diff --git a/lib/health/query.py b/lib/health/query.py index 4ec1212f..301761b1 100644 --- a/lib/health/query.py +++ b/lib/health/query.py @@ -25,8 +25,57 @@ // SET CONSTRAINT VERSION IN [3.8.4, 3.10.0]; SET CONSTRAINT VERSION ALL; +/* System checks */ + +limit = select "Soft_Max_open_files" as "fd" from SYSTEM.LIMITS save; +limit = group by CLUSTER, NODE, KEY do SUM(limit); +config = select "proto-fd-max" as "fd" from SERVICE.CONFIG save; +r = do config < limit; +ASSERT(r, True, "File descriptor is configured higher than limit.", "LIMITS", INFO, + "Listed node[s] have proto-fd-limit set higher than system soft limit of Max open files. Aerospike process may run out of file descriptor, Possible misconfiguration.", + "System open file descriptor limit check."); + +s = select * from SYSTEM.HDPARM save; +r = group by KEY do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different Disk Hardware in cluster.", "OPERATIONS", INFO, + "Different disk hardware configuration across multiple nodes in cluster.", "Disk hardware check."); + +s = select "OOM" from SYSTEM.DMESG save; +ASSERT(s, False, "DMESG: Process Out of Memory kill.", "OPERATIONS", INFO, + "Certain process was killed due to Out Of Memory. Check dmesg or system log.", + "System OOM kill check."); + +s = select "Blocked" from SYSTEM.DMESG save; +ASSERT(s, False, "DMESG: Process blocking.", "OPERATIONS", INFO, + "Certain process was blocked for more than 120sec. Check dmesg or system log.", + "System process blocking Check."); + +s = select "OS" from SYSTEM.DMESG save; +r = group by NODE do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different OS version in cluster.", "OPERATIONS", INFO, + "Different version of OS running across multiple nodes in cluster.", "OS version check."); + +s = select * from SYSTEM.LSCPU save; +r = group by KEY do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "CPU configuration mismatch.", "OPERATIONS", INFO, + "Listed node[s] in the cluster are running with different CPU or CPU setting, performance may be skewed. Please run 'lscpu' to check CPU configuration.", + "CPU config check."); + +s = select "vm_drop_caches", "vm_nr_hugepages", "vm_nr_hugepages_policy", "vm_numa_zonelist_order", "vm_oom_dump_tasks", "vm_oom_kill_allocating_task", "vm_zone_reclaim_mode", "vm_swapiness", + "vm_nr_overcommit_hugepages", "kernel_shmmax", "kernel_shmall", "kernel_version" from SYSTEM.SYSCTLALL save; +r = group by KEY do NO_MATCH(s, ==, MAJORITY); +ASSERT(r, False, "Sysctl configuration mismatch.", "OPERATIONS", INFO, + "Listed node[s] in the cluster are running with different Sysctl setting. Please run 'sysctl -a' to check CPU configuration.", + "Sysctl config check."); + +s = select "has_firewall" from SYSTEM.IPTABLES; +ASSERT(s, False, "Node in cluster have firewall setting.", "OPERATIONS", INFO, + "Listed node[s] have firewall setting. Could cause cluster formation issue if misconfigured. Please run 'iptables -L' to check firewall rules.", + "Firewall Check."); + + /* Disk */ -s = select "%util" from SYSTEM.IOSTAT; +s = select "%util" from SYSTEM.IOSTAT save; r = do s > 90; ASSERT(r, False, "High system disk utilization.", "PERFORMANCE", CRITICAL, "Listed disks show higher than normal (> 90%) disk utilization at the time of sampling. Please run 'iostat' command to check disk utilization. Possible causes can be disk overload due to undersized cluster or some issue with disk hardware itself. If running on cloud, can be a problem with cloud instance itself.", @@ -37,9 +86,9 @@ "Disk utilization Anomaly."); -avail=select like(".*available_pct") as "free_disk" from NAMESPACE.STATISTICS; -disk_free = select "device_free_pct" as "free_disk", "free-pct-disk" as "free_disk" from NAMESPACE.STATISTICS; -r = do disk_free - avail; +avail=select like(".*available_pct") as "free_disk" from NAMESPACE.STATISTICS save; +disk_free = select "device_free_pct" as "free_disk", "free-pct-disk" as "free_disk" from NAMESPACE.STATISTICS save; +r = do disk_free - avail save as "fragmented blocks pct"; r = do r <= 30; r = group by CLUSTER, NAMESPACE r; ASSERT(r, True, "High (> 30%) fragmented blocks.", "PERFORMANCE", WARNING, @@ -47,7 +96,7 @@ "Fragmented Blocks check."); -s = select "%iowait" from SYSTEM.IOSTAT; +s = select "%iowait" from SYSTEM.IOSTAT save; r = do s > 10; ASSERT(r, False, "High (> 10%) CPU IO wait time.", "PERFORMANCE", WARNING, "Listed nodes show higher than normal (> 10%) CPU spent in io wait. Please run 'iostat' command to check utilization. Possible cause can be slow disk or network leading to lot of CPU time spent waiting for IO.", @@ -58,7 +107,7 @@ "CPU IO wait time anomaly."); -s = select "await" from SYSTEM.IOSTAT; +s = select "await" from SYSTEM.IOSTAT save; r = do s > 4; ASSERT(r, False, "High system disk average wait time.", "PERFORMANCE", WARNING, "Listed disks show higher than normal (> 4ms) disk average wait time. Please run 'iostat' command to check average wait time (await). Possible cause can be issue with disk hardware or VM instance in case you are running in cloud environment. This may also be caused by having storage over network like say SAN device or EBS.", @@ -69,7 +118,7 @@ "Disk average wait time anomaly check."); -s = select "avgqu-sz" from SYSTEM.IOSTAT; +s = select "avgqu-sz" from SYSTEM.IOSTAT save; r = do s > 7; ASSERT(r, False, "High disk average queue size.", "PERFORMANCE", INFO, "Listed disks show higher than normal (> 7) disk average queue size. This is not a issue if using NVME drives which support more queues. Please run 'iostat' command to check average wait time (avgqu-sz). Possible disk overload. This may be non-issue of disk has more than 7 queues. Please analyze this number in conjunction with utilization.", @@ -80,8 +129,8 @@ "Disk avg queue size anomaly check."); -s = select "id" as "cpu_use" from SYSTEM.TOP.CPU_UTILIZATION; -s = do 100 - s; +s = select "id" as "cpu_use" from SYSTEM.TOP.CPU_UTILIZATION save as "cpu_idle_pct"; +s = do 100 - s save as "cpu utilization pct"; r = do s > 70; ASSERT(r, False, "High system CPU utilization.", "PERFORMANCE", CRITICAL, "Listed node[s] are showing higher than normal (> 70%) CPU utilization. Please check top output. Possible system overload.", @@ -92,21 +141,21 @@ "CPU utilization anomaly check."); -s = select "resident_memory" from SYSTEM.TOP; +s = select "resident_memory" from SYSTEM.TOP save; r = group by KEY do SD_ANOMALY(s, ==, 3); ASSERT(r, False, "Skewed cluster resident memory utilization.", "ANOMALY", WARNING, "Listed node[s] show different resident memory usage compared to other node[s]. Please run top command on those node[s] to confirm such behavior. Possible skewed data distribution. This may be non-issue in case migrations are going on.", "Resident memory utilization anomaly."); -s = select "system_swapping" from SERVICE.STATISTICS; +s = select "system_swapping" from SERVICE.STATISTICS save; r = do s == true; ASSERT(r, False, "System memory swapping.", "LIMITS", INFO, "Listed node[s] are swapping. Please run 'show statistics service like system_swapping' to confirm such behaviour. Possible misconfiguration. This may be non-issue if amount of swap is small and good amount of memory available.", "System swap check."); /* TODO - is it really actually an issue */ -s = select "system_free_mem_pct"; +s = select "system_free_mem_pct" from SERVICE.STATISTICS save; r = do s < 20; ASSERT(r, False, "Low system memory percentage.", "LIMITS", CRITICAL, "Listed node[s] have lower than normal (< 20%) system free memory percentage. Please run 'show statistics service like system_free_mem_pct' to get actual values. Possible misconfiguration.", @@ -115,7 +164,7 @@ /* NB : ADD CHECKS IF NODES ARE NOT HOMOGENOUS MEM / NUM CPU etc */ -s = select "available_bin_names", "available-bin-names" from NAMESPACE; +s = select "available_bin_names", "available-bin-names" from NAMESPACE save; r = group by NAMESPACE do s > 3200; ASSERT(r, True, "Low namespace available bin names.", "LIMITS", WARNING, "Listed node[s] have low available bin name (< 3200) for corresponding namespace[s]. Maximum unique bin names allowed per namespace are 32k. Please run 'show statistics namespace like available' to get actual values. Possible improperly modeled data.", @@ -124,29 +173,29 @@ /* Holds only upto 4B key */ SET CONSTRAINT VERSION < 3.12; -s = select "memory-size" from NAMESPACE; -r = group by NODE, NAMESPACE do SUM(s); +s = select "memory-size" from NAMESPACE.CONFIG save; +r = group by CLUSTER, NODE, NAMESPACE do SUM(s); e = do r <= 274877906944; ASSERT(e, True, "Namespace configured to use more than 256G.", "LIMITS", WARNING, - "On list nodes namespace as mentioned have configured more than 256G of memory. Namespace with data not in memory can have max upto 4billion keys and can utilize only up to 256G. Please run 'show statistics namespace like memory-size' to check configured memory.", + "On listed nodes namespace as mentioned have configured more than 256G of memory. Namespace with data not in memory can have max upto 4 billion keys and can utilize only up to 256G. Please run 'show statistics namespace like memory-size' to check configured memory.", "Namespace per node memory limit check."); SET CONSTRAINT VERSION ALL; /* -Following query selects assigned memory-size from namespace statistics and total ram size from system statistics. +Following query selects assigned memory-size from namespace config and total ram size from system statistics. group by for namespace stats sums all memory size and gives node level memory size. group by for system stats helps to remove key, this is requirement for proper matching for simple operations. */ -s = select "memory-size" from NAMESPACE; -n = group by NODE do SUM(s); +s = select "memory-size" from NAMESPACE.CONFIG save; +n = group by NODE do SUM(s) save as "sum of memory-size"; s = select "total" from SYSTEM.FREE.MEM; -m = group by NODE do SUM(s); +m = group by NODE do SUM(s) save as "total physical memory"; r = do n <= m on common; ASSERT(r, True, "Namespace memory misconfiguration.", "LIMITS", WARNING, "Listed node[s] have more namespace memory configured than available physical memory. Please run 'show statistics namespace like memory-size' to check configured memory and check output of 'free' for system memory. Possible namespace misconfiguration.", "Namespace memory configuration check."); -r = do m - n on common; +r = do m - n on common save as "runtime memory"; r = do r >= 5368709120; ASSERT(r, True, "Aerospike runtime memory configured < 5G.", "LIMITS", INFO, "Listed node[s] have less than 5G free memory available for Aerospike runtime. Please run 'show statistics namespace like memory-size' to check configured memory and check output of 'free' for system memory. Possible misconfiguration.", @@ -157,30 +206,40 @@ Following query selects proto-fd-max from service config and client_connections from service statistics. It uses as clause to get proper matching structure for simple operation. */ -max = select "proto-fd-max" as "fd" from SERVICE.CONFIG; -conn = select "client_connections" as "fd" from SERVICE.STATISTICS; +max = select "proto-fd-max" as "fd" from SERVICE.CONFIG save; +conn = select "client_connections" as "fd" from SERVICE.STATISTICS save; bound = do 80 %% max; r = do conn > bound; ASSERT(r, False, "High system client connections.", "OPERATIONS", WARNING, - "Listed node[s] show higher than normal (> 80%) client-connections of the max configured proto-fd-max. Please run 'show config like proto-fd-max' and 'show statistics like client_connections' for actual values. Possible can be network issue / improper client behavior / FD leak.", + "Listed node[s] show higher than normal client-connections (> 80% of the max configured proto-fd-max). Please run 'show config like proto-fd-max' and 'show statistics like client_connections' for actual values. Possible can be network issue / improper client behavior / FD leak.", "Client connections check."); -s = select like(".*available_pct") from NAMESPACE.STATISTICS; +s = select like(".*available_pct") from NAMESPACE.STATISTICS save; r = do s < 20; ASSERT(r, False, "Low namespace disk available pct.", "OPERATIONS", WARNING, - "Listed namespace[s] have lower than normal (< 20 %). Please run 'show statistics namespace like available_pct' to check available disk space. Probable cause - namespace size misconfiguration.", + "Listed namespace[s] have lower than normal (< 20 %) available disk space. Please run 'show statistics namespace like available_pct' to check available disk space. Probable cause - namespace size misconfiguration.", "Namespace disk available pct check."); -s = select * from SERVICE.CONFIG; -r = group by KEY do EQUAL(s); -ASSERT(r, True, "Different service configurations.", "OPERATIONS", WARNING, +s = select * from SERVICE.CONFIG ignore "pidfile", "heartbeat.mtu", like(".*address"), like(".*port") save; +r = group by CLUSTER, KEY do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different service configurations.", "OPERATIONS", WARNING, "Listed Service configuration[s] are different across multiple nodes in cluster. Please run 'show config service diff' to check different configuration values. Probable cause - config file misconfiguration.", "Service configurations difference check."); +multicast_mode_enabled = select like(".*mode") from NETWORK.CONFIG; +multicast_mode_enabled = do multicast_mode_enabled == "multicast"; +multicast_mode_enabled = group by CLUSTER, NODE do OR(multicast_mode_enabled); +s = select like(".*mtu") from SERVICE.CONFIG save; +r = group by CLUSTER do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different heartbeat.mtu.", "OPERATIONS", WARNING, + "Listed node[s] have a different heartbeat.mtu configured. A multicast packet can only be as large as the interface mtu. Different mtu values might create cluster stability issue. Please contact Aerospike Support team.", + "heartbeat.mtu check.", + multicast_mode_enabled); + -s = select "migrate-threads", "migrate_threads" from SERVICE.CONFIG; +s = select "migrate-threads", "migrate_threads" from SERVICE.CONFIG save; r = do s > 1; ASSERT(r, False, "> 1 migrate thread configured.", "OPERATIONS", INFO, "Listed node[s] are running with higher than normal (> 1) migrate threads. Please run 'show config service like migrate-threads' to check migration configuration. Is a non-issue if requirement is to run migration aggressively. Otherwise possible operational misconfiguration.", @@ -188,27 +247,35 @@ /* Device Configuration */ -s = select "device_total_bytes", "device-total-bytes", "total-bytes-disk" from NAMESPACE.STATISTICS; -r = group by NAMESPACE do EQUAL(s); -ASSERT(r, True, "Different namespace device size configuration.", "OPERATIONS", WARNING, +s = select "device_total_bytes", "device-total-bytes", "total-bytes-disk" from NAMESPACE.STATISTICS save; +r = group by CLUSTER, NAMESPACE do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different namespace device size configuration.", "OPERATIONS", WARNING, "Listed namespace[s] have difference in configured disk size. Please run 'show statistics namespace like bytes' to check total device size. Probable cause - config file misconfiguration.", "Namespace device size configuration difference check."); -hwm = select "high-water-disk-pct" from NAMESPACE.CONFIG; +hwm = select "high-water-disk-pct" from NAMESPACE.CONFIG save; hwm = group by CLUSTER, NAMESPACE hwm; r = do hwm == 50; ASSERT(r, True, "Non-default namespace device high water mark configuration.", "OPERATIONS", INFO, "Listed namespace[s] have non-default high water mark configuration. Please run 'show config namespace like high-water-disk-pct' to check value. Probable cause - config file misconfiguration.", "Non-default namespace device high water mark check."); -hwm = select "high-water-disk-pct" as "defrag-lwm-pct" from NAMESPACE.CONFIG; -lwm = select like(".*defrag-lwm-pct") as "defrag-lwm-pct" from NAMESPACE.CONFIG; +lwm = select like(".*defrag-lwm-pct") from NAMESPACE.CONFIG save; +lwm = group by CLUSTER, NAMESPACE lwm; +r = do lwm == 50; +ASSERT(r, True, "Non-default namespace device low water mark configuration.", "OPERATIONS", INFO, + "Listed namespace[s] have non-default low water mark configuration. Probable cause - config file misconfiguration.", + "Non-default namespace device low water mark check."); + +hwm = select "high-water-disk-pct" as "defrag-lwm-pct" from NAMESPACE.CONFIG save; +lwm = select like(".*defrag-lwm-pct") as "defrag-lwm-pct" from NAMESPACE.CONFIG save; r = do lwm < hwm on common; r = group by CLUSTER, NAMESPACE r; ASSERT(r, False, "Defrag low water mark misconfigured.", "OPERATIONS", WARNING, "Listed namespace[s] have defrag-lwm-pct lower than high-water-disk-pct. This might create situation like no block to write, no eviction and no defragmentation. Please run 'show config namespace like high-water-disk-pct defrag-lwm-pct' to check configured values. Probable cause - namespace watermark misconfiguration.", "Defrag low water mark misconfiguration check."); + /* Following query collects used device space and total device space and computes available free space on each node per namespace per cluster (group by CLUSTER, NAMESPACE, NODE). It collects cluster-size and uses it to find out expected data distribution for each node in case that node is down. It checks max of this computed value per namespace @@ -219,13 +286,13 @@ u = select "used-bytes-disk" as "disk_space", "device_used_bytes" as "disk_space" from NAMESPACE.STATISTICS; /* Available extra space */ e = do t - u; -e = group by CLUSTER, NAMESPACE, NODE do SUM(e); +e = group by CLUSTER, NAMESPACE, NODE do SUM(e) save as "available device space"; s = select "cluster_size" as "size" from SERVICE; -n = do AVG(s); +n = do MAX(s); n = do n - 1; /* Extra space need if 1 node goes down */ e1 = do u / n; -e1 = group by CLUSTER, NAMESPACE do MAX(e1); +e1 = group by CLUSTER, NAMESPACE do MAX(e1) save as "distribution share of used device space per node"; r = do e > e1; ASSERT(r, True, "Namespace under configured (disk) for single node failure.", "OPERATIONS", WARNING, "Listed namespace[s] does not have enough disk space configured to deal with increase in data per node in case of 1 node failure. Please run 'show statistics namespace like bytes' to check device space. It is non-issue if single replica limit is set to larger values, i.e if number of replica copies are reduced in case of node loss.", @@ -234,18 +301,17 @@ /* Same as above query but for memory */ -t = select "memory-size" as "mem" from NAMESPACE; +t = select "memory-size" as "mem" from NAMESPACE.CONFIG; u = select "used-bytes-memory" as "mem", "memory_used_bytes" as "mem" from NAMESPACE.STATISTICS; /* Available extra space */ e = do t - u; -e = group by CLUSTER, NAMESPACE, NODE do SUM(e); - +e = group by CLUSTER, NAMESPACE, NODE do SUM(e) save as "available memory space"; s = select "cluster_size" as "size" from SERVICE; -n = do AVG(s); +n = do MAX(s); n = do n - 1; /* Extra space need if 1 node goes down */ e1 = do u / n; -e1 = group by CLUSTER, NAMESPACE do MAX(e1); +e1 = group by CLUSTER, NAMESPACE do MAX(e1) save as "distribution share of used memory space per node"; r = do e > e1; ASSERT(r, True, "Namespace under configured (memory) for single node failure.", "OPERATIONS", WARNING, "Listed namespace[s] does not have enough memory space configured to deal with increase in data per node in case of 1 node failure. Please run 'show statistics namespace like bytes' to check memory space. It is non-issue if single replica limit is set to larger values, i.e number of replica copies reduce.", @@ -254,54 +320,55 @@ /* Namespace Configuration */ +SET CONSTRAINT VERSION < 3.13; + nsid = select "nsid" from NAMESPACE.CONFIG; -r = group by CLUSTER, NAMESPACE do EQUAL(nsid); -ASSERT(r, True, "Different namespace order in aerospike conf.", "OPERATIONS", CRITICAL, +r = group by CLUSTER, NAMESPACE do NO_MATCH(nsid, ==, MAJORITY) save; +ASSERT(r, False, "Different namespace order in aerospike conf.", "OPERATIONS", CRITICAL, "Listed namespace[s] have different order on different nodes. Please check aerospike conf file on all nodes and change configuration to make namespace order same.", "Namespace order check."); +SET CONSTRAINT VERSION ALL; + repl = select "replication-factor", "repl-factor" from NAMESPACE.CONFIG; repl = group by CLUSTER, NAMESPACE repl; -ns_count = group by CLUSTER do COUNT(repl); -ns_count_per_node = group by CLUSTER, NODE do COUNT(repl); +ns_count = group by CLUSTER do COUNT(repl) save as "total available namespaces for cluster"; +ns_count_per_node = group by CLUSTER, NODE do COUNT(repl) save as "namespace count"; r = do ns_count_per_node == ns_count; ASSERT(r, True, "Disparate namespaces.", "OPERATIONS", WARNING, "Listed node[s] do not have all namespaces configured. Please check aerospike conf file on all nodes and change namespace configuration as per requirement.", "Namespaces per node count check."); -r = select "replication-factor", "repl-factor" from NAMESPACE.CONFIG; +r = select "replication-factor", "repl-factor" from NAMESPACE.CONFIG save; r = group by CLUSTER, NAMESPACE r; r = do r == 2; ASSERT(r, True, "Non-default namespace replication-factor configuration.", "OPERATIONS", INFO, "Listed namespace[s] have non-default replication-factor configuration. Please run 'show config namespace like repl' to check value. It may be non-issue in case namespace are configured for user requirement. Ignore those.", "Non-default namespace replication-factor check."); -s = select * from NAMESPACE.CONFIG; -r = group by NAMESPACE, KEY do EQUAL(s); -ASSERT(r, True, "Different namespace configurations.", "OPERATIONS", WARNING, +s = select * from NAMESPACE.CONFIG ignore "rack-id", like(".*device"), like(".*file") save; +r = group by CLUSTER, NAMESPACE, KEY do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different namespace configurations.", "OPERATIONS", WARNING, "Listed namespace configuration[s] are different across multiple nodes in cluster. Please run 'show config namespace diff' to get actual difference. It may be non-issue in case namespace are configured with different device or file name etc. Ignore those.", "Namespace configurations difference check."); - -s = select like(".*_err.*") from SERVICE.STATISTICS; +/* Errors */ +s = select like(".*_err.*") from SERVICE.STATISTICS save; u = select "uptime" from SERVICE.STATISTICS; -u = group by CLUSTER, NODE do SUM(u); +u = group by CLUSTER, NODE do MAX(u); s = do s / u; r = group by KEY do SD_ANOMALY(s, ==, 3); -ASSERT(r, False, "Skewed cluster service errors count.", "ANOMALY", WARNING, +ASSERT(r, False, "Skewed cluster service errors count.", "ANOMALY", INFO, "Listed service errors[s] show skew in error count patterns (for listed node[s]). Please run 'show statistics service like err' for details.", "Service errors count anomaly check."); -s = select like(".*_error") from NAMESPACE.STATISTICS; -u = select "uptime" from SERVICE.STATISTICS; -u = group by CLUSTER, NODE do MAX(u); -s = do s / u on common; -d = group by NAMESPACE, KEY do SUM(s); -e = do d == 0; -ASSERT(e, True, "Non-zero namespace errors count.", "OPERATIONS", WARNING, - "Listed namespace error[s] show skew in count (for nodes). It may or may not be an issue depending on the error type. Please run 'show statistics namespace like error' for details.", - "Namespace errors count check."); +e = select "hwm_breached", "hwm-breached" from NAMESPACE.STATISTICS; +e = group by CLUSTER, NAMESPACE e; +r = do e == False; +ASSERT(r, True, "Namespace HWM breached.", "OPERATIONS", WARNING, + "Listed namespace[s] show HWM breached for memory or Disks.", + "Namespace HWM breach check."); /* Following query collects master_objects, prole_objects and replication_factor, and computes proles for one replication (prole_objects/(replication_factor-1)). @@ -312,82 +379,82 @@ m = select "master_objects" as "cnt", "master-objects" as "cnt" from NAMESPACE.STATISTICS; p = select "prole_objects" as "cnt", "prole-objects" as "cnt" from NAMESPACE.STATISTICS; r = select "replication-factor", "repl-factor" from NAMESPACE.CONFIG; -m = select "migrate_rx_partitions_active", "migrate_progress_recv", "migrate-rx-partitions-active" from NAMESPACE.STATISTICS; -mt = group by NAMESPACE do SUM(m); +mg = select "migrate_rx_partitions_active", "migrate_progress_recv", "migrate-rx-partitions-active" from NAMESPACE.STATISTICS; +mt = group by NAMESPACE do SUM(m) save as "master_objects"; pt = group by NAMESPACE do SUM(p); r = group by NAMESPACE do MAX(r); -m = group by NAMESPACE do MAX(m); -migration_in_progress = do m > 0; +mg = group by NAMESPACE do MAX(mg); +no_migration = do mg == 0; + replication_enabled = do r > 1; r = do r - 1; -pt = do pt / r; -discounted_pt = do 95 %% pt; +pt = do pt / r save as "unique prole_objects"; +discounted_pt = do 95 %% pt save as "95% of unique prole_objects"; d = do discounted_pt > mt; d = do d && replication_enabled; -d = do d && migration_in_progress; +d = do d && no_migration; ASSERT(d, False, "Skewed namespace data distribution, prole objects exceed master objects by > 5%.", "DATA", INFO, "Listed namespace[s] show abnormal object distribution. It may not be an issue if migrations are in progress. Please run 'show statistics namespace like object' for actual counts.", "Namespace data distribution check (prole objects exceed master objects by > 5%)."); -discounted_mt = do 95 %% mt; +discounted_mt = do 95 %% mt save as "95% of master_objects"; d = group by NAMESPACE do discounted_mt > pt; d = do d && replication_enabled; -d = do d && migration_in_progress; +d = do d && no_migration; ASSERT(d, False, "Skewed namespace data distribution, master objects exceed prole objects by > 5%.", "DATA", INFO, "Listed namespace[s] show abnormal object distribution. It may not be an issue if migrations are in progress. Please run 'show statistics namespace like object' for actual counts.", "Namespace data distribution check (master objects exceed prole objects by > 5%)."); -s = select "set-delete", "deleting" as "set-delete" from SET; -r = group by NAMESPACE, SET do EQUAL(s); -ASSERT(r, True, "Different set delete status.", "OPERATIONS", INFO, +s = select "set-delete", "deleting" as "set-delete" from SET save; +r = group by CLUSTER, NAMESPACE, SET do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different set delete status.", "OPERATIONS", INFO, "Listed set[s] have different set delete status across multiple nodes in cluster. This is non-issue if set-delete is being performed. Nodes reset the status asynchronously. Please check if nsup is still delete data for the set.", "Set delete status check."); -s = select like ("disable-eviction") from SET; -r = group by NAMESPACE, SET do EQUAL(s); -ASSERT(r, True, "Different set eviction configuration.", "OPERATIONS", WARNING, +s = select like ("disable-eviction") from SET save; +r = group by CLUSTER, NAMESPACE, SET do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different set eviction configuration.", "OPERATIONS", WARNING, "Listed set[s] have different eviction setting across multiple nodes in cluster. Please run 'show statistics set like disable-eviction' to check values. Possible operational misconfiguration.", "Set eviction configuration difference check."); -s = select like ("set-enable-xdr") from SET; -r = group by NAMESPACE, SET do EQUAL(s); -ASSERT(r, True, "Different set xdr configuration.", "OPERATIONS", WARNING, +s = select like ("set-enable-xdr") from SET save; +r = group by CLUSTER, NAMESPACE, SET do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different set xdr configuration.", "OPERATIONS", WARNING, "Listed set[s] have different XDR replication setting across multiple nodes in cluster. Please run 'show statistics set like set-enable-xdr' to check values. Possible operational misconfiguration.", "Set xdr configuration difference check."); -s = select "n_objects", "objects" as "n_objects" from SET; -/* Should be Anomaly */ -r = group by NAMESPACE, SET do SD_ANOMALY(s, ==, 3); +s = select "n_objects", "objects" from SET save; +r = group by CLUSTER, NAMESPACE, SET do SD_ANOMALY(s, ==, 3); ASSERT(r, False, "Skewed cluster set object count.", "ANOMALY", WARNING, "Listed set[s] have skewed object distribution. Please run 'show statistics set like object' to check counts. It may be non-issue if cluster is undergoing migrations.", "Set object count anomaly check."); /* XDR */ -s = select * from XDR.CONFIG; -r = GROUP by KEY do EQUAL(s); -ASSERT(r, True, "Different XDR configurations.", "OPERATIONS", WARNING, +s = select * from XDR.CONFIG save; +r = GROUP by CLUSTER, KEY do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different XDR configurations.", "OPERATIONS", WARNING, "Listed XDR configuration[s] are different across multiple nodes in cluster. Please run 'show config xdr diff' to get difference. Possible operational misconfiguration.", "XDR configurations difference check."); -s = select * from XDR.STATISTICS; +s = select * from XDR.STATISTICS save; u = select "uptime" from SERVICE.STATISTICS; -u = group by CLUSTER, NODE do SUM(u); +u = group by CLUSTER, NODE do MAX(u); s = do s / u; -r = group by KEY do SD_ANOMALY(s, ==, 3); +r = group by CLUSTER, KEY do SD_ANOMALY(s, ==, 3); ASSERT(r, False, "Skewed cluster XDR statistics.", "ANOMALY", WARNING, "Listed XDR statistic[s] show skew for the listed node[s]. It may or may not be an issue depending on the statistic type.", "XDR statistics anomaly check."); -s = select * from DC.STATISTICS; +s = select * from DC.STATISTICS ignore "dc_size", "dc_state" save; u = select "uptime" from SERVICE.STATISTICS; -u = group by CLUSTER, NODE do SUM(u); +u = group by CLUSTER, NODE do MAX(u); s = do s / u on common; -r = group by DC, KEY do SD_ANOMALY(s, ==, 3); +r = group by CLUSTER, DC, KEY do SD_ANOMALY(s, ==, 3); ASSERT(r, False, "Skewed cluster remote DC statistics.", "ANOMALY", WARNING, "Listed DC statistic[s] show skew for the listed node[s]. Please run 'show statistics dc' to get all DC stats. May be non-issue if remote Data center connectivity behavior for nodes is not same.", "Remote DC statistics anomaly check."); @@ -399,51 +466,58 @@ */ xdr_enabled = select "enable-xdr" from XDR.CONFIG; xdr_enabled = group by CLUSTER, NODE do OR(xdr_enabled); +cluster_xdr_enabled = group by CLUSTER do OR(xdr_enabled); -s = select "xdr-dc-state", "dc_state" from DC.STATISTICS; -r = group by DC do EQUAL(s); -ASSERT(r, True, "Different remote DC states.", "OPERATIONS", WARNING, - "Listed node[s] have a different remote DC visibility. Please run 'show statistics dc like state' to see DC state. Possible network issue between data centers.", +s = select "xdr-dc-state", "dc_state" from DC.STATISTICS save; +r = group by CLUSTER, DC do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different remote DC states.", "OPERATIONS", WARNING, + "Listed DC[s] have a different remote DC visibility. Please run 'show statistics dc like state' to see DC state. Possible network issue between data centers.", "Remote DC state check.", xdr_enabled); -s = select "free-dlog-pct", "dlog_free_pct", "free_dlog_pct" from XDR; +s = select "dc_size" from DC.STATISTICS save; +r = group by CLUSTER, DC do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different remote DC sizes.", "OPERATIONS", WARNING, + "Listed DC[s] have a different remote DC size. Please run 'show statistics dc like size' to see DC size. Possible network issue between data centers.", + "Remote DC size check."); + +s = select "free-dlog-pct", "dlog_free_pct", "free_dlog_pct" from XDR save; r = do s < 95; ASSERT(r, False, "Low XDR free digest log space.", "OPERATIONS", INFO, "Listed node[s] have lower than ideal (95%) free digest log space. Please run 'show statistics xdr like free' to see digest log space. Probable cause - low XDR throughput or a failed node processing in progress.", "XDR free digest log space check.", xdr_enabled); -r = group by CLUSTER, NODE do SD_ANOMALY(s, ==, 3); +r = group by CLUSTER do SD_ANOMALY(s, ==, 3); ASSERT(r, False, "Skewed cluster XDR free digest log space.", "ANOMALY", WARNING, "Listed node[s] have different digest log free size pattern. Please run 'show statistics xdr like free' to see digest log space. May not be an issue if the nodes are newly added or have been restarted with noresume or if remote Datacenter connectivity behavior differs for nodes.", "XDR free digest log space anomaly check.", - xdr_enabled); + cluster_xdr_enabled); /* Needs normalization but not sure on what ?? */ -s = select "timediff_lastship_cur_secs", "xdr_timelag" from XDR.STATISTICS; +s = select "timediff_lastship_cur_secs", "xdr_timelag" from XDR.STATISTICS save; r = do s > 10; ASSERT(r, False, "High XDR shipping lag (> 10s).", "PERFORMANCE", WARNING, "Listed node[s] have higher than healthy ( > 10 sec) ship lag to remote data center. Please run 'show statistics xdr like time' to see shipping lag. Probable cause - connectivity issue to remote datacenter or spike in write throughput on the local cluster.", "XDR shipping lag check.", xdr_enabled); -r = group by CLUSTER, NODE do SD_ANOMALY(s, ==, 3); +r = group by CLUSTER do SD_ANOMALY(s, ==, 3); ASSERT(r, False, "Cluster XDR shipping lag skewed.", "ANOMALY", WARNING, "Listed node[s] have different ship lag patterns. Please run 'show statistics xdr like time' to see shipping lag. May not be an issue if the nodes are newly added or have been restarted with noresume or if remote Datacenter connectivity behavior differs for nodes.", "XDR shipping lag anomaly check.", - xdr_enabled); + cluster_xdr_enabled); -s = select "xdr-dc-timelag", "dc_timelag" from DC.STATISTICS; -r = group by DC do SD_ANOMALY(s, ==, 3); +s = select "xdr-dc-timelag", "dc_timelag" from DC.STATISTICS save; +r = group by CLUSTER, DC do SD_ANOMALY(s, ==, 3); ASSERT(r, False, "Skewed cluster remote DC Lag.", "ANOMALY", WARNING, "Listed node[s] have different latency to remote data center. Please run 'show statistics dc like timelag' to see timelag. Possible Data center connectivity issue.", "Remote DC lag anomaly check.", - xdr_enabled); + cluster_xdr_enabled); /* XDR xdr_read_latency_avg check */ -s = select "xdr_read_latency_avg", "local_recs_fetch_avg_latency" from XDR.STATISTICS; +s = select "xdr_read_latency_avg", "local_recs_fetch_avg_latency" from XDR.STATISTICS save; r = do s > 2; ASSERT(r, False, "High XDR average read latency (>2 sec).", "PERFORMANCE", WARNING, "Listed node[s] have higher than normal (> 2sec) local read latencies. Please run 'show statistics xdr like latency' to see XDR read latency. Probable cause - system overload causing transaction queue to back up.", @@ -451,9 +525,9 @@ xdr_enabled); -s = select "dc_open_conn" as "conn" from DC.STATISTICS; -ds = select "dc_size" as "conn" from DC.STATISTICS; -ds = do ds * 64; +s = select "dc_open_conn" as "conn" from DC.STATISTICS save; +ds = select "dc_size" as "conn" from DC.STATISTICS save; +ds = do ds * 64 save as "max expected dc connections"; r = do s > ds; ASSERT(r, False, "High remote DC connections.", "LIMITS", WARNING, "Listed node[s] have higher than normal remote datacenter connections. Generally accepted number is (64*No of nodes in remote DC) per node. Please run 'show statistics dc like dc_open_conn dc_size' to see DC connection statistics. Ignore if XDR is not pipelined.", @@ -461,7 +535,7 @@ xdr_enabled); -s = select "xdr_uninitialized_destination_error", "noship_recs_uninitialized_destination" from XDR.STATISTICS; +s = select "xdr_uninitialized_destination_error", "noship_recs_uninitialized_destination" from XDR.STATISTICS save; r = do s > 0; ASSERT(r, False, "Uninitialized destination cluster.", "OPERATIONS", WARNING, "Listed node[s] have a non zero value for this uninitialized DC. Please check the configuration.", @@ -469,7 +543,7 @@ xdr_enabled); -s = select "xdr_unknown_namespace_error", "noship_recs_unknown_namespace" from XDR.STATISTICS; +s = select "xdr_unknown_namespace_error", "noship_recs_unknown_namespace" from XDR.STATISTICS save; r = do s > 0; ASSERT(r, False, "Missing namespace in remote data center.", "OPERATIONS", WARNING, "Certain namespace not found in remote DC. Please check the configuration to ascertain if remote DC has all the namespace being shipped.", @@ -477,7 +551,7 @@ xdr_enabled); /* XDR failednode_sessions_pending check */ -s = select "failednode_sessions_pending", "xdr_active_failed_node_sessions" from XDR.STATISTICS; +s = select "failednode_sessions_pending", "xdr_active_failed_node_sessions" from XDR.STATISTICS save; r = do s > 0; ASSERT(r, False, "Active failed node sessions.", "OPERATIONS", INFO, "Listed node[s] have failed node sessions pending. Please check if there are any failed nodes on the source cluster.", @@ -485,7 +559,7 @@ xdr_enabled); /* XDR linkdown_sessions_pending check */ -s = select "linkdown_sessions_pending", "xdr_active_link_down_sessions" from XDR.STATISTICS; +s = select "linkdown_sessions_pending", "xdr_active_link_down_sessions" from XDR.STATISTICS save; r = do s > 0; ASSERT(r, False, "Active linkdown sessions.", "OPERATIONS", INFO, "Listed node[s] have link down sessions pending. Please check the connectivity of remote datacenter.", @@ -493,7 +567,7 @@ xdr_enabled); /* XDR xdr_ship_outstanding_objects check */ -s = select "xdr_ship_outstanding_objects", "stat_recs_outstanding" from XDR.STATISTICS; +s = select "xdr_ship_outstanding_objects", "stat_recs_outstanding" from XDR.STATISTICS save; r = do s > 10000; ASSERT(r, False, "Too many outstanding objects (>10000) to ship !!.", "OPERATIONS", WARNING, "Listed node[s] have too many records outstanding. Please check relogging and error statistics.", @@ -501,7 +575,7 @@ xdr_enabled); /* XDR xdr_ship_inflight_objects check */ -s = select "xdr_ship_inflight_objects", "stat_recs_inflight" from XDR.STATISTICS; +s = select "xdr_ship_inflight_objects", "stat_recs_inflight" from XDR.STATISTICS save; r = do s > 5000; ASSERT(r, False, "Too many inflight objects (>5000).", "PERFORMANCE", WARNING, "Listed node[s] have too many objects inflight. This might lead to XDR throttling itself, consider tuning this parameter to a lower value.", @@ -509,7 +583,7 @@ xdr_enabled); /* XDR xdr_ship_latency_avg check */ -s = select "xdr_ship_latency_avg", "latency_avg_ship" from XDR.STATISTICS; +s = select "xdr_ship_latency_avg", "latency_avg_ship" from XDR.STATISTICS save; // Following value is not fixed yet r = do s > 5000; ASSERT(r, False, "Record shipping takes too long (>5 sec).", "PERFORMANCE", WARNING, @@ -520,21 +594,21 @@ /* CLUSTER STATE */ -r = select "cluster_integrity" from SERVICE.STATISTICS; +r = select "cluster_integrity" from SERVICE.STATISTICS save; r = do r == True; ASSERT(r, True, "Cluster integrity fault.", "OPERATIONS", CRITICAL, "Listed node[s] have cluster integrity fault. This indicates cluster is not completely wellformed. Please check server logs for more information. Probable cause - issue with network.", "Cluster integrity fault check."); r = select "cluster_key" from SERVICE.STATISTICS; -r = do EQUAL(r); -ASSERT(r, True, "Different Cluster Key.", "OPERATIONS", CRITICAL, +r = do NO_MATCH(r, ==, MAJORITY) save; +ASSERT(r, False, "Different Cluster Key.", "OPERATIONS", CRITICAL, "Listed cluster[s] have different cluster keys for nodes. This indicates cluster is not completely wellformed. Please check server logs for more information. Probable cause - issue with network.", "Cluster Key difference check."); u = select "uptime" from SERVICE.STATISTICS; -total_nodes = group by CLUSTER do COUNT(u); -r = select "cluster_size" from SERVICE.STATISTICS; +total_nodes = group by CLUSTER do COUNT(u) save as "total nodes"; +r = select "cluster_size" from SERVICE.STATISTICS save; r = do r == total_nodes; ASSERT(r, True, "Unstable Cluster.", "OPERATIONS", CRITICAL, "Listed node[s] have cluster size not matching total number of available nodes. This indicates cluster is not completely wellformed. Please check server logs for more information. Probable cause - issue with network.", @@ -543,18 +617,20 @@ hp = select "heartbeat.protocol", "heartbeat-protocol" from NETWORK.CONFIG; heartbeat_proto_v2 = do hp == "v2"; heartbeat_proto_v2 = group by CLUSTER, NODE do OR(heartbeat_proto_v2); -cs = select "cluster_size" from SERVICE.STATISTICS; -mcs = select "paxos-max-cluster-size" as "cluster_size" from SERVICE.CONFIG; +cs = select "cluster_size" from SERVICE.STATISTICS save; +mcs = select "paxos-max-cluster-size" as "cluster_size" from SERVICE.CONFIG save; +cs_without_saved_value = select "cluster_size" from SERVICE.STATISTICS; +mcs_without_saved_value = select "paxos-max-cluster-size" as "cluster_size" from SERVICE.CONFIG; r = do cs < mcs; ASSERT(r, True, "Critical cluster size.", "OPERATIONS", CRITICAL, "Listed node[s] have cluster size higher than configured paxos-max-cluster-size. Please run 'show config service like paxos-max-cluster-size' to check configured max cluster size.", "Critical cluster size check.", heartbeat_proto_v2); -small_max_configured = do mcs < 20; +small_max_configured = do mcs_without_saved_value < 20; critical_size = do cs >= mcs; -correct_size = do mcs - 10; -correct_size = do cs <= correct_size; +correct_size = do mcs_without_saved_value - 10; +correct_size = do cs_without_saved_value <= correct_size; r = do small_max_configured || critical_size; r = do r || correct_size; ASSERT(r, True, "Cluster size is near the max configured cluster size.", "OPERATIONS", WARNING, @@ -566,12 +642,12 @@ /* UDF */ u = select * from UDF.METADATA; -r = group by FILENAME, KEY do EQUAL(u); -ASSERT(r, True, "UDF not in sync (file not matching).", "OPERATIONS", CRITICAL, +r = group by FILENAME, KEY do NO_MATCH(u, ==, MAJORITY) save; +ASSERT(r, False, "UDF not in sync (file not matching).", "OPERATIONS", CRITICAL, "Listed UDF definitions do not match across the nodes. This may lead to incorrect UDF behavior. Run command 'asinfo -v udf-list' to see list of UDF. Re-register the latest version of the not in sync UDF[s].", "UDF sync (file not matching) check."); -total_nodes = group by CLUSTER do COUNT(u); -c = group by CLUSTER, FILENAME do COUNT(u); +total_nodes = group by CLUSTER do COUNT(u) save as "expected node count"; +c = group by CLUSTER, FILENAME do COUNT(u) save as "node count"; r = do c == total_nodes; ASSERT(r, True, "UDF not in sync (not available on all node).", "OPERATIONS", CRITICAL, "Listed UDF[s] are not available on all the nodes. This may lead to incorrect UDF behavior. Run command 'asinfo -v udf-list' to see list of UDF. Re-register missing UDF in cluster.", @@ -579,15 +655,15 @@ /* SINDEX */ -s = select "sync_state" from SINDEX.STATISTICS; +s = select "sync_state" from SINDEX.STATISTICS save; s = group by CLUSTER, NAMESPACE, SET, SINDEX s; r = do s == "synced"; ASSERT(r, True, "SINDEX not in sync with primary.", "OPERATIONS", CRITICAL, "Listed sindex[es] are not in sync with primary. This can lead to wrong query results. Consider dropping and recreating secondary index[es].", "SINDEX sync state check."); u = select "uptime" from SERVICE.STATISTICS; -total_nodes = group by CLUSTER do COUNT(u); -c = group by CLUSTER, NAMESPACE, SET, SINDEX do COUNT(s); +total_nodes = group by CLUSTER do COUNT(u) save as "cluster node count"; +c = group by CLUSTER, NAMESPACE, SET, SINDEX do COUNT(s) save as "nodes with SINDEX"; r = do c == total_nodes; ASSERT(r, True, "SINDEX not in sync (not available on all node).", "OPERATIONS", CRITICAL, "Listed sindex[es] not available on all nodes. This can lead to wrong query results. Consider dropping and recreating missing secondary index[es].", @@ -620,133 +696,629 @@ */ SET CONSTRAINT VERSION >= 3.9; +// Uptime u = select "uptime" from SERVICE.STATISTICS; -u = GROUP BY CLUSTER, NODE do SUM(u); +u = GROUP BY CLUSTER, NODE do MAX(u); -e = select "client_write_error" from NAMESPACE.STATISTICS; -s = select "client_write_success" from NAMESPACE.STATISTICS; -s = GROUP BY CLUSTER, NODE, NAMESPACE do SUM(s); -r = do e / s; -r = do r/u on common; -r = do r == 0; -ASSERT(r, True, "Non-zero namespace write errors count", "OPERATIONS", INFO, - "Listed namespace write error[s] show skew in count across nodes in cluster. It may or may not be an issue depending on the error type (e.g gen check errors may be expected if client is using check and set kind of operations). Please run 'show statistics namespace like client_write' to see values.", - "Namespace write errors count check"); -e = select "client_read_error" from NAMESPACE.STATISTICS; -s = select "client_read_success" from NAMESPACE.STATISTICS; -s = GROUP BY CLUSTER, NODE, NAMESPACE do SUM(s); -r = do e / s; -r = do r/u on common; -r = do r == 0; -ASSERT(r, True, "Non-zero namespace read errors count", "OPERATIONS", INFO, - "Listed namespace read error[s] show skew in count across nodes in the cluster. It may or may not be an issue depending on the error type (e.g key not found may be expected). Please run 'show statistics namespace like client_read' to see values.", - "Namespace read errors count check"); +// Read statistics -e = select "client_delete_error" from NAMESPACE.STATISTICS; -s = select "client_delete_success" from NAMESPACE.STATISTICS; -s = GROUP BY CLUSTER, NODE, NAMESPACE do SUM(s); -r = do e / s; -r = do r/u on common; -r = do r == 0; -ASSERT(r, True, "Non-zero namespace delete errors count", "OPERATIONS", INFO, - "Listed namespace delete error[s] show skew in count across nodes in the cluster. It may or may not be an issue depending on the error type (e.g key not found). Please run 'show statistics namespace like client_delete' to see values.", - "Namespace delete errors count check"); +nf = select "client_read_not_found" as "cnt" from NAMESPACE.STATISTICS; +s = select "client_read_success" as "cnt" from NAMESPACE.STATISTICS; +t = select "client_read_timeout" as "cnt" from NAMESPACE.STATISTICS; +e = select "client_read_error" as "cnt" from NAMESPACE.STATISTICS; +total_reads = do s + nf; +total_reads = do total_reads + t; +total_reads = do total_reads + e save as "total client reads"; +total_reads_per_sec = do total_reads/u; +total_reads = group by CLUSTER, NAMESPACE, NODE do MAX(total_reads); +total_reads_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_reads_per_sec); -e = select "batch_sub_tsvc_timeout" from NAMESPACE.STATISTICS; -e = do e/u on common; +e = select "client_read_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; e = group by CLUSTER, NAMESPACE e; -r = do e > 0; -ASSERT(r, False, "Non-zero batch-index read sub-transaction timeouts.", "OPERATIONS", INFO, - "Listed namespace[s] have non-zero batch-index read sub-transaction timeouts across the nodes. Please run 'show statistics namespace like batch_sub_tsvc_timeout' to see the values.", - "Namespace batch-index read sub-transaction timeout count check"); +p = do e/total_reads_per_sec; +p = do p * 100 save as "client_read_error % of total reads"; +r = do p <= 5; +ASSERT(r, True, "High client read errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal read errors (> 5% client reads). Please run 'show statistics namespace like client_read' to see values.", + "High read error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero client read errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero read errors. Please run 'show statistics namespace like client_read' to see values.", + "Non-zero read error check"); + +t = select "client_read_timeout" from NAMESPACE.STATISTICS save; +t = group by CLUSTER, NAMESPACE t; +r = do t/total_reads; +r = do r * 100 save as "client_read_timeout % of total reads"; +r = do r <= 5; +ASSERT(r, True, "High client read timeouts", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal read timeouts (> 5% client reads). Please run 'show statistics namespace like client_read' to see values.", + "High read timeouts check"); + +c = select "client_read_not_found" from NAMESPACE.STATISTICS save; +c = group by CLUSTER, NAMESPACE c; + +r = do c / total_reads; +r = do r * 100 save as "client_read_not_found % of total reads"; +r = do r <= 20; +ASSERT(r, True, "High read not found errors", "OPERATIONS", INFO, + "Listed namespace[s] show higher than normal read not found errors (> 20% client reads). Please run 'show statistics namespace like client_read' to see values.", + "High read not found error check"); -e = select "client_tsvc_timeout" from NAMESPACE.STATISTICS; -e = do e/u on common; -e = group by CLUSTER, NAMESPACE e; -r = do e > 0; -ASSERT(r, False, "Non-zero client transaction timeouts.", "OPERATIONS", INFO, - "Listed namespace[s] have non-zero client transaction timeouts (for nodes). Please run 'show statistics namespace like client_tsvc_timeout' to see values. Probable cause - congestion in the transaction queue (transaction threads not able to process efficiently enough), or it could also be that the timeout set by the client is too aggressive.", - "Namespace client transaction timeout count check"); -e = select "client_udf_error" from NAMESPACE.STATISTICS; -e = do e/u on common; -e = group by CLUSTER, NAMESPACE e; -r = do e > 0; -ASSERT(r, False, "Non-zero UDF transaction failure.", "OPERATIONS", INFO, - "Listed namespace[s] have non-zero UDF transaction failures (for nodes). Please run 'show statistics namespace like client_udf_error' to see values.", - "Namespace UDF transaction failure check"); +// Delete statistics -e = select "client_udf_timeout" from NAMESPACE.STATISTICS; -e = do e/u on common; -e = group by CLUSTER, NAMESPACE e; -r = do e > 0; -ASSERT(r, False, "Non-zero UDF transaction timeouts.", "OPERATIONS", INFO, - "Listed namespace[s] have non-zero UDF transaction timeouts (for nodes). Please run 'show statistics namespace like client_udf_timeout' to see values.", - "Namespace UDF transaction timeout check"); +nf = select "client_delete_not_found" as "cnt" from NAMESPACE.STATISTICS; +s = select "client_delete_success" as "cnt" from NAMESPACE.STATISTICS; +t = select "client_delete_timeout" as "cnt" from NAMESPACE.STATISTICS; +e = select "client_delete_error" as "cnt" from NAMESPACE.STATISTICS; +total_deletes = do s + nf; +total_deletes = do total_deletes + t; +total_deletes = do total_deletes + e save as "total client deletes"; +total_deletes_per_sec = do total_deletes/u; +total_deletes = group by CLUSTER, NAMESPACE, NODE do MAX(total_deletes); +total_deletes_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_deletes_per_sec); -e = select "udf_sub_udf_error" from NAMESPACE.STATISTICS; -e = do e/u on common; +e = select "client_delete_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; e = group by CLUSTER, NAMESPACE e; -r = do e > 0; -ASSERT(r, False, "Non-zero UDF sub-transaction failures.", "OPERATIONS", INFO, - "Listed namespace[s] have non-zero UDF sub-transaction failures across nodes in cluster for scan/query background udf jobs. Please run 'show statistics namespace like udf_sub_udf_error udf_sub_lang_' to see details.", - "Namespace UDF sub-transaction failure check"); - -e = select "client_write_timeout" from NAMESPACE.STATISTICS; -e = do e/u on common; +p = do e/total_deletes_per_sec; +p = do p * 100 save as "client_delete_error % of total deletes"; +r = do p <= 5; +ASSERT(r, True, "High client delete errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal delete errors (> 5% client deletes). Please run 'show statistics namespace like client_delete' to see values.", + "High delete error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero client delete errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero delete errors. Please run 'show statistics namespace like client_delete' to see values.", + "Non-zero delete error check"); + +t = select "client_delete_timeout" from NAMESPACE.STATISTICS save; +t = group by CLUSTER, NAMESPACE t; +r = do t/total_deletes; +r = do r * 100 save as "client_delete_timeout % of total deletes"; +r = do r <= 5; +ASSERT(r, True, "High client delete timeouts", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal delete timeouts (> 5% client deletes). Please run 'show statistics namespace like client_delete' to see values.", + "High delete timeouts check"); + +c = select "client_delete_not_found" from NAMESPACE.STATISTICS save; +c = group by CLUSTER, NAMESPACE c; +r = do c / total_deletes; +r = do r * 100 save as "client_delete_not_found % of total deletes"; +r = do r <= 20; +ASSERT(r, True, "High delete not found errors", "OPERATIONS", INFO, + "Listed namespace[s] show higher than normal delete not found errors (> 20% client deletes). Please run 'show statistics namespace like client_delete' to see values.", + "High delete not found error check"); + + +// Write statistics + +s = select "client_write_success" as "cnt" from NAMESPACE.STATISTICS; +t = select "client_write_timeout" as "cnt" from NAMESPACE.STATISTICS; +e = select "client_write_error" as "cnt" from NAMESPACE.STATISTICS; +total_writes = do s + t; +total_writes = do total_writes + e save as "total client writes"; +total_writes_per_sec = do total_writes/u; +total_writes = group by CLUSTER, NAMESPACE, NODE do MAX(total_writes); +total_writes_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_writes_per_sec); + +e = select "client_write_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_writes_per_sec; +p = do p * 100 save as "client_write_error % of total writes"; +r = do p <= 5; +ASSERT(r, True, "High client write errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal write errors (> 5% client writes). Please run 'show statistics namespace like client_write' to see values.", + "High write error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero client write errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero write errors. Please run 'show statistics namespace like client_write' to see values.", + "Non-zero write error check"); + +t = select "client_write_timeout" from NAMESPACE.STATISTICS save; +t = group by CLUSTER, NAMESPACE t; +r = do t/total_writes; +r = do r * 100 save as "client_write_timeout % of total writes"; +r = do r <= 5; +ASSERT(r, True, "High client write timeouts", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal write timeouts (> 5% client writes). Please run 'show statistics namespace like client_write' to see values.", + "High write timeouts check"); + + +// Client Proxy transaction statistics + +s = select "client_proxy_complete" as "cnt" from NAMESPACE.STATISTICS; +t = select "client_proxy_timeout" as "cnt" from NAMESPACE.STATISTICS; +e = select "client_proxy_error" as "cnt" from NAMESPACE.STATISTICS; +total_client_proxy = do s + t; +total_client_proxy = do total_client_proxy + e save as "total client proxy transactions"; +total_client_proxy_per_sec = do total_client_proxy/u; +total_client_proxy = group by CLUSTER, NAMESPACE, NODE do MAX(total_client_proxy); +total_client_proxy_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_client_proxy_per_sec); + +e = select "client_proxy_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_client_proxy_per_sec; +p = do p * 100 save as "client_proxy_error % of total proxy transactions"; +r = do p <= 5; +ASSERT(r, True, "High client proxy transaction errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal proxy transaction errors (> 5% client proxy transactions). Please run 'show statistics namespace like client_proxy' to see values.", + "High proxy transaction error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero client proxy transaction errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero proxy transaction errors. Please run 'show statistics namespace like client_proxy' to see values.", + "Non-zero proxy transaction error check"); + + +t = select "client_proxy_timeout" from NAMESPACE.STATISTICS save; +t = group by CLUSTER, NAMESPACE t; +r = do t/total_client_proxy; +r = do r * 100 save as "client_proxy_timeout % of total proxy transactions"; +r = do r <= 5; +ASSERT(r, True, "High client proxy transaction timeouts", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal proxy transaction timeouts (> 5% client proxy transactions). Please run 'show statistics namespace like client_proxy' to see values.", + "High proxy transaction timeouts check"); + + + +// XDR Write statistics + +s = select "xdr_write_success" as "cnt" from NAMESPACE.STATISTICS; +t = select "xdr_write_timeout" as "cnt" from NAMESPACE.STATISTICS; +e = select "xdr_write_error" as "cnt" from NAMESPACE.STATISTICS; +total_xdr_writes = do s + t; +total_xdr_writes = do total_xdr_writes + e save as "total xdr writes"; +total_xdr_writes_per_sec = do total_xdr_writes/u; +total_xdr_writes = group by CLUSTER, NAMESPACE, NODE do MAX(total_xdr_writes); +total_xdr_writes_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_xdr_writes_per_sec); + +e = select "xdr_write_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_xdr_writes_per_sec; +p = do p * 100 save as "xdr_write_error % of total xdr writes"; +r = do p <= 5; +ASSERT(r, True, "High xdr write errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal xdr write errors (> 5% xdr writes). Please run 'show statistics namespace like xdr_write' to see values.", + "High xdr write error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero xdr write errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero xdr write errors. Please run 'show statistics namespace like xdr_write' to see values.", + "Non-zero xdr write error check"); + +t = select "xdr_write_timeout" from NAMESPACE.STATISTICS save; +t = group by CLUSTER, NAMESPACE t; +r = do t/total_xdr_writes; +r = do r * 100 save as "xdr_write_timeout % of total xdr writes"; +r = do r <= 5; +ASSERT(r, True, "High xdr write timeouts", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal xdr write timeouts (> 5% xdr writes). Please run 'show statistics namespace like xdr_write' to see values.", + "High xdr write timeouts check"); + + +// UDF Transaction statistics + +s = select "client_udf_complete" as "cnt" from NAMESPACE.STATISTICS; +t = select "client_udf_timeout" as "cnt" from NAMESPACE.STATISTICS; +e = select "client_udf_error" as "cnt" from NAMESPACE.STATISTICS; +total_udf_transactions = do s + t; +total_udf_transactions = do total_udf_transactions + e save as "total udf transactions"; +total_udf_transactions_per_sec = do total_udf_transactions/u; +total_udf_transactions = group by CLUSTER, NAMESPACE, NODE do MAX(total_udf_transactions); +total_udf_transactions_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_udf_transactions_per_sec); + +e = select "client_udf_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_udf_transactions_per_sec; +p = do p * 100 save as "client_udf_error % of total udf transactions"; +r = do p <= 5; +ASSERT(r, True, "High udf transaction errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal udf transaction errors (> 5% udf transactions). Please run 'show statistics namespace like client_udf' to see values.", + "High udf transaction error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero udf transaction errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero udf transaction errors. Please run 'show statistics namespace like client_udf' to see values.", + "Non-zero udf transaction error check"); + +t = select "client_udf_timeout" from NAMESPACE.STATISTICS save; +t = group by CLUSTER, NAMESPACE t; +r = do t/total_udf_transactions; +r = do r * 100 save as "client_udf_timeout % of total udf transactions"; +r = do r <= 5; +ASSERT(r, True, "High udf transaction timeouts", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal udf transaction timeouts (> 5% udf transaction). Please run 'show statistics namespace like client_udf' to see values.", + "High udf transaction timeouts check"); + + +// UDF Sub-Transaction statistics + +s = select "udf_sub_udf_complete" as "cnt" from NAMESPACE.STATISTICS; +t = select "udf_sub_udf_timeout" as "cnt" from NAMESPACE.STATISTICS; +e = select "udf_sub_udf_error" as "cnt" from NAMESPACE.STATISTICS; +total_udf_sub_transactions = do s + t; +total_udf_sub_transactions = do total_udf_sub_transactions + e save as "total udf sub-transactions"; +total_udf_sub_transactions_per_sec = do total_udf_sub_transactions/u; +total_udf_sub_transactions = group by CLUSTER, NAMESPACE, NODE do MAX(total_udf_sub_transactions); +total_udf_sub_transactions_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_udf_sub_transactions_per_sec); + +e = select "udf_sub_udf_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_udf_sub_transactions_per_sec; +p = do p * 100 save as "udf_sub_udf_error % of total udf sub-transactions"; +r = do p <= 5; +ASSERT(r, True, "High udf sub-transaction errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal udf sub-transaction errors (> 5% udf sub-transactions). Please run 'show statistics namespace like udf_sub_udf' to see values.", + "High udf sub-transaction error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero udf sub-transaction errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero udf sub-transaction errors. Please run 'show statistics namespace like udf_sub_udf' to see values.", + "Non-zero udf sub-transaction error check"); + +t = select "udf_sub_udf_timeout" from NAMESPACE.STATISTICS save; +t = group by CLUSTER, NAMESPACE t; +r = do t/total_udf_sub_transactions; +r = do r * 100 save as "udf_sub_udf_timeout % of total udf sub-transactions"; +r = do r <= 5; +ASSERT(r, True, "High udf sub-transaction timeouts", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal udf sub-transaction timeouts (> 5% udf sub-transaction). Please run 'show statistics namespace like udf_sub_udf' to see values.", + "High udf sub-transaction timeouts check"); + + +// Proxied Batch-index Sub-Transaction statistics + +s = select "batch_sub_proxy_complete" as "cnt" from NAMESPACE.STATISTICS; +t = select "batch_sub_proxy_error" as "cnt" from NAMESPACE.STATISTICS; +e = select "batch_sub_proxy_timeout" as "cnt" from NAMESPACE.STATISTICS; +total_transactions = do s + t; +total_transactions = do total_transactions + e save as "total batch-index sub-transactions"; +total_transactions_per_sec = do total_transactions/u; +total_transactions = group by CLUSTER, NAMESPACE, NODE do MAX(total_transactions); +total_transactions_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_transactions_per_sec); + +e = select "batch_sub_proxy_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_transactions_per_sec; +p = do p * 100 save as "batch_sub_proxy_error % of total batch-index sub-transactions"; +r = do p <= 5; +ASSERT(r, True, "High batch-index sub-transaction errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal batch-index sub-transaction errors (> 5% batch-index sub-transactions). Please run 'show statistics namespace like batch_sub_proxy' to see values.", + "High batch-index sub-transaction error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero batch-index sub-transaction errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero batch-index sub-transaction errors. Please run 'show statistics namespace like batch_sub_proxy' to see values.", + "Non-zero batch-index sub-transaction error check"); + +t = select "batch_sub_proxy_timeout" from NAMESPACE.STATISTICS save; +t = group by CLUSTER, NAMESPACE t; +r = do t/total_transactions; +r = do r * 100 save as "batch_sub_proxy_timeout % of total batch-index sub-transactions"; +r = do r <= 5; +ASSERT(r, True, "High batch-index sub-transaction timeouts", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal batch-index sub-transaction timeouts (> 5% batch-index sub-transaction). Please run 'show statistics namespace like batch_sub_proxy' to see values.", + "High batch-index sub-transaction timeouts check"); + + +// Batch-index read Sub-Transaction statistics + +nf = select "batch_sub_read_not_found" as "cnt" from NAMESPACE.STATISTICS; +s = select "batch_sub_read_success" as "cnt" from NAMESPACE.STATISTICS; +t = select "batch_sub_read_timeout" as "cnt" from NAMESPACE.STATISTICS; +e = select "batch_sub_read_error" as "cnt" from NAMESPACE.STATISTICS; +total_transactions = do s + nf; +total_transactions = do total_transactions + t; +total_transactions = do total_transactions + e save as "total batch-index read sub-transactions"; +total_transactions_per_sec = do total_transactions/u; +total_transactions = group by CLUSTER, NAMESPACE, NODE do MAX(total_transactions); +total_transactions_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_transactions_per_sec); + +e = select "batch_sub_read_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_transactions_per_sec; +p = do p * 100 save as "batch_sub_read_error % of total reads"; +r = do p <= 5; +ASSERT(r, True, "High batch-index read sub-transaction errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal batch-index read sub-transaction errors (> 5% batch-index read sub-transactions). Please run 'show statistics namespace like batch_sub_read' to see values.", + "High batch-index read sub-transaction error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero batch-index read sub-transaction errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero batch-index read sub-transaction errors. Please run 'show statistics namespace like batch_sub_read' to see values.", + "Non-zero batch-index read sub-transaction error check"); + +t = select "batch_sub_read_timeout" from NAMESPACE.STATISTICS save; +t = group by CLUSTER, NAMESPACE t; +r = do t/total_transactions; +r = do r * 100 save as "batch_sub_read_timeout % of total batch-index read sub-transactions"; +r = do r <= 5; +ASSERT(r, True, "High batch-index read sub-transaction timeouts", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal batch-index read sub-transaction timeouts (> 5% batch-index read sub-transactions). Please run 'show statistics namespace like batch_sub_read' to see values.", + "High batch-index read sub-transaction timeouts check"); + +c = select "batch_sub_read_not_found" from NAMESPACE.STATISTICS save; +c = group by CLUSTER, NAMESPACE c; +r = do c / total_transactions; +r = do r * 100 save as "batch_sub_read_not_found % of total batch-index read sub-transactions"; +r = do r <= 20; +ASSERT(r, True, "High batch-index read sub-transaction not found errors", "OPERATIONS", INFO, + "Listed namespace[s] show higher than normal batch-index read sub-transaction not found errors (> 20% batch-index read sub-transactions). Please run 'show statistics namespace like batch_sub_read' to see values.", + "High batch-index read sub-transaction not found error check"); + + +// Client UDF Transaction statistics + +rs = select "client_lang_read_success" as "cnt" from NAMESPACE.STATISTICS; +ds = select "client_lang_delete_success" as "cnt" from NAMESPACE.STATISTICS; +ws = select "client_lang_write_success" as "cnt" from NAMESPACE.STATISTICS; +e = select "client_lang_error" as "cnt" from NAMESPACE.STATISTICS; +total_client_udf_transactions = do rs + ds; +total_client_udf_transactions = do total_client_udf_transactions + ws; +total_client_udf_transactions = do total_client_udf_transactions + e save as "total client_lang"; +total_client_udf_transactions_per_sec = do total_client_udf_transactions/u; +total_client_udf_transactions_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_client_udf_transactions_per_sec); + +e = select "client_lang_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_client_udf_transactions_per_sec; +p = do p * 100 save as "client_lang_error % of total client_lang"; +r = do p <= 5; +ASSERT(r, True, "High client initiated udf transactions errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal client initiated udf transactions errors (> 5% client initiated udf transactions). Please run 'show statistics namespace like client_lang' to see values.", + "High client initiated udf transactions error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero client initiated udf transaction errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero client initiated udf transaction errors. Please run 'show statistics namespace like client_lang' to see values.", + "Non-zero client initiated udf transaction error check"); + + +// UDF Sub-Transaction statistics + +rs = select "udf_sub_lang_read_success" as "cnt" from NAMESPACE.STATISTICS; +ds = select "udf_sub_lang_delete_success" as "cnt" from NAMESPACE.STATISTICS; +ws = select "udf_sub_lang_write_success" as "cnt" from NAMESPACE.STATISTICS; +e = select "udf_sub_lang_error" as "cnt" from NAMESPACE.STATISTICS; +total_transactions = do rs + ds; +total_transactions = do total_transactions + ws; +total_transactions = do total_transactions + e save as "total udf_sub_lang"; +total_transactions_per_sec = do total_transactions/u; +total_transactions_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_transactions_per_sec); + +e = select "udf_sub_lang_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_transactions_per_sec; +p = do p * 100 save as "udf_sub_lang_error % of total udf_sub_lang"; +r = do p <= 5; +ASSERT(r, True, "High udf sub-transaction errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal udf sub-transaction errors (> 5% udf sub-transactions). Please run 'show statistics namespace like udf_sub_lang' to see values.", + "High udf sub-transaction error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero udf sub-transaction errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero udf sub-transaction errors. Please run 'show statistics namespace like udf_sub_lang' to see values.", + "Non-zero udf sub-transaction error check"); + + +// Query Agg statistics + +total_transactions = select "query_agg" from NAMESPACE.STATISTICS save as "total query aggregations"; +total_transactions_per_sec = do total_transactions/u; +total_transactions_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_transactions_per_sec); + +e = select "query_agg_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_transactions_per_sec; +p = do p * 100 save as "query_agg_error % of total query aggregations"; +r = do p <= 5; +ASSERT(r, True, "High query aggregation errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal query aggregation errors (> 5% query aggregations). Please run 'show statistics namespace like query_agg' to see values.", + "High query aggregation error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero query aggregation errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero query aggregation errors. Please run 'show statistics namespace like query_agg' to see values.", + "Non-zero query aggregation error check"); + + +// Query Lookup statistics + +total_transactions = select "query_lookups" from NAMESPACE.STATISTICS save as "total query lookups"; +total_transactions_per_sec = do total_transactions/u; +total_transactions_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_transactions_per_sec); + +e = select "query_lookup_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_transactions_per_sec; +p = do p * 100 save as "query_lookup_error % of total query lookups"; +r = do p <= 5; +ASSERT(r, True, "High query lookup errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal query lookup errors (> 5% query lookups). Please run 'show statistics namespace like query_lookup' to see values.", + "High query lookup error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero query lookup errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero query lookup errors. Please run 'show statistics namespace like query_lookup' to see values.", + "Non-zero query lookup error check"); + + +// Scan Agg statistics +s = select "scan_aggr_complete" as "cnt" from NAMESPACE.STATISTICS; +e = select "scan_aggr_error" as "cnt" from NAMESPACE.STATISTICS; +total_transactions = do s + e save as "total scan aggregations"; +total_transactions_per_sec = do total_transactions/u; +total_transactions_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_transactions_per_sec); + +e = select "scan_aggr_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_transactions_per_sec; +p = do p * 100 save as "scan_aggr_error % of total scan aggregations"; +r = do p <= 5; +ASSERT(r, True, "High scan aggregation errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal scan aggregation errors (> 5% scan aggregations). Please run 'show statistics namespace like scan_agg' to see values.", + "High scan aggregation error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero scan aggregation errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero scan aggregation errors. Please run 'show statistics namespace like scan_agg' to see values.", + "Non-zero scan aggregation error check"); + + +// Scan Basic statistics +s = select "scan_basic_complete" as "cnt" from NAMESPACE.STATISTICS; +e = select "scan_basic_error" as "cnt" from NAMESPACE.STATISTICS; +total_transactions = do s + e save as "total basic scans"; +total_transactions_per_sec = do total_transactions/u; +total_transactions_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_transactions_per_sec); + +e = select "scan_basic_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_transactions_per_sec; +p = do p * 100 save as "scan_basic_error % of total basic scans"; +r = do p <= 5; +ASSERT(r, True, "High basic scan errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal basic scan errors (> 5% basic scans). Please run 'show statistics namespace like scan_basic' to see values.", + "High basic scan error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero basic scan errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero basic scan errors. Please run 'show statistics namespace like scan_basic' to see values.", + "Non-zero basic scan error check"); + + +// Scan Background UDF statistics +s = select "scan_udf_bg_complete" as "cnt" from NAMESPACE.STATISTICS; +e = select "scan_udf_bg_error" as "cnt" from NAMESPACE.STATISTICS; +total_transactions = do s + e save as "total scan background udf"; +total_transactions_per_sec = do total_transactions/u; +total_transactions_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_transactions_per_sec); + +e = select "scan_udf_bg_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_transactions_per_sec; +p = do p * 100 save as "scan_udf_bg_error % of total scan background udf"; +r = do p <= 5; +ASSERT(r, True, "High scan background udf errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal scan background udf errors (> 5% scan background udf). Please run 'show statistics namespace like scan_udf_bg' to see values.", + "High scan background udf error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero scan background udf errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero scan background udf errors. Please run 'show statistics namespace like scan_udf_bg' to see values.", + "Non-zero scan background udf error check"); + + +// Client transaction statistics + +e = select "client_tsvc_error" from NAMESPACE.STATISTICS save; +e = do e/u on common save as "errors per second"; e = group by CLUSTER, NAMESPACE e; r = do e > 0; -ASSERT(r, False, "Non-zero write transaction timeouts.", "OPERATIONS", INFO, - "Listed namespace[s] have non-zero write transaction timeouts (for nodes). Please run 'show statistics namespace like client_write_timeout' to see values.", - "Namespace write transaction timeout check"); +ASSERT(r, False, "Non-zero client transaction error.", "OPERATIONS", INFO, + "Listed namespace[s] have non-zero client transaction errors (for nodes). Please run 'show statistics namespace like client_tsvc_error' to see values. Probable cause - protocol errors or security permission mismatch.", + "Namespace client transaction error count check"); -e = select "client_read_not_found" from NAMESPACE.STATISTICS; -e = group by CLUSTER, NAMESPACE e; -s = select "client_read_success" from NAMESPACE.STATISTICS; -s = group by CLUSTER, NAMESPACE, NODE do MAX(s); -s = do 50 %% s; -r = do e <= s; -ASSERT(r, True, "High read not found errors", "OPERATIONS", INFO, - "Listed namespace[s] show higher than normal read not found errors (> 50% client read success). Please run 'show statistics namespace like client_read_not_found client_read_success' to see values.", - "High read not found error check"); -e = select "xdr_write_error" from NAMESPACE.STATISTICS; -e = do e/u on common; +// UDF Sub-Transactions (transaction service) statistics + +e = select "udf_sub_tsvc_error" from NAMESPACE.STATISTICS save; +e = do e/u on common save as "errors per second"; e = group by CLUSTER, NAMESPACE e; r = do e > 0; -ASSERT(r, False, "Non-zero XDR write errors count.", "OPERATIONS", INFO, - "Listed namespace[s] have non-zero XDR write transaction failures (for nodes). Please run 'show statistics namespace like xdr_write_error' to see values.", - "Namespace XDR write failure check"); +ASSERT(r, False, "Non-zero udf sub-transaction error in the transaction service.", "OPERATIONS", INFO, + "Listed namespace[s] have non-zero udf sub-transaction errors in the transaction service (for nodes). Probable cause - protocol errors or security permission mismatch.", + "Namespace udf sub-transaction transaction service error count check"); + + +// Batch-index read Sub-Transaction (transaction service) statistics -e = select "xdr_write_timeout" from NAMESPACE.STATISTICS; -e = do e/u on common; +e = select "batch_sub_tsvc_error" from NAMESPACE.STATISTICS save; +e = do e/u on common save as "errors per second"; e = group by CLUSTER, NAMESPACE e; r = do e > 0; -ASSERT(r, False, "Non-zero XDR write timeouts.", "OPERATIONS", INFO, - "Listed namespace[s] have non-zero XDR write transaction timeouts (for nodes). Please run 'show statistics namespace like xdr_write_timeout' to see values.", - "Namespace XDR write timeout check"); +ASSERT(r, False, "Non-zero batch-index read sub-transaction errors in the transaction service.", "OPERATIONS", INFO, + "Listed namespace[s] have non-zero batch-index read sub-transaction errors in the transaction service across the nodes. Please run 'show statistics namespace like batch_sub_tsvc_error' to see the values.", + "Namespace batch-index read sub-transaction transaction service error count check"); + SET CONSTRAINT VERSION < 3.9; -e = select "stat_write_errs" from SERVICE.STATISTICS; -s = select "stat_write_success" from SERVICE.STATISTICS; -s = GROUP BY CLUSTER, NODE do SUM(s); -u = select "uptime" from SERVICE.STATISTICS; -u = GROUP BY CLUSTER, NODE do SUM(u); -r = do e / s; -r = do r/u on common; -r = do r == 0; -ASSERT(r, True, "Non-zero node write errors count", "OPERATIONS", INFO, - "Listed write error[s] show skew in count (for nodes). It may or may not be an issue depending on the error type. Please run 'show statistics service like stat_write' to see values.", - "Node write errors count check"); +// Read statistics -e = select "stat_read_errs_other" from SERVICE.STATISTICS; +t = select "stat_read_reqs" as "cnt" from SERVICE.STATISTICS save; + +e = select "stat_read_errs_other" from SERVICE.STATISTICS save; +r = do e/t; +r = do r * 100 save as "stat_read_errs_other % of total reads"; +r = do r <= 5; +ASSERT(r, True, "High read errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal read errors (> 5% reads). Please run 'show statistics service like stat_read' to see values.", + "High read error check"); + +nf = select "stat_read_errs_notfound" from SERVICE.STATISTICS save; +r = do nf/t; +r = do r * 100 save as "stat_read_errs_notfound % of total reads"; +r = do r <= 20; +ASSERT(r, True, "High read not found errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal read not found errors (> 20% reads). Please run 'show statistics service like stat_read' to see values.", + "High read not found error check"); + + +// Write statistics + +t = select "stat_write_reqs" as "cnt" from SERVICE.STATISTICS save; + +e = select "stat_write_errs" from SERVICE.STATISTICS save; +r = do e/t; +r = do r * 100 save as "stat_write_errs % of total writes"; +r = do r <= 5; +ASSERT(r, True, "High write errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal write errors (> 5% writes). Please run 'show statistics service like stat_write' to see values.", + "High write error check"); + + +e = select "stat_read_errs_other" from SERVICE.STATISTICS save; s = select "stat_read_success" from SERVICE.STATISTICS; s = GROUP BY CLUSTER, NODE do SUM(s); -u = select "uptime" from SERVICE.STATISTICS; -u = GROUP BY CLUSTER, NODE do SUM(u); r = do e / s; r = do r/u on common; r = do r == 0; @@ -757,7 +1329,7 @@ SET CONSTRAINT VERSION >= 3.3.17; -defslp= select "defrag-sleep", "storage-engine.defrag-sleep" from NAMESPACE.CONFIG; +defslp= select "defrag-sleep", "storage-engine.defrag-sleep" from NAMESPACE.CONFIG save; defslp = group by CLUSTER, NAMESPACE defslp; r = do defslp == 1000; ASSERT(r, True, "Non-default namespace defrag-sleep configuration.", "OPERATIONS",INFO, @@ -766,4 +1338,77 @@ SET CONSTRAINT VERSION ALL; + +/* +Queries Requested by SA Team (Ronen) +*/ + +SET CONSTRAINT VERSION >= 3.9; + +crp = select "cache_read_pct" as "post-write-queue", "cache-read-pct" as "post-write-queue" from NAMESPACE.STATISTICS save; +pwq = select "post-write-queue", "storage-engine.post-write-queue" as "post-write-queue" from NAMESPACE.CONFIG save; +crp = do crp >= 10; +pwq = do pwq == 256; +r = do crp && pwq; +r = group by CLUSTER, NAMESPACE, NODE r; +ASSERT(r, False, "Sub-optimal post-write-queue", "OPERATIONS", INFO, + "Listed namespace[s] show high cache hit rate (> 10%) but post-write-queue value is default. It might be sub-optimal. Please contact Aerospike support team or SA team.", + "Namespace post-write-queue check"); + + +SET CONSTRAINT VERSION >= 3.11; + +ptl = select "partition-tree-locks" from NAMESPACE.CONFIG save; +cs = select "cluster_size" from SERVICE.STATISTICS; +cs = group by CLUSTER do MAX(cs) save as "cluster_size"; +r = do cs/ptl; +r = group by CLUSTER, NAMESPACE, NODE r; +r = do r < 2; + +ASSERT(r, True, "Non-recommended partition-tree-locks", "OPERATIONS", WARNING, + "Listed namespace[s] show low value for partition-tree-locks with respect to cluster size. It should be 8 for cluster-size < 16, 16 for cluster sizes 16 to 31, 32 for cluster sizes 32 to 63, etc. Please contact Aerospike support team or SA team.", + "Namespace partition-tree-locks check"); + + +m = select "memory-size" as "cnt" from NAMESPACE.CONFIG; +s = select "stop-writes-pct" as "cnt" from NAMESPACE.CONFIG; +s = do 100 - s; +s = do s/100; +extra_space = do m * s save as "breathing space (over stop-write)"; +extra_space = group by CLUSTER, NODE, NAMESPACE do SUM(extra_space); + +p = select "partition-tree-sprigs" from NAMESPACE.CONFIG save; +p = do p/16; + +overhead1 = do 64 * 1024; +overhead2 = do 1024 * 1024; +overhead = do overhead1 + overhead2; + +total_overhead = do p * overhead save as "partition-tree-sprigs overhead"; +r = do total_overhead < extra_space; + +e = select "edition" from METADATA; +e = do e == "Community"; +e = group by CLUSTER, NODE do OR(e); +ASSERT(r, False, "Non-recommended partition-tree-sprigs for Community edition", "OPERATIONS", INFO, + "Listed namespace[s] show low value for partition-tree-sprigs with respect to memory-size. partition-tree-sprigs overhead is less than (100 - stop-write-pct) % memory-size. It should be increased. Please contact Aerospike support team or SA team.", + "Namespace partition-tree-sprigs check for Community edition", + e); + +ee_overhead = do 320 * 1024; +overhead = do overhead + ee_overhead; + +total_overhead = do p * overhead save as "partition-tree-sprigs overhead"; +r = do total_overhead < extra_space; + +e = select "edition" from METADATA; +e = do e == "Enterprise"; +e = group by CLUSTER, NODE do OR(e); +ASSERT(r, False, "Non-recommended partition-tree-sprigs for Enterprise edition", "OPERATIONS", INFO, + "Listed namespace[s] show low value for partition-tree-sprigs with respect to memory-size. partition-tree-sprigs overhead is less than (100 - stop-write-pct) % memory-size. It should be increased. Please contact Aerospike support team or SA team.", + "Namespace partition-tree-sprigs check for Enterprise edition", + e); + +SET CONSTRAINT VERSION ALL; + ''' diff --git a/lib/health/query/health.hql b/lib/health/query/health.hql index fb5ef234..853ba466 100644 --- a/lib/health/query/health.hql +++ b/lib/health/query/health.hql @@ -10,8 +10,57 @@ // SET CONSTRAINT VERSION IN [3.8.4, 3.10.0]; SET CONSTRAINT VERSION ALL; +/* System checks */ + +limit = select "Soft_Max_open_files" as "fd" from SYSTEM.LIMITS save; +limit = group by CLUSTER, NODE, KEY do SUM(limit); +config = select "proto-fd-max" as "fd" from SERVICE.CONFIG save; +r = do config < limit; +ASSERT(r, True, "File descriptor is configured higher than limit.", "LIMITS", INFO, + "Listed node[s] have proto-fd-limit set higher than system soft limit of Max open files. Aerospike process may run out of file descriptor, Possible misconfiguration.", + "System open file descriptor limit check."); + +s = select * from SYSTEM.HDPARM save; +r = group by KEY do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different Disk Hardware in cluster.", "OPERATIONS", INFO, + "Different disk hardware configuration across multiple nodes in cluster.", "Disk hardware check."); + +s = select "OOM" from SYSTEM.DMESG save; +ASSERT(s, False, "DMESG: Process Out of Memory kill.", "OPERATIONS", INFO, + "Certain process was killed due to Out Of Memory. Check dmesg or system log.", + "System OOM kill check."); + +s = select "Blocked" from SYSTEM.DMESG save; +ASSERT(s, False, "DMESG: Process blocking.", "OPERATIONS", INFO, + "Certain process was blocked for more than 120sec. Check dmesg or system log.", + "System process blocking Check."); + +s = select "OS" from SYSTEM.DMESG save; +r = group by NODE do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different OS version in cluster.", "OPERATIONS", INFO, + "Different version of OS running across multiple nodes in cluster.", "OS version check."); + +s = select * from SYSTEM.LSCPU save; +r = group by KEY do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "CPU configuration mismatch.", "OPERATIONS", INFO, + "Listed node[s] in the cluster are running with different CPU or CPU setting, performance may be skewed. Please run 'lscpu' to check CPU configuration.", + "CPU config check."); + +s = select "vm_drop_caches", "vm_nr_hugepages", "vm_nr_hugepages_policy", "vm_numa_zonelist_order", "vm_oom_dump_tasks", "vm_oom_kill_allocating_task", "vm_zone_reclaim_mode", "vm_swapiness", + "vm_nr_overcommit_hugepages", "kernel_shmmax", "kernel_shmall", "kernel_version" from SYSTEM.SYSCTLALL save; +r = group by KEY do NO_MATCH(s, ==, MAJORITY); +ASSERT(r, False, "Sysctl configuration mismatch.", "OPERATIONS", INFO, + "Listed node[s] in the cluster are running with different Sysctl setting. Please run 'sysctl -a' to check CPU configuration.", + "Sysctl config check."); + +s = select "has_firewall" from SYSTEM.IPTABLES; +ASSERT(s, False, "Node in cluster have firewall setting.", "OPERATIONS", INFO, + "Listed node[s] have firewall setting. Could cause cluster formation issue if misconfigured. Please run 'iptables -L' to check firewall rules.", + "Firewall Check."); + + /* Disk */ -s = select "%util" from SYSTEM.IOSTAT; +s = select "%util" from SYSTEM.IOSTAT save; r = do s > 90; ASSERT(r, False, "High system disk utilization.", "PERFORMANCE", CRITICAL, "Listed disks show higher than normal (> 90%) disk utilization at the time of sampling. Please run 'iostat' command to check disk utilization. Possible causes can be disk overload due to undersized cluster or some issue with disk hardware itself. If running on cloud, can be a problem with cloud instance itself.", @@ -22,9 +71,9 @@ ASSERT(r1, False, "Skewed cluster disk utilization.", "ANOMALY", WARNING, "Disk utilization Anomaly."); -avail=select like(".*available_pct") as "free_disk" from NAMESPACE.STATISTICS; -disk_free = select "device_free_pct" as "free_disk", "free-pct-disk" as "free_disk" from NAMESPACE.STATISTICS; -r = do disk_free - avail; +avail=select like(".*available_pct") as "free_disk" from NAMESPACE.STATISTICS save; +disk_free = select "device_free_pct" as "free_disk", "free-pct-disk" as "free_disk" from NAMESPACE.STATISTICS save; +r = do disk_free - avail save as "fragmented blocks pct"; r = do r <= 30; r = group by CLUSTER, NAMESPACE r; ASSERT(r, True, "High (> 30%) fragmented blocks.", "PERFORMANCE", WARNING, @@ -32,7 +81,7 @@ ASSERT(r, True, "High (> 30%) fragmented blocks.", "PERFORMANCE", WARNING, "Fragmented Blocks check."); -s = select "%iowait" from SYSTEM.IOSTAT; +s = select "%iowait" from SYSTEM.IOSTAT save; r = do s > 10; ASSERT(r, False, "High (> 10%) CPU IO wait time.", "PERFORMANCE", WARNING, "Listed nodes show higher than normal (> 10%) CPU spent in io wait. Please run 'iostat' command to check utilization. Possible cause can be slow disk or network leading to lot of CPU time spent waiting for IO.", @@ -43,7 +92,7 @@ ASSERT(r1, False, "Skewed CPU IO wait time.", "ANOMALY", WARNING, "CPU IO wait time anomaly."); -s = select "await" from SYSTEM.IOSTAT; +s = select "await" from SYSTEM.IOSTAT save; r = do s > 4; ASSERT(r, False, "High system disk average wait time.", "PERFORMANCE", WARNING, "Listed disks show higher than normal (> 4ms) disk average wait time. Please run 'iostat' command to check average wait time (await). Possible cause can be issue with disk hardware or VM instance in case you are running in cloud environment. This may also be caused by having storage over network like say SAN device or EBS.", @@ -54,7 +103,7 @@ ASSERT(r1, False, "Skewed cluster disk average wait time", "ANOMALY", WARNING, "Disk average wait time anomaly check."); -s = select "avgqu-sz" from SYSTEM.IOSTAT; +s = select "avgqu-sz" from SYSTEM.IOSTAT save; r = do s > 7; ASSERT(r, False, "High disk average queue size.", "PERFORMANCE", INFO, "Listed disks show higher than normal (> 7) disk average queue size. This is not a issue if using NVME drives which support more queues. Please run 'iostat' command to check average wait time (avgqu-sz). Possible disk overload. This may be non-issue of disk has more than 7 queues. Please analyze this number in conjunction with utilization.", @@ -65,8 +114,8 @@ ASSERT(r1, False, "Skewed cluster disk avg queue size.", "ANOMALY", WARNING, "Disk avg queue size anomaly check."); -s = select "id" as "cpu_use" from SYSTEM.TOP.CPU_UTILIZATION; -s = do 100 - s; +s = select "id" as "cpu_use" from SYSTEM.TOP.CPU_UTILIZATION save as "cpu_idle_pct"; +s = do 100 - s save as "cpu utilization pct"; r = do s > 70; ASSERT(r, False, "High system CPU utilization.", "PERFORMANCE", CRITICAL, "Listed node[s] are showing higher than normal (> 70%) CPU utilization. Please check top output. Possible system overload.", @@ -77,21 +126,21 @@ ASSERT(r1, False, "Skewed cluster CPU utilization.", "ANOMALY", WARNING, "CPU utilization anomaly check."); -s = select "resident_memory" from SYSTEM.TOP; +s = select "resident_memory" from SYSTEM.TOP save; r = group by KEY do SD_ANOMALY(s, ==, 3); ASSERT(r, False, "Skewed cluster resident memory utilization.", "ANOMALY", WARNING, "Listed node[s] show different resident memory usage compared to other node[s]. Please run top command on those node[s] to confirm such behavior. Possible skewed data distribution. This may be non-issue in case migrations are going on.", "Resident memory utilization anomaly."); -s = select "system_swapping" from SERVICE.STATISTICS; +s = select "system_swapping" from SERVICE.STATISTICS save; r = do s == true; ASSERT(r, False, "System memory swapping.", "LIMITS", INFO, "Listed node[s] are swapping. Please run 'show statistics service like system_swapping' to confirm such behaviour. Possible misconfiguration. This may be non-issue if amount of swap is small and good amount of memory available.", "System swap check."); /* TODO - is it really actually an issue */ -s = select "system_free_mem_pct"; +s = select "system_free_mem_pct" from SERVICE.STATISTICS save; r = do s < 20; ASSERT(r, False, "Low system memory percentage.", "LIMITS", CRITICAL, "Listed node[s] have lower than normal (< 20%) system free memory percentage. Please run 'show statistics service like system_free_mem_pct' to get actual values. Possible misconfiguration.", @@ -100,7 +149,7 @@ ASSERT(r, False, "Low system memory percentage.", "LIMITS", CRITICAL, /* NB : ADD CHECKS IF NODES ARE NOT HOMOGENOUS MEM / NUM CPU etc */ -s = select "available_bin_names", "available-bin-names" from NAMESPACE; +s = select "available_bin_names", "available-bin-names" from NAMESPACE save; r = group by NAMESPACE do s > 3200; ASSERT(r, True, "Low namespace available bin names.", "LIMITS", WARNING, "Listed node[s] have low available bin name (< 3200) for corresponding namespace[s]. Maximum unique bin names allowed per namespace are 32k. Please run 'show statistics namespace like available' to get actual values. Possible improperly modeled data.", @@ -109,29 +158,29 @@ ASSERT(r, True, "Low namespace available bin names.", "LIMITS", WARNING, /* Holds only upto 4B key */ SET CONSTRAINT VERSION < 3.12; -s = select "memory-size" from NAMESPACE; -r = group by NODE, NAMESPACE do SUM(s); +s = select "memory-size" from NAMESPACE.CONFIG save; +r = group by CLUSTER, NODE, NAMESPACE do SUM(s); e = do r <= 274877906944; ASSERT(e, True, "Namespace configured to use more than 256G.", "LIMITS", WARNING, - "On list nodes namespace as mentioned have configured more than 256G of memory. Namespace with data not in memory can have max upto 4billion keys and can utilize only up to 256G. Please run 'show statistics namespace like memory-size' to check configured memory.", + "On listed nodes namespace as mentioned have configured more than 256G of memory. Namespace with data not in memory can have max upto 4 billion keys and can utilize only up to 256G. Please run 'show statistics namespace like memory-size' to check configured memory.", "Namespace per node memory limit check."); SET CONSTRAINT VERSION ALL; /* -Following query selects assigned memory-size from namespace statistics and total ram size from system statistics. +Following query selects assigned memory-size from namespace config and total ram size from system statistics. group by for namespace stats sums all memory size and gives node level memory size. group by for system stats helps to remove key, this is requirement for proper matching for simple operations. */ -s = select "memory-size" from NAMESPACE; -n = group by NODE do SUM(s); +s = select "memory-size" from NAMESPACE.CONFIG save; +n = group by NODE do SUM(s) save as "sum of memory-size"; s = select "total" from SYSTEM.FREE.MEM; -m = group by NODE do SUM(s); +m = group by NODE do SUM(s) save as "total physical memory"; r = do n <= m on common; ASSERT(r, True, "Namespace memory misconfiguration.", "LIMITS", WARNING, "Listed node[s] have more namespace memory configured than available physical memory. Please run 'show statistics namespace like memory-size' to check configured memory and check output of 'free' for system memory. Possible namespace misconfiguration.", "Namespace memory configuration check."); -r = do m - n on common; +r = do m - n on common save as "runtime memory"; r = do r >= 5368709120; ASSERT(r, True, "Aerospike runtime memory configured < 5G.", "LIMITS", INFO, "Listed node[s] have less than 5G free memory available for Aerospike runtime. Please run 'show statistics namespace like memory-size' to check configured memory and check output of 'free' for system memory. Possible misconfiguration.", @@ -142,30 +191,40 @@ ASSERT(r, True, "Aerospike runtime memory configured < 5G.", "LIMITS", INFO, Following query selects proto-fd-max from service config and client_connections from service statistics. It uses as clause to get proper matching structure for simple operation. */ -max = select "proto-fd-max" as "fd" from SERVICE.CONFIG; -conn = select "client_connections" as "fd" from SERVICE.STATISTICS; +max = select "proto-fd-max" as "fd" from SERVICE.CONFIG save; +conn = select "client_connections" as "fd" from SERVICE.STATISTICS save; bound = do 80 %% max; r = do conn > bound; ASSERT(r, False, "High system client connections.", "OPERATIONS", WARNING, - "Listed node[s] show higher than normal (> 80%) client-connections of the max configured proto-fd-max. Please run 'show config like proto-fd-max' and 'show statistics like client_connections' for actual values. Possible can be network issue / improper client behavior / FD leak.", + "Listed node[s] show higher than normal client-connections (> 80% of the max configured proto-fd-max). Please run 'show config like proto-fd-max' and 'show statistics like client_connections' for actual values. Possible can be network issue / improper client behavior / FD leak.", "Client connections check."); -s = select like(".*available_pct") from NAMESPACE.STATISTICS; +s = select like(".*available_pct") from NAMESPACE.STATISTICS save; r = do s < 20; ASSERT(r, False, "Low namespace disk available pct.", "OPERATIONS", WARNING, - "Listed namespace[s] have lower than normal (< 20 %). Please run 'show statistics namespace like available_pct' to check available disk space. Probable cause - namespace size misconfiguration.", + "Listed namespace[s] have lower than normal (< 20 %) available disk space. Please run 'show statistics namespace like available_pct' to check available disk space. Probable cause - namespace size misconfiguration.", "Namespace disk available pct check."); -s = select * from SERVICE.CONFIG; -r = group by KEY do EQUAL(s); -ASSERT(r, True, "Different service configurations.", "OPERATIONS", WARNING, +s = select * from SERVICE.CONFIG ignore "pidfile", "heartbeat.mtu", like(".*address"), like(".*port") save; +r = group by CLUSTER, KEY do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different service configurations.", "OPERATIONS", WARNING, "Listed Service configuration[s] are different across multiple nodes in cluster. Please run 'show config service diff' to check different configuration values. Probable cause - config file misconfiguration.", "Service configurations difference check."); +multicast_mode_enabled = select like(".*mode") from NETWORK.CONFIG; +multicast_mode_enabled = do multicast_mode_enabled == "multicast"; +multicast_mode_enabled = group by CLUSTER, NODE do OR(multicast_mode_enabled); +s = select like(".*mtu") from SERVICE.CONFIG save; +r = group by CLUSTER do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different heartbeat.mtu.", "OPERATIONS", WARNING, + "Listed node[s] have a different heartbeat.mtu configured. A multicast packet can only be as large as the interface mtu. Different mtu values might create cluster stability issue. Please contact Aerospike Support team.", + "heartbeat.mtu check.", + multicast_mode_enabled); + -s = select "migrate-threads", "migrate_threads" from SERVICE.CONFIG; +s = select "migrate-threads", "migrate_threads" from SERVICE.CONFIG save; r = do s > 1; ASSERT(r, False, "> 1 migrate thread configured.", "OPERATIONS", INFO, "Listed node[s] are running with higher than normal (> 1) migrate threads. Please run 'show config service like migrate-threads' to check migration configuration. Is a non-issue if requirement is to run migration aggressively. Otherwise possible operational misconfiguration.", @@ -173,27 +232,35 @@ ASSERT(r, False, "> 1 migrate thread configured.", "OPERATIONS", INFO, /* Device Configuration */ -s = select "device_total_bytes", "device-total-bytes", "total-bytes-disk" from NAMESPACE.STATISTICS; -r = group by NAMESPACE do EQUAL(s); -ASSERT(r, True, "Different namespace device size configuration.", "OPERATIONS", WARNING, +s = select "device_total_bytes", "device-total-bytes", "total-bytes-disk" from NAMESPACE.STATISTICS save; +r = group by CLUSTER, NAMESPACE do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different namespace device size configuration.", "OPERATIONS", WARNING, "Listed namespace[s] have difference in configured disk size. Please run 'show statistics namespace like bytes' to check total device size. Probable cause - config file misconfiguration.", "Namespace device size configuration difference check."); -hwm = select "high-water-disk-pct" from NAMESPACE.CONFIG; +hwm = select "high-water-disk-pct" from NAMESPACE.CONFIG save; hwm = group by CLUSTER, NAMESPACE hwm; r = do hwm == 50; ASSERT(r, True, "Non-default namespace device high water mark configuration.", "OPERATIONS", INFO, "Listed namespace[s] have non-default high water mark configuration. Please run 'show config namespace like high-water-disk-pct' to check value. Probable cause - config file misconfiguration.", "Non-default namespace device high water mark check."); -hwm = select "high-water-disk-pct" as "defrag-lwm-pct" from NAMESPACE.CONFIG; -lwm = select like(".*defrag-lwm-pct") as "defrag-lwm-pct" from NAMESPACE.CONFIG; +lwm = select like(".*defrag-lwm-pct") from NAMESPACE.CONFIG save; +lwm = group by CLUSTER, NAMESPACE lwm; +r = do lwm == 50; +ASSERT(r, True, "Non-default namespace device low water mark configuration.", "OPERATIONS", INFO, + "Listed namespace[s] have non-default low water mark configuration. Probable cause - config file misconfiguration.", + "Non-default namespace device low water mark check."); + +hwm = select "high-water-disk-pct" as "defrag-lwm-pct" from NAMESPACE.CONFIG save; +lwm = select like(".*defrag-lwm-pct") as "defrag-lwm-pct" from NAMESPACE.CONFIG save; r = do lwm < hwm on common; r = group by CLUSTER, NAMESPACE r; ASSERT(r, False, "Defrag low water mark misconfigured.", "OPERATIONS", WARNING, "Listed namespace[s] have defrag-lwm-pct lower than high-water-disk-pct. This might create situation like no block to write, no eviction and no defragmentation. Please run 'show config namespace like high-water-disk-pct defrag-lwm-pct' to check configured values. Probable cause - namespace watermark misconfiguration.", "Defrag low water mark misconfiguration check."); + /* Following query collects used device space and total device space and computes available free space on each node per namespace per cluster (group by CLUSTER, NAMESPACE, NODE). It collects cluster-size and uses it to find out expected data distribution for each node in case that node is down. It checks max of this computed value per namespace @@ -204,13 +271,13 @@ t = select "device_total_bytes" as "disk_space", "device-total-bytes" as "disk_s u = select "used-bytes-disk" as "disk_space", "device_used_bytes" as "disk_space" from NAMESPACE.STATISTICS; /* Available extra space */ e = do t - u; -e = group by CLUSTER, NAMESPACE, NODE do SUM(e); +e = group by CLUSTER, NAMESPACE, NODE do SUM(e) save as "available device space"; s = select "cluster_size" as "size" from SERVICE; -n = do AVG(s); +n = do MAX(s); n = do n - 1; /* Extra space need if 1 node goes down */ e1 = do u / n; -e1 = group by CLUSTER, NAMESPACE do MAX(e1); +e1 = group by CLUSTER, NAMESPACE do MAX(e1) save as "distribution share of used device space per node"; r = do e > e1; ASSERT(r, True, "Namespace under configured (disk) for single node failure.", "OPERATIONS", WARNING, "Listed namespace[s] does not have enough disk space configured to deal with increase in data per node in case of 1 node failure. Please run 'show statistics namespace like bytes' to check device space. It is non-issue if single replica limit is set to larger values, i.e if number of replica copies are reduced in case of node loss.", @@ -219,18 +286,17 @@ ASSERT(r, True, "Namespace under configured (disk) for single node failure.", "O /* Same as above query but for memory */ -t = select "memory-size" as "mem" from NAMESPACE; +t = select "memory-size" as "mem" from NAMESPACE.CONFIG; u = select "used-bytes-memory" as "mem", "memory_used_bytes" as "mem" from NAMESPACE.STATISTICS; /* Available extra space */ e = do t - u; -e = group by CLUSTER, NAMESPACE, NODE do SUM(e); - +e = group by CLUSTER, NAMESPACE, NODE do SUM(e) save as "available memory space"; s = select "cluster_size" as "size" from SERVICE; -n = do AVG(s); +n = do MAX(s); n = do n - 1; /* Extra space need if 1 node goes down */ e1 = do u / n; -e1 = group by CLUSTER, NAMESPACE do MAX(e1); +e1 = group by CLUSTER, NAMESPACE do MAX(e1) save as "distribution share of used memory space per node"; r = do e > e1; ASSERT(r, True, "Namespace under configured (memory) for single node failure.", "OPERATIONS", WARNING, "Listed namespace[s] does not have enough memory space configured to deal with increase in data per node in case of 1 node failure. Please run 'show statistics namespace like bytes' to check memory space. It is non-issue if single replica limit is set to larger values, i.e number of replica copies reduce.", @@ -239,54 +305,55 @@ ASSERT(r, True, "Namespace under configured (memory) for single node failure.", /* Namespace Configuration */ +SET CONSTRAINT VERSION < 3.13; + nsid = select "nsid" from NAMESPACE.CONFIG; -r = group by CLUSTER, NAMESPACE do EQUAL(nsid); -ASSERT(r, True, "Different namespace order in aerospike conf.", "OPERATIONS", CRITICAL, +r = group by CLUSTER, NAMESPACE do NO_MATCH(nsid, ==, MAJORITY) save; +ASSERT(r, False, "Different namespace order in aerospike conf.", "OPERATIONS", CRITICAL, "Listed namespace[s] have different order on different nodes. Please check aerospike conf file on all nodes and change configuration to make namespace order same.", "Namespace order check."); +SET CONSTRAINT VERSION ALL; + repl = select "replication-factor", "repl-factor" from NAMESPACE.CONFIG; repl = group by CLUSTER, NAMESPACE repl; -ns_count = group by CLUSTER do COUNT(repl); -ns_count_per_node = group by CLUSTER, NODE do COUNT(repl); +ns_count = group by CLUSTER do COUNT(repl) save as "total available namespaces for cluster"; +ns_count_per_node = group by CLUSTER, NODE do COUNT(repl) save as "namespace count"; r = do ns_count_per_node == ns_count; ASSERT(r, True, "Disparate namespaces.", "OPERATIONS", WARNING, "Listed node[s] do not have all namespaces configured. Please check aerospike conf file on all nodes and change namespace configuration as per requirement.", "Namespaces per node count check."); -r = select "replication-factor", "repl-factor" from NAMESPACE.CONFIG; +r = select "replication-factor", "repl-factor" from NAMESPACE.CONFIG save; r = group by CLUSTER, NAMESPACE r; r = do r == 2; ASSERT(r, True, "Non-default namespace replication-factor configuration.", "OPERATIONS", INFO, "Listed namespace[s] have non-default replication-factor configuration. Please run 'show config namespace like repl' to check value. It may be non-issue in case namespace are configured for user requirement. Ignore those.", "Non-default namespace replication-factor check."); -s = select * from NAMESPACE.CONFIG; -r = group by NAMESPACE, KEY do EQUAL(s); -ASSERT(r, True, "Different namespace configurations.", "OPERATIONS", WARNING, +s = select * from NAMESPACE.CONFIG ignore "rack-id", like(".*device"), like(".*file") save; +r = group by CLUSTER, NAMESPACE, KEY do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different namespace configurations.", "OPERATIONS", WARNING, "Listed namespace configuration[s] are different across multiple nodes in cluster. Please run 'show config namespace diff' to get actual difference. It may be non-issue in case namespace are configured with different device or file name etc. Ignore those.", "Namespace configurations difference check."); - -s = select like(".*_err.*") from SERVICE.STATISTICS; +/* Errors */ +s = select like(".*_err.*") from SERVICE.STATISTICS save; u = select "uptime" from SERVICE.STATISTICS; -u = group by CLUSTER, NODE do SUM(u); +u = group by CLUSTER, NODE do MAX(u); s = do s / u; r = group by KEY do SD_ANOMALY(s, ==, 3); -ASSERT(r, False, "Skewed cluster service errors count.", "ANOMALY", WARNING, +ASSERT(r, False, "Skewed cluster service errors count.", "ANOMALY", INFO, "Listed service errors[s] show skew in error count patterns (for listed node[s]). Please run 'show statistics service like err' for details.", "Service errors count anomaly check."); -s = select like(".*_error") from NAMESPACE.STATISTICS; -u = select "uptime" from SERVICE.STATISTICS; -u = group by CLUSTER, NODE do MAX(u); -s = do s / u on common; -d = group by NAMESPACE, KEY do SUM(s); -e = do d == 0; -ASSERT(e, True, "Non-zero namespace errors count.", "OPERATIONS", WARNING, - "Listed namespace error[s] show skew in count (for nodes). It may or may not be an issue depending on the error type. Please run 'show statistics namespace like error' for details.", - "Namespace errors count check."); +e = select "hwm_breached", "hwm-breached" from NAMESPACE.STATISTICS; +e = group by CLUSTER, NAMESPACE e; +r = do e == False; +ASSERT(r, True, "Namespace HWM breached.", "OPERATIONS", WARNING, + "Listed namespace[s] show HWM breached for memory or Disks.", + "Namespace HWM breach check."); /* Following query collects master_objects, prole_objects and replication_factor, and computes proles for one replication (prole_objects/(replication_factor-1)). @@ -297,82 +364,82 @@ this last result will 'AND' with replication_enabled and migration_in_progress b m = select "master_objects" as "cnt", "master-objects" as "cnt" from NAMESPACE.STATISTICS; p = select "prole_objects" as "cnt", "prole-objects" as "cnt" from NAMESPACE.STATISTICS; r = select "replication-factor", "repl-factor" from NAMESPACE.CONFIG; -m = select "migrate_rx_partitions_active", "migrate_progress_recv", "migrate-rx-partitions-active" from NAMESPACE.STATISTICS; -mt = group by NAMESPACE do SUM(m); +mg = select "migrate_rx_partitions_active", "migrate_progress_recv", "migrate-rx-partitions-active" from NAMESPACE.STATISTICS; +mt = group by NAMESPACE do SUM(m) save as "master_objects"; pt = group by NAMESPACE do SUM(p); r = group by NAMESPACE do MAX(r); -m = group by NAMESPACE do MAX(m); -migration_in_progress = do m > 0; +mg = group by NAMESPACE do MAX(mg); +no_migration = do mg == 0; + replication_enabled = do r > 1; r = do r - 1; -pt = do pt / r; -discounted_pt = do 95 %% pt; +pt = do pt / r save as "unique prole_objects"; +discounted_pt = do 95 %% pt save as "95% of unique prole_objects"; d = do discounted_pt > mt; d = do d && replication_enabled; -d = do d && migration_in_progress; +d = do d && no_migration; ASSERT(d, False, "Skewed namespace data distribution, prole objects exceed master objects by > 5%.", "DATA", INFO, "Listed namespace[s] show abnormal object distribution. It may not be an issue if migrations are in progress. Please run 'show statistics namespace like object' for actual counts.", "Namespace data distribution check (prole objects exceed master objects by > 5%)."); -discounted_mt = do 95 %% mt; +discounted_mt = do 95 %% mt save as "95% of master_objects"; d = group by NAMESPACE do discounted_mt > pt; d = do d && replication_enabled; -d = do d && migration_in_progress; +d = do d && no_migration; ASSERT(d, False, "Skewed namespace data distribution, master objects exceed prole objects by > 5%.", "DATA", INFO, "Listed namespace[s] show abnormal object distribution. It may not be an issue if migrations are in progress. Please run 'show statistics namespace like object' for actual counts.", "Namespace data distribution check (master objects exceed prole objects by > 5%)."); -s = select "set-delete", "deleting" as "set-delete" from SET; -r = group by NAMESPACE, SET do EQUAL(s); -ASSERT(r, True, "Different set delete status.", "OPERATIONS", INFO, +s = select "set-delete", "deleting" as "set-delete" from SET save; +r = group by CLUSTER, NAMESPACE, SET do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different set delete status.", "OPERATIONS", INFO, "Listed set[s] have different set delete status across multiple nodes in cluster. This is non-issue if set-delete is being performed. Nodes reset the status asynchronously. Please check if nsup is still delete data for the set.", "Set delete status check."); -s = select like ("disable-eviction") from SET; -r = group by NAMESPACE, SET do EQUAL(s); -ASSERT(r, True, "Different set eviction configuration.", "OPERATIONS", WARNING, +s = select like ("disable-eviction") from SET save; +r = group by CLUSTER, NAMESPACE, SET do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different set eviction configuration.", "OPERATIONS", WARNING, "Listed set[s] have different eviction setting across multiple nodes in cluster. Please run 'show statistics set like disable-eviction' to check values. Possible operational misconfiguration.", "Set eviction configuration difference check."); -s = select like ("set-enable-xdr") from SET; -r = group by NAMESPACE, SET do EQUAL(s); -ASSERT(r, True, "Different set xdr configuration.", "OPERATIONS", WARNING, +s = select like ("set-enable-xdr") from SET save; +r = group by CLUSTER, NAMESPACE, SET do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different set xdr configuration.", "OPERATIONS", WARNING, "Listed set[s] have different XDR replication setting across multiple nodes in cluster. Please run 'show statistics set like set-enable-xdr' to check values. Possible operational misconfiguration.", "Set xdr configuration difference check."); -s = select "n_objects", "objects" as "n_objects" from SET; -/* Should be Anomaly */ -r = group by NAMESPACE, SET do SD_ANOMALY(s, ==, 3); +s = select "n_objects", "objects" from SET save; +r = group by CLUSTER, NAMESPACE, SET do SD_ANOMALY(s, ==, 3); ASSERT(r, False, "Skewed cluster set object count.", "ANOMALY", WARNING, "Listed set[s] have skewed object distribution. Please run 'show statistics set like object' to check counts. It may be non-issue if cluster is undergoing migrations.", "Set object count anomaly check."); /* XDR */ -s = select * from XDR.CONFIG; -r = GROUP by KEY do EQUAL(s); -ASSERT(r, True, "Different XDR configurations.", "OPERATIONS", WARNING, +s = select * from XDR.CONFIG save; +r = GROUP by CLUSTER, KEY do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different XDR configurations.", "OPERATIONS", WARNING, "Listed XDR configuration[s] are different across multiple nodes in cluster. Please run 'show config xdr diff' to get difference. Possible operational misconfiguration.", "XDR configurations difference check."); -s = select * from XDR.STATISTICS; +s = select * from XDR.STATISTICS save; u = select "uptime" from SERVICE.STATISTICS; -u = group by CLUSTER, NODE do SUM(u); +u = group by CLUSTER, NODE do MAX(u); s = do s / u; -r = group by KEY do SD_ANOMALY(s, ==, 3); +r = group by CLUSTER, KEY do SD_ANOMALY(s, ==, 3); ASSERT(r, False, "Skewed cluster XDR statistics.", "ANOMALY", WARNING, "Listed XDR statistic[s] show skew for the listed node[s]. It may or may not be an issue depending on the statistic type.", "XDR statistics anomaly check."); -s = select * from DC.STATISTICS; +s = select * from DC.STATISTICS ignore "dc_size", "dc_state" save; u = select "uptime" from SERVICE.STATISTICS; -u = group by CLUSTER, NODE do SUM(u); +u = group by CLUSTER, NODE do MAX(u); s = do s / u on common; -r = group by DC, KEY do SD_ANOMALY(s, ==, 3); +r = group by CLUSTER, DC, KEY do SD_ANOMALY(s, ==, 3); ASSERT(r, False, "Skewed cluster remote DC statistics.", "ANOMALY", WARNING, "Listed DC statistic[s] show skew for the listed node[s]. Please run 'show statistics dc' to get all DC stats. May be non-issue if remote Data center connectivity behavior for nodes is not same.", "Remote DC statistics anomaly check."); @@ -384,51 +451,58 @@ assert input data structure, only exceptions are data which grouped by DC, in th */ xdr_enabled = select "enable-xdr" from XDR.CONFIG; xdr_enabled = group by CLUSTER, NODE do OR(xdr_enabled); +cluster_xdr_enabled = group by CLUSTER do OR(xdr_enabled); -s = select "xdr-dc-state", "dc_state" from DC.STATISTICS; -r = group by DC do EQUAL(s); -ASSERT(r, True, "Different remote DC states.", "OPERATIONS", WARNING, - "Listed node[s] have a different remote DC visibility. Please run 'show statistics dc like state' to see DC state. Possible network issue between data centers.", +s = select "xdr-dc-state", "dc_state" from DC.STATISTICS save; +r = group by CLUSTER, DC do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different remote DC states.", "OPERATIONS", WARNING, + "Listed DC[s] have a different remote DC visibility. Please run 'show statistics dc like state' to see DC state. Possible network issue between data centers.", "Remote DC state check.", xdr_enabled); -s = select "free-dlog-pct", "dlog_free_pct", "free_dlog_pct" from XDR; +s = select "dc_size" from DC.STATISTICS save; +r = group by CLUSTER, DC do NO_MATCH(s, ==, MAJORITY) save; +ASSERT(r, False, "Different remote DC sizes.", "OPERATIONS", WARNING, + "Listed DC[s] have a different remote DC size. Please run 'show statistics dc like size' to see DC size. Possible network issue between data centers.", + "Remote DC size check."); + +s = select "free-dlog-pct", "dlog_free_pct", "free_dlog_pct" from XDR save; r = do s < 95; ASSERT(r, False, "Low XDR free digest log space.", "OPERATIONS", INFO, "Listed node[s] have lower than ideal (95%) free digest log space. Please run 'show statistics xdr like free' to see digest log space. Probable cause - low XDR throughput or a failed node processing in progress.", "XDR free digest log space check.", xdr_enabled); -r = group by CLUSTER, NODE do SD_ANOMALY(s, ==, 3); +r = group by CLUSTER do SD_ANOMALY(s, ==, 3); ASSERT(r, False, "Skewed cluster XDR free digest log space.", "ANOMALY", WARNING, "Listed node[s] have different digest log free size pattern. Please run 'show statistics xdr like free' to see digest log space. May not be an issue if the nodes are newly added or have been restarted with noresume or if remote Datacenter connectivity behavior differs for nodes.", "XDR free digest log space anomaly check.", - xdr_enabled); + cluster_xdr_enabled); /* Needs normalization but not sure on what ?? */ -s = select "timediff_lastship_cur_secs", "xdr_timelag" from XDR.STATISTICS; +s = select "timediff_lastship_cur_secs", "xdr_timelag" from XDR.STATISTICS save; r = do s > 10; ASSERT(r, False, "High XDR shipping lag (> 10s).", "PERFORMANCE", WARNING, "Listed node[s] have higher than healthy ( > 10 sec) ship lag to remote data center. Please run 'show statistics xdr like time' to see shipping lag. Probable cause - connectivity issue to remote datacenter or spike in write throughput on the local cluster.", "XDR shipping lag check.", xdr_enabled); -r = group by CLUSTER, NODE do SD_ANOMALY(s, ==, 3); +r = group by CLUSTER do SD_ANOMALY(s, ==, 3); ASSERT(r, False, "Cluster XDR shipping lag skewed.", "ANOMALY", WARNING, "Listed node[s] have different ship lag patterns. Please run 'show statistics xdr like time' to see shipping lag. May not be an issue if the nodes are newly added or have been restarted with noresume or if remote Datacenter connectivity behavior differs for nodes.", "XDR shipping lag anomaly check.", - xdr_enabled); + cluster_xdr_enabled); -s = select "xdr-dc-timelag", "dc_timelag" from DC.STATISTICS; -r = group by DC do SD_ANOMALY(s, ==, 3); +s = select "xdr-dc-timelag", "dc_timelag" from DC.STATISTICS save; +r = group by CLUSTER, DC do SD_ANOMALY(s, ==, 3); ASSERT(r, False, "Skewed cluster remote DC Lag.", "ANOMALY", WARNING, "Listed node[s] have different latency to remote data center. Please run 'show statistics dc like timelag' to see timelag. Possible Data center connectivity issue.", "Remote DC lag anomaly check.", - xdr_enabled); + cluster_xdr_enabled); /* XDR xdr_read_latency_avg check */ -s = select "xdr_read_latency_avg", "local_recs_fetch_avg_latency" from XDR.STATISTICS; +s = select "xdr_read_latency_avg", "local_recs_fetch_avg_latency" from XDR.STATISTICS save; r = do s > 2; ASSERT(r, False, "High XDR average read latency (>2 sec).", "PERFORMANCE", WARNING, "Listed node[s] have higher than normal (> 2sec) local read latencies. Please run 'show statistics xdr like latency' to see XDR read latency. Probable cause - system overload causing transaction queue to back up.", @@ -436,9 +510,9 @@ ASSERT(r, False, "High XDR average read latency (>2 sec).", "PERFORMANCE", WARNI xdr_enabled); -s = select "dc_open_conn" as "conn" from DC.STATISTICS; -ds = select "dc_size" as "conn" from DC.STATISTICS; -ds = do ds * 64; +s = select "dc_open_conn" as "conn" from DC.STATISTICS save; +ds = select "dc_size" as "conn" from DC.STATISTICS save; +ds = do ds * 64 save as "max expected dc connections"; r = do s > ds; ASSERT(r, False, "High remote DC connections.", "LIMITS", WARNING, "Listed node[s] have higher than normal remote datacenter connections. Generally accepted number is (64*No of nodes in remote DC) per node. Please run 'show statistics dc like dc_open_conn dc_size' to see DC connection statistics. Ignore if XDR is not pipelined.", @@ -446,7 +520,7 @@ ASSERT(r, False, "High remote DC connections.", "LIMITS", WARNING, xdr_enabled); -s = select "xdr_uninitialized_destination_error", "noship_recs_uninitialized_destination" from XDR.STATISTICS; +s = select "xdr_uninitialized_destination_error", "noship_recs_uninitialized_destination" from XDR.STATISTICS save; r = do s > 0; ASSERT(r, False, "Uninitialized destination cluster.", "OPERATIONS", WARNING, "Listed node[s] have a non zero value for this uninitialized DC. Please check the configuration.", @@ -454,7 +528,7 @@ ASSERT(r, False, "Uninitialized destination cluster.", "OPERATIONS", WARNING, xdr_enabled); -s = select "xdr_unknown_namespace_error", "noship_recs_unknown_namespace" from XDR.STATISTICS; +s = select "xdr_unknown_namespace_error", "noship_recs_unknown_namespace" from XDR.STATISTICS save; r = do s > 0; ASSERT(r, False, "Missing namespace in remote data center.", "OPERATIONS", WARNING, "Certain namespace not found in remote DC. Please check the configuration to ascertain if remote DC has all the namespace being shipped.", @@ -462,7 +536,7 @@ ASSERT(r, False, "Missing namespace in remote data center.", "OPERATIONS", WARNI xdr_enabled); /* XDR failednode_sessions_pending check */ -s = select "failednode_sessions_pending", "xdr_active_failed_node_sessions" from XDR.STATISTICS; +s = select "failednode_sessions_pending", "xdr_active_failed_node_sessions" from XDR.STATISTICS save; r = do s > 0; ASSERT(r, False, "Active failed node sessions.", "OPERATIONS", INFO, "Listed node[s] have failed node sessions pending. Please check if there are any failed nodes on the source cluster.", @@ -470,7 +544,7 @@ ASSERT(r, False, "Active failed node sessions.", "OPERATIONS", INFO, xdr_enabled); /* XDR linkdown_sessions_pending check */ -s = select "linkdown_sessions_pending", "xdr_active_link_down_sessions" from XDR.STATISTICS; +s = select "linkdown_sessions_pending", "xdr_active_link_down_sessions" from XDR.STATISTICS save; r = do s > 0; ASSERT(r, False, "Active linkdown sessions.", "OPERATIONS", INFO, "Listed node[s] have link down sessions pending. Please check the connectivity of remote datacenter.", @@ -478,7 +552,7 @@ ASSERT(r, False, "Active linkdown sessions.", "OPERATIONS", INFO, xdr_enabled); /* XDR xdr_ship_outstanding_objects check */ -s = select "xdr_ship_outstanding_objects", "stat_recs_outstanding" from XDR.STATISTICS; +s = select "xdr_ship_outstanding_objects", "stat_recs_outstanding" from XDR.STATISTICS save; r = do s > 10000; ASSERT(r, False, "Too many outstanding objects (>10000) to ship !!.", "OPERATIONS", WARNING, "Listed node[s] have too many records outstanding. Please check relogging and error statistics.", @@ -486,7 +560,7 @@ ASSERT(r, False, "Too many outstanding objects (>10000) to ship !!.", "OPERATION xdr_enabled); /* XDR xdr_ship_inflight_objects check */ -s = select "xdr_ship_inflight_objects", "stat_recs_inflight" from XDR.STATISTICS; +s = select "xdr_ship_inflight_objects", "stat_recs_inflight" from XDR.STATISTICS save; r = do s > 5000; ASSERT(r, False, "Too many inflight objects (>5000).", "PERFORMANCE", WARNING, "Listed node[s] have too many objects inflight. This might lead to XDR throttling itself, consider tuning this parameter to a lower value.", @@ -494,7 +568,7 @@ ASSERT(r, False, "Too many inflight objects (>5000).", "PERFORMANCE", WARNING, xdr_enabled); /* XDR xdr_ship_latency_avg check */ -s = select "xdr_ship_latency_avg", "latency_avg_ship" from XDR.STATISTICS; +s = select "xdr_ship_latency_avg", "latency_avg_ship" from XDR.STATISTICS save; // Following value is not fixed yet r = do s > 5000; ASSERT(r, False, "Record shipping takes too long (>5 sec).", "PERFORMANCE", WARNING, @@ -505,21 +579,21 @@ ASSERT(r, False, "Record shipping takes too long (>5 sec).", "PERFORMANCE", WARN /* CLUSTER STATE */ -r = select "cluster_integrity" from SERVICE.STATISTICS; +r = select "cluster_integrity" from SERVICE.STATISTICS save; r = do r == True; ASSERT(r, True, "Cluster integrity fault.", "OPERATIONS", CRITICAL, "Listed node[s] have cluster integrity fault. This indicates cluster is not completely wellformed. Please check server logs for more information. Probable cause - issue with network.", "Cluster integrity fault check."); r = select "cluster_key" from SERVICE.STATISTICS; -r = do EQUAL(r); -ASSERT(r, True, "Different Cluster Key.", "OPERATIONS", CRITICAL, +r = do NO_MATCH(r, ==, MAJORITY) save; +ASSERT(r, False, "Different Cluster Key.", "OPERATIONS", CRITICAL, "Listed cluster[s] have different cluster keys for nodes. This indicates cluster is not completely wellformed. Please check server logs for more information. Probable cause - issue with network.", "Cluster Key difference check."); u = select "uptime" from SERVICE.STATISTICS; -total_nodes = group by CLUSTER do COUNT(u); -r = select "cluster_size" from SERVICE.STATISTICS; +total_nodes = group by CLUSTER do COUNT(u) save as "total nodes"; +r = select "cluster_size" from SERVICE.STATISTICS save; r = do r == total_nodes; ASSERT(r, True, "Unstable Cluster.", "OPERATIONS", CRITICAL, "Listed node[s] have cluster size not matching total number of available nodes. This indicates cluster is not completely wellformed. Please check server logs for more information. Probable cause - issue with network.", @@ -528,18 +602,20 @@ ASSERT(r, True, "Unstable Cluster.", "OPERATIONS", CRITICAL, hp = select "heartbeat.protocol", "heartbeat-protocol" from NETWORK.CONFIG; heartbeat_proto_v2 = do hp == "v2"; heartbeat_proto_v2 = group by CLUSTER, NODE do OR(heartbeat_proto_v2); -cs = select "cluster_size" from SERVICE.STATISTICS; -mcs = select "paxos-max-cluster-size" as "cluster_size" from SERVICE.CONFIG; +cs = select "cluster_size" from SERVICE.STATISTICS save; +mcs = select "paxos-max-cluster-size" as "cluster_size" from SERVICE.CONFIG save; +cs_without_saved_value = select "cluster_size" from SERVICE.STATISTICS; +mcs_without_saved_value = select "paxos-max-cluster-size" as "cluster_size" from SERVICE.CONFIG; r = do cs < mcs; ASSERT(r, True, "Critical cluster size.", "OPERATIONS", CRITICAL, "Listed node[s] have cluster size higher than configured paxos-max-cluster-size. Please run 'show config service like paxos-max-cluster-size' to check configured max cluster size.", "Critical cluster size check.", heartbeat_proto_v2); -small_max_configured = do mcs < 20; +small_max_configured = do mcs_without_saved_value < 20; critical_size = do cs >= mcs; -correct_size = do mcs - 10; -correct_size = do cs <= correct_size; +correct_size = do mcs_without_saved_value - 10; +correct_size = do cs_without_saved_value <= correct_size; r = do small_max_configured || critical_size; r = do r || correct_size; ASSERT(r, True, "Cluster size is near the max configured cluster size.", "OPERATIONS", WARNING, @@ -551,12 +627,12 @@ ASSERT(r, True, "Cluster size is near the max configured cluster size.", "OPERAT /* UDF */ u = select * from UDF.METADATA; -r = group by FILENAME, KEY do EQUAL(u); -ASSERT(r, True, "UDF not in sync (file not matching).", "OPERATIONS", CRITICAL, +r = group by FILENAME, KEY do NO_MATCH(u, ==, MAJORITY) save; +ASSERT(r, False, "UDF not in sync (file not matching).", "OPERATIONS", CRITICAL, "Listed UDF definitions do not match across the nodes. This may lead to incorrect UDF behavior. Run command 'asinfo -v udf-list' to see list of UDF. Re-register the latest version of the not in sync UDF[s].", "UDF sync (file not matching) check."); -total_nodes = group by CLUSTER do COUNT(u); -c = group by CLUSTER, FILENAME do COUNT(u); +total_nodes = group by CLUSTER do COUNT(u) save as "expected node count"; +c = group by CLUSTER, FILENAME do COUNT(u) save as "node count"; r = do c == total_nodes; ASSERT(r, True, "UDF not in sync (not available on all node).", "OPERATIONS", CRITICAL, "Listed UDF[s] are not available on all the nodes. This may lead to incorrect UDF behavior. Run command 'asinfo -v udf-list' to see list of UDF. Re-register missing UDF in cluster.", @@ -564,15 +640,15 @@ ASSERT(r, True, "UDF not in sync (not available on all node).", "OPERATIONS", CR /* SINDEX */ -s = select "sync_state" from SINDEX.STATISTICS; +s = select "sync_state" from SINDEX.STATISTICS save; s = group by CLUSTER, NAMESPACE, SET, SINDEX s; r = do s == "synced"; ASSERT(r, True, "SINDEX not in sync with primary.", "OPERATIONS", CRITICAL, "Listed sindex[es] are not in sync with primary. This can lead to wrong query results. Consider dropping and recreating secondary index[es].", "SINDEX sync state check."); u = select "uptime" from SERVICE.STATISTICS; -total_nodes = group by CLUSTER do COUNT(u); -c = group by CLUSTER, NAMESPACE, SET, SINDEX do COUNT(s); +total_nodes = group by CLUSTER do COUNT(u) save as "cluster node count"; +c = group by CLUSTER, NAMESPACE, SET, SINDEX do COUNT(s) save as "nodes with SINDEX"; r = do c == total_nodes; ASSERT(r, True, "SINDEX not in sync (not available on all node).", "OPERATIONS", CRITICAL, "Listed sindex[es] not available on all nodes. This can lead to wrong query results. Consider dropping and recreating missing secondary index[es].", @@ -605,133 +681,629 @@ ASSERT(r, True, "Services list discrepancy.", "OPERATIONS", WARNING, */ SET CONSTRAINT VERSION >= 3.9; +// Uptime u = select "uptime" from SERVICE.STATISTICS; -u = GROUP BY CLUSTER, NODE do SUM(u); +u = GROUP BY CLUSTER, NODE do MAX(u); -e = select "client_write_error" from NAMESPACE.STATISTICS; -s = select "client_write_success" from NAMESPACE.STATISTICS; -s = GROUP BY CLUSTER, NODE, NAMESPACE do SUM(s); -r = do e / s; -r = do r/u on common; -r = do r == 0; -ASSERT(r, True, "Non-zero namespace write errors count", "OPERATIONS", INFO, - "Listed namespace write error[s] show skew in count across nodes in cluster. It may or may not be an issue depending on the error type (e.g gen check errors may be expected if client is using check and set kind of operations). Please run 'show statistics namespace like client_write' to see values.", - "Namespace write errors count check"); -e = select "client_read_error" from NAMESPACE.STATISTICS; -s = select "client_read_success" from NAMESPACE.STATISTICS; -s = GROUP BY CLUSTER, NODE, NAMESPACE do SUM(s); -r = do e / s; -r = do r/u on common; -r = do r == 0; -ASSERT(r, True, "Non-zero namespace read errors count", "OPERATIONS", INFO, - "Listed namespace read error[s] show skew in count across nodes in the cluster. It may or may not be an issue depending on the error type (e.g key not found may be expected). Please run 'show statistics namespace like client_read' to see values.", - "Namespace read errors count check"); +// Read statistics -e = select "client_delete_error" from NAMESPACE.STATISTICS; -s = select "client_delete_success" from NAMESPACE.STATISTICS; -s = GROUP BY CLUSTER, NODE, NAMESPACE do SUM(s); -r = do e / s; -r = do r/u on common; -r = do r == 0; -ASSERT(r, True, "Non-zero namespace delete errors count", "OPERATIONS", INFO, - "Listed namespace delete error[s] show skew in count across nodes in the cluster. It may or may not be an issue depending on the error type (e.g key not found). Please run 'show statistics namespace like client_delete' to see values.", - "Namespace delete errors count check"); +nf = select "client_read_not_found" as "cnt" from NAMESPACE.STATISTICS; +s = select "client_read_success" as "cnt" from NAMESPACE.STATISTICS; +t = select "client_read_timeout" as "cnt" from NAMESPACE.STATISTICS; +e = select "client_read_error" as "cnt" from NAMESPACE.STATISTICS; +total_reads = do s + nf; +total_reads = do total_reads + t; +total_reads = do total_reads + e save as "total client reads"; +total_reads_per_sec = do total_reads/u; +total_reads = group by CLUSTER, NAMESPACE, NODE do MAX(total_reads); +total_reads_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_reads_per_sec); -e = select "batch_sub_tsvc_timeout" from NAMESPACE.STATISTICS; -e = do e/u on common; +e = select "client_read_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; e = group by CLUSTER, NAMESPACE e; -r = do e > 0; -ASSERT(r, False, "Non-zero batch-index read sub-transaction timeouts.", "OPERATIONS", INFO, - "Listed namespace[s] have non-zero batch-index read sub-transaction timeouts across the nodes. Please run 'show statistics namespace like batch_sub_tsvc_timeout' to see the values.", - "Namespace batch-index read sub-transaction timeout count check"); +p = do e/total_reads_per_sec; +p = do p * 100 save as "client_read_error % of total reads"; +r = do p <= 5; +ASSERT(r, True, "High client read errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal read errors (> 5% client reads). Please run 'show statistics namespace like client_read' to see values.", + "High read error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero client read errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero read errors. Please run 'show statistics namespace like client_read' to see values.", + "Non-zero read error check"); + +t = select "client_read_timeout" from NAMESPACE.STATISTICS save; +t = group by CLUSTER, NAMESPACE t; +r = do t/total_reads; +r = do r * 100 save as "client_read_timeout % of total reads"; +r = do r <= 5; +ASSERT(r, True, "High client read timeouts", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal read timeouts (> 5% client reads). Please run 'show statistics namespace like client_read' to see values.", + "High read timeouts check"); + +c = select "client_read_not_found" from NAMESPACE.STATISTICS save; +c = group by CLUSTER, NAMESPACE c; + +r = do c / total_reads; +r = do r * 100 save as "client_read_not_found % of total reads"; +r = do r <= 20; +ASSERT(r, True, "High read not found errors", "OPERATIONS", INFO, + "Listed namespace[s] show higher than normal read not found errors (> 20% client reads). Please run 'show statistics namespace like client_read' to see values.", + "High read not found error check"); -e = select "client_tsvc_timeout" from NAMESPACE.STATISTICS; -e = do e/u on common; -e = group by CLUSTER, NAMESPACE e; -r = do e > 0; -ASSERT(r, False, "Non-zero client transaction timeouts.", "OPERATIONS", INFO, - "Listed namespace[s] have non-zero client transaction timeouts (for nodes). Please run 'show statistics namespace like client_tsvc_timeout' to see values. Probable cause - congestion in the transaction queue (transaction threads not able to process efficiently enough), or it could also be that the timeout set by the client is too aggressive.", - "Namespace client transaction timeout count check"); -e = select "client_udf_error" from NAMESPACE.STATISTICS; -e = do e/u on common; -e = group by CLUSTER, NAMESPACE e; -r = do e > 0; -ASSERT(r, False, "Non-zero UDF transaction failure.", "OPERATIONS", INFO, - "Listed namespace[s] have non-zero UDF transaction failures (for nodes). Please run 'show statistics namespace like client_udf_error' to see values.", - "Namespace UDF transaction failure check"); +// Delete statistics -e = select "client_udf_timeout" from NAMESPACE.STATISTICS; -e = do e/u on common; -e = group by CLUSTER, NAMESPACE e; -r = do e > 0; -ASSERT(r, False, "Non-zero UDF transaction timeouts.", "OPERATIONS", INFO, - "Listed namespace[s] have non-zero UDF transaction timeouts (for nodes). Please run 'show statistics namespace like client_udf_timeout' to see values.", - "Namespace UDF transaction timeout check"); +nf = select "client_delete_not_found" as "cnt" from NAMESPACE.STATISTICS; +s = select "client_delete_success" as "cnt" from NAMESPACE.STATISTICS; +t = select "client_delete_timeout" as "cnt" from NAMESPACE.STATISTICS; +e = select "client_delete_error" as "cnt" from NAMESPACE.STATISTICS; +total_deletes = do s + nf; +total_deletes = do total_deletes + t; +total_deletes = do total_deletes + e save as "total client deletes"; +total_deletes_per_sec = do total_deletes/u; +total_deletes = group by CLUSTER, NAMESPACE, NODE do MAX(total_deletes); +total_deletes_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_deletes_per_sec); -e = select "udf_sub_udf_error" from NAMESPACE.STATISTICS; -e = do e/u on common; +e = select "client_delete_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; e = group by CLUSTER, NAMESPACE e; -r = do e > 0; -ASSERT(r, False, "Non-zero UDF sub-transaction failures.", "OPERATIONS", INFO, - "Listed namespace[s] have non-zero UDF sub-transaction failures across nodes in cluster for scan/query background udf jobs. Please run 'show statistics namespace like udf_sub_udf_error udf_sub_lang_' to see details.", - "Namespace UDF sub-transaction failure check"); - -e = select "client_write_timeout" from NAMESPACE.STATISTICS; -e = do e/u on common; +p = do e/total_deletes_per_sec; +p = do p * 100 save as "client_delete_error % of total deletes"; +r = do p <= 5; +ASSERT(r, True, "High client delete errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal delete errors (> 5% client deletes). Please run 'show statistics namespace like client_delete' to see values.", + "High delete error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero client delete errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero delete errors. Please run 'show statistics namespace like client_delete' to see values.", + "Non-zero delete error check"); + +t = select "client_delete_timeout" from NAMESPACE.STATISTICS save; +t = group by CLUSTER, NAMESPACE t; +r = do t/total_deletes; +r = do r * 100 save as "client_delete_timeout % of total deletes"; +r = do r <= 5; +ASSERT(r, True, "High client delete timeouts", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal delete timeouts (> 5% client deletes). Please run 'show statistics namespace like client_delete' to see values.", + "High delete timeouts check"); + +c = select "client_delete_not_found" from NAMESPACE.STATISTICS save; +c = group by CLUSTER, NAMESPACE c; +r = do c / total_deletes; +r = do r * 100 save as "client_delete_not_found % of total deletes"; +r = do r <= 20; +ASSERT(r, True, "High delete not found errors", "OPERATIONS", INFO, + "Listed namespace[s] show higher than normal delete not found errors (> 20% client deletes). Please run 'show statistics namespace like client_delete' to see values.", + "High delete not found error check"); + + +// Write statistics + +s = select "client_write_success" as "cnt" from NAMESPACE.STATISTICS; +t = select "client_write_timeout" as "cnt" from NAMESPACE.STATISTICS; +e = select "client_write_error" as "cnt" from NAMESPACE.STATISTICS; +total_writes = do s + t; +total_writes = do total_writes + e save as "total client writes"; +total_writes_per_sec = do total_writes/u; +total_writes = group by CLUSTER, NAMESPACE, NODE do MAX(total_writes); +total_writes_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_writes_per_sec); + +e = select "client_write_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_writes_per_sec; +p = do p * 100 save as "client_write_error % of total writes"; +r = do p <= 5; +ASSERT(r, True, "High client write errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal write errors (> 5% client writes). Please run 'show statistics namespace like client_write' to see values.", + "High write error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero client write errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero write errors. Please run 'show statistics namespace like client_write' to see values.", + "Non-zero write error check"); + +t = select "client_write_timeout" from NAMESPACE.STATISTICS save; +t = group by CLUSTER, NAMESPACE t; +r = do t/total_writes; +r = do r * 100 save as "client_write_timeout % of total writes"; +r = do r <= 5; +ASSERT(r, True, "High client write timeouts", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal write timeouts (> 5% client writes). Please run 'show statistics namespace like client_write' to see values.", + "High write timeouts check"); + + +// Client Proxy transaction statistics + +s = select "client_proxy_complete" as "cnt" from NAMESPACE.STATISTICS; +t = select "client_proxy_timeout" as "cnt" from NAMESPACE.STATISTICS; +e = select "client_proxy_error" as "cnt" from NAMESPACE.STATISTICS; +total_client_proxy = do s + t; +total_client_proxy = do total_client_proxy + e save as "total client proxy transactions"; +total_client_proxy_per_sec = do total_client_proxy/u; +total_client_proxy = group by CLUSTER, NAMESPACE, NODE do MAX(total_client_proxy); +total_client_proxy_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_client_proxy_per_sec); + +e = select "client_proxy_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_client_proxy_per_sec; +p = do p * 100 save as "client_proxy_error % of total proxy transactions"; +r = do p <= 5; +ASSERT(r, True, "High client proxy transaction errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal proxy transaction errors (> 5% client proxy transactions). Please run 'show statistics namespace like client_proxy' to see values.", + "High proxy transaction error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero client proxy transaction errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero proxy transaction errors. Please run 'show statistics namespace like client_proxy' to see values.", + "Non-zero proxy transaction error check"); + + +t = select "client_proxy_timeout" from NAMESPACE.STATISTICS save; +t = group by CLUSTER, NAMESPACE t; +r = do t/total_client_proxy; +r = do r * 100 save as "client_proxy_timeout % of total proxy transactions"; +r = do r <= 5; +ASSERT(r, True, "High client proxy transaction timeouts", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal proxy transaction timeouts (> 5% client proxy transactions). Please run 'show statistics namespace like client_proxy' to see values.", + "High proxy transaction timeouts check"); + + + +// XDR Write statistics + +s = select "xdr_write_success" as "cnt" from NAMESPACE.STATISTICS; +t = select "xdr_write_timeout" as "cnt" from NAMESPACE.STATISTICS; +e = select "xdr_write_error" as "cnt" from NAMESPACE.STATISTICS; +total_xdr_writes = do s + t; +total_xdr_writes = do total_xdr_writes + e save as "total xdr writes"; +total_xdr_writes_per_sec = do total_xdr_writes/u; +total_xdr_writes = group by CLUSTER, NAMESPACE, NODE do MAX(total_xdr_writes); +total_xdr_writes_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_xdr_writes_per_sec); + +e = select "xdr_write_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_xdr_writes_per_sec; +p = do p * 100 save as "xdr_write_error % of total xdr writes"; +r = do p <= 5; +ASSERT(r, True, "High xdr write errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal xdr write errors (> 5% xdr writes). Please run 'show statistics namespace like xdr_write' to see values.", + "High xdr write error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero xdr write errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero xdr write errors. Please run 'show statistics namespace like xdr_write' to see values.", + "Non-zero xdr write error check"); + +t = select "xdr_write_timeout" from NAMESPACE.STATISTICS save; +t = group by CLUSTER, NAMESPACE t; +r = do t/total_xdr_writes; +r = do r * 100 save as "xdr_write_timeout % of total xdr writes"; +r = do r <= 5; +ASSERT(r, True, "High xdr write timeouts", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal xdr write timeouts (> 5% xdr writes). Please run 'show statistics namespace like xdr_write' to see values.", + "High xdr write timeouts check"); + + +// UDF Transaction statistics + +s = select "client_udf_complete" as "cnt" from NAMESPACE.STATISTICS; +t = select "client_udf_timeout" as "cnt" from NAMESPACE.STATISTICS; +e = select "client_udf_error" as "cnt" from NAMESPACE.STATISTICS; +total_udf_transactions = do s + t; +total_udf_transactions = do total_udf_transactions + e save as "total udf transactions"; +total_udf_transactions_per_sec = do total_udf_transactions/u; +total_udf_transactions = group by CLUSTER, NAMESPACE, NODE do MAX(total_udf_transactions); +total_udf_transactions_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_udf_transactions_per_sec); + +e = select "client_udf_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_udf_transactions_per_sec; +p = do p * 100 save as "client_udf_error % of total udf transactions"; +r = do p <= 5; +ASSERT(r, True, "High udf transaction errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal udf transaction errors (> 5% udf transactions). Please run 'show statistics namespace like client_udf' to see values.", + "High udf transaction error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero udf transaction errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero udf transaction errors. Please run 'show statistics namespace like client_udf' to see values.", + "Non-zero udf transaction error check"); + +t = select "client_udf_timeout" from NAMESPACE.STATISTICS save; +t = group by CLUSTER, NAMESPACE t; +r = do t/total_udf_transactions; +r = do r * 100 save as "client_udf_timeout % of total udf transactions"; +r = do r <= 5; +ASSERT(r, True, "High udf transaction timeouts", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal udf transaction timeouts (> 5% udf transaction). Please run 'show statistics namespace like client_udf' to see values.", + "High udf transaction timeouts check"); + + +// UDF Sub-Transaction statistics + +s = select "udf_sub_udf_complete" as "cnt" from NAMESPACE.STATISTICS; +t = select "udf_sub_udf_timeout" as "cnt" from NAMESPACE.STATISTICS; +e = select "udf_sub_udf_error" as "cnt" from NAMESPACE.STATISTICS; +total_udf_sub_transactions = do s + t; +total_udf_sub_transactions = do total_udf_sub_transactions + e save as "total udf sub-transactions"; +total_udf_sub_transactions_per_sec = do total_udf_sub_transactions/u; +total_udf_sub_transactions = group by CLUSTER, NAMESPACE, NODE do MAX(total_udf_sub_transactions); +total_udf_sub_transactions_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_udf_sub_transactions_per_sec); + +e = select "udf_sub_udf_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_udf_sub_transactions_per_sec; +p = do p * 100 save as "udf_sub_udf_error % of total udf sub-transactions"; +r = do p <= 5; +ASSERT(r, True, "High udf sub-transaction errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal udf sub-transaction errors (> 5% udf sub-transactions). Please run 'show statistics namespace like udf_sub_udf' to see values.", + "High udf sub-transaction error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero udf sub-transaction errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero udf sub-transaction errors. Please run 'show statistics namespace like udf_sub_udf' to see values.", + "Non-zero udf sub-transaction error check"); + +t = select "udf_sub_udf_timeout" from NAMESPACE.STATISTICS save; +t = group by CLUSTER, NAMESPACE t; +r = do t/total_udf_sub_transactions; +r = do r * 100 save as "udf_sub_udf_timeout % of total udf sub-transactions"; +r = do r <= 5; +ASSERT(r, True, "High udf sub-transaction timeouts", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal udf sub-transaction timeouts (> 5% udf sub-transaction). Please run 'show statistics namespace like udf_sub_udf' to see values.", + "High udf sub-transaction timeouts check"); + + +// Proxied Batch-index Sub-Transaction statistics + +s = select "batch_sub_proxy_complete" as "cnt" from NAMESPACE.STATISTICS; +t = select "batch_sub_proxy_error" as "cnt" from NAMESPACE.STATISTICS; +e = select "batch_sub_proxy_timeout" as "cnt" from NAMESPACE.STATISTICS; +total_transactions = do s + t; +total_transactions = do total_transactions + e save as "total batch-index sub-transactions"; +total_transactions_per_sec = do total_transactions/u; +total_transactions = group by CLUSTER, NAMESPACE, NODE do MAX(total_transactions); +total_transactions_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_transactions_per_sec); + +e = select "batch_sub_proxy_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_transactions_per_sec; +p = do p * 100 save as "batch_sub_proxy_error % of total batch-index sub-transactions"; +r = do p <= 5; +ASSERT(r, True, "High batch-index sub-transaction errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal batch-index sub-transaction errors (> 5% batch-index sub-transactions). Please run 'show statistics namespace like batch_sub_proxy' to see values.", + "High batch-index sub-transaction error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero batch-index sub-transaction errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero batch-index sub-transaction errors. Please run 'show statistics namespace like batch_sub_proxy' to see values.", + "Non-zero batch-index sub-transaction error check"); + +t = select "batch_sub_proxy_timeout" from NAMESPACE.STATISTICS save; +t = group by CLUSTER, NAMESPACE t; +r = do t/total_transactions; +r = do r * 100 save as "batch_sub_proxy_timeout % of total batch-index sub-transactions"; +r = do r <= 5; +ASSERT(r, True, "High batch-index sub-transaction timeouts", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal batch-index sub-transaction timeouts (> 5% batch-index sub-transaction). Please run 'show statistics namespace like batch_sub_proxy' to see values.", + "High batch-index sub-transaction timeouts check"); + + +// Batch-index read Sub-Transaction statistics + +nf = select "batch_sub_read_not_found" as "cnt" from NAMESPACE.STATISTICS; +s = select "batch_sub_read_success" as "cnt" from NAMESPACE.STATISTICS; +t = select "batch_sub_read_timeout" as "cnt" from NAMESPACE.STATISTICS; +e = select "batch_sub_read_error" as "cnt" from NAMESPACE.STATISTICS; +total_transactions = do s + nf; +total_transactions = do total_transactions + t; +total_transactions = do total_transactions + e save as "total batch-index read sub-transactions"; +total_transactions_per_sec = do total_transactions/u; +total_transactions = group by CLUSTER, NAMESPACE, NODE do MAX(total_transactions); +total_transactions_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_transactions_per_sec); + +e = select "batch_sub_read_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_transactions_per_sec; +p = do p * 100 save as "batch_sub_read_error % of total reads"; +r = do p <= 5; +ASSERT(r, True, "High batch-index read sub-transaction errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal batch-index read sub-transaction errors (> 5% batch-index read sub-transactions). Please run 'show statistics namespace like batch_sub_read' to see values.", + "High batch-index read sub-transaction error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero batch-index read sub-transaction errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero batch-index read sub-transaction errors. Please run 'show statistics namespace like batch_sub_read' to see values.", + "Non-zero batch-index read sub-transaction error check"); + +t = select "batch_sub_read_timeout" from NAMESPACE.STATISTICS save; +t = group by CLUSTER, NAMESPACE t; +r = do t/total_transactions; +r = do r * 100 save as "batch_sub_read_timeout % of total batch-index read sub-transactions"; +r = do r <= 5; +ASSERT(r, True, "High batch-index read sub-transaction timeouts", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal batch-index read sub-transaction timeouts (> 5% batch-index read sub-transactions). Please run 'show statistics namespace like batch_sub_read' to see values.", + "High batch-index read sub-transaction timeouts check"); + +c = select "batch_sub_read_not_found" from NAMESPACE.STATISTICS save; +c = group by CLUSTER, NAMESPACE c; +r = do c / total_transactions; +r = do r * 100 save as "batch_sub_read_not_found % of total batch-index read sub-transactions"; +r = do r <= 20; +ASSERT(r, True, "High batch-index read sub-transaction not found errors", "OPERATIONS", INFO, + "Listed namespace[s] show higher than normal batch-index read sub-transaction not found errors (> 20% batch-index read sub-transactions). Please run 'show statistics namespace like batch_sub_read' to see values.", + "High batch-index read sub-transaction not found error check"); + + +// Client UDF Transaction statistics + +rs = select "client_lang_read_success" as "cnt" from NAMESPACE.STATISTICS; +ds = select "client_lang_delete_success" as "cnt" from NAMESPACE.STATISTICS; +ws = select "client_lang_write_success" as "cnt" from NAMESPACE.STATISTICS; +e = select "client_lang_error" as "cnt" from NAMESPACE.STATISTICS; +total_client_udf_transactions = do rs + ds; +total_client_udf_transactions = do total_client_udf_transactions + ws; +total_client_udf_transactions = do total_client_udf_transactions + e save as "total client_lang"; +total_client_udf_transactions_per_sec = do total_client_udf_transactions/u; +total_client_udf_transactions_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_client_udf_transactions_per_sec); + +e = select "client_lang_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_client_udf_transactions_per_sec; +p = do p * 100 save as "client_lang_error % of total client_lang"; +r = do p <= 5; +ASSERT(r, True, "High client initiated udf transactions errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal client initiated udf transactions errors (> 5% client initiated udf transactions). Please run 'show statistics namespace like client_lang' to see values.", + "High client initiated udf transactions error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero client initiated udf transaction errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero client initiated udf transaction errors. Please run 'show statistics namespace like client_lang' to see values.", + "Non-zero client initiated udf transaction error check"); + + +// UDF Sub-Transaction statistics + +rs = select "udf_sub_lang_read_success" as "cnt" from NAMESPACE.STATISTICS; +ds = select "udf_sub_lang_delete_success" as "cnt" from NAMESPACE.STATISTICS; +ws = select "udf_sub_lang_write_success" as "cnt" from NAMESPACE.STATISTICS; +e = select "udf_sub_lang_error" as "cnt" from NAMESPACE.STATISTICS; +total_transactions = do rs + ds; +total_transactions = do total_transactions + ws; +total_transactions = do total_transactions + e save as "total udf_sub_lang"; +total_transactions_per_sec = do total_transactions/u; +total_transactions_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_transactions_per_sec); + +e = select "udf_sub_lang_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_transactions_per_sec; +p = do p * 100 save as "udf_sub_lang_error % of total udf_sub_lang"; +r = do p <= 5; +ASSERT(r, True, "High udf sub-transaction errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal udf sub-transaction errors (> 5% udf sub-transactions). Please run 'show statistics namespace like udf_sub_lang' to see values.", + "High udf sub-transaction error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero udf sub-transaction errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero udf sub-transaction errors. Please run 'show statistics namespace like udf_sub_lang' to see values.", + "Non-zero udf sub-transaction error check"); + + +// Query Agg statistics + +total_transactions = select "query_agg" from NAMESPACE.STATISTICS save as "total query aggregations"; +total_transactions_per_sec = do total_transactions/u; +total_transactions_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_transactions_per_sec); + +e = select "query_agg_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_transactions_per_sec; +p = do p * 100 save as "query_agg_error % of total query aggregations"; +r = do p <= 5; +ASSERT(r, True, "High query aggregation errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal query aggregation errors (> 5% query aggregations). Please run 'show statistics namespace like query_agg' to see values.", + "High query aggregation error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero query aggregation errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero query aggregation errors. Please run 'show statistics namespace like query_agg' to see values.", + "Non-zero query aggregation error check"); + + +// Query Lookup statistics + +total_transactions = select "query_lookups" from NAMESPACE.STATISTICS save as "total query lookups"; +total_transactions_per_sec = do total_transactions/u; +total_transactions_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_transactions_per_sec); + +e = select "query_lookup_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_transactions_per_sec; +p = do p * 100 save as "query_lookup_error % of total query lookups"; +r = do p <= 5; +ASSERT(r, True, "High query lookup errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal query lookup errors (> 5% query lookups). Please run 'show statistics namespace like query_lookup' to see values.", + "High query lookup error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero query lookup errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero query lookup errors. Please run 'show statistics namespace like query_lookup' to see values.", + "Non-zero query lookup error check"); + + +// Scan Agg statistics +s = select "scan_aggr_complete" as "cnt" from NAMESPACE.STATISTICS; +e = select "scan_aggr_error" as "cnt" from NAMESPACE.STATISTICS; +total_transactions = do s + e save as "total scan aggregations"; +total_transactions_per_sec = do total_transactions/u; +total_transactions_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_transactions_per_sec); + +e = select "scan_aggr_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_transactions_per_sec; +p = do p * 100 save as "scan_aggr_error % of total scan aggregations"; +r = do p <= 5; +ASSERT(r, True, "High scan aggregation errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal scan aggregation errors (> 5% scan aggregations). Please run 'show statistics namespace like scan_agg' to see values.", + "High scan aggregation error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero scan aggregation errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero scan aggregation errors. Please run 'show statistics namespace like scan_agg' to see values.", + "Non-zero scan aggregation error check"); + + +// Scan Basic statistics +s = select "scan_basic_complete" as "cnt" from NAMESPACE.STATISTICS; +e = select "scan_basic_error" as "cnt" from NAMESPACE.STATISTICS; +total_transactions = do s + e save as "total basic scans"; +total_transactions_per_sec = do total_transactions/u; +total_transactions_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_transactions_per_sec); + +e = select "scan_basic_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_transactions_per_sec; +p = do p * 100 save as "scan_basic_error % of total basic scans"; +r = do p <= 5; +ASSERT(r, True, "High basic scan errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal basic scan errors (> 5% basic scans). Please run 'show statistics namespace like scan_basic' to see values.", + "High basic scan error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero basic scan errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero basic scan errors. Please run 'show statistics namespace like scan_basic' to see values.", + "Non-zero basic scan error check"); + + +// Scan Background UDF statistics +s = select "scan_udf_bg_complete" as "cnt" from NAMESPACE.STATISTICS; +e = select "scan_udf_bg_error" as "cnt" from NAMESPACE.STATISTICS; +total_transactions = do s + e save as "total scan background udf"; +total_transactions_per_sec = do total_transactions/u; +total_transactions_per_sec = group by CLUSTER, NAMESPACE, NODE do MAX(total_transactions_per_sec); + +e = select "scan_udf_bg_error" from NAMESPACE.STATISTICS save; +e = do e/u save as "errors per second (by using uptime)"; +e = group by CLUSTER, NAMESPACE e; +p = do e/total_transactions_per_sec; +p = do p * 100 save as "scan_udf_bg_error % of total scan background udf"; +r = do p <= 5; +ASSERT(r, True, "High scan background udf errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal scan background udf errors (> 5% scan background udf). Please run 'show statistics namespace like scan_udf_bg' to see values.", + "High scan background udf error check"); +warning_breached = do p > 5; +r = do p == 0; +r = do r || warning_breached; +ASSERT(r, True, "Non-zero scan background udf errors", "OPERATIONS", INFO, + "Listed namespace[s] show non-zero scan background udf errors. Please run 'show statistics namespace like scan_udf_bg' to see values.", + "Non-zero scan background udf error check"); + + +// Client transaction statistics + +e = select "client_tsvc_error" from NAMESPACE.STATISTICS save; +e = do e/u on common save as "errors per second"; e = group by CLUSTER, NAMESPACE e; r = do e > 0; -ASSERT(r, False, "Non-zero write transaction timeouts.", "OPERATIONS", INFO, - "Listed namespace[s] have non-zero write transaction timeouts (for nodes). Please run 'show statistics namespace like client_write_timeout' to see values.", - "Namespace write transaction timeout check"); +ASSERT(r, False, "Non-zero client transaction error.", "OPERATIONS", INFO, + "Listed namespace[s] have non-zero client transaction errors (for nodes). Please run 'show statistics namespace like client_tsvc_error' to see values. Probable cause - protocol errors or security permission mismatch.", + "Namespace client transaction error count check"); -e = select "client_read_not_found" from NAMESPACE.STATISTICS; -e = group by CLUSTER, NAMESPACE e; -s = select "client_read_success" from NAMESPACE.STATISTICS; -s = group by CLUSTER, NAMESPACE, NODE do MAX(s); -s = do 50 %% s; -r = do e <= s; -ASSERT(r, True, "High read not found errors", "OPERATIONS", INFO, - "Listed namespace[s] show higher than normal read not found errors (> 50% client read success). Please run 'show statistics namespace like client_read_not_found client_read_success' to see values.", - "High read not found error check"); -e = select "xdr_write_error" from NAMESPACE.STATISTICS; -e = do e/u on common; +// UDF Sub-Transactions (transaction service) statistics + +e = select "udf_sub_tsvc_error" from NAMESPACE.STATISTICS save; +e = do e/u on common save as "errors per second"; e = group by CLUSTER, NAMESPACE e; r = do e > 0; -ASSERT(r, False, "Non-zero XDR write errors count.", "OPERATIONS", INFO, - "Listed namespace[s] have non-zero XDR write transaction failures (for nodes). Please run 'show statistics namespace like xdr_write_error' to see values.", - "Namespace XDR write failure check"); +ASSERT(r, False, "Non-zero udf sub-transaction error in the transaction service.", "OPERATIONS", INFO, + "Listed namespace[s] have non-zero udf sub-transaction errors in the transaction service (for nodes). Probable cause - protocol errors or security permission mismatch.", + "Namespace udf sub-transaction transaction service error count check"); + + +// Batch-index read Sub-Transaction (transaction service) statistics -e = select "xdr_write_timeout" from NAMESPACE.STATISTICS; -e = do e/u on common; +e = select "batch_sub_tsvc_error" from NAMESPACE.STATISTICS save; +e = do e/u on common save as "errors per second"; e = group by CLUSTER, NAMESPACE e; r = do e > 0; -ASSERT(r, False, "Non-zero XDR write timeouts.", "OPERATIONS", INFO, - "Listed namespace[s] have non-zero XDR write transaction timeouts (for nodes). Please run 'show statistics namespace like xdr_write_timeout' to see values.", - "Namespace XDR write timeout check"); +ASSERT(r, False, "Non-zero batch-index read sub-transaction errors in the transaction service.", "OPERATIONS", INFO, + "Listed namespace[s] have non-zero batch-index read sub-transaction errors in the transaction service across the nodes. Please run 'show statistics namespace like batch_sub_tsvc_error' to see the values.", + "Namespace batch-index read sub-transaction transaction service error count check"); + SET CONSTRAINT VERSION < 3.9; -e = select "stat_write_errs" from SERVICE.STATISTICS; -s = select "stat_write_success" from SERVICE.STATISTICS; -s = GROUP BY CLUSTER, NODE do SUM(s); -u = select "uptime" from SERVICE.STATISTICS; -u = GROUP BY CLUSTER, NODE do SUM(u); -r = do e / s; -r = do r/u on common; -r = do r == 0; -ASSERT(r, True, "Non-zero node write errors count", "OPERATIONS", INFO, - "Listed write error[s] show skew in count (for nodes). It may or may not be an issue depending on the error type. Please run 'show statistics service like stat_write' to see values.", - "Node write errors count check"); +// Read statistics -e = select "stat_read_errs_other" from SERVICE.STATISTICS; +t = select "stat_read_reqs" as "cnt" from SERVICE.STATISTICS save; + +e = select "stat_read_errs_other" from SERVICE.STATISTICS save; +r = do e/t; +r = do r * 100 save as "stat_read_errs_other % of total reads"; +r = do r <= 5; +ASSERT(r, True, "High read errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal read errors (> 5% reads). Please run 'show statistics service like stat_read' to see values.", + "High read error check"); + +nf = select "stat_read_errs_notfound" from SERVICE.STATISTICS save; +r = do nf/t; +r = do r * 100 save as "stat_read_errs_notfound % of total reads"; +r = do r <= 20; +ASSERT(r, True, "High read not found errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal read not found errors (> 20% reads). Please run 'show statistics service like stat_read' to see values.", + "High read not found error check"); + + +// Write statistics + +t = select "stat_write_reqs" as "cnt" from SERVICE.STATISTICS save; + +e = select "stat_write_errs" from SERVICE.STATISTICS save; +r = do e/t; +r = do r * 100 save as "stat_write_errs % of total writes"; +r = do r <= 5; +ASSERT(r, True, "High write errors", "OPERATIONS", WARNING, + "Listed namespace[s] show higher than normal write errors (> 5% writes). Please run 'show statistics service like stat_write' to see values.", + "High write error check"); + + +e = select "stat_read_errs_other" from SERVICE.STATISTICS save; s = select "stat_read_success" from SERVICE.STATISTICS; s = GROUP BY CLUSTER, NODE do SUM(s); -u = select "uptime" from SERVICE.STATISTICS; -u = GROUP BY CLUSTER, NODE do SUM(u); r = do e / s; r = do r/u on common; r = do r == 0; @@ -742,7 +1314,7 @@ ASSERT(r, True, "Non-zero node read errors count", "OPERATIONS", INFO, SET CONSTRAINT VERSION >= 3.3.17; -defslp= select "defrag-sleep", "storage-engine.defrag-sleep" from NAMESPACE.CONFIG; +defslp= select "defrag-sleep", "storage-engine.defrag-sleep" from NAMESPACE.CONFIG save; defslp = group by CLUSTER, NAMESPACE defslp; r = do defslp == 1000; ASSERT(r, True, "Non-default namespace defrag-sleep configuration.", "OPERATIONS",INFO, @@ -750,3 +1322,76 @@ ASSERT(r, True, "Non-default namespace defrag-sleep configuration.", "OPERATIONS "Non-default namespace defrag-sleep check."); SET CONSTRAINT VERSION ALL; + + +/* +Queries Requested by SA Team (Ronen) +*/ + +SET CONSTRAINT VERSION >= 3.9; + +crp = select "cache_read_pct" as "post-write-queue", "cache-read-pct" as "post-write-queue" from NAMESPACE.STATISTICS save; +pwq = select "post-write-queue", "storage-engine.post-write-queue" as "post-write-queue" from NAMESPACE.CONFIG save; +crp = do crp >= 10; +pwq = do pwq == 256; +r = do crp && pwq; +r = group by CLUSTER, NAMESPACE, NODE r; +ASSERT(r, False, "Sub-optimal post-write-queue", "OPERATIONS", INFO, + "Listed namespace[s] show high cache hit rate (> 10%) but post-write-queue value is default. It might be sub-optimal. Please contact Aerospike support team or SA team.", + "Namespace post-write-queue check"); + + +SET CONSTRAINT VERSION >= 3.11; + +ptl = select "partition-tree-locks" from NAMESPACE.CONFIG save; +cs = select "cluster_size" from SERVICE.STATISTICS; +cs = group by CLUSTER do MAX(cs) save as "cluster_size"; +r = do cs/ptl; +r = group by CLUSTER, NAMESPACE, NODE r; +r = do r < 2; + +ASSERT(r, True, "Non-recommended partition-tree-locks", "OPERATIONS", WARNING, + "Listed namespace[s] show low value for partition-tree-locks with respect to cluster size. It should be 8 for cluster-size < 16, 16 for cluster sizes 16 to 31, 32 for cluster sizes 32 to 63, etc. Please contact Aerospike support team or SA team.", + "Namespace partition-tree-locks check"); + + +m = select "memory-size" as "cnt" from NAMESPACE.CONFIG; +s = select "stop-writes-pct" as "cnt" from NAMESPACE.CONFIG; +s = do 100 - s; +s = do s/100; +extra_space = do m * s save as "breathing space (over stop-write)"; +extra_space = group by CLUSTER, NODE, NAMESPACE do SUM(extra_space); + +p = select "partition-tree-sprigs" from NAMESPACE.CONFIG save; +p = do p/16; + +overhead1 = do 64 * 1024; +overhead2 = do 1024 * 1024; +overhead = do overhead1 + overhead2; + +total_overhead = do p * overhead save as "partition-tree-sprigs overhead"; +r = do total_overhead < extra_space; + +e = select "edition" from METADATA; +e = do e == "Community"; +e = group by CLUSTER, NODE do OR(e); +ASSERT(r, False, "Non-recommended partition-tree-sprigs for Community edition", "OPERATIONS", INFO, + "Listed namespace[s] show low value for partition-tree-sprigs with respect to memory-size. partition-tree-sprigs overhead is less than (100 - stop-write-pct) % memory-size. It should be increased. Please contact Aerospike support team or SA team.", + "Namespace partition-tree-sprigs check for Community edition", + e); + +ee_overhead = do 320 * 1024; +overhead = do overhead + ee_overhead; + +total_overhead = do p * overhead save as "partition-tree-sprigs overhead"; +r = do total_overhead < extra_space; + +e = select "edition" from METADATA; +e = do e == "Enterprise"; +e = group by CLUSTER, NODE do OR(e); +ASSERT(r, False, "Non-recommended partition-tree-sprigs for Enterprise edition", "OPERATIONS", INFO, + "Listed namespace[s] show low value for partition-tree-sprigs with respect to memory-size. partition-tree-sprigs overhead is less than (100 - stop-write-pct) % memory-size. It should be increased. Please contact Aerospike support team or SA team.", + "Namespace partition-tree-sprigs check for Enterprise edition", + e); + +SET CONSTRAINT VERSION ALL; diff --git a/lib/health/util.py b/lib/health/util.py index 5f15f7ac..78774b0d 100644 --- a/lib/health/util.py +++ b/lib/health/util.py @@ -15,6 +15,7 @@ import copy import re +from lib.health.constants import HEALTH_PARSER_VAR from lib.health.exceptions import HealthException from lib.utils.util import get_value_from_dict @@ -49,73 +50,6 @@ def deep_merge_dicts(dict_to, dict_from): return dict_to -def fetch_keys_from_dict(data={}, keys=[], from_keys=[]): - """ - Function takes dictionary, list of keys to fetch, list of from_keys to filter scope - - Returns dictionary of selected keys and values - """ - - if not data or not isinstance(data, dict): - raise HealthException("Wrong Input Data for select operation.") - - result_dict = {} - if not keys: - raise HealthException("No key provided for select operation.") - - for _key in data: - if from_keys: - f_key = from_keys[0] - if isinstance(_key, tuple): - # from_keys work with static component keys only, if we get - # tuple keys means we have done with checking of all component - # keys and not found any from key match so no need to check - # further in this direction - break - - if (f_key == "ALL") or (_key == f_key): - # from_key is ALL or matching with _key - child_res = fetch_keys_from_dict(data[_key], keys=keys, - from_keys=from_keys[1:] if len(from_keys) > 1 else []) - - else: - # no key match, need to check further - child_res = fetch_keys_from_dict(data[_key], keys=keys, - from_keys=from_keys) - - if child_res: - if f_key == "ALL": - # It assumes ALL is only for top snapshot level - result_dict[(_key, "SNAPSHOT")] = copy.deepcopy(child_res) - else: - result_dict = deep_merge_dicts( - result_dict, copy.deepcopy(child_res)) - - else: - if (False, "*", None) in keys and isinstance(_key, tuple): - result_dict[_key] = copy.deepcopy(data[_key]) - elif isinstance(_key, tuple) and _key[1] == "KEY": - for check_substring, s_key, new_name in keys: - if ((check_substring and re.search(s_key, _key[0])) - or (not check_substring and _key[0] == s_key)): - if new_name: - result_dict[(new_name, "KEY")] = data[_key] - else: - result_dict[_key] = data[_key] - break - - elif data[_key] and isinstance(data[_key], dict): - child_res = fetch_keys_from_dict(data[_key], keys=keys) - if child_res: - if isinstance(_key, tuple): - result_dict[_key] = copy.deepcopy(child_res) - else: - result_dict = deep_merge_dicts(result_dict, - copy.deepcopy(child_res)) - - return result_dict - - def add_component_keys(data, component_key_list): if not component_key_list: return data @@ -369,6 +303,80 @@ def make_key(key): return (key, "KEY") +def _remove_duplicates_from_saved_value_list(v_list): + """ + Remove items with duplicate keys and create single tuple entry with last possible value for key in list. + + """ + + if not v_list: + return v_list + + tmp_dict = {} + for i in v_list: + tmp_dict[i[0]] = (i[1], i[2]) + + res_list = [] + for i in v_list: + t = (i[0], tmp_dict[i[0]][0], tmp_dict[i[0]][1]) + if t not in res_list: + res_list.append(t) + + return res_list + + +def _extract_saved_value_list_from_value_vector(v): + val_to_save = [] + + for i in v: + try: + _k, _v = get_kv(i) + + if _v[1]: + val_to_save += _v[1] + + except Exception: + # Not expected Input format (list of kv map) + pass + + return val_to_save + + +def create_value_list_to_save(save_param=None, key=" ", value=None, op1=None, op2=None, formatting=True): + """ + Merge saved value lists of operand/s. + + """ + + value_list = [] + + if op1: + if isinstance(op1, list): + value_list += _extract_saved_value_list_from_value_vector(op1) + else: + value_list += op1[1] + + if op2: + if isinstance(op2, list): + value_list += _extract_saved_value_list_from_value_vector(op2) + else: + value_list += op2[1] + + if save_param is None: + # Not saving value (result) + return _remove_duplicates_from_saved_value_list(value_list) + + if save_param == "": + # Saving value (result) with key + value_list.append((key, value, formatting)) + + else: + # Saving value (result) with save_param as key + value_list.append((save_param, value, formatting)) + + return _remove_duplicates_from_saved_value_list(value_list) + + def create_snapshot_key(id, snapshot_prefix="SNAPSHOT"): id = str(id) if len(id) > 2: @@ -381,3 +389,51 @@ def create_snapshot_key(id, snapshot_prefix="SNAPSHOT"): return snapshot_prefix + "00" + id return None + + +def create_health_internal_tuple(val, saved_value_list=[]): + return (val, saved_value_list) + + +def get_value_from_health_internal_tuple(t): + if not t or not isinstance(t, tuple): + return t + + return t[0] + + +def is_health_parser_variable(var): + """ + + :param var: variable to check + :return: True/False + + """ + if not var: + return False + + if isinstance(var, tuple) and var[0] == HEALTH_PARSER_VAR: + return True + + return False + + +def find_majority_element(value_list): + if not value_list: + return None + + m_value = value_list[0] + tmp_dict = {} + tmp_dict[m_value] = 1 + + for i in range(1, len(value_list)): + v = value_list[i] + if v in tmp_dict: + tmp_dict[v] += 1 + else: + tmp_dict[v] = 1 + + if v != m_value and tmp_dict[v] > tmp_dict[m_value]: + m_value = v + + return m_value \ No newline at end of file diff --git a/lib/log/loghdlr.py b/lib/log/loghdlr.py index f56bc2f9..49b3a424 100644 --- a/lib/log/loghdlr.py +++ b/lib/log/loghdlr.py @@ -160,7 +160,7 @@ def remove_logs_by_index(self, indices='all'): del self.selected_logs[log] except Exception as e: - self.logger.error(e) + self.logger.warning("Ignoring remove operation for index %s. Error: %s"%(str(index), str(e))) continue def select_logs_by_index(self, indices="all"): diff --git a/lib/log/reader.py b/lib/log/reader.py index 5de46072..3982feac 100644 --- a/lib/log/reader.py +++ b/lib/log/reader.py @@ -167,8 +167,8 @@ def parse_init_dt(self, arg_from, tail_dt): try: init_dt = tail_dt - self.parse_timedelta(arg_from.strip("- ")) except Exception: - self.logger.error( - "Can't parse relative start time " + arg_from) + self.logger.warning( + "Ignoring relative start time. Can't parse relative start time " + arg_from) return 0 else: # Absolute start time: @@ -176,8 +176,8 @@ def parse_init_dt(self, arg_from, tail_dt): init_dt = datetime.datetime( *(time.strptime(arg_from, DT_FMT)[0:6])) except Exception as e: - self.logger.error( - "Can't parse absolute start time " + arg_from + " " + str(e)) + self.logger.warning( + "Ignoring absolute start time. Can't parse absolute start time " + arg_from + " " + str(e)) return 0 return init_dt @@ -304,4 +304,4 @@ def read_line(self, f): except Exception: pass - return ln \ No newline at end of file + return ln diff --git a/lib/logcontroller.py b/lib/logcontroller.py index 8be475bc..b0acea71 100644 --- a/lib/logcontroller.py +++ b/lib/logcontroller.py @@ -366,7 +366,7 @@ def do_show(self, line): try: output_page_size = int(util.strip_string(tline.pop(0))) except Exception: - self.logger.error( + self.logger.warning( "Wrong output page size, setting default value") elif word == '-n': try: @@ -449,13 +449,13 @@ def do_count(self, line): try: output_page_size = int(util.strip_string(tline.pop(0))) except Exception: - self.logger.error( + self.logger.warning( "Wrong output page size, setting default value") elif word == '-r': try: title_every_nth = int(util.strip_string(tline.pop(0))) except Exception: - self.logger.error( + self.logger.warning( "Wrong output title repetition value, setting default value") elif word == '-f': start_tm = tline.pop(0) @@ -557,13 +557,13 @@ def do_diff(self, line): try: output_page_size = int(util.strip_string(tline.pop(0))) except Exception: - self.logger.error( + self.logger.warning( "Wrong output page size, setting default value") elif word == '-r': try: title_every_nth = int(util.strip_string(tline.pop(0))) except Exception: - self.logger.error( + self.logger.warning( "Wrong output title repetition value, setting default value") elif word == '-n': try: @@ -652,13 +652,13 @@ def do_latency(self, line): try: output_page_size = int(util.strip_string(tline.pop(0))) except Exception: - self.logger.error( + self.logger.warning( "Wrong output page size, setting default value") elif word == '-r': try: title_every_nth = int(util.strip_string(tline.pop(0))) except Exception: - self.logger.error( + self.logger.warning( "Wrong output title repetition value, setting default value") elif word == '-n': try: diff --git a/lib/utils/util.py b/lib/utils/util.py index cd865870..32735f22 100644 --- a/lib/utils/util.py +++ b/lib/utils/util.py @@ -13,18 +13,18 @@ # limitations under the License. import copy from distutils.version import LooseVersion - +import pipes import re -import threading +import StringIO import subprocess -import pipes import sys -import StringIO +import threading -# Dictionary to contain feature and related stats to identify state of that feature -# Format : { feature1: ((service_stat1, service_stat2, ....), (namespace_stat1, namespace_stat2, ...), ...} from lib.utils import filesize + +# Dictionary to contain feature and related stats to identify state of that feature +# Format : { feature1: ((service_stat1, service_stat2, ....), (namespace_stat1, namespace_stat2, ...), ...} FEATURE_KEYS = { "KVS": (('stat_read_reqs', 'stat_write_reqs'), ('client_read_error', 'client_read_success', 'client_write_error', 'client_write_success')), "UDF": (('udf_read_reqs', 'udf_write_reqs'), ('client_udf_complete', 'client_udf_error')), @@ -112,7 +112,7 @@ def capture_stdout(func, line=''): def compile_likes(likes): - likes = map(re.escape, likes) + likes = ["(" + like.translate(None, '\'"') + ")" for like in likes] likes = "|".join(likes) likes = re.compile(likes) return likes @@ -252,9 +252,18 @@ def get_value_from_dict(d, keys, default_value=None, return_type=None): if not return_type: return val + try: + if return_type == bool: + if val.lower() == "false": + return False + if val.lower() == "true": + return True + except Exception: + pass + try: return return_type(val) - except: + except Exception: pass return default_value @@ -410,7 +419,7 @@ def pct_to_value(data, d_pct): return out_map -def is_keyval_greater_than_value(data={}, keys=(), value=0, is_and=False, type_check=int): +def _is_keyval_greater_than_value(data={}, keys=(), value=0, is_and=False, type_check=int): """ Function takes dictionary, keys and value to compare. Returns boolean to indicate value for key is greater than comparing value or not. @@ -442,19 +451,19 @@ def check_feature_by_keys(service_data=None, service_keys=None, ns_data=None, ns """ if service_data and not isinstance(service_data, Exception) and service_keys: - if is_keyval_greater_than_value(service_data, service_keys): + if _is_keyval_greater_than_value(service_data, service_keys): return True if ns_data and ns_keys: for ns, nsval in ns_data.iteritems(): if not nsval or isinstance(nsval, Exception): continue - if is_keyval_greater_than_value(nsval, ns_keys): + if _is_keyval_greater_than_value(nsval, ns_keys): return True return False -def find_features_for_cluster(service_data, ns_data): +def _find_features_for_cluster(service_data, ns_data): """ Function takes dictionary of service data and dictionary of namespace data. Returns list of active (used) features identifying by comparing respective keys for non-zero value. @@ -476,7 +485,7 @@ def find_features_for_cluster(service_data, ns_data): return features -def compute_set_overhead_for_ns(set_stats, ns): +def _compute_set_overhead_for_ns(set_stats, ns): """ Function takes set stat and namespace name. Returns set overhead for input namespace name. @@ -500,9 +509,9 @@ def compute_set_overhead_for_ns(set_stats, ns): return overhead -def compute_license_data_size(namespace_stats, set_stats, cluster_dict, ns_dict): +def _compute_license_data_size(namespace_stats, set_stats, cluster_dict, ns_dict): """ - Function takes dictionary of service stats, dictionary of namespace stats, cluster output dictionary and namespace output dictionary. + Function takes dictionary of set stats, dictionary of namespace stats, cluster output dictionary and namespace output dictionary. Function finds license data size per namespace, and per cluster and updates output dictionaries. """ @@ -535,7 +544,7 @@ def compute_license_data_size(namespace_stats, set_stats, cluster_dict, ns_dict) device_data_size = sum(get_value_from_second_level_of_dict(ns_stats, ("device_used_bytes", "used-bytes-disk"), default_value=0, return_type=int).values()) if device_data_size > 0: - set_overhead = compute_set_overhead_for_ns(set_stats, ns) + set_overhead = _compute_set_overhead_for_ns(set_stats, ns) device_data_size = device_data_size - set_overhead if device_data_size > 0: @@ -548,20 +557,40 @@ def compute_license_data_size(namespace_stats, set_stats, cluster_dict, ns_dict) device_record_overhead = master_objects * 64 device_data_size = device_data_size - device_record_overhead - ns_dict[ns]["license_data"] = {} + ns_dict[ns]["license_data_in_memory"] = 0 + ns_dict[ns]["license_data_on_disk"] = 0 if memory_data_size is not None: - ns_dict[ns]["license_data"]["memory_size"] = memory_data_size + ns_dict[ns]["license_data_in_memory"] = memory_data_size cl_memory_data_size += memory_data_size if device_data_size is not None: - ns_dict[ns]["license_data"]["device_size"] = device_data_size + ns_dict[ns]["license_data_on_disk"] = device_data_size cl_device_data_size += device_data_size cluster_dict["license_data"] = {} cluster_dict["license_data"]["memory_size"] = cl_memory_data_size cluster_dict["license_data"]["device_size"] = cl_device_data_size -def initialize_summary_output(ns_list): +def _set_migration_status(namespace_stats, cluster_dict, ns_dict): + """ + Function takes dictionary of namespace stats, cluster output dictionary and namespace output dictionary. + Function finds migration status per namespace, and per cluster and updates output dictionaries. + """ + + if not namespace_stats: + return + + for ns, ns_stats in namespace_stats.iteritems(): + if not ns_stats or isinstance(ns_stats, Exception): + continue + + migrations_in_progress = any(get_value_from_second_level_of_dict(ns_stats, ("migrate_tx_partitions_remaining", "migrate-tx-partitions-remaining"), + default_value=0, return_type=int).values()) + if migrations_in_progress: + ns_dict[ns]["migrations_in_progress"] = True + cluster_dict["migrations_in_progress"] = True + +def _initialize_summary_output(ns_list): """ Function takes list of namespace names. Returns dictionary with summary fields set. @@ -573,6 +602,7 @@ def initialize_summary_output(ns_list): summary_dict["CLUSTER"]["server_version"] = [] summary_dict["CLUSTER"]["os_version"] = [] summary_dict["CLUSTER"]["active_features"] = [] + summary_dict["CLUSTER"]["migrations_in_progress"] = False summary_dict["CLUSTER"]["device"] = {} summary_dict["CLUSTER"]["device"]["count"] = 0 @@ -587,6 +617,7 @@ def initialize_summary_output(ns_list): summary_dict["CLUSTER"]["memory"]["aval_pct"] = 0 summary_dict["CLUSTER"]["active_ns"] = 0 + summary_dict["CLUSTER"]["ns_count"] = 0 summary_dict["CLUSTER"]["license_data"] = {} summary_dict["CLUSTER"]["license_data"]["memory_size"] = 0 @@ -598,24 +629,23 @@ def initialize_summary_output(ns_list): for ns in ns_list: summary_dict["FEATURES"]["NAMESPACE"][ns] = {} - summary_dict["FEATURES"]["NAMESPACE"][ns]["device"] = {} - summary_dict["FEATURES"]["NAMESPACE"][ns]["device"]["count"] = 0 - summary_dict["FEATURES"]["NAMESPACE"][ns]["device"]["count_per_node"] = 0 - summary_dict["FEATURES"]["NAMESPACE"][ns]["device"]["count_same_across_nodes"] = True - summary_dict["FEATURES"]["NAMESPACE"][ns]["device"]["total"] = 0 - summary_dict["FEATURES"]["NAMESPACE"][ns]["device"]["used_pct"] = 0 - summary_dict["FEATURES"]["NAMESPACE"][ns]["device"]["aval_pct"] = 0 + summary_dict["FEATURES"]["NAMESPACE"][ns]["devices_total"] = 0 + summary_dict["FEATURES"]["NAMESPACE"][ns]["devices_per_node"] = 0 + summary_dict["FEATURES"]["NAMESPACE"][ns]["memory_total"] = 0 + summary_dict["FEATURES"]["NAMESPACE"][ns]["memory_available_pct"] = 0 - summary_dict["FEATURES"]["NAMESPACE"][ns]["memory"] = {} - summary_dict["FEATURES"]["NAMESPACE"][ns]["memory"]["total"] = 0 - summary_dict["FEATURES"]["NAMESPACE"][ns]["memory"]["aval_pct"] = 0 + summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_total"] = 0 + summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_used_pct"] = 0 + summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_available_pct"] = 0 summary_dict["FEATURES"]["NAMESPACE"][ns]["repl_factor"] = 0 summary_dict["FEATURES"]["NAMESPACE"][ns]["master_objects"] = 0 summary_dict["FEATURES"]["NAMESPACE"][ns]["license_data"] = {} + summary_dict["FEATURES"]["NAMESPACE"][ns]["migrations_in_progress"] = False + return summary_dict def create_summary(service_stats, namespace_stats, set_stats, metadata): @@ -624,12 +654,12 @@ def create_summary(service_stats, namespace_stats, set_stats, metadata): Returns dictionary with summary information. """ - features = find_features_for_cluster(service_stats, namespace_stats) + features = _find_features_for_cluster(service_stats, namespace_stats) namespace_stats = flip_keys(namespace_stats) set_stats = flip_keys(set_stats) - summary_dict = initialize_summary_output(namespace_stats.keys()) + summary_dict = _initialize_summary_output(namespace_stats.keys()) total_nodes = len(service_stats.keys()) @@ -642,7 +672,8 @@ def create_summary(service_stats, namespace_stats, set_stats, metadata): cl_nodewise_device_used = {} cl_nodewise_device_aval = {} - compute_license_data_size(namespace_stats, set_stats, summary_dict["CLUSTER"], summary_dict["FEATURES"]["NAMESPACE"]) + _compute_license_data_size(namespace_stats, set_stats, summary_dict["CLUSTER"], summary_dict["FEATURES"]["NAMESPACE"]) + _set_migration_status(namespace_stats, summary_dict["CLUSTER"], summary_dict["FEATURES"]["NAMESPACE"]) summary_dict["CLUSTER"]["active_features"] = features summary_dict["CLUSTER"]["cluster_size"]= list(set(get_value_from_second_level_of_dict(service_stats, ("cluster_size",), default_value=0, return_type=int).values())) @@ -665,18 +696,16 @@ def create_summary(service_stats, namespace_stats, set_stats, metadata): ns_total_nodes = len(ns_stats.keys()) if ns_total_devices: - summary_dict["FEATURES"]["NAMESPACE"][ns]["device"]["count"] = ns_total_devices - summary_dict["FEATURES"]["NAMESPACE"][ns]["device"]["count_per_node"] = int((float(ns_total_devices)/float(ns_total_nodes)) + 0.5) - if len(set(device_counts.values())) > 1: - summary_dict["FEATURES"]["NAMESPACE"][ns]["device"]["count_same_across_nodes"] = False + summary_dict["FEATURES"]["NAMESPACE"][ns]["devices_total"] = ns_total_devices + summary_dict["FEATURES"]["NAMESPACE"][ns]["devices_per_node"] = int((float(ns_total_devices)/float(ns_total_nodes)) + 0.5) mem_size = get_value_from_second_level_of_dict(ns_stats, ("memory-size",), default_value=0, return_type=int) mem_aval_pct = get_value_from_second_level_of_dict(ns_stats, ("memory_free_pct", "free-pct-memory"), default_value=0, return_type=int) mem_aval = pct_to_value(mem_size, mem_aval_pct) cl_nodewise_mem_size = add_dicts(cl_nodewise_mem_size, mem_size) cl_nodewise_mem_aval = add_dicts(cl_nodewise_mem_aval, mem_aval) - summary_dict["FEATURES"]["NAMESPACE"][ns]["memory"]["total"] = sum(mem_size.values()) - summary_dict["FEATURES"]["NAMESPACE"][ns]["memory"]["aval_pct"] = (float(sum(mem_aval.values()))/float(sum(mem_size.values())))*100.0 + summary_dict["FEATURES"]["NAMESPACE"][ns]["memory_total"] = sum(mem_size.values()) + summary_dict["FEATURES"]["NAMESPACE"][ns]["memory_available_pct"] = (float(sum(mem_aval.values()))/float(sum(mem_size.values())))*100.0 device_size = get_value_from_second_level_of_dict(ns_stats, ("device_total_bytes", "total-bytes-disk"), default_value=0, return_type=int) device_used = get_value_from_second_level_of_dict(ns_stats, ("device_used_bytes", "used-bytes-disk"), default_value=0, return_type=int) @@ -687,12 +716,23 @@ def create_summary(service_stats, namespace_stats, set_stats, metadata): cl_nodewise_device_aval = add_dicts(cl_nodewise_device_aval, device_aval) device_size_total = sum(device_size.values()) if device_size_total > 0: - summary_dict["FEATURES"]["NAMESPACE"][ns]["device"]["total"] = device_size_total - summary_dict["FEATURES"]["NAMESPACE"][ns]["device"]["used_pct"] = (float(sum(device_used.values()))/float(device_size_total))*100.0 - summary_dict["FEATURES"]["NAMESPACE"][ns]["device"]["aval_pct"] = (float(sum(device_aval.values()))/float(device_size_total))*100.0 + summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_total"] = device_size_total + summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_used_pct"] = (float(sum(device_used.values()))/float(device_size_total))*100.0 + summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_available_pct"] = (float(sum(device_aval.values()))/float(device_size_total))*100.0 summary_dict["FEATURES"]["NAMESPACE"][ns]["repl_factor"] = list(set(get_value_from_second_level_of_dict(ns_stats, ("repl-factor",), default_value=0, return_type=int).values())) + + data_in_memory = get_value_from_second_level_of_dict(ns_stats, ("storage-engine.data-in-memory", "data-in-memory"), default_value=False, return_type=bool).values()[0] + + if data_in_memory: + cache_read_pcts = get_value_from_second_level_of_dict(ns_stats, ("cache_read_pct", "cache-read-pct"), default_value="N/E", return_type=int).values() + if cache_read_pcts: + try: + summary_dict["FEATURES"]["NAMESPACE"][ns]["cache_read_pct"] = sum(cache_read_pcts)/len(cache_read_pcts) + except Exception: + pass master_objects = sum(get_value_from_second_level_of_dict(ns_stats, ("master_objects", "master-objects"), default_value=0, return_type=int).values()) + summary_dict["CLUSTER"]["ns_count"] += 1 if master_objects > 0: summary_dict["FEATURES"]["NAMESPACE"][ns]["master_objects"] = master_objects summary_dict["CLUSTER"]["active_ns"] += 1 @@ -924,3 +964,22 @@ def create_histogram_output(histogram_name, histogram_data, **params): return _create_bytewise_histogram_percentiles_output(histogram_data, params["bucket_count"], params["builds"]) +def find_delimiter_in(value): + """Find a good delimiter to split the value by""" + + for d in [';', ':', ',']: + if d in value: + return d + + return ';' + +def convert_edition_to_shortform(edition): + """Convert edition to shortform EE or CE or N/E""" + + if edition.lower() in ['enterprise', 'true', 'ee'] or 'enterprise' in edition.lower(): + return "Enterprise" + + if edition.lower() in ['community', 'false', 'ce'] or 'community' in edition.lower(): + return "Community" + + return "N/E" diff --git a/lib/view/view.py b/lib/view/view.py index b8537e2c..bb4ac578 100644 --- a/lib/view/view.py +++ b/lib/view/view.py @@ -14,21 +14,22 @@ import datetime import itertools -import math -from pydoc import pipepager -import re -import types +import locale +import sys import time +import types from cStringIO import StringIO -import sys +from pydoc import pipepager -from lib.health.constants import HealthResultType, HealthResultCounter, AssertResultKey, AssertLevel +from lib.health.constants import (AssertLevel, AssertResultKey, + HealthResultCounter, HealthResultType) from lib.health.util import print_dict from lib.utils import filesize -from lib.utils.util import get_value_from_dict, set_value_in_dict from lib.utils.constants import COUNT_RESULT_KEY, DT_FMT -from lib.view.table import Table, Extractors, TitleFormats, Styles +from lib.utils.util import (compile_likes, find_delimiter_in, + get_value_from_dict, set_value_in_dict) from lib.view import terminal +from lib.view.table import Extractors, Styles, Table, TitleFormats H1_offset = 13 H2_offset = 15 @@ -39,13 +40,6 @@ class CliView(object): NO_PAGER, LESS, MORE, SCROLL = range(4) pager = NO_PAGER - @staticmethod - def compile_likes(likes): - likes = map(re.escape, likes) - likes = "|".join(likes) - likes = re.compile(likes) - return likes - @staticmethod def print_result(out): if type(out) is not str: @@ -76,26 +70,29 @@ def info_network(stats, cluster_names, versions, builds, cluster, title_suffix=" principal = cluster.get_expected_principal() hosts = cluster.nodes title = "Network Information%s" % (title_suffix) - column_names = (('cluster-name', 'Cluster Name'), 'node', 'node_id', 'ip', 'build', 'cluster_size', 'cluster_key', - '_cluster_integrity', ('_paxos_principal', 'Principal'), 'rackaware_mode', ('client_connections', 'Client Conns'), '_uptime') + column_names = ( + ('cluster-name', 'Cluster Name'), 'node', 'node_id', 'ip', 'build', + 'cluster_size', ('_migrations', 'Migrations'), 'cluster_key', + '_cluster_integrity', ('_paxos_principal', 'Principal'), + 'rackaware_mode', ('client_connections', 'Client Conns'), '_uptime' + ) t = Table(title, column_names, group_by=0, sort_by=1) - t.add_cell_alert('node_id', lambda data: data[ - 'real_node_id'] == principal, color=terminal.fg_green) - + t.add_data_source('Enterprise', lambda data: 'N/E' if data['version'] == 'N/E' else( + True if "Enterprise" in data['version'] else False)) t.add_data_source('_cluster_integrity', lambda data: True if row['cluster_integrity'] == 'true' else False) + t.add_data_source('_migrations', + Extractors.sif_extractor('migrate_partitions_remaining')) t.add_data_source('_uptime', Extractors.time_extractor('uptime')) - t.add_cell_alert( - '_cluster_integrity', lambda data: data['cluster_integrity'] != 'true') - + t.add_cell_alert('node_id', lambda data: data[ + 'real_node_id'] == principal, color=terminal.fg_green) t.add_cell_alert( 'node', lambda data: data['real_node_id'] == principal, color=terminal.fg_green) - - t.add_data_source('Enterprise', lambda data: 'N/E' if data['version'] == 'N/E' else( - True if "Enterprise" in data['version'] else False)) + t.add_cell_alert( + '_cluster_integrity', lambda data: data['cluster_integrity'] != 'true') for node_key, n_stats in stats.iteritems(): if isinstance(n_stats, Exception): @@ -108,6 +105,7 @@ def info_network(stats, cluster_names, versions, builds, cluster, title_suffix=" row['ip'] = hosts[node_key].sock_name(use_fqdn=False) row['node_id'] = node.node_id if node.node_id != principal else "*%s" % ( node.node_id) + try: paxos_node = cluster.get_node(row['paxos_principal'])[0] row['_paxos_principal'] = paxos_node.node_id @@ -147,23 +145,30 @@ def info_network(stats, cluster_names, versions, builds, cluster, title_suffix=" CliView.print_result(t) @staticmethod - def info_namespace(stats, cluster, title_suffix="", **ignore): + def info_namespace_usage(stats, cluster, title_suffix="", **ignore): prefixes = cluster.get_node_names() principal = cluster.get_expected_principal() - title = "Namespace Information%s" % (title_suffix) - column_names = ('namespace', 'node', ('available_pct', 'Avail%'), ('_evicted_objects', 'Evictions'), ('_total_objects', 'Total Objects'), ('_expired_objects', 'Expirations'), 'repl-factor', 'stop_writes', ('_migrates', 'Pending Migrates (tx,rx)'), - ('_used_bytes_disk', 'Disk Used'), ('_used_disk_pct', 'Disk Used%'), ('high-water-disk-pct', 'HWM Disk%'), ('_used_bytes_memory', 'Mem Used'), ('_used_mem_pct', 'Mem Used%'), ('high-water-memory-pct', 'HWM Mem%'), ('stop-writes-pct', 'Stop Writes%'), - ('rack-id', 'Rack ID') - ) + title = "Namespace Usage Information%s" % (title_suffix) + column_names = ( + 'namespace', 'node', + ('_total_records', 'Total Records'), + ('_expired_and_evicted', 'Expirations,Evictions'), 'stop_writes', + ('_used_bytes_disk', 'Disk Used'), ('_used_disk_pct', 'Disk Used%'), + ('high-water-disk-pct', 'HWM Disk%'), ('available_pct', 'Avail%'), + ('_used_bytes_memory', 'Mem Used'), ('_used_mem_pct', 'Mem Used%'), + ('high-water-memory-pct', 'HWM Mem%'), + ('stop-writes-pct', 'Stop Writes%') + ) t = Table(title, column_names, sort_by=0) - t.add_data_source('_total_objects', Extractors.sif_extractor( - ('total_objects'))) - t.add_data_source('_evicted_objects', Extractors.sif_extractor( - ('evicted-objects', 'evicted_objects'))) - t.add_data_source('_expired_objects', Extractors.sif_extractor( - ('expired-objects', 'expired_objects'))) + + t.add_data_source('_total_records', Extractors.sif_extractor( + ('_total_records'))) + t.add_data_source_tuple( + '_expired_and_evicted', + Extractors.sif_extractor(('expired-objects', 'expired_objects')), + Extractors.sif_extractor(('evicted-objects', 'evicted_objects'))) t.add_data_source('_used_bytes_disk', Extractors.byte_extractor( ('used-bytes-disk', 'device_used_bytes'))) t.add_data_source('_used_bytes_memory', Extractors.byte_extractor( @@ -179,13 +184,6 @@ def info_namespace(stats, cluster, title_suffix="", **ignore): t.add_cell_alert('stop_writes', lambda data: data['stop_writes'] != " " and data['stop_writes'] != 'false') - t.add_data_source_tuple( - '_migrates', - Extractors.sif_extractor(('migrate_tx_partitions_remaining', - 'migrate-tx-partitions-remaining')), - Extractors.sif_extractor(('migrate_rx_partitions_remaining', - 'migrate-rx-partitions-remaining'))) - t.add_cell_alert('_used_mem_pct', lambda data: data['free_pct_memory'] != " " and (100 - int(data['free_pct_memory'])) >= int(data['high-water-memory-pct'])) t.add_cell_alert('_used_disk_pct', lambda data: data['free_pct_disk'] != " " and (100 - int(data['free_pct_disk'])) >= int(data['high-water-disk-pct'])) @@ -196,21 +194,18 @@ def info_namespace(stats, cluster, title_suffix="", **ignore): t.add_cell_alert( 'namespace', lambda data: data['node'] is " ", color=terminal.fg_blue) t.add_cell_alert( - '_total_objects', lambda data: data['node'] is " ", color=terminal.fg_blue) + '_total_records', lambda data: data['node'] is " ", color=terminal.fg_blue) t.add_cell_alert( '_used_bytes_memory', lambda data: data['node'] is " ", color=terminal.fg_blue) t.add_cell_alert( '_used_bytes_disk', lambda data: data['node'] is " ", color=terminal.fg_blue) t.add_cell_alert( - '_evicted_objects', lambda data: data['node'] is " ", color=terminal.fg_blue) - t.add_cell_alert( - '_expired_objects', lambda data: data['node'] is " ", color=terminal.fg_blue) - t.add_cell_alert( - '_migrates', lambda data: data['node'] is " ", color=terminal.fg_blue) + '_expired_and_evicted', lambda data: data['node'] is " ", color=terminal.fg_blue) total_res = {} - # Need to maintain Node column ascending order per namespace. If set sort_by in table, it will affect total rows. + # Need to maintain Node column ascending order per namespace. + # If set sort_by in table, it will affect total rows. TODO: implement group_by # So we need to add rows as Nodes ascending order. So need to sort # stats.keys as per respective Node value (prefixes[node_key]). node_key_list = stats.keys() @@ -218,8 +213,6 @@ def info_namespace(stats, cluster, title_suffix="", **ignore): sorted_node_list = [x for (y, x) in sorted( zip(node_column_list, node_key_list), key=lambda pair: pair[0])] - rack_id_available = False - for node_key in sorted_node_list: n_stats = stats[node_key] node = cluster.get_node(node_key)[0] @@ -235,50 +228,48 @@ def info_namespace(stats, cluster, title_suffix="", **ignore): else: row = ns_stats - if "rack-id" in row: - rack_id_available = True - - total_objects = 0 + _total_records = 0 if ns not in total_res: total_res[ns] = {} - total_res[ns]["total_objects"] = 0 + total_res[ns]["_total_records"] = 0 total_res[ns]["used-bytes-memory"] = 0 total_res[ns]["used-bytes-disk"] = 0 total_res[ns]["evicted_objects"] = 0 total_res[ns]["expired_objects"] = 0 - total_res[ns]["migrate_tx_partitions_remaining"] = 0 - total_res[ns]["migrate_rx_partitions_remaining"] = 0 try: - total_objects += get_value_from_dict( - ns_stats, ('master-objects', 'master_objects'), default_value=0, return_type=int) + _total_records += get_value_from_dict( + ns_stats, ('master-objects', 'master_objects'), + default_value=0, return_type=int) except Exception: pass try: - total_objects += get_value_from_dict( + _total_records += get_value_from_dict( ns_stats, ('master_tombstones'), default_value=0, return_type=int) except Exception: pass try: - total_objects += get_value_from_dict( - ns_stats, ('prole-objects', 'prole_objects'), default_value=0, return_type=int) + _total_records += get_value_from_dict( + ns_stats, ('prole-objects', 'prole_objects'), default_value=0, + return_type=int) except Exception: pass try: - total_objects += get_value_from_dict( + _total_records += get_value_from_dict( ns_stats, ('prole_tombstones'), default_value=0, return_type=int) except Exception: pass try: - total_objects += get_value_from_dict( - ns_stats, ('non-replica-objects', 'non_replica_objects'), default_value=0, return_type=int) + _total_records += get_value_from_dict( + ns_stats, ('non-replica-objects', 'non_replica_objects'), + default_value=0, return_type=int) except Exception: pass try: - total_objects += get_value_from_dict( + _total_records += get_value_from_dict( ns_stats, ('non_replica_tombstones'), default_value=0, return_type=int) except Exception: pass @@ -306,18 +297,6 @@ def info_namespace(stats, cluster, title_suffix="", **ignore): except Exception: pass - try: - total_res[ns]["migrate_tx_partitions_remaining"] += get_value_from_dict( - ns_stats, ('migrate-tx-partitions-remaining', 'migrate_tx_partitions_remaining'), return_type=int) - except Exception: - pass - - try: - total_res[ns]["migrate_rx_partitions_remaining"] += get_value_from_dict( - ns_stats, ('migrate-rx-partitions-remaining', 'migrate_rx_partitions_remaining'), return_type=int) - except Exception: - pass - row['namespace'] = ns row['real_node_id'] = node.node_id row['node'] = prefixes[node_key] @@ -330,8 +309,8 @@ def info_namespace(stats, cluster, title_suffix="", **ignore): set_value_in_dict( row, "stop_writes", get_value_from_dict(row, ('stop-writes', 'stop_writes'))) set_value_in_dict( - row, "total_objects", total_objects) - total_res[ns]["total_objects"] += total_objects + row, "_total_records", _total_records) + total_res[ns]["_total_records"] += _total_records t.insert_row(row) @@ -339,55 +318,72 @@ def info_namespace(stats, cluster, title_suffix="", **ignore): row = {} row['node'] = " " row['available_pct'] = " " - row["repl-factor"] = " " row["stop_writes"] = " " row["high-water-disk-pct"] = " " row["free_pct_disk"] = " " row["free_pct_memory"] = " " row["high-water-memory-pct"] = " " row["stop-writes-pct"] = " " - if rack_id_available: - row["rack-id"] = " " row['namespace'] = ns - row["total_objects"] = str(total_res[ns]["total_objects"]) + row["_total_records"] = str(total_res[ns]["_total_records"]) row["used-bytes-memory"] = str(total_res[ns]["used-bytes-memory"]) row["used-bytes-disk"] = str(total_res[ns]["used-bytes-disk"]) row["evicted_objects"] = str(total_res[ns]["evicted_objects"]) row["expired_objects"] = str(total_res[ns]["expired_objects"]) - row["migrate_tx_partitions_remaining"] = str(total_res[ns]["migrate_tx_partitions_remaining"]) - row["migrate_rx_partitions_remaining"] = str(total_res[ns]["migrate_rx_partitions_remaining"]) t.insert_row(row) CliView.print_result(t) @staticmethod - def info_object(stats, cluster, title_suffix="", **ignore): + def info_namespace_object(stats, cluster, title_suffix="", **ignore): prefixes = cluster.get_node_names() principal = cluster.get_expected_principal() - title = "Object Information%s" % (title_suffix) - column_names = ('namespace', 'node', ('_master_objects', 'Master (Objects,Tombstones)'), ('_prole_objects', 'Replica (Objects,Tombstones)'), - ('_non_replica_objects', 'Non-Replica (Objects,Tombstones)'), ('_migration', 'Migration') - ) + title = "Namespace Object Information%s" % (title_suffix) + column_names = ( + 'namespace', 'node', + ('_total_records', 'Total Records'), + ('_repl_factor', "Repl Factor"), + ('_objects', 'Objects (Master,Prole,Non-Replica)'), + ('_tombstones', 'Tombstones (Master,Prole,Non-Replica)'), + ('_migrates', 'Pending Migrates (tx,rx)'), ('rack-id', 'Rack ID') + ) t = Table(title, column_names, sort_by=0) + t.add_data_source( + '_total_records', + Extractors.sif_extractor('_total_records')) + + t.add_data_source( + '_repl_factor', + lambda data: get_value_from_dict( + data, ('repl-factor', + 'effective_replication_factor') # introduced post 3.15.0.1 + )) + t.add_data_source_tuple( - '_master_objects', + '_objects', Extractors.sif_extractor(('master-objects', 'master_objects')), - Extractors.sif_extractor(('master_tombstones'))) + Extractors.sif_extractor(('prole-objects', 'prole_objects')), + Extractors.sif_extractor(('non_replica_objects')) + ) t.add_data_source_tuple( - '_prole_objects', - Extractors.sif_extractor(('prole-objects', 'prole_objects')), - Extractors.sif_extractor(('prole_tombstones'))) + '_tombstones', + Extractors.sif_extractor(('master_tombstones')), + Extractors.sif_extractor(('prole_tombstones')), + Extractors.sif_extractor(('non_replica_tombstones')) + ) t.add_data_source_tuple( - '_non_replica_objects', - Extractors.sif_extractor(('non_replica_objects')), - Extractors.sif_extractor(('non_replica_tombstones'))) + '_migrates', + Extractors.sif_extractor(('migrate_tx_partitions_remaining', + 'migrate-tx-partitions-remaining')), + Extractors.sif_extractor(('migrate_rx_partitions_remaining', + 'migrate-rx-partitions-remaining'))) t.add_cell_alert( 'node', lambda data: data['real_node_id'] == principal, color=terminal.fg_green) @@ -395,23 +391,18 @@ def info_object(stats, cluster, title_suffix="", **ignore): t.add_cell_alert( 'namespace', lambda data: data['node'] is " ", color=terminal.fg_blue) t.add_cell_alert( - '_master_objects', lambda data: data['node'] is " ", color=terminal.fg_blue) - t.add_cell_alert( - '_master_tombstones', lambda data: data['node'] is " ", color=terminal.fg_blue) + '_total_records', lambda data: data['node'] is " ", color=terminal.fg_blue) t.add_cell_alert( - '_prole_objects', lambda data: data['node'] is " ", color=terminal.fg_blue) + '_objects', lambda data: data['node'] is " ", color=terminal.fg_blue) t.add_cell_alert( - '_prole_tombstones', lambda data: data['node'] is " ", color=terminal.fg_blue) + '_tombstones', lambda data: data['node'] is " ", color=terminal.fg_blue) t.add_cell_alert( - '_non_replica_objects', lambda data: data['node'] is " ", color=terminal.fg_blue) - t.add_cell_alert( - '_non_replica_tombstones', lambda data: data['node'] is " ", color=terminal.fg_blue) - t.add_cell_alert( - '_migration', lambda data: data['node'] is " ", color=terminal.fg_blue) + '_migrates', lambda data: data['node'] is " ", color=terminal.fg_blue) total_res = {} - # Need to maintain Node column ascending order per namespace. If set sort_by in table, it will affect total rows. + # Need to maintain Node column ascending order per namespace. + # If set sort_by in table, it will affect total rows. # So we need to add rows as Nodes ascending order. So need to sort # stats.keys as per respective Node value (prefixes[node_key]). node_key_list = stats.keys() @@ -419,6 +410,8 @@ def info_object(stats, cluster, title_suffix="", **ignore): sorted_node_list = [x for (y, x) in sorted( zip(node_column_list, node_key_list), key=lambda pair: pair[0])] + rack_id_available = False + for node_key in sorted_node_list: n_stats = stats[node_key] node = cluster.get_node(node_key)[0] @@ -428,73 +421,92 @@ def info_object(stats, cluster, title_suffix="", **ignore): continue for ns, ns_stats in n_stats.iteritems(): - if isinstance(ns_stats, Exception): row = {} else: row = ns_stats - pending_migration = False - if ns not in total_res: total_res[ns] = {} + total_res[ns]["_total_records"] = 0 total_res[ns]["master_objects"] = 0 total_res[ns]["master_tombstones"] = 0 total_res[ns]["prole_objects"] = 0 total_res[ns]["prole_tombstones"] = 0 total_res[ns]["non_replica_objects"] = 0 total_res[ns]["non_replica_tombstones"] = 0 - total_res[ns]["migration"] = False + total_res[ns]["migrate_tx_partitions_remaining"] = 0 + total_res[ns]["migrate_rx_partitions_remaining"] = 0 + + ns_stats['_total_records'] = 0 + + if "rack-id" in row: + rack_id_available = True try: - total_res[ns]["master_objects"] += get_value_from_dict( + value = get_value_from_dict( ns_stats, ('master-objects', 'master_objects'), return_type=int) + total_res[ns]["master_objects"] += value + ns_stats['_total_records'] += value except Exception: pass + try: - total_res[ns][ - "master_tombstones"] += get_value_from_dict(ns_stats, ('master_tombstones'), return_type=int) + value = get_value_from_dict(ns_stats, ('master_tombstones'), return_type=int) + total_res[ns]["master_tombstones"] += value + ns_stats['_total_records'] += value except Exception: pass + try: - total_res[ns]["prole_objects"] += get_value_from_dict( + value = get_value_from_dict( ns_stats, ('prole-objects', 'prole_objects'), return_type=int) + total_res[ns]["prole_objects"] += value + ns_stats['_total_records'] += value except Exception: pass + try: - total_res[ns][ - "prole_tombstones"] += get_value_from_dict(ns_stats, ('prole_tombstones'), return_type=int) + value = get_value_from_dict(ns_stats, ('prole_tombstones'), return_type=int) + total_res[ns]["prole_tombstones"] += value + ns_stats['_total_records'] += value except Exception: pass + try: - total_res[ns]["non_replica_objects"] += get_value_from_dict( + value = get_value_from_dict( ns_stats, ('non_replica_objects'), return_type=int) + total_res[ns]["non_replica_objects"] += value + ns_stats['_total_records'] += value except Exception: pass + try: - total_res[ns][ - "non_replica_tombstones"] += get_value_from_dict(ns_stats, ('non_replica_tombstones'), return_type=int) + value = get_value_from_dict(ns_stats, ('non_replica_tombstones'), return_type=int) + total_res[ns]["non_replica_tombstones"] += value + ns_stats['_total_records'] += value except Exception: pass try: - if get_value_from_dict(ns_stats, ('migrate-tx-partitions-remaining', 'migrate_tx_partitions_remaining'), default_value=0, return_type=int): - pending_migration = True - total_res[ns]["migration"] = True + total_res[ns]["migrate_tx_partitions_remaining"] += get_value_from_dict( + ns_stats, ('migrate-tx-partitions-remaining', + 'migrate_tx_partitions_remaining'), return_type=int) except Exception: pass try: - if get_value_from_dict(ns_stats, ('migrate-rx-partitions-remaining', 'migrate_rx_partitions_remaining'), default_value=0, return_type=int): - pending_migration = True - total_res[ns]["migration"] = True + total_res[ns]["migrate_rx_partitions_remaining"] += get_value_from_dict( + ns_stats, ('migrate-rx-partitions-remaining', + 'migrate_rx_partitions_remaining'), return_type=int) except Exception: pass + total_res[ns]['_total_records'] += ns_stats['_total_records'] + row['namespace'] = ns row['real_node_id'] = node.node_id row['node'] = prefixes[node_key] - row['_migration'] = str(pending_migration).lower() t.insert_row(row) for ns in total_res: @@ -502,13 +514,21 @@ def info_object(stats, cluster, title_suffix="", **ignore): row['node'] = " " row['namespace'] = ns + row["_total_records"] = str(total_res[ns]["_total_records"]) + row["effective_replication_factor"] = " " row["master_objects"] = str(total_res[ns]["master_objects"]) row["master_tombstones"] = str(total_res[ns]["master_tombstones"]) row["prole_objects"] = str(total_res[ns]["prole_objects"]) row["prole_tombstones"] = str(total_res[ns]["prole_tombstones"]) row["non_replica_objects"] = str(total_res[ns]["non_replica_objects"]) row["non_replica_tombstones"] = str(total_res[ns]["non_replica_tombstones"]) - row['_migration'] = str(total_res[ns]["migration"]).lower() + row["migrate_tx_partitions_remaining"] = str( + total_res[ns]["migrate_tx_partitions_remaining"]) + row["migrate_rx_partitions_remaining"] = str( + total_res[ns]["migrate_rx_partitions_remaining"]) + + if rack_id_available: + row["rack-id"] = " " t.insert_row(row) @@ -784,7 +804,7 @@ def info_string(title, summary): def show_distribution(title, histogram, unit, hist, cluster, like=None, title_suffix="", **ignore): prefixes = cluster.get_node_names() - likes = CliView.compile_likes(like) + likes = compile_likes(like) columns = ["%s%%" % (n) for n in xrange(10, 110, 10)] percentages = columns[:] @@ -815,7 +835,7 @@ def show_distribution(title, histogram, unit, hist, cluster, like=None, title_su def show_object_distribution(title, histogram, unit, hist, bucket_count, set_bucket_count, cluster, like=None, title_suffix="", loganalyser_mode=False, **ignore): prefixes = cluster.get_node_names() - likes = CliView.compile_likes(like) + likes = compile_likes(like) description = "Number of records having %s in the range " % (hist) + \ "measured in %s" % (unit) @@ -856,7 +876,7 @@ def show_object_distribution(title, histogram, unit, hist, bucket_count, set_buc def show_latency(latency, cluster, machine_wise_display=False, show_ns_details=False, like=None, **ignore): prefixes = cluster.get_node_names() if like: - likes = CliView.compile_likes(like) + likes = compile_likes(like) if not machine_wise_display: if like: histograms = set(filter(likes.search, latency.keys())) @@ -946,7 +966,7 @@ def show_config(title, service_configs, cluster, like=None, diff=None, show_tota column_names = sorted(column_names) if like: - likes = CliView.compile_likes(like) + likes = compile_likes(like) column_names = filter(likes.search, column_names) @@ -1182,13 +1202,13 @@ def show_mapping(col1, col2, mapping, like=None, **ignore): t = Table("%s to %s Mapping" % (col1, col2), column_names, title_format=TitleFormats.no_change, style=Styles.HORIZONTAL) if like: - likes = CliView.compile_likes(like) + likes = compile_likes(like) filtered_keys = filter(likes.search, mapping.keys()) else: filtered_keys = mapping.keys() for col1_val, col2_val in mapping.iteritems(): - if not col1_val in filtered_keys: + if col1_val not in filtered_keys: continue row = {} if not isinstance(col2_val, Exception): @@ -1218,20 +1238,18 @@ def asinfo(results, line_sep, show_node_name, cluster, **kwargs): print "%s%s%s" % (terminal.fg_red(), value, terminal.reset()) print "\n" else: - if type(value) == types.StringType: - # most info commands return a semicolon delimited list of key=value. - # Assuming this is the case here, later we may want to try to detect - # the format. + if isinstance(value, types.StringType): + delimiter = find_delimiter_in(value) + value = value.split(delimiter) + if like: - value = value.split(';') - likes = CliView.compile_likes(like) + likes = compile_likes(like) value = filter(likes.search, value) - if line_sep: - value = "\n".join(value) - else: - value = ";".join(value) - elif line_sep: - value = value.replace(';', '\n') + + if line_sep: + value = "\n".join(value) + else: + value = delimiter.join(value) print value if show_node_name: @@ -1399,7 +1417,7 @@ def print_data(d): if d is None: return if isinstance(d, tuple): - print str(d[0]) + " : " + str(d[1]) + print d elif isinstance(d, dict): print_dict(d) else: @@ -1451,8 +1469,8 @@ def print_debug_messages(ho): try: for d in ho[HealthResultType.DEBUG_MESSAGES]: try: - print "Value of %s:" % (d[0]) - CliView.print_data(d[1]) + print "Value of %s:" % (d[1]) + CliView.print_data(d[2]) except Exception: pass except Exception: @@ -1487,6 +1505,88 @@ def get_msg(msg, level=None): else: return ("\n" + " ".rjust(H2_offset)).join(msg) + @staticmethod + def format_value(val, formatting=True): + if not val or not formatting: + return val + + if isinstance(val, int): + try: + # For python 2.7 + return str(format(val, ',d')) + + except Exception: + try: + # For python 2.6 + locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') + return str(locale.format('%d', val, True)) + + except Exception: + pass + + elif isinstance(val, float): + return_val = None + try: + # For python 2.7 + return_val = format(val, ',f') + + except Exception: + try: + # For python 2.6 + locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') + return_val = locale.format('%f', val, True) + + except Exception: + pass + + if return_val is not None: + return_val = str(return_val) + if '.' in return_val: + return_val = return_val.rstrip('0') + return_val = return_val.rstrip('.') + return return_val + + elif isinstance(val, str) and val.isdigit(): + return CliView.format_value(int(val)) + + elif isinstance(val, str): + try: + val = float(val) + return CliView.format_value(val) + except Exception: + pass + + return val + + @staticmethod + def get_kv_msg_list(kv_list): + if not kv_list: + return [] + + res_str = [] + for kv in kv_list: + if not isinstance(kv, tuple): + res_str.append(str(kv)) + continue + + tmp_res_str = str(kv[0]) + if kv[1] and isinstance(kv[1], list): + _str = None + for _kv in kv[1]: + if _kv: + try: + _str += ", " + ("%s:"%(str(_kv[0])) if len(str(_kv[0]).strip())>0 else "") + "%s"%(CliView.format_value(_kv[1], _kv[2])) + except Exception: + _str = ("%s:"%(str(_kv[0])) if len(str(_kv[0]).strip())>0 else "") + "%s"%(CliView.format_value(_kv[1], _kv[2])) + + if _str: + tmp_res_str += " {%s}"%(_str) + + if tmp_res_str: + res_str.append(tmp_res_str) + + return res_str + @staticmethod def get_error_string(data, verbose=False, level=AssertLevel.CRITICAL): if not data: @@ -1505,12 +1605,12 @@ def get_error_string(data, verbose=False, level=AssertLevel.CRITICAL): if d[AssertResultKey.SUCCESS_MSG]: s_msg_str += CliView.get_header(d[AssertResultKey.CATEGORY][0]) + \ - CliView.get_msg([d[AssertResultKey.SUCCESS_MSG]]) + CliView.get_msg([d[AssertResultKey.SUCCESS_MSG]]) s_msg_cnt += 1 - continue; + continue s += CliView.get_header(d[AssertResultKey.CATEGORY][0]) + \ - CliView.get_msg([d[AssertResultKey.FAIL_MSG]], level) + CliView.get_msg([d[AssertResultKey.FAIL_MSG]], level) if verbose: import textwrap @@ -1521,7 +1621,7 @@ def get_error_string(data, verbose=False, level=AssertLevel.CRITICAL): s += "\n" s += CliView.get_header("Keys:") - s += CliView.get_msg(d[AssertResultKey.KEYS]) + s += CliView.get_msg(CliView.get_kv_msg_list(d[AssertResultKey.KEYS])) # Extra new line in case verbose output is printed s += "\n" @@ -1536,8 +1636,8 @@ def get_error_string(data, verbose=False, level=AssertLevel.CRITICAL): res_success_msg_str = "" if s_msg_cnt > 0: - #res_success_msg_str = "\n\n" - #res_success_msg_str += (".".join(data[0] + # res_success_msg_str = "\n\n" + # res_success_msg_str += (".".join(data[0] # [AssertResultKey.CATEGORY]) + ":").ljust(25) + "" res_success_msg_str += s_msg_str @@ -1651,8 +1751,6 @@ def print_assert_summary(assert_out, verbose=False, output_filter_category=[], o print "\n\n" + terminal.bold() + str(" %s: count(%d) " %("FAIL", all_fail_cnt)).center(H_width, "_") + terminal.unbold() print all_fail_str - - print "_" * H_width + "\n" @staticmethod @@ -1691,11 +1789,122 @@ def get_summary_line_prefix(index, key): return s @staticmethod - def print_summary(summary): + def _summary_namespace_table_view(stats, **ignore): + title = "Namespaces" + column_names = ('namespace', ('_devices', 'Devices (Total,Per-Node)'), ('_memory', 'Memory (Total,Used%,Avail%)'), + ('_disk', 'Disk (Total,Used%,Avail%)'), ('repl_factor', 'Replication Factor'), ('cache_read_pct','Post-Write-Queue Hit-Rate'), + ('master_objects', 'Master Objects'), + ('license_data_in_memory', 'Usage In-Memory'), ('license_data_on_disk', 'Usage On-Disk') + ) + + + t = Table(title, column_names, sort_by=0) + + t.add_cell_alert( + 'namespace', + lambda data: data['migrations_in_progress'], + color=terminal.fg_red + ) + + t.add_data_source_tuple( + '_devices', + lambda data:str(data['devices_total']), + lambda data:str(data['devices_per_node'])) + + t.add_data_source_tuple( + '_memory', + Extractors.byte_extractor('memory_total'), + lambda data:"%.2f"%data["memory_used_pct"], + lambda data:"%.2f"%data["memory_available_pct"]) + + t.add_data_source_tuple( + '_disk', + Extractors.byte_extractor('disk_total'), + lambda data:"%.2f"%data["disk_used_pct"], + lambda data:"%.2f"%data["disk_available_pct"]) + + t.add_data_source( + 'repl_factor', + lambda data:",".join([str(rf) for rf in data["repl_factor"]]) + ) + + t.add_data_source( + 'master_objects', + Extractors.sif_extractor('master_objects') + ) + + t.add_data_source( + 'license_data_in_memory', + Extractors.byte_extractor('license_data_in_memory') + ) + + t.add_data_source( + 'license_data_on_disk', + Extractors.byte_extractor('license_data_on_disk') + ) + + for ns, ns_stats in stats.iteritems(): + if isinstance(ns_stats, Exception): + row = {} + else: + row = ns_stats + + row['namespace'] = ns + row['memory_used_pct'] = 100.00 - row['memory_available_pct'] + + t.insert_row(row) + + CliView.print_result(t) + + @staticmethod + def _summary_namespace_list_view(stats, **ignore): + print "Namespaces" + print "==========" + print + for ns in stats: + index = 1 + print " " + ("%s"%(terminal.fg_red() + ns + terminal.fg_clear()) + if stats[ns]["migrations_in_progress"] else ns) + print " " + "=" * len(ns) + + print CliView.get_summary_line_prefix(index, "Devices") + "Total %d, per-node %d"%(stats[ns]["devices_total"], stats[ns]["devices_per_node"]) + index += 1 + + print CliView.get_summary_line_prefix(index, "Memory") + "%s, %.2f%% available"%(filesize.size(stats[ns]["memory_total"]),stats[ns]["memory_available_pct"]) + index += 1 + + if stats[ns]["disk_total"]: + print CliView.get_summary_line_prefix(index, "Disk") + "%s, %.2f%% used, %.2f%% available"%(filesize.size(stats[ns]["disk_total"]), stats[ns]["disk_used_pct"], stats[ns]["disk_available_pct"]) + index += 1 + + print CliView.get_summary_line_prefix(index, "Replication Factor") + "%s"%(",".join([str(rf) for rf in stats[ns]["repl_factor"]])) + index += 1 + + if "cache_read_pct" in stats[ns]: + print CliView.get_summary_line_prefix(index, "Post-Write-Queue Hit-Rate") + "%s"%(filesize.size(stats[ns]["cache_read_pct"], filesize.sif)) + index += 1 + + print CliView.get_summary_line_prefix(index, "Master Objects") + "%s"%(filesize.size(stats[ns]["master_objects"], filesize.sif)) + index += 1 + s = "" + + if "license_data_in_memory" in stats[ns]: + s += "%s in-memory"%(filesize.size(stats[ns]["license_data_in_memory"])) + + if "license_data_on_disk" in stats[ns]: + if s: + s += ", " + s += "%s on-disk"%(filesize.size(stats[ns]["license_data_on_disk"])) + print CliView.get_summary_line_prefix(index, "Usage") + s + print + + @staticmethod + def print_summary(summary, list_view=True): index = 1 - print "Cluster" - print "=======" + print "Cluster" + (" (%s)"%(terminal.fg_red() + "Migrations in Progress" + terminal.fg_clear()) + if summary["CLUSTER"]["migrations_in_progress"] else "") + print "=======" + ("==========================" if summary["CLUSTER"]["migrations_in_progress"] else "") print print CliView.get_summary_line_prefix(index, "Server Version") + ", ".join(summary["CLUSTER"]["server_version"]) index += 1 @@ -1705,47 +1914,23 @@ def print_summary(summary): index += 1 print CliView.get_summary_line_prefix(index, "Devices") + "Total %d, per-node %d"%(summary["CLUSTER"]["device"]["count"], summary["CLUSTER"]["device"]["count_per_node"]) index += 1 - print CliView.get_summary_line_prefix(index, "Memory") + "%s, %.2f%% available"%(filesize.size(summary["CLUSTER"]["memory"]["total"]),summary["CLUSTER"]["memory"]["aval_pct"]) + print CliView.get_summary_line_prefix(index, "Memory") + "%s, %.2f%% used, %.2f%% available"%(filesize.size(summary["CLUSTER"]["memory"]["total"]), 100.00-summary["CLUSTER"]["memory"]["aval_pct"], summary["CLUSTER"]["memory"]["aval_pct"]) index += 1 print CliView.get_summary_line_prefix(index, "Disk") + "%s, %.2f%% used, %.2f%% available"%(filesize.size(summary["CLUSTER"]["device"]["total"]), summary["CLUSTER"]["device"]["used_pct"],summary["CLUSTER"]["device"]["aval_pct"]) index += 1 - print CliView.get_summary_line_prefix(index, "License Data") + "%s in-memory, %s on-disk"%(filesize.size(summary["CLUSTER"]["license_data"]["memory_size"]),filesize.size(summary["CLUSTER"]["license_data"]["device_size"])) + print CliView.get_summary_line_prefix(index, "Usage") + "%s in-memory, %s on-disk"%(filesize.size(summary["CLUSTER"]["license_data"]["memory_size"]),filesize.size(summary["CLUSTER"]["license_data"]["device_size"])) index += 1 - print CliView.get_summary_line_prefix(index, "Active Namespaces") + "%d"%(summary["CLUSTER"]["active_ns"]) + print CliView.get_summary_line_prefix(index, "Active Namespaces") + "%d of %d"%(summary["CLUSTER"]["active_ns"], summary["CLUSTER"]["ns_count"]) index += 1 print CliView.get_summary_line_prefix(index, "Features") + ", ".join(sorted(summary["CLUSTER"]["active_features"])) print "\n" - print "Namespaces" - print "==========" - print - for ns in summary["FEATURES"]["NAMESPACE"]: - index = 1 - print " " + ns - print " " + "=" * len(ns) - - print CliView.get_summary_line_prefix(index, "Devices") + "Total %d, per-node %d"%(summary["FEATURES"]["NAMESPACE"][ns]["device"]["count"], summary["FEATURES"]["NAMESPACE"][ns]["device"]["count_per_node"]) - index += 1 - print CliView.get_summary_line_prefix(index, "Memory") + "%s, %.2f%% available"%(filesize.size(summary["FEATURES"]["NAMESPACE"][ns]["memory"]["total"]),summary["FEATURES"]["NAMESPACE"][ns]["memory"]["aval_pct"]) - index += 1 - if summary["FEATURES"]["NAMESPACE"][ns]["device"]["total"]: - print CliView.get_summary_line_prefix(index, "Disk") + "%s, %.2f%% used, %.2f%% available"%(filesize.size(summary["FEATURES"]["NAMESPACE"][ns]["device"]["total"]), summary["FEATURES"]["NAMESPACE"][ns]["device"]["used_pct"],summary["FEATURES"]["NAMESPACE"][ns]["device"]["aval_pct"]) - index += 1 - print CliView.get_summary_line_prefix(index, "Replication Factor") + "%s"%(",".join([str(rf) for rf in summary["FEATURES"]["NAMESPACE"][ns]["repl_factor"]])) - index += 1 - print CliView.get_summary_line_prefix(index, "Master Objects") + "%s"%(filesize.size(summary["FEATURES"]["NAMESPACE"][ns]["master_objects"], filesize.sif)) - index += 1 - s = "" - if "memory_size" in summary["FEATURES"]["NAMESPACE"][ns]["license_data"]: - s += "%s in-memory"%(filesize.size(summary["FEATURES"]["NAMESPACE"][ns]["license_data"]["memory_size"])) + if list_view: + CliView._summary_namespace_list_view(summary["FEATURES"]["NAMESPACE"]) - if "device_size" in summary["FEATURES"]["NAMESPACE"][ns]["license_data"]: - if s: - s += ", " - s += "%s on-disk"%(filesize.size(summary["FEATURES"]["NAMESPACE"][ns]["license_data"]["device_size"])) - print CliView.get_summary_line_prefix(index, "License Data") + s - print + else: + CliView._summary_namespace_table_view(summary["FEATURES"]["NAMESPACE"]) @staticmethod def show_pmap(pmap_data, cluster, title_suffix="", **ignore): diff --git a/test/e2e/test_info.py b/test/e2e/test_info.py index 88ed6033..7aa8b23a 100644 --- a/test/e2e/test_info.py +++ b/test/e2e/test_info.py @@ -25,8 +25,8 @@ class TestInfo(unittest.TestCase): output_list = list() service_info = '' network_info = '' - namespace_info = '' - object_info = '' + namespace_usage_info = '' + namespace_object_info = '' sindex_info = '' xdr_info = '' @@ -36,18 +36,17 @@ def setUpClass(cls): actual_out = util.capture_stdout(TestInfo.rc.execute, ['info']) TestInfo.output_list = test_util.get_separate_output(actual_out, 'Information') # TestInfo.output_list.append(util.capture_stdout(TestInfo.rc.execute, ['info', 'sindex'])) - TestInfo.output_list.append(util.capture_stdout(TestInfo.rc.execute, ['info', 'object'])) for item in TestInfo.output_list: if "~~Network Information~~" in item: TestInfo.network_info = item - elif "~~Namespace Information~~" in item: - TestInfo.namespace_info = item + elif "~~Namespace Usage Information~~" in item: + TestInfo.namespace_usage_info = item elif "~~Secondary Index Information~~" in item: TestInfo.sindex_info = item elif "~~XDR Information~~" in item: TestInfo.xdr_info = item - elif "~~Object Information~~" in item: - TestInfo.object_info = item + elif "~~Namespace Object Information~~" in item: + TestInfo.namespace_object_info = item @classmethod def tearDownClass(self): @@ -102,18 +101,17 @@ def test_sindex(self): self.assertTrue(exp_heading in actual_heading) self.assertEqual(exp_header, actual_header) - def test_namespace(self): + def test_namespace_usage(self): """ - This test will assert info Namespace output for heading, headerline1, headerline2 + This test will assert info namespace usage output for heading, headerline1, headerline2 displayed in output TODO: test for values as well """ - exp_heading = "~~Namespace Information~~" + exp_heading = "~~Namespace Usage Information~~" exp_header = [ 'Node', 'Namespace', - 'Evictions', - 'Expirations', - 'Repl Factor', + 'Total Records', + 'Expirations,Evictions', 'Stop Writes', 'Disk Used', 'Disk Used%', @@ -122,35 +120,34 @@ def test_namespace(self): 'Mem Used%', 'HWM Mem%', 'Stop Writes%', - 'Total Objects', - 'Pending Migrates', - ('Rack ID', None) ] - actual_heading, actual_header, actual_no_of_rows = test_util.parse_output(TestInfo.namespace_info, horizontal = True) + actual_heading, actual_header, actual_no_of_rows = test_util.parse_output(TestInfo.namespace_usage_info, horizontal = True) self.assertTrue(test_util.check_for_subset(actual_header, exp_header)) self.assertTrue(exp_heading in actual_heading) - def test_object(self): + def test_namespace_object(self): """ - This test will assert info Object output for heading, headerline1, headerline2 + This test will assert info namespace Object output for heading, headerline1, headerline2 displayed in output TODO: test for values as well """ - exp_heading = "~~Object Information~~" + exp_heading = "~~Namespace Object Information~~" exp_header = [ 'Namespace', 'Node', - 'Master (Objects,Tombstones)', - 'Replica (Objects,Tombstones)', - 'Non-Replica (Objects,Tombstones)', - 'Migration' + 'Total Records', + 'Repl Factor', + 'Objects (Master,Prole,Non-Replica)', + 'Tombstones (Master,Prole,Non-Replica)', + 'Pending Migrates', + ('Rack ID', None) ] - actual_heading, actual_header, actual_no_of_rows = test_util.parse_output(TestInfo.object_info, horizontal = True) + actual_heading, actual_header, actual_no_of_rows = test_util.parse_output(TestInfo.namespace_object_info, horizontal = True) self.assertTrue(test_util.check_for_subset(actual_header, exp_header)) self.assertTrue(exp_heading in actual_heading) - @unittest.skip("Will enable only when xdr is configuired") + @unittest.skip("Will enable only when xdr is configured") def test_xdr(self): """ This test will assert info Namespace output for heading, headerline1, headerline2 diff --git a/test/e2e/test_show.py b/test/e2e/test_show.py index 55191cc0..d9bcd837 100644 --- a/test/e2e/test_show.py +++ b/test/e2e/test_show.py @@ -29,37 +29,37 @@ class TestShowConfig(unittest.TestCase): test_namespace_config = '' bar_namespace_config = '' xdr_config = '' - + @classmethod def setUpClass(cls): rc = controller.BasicRootController() actual_out = util.capture_stdout(rc.execute, ['show', 'config']) TestShowConfig.output_list = test_util.get_separate_output(actual_out, 'Configuration') TestShowConfig.is_bar_present = False - + for item in TestShowConfig.output_list: if "~~Service Configuration~~" in item: - TestShowConfig.service_config = item + TestShowConfig.service_config = item elif "~~Network Configuration~~" in item: - TestShowConfig.network_config = item + TestShowConfig.network_config = item elif "~~test Namespace Configuration~~" in item: - TestShowConfig.test_namespace_config = item + TestShowConfig.test_namespace_config = item elif "~~bar Namespace Configuration~~" in item: TestShowConfig.bar_namespace_config = item TestShowConfig.is_bar_present = True elif "~~XDR Configuration~~" in item: TestShowConfig.xdr_config = item - - @classmethod + + @classmethod def tearDownClass(self): self.rc = None - + def test_network(self): """ This test will assert network output on heading, header, parameters. TODO: test for values as well """ - + exp_heading = "~~Network Configuration~~" exp_header = "NODE" exp_params = [('fabric-keepalive-enabled', 'fabric.keepalive-enabled'), @@ -67,7 +67,7 @@ def test_network(self): ('fabric-keepalive-probes', 'fabric.keepalive-probes'), ('fabric-keepalive-time', 'fabric.keepalive-time'), ('fabric-port', 'fabric.port'), - ('heartbeat-address', 'heartbeat.address', 'heartbeat.addresses'), + ('heartbeat-address', 'heartbeat.address', 'heartbeat.addresses', None), ('heartbeat-interval', 'heartbeat.interval'), ('heartbeat-mode', 'heartbeat.mode'), ('heartbeat-port', 'heartbeat.port', None), @@ -79,7 +79,7 @@ def test_network(self): ('service-port','service.port')] actual_heading, actual_header, actual_params = test_util.parse_output(TestShowConfig.network_config) - + self.assertTrue(exp_heading in actual_heading) self.assertTrue(exp_header in actual_header) self.assertTrue(test_util.check_for_subset(actual_params, exp_params)) @@ -89,7 +89,7 @@ def test_service(self): Asserts service config output with heading, header & parameters. TODO: test for values as well """ - + exp_heading = "~~Service Configuration~~" exp_header = "NODE" exp_params = [ ('allow-inline-transactions', None), @@ -98,7 +98,7 @@ def test_service(self): 'batch-threads', ('fabric-workers', None), 'info-threads', - 'ldt-benchmarks', + ('ldt-benchmarks', None), 'max-msgs-per-type', ('memory-accounting', None), ('microbenchmarks', None), @@ -108,10 +108,10 @@ def test_service(self): 'nsup-delete-sleep', 'nsup-period', 'nsup-startup-evict', - 'paxos-max-cluster-size', - 'paxos-protocol', + ('paxos-max-cluster-size', None), + ('paxos-protocol', None), ('paxos-recovery-policy', None), - 'paxos-retransmit-period', + ('paxos-retransmit-period', None), 'paxos-single-replica-limit', 'prole-extra-ttl', 'proto-fd-idle-ms', @@ -130,7 +130,7 @@ def test_service(self): 'query-threshold', 'query-worker-threads', ('replication-fire-and-forget', None), - 'respond-client-on-master-completion', + ('respond-client-on-master-completion', None), 'service-threads', ('sindex-data-max-memory', None), ('snub-nodes', None), @@ -139,16 +139,16 @@ def test_service(self): 'transaction-max-ms', 'transaction-pending-limit', 'transaction-queues', - 'transaction-repeatable-read', + ('transaction-repeatable-read', None), 'transaction-retry-ms', 'transaction-threads-per-queue', ('udf-runtime-gmax-memory', None), ('udf-runtime-max-memory', None), ('use-queue-per-device', None), - 'write-duplicate-resolution-disable', + ('write-duplicate-resolution-disable', None) ] - actual_heading, actual_header, actual_params = test_util.parse_output(TestShowConfig.service_config) + actual_heading, actual_header, actual_params = test_util.parse_output(TestShowConfig.service_config) self.assertTrue(exp_heading in actual_heading) self.assertTrue(exp_header in actual_header) @@ -159,7 +159,7 @@ def test_test_namespace(self): Asserts namespace config output with heading, header & parameters. TODO: test for values as well """ - + exp_heading = "~~test Namespace Configuration~~" exp_header = "NODE" exp_params_test = [ 'allow-nonxdr-writes', @@ -172,8 +172,8 @@ def test_test_namespace(self): 'evict-tenths-pct', 'high-water-disk-pct', 'high-water-memory-pct', - 'ldt-enabled', - 'ldt-page-size', + ('ldt-enabled', None), + ('ldt-page-size', None), 'max-ttl', 'memory-size', 'ns-forward-xdr-writes', @@ -184,10 +184,10 @@ def test_test_namespace(self): 'stop-writes-pct', ('total-bytes-memory', None), 'write-commit-level-override' - ] - - actual_heading, actual_header, actual_params = test_util.parse_output(TestShowConfig.test_namespace_config) - + ] + + actual_heading, actual_header, actual_params = test_util.parse_output(TestShowConfig.test_namespace_config) + self.assertTrue(exp_heading in actual_heading) self.assertTrue(exp_header in actual_header) self.assertTrue(test_util.check_for_subset(actual_params, exp_params_test)) @@ -199,7 +199,7 @@ def test_bar_namespace(self): """ if not TestShowConfig.is_bar_present: return - + exp_heading = "~~bar Namespace Configuration~~" exp_header = "NODE" exp_params_bar = [ 'allow-nonxdr-writes', @@ -212,8 +212,8 @@ def test_bar_namespace(self): 'evict-tenths-pct', 'high-water-disk-pct', 'high-water-memory-pct', - 'ldt-enabled', - 'ldt-page-size', + ('ldt-enabled', None), + ('ldt-page-size', None), 'max-ttl', 'memory-size', 'ns-forward-xdr-writes', @@ -224,15 +224,15 @@ def test_bar_namespace(self): 'stop-writes-pct', ('total-bytes-memory', None), 'write-commit-level-override' - ] - - actual_heading, actual_header, actual_params = test_util.parse_output(TestShowConfig.bar_namespace_config) - + ] + + actual_heading, actual_header, actual_params = test_util.parse_output(TestShowConfig.bar_namespace_config) + self.assertTrue(exp_heading in actual_heading) self.assertTrue(exp_header in actual_header) self.assertTrue(test_util.check_for_subset(actual_params, exp_params_bar)) - @unittest.skip("Will enable only when xdr is configuired") + @unittest.skip("Will enable only when xdr is configuired") def test_xdr(self): """ Asserts XDR config output with heading, header & parameters. @@ -259,10 +259,10 @@ def test_xdr(self): 'xdr-shipping-enabled', 'xdr-timeout', 'xdr-write-batch-size' - ] - - actual_heading, actual_header, actual_params = test_util.parse_output(TestShowConfig.xdr_config) - + ] + + actual_heading, actual_header, actual_params = test_util.parse_output(TestShowConfig.xdr_config) + self.assertTrue(exp_heading in actual_heading) self.assertTrue(exp_header in actual_header) self.assertTrue(set(exp_params).issubset(set(actual_params))) @@ -282,42 +282,42 @@ def setUpClass(cls): TestShowLatency.rc = controller.BasicRootController() actual_out = util.capture_stdout(TestShowLatency.rc.execute, ['show', 'latency']) TestShowLatency.output_list = test_util.get_separate_output(actual_out, 'Latency') - + for item in TestShowLatency.output_list: if "~~~proxy Latency~~" in item: - TestShowLatency.proxy_latency = item + TestShowLatency.proxy_latency = item elif "~~query Latency~~" in item: - TestShowLatency.query_latency = item + TestShowLatency.query_latency = item elif "~~reads Latency~~" in item or "~~read Latency~~" in item: - TestShowLatency.reads_latency = item + TestShowLatency.reads_latency = item elif "~~udf Latency~~" in item: - TestShowLatency.udf_latency = item + TestShowLatency.udf_latency = item elif "~~writes_master Latency~~" in item: TestShowLatency.writes_master_latency = item elif "~~writes_reply Latency~~" in item: TestShowLatency.writes_reply_latency = item elif "~~write Latency~~" in item: TestShowLatency.write_latency = item - - @classmethod + + @classmethod def tearDownClass(self): self.rc = None - def test_proxy_latency(self): + def test_proxy_latency(self): """ Asserts proxy latency output with heading, header & no of node processed(based on row count). TODO: test for values as well """ exp_heading = "~~proxy Latency~~" - exp_header= ['Node', - 'Time Span', - 'Ops/Sec', - '>1Ms', - '>8Ms', - '>64Ms'] + exp_header= ['Node', + 'Time Span', + 'Ops/Sec', + '>1Ms', + '>8Ms', + '>64Ms'] exp_no_of_rows = len(TestShowLatency.rc.cluster._live_nodes) - - actual_heading, actual_header, actual_no_of_rows = test_util.parse_output(TestShowLatency.proxy_latency, horizontal = True) + + actual_heading, actual_header, actual_no_of_rows = test_util.parse_output(TestShowLatency.proxy_latency, horizontal = True) if actual_heading: self.assertTrue(exp_heading in actual_heading) @@ -328,23 +328,23 @@ def test_proxy_latency(self): if actual_no_of_rows: self.assertEqual(exp_no_of_rows, int(actual_no_of_rows.strip())) - def test_query_latency(self): + def test_query_latency(self): """ Asserts query latency output with heading, header & no of node processed(based on row count). TODO: test for values as well """ exp_heading = "~~query Latency~~" - exp_header= ['Node', - 'Time Span', - 'Ops/Sec', - '>1Ms', - '>8Ms', - '>64Ms'] - + exp_header= ['Node', + 'Time Span', + 'Ops/Sec', + '>1Ms', + '>8Ms', + '>64Ms'] + exp_no_of_rows = len(TestShowLatency.rc.cluster._live_nodes) - - actual_heading, actual_header, actual_no_of_rows = test_util.parse_output(TestShowLatency.query_latency, horizontal = True) - + + actual_heading, actual_header, actual_no_of_rows = test_util.parse_output(TestShowLatency.query_latency, horizontal = True) + if actual_heading: self.assertTrue(exp_heading in actual_heading) @@ -354,23 +354,23 @@ def test_query_latency(self): if actual_no_of_rows: self.assertEqual(exp_no_of_rows, int(actual_no_of_rows.strip())) - def test_reads_latency(self): + def test_reads_latency(self): """ Asserts reads latency output with heading, header & no of node processed(based on row count). TODO: test for values as well """ exp_heading = [("~~reads Latency~~", "~~read Latency~~")] - exp_header= ['Node', - 'Time Span', - 'Ops/Sec', - '>1Ms', - '>8Ms', - '>64Ms'] - + exp_header= ['Node', + 'Time Span', + 'Ops/Sec', + '>1Ms', + '>8Ms', + '>64Ms'] + exp_no_of_rows = len(TestShowLatency.rc.cluster._live_nodes) - - actual_heading, actual_header, actual_no_of_rows = test_util.parse_output(TestShowLatency.reads_latency, horizontal = True) - + + actual_heading, actual_header, actual_no_of_rows = test_util.parse_output(TestShowLatency.reads_latency, horizontal = True) + if actual_heading: self.assertTrue(test_util.check_for_subset(actual_heading, exp_heading)) @@ -380,23 +380,23 @@ def test_reads_latency(self): if actual_no_of_rows: self.assertEqual(exp_no_of_rows, int(actual_no_of_rows.strip())) - def test_udf_latency(self): + def test_udf_latency(self): """ Asserts udf latency output with heading, header & no of node processed(based on row count). TODO: test for values as well """ exp_heading = "~~udf Latency~~" - exp_header= ['Node', - 'Time Span', - 'Ops/Sec', - '>1Ms', - '>8Ms', - '>64Ms'] - + exp_header= ['Node', + 'Time Span', + 'Ops/Sec', + '>1Ms', + '>8Ms', + '>64Ms'] + exp_no_of_rows = len(TestShowLatency.rc.cluster._live_nodes) - - actual_heading, actual_header, actual_no_of_rows = test_util.parse_output(TestShowLatency.udf_latency, horizontal = True) - + + actual_heading, actual_header, actual_no_of_rows = test_util.parse_output(TestShowLatency.udf_latency, horizontal = True) + if actual_heading: self.assertTrue(exp_heading in actual_heading) @@ -406,23 +406,23 @@ def test_udf_latency(self): if actual_no_of_rows: self.assertEqual(exp_no_of_rows, int(actual_no_of_rows.strip())) - def test_writes_master_latency(self): + def test_writes_master_latency(self): """ Asserts writes_master latency output with heading, header & no of node processed(based on row count). TODO: test for values as well """ exp_heading = "~~writes_master Latency~~" - exp_header= ['Node', - 'Time Span', - 'Ops/Sec', - '>1Ms', - '>8Ms', - '>64Ms'] - + exp_header= ['Node', + 'Time Span', + 'Ops/Sec', + '>1Ms', + '>8Ms', + '>64Ms'] + exp_no_of_rows = len(TestShowLatency.rc.cluster._live_nodes) - + actual_heading, actual_header, actual_no_of_rows = test_util.parse_output(TestShowLatency.writes_master_latency, horizontal = True) - + if actual_heading: self.assertTrue(exp_heading in actual_heading) @@ -462,30 +462,30 @@ class TestShowDistribution(unittest.TestCase): output_list = list() test_ttl_distri = '' bar_ttl_distri = '' - + @classmethod def setUpClass(cls): rc = controller.BasicRootController() actual_out = util.capture_stdout(rc.execute, ['show', 'distribution']) - # use regex in get_separate_output(~.+Distribution.*~.+) + # use regex in get_separate_output(~.+Distribution.*~.+) #if you are changing below Distribution keyword TestShowDistribution.output_list = test_util.get_separate_output(actual_out, 'Distribution in Seconds') TestShowDistribution.is_bar_present = False - + for item in TestShowDistribution.output_list: if "~~test - TTL Distribution in Seconds~~" in item: - TestShowDistribution.test_ttl_distri = item + TestShowDistribution.test_ttl_distri = item elif "~~bar - TTL Distribution in Seconds~~" in item: TestShowDistribution.bar_ttl_distri = item TestShowDistribution.is_bar_present = True elif "~~~~" in item: - TestShowDistribution.test_namespace_config = item - - - @classmethod + TestShowDistribution.test_namespace_config = item + + + @classmethod def tearDownClass(self): self.rc = None - + def test_test_ttl(self): """ Asserts TTL Distribution in Seconds for test namespace with heading, header & parameters. @@ -495,7 +495,7 @@ def test_test_ttl(self): exp_header = """Percentage of records having ttl less than or equal to value measured in Seconds Node 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%""" - + actual_heading, actual_header, actual_params = test_util.parse_output(TestShowDistribution.test_ttl_distri, horizontal=True, mearge_header = False) if 'Node' not in actual_header: actual_header += ' ' + TestShowDistribution.test_ttl_distri.split('\n')[3] @@ -517,7 +517,7 @@ def test_bar_ttl(self): exp_header = """Percentage of records having ttl less than or equal to value measured in Seconds Node 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%""" - + actual_heading, actual_header, actual_params = test_util.parse_output(TestShowDistribution.bar_ttl_distri, horizontal=True, mearge_header = False) if 'Node' not in actual_header: actual_header += ' ' + TestShowDistribution.bar_ttl_distri.split('\n')[3] @@ -536,22 +536,22 @@ class TestShowStatistics(unittest.TestCase): bar_namespace_stats = '' test_namespace_stats = '' xdr_stats = '' - + @classmethod def setUpClass(cls): rc = controller.BasicRootController() actual_out = util.capture_stdout(rc.execute, ['show', 'statistics']) TestShowStatistics.output_list = test_util.get_separate_output(actual_out, 'Statistics') TestShowStatistics.is_bar_present = False - + for item in TestShowStatistics.output_list: if "~~test Bin Statistics~~" in item: - TestShowStatistics.test_bin_stats = item + TestShowStatistics.test_bin_stats = item elif "~~bar Bin Statistics~~" in item: TestShowStatistics.bar_bin_stats = item TestShowStatistics.is_bar_present = True elif "~~Service Statistics~~" in item: - TestShowStatistics.service_stats = item + TestShowStatistics.service_stats = item elif "~~bar Namespace Statistics~~" in item: TestShowStatistics.bar_namespace_stats = item TestShowStatistics.is_bar_present = True @@ -559,11 +559,11 @@ def setUpClass(cls): TestShowStatistics.test_namespace_stats = item elif "~~XDR Statistics~~" in item: TestShowStatistics.xdr_stats = item - - @classmethod + + @classmethod def tearDownClass(self): self.rc = None - + def test_test_bin(self): """ This test will assert test Bin Statistics output for heading, header and parameters. @@ -572,13 +572,13 @@ def test_test_bin(self): exp_heading = "~test Bin Statistics~" exp_header = "NODE" exp_params = [('bin-names-quota','bin_names_quota'), ('num-bin-names','bin_names')] - + actual_heading, actual_header, actual_params = test_util.parse_output(TestShowStatistics.test_bin_stats) - + self.assertTrue(exp_heading in actual_heading) self.assertTrue(exp_header in actual_header) self.assertTrue(test_util.check_for_subset(actual_params, exp_params)) - + def test_bar_bin(self): """ This test will assert bar Bin Statistics output for heading, header and parameters. @@ -589,13 +589,13 @@ def test_bar_bin(self): exp_heading = "~bar Bin Statistics~" exp_header = "NODE" exp_params = [('bin-names-quota','bin_names_quota'), ('num-bin-names','bin_names')] - + actual_heading, actual_header, actual_params = test_util.parse_output(TestShowStatistics.bar_bin_stats) - + self.assertTrue(exp_heading in actual_heading) self.assertTrue(exp_header in actual_header) self.assertTrue(test_util.check_for_subset(actual_params, exp_params)) - + def test_service(self): """ This test will assert Service Statistics output for heading, header and parameters. @@ -698,10 +698,10 @@ def test_service(self): ('rw_err_write_internal', None), ('rw_err_write_send', None), ('sindex-used-bytes-memory', None), - 'sindex_gc_activity_dur', + ('sindex_gc_activity_dur', None), 'sindex_gc_garbage_cleaned', 'sindex_gc_garbage_found', - 'sindex_gc_inactivity_dur', + ('sindex_gc_inactivity_dur', None), 'sindex_gc_list_creation_time', 'sindex_gc_list_deletion_time', 'sindex_gc_locktimedout', @@ -734,7 +734,7 @@ def test_service(self): ('stat_write_success', None), ('stat_zero_bin_records', None), ('storage_defrag_corrupt_record', None), - ('sub-records', 'sub_objects'), + ('sub-records', 'sub_objects', None), 'system_free_mem_pct', 'system_swapping', ('total-bytes-disk', None), @@ -766,7 +766,7 @@ def test_service(self): self.assertTrue(exp_heading in actual_heading) self.assertTrue(exp_header in actual_header) self.assertTrue(test_util.check_for_subset(actual_params, exp_params)) - + def test_bar_namespace(self): """ This test will assert bar Namespace Statistics output for heading, header and parameters. @@ -794,10 +794,10 @@ def test_bar_namespace(self): 'high-water-memory-pct', ('hwm-breached','hwm_breached'), ('index-used-bytes-memory','memory_used_index_bytes'), - 'ldt-enabled', - 'ldt-page-size', + ('ldt-enabled', None), + ('ldt-page-size', None), ('master-objects','master_objects'), - ('master-sub-objects','master_sub_objects'), + ('master-sub-objects','master_sub_objects', None), 'max-ttl', ('max-void-time','max_void_time', None), 'memory-size', @@ -812,24 +812,24 @@ def test_bar_namespace(self): ('nsup-cycle-sleep-pct','nsup_cycle_sleep_pct'), 'objects', ('prole-objects','prole_objects'), - ('prole-sub-objects','prole_sub_objects'), + ('prole-sub-objects','prole_sub_objects', None), 'read-consistency-level-override', 'repl-factor', - ('set-deleted-objects','set_deleted_objects'), + ('set-deleted-objects','set_deleted_objects', None), 'sets-enable-xdr', ('sindex-used-bytes-memory','memory_used_sindex_bytes'), 'single-bin', ('stop-writes','stop_writes'), 'stop-writes-pct', - ('sub-objects','sub_objects'), + ('sub-objects','sub_objects', None), ('total-bytes-memory',None), ('type',None), ('used-bytes-memory','memory_used_bytes'), 'write-commit-level-override', ] - + actual_heading, actual_header, actual_params = test_util.parse_output(TestShowStatistics.bar_namespace_stats) - + self.assertTrue(exp_heading in actual_heading) self.assertTrue(exp_header in actual_header) self.assertTrue(test_util.check_for_subset(actual_params,exp_params)) @@ -859,10 +859,10 @@ def test_test_namespace(self): 'high-water-memory-pct', ('hwm-breached', 'hwm_breached'), ('index-used-bytes-memory', 'memory_used_index_bytes'), - 'ldt-enabled', - 'ldt-page-size', + ('ldt-enabled', None), + ('ldt-page-size', None), ('master-objects', 'master_objects'), - ('master-sub-objects', 'master_sub_objects'), + ('master-sub-objects', 'master_sub_objects', None), 'max-ttl', ('max-void-time', 'max_void_time', None), 'memory-size', @@ -877,28 +877,28 @@ def test_test_namespace(self): ('nsup-cycle-sleep-pct', 'nsup_cycle_sleep_pct'), 'objects', ('prole-objects', 'prole_objects'), - ('prole-sub-objects', 'prole_sub_objects'), + ('prole-sub-objects', 'prole_sub_objects', None), 'read-consistency-level-override', 'repl-factor', - ('set-deleted-objects', 'set_deleted_objects'), + ('set-deleted-objects', 'set_deleted_objects', None), 'sets-enable-xdr', ('sindex-used-bytes-memory', 'memory_used_sindex_bytes'), 'single-bin', ('stop-writes', 'stop_writes'), 'stop-writes-pct', - ('sub-objects', 'sub_objects'), + ('sub-objects', 'sub_objects', None), ('total-bytes-memory', None), ('type', None), ('used-bytes-memory', 'memory_used_bytes'), 'write-commit-level-override', ] - + actual_heading, actual_header, actual_params = test_util.parse_output(TestShowStatistics.test_namespace_stats) - + self.assertTrue(exp_heading in actual_heading) self.assertTrue(exp_header in actual_header) self.assertTrue(test_util.check_for_subset(actual_params, exp_params)) - + @unittest.skip("Will enable only when xdr is configuired") def test_xdr(self): """ @@ -951,7 +951,7 @@ def test_xdr(self): 'xdr_deletes_shipped', ] actual_heading, actual_header, actual_params = test_util.parse_output(TestShowStatistics.xdr_stats) - + self.assertTrue(exp_heading in actual_heading) self.assertTrue(exp_header in actual_header) self.assertTrue(set(exp_params).issubset(set(actual_params))) diff --git a/test/unit/test_controller.py b/test/unit/test_controller.py index 903e940a..e9f6384f 100644 --- a/test/unit/test_controller.py +++ b/test/unit/test_controller.py @@ -63,9 +63,16 @@ def test_info_controller(self): ic.pre_command([""]) ic.do_network(["network"]) # TODO: view.info_network needs a "real" node - ic.do_namespace(["namespace"]) ic.do_xdr(["xdr"]) + def test_info_namespace_controller(self): + inc = InfoNamespaceController() + + inc.pre_command([""]) + + inc.do_usage(["namespace usage"]) + inc.do_object(["namespace object"]) + def test_show_distribution_controller(self): sdc = ShowDistributionController()