From 98dbe24e904256637f9cab78efd83dbfb9370015 Mon Sep 17 00:00:00 2001 From: hbpatre Date: Wed, 24 Jan 2018 16:37:07 +0530 Subject: [PATCH] asadm improvements (#33) * TOOLS-971: (ASADM) Fix command history for Mac. * TOOLS-975: (ASADM) Provide sindex and set filter for 'show statistics' command. * TOOLS-1026: (ASADM) Fix collectinfo JSON dump error. * TOOLS-1049: (ASADM) Modify show latency columns to display % sign. * TOOLS-1052: (ASADM-HEALTHCHECK) Add more configurations in ignore list. * TOOLS-1057: (ASADM) Modify to collect system stats for offline node * TOOLS-1060: (ASADM) Use optimal lsof command line option, when running it for collectinfo. * TOOLS-1061: (ASADM) Modify collectinfo to collect nvme* drive info. * TOOLS-1064: (ASADM) Improve to show stack trace for exceptions. --- README.md | 6 + asadm.py | 42 +- lib/basiccontroller.py | 440 ++----------- lib/collectinfo/cinfolog.py | 13 +- lib/collectinfocontroller.py | 46 +- lib/getcontroller.py | 40 +- lib/health/query.py | 24 +- lib/health/query/health.hql | 24 +- lib/utils/common.py | 1135 ++++++++++++++++++++++++++++++++++ lib/utils/constants.py | 3 + lib/utils/util.py | 634 +------------------ lib/view/view.py | 16 +- test/e2e/test_show.py | 36 +- 13 files changed, 1411 insertions(+), 1048 deletions(-) create mode 100644 lib/utils/common.py diff --git a/README.md b/README.md index 7dff476d..4733ef18 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,12 @@ Admin> help sudo ./asadm-deps/install.sh ``` +### Mac OSX +Run following command to ensure asadm history works properly: +``` +sudo easy_install -a readline +``` + ## Tests ### Dependencies - unittest2: 0.5.1 diff --git a/asadm.py b/asadm.py index ff120284..349f622e 100755 --- a/asadm.py +++ b/asadm.py @@ -95,7 +95,7 @@ def critical(self, msg, *args, **kwargs): from lib.client.ssl_context import SSLContext from lib.collectinfocontroller import CollectinfoRootController from lib.logcontroller import LogRootController -from lib.utils import util +from lib.utils import common, util from lib.utils.constants import ADMIN_HOME from lib.view import terminal, view @@ -114,6 +114,9 @@ def __init__(self, seed, user=None, password=None, use_services_alumni=False, us log_path="", log_analyser=False, collectinfo=False, ssl_context=None, only_connect_seed=False, execute_only_mode=False, timeout=5): + # indicates shell created successfully and connected to cluster/collectinfo/logfile + self.connected = True + self.execute_only_mode = execute_only_mode if log_analyser: @@ -165,7 +168,12 @@ def __init__(self, seed, user=None, password=None, use_services_alumni=False, us if not self.ctrl.cluster.get_live_nodes(): self.do_exit('') - logger.critical("Not able to connect any cluster.") + if self.execute_only_mode: + logger.error("Not able to connect any cluster.") + self.connected = False + return + else: + logger.critical("Not able to connect any cluster.") self.prompt = "Admin> " self.intro = "" @@ -248,7 +256,7 @@ def precmd(self, line, max_commands_to_print_header=1, if not lines: # allow empty lines return "" except Exception as e: - logger.error(str(e)) + logger.error(e) return "" for line in lines: @@ -271,7 +279,7 @@ def precmd(self, line, max_commands_to_print_header=1, if response == "EXIT": return "exit" except Exception as e: - logger.error(str(e)) + logger.error(e) return "" # line was handled by execute def _listdir(self, root): @@ -717,7 +725,7 @@ def main(): ssl_context=ssl_context, line_separator=cli_args.line_separator) exit(0) except Exception as e: - logger.error(str(e)) + logger.error(e) exit(1) if not execute_only_mode: @@ -750,8 +758,12 @@ def main(): single_command = True real_stdout = sys.stdout if not execute_only_mode: + if not shell.connected: + exit(1) + func = shell.cmdloop single_command = False + else: commands_arg = cli_args.execute max_commands_to_print_header = 1 @@ -771,13 +783,21 @@ def main(): except Exception as e: print e - line = shell.precmd(commands_arg, - max_commands_to_print_header=max_commands_to_print_header, - command_index_to_print_from=command_index_to_print_from) + if shell.connected: + line = shell.precmd(commands_arg, + max_commands_to_print_header=max_commands_to_print_header, + command_index_to_print_from=command_index_to_print_from) - shell.onecmd(line) - func = shell.onecmd - args = (line,) + shell.onecmd(line) + func = shell.onecmd + args = (line,) + + else: + if "collectinfo" in commands_arg: + logger.info("Collecting only System data") + func = common.collect_sys_info(port=cli_args.port) + + exit(1) cmdloop(shell, func, args, use_yappi, single_command) shell.close() diff --git a/lib/basiccontroller.py b/lib/basiccontroller.py index 68e21b08..7669f151 100644 --- a/lib/basiccontroller.py +++ b/lib/basiccontroller.py @@ -17,11 +17,8 @@ import os import platform import shutil -import socket import sys import time -import urllib2 -import zipfile from distutils.version import LooseVersion from lib.client.cluster import Cluster @@ -33,14 +30,10 @@ get_sindex_stats) from lib.health.util import (create_health_input_dict, create_snapshot_key, h_eval) -from lib.utils import util -from lib.utils.data import lsof_file_type_desc +from lib.utils import common, constants, util from lib.view import terminal from lib.view.view import CliView -aslogfile = "" -aslogdir = "" - class BasicCommandController(CommandController): cluster = None @@ -815,6 +808,8 @@ def _do_default(self, line): class CollectinfoController(BasicCommandController): def __init__(self): self.modifiers = set(['with']) + self.aslogfile = "" + self.aslogdir = "" def _collect_local_file(self, src, dest_dir): self.logger.info("Copying file %s to %s" % (src, dest_dir)) @@ -826,264 +821,45 @@ def _collect_local_file(self, src, dest_dir): def _collectinfo_content(self, func, parm='', alt_parms=''): name = '' capture_stdout = util.capture_stdout - sep = "\n====ASCOLLECTINFO====\n" + sep = constants.COLLECTINFO_SEPRATOR try: name = func.func_name except Exception: pass - info_line = "Data collection for " + name + \ - "%s" % (" %s" % (str(parm)) if parm else "") + " in progress.." + info_line = constants.COLLECTINFO_PROGRESS_MSG %(name, "%s" % (" %s" % (str(parm)) if parm else "")) self.logger.info(info_line) if parm: sep += str(parm) + "\n" - if func == 'shell': - o, e = util.shell_command(parm) - if e: - self.logger.warning(str(e)) - success = False - for alt_parm in alt_parms: - if not alt_parm: - continue - - alt_parm = [alt_parm] - info_line = "Data collection for alternative command " + \ - name + str(alt_parm) + " in progress.." - self.logger.info(info_line) - sep += str(alt_parm) + "\n" - o_alt, e_alt = util.shell_command(alt_parm) - - if e_alt: - e = e_alt - - else: - success = True - - if o_alt: - o = o_alt - break - - if not success: - self.cmds_error.add(parm[0]) - for alt_parm in alt_parms: - self.cmds_error.add(alt_parm) - - elif func == 'cluster': + if func == 'cluster': o = self.cluster.info(parm) else: if self.nodes and isinstance(self.nodes, list): parm += ["with"] + self.nodes o = capture_stdout(func, parm) - self._write_log(sep + str(o)) + util.write_to_file(self.aslogfile, sep + str(o)) return '' - def _write_log(self, collectedinfo): - f = open(str(aslogfile), 'a') - f.write(str(collectedinfo)) - return f.close() - def _write_version(self, line): print "asadm version " + str(self.asadm_version) - def _get_metadata(self, response_str, prefix='', old_response=''): - aws_c = '' - aws_metadata_base_url = 'http://169.254.169.254/latest/meta-data' - - # set of values which will give same old_response, so no need to go further - last_values = [] - for rsp in response_str.split("\n"): - if rsp[-1:] == '/': - rsp_p = rsp.strip('/') - aws_c += self._get_metadata(rsp_p, prefix, old_response=old_response) - else: - meta_url = aws_metadata_base_url + prefix + rsp - - req = urllib2.Request(meta_url) - r = urllib2.urlopen(req) - # r = requests.get(meta_url,timeout=aws_timeout) - if r.code != 404: - response = r.read().strip() - if response == old_response: - last_values.append(rsp.strip()) - continue - try: - aws_c += self._get_metadata(response, prefix + rsp + "/", old_response=response) - except Exception: - aws_c += (prefix + rsp).strip('/') + '\n' + response + "\n\n" - - if last_values: - aws_c += prefix.strip('/') + '\n' + '\n'.join(last_values) + "\n\n" - - return aws_c - - def _get_awsdata(self, line): - aws_rsp = '' - aws_timeout = 1 - socket.setdefaulttimeout(aws_timeout) - aws_metadata_base_url = 'http://169.254.169.254/latest/meta-data' - print "['AWS']" - try: - req = urllib2.Request(aws_metadata_base_url) - r = urllib2.urlopen(req) - # r = requests.get(aws_metadata_base_url,timeout=aws_timeout) - if r.code == 200: - rsp = r.read() - aws_rsp += self._get_metadata(rsp, '/') - print "Requesting... {0} \n{1} \t Successful".format(aws_metadata_base_url, aws_rsp) - else: - aws_rsp = " Not likely in AWS" - print "Requesting... {0} \t FAILED {1} ".format(aws_metadata_base_url, aws_rsp) - - except Exception as e: - print "Requesting... {0} \t {1} ".format(aws_metadata_base_url, e) - print "FAILED! Node Is Not likely In AWS" - - def _collect_sys(self, line=''): - print "['cpuinfo']" - cpu_info_cmd = 'cat /proc/cpuinfo | grep "vendor_id"' - o, e = util.shell_command([cpu_info_cmd]) - if o: - o = o.strip().split("\n") - cpu_info = {} - for item in o: - items = item.strip().split(":") - if len(items) == 2: - key = items[1].strip() - if key in cpu_info.keys(): - cpu_info[key] = cpu_info[key] + 1 - else: - cpu_info[key] = 1 - print "vendor_id\tprocessor count" - for key in cpu_info.keys(): - print key + "\t" + str(cpu_info[key]) - - def _get_asd_pids(self): - pids = [] - ps_cmd = 'sudo ps aux|grep -v grep|grep -E "asd|cld"' - ps_o, ps_e = util.shell_command([ps_cmd]) - if ps_o: - ps_o = ps_o.strip().split("\n") - pids = [] - for item in ps_o: - vals = item.strip().split() - if len(vals) >= 2: - pids.append(vals[1]) - return pids - def _collect_logs_from_systemd_journal(self, as_logfile_prefix): - asd_pids = self._get_asd_pids() + asd_pids = common.get_asd_pids() for pid in asd_pids: try: journalctl_cmd = [ 'journalctl _PID=%s --since "24 hours ago" -q -o cat' % (pid)] - aslogfile = as_logfile_prefix + 'aerospike_%s.log' % (pid) - print "[INFO] Data collection for %s to %s in progress..." % (str(journalctl_cmd), aslogfile) + self.aslogfile = as_logfile_prefix + 'aerospike_%s.log' % (pid) + self.logger.info("Data collection for %s to %s in progress..." % (str(journalctl_cmd), self.aslogfile)) o, e = util.shell_command(journalctl_cmd) if e: - print e + self.logger.error(str(e)) else: - self._write_log(o) + util.write_to_file(self.aslogfile, str(o)) except Exception as e1: - print str(e1) + self.logger.error(str(e1)) sys.stdout = sys.__stdout__ - def _collect_lsof(self, verbose=False): - print "['lsof']" - pids = self._get_asd_pids() - if pids and len(pids) > 0: - search_str = pids[0] - for _str in pids[1:len(pids)]: - search_str += "\\|" + _str - lsof_cmd = 'sudo lsof -n |grep "%s"' % (search_str) - lsof_o, lsof_e = util.shell_command([lsof_cmd]) - if lsof_e: - print lsof_e - self.cmds_error.add(lsof_cmd) - if lsof_o: - if verbose: - print lsof_o - else: - lsof_dic = {} - unidentified_protocol_count = 0 - lsof_list = lsof_o.strip().split("\n") - type_ljust_parm = 20 - desc_ljust_parm = 20 - for row in lsof_list: - try: - if "can't identify protocol" in row: - unidentified_protocol_count = unidentified_protocol_count + \ - 1 - except Exception: - pass - - try: - type = row.strip().split()[4] - if type not in lsof_dic: - - if len(type) > type_ljust_parm: - type_ljust_parm = len(type) - - if (type in lsof_file_type_desc - and len(lsof_file_type_desc[type]) > desc_ljust_parm): - desc_ljust_parm = len( - lsof_file_type_desc[type]) - - lsof_dic[type] = 1 - else: - lsof_dic[type] = lsof_dic[type] + 1 - - except Exception: - continue - - print "FileType".ljust(type_ljust_parm) + "Description".ljust(desc_ljust_parm) + "fd count" - for ftype in sorted(lsof_dic.keys()): - desc = "Unknown" - if ftype in lsof_file_type_desc: - desc = lsof_file_type_desc[ftype] - print ftype.ljust(type_ljust_parm) + desc.ljust(desc_ljust_parm) + str(lsof_dic[ftype]) - - print "\nUnidentified Protocols = " + str(unidentified_protocol_count) - - def _zip_files(self, dir_path, _size=1): - """ - If file size is greater then given _size, create zip of file on same location and - remove original one. Won't zip If zlib module is not available. - """ - for root, dirs, files in os.walk(dir_path): - for _file in files: - file_path = os.path.join(root, _file) - size_mb = (os.path.getsize(file_path) / (1024 * 1024)) - if size_mb >= _size: - os.chdir(root) - try: - newzip = zipfile.ZipFile( - _file + ".zip", "w", zipfile.ZIP_DEFLATED) - newzip.write(_file) - newzip.close() - os.remove(_file) - except Exception as e: - print e - pass - - def _archive_log(self, logdir): - self._zip_files(logdir) - util.shell_command(["tar -czvf " + logdir + ".tgz " + aslogdir]) - sys.stderr.write("\x1b[2J\x1b[H") - print "\n\n\n" - self.logger.info("Files in " + logdir + " and " + logdir + ".tgz saved.") - - def _print_collecinto_summary(self, logdir): - if self.cmds_error: - self.logger.warning("Following commands are either unavailable or giving runtime error...") - self.logger.warning(list(self.cmds_error)) - - print "\n" - self.logger.info("Please provide file " + logdir + ".tgz to Aerospike Support.") - self.logger.info("END OF ASCOLLECTINFO") - - # If multiple commands are given in execute_only mode then we might need coloring for next commands - terminal.enable_color(True) - def _parse_namespace(self, namespace_data): """ This method will return set of namespaces present given namespace data @@ -1314,9 +1090,13 @@ def _get_as_pmap(self): def _dump_in_json_file(self, as_logfile_prefix, dump): self.logger.info("Dumping collectinfo in JSON format.") - aslogfile = as_logfile_prefix + 'ascinfo.json' - with open(aslogfile, "w") as f: - f.write(json.dumps(dump, indent=4, separators=(',', ':'))) + self.aslogfile = as_logfile_prefix + 'ascinfo.json' + + try: + with open(self.aslogfile, "w") as f: + f.write(json.dumps(dump, indent=4, separators=(',', ':'))) + except Exception as e: + self.logger.error("Failed to write JSON file: " + str(e)) def _get_collectinfo_data_json(self, default_user, default_pwd, default_ssh_port, default_ssh_key, credential_file, enable_ssh): @@ -1397,60 +1177,10 @@ def _dump_collectinfo_pretty_print(self, timestamp, as_logfile_prefix, except Exception: port = 3000 - # Unfortunately timestamp can not be printed in Centos with dmesg, - # storing dmesg logs without timestamp for this particular OS. - if 'centos' == (platform.linux_distribution()[0]).lower(): - cmd_dmesg = 'sudo dmesg' - alt_dmesg = '' - else: - cmd_dmesg = 'sudo dmesg -T' - alt_dmesg = 'sudo dmesg' + collect_output = time.strftime("%Y-%m-%d %H:%M:%S UTC\n", timestamp) - global aslogfile, output_time - - # cmd and alternative cmds are stored in list of list instead of dic to - # maintain proper order for output - sys_shell_cmds = [ - ['hostname -I', 'hostname'], - ['top -n3 -b', 'top -l 3'], - ['lsb_release -a', 'ls /etc|grep release|xargs -I f cat /etc/f'], - ['cat /proc/meminfo', 'vmstat -s'], - ['cat /proc/interrupts', ''], - ['iostat -x 1 10', ''], - [cmd_dmesg, alt_dmesg], - ['sudo pgrep asd | xargs -I f sh -c "cat /proc/f/limits"', ''], - ['lscpu', ''], - ['sudo sysctl -a | grep -E "shmmax|file-max|maxfiles"'], - ['sudo iptables -L', ''], - ['sudo fdisk -l |grep Disk |grep dev | cut -d " " -f 2 | cut -d ":" -f 1 | xargs sudo hdparm -I 2>/dev/null', ''], - ['df -h', ''], - ['free -m', ''], - ['uname -a', ''], - - # Only in pretty print - ['cat /proc/partitions', 'fdisk -l'], - ['ls /sys/block/{sd*,xvd*}/queue/rotational |xargs -I f sh -c "echo f; cat f;"', ''], - ['ls /sys/block/{sd*,xvd*}/device/model |xargs -I f sh -c "echo f; cat f;"', ''], - ['ls /sys/block/{sd*,xvd*}/queue/scheduler |xargs -I f sh -c "echo f; cat f;"', ''], - ['rpm -qa|grep -E "citrus|aero"', 'dpkg -l|grep -E "citrus|aero"'], - ['ip addr', ''], - ['ip -s link', '', ''], - ['sar -n DEV', ''], - ['sar -n EDEV', ''], - ['mpstat -P ALL 2 3', ''], - ['uptime', ''], - ['ss -ant state time-wait sport = :%d or dport = :%d | wc -l' % - (port,port), 'netstat -ant | grep %d | grep TIME_WAIT | wc -l' % (port)], - ['ss -ant state close-wait sport = :%d or dport = :%d | wc -l' % - (port,port), 'netstat -ant | grep %d | grep CLOSE_WAIT | wc -l' % (port)], - ['ss -ant state established sport = :%d or dport = :%d | wc -l' % - (port,port), 'netstat -ant | grep %d | grep ESTABLISHED | wc -l' % (port)], - ['ss -ant state listen sport = :%d or dport = :%d | wc -l' % - (port,port), 'netstat -ant | grep %d | grep LISTEN | wc -l' % (port)], - ['arp -n|grep ether|tr -s [:blank:] | cut -d" " -f5 |sort|uniq -c', ''], - ['find /proc/sys/net/ipv4/neigh/default/ -name "gc_thresh*" -print -exec cat {} \;', ''] - ] + dignostic_info_params = [ 'network', 'namespace', 'set', 'xdr', 'dc', 'sindex'] dignostic_features_params = ['features'] @@ -1533,13 +1263,13 @@ def _dump_collectinfo_pretty_print(self, timestamp, as_logfile_prefix, ####### Dignostic info ######## - aslogfile = as_logfile_prefix + 'ascollectinfo.log' - self._write_log(collect_output) + self.aslogfile = as_logfile_prefix + 'ascollectinfo.log' + util.write_to_file(self.aslogfile, collect_output) try: self._collectinfo_content(self._write_version) except Exception as e: - self._write_log(str(e)) + util.write_to_file(self.aslogfile, str(e)) sys.stdout = sys.__stdout__ try: @@ -1547,7 +1277,7 @@ def _dump_collectinfo_pretty_print(self, timestamp, as_logfile_prefix, for info_param in dignostic_info_params: self._collectinfo_content(info_controller, [info_param]) except Exception as e: - self._write_log(str(e)) + util.write_to_file(self.aslogfile, str(e)) sys.stdout = sys.__stdout__ try: @@ -1555,7 +1285,7 @@ def _dump_collectinfo_pretty_print(self, timestamp, as_logfile_prefix, for show_param in dignostic_show_params: self._collectinfo_content(show_controller, show_param.split()) except Exception as e: - self._write_log(str(e)) + util.write_to_file(self.aslogfile, str(e)) sys.stdout = sys.__stdout__ try: @@ -1563,32 +1293,33 @@ def _dump_collectinfo_pretty_print(self, timestamp, as_logfile_prefix, for cmd in dignostic_features_params: self._collectinfo_content(features_controller, [cmd]) except Exception as e: - self._write_log(str(e)) + util.write_to_file(self.aslogfile, str(e)) sys.stdout = sys.__stdout__ try: for cmd in dignostic_aerospike_cluster_params: self._collectinfo_content('cluster', cmd) except Exception as e: - self._write_log(str(e)) + util.write_to_file(self.aslogfile, str(e)) sys.stdout = sys.__stdout__ ####### Summary ######## - collectinfo_root_controller = CollectinfoRootController(asadm_version=self.asadm_version, clinfo_path=as_logfile_prefix + "ascinfo.json") - aslogfile = as_logfile_prefix + 'summary.log' - self._write_log(collect_output) + collectinfo_root_controller = CollectinfoRootController(asadm_version=self.asadm_version, clinfo_path=self.aslogdir) + + self.aslogfile = as_logfile_prefix + 'summary.log' + util.write_to_file(self.aslogfile, collect_output) try: self._collectinfo_content(self._write_version) except Exception as e: - self._write_log(str(e)) + util.write_to_file(self.aslogfile, str(e)) sys.stdout = sys.__stdout__ try: for summary_param in summary_params: self._collectinfo_content(collectinfo_root_controller.execute, [summary_param]) except Exception as e: - self._write_log(str(e)) + util.write_to_file(self.aslogfile, str(e)) sys.stdout = sys.__stdout__ try: @@ -1596,57 +1327,25 @@ def _dump_collectinfo_pretty_print(self, timestamp, as_logfile_prefix, for info_param in summary_info_params: self._collectinfo_content(info_controller, [info_param]) except Exception as e: - self._write_log(str(e)) + util.write_to_file(self.aslogfile, str(e)) sys.stdout = sys.__stdout__ ####### Health ######## - aslogfile = as_logfile_prefix + 'health.log' - self._write_log(collect_output) + self.aslogfile = as_logfile_prefix + 'health.log' + util.write_to_file(self.aslogfile, collect_output) try: for health_param in health_params: self._collectinfo_content(collectinfo_root_controller.execute, health_param.split()) except Exception as e: - self._write_log(str(e)) + util.write_to_file(self.aslogfile, str(e)) sys.stdout = sys.__stdout__ ####### System info ######## - aslogfile = as_logfile_prefix + 'sysinfo.log' - self._write_log(collect_output) - - try: - for cmds in sys_shell_cmds: - self._collectinfo_content('shell', [cmds[0]], cmds[1:] if len(cmds)>1 else []) - except Exception as e: - self._write_log(str(e)) - sys.stdout = sys.__stdout__ - - try: - self._collectinfo_content(self._collect_sys) - except Exception as e: - self._write_log(str(e)) - sys.stdout = sys.__stdout__ - - try: - self._collectinfo_content(self._get_awsdata) - except Exception as e: - self._write_log(str(e)) - sys.stdout = sys.__stdout__ - - try: - self._collectinfo_content(self._collect_lsof) - except Exception as e: - self._write_log(str(e)) - sys.stdout = sys.__stdout__ - - if show_all and verbose: - try: - self._collectinfo_content(self._collect_lsof, verbose) - except Exception as e: - self._write_log(str(e)) - sys.stdout = sys.__stdout__ + self.aslogfile = as_logfile_prefix + 'sysinfo.log' + self.failed_cmds = common.collect_sys_info(port=port, timestamp=collect_output, outfile=self.aslogfile, verbose=show_all & verbose) ####### Logs and conf ######## @@ -1670,7 +1369,7 @@ def _dump_collectinfo_pretty_print(self, timestamp, as_logfile_prefix, is_xdr_in_asd_version = self.cluster.call_node_method( [_ip], "is_feature_present", "xdr").popitem()[1] except Exception: - from lib.node import Node + from lib.client.node import Node temp_node = Node(_ip) is_xdr_in_asd_version = self.cluster.call_node_method( [temp_node.ip], "is_feature_present", "xdr").popitem()[1] @@ -1686,11 +1385,11 @@ def _dump_collectinfo_pretty_print(self, timestamp, as_logfile_prefix, except Exception: xdr_log_location = '/var/log/aerospike/*xdr.log' - aslogfile = as_logfile_prefix + 'asxdr.log' - self._collectinfo_content( - 'shell', ['cat ' + xdr_log_location]) + self.aslogfile = as_logfile_prefix + 'asxdr.log' + self._collect_local_file(xdr_log_location, self.aslogfile) + except Exception as e: - self._write_log(str(e)) + util.write_to_file(self.aslogfile, str(e)) sys.stdout = sys.__stdout__ try: @@ -1698,7 +1397,7 @@ def _dump_collectinfo_pretty_print(self, timestamp, as_logfile_prefix, log_locations = [i.split(':')[1] for i in self.cluster.call_node_method( [_ip], "info", "logs").popitem()[1].split(';')] except Exception: - from lib.node import Node + from lib.client.node import Node temp_node = Node(_ip) log_locations = [i.split(':')[1] for i in self.cluster.call_node_method( [temp_node.ip], "info", "logs").popitem()[1].split(';')] @@ -1724,10 +1423,10 @@ def _dump_collectinfo_pretty_print(self, timestamp, as_logfile_prefix, self._collect_logs_from_systemd_journal( as_logfile_prefix) except Exception as e1: - self._write_log(str(e1)) + util.write_to_file(self.aslogfile, str(e1)) sys.stdout = sys.__stdout__ except Exception as e: - self._write_log(str(e)) + util.write_to_file(self.aslogfile, str(e)) sys.stdout = sys.__stdout__ ##### aerospike conf file ##### @@ -1735,43 +1434,35 @@ def _dump_collectinfo_pretty_print(self, timestamp, as_logfile_prefix, # Comparing with this version because prior to this it was # citrusleaf.conf & citrusleaf.log if LooseVersion(as_version) > LooseVersion("3.0.0"): - aslogfile = as_logfile_prefix + 'aerospike.conf' + self.aslogfile = as_logfile_prefix + 'aerospike.conf' else: - aslogfile = as_logfile_prefix + 'citrusleaf.conf' + self.aslogfile = as_logfile_prefix + 'citrusleaf.conf' - self._write_log(collect_output) - self._collectinfo_content('shell', ['cat %s' % (conf_path)]) + self._collect_local_file(conf_path, self.aslogfile) except Exception as e: - self._write_log(str(e)) + util.write_to_file(self.aslogfile, str(e)) sys.stdout = sys.__stdout__ def _main_collectinfo(self, default_user, default_pwd, default_ssh_port, default_ssh_key, credential_file, snp_count, wait_time, enable_ssh=False, show_all=False, verbose=False, output_prefix=""): - global aslogdir, output_time + + # JSON collectinfo snapshot count check + if snp_count < 1: + self.logger.error("Wrong collectinfo snapshot count") + return + timestamp = time.gmtime() - output_time = time.strftime("%Y%m%d_%H%M%S", timestamp) - aslogdir_prefix = "" - if output_prefix: - output_prefix = output_prefix.strip() - aslogdir_prefix = "%s%s"%(str(output_prefix), '_' if not output_prefix.endswith('_') - and not output_prefix.endswith('-') else "")\ - if output_prefix else "" - aslogdir = '/tmp/%scollect_info_'%(aslogdir_prefix) + output_time - as_logfile_prefix = aslogdir + '/' + output_time + '_' - - os.makedirs(aslogdir) + self.aslogdir, as_logfile_prefix = common.set_collectinfo_path(timestamp, output_prefix=output_prefix) # Coloring might writes extra characters to file, to avoid it we need to disable terminal coloring terminal.enable_color(False) - self.cmds_error = set() + # list of failed system commands + self.failed_cmds = [] # JSON collectinfo - if snp_count < 1: - self._archive_log(aslogdir) - return self._dump_collectinfo_json(timestamp, as_logfile_prefix, default_user, default_pwd, default_ssh_port, default_ssh_key, credential_file, enable_ssh, snp_count, wait_time,) @@ -1780,9 +1471,10 @@ def _main_collectinfo(self, default_user, default_pwd, default_ssh_port, default self._dump_collectinfo_pretty_print(timestamp, as_logfile_prefix, show_all=show_all, verbose=verbose) # Archive collectinfo directory - self._archive_log(aslogdir) + common.archive_log(self.aslogdir) - self._print_collecinto_summary(aslogdir) + # printing collectinfo summary + common.print_collecinto_summary(self.aslogdir, failed_cmds=self.failed_cmds) def _collect_info(self, line, show_all=False): @@ -2367,6 +2059,6 @@ def _do_default(self, line): metadata["os_version"] = os_version - return util.Future(self.view.print_summary, util.create_summary(service_stats=service_stats, namespace_stats=namespace_stats, + return util.Future(self.view.print_summary, common.create_summary(service_stats=service_stats, namespace_stats=namespace_stats, set_stats=set_stats, metadata=metadata, cluster_configs=cluster_configs), list_view=enable_list_view) diff --git a/lib/collectinfo/cinfolog.py b/lib/collectinfo/cinfolog.py index 4dfad63d..caa7beb4 100644 --- a/lib/collectinfo/cinfolog.py +++ b/lib/collectinfo/cinfolog.py @@ -140,12 +140,21 @@ def get_data(self, type="", stanza=""): elif stanza == "bin" or stanza == "bins": data[node][ns_name] = copy.deepcopy(d[ns_name][stanza]) - elif stanza in ["set", "sindex"]: - + elif stanza == "set": for _name in d[ns_name][stanza]: _key = "%s %s" % (ns_name, _name) data[node][_key] = copy.deepcopy(d[ns_name][stanza][_name]) + elif stanza == "sindex": + for _name in d[ns_name][stanza]: + try: + set = d[ns_name][stanza][_name]["set"] + _key = "%s %s %s" % (ns_name, set, _name) + except Exception: + continue + + data[node][_key] = copy.deepcopy(d[ns_name][stanza][_name]) + except Exception: pass diff --git a/lib/collectinfocontroller.py b/lib/collectinfocontroller.py index bfcc68ab..f59c1d6e 100644 --- a/lib/collectinfocontroller.py +++ b/lib/collectinfocontroller.py @@ -18,7 +18,7 @@ from lib.collectinfo.loghdlr import CollectinfoLoghdlr from lib.health.util import create_health_input_dict, h_eval, create_snapshot_key from lib.utils.constants import * -from lib.utils import util +from lib.utils import common, util from lib.view import terminal from lib.view.view import CliView @@ -415,7 +415,7 @@ def _do_distribution(self, histogram_name, title, unit): for timestamp in sorted(histogram.keys()): if not histogram[timestamp]: continue - self.view.show_distribution(title, util.create_histogram_output(histogram_name, histogram[timestamp]), unit, + self.view.show_distribution(title, common.create_histogram_output(histogram_name, histogram[timestamp]), unit, histogram_name, self.loghdlr.get_cinfo_log_at( timestamp=timestamp), @@ -446,7 +446,7 @@ def do_object_size(self, line): for timestamp in histogram: self.view.show_object_distribution('Object Size Distribution', - util.create_histogram_output(histogram_name, histogram[timestamp], byte_distribution=True, bucket_count=bucket_count, builds=builds), + common.create_histogram_output(histogram_name, histogram[timestamp], byte_distribution=True, bucket_count=bucket_count, builds=builds), 'Bytes', 'objsz', bucket_count, True, self.loghdlr.get_cinfo_log_at(timestamp=timestamp), title_suffix=" (%s)" % (timestamp), @@ -623,10 +623,24 @@ def do_sets(self, line): continue namespace_list = [ns_set.split()[0] for ns_set in set_stats[timestamp].keys()] - namespace_list = util.filter_list(namespace_list, self.mods['for']) + + try: + namespace_list = util.filter_list(namespace_list, self.mods['for'][:1]) + except Exception: + pass + + set_list = [ns_set.split()[1] + for ns_set in set_stats[timestamp].keys()] + try: + set_list = util.filter_list(set_list, self.mods['for'][1:2]) + except Exception: + pass + for ns_set, stats in set_stats[timestamp].iteritems(): - if ns_set.split()[0] not in namespace_list: + ns, set = ns_set.split() + if ns not in namespace_list or set not in set_list: continue + self.view.show_stats("%s Set Statistics (%s)" % (ns_set, timestamp), stats, self.loghdlr.get_cinfo_log_at(timestamp=timestamp), @@ -733,12 +747,26 @@ def do_sindex(self, line): for timestamp in sorted(sindex_stats.keys()): if not sindex_stats[timestamp] or isinstance(sindex_stats[timestamp], Exception): continue + namespace_list = [ns_set_sindex.split()[0] for ns_set_sindex in sindex_stats[timestamp].keys()] - namespace_list = util.filter_list(namespace_list, self.mods['for']) + try: + namespace_list = util.filter_list(namespace_list, self.mods['for'][:1]) + except Exception: + pass + + sindex_list = [ns_set_sindex.split()[2] + for ns_set_sindex in sindex_stats[timestamp].keys()] + try: + sindex_list = util.filter_list(sindex_list, self.mods['for'][1:2]) + except Exception: + pass + for sindex, stats in sindex_stats[timestamp].iteritems(): - if sindex.split()[0] not in namespace_list: + ns, set, si = sindex.split() + if ns not in namespace_list or si not in sindex_list: continue + self.view.show_stats("%s Sindex Statistics (%s)" % (sindex, timestamp), stats, self.loghdlr.get_cinfo_log_at(timestamp=timestamp), @@ -784,7 +812,7 @@ def _do_default(self, line): if timestamp in cluster_configs: cl_configs = cluster_configs[timestamp] - features = util.find_nodewise_features(service_data=s_stats, ns_data=ns_stats, cl_data=cl_configs) + features = common.find_nodewise_features(service_data=s_stats, ns_data=ns_stats, cl_data=cl_configs) self.view.show_config( "(%s) Features" % @@ -1113,6 +1141,6 @@ def _do_default(self, line): metadata["os_version"] = os_version - self.view.print_summary(util.create_summary(service_stats=service_stats[last_timestamp], namespace_stats=namespace_stats[last_timestamp], + self.view.print_summary(common.create_summary(service_stats=service_stats[last_timestamp], namespace_stats=namespace_stats[last_timestamp], set_stats=set_stats[last_timestamp], metadata=metadata, cluster_configs=cluster_configs), list_view=enable_list_view) diff --git a/lib/getcontroller.py b/lib/getcontroller.py index b009ac7d..72a7b7c3 100644 --- a/lib/getcontroller.py +++ b/lib/getcontroller.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from lib.utils import util +from lib.utils import common, util def get_sindex_stats(cluster, nodes='all', for_mods=[]): @@ -23,16 +23,28 @@ def get_sindex_stats(cluster, nodes='all', for_mods=[]): for host, stat_list in stats.iteritems(): if not stat_list or isinstance(stat_list, Exception): continue + namespace_list = [stat['ns'] for stat in stat_list] - namespace_list = util.filter_list(namespace_list, for_mods) + try: + namespace_list = util.filter_list(namespace_list, for_mods[:1]) + except Exception: + pass + + sindex_list = [stat['indexname'] for stat in stat_list] + try: + sindex_list = util.filter_list(sindex_list, for_mods[1:2]) + except Exception: + pass + for stat in stat_list: if not stat or stat['ns'] not in namespace_list: continue + ns = stat['ns'] set = stat['set'] indexname = stat['indexname'] - if not indexname or not ns: + if not indexname or not ns or indexname not in sindex_list: continue sindex_key = "%s %s %s" % (ns, set, indexname) @@ -57,7 +69,7 @@ def __init__(self, cluster): def do_distribution(self, histogram_name, nodes='all'): histogram = self.cluster.info_histogram(histogram_name, nodes=nodes) - return util.create_histogram_output(histogram_name, histogram) + return common.create_histogram_output(histogram_name, histogram) def do_object_size(self, byte_distribution=False, bucket_count=5, nodes='all'): @@ -71,7 +83,7 @@ def do_object_size(self, byte_distribution=False, bucket_count=5, nodes='all'): histogram = histogram.result() builds = builds.result() - return util.create_histogram_output(histogram_name, histogram, byte_distribution=True, bucket_count=bucket_count, builds=builds) + return common.create_histogram_output(histogram_name, histogram, byte_distribution=True, bucket_count=bucket_count, builds=builds) class GetLatencyController(): @@ -287,11 +299,23 @@ def get_sets(self, nodes='all', for_mods=[]): for host_id, key_values in sets.iteritems(): if isinstance(key_values, Exception) or not key_values: continue + namespace_list = [ns_set[0] for ns_set in key_values.keys()] - namespace_list = util.filter_list(namespace_list, for_mods) + try: + namespace_list = util.filter_list(namespace_list, for_mods[:1]) + except Exception: + pass + + set_list = [ns_set[1] for ns_set in key_values.keys()] + try: + set_list = util.filter_list(set_list, for_mods[1:2]) + except Exception: + pass + for key, values in key_values.iteritems(): - if key[0] not in namespace_list: + if key[0] not in namespace_list or key[1] not in set_list: continue + if key not in set_stats: set_stats[key] = {} host_vals = set_stats[key] @@ -372,7 +396,7 @@ def get_features(self, nodes='all'): ns_stats = ns_stats.result() cl_configs = cl_configs.result() - return util.find_nodewise_features(service_data=service_stats, ns_data=ns_stats, cl_data=cl_configs) + return common.find_nodewise_features(service_data=service_stats, ns_data=ns_stats, cl_data=cl_configs) class GetPmapController(): diff --git a/lib/health/query.py b/lib/health/query.py index 5b67f807..30f3533a 100644 --- a/lib/health/query.py +++ b/lib/health/query.py @@ -269,7 +269,7 @@ "Listed namespace[s] have lower than normal (< 20 %) available disk space. Probable cause - namespace size misconfiguration.", "Namespace disk available pct check."); -s = select * from SERVICE.CONFIG ignore "pidfile", "heartbeat.mtu", like(".*address"), like(".*port") save; +s = select * from SERVICE.CONFIG ignore "heartbeat.mtu", "node-id-interface", "pidfile", like(".*address"), like(".*port") save; r = group by CLUSTER, KEY do NO_MATCH(s, ==, MAJORITY) save; ASSERT(r, False, "Different service configurations.", "OPERATIONS", WARNING, "Listed Service configuration[s] are different across multiple nodes in cluster. Please run 'show config service diff' to check different configuration values. Probable cause - config file misconfiguration.", @@ -1334,8 +1334,30 @@ "Namespace batch-index read sub-transaction transaction service error count check"); +/* Key busy error */ +s = select "fail_key_busy" from NAMESPACE.STATISTICS save; +u = select "uptime" from SERVICE.STATISTICS; +u = group by CLUSTER, NODE do MAX(u); +s = do s / u; +r = group by KEY do SD_ANOMALY(s, ==, 3); +ASSERT(r, False, "Skewed Fail Key Busy count.", "ANOMALY", INFO, + "fail_key_busy show skew count patterns (for listed node[s]). Please run 'show statistics namespace like fail_key_busy' for details.", + "Key Busy errors count anomaly check."); + + SET CONSTRAINT VERSION < 3.9; +/* Key busy error */ +s = select "err_rw_pending_limit" from SERVICE.STATISTICS save; +u = select "uptime" from SERVICE.STATISTICS; +u = group by CLUSTER, NODE do MAX(u); +s = do s / u; +r = group by KEY do SD_ANOMALY(s, ==, 3); +ASSERT(r, False, "Skewed Fail Key Busy count.", "ANOMALY", INFO, + "err_rw_pending_limit show skew count patterns (for listed node[s]). Please run 'show statistics like err_rw_pending_limit' for details.", + "Key Busy errors count anomaly check."); + + // Read statistics t = select "stat_read_reqs" as "cnt" from SERVICE.STATISTICS save; diff --git a/lib/health/query/health.hql b/lib/health/query/health.hql index 6d87d6fa..847feedd 100644 --- a/lib/health/query/health.hql +++ b/lib/health/query/health.hql @@ -254,7 +254,7 @@ ASSERT(r, True, "Low namespace disk available pct.", "OPERATIONS", WARNING, "Listed namespace[s] have lower than normal (< 20 %) available disk space. Probable cause - namespace size misconfiguration.", "Namespace disk available pct check."); -s = select * from SERVICE.CONFIG ignore "pidfile", "heartbeat.mtu", like(".*address"), like(".*port") save; +s = select * from SERVICE.CONFIG ignore "heartbeat.mtu", "node-id-interface", "pidfile", like(".*address"), like(".*port") save; r = group by CLUSTER, KEY do NO_MATCH(s, ==, MAJORITY) save; ASSERT(r, False, "Different service configurations.", "OPERATIONS", WARNING, "Listed Service configuration[s] are different across multiple nodes in cluster. Please run 'show config service diff' to check different configuration values. Probable cause - config file misconfiguration.", @@ -1319,8 +1319,30 @@ ASSERT(r, False, "Non-zero batch-index read sub-transaction errors in the transa "Namespace batch-index read sub-transaction transaction service error count check"); +/* Key busy error */ +s = select "fail_key_busy" from NAMESPACE.STATISTICS save; +u = select "uptime" from SERVICE.STATISTICS; +u = group by CLUSTER, NODE do MAX(u); +s = do s / u; +r = group by KEY do SD_ANOMALY(s, ==, 3); +ASSERT(r, False, "Skewed Fail Key Busy count.", "ANOMALY", INFO, + "fail_key_busy show skew count patterns (for listed node[s]). Please run 'show statistics namespace like fail_key_busy' for details.", + "Key Busy errors count anomaly check."); + + SET CONSTRAINT VERSION < 3.9; +/* Key busy error */ +s = select "err_rw_pending_limit" from SERVICE.STATISTICS save; +u = select "uptime" from SERVICE.STATISTICS; +u = group by CLUSTER, NODE do MAX(u); +s = do s / u; +r = group by KEY do SD_ANOMALY(s, ==, 3); +ASSERT(r, False, "Skewed Fail Key Busy count.", "ANOMALY", INFO, + "err_rw_pending_limit show skew count patterns (for listed node[s]). Please run 'show statistics like err_rw_pending_limit' for details.", + "Key Busy errors count anomaly check."); + + // Read statistics t = select "stat_read_reqs" as "cnt" from SERVICE.STATISTICS save; diff --git a/lib/utils/common.py b/lib/utils/common.py new file mode 100644 index 00000000..2d42b4da --- /dev/null +++ b/lib/utils/common.py @@ -0,0 +1,1135 @@ +# Copyright 2013-2018 Aerospike, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +############################################################################################################# +# Functions common to multiple modes (online cluster / offline cluster / collectinfo-analyser / log-analyser) +############################################################################################################# + +import logging +import os +import platform +import socket +import sys +import time +import urllib2 +import zipfile +from distutils.version import LooseVersion + +from lib.utils import constants, filesize, util +from lib.utils.data import lsof_file_type_desc +from lib.view import terminal + +logger = logging.getLogger("asadm") +########## Feature ########## + +# Dictionary to contain feature and related stats to identify state of that feature +# Format : { feature1: ((service_stat1, service_stat2, ....), (namespace_stat1, namespace_stat2, ...), ...} +FEATURE_KEYS = { + "KVS": (('stat_read_reqs', 'stat_write_reqs'), + ('client_read_error', 'client_read_success', 'client_write_error', 'client_write_success')), + "UDF": (('udf_read_reqs', 'udf_write_reqs'), ('client_udf_complete', 'client_udf_error')), + "Batch": (('batch_initiate', 'batch_index_initiate'), None), + "Scan": (('tscan_initiate', 'basic_scans_succeeded', 'basic_scans_failed', 'aggr_scans_succeeded', + 'aggr_scans_failed', 'udf_bg_scans_succeeded', 'udf_bg_scans_failed'), + ( + 'scan_basic_complete', 'scan_basic_error', 'scan_aggr_complete', 'scan_aggr_error', + 'scan_udf_bg_complete', + 'scan_udf_bg_error')), + "SINDEX": (('sindex-used-bytes-memory'), ('memory_used_sindex_bytes')), + "Query": (('query_reqs', 'query_success'), ('query_reqs', 'query_success')), + "Aggregation": (('query_agg', 'query_agg_success'), ('query_agg', 'query_agg_success')), + "LDT": ( + ('sub-records', 'ldt-writes', 'ldt-reads', 'ldt-deletes', 'ldt_writes', 'ldt_reads', 'ldt_deletes', + 'sub_objects'), + ('ldt-writes', 'ldt-reads', 'ldt-deletes', 'ldt_writes', 'ldt_reads', 'ldt_deletes')), + "XDR Source": (('stat_read_reqs_xdr', 'xdr_read_success', 'xdr_read_error'), None), + "XDR Destination": (('stat_write_reqs_xdr'), ('xdr_write_success')), + "Rack-aware": (('self-group-id'), ('rack-id')), +} + + +def _is_keyval_greater_than_value(data={}, keys=(), value=0, is_and=False, type_check=int): + """ + Function takes dictionary, keys and value to compare. + Returns boolean to indicate value for key is greater than comparing value or not. + """ + + if not keys: + return True + + if not data: + return False + + if not isinstance(keys, tuple): + keys = (keys,) + + if is_and: + if all(util.get_value_from_dict(data, k, value, type_check) > value for k in keys): + return True + + else: + if any(util.get_value_from_dict(data, k, value, type_check) > value for k in keys): + return True + + return False + + +def _check_feature_by_keys(service_data=None, service_keys=None, ns_data=None, ns_keys=None): + """ + Function takes dictionary of service data, service keys, dictionary of namespace data and namespace keys. + Returns boolean to indicate service key in service data or namespace key in namespace data has non-zero value or not. + """ + + if service_data and not isinstance(service_data, Exception) and service_keys: + if _is_keyval_greater_than_value(service_data, service_keys): + return True + + if ns_data and ns_keys: + for ns, nsval in ns_data.iteritems(): + if not nsval or isinstance(nsval, Exception): + continue + if _is_keyval_greater_than_value(nsval, ns_keys): + return True + + return False + + +def _find_features_for_cluster(service_data, ns_data, cl_data={}): + """ + Function takes dictionary of service stats, dictionary of namespace stats, and dictionary cluster config. + Returns list of active (used) features identifying by comparing respective keys for non-zero value. + """ + + features = [] + + for node in service_data.keys(): + if cl_data and node in cl_data and cl_data[node] and not isinstance(cl_data[node], Exception): + if service_data[node] and not isinstance(service_data[node], Exception): + service_data[node].update(cl_data[node]) + else: + service_data[node] = cl_data[node] + + for feature, keys in FEATURE_KEYS.iteritems(): + for node, d in service_data.iteritems(): + + ns_d = None + + if node in ns_data and not isinstance(ns_data[node], Exception): + ns_d = ns_data[node] + + if _check_feature_by_keys(d, keys[0], ns_d, keys[1]): + features.append(feature) + break + + return features + + +def find_nodewise_features(service_data, ns_data, cl_data={}): + """ + Function takes dictionary of service stats, dictionary of namespace stats, and dictionary cluster config. + Returns map of active (used) features per node identifying by comparing respective keys for non-zero value. + """ + + features = {} + + for node in service_data.keys(): + if node in cl_data and cl_data[node] and not isinstance(cl_data[node], Exception): + if service_data[node] and not isinstance(service_data[node], Exception): + service_data[node].update(cl_data[node]) + else: + service_data[node] = cl_data[node] + + for feature, keys in FEATURE_KEYS.iteritems(): + for node, s_stats in service_data.iteritems(): + + if node not in features: + features[node] = {} + + features[node][feature.upper()] = "NO" + n_stats = None + + if node in ns_data and not isinstance(ns_data[node], Exception): + n_stats = ns_data[node] + + if _check_feature_by_keys(s_stats, keys[0], n_stats, keys[1]): + features[node][feature.upper()] = "YES" + + return features + + +############################# + +########## Summary ########## + +def _compute_set_overhead_for_ns(set_stats, ns): + """ + Function takes set stat and namespace name. + Returns set overhead for input namespace name. + """ + + if not ns or not set_stats or isinstance(set_stats, Exception): + return 0 + + overhead = 0 + for _k, stats in set_stats.iteritems(): + if not stats or isinstance(stats, Exception): + continue + + ns_name = util.get_value_from_second_level_of_dict(stats, ("ns", "ns_name"), default_value=None, + return_type=str).values()[0] + if ns_name != ns: + continue + + set_name = util.get_value_from_second_level_of_dict(stats, ("set", "set_name"), default_value="", + return_type=str).values()[0] + objects = sum(util.get_value_from_second_level_of_dict(stats, ("objects", "n_objects"), default_value=0, + return_type=int).values()) + overhead += objects * (9 + len(set_name)) + + return overhead + + +def _compute_license_data_size(namespace_stats, set_stats, cluster_dict, ns_dict): + """ + Function takes dictionary of set stats, dictionary of namespace stats, cluster output dictionary and namespace output dictionary. + Function finds license data size per namespace, and per cluster and updates output dictionaries. + """ + + if not namespace_stats: + return + + cl_memory_data_size = 0 + cl_device_data_size = 0 + + for ns, ns_stats in namespace_stats.iteritems(): + if not ns_stats or isinstance(ns_stats, Exception): + continue + repl_factor = max( + util.get_value_from_second_level_of_dict(ns_stats, ("repl-factor", "replication-factor"), default_value=0, + return_type=int).values()) + master_objects = sum( + util.get_value_from_second_level_of_dict(ns_stats, ("master_objects", "master-objects"), default_value=0, + return_type=int).values()) + devices_in_use = list(set(util.get_value_from_second_level_of_dict(ns_stats, ( + "storage-engine.device", "device", "storage-engine.file", "file", "dev"), default_value=None, + return_type=str).values())) + memory_data_size = None + device_data_size = None + + if len(devices_in_use) == 0 or (len(devices_in_use) == 1 and devices_in_use[0] == None): + # Data in memory only + memory_data_size = sum( + util.get_value_from_second_level_of_dict(ns_stats, ("memory_used_data_bytes", "data-used-bytes-memory"), + default_value=0, return_type=int).values()) + memory_data_size = memory_data_size / repl_factor + + if memory_data_size > 0: + memory_record_overhead = master_objects * 2 + memory_data_size = memory_data_size - memory_record_overhead + + else: + # Data on disk + device_data_size = sum( + util.get_value_from_second_level_of_dict(ns_stats, ("device_used_bytes", "used-bytes-disk"), + default_value=0, return_type=int).values()) + + if device_data_size > 0: + set_overhead = _compute_set_overhead_for_ns(set_stats, ns) + device_data_size = device_data_size - set_overhead + + if device_data_size > 0: + tombstones = sum(util.get_value_from_second_level_of_dict(ns_stats, ("tombstones",), default_value=0, + return_type=int).values()) + tombstone_overhead = tombstones * 128 + device_data_size = device_data_size - tombstone_overhead + + device_data_size = device_data_size / repl_factor + if device_data_size > 0: + device_record_overhead = master_objects * 64 + device_data_size = device_data_size - device_record_overhead + + ns_dict[ns]["license_data_in_memory"] = 0 + ns_dict[ns]["license_data_on_disk"] = 0 + if memory_data_size is not None: + ns_dict[ns]["license_data_in_memory"] = memory_data_size + cl_memory_data_size += memory_data_size + + if device_data_size is not None: + ns_dict[ns]["license_data_on_disk"] = device_data_size + cl_device_data_size += device_data_size + + cluster_dict["license_data"] = {} + cluster_dict["license_data"]["memory_size"] = cl_memory_data_size + cluster_dict["license_data"]["device_size"] = cl_device_data_size + + +def _set_migration_status(namespace_stats, cluster_dict, ns_dict): + """ + Function takes dictionary of namespace stats, cluster output dictionary and namespace output dictionary. + Function finds migration status per namespace, and per cluster and updates output dictionaries. + """ + + if not namespace_stats: + return + + for ns, ns_stats in namespace_stats.iteritems(): + if not ns_stats or isinstance(ns_stats, Exception): + continue + + migrations_in_progress = any(util.get_value_from_second_level_of_dict(ns_stats, ( + "migrate_tx_partitions_remaining", "migrate-tx-partitions-remaining"), + default_value=0, + return_type=int).values()) + if migrations_in_progress: + ns_dict[ns]["migrations_in_progress"] = True + cluster_dict["migrations_in_progress"] = True + + +def _initialize_summary_output(ns_list): + """ + Function takes list of namespace names. + Returns dictionary with summary fields set. + """ + + summary_dict = {} + summary_dict["CLUSTER"] = {} + + summary_dict["CLUSTER"]["server_version"] = [] + summary_dict["CLUSTER"]["os_version"] = [] + summary_dict["CLUSTER"]["active_features"] = [] + summary_dict["CLUSTER"]["migrations_in_progress"] = False + + summary_dict["CLUSTER"]["device"] = {} + summary_dict["CLUSTER"]["device"]["count"] = 0 + summary_dict["CLUSTER"]["device"]["count_per_node"] = 0 + summary_dict["CLUSTER"]["device"]["count_same_across_nodes"] = True + summary_dict["CLUSTER"]["device"]["total"] = 0 + summary_dict["CLUSTER"]["device"]["used"] = 0 + summary_dict["CLUSTER"]["device"]["aval"] = 0 + summary_dict["CLUSTER"]["device"]["used_pct"] = 0 + summary_dict["CLUSTER"]["device"]["aval_pct"] = 0 + + summary_dict["CLUSTER"]["memory"] = {} + summary_dict["CLUSTER"]["memory"]["total"] = 0 + summary_dict["CLUSTER"]["memory"]["aval"] = 0 + summary_dict["CLUSTER"]["memory"]["aval_pct"] = 0 + + summary_dict["CLUSTER"]["active_ns"] = 0 + summary_dict["CLUSTER"]["ns_count"] = 0 + + summary_dict["CLUSTER"]["license_data"] = {} + summary_dict["CLUSTER"]["license_data"]["memory_size"] = 0 + summary_dict["CLUSTER"]["license_data"]["device_size"] = 0 + + summary_dict["FEATURES"] = {} + summary_dict["FEATURES"]["NAMESPACE"] = {} + + for ns in ns_list: + summary_dict["FEATURES"]["NAMESPACE"][ns] = {} + + summary_dict["FEATURES"]["NAMESPACE"][ns]["devices_total"] = 0 + summary_dict["FEATURES"]["NAMESPACE"][ns]["devices_per_node"] = 0 + summary_dict["FEATURES"]["NAMESPACE"][ns]["devices_count_same_across_nodes"] = True + + summary_dict["FEATURES"]["NAMESPACE"][ns]["memory_total"] = 0 + summary_dict["FEATURES"]["NAMESPACE"][ns]["memory_aval"] = 0 + summary_dict["FEATURES"]["NAMESPACE"][ns]["memory_available_pct"] = 0 + + summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_total"] = 0 + summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_used"] = 0 + summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_aval"] = 0 + summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_used_pct"] = 0 + summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_available_pct"] = 0 + + summary_dict["FEATURES"]["NAMESPACE"][ns]["repl_factor"] = 0 + summary_dict["FEATURES"]["NAMESPACE"][ns]["master_objects"] = 0 + + summary_dict["FEATURES"]["NAMESPACE"][ns]["license_data"] = {} + + summary_dict["FEATURES"]["NAMESPACE"][ns]["migrations_in_progress"] = False + + return summary_dict + + +def create_summary(service_stats, namespace_stats, set_stats, metadata, cluster_configs={}): + """ + Function takes four dictionaries service stats, namespace stats, set stats and metadata. + Returns dictionary with summary information. + """ + + features = _find_features_for_cluster(service_stats, namespace_stats, cluster_configs) + + namespace_stats = util.flip_keys(namespace_stats) + set_stats = util.flip_keys(set_stats) + + summary_dict = _initialize_summary_output(namespace_stats.keys()) + + total_nodes = len(service_stats.keys()) + + cl_nodewise_device_counts = {} + + cl_nodewise_mem_size = {} + cl_nodewise_mem_aval = {} + + cl_nodewise_device_size = {} + cl_nodewise_device_used = {} + cl_nodewise_device_aval = {} + + _compute_license_data_size(namespace_stats, set_stats, summary_dict["CLUSTER"], + summary_dict["FEATURES"]["NAMESPACE"]) + _set_migration_status(namespace_stats, summary_dict["CLUSTER"], summary_dict["FEATURES"]["NAMESPACE"]) + + summary_dict["CLUSTER"]["active_features"] = features + summary_dict["CLUSTER"]["cluster_size"] = list(set( + util.get_value_from_second_level_of_dict(service_stats, ("cluster_size",), default_value=0, + return_type=int).values())) + + if "server_version" in metadata and metadata["server_version"]: + summary_dict["CLUSTER"]["server_version"] = list(set(metadata["server_version"].values())) + + if "os_version" in metadata and metadata["os_version"]: + summary_dict["CLUSTER"]["os_version"] = list(set( + util.get_value_from_second_level_of_dict(metadata["os_version"], ("description",), default_value="", + return_type=str).values())) + + for ns, ns_stats in namespace_stats.iteritems(): + if not ns_stats or isinstance(ns_stats, Exception): + continue + + device_names_str = util.get_value_from_second_level_of_dict(ns_stats, ( + "storage-engine.device", "device", "storage-engine.file", "file", "dev"), default_value="", return_type=str) + device_counts = dict([(k, len(v.split(',')) if v else 0) for k, v in device_names_str.iteritems()]) + cl_nodewise_device_counts = util.add_dicts(cl_nodewise_device_counts, device_counts) + + ns_total_devices = sum(device_counts.values()) + ns_total_nodes = len(ns_stats.keys()) + + if ns_total_devices: + summary_dict["FEATURES"]["NAMESPACE"][ns]["devices_total"] = ns_total_devices + summary_dict["FEATURES"]["NAMESPACE"][ns]["devices_per_node"] = int( + (float(ns_total_devices) / float(ns_total_nodes)) + 0.5) + if len(set(device_counts.values())) > 1: + summary_dict["FEATURES"]["NAMESPACE"][ns]["devices_count_same_across_nodes"] = False + + mem_size = util.get_value_from_second_level_of_dict(ns_stats, ("memory-size",), default_value=0, + return_type=int) + mem_aval_pct = util.get_value_from_second_level_of_dict(ns_stats, ("memory_free_pct", "free-pct-memory"), + default_value=0, return_type=int) + mem_aval = util.pct_to_value(mem_size, mem_aval_pct) + cl_nodewise_mem_size = util.add_dicts(cl_nodewise_mem_size, mem_size) + cl_nodewise_mem_aval = util.add_dicts(cl_nodewise_mem_aval, mem_aval) + summary_dict["FEATURES"]["NAMESPACE"][ns]["memory_total"] = sum(mem_size.values()) + summary_dict["FEATURES"]["NAMESPACE"][ns]["memory_aval"] = sum(mem_aval.values()) + summary_dict["FEATURES"]["NAMESPACE"][ns]["memory_available_pct"] = (float(sum(mem_aval.values())) / float( + sum(mem_size.values()))) * 100.0 + + device_size = util.get_value_from_second_level_of_dict(ns_stats, ("device_total_bytes", "total-bytes-disk"), + default_value=0, return_type=int) + device_used = util.get_value_from_second_level_of_dict(ns_stats, ("device_used_bytes", "used-bytes-disk"), + default_value=0, return_type=int) + device_aval_pct = util.get_value_from_second_level_of_dict(ns_stats, ("device_available_pct", "available_pct"), + default_value=0, return_type=int) + device_aval = util.pct_to_value(device_size, device_aval_pct) + cl_nodewise_device_size = util.add_dicts(cl_nodewise_device_size, device_size) + cl_nodewise_device_used = util.add_dicts(cl_nodewise_device_used, device_used) + cl_nodewise_device_aval = util.add_dicts(cl_nodewise_device_aval, device_aval) + device_size_total = sum(device_size.values()) + if device_size_total > 0: + summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_total"] = device_size_total + summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_used"] = sum(device_used.values()) + summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_aval"] = sum(device_aval.values()) + summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_used_pct"] = (float(sum(device_used.values())) / float( + device_size_total)) * 100.0 + summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_available_pct"] = (float(sum(device_aval.values())) / float( + device_size_total)) * 100.0 + + summary_dict["FEATURES"]["NAMESPACE"][ns]["repl_factor"] = list(set( + util.get_value_from_second_level_of_dict(ns_stats, ("repl-factor", "replication-factor"), default_value=0, + return_type=int).values())) + + data_in_memory = \ + util.get_value_from_second_level_of_dict(ns_stats, ("storage-engine.data-in-memory", "data-in-memory"), + default_value=False, return_type=bool).values()[0] + + if data_in_memory: + cache_read_pcts = util.get_value_from_second_level_of_dict(ns_stats, ("cache_read_pct", "cache-read-pct"), + default_value="N/E", return_type=int).values() + if cache_read_pcts: + try: + summary_dict["FEATURES"]["NAMESPACE"][ns]["cache_read_pct"] = sum(cache_read_pcts) / len( + cache_read_pcts) + except Exception: + pass + master_objects = sum( + util.get_value_from_second_level_of_dict(ns_stats, ("master_objects", "master-objects"), default_value=0, + return_type=int).values()) + summary_dict["CLUSTER"]["ns_count"] += 1 + if master_objects > 0: + summary_dict["FEATURES"]["NAMESPACE"][ns]["master_objects"] = master_objects + summary_dict["CLUSTER"]["active_ns"] += 1 + + try: + rack_ids = util.get_value_from_second_level_of_dict(ns_stats, ("rack-id",), default_value=None, + return_type=int) + rack_ids = list(set(rack_ids.values())) + if len(rack_ids) > 1 or rack_ids[0] is not None: + if any((i is not None and i > 0) for i in rack_ids): + summary_dict["FEATURES"]["NAMESPACE"][ns]["rack-aware"] = True + else: + summary_dict["FEATURES"]["NAMESPACE"][ns]["rack-aware"] = False + except Exception: + pass + + cl_device_counts = sum(cl_nodewise_device_counts.values()) + if cl_device_counts: + summary_dict["CLUSTER"]["device"]["count"] = cl_device_counts + summary_dict["CLUSTER"]["device"]["count_per_node"] = int((float(cl_device_counts) / float(total_nodes)) + 0.5) + if len(set(cl_nodewise_device_counts.values())) > 1: + summary_dict["CLUSTER"]["device"]["count_same_across_nodes"] = False + + cl_memory_size_total = sum(cl_nodewise_mem_size.values()) + if cl_memory_size_total > 0: + summary_dict["CLUSTER"]["memory"]["total"] = cl_memory_size_total + summary_dict["CLUSTER"]["memory"]["aval"] = sum(cl_nodewise_mem_aval.values()) + summary_dict["CLUSTER"]["memory"]["aval_pct"] = (float(sum(cl_nodewise_mem_aval.values())) / float( + cl_memory_size_total)) * 100.0 + + cl_device_size_total = sum(cl_nodewise_device_size.values()) + if cl_device_size_total > 0: + summary_dict["CLUSTER"]["device"]["total"] = cl_device_size_total + summary_dict["CLUSTER"]["device"]["used"] = sum(cl_nodewise_device_used.values()) + summary_dict["CLUSTER"]["device"]["aval"] = sum(cl_nodewise_device_aval.values()) + summary_dict["CLUSTER"]["device"]["used_pct"] = (float(sum(cl_nodewise_device_used.values())) / float( + cl_device_size_total)) * 100.0 + summary_dict["CLUSTER"]["device"]["aval_pct"] = (float(sum(cl_nodewise_device_aval.values())) / float( + cl_device_size_total)) * 100.0 + + return summary_dict + + +############################# + +########## Histogram ########## + +def _create_histogram_percentiles_output(histogram_name, histogram_data): + histogram_data = util.flip_keys(histogram_data) + + for namespace, host_data in histogram_data.iteritems(): + for host_id, data in host_data.iteritems(): + hist = data['data'] + width = data['width'] + + cum_total = 0 + total = sum(hist) + percentile = 0.1 + result = [] + + for i, v in enumerate(hist): + cum_total += float(v) + if total > 0: + portion = cum_total / total + else: + portion = 0.0 + + while portion >= percentile: + percentile += 0.1 + result.append(i + 1) + + if percentile > 1.0: + break + + if result == []: + result = [0] * 10 + + if histogram_name is "objsz": + data['percentiles'] = [(r * width) - 1 if r > 0 else r for r in result] + else: + data['percentiles'] = [r * width for r in result] + + return histogram_data + + +def _create_bytewise_histogram_percentiles_output(histogram_data, bucket_count, builds): + histogram_data = util.flip_keys(histogram_data) + + for namespace, host_data in histogram_data.iteritems(): + result = [] + rblock_size_bytes = 128 + width = 1 + + for host_id, data in host_data.iteritems(): + + try: + as_version = builds[host_id] + if (LooseVersion(as_version) < LooseVersion("2.7.0") + or (LooseVersion(as_version) >= LooseVersion("3.0.0") + and LooseVersion(as_version) < LooseVersion("3.1.3"))): + rblock_size_bytes = 512 + + except Exception: + pass + + hist = data['data'] + width = data['width'] + + for i, v in enumerate(hist): + if v and v > 0: + result.append(i) + + result = list(set(result)) + result.sort() + start_buckets = [] + + if len(result) <= bucket_count: + # if asinfo buckets with values>0 are less than + # show_bucket_count then we can show all single buckets as it + # is, no need to merge to show big range + for res in result: + start_buckets.append(res) + start_buckets.append(res + 1) + + else: + # dividing volume buckets (from min possible bucket with + # value>0 to max possible bucket with value>0) into same range + start_bucket = result[0] + size = result[len(result) - 1] - result[0] + 1 + + bucket_width = size / bucket_count + additional_bucket_index = bucket_count - (size % bucket_count) + + bucket_index = 0 + + while bucket_index < bucket_count: + start_buckets.append(start_bucket) + + if bucket_index == additional_bucket_index: + bucket_width += 1 + + start_bucket += bucket_width + bucket_index += 1 + + start_buckets.append(start_bucket) + + columns = [] + need_to_show = {} + + for i, bucket in enumerate(start_buckets): + + if i == len(start_buckets) - 1: + break + + key = _get_bucket_range(bucket, start_buckets[i + 1], width, rblock_size_bytes) + need_to_show[key] = False + columns.append(key) + + for host_id, data in host_data.iteritems(): + + rblock_size_bytes = 128 + + try: + as_version = builds[host_id] + + if (LooseVersion(as_version) < LooseVersion("2.7.0") + or (LooseVersion(as_version) >= LooseVersion("3.0.0") + and LooseVersion(as_version) < LooseVersion("3.1.3"))): + rblock_size_bytes = 512 + + except Exception: + pass + + hist = data['data'] + width = data['width'] + data['values'] = {} + + for i, s in enumerate(start_buckets): + + if i == len(start_buckets) - 1: + break + + b_index = s + + key = _get_bucket_range(s, start_buckets[i + 1], width, rblock_size_bytes) + + if key not in columns: + columns.append(key) + + if key not in data["values"]: + data["values"][key] = 0 + + while b_index < start_buckets[i + 1]: + data["values"][key] += hist[b_index] + b_index += 1 + + if data["values"][key] > 0: + need_to_show[key] = True + + else: + if key not in need_to_show: + need_to_show[key] = False + + host_data["columns"] = [] + + for column in columns: + if need_to_show[column]: + host_data["columns"].append(column) + + return histogram_data + + +def _get_bucket_range(current_bucket, next_bucket, width, rblock_size_bytes): + s_b = "0 B" + if current_bucket > 0: + last_bucket_last_rblock_end = ((current_bucket * width) - 1) * rblock_size_bytes + + if last_bucket_last_rblock_end < 1: + last_bucket_last_rblock_end = 0 + + else: + last_bucket_last_rblock_end += 1 + + s_b = filesize.size(last_bucket_last_rblock_end, filesize.byte) + + if current_bucket == 99 or next_bucket > 99: + return ">%s" % (s_b.replace(" ", "")) + + bucket_last_rblock_end = ((next_bucket * width) - 1) * rblock_size_bytes + e_b = filesize.size(bucket_last_rblock_end, filesize.byte) + return "%s to %s" % (s_b.replace(" ", ""), e_b.replace(" ", "")) + + +def create_histogram_output(histogram_name, histogram_data, **params): + if "byte_distribution" not in params or not params["byte_distribution"]: + return _create_histogram_percentiles_output(histogram_name, histogram_data) + + if "bucket_count" not in params or "builds" not in params: + return {} + + return _create_bytewise_histogram_percentiles_output(histogram_data, params["bucket_count"], params["builds"]) + + +################################# + +########## System Collectinfo ########## + + +def _get_metadata(response_str, prefix='', old_response=''): + aws_c = '' + aws_metadata_base_url = 'http://169.254.169.254/latest/meta-data' + + # set of values which will give same old_response, so no need to go further + last_values = [] + for rsp in response_str.split("\n"): + if rsp[-1:] == '/': + rsp_p = rsp.strip('/') + aws_c += _get_metadata(rsp_p, prefix, old_response=old_response) + else: + meta_url = aws_metadata_base_url + prefix + rsp + + req = urllib2.Request(meta_url) + r = urllib2.urlopen(req) + # r = requests.get(meta_url,timeout=aws_timeout) + if r.code != 404: + response = r.read().strip() + if response == old_response: + last_values.append(rsp.strip()) + continue + try: + aws_c += _get_metadata(response, prefix + rsp + "/", old_response=response) + except Exception: + aws_c += (prefix + rsp).strip('/') + '\n' + response + "\n\n" + + if last_values: + aws_c += prefix.strip('/') + '\n' + '\n'.join(last_values) + "\n\n" + + return aws_c + + +def _collect_awsdata(cmd=''): + aws_rsp = '' + aws_timeout = 1 + socket.setdefaulttimeout(aws_timeout) + aws_metadata_base_url = 'http://169.254.169.254/latest/meta-data' + out = "['AWS']" + try: + req = urllib2.Request(aws_metadata_base_url) + r = urllib2.urlopen(req) + # r = requests.get(aws_metadata_base_url,timeout=aws_timeout) + if r.code == 200: + rsp = r.read() + aws_rsp += _get_metadata(rsp, '/') + out += "\n" + "Requesting... {0} \n{1} \t Successful".format(aws_metadata_base_url, aws_rsp) + else: + aws_rsp = " Not likely in AWS" + out += "\n" + "Requesting... {0} \t FAILED {1} ".format(aws_metadata_base_url, aws_rsp) + + except Exception as e: + out += "\n" + "Requesting... {0} \t {1} ".format(aws_metadata_base_url, e) + out += "\n" + "FAILED! Node Is Not likely In AWS" + + return out, None + + +def _collect_cpuinfo(cmd=''): + out = "['cpuinfo']" + + cpu_info_cmd = 'cat /proc/cpuinfo | grep "vendor_id"' + o, e = util.shell_command([cpu_info_cmd]) + + if o: + o = o.strip().split("\n") + cpu_info = {} + + for item in o: + items = item.strip().split(":") + + if len(items) == 2: + key = items[1].strip() + if key in cpu_info.keys(): + cpu_info[key] = cpu_info[key] + 1 + else: + cpu_info[key] = 1 + out += "\nvendor_id\tprocessor count" + + for key in cpu_info.keys(): + out += "\n" + key + "\t" + str(cpu_info[key]) + + return out, None + + +def _collect_lsof(verbose=False): + # Collect lsof data + # If verbose true then returns whole output + # If verbose false then returns count and type of fds for aerospike process + + out = "['lsof']" + + pids = get_asd_pids() + + o_dict = {} + unidentified_protocol_count = 0 + type_ljust = 20 + desc_ljust = 20 + + for pid in pids: + cmd = "sudo lsof -n -p %s" % str(pid) + o, e = util.shell_command([cmd]) + + if e or not o: + continue + + if verbose: + out += "\n" + str(o) + continue + + o_rows = o.strip().split("\n") + + # first line is header, so ignore it + if "asd" not in o_rows[0]: + o_rows = o_rows[1:] + + for row in o_rows: + try: + if "can't identify protocol" in row: + unidentified_protocol_count += 1 + + except Exception: + pass + + try: + t = row.strip().split()[4] + if t not in o_dict: + + if len(t) > type_ljust: + type_ljust = len(t) + + if (t in lsof_file_type_desc + and len(lsof_file_type_desc[t]) > desc_ljust): + desc_ljust = len(lsof_file_type_desc[t]) + + o_dict[t] = 1 + else: + o_dict[t] += 1 + + except Exception: + continue + + if verbose: + # sending actual output, no need to compute counts + return out, None + + out += "\n" + "FileType".ljust(type_ljust) + "Description".ljust(desc_ljust) + "fd count" + + for ftype in sorted(o_dict.keys()): + desc = "Unknown" + if ftype in lsof_file_type_desc: + desc = lsof_file_type_desc[ftype] + + out += "\n" + ftype.ljust(type_ljust) + desc.ljust(desc_ljust) + str(o_dict[ftype]) + + out += "\n\n" + "Unidentified Protocols = " + str(unidentified_protocol_count) + + return out, None + + +def _collectinfo_content(func, cmd='', alt_cmds=[]): + fname = '' + try: + fname = func.func_name + except Exception: + pass + + info_line = constants.COLLECTINFO_PROGRESS_MSG % (fname, (" %s" % (str(cmd)) if cmd else "")) + logger.info(info_line) + + o_line = constants.COLLECTINFO_SEPRATOR + + o, e = None, None + + if cmd: + o_line += str(cmd) + "\n" + + failed_cmds = [] + + try: + o, e = func(cmd) + except Exception as e: + return o_line + str(e), failed_cmds + + if e: + logger.warning(str(e)) + if func == util.shell_command: + failed_cmds += cmd + + if alt_cmds: + success = False + for alt_cmd in alt_cmds: + if not alt_cmd: + continue + + alt_cmd = [alt_cmd] + info_line = "Data collection for alternative command %s %s in progress.." % (fname, str(alt_cmd)) + logger.info(info_line) + o_line += str(alt_cmd) + "\n" + o_alt, e_alt = util.shell_command(alt_cmd) + + if e_alt: + e = e_alt + + else: + failed_cmds = [] + success = True + + if o_alt: + o = o_alt + break + + if not success: + if alt_cmds: + failed_cmds += alt_cmds + + if o: + o_line += str(o) + "\n" + + return o_line, failed_cmds + + +def _zip_files(dir_path, _size=1): + """ + If file size is greater then given _size, create zip of file on same location and + remove original one. Won't zip If zlib module is not available. + """ + for root, dirs, files in os.walk(dir_path): + for _file in files: + file_path = os.path.join(root, _file) + size_mb = (os.path.getsize(file_path) / (1024 * 1024)) + if size_mb >= _size: + os.chdir(root) + try: + newzip = zipfile.ZipFile( + _file + ".zip", "w", zipfile.ZIP_DEFLATED) + newzip.write(_file) + newzip.close() + os.remove(_file) + except Exception as e: + print e + pass + + +def get_system_commands(port=3000): + # Unfortunately timestamp can not be printed in Centos with dmesg, + # storing dmesg logs without timestamp for this particular OS. + if 'centos' == (platform.linux_distribution()[0]).lower(): + cmd_dmesg = 'sudo dmesg' + alt_dmesg = '' + else: + cmd_dmesg = 'sudo dmesg -T' + alt_dmesg = 'sudo dmesg' + + # cmd and alternative cmds are stored in list of list instead of dic to + # maintain proper order for output + + + sys_shell_cmds = [ + ['hostname -I', 'hostname'], + ['top -n3 -b', 'top -l 3'], + ['lsb_release -a', 'ls /etc|grep release|xargs -I f cat /etc/f'], + ['cat /proc/meminfo', 'vmstat -s'], + ['cat /proc/interrupts'], + ['iostat -x 1 10'], + [cmd_dmesg, alt_dmesg], + ['sudo pgrep asd | xargs -I f sh -c "cat /proc/f/limits"'], + ['lscpu'], + ['sudo sysctl -a | grep -E "shmmax|file-max|maxfiles"'], + ['sudo iptables -L'], + ['sudo fdisk -l |grep Disk |grep dev | cut -d " " -f 2 | cut -d ":" -f 1 | xargs sudo hdparm -I 2>/dev/null'], + ['df -h'], + ['free -m'], + ['uname -a'], + + # Only in pretty print + ['cat /proc/partitions', 'fdisk -l'], + ['ls /sys/block/{sd*,xvd*,nvme*}/queue/rotational |xargs -I f sh -c "echo f; cat f;"'], + ['ls /sys/block/{sd*,xvd*,nvme*}/device/model |xargs -I f sh -c "echo f; cat f;"'], + ['ls /sys/block/{sd*,xvd*,nvme*}/queue/scheduler |xargs -I f sh -c "echo f; cat f;"'], + ['rpm -qa|grep -E "citrus|aero"', 'dpkg -l|grep -E "citrus|aero"'], + ['ip addr'], + ['ip -s link'], + ['sar -n DEV'], + ['sar -n EDEV'], + ['mpstat -P ALL 2 3'], + ['uptime'], + ['ss -ant state time-wait sport = :%d or dport = :%d | wc -l' % + (port, port), 'netstat -ant | grep %d | grep TIME_WAIT | wc -l' % (port)], + ['ss -ant state close-wait sport = :%d or dport = :%d | wc -l' % + (port, port), 'netstat -ant | grep %d | grep CLOSE_WAIT | wc -l' % (port)], + ['ss -ant state established sport = :%d or dport = :%d | wc -l' % + (port, port), 'netstat -ant | grep %d | grep ESTABLISHED | wc -l' % (port)], + ['ss -ant state listen sport = :%d or dport = :%d | wc -l' % + (port, port), 'netstat -ant | grep %d | grep LISTEN | wc -l' % (port)], + ['arp -n|grep ether|tr -s [:blank:] | cut -d" " -f5 |sort|uniq -c'], + ['find /proc/sys/net/ipv4/neigh/default/ -name "gc_thresh*" -print -exec cat {} \;'] + ] + + return sys_shell_cmds + + +def get_asd_pids(): + pids = [] + ps_cmd = 'sudo ps aux|grep -v grep|grep -E "asd|cld"' + ps_o, ps_e = util.shell_command([ps_cmd]) + if ps_o: + ps_o = ps_o.strip().split("\n") + pids = [] + for item in ps_o: + vals = item.strip().split() + if len(vals) >= 2: + pids.append(vals[1]) + return pids + + +def set_collectinfo_path(timestamp, output_prefix=""): + output_time = time.strftime("%Y%m%d_%H%M%S", timestamp) + aslogdir_prefix = "" + if output_prefix: + output_prefix = output_prefix.strip() + aslogdir_prefix = "%s%s" % (str(output_prefix), '_' if not output_prefix.endswith('_') + and not output_prefix.endswith('-') else "") \ + if output_prefix else "" + aslogdir = '/tmp/%scollect_info_' % (aslogdir_prefix) + output_time + as_logfile_prefix = aslogdir + '/' + output_time + '_' + + os.makedirs(aslogdir) + + return aslogdir, as_logfile_prefix + + +def archive_log(logdir): + _zip_files(logdir) + util.shell_command(["tar -czvf " + logdir + ".tgz " + logdir]) + sys.stderr.write("\x1b[2J\x1b[H") + print "\n\n\n" + logger.info("Files in " + logdir + " and " + logdir + ".tgz saved.") + + +def print_collecinto_summary(logdir, failed_cmds): + if failed_cmds: + logger.warning("Following commands are either unavailable or giving runtime error...") + logger.warning(list(set(failed_cmds))) + + print "\n" + logger.info("Please provide file " + logdir + ".tgz to Aerospike Support.") + logger.info("END OF ASCOLLECTINFO") + + # If multiple commands are given in execute_only mode then we might need coloring for next commands + terminal.enable_color(True) + + +def collect_sys_info(port=3000, timestamp="", outfile="", verbose=False): + failed_cmds = [] + + cluster_online = True + aslogdir = "" + + if not timestamp: + cluster_online = False + ts = time.gmtime() + timestamp = time.strftime("%Y-%m-%d %H:%M:%S UTC\n", ts) + aslogdir, as_logfile_prefix = set_collectinfo_path(ts) + outfile = as_logfile_prefix + "sysinfo.log" + + util.write_to_file(outfile, timestamp) + + try: + for cmds in get_system_commands(port=port): + o, f_cmds = _collectinfo_content(func=util.shell_command, cmd=cmds[0:1], + alt_cmds=cmds[1:] if len(cmds) > 1 else []) + failed_cmds += f_cmds + util.write_to_file(outfile, o) + except Exception as e: + print e + util.write_to_file(outfile, str(e)) + + try: + o, f_cmds = _collectinfo_content(func=_collect_cpuinfo) + util.write_to_file(outfile, o) + except Exception as e: + util.write_to_file(outfile, str(e)) + + try: + o, f_cmds = _collectinfo_content(func=_collect_awsdata) + util.write_to_file(outfile, o) + except Exception as e: + util.write_to_file(outfile, str(e)) + + try: + o, f_cmds = _collectinfo_content(func=_collect_lsof) + util.write_to_file(outfile, o) + except Exception as e: + util.write_to_file(outfile, str(e)) + + if verbose: + try: + o, f_cmds = _collectinfo_content(func=_collect_lsof(verbose=verbose)) + util.write_to_file(outfile, o) + except Exception as e: + util.write_to_file(outfile, str(e)) + + if not cluster_online: + # Cluster is offline so collecting only system info and archiving files + archive_log(aslogdir) + print_collecinto_summary(aslogdir, failed_cmds=failed_cmds) + + return failed_cmds + +######################################## diff --git a/lib/utils/constants.py b/lib/utils/constants.py index ec596ac7..4da5ff46 100644 --- a/lib/utils/constants.py +++ b/lib/utils/constants.py @@ -50,3 +50,6 @@ SERVER_FILE = 1 SYSTEM_FILE = 2 JSON_FILE = 3 + +COLLECTINFO_SEPRATOR = "\n====ASCOLLECTINFO====\n" +COLLECTINFO_PROGRESS_MSG = "Data collection for %s%s in progress.." diff --git a/lib/utils/util.py b/lib/utils/util.py index 523c38d9..59f38858 100644 --- a/lib/utils/util.py +++ b/lib/utils/util.py @@ -11,8 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import copy -from distutils.version import LooseVersion import pipes import re import StringIO @@ -20,26 +20,6 @@ import sys import threading -from lib.utils import filesize - - -# Dictionary to contain feature and related stats to identify state of that feature -# Format : { feature1: ((service_stat1, service_stat2, ....), (namespace_stat1, namespace_stat2, ...), ...} -FEATURE_KEYS = { - "KVS": (('stat_read_reqs', 'stat_write_reqs'), ('client_read_error', 'client_read_success', 'client_write_error', 'client_write_success')), - "UDF": (('udf_read_reqs', 'udf_write_reqs'), ('client_udf_complete', 'client_udf_error')), - "Batch": (('batch_initiate', 'batch_index_initiate'), None), - "Scan": (('tscan_initiate', 'basic_scans_succeeded', 'basic_scans_failed', 'aggr_scans_succeeded', 'aggr_scans_failed', 'udf_bg_scans_succeeded', 'udf_bg_scans_failed'), - ('scan_basic_complete', 'scan_basic_error', 'scan_aggr_complete', 'scan_aggr_error', 'scan_udf_bg_complete', 'scan_udf_bg_error')), - "SINDEX": (('sindex-used-bytes-memory'), ('memory_used_sindex_bytes')), - "Query": (('query_reqs', 'query_success'), ('query_reqs', 'query_success')), - "Aggregation": (('query_agg', 'query_agg_success'), ('query_agg', 'query_agg_success')), - "LDT": (('sub-records', 'ldt-writes', 'ldt-reads', 'ldt-deletes', 'ldt_writes', 'ldt_reads', 'ldt_deletes', 'sub_objects'), - ('ldt-writes', 'ldt-reads', 'ldt-deletes', 'ldt_writes', 'ldt_reads', 'ldt_deletes')), - "XDR Source": (('stat_read_reqs_xdr', 'xdr_read_success', 'xdr_read_error'), None), - "XDR Destination": (('stat_write_reqs_xdr'), ('xdr_write_success')), - "Rack-aware": (('self-group-id'), ('rack-id')), - } class Future(object): @@ -58,8 +38,9 @@ def wrapper(func, *args, **kwargs): self.exc = None try: self._result = func(*args, **kwargs) - except Exception as e: - self.exc = e + except Exception: + # Store original stack trace/exception to be re-thrown later. + self.exc = sys.exc_info() self._worker = threading.Thread(target=wrapper, args=args, kwargs=kwargs) @@ -70,7 +51,8 @@ def start(self): def result(self): if self.exc: - raise self.exc + raise self.exc[0], self.exc[1], self.exc[2] + self._worker.join() return self._result @@ -369,6 +351,7 @@ def restructure_sys_data(content, cmd): return content + def get_value_from_second_level_of_dict(data, keys, default_value=None, return_type=None): """ Function takes dictionary and keys to find values inside all subkeys of dictionary. @@ -387,6 +370,7 @@ def get_value_from_second_level_of_dict(data, keys, default_value=None, return_t return res_dict + def add_dicts(d1, d2): """ Function takes two dictionaries and merges those to one dictionary by adding values for same key. @@ -403,6 +387,7 @@ def add_dicts(d1, d2): return d1 + def pct_to_value(data, d_pct): """ Function takes dictionary with base value, and dictionary with percentage and converts percentage to value. @@ -420,408 +405,6 @@ def pct_to_value(data, d_pct): return out_map -def _is_keyval_greater_than_value(data={}, keys=(), value=0, is_and=False, type_check=int): - """ - Function takes dictionary, keys and value to compare. - Returns boolean to indicate value for key is greater than comparing value or not. - """ - - if not keys: - return True - - if not data: - return False - - if not isinstance(keys, tuple): - keys = (keys,) - - if is_and: - if all(get_value_from_dict(data, k, value, type_check) > value for k in keys): - return True - - else: - if any(get_value_from_dict(data, k, value, type_check) > value for k in keys): - return True - - return False - -def check_feature_by_keys(service_data=None, service_keys=None, ns_data=None, ns_keys=None): - """ - Function takes dictionary of service data, service keys, dictionary of namespace data and namespace keys. - Returns boolean to indicate service key in service data or namespace key in namespace data has non-zero value or not. - """ - - if service_data and not isinstance(service_data, Exception) and service_keys: - if _is_keyval_greater_than_value(service_data, service_keys): - return True - - if ns_data and ns_keys: - for ns, nsval in ns_data.iteritems(): - if not nsval or isinstance(nsval, Exception): - continue - if _is_keyval_greater_than_value(nsval, ns_keys): - return True - - return False - -def find_nodewise_features(service_data, ns_data, cl_data={}): - """ - Function takes dictionary of service stats, dictionary of namespace stats, and dictionary cluster config. - Returns map of active (used) features per node identifying by comparing respective keys for non-zero value. - """ - - features = {} - - for node in service_data.keys(): - if node in cl_data and cl_data[node] and not isinstance(cl_data[node], Exception): - if service_data[node] and not isinstance(service_data[node], Exception): - service_data[node].update(cl_data[node]) - else: - service_data[node] = cl_data[node] - - for feature, keys in FEATURE_KEYS.iteritems(): - for node, s_stats in service_data.iteritems(): - - if node not in features: - features[node] = {} - - features[node][feature.upper()] = "NO" - n_stats = None - - if node in ns_data and not isinstance(ns_data[node], Exception): - n_stats = ns_data[node] - - if check_feature_by_keys(s_stats, keys[0], n_stats, keys[1]): - features[node][feature.upper()] = "YES" - - return features - -def _find_features_for_cluster(service_data, ns_data, cl_data={}): - """ - Function takes dictionary of service stats, dictionary of namespace stats, and dictionary cluster config. - Returns list of active (used) features identifying by comparing respective keys for non-zero value. - """ - - features = [] - - for node in service_data.keys(): - if cl_data and node in cl_data and cl_data[node] and not isinstance(cl_data[node], Exception): - if service_data[node] and not isinstance(service_data[node], Exception): - service_data[node].update(cl_data[node]) - else: - service_data[node] = cl_data[node] - - for feature, keys in FEATURE_KEYS.iteritems(): - for node, d in service_data.iteritems(): - - ns_d = None - - if node in ns_data and not isinstance(ns_data[node], Exception): - ns_d = ns_data[node] - - if check_feature_by_keys(d, keys[0], ns_d, keys[1]): - features.append(feature) - break - - return features - -def _compute_set_overhead_for_ns(set_stats, ns): - """ - Function takes set stat and namespace name. - Returns set overhead for input namespace name. - """ - - if not ns or not set_stats or isinstance(set_stats, Exception): - return 0 - - overhead = 0 - for _k, stats in set_stats.iteritems(): - if not stats or isinstance(stats, Exception): - continue - - ns_name = get_value_from_second_level_of_dict(stats, ("ns", "ns_name"), default_value=None, return_type=str).values()[0] - if ns_name != ns: - continue - - set_name = get_value_from_second_level_of_dict(stats, ("set", "set_name"), default_value="", return_type=str).values()[0] - objects = sum(get_value_from_second_level_of_dict(stats, ("objects", "n_objects"), default_value=0, return_type=int).values()) - overhead += objects * (9 + len(set_name)) - - return overhead - -def _compute_license_data_size(namespace_stats, set_stats, cluster_dict, ns_dict): - """ - Function takes dictionary of set stats, dictionary of namespace stats, cluster output dictionary and namespace output dictionary. - Function finds license data size per namespace, and per cluster and updates output dictionaries. - """ - - if not namespace_stats: - return - - cl_memory_data_size = 0 - cl_device_data_size = 0 - - for ns, ns_stats in namespace_stats.iteritems(): - if not ns_stats or isinstance(ns_stats, Exception): - continue - repl_factor = max(get_value_from_second_level_of_dict(ns_stats, ("repl-factor", "replication-factor"), default_value=0, return_type=int).values()) - master_objects = sum(get_value_from_second_level_of_dict(ns_stats, ("master_objects", "master-objects"), default_value=0, return_type=int).values()) - devices_in_use = list(set(get_value_from_second_level_of_dict(ns_stats, ("storage-engine.device", "device", "storage-engine.file", "file", "dev"), default_value=None, return_type=str).values())) - memory_data_size = None - device_data_size = None - - if len(devices_in_use) == 0 or (len(devices_in_use) == 1 and devices_in_use[0] == None): - # Data in memory only - memory_data_size = sum(get_value_from_second_level_of_dict(ns_stats, ("memory_used_data_bytes", "data-used-bytes-memory"), default_value=0, return_type=int).values()) - memory_data_size = memory_data_size / repl_factor - - if memory_data_size > 0: - memory_record_overhead = master_objects * 2 - memory_data_size = memory_data_size - memory_record_overhead - - else: - # Data on disk - device_data_size = sum(get_value_from_second_level_of_dict(ns_stats, ("device_used_bytes", "used-bytes-disk"), default_value=0, return_type=int).values()) - - if device_data_size > 0: - set_overhead = _compute_set_overhead_for_ns(set_stats, ns) - device_data_size = device_data_size - set_overhead - - if device_data_size > 0: - tombstones = sum(get_value_from_second_level_of_dict(ns_stats, ("tombstones",), default_value=0, return_type=int).values()) - tombstone_overhead = tombstones * 128 - device_data_size = device_data_size - tombstone_overhead - - device_data_size = device_data_size / repl_factor - if device_data_size > 0: - device_record_overhead = master_objects * 64 - device_data_size = device_data_size - device_record_overhead - - ns_dict[ns]["license_data_in_memory"] = 0 - ns_dict[ns]["license_data_on_disk"] = 0 - if memory_data_size is not None: - ns_dict[ns]["license_data_in_memory"] = memory_data_size - cl_memory_data_size += memory_data_size - - if device_data_size is not None: - ns_dict[ns]["license_data_on_disk"] = device_data_size - cl_device_data_size += device_data_size - - cluster_dict["license_data"] = {} - cluster_dict["license_data"]["memory_size"] = cl_memory_data_size - cluster_dict["license_data"]["device_size"] = cl_device_data_size - -def _set_migration_status(namespace_stats, cluster_dict, ns_dict): - """ - Function takes dictionary of namespace stats, cluster output dictionary and namespace output dictionary. - Function finds migration status per namespace, and per cluster and updates output dictionaries. - """ - - if not namespace_stats: - return - - for ns, ns_stats in namespace_stats.iteritems(): - if not ns_stats or isinstance(ns_stats, Exception): - continue - - migrations_in_progress = any(get_value_from_second_level_of_dict(ns_stats, ("migrate_tx_partitions_remaining", "migrate-tx-partitions-remaining"), - default_value=0, return_type=int).values()) - if migrations_in_progress: - ns_dict[ns]["migrations_in_progress"] = True - cluster_dict["migrations_in_progress"] = True - -def _initialize_summary_output(ns_list): - """ - Function takes list of namespace names. - Returns dictionary with summary fields set. - """ - - summary_dict = {} - summary_dict["CLUSTER"] = {} - - summary_dict["CLUSTER"]["server_version"] = [] - summary_dict["CLUSTER"]["os_version"] = [] - summary_dict["CLUSTER"]["active_features"] = [] - summary_dict["CLUSTER"]["migrations_in_progress"] = False - - summary_dict["CLUSTER"]["device"] = {} - summary_dict["CLUSTER"]["device"]["count"] = 0 - summary_dict["CLUSTER"]["device"]["count_per_node"] = 0 - summary_dict["CLUSTER"]["device"]["count_same_across_nodes"] = True - summary_dict["CLUSTER"]["device"]["total"] = 0 - summary_dict["CLUSTER"]["device"]["used"] = 0 - summary_dict["CLUSTER"]["device"]["aval"] = 0 - summary_dict["CLUSTER"]["device"]["used_pct"] = 0 - summary_dict["CLUSTER"]["device"]["aval_pct"] = 0 - - summary_dict["CLUSTER"]["memory"] = {} - summary_dict["CLUSTER"]["memory"]["total"] = 0 - summary_dict["CLUSTER"]["memory"]["aval"] = 0 - summary_dict["CLUSTER"]["memory"]["aval_pct"] = 0 - - summary_dict["CLUSTER"]["active_ns"] = 0 - summary_dict["CLUSTER"]["ns_count"] = 0 - - summary_dict["CLUSTER"]["license_data"] = {} - summary_dict["CLUSTER"]["license_data"]["memory_size"] = 0 - summary_dict["CLUSTER"]["license_data"]["device_size"] = 0 - - summary_dict["FEATURES"] = {} - summary_dict["FEATURES"]["NAMESPACE"] = {} - - for ns in ns_list: - summary_dict["FEATURES"]["NAMESPACE"][ns] = {} - - summary_dict["FEATURES"]["NAMESPACE"][ns]["devices_total"] = 0 - summary_dict["FEATURES"]["NAMESPACE"][ns]["devices_per_node"] = 0 - summary_dict["FEATURES"]["NAMESPACE"][ns]["devices_count_same_across_nodes"] = True - - summary_dict["FEATURES"]["NAMESPACE"][ns]["memory_total"] = 0 - summary_dict["FEATURES"]["NAMESPACE"][ns]["memory_aval"] = 0 - summary_dict["FEATURES"]["NAMESPACE"][ns]["memory_available_pct"] = 0 - - summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_total"] = 0 - summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_used"] = 0 - summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_aval"] = 0 - summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_used_pct"] = 0 - summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_available_pct"] = 0 - - summary_dict["FEATURES"]["NAMESPACE"][ns]["repl_factor"] = 0 - summary_dict["FEATURES"]["NAMESPACE"][ns]["master_objects"] = 0 - - summary_dict["FEATURES"]["NAMESPACE"][ns]["license_data"] = {} - - summary_dict["FEATURES"]["NAMESPACE"][ns]["migrations_in_progress"] = False - - return summary_dict - -def create_summary(service_stats, namespace_stats, set_stats, metadata, cluster_configs={}): - """ - Function takes four dictionaries service stats, namespace stats, set stats and metadata. - Returns dictionary with summary information. - """ - - features = _find_features_for_cluster(service_stats, namespace_stats, cluster_configs) - - namespace_stats = flip_keys(namespace_stats) - set_stats = flip_keys(set_stats) - - summary_dict = _initialize_summary_output(namespace_stats.keys()) - - total_nodes = len(service_stats.keys()) - - cl_nodewise_device_counts = {} - - cl_nodewise_mem_size = {} - cl_nodewise_mem_aval = {} - - cl_nodewise_device_size = {} - cl_nodewise_device_used = {} - cl_nodewise_device_aval = {} - - _compute_license_data_size(namespace_stats, set_stats, summary_dict["CLUSTER"], summary_dict["FEATURES"]["NAMESPACE"]) - _set_migration_status(namespace_stats, summary_dict["CLUSTER"], summary_dict["FEATURES"]["NAMESPACE"]) - - summary_dict["CLUSTER"]["active_features"] = features - summary_dict["CLUSTER"]["cluster_size"]= list(set(get_value_from_second_level_of_dict(service_stats, ("cluster_size",), default_value=0, return_type=int).values())) - - if "server_version" in metadata and metadata["server_version"]: - summary_dict["CLUSTER"]["server_version"]= list(set(metadata["server_version"].values())) - - if "os_version" in metadata and metadata["os_version"]: - summary_dict["CLUSTER"]["os_version"]= list(set(get_value_from_second_level_of_dict(metadata["os_version"], ("description",), default_value="", return_type=str).values())) - - for ns, ns_stats in namespace_stats.iteritems(): - if not ns_stats or isinstance(ns_stats, Exception): - continue - - device_names_str = get_value_from_second_level_of_dict(ns_stats, ("storage-engine.device", "device", "storage-engine.file", "file", "dev"), default_value="", return_type=str) - device_counts = dict([(k, len(v.split(',')) if v else 0) for k, v in device_names_str.iteritems()]) - cl_nodewise_device_counts = add_dicts(cl_nodewise_device_counts, device_counts) - - ns_total_devices = sum(device_counts.values()) - ns_total_nodes = len(ns_stats.keys()) - - if ns_total_devices: - summary_dict["FEATURES"]["NAMESPACE"][ns]["devices_total"] = ns_total_devices - summary_dict["FEATURES"]["NAMESPACE"][ns]["devices_per_node"] = int((float(ns_total_devices)/float(ns_total_nodes)) + 0.5) - if len(set(device_counts.values())) > 1: - summary_dict["FEATURES"]["NAMESPACE"][ns]["devices_count_same_across_nodes"] = False - - mem_size = get_value_from_second_level_of_dict(ns_stats, ("memory-size",), default_value=0, return_type=int) - mem_aval_pct = get_value_from_second_level_of_dict(ns_stats, ("memory_free_pct", "free-pct-memory"), default_value=0, return_type=int) - mem_aval = pct_to_value(mem_size, mem_aval_pct) - cl_nodewise_mem_size = add_dicts(cl_nodewise_mem_size, mem_size) - cl_nodewise_mem_aval = add_dicts(cl_nodewise_mem_aval, mem_aval) - summary_dict["FEATURES"]["NAMESPACE"][ns]["memory_total"] = sum(mem_size.values()) - summary_dict["FEATURES"]["NAMESPACE"][ns]["memory_aval"] = sum(mem_aval.values()) - summary_dict["FEATURES"]["NAMESPACE"][ns]["memory_available_pct"] = (float(sum(mem_aval.values()))/float(sum(mem_size.values())))*100.0 - - device_size = get_value_from_second_level_of_dict(ns_stats, ("device_total_bytes", "total-bytes-disk"), default_value=0, return_type=int) - device_used = get_value_from_second_level_of_dict(ns_stats, ("device_used_bytes", "used-bytes-disk"), default_value=0, return_type=int) - device_aval_pct = get_value_from_second_level_of_dict(ns_stats, ("device_available_pct", "available_pct"), default_value=0, return_type=int) - device_aval = pct_to_value(device_size, device_aval_pct) - cl_nodewise_device_size = add_dicts(cl_nodewise_device_size, device_size) - cl_nodewise_device_used = add_dicts(cl_nodewise_device_used, device_used) - cl_nodewise_device_aval = add_dicts(cl_nodewise_device_aval, device_aval) - device_size_total = sum(device_size.values()) - if device_size_total > 0: - summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_total"] = device_size_total - summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_used"] = sum(device_used.values()) - summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_aval"] = sum(device_aval.values()) - summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_used_pct"] = (float(sum(device_used.values()))/float(device_size_total))*100.0 - summary_dict["FEATURES"]["NAMESPACE"][ns]["disk_available_pct"] = (float(sum(device_aval.values()))/float(device_size_total))*100.0 - - summary_dict["FEATURES"]["NAMESPACE"][ns]["repl_factor"] = list(set(get_value_from_second_level_of_dict(ns_stats, ("repl-factor", "replication-factor"), default_value=0, return_type=int).values())) - - data_in_memory = get_value_from_second_level_of_dict(ns_stats, ("storage-engine.data-in-memory", "data-in-memory"), default_value=False, return_type=bool).values()[0] - - if data_in_memory: - cache_read_pcts = get_value_from_second_level_of_dict(ns_stats, ("cache_read_pct", "cache-read-pct"), default_value="N/E", return_type=int).values() - if cache_read_pcts: - try: - summary_dict["FEATURES"]["NAMESPACE"][ns]["cache_read_pct"] = sum(cache_read_pcts)/len(cache_read_pcts) - except Exception: - pass - master_objects = sum(get_value_from_second_level_of_dict(ns_stats, ("master_objects", "master-objects"), default_value=0, return_type=int).values()) - summary_dict["CLUSTER"]["ns_count"] += 1 - if master_objects > 0: - summary_dict["FEATURES"]["NAMESPACE"][ns]["master_objects"] = master_objects - summary_dict["CLUSTER"]["active_ns"] += 1 - - try: - rack_ids = get_value_from_second_level_of_dict(ns_stats, ("rack-id",), default_value=None, return_type=int) - rack_ids = list(set(rack_ids.values())) - if len(rack_ids) > 1 or rack_ids[0] is not None: - if any((i is not None and i > 0) for i in rack_ids): - summary_dict["FEATURES"]["NAMESPACE"][ns]["rack-aware"] = True - else: - summary_dict["FEATURES"]["NAMESPACE"][ns]["rack-aware"] = False - except Exception: - pass - - cl_device_counts = sum(cl_nodewise_device_counts.values()) - if cl_device_counts: - summary_dict["CLUSTER"]["device"]["count"] = cl_device_counts - summary_dict["CLUSTER"]["device"]["count_per_node"] = int((float(cl_device_counts)/float(total_nodes)) + 0.5) - if len(set(cl_nodewise_device_counts.values())) > 1: - summary_dict["CLUSTER"]["device"]["count_same_across_nodes"] = False - - cl_memory_size_total = sum(cl_nodewise_mem_size.values()) - if cl_memory_size_total > 0: - summary_dict["CLUSTER"]["memory"]["total"] = cl_memory_size_total - summary_dict["CLUSTER"]["memory"]["aval"] = sum(cl_nodewise_mem_aval.values()) - summary_dict["CLUSTER"]["memory"]["aval_pct"] = (float(sum(cl_nodewise_mem_aval.values()))/float(cl_memory_size_total))*100.0 - - cl_device_size_total = sum(cl_nodewise_device_size.values()) - if cl_device_size_total > 0: - summary_dict["CLUSTER"]["device"]["total"] = cl_device_size_total - summary_dict["CLUSTER"]["device"]["used"] = sum(cl_nodewise_device_used.values()) - summary_dict["CLUSTER"]["device"]["aval"] = sum(cl_nodewise_device_aval.values()) - summary_dict["CLUSTER"]["device"]["used_pct"] = (float(sum(cl_nodewise_device_used.values()))/float(cl_device_size_total))*100.0 - summary_dict["CLUSTER"]["device"]["aval_pct"] = (float(sum(cl_nodewise_device_aval.values()))/float(cl_device_size_total))*100.0 - - return summary_dict def mbytes_to_bytes(data): if not data: @@ -838,198 +421,6 @@ def mbytes_to_bytes(data): return data -def _create_histogram_percentiles_output(histogram_name, histogram_data): - histogram_data = flip_keys(histogram_data) - - for namespace, host_data in histogram_data.iteritems(): - for host_id, data in host_data.iteritems(): - hist = data['data'] - width = data['width'] - - cum_total = 0 - total = sum(hist) - percentile = 0.1 - result = [] - - for i, v in enumerate(hist): - cum_total += float(v) - if total > 0: - portion = cum_total / total - else: - portion = 0.0 - - while portion >= percentile: - percentile += 0.1 - result.append(i + 1) - - if percentile > 1.0: - break - - if result == []: - result = [0] * 10 - - if histogram_name is "objsz": - data['percentiles'] = [(r * width) - 1 if r > 0 else r for r in result] - else: - data['percentiles'] = [r * width for r in result] - - return histogram_data - -def _create_bytewise_histogram_percentiles_output(histogram_data, bucket_count, builds): - histogram_data = flip_keys(histogram_data) - - for namespace, host_data in histogram_data.iteritems(): - result = [] - rblock_size_bytes = 128 - width = 1 - - for host_id, data in host_data.iteritems(): - - try: - as_version = builds[host_id] - if (LooseVersion(as_version) < LooseVersion("2.7.0") - or (LooseVersion(as_version) >= LooseVersion("3.0.0") - and LooseVersion(as_version) < LooseVersion("3.1.3"))): - rblock_size_bytes = 512 - - except Exception: - pass - - hist = data['data'] - width = data['width'] - - for i, v in enumerate(hist): - if v and v > 0: - result.append(i) - - result = list(set(result)) - result.sort() - start_buckets = [] - - if len(result) <= bucket_count: - # if asinfo buckets with values>0 are less than - # show_bucket_count then we can show all single buckets as it - # is, no need to merge to show big range - for res in result: - start_buckets.append(res) - start_buckets.append(res + 1) - - else: - # dividing volume buckets (from min possible bucket with - # value>0 to max possible bucket with value>0) into same range - start_bucket = result[0] - size = result[len(result) - 1] - result[0] + 1 - - bucket_width = size / bucket_count - additional_bucket_index = bucket_count - (size % bucket_count) - - bucket_index = 0 - - while bucket_index < bucket_count: - start_buckets.append(start_bucket) - - if bucket_index == additional_bucket_index: - bucket_width += 1 - - start_bucket += bucket_width - bucket_index += 1 - - start_buckets.append(start_bucket) - - columns = [] - need_to_show = {} - - for i, bucket in enumerate(start_buckets): - - if i == len(start_buckets) - 1: - break - - key = _get_bucket_range(bucket, start_buckets[i + 1], width, rblock_size_bytes) - need_to_show[key] = False - columns.append(key) - - for host_id, data in host_data.iteritems(): - - rblock_size_bytes = 128 - - try: - as_version = builds[host_id] - - if (LooseVersion(as_version) < LooseVersion("2.7.0") - or (LooseVersion(as_version) >= LooseVersion("3.0.0") - and LooseVersion(as_version) < LooseVersion("3.1.3"))): - rblock_size_bytes = 512 - - except Exception: - pass - - hist = data['data'] - width = data['width'] - data['values'] = {} - - for i, s in enumerate(start_buckets): - - if i == len(start_buckets) - 1: - break - - b_index = s - - key = _get_bucket_range(s, start_buckets[i + 1], width, rblock_size_bytes) - - if key not in columns: - columns.append(key) - - if key not in data["values"]: - data["values"][key] = 0 - - while b_index < start_buckets[i + 1]: - data["values"][key] += hist[b_index] - b_index += 1 - - if data["values"][key] > 0: - need_to_show[key] = True - - else: - if key not in need_to_show: - need_to_show[key] = False - - host_data["columns"] = [] - - for column in columns: - if need_to_show[column]: - host_data["columns"].append(column) - - return histogram_data - -def _get_bucket_range(current_bucket, next_bucket, width, rblock_size_bytes): - s_b = "0 B" - if current_bucket > 0: - last_bucket_last_rblock_end = ((current_bucket * width) - 1) * rblock_size_bytes - - if last_bucket_last_rblock_end < 1: - last_bucket_last_rblock_end = 0 - - else: - last_bucket_last_rblock_end += 1 - - s_b = filesize.size(last_bucket_last_rblock_end, filesize.byte) - - if current_bucket == 99 or next_bucket > 99: - return ">%s" % (s_b.replace(" ", "")) - - bucket_last_rblock_end = ((next_bucket * width) - 1) * rblock_size_bytes - e_b = filesize.size(bucket_last_rblock_end, filesize.byte) - return "%s to %s" % (s_b.replace(" ", ""), e_b.replace(" ", "")) - -def create_histogram_output(histogram_name, histogram_data, **params): - if "byte_distribution" not in params or not params["byte_distribution"]: - return _create_histogram_percentiles_output(histogram_name, histogram_data) - - if "bucket_count" not in params or "builds" not in params: - return {} - - return _create_bytewise_histogram_percentiles_output(histogram_data, params["bucket_count"], params["builds"]) - def find_delimiter_in(value): """Find a good delimiter to split the value by""" @@ -1039,6 +430,7 @@ def find_delimiter_in(value): return ';' + def convert_edition_to_shortform(edition): """Convert edition to shortform Enterprise or Community or N/E""" @@ -1049,3 +441,9 @@ def convert_edition_to_shortform(edition): return "Community" return "N/E" + + +def write_to_file(file, data): + f = open(str(file), 'a') + f.write(str(data)) + return f.close() diff --git a/lib/view/view.py b/lib/view/view.py index 82c587b5..5df92e91 100644 --- a/lib/view/view.py +++ b/lib/view/view.py @@ -426,6 +426,7 @@ def info_namespace_object(stats, cluster, title_suffix="", **ignore): row = {} else: row = ns_stats + ns_stats['_total_records'] = 0 if ns not in total_res: total_res[ns] = {} @@ -439,8 +440,6 @@ def info_namespace_object(stats, cluster, title_suffix="", **ignore): total_res[ns]["migrate_tx_partitions_remaining"] = 0 total_res[ns]["migrate_rx_partitions_remaining"] = 0 - ns_stats['_total_records'] = 0 - if "rack-id" in row: rack_id_available = True @@ -503,7 +502,8 @@ def info_namespace_object(stats, cluster, title_suffix="", **ignore): except Exception: pass - total_res[ns]['_total_records'] += ns_stats['_total_records'] + if not isinstance(ns_stats, Exception): + total_res[ns]['_total_records'] += ns_stats['_total_records'] row['namespace'] = ns row['real_node_id'] = node.node_id @@ -880,8 +880,12 @@ def _update_latency_column_list(data, all_columns): for column in data["columns"]: if column[0] == '>': - column = int(column[1:-2]) - all_columns.add(column) + c = int(column[1:-2]) + all_columns.add((c,(column, "%%>%dMs"%c))) + + elif column[0:2] == "%>": + c = int(column[2:-2]) + all_columns.add((c, column)) @staticmethod def _create_latency_row(data, ns=" "): @@ -935,7 +939,7 @@ def show_latency(latency, cluster, machine_wise_display=False, show_ns_details=F for ns, ns_data in _type_data.iteritems(): CliView._update_latency_column_list(ns_data, all_columns=all_columns) - all_columns = [">%sms" % (c) for c in sorted(all_columns)] + all_columns = [c[1] for c in sorted(all_columns, key=lambda c:c[0])] all_columns.insert(0, 'ops/sec') all_columns.insert(0, 'Time Span') if show_ns_details: diff --git a/test/e2e/test_show.py b/test/e2e/test_show.py index e1636389..ee29cca9 100644 --- a/test/e2e/test_show.py +++ b/test/e2e/test_show.py @@ -312,9 +312,9 @@ def test_proxy_latency(self): exp_header= ['Node', 'Time Span', 'Ops/Sec', - '>1Ms', - '>8Ms', - '>64Ms'] + '%>1Ms', + '%>8Ms', + '%>64Ms'] exp_no_of_rows = len(TestShowLatency.rc.cluster._live_nodes) actual_heading, actual_header, actual_no_of_rows = test_util.parse_output(TestShowLatency.proxy_latency, horizontal = True) @@ -337,9 +337,9 @@ def test_query_latency(self): exp_header= ['Node', 'Time Span', 'Ops/Sec', - '>1Ms', - '>8Ms', - '>64Ms'] + '%>1Ms', + '%>8Ms', + '%>64Ms'] exp_no_of_rows = len(TestShowLatency.rc.cluster._live_nodes) @@ -363,9 +363,9 @@ def test_reads_latency(self): exp_header= ['Node', 'Time Span', 'Ops/Sec', - '>1Ms', - '>8Ms', - '>64Ms'] + '%>1Ms', + '%>8Ms', + '%>64Ms'] exp_no_of_rows = len(TestShowLatency.rc.cluster._live_nodes) @@ -389,9 +389,9 @@ def test_udf_latency(self): exp_header= ['Node', 'Time Span', 'Ops/Sec', - '>1Ms', - '>8Ms', - '>64Ms'] + '%>1Ms', + '%>8Ms', + '%>64Ms'] exp_no_of_rows = len(TestShowLatency.rc.cluster._live_nodes) @@ -415,9 +415,9 @@ def test_writes_master_latency(self): exp_header= ['Node', 'Time Span', 'Ops/Sec', - '>1Ms', - '>8Ms', - '>64Ms'] + '%>1Ms', + '%>8Ms', + '%>64Ms'] exp_no_of_rows = len(TestShowLatency.rc.cluster._live_nodes) @@ -441,9 +441,9 @@ def test_write_latency(self): exp_header= ['Node', 'Time Span', 'Ops/Sec', - '>1Ms', - '>8Ms', - '>64Ms'] + '%>1Ms', + '%>8Ms', + '%>64Ms'] exp_no_of_rows = len(TestShowLatency.rc.cluster._live_nodes)