From ba7e43a7aa936a06b292ff4cc654cc4b275c8789 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 11 Jul 2017 16:25:42 -0400 Subject: [PATCH 01/19] mgr/PyModules: add 'pg_status' dump This is summary info, same as what's in 'ceph status'. Signed-off-by: Sage Weil --- src/mgr/PyModules.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/mgr/PyModules.cc b/src/mgr/PyModules.cc index 74628dbbf1c7b..d63b94b939f6d 100644 --- a/src/mgr/PyModules.cc +++ b/src/mgr/PyModules.cc @@ -254,6 +254,14 @@ PyObject *PyModules::get_python(const std::string &what) } ); return f.get(); + } else if (what == "pg_status") { + PyFormatter f; + cluster_state.with_pgmap( + [&f](const PGMap &pg_map) { + pg_map.print_summary(&f, nullptr); + } + ); + return f.get(); } else if (what == "df") { PyFormatter f; From 7686fd1f26018e21a15b4b80eaca780e384ce935 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 27 Jul 2017 10:06:45 -0400 Subject: [PATCH 02/19] mgr/PyModules: add 'pg_dump' get Signed-off-by: Sage Weil --- src/mgr/PyModules.cc | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/mgr/PyModules.cc b/src/mgr/PyModules.cc index d63b94b939f6d..526917f68b761 100644 --- a/src/mgr/PyModules.cc +++ b/src/mgr/PyModules.cc @@ -262,7 +262,14 @@ PyObject *PyModules::get_python(const std::string &what) } ); return f.get(); - + } else if (what == "pg_dump") { + PyFormatter f; + cluster_state.with_pgmap( + [&f](const PGMap &pg_map) { + pg_map.dump(&f); + } + ); + return f.get(); } else if (what == "df") { PyFormatter f; From 4337ec8fee6895c39a954eb1947f497cb7afbb9d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 10 Jul 2017 23:23:19 -0400 Subject: [PATCH 03/19] mgr: add trivial OSDMap wrapper class Signed-off-by: Sage Weil --- src/CMakeLists.txt | 1 + src/mgr/MgrPyModule.cc | 4 + src/mgr/PyModules.cc | 23 ++++++ src/mgr/PyModules.h | 1 + src/mgr/PyOSDMap.cc | 125 +++++++++++++++++++++++++++++ src/mgr/PyOSDMap.h | 10 +++ src/mgr/PyState.cc | 9 +++ src/pybind/mgr/dashboard/module.py | 3 + src/pybind/mgr/mgr_module.py | 49 +++++++++++ 9 files changed, 225 insertions(+) create mode 100644 src/mgr/PyOSDMap.cc create mode 100644 src/mgr/PyOSDMap.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2a8639aa062ff..424a7e5fe14c7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -688,6 +688,7 @@ if (WITH_MGR) mgr/ClusterState.cc mgr/PyModules.cc mgr/PyFormatter.cc + mgr/PyOSDMap.cc mgr/PyState.cc mgr/MgrPyModule.cc mgr/MgrStandby.cc diff --git a/src/mgr/MgrPyModule.cc b/src/mgr/MgrPyModule.cc index fda9bf6528dbd..c5f54361465c3 100644 --- a/src/mgr/MgrPyModule.cc +++ b/src/mgr/MgrPyModule.cc @@ -12,6 +12,7 @@ */ #include "PyState.h" +#include "PyOSDMap.h" #include "Gil.h" #include "PyFormatter.h" @@ -109,6 +110,9 @@ MgrPyModule::MgrPyModule(const std::string &module_name_, const std::string &sys } // Populate python namespace with callable hooks Py_InitModule("ceph_state", CephStateMethods); + Py_InitModule("ceph_osdmap", OSDMapMethods); + Py_InitModule("ceph_osdmap_incremental", OSDMapIncrementalMethods); + Py_InitModule("ceph_crushmap", CRUSHMapMethods); PySys_SetPath(const_cast(sys_path.c_str())); } diff --git a/src/mgr/PyModules.cc b/src/mgr/PyModules.cc index 526917f68b761..14372249f646d 100644 --- a/src/mgr/PyModules.cc +++ b/src/mgr/PyModules.cc @@ -786,6 +786,29 @@ PyObject *PyModules::get_context() return capsule; } +static void delete_osdmap(PyObject *object) +{ + OSDMap *osdmap = static_cast(PyCapsule_GetPointer(object, nullptr)); + assert(osdmap); + dout(10) << __func__ << " " << osdmap << dendl; + delete osdmap; +} + +PyObject *PyModules::get_osdmap() +{ + PyThreadState *tstate = PyEval_SaveThread(); + Mutex::Locker l(lock); + PyEval_RestoreThread(tstate); + + // Construct a capsule containing an OSDMap. + OSDMap *newmap = new OSDMap; + cluster_state.with_osdmap([&](const OSDMap& o) { + newmap->deepish_copy_from(o); + }); + dout(10) << __func__ << " " << newmap << dendl; + return PyCapsule_New(newmap, nullptr, &delete_osdmap); +} + static void _list_modules( const std::string path, std::set *modules) diff --git a/src/mgr/PyModules.h b/src/mgr/PyModules.h index 1c6751f4e51b9..b2540777163e3 100644 --- a/src/mgr/PyModules.h +++ b/src/mgr/PyModules.h @@ -82,6 +82,7 @@ class PyModules const std::string svc_type, const std::string &svc_id); PyObject *get_context(); + PyObject *get_osdmap(); std::map config_cache; diff --git a/src/mgr/PyOSDMap.cc b/src/mgr/PyOSDMap.cc new file mode 100644 index 0000000000000..9b1bdc4e3b79d --- /dev/null +++ b/src/mgr/PyOSDMap.cc @@ -0,0 +1,125 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "Mgr.h" + +#include "osd/OSDMap.h" +#include "common/errno.h" +#include "common/version.h" + +#include "PyOSDMap.h" +#include "PyFormatter.h" +#include "Gil.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mgr + +// ---------- + +static PyObject *osdmap_get_epoch(PyObject *self, PyObject *obj) +{ + OSDMap *osdmap = static_cast(PyCapsule_GetPointer(obj, nullptr)); + return PyInt_FromLong(osdmap->get_epoch()); +} + +static PyObject *osdmap_dump(PyObject *self, PyObject *obj) +{ + OSDMap *osdmap = static_cast(PyCapsule_GetPointer(obj, nullptr)); + PyFormatter f; + osdmap->dump(&f); + return f.get(); +} + + +static void delete_osdmap_incremental(PyObject *object) +{ + OSDMap::Incremental *inc = static_cast( + PyCapsule_GetPointer(object, nullptr)); + derr << __func__ << " " << inc << dendl; + delete inc; +} + +static PyObject *osdmap_new_incremental(PyObject *self, PyObject *obj) +{ + OSDMap *osdmap = static_cast(PyCapsule_GetPointer(obj, nullptr)); + + // Construct a capsule containing an OSDMap. + OSDMap::Incremental *inc = new OSDMap::Incremental; + inc->fsid = osdmap->get_fsid(); + inc->epoch = osdmap->get_epoch() + 1; + return PyCapsule_New(inc, nullptr, &delete_osdmap_incremental); +} + +static PyObject *osdmap_calc_pg_upmaps(PyObject *self, PyObject *args) +{ + PyObject *mapobj, *incobj, *pool_list; + double max_deviation = 0; + int max_iterations = 0; + if (!PyArg_ParseTuple(args, "OOdiO:calc_pg_upmaps", + &mapobj, &incobj, &max_deviation, + &max_iterations, &pool_list)) { + return nullptr; + } + + OSDMap *osdmap = static_cast(PyCapsule_GetPointer(mapobj, nullptr)); + OSDMap::Incremental *inc = static_cast( + PyCapsule_GetPointer(incobj, nullptr)); + + dout(10) << __func__ << " osdmap " << osdmap << " inc " << inc + << " max_deviation " << max_deviation + << " max_iterations " << max_iterations + << dendl; + set pools; + // FIXME: unpack pool_list and translate to pools set + int r = osdmap->calc_pg_upmaps(g_ceph_context, + max_deviation, + max_iterations, + pools, + inc); + dout(10) << __func__ << " r = " << r << dendl; + return PyInt_FromLong(r); +} + +PyMethodDef OSDMapMethods[] = { + {"get_epoch", osdmap_get_epoch, METH_O, "Get OSDMap epoch"}, + {"dump", osdmap_dump, METH_O, "Dump OSDMap::Incremental"}, + {"new_incremental", osdmap_new_incremental, METH_O, + "Create OSDMap::Incremental"}, + {"calc_pg_upmaps", osdmap_calc_pg_upmaps, METH_VARARGS, + "Calculate new pg-upmap values"}, + {NULL, NULL, 0, NULL} +}; + +// ---------- + +static PyObject *osdmap_inc_get_epoch(PyObject *self, PyObject *obj) +{ + OSDMap::Incremental *inc = static_cast( + PyCapsule_GetPointer(obj, nullptr)); + return PyInt_FromLong(inc->epoch); +} + +static PyObject *osdmap_inc_dump(PyObject *self, PyObject *obj) +{ + OSDMap::Incremental *inc = static_cast( + PyCapsule_GetPointer(obj, nullptr)); + PyFormatter f; + inc->dump(&f); + return f.get(); +} + +PyMethodDef OSDMapIncrementalMethods[] = { + {"get_epoch", osdmap_inc_get_epoch, METH_O, "Get OSDMap::Incremental epoch"}, + {"dump", osdmap_inc_dump, METH_O, "Dump OSDMap::Incremental"}, + {NULL, NULL, 0, NULL} +}; + + +// ---------- + + + +PyMethodDef CRUSHMapMethods[] = { +// {"get_epoch", osdmap_get_epoch, METH_O, "Get OSDMap epoch"}, + {NULL, NULL, 0, NULL} +}; diff --git a/src/mgr/PyOSDMap.h b/src/mgr/PyOSDMap.h new file mode 100644 index 0000000000000..bfa851cd66012 --- /dev/null +++ b/src/mgr/PyOSDMap.h @@ -0,0 +1,10 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "Python.h" + +extern PyMethodDef OSDMapMethods[]; +extern PyMethodDef OSDMapIncrementalMethods[]; +extern PyMethodDef CRUSHMapMethods[]; diff --git a/src/mgr/PyState.cc b/src/mgr/PyState.cc index ae237b2e599e6..d70a96d316cc2 100644 --- a/src/mgr/PyState.cc +++ b/src/mgr/PyState.cc @@ -348,6 +348,13 @@ get_perf_schema(PyObject *self, PyObject *args) return global_handle->get_perf_schema_python(handle, type_str, svc_id); } +static PyObject * +ceph_get_osdmap(PyObject *self, PyObject *args) +{ + return global_handle->get_osdmap(); +} + + PyMethodDef CephStateMethods[] = { {"get", ceph_state_get, METH_VARARGS, "Get a cluster object"}, @@ -377,6 +384,8 @@ PyMethodDef CephStateMethods[] = { "Get the ceph version of this process"}, {"get_context", ceph_get_context, METH_NOARGS, "Get a CephContext* in a python capsule"}, + {"get_osdmap", ceph_get_osdmap, METH_NOARGS, + "Get an OSDMap handle"}, {NULL, NULL, 0, NULL} }; diff --git a/src/pybind/mgr/dashboard/module.py b/src/pybind/mgr/dashboard/module.py index 7e2a4260a03e6..c9039a831f7f4 100644 --- a/src/pybind/mgr/dashboard/module.py +++ b/src/pybind/mgr/dashboard/module.py @@ -802,6 +802,9 @@ def get_perf_schema(self, **args): 'engine.autoreload.on': False }) + osdmap = self.get_osdmap() + log.info("latest osdmap is %d" % osdmap.get_epoch()) + static_dir = os.path.join(current_dir, 'static') conf = { "/static": { diff --git a/src/pybind/mgr/mgr_module.py b/src/pybind/mgr/mgr_module.py index 49c7efe15ca8d..2b64a2ea9ccfc 100644 --- a/src/pybind/mgr/mgr_module.py +++ b/src/pybind/mgr/mgr_module.py @@ -1,5 +1,8 @@ import ceph_state #noqa +import ceph_osdmap #noqa +import ceph_osdmap_incremental #noqa +import ceph_crushmap #noqa import json import logging import threading @@ -30,6 +33,44 @@ def wait(self): return self.r, self.outb, self.outs +class OSDMap(object): + def __init__(self, handle): + self._handle = handle + + def get_epoch(self): + return ceph_osdmap.get_epoch(self._handle) + + def dump(self): + return ceph_osdmap.dump(self._handle) + + def new_incremental(self): + return OSDMapIncremental(ceph_osdmap.new_incremental(self._handle)) + + def calc_pg_upmaps(self, inc, max_deviation=.01, max_iterations=10, pools=[]): + return ceph_osdmap.calc_pg_upmaps( + self._handle, + inc._handle, + max_deviation, max_iterations, pools) + + +class OSDMapIncremental(object): + def __init__(self, handle): + self._handle = handle + + def get_epoch(self): + return ceph_osdmap_incremental.get_epoch(self._handle) + + def dump(self): + return ceph_osdmap_incremental.dump(self._handle) + +class CRUSHMap(object): + def __init__(self, handle): + self._handle = handle + +# def get_epoch(self): +# return ceph_crushmap.get_epoch(self._handle) + + class MgrModule(object): COMMANDS = [] @@ -275,3 +316,11 @@ def self_test(self): :return: bool """ pass + + def get_osdmap(self): + """ + Get a handle to an OSDMap. If epoch==0, get a handle for the latest + OSDMap. + :return: OSDMap + """ + return OSDMap(ceph_state.get_osdmap()) From 57ba13f322170bfad1a1c12b124202f93c9cc864 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 11 Jul 2017 16:26:16 -0400 Subject: [PATCH 04/19] pybind/mgr/mgr_module: add default arg to get_config Signed-off-by: Sage Weil --- src/pybind/mgr/mgr_module.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/pybind/mgr/mgr_module.py b/src/pybind/mgr/mgr_module.py index 2b64a2ea9ccfc..346bc6e26dd0f 100644 --- a/src/pybind/mgr/mgr_module.py +++ b/src/pybind/mgr/mgr_module.py @@ -236,14 +236,18 @@ def get_mgr_id(self): """ return ceph_state.get_mgr_id() - def get_config(self, key): + def get_config(self, key, default=None): """ Retrieve the value of a persistent configuration setting :param key: str :return: str """ - return ceph_state.get_config(self._handle, key) + r = ceph_state.get_config(self._handle, key) + if r is None: + return default + else: + return r def get_config_prefix(self, key_prefix): """ From 4f6305f7adb97e98e50bda5aff7a392ca303136c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 11 Jul 2017 16:27:08 -0400 Subject: [PATCH 05/19] pybind/mgr/balancer: add balancer module - wake up every minute - back off when unknown, inactive, degraded - throttle against misplaced ratio - apply some optimization step - initially implement 'upmap' only Signed-off-by: Sage Weil --- src/pybind/mgr/balancer/__init__.py | 2 + src/pybind/mgr/balancer/module.py | 109 ++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 src/pybind/mgr/balancer/__init__.py create mode 100644 src/pybind/mgr/balancer/module.py diff --git a/src/pybind/mgr/balancer/__init__.py b/src/pybind/mgr/balancer/__init__.py new file mode 100644 index 0000000000000..79f5b86fd5045 --- /dev/null +++ b/src/pybind/mgr/balancer/__init__.py @@ -0,0 +1,2 @@ + +from module import * # NOQA diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py new file mode 100644 index 0000000000000..883be130c3135 --- /dev/null +++ b/src/pybind/mgr/balancer/module.py @@ -0,0 +1,109 @@ + +""" +Balance PG distribution across OSDs. +""" + +import json +import time +from mgr_module import MgrModule, CommandResult +from threading import Event + +# available modes: 'none', 'crush', 'crush-compat', 'upmap', 'osd_weight' +default_mode = 'upmap' +default_sleep_interval = 30 # seconds +default_max_misplaced = .03 # max ratio of pgs replaced at a time + +class Module(MgrModule): + run = True + + def __init__(self, *args, **kwargs): + super(Module, self).__init__(*args, **kwargs) + self.event = Event() + + def handle_command(self, command): + return (-errno.EINVAL, '', + "Command not found '{0}'".format(command['prefix'])) + + def shutdown(self): + self.log.info('Stopping') + self.run = False + self.event.set() + + def serve(self): + self.log.info('Starting') + while self.run: + mode = self.get_config('mode', default_mode) + sleep_interval = float(self.get_config('sleep_interval', + default_sleep_interval)) + max_misplaced = float(self.get_config('max_misplaced', + default_max_misplaced)) + self.log.info('Mode %s, sleep interval %f, max misplaced %f' % + (mode, sleep_interval, max_misplaced)) + + info = self.get('pg_status') + if info.get('unknown_ratio', 0.0) + info.get('degraded_ratio', 0.0) + info.get('inactive_ratio', 0.0) > 0.0: + self.log.info('Some PGs are unknown, degraded, or inactive; waiting') + elif info.get('misplaced_ratio', 0.0) >= max_misplaced: + self.log.info('Too many PGs (%f > max %f) are misplaced; waiting' % (info.get('misplaced_ratio', 0.0), max_misplaced)) + else: + if mode == 'upmap': + self.do_upmap() + elif mode == 'crush': + self.do_crush(compat=False) + elif mode == 'crush-compat': + self.do_crush(compat=True) + elif mode == 'osd_weight': + self.osd_weight() + elif mode == 'none': + self.log.info('Idle') + else: + self.log.info('Unrecognized mode %s' % mode) + self.event.wait(sleep_interval) + + def do_upmap(self): + self.log.info('do_upmap') + max_iterations = self.get_config('upmap_max_iterations', 10) + max_deviation = self.get_config('upmap_max_deviation', .01) + osdmap = self.get_osdmap() + inc = osdmap.new_incremental() + osdmap.calc_pg_upmaps(inc, max_deviation, max_iterations, []) + incdump = inc.dump() + self.log.info('inc is %s' % incdump) + + for pgid in incdump.get('old_pg_upmap_items', []): + self.log.info('ceph osd rm-pg-upmap-items %s', pgid) + result = CommandResult('foo') + self.send_command(result, 'mon', '', json.dumps({ + 'prefix': 'osd rm-pg-upmap-items', + 'format': 'json', + 'pgid': pgid, + }), 'foo') + r, outb, outs = result.wait() + if r != 0: + self.log.error('Error removing pg-upmap on %s' % pgid) + break; + + for item in incdump.get('new_pg_upmap_items', []): + self.log.info('ceph osd pg-upmap-items %s mappings %s', item['pgid'], + item['mappings']) + osdlist = [] + for m in item['mappings']: + osdlist += [m['from'], m['to']] + result = CommandResult('foo') + self.send_command(result, 'mon', '', json.dumps({ + 'prefix': 'osd pg-upmap-items', + 'format': 'json', + 'pgid': item['pgid'], + 'id': osdlist, + }), 'foo') + r, outb, outs = result.wait() + if r != 0: + self.log.error('Error setting pg-upmap on %s' % item['pgid']) + break; + + + def do_crush(self, compat): + self.log.info('do_crush compat=%b' % compat) + + def do_osd_weight(self): + self.log.info('do_osd_weight') From f7f017a7297f6ff891c21f06ce6ae0c90508c5a5 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 22 Jul 2017 23:17:18 -0400 Subject: [PATCH 06/19] pybind/mgr/balancer: do upmap by pool, in random order Signed-off-by: Sage Weil --- src/pybind/mgr/balancer/module.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py index 883be130c3135..c95f88de0591d 100644 --- a/src/pybind/mgr/balancer/module.py +++ b/src/pybind/mgr/balancer/module.py @@ -4,6 +4,7 @@ """ import json +import random import time from mgr_module import MgrModule, CommandResult from threading import Event @@ -64,11 +65,30 @@ def do_upmap(self): self.log.info('do_upmap') max_iterations = self.get_config('upmap_max_iterations', 10) max_deviation = self.get_config('upmap_max_deviation', .01) + + osdmap_dump = self.get('osd_map') + pools = [str(i['pool_name']) for i in osdmap_dump.get('pools',[])] + if len(pools) == 0: + self.log.info('no pools, nothing to do') + return + # shuffle pool list so they all get equal (in)attention + random.shuffle(pools) + self.log.info('pools %s' % pools) + osdmap = self.get_osdmap() inc = osdmap.new_incremental() - osdmap.calc_pg_upmaps(inc, max_deviation, max_iterations, []) + total_did = 0 + left = max_iterations + for pool in pools: + did = osdmap.calc_pg_upmaps(inc, max_deviation, left, [pool]) + total_did += did + left -= did + if left <= 0: + break + self.log.info('prepared %d/%d changes' % (total_did, max_iterations)) + incdump = inc.dump() - self.log.info('inc is %s' % incdump) + self.log.debug('resulting inc is %s' % incdump) for pgid in incdump.get('old_pg_upmap_items', []): self.log.info('ceph osd rm-pg-upmap-items %s', pgid) From 826b10e02760a8aef9fa8d5af63bf4150204aaec Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 22 Jul 2017 23:50:27 -0400 Subject: [PATCH 07/19] crush/CrushWrapper: refactor get_rule_weight_osd_map to work with roots too Allow us to specify a root node in the hierarchy instead of a rule. This way we can use it in conjunction with find_takes(). Signed-off-by: Sage Weil --- src/crush/CrushWrapper.cc | 80 +++++++++++++++++++++++++-------------- src/crush/CrushWrapper.h | 20 +++++++++- 2 files changed, 71 insertions(+), 29 deletions(-) diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc index c01856607afdb..2b02f24131548 100644 --- a/src/crush/CrushWrapper.cc +++ b/src/crush/CrushWrapper.cc @@ -1577,7 +1577,56 @@ int CrushWrapper::add_simple_rule( rule_type, -1, err); } -int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, map *pmap) +float CrushWrapper::_get_take_weight_osd_map(int root, + map *pmap) const +{ + float sum = 0.0; + list q; + q.push_back(root); + //breadth first iterate the OSD tree + while (!q.empty()) { + int bno = q.front(); + q.pop_front(); + crush_bucket *b = crush->buckets[-1-bno]; + assert(b); + for (unsigned j=0; jsize; ++j) { + int item_id = b->items[j]; + if (item_id >= 0) { //it's an OSD + float w = crush_get_bucket_item_weight(b, j); + (*pmap)[item_id] = w; + sum += w; + } else { //not an OSD, expand the child later + q.push_back(item_id); + } + } + } + return sum; +} + +void CrushWrapper::_normalize_weight_map(float sum, + const map& m, + map *pmap) const +{ + for (auto& p : m) { + map::iterator q = pmap->find(p.first); + if (q == pmap->end()) { + (*pmap)[p.first] = p.second / sum; + } else { + q->second += p.second / sum; + } + } +} + +int CrushWrapper::get_take_weight_osd_map(int root, map *pmap) const +{ + map m; + float sum = _get_take_weight_osd_map(root, &m); + _normalize_weight_map(sum, m, pmap); + return 0; +} + +int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, + map *pmap) const { if (ruleno >= crush->max_rules) return -ENOENT; @@ -1600,35 +1649,10 @@ int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, map *pmap) m[n] = 1.0; sum = 1.0; } else { - list q; - q.push_back(n); - //breadth first iterate the OSD tree - while (!q.empty()) { - int bno = q.front(); - q.pop_front(); - crush_bucket *b = crush->buckets[-1-bno]; - assert(b); - for (unsigned j=0; jsize; ++j) { - int item_id = b->items[j]; - if (item_id >= 0) { //it's an OSD - float w = crush_get_bucket_item_weight(b, j); - m[item_id] = w; - sum += w; - } else { //not an OSD, expand the child later - q.push_back(item_id); - } - } - } - } - } - for (map::iterator p = m.begin(); p != m.end(); ++p) { - map::iterator q = pmap->find(p->first); - if (q == pmap->end()) { - (*pmap)[p->first] = p->second / sum; - } else { - q->second += p->second / sum; + sum += _get_take_weight_osd_map(n, &m); } } + _normalize_weight_map(sum, m, pmap); } return 0; diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h index ec6a9ea85816f..8b3fd438a30c8 100644 --- a/src/crush/CrushWrapper.h +++ b/src/crush/CrushWrapper.h @@ -1006,6 +1006,12 @@ class CrushWrapper { return s->arg2; } +private: + float _get_take_weight_osd_map(int root, map *pmap) const; + void _normalize_weight_map(float sum, const map& m, + map *pmap) const; + +public: /** * calculate a map of osds to weights for a given rule * @@ -1016,7 +1022,19 @@ class CrushWrapper { * @param pmap [out] map of osd to weight * @return 0 for success, or negative error code */ - int get_rule_weight_osd_map(unsigned ruleno, map *pmap); + int get_rule_weight_osd_map(unsigned ruleno, map *pmap) const; + + /** + * calculate a map of osds to weights for a given starting root + * + * Generate a map of which OSDs get how much relative weight for a + * given starting root + * + * @param root node + * @param pmap [out] map of osd to weight + * @return 0 for success, or negative error code + */ + int get_take_weight_osd_map(int root, map *pmap) const; /* modifiers */ From 626059d474cf66264ed173d0a97a77ba6a14a4ff Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 22 Jul 2017 23:51:47 -0400 Subject: [PATCH 08/19] crush/CrushWrapper: fix output arg for find_{takes,roots}() Signed-off-by: Sage Weil --- src/crush/CrushTreeDumper.h | 2 +- src/crush/CrushWrapper.cc | 20 ++++++++++---------- src/crush/CrushWrapper.h | 6 +++--- src/tools/crushtool.cc | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/crush/CrushTreeDumper.h b/src/crush/CrushTreeDumper.h index 199b3c39be3ec..8d48069fe1b7b 100644 --- a/src/crush/CrushTreeDumper.h +++ b/src/crush/CrushTreeDumper.h @@ -68,7 +68,7 @@ namespace CrushTreeDumper { explicit Dumper(const CrushWrapper *crush_, const name_map_t& weight_set_names_) : crush(crush_), weight_set_names(weight_set_names_) { - crush->find_nonshadow_roots(roots); + crush->find_nonshadow_roots(&roots); root = roots.begin(); } diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc index 2b02f24131548..428bf29c9c0ad 100644 --- a/src/crush/CrushWrapper.cc +++ b/src/crush/CrushWrapper.cc @@ -291,7 +291,7 @@ int CrushWrapper::rename_bucket(const string& srcname, return set_item_name(oldid, dstname); } -void CrushWrapper::find_takes(set& roots) const +void CrushWrapper::find_takes(set *roots) const { for (unsigned i=0; imax_rules; i++) { crush_rule *r = crush->rules[i]; @@ -299,23 +299,23 @@ void CrushWrapper::find_takes(set& roots) const continue; for (unsigned j=0; jlen; j++) { if (r->steps[j].op == CRUSH_RULE_TAKE) - roots.insert(r->steps[j].arg1); + roots->insert(r->steps[j].arg1); } } } -void CrushWrapper::find_roots(set& roots) const +void CrushWrapper::find_roots(set *roots) const { for (int i = 0; i < crush->max_buckets; i++) { if (!crush->buckets[i]) continue; crush_bucket *b = crush->buckets[i]; if (!_search_item_exists(b->id)) - roots.insert(b->id); + roots->insert(b->id); } } -void CrushWrapper::find_nonshadow_roots(set& roots) const +void CrushWrapper::find_nonshadow_roots(set *roots) const { for (int i = 0; i < crush->max_buckets; i++) { if (!crush->buckets[i]) @@ -326,7 +326,7 @@ void CrushWrapper::find_nonshadow_roots(set& roots) const const char *name = get_item_name(b->id); if (name && !is_valid_crush_name(name)) continue; - roots.insert(b->id); + roots->insert(b->id); } } @@ -1381,7 +1381,7 @@ bool CrushWrapper::class_is_in_use(int class_id) int CrushWrapper::populate_classes() { set roots; - find_roots(roots); + find_roots(&roots); for (auto &r : roots) { if (r >= 0) continue; @@ -1405,7 +1405,7 @@ int CrushWrapper::cleanup_classes() int CrushWrapper::trim_roots_with_class(bool unused) { set roots; - find_roots(roots); + find_roots(&roots); for (auto &r : roots) { if (r >= 0) continue; @@ -1449,7 +1449,7 @@ int32_t CrushWrapper::_alloc_class_id() const { void CrushWrapper::reweight(CephContext *cct) { set roots; - find_roots(roots); + find_roots(&roots); for (set::iterator p = roots.begin(); p != roots.end(); ++p) { if (*p >= 0) continue; @@ -2387,7 +2387,7 @@ namespace { void dump(Formatter *f) { set roots; - crush->find_roots(roots); + crush->find_roots(&roots); for (set::iterator root = roots.begin(); root != roots.end(); ++root) { dump_item(Item(*root, 0, 0, crush->get_bucket_weightf(*root)), f); } diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h index 8b3fd438a30c8..cd272d5e4134a 100644 --- a/src/crush/CrushWrapper.h +++ b/src/crush/CrushWrapper.h @@ -588,14 +588,14 @@ class CrushWrapper { * * Note that these may not be parentless roots. */ - void find_takes(set& roots) const; + void find_takes(set *roots) const; /** * find tree roots * * These are parentless nodes in the map. */ - void find_roots(set& roots) const; + void find_roots(set *roots) const; /** * find tree roots that are not shadow (device class) items @@ -603,7 +603,7 @@ class CrushWrapper { * These are parentless nodes in the map that are not shadow * items for device classes. */ - void find_nonshadow_roots(set& roots) const; + void find_nonshadow_roots(set *roots) const; /** * see if an item is contained within a subtree diff --git a/src/tools/crushtool.cc b/src/tools/crushtool.cc index 2a1bc83b8f86f..c2dbaae84fce2 100644 --- a/src/tools/crushtool.cc +++ b/src/tools/crushtool.cc @@ -829,7 +829,7 @@ int main(int argc, const char **argv) { set roots; - crush.find_roots(roots); + crush.find_roots(&roots); if (roots.size() > 1) dout(1) << "The crush rulesets will use the root " << root << "\n" << "and ignore the others.\n" From 1b8335f64ee92b53afe0379592bcf5021d365a2c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 27 Jul 2017 10:07:31 -0400 Subject: [PATCH 09/19] crush/CrushWrapper: rule_has_take Signed-off-by: Sage Weil --- src/crush/CrushWrapper.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h index cd272d5e4134a..0f858d1678f15 100644 --- a/src/crush/CrushWrapper.h +++ b/src/crush/CrushWrapper.h @@ -965,6 +965,17 @@ class CrushWrapper { return true; return false; } + bool rule_has_take(unsigned ruleno, int take) const { + if (!crush) return false; + crush_rule *rule = get_rule(ruleno); + for (unsigned i = 0; i < rule->len; ++i) { + if (rule->steps[i].op == CRUSH_RULE_TAKE && + rule->steps[i].arg1 == take) { + return true; + } + } + return false; + } int get_rule_len(unsigned ruleno) const { crush_rule *r = get_rule(ruleno); if (IS_ERR(r)) return PTR_ERR(r); From 19ab2a7d7b99f6cbf2d262475740e5bd05bdf07a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 23 Jul 2017 00:10:56 -0400 Subject: [PATCH 10/19] mgr/PyOSDMap: get_crush, find_takes, get_take_weight_osd_map These let us identify distinct CRUSH hierarchies that rules distribute data over, and create relative weight maps for the OSDs they map to. Signed-off-by: Sage Weil --- src/mgr/PyOSDMap.cc | 212 ++++++++++++++++++++++++++++++++++- src/pybind/mgr/mgr_module.py | 46 +++++++- 2 files changed, 250 insertions(+), 8 deletions(-) diff --git a/src/mgr/PyOSDMap.cc b/src/mgr/PyOSDMap.cc index 9b1bdc4e3b79d..69bcea42a2ade 100644 --- a/src/mgr/PyOSDMap.cc +++ b/src/mgr/PyOSDMap.cc @@ -6,6 +6,7 @@ #include "osd/OSDMap.h" #include "common/errno.h" #include "common/version.h" +#include "include/stringify.h" #include "PyOSDMap.h" #include "PyFormatter.h" @@ -22,6 +23,12 @@ static PyObject *osdmap_get_epoch(PyObject *self, PyObject *obj) return PyInt_FromLong(osdmap->get_epoch()); } +static PyObject *osdmap_get_crush_version(PyObject *self, PyObject *obj) +{ + OSDMap *osdmap = static_cast(PyCapsule_GetPointer(obj, nullptr)); + return PyInt_FromLong(osdmap->get_crush_version()); +} + static PyObject *osdmap_dump(PyObject *self, PyObject *obj) { OSDMap *osdmap = static_cast(PyCapsule_GetPointer(obj, nullptr)); @@ -35,21 +42,84 @@ static void delete_osdmap_incremental(PyObject *object) { OSDMap::Incremental *inc = static_cast( PyCapsule_GetPointer(object, nullptr)); - derr << __func__ << " " << inc << dendl; + dout(10) << __func__ << " " << inc << dendl; delete inc; } static PyObject *osdmap_new_incremental(PyObject *self, PyObject *obj) { OSDMap *osdmap = static_cast(PyCapsule_GetPointer(obj, nullptr)); - - // Construct a capsule containing an OSDMap. OSDMap::Incremental *inc = new OSDMap::Incremental; inc->fsid = osdmap->get_fsid(); inc->epoch = osdmap->get_epoch() + 1; + // always include latest crush map here... this is okay since we never + // actually use this map in the real world (and even if we did it would + // be a no-op). + osdmap->crush->encode(inc->crush, CEPH_FEATURES_ALL); + dout(10) << __func__ << " " << inc << dendl; return PyCapsule_New(inc, nullptr, &delete_osdmap_incremental); } +static void delete_osdmap(PyObject *object) +{ + OSDMap *osdmap = static_cast(PyCapsule_GetPointer(object, nullptr)); + assert(osdmap); + dout(10) << __func__ << " " << osdmap << dendl; + delete osdmap; +} + +static PyObject *osdmap_apply_incremental(PyObject *self, PyObject *args) +{ + PyObject *mapobj, *incobj; + if (!PyArg_ParseTuple(args, "OO:apply_incremental", + &mapobj, &incobj)) { + return nullptr; + } + OSDMap *osdmap = static_cast(PyCapsule_GetPointer(mapobj, nullptr)); + OSDMap::Incremental *inc = static_cast( + PyCapsule_GetPointer(incobj, nullptr)); + if (!osdmap || !inc) { + return nullptr; + } + + bufferlist bl; + osdmap->encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED); + OSDMap *next = new OSDMap; + next->decode(bl); + next->apply_incremental(*inc); + dout(10) << __func__ << " map " << osdmap << " inc " << inc + << " next " << next << dendl; + return PyCapsule_New(next, nullptr, &delete_osdmap); +} + +static PyObject *osdmap_get_crush(PyObject *self, PyObject *obj) +{ + OSDMap *osdmap = static_cast(PyCapsule_GetPointer(obj, nullptr)); + + // Construct a capsule containing a the CrushWrapper. + return PyCapsule_New(osdmap->crush.get(), nullptr, nullptr); +} + +static PyObject *osdmap_get_pools_by_take(PyObject *self, PyObject *args) +{ + PyObject *mapobj; + int take; + if (!PyArg_ParseTuple(args, "Oi:get_pools_by_take", + &mapobj, &take)) { + return nullptr; + } + OSDMap *osdmap = static_cast(PyCapsule_GetPointer(mapobj, nullptr)); + PyFormatter f; + f.open_array_section("pools"); + for (auto& p : osdmap->get_pools()) { + if (osdmap->crush->rule_has_take(p.second.crush_rule, take)) { + f.dump_int("pool", p.first); + } + } + f.close_section(); + return f.get(); +} + static PyObject *osdmap_calc_pg_upmaps(PyObject *self, PyObject *args) { PyObject *mapobj, *incobj, *pool_list; @@ -82,9 +152,15 @@ static PyObject *osdmap_calc_pg_upmaps(PyObject *self, PyObject *args) PyMethodDef OSDMapMethods[] = { {"get_epoch", osdmap_get_epoch, METH_O, "Get OSDMap epoch"}, + {"get_crush_version", osdmap_get_crush_version, METH_O, "Get CRUSH version"}, {"dump", osdmap_dump, METH_O, "Dump OSDMap::Incremental"}, {"new_incremental", osdmap_new_incremental, METH_O, "Create OSDMap::Incremental"}, + {"apply_incremental", osdmap_apply_incremental, METH_VARARGS, + "Apply OSDMap::Incremental and return the resulting OSDMap"}, + {"get_crush", osdmap_get_crush, METH_O, "Get CrushWrapper"}, + {"get_pools_by_take", osdmap_get_pools_by_take, METH_VARARGS, + "Get pools that have CRUSH rules that TAKE the given root"}, {"calc_pg_upmaps", osdmap_calc_pg_upmaps, METH_VARARGS, "Calculate new pg-upmap values"}, {NULL, NULL, 0, NULL} @@ -108,18 +184,146 @@ static PyObject *osdmap_inc_dump(PyObject *self, PyObject *obj) return f.get(); } +static int get_int_float_map(PyObject *obj, map *out) +{ + PyObject *ls = PyDict_Items(obj); + for (int j = 0; j < PyList_Size(ls); ++j) { + PyObject *pair = PyList_GET_ITEM(ls, j); + if (!PyTuple_Check(pair)) { + derr << __func__ << " item " << j << " not a tuple" << dendl; + return -1; + } + int k; + double v; + if (!PyArg_ParseTuple(pair, "id:pair", &k, &v)) { + derr << __func__ << " item " << j << " not a size 2 tuple" << dendl; + return -1; + } + (*out)[k] = v; + } + return 0; +} + +static PyObject *osdmap_inc_set_osd_reweights(PyObject *self, PyObject *args) +{ + PyObject *incobj, *weightobj; + if (!PyArg_ParseTuple(args, "OO:set_osd_reweights", + &incobj, &weightobj)) { + return nullptr; + } + OSDMap::Incremental *inc = static_cast( + PyCapsule_GetPointer(incobj, nullptr)); + map wm; + if (get_int_float_map(weightobj, &wm) < 0) { + return nullptr; + } + + for (auto i : wm) { + inc->new_weight[i.first] = std::max(0.0, std::min(1.0, i.second)) * 0x10000; + } + Py_RETURN_NONE; +} + +static PyObject *osdmap_inc_set_compat_weight_set_weights( + PyObject *self, PyObject *args) +{ + PyObject *incobj, *weightobj; + if (!PyArg_ParseTuple(args, "OO:set_compat_weight_set_weights", + &incobj, &weightobj)) { + return nullptr; + } + OSDMap::Incremental *inc = static_cast( + PyCapsule_GetPointer(incobj, nullptr)); + map wm; + if (get_int_float_map(weightobj, &wm) < 0) { + return nullptr; + } + + CrushWrapper crush; + assert(inc->crush.length()); // see new_incremental + auto p = inc->crush.begin(); + ::decode(crush, p); + crush.create_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS, 1); + for (auto i : wm) { + crush.choose_args_adjust_item_weightf( + g_ceph_context, + crush.choose_args_get(CrushWrapper::DEFAULT_CHOOSE_ARGS), + i.first, + { i.second }, + nullptr); + } + inc->crush.clear(); + crush.encode(inc->crush, CEPH_FEATURES_ALL); + Py_RETURN_NONE; +} + + + PyMethodDef OSDMapIncrementalMethods[] = { {"get_epoch", osdmap_inc_get_epoch, METH_O, "Get OSDMap::Incremental epoch"}, {"dump", osdmap_inc_dump, METH_O, "Dump OSDMap::Incremental"}, + {"set_osd_reweights", osdmap_inc_set_osd_reweights, METH_VARARGS, + "Set osd reweight values"}, + {"set_crush_compat_weight_set_weights", + osdmap_inc_set_compat_weight_set_weights, METH_VARARGS, + "Set weight values in the pending CRUSH compat weight-set"}, {NULL, NULL, 0, NULL} }; // ---------- +static PyObject *crush_dump(PyObject *self, PyObject *obj) +{ + CrushWrapper *crush = static_cast( + PyCapsule_GetPointer(obj, nullptr)); + PyFormatter f; + crush->dump(&f); + return f.get(); +} + +static PyObject *crush_find_takes(PyObject *self, PyObject *obj) +{ + CrushWrapper *crush = static_cast( + PyCapsule_GetPointer(obj, nullptr)); + set takes; + crush->find_takes(&takes); + PyFormatter f; + f.open_array_section("takes"); + for (auto root : takes) { + f.dump_int("root", root); + } + f.close_section(); + return f.get(); +} + +static PyObject *crush_get_take_weight_osd_map(PyObject *self, PyObject *args) +{ + PyObject *obj; + int root; + if (!PyArg_ParseTuple(args, "Oi:get_take_weight_osd_map", + &obj, &root)) { + return nullptr; + } + CrushWrapper *crush = static_cast( + PyCapsule_GetPointer(obj, nullptr)); + map wmap; + crush->get_take_weight_osd_map(root, &wmap); + PyFormatter f; + f.open_object_section("weights"); + for (auto& p : wmap) { + string n = stringify(p.first); // ick + f.dump_float(n.c_str(), p.second); + } + f.close_section(); + return f.get(); +} PyMethodDef CRUSHMapMethods[] = { -// {"get_epoch", osdmap_get_epoch, METH_O, "Get OSDMap epoch"}, + {"dump", crush_dump, METH_O, "Dump map"}, + {"find_takes", crush_find_takes, METH_O, "Find distinct TAKE roots"}, + {"get_take_weight_osd_map", crush_get_take_weight_osd_map, METH_VARARGS, + "Get OSD weight map for a given TAKE root node"}, {NULL, NULL, 0, NULL} }; diff --git a/src/pybind/mgr/mgr_module.py b/src/pybind/mgr/mgr_module.py index 346bc6e26dd0f..5dbb6c1f01dea 100644 --- a/src/pybind/mgr/mgr_module.py +++ b/src/pybind/mgr/mgr_module.py @@ -40,13 +40,26 @@ def __init__(self, handle): def get_epoch(self): return ceph_osdmap.get_epoch(self._handle) + def get_crush_version(self): + return ceph_osdmap.get_crush_version(self._handle) + def dump(self): return ceph_osdmap.dump(self._handle) def new_incremental(self): return OSDMapIncremental(ceph_osdmap.new_incremental(self._handle)) - def calc_pg_upmaps(self, inc, max_deviation=.01, max_iterations=10, pools=[]): + def apply_incremental(self, inc): + return OSDMap(ceph_osdmap.apply_incremental(self._handle, inc._handle)) + + def get_crush(self): + return CRUSHMap(ceph_osdmap.get_crush(self._handle), self) + + def get_pools_by_take(self, take): + return ceph_osdmap.get_pools_by_take(self._handle, take).get('pools', []) + + def calc_pg_upmaps(self, inc, + max_deviation=.01, max_iterations=10, pools=[]): return ceph_osdmap.calc_pg_upmaps( self._handle, inc._handle, @@ -63,12 +76,37 @@ def get_epoch(self): def dump(self): return ceph_osdmap_incremental.dump(self._handle) + def set_osd_reweights(self, weightmap): + """ + weightmap is a dict, int to float. e.g. { 0: .9, 1: 1.0, 3: .997 } + """ + return ceph_osdmap_incremental.set_osd_reweights(self._handle, weightmap) + + def set_crush_compat_weight_set_weights(self, weightmap): + """ + weightmap is a dict, int to float. devices only. e.g., + { 0: 3.4, 1: 3.3, 2: 3.334 } + """ + return ceph_osdmap_incremental.set_crush_compat_weight_set_weights( + self._handle, weightmap) + + + class CRUSHMap(object): - def __init__(self, handle): + def __init__(self, handle, parent_osdmap): self._handle = handle + # keep ref to parent osdmap since handle lifecycle is owned by it + self._parent_osdmap = parent_osdmap + + def dump(self): + return ceph_crushmap.dump(self._handle) + + def find_takes(self): + return ceph_crushmap.find_takes(self._handle).get('takes',[]) -# def get_epoch(self): -# return ceph_crushmap.get_epoch(self._handle) + def get_take_weight_osd_map(self, root): + uglymap = ceph_crushmap.get_take_weight_osd_map(self._handle, root) + return { int(k): v for k, v in uglymap.get('weights', {}).iteritems() } class MgrModule(object): From 2905b7b69a80dfbd9855c5afc2906ea0accd5c7c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 27 Jul 2017 23:33:06 -0400 Subject: [PATCH 11/19] mgr/PyOSDMap: OSDMap.map_pool_pgs_up, CRUSHMap.get_item_name Signed-off-by: Sage Weil --- src/mgr/PyOSDMap.cc | 50 ++++++++++++++++++++++++++++++++++++ src/pybind/mgr/mgr_module.py | 5 ++++ 2 files changed, 55 insertions(+) diff --git a/src/mgr/PyOSDMap.cc b/src/mgr/PyOSDMap.cc index 69bcea42a2ade..e630cadef3779 100644 --- a/src/mgr/PyOSDMap.cc +++ b/src/mgr/PyOSDMap.cc @@ -150,6 +150,37 @@ static PyObject *osdmap_calc_pg_upmaps(PyObject *self, PyObject *args) return PyInt_FromLong(r); } +static PyObject *osdmap_map_pool_pgs_up(PyObject *self, PyObject *args) +{ + PyObject *mapobj; + int poolid; + if (!PyArg_ParseTuple(args, "Oi:map_pool_pgs_up", + &mapobj, &poolid)) { + return nullptr; + } + OSDMap *osdmap = static_cast(PyCapsule_GetPointer(mapobj, nullptr)); + if (!osdmap) + return nullptr; + auto pi = osdmap->get_pg_pool(poolid); + if (!pi) + return nullptr; + map> pm; + for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) { + pg_t pgid(ps, poolid); + osdmap->pg_to_up_acting_osds(pgid, &pm[pgid], nullptr, nullptr, nullptr); + } + PyFormatter f; + for (auto p : pm) { + string pg = stringify(p.first); + f.open_array_section(pg.c_str()); + for (auto o : p.second) { + f.dump_int("osd", o); + } + f.close_section(); + } + return f.get(); +} + PyMethodDef OSDMapMethods[] = { {"get_epoch", osdmap_get_epoch, METH_O, "Get OSDMap epoch"}, {"get_crush_version", osdmap_get_crush_version, METH_O, "Get CRUSH version"}, @@ -163,6 +194,8 @@ PyMethodDef OSDMapMethods[] = { "Get pools that have CRUSH rules that TAKE the given root"}, {"calc_pg_upmaps", osdmap_calc_pg_upmaps, METH_VARARGS, "Calculate new pg-upmap values"}, + {"map_pool_pgs_up", osdmap_map_pool_pgs_up, METH_VARARGS, + "Calculate up set mappings for all PGs in a pool"}, {NULL, NULL, 0, NULL} }; @@ -282,6 +315,22 @@ static PyObject *crush_dump(PyObject *self, PyObject *obj) return f.get(); } +static PyObject *crush_get_item_name(PyObject *self, PyObject *args) +{ + PyObject *obj; + int item; + if (!PyArg_ParseTuple(args, "Oi:get_item_name", + &obj, &item)) { + return nullptr; + } + CrushWrapper *crush = static_cast( + PyCapsule_GetPointer(obj, nullptr)); + if (!crush->item_exists(item)) { + Py_RETURN_NONE; + } + return PyString_FromString(crush->get_item_name(item)); +} + static PyObject *crush_find_takes(PyObject *self, PyObject *obj) { CrushWrapper *crush = static_cast( @@ -322,6 +371,7 @@ static PyObject *crush_get_take_weight_osd_map(PyObject *self, PyObject *args) PyMethodDef CRUSHMapMethods[] = { {"dump", crush_dump, METH_O, "Dump map"}, + {"get_item_name", crush_get_item_name, METH_VARARGS, "Get item name"}, {"find_takes", crush_find_takes, METH_O, "Find distinct TAKE roots"}, {"get_take_weight_osd_map", crush_get_take_weight_osd_map, METH_VARARGS, "Get OSD weight map for a given TAKE root node"}, diff --git a/src/pybind/mgr/mgr_module.py b/src/pybind/mgr/mgr_module.py index 5dbb6c1f01dea..4e3554e1f26d3 100644 --- a/src/pybind/mgr/mgr_module.py +++ b/src/pybind/mgr/mgr_module.py @@ -65,6 +65,8 @@ def calc_pg_upmaps(self, inc, inc._handle, max_deviation, max_iterations, pools) + def map_pool_pgs_up(self, poolid): + return ceph_osdmap.map_pool_pgs_up(self._handle, poolid) class OSDMapIncremental(object): def __init__(self, handle): @@ -101,6 +103,9 @@ def __init__(self, handle, parent_osdmap): def dump(self): return ceph_crushmap.dump(self._handle) + def get_item_name(self, item): + return ceph_crushmap.get_item_name(self._handle, item) + def find_takes(self): return ceph_crushmap.find_takes(self._handle).get('takes',[]) From eb193894f59ce406a881e35a26f8d72197c7add5 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Aug 2017 16:23:08 -0400 Subject: [PATCH 12/19] pybind/mgr/balancer: rough framework Signed-off-by: Sage Weil --- src/pybind/mgr/balancer/module.py | 757 ++++++++++++++++++++++++++++-- src/vstart.sh | 2 +- 2 files changed, 709 insertions(+), 50 deletions(-) diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py index c95f88de0591d..4c270b3280ade 100644 --- a/src/pybind/mgr/balancer/module.py +++ b/src/pybind/mgr/balancer/module.py @@ -3,27 +3,268 @@ Balance PG distribution across OSDs. """ +import errno import json +import math import random import time from mgr_module import MgrModule, CommandResult from threading import Event # available modes: 'none', 'crush', 'crush-compat', 'upmap', 'osd_weight' -default_mode = 'upmap' -default_sleep_interval = 30 # seconds +default_mode = 'none' +default_sleep_interval = 60 # seconds default_max_misplaced = .03 # max ratio of pgs replaced at a time +TIME_FORMAT = '%Y-%m-%d %H:%M:%S %Z' + + +class MappingState: + def __init__(self, osdmap, pg_dump, desc=''): + self.desc = desc + self.osdmap = osdmap + self.osdmap_dump = self.osdmap.dump() + self.crush = osdmap.get_crush() + self.crush_dump = self.crush.dump() + self.pg_dump = pg_dump + self.pg_stat = { + i['pgid']: i['stat_sum'] for i in pg_dump.get('pg_stats', []) + } + + +class Plan: + def __init__(self, name, ms): + self.mode = 'unknown' + self.name = name + self.initial = ms + + self.osd_weights = {} + self.compat_ws = {} + self.inc = ms.osdmap.new_incremental() + + def final_state(self): + self.inc.set_osd_reweights(self.osd_weights) + self.inc.set_crush_compat_weight_set_weights(self.compat_ws) + return MappingState(self.initial.osdmap.apply_incremental(self.inc), + self.initial.pg_dump, + 'plan %s final' % self.name) + + def dump(self): + return json.dumps(self.inc.dump(), indent=4) + + def show(self): + ls = [] + ls.append('# starting osdmap epoch %d' % self.initial.osdmap.get_epoch()) + ls.append('# starting crush version %d' % + self.initial.osdmap.get_crush_version()) + ls.append('# mode %s' % self.mode) + if len(self.compat_ws) and \ + '-1' in self.crush_dump.get('choose_args', {}): + ls.append('ceph osd crush weight-set create-compat') + for osd, weight in self.compat_ws.iteritems(): + ls.append('ceph osd crush weight-set reweight-compat %s %f' % + (osd, weight)) + for osd, weight in self.osd_weights.iteritems(): + ls.append('ceph osd reweight osd.%d %f' % (osd, weight)) + incdump = self.inc.dump() + for pgid in incdump.get('old_pg_upmap_items', []): + ls.append('ceph osd rm-pg-upmap-items %s' % pgid) + for item in incdump.get('new_pg_upmap_items', []): + osdlist = [] + for m in item['mappings']: + osdlist += [m['from'], m['to']] + ls.append('ceph osd pg-upmap-items %s %s' % + (item['pgid'], ' '.join([str(a) for a in osdlist]))) + return '\n'.join(ls) + + +class Eval: + pool_name = {} # pool id -> pool name + pool_id = {} # pool name -> id + pool_roots = {} # pool name -> root name + target_by_root = {} # root name -> target weight map + count_by_pool = {} + count_by_root = {} + actual_by_pool = {} # pool -> by_* -> actual weight map + actual_by_root = {} # pool -> by_* -> actual weight map + total_by_pool = {} # pool -> by_* -> total + total_by_root = {} # root -> by_* -> total + stats_by_pool = {} # pool -> by_* -> stddev or avg -> value + stats_by_root = {} # root -> by_* -> stddev or avg -> value + + score_by_pool = {} + score_by_root = {} + + score = 0.0 + + def __init__(self, ms): + self.ms = ms + + def show(self): + r = self.ms.desc + '\n' + r += 'target_by_root %s\n' % self.target_by_root + r += 'actual_by_pool %s\n' % self.actual_by_pool + r += 'actual_by_root %s\n' % self.actual_by_root + r += 'count_by_pool %s\n' % self.count_by_pool + r += 'count_by_root %s\n' % self.count_by_root + r += 'total_by_pool %s\n' % self.total_by_pool + r += 'total_by_root %s\n' % self.total_by_root + r += 'stats_by_root %s\n' % self.stats_by_root + r += 'score_by_pool %s\n' % self.score_by_pool + r += 'score_by_root %s\n' % self.score_by_root + r += 'score %f (lower is better)\n' % self.score + return r + + def calc_stats(self, count, target, total): + num = max(len(target), 1) + r = {} + for t in ('pgs', 'objects', 'bytes'): + avg = float(total[t]) / float(num) + dev = 0.0 + for k, v in count[t].iteritems(): + # adjust/normalize by weight + adjusted = float(v) / target[k] / float(num) + dev += (avg - adjusted) * (avg - adjusted) + stddev = math.sqrt(dev / float(max(num - 1, 1))) + r[t] = { + 'avg': avg, + 'stddev': stddev, + } + return r + class Module(MgrModule): + COMMANDS = [ + { + "cmd": "balancer status", + "desc": "Show balancer status", + "perm": "r", + }, + { + "cmd": "balancer mode name=mode,type=CephChoices,strings=none|crush-compat|upmap", + "desc": "Set balancer mode", + "perm": "rw", + }, + { + "cmd": "balancer on", + "desc": "Enable automatic balancing", + "perm": "rw", + }, + { + "cmd": "balancer off", + "desc": "Disable automatic balancing", + "perm": "rw", + }, + { + "cmd": "balancer eval name=plan,type=CephString,req=false", + "desc": "Evaluate data distribution for the current cluster or specific plan", + "perm": "r", + }, + { + "cmd": "balancer optimize name=plan,type=CephString", + "desc": "Run optimizer to create a new plan", + "perm": "rw", + }, + { + "cmd": "balancer show name=plan,type=CephString", + "desc": "Show details of an optimization plan", + "perm": "r", + }, + { + "cmd": "balancer rm name=plan,type=CephString", + "desc": "Discard an optimization plan", + "perm": "rw", + }, + { + "cmd": "balancer reset", + "desc": "Discard all optimization plans", + "perm": "rw", + }, + { + "cmd": "balancer dump name=plan,type=CephString", + "desc": "Show an optimization plan", + "perm": "r", + }, + { + "cmd": "balancer execute name=plan,type=CephString", + "desc": "Execute an optimization plan", + "perm": "r", + }, + ] + active = False run = True + plans = {} + mode = '' def __init__(self, *args, **kwargs): super(Module, self).__init__(*args, **kwargs) self.event = Event() def handle_command(self, command): - return (-errno.EINVAL, '', - "Command not found '{0}'".format(command['prefix'])) + self.log.warn("Handling command: '%s'" % str(command)) + if command['prefix'] == 'balancer status': + s = { + 'plans': self.plans.keys(), + 'active': self.active, + 'mode': self.get_config('mode', default_mode), + } + return (0, json.dumps(s, indent=4), '') + elif command['prefix'] == 'balancer mode': + self.set_config('mode', command['mode']) + return (0, '', '') + elif command['prefix'] == 'balancer on': + if not self.active: + self.set_config('active', '1') + self.active = True + self.event.set() + return (0, '', '') + elif command['prefix'] == 'balancer off': + if self.active: + self.set_config('active', '') + self.active = False + self.event.set() + return (0, '', '') + elif command['prefix'] == 'balancer eval': + if 'plan' in command: + plan = self.plans.get(command['plan']) + if not plan: + return (-errno.ENOENT, '', 'plan %s not found' % + command['plan']) + ms = plan.final_state() + else: + ms = MappingState(self.get_osdmap(), + self.get("pg_dump"), + 'current cluster') + return (0, self.evaluate(ms), '') + elif command['prefix'] == 'balancer optimize': + plan = self.plan_create(command['plan']) + self.optimize(plan) + return (0, '', '') + elif command['prefix'] == 'balancer rm': + self.plan_rm(command['name']) + return (0, '', '') + elif command['prefix'] == 'balancer reset': + self.plans = {} + return (0, '', '') + elif command['prefix'] == 'balancer dump': + plan = self.plans.get(command['plan']) + if not plan: + return (-errno.ENOENT, '', 'plan %s not found' % command['plan']) + return (0, plan.dump(), '') + elif command['prefix'] == 'balancer show': + plan = self.plans.get(command['plan']) + if not plan: + return (-errno.ENOENT, '', 'plan %s not found' % command['plan']) + return (0, plan.show(), '') + elif command['prefix'] == 'balancer execute': + plan = self.plans.get(command['plan']) + if not plan: + return (-errno.ENOENT, '', 'plan %s not found' % command['plan']) + self.execute(plan) + self.plan_rm(plan) + return (0, '', '') + else: + return (-errno.EINVAL, '', + "Command not found '{0}'".format(command['prefix'])) def shutdown(self): self.log.info('Stopping') @@ -33,41 +274,252 @@ def shutdown(self): def serve(self): self.log.info('Starting') while self.run: - mode = self.get_config('mode', default_mode) + self.log.debug('Waking up') + self.active = self.get_config('active', '') is not '' sleep_interval = float(self.get_config('sleep_interval', default_sleep_interval)) - max_misplaced = float(self.get_config('max_misplaced', - default_max_misplaced)) - self.log.info('Mode %s, sleep interval %f, max misplaced %f' % - (mode, sleep_interval, max_misplaced)) - - info = self.get('pg_status') - if info.get('unknown_ratio', 0.0) + info.get('degraded_ratio', 0.0) + info.get('inactive_ratio', 0.0) > 0.0: - self.log.info('Some PGs are unknown, degraded, or inactive; waiting') - elif info.get('misplaced_ratio', 0.0) >= max_misplaced: - self.log.info('Too many PGs (%f > max %f) are misplaced; waiting' % (info.get('misplaced_ratio', 0.0), max_misplaced)) - else: - if mode == 'upmap': - self.do_upmap() - elif mode == 'crush': - self.do_crush(compat=False) - elif mode == 'crush-compat': - self.do_crush(compat=True) - elif mode == 'osd_weight': - self.osd_weight() - elif mode == 'none': - self.log.info('Idle') - else: - self.log.info('Unrecognized mode %s' % mode) + if self.active: + self.log.debug('Running') + plan = self.plan_create('auto-foo') + self.optimize(plan) + #self.plan_apply(plan) + self.log.debug('Sleeping for %d', sleep_interval) self.event.wait(sleep_interval) + self.event.clear() + + def plan_create(self, name): + plan = Plan(name, MappingState(self.get_osdmap(), + self.get("pg_dump"), + 'plan %s initial' % name)) + self.plans[name] = plan + return plan + + def plan_rm(self, name): + if name in self.plans: + del self.plans[name] + + def calc_eval(self, ms): + pe = Eval(ms) + pool_rule = {} + for p in ms.osdmap_dump.get('pools',[]): + pe.pool_name[p['pool']] = p['pool_name'] + pe.pool_id[p['pool_name']] = p['pool'] + pool_rule[p['pool_name']] = p['crush_rule'] + pe.pool_roots[p['pool_name']] = [] + pools = pe.pool_id.keys() + if len(pools) == 0: + return pe + self.log.debug('pool_name %s' % pe.pool_name) + self.log.debug('pool_id %s' % pe.pool_id) + self.log.debug('pools %s' % pools) + self.log.debug('pool_rule %s' % pool_rule) + + # get expected distributions by root + actual_by_root = {} + roots = ms.crush.find_takes() + for root in roots: + rname = ms.crush.get_item_name(root) + ls = ms.osdmap.get_pools_by_take(root) + for poolid in ls: + pe.pool_roots[pe.pool_name[poolid]].append(rname) + pe.target_by_root[rname] = ms.crush.get_take_weight_osd_map(root) + actual_by_root[rname] = { + 'pgs': {}, + 'objects': {}, + 'bytes': {}, + } + for osd in pe.target_by_root[rname].iterkeys(): + actual_by_root[rname]['pgs'][osd] = 0 + actual_by_root[rname]['objects'][osd] = 0 + actual_by_root[rname]['bytes'][osd] = 0 + pe.total_by_root[rname] = { + 'pgs': 0, + 'objects': 0, + 'bytes': 0, + } + self.log.debug('pool_roots %s' % pe.pool_roots) + self.log.debug('target_by_root %s' % pe.target_by_root) + + # pool and root actual + for pool in pools: + pi = [p for p in ms.osdmap_dump.get('pools',[]) + if p['pool_name'] == pool][0] + poolid = pi['pool'] + pm = ms.osdmap.map_pool_pgs_up(poolid) + pgs = 0 + objects = 0 + bytes = 0 + pgs_by_osd = {} + objects_by_osd = {} + bytes_by_osd = {} + for root in pe.pool_roots[pool]: + for osd in pe.target_by_root[root].iterkeys(): + pgs_by_osd[osd] = 0 + objects_by_osd[osd] = 0 + bytes_by_osd[osd] = 0 + for pgid, up in pm.iteritems(): + for osd in [int(osd) for osd in up]: + pgs_by_osd[osd] += 1 + objects_by_osd[osd] += ms.pg_stat[pgid]['num_objects'] + bytes_by_osd[osd] += ms.pg_stat[pgid]['num_bytes'] + # pick a root to associate this pg instance with. + # note that this is imprecise if the roots have + # overlapping children. + # FIXME: divide bytes by k for EC pools. + for root in pe.pool_roots[pool]: + if osd in pe.target_by_root[root]: + actual_by_root[root]['pgs'][osd] += 1 + actual_by_root[root]['objects'][osd] += ms.pg_stat[pgid]['num_objects'] + actual_by_root[root]['bytes'][osd] += ms.pg_stat[pgid]['num_bytes'] + pgs += 1 + objects += ms.pg_stat[pgid]['num_objects'] + bytes += ms.pg_stat[pgid]['num_bytes'] + pe.total_by_root[root]['pgs'] += 1 + pe.total_by_root[root]['objects'] += ms.pg_stat[pgid]['num_objects'] + pe.total_by_root[root]['bytes'] += ms.pg_stat[pgid]['num_bytes'] + break + pe.count_by_pool[pool] = { + 'pgs': { + k: v + for k, v in pgs_by_osd.iteritems() + }, + 'objects': { + k: v + for k, v in objects_by_osd.iteritems() + }, + 'bytes': { + k: v + for k, v in bytes_by_osd.iteritems() + }, + } + pe.actual_by_pool[pool] = { + 'pgs': { + k: float(v) / float(max(pgs, 1)) + for k, v in pgs_by_osd.iteritems() + }, + 'objects': { + k: float(v) / float(max(objects, 1)) + for k, v in objects_by_osd.iteritems() + }, + 'bytes': { + k: float(v) / float(max(bytes, 1)) + for k, v in bytes_by_osd.iteritems() + }, + } + pe.total_by_pool[pool] = { + 'pgs': pgs, + 'objects': objects, + 'bytes': bytes, + } + for root, m in pe.total_by_root.iteritems(): + pe.count_by_root[root] = { + 'pgs': { + k: float(v) + for k, v in actual_by_root[root]['pgs'].iteritems() + }, + 'objects': { + k: float(v) + for k, v in actual_by_root[root]['objects'].iteritems() + }, + 'bytes': { + k: float(v) + for k, v in actual_by_root[root]['bytes'].iteritems() + }, + } + pe.actual_by_root[root] = { + 'pgs': { + k: float(v) / float(max(pe.total_by_root[root]['pgs'], 1)) + for k, v in actual_by_root[root]['pgs'].iteritems() + }, + 'objects': { + k: float(v) / float(max(pe.total_by_root[root]['objects'], 1)) + for k, v in actual_by_root[root]['objects'].iteritems() + }, + 'bytes': { + k: float(v) / float(max(pe.total_by_root[root]['bytes'], 1)) + for k, v in actual_by_root[root]['bytes'].iteritems() + }, + } + self.log.debug('actual_by_pool %s' % pe.actual_by_pool) + self.log.debug('actual_by_root %s' % pe.actual_by_root) + + # average and stddev + pe.stats_by_root = { + a: pe.calc_stats( + b, + pe.target_by_root[a], + pe.total_by_root[a] + ) for a, b in pe.count_by_root.iteritems() + } + + # aggregate score (normalize the stddev by count) + pe.score_by_root = { + r: { + 'pgs': pe.stats_by_root[r]['pgs']['stddev'] / max(1, pe.total_by_root[r]['pgs']), + 'objects': pe.stats_by_root[r]['objects']['stddev'] / max(1, pe.total_by_root[r]['objects']), + 'bytes': pe.stats_by_root[r]['bytes']['stddev'] / max(1, pe.total_by_root[r]['bytes']), + } for r in pe.total_by_root.keys() + } + + # total score is just average of normalized stddevs + pe.score = 0.0 + for r, vs in pe.score_by_root.iteritems(): + for k, v in vs.iteritems(): + pe.score += v + pe.score /= 3 * len(roots) + return pe - def do_upmap(self): + def evaluate(self, ms): + pe = self.calc_eval(ms) + return pe.show() + + def optimize(self, plan): + self.log.info('Optimize plan %s' % plan.name) + plan.mode = self.get_config('mode', default_mode) + max_misplaced = float(self.get_config('max_misplaced', + default_max_misplaced)) + self.log.info('Mode %s, max misplaced %f' % + (plan.mode, max_misplaced)) + + info = self.get('pg_status') + unknown = info.get('unknown_pgs_ratio', 0.0) + degraded = info.get('degraded_ratio', 0.0) + inactive = info.get('inactive_pgs_ratio', 0.0) + misplaced = info.get('misplaced_ratio', 0.0) + self.log.debug('unknown %f degraded %f inactive %f misplaced %g', + unknown, degraded, inactive, misplaced) + if unknown > 0.0: + self.log.info('Some PGs (%f) are unknown; waiting', unknown) + elif degraded > 0.0: + self.log.info('Some PGs (%f) are degraded; waiting', degraded) + elif inactive > 0.0: + self.log.info('Some PGs (%f) are inactive; waiting', inactive) + elif misplaced >= max_misplaced: + self.log.info('Too many PGs (%f > %f) are misplaced; waiting', + misplaced, max_misplaced) + else: + if plan.mode == 'upmap': + self.do_upmap(plan) + elif plan.mode == 'crush': + self.do_crush() + elif plan.mode == 'crush-compat': + self.do_crush_compat() + elif plan.mode == 'osd_weight': + self.osd_weight() + elif plan.mode == 'none': + self.log.info('Idle') + else: + self.log.info('Unrecognized mode %s' % plan.mode) + + ## + + def do_upmap(self, plan): self.log.info('do_upmap') max_iterations = self.get_config('upmap_max_iterations', 10) max_deviation = self.get_config('upmap_max_deviation', .01) - osdmap_dump = self.get('osd_map') - pools = [str(i['pool_name']) for i in osdmap_dump.get('pools',[])] + ms = plan.initial + pools = [str(i['pool_name']) for i in ms.osdmap_dump.get('pools',[])] if len(pools) == 0: self.log.info('no pools, nothing to do') return @@ -75,21 +527,234 @@ def do_upmap(self): random.shuffle(pools) self.log.info('pools %s' % pools) - osdmap = self.get_osdmap() - inc = osdmap.new_incremental() + inc = plan.inc total_did = 0 left = max_iterations for pool in pools: - did = osdmap.calc_pg_upmaps(inc, max_deviation, left, [pool]) + did = ms.osdmap.calc_pg_upmaps(inc, max_deviation, left, [pool]) total_did += did left -= did if left <= 0: break self.log.info('prepared %d/%d changes' % (total_did, max_iterations)) - incdump = inc.dump() - self.log.debug('resulting inc is %s' % incdump) + def do_crush_compat(self): + self.log.info('do_crush_compat') + osdmap = self.get_osdmap() + crush = osdmap.get_crush() + + weight_set = self.get_compat_weight_set_weights() + + # get subtree weight maps, check for overlap + roots = crush.find_takes() + self.log.debug('roots %s', roots) + weight_maps = {} + visited = {} + overlap = {} + for root in roots: + weight_maps[root] = crush.get_take_weight_osd_map(root) + self.log.debug(' map for %d: %s' % (root, weight_maps[root])) + for osd in weight_maps[root].iterkeys(): + if osd in visited: + overlap[osd] = 1 + visited[osd] = 1 + if len(overlap) > 0: + self.log.err('error: some osds belong to multiple subtrees: %s' % + overlap) + return + + # select a cost mode: pg, pgsize, or utilization + # FIXME: when we add utilization support, we need to throttle back + # so that we don't run if *any* objects are misplaced. with 'pgs' we + # can look at up mappings, which lets us look ahead a bit. + cost_mode = 'pg' + # go + random.shuffle(roots) + for root in roots: + pools = osdmap.get_pools_by_take(root) + self.log.info('Balancing root %s pools %s' % (root, pools)) + wm = weight_maps[root] + util = self.get_util(cost_mode, pools, wm.keys()) + self.log.info('wm %s util %s' % (wm, util)) + if len(util) == 0: + self.log.info('no utilization information, stopping') + return + target = self.get_target(util) + self.log.info('target %s' % target) + if target == 0: + continue + + queue = sorted(util, key=lambda osd: -abs(target - util[osd])) + self.log.info('queue %s' % queue) + + for osd in queue: + deviation = float(util[osd]) - float(target) + if deviation == 0: + break + self.log.debug('osd.%d deviation %f', osd, deviation) + weight = weight_set[osd] + calc_weight = (float(target) / float(util[osd])) * weight + new_weight = weight * .7 + calc_weight * .3 + self.log.debug('Reweight osd.%d %f -> %f', osd, weight, + new_weight) + self.compat_weight_set_reweight(osd, new_weight) + + def compat_weight_set_reweight(self, osd, new_weight): + self.log.debug('ceph osd crush weight-set reweight-compat') + result = CommandResult('') + self.send_command(result, 'mon', '', json.dumps({ + 'prefix': 'osd crush weight-set reweight-compat', + 'format': 'json', + 'item': 'osd.%d' % osd, + 'weight': [new_weight], + }), '') + r, outb, outs = result.wait() + if r != 0: + self.log.error('Error setting compat weight-set osd.%d to %f' % + (osd, new_weight)) + return + + def get_compat_weight_set_weights(self): + # enable compat weight-set + self.log.debug('ceph osd crush weight-set create-compat') + result = CommandResult('') + self.send_command(result, 'mon', '', json.dumps({ + 'prefix': 'osd crush weight-set create-compat', + 'format': 'json', + }), '') + r, outb, outs = result.wait() + if r != 0: + self.log.error('Error creating compat weight-set') + return + + result = CommandResult('') + self.send_command(result, 'mon', '', json.dumps({ + 'prefix': 'osd crush dump', + 'format': 'json', + }), '') + r, outb, outs = result.wait() + if r != 0: + self.log.error('Error dumping crush map') + return + try: + crushmap = json.loads(outb) + except: + raise RuntimeError('unable to parse crush map') + + raw = crushmap.get('choose_args',{}).get('-1', []) + weight_set = {} + for b in raw: + bucket = None + for t in crushmap['buckets']: + if t['id'] == b['bucket_id']: + bucket = t + break + if not bucket: + raise RuntimeError('could not find bucket %s' % b['bucket_id']) + self.log.debug('bucket items %s' % bucket['items']) + self.log.debug('weight set %s' % b['weight_set'][0]) + if len(bucket['items']) != len(b['weight_set'][0]): + raise RuntimeError('weight-set size does not match bucket items') + for pos in range(len(bucket['items'])): + weight_set[bucket['items'][pos]['id']] = b['weight_set'][0][pos] + + self.log.debug('weight_set weights %s' % weight_set) + return weight_set + + def get_util(self, cost_mode, pools, osds): + if cost_mode == 'pg' or \ + cost_mode == 'pg_bytes' or \ + cost_mode == 'pg_objects': + util_map = {} + for osd in osds: + util_map[osd] = 0 + dump = self.get('pg_dump') + #self.log.info('dump %s' % dump) + self.log.info('osds %s' % osds) + for pg in dump['pg_stats']: + inpool = False + for pool in pools: + if pg['pgid'].startswith(str(pool) + '.'): + inpool = True + break + if not inpool: + self.log.info('skipping %s' % pg['pgid']) + continue + self.log.info('pg %s osds %s' % (pg['pgid'], pg['up'])) + for osd in [int(a) for a in pg['up']]: + if osd in osds: + if cost_mode == 'pg': + util_map[osd] += 1 + elif cost_mode == 'pg_bytes': + util_map[osd] += pg['stat_sum']['num_bytes'] + elif cost_mode == 'pg_objects': + util_map[osd] += pg['stat_sum']['num_objects'] + return util_map + else: + raise RuntimeError('unsupported cost mode %s' % cost_mode) + + def get_target(self, util_map): + total = 0 + count = 0 + for k, v in util_map.iteritems(): + total += v; + count += 1 + return total / count + + def do_crush(self): + self.log.info('do_crush (not yet implemented)') + + def do_osd_weight(self): + self.log.info('do_osd_weight (not yet implemented)') + + def execute(self, plan): + self.log.info('Executing plan %s' % plan.name) + + commands = [] + + # compat weight-set + if len(plan.compat_ws) and \ + '-1' in plan.crush_dump.get('choose_args', {}): + self.log.debug('ceph osd crush weight-set create-compat') + result = CommandResult('') + self.send_command(result, 'mon', '', json.dumps({ + 'prefix': 'osd crush weight-set create-compat', + 'format': 'json', + }), '') + r, outb, outs = result.wait() + if r != 0: + self.log.error('Error creating compat weight-set') + return + + for osd, weight in plan.compat_ws.iteritems(): + self.log.info('ceph osd crush weight-set reweight-compat osd.%d %f', + osd, weight) + result = CommandResult('foo') + self.send_command(result, 'mon', '', json.dumps({ + 'prefix': 'osd crush weight-set reweight-compat', + 'format': 'json', + 'item': 'osd.%d' % osd, + 'weight': [weight], + }), 'foo') + commands.append(result) + + # new_weight + reweightn = {} + for osd, weight in plan.osd_weights.iteritems(): + reweightn[int(osd)] = float(weight) / float(0x10000) + if len(reweightn): + self.log.info('ceph osd reweightn %s', reweightn) + result = CommandResult('foo') + self.send_command(result, 'mon', '', json.dumps({ + 'prefix': 'osd reweightn', + 'format': 'json', + 'weights': json.dumps(reweightn), + }), 'foo') + commands.append(result) + + # upmap + incdump = plan.inc.dump() for pgid in incdump.get('old_pg_upmap_items', []): self.log.info('ceph osd rm-pg-upmap-items %s', pgid) result = CommandResult('foo') @@ -98,10 +763,7 @@ def do_upmap(self): 'format': 'json', 'pgid': pgid, }), 'foo') - r, outb, outs = result.wait() - if r != 0: - self.log.error('Error removing pg-upmap on %s' % pgid) - break; + commands.append(result) for item in incdump.get('new_pg_upmap_items', []): self.log.info('ceph osd pg-upmap-items %s mappings %s', item['pgid'], @@ -116,14 +778,11 @@ def do_upmap(self): 'pgid': item['pgid'], 'id': osdlist, }), 'foo') + commands.append(result) + + # wait for commands + for result in commands: r, outb, outs = result.wait() if r != 0: - self.log.error('Error setting pg-upmap on %s' % item['pgid']) - break; - - - def do_crush(self, compat): - self.log.info('do_crush compat=%b' % compat) - - def do_osd_weight(self): - self.log.info('do_osd_weight') + self.log.error('Error on command') + return diff --git a/src/vstart.sh b/src/vstart.sh index d1285bd061a7b..27cb6084fd5b5 100755 --- a/src/vstart.sh +++ b/src/vstart.sh @@ -516,7 +516,7 @@ $COSDMEMSTORE $COSDSHORT $extra_conf [mon] - mgr initial modules = restful status dashboard + mgr initial modules = restful status dashboard balancer mon pg warn min per osd = 3 mon osd allow primary affinity = true mon reweight min pgs per osd = 4 From a14d20ff064c90fee0a28a09d3c292a097e6c105 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Aug 2017 23:47:40 -0400 Subject: [PATCH 13/19] ... --- src/pybind/mgr/balancer/module.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py index 4c270b3280ade..8e2d528b1cb7c 100644 --- a/src/pybind/mgr/balancer/module.py +++ b/src/pybind/mgr/balancer/module.py @@ -538,13 +538,17 @@ def do_upmap(self, plan): break self.log.info('prepared %d/%d changes' % (total_did, max_iterations)) - def do_crush_compat(self): + def do_crush_compat(self, plan): self.log.info('do_crush_compat') osdmap = self.get_osdmap() crush = osdmap.get_crush() + # get current compat weight-set weights weight_set = self.get_compat_weight_set_weights() + ms = plan.initial + pe = self.calc_eval(ms) + # get subtree weight maps, check for overlap roots = crush.find_takes() self.log.debug('roots %s', roots) From 930ca8b04ea93a9d8327deec75556e0408df1ab4 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 4 Aug 2017 17:59:20 -0400 Subject: [PATCH 14/19] pybind/mgr/balancer: make 'crush-compat' sort of work Signed-off-by: Sage Weil --- src/pybind/mgr/balancer/module.py | 95 ++++++++++++++----------------- 1 file changed, 43 insertions(+), 52 deletions(-) diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py index 8e2d528b1cb7c..6e880be4b23da 100644 --- a/src/pybind/mgr/balancer/module.py +++ b/src/pybind/mgr/balancer/module.py @@ -59,7 +59,7 @@ def show(self): self.initial.osdmap.get_crush_version()) ls.append('# mode %s' % self.mode) if len(self.compat_ws) and \ - '-1' in self.crush_dump.get('choose_args', {}): + '-1' not in self.initial.crush_dump.get('choose_args', {}): ls.append('ceph osd crush weight-set create-compat') for osd, weight in self.compat_ws.iteritems(): ls.append('ceph osd crush weight-set reweight-compat %s %f' % @@ -82,6 +82,7 @@ class Eval: pool_name = {} # pool id -> pool name pool_id = {} # pool name -> id pool_roots = {} # pool name -> root name + root_pools = {} # root name -> pools target_by_root = {} # root name -> target weight map count_by_pool = {} count_by_root = {} @@ -316,28 +317,33 @@ def calc_eval(self, ms): # get expected distributions by root actual_by_root = {} - roots = ms.crush.find_takes() - for root in roots: - rname = ms.crush.get_item_name(root) - ls = ms.osdmap.get_pools_by_take(root) + rootids = ms.crush.find_takes() + roots = [] + for rootid in rootids: + root = ms.crush.get_item_name(rootid) + roots.append(root) + ls = ms.osdmap.get_pools_by_take(rootid) + pe.root_pools[root] = [] for poolid in ls: - pe.pool_roots[pe.pool_name[poolid]].append(rname) - pe.target_by_root[rname] = ms.crush.get_take_weight_osd_map(root) - actual_by_root[rname] = { + pe.pool_roots[pe.pool_name[poolid]].append(root) + pe.root_pools[root].append(pe.pool_name[poolid]) + pe.target_by_root[root] = ms.crush.get_take_weight_osd_map(rootid) + actual_by_root[root] = { 'pgs': {}, 'objects': {}, 'bytes': {}, } - for osd in pe.target_by_root[rname].iterkeys(): - actual_by_root[rname]['pgs'][osd] = 0 - actual_by_root[rname]['objects'][osd] = 0 - actual_by_root[rname]['bytes'][osd] = 0 - pe.total_by_root[rname] = { + for osd in pe.target_by_root[root].iterkeys(): + actual_by_root[root]['pgs'][osd] = 0 + actual_by_root[root]['objects'][osd] = 0 + actual_by_root[root]['bytes'][osd] = 0 + pe.total_by_root[root] = { 'pgs': 0, 'objects': 0, 'bytes': 0, } self.log.debug('pool_roots %s' % pe.pool_roots) + self.log.debug('root_pools %s' % pe.root_pools) self.log.debug('target_by_root %s' % pe.target_by_root) # pool and root actual @@ -500,12 +506,8 @@ def optimize(self, plan): else: if plan.mode == 'upmap': self.do_upmap(plan) - elif plan.mode == 'crush': - self.do_crush() elif plan.mode == 'crush-compat': - self.do_crush_compat() - elif plan.mode == 'osd_weight': - self.osd_weight() + self.do_crush_compat(plan) elif plan.mode == 'none': self.log.info('Idle') else: @@ -544,21 +546,20 @@ def do_crush_compat(self, plan): crush = osdmap.get_crush() # get current compat weight-set weights - weight_set = self.get_compat_weight_set_weights() + old_ws = self.get_compat_weight_set_weights() ms = plan.initial pe = self.calc_eval(ms) - # get subtree weight maps, check for overlap - roots = crush.find_takes() + # Make sure roots don't overlap their devices. If so, we + # can't proceed. + roots = pe.target_by_root.keys() self.log.debug('roots %s', roots) - weight_maps = {} visited = {} overlap = {} - for root in roots: - weight_maps[root] = crush.get_take_weight_osd_map(root) - self.log.debug(' map for %d: %s' % (root, weight_maps[root])) - for osd in weight_maps[root].iterkeys(): + root_ids = {} + for root, wm in pe.target_by_root.iteritems(): + for osd in wm.iterkeys(): if osd in visited: overlap[osd] = 1 visited[osd] = 1 @@ -567,42 +568,30 @@ def do_crush_compat(self, plan): overlap) return - # select a cost mode: pg, pgsize, or utilization - # FIXME: when we add utilization support, we need to throttle back - # so that we don't run if *any* objects are misplaced. with 'pgs' we - # can look at up mappings, which lets us look ahead a bit. - cost_mode = 'pg' + key = 'pgs' # pgs objects or bytes # go random.shuffle(roots) for root in roots: - pools = osdmap.get_pools_by_take(root) - self.log.info('Balancing root %s pools %s' % (root, pools)) - wm = weight_maps[root] - util = self.get_util(cost_mode, pools, wm.keys()) - self.log.info('wm %s util %s' % (wm, util)) - if len(util) == 0: - self.log.info('no utilization information, stopping') - return - target = self.get_target(util) - self.log.info('target %s' % target) - if target == 0: - continue - - queue = sorted(util, key=lambda osd: -abs(target - util[osd])) - self.log.info('queue %s' % queue) - + pools = pe.root_pools[root] + self.log.info('Balancing root %s (pools %s) by %s' % + (root, pools, key)) + target = pe.target_by_root[root] + actual = pe.actual_by_root[root][key] + queue = sorted(actual.keys(), + key=lambda osd: -abs(target[osd] - actual[osd])) + self.log.debug('queue %s' % queue) for osd in queue: - deviation = float(util[osd]) - float(target) + deviation = target[osd] - actual[osd] if deviation == 0: break self.log.debug('osd.%d deviation %f', osd, deviation) - weight = weight_set[osd] - calc_weight = (float(target) / float(util[osd])) * weight + weight = old_ws[osd] + calc_weight = target[osd] / actual[osd] * weight new_weight = weight * .7 + calc_weight * .3 self.log.debug('Reweight osd.%d %f -> %f', osd, weight, new_weight) - self.compat_weight_set_reweight(osd, new_weight) + plan.compat_ws[osd] = new_weight def compat_weight_set_reweight(self, osd, new_weight): self.log.debug('ceph osd crush weight-set reweight-compat') @@ -719,7 +708,7 @@ def execute(self, plan): # compat weight-set if len(plan.compat_ws) and \ - '-1' in plan.crush_dump.get('choose_args', {}): + '-1' not in plan.initial.crush_dump.get('choose_args', {}): self.log.debug('ceph osd crush weight-set create-compat') result = CommandResult('') self.send_command(result, 'mon', '', json.dumps({ @@ -785,8 +774,10 @@ def execute(self, plan): commands.append(result) # wait for commands + self.log.debug('commands %s' % commands) for result in commands: r, outb, outs = result.wait() if r != 0: self.log.error('Error on command') return + self.log.debug('done') From b3f9a723b453c13868873a3be1d7c23b894843be Mon Sep 17 00:00:00 2001 From: Spandan Kumar Sahu Date: Mon, 7 Aug 2017 04:01:57 +0530 Subject: [PATCH 15/19] src/pybind/mgr/balancer/module.py: improve scoring method * score lies in [0, 1), 0 being perfect distribution * use shifted and scaled cdf of normal distribution to prioritize highly over-weighted device. * consider only over-weighted devices to calculate score Signed-off-by: Spandan Kumar Sahu --- src/pybind/mgr/balancer/module.py | 41 +++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py index 6e880be4b23da..5f79a5d8aec82 100644 --- a/src/pybind/mgr/balancer/module.py +++ b/src/pybind/mgr/balancer/module.py @@ -122,14 +122,45 @@ def calc_stats(self, count, target, total): for t in ('pgs', 'objects', 'bytes'): avg = float(total[t]) / float(num) dev = 0.0 + + # score is a measure of how uneven the data distribution is. + # score lies between [0, 1), 0 means perfect distribution. + score = 0.0 + sum_weight = 0.0 + for k, v in count[t].iteritems(): # adjust/normalize by weight adjusted = float(v) / target[k] / float(num) + + # Overweighted devices and their weights are factors to calculate reweight_urgency. + # One 10% underfilled device with 5 2% overfilled devices, is arguably a better + # situation than one 10% overfilled with 5 2% underfilled devices + if adjusted > avg: + ''' + F(x) = 2*phi(x) - 1, where phi(x) = cdf of standard normal distribution + x = (adjusted - avg)/avg. + Since, we're considering only over-weighted devices, x >= 0, and so phi(x) lies in [0.5, 1). + To bring range of F(x) in range [0, 1), we need to make the above modification. + + In general, we need to use a function F(x), where x = (adjusted - avg)/avg + 1. which is bounded between 0 and 1, so that ultimately reweight_urgency will also be bounded. + 2. A larger value of x, should imply more urgency to reweight. + 3. Also, the difference between F(x) when x is large, should be minimal. + 4. The value of F(x) should get close to 1 (highest urgency to reweight) with steeply. + + Could have used F(x) = (1 - e^(-x)). But that had slower convergence to 1, compared to the one currently in use. + + cdf of standard normal distribution: https://stackoverflow.com/a/29273201 + ''' + score += target[k] * (math.erf(((adjusted - avg)/avg) / math.sqrt(2.0))) + sum_weight += target[k] dev += (avg - adjusted) * (avg - adjusted) stddev = math.sqrt(dev / float(max(num - 1, 1))) + score = score / max(sum_weight, 1) r[t] = { 'avg': avg, 'stddev': stddev, + 'score': sum_weight, } return r @@ -449,7 +480,7 @@ def calc_eval(self, ms): self.log.debug('actual_by_pool %s' % pe.actual_by_pool) self.log.debug('actual_by_root %s' % pe.actual_by_root) - # average and stddev + # average and stddev and score pe.stats_by_root = { a: pe.calc_stats( b, @@ -458,12 +489,12 @@ def calc_eval(self, ms): ) for a, b in pe.count_by_root.iteritems() } - # aggregate score (normalize the stddev by count) + # the scores are already normalized pe.score_by_root = { r: { - 'pgs': pe.stats_by_root[r]['pgs']['stddev'] / max(1, pe.total_by_root[r]['pgs']), - 'objects': pe.stats_by_root[r]['objects']['stddev'] / max(1, pe.total_by_root[r]['objects']), - 'bytes': pe.stats_by_root[r]['bytes']['stddev'] / max(1, pe.total_by_root[r]['bytes']), + 'pgs': pe.stats_by_root[r]['pgs']['score'], + 'objects': pe.stats_by_root[r]['objects']['score'], + 'bytes': pe.stats_by_root[r]['bytes']['score'], } for r in pe.total_by_root.keys() } From 1ac2a8773dcd3129d1106e652cb9feb5576f1b9c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 8 Aug 2017 16:36:23 -0400 Subject: [PATCH 16/19] pybind/mgr/balancer: make auto mode work (with upmap at least) Signed-off-by: Sage Weil --- src/pybind/mgr/balancer/module.py | 49 +++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py index 5f79a5d8aec82..0767ba23dd4a1 100644 --- a/src/pybind/mgr/balancer/module.py +++ b/src/pybind/mgr/balancer/module.py @@ -16,7 +16,7 @@ default_sleep_interval = 60 # seconds default_max_misplaced = .03 # max ratio of pgs replaced at a time -TIME_FORMAT = '%Y-%m-%d %H:%M:%S %Z' +TIME_FORMAT = '%Y-%m-%d_%H:%M:%S' class MappingState: @@ -312,9 +312,11 @@ def serve(self): default_sleep_interval)) if self.active: self.log.debug('Running') - plan = self.plan_create('auto-foo') - self.optimize(plan) - #self.plan_apply(plan) + name = 'auto_%s' % time.strftime(TIME_FORMAT, time.gmtime()) + plan = self.plan_create(name) + if self.optimize(plan): + self.execute(plan) + self.plan_rm(name) self.log.debug('Sleeping for %d', sleep_interval) self.event.wait(sleep_interval) self.event.clear() @@ -536,13 +538,14 @@ def optimize(self, plan): misplaced, max_misplaced) else: if plan.mode == 'upmap': - self.do_upmap(plan) + return self.do_upmap(plan) elif plan.mode == 'crush-compat': - self.do_crush_compat(plan) + return self.do_crush_compat(plan) elif plan.mode == 'none': self.log.info('Idle') else: self.log.info('Unrecognized mode %s' % plan.mode) + return False ## @@ -555,7 +558,7 @@ def do_upmap(self, plan): pools = [str(i['pool_name']) for i in ms.osdmap_dump.get('pools',[])] if len(pools) == 0: self.log.info('no pools, nothing to do') - return + return False # shuffle pool list so they all get equal (in)attention random.shuffle(pools) self.log.info('pools %s' % pools) @@ -570,6 +573,7 @@ def do_upmap(self, plan): if left <= 0: break self.log.info('prepared %d/%d changes' % (total_did, max_iterations)) + return True def do_crush_compat(self, plan): self.log.info('do_crush_compat') @@ -597,7 +601,7 @@ def do_crush_compat(self, plan): if len(overlap) > 0: self.log.err('error: some osds belong to multiple subtrees: %s' % overlap) - return + return False key = 'pgs' # pgs objects or bytes @@ -623,6 +627,7 @@ def do_crush_compat(self, plan): self.log.debug('Reweight osd.%d %f -> %f', osd, weight, new_weight) plan.compat_ws[osd] = new_weight + return True def compat_weight_set_reweight(self, osd, new_weight): self.log.debug('ceph osd crush weight-set reweight-compat') @@ -761,7 +766,10 @@ def execute(self, plan): 'item': 'osd.%d' % osd, 'weight': [weight], }), 'foo') - commands.append(result) + r, outb, outs = result.wait() + if r != 0: + self.log.error('Error on command') + return # new_weight reweightn = {} @@ -775,7 +783,10 @@ def execute(self, plan): 'format': 'json', 'weights': json.dumps(reweightn), }), 'foo') - commands.append(result) + r, outb, outs = result.wait() + if r != 0: + self.log.error('Error on command') + return # upmap incdump = plan.inc.dump() @@ -787,7 +798,10 @@ def execute(self, plan): 'format': 'json', 'pgid': pgid, }), 'foo') - commands.append(result) + r, outb, outs = result.wait() + if r != 0: + self.log.error('Error on command') + return for item in incdump.get('new_pg_upmap_items', []): self.log.info('ceph osd pg-upmap-items %s mappings %s', item['pgid'], @@ -802,13 +816,16 @@ def execute(self, plan): 'pgid': item['pgid'], 'id': osdlist, }), 'foo') - commands.append(result) - - # wait for commands - self.log.debug('commands %s' % commands) - for result in commands: r, outb, outs = result.wait() if r != 0: self.log.error('Error on command') return + + # wait for commands + #self.log.debug('commands %s' % commands) + #for result in commands: + # r, outb, outs = result.wait() + # if r != 0: + # self.log.error('Error on command') + # return self.log.debug('done') From 34729c5eddb199e537f875f57a1c233646a99f69 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 8 Aug 2017 16:36:23 -0400 Subject: [PATCH 17/19] pybind/mgr/balancer: make auto mode work (with upmap at least) Signed-off-by: Sage Weil --- src/pybind/mgr/balancer/module.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py index 0767ba23dd4a1..22f085673b46d 100644 --- a/src/pybind/mgr/balancer/module.py +++ b/src/pybind/mgr/balancer/module.py @@ -547,8 +547,6 @@ def optimize(self, plan): self.log.info('Unrecognized mode %s' % plan.mode) return False - ## - def do_upmap(self, plan): self.log.info('do_upmap') max_iterations = self.get_config('upmap_max_iterations', 10) @@ -816,6 +814,10 @@ def execute(self, plan): 'pgid': item['pgid'], 'id': osdlist, }), 'foo') + commands.append(result) + + # wait for commands + for result in commands: r, outb, outs = result.wait() if r != 0: self.log.error('Error on command') @@ -829,3 +831,4 @@ def execute(self, plan): # self.log.error('Error on command') # return self.log.debug('done') + From 6cf5c1cbfa8a9ee7c4685a88170f59e844a81aab Mon Sep 17 00:00:00 2001 From: Spandan Kumar Sahu Date: Sat, 29 Jul 2017 14:53:19 +0530 Subject: [PATCH 18/19] src/pybind/mgr/balancer/module.py: implement 'do_osd_weight' * make cost_model a config * use existing reweight-by-utilization and reweight-by-pg Signed-off-by: Spandan Kumar Sahu --- src/pybind/mgr/balancer/module.py | 48 ++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py index 22f085673b46d..1c9d75d73cd4c 100644 --- a/src/pybind/mgr/balancer/module.py +++ b/src/pybind/mgr/balancer/module.py @@ -13,12 +13,12 @@ # available modes: 'none', 'crush', 'crush-compat', 'upmap', 'osd_weight' default_mode = 'none' +default_cost_mode = 'pg' default_sleep_interval = 60 # seconds default_max_misplaced = .03 # max ratio of pgs replaced at a time TIME_FORMAT = '%Y-%m-%d_%H:%M:%S' - class MappingState: def __init__(self, osdmap, pg_dump, desc=''): self.desc = desc @@ -172,10 +172,15 @@ class Module(MgrModule): "perm": "r", }, { - "cmd": "balancer mode name=mode,type=CephChoices,strings=none|crush-compat|upmap", + "cmd": "balancer mode name=mode,type=CephChoices,strings=none|crush-compat|upmap|osd_weight", "desc": "Set balancer mode", "perm": "rw", }, + { + "cmd": "balancer cost-mode name=cost-mode,type=CephChoices,strings=pg|pg-size|utilization", + "desc": "Set balancer cost mode", + "perm": "rw", + }, { "cmd": "balancer on", "desc": "Enable automatic balancing", @@ -238,11 +243,15 @@ def handle_command(self, command): 'plans': self.plans.keys(), 'active': self.active, 'mode': self.get_config('mode', default_mode), + 'cost-mode': self.get_config('cost_mode', default_cost_mode), } return (0, json.dumps(s, indent=4), '') elif command['prefix'] == 'balancer mode': self.set_config('mode', command['mode']) return (0, '', '') + elif command['prefix'] == 'balancer cost-mode': + self.set_config('cost_mode', command['cost-mode']) + return (0, '', '') elif command['prefix'] == 'balancer on': if not self.active: self.set_config('active', '1') @@ -541,6 +550,8 @@ def optimize(self, plan): return self.do_upmap(plan) elif plan.mode == 'crush-compat': return self.do_crush_compat(plan) + elif plan.mode == 'osd_weight': + self.do_osd_weight() elif plan.mode == 'none': self.log.info('Idle') else: @@ -602,6 +613,11 @@ def do_crush_compat(self, plan): return False key = 'pgs' # pgs objects or bytes + # select a cost mode: pg, pgsize, or utilization + # FIXME: when we add utilization support, we need to throttle back + # so that we don't run if *any* objects are misplaced. with 'pgs' we + # can look at up mappings, which lets us look ahead a bit. + cost_mode = self.get_config('cost_mode', default_cost_mode) # go random.shuffle(roots) @@ -733,7 +749,29 @@ def do_crush(self): self.log.info('do_crush (not yet implemented)') def do_osd_weight(self): - self.log.info('do_osd_weight (not yet implemented)') + self.log.info('do_osd_weight') + + cost_mode = self.get_config('cost_mode', default_cost_mode) + + # Reweight any over-utilized OSD + result = CommandResult('balancer_osd_weight') + if cost_mode == 'pg': + self.send_command(result, 'mon', '', json.dumps({ + 'prefix': 'osd reweight-by-pg', + 'oload': '101', + }), 'balancer_osd_weight') + r, outb, outs = result.wait() + elif cost_mode == 'utilization': + self.send_command(result, 'mon', '', json.dumps({ + 'prefix': 'osd reweight-by-utilization', + 'oload': '101', + }), 'balancer_osd_weight') + r, outb, outs = result.wait() + else: + raise RuntimeError('unsupported cost mode %s' % cost_mode) + if r != 0: + self.log.error(outs) + return def execute(self, plan): self.log.info('Executing plan %s' % plan.name) @@ -814,10 +852,6 @@ def execute(self, plan): 'pgid': item['pgid'], 'id': osdlist, }), 'foo') - commands.append(result) - - # wait for commands - for result in commands: r, outb, outs = result.wait() if r != 0: self.log.error('Error on command') From 6bc153fab2cd22537a4bfa52bda0b0139067ee5f Mon Sep 17 00:00:00 2001 From: Spandan Kumar Sahu Date: Sun, 30 Jul 2017 00:15:00 +0530 Subject: [PATCH 19/19] src/mon/MonCommands.h: add 'reweight-by-pg' and 'reweight-by-utilization' as mon commands Signed-off-by: Spandan Kumar Sahu --- src/mon/MonCommands.h | 14 ++++++++++++++ src/pybind/mgr/balancer/module.py | 1 + 2 files changed, 15 insertions(+) diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index 41e3f79f1175a..1668055880a2e 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -808,6 +808,20 @@ COMMAND("osd reweightn " \ "name=weights,type=CephString", "reweight osds with {: ,...})", "osd", "rw", "cli,rest") +COMMAND("osd reweight-by-utilization " \ + "name=oload,type=CephInt,req=false " \ + "name=max_change,type=CephFloat,req=false " \ + "name=max_osds,type=CephInt,req=false " \ + "name=no_increasing,type=CephChoices,strings=--no-increasing,req=false",\ + "reweight OSDs by utilization [overload-percentage-for-consideration, default 120]", \ + "osd", "rw", "cli,rest") +COMMAND("osd reweight-by-pg " \ + "name=oload,type=CephInt,req=false " \ + "name=max_change,type=CephFloat,req=false " \ + "name=max_osds,type=CephInt,req=false " \ + "name=pools,type=CephPoolname,n=N,req=false", \ + "reweight OSDs by PG distribution [overload-percentage-for-consideration, default 120]", \ + "osd", "rw", "cli,rest") COMMAND("osd force-create-pg " \ "name=pgid,type=CephPgid ", "force creation of pg ", diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py index 1c9d75d73cd4c..ae834f040041b 100644 --- a/src/pybind/mgr/balancer/module.py +++ b/src/pybind/mgr/balancer/module.py @@ -19,6 +19,7 @@ TIME_FORMAT = '%Y-%m-%d_%H:%M:%S' + class MappingState: def __init__(self, osdmap, pg_dump, desc=''): self.desc = desc