From ba7e43a7aa936a06b292ff4cc654cc4b275c8789 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Tue, 11 Jul 2017 16:25:42 -0400
Subject: [PATCH 01/19] mgr/PyModules: add 'pg_status' dump

This is summary info, same as what's in 'ceph status'.

Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/mgr/PyModules.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/mgr/PyModules.cc b/src/mgr/PyModules.cc
index 74628dbbf1c7b..d63b94b939f6d 100644
--- a/src/mgr/PyModules.cc
+++ b/src/mgr/PyModules.cc
@@ -254,6 +254,14 @@ PyObject *PyModules::get_python(const std::string &what)
         }
     );
     return f.get();
+  } else if (what == "pg_status") {
+    PyFormatter f;
+    cluster_state.with_pgmap(
+        [&f](const PGMap &pg_map) {
+	  pg_map.print_summary(&f, nullptr);
+        }
+    );
+    return f.get();
 
   } else if (what == "df") {
     PyFormatter f;

From 7686fd1f26018e21a15b4b80eaca780e384ce935 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Thu, 27 Jul 2017 10:06:45 -0400
Subject: [PATCH 02/19] mgr/PyModules: add 'pg_dump' get

Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/mgr/PyModules.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/mgr/PyModules.cc b/src/mgr/PyModules.cc
index d63b94b939f6d..526917f68b761 100644
--- a/src/mgr/PyModules.cc
+++ b/src/mgr/PyModules.cc
@@ -262,7 +262,14 @@ PyObject *PyModules::get_python(const std::string &what)
         }
     );
     return f.get();
-
+  } else if (what == "pg_dump") {
+    PyFormatter f;
+        cluster_state.with_pgmap(
+        [&f](const PGMap &pg_map) {
+	  pg_map.dump(&f);
+        }
+    );
+    return f.get();
   } else if (what == "df") {
     PyFormatter f;
 

From 4337ec8fee6895c39a954eb1947f497cb7afbb9d Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Mon, 10 Jul 2017 23:23:19 -0400
Subject: [PATCH 03/19] mgr: add trivial OSDMap wrapper class

Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/CMakeLists.txt                 |   1 +
 src/mgr/MgrPyModule.cc             |   4 +
 src/mgr/PyModules.cc               |  23 ++++++
 src/mgr/PyModules.h                |   1 +
 src/mgr/PyOSDMap.cc                | 125 +++++++++++++++++++++++++++++
 src/mgr/PyOSDMap.h                 |  10 +++
 src/mgr/PyState.cc                 |   9 +++
 src/pybind/mgr/dashboard/module.py |   3 +
 src/pybind/mgr/mgr_module.py       |  49 +++++++++++
 9 files changed, 225 insertions(+)
 create mode 100644 src/mgr/PyOSDMap.cc
 create mode 100644 src/mgr/PyOSDMap.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2a8639aa062ff..424a7e5fe14c7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -688,6 +688,7 @@ if (WITH_MGR)
       mgr/ClusterState.cc
       mgr/PyModules.cc
       mgr/PyFormatter.cc
+      mgr/PyOSDMap.cc
       mgr/PyState.cc
       mgr/MgrPyModule.cc
       mgr/MgrStandby.cc
diff --git a/src/mgr/MgrPyModule.cc b/src/mgr/MgrPyModule.cc
index fda9bf6528dbd..c5f54361465c3 100644
--- a/src/mgr/MgrPyModule.cc
+++ b/src/mgr/MgrPyModule.cc
@@ -12,6 +12,7 @@
  */
 
 #include "PyState.h"
+#include "PyOSDMap.h"
 #include "Gil.h"
 
 #include "PyFormatter.h"
@@ -109,6 +110,9 @@ MgrPyModule::MgrPyModule(const std::string &module_name_, const std::string &sys
     }
     // Populate python namespace with callable hooks
     Py_InitModule("ceph_state", CephStateMethods);
+    Py_InitModule("ceph_osdmap", OSDMapMethods);
+    Py_InitModule("ceph_osdmap_incremental", OSDMapIncrementalMethods);
+    Py_InitModule("ceph_crushmap", CRUSHMapMethods);
 
     PySys_SetPath(const_cast<char*>(sys_path.c_str()));
   }
diff --git a/src/mgr/PyModules.cc b/src/mgr/PyModules.cc
index 526917f68b761..14372249f646d 100644
--- a/src/mgr/PyModules.cc
+++ b/src/mgr/PyModules.cc
@@ -786,6 +786,29 @@ PyObject *PyModules::get_context()
   return capsule;
 }
 
+static void delete_osdmap(PyObject *object)
+{
+  OSDMap *osdmap = static_cast<OSDMap*>(PyCapsule_GetPointer(object, nullptr));
+  assert(osdmap);
+  dout(10) << __func__ << " " << osdmap << dendl;
+  delete osdmap;
+}
+
+PyObject *PyModules::get_osdmap()
+{
+  PyThreadState *tstate = PyEval_SaveThread();
+  Mutex::Locker l(lock);
+  PyEval_RestoreThread(tstate);
+
+  // Construct a capsule containing an OSDMap.
+  OSDMap *newmap = new OSDMap;
+  cluster_state.with_osdmap([&](const OSDMap& o) {
+      newmap->deepish_copy_from(o);
+    });
+  dout(10) << __func__ << " " << newmap << dendl;
+  return PyCapsule_New(newmap, nullptr, &delete_osdmap);
+}
+
 static void _list_modules(
   const std::string path,
   std::set<std::string> *modules)
diff --git a/src/mgr/PyModules.h b/src/mgr/PyModules.h
index 1c6751f4e51b9..b2540777163e3 100644
--- a/src/mgr/PyModules.h
+++ b/src/mgr/PyModules.h
@@ -82,6 +82,7 @@ class PyModules
      const std::string svc_type,
      const std::string &svc_id);
   PyObject *get_context();
+  PyObject *get_osdmap();
 
   std::map<std::string, std::string> config_cache;
 
diff --git a/src/mgr/PyOSDMap.cc b/src/mgr/PyOSDMap.cc
new file mode 100644
index 0000000000000..9b1bdc4e3b79d
--- /dev/null
+++ b/src/mgr/PyOSDMap.cc
@@ -0,0 +1,125 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Mgr.h"
+
+#include "osd/OSDMap.h"
+#include "common/errno.h"
+#include "common/version.h"
+
+#include "PyOSDMap.h"
+#include "PyFormatter.h"
+#include "Gil.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mgr
+
+// ----------
+
+static PyObject *osdmap_get_epoch(PyObject *self, PyObject *obj)
+{
+  OSDMap *osdmap = static_cast<OSDMap*>(PyCapsule_GetPointer(obj, nullptr));
+  return PyInt_FromLong(osdmap->get_epoch());
+}
+
+static PyObject *osdmap_dump(PyObject *self, PyObject *obj)
+{
+  OSDMap *osdmap = static_cast<OSDMap*>(PyCapsule_GetPointer(obj, nullptr));
+  PyFormatter f;
+  osdmap->dump(&f);
+  return f.get();
+}
+
+
+static void delete_osdmap_incremental(PyObject *object)
+{
+  OSDMap::Incremental *inc = static_cast<OSDMap::Incremental*>(
+    PyCapsule_GetPointer(object, nullptr));
+  derr << __func__ << " " << inc << dendl;
+  delete inc;
+}
+
+static PyObject *osdmap_new_incremental(PyObject *self, PyObject *obj)
+{
+  OSDMap *osdmap = static_cast<OSDMap*>(PyCapsule_GetPointer(obj, nullptr));
+
+  // Construct a capsule containing an OSDMap.
+  OSDMap::Incremental *inc = new OSDMap::Incremental;
+  inc->fsid = osdmap->get_fsid();
+  inc->epoch = osdmap->get_epoch() + 1;
+  return PyCapsule_New(inc, nullptr, &delete_osdmap_incremental);
+}
+
+static PyObject *osdmap_calc_pg_upmaps(PyObject *self, PyObject *args)
+{
+  PyObject *mapobj, *incobj, *pool_list;
+  double max_deviation = 0;
+  int max_iterations = 0;
+  if (!PyArg_ParseTuple(args, "OOdiO:calc_pg_upmaps",
+			&mapobj, &incobj, &max_deviation,
+			&max_iterations, &pool_list)) {
+    return nullptr;
+  }
+
+  OSDMap *osdmap = static_cast<OSDMap*>(PyCapsule_GetPointer(mapobj, nullptr));
+  OSDMap::Incremental *inc = static_cast<OSDMap::Incremental*>(
+    PyCapsule_GetPointer(incobj, nullptr));
+
+  dout(10) << __func__ << " osdmap " << osdmap << " inc " << inc
+	   << " max_deviation " << max_deviation
+	   << " max_iterations " << max_iterations
+	   << dendl;
+  set<int64_t> pools;
+  // FIXME: unpack pool_list and translate to pools set
+  int r = osdmap->calc_pg_upmaps(g_ceph_context,
+				 max_deviation,
+				 max_iterations,
+				 pools,
+				 inc);
+  dout(10) << __func__ << " r = " << r << dendl;
+  return PyInt_FromLong(r);
+}
+
+PyMethodDef OSDMapMethods[] = {
+  {"get_epoch", osdmap_get_epoch, METH_O, "Get OSDMap epoch"},
+  {"dump", osdmap_dump, METH_O, "Dump OSDMap::Incremental"},
+  {"new_incremental", osdmap_new_incremental, METH_O,
+   "Create OSDMap::Incremental"},
+  {"calc_pg_upmaps", osdmap_calc_pg_upmaps, METH_VARARGS,
+   "Calculate new pg-upmap values"},
+  {NULL, NULL, 0, NULL}
+};
+
+// ----------
+
+static PyObject *osdmap_inc_get_epoch(PyObject *self, PyObject *obj)
+{
+  OSDMap::Incremental *inc = static_cast<OSDMap::Incremental*>(
+    PyCapsule_GetPointer(obj, nullptr));
+  return PyInt_FromLong(inc->epoch);
+}
+
+static PyObject *osdmap_inc_dump(PyObject *self, PyObject *obj)
+{
+  OSDMap::Incremental *inc = static_cast<OSDMap::Incremental*>(
+    PyCapsule_GetPointer(obj, nullptr));
+  PyFormatter f;
+  inc->dump(&f);
+  return f.get();
+}
+
+PyMethodDef OSDMapIncrementalMethods[] = {
+  {"get_epoch", osdmap_inc_get_epoch, METH_O, "Get OSDMap::Incremental epoch"},
+  {"dump", osdmap_inc_dump, METH_O, "Dump OSDMap::Incremental"},
+  {NULL, NULL, 0, NULL}
+};
+
+
+// ----------
+
+
+
+PyMethodDef CRUSHMapMethods[] = {
+//  {"get_epoch", osdmap_get_epoch, METH_O, "Get OSDMap epoch"},
+  {NULL, NULL, 0, NULL}
+};
diff --git a/src/mgr/PyOSDMap.h b/src/mgr/PyOSDMap.h
new file mode 100644
index 0000000000000..bfa851cd66012
--- /dev/null
+++ b/src/mgr/PyOSDMap.h
@@ -0,0 +1,10 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "Python.h"
+
+extern PyMethodDef OSDMapMethods[];
+extern PyMethodDef OSDMapIncrementalMethods[];
+extern PyMethodDef CRUSHMapMethods[];
diff --git a/src/mgr/PyState.cc b/src/mgr/PyState.cc
index ae237b2e599e6..d70a96d316cc2 100644
--- a/src/mgr/PyState.cc
+++ b/src/mgr/PyState.cc
@@ -348,6 +348,13 @@ get_perf_schema(PyObject *self, PyObject *args)
   return global_handle->get_perf_schema_python(handle, type_str, svc_id);
 }
 
+static PyObject *
+ceph_get_osdmap(PyObject *self, PyObject *args)
+{
+  return global_handle->get_osdmap();
+}
+
+
 PyMethodDef CephStateMethods[] = {
     {"get", ceph_state_get, METH_VARARGS,
      "Get a cluster object"},
@@ -377,6 +384,8 @@ PyMethodDef CephStateMethods[] = {
      "Get the ceph version of this process"},
     {"get_context", ceph_get_context, METH_NOARGS,
       "Get a CephContext* in a python capsule"},
+    {"get_osdmap", ceph_get_osdmap, METH_NOARGS,
+     "Get an OSDMap handle"},
     {NULL, NULL, 0, NULL}
 };
 
diff --git a/src/pybind/mgr/dashboard/module.py b/src/pybind/mgr/dashboard/module.py
index 7e2a4260a03e6..c9039a831f7f4 100644
--- a/src/pybind/mgr/dashboard/module.py
+++ b/src/pybind/mgr/dashboard/module.py
@@ -802,6 +802,9 @@ def get_perf_schema(self, **args):
             'engine.autoreload.on': False
         })
 
+        osdmap = self.get_osdmap()
+        log.info("latest osdmap is %d" % osdmap.get_epoch())
+
         static_dir = os.path.join(current_dir, 'static')
         conf = {
             "/static": {
diff --git a/src/pybind/mgr/mgr_module.py b/src/pybind/mgr/mgr_module.py
index 49c7efe15ca8d..2b64a2ea9ccfc 100644
--- a/src/pybind/mgr/mgr_module.py
+++ b/src/pybind/mgr/mgr_module.py
@@ -1,5 +1,8 @@
 
 import ceph_state  #noqa
+import ceph_osdmap  #noqa
+import ceph_osdmap_incremental  #noqa
+import ceph_crushmap  #noqa
 import json
 import logging
 import threading
@@ -30,6 +33,44 @@ def wait(self):
         return self.r, self.outb, self.outs
 
 
+class OSDMap(object):
+    def __init__(self, handle):
+        self._handle = handle
+
+    def get_epoch(self):
+        return ceph_osdmap.get_epoch(self._handle)
+
+    def dump(self):
+        return ceph_osdmap.dump(self._handle)
+
+    def new_incremental(self):
+        return OSDMapIncremental(ceph_osdmap.new_incremental(self._handle))
+
+    def calc_pg_upmaps(self, inc, max_deviation=.01, max_iterations=10, pools=[]):
+        return ceph_osdmap.calc_pg_upmaps(
+            self._handle,
+            inc._handle,
+            max_deviation, max_iterations, pools)
+
+
+class OSDMapIncremental(object):
+    def __init__(self, handle):
+        self._handle = handle
+
+    def get_epoch(self):
+        return ceph_osdmap_incremental.get_epoch(self._handle)
+
+    def dump(self):
+        return ceph_osdmap_incremental.dump(self._handle)
+
+class CRUSHMap(object):
+    def __init__(self, handle):
+        self._handle = handle
+
+#    def get_epoch(self):
+#        return ceph_crushmap.get_epoch(self._handle)
+
+
 class MgrModule(object):
     COMMANDS = []
 
@@ -275,3 +316,11 @@ def self_test(self):
         :return: bool
         """
         pass
+
+    def get_osdmap(self):
+        """
+        Get a handle to an OSDMap.  If epoch==0, get a handle for the latest
+        OSDMap.
+        :return: OSDMap
+        """
+        return OSDMap(ceph_state.get_osdmap())

From 57ba13f322170bfad1a1c12b124202f93c9cc864 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Tue, 11 Jul 2017 16:26:16 -0400
Subject: [PATCH 04/19] pybind/mgr/mgr_module: add default arg to get_config

Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/pybind/mgr/mgr_module.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/pybind/mgr/mgr_module.py b/src/pybind/mgr/mgr_module.py
index 2b64a2ea9ccfc..346bc6e26dd0f 100644
--- a/src/pybind/mgr/mgr_module.py
+++ b/src/pybind/mgr/mgr_module.py
@@ -236,14 +236,18 @@ def get_mgr_id(self):
         """
         return ceph_state.get_mgr_id()
 
-    def get_config(self, key):
+    def get_config(self, key, default=None):
         """
         Retrieve the value of a persistent configuration setting
 
         :param key: str
         :return: str
         """
-        return ceph_state.get_config(self._handle, key)
+        r = ceph_state.get_config(self._handle, key)
+        if r is None:
+            return default
+        else:
+            return r
 
     def get_config_prefix(self, key_prefix):
         """

From 4f6305f7adb97e98e50bda5aff7a392ca303136c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Tue, 11 Jul 2017 16:27:08 -0400
Subject: [PATCH 05/19] pybind/mgr/balancer: add balancer module

- wake up every minute
- back off when unknown, inactive, degraded
- throttle against misplaced ratio
- apply some optimization step
  - initially implement 'upmap' only

Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/pybind/mgr/balancer/__init__.py |   2 +
 src/pybind/mgr/balancer/module.py   | 109 ++++++++++++++++++++++++++++
 2 files changed, 111 insertions(+)
 create mode 100644 src/pybind/mgr/balancer/__init__.py
 create mode 100644 src/pybind/mgr/balancer/module.py

diff --git a/src/pybind/mgr/balancer/__init__.py b/src/pybind/mgr/balancer/__init__.py
new file mode 100644
index 0000000000000..79f5b86fd5045
--- /dev/null
+++ b/src/pybind/mgr/balancer/__init__.py
@@ -0,0 +1,2 @@
+
+from module import *  # NOQA
diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py
new file mode 100644
index 0000000000000..883be130c3135
--- /dev/null
+++ b/src/pybind/mgr/balancer/module.py
@@ -0,0 +1,109 @@
+
+"""
+Balance PG distribution across OSDs.
+"""
+
+import json
+import time
+from mgr_module import MgrModule, CommandResult
+from threading import Event
+
+# available modes: 'none', 'crush', 'crush-compat', 'upmap', 'osd_weight'
+default_mode = 'upmap'
+default_sleep_interval = 30   # seconds
+default_max_misplaced = .03   # max ratio of pgs replaced at a time
+
+class Module(MgrModule):
+    run = True
+
+    def __init__(self, *args, **kwargs):
+        super(Module, self).__init__(*args, **kwargs)
+        self.event = Event()
+
+    def handle_command(self, command):
+        return (-errno.EINVAL, '',
+                "Command not found '{0}'".format(command['prefix']))
+
+    def shutdown(self):
+        self.log.info('Stopping')
+        self.run = False
+        self.event.set()
+
+    def serve(self):
+        self.log.info('Starting')
+        while self.run:
+            mode = self.get_config('mode', default_mode)
+            sleep_interval = float(self.get_config('sleep_interval',
+                                                   default_sleep_interval))
+            max_misplaced = float(self.get_config('max_misplaced',
+                                                  default_max_misplaced))
+            self.log.info('Mode %s, sleep interval %f, max misplaced %f' %
+                          (mode, sleep_interval, max_misplaced))
+
+            info = self.get('pg_status')
+            if info.get('unknown_ratio', 0.0) + info.get('degraded_ratio', 0.0) + info.get('inactive_ratio', 0.0) > 0.0:
+                self.log.info('Some PGs are unknown, degraded, or inactive; waiting')
+            elif info.get('misplaced_ratio', 0.0) >= max_misplaced:
+                self.log.info('Too many PGs (%f > max %f) are misplaced; waiting' % (info.get('misplaced_ratio', 0.0), max_misplaced))
+            else:
+                if mode == 'upmap':
+                    self.do_upmap()
+                elif mode == 'crush':
+                    self.do_crush(compat=False)
+                elif mode == 'crush-compat':
+                    self.do_crush(compat=True)
+                elif mode == 'osd_weight':
+                    self.osd_weight()
+                elif mode == 'none':
+                    self.log.info('Idle')
+                else:
+                    self.log.info('Unrecognized mode %s' % mode)
+            self.event.wait(sleep_interval)
+
+    def do_upmap(self):
+        self.log.info('do_upmap')
+        max_iterations = self.get_config('upmap_max_iterations', 10)
+        max_deviation = self.get_config('upmap_max_deviation', .01)
+        osdmap = self.get_osdmap()
+        inc = osdmap.new_incremental()
+        osdmap.calc_pg_upmaps(inc, max_deviation, max_iterations, [])
+        incdump = inc.dump()
+        self.log.info('inc is %s' % incdump)
+
+        for pgid in incdump.get('old_pg_upmap_items', []):
+            self.log.info('ceph osd rm-pg-upmap-items %s', pgid)
+            result = CommandResult('foo')
+            self.send_command(result, 'mon', '', json.dumps({
+                'prefix': 'osd rm-pg-upmap-items',
+                'format': 'json',
+                'pgid': pgid,
+            }), 'foo')
+            r, outb, outs = result.wait()
+            if r != 0:
+                self.log.error('Error removing pg-upmap on %s' % pgid)
+                break;
+
+        for item in incdump.get('new_pg_upmap_items', []):
+            self.log.info('ceph osd pg-upmap-items %s mappings %s', item['pgid'],
+                          item['mappings'])
+            osdlist = []
+            for m in item['mappings']:
+                osdlist += [m['from'], m['to']]
+            result = CommandResult('foo')
+            self.send_command(result, 'mon', '', json.dumps({
+                'prefix': 'osd pg-upmap-items',
+                'format': 'json',
+                'pgid': item['pgid'],
+                'id': osdlist,
+            }), 'foo')
+            r, outb, outs = result.wait()
+            if r != 0:
+                self.log.error('Error setting pg-upmap on %s' % item['pgid'])
+                break;
+
+
+    def do_crush(self, compat):
+        self.log.info('do_crush compat=%b' % compat)
+
+    def do_osd_weight(self):
+        self.log.info('do_osd_weight')

From f7f017a7297f6ff891c21f06ce6ae0c90508c5a5 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Sat, 22 Jul 2017 23:17:18 -0400
Subject: [PATCH 06/19] pybind/mgr/balancer: do upmap by pool, in random order

Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/pybind/mgr/balancer/module.py | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py
index 883be130c3135..c95f88de0591d 100644
--- a/src/pybind/mgr/balancer/module.py
+++ b/src/pybind/mgr/balancer/module.py
@@ -4,6 +4,7 @@
 """
 
 import json
+import random
 import time
 from mgr_module import MgrModule, CommandResult
 from threading import Event
@@ -64,11 +65,30 @@ def do_upmap(self):
         self.log.info('do_upmap')
         max_iterations = self.get_config('upmap_max_iterations', 10)
         max_deviation = self.get_config('upmap_max_deviation', .01)
+
+        osdmap_dump = self.get('osd_map')
+        pools = [str(i['pool_name']) for i in osdmap_dump.get('pools',[])]
+        if len(pools) == 0:
+            self.log.info('no pools, nothing to do')
+            return
+        # shuffle pool list so they all get equal (in)attention
+        random.shuffle(pools)
+        self.log.info('pools %s' % pools)
+
         osdmap = self.get_osdmap()
         inc = osdmap.new_incremental()
-        osdmap.calc_pg_upmaps(inc, max_deviation, max_iterations, [])
+        total_did = 0
+        left = max_iterations
+        for pool in pools:
+            did = osdmap.calc_pg_upmaps(inc, max_deviation, left, [pool])
+            total_did += did
+            left -= did
+            if left <= 0:
+                break
+        self.log.info('prepared %d/%d changes' % (total_did, max_iterations))
+
         incdump = inc.dump()
-        self.log.info('inc is %s' % incdump)
+        self.log.debug('resulting inc is %s' % incdump)
 
         for pgid in incdump.get('old_pg_upmap_items', []):
             self.log.info('ceph osd rm-pg-upmap-items %s', pgid)

From 826b10e02760a8aef9fa8d5af63bf4150204aaec Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Sat, 22 Jul 2017 23:50:27 -0400
Subject: [PATCH 07/19] crush/CrushWrapper: refactor get_rule_weight_osd_map to
 work with roots too

Allow us to specify a root node in the hierarchy instead of a rule.
This way we can use it in conjunction with find_takes().

Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/crush/CrushWrapper.cc | 80 +++++++++++++++++++++++++--------------
 src/crush/CrushWrapper.h  | 20 +++++++++-
 2 files changed, 71 insertions(+), 29 deletions(-)

diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc
index c01856607afdb..2b02f24131548 100644
--- a/src/crush/CrushWrapper.cc
+++ b/src/crush/CrushWrapper.cc
@@ -1577,7 +1577,56 @@ int CrushWrapper::add_simple_rule(
 			    rule_type, -1, err);
 }
 
-int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap)
+float CrushWrapper::_get_take_weight_osd_map(int root,
+					     map<int,float> *pmap) const
+{
+  float sum = 0.0;
+  list<int> q;
+  q.push_back(root);
+  //breadth first iterate the OSD tree
+  while (!q.empty()) {
+    int bno = q.front();
+    q.pop_front();
+    crush_bucket *b = crush->buckets[-1-bno];
+    assert(b);
+    for (unsigned j=0; j<b->size; ++j) {
+      int item_id = b->items[j];
+      if (item_id >= 0) { //it's an OSD
+	float w = crush_get_bucket_item_weight(b, j);
+	(*pmap)[item_id] = w;
+	sum += w;
+      } else { //not an OSD, expand the child later
+	q.push_back(item_id);
+      }
+    }
+  }
+  return sum;
+}
+
+void CrushWrapper::_normalize_weight_map(float sum,
+					 const map<int,float>& m,
+					 map<int,float> *pmap) const
+{
+  for (auto& p : m) {
+    map<int,float>::iterator q = pmap->find(p.first);
+    if (q == pmap->end()) {
+      (*pmap)[p.first] = p.second / sum;
+    } else {
+      q->second += p.second / sum;
+    }
+  }
+}
+
+int CrushWrapper::get_take_weight_osd_map(int root, map<int,float> *pmap) const
+{
+  map<int,float> m;
+  float sum = _get_take_weight_osd_map(root, &m);
+  _normalize_weight_map(sum, m, pmap);
+  return 0;
+}
+
+int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno,
+					  map<int,float> *pmap) const
 {
   if (ruleno >= crush->max_rules)
     return -ENOENT;
@@ -1600,35 +1649,10 @@ int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap)
 	m[n] = 1.0;
 	sum = 1.0;
       } else {
-	list<int> q;
-	q.push_back(n);
-	//breadth first iterate the OSD tree
-	while (!q.empty()) {
-	  int bno = q.front();
-	  q.pop_front();
-	  crush_bucket *b = crush->buckets[-1-bno];
-	  assert(b);
-	  for (unsigned j=0; j<b->size; ++j) {
-	    int item_id = b->items[j];
-	    if (item_id >= 0) { //it's an OSD
-	      float w = crush_get_bucket_item_weight(b, j);
-	      m[item_id] = w;
-	      sum += w;
-	    } else { //not an OSD, expand the child later
-	      q.push_back(item_id);
-	    }
-	  }
-	}
-      }
-    }
-    for (map<int,float>::iterator p = m.begin(); p != m.end(); ++p) {
-      map<int,float>::iterator q = pmap->find(p->first);
-      if (q == pmap->end()) {
-	(*pmap)[p->first] = p->second / sum;
-      } else {
-	q->second += p->second / sum;
+	sum += _get_take_weight_osd_map(n, &m);
       }
     }
+    _normalize_weight_map(sum, m, pmap);
   }
 
   return 0;
diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h
index ec6a9ea85816f..8b3fd438a30c8 100644
--- a/src/crush/CrushWrapper.h
+++ b/src/crush/CrushWrapper.h
@@ -1006,6 +1006,12 @@ class CrushWrapper {
     return s->arg2;
   }
 
+private:
+  float _get_take_weight_osd_map(int root, map<int,float> *pmap) const;
+  void _normalize_weight_map(float sum, const map<int,float>& m,
+			     map<int,float> *pmap) const;
+
+public:
   /**
    * calculate a map of osds to weights for a given rule
    *
@@ -1016,7 +1022,19 @@ class CrushWrapper {
    * @param pmap [out] map of osd to weight
    * @return 0 for success, or negative error code
    */
-  int get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap);
+  int get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap) const;
+
+  /**
+   * calculate a map of osds to weights for a given starting root
+   *
+   * Generate a map of which OSDs get how much relative weight for a
+   * given starting root
+   *
+   * @param root node
+   * @param pmap [out] map of osd to weight
+   * @return 0 for success, or negative error code
+   */
+  int get_take_weight_osd_map(int root, map<int,float> *pmap) const;
 
   /* modifiers */
 

From 626059d474cf66264ed173d0a97a77ba6a14a4ff Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Sat, 22 Jul 2017 23:51:47 -0400
Subject: [PATCH 08/19] crush/CrushWrapper: fix output arg for
 find_{takes,roots}()

Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/crush/CrushTreeDumper.h |  2 +-
 src/crush/CrushWrapper.cc   | 20 ++++++++++----------
 src/crush/CrushWrapper.h    |  6 +++---
 src/tools/crushtool.cc      |  2 +-
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/crush/CrushTreeDumper.h b/src/crush/CrushTreeDumper.h
index 199b3c39be3ec..8d48069fe1b7b 100644
--- a/src/crush/CrushTreeDumper.h
+++ b/src/crush/CrushTreeDumper.h
@@ -68,7 +68,7 @@ namespace CrushTreeDumper {
     explicit Dumper(const CrushWrapper *crush_,
 		    const name_map_t& weight_set_names_)
       : crush(crush_), weight_set_names(weight_set_names_) {
-      crush->find_nonshadow_roots(roots);
+      crush->find_nonshadow_roots(&roots);
       root = roots.begin();
     }
 
diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc
index 2b02f24131548..428bf29c9c0ad 100644
--- a/src/crush/CrushWrapper.cc
+++ b/src/crush/CrushWrapper.cc
@@ -291,7 +291,7 @@ int CrushWrapper::rename_bucket(const string& srcname,
   return set_item_name(oldid, dstname);
 }
 
-void CrushWrapper::find_takes(set<int>& roots) const
+void CrushWrapper::find_takes(set<int> *roots) const
 {
   for (unsigned i=0; i<crush->max_rules; i++) {
     crush_rule *r = crush->rules[i];
@@ -299,23 +299,23 @@ void CrushWrapper::find_takes(set<int>& roots) const
       continue;
     for (unsigned j=0; j<r->len; j++) {
       if (r->steps[j].op == CRUSH_RULE_TAKE)
-	roots.insert(r->steps[j].arg1);
+	roots->insert(r->steps[j].arg1);
     }
   }
 }
 
-void CrushWrapper::find_roots(set<int>& roots) const
+void CrushWrapper::find_roots(set<int> *roots) const
 {
   for (int i = 0; i < crush->max_buckets; i++) {
     if (!crush->buckets[i])
       continue;
     crush_bucket *b = crush->buckets[i];
     if (!_search_item_exists(b->id))
-      roots.insert(b->id);
+      roots->insert(b->id);
   }
 }
 
-void CrushWrapper::find_nonshadow_roots(set<int>& roots) const
+void CrushWrapper::find_nonshadow_roots(set<int> *roots) const
 {
   for (int i = 0; i < crush->max_buckets; i++) {
     if (!crush->buckets[i])
@@ -326,7 +326,7 @@ void CrushWrapper::find_nonshadow_roots(set<int>& roots) const
     const char *name = get_item_name(b->id);
     if (name && !is_valid_crush_name(name))
       continue;
-    roots.insert(b->id);
+    roots->insert(b->id);
   }
 }
 
@@ -1381,7 +1381,7 @@ bool CrushWrapper::class_is_in_use(int class_id)
 int CrushWrapper::populate_classes()
 {
   set<int> roots;
-  find_roots(roots);
+  find_roots(&roots);
   for (auto &r : roots) {
     if (r >= 0)
       continue;
@@ -1405,7 +1405,7 @@ int CrushWrapper::cleanup_classes()
 int CrushWrapper::trim_roots_with_class(bool unused)
 {
   set<int> roots;
-  find_roots(roots);
+  find_roots(&roots);
   for (auto &r : roots) {
     if (r >= 0)
       continue;
@@ -1449,7 +1449,7 @@ int32_t CrushWrapper::_alloc_class_id() const {
 void CrushWrapper::reweight(CephContext *cct)
 {
   set<int> roots;
-  find_roots(roots);
+  find_roots(&roots);
   for (set<int>::iterator p = roots.begin(); p != roots.end(); ++p) {
     if (*p >= 0)
       continue;
@@ -2387,7 +2387,7 @@ namespace {
 
     void dump(Formatter *f) {
       set<int> roots;
-      crush->find_roots(roots);
+      crush->find_roots(&roots);
       for (set<int>::iterator root = roots.begin(); root != roots.end(); ++root) {
 	dump_item(Item(*root, 0, 0, crush->get_bucket_weightf(*root)), f);
       }
diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h
index 8b3fd438a30c8..cd272d5e4134a 100644
--- a/src/crush/CrushWrapper.h
+++ b/src/crush/CrushWrapper.h
@@ -588,14 +588,14 @@ class CrushWrapper {
    *
    * Note that these may not be parentless roots.
    */
-  void find_takes(set<int>& roots) const;
+  void find_takes(set<int> *roots) const;
 
   /**
    * find tree roots
    *
    * These are parentless nodes in the map.
    */
-  void find_roots(set<int>& roots) const;
+  void find_roots(set<int> *roots) const;
 
   /**
    * find tree roots that are not shadow (device class) items
@@ -603,7 +603,7 @@ class CrushWrapper {
    * These are parentless nodes in the map that are not shadow
    * items for device classes.
    */
-  void find_nonshadow_roots(set<int>& roots) const;
+  void find_nonshadow_roots(set<int> *roots) const;
 
   /**
    * see if an item is contained within a subtree
diff --git a/src/tools/crushtool.cc b/src/tools/crushtool.cc
index 2a1bc83b8f86f..c2dbaae84fce2 100644
--- a/src/tools/crushtool.cc
+++ b/src/tools/crushtool.cc
@@ -829,7 +829,7 @@ int main(int argc, const char **argv)
 
     {
       set<int> roots;
-      crush.find_roots(roots);
+      crush.find_roots(&roots);
       if (roots.size() > 1)
 	dout(1)	<< "The crush rulesets will use the root " << root << "\n"
 		<< "and ignore the others.\n"

From 1b8335f64ee92b53afe0379592bcf5021d365a2c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Thu, 27 Jul 2017 10:07:31 -0400
Subject: [PATCH 09/19] crush/CrushWrapper: rule_has_take

Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/crush/CrushWrapper.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h
index cd272d5e4134a..0f858d1678f15 100644
--- a/src/crush/CrushWrapper.h
+++ b/src/crush/CrushWrapper.h
@@ -965,6 +965,17 @@ class CrushWrapper {
       return true;
     return false;
   }
+  bool rule_has_take(unsigned ruleno, int take) const {
+    if (!crush) return false;
+    crush_rule *rule = get_rule(ruleno);
+    for (unsigned i = 0; i < rule->len; ++i) {
+      if (rule->steps[i].op == CRUSH_RULE_TAKE &&
+	  rule->steps[i].arg1 == take) {
+	return true;
+      }
+    }
+    return false;
+  }
   int get_rule_len(unsigned ruleno) const {
     crush_rule *r = get_rule(ruleno);
     if (IS_ERR(r)) return PTR_ERR(r);

From 19ab2a7d7b99f6cbf2d262475740e5bd05bdf07a Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Sun, 23 Jul 2017 00:10:56 -0400
Subject: [PATCH 10/19] mgr/PyOSDMap: get_crush, find_takes,
 get_take_weight_osd_map

These let us identify distinct CRUSH hierarchies that rules distribute
data over, and create relative weight maps for the OSDs they map to.

Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/mgr/PyOSDMap.cc          | 212 ++++++++++++++++++++++++++++++++++-
 src/pybind/mgr/mgr_module.py |  46 +++++++-
 2 files changed, 250 insertions(+), 8 deletions(-)

diff --git a/src/mgr/PyOSDMap.cc b/src/mgr/PyOSDMap.cc
index 9b1bdc4e3b79d..69bcea42a2ade 100644
--- a/src/mgr/PyOSDMap.cc
+++ b/src/mgr/PyOSDMap.cc
@@ -6,6 +6,7 @@
 #include "osd/OSDMap.h"
 #include "common/errno.h"
 #include "common/version.h"
+#include "include/stringify.h"
 
 #include "PyOSDMap.h"
 #include "PyFormatter.h"
@@ -22,6 +23,12 @@ static PyObject *osdmap_get_epoch(PyObject *self, PyObject *obj)
   return PyInt_FromLong(osdmap->get_epoch());
 }
 
+static PyObject *osdmap_get_crush_version(PyObject *self, PyObject *obj)
+{
+  OSDMap *osdmap = static_cast<OSDMap*>(PyCapsule_GetPointer(obj, nullptr));
+  return PyInt_FromLong(osdmap->get_crush_version());
+}
+
 static PyObject *osdmap_dump(PyObject *self, PyObject *obj)
 {
   OSDMap *osdmap = static_cast<OSDMap*>(PyCapsule_GetPointer(obj, nullptr));
@@ -35,21 +42,84 @@ static void delete_osdmap_incremental(PyObject *object)
 {
   OSDMap::Incremental *inc = static_cast<OSDMap::Incremental*>(
     PyCapsule_GetPointer(object, nullptr));
-  derr << __func__ << " " << inc << dendl;
+  dout(10) << __func__ << " " << inc << dendl;
   delete inc;
 }
 
 static PyObject *osdmap_new_incremental(PyObject *self, PyObject *obj)
 {
   OSDMap *osdmap = static_cast<OSDMap*>(PyCapsule_GetPointer(obj, nullptr));
-
-  // Construct a capsule containing an OSDMap.
   OSDMap::Incremental *inc = new OSDMap::Incremental;
   inc->fsid = osdmap->get_fsid();
   inc->epoch = osdmap->get_epoch() + 1;
+  // always include latest crush map here... this is okay since we never
+  // actually use this map in the real world (and even if we did it would
+  // be a no-op).
+  osdmap->crush->encode(inc->crush, CEPH_FEATURES_ALL);
+  dout(10) << __func__ << " " << inc << dendl;
   return PyCapsule_New(inc, nullptr, &delete_osdmap_incremental);
 }
 
+static void delete_osdmap(PyObject *object)
+{
+  OSDMap *osdmap = static_cast<OSDMap*>(PyCapsule_GetPointer(object, nullptr));
+  assert(osdmap);
+  dout(10) << __func__ << " " << osdmap << dendl;
+  delete osdmap;
+}
+
+static PyObject *osdmap_apply_incremental(PyObject *self, PyObject *args)
+{
+  PyObject *mapobj, *incobj;
+  if (!PyArg_ParseTuple(args, "OO:apply_incremental",
+			&mapobj, &incobj)) {
+    return nullptr;
+  }
+  OSDMap *osdmap = static_cast<OSDMap*>(PyCapsule_GetPointer(mapobj, nullptr));
+  OSDMap::Incremental *inc = static_cast<OSDMap::Incremental*>(
+    PyCapsule_GetPointer(incobj, nullptr));
+  if (!osdmap || !inc) {
+    return nullptr;
+  }
+
+  bufferlist bl;
+  osdmap->encode(bl, CEPH_FEATURES_ALL|CEPH_FEATURE_RESERVED);
+  OSDMap *next = new OSDMap;
+  next->decode(bl);
+  next->apply_incremental(*inc);
+  dout(10) << __func__ << " map " << osdmap << " inc " << inc
+	   << " next " << next << dendl;
+  return PyCapsule_New(next, nullptr, &delete_osdmap);
+}
+
+static PyObject *osdmap_get_crush(PyObject *self, PyObject *obj)
+{
+  OSDMap *osdmap = static_cast<OSDMap*>(PyCapsule_GetPointer(obj, nullptr));
+
+  // Construct a capsule containing a the CrushWrapper.
+  return PyCapsule_New(osdmap->crush.get(), nullptr, nullptr);
+}
+
+static PyObject *osdmap_get_pools_by_take(PyObject *self, PyObject *args)
+{
+  PyObject *mapobj;
+  int take;
+  if (!PyArg_ParseTuple(args, "Oi:get_pools_by_take",
+			&mapobj, &take)) {
+    return nullptr;
+  }
+  OSDMap *osdmap = static_cast<OSDMap*>(PyCapsule_GetPointer(mapobj, nullptr));
+  PyFormatter f;
+  f.open_array_section("pools");
+  for (auto& p : osdmap->get_pools()) {
+    if (osdmap->crush->rule_has_take(p.second.crush_rule, take)) {
+      f.dump_int("pool", p.first);
+    }
+  }
+  f.close_section();
+  return f.get();
+}
+
 static PyObject *osdmap_calc_pg_upmaps(PyObject *self, PyObject *args)
 {
   PyObject *mapobj, *incobj, *pool_list;
@@ -82,9 +152,15 @@ static PyObject *osdmap_calc_pg_upmaps(PyObject *self, PyObject *args)
 
 PyMethodDef OSDMapMethods[] = {
   {"get_epoch", osdmap_get_epoch, METH_O, "Get OSDMap epoch"},
+  {"get_crush_version", osdmap_get_crush_version, METH_O, "Get CRUSH version"},
   {"dump", osdmap_dump, METH_O, "Dump OSDMap::Incremental"},
   {"new_incremental", osdmap_new_incremental, METH_O,
    "Create OSDMap::Incremental"},
+  {"apply_incremental", osdmap_apply_incremental, METH_VARARGS,
+   "Apply OSDMap::Incremental and return the resulting OSDMap"},
+  {"get_crush", osdmap_get_crush, METH_O, "Get CrushWrapper"},
+  {"get_pools_by_take", osdmap_get_pools_by_take, METH_VARARGS,
+   "Get pools that have CRUSH rules that TAKE the given root"},
   {"calc_pg_upmaps", osdmap_calc_pg_upmaps, METH_VARARGS,
    "Calculate new pg-upmap values"},
   {NULL, NULL, 0, NULL}
@@ -108,18 +184,146 @@ static PyObject *osdmap_inc_dump(PyObject *self, PyObject *obj)
   return f.get();
 }
 
+static int get_int_float_map(PyObject *obj, map<int,double> *out)
+{
+  PyObject *ls = PyDict_Items(obj);
+  for (int j = 0; j < PyList_Size(ls); ++j) {
+    PyObject *pair = PyList_GET_ITEM(ls, j);
+    if (!PyTuple_Check(pair)) {
+      derr << __func__ << " item " << j << " not a tuple" << dendl;
+      return -1;
+    }
+    int k;
+    double v;
+    if (!PyArg_ParseTuple(pair, "id:pair", &k, &v)) {
+      derr << __func__ << " item " << j << " not a size 2 tuple" << dendl;
+      return -1;
+    }
+    (*out)[k] = v;
+  }
+  return 0;
+}
+
+static PyObject *osdmap_inc_set_osd_reweights(PyObject *self, PyObject *args)
+{
+  PyObject *incobj, *weightobj;
+  if (!PyArg_ParseTuple(args, "OO:set_osd_reweights",
+			&incobj, &weightobj)) {
+    return nullptr;
+  }
+  OSDMap::Incremental *inc = static_cast<OSDMap::Incremental*>(
+    PyCapsule_GetPointer(incobj, nullptr));
+  map<int,double> wm;
+  if (get_int_float_map(weightobj, &wm) < 0) {
+    return nullptr;
+  }
+
+  for (auto i : wm) {
+    inc->new_weight[i.first] = std::max(0.0, std::min(1.0, i.second)) * 0x10000;
+  }
+  Py_RETURN_NONE;
+}
+
+static PyObject *osdmap_inc_set_compat_weight_set_weights(
+  PyObject *self, PyObject *args)
+{
+  PyObject *incobj, *weightobj;
+  if (!PyArg_ParseTuple(args, "OO:set_compat_weight_set_weights",
+			&incobj, &weightobj)) {
+    return nullptr;
+  }
+  OSDMap::Incremental *inc = static_cast<OSDMap::Incremental*>(
+    PyCapsule_GetPointer(incobj, nullptr));
+  map<int,double> wm;
+  if (get_int_float_map(weightobj, &wm) < 0) {
+    return nullptr;
+  }
+
+  CrushWrapper crush;
+  assert(inc->crush.length());  // see new_incremental
+  auto p = inc->crush.begin();
+  ::decode(crush, p);
+  crush.create_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS, 1);
+  for (auto i : wm) {
+    crush.choose_args_adjust_item_weightf(
+      g_ceph_context,
+      crush.choose_args_get(CrushWrapper::DEFAULT_CHOOSE_ARGS),
+      i.first,
+      { i.second },
+      nullptr);
+  }
+  inc->crush.clear();
+  crush.encode(inc->crush, CEPH_FEATURES_ALL);
+  Py_RETURN_NONE;
+}
+
+
+
 PyMethodDef OSDMapIncrementalMethods[] = {
   {"get_epoch", osdmap_inc_get_epoch, METH_O, "Get OSDMap::Incremental epoch"},
   {"dump", osdmap_inc_dump, METH_O, "Dump OSDMap::Incremental"},
+  {"set_osd_reweights", osdmap_inc_set_osd_reweights, METH_VARARGS,
+   "Set osd reweight values"},
+  {"set_crush_compat_weight_set_weights",
+   osdmap_inc_set_compat_weight_set_weights, METH_VARARGS,
+   "Set weight values in the pending CRUSH compat weight-set"},
   {NULL, NULL, 0, NULL}
 };
 
 
 // ----------
 
+static PyObject *crush_dump(PyObject *self, PyObject *obj)
+{
+  CrushWrapper *crush = static_cast<CrushWrapper*>(
+    PyCapsule_GetPointer(obj, nullptr));
+  PyFormatter f;
+  crush->dump(&f);
+  return f.get();
+}
+
+static PyObject *crush_find_takes(PyObject *self, PyObject *obj)
+{
+  CrushWrapper *crush = static_cast<CrushWrapper*>(
+    PyCapsule_GetPointer(obj, nullptr));
+  set<int> takes;
+  crush->find_takes(&takes);
+  PyFormatter f;
+  f.open_array_section("takes");
+  for (auto root : takes) {
+    f.dump_int("root", root);
+  }
+  f.close_section();
+  return f.get();
+}
+
+static PyObject *crush_get_take_weight_osd_map(PyObject *self, PyObject *args)
+{
+  PyObject *obj;
+  int root;
+  if (!PyArg_ParseTuple(args, "Oi:get_take_weight_osd_map",
+			&obj, &root)) {
+    return nullptr;
+  }
+  CrushWrapper *crush = static_cast<CrushWrapper*>(
+    PyCapsule_GetPointer(obj, nullptr));
 
+  map<int,float> wmap;
+  crush->get_take_weight_osd_map(root, &wmap);
+  PyFormatter f;
+  f.open_object_section("weights");
+  for (auto& p : wmap) {
+    string n = stringify(p.first);     // ick
+    f.dump_float(n.c_str(), p.second);
+  }
+  f.close_section();
+  return f.get();
+}
 
 PyMethodDef CRUSHMapMethods[] = {
-//  {"get_epoch", osdmap_get_epoch, METH_O, "Get OSDMap epoch"},
+  {"dump", crush_dump, METH_O, "Dump map"},
+  {"find_takes", crush_find_takes, METH_O, "Find distinct TAKE roots"},
+  {"get_take_weight_osd_map", crush_get_take_weight_osd_map, METH_VARARGS,
+   "Get OSD weight map for a given TAKE root node"},
   {NULL, NULL, 0, NULL}
 };
diff --git a/src/pybind/mgr/mgr_module.py b/src/pybind/mgr/mgr_module.py
index 346bc6e26dd0f..5dbb6c1f01dea 100644
--- a/src/pybind/mgr/mgr_module.py
+++ b/src/pybind/mgr/mgr_module.py
@@ -40,13 +40,26 @@ def __init__(self, handle):
     def get_epoch(self):
         return ceph_osdmap.get_epoch(self._handle)
 
+    def get_crush_version(self):
+        return ceph_osdmap.get_crush_version(self._handle)
+
     def dump(self):
         return ceph_osdmap.dump(self._handle)
 
     def new_incremental(self):
         return OSDMapIncremental(ceph_osdmap.new_incremental(self._handle))
 
-    def calc_pg_upmaps(self, inc, max_deviation=.01, max_iterations=10, pools=[]):
+    def apply_incremental(self, inc):
+        return OSDMap(ceph_osdmap.apply_incremental(self._handle, inc._handle))
+
+    def get_crush(self):
+        return CRUSHMap(ceph_osdmap.get_crush(self._handle), self)
+
+    def get_pools_by_take(self, take):
+        return ceph_osdmap.get_pools_by_take(self._handle, take).get('pools', [])
+
+    def calc_pg_upmaps(self, inc,
+                       max_deviation=.01, max_iterations=10, pools=[]):
         return ceph_osdmap.calc_pg_upmaps(
             self._handle,
             inc._handle,
@@ -63,12 +76,37 @@ def get_epoch(self):
     def dump(self):
         return ceph_osdmap_incremental.dump(self._handle)
 
+    def set_osd_reweights(self, weightmap):
+        """
+        weightmap is a dict, int to float.  e.g. { 0: .9, 1: 1.0, 3: .997 }
+        """
+        return ceph_osdmap_incremental.set_osd_reweights(self._handle, weightmap)
+
+    def set_crush_compat_weight_set_weights(self, weightmap):
+        """
+        weightmap is a dict, int to float.  devices only.  e.g.,
+        { 0: 3.4, 1: 3.3, 2: 3.334 }
+        """
+        return ceph_osdmap_incremental.set_crush_compat_weight_set_weights(
+            self._handle, weightmap)
+
+
+
 class CRUSHMap(object):
-    def __init__(self, handle):
+    def __init__(self, handle, parent_osdmap):
         self._handle = handle
+        # keep ref to parent osdmap since handle lifecycle is owned by it
+        self._parent_osdmap = parent_osdmap
+
+    def dump(self):
+        return ceph_crushmap.dump(self._handle)
+
+    def find_takes(self):
+        return ceph_crushmap.find_takes(self._handle).get('takes',[])
 
-#    def get_epoch(self):
-#        return ceph_crushmap.get_epoch(self._handle)
+    def get_take_weight_osd_map(self, root):
+        uglymap = ceph_crushmap.get_take_weight_osd_map(self._handle, root)
+        return { int(k): v for k, v in uglymap.get('weights', {}).iteritems() }
 
 
 class MgrModule(object):

From 2905b7b69a80dfbd9855c5afc2906ea0accd5c7c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Thu, 27 Jul 2017 23:33:06 -0400
Subject: [PATCH 11/19] mgr/PyOSDMap: OSDMap.map_pool_pgs_up,
 CRUSHMap.get_item_name

Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/mgr/PyOSDMap.cc          | 50 ++++++++++++++++++++++++++++++++++++
 src/pybind/mgr/mgr_module.py |  5 ++++
 2 files changed, 55 insertions(+)

diff --git a/src/mgr/PyOSDMap.cc b/src/mgr/PyOSDMap.cc
index 69bcea42a2ade..e630cadef3779 100644
--- a/src/mgr/PyOSDMap.cc
+++ b/src/mgr/PyOSDMap.cc
@@ -150,6 +150,37 @@ static PyObject *osdmap_calc_pg_upmaps(PyObject *self, PyObject *args)
   return PyInt_FromLong(r);
 }
 
+static PyObject *osdmap_map_pool_pgs_up(PyObject *self, PyObject *args)
+{
+  PyObject *mapobj;
+  int poolid;
+  if (!PyArg_ParseTuple(args, "Oi:map_pool_pgs_up",
+			&mapobj, &poolid)) {
+    return nullptr;
+  }
+  OSDMap *osdmap = static_cast<OSDMap*>(PyCapsule_GetPointer(mapobj, nullptr));
+  if (!osdmap)
+    return nullptr;
+  auto pi = osdmap->get_pg_pool(poolid);
+  if (!pi)
+    return nullptr;
+  map<pg_t,vector<int>> pm;
+  for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
+    pg_t pgid(ps, poolid);
+    osdmap->pg_to_up_acting_osds(pgid, &pm[pgid], nullptr, nullptr, nullptr);
+  }
+  PyFormatter f;
+  for (auto p : pm) {
+    string pg = stringify(p.first);
+    f.open_array_section(pg.c_str());
+    for (auto o : p.second) {
+      f.dump_int("osd", o);
+    }
+    f.close_section();
+  }
+  return f.get();
+}
+
 PyMethodDef OSDMapMethods[] = {
   {"get_epoch", osdmap_get_epoch, METH_O, "Get OSDMap epoch"},
   {"get_crush_version", osdmap_get_crush_version, METH_O, "Get CRUSH version"},
@@ -163,6 +194,8 @@ PyMethodDef OSDMapMethods[] = {
    "Get pools that have CRUSH rules that TAKE the given root"},
   {"calc_pg_upmaps", osdmap_calc_pg_upmaps, METH_VARARGS,
    "Calculate new pg-upmap values"},
+  {"map_pool_pgs_up", osdmap_map_pool_pgs_up, METH_VARARGS,
+   "Calculate up set mappings for all PGs in a pool"},
   {NULL, NULL, 0, NULL}
 };
 
@@ -282,6 +315,22 @@ static PyObject *crush_dump(PyObject *self, PyObject *obj)
   return f.get();
 }
 
+static PyObject *crush_get_item_name(PyObject *self, PyObject *args)
+{
+  PyObject *obj;
+  int item;
+  if (!PyArg_ParseTuple(args, "Oi:get_item_name",
+			&obj, &item)) {
+    return nullptr;
+  }
+  CrushWrapper *crush = static_cast<CrushWrapper*>(
+    PyCapsule_GetPointer(obj, nullptr));
+  if (!crush->item_exists(item)) {
+    Py_RETURN_NONE;
+  }
+  return PyString_FromString(crush->get_item_name(item));
+}
+
 static PyObject *crush_find_takes(PyObject *self, PyObject *obj)
 {
   CrushWrapper *crush = static_cast<CrushWrapper*>(
@@ -322,6 +371,7 @@ static PyObject *crush_get_take_weight_osd_map(PyObject *self, PyObject *args)
 
 PyMethodDef CRUSHMapMethods[] = {
   {"dump", crush_dump, METH_O, "Dump map"},
+  {"get_item_name", crush_get_item_name, METH_VARARGS, "Get item name"},
   {"find_takes", crush_find_takes, METH_O, "Find distinct TAKE roots"},
   {"get_take_weight_osd_map", crush_get_take_weight_osd_map, METH_VARARGS,
    "Get OSD weight map for a given TAKE root node"},
diff --git a/src/pybind/mgr/mgr_module.py b/src/pybind/mgr/mgr_module.py
index 5dbb6c1f01dea..4e3554e1f26d3 100644
--- a/src/pybind/mgr/mgr_module.py
+++ b/src/pybind/mgr/mgr_module.py
@@ -65,6 +65,8 @@ def calc_pg_upmaps(self, inc,
             inc._handle,
             max_deviation, max_iterations, pools)
 
+    def map_pool_pgs_up(self, poolid):
+        return ceph_osdmap.map_pool_pgs_up(self._handle, poolid)
 
 class OSDMapIncremental(object):
     def __init__(self, handle):
@@ -101,6 +103,9 @@ def __init__(self, handle, parent_osdmap):
     def dump(self):
         return ceph_crushmap.dump(self._handle)
 
+    def get_item_name(self, item):
+        return ceph_crushmap.get_item_name(self._handle, item)
+
     def find_takes(self):
         return ceph_crushmap.find_takes(self._handle).get('takes',[])
 

From eb193894f59ce406a881e35a26f8d72197c7add5 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Thu, 3 Aug 2017 16:23:08 -0400
Subject: [PATCH 12/19] pybind/mgr/balancer: rough framework

Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/pybind/mgr/balancer/module.py | 757 ++++++++++++++++++++++++++++--
 src/vstart.sh                     |   2 +-
 2 files changed, 709 insertions(+), 50 deletions(-)

diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py
index c95f88de0591d..4c270b3280ade 100644
--- a/src/pybind/mgr/balancer/module.py
+++ b/src/pybind/mgr/balancer/module.py
@@ -3,27 +3,268 @@
 Balance PG distribution across OSDs.
 """
 
+import errno
 import json
+import math
 import random
 import time
 from mgr_module import MgrModule, CommandResult
 from threading import Event
 
 # available modes: 'none', 'crush', 'crush-compat', 'upmap', 'osd_weight'
-default_mode = 'upmap'
-default_sleep_interval = 30   # seconds
+default_mode = 'none'
+default_sleep_interval = 60   # seconds
 default_max_misplaced = .03   # max ratio of pgs replaced at a time
 
+TIME_FORMAT = '%Y-%m-%d %H:%M:%S %Z'
+
+
+class MappingState:
+    def __init__(self, osdmap, pg_dump, desc=''):
+        self.desc = desc
+        self.osdmap = osdmap
+        self.osdmap_dump = self.osdmap.dump()
+        self.crush = osdmap.get_crush()
+        self.crush_dump = self.crush.dump()
+        self.pg_dump = pg_dump
+        self.pg_stat = {
+            i['pgid']: i['stat_sum'] for i in pg_dump.get('pg_stats', [])
+        }
+
+
+class Plan:
+    def __init__(self, name, ms):
+        self.mode = 'unknown'
+        self.name = name
+        self.initial = ms
+
+        self.osd_weights = {}
+        self.compat_ws = {}
+        self.inc = ms.osdmap.new_incremental()
+
+    def final_state(self):
+        self.inc.set_osd_reweights(self.osd_weights)
+        self.inc.set_crush_compat_weight_set_weights(self.compat_ws)
+        return MappingState(self.initial.osdmap.apply_incremental(self.inc),
+                            self.initial.pg_dump,
+                            'plan %s final' % self.name)
+
+    def dump(self):
+        return json.dumps(self.inc.dump(), indent=4)
+
+    def show(self):
+        ls = []
+        ls.append('# starting osdmap epoch %d' % self.initial.osdmap.get_epoch())
+        ls.append('# starting crush version %d' %
+                  self.initial.osdmap.get_crush_version())
+        ls.append('# mode %s' % self.mode)
+        if len(self.compat_ws) and \
+           '-1' in self.crush_dump.get('choose_args', {}):
+            ls.append('ceph osd crush weight-set create-compat')
+        for osd, weight in self.compat_ws.iteritems():
+            ls.append('ceph osd crush weight-set reweight-compat %s %f' %
+                      (osd, weight))
+        for osd, weight in self.osd_weights.iteritems():
+            ls.append('ceph osd reweight osd.%d %f' % (osd, weight))
+        incdump = self.inc.dump()
+        for pgid in incdump.get('old_pg_upmap_items', []):
+            ls.append('ceph osd rm-pg-upmap-items %s' % pgid)
+        for item in incdump.get('new_pg_upmap_items', []):
+            osdlist = []
+            for m in item['mappings']:
+                osdlist += [m['from'], m['to']]
+            ls.append('ceph osd pg-upmap-items %s %s' %
+                      (item['pgid'], ' '.join([str(a) for a in osdlist])))
+        return '\n'.join(ls)
+
+
+class Eval:
+    pool_name = {}       # pool id -> pool name
+    pool_id = {}         # pool name -> id
+    pool_roots = {}      # pool name -> root name
+    target_by_root = {}  # root name -> target weight map
+    count_by_pool = {}
+    count_by_root = {}
+    actual_by_pool = {}  # pool -> by_* -> actual weight map
+    actual_by_root = {}  # pool -> by_* -> actual weight map
+    total_by_pool = {}   # pool -> by_* -> total
+    total_by_root = {}   # root -> by_* -> total
+    stats_by_pool = {}   # pool -> by_* -> stddev or avg -> value
+    stats_by_root = {}   # root -> by_* -> stddev or avg -> value
+
+    score_by_pool = {}
+    score_by_root = {}
+
+    score = 0.0
+
+    def __init__(self, ms):
+        self.ms = ms
+
+    def show(self):
+        r = self.ms.desc + '\n'
+        r += 'target_by_root %s\n' % self.target_by_root
+        r += 'actual_by_pool %s\n' % self.actual_by_pool
+        r += 'actual_by_root %s\n' % self.actual_by_root
+        r += 'count_by_pool %s\n' % self.count_by_pool
+        r += 'count_by_root %s\n' % self.count_by_root
+        r += 'total_by_pool %s\n' % self.total_by_pool
+        r += 'total_by_root %s\n' % self.total_by_root
+        r += 'stats_by_root %s\n' % self.stats_by_root
+        r += 'score_by_pool %s\n' % self.score_by_pool
+        r += 'score_by_root %s\n' % self.score_by_root
+        r += 'score %f (lower is better)\n' % self.score
+        return r
+
+    def calc_stats(self, count, target, total):
+        num = max(len(target), 1)
+        r = {}
+        for t in ('pgs', 'objects', 'bytes'):
+            avg = float(total[t]) / float(num)
+            dev = 0.0
+            for k, v in count[t].iteritems():
+                # adjust/normalize by weight
+                adjusted = float(v) / target[k] / float(num)
+                dev += (avg - adjusted) * (avg - adjusted)
+            stddev = math.sqrt(dev / float(max(num - 1, 1)))
+            r[t] = {
+                'avg': avg,
+                'stddev': stddev,
+            }
+        return r
+
 class Module(MgrModule):
+    COMMANDS = [
+        {
+            "cmd": "balancer status",
+            "desc": "Show balancer status",
+            "perm": "r",
+        },
+        {
+            "cmd": "balancer mode name=mode,type=CephChoices,strings=none|crush-compat|upmap",
+            "desc": "Set balancer mode",
+            "perm": "rw",
+        },
+        {
+            "cmd": "balancer on",
+            "desc": "Enable automatic balancing",
+            "perm": "rw",
+        },
+        {
+            "cmd": "balancer off",
+            "desc": "Disable automatic balancing",
+            "perm": "rw",
+        },
+        {
+            "cmd": "balancer eval name=plan,type=CephString,req=false",
+            "desc": "Evaluate data distribution for the current cluster or specific plan",
+            "perm": "r",
+        },
+        {
+            "cmd": "balancer optimize name=plan,type=CephString",
+            "desc": "Run optimizer to create a new plan",
+            "perm": "rw",
+        },
+        {
+            "cmd": "balancer show name=plan,type=CephString",
+            "desc": "Show details of an optimization plan",
+            "perm": "r",
+        },
+        {
+            "cmd": "balancer rm name=plan,type=CephString",
+            "desc": "Discard an optimization plan",
+            "perm": "rw",
+        },
+        {
+            "cmd": "balancer reset",
+            "desc": "Discard all optimization plans",
+            "perm": "rw",
+        },
+        {
+            "cmd": "balancer dump name=plan,type=CephString",
+            "desc": "Show an optimization plan",
+            "perm": "r",
+        },
+        {
+            "cmd": "balancer execute name=plan,type=CephString",
+            "desc": "Execute an optimization plan",
+            "perm": "r",
+        },
+    ]
+    active = False
     run = True
+    plans = {}
+    mode = ''
 
     def __init__(self, *args, **kwargs):
         super(Module, self).__init__(*args, **kwargs)
         self.event = Event()
 
     def handle_command(self, command):
-        return (-errno.EINVAL, '',
-                "Command not found '{0}'".format(command['prefix']))
+        self.log.warn("Handling command: '%s'" % str(command))
+        if command['prefix'] == 'balancer status':
+            s = {
+                'plans': self.plans.keys(),
+                'active': self.active,
+                'mode': self.get_config('mode', default_mode),
+            }
+            return (0, json.dumps(s, indent=4), '')
+        elif command['prefix'] == 'balancer mode':
+            self.set_config('mode', command['mode'])
+            return (0, '', '')
+        elif command['prefix'] == 'balancer on':
+            if not self.active:
+                self.set_config('active', '1')
+                self.active = True
+            self.event.set()
+            return (0, '', '')
+        elif command['prefix'] == 'balancer off':
+            if self.active:
+                self.set_config('active', '')
+                self.active = False
+            self.event.set()
+            return (0, '', '')
+        elif command['prefix'] == 'balancer eval':
+            if 'plan' in command:
+                plan = self.plans.get(command['plan'])
+                if not plan:
+                    return (-errno.ENOENT, '', 'plan %s not found' %
+                            command['plan'])
+                ms = plan.final_state()
+            else:
+                ms = MappingState(self.get_osdmap(),
+                                  self.get("pg_dump"),
+                                  'current cluster')
+            return (0, self.evaluate(ms), '')
+        elif command['prefix'] == 'balancer optimize':
+            plan = self.plan_create(command['plan'])
+            self.optimize(plan)
+            return (0, '', '')
+        elif command['prefix'] == 'balancer rm':
+            self.plan_rm(command['name'])
+            return (0, '', '')
+        elif command['prefix'] == 'balancer reset':
+            self.plans = {}
+            return (0, '', '')
+        elif command['prefix'] == 'balancer dump':
+            plan = self.plans.get(command['plan'])
+            if not plan:
+                return (-errno.ENOENT, '', 'plan %s not found' % command['plan'])
+            return (0, plan.dump(), '')
+        elif command['prefix'] == 'balancer show':
+            plan = self.plans.get(command['plan'])
+            if not plan:
+                return (-errno.ENOENT, '', 'plan %s not found' % command['plan'])
+            return (0, plan.show(), '')
+        elif command['prefix'] == 'balancer execute':
+            plan = self.plans.get(command['plan'])
+            if not plan:
+                return (-errno.ENOENT, '', 'plan %s not found' % command['plan'])
+            self.execute(plan)
+            self.plan_rm(plan)
+            return (0, '', '')
+        else:
+            return (-errno.EINVAL, '',
+                    "Command not found '{0}'".format(command['prefix']))
 
     def shutdown(self):
         self.log.info('Stopping')
@@ -33,41 +274,252 @@ def shutdown(self):
     def serve(self):
         self.log.info('Starting')
         while self.run:
-            mode = self.get_config('mode', default_mode)
+            self.log.debug('Waking up')
+            self.active = self.get_config('active', '') is not ''
             sleep_interval = float(self.get_config('sleep_interval',
                                                    default_sleep_interval))
-            max_misplaced = float(self.get_config('max_misplaced',
-                                                  default_max_misplaced))
-            self.log.info('Mode %s, sleep interval %f, max misplaced %f' %
-                          (mode, sleep_interval, max_misplaced))
-
-            info = self.get('pg_status')
-            if info.get('unknown_ratio', 0.0) + info.get('degraded_ratio', 0.0) + info.get('inactive_ratio', 0.0) > 0.0:
-                self.log.info('Some PGs are unknown, degraded, or inactive; waiting')
-            elif info.get('misplaced_ratio', 0.0) >= max_misplaced:
-                self.log.info('Too many PGs (%f > max %f) are misplaced; waiting' % (info.get('misplaced_ratio', 0.0), max_misplaced))
-            else:
-                if mode == 'upmap':
-                    self.do_upmap()
-                elif mode == 'crush':
-                    self.do_crush(compat=False)
-                elif mode == 'crush-compat':
-                    self.do_crush(compat=True)
-                elif mode == 'osd_weight':
-                    self.osd_weight()
-                elif mode == 'none':
-                    self.log.info('Idle')
-                else:
-                    self.log.info('Unrecognized mode %s' % mode)
+            if self.active:
+                self.log.debug('Running')
+                plan = self.plan_create('auto-foo')
+                self.optimize(plan)
+                #self.plan_apply(plan)
+            self.log.debug('Sleeping for %d', sleep_interval)
             self.event.wait(sleep_interval)
+            self.event.clear()
+
+    def plan_create(self, name):
+        plan = Plan(name, MappingState(self.get_osdmap(),
+                                       self.get("pg_dump"),
+                                       'plan %s initial' % name))
+        self.plans[name] = plan
+        return plan
+
+    def plan_rm(self, name):
+        if name in self.plans:
+            del self.plans[name]
+
+    def calc_eval(self, ms):
+        pe = Eval(ms)
+        pool_rule = {}
+        for p in ms.osdmap_dump.get('pools',[]):
+            pe.pool_name[p['pool']] = p['pool_name']
+            pe.pool_id[p['pool_name']] = p['pool']
+            pool_rule[p['pool_name']] = p['crush_rule']
+            pe.pool_roots[p['pool_name']] = []
+        pools = pe.pool_id.keys()
+        if len(pools) == 0:
+            return pe
+        self.log.debug('pool_name %s' % pe.pool_name)
+        self.log.debug('pool_id %s' % pe.pool_id)
+        self.log.debug('pools %s' % pools)
+        self.log.debug('pool_rule %s' % pool_rule)
+        
+        # get expected distributions by root
+        actual_by_root = {}
+        roots = ms.crush.find_takes()
+        for root in roots:
+            rname = ms.crush.get_item_name(root)
+            ls = ms.osdmap.get_pools_by_take(root)
+            for poolid in ls:
+                pe.pool_roots[pe.pool_name[poolid]].append(rname)
+            pe.target_by_root[rname] = ms.crush.get_take_weight_osd_map(root)
+            actual_by_root[rname] = {
+                'pgs': {},
+                'objects': {},
+                'bytes': {},
+            }
+            for osd in pe.target_by_root[rname].iterkeys():
+                actual_by_root[rname]['pgs'][osd] = 0
+                actual_by_root[rname]['objects'][osd] = 0
+                actual_by_root[rname]['bytes'][osd] = 0
+            pe.total_by_root[rname] = {
+                'pgs': 0,
+                'objects': 0,
+                'bytes': 0,
+            }
+        self.log.debug('pool_roots %s' % pe.pool_roots)
+        self.log.debug('target_by_root %s' % pe.target_by_root)
+
+        # pool and root actual
+        for pool in pools:
+            pi = [p for p in ms.osdmap_dump.get('pools',[])
+                  if p['pool_name'] == pool][0]
+            poolid = pi['pool']
+            pm = ms.osdmap.map_pool_pgs_up(poolid)
+            pgs = 0
+            objects = 0
+            bytes = 0
+            pgs_by_osd = {}
+            objects_by_osd = {}
+            bytes_by_osd = {}
+            for root in pe.pool_roots[pool]:
+                for osd in pe.target_by_root[root].iterkeys():
+                    pgs_by_osd[osd] = 0
+                    objects_by_osd[osd] = 0
+                    bytes_by_osd[osd] = 0
+            for pgid, up in pm.iteritems():
+                for osd in [int(osd) for osd in up]:
+                    pgs_by_osd[osd] += 1
+                    objects_by_osd[osd] += ms.pg_stat[pgid]['num_objects']
+                    bytes_by_osd[osd] += ms.pg_stat[pgid]['num_bytes']
+                    # pick a root to associate this pg instance with.
+                    # note that this is imprecise if the roots have
+                    # overlapping children.
+                    # FIXME: divide bytes by k for EC pools.
+                    for root in pe.pool_roots[pool]:
+                        if osd in pe.target_by_root[root]:
+                            actual_by_root[root]['pgs'][osd] += 1
+                            actual_by_root[root]['objects'][osd] += ms.pg_stat[pgid]['num_objects']
+                            actual_by_root[root]['bytes'][osd] += ms.pg_stat[pgid]['num_bytes']
+                            pgs += 1
+                            objects += ms.pg_stat[pgid]['num_objects']
+                            bytes += ms.pg_stat[pgid]['num_bytes']
+                            pe.total_by_root[root]['pgs'] += 1
+                            pe.total_by_root[root]['objects'] += ms.pg_stat[pgid]['num_objects']
+                            pe.total_by_root[root]['bytes'] += ms.pg_stat[pgid]['num_bytes']
+                            break
+            pe.count_by_pool[pool] = {
+                'pgs': {
+                    k: v
+                    for k, v in pgs_by_osd.iteritems()
+                },
+                'objects': {
+                    k: v
+                    for k, v in objects_by_osd.iteritems()
+                },
+                'bytes': {
+                    k: v
+                    for k, v in bytes_by_osd.iteritems()
+                },
+            }
+            pe.actual_by_pool[pool] = {
+                'pgs': {
+                    k: float(v) / float(max(pgs, 1))
+                    for k, v in pgs_by_osd.iteritems()
+                },
+                'objects': {
+                    k: float(v) / float(max(objects, 1))
+                    for k, v in objects_by_osd.iteritems()
+                },
+                'bytes': {
+                    k: float(v) / float(max(bytes, 1))
+                    for k, v in bytes_by_osd.iteritems()
+                },
+            }
+            pe.total_by_pool[pool] = {
+                'pgs': pgs,
+                'objects': objects,
+                'bytes': bytes,
+            }
+        for root, m in pe.total_by_root.iteritems():
+            pe.count_by_root[root] = {
+                'pgs': {
+                    k: float(v)
+                    for k, v in actual_by_root[root]['pgs'].iteritems()
+                },
+                'objects': {
+                    k: float(v)
+                    for k, v in actual_by_root[root]['objects'].iteritems()
+                },
+                'bytes': {
+                    k: float(v)
+                    for k, v in actual_by_root[root]['bytes'].iteritems()
+                },
+            }
+            pe.actual_by_root[root] = {
+                'pgs': {
+                    k: float(v) / float(max(pe.total_by_root[root]['pgs'], 1))
+                    for k, v in actual_by_root[root]['pgs'].iteritems()
+                },
+                'objects': {
+                    k: float(v) / float(max(pe.total_by_root[root]['objects'], 1))
+                    for k, v in actual_by_root[root]['objects'].iteritems()
+                },
+                'bytes': {
+                    k: float(v) / float(max(pe.total_by_root[root]['bytes'], 1))
+                    for k, v in actual_by_root[root]['bytes'].iteritems()
+                },
+            }
+        self.log.debug('actual_by_pool %s' % pe.actual_by_pool)
+        self.log.debug('actual_by_root %s' % pe.actual_by_root)
+
+        # average and stddev
+        pe.stats_by_root = {
+            a: pe.calc_stats(
+                b,
+                pe.target_by_root[a],
+                pe.total_by_root[a]
+            ) for a, b in pe.count_by_root.iteritems()
+        }
+            
+        # aggregate score (normalize the stddev by count)
+        pe.score_by_root = {
+            r: {
+                'pgs': pe.stats_by_root[r]['pgs']['stddev'] / max(1, pe.total_by_root[r]['pgs']),
+                'objects': pe.stats_by_root[r]['objects']['stddev'] / max(1, pe.total_by_root[r]['objects']),
+                'bytes': pe.stats_by_root[r]['bytes']['stddev'] / max(1, pe.total_by_root[r]['bytes']),
+            } for r in pe.total_by_root.keys()
+        }
+
+        # total score is just average of normalized stddevs
+        pe.score = 0.0
+        for r, vs in pe.score_by_root.iteritems():
+            for k, v in vs.iteritems():
+                pe.score += v
+        pe.score /= 3 * len(roots)
+        return pe
 
-    def do_upmap(self):
+    def evaluate(self, ms):
+        pe = self.calc_eval(ms)
+        return pe.show()
+    
+    def optimize(self, plan):
+        self.log.info('Optimize plan %s' % plan.name)
+        plan.mode = self.get_config('mode', default_mode)
+        max_misplaced = float(self.get_config('max_misplaced',
+                                              default_max_misplaced))
+        self.log.info('Mode %s, max misplaced %f' %
+                      (plan.mode, max_misplaced))
+
+        info = self.get('pg_status')
+        unknown = info.get('unknown_pgs_ratio', 0.0)
+        degraded = info.get('degraded_ratio', 0.0)
+        inactive = info.get('inactive_pgs_ratio', 0.0)
+        misplaced = info.get('misplaced_ratio', 0.0)
+        self.log.debug('unknown %f degraded %f inactive %f misplaced %g',
+                       unknown, degraded, inactive, misplaced)
+        if unknown > 0.0:
+            self.log.info('Some PGs (%f) are unknown; waiting', unknown)
+        elif degraded > 0.0:
+            self.log.info('Some PGs (%f) are degraded; waiting', degraded)
+        elif inactive > 0.0:
+            self.log.info('Some PGs (%f) are inactive; waiting', inactive)
+        elif misplaced >= max_misplaced:
+            self.log.info('Too many PGs (%f > %f) are misplaced; waiting',
+                          misplaced, max_misplaced)
+        else:
+            if plan.mode == 'upmap':
+                self.do_upmap(plan)
+            elif plan.mode == 'crush':
+                self.do_crush()
+            elif plan.mode == 'crush-compat':
+                self.do_crush_compat()
+            elif plan.mode == 'osd_weight':
+                self.osd_weight()
+            elif plan.mode == 'none':
+                self.log.info('Idle')
+            else:
+                self.log.info('Unrecognized mode %s' % plan.mode)
+
+        ##
+
+    def do_upmap(self, plan):
         self.log.info('do_upmap')
         max_iterations = self.get_config('upmap_max_iterations', 10)
         max_deviation = self.get_config('upmap_max_deviation', .01)
 
-        osdmap_dump = self.get('osd_map')
-        pools = [str(i['pool_name']) for i in osdmap_dump.get('pools',[])]
+        ms = plan.initial
+        pools = [str(i['pool_name']) for i in ms.osdmap_dump.get('pools',[])]
         if len(pools) == 0:
             self.log.info('no pools, nothing to do')
             return
@@ -75,21 +527,234 @@ def do_upmap(self):
         random.shuffle(pools)
         self.log.info('pools %s' % pools)
 
-        osdmap = self.get_osdmap()
-        inc = osdmap.new_incremental()
+        inc = plan.inc
         total_did = 0
         left = max_iterations
         for pool in pools:
-            did = osdmap.calc_pg_upmaps(inc, max_deviation, left, [pool])
+            did = ms.osdmap.calc_pg_upmaps(inc, max_deviation, left, [pool])
             total_did += did
             left -= did
             if left <= 0:
                 break
         self.log.info('prepared %d/%d changes' % (total_did, max_iterations))
 
-        incdump = inc.dump()
-        self.log.debug('resulting inc is %s' % incdump)
+    def do_crush_compat(self):
+        self.log.info('do_crush_compat')
+        osdmap = self.get_osdmap()
+        crush = osdmap.get_crush()
+
+        weight_set = self.get_compat_weight_set_weights()
+
+        # get subtree weight maps, check for overlap
+        roots = crush.find_takes()
+        self.log.debug('roots %s', roots)
+        weight_maps = {}
+        visited = {}
+        overlap = {}
+        for root in roots:
+            weight_maps[root] = crush.get_take_weight_osd_map(root)
+            self.log.debug(' map for %d: %s' % (root, weight_maps[root]))
+            for osd in weight_maps[root].iterkeys():
+                if osd in visited:
+                    overlap[osd] = 1
+                visited[osd] = 1
+        if len(overlap) > 0:
+            self.log.err('error: some osds belong to multiple subtrees: %s' %
+                         overlap)
+            return
+
+        # select a cost mode: pg, pgsize, or utilization
+        # FIXME: when we add utilization support, we need to throttle back
+        # so that we don't run if *any* objects are misplaced.  with 'pgs' we
+        # can look at up mappings, which lets us look ahead a bit.
+        cost_mode = 'pg'
 
+        # go
+        random.shuffle(roots)
+        for root in roots:
+            pools = osdmap.get_pools_by_take(root)
+            self.log.info('Balancing root %s pools %s' % (root, pools))
+            wm = weight_maps[root]
+            util = self.get_util(cost_mode, pools, wm.keys())
+            self.log.info('wm %s util %s' % (wm, util))
+            if len(util) == 0:
+                self.log.info('no utilization information, stopping')
+                return
+            target = self.get_target(util)
+            self.log.info('target %s' % target)
+            if target == 0:
+                continue
+
+            queue = sorted(util, key=lambda osd: -abs(target - util[osd]))
+            self.log.info('queue %s' % queue)
+
+            for osd in queue:
+                deviation = float(util[osd]) - float(target)
+                if deviation == 0:
+                    break
+                self.log.debug('osd.%d deviation %f', osd, deviation)
+                weight = weight_set[osd]
+                calc_weight = (float(target) / float(util[osd])) * weight
+                new_weight = weight * .7 + calc_weight * .3
+                self.log.debug('Reweight osd.%d %f -> %f', osd, weight,
+                               new_weight)
+                self.compat_weight_set_reweight(osd, new_weight)
+
+    def compat_weight_set_reweight(self, osd, new_weight):
+        self.log.debug('ceph osd crush weight-set reweight-compat')
+        result = CommandResult('')
+        self.send_command(result, 'mon', '', json.dumps({
+            'prefix': 'osd crush weight-set reweight-compat',
+            'format': 'json',
+            'item': 'osd.%d' % osd,
+            'weight': [new_weight],
+        }), '')
+        r, outb, outs = result.wait()
+        if r != 0:
+            self.log.error('Error setting compat weight-set osd.%d to %f' %
+            (osd, new_weight))
+            return
+
+    def get_compat_weight_set_weights(self):
+        # enable compat weight-set
+        self.log.debug('ceph osd crush weight-set create-compat')
+        result = CommandResult('')
+        self.send_command(result, 'mon', '', json.dumps({
+            'prefix': 'osd crush weight-set create-compat',
+            'format': 'json',
+        }), '')
+        r, outb, outs = result.wait()
+        if r != 0:
+            self.log.error('Error creating compat weight-set')
+            return
+
+        result = CommandResult('')
+        self.send_command(result, 'mon', '', json.dumps({
+            'prefix': 'osd crush dump',
+            'format': 'json',
+        }), '')
+        r, outb, outs = result.wait()
+        if r != 0:
+            self.log.error('Error dumping crush map')
+            return
+        try:
+            crushmap = json.loads(outb)
+        except:
+            raise RuntimeError('unable to parse crush map')
+
+        raw = crushmap.get('choose_args',{}).get('-1', [])
+        weight_set = {}
+        for b in raw:
+            bucket = None
+            for t in crushmap['buckets']:
+                if t['id'] == b['bucket_id']:
+                    bucket = t
+                    break
+            if not bucket:
+                raise RuntimeError('could not find bucket %s' % b['bucket_id'])
+            self.log.debug('bucket items %s' % bucket['items'])
+            self.log.debug('weight set %s' % b['weight_set'][0])
+            if len(bucket['items']) != len(b['weight_set'][0]):
+                raise RuntimeError('weight-set size does not match bucket items')
+            for pos in range(len(bucket['items'])):
+                weight_set[bucket['items'][pos]['id']] = b['weight_set'][0][pos]
+
+        self.log.debug('weight_set weights %s' % weight_set)
+        return weight_set
+
+    def get_util(self, cost_mode, pools, osds):
+        if cost_mode == 'pg' or \
+           cost_mode == 'pg_bytes' or \
+           cost_mode == 'pg_objects':
+            util_map = {}
+            for osd in osds:
+                util_map[osd] = 0
+            dump = self.get('pg_dump')
+            #self.log.info('dump %s' % dump)
+            self.log.info('osds %s' % osds)
+            for pg in dump['pg_stats']:
+                inpool = False
+                for pool in pools:
+                    if pg['pgid'].startswith(str(pool) + '.'):
+                        inpool = True
+                        break
+                if not inpool:
+                    self.log.info('skipping %s' % pg['pgid'])
+                    continue
+                self.log.info('pg %s osds %s' % (pg['pgid'], pg['up']))
+                for osd in [int(a) for a in pg['up']]:
+                    if osd in osds:
+                        if cost_mode == 'pg':
+                            util_map[osd] += 1
+                        elif cost_mode == 'pg_bytes':
+                            util_map[osd] += pg['stat_sum']['num_bytes']
+                        elif cost_mode == 'pg_objects':
+                            util_map[osd] += pg['stat_sum']['num_objects']
+            return util_map
+        else:
+            raise RuntimeError('unsupported cost mode %s' % cost_mode)
+
+    def get_target(self, util_map):
+        total = 0
+        count = 0
+        for k, v in util_map.iteritems():
+            total += v;
+            count += 1
+        return total / count
+
+    def do_crush(self):
+        self.log.info('do_crush (not yet implemented)')
+
+    def do_osd_weight(self):
+        self.log.info('do_osd_weight (not yet implemented)')
+
+    def execute(self, plan):
+        self.log.info('Executing plan %s' % plan.name)
+
+        commands = []
+
+        # compat weight-set
+        if len(plan.compat_ws) and \
+           '-1' in plan.crush_dump.get('choose_args', {}):
+            self.log.debug('ceph osd crush weight-set create-compat')
+            result = CommandResult('')
+            self.send_command(result, 'mon', '', json.dumps({
+                'prefix': 'osd crush weight-set create-compat',
+                'format': 'json',
+            }), '')
+            r, outb, outs = result.wait()
+            if r != 0:
+                self.log.error('Error creating compat weight-set')
+                return
+
+        for osd, weight in plan.compat_ws.iteritems():
+            self.log.info('ceph osd crush weight-set reweight-compat osd.%d %f',
+                          osd, weight)
+            result = CommandResult('foo')
+            self.send_command(result, 'mon', '', json.dumps({
+                'prefix': 'osd crush weight-set reweight-compat',
+                'format': 'json',
+                'item': 'osd.%d' % osd,
+                'weight': [weight],
+            }), 'foo')
+            commands.append(result)
+
+        # new_weight
+        reweightn = {}
+        for osd, weight in plan.osd_weights.iteritems():
+            reweightn[int(osd)] = float(weight) / float(0x10000)
+        if len(reweightn):
+            self.log.info('ceph osd reweightn %s', reweightn)
+            result = CommandResult('foo')
+            self.send_command(result, 'mon', '', json.dumps({
+                'prefix': 'osd reweightn',
+                'format': 'json',
+                'weights': json.dumps(reweightn),
+            }), 'foo')
+            commands.append(result)
+
+        # upmap
+        incdump = plan.inc.dump()
         for pgid in incdump.get('old_pg_upmap_items', []):
             self.log.info('ceph osd rm-pg-upmap-items %s', pgid)
             result = CommandResult('foo')
@@ -98,10 +763,7 @@ def do_upmap(self):
                 'format': 'json',
                 'pgid': pgid,
             }), 'foo')
-            r, outb, outs = result.wait()
-            if r != 0:
-                self.log.error('Error removing pg-upmap on %s' % pgid)
-                break;
+            commands.append(result)
 
         for item in incdump.get('new_pg_upmap_items', []):
             self.log.info('ceph osd pg-upmap-items %s mappings %s', item['pgid'],
@@ -116,14 +778,11 @@ def do_upmap(self):
                 'pgid': item['pgid'],
                 'id': osdlist,
             }), 'foo')
+            commands.append(result)
+
+        # wait for commands
+        for result in commands:
             r, outb, outs = result.wait()
             if r != 0:
-                self.log.error('Error setting pg-upmap on %s' % item['pgid'])
-                break;
-
-
-    def do_crush(self, compat):
-        self.log.info('do_crush compat=%b' % compat)
-
-    def do_osd_weight(self):
-        self.log.info('do_osd_weight')
+                self.log.error('Error on command')
+                return
diff --git a/src/vstart.sh b/src/vstart.sh
index d1285bd061a7b..27cb6084fd5b5 100755
--- a/src/vstart.sh
+++ b/src/vstart.sh
@@ -516,7 +516,7 @@ $COSDMEMSTORE
 $COSDSHORT
 $extra_conf
 [mon]
-        mgr initial modules = restful status dashboard
+        mgr initial modules = restful status dashboard balancer
         mon pg warn min per osd = 3
         mon osd allow primary affinity = true
         mon reweight min pgs per osd = 4

From a14d20ff064c90fee0a28a09d3c292a097e6c105 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Thu, 3 Aug 2017 23:47:40 -0400
Subject: [PATCH 13/19] ...

---
 src/pybind/mgr/balancer/module.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py
index 4c270b3280ade..8e2d528b1cb7c 100644
--- a/src/pybind/mgr/balancer/module.py
+++ b/src/pybind/mgr/balancer/module.py
@@ -538,13 +538,17 @@ def do_upmap(self, plan):
                 break
         self.log.info('prepared %d/%d changes' % (total_did, max_iterations))
 
-    def do_crush_compat(self):
+    def do_crush_compat(self, plan):
         self.log.info('do_crush_compat')
         osdmap = self.get_osdmap()
         crush = osdmap.get_crush()
 
+        # get current compat weight-set weights
         weight_set = self.get_compat_weight_set_weights()
 
+        ms = plan.initial
+        pe = self.calc_eval(ms)
+
         # get subtree weight maps, check for overlap
         roots = crush.find_takes()
         self.log.debug('roots %s', roots)

From 930ca8b04ea93a9d8327deec75556e0408df1ab4 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Fri, 4 Aug 2017 17:59:20 -0400
Subject: [PATCH 14/19] pybind/mgr/balancer: make 'crush-compat' sort of work

Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/pybind/mgr/balancer/module.py | 95 ++++++++++++++-----------------
 1 file changed, 43 insertions(+), 52 deletions(-)

diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py
index 8e2d528b1cb7c..6e880be4b23da 100644
--- a/src/pybind/mgr/balancer/module.py
+++ b/src/pybind/mgr/balancer/module.py
@@ -59,7 +59,7 @@ def show(self):
                   self.initial.osdmap.get_crush_version())
         ls.append('# mode %s' % self.mode)
         if len(self.compat_ws) and \
-           '-1' in self.crush_dump.get('choose_args', {}):
+           '-1' not in self.initial.crush_dump.get('choose_args', {}):
             ls.append('ceph osd crush weight-set create-compat')
         for osd, weight in self.compat_ws.iteritems():
             ls.append('ceph osd crush weight-set reweight-compat %s %f' %
@@ -82,6 +82,7 @@ class Eval:
     pool_name = {}       # pool id -> pool name
     pool_id = {}         # pool name -> id
     pool_roots = {}      # pool name -> root name
+    root_pools = {}      # root name -> pools
     target_by_root = {}  # root name -> target weight map
     count_by_pool = {}
     count_by_root = {}
@@ -316,28 +317,33 @@ def calc_eval(self, ms):
         
         # get expected distributions by root
         actual_by_root = {}
-        roots = ms.crush.find_takes()
-        for root in roots:
-            rname = ms.crush.get_item_name(root)
-            ls = ms.osdmap.get_pools_by_take(root)
+        rootids = ms.crush.find_takes()
+        roots = []
+        for rootid in rootids:
+            root = ms.crush.get_item_name(rootid)
+            roots.append(root)
+            ls = ms.osdmap.get_pools_by_take(rootid)
+            pe.root_pools[root] = []
             for poolid in ls:
-                pe.pool_roots[pe.pool_name[poolid]].append(rname)
-            pe.target_by_root[rname] = ms.crush.get_take_weight_osd_map(root)
-            actual_by_root[rname] = {
+                pe.pool_roots[pe.pool_name[poolid]].append(root)
+                pe.root_pools[root].append(pe.pool_name[poolid])
+            pe.target_by_root[root] = ms.crush.get_take_weight_osd_map(rootid)
+            actual_by_root[root] = {
                 'pgs': {},
                 'objects': {},
                 'bytes': {},
             }
-            for osd in pe.target_by_root[rname].iterkeys():
-                actual_by_root[rname]['pgs'][osd] = 0
-                actual_by_root[rname]['objects'][osd] = 0
-                actual_by_root[rname]['bytes'][osd] = 0
-            pe.total_by_root[rname] = {
+            for osd in pe.target_by_root[root].iterkeys():
+                actual_by_root[root]['pgs'][osd] = 0
+                actual_by_root[root]['objects'][osd] = 0
+                actual_by_root[root]['bytes'][osd] = 0
+            pe.total_by_root[root] = {
                 'pgs': 0,
                 'objects': 0,
                 'bytes': 0,
             }
         self.log.debug('pool_roots %s' % pe.pool_roots)
+        self.log.debug('root_pools %s' % pe.root_pools)
         self.log.debug('target_by_root %s' % pe.target_by_root)
 
         # pool and root actual
@@ -500,12 +506,8 @@ def optimize(self, plan):
         else:
             if plan.mode == 'upmap':
                 self.do_upmap(plan)
-            elif plan.mode == 'crush':
-                self.do_crush()
             elif plan.mode == 'crush-compat':
-                self.do_crush_compat()
-            elif plan.mode == 'osd_weight':
-                self.osd_weight()
+                self.do_crush_compat(plan)
             elif plan.mode == 'none':
                 self.log.info('Idle')
             else:
@@ -544,21 +546,20 @@ def do_crush_compat(self, plan):
         crush = osdmap.get_crush()
 
         # get current compat weight-set weights
-        weight_set = self.get_compat_weight_set_weights()
+        old_ws = self.get_compat_weight_set_weights()
 
         ms = plan.initial
         pe = self.calc_eval(ms)
 
-        # get subtree weight maps, check for overlap
-        roots = crush.find_takes()
+        # Make sure roots don't overlap their devices.  If so, we
+        # can't proceed.
+        roots = pe.target_by_root.keys()
         self.log.debug('roots %s', roots)
-        weight_maps = {}
         visited = {}
         overlap = {}
-        for root in roots:
-            weight_maps[root] = crush.get_take_weight_osd_map(root)
-            self.log.debug(' map for %d: %s' % (root, weight_maps[root]))
-            for osd in weight_maps[root].iterkeys():
+        root_ids = {}
+        for root, wm in pe.target_by_root.iteritems():
+            for osd in wm.iterkeys():
                 if osd in visited:
                     overlap[osd] = 1
                 visited[osd] = 1
@@ -567,42 +568,30 @@ def do_crush_compat(self, plan):
                          overlap)
             return
 
-        # select a cost mode: pg, pgsize, or utilization
-        # FIXME: when we add utilization support, we need to throttle back
-        # so that we don't run if *any* objects are misplaced.  with 'pgs' we
-        # can look at up mappings, which lets us look ahead a bit.
-        cost_mode = 'pg'
+        key = 'pgs'  # pgs objects or bytes
 
         # go
         random.shuffle(roots)
         for root in roots:
-            pools = osdmap.get_pools_by_take(root)
-            self.log.info('Balancing root %s pools %s' % (root, pools))
-            wm = weight_maps[root]
-            util = self.get_util(cost_mode, pools, wm.keys())
-            self.log.info('wm %s util %s' % (wm, util))
-            if len(util) == 0:
-                self.log.info('no utilization information, stopping')
-                return
-            target = self.get_target(util)
-            self.log.info('target %s' % target)
-            if target == 0:
-                continue
-
-            queue = sorted(util, key=lambda osd: -abs(target - util[osd]))
-            self.log.info('queue %s' % queue)
-
+            pools = pe.root_pools[root]
+            self.log.info('Balancing root %s (pools %s) by %s' %
+                          (root, pools, key))
+            target = pe.target_by_root[root]
+            actual = pe.actual_by_root[root][key]
+            queue = sorted(actual.keys(),
+                           key=lambda osd: -abs(target[osd] - actual[osd]))
+            self.log.debug('queue %s' % queue)
             for osd in queue:
-                deviation = float(util[osd]) - float(target)
+                deviation = target[osd] - actual[osd]
                 if deviation == 0:
                     break
                 self.log.debug('osd.%d deviation %f', osd, deviation)
-                weight = weight_set[osd]
-                calc_weight = (float(target) / float(util[osd])) * weight
+                weight = old_ws[osd]
+                calc_weight = target[osd] / actual[osd] * weight
                 new_weight = weight * .7 + calc_weight * .3
                 self.log.debug('Reweight osd.%d %f -> %f', osd, weight,
                                new_weight)
-                self.compat_weight_set_reweight(osd, new_weight)
+                plan.compat_ws[osd] = new_weight
 
     def compat_weight_set_reweight(self, osd, new_weight):
         self.log.debug('ceph osd crush weight-set reweight-compat')
@@ -719,7 +708,7 @@ def execute(self, plan):
 
         # compat weight-set
         if len(plan.compat_ws) and \
-           '-1' in plan.crush_dump.get('choose_args', {}):
+           '-1' not in plan.initial.crush_dump.get('choose_args', {}):
             self.log.debug('ceph osd crush weight-set create-compat')
             result = CommandResult('')
             self.send_command(result, 'mon', '', json.dumps({
@@ -785,8 +774,10 @@ def execute(self, plan):
             commands.append(result)
 
         # wait for commands
+        self.log.debug('commands %s' % commands)
         for result in commands:
             r, outb, outs = result.wait()
             if r != 0:
                 self.log.error('Error on command')
                 return
+        self.log.debug('done')

From b3f9a723b453c13868873a3be1d7c23b894843be Mon Sep 17 00:00:00 2001
From: Spandan Kumar Sahu <spandankumarsahu@gmail.com>
Date: Mon, 7 Aug 2017 04:01:57 +0530
Subject: [PATCH 15/19] src/pybind/mgr/balancer/module.py: improve scoring
 method

* score lies in [0, 1), 0 being perfect distribution
* use shifted and scaled cdf of normal distribution
  to prioritize highly over-weighted device.
* consider only over-weighted devices to calculate score

Signed-off-by: Spandan Kumar Sahu <spandankumarsahu@gmail.com>
---
 src/pybind/mgr/balancer/module.py | 41 +++++++++++++++++++++++++++----
 1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py
index 6e880be4b23da..5f79a5d8aec82 100644
--- a/src/pybind/mgr/balancer/module.py
+++ b/src/pybind/mgr/balancer/module.py
@@ -122,14 +122,45 @@ def calc_stats(self, count, target, total):
         for t in ('pgs', 'objects', 'bytes'):
             avg = float(total[t]) / float(num)
             dev = 0.0
+
+            # score is a measure of how uneven the data distribution is.
+            # score lies between [0, 1), 0 means perfect distribution.
+            score = 0.0
+            sum_weight = 0.0
+
             for k, v in count[t].iteritems():
                 # adjust/normalize by weight
                 adjusted = float(v) / target[k] / float(num)
+
+                # Overweighted devices and their weights are factors to calculate reweight_urgency.
+                # One 10% underfilled device with 5 2% overfilled devices, is arguably a better
+                # situation than one 10% overfilled with 5 2% underfilled devices
+                if adjusted > avg:
+                    '''
+                    F(x) = 2*phi(x) - 1, where phi(x) = cdf of standard normal distribution
+                    x = (adjusted - avg)/avg.
+                    Since, we're considering only over-weighted devices, x >= 0, and so phi(x) lies in [0.5, 1).
+                    To bring range of F(x) in range [0, 1), we need to make the above modification.
+
+                    In general, we need to use a function F(x), where x = (adjusted - avg)/avg
+                    1. which is bounded between 0 and 1, so that ultimately reweight_urgency will also be bounded.
+                    2. A larger value of x, should imply more urgency to reweight.
+                    3. Also, the difference between F(x) when x is large, should be minimal. 
+                    4. The value of F(x) should get close to 1 (highest urgency to reweight) with steeply.
+                    
+                    Could have used F(x) = (1 - e^(-x)). But that had slower convergence to 1, compared to the one currently in use.
+
+                    cdf of standard normal distribution: https://stackoverflow.com/a/29273201
+                    '''
+                    score += target[k] * (math.erf(((adjusted - avg)/avg) / math.sqrt(2.0)))
+                    sum_weight += target[k]
                 dev += (avg - adjusted) * (avg - adjusted)
             stddev = math.sqrt(dev / float(max(num - 1, 1)))
+            score = score / max(sum_weight, 1)
             r[t] = {
                 'avg': avg,
                 'stddev': stddev,
+                'score': sum_weight,
             }
         return r
 
@@ -449,7 +480,7 @@ def calc_eval(self, ms):
         self.log.debug('actual_by_pool %s' % pe.actual_by_pool)
         self.log.debug('actual_by_root %s' % pe.actual_by_root)
 
-        # average and stddev
+        # average and stddev and score
         pe.stats_by_root = {
             a: pe.calc_stats(
                 b,
@@ -458,12 +489,12 @@ def calc_eval(self, ms):
             ) for a, b in pe.count_by_root.iteritems()
         }
             
-        # aggregate score (normalize the stddev by count)
+	# the scores are already normalized
         pe.score_by_root = {
             r: {
-                'pgs': pe.stats_by_root[r]['pgs']['stddev'] / max(1, pe.total_by_root[r]['pgs']),
-                'objects': pe.stats_by_root[r]['objects']['stddev'] / max(1, pe.total_by_root[r]['objects']),
-                'bytes': pe.stats_by_root[r]['bytes']['stddev'] / max(1, pe.total_by_root[r]['bytes']),
+                'pgs': pe.stats_by_root[r]['pgs']['score'],
+                'objects': pe.stats_by_root[r]['objects']['score'],
+                'bytes': pe.stats_by_root[r]['bytes']['score'],
             } for r in pe.total_by_root.keys()
         }
 

From 1ac2a8773dcd3129d1106e652cb9feb5576f1b9c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Tue, 8 Aug 2017 16:36:23 -0400
Subject: [PATCH 16/19] pybind/mgr/balancer: make auto mode work

(with upmap at least)

Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/pybind/mgr/balancer/module.py | 49 +++++++++++++++++++++----------
 1 file changed, 33 insertions(+), 16 deletions(-)

diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py
index 5f79a5d8aec82..0767ba23dd4a1 100644
--- a/src/pybind/mgr/balancer/module.py
+++ b/src/pybind/mgr/balancer/module.py
@@ -16,7 +16,7 @@
 default_sleep_interval = 60   # seconds
 default_max_misplaced = .03   # max ratio of pgs replaced at a time
 
-TIME_FORMAT = '%Y-%m-%d %H:%M:%S %Z'
+TIME_FORMAT = '%Y-%m-%d_%H:%M:%S'
 
 
 class MappingState:
@@ -312,9 +312,11 @@ def serve(self):
                                                    default_sleep_interval))
             if self.active:
                 self.log.debug('Running')
-                plan = self.plan_create('auto-foo')
-                self.optimize(plan)
-                #self.plan_apply(plan)
+                name = 'auto_%s' % time.strftime(TIME_FORMAT, time.gmtime())
+                plan = self.plan_create(name)
+                if self.optimize(plan):
+                    self.execute(plan)
+                self.plan_rm(name)
             self.log.debug('Sleeping for %d', sleep_interval)
             self.event.wait(sleep_interval)
             self.event.clear()
@@ -536,13 +538,14 @@ def optimize(self, plan):
                           misplaced, max_misplaced)
         else:
             if plan.mode == 'upmap':
-                self.do_upmap(plan)
+                return self.do_upmap(plan)
             elif plan.mode == 'crush-compat':
-                self.do_crush_compat(plan)
+                return self.do_crush_compat(plan)
             elif plan.mode == 'none':
                 self.log.info('Idle')
             else:
                 self.log.info('Unrecognized mode %s' % plan.mode)
+        return False
 
         ##
 
@@ -555,7 +558,7 @@ def do_upmap(self, plan):
         pools = [str(i['pool_name']) for i in ms.osdmap_dump.get('pools',[])]
         if len(pools) == 0:
             self.log.info('no pools, nothing to do')
-            return
+            return False
         # shuffle pool list so they all get equal (in)attention
         random.shuffle(pools)
         self.log.info('pools %s' % pools)
@@ -570,6 +573,7 @@ def do_upmap(self, plan):
             if left <= 0:
                 break
         self.log.info('prepared %d/%d changes' % (total_did, max_iterations))
+        return True
 
     def do_crush_compat(self, plan):
         self.log.info('do_crush_compat')
@@ -597,7 +601,7 @@ def do_crush_compat(self, plan):
         if len(overlap) > 0:
             self.log.err('error: some osds belong to multiple subtrees: %s' %
                          overlap)
-            return
+            return False
 
         key = 'pgs'  # pgs objects or bytes
 
@@ -623,6 +627,7 @@ def do_crush_compat(self, plan):
                 self.log.debug('Reweight osd.%d %f -> %f', osd, weight,
                                new_weight)
                 plan.compat_ws[osd] = new_weight
+        return True
 
     def compat_weight_set_reweight(self, osd, new_weight):
         self.log.debug('ceph osd crush weight-set reweight-compat')
@@ -761,7 +766,10 @@ def execute(self, plan):
                 'item': 'osd.%d' % osd,
                 'weight': [weight],
             }), 'foo')
-            commands.append(result)
+            r, outb, outs = result.wait()
+            if r != 0:
+                self.log.error('Error on command')
+                return
 
         # new_weight
         reweightn = {}
@@ -775,7 +783,10 @@ def execute(self, plan):
                 'format': 'json',
                 'weights': json.dumps(reweightn),
             }), 'foo')
-            commands.append(result)
+            r, outb, outs = result.wait()
+            if r != 0:
+                self.log.error('Error on command')
+                return
 
         # upmap
         incdump = plan.inc.dump()
@@ -787,7 +798,10 @@ def execute(self, plan):
                 'format': 'json',
                 'pgid': pgid,
             }), 'foo')
-            commands.append(result)
+            r, outb, outs = result.wait()
+            if r != 0:
+                self.log.error('Error on command')
+                return
 
         for item in incdump.get('new_pg_upmap_items', []):
             self.log.info('ceph osd pg-upmap-items %s mappings %s', item['pgid'],
@@ -802,13 +816,16 @@ def execute(self, plan):
                 'pgid': item['pgid'],
                 'id': osdlist,
             }), 'foo')
-            commands.append(result)
-
-        # wait for commands
-        self.log.debug('commands %s' % commands)
-        for result in commands:
             r, outb, outs = result.wait()
             if r != 0:
                 self.log.error('Error on command')
                 return
+
+        # wait for commands
+        #self.log.debug('commands %s' % commands)
+        #for result in commands:
+        #    r, outb, outs = result.wait()
+        #    if r != 0:
+        #        self.log.error('Error on command')
+        #        return
         self.log.debug('done')

From 34729c5eddb199e537f875f57a1c233646a99f69 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Tue, 8 Aug 2017 16:36:23 -0400
Subject: [PATCH 17/19] pybind/mgr/balancer: make auto mode work

(with upmap at least)

Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/pybind/mgr/balancer/module.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py
index 0767ba23dd4a1..22f085673b46d 100644
--- a/src/pybind/mgr/balancer/module.py
+++ b/src/pybind/mgr/balancer/module.py
@@ -547,8 +547,6 @@ def optimize(self, plan):
                 self.log.info('Unrecognized mode %s' % plan.mode)
         return False
 
-        ##
-
     def do_upmap(self, plan):
         self.log.info('do_upmap')
         max_iterations = self.get_config('upmap_max_iterations', 10)
@@ -816,6 +814,10 @@ def execute(self, plan):
                 'pgid': item['pgid'],
                 'id': osdlist,
             }), 'foo')
+            commands.append(result)
+
+        # wait for commands
+        for result in commands:
             r, outb, outs = result.wait()
             if r != 0:
                 self.log.error('Error on command')
@@ -829,3 +831,4 @@ def execute(self, plan):
         #        self.log.error('Error on command')
         #        return
         self.log.debug('done')
+

From 6cf5c1cbfa8a9ee7c4685a88170f59e844a81aab Mon Sep 17 00:00:00 2001
From: Spandan Kumar Sahu <spandankumarsahu@gmail.com>
Date: Sat, 29 Jul 2017 14:53:19 +0530
Subject: [PATCH 18/19] src/pybind/mgr/balancer/module.py: implement
 'do_osd_weight'

* make cost_model a config
* use existing reweight-by-utilization and reweight-by-pg

Signed-off-by: Spandan Kumar Sahu <spandankumarsahu@gmail.com>
---
 src/pybind/mgr/balancer/module.py | 48 ++++++++++++++++++++++++++-----
 1 file changed, 41 insertions(+), 7 deletions(-)

diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py
index 22f085673b46d..1c9d75d73cd4c 100644
--- a/src/pybind/mgr/balancer/module.py
+++ b/src/pybind/mgr/balancer/module.py
@@ -13,12 +13,12 @@
 
 # available modes: 'none', 'crush', 'crush-compat', 'upmap', 'osd_weight'
 default_mode = 'none'
+default_cost_mode = 'pg'
 default_sleep_interval = 60   # seconds
 default_max_misplaced = .03   # max ratio of pgs replaced at a time
 
 TIME_FORMAT = '%Y-%m-%d_%H:%M:%S'
 
-
 class MappingState:
     def __init__(self, osdmap, pg_dump, desc=''):
         self.desc = desc
@@ -172,10 +172,15 @@ class Module(MgrModule):
             "perm": "r",
         },
         {
-            "cmd": "balancer mode name=mode,type=CephChoices,strings=none|crush-compat|upmap",
+            "cmd": "balancer mode name=mode,type=CephChoices,strings=none|crush-compat|upmap|osd_weight",
             "desc": "Set balancer mode",
             "perm": "rw",
         },
+        {
+            "cmd": "balancer cost-mode name=cost-mode,type=CephChoices,strings=pg|pg-size|utilization",
+            "desc": "Set balancer cost mode",
+            "perm": "rw",
+        },
         {
             "cmd": "balancer on",
             "desc": "Enable automatic balancing",
@@ -238,11 +243,15 @@ def handle_command(self, command):
                 'plans': self.plans.keys(),
                 'active': self.active,
                 'mode': self.get_config('mode', default_mode),
+                'cost-mode': self.get_config('cost_mode', default_cost_mode),
             }
             return (0, json.dumps(s, indent=4), '')
         elif command['prefix'] == 'balancer mode':
             self.set_config('mode', command['mode'])
             return (0, '', '')
+        elif command['prefix'] == 'balancer cost-mode':
+            self.set_config('cost_mode', command['cost-mode'])
+            return (0, '', '')
         elif command['prefix'] == 'balancer on':
             if not self.active:
                 self.set_config('active', '1')
@@ -541,6 +550,8 @@ def optimize(self, plan):
                 return self.do_upmap(plan)
             elif plan.mode == 'crush-compat':
                 return self.do_crush_compat(plan)
+            elif plan.mode == 'osd_weight':
+                self.do_osd_weight()
             elif plan.mode == 'none':
                 self.log.info('Idle')
             else:
@@ -602,6 +613,11 @@ def do_crush_compat(self, plan):
             return False
 
         key = 'pgs'  # pgs objects or bytes
+        # select a cost mode: pg, pgsize, or utilization
+        # FIXME: when we add utilization support, we need to throttle back
+        # so that we don't run if *any* objects are misplaced.  with 'pgs' we
+        # can look at up mappings, which lets us look ahead a bit.
+        cost_mode = self.get_config('cost_mode', default_cost_mode)
 
         # go
         random.shuffle(roots)
@@ -733,7 +749,29 @@ def do_crush(self):
         self.log.info('do_crush (not yet implemented)')
 
     def do_osd_weight(self):
-        self.log.info('do_osd_weight (not yet implemented)')
+        self.log.info('do_osd_weight')
+
+        cost_mode = self.get_config('cost_mode', default_cost_mode)
+
+	# Reweight any over-utilized OSD
+        result = CommandResult('balancer_osd_weight')
+        if cost_mode == 'pg':
+            self.send_command(result, 'mon', '', json.dumps({
+                'prefix': 'osd reweight-by-pg',
+                'oload': '101',
+            }), 'balancer_osd_weight')
+            r, outb, outs = result.wait()
+        elif cost_mode == 'utilization':
+            self.send_command(result, 'mon', '', json.dumps({
+                'prefix': 'osd reweight-by-utilization',
+                'oload': '101',
+            }), 'balancer_osd_weight')
+            r, outb, outs = result.wait()
+        else:
+            raise RuntimeError('unsupported cost mode %s' % cost_mode)
+        if r != 0:
+            self.log.error(outs)
+            return
 
     def execute(self, plan):
         self.log.info('Executing plan %s' % plan.name)
@@ -814,10 +852,6 @@ def execute(self, plan):
                 'pgid': item['pgid'],
                 'id': osdlist,
             }), 'foo')
-            commands.append(result)
-
-        # wait for commands
-        for result in commands:
             r, outb, outs = result.wait()
             if r != 0:
                 self.log.error('Error on command')

From 6bc153fab2cd22537a4bfa52bda0b0139067ee5f Mon Sep 17 00:00:00 2001
From: Spandan Kumar Sahu <spandankumarsahu@gmail.com>
Date: Sun, 30 Jul 2017 00:15:00 +0530
Subject: [PATCH 19/19] src/mon/MonCommands.h: add 'reweight-by-pg' and
 'reweight-by-utilization' as mon commands

Signed-off-by: Spandan Kumar Sahu <spandankumarsahu@gmail.com>
---
 src/mon/MonCommands.h             | 14 ++++++++++++++
 src/pybind/mgr/balancer/module.py |  1 +
 2 files changed, 15 insertions(+)

diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index 41e3f79f1175a..1668055880a2e 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -808,6 +808,20 @@ COMMAND("osd reweightn " \
 	"name=weights,type=CephString",
 	"reweight osds with {<id>: <weight>,...})",
 	"osd", "rw", "cli,rest")
+COMMAND("osd reweight-by-utilization " \
+	"name=oload,type=CephInt,req=false " \
+	"name=max_change,type=CephFloat,req=false "			\
+	"name=max_osds,type=CephInt,req=false "			\
+	"name=no_increasing,type=CephChoices,strings=--no-increasing,req=false",\
+	"reweight OSDs by utilization [overload-percentage-for-consideration, default 120]", \
+	"osd", "rw", "cli,rest")
+COMMAND("osd reweight-by-pg " \
+	"name=oload,type=CephInt,req=false " \
+	"name=max_change,type=CephFloat,req=false "			\
+	"name=max_osds,type=CephInt,req=false "			\
+	"name=pools,type=CephPoolname,n=N,req=false",			\
+	"reweight OSDs by PG distribution [overload-percentage-for-consideration, default 120]", \
+	"osd", "rw", "cli,rest")
 COMMAND("osd force-create-pg " \
 	"name=pgid,type=CephPgid ",
 	"force creation of pg <pgid>",
diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py
index 1c9d75d73cd4c..ae834f040041b 100644
--- a/src/pybind/mgr/balancer/module.py
+++ b/src/pybind/mgr/balancer/module.py
@@ -19,6 +19,7 @@
 
 TIME_FORMAT = '%Y-%m-%d_%H:%M:%S'
 
+
 class MappingState:
     def __init__(self, osdmap, pg_dump, desc=''):
         self.desc = desc