Skip to content

Commit

Permalink
slurmctld: add support for TLS certificates
Browse files Browse the repository at this point in the history
Add three configurations in slurmctld charm to allow the operator to
supply TLS certificates for etcd. Two options for the TLS cert (pub and
private files), and one optional as the CA public certificate, in case
the nodes do not have that or it was used an internal CA. etcd will not
use that cert to verify the clients.

Add integration tests to validade the certs are in place and can be
used correctly.
  • Loading branch information
heitorPB committed Jun 27, 2022
1 parent 4723f6c commit b507db1
Show file tree
Hide file tree
Showing 12 changed files with 333 additions and 17 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,17 @@
*.snap
*.resource
*.charm
*.crt
*.key
*.srl
*.csr
*.v3.ext
venv
out
build
tmp
node_modules/
**__pycache__
.env
version
charm-slurm*/version
Expand Down
2 changes: 2 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ This file keeps track of all notable changes to the Slurm Charms.
Unreleased
----------

- added slurmctld configuration options to use TLS certificates for etcd

0.9.1 - 2022-06-02
------------------

Expand Down
16 changes: 16 additions & 0 deletions charm-slurmctld/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,19 @@ options:
This value supplements the charm supplied `acct_gather.conf` file that is
used for configuring the acct_gather plugins.
tls-key:
type: string
default: ""
description: A TLS server private key (`.key` file) to be used.
tls-cert:
type: string
default: ""
description: A TLS server certificate (`.crt` file) to be used.
tls-ca-cert:
type: string
default: ""
description: |
A CA certificate (`.crt` file) to be used for verification of TLS
certificates. A CA certificate should only be issued in the case of
custom CAs and nodes not having it installed.
1 change: 1 addition & 0 deletions charm-slurmctld/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
ops==1.3.0
influxdb
etcd3gw==1.0.2
jinja2==3.1.2
git+https://github.com/omnivector-solutions/[email protected]
16 changes: 15 additions & 1 deletion charm-slurmctld/src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ def __init__(self, *args):
etcd_configured=False,
etcd_root_pass=str(),
etcd_slurmd_pass=str(),
use_tls=False,
use_tls_ca=False,
)

self._slurm_manager = SlurmManager(self, "slurmctld")
Expand All @@ -65,7 +67,7 @@ def __init__(self, *args):
self._fluentbit = FluentbitClient(self, "fluentbit")

self._user_group = UserGroupProvides(self, "user-group")
self._etcd = EtcdOps()
self._etcd = EtcdOps(self)

event_handler_bindings = {
self.on.install: self._on_install,
Expand Down Expand Up @@ -505,6 +507,18 @@ def _on_write_slurm_config(self, event):
event.defer()
return

# check if both certificates are supplied
tls_key = self.model.config['tls-key']
tls_cert = self.model.config['tls-cert']
self._stored.use_tls = (bool(tls_key) and bool(tls_cert))
self._stored.use_tls_ca = bool(self.model.config['tls-ca-cert'])
logger.debug(f"## _on_write_slurm_config(): use_tls: {self._stored.use_tls}")
logger.debug(f"## _on_write_slurm_config(): use_tls_ca: {self._stored.use_tls_ca}")

# TODO this will fire everytime a the configuration changed, we don't
# need that. This should happen only if the tls configs changed
self._etcd.setup_tls()

slurm_config = self._assemble_slurm_config()
if slurm_config:
self._slurm_manager.render_slurm_configs(slurm_config)
Expand Down
130 changes: 121 additions & 9 deletions charm-slurmctld/src/etcd_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
from pathlib import Path
from typing import List

from jinja2 import Environment, FileSystemLoader
from slurm_ops_manager.utils import operating_system

from omnietcd3 import Etcd3AuthClient

logger = logging.getLogger()
Expand All @@ -18,13 +21,25 @@
class EtcdOps:
"""ETCD ops."""

def __init__(self):
def __init__(self, charm):
"""Initialize class."""
self._charm = charm

# system user and group
self._etcd_user = "etcd"
self._etcd_group = "etcd"
self._etcd_service = "etcd.service"

if operating_system() == 'ubuntu':
self._etcd_environment_file = Path("/etc/default/etcd")
else:
self._etcd_environment_file = Path("/etc/sysconfig/etcd")

self._certs_path = Path("/var/lib/etcd/tls_certificates/")
self._tls_key_path = self._certs_path / "tls.key"
self._tls_crt_path = self._certs_path / "tls.crt"
self._tls_ca_crt_path = self._certs_path / "tls-ca.crt"

def install(self, resource_path: Path):
"""Install etcd."""
# extract resource tarball
Expand Down Expand Up @@ -63,20 +78,91 @@ def _create_etcd_user_group(self):
def _setup_systemd(self):
logger.debug("## creating systemd files for etcd")

charm_dir = Path(__file__).parent
template_dir = Path(charm_dir) / "templates"
source = template_dir / "etcd.service.tmpl"
template_dir = Path(__file__).parent / "templates"
environment = Environment(loader=FileSystemLoader(template_dir))

# service unit
template = environment.get_template("etcd.service.tmpl")
ctxt = {"environment_file": self._etcd_environment_file}
dest = Path("/etc/systemd/system/") / self._etcd_service
shutil.copy2(source, dest)
dest.write_text(template.render(ctxt))

subprocess.call(["systemctl", "daemon-reload"])

def _setup_environment_file(self):
logger.debug("## creating environemnt file for etcd")
template_dir = Path(__file__).parent / "templates"
environment = Environment(loader=FileSystemLoader(template_dir))

template = environment.get_template("etcd.env.tmpl")

if self._charm._stored.use_tls:
ctxt = {"use_tls": True,
"protocol": "https",
"tls_key_path": self._tls_key_path,
"tls_cert_path": self._tls_crt_path,
}
if self._charm._stored.use_tls_ca:
ctxt["ca_cert_path"] = self._tls_ca_crt_path
else:
ctxt = {"use_tls": False,
"protocol": "http"}

self._etcd_environment_file.write_text(template.render(ctxt))

def setup_tls(self):
"""Setup the files for TLS."""
logger.debug("## setting tls files for etcd")

# safeguard
if not self._charm._stored.use_tls:
logger.debug("## no certificates provided")
# must restart if user removed certs
self._setup_environment_file()
self.restart()
return

# create dir to store certs
if not self._certs_path.exists():
logger.debug("## creating directory to store certs")
self._certs_path.mkdir(parents=True)

# create the files
logger.debug("## creating cert files")
key = self._charm.model.config["tls-key"]
self._tls_key_path.write_text(key)
crt = self._charm.model.config["tls-cert"]
self._tls_crt_path.write_text(crt)

ca_crt = self._charm.model.config["tls-ca-cert"]
if ca_crt:
logger.debug("## creating ca cert file")
self._tls_ca_crt_path.write_text(ca_crt)

# set correct permissions
shutil.chown(self._certs_path, user=self._etcd_user, group=self._etcd_group)
self._certs_path.chmod(0o500)

# update configurations and restart
self._setup_environment_file()
self.restart()

def stop(self):
"""Stop etcd service."""
logger.debug("## stopping etcd")
subprocess.call(["systemctl", "stop", self._etcd_service])

def start(self):
"""Start etcd service."""
logger.debug("## enabling and starting etcd")
subprocess.call(["systemctl", "enable", self._etcd_service])
subprocess.call(["systemctl", "start", self._etcd_service])

def restart(self):
"""Restart etcd service."""
logger.debug("## restarting etcd")
subprocess.call(["systemctl", "restart", self._etcd_service])

def is_active(self) -> bool:
"""Check if systemd etcd service is active."""
try:
Expand All @@ -88,8 +174,12 @@ def is_active(self) -> bool:
return False

def configure(self, root_pass: str, slurmd_pass: str) -> None:
"""Configure etcd service."""
"""Configure etcd service for the first time."""
logger.debug("## configuring etcd")

# rewrite environment file and start service
self.setup_tls()
self._setup_environment_file()
self.start()

# some configs can only be applied with the server running
Expand All @@ -107,7 +197,7 @@ def setup_default_roles(self, root_pass: str, slurmd_pass: str) -> None:
- has r permissions for munge/* keys
"""
logger.debug("## creating default etcd roles/users")
cmds = [# create root account with random pass
cmds = [# create root account
f"etcdctl user add root:{root_pass}",
# create root role
"etcdctl role add root",
Expand Down Expand Up @@ -146,14 +236,36 @@ def create_new_munge_user(self, root_pass: str, user: str, password: str) -> Non
cmd = f"etcdctl {auth} user grant-role {user} munge-readers"
subprocess.run(shlex.split(cmd))

def _client(self, root_pass: str) -> Etcd3AuthClient:
"""Build an etcd client with the correct protocol.
Use https if we have TLS certs and HTTP otherwise.
"""
protocol = "http"
tls_cert = None
cacert = None

if self._charm._stored.use_tls:
protocol = "https"
tls_cert = self._tls_crt_path.as_posix()

if self._charm._stored.use_tls_ca:
cacert = self._tls_ca_crt_path.as_posix()
logger.debug(f"## Created new etcd client using {protocol}, {tls_cert} and {cacert}")
client = Etcd3AuthClient(username="root", password=root_pass,
protocol=protocol, ca_cert=cacert,
cert_cert=tls_cert)
client.authenticate()
return client

def set_list_of_accounted_nodes(self, root_pass: str, nodes: List[str]) -> None:
"""Set list of nodes on etcd."""
logger.debug(f"## setting on etcd: nodes/all_nodes/{nodes}")
client = Etcd3AuthClient(username="root", password=root_pass)
client = self._client(root_pass)
client.put(key="nodes/all_nodes", value=json.dumps(nodes))

def store_munge_key(self, root_pass: str, key: str) -> None:
"""Store munge key on etcd."""
logger.debug("## Storing munge key on etcd: munge/key")
client = Etcd3AuthClient(username="root", password=root_pass)
client = self._client(root_pass)
client.put(key="munge/key", value=key)
19 changes: 19 additions & 0 deletions charm-slurmctld/src/interface_slurmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ def _on_relation_created(self, event):

app_relation_data["etcd_slurmd_pass"] = self._charm.etcd_slurmd_password

app_relation_data["tls_cert"] = self._charm.model.config["tls-cert"]
app_relation_data["ca_cert"] = self._charm.model.config["tls-ca-cert"]

def _on_relation_changed(self, event):
"""Emit slurmd available event."""
if event.relation.data[event.app].get("partition_info"):
Expand Down Expand Up @@ -180,6 +183,22 @@ def set_nhc_params(self, params: str = ""):
else:
logger.debug("## slurmd not joined")

def set_tls_settings(self):
"""Send TLS settings to all slurmd."""
tls_cert = self._charm.model.config["tls-cert"]
ca_cert = self._charm.model.config["tls-ca-cert"]

logger.debug(f"## set_tls_settings: {bool(tls_cert)}, {bool(ca_cert)}")

if self.is_joined:
relations = self._charm.framework.model.relations.get(self._relation_name)
for relation in relations:
app = self.model.app
relation.data[app]["tls_cert"] = ca_cert
relation.data[app]["ca_cert"] = ca_cert
else:
logger.debug("## slurmd not joined")


def ensure_unique_partitions(partitions):
"""Return a list of unique partitions."""
Expand Down
9 changes: 9 additions & 0 deletions charm-slurmctld/src/templates/etcd.env.tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
ETCD_NAME=osd-etcd
ETCD_DATA_DIR=/var/lib/etcd
ETCD_LISTEN_CLIENT_URLS={{ protocol }}://0.0.0.0:2379
ETCD_ADVERTISE_CLIENT_URLS={{ protocol }}://0.0.0.0:2379

{% if use_tls %}
ETCD_CERT_FILE={{ tls_cert_path }}
ETCD_KEY_FILE={{ tls_key_path }}
{% endif %}
5 changes: 1 addition & 4 deletions charm-slurmctld/src/templates/etcd.service.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,7 @@ Wants=network-online.target local-fs.target remote-fs.target time-sync.target
User=etcd
Group=etcd
Type=notify
Environment=ETCD_DATA_DIR=/var/lib/etcd
Environment=ETCD_NAME=osd-etcd
Environment=ETCD_LISTEN_CLIENT_URLS=http://0.0.0.0:2379
Environment=ETCD_ADVERTISE_CLIENT_URLS=http://0.0.0.0:2379
EnvironmentFile=-{{ environment_file }}
ExecStart=/usr/bin/etcd
Restart=always
RestartSec=10s
Expand Down
Loading

0 comments on commit b507db1

Please sign in to comment.