Skip to content

Commit

Permalink
ADD: Obol Alerts (#2018)
Browse files Browse the repository at this point in the history
* ADD: Alert Rules

* CHANGE: name of alert files

* ADD: Function for Obol Alerts in Stereum

* FIX: Correct alarm filtering logic in alert boxes (#2017)

* FIX: update alarm filter in node page

* FIX: update alarm filter in control page

* ADD: Alert Rules

* CHANGE: name of alert files

* ADD: Function for Obol Alerts in Stereum

* Fix: the control alert with obol alerts

* fix: tooltip of  yellow obol alert

* FIX: now filter algo for the alarm controls

* IMPROVE: error handling

* FIX: new alarm filter algo in the node alert box

* FIX: wire obol warning in the node alert box

---------

Co-authored-by: mabasian <[email protected]>
  • Loading branch information
NeoPlays and mabasian authored Sep 3, 2024
1 parent ff4a058 commit f17d1a9
Show file tree
Hide file tree
Showing 10 changed files with 1,080 additions and 91 deletions.

Large diffs are not rendered by default.

14 changes: 0 additions & 14 deletions launcher/public/output.css
Original file line number Diff line number Diff line change
Expand Up @@ -1800,10 +1800,6 @@ video {
height: 50px;
}

.h-\[53px\]{
height: 53px;
}

.h-\[554px\]{
height: 554px;
}
Expand Down Expand Up @@ -5017,11 +5013,6 @@ video {
--tw-shadow: var(--tw-shadow-colored);
}

.shadow-zinc-700{
--tw-shadow-color: #3f3f46;
--tw-shadow: var(--tw-shadow-colored);
}

.shadow-zinc-800{
--tw-shadow-color: #27272a;
--tw-shadow: var(--tw-shadow-colored);
Expand Down Expand Up @@ -5626,11 +5617,6 @@ html body {
background-color: rgb(43 48 52 / var(--tw-bg-opacity));
}

.hover\:bg-\[\#2f5a50\]:hover{
--tw-bg-opacity: 1;
background-color: rgb(47 90 80 / var(--tw-bg-opacity));
}

.hover\:bg-\[\#325e5a\]:hover{
--tw-bg-opacity: 1;
background-color: rgb(50 94 90 / var(--tw-bg-opacity));
Expand Down
130 changes: 119 additions & 11 deletions launcher/src/backend/Monitoring.js
Original file line number Diff line number Diff line change
Expand Up @@ -436,11 +436,11 @@ export class Monitoring {
var query =
rpc_method.trim().indexOf("{") < 0
? JSON.stringify({
jsonrpc: "2.0",
method: rpc_method.trim(),
params: rpc_params,
id: 1,
})
jsonrpc: "2.0",
method: rpc_method.trim(),
params: rpc_params,
id: 1,
})
: rpc_method;

// Define default response
Expand Down Expand Up @@ -2623,8 +2623,8 @@ export class Monitoring {
const addr_type = Array.isArray(addr)
? "arr"
: typeof addr === "string" && ["public", "local"].includes(addr)
? "str"
: "invalid";
? "str"
: "invalid";
addr = addr_type == "str" ? addr.toLowerCase().trim() : addr;
if (addr_type == "invalid") {
return {
Expand Down Expand Up @@ -2712,7 +2712,7 @@ export class Monitoring {
for (let i = 0; i < serviceInfos.length; i++) {
const hashDependencies =
serviceInfos[i].config.dependencies.consensusClients.length ||
serviceInfos[i].config.dependencies.executionClients.length
serviceInfos[i].config.dependencies.executionClients.length
? "yes"
: "no";
easyInfos.push({
Expand Down Expand Up @@ -3249,9 +3249,8 @@ rm -rf diskoutput
const parsedJson = JSON.parse(stdoutJson);

let message =
`${parsedJson?.message || ""}${parsedJson?.message && parsedJson?.stacktraces ? "\n" : ""}${
parsedJson?.stacktraces || ""
}`.trim() || output;
`${parsedJson?.message || ""}${parsedJson?.message && parsedJson?.stacktraces ? "\n" : ""}${parsedJson?.stacktraces || ""
}`.trim() || output;

return {
pubkey: pubkey,
Expand Down Expand Up @@ -3341,4 +3340,113 @@ rm -rf diskoutput
];
}
}
/**
* Will gather metrics from Prometheus and evaluate.
* If thresholds are exceeded, an alert will be generated and added to the retuned array.
* @returns {Object[]} Array of alerts e.g. [{name: "Cluster in Unknown Status", level: "warning"}, {name: "Beacon Node Down", level: "critical"}]
*/
async fetchObolCharonAlerts() {
try {
const serviceInfos = await this.getServiceInfos("CharonService");
if (serviceInfos.length < 1) {
return [];
}
const queries = {
app_monitoring_readyz: "max((app_monitoring_readyz)) by (cluster_name, cluster_hash, cluster_peer)",
cluster_missed_attestations: "max(increase(core_tracker_failed_duties_total[10m])) by (cluster_hash, cluster_name)",
cluster_failure_rate: "floor(100 * (max(increase(core_tracker_success_duties_total[15m])) by (cluster_hash, cluster_name) / max(increase(core_tracker_expect_duties_total[15m])) by (cluster_hash, cluster_name)))",
percentage_failed_sync_message_duty: "(\n sum(increase(core_tracker_failed_duties_total[1h])) by (cluster_name,cluster_hash,cluster_peer)\n) \n/ \n(\n sum(increase(core_tracker_failed_duties_total[1h])) by (cluster_name,cluster_hash,cluster_peer) \n + \n sum(increase(core_bcast_broadcast_total[1h])) by (cluster_name,cluster_hash,cluster_peer) \n)",
connected_relays: "group (p2p_relay_connections) by (cluster_peer)",
peer_ping_latency: "histogram_quantile(0.90, sum(rate(p2p_ping_latency_secs_bucket[2m])) by (le,peer))",
}

const queryPromises = Object.entries(queries).map(([key, query]) => {
return this.queryPrometheus(encodeURIComponent(query)).then(result => ({ key, result }));
});

const results = await Promise.all(queryPromises);

let alerts = results.map((metric) => {
if (metric.result.status != "success") {
return;
}
if (metric.key === "peer_ping_latency") {
let value = Math.max(...metric.result.data.result.map((r) => r.value[1]));
return this.parseObolCharonAlerts(metric.key, value);
}
let value = metric.result.data.result[0].value[1];
return this.parseObolCharonAlerts(metric.key, value);
}).filter((alert) => alert);

return alerts;

} catch (error) {
log.error("Fetching Obol Charon Alerts Failed:\n" + error);
return []
}
}

parseObolCharonAlerts(key, value) {
value = 0
//app_monitoring_readyz
if (key === "app_monitoring_readyz") {
switch (value) {
case 0:
return {
name: "Cluster in Unknown Status",
level: "warning"
};
case 2:
return {
name: "Beacon Node Down",
level: "critical"
};
case 4:
return {
name: "Cluster Insufficient Peers",
level: "warning"
};
case 6:
return {
name: "Cluster Missing Validators",
level: "critical"
};
case 7:
return {
name: "Beacon Node Zero Peers",
level: "critical"
};
}
}
if (key === "cluster_missed_attestations" && value > 0) {
return {
name: "Cluster Missed Attestations",
level: "critical"
};
}
if (key === "cluster_failure_rate" && value < 95) {
return {
name: "Cluster Failure Rate",
level: "critical"
};
}
if (key === "percentage_failed_sync_message_duty" && value > 0.1) {
return {
name: "Failed Sync Msg Duty",
level: "critical"
};
}
if (key === "connected_relays" && value < 1) {
return {
name: "Num. Connected Relays",
level: "warning"
};
}
if (key === "peer_ping_latency" && value > 0.4) {
return {
name: "Peer Ping Latency",
level: "warning"
};
}
}
}
7 changes: 5 additions & 2 deletions launcher/src/backend/SSHService.js
Original file line number Diff line number Diff line change
Expand Up @@ -571,7 +571,10 @@ export class SSHService {
* @param {Client} [conn]
* @returns `void`
*/
async uploadFileSSH(localPath, remotePath, conn = this.getConnectionFromPool()) {
async uploadFileSSH(localPath, remotePath, conn) {
if (!conn) {
conn = await this.getConnectionFromPool();
}
return new Promise((resolve, reject) => {
const readStream = fs.createReadStream(localPath);
readStream.on("error", reject);
Expand Down Expand Up @@ -621,7 +624,7 @@ export class SSHService {
if (item.isDirectory()) {
await this.uploadDirectorySSH(localFilePath, remoteFilePath, conn);
} else {
await this.uploadFileSSH(localFilePath, remoteFilePath);
await this.uploadFileSSH(localFilePath, remoteFilePath, conn);
}
}
return true;
Expand Down
2 changes: 1 addition & 1 deletion launcher/src/backend/ValidatorAccountManager.js
Original file line number Diff line number Diff line change
Expand Up @@ -1127,7 +1127,7 @@ export class ValidatorAccountManager {
this.nodeConnection.sshService.exec(`rm -rf ${dataDir}`);
const result = await this.nodeConnection.sshService.uploadDirectorySSH(path.normalize(localPath), dataDir);
if (result) {
log.info("Obol Backup downloaded from: ", localPath);
log.info("Obol Backup uploaded from: ", localPath);
}
} catch (err) {
log.error("Error uploading Obol Backup: ", err);
Expand Down
4 changes: 4 additions & 0 deletions launcher/src/background.js
Original file line number Diff line number Diff line change
Expand Up @@ -741,6 +741,10 @@ ipcMain.handle("readGasConfigFile", async (event, args) => {
return await tekuGasLimitConfig.readGasConfigFile(args);
});

ipcMain.handle("fetchObolCharonAlerts", async () => {
return await monitoring.fetchObolCharonAlerts();
});

// Scheme must be registered before the app is ready
protocol.registerSchemesAsPrivileged([{ scheme: "app", privileges: { secure: true, standard: true } }]);

Expand Down
Loading

0 comments on commit f17d1a9

Please sign in to comment.