Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP][dashboard] remove DataSource.agents for good. #49903

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions python/ray/dashboard/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,8 +202,8 @@ async def run(self):
http_port = -1 if not self.http_server else self.http_server.http_port
grpc_port = -1 if not self.server else self.grpc_port
await self.gcs_aio_client.internal_kv_put(
f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{self.node_id}".encode(),
json.dumps([http_port, grpc_port]).encode(),
f"{dashboard_consts.DASHBOARD_AGENT_ADDR_PREFIX}{self.node_id}".encode(),
json.dumps([self.ip, http_port, grpc_port]).encode(),
True,
namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
)
Expand Down
27 changes: 16 additions & 11 deletions python/ray/dashboard/client/src/common/ProfilingLink.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import { ClassNameProps } from "./props";
type CpuProfilingLinkProps = PropsWithChildren<
{
pid: string | number | null | undefined;
ip: string | null | undefined;
nodeId: string | null | undefined;
type: string | null;
} & ClassNameProps
>;
Expand All @@ -34,7 +34,7 @@ type TaskProfilingStackTraceProps = {
type MemoryProfilingProps = PropsWithChildren<
{
pid: string | number | null | undefined;
ip: string | null | undefined;
nodeId: string | null | undefined;
type?: string | null;
} & ClassNameProps
>;
Expand Down Expand Up @@ -92,15 +92,20 @@ export const TaskCpuStackTraceLink = ({

export const CpuStackTraceLink = ({
pid,
ip,
nodeId,
type = "",
}: CpuProfilingLinkProps) => {
if (!pid || !ip || typeof pid === "undefined" || typeof ip === "undefined") {
if (
!pid ||
!nodeId ||
typeof pid === "undefined" ||
typeof nodeId === "undefined"
) {
return <div></div>;
}
return (
<Link
href={`worker/traceback?pid=${pid}&ip=${ip}&native=0`}
href={`worker/traceback?pid=${pid}&node_id=${nodeId}&native=0`}
target="_blank"
title="Sample the current Python stack trace for this worker."
rel="noreferrer"
Expand All @@ -112,16 +117,16 @@ export const CpuStackTraceLink = ({

export const CpuProfilingLink = ({
pid,
ip,
nodeId,
type = "",
}: CpuProfilingLinkProps) => {
if (!pid || !ip) {
if (!pid || !nodeId) {
return <div></div>;
}

return (
<Link
href={`worker/cpu_profile?pid=${pid}&ip=${ip}&duration=5&native=0`}
href={`worker/cpu_profile?pid=${pid}&node_id=${nodeId}&duration=5&native=0`}
target="_blank"
title="Profile the Python worker for 5 seconds (default) and display a CPU flame graph."
rel="noreferrer"
Expand Down Expand Up @@ -283,13 +288,13 @@ export const ProfilerButton = ({

export const MemoryProfilingButton = ({
pid,
ip,
nodeId,
type = "",
}: MemoryProfilingProps) => {
if (!pid || !ip) {
if (!pid || !nodeId) {
return <div></div>;
}
const profilerUrl = `memory_profile?pid=${pid}&ip=${ip}`;
const profilerUrl = `memory_profile?pid=${pid}&node_id=${nodeId}`;

return <ProfilerButton profilerUrl={profilerUrl} type={type} />;
};
Expand Down
6 changes: 3 additions & 3 deletions python/ray/dashboard/client/src/components/ActorTable.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -609,19 +609,19 @@ const ActorTable = ({
<br />
<CpuProfilingLink
pid={pid}
ip={address?.ipAddress}
nodeId={address?.rayletId}
type=""
/>
<br />
<CpuStackTraceLink
pid={pid}
ip={address?.ipAddress}
nodeId={address?.rayletId}
type=""
/>
<br />
<MemoryProfilingButton
pid={pid}
ip={address?.ipAddress}
nodeId={address?.rayletId}
/>
</React.Fragment>
</TableCell>
Expand Down
6 changes: 3 additions & 3 deletions python/ray/dashboard/client/src/pages/actor/ActorDetail.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -188,19 +188,19 @@ const ActorDetailPage = () => {
<div>
<CpuStackTraceLink
pid={actorDetail.pid}
ip={actorDetail.address?.ipAddress}
nodeId={actorDetail.address?.rayletId}
type=""
/>
<br />
<CpuProfilingLink
pid={actorDetail.pid}
ip={actorDetail.address?.ipAddress}
nodeId={actorDetail.address?.rayletId}
type=""
/>
<br />
<MemoryProfilingButton
pid={actorDetail.pid}
ip={actorDetail.address?.ipAddress}
nodeId={actorDetail.address?.rayletId}
type=""
/>
</div>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,19 +172,19 @@ export const JobMetadataSection = ({ job }: JobMetadataSectionProps) => {
<div>
<CpuStackTraceLink
pid={job.driver_info?.pid}
ip={job.driver_info?.node_ip_address}
nodeId={job.driver_info?.node_id}
type="Driver"
/>
<br />
<CpuProfilingLink
pid={job.driver_info?.pid}
ip={job.driver_info?.node_ip_address}
nodeId={job.driver_info?.node_id}
type="Driver"
/>
<br />
<MemoryProfilingButton
pid={job.driver_info?.pid}
ip={job.driver_info?.node_ip_address}
nodeId={job.driver_info?.node_id}
type="Driver"
/>
</div>
Expand Down
6 changes: 3 additions & 3 deletions python/ray/dashboard/client/src/pages/job/JobRow.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -116,19 +116,19 @@ export const JobRow = ({ job }: JobRowProps) => {
)}
<CpuStackTraceLink
pid={job.driver_info?.pid}
ip={job.driver_info?.node_ip_address}
nodeId={job.driver_info?.node_id}
type="Driver"
/>
<br />
<CpuProfilingLink
pid={job.driver_info?.pid}
ip={job.driver_info?.node_ip_address}
nodeId={job.driver_info?.node_id}
type="Driver"
/>
<br />
<MemoryProfilingButton
pid={job.driver_info?.pid}
ip={job.driver_info?.node_ip_address}
nodeId={job.driver_info?.node_id}
type="Driver"
/>
</TableCell>
Expand Down
7 changes: 3 additions & 4 deletions python/ray/dashboard/client/src/pages/node/NodeRow.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,6 @@ type WorkerRowProps = {
*/
export const WorkerRow = ({ node, worker }: WorkerRowProps) => {
const {
ip,
mem,
raylet: { nodeId },
} = node;
Expand Down Expand Up @@ -278,11 +277,11 @@ export const WorkerRow = ({ node, worker }: WorkerRowProps) => {
Log
</Link>
<br />
<CpuProfilingLink pid={pid} ip={ip} type="" />
<CpuProfilingLink pid={pid} nodeId={nodeId} type="" />
<br />
<CpuStackTraceLink pid={pid} ip={ip} type="" />
<CpuStackTraceLink pid={pid} nodeId={nodeId} type="" />
<br />
<MemoryProfilingButton pid={pid} ip={ip} />
<MemoryProfilingButton pid={pid} nodeId={nodeId} />
</TableCell>
<TableCell>
<PercentageBar num={Number(cpu)} total={100}>
Expand Down
2 changes: 1 addition & 1 deletion python/ray/dashboard/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from ray._private.ray_constants import env_bool, env_integer

DASHBOARD_LOG_FILENAME = "dashboard.log"
DASHBOARD_AGENT_PORT_PREFIX = "DASHBOARD_AGENT_PORT_PREFIX:"
DASHBOARD_AGENT_ADDR_PREFIX = "DASHBOARD_AGENT_ADDR_PREFIX:"
DASHBOARD_AGENT_LOG_FILENAME = "dashboard_agent.log"
DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_S_ENV_NAME = (
"RAY_DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_S" # noqa
Expand Down
42 changes: 1 addition & 41 deletions python/ray/dashboard/datacenter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import Any, List, Optional
from typing import List, Optional

import ray.dashboard.consts as dashboard_consts
from ray._private.utils import (
Expand All @@ -26,9 +26,6 @@ class DataSource:
# {actor id hex(str): actor table data(dict of ActorTableData
# in gcs.proto)}
actors = MutableNotificationDict()
# {job id hex(str): job table data(dict of JobTableData in gcs.proto)}
# {node id hex(str): dashboard agent [http port(int), grpc port(int)]}
agents = Dict()
# {node id hex(str): gcs node info(dict of GcsNodeInfo in gcs.proto)}
nodes = Dict()
# {node id hex(str): worker list}
Expand All @@ -48,7 +45,6 @@ async def purge():
# Purge data that is out of date.
# These data sources are maintained by DashboardHead,
# we do not needs to purge them:
# * agents
# * nodes
alive_nodes = {
node_id
Expand Down Expand Up @@ -188,42 +184,6 @@ async def get_all_node_summary(cls):
for node_id in DataSource.nodes.keys()
]

@classmethod
async def get_agent_infos(
cls, target_node_ids: Optional[List[str]] = None
) -> Dict[str, Dict[str, Any]]:
"""Fetches running Agent (like HTTP/gRPC ports, IP, etc) running on every node

:param target_node_ids: Target node ids to fetch agent info for. If omitted will
fetch the info for all agents
"""

# Return all available agent infos in case no target node-ids were provided
target_node_ids = target_node_ids or DataSource.agents.keys()

missing_node_ids = [
node_id for node_id in target_node_ids if node_id not in DataSource.agents
]
if missing_node_ids:
logger.warning(
f"Agent info was not found for {missing_node_ids}"
f" (having agent infos for {list(DataSource.agents.keys())})"
)
return {}

def _create_agent_info(node_id: str):
(http_port, grpc_port) = DataSource.agents[node_id]
node_ip = DataSource.nodes[node_id]["nodeManagerAddress"]

return dict(
ipAddress=node_ip,
httpPort=int(http_port or -1),
grpcPort=int(grpc_port or -1),
httpAddress=f"{node_ip}:{http_port}",
)

return {node_id: _create_agent_info(node_id) for node_id in target_node_ids}

@classmethod
async def get_actor_infos(cls, actor_ids: Optional[List[str]] = None):
target_actor_table_entries: dict[str, Optional[dict]]
Expand Down
Loading
Loading