Skip to content

[Core] Bind runtime env agent and dashboard agent server to specified ip instead of 0.0.0.0 #55431

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 23 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions cpp/test_submit_cpp_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,7 @@ def headers():

@pytest.fixture(scope="module")
def job_sdk_client(headers):
with _ray_start(
include_dashboard=True, num_cpus=1, _node_ip_address="0.0.0.0"
) as ctx:
with _ray_start(include_dashboard=True, num_cpus=1) as ctx:
address = ctx.address_info["webui_url"]
assert wait_until_server_available(address)
yield JobSubmissionClient(format_web_url(address), headers=headers)
Expand Down
5 changes: 1 addition & 4 deletions python/ray/_private/runtime_env/agent/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,13 +218,10 @@ def parent_dead_callback(msg):
check_raylet_task = create_check_raylet_task(
args.log_dir, gcs_client, parent_dead_callback, loop
)
runtime_env_agent_ip = (
"127.0.0.1" if args.node_ip_address == "127.0.0.1" else "0.0.0.0"
)
try:
web.run_app(
app,
host=runtime_env_agent_ip,
host=args.node_ip_address,
port=args.runtime_env_agent_port,
loop=loop,
)
Expand Down
9 changes: 6 additions & 3 deletions python/ray/dashboard/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,14 @@ def _init_non_minimal(self):
),
) # noqa
)
grpc_ip = "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0"
try:
self.grpc_port = add_port_to_grpc_server(
self.server, build_address(grpc_ip, self.dashboard_agent_port)
self.server, build_address(self.ip, self.dashboard_agent_port)
)
if self.ip != "127.0.0.1" and self.ip != "localhost":
self.grpc_port = add_port_to_grpc_server(
self.server, f"127.0.0.1:{self.dashboard_agent_port}"
)
except Exception:
# TODO(SongGuyang): Catch the exception here because there is
# port conflict issue which brought from static port. We should
Expand All @@ -129,7 +132,7 @@ def _init_non_minimal(self):
else:
logger.info(
"Dashboard agent grpc address: %s",
build_address(grpc_ip, self.grpc_port),
build_address(self.ip, self.grpc_port),
)

# If the agent is not minimal it should start the http server
Expand Down
9 changes: 8 additions & 1 deletion python/ray/dashboard/http_server_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,17 @@ async def _start_site_with_retry(
try:
site = aiohttp.web.TCPSite(
self.runner,
"127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0",
self.ip,
self.listen_port,
)
await site.start()
# if self.ip not in ["127.0.0.1", "localhost"]:
# local_site = aiohttp.web.TCPSite(
# self.runner,
# "127.0.0.1",
# self.listen_port,
# )
# await local_site.start()
if attempt > 0:
logger.info(
f"Successfully started agent on port {self.listen_port} "
Expand Down
26 changes: 16 additions & 10 deletions python/ray/dashboard/modules/job/tests/test_job_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
run_string_as_driver_nonblocking,
wait_until_server_available,
)
from ray._common.network_utils import parse_address, build_address
from ray._common.network_utils import build_address
from ray.dashboard.modules.job.common import (
JOB_ACTOR_NAME_TEMPLATE,
SUPERVISOR_ACTOR_RAY_NAMESPACE,
Expand Down Expand Up @@ -77,8 +77,10 @@ def __init__(self, *args, **kwargs):
@pytest_asyncio.fixture
async def job_sdk_client(make_sure_dashboard_http_port_unused):
with _ray_start(include_dashboard=True, num_cpus=1) as ctx:
ip, _ = parse_address(ctx.address_info["webui_url"])
agent_address = build_address(ip, DEFAULT_DASHBOARD_AGENT_LISTEN_PORT)
# Use the actual node IP address instead of parsing from webui_url
# which might contain localhost/127.0.0.1
node_ip = ctx.address_info["node_ip_address"]
agent_address = build_address(node_ip, DEFAULT_DASHBOARD_AGENT_LISTEN_PORT)
assert wait_until_server_available(agent_address)
head_address = ctx.address_info["webui_url"]
assert wait_until_server_available(head_address)
Expand Down Expand Up @@ -469,8 +471,9 @@ async def test_job_log_in_multiple_node(
dashboard_agent_listen_port=DEFAULT_DASHBOARD_AGENT_LISTEN_PORT + 2
)

ip, _ = parse_address(cluster.webui_url)
agent_address = build_address(ip, DEFAULT_DASHBOARD_AGENT_LISTEN_PORT)
# Get the actual node IP from the cluster head node instead of parsing from webui_url
node_ip = cluster.head_node.node_ip_address
agent_address = build_address(node_ip, DEFAULT_DASHBOARD_AGENT_LISTEN_PORT)
assert wait_until_server_available(agent_address)
client = JobAgentSubmissionClient(format_web_url(agent_address))

Expand Down Expand Up @@ -595,18 +598,21 @@ async def test_non_default_dashboard_agent_http_port(tmp_path):
"""
import subprocess

cmd = (
"ray start --head " f"--dashboard-agent-listen-port {get_current_unused_port()}"
)
dashboard_agent_port = get_current_unused_port()
cmd = "ray start --head " f"--dashboard-agent-listen-port {dashboard_agent_port}"
subprocess.check_output(cmd, shell=True)

try:
# We will need to wait for the ray to be started in the subprocess.
address_info = ray.init("auto", ignore_reinit_error=True).address_info

ip, _ = parse_address(address_info["webui_url"])
# Get the actual node IP address from the nodes list
nodes = list_nodes()
assert len(nodes) > 0, "No nodes found"
node_ip = nodes[0].node_ip

dashboard_agent_listen_port = address_info["dashboard_agent_listen_port"]
agent_address = build_address(ip, dashboard_agent_listen_port)
agent_address = build_address(node_ip, dashboard_agent_listen_port)
print("agent address = ", agent_address)

agent_client = JobAgentSubmissionClient(format_web_url(agent_address))
Expand Down