Skip to content

Commit a4e1bc7

Browse files
Merge pull request #14129 from rabbitmq/mergify/bp/v4.1.x/pr-14125
Re-submit #14087 by @SimonUnge: introduce an opinionated, opt-in way to prevent a node from booting if it's been reset in the past (backport #14125)
2 parents f5bf240 + c3e0472 commit a4e1bc7

File tree

5 files changed

+227
-3
lines changed

5 files changed

+227
-3
lines changed

deps/rabbit/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ PARALLEL_CT_SET_1_D = amqqueue_backward_compatibility channel_interceptor channe
265265

266266
PARALLEL_CT_SET_2_A = cluster confirms_rejects consumer_timeout rabbit_access_control rabbit_confirms rabbit_core_metrics_gc rabbit_cuttlefish rabbit_db_binding rabbit_db_exchange
267267
PARALLEL_CT_SET_2_B = clustering_recovery crashing_queues deprecated_features direct_exchange_routing_v2 disconnect_detected_during_alarm exchanges unit_gen_server2
268-
PARALLEL_CT_SET_2_C = disk_monitor dynamic_qq unit_disk_monitor unit_file_handle_cache unit_log_management unit_operator_policy
268+
PARALLEL_CT_SET_2_C = disk_monitor dynamic_qq unit_disk_monitor unit_file_handle_cache unit_log_management unit_operator_policy prevent_startup_if_node_was_reset
269269
PARALLEL_CT_SET_2_D = queue_length_limits queue_parallel quorum_queue_member_reconciliation rabbit_fifo rabbit_fifo_dlx rabbit_stream_coordinator
270270

271271
PARALLEL_CT_SET_3_A = definition_import per_user_connection_channel_limit_partitions per_vhost_connection_limit_partitions policy priority_queue_recovery rabbit_fifo_v0 rabbit_stream_sac_coordinator_v4 rabbit_stream_sac_coordinator unit_credit_flow unit_queue_consumers unit_queue_location unit_quorum_queue

deps/rabbit/ct.test.spec

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
, disk_monitor_SUITE
4444
, dynamic_qq_SUITE
4545
, exchanges_SUITE
46+
, prevent_startup_if_node_was_reset_SUITE
4647
, rabbit_stream_queue_SUITE
4748
]}.
4849

deps/rabbit/priv/schema/rabbit.schema

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1646,6 +1646,16 @@ end}.
16461646
{datatype, string}
16471647
]}.
16481648

1649+
1650+
%% Whether to verify if this is the first time a node starts.
1651+
%% When enabled, nodes will create a marker file on first startup
1652+
%% and refuse to start if the marker exists but tables are empty.
1653+
%%
1654+
1655+
{mapping, "prevent_startup_if_node_was_reset", "rabbit.prevent_startup_if_node_was_reset", [
1656+
{datatype, {enum, [true, false]}}
1657+
]}.
1658+
16491659
% ==========================
16501660
% Logging section
16511661
% ==========================

deps/rabbit/src/rabbit.erl

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
%% Boot steps.
4141
-export([update_cluster_tags/0, maybe_insert_default_data/0, boot_delegate/0, recover/0,
4242
pg_local_amqp_session/0,
43-
pg_local_amqp_connection/0]).
43+
pg_local_amqp_connection/0, prevent_startup_if_node_was_reset/0]).
4444

4545
-rabbit_boot_step({pre_boot, [{description, "rabbit boot start"}]}).
4646

@@ -199,10 +199,16 @@
199199
{requires, [core_initialized]},
200200
{enables, routing_ready}]}).
201201

202+
-rabbit_boot_step({prevent_startup_if_node_was_reset,
203+
[{description, "prevents node boot if a prior boot marker file exists but the database is not seeded (requires opt-in configuration in rabbitmq.conf)"},
204+
{mfa, {?MODULE, prevent_startup_if_node_was_reset, []}},
205+
{requires, recovery},
206+
{enables, empty_db_check}]}).
207+
202208
-rabbit_boot_step({empty_db_check,
203209
[{description, "empty DB check"},
204210
{mfa, {?MODULE, maybe_insert_default_data, []}},
205-
{requires, recovery},
211+
{requires, prevent_startup_if_node_was_reset},
206212
{enables, routing_ready}]}).
207213

208214

@@ -235,6 +241,7 @@
235241
{requires, [core_initialized, recovery]},
236242
{enables, routing_ready}]}).
237243

244+
238245
-rabbit_boot_step({pre_flight,
239246
[{description, "ready to communicate with peers and clients"},
240247
{requires, [core_initialized, recovery, routing_ready]}]}).
@@ -1151,6 +1158,44 @@ update_cluster_tags() ->
11511158
#{domain => ?RMQLOG_DOMAIN_GLOBAL}),
11521159
rabbit_runtime_parameters:set_global(cluster_tags, Tags, <<"internal_user">>).
11531160

1161+
1162+
-spec prevent_startup_if_node_was_reset() -> 'ok' | no_return().
1163+
1164+
prevent_startup_if_node_was_reset() ->
1165+
case application:get_env(rabbit, prevent_startup_if_node_was_reset, false) of
1166+
false ->
1167+
%% Feature is disabled, skip the check
1168+
?LOG_DEBUG("prevent_startup_if_node_was_reset is disabled",
1169+
#{domain => ?RMQLOG_DOMAIN_GLOBAL}),
1170+
ok;
1171+
true ->
1172+
%% Feature is enabled, perform the check
1173+
DataDir = data_dir(),
1174+
MarkerFile = filename:join(DataDir, "node_initialized.marker"),
1175+
case filelib:is_file(MarkerFile) of
1176+
true ->
1177+
%% Not the first run, check if tables need default data
1178+
case rabbit_table:needs_default_data() of
1179+
true ->
1180+
?LOG_ERROR("Node has already been initialized, but database appears empty. "
1181+
"This could indicate data loss or a split-brain scenario.",
1182+
#{domain => ?RMQLOG_DOMAIN_GLOBAL}),
1183+
throw({error, cluster_already_initialized_but_tables_empty});
1184+
false ->
1185+
?LOG_INFO("Node has already been initialized, proceeding with normal startup",
1186+
#{domain => ?RMQLOG_DOMAIN_GLOBAL}),
1187+
ok
1188+
end;
1189+
false ->
1190+
%% First time starting, create the marker file
1191+
?LOG_INFO("First node startup detected, creating initialization marker",
1192+
#{domain => ?RMQLOG_DOMAIN_GLOBAL}),
1193+
ok = filelib:ensure_dir(MarkerFile),
1194+
ok = file:write_file(MarkerFile, <<>>, [exclusive]), % Empty file.
1195+
ok
1196+
end
1197+
end.
1198+
11541199
-spec maybe_insert_default_data() -> 'ok'.
11551200

11561201
maybe_insert_default_data() ->
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
%% This Source Code Form is subject to the terms of the Mozilla Public
2+
%% License, v. 2.0. If a copy of the MPL was not distributed with this
3+
%% file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
%%
5+
%% Copyright (c) 2007-2024 Broadcom. All Rights Reserved. The term "Broadcom" refers to Broadcom Inc. and/or its subsidiaries. All rights reserved.
6+
%%
7+
8+
%% Test suite for the prevent_startup_if_node_was_reset feature.
9+
%% This feature helps detect potential data loss scenarios by maintaining
10+
%% a marker file to track if a node has been initialized before.
11+
12+
-module(prevent_startup_if_node_was_reset_SUITE).
13+
14+
-include_lib("common_test/include/ct.hrl").
15+
-include_lib("eunit/include/eunit.hrl").
16+
17+
-compile(export_all).
18+
19+
all() ->
20+
[
21+
{group, single_node_mnesia},
22+
{group, single_node_khepri}
23+
].
24+
25+
groups() ->
26+
[
27+
{single_node_mnesia, [], [
28+
prevent_startup_if_node_was_reset_disabled,
29+
prevent_startup_if_node_was_reset_enabled
30+
]},
31+
{single_node_khepri, [], [
32+
prevent_startup_if_node_was_reset_disabled,
33+
prevent_startup_if_node_was_reset_enabled
34+
]}
35+
].
36+
37+
%% -------------------------------------------------------------------
38+
%% Testsuite setup/teardown.
39+
%% -------------------------------------------------------------------
40+
41+
init_per_suite(Config) ->
42+
rabbit_ct_helpers:log_environment(),
43+
rabbit_ct_helpers:run_setup_steps(Config).
44+
45+
end_per_suite(Config) ->
46+
rabbit_ct_helpers:run_teardown_steps(Config).
47+
48+
init_per_group(Groupname, Config) ->
49+
Config0 = rabbit_ct_helpers:set_config(Config, [
50+
{metadata_store, meta_store(Groupname)},
51+
{rmq_nodes_clustered, false},
52+
{rmq_nodename_suffix, Groupname},
53+
{rmq_nodes_count, 1}
54+
]),
55+
rabbit_ct_helpers:run_steps(
56+
Config0,
57+
rabbit_ct_broker_helpers:setup_steps() ++
58+
rabbit_ct_client_helpers:setup_steps()
59+
).
60+
61+
end_per_group(_, Config) ->
62+
rabbit_ct_helpers:run_steps(
63+
Config,
64+
rabbit_ct_client_helpers:teardown_steps() ++
65+
rabbit_ct_broker_helpers:teardown_steps()
66+
).
67+
68+
init_per_testcase(Testcase, Config) ->
69+
rabbit_ct_helpers:testcase_started(Config, Testcase),
70+
Config.
71+
72+
end_per_testcase(Testcase, Config) ->
73+
rabbit_ct_helpers:testcase_finished(Config, Testcase).
74+
75+
%% -------------------------------------------------------------------
76+
%% Test cases
77+
%% -------------------------------------------------------------------
78+
79+
prevent_startup_if_node_was_reset_disabled(Config) ->
80+
% When feature is disabled (default), node should start normally
81+
DataDir = rabbit_ct_broker_helpers:get_node_config(Config, 0, data_dir),
82+
MarkerFile = filename:join(DataDir, "node_initialized.marker"),
83+
% Setting is disabled so no marker file should be present
84+
?assertNot(filelib:is_file(MarkerFile)),
85+
86+
% Restarting the node should work fine
87+
ok = stop_app(Config),
88+
set_env(Config, false),
89+
ok = start_app(Config),
90+
% Still no marker file
91+
?assertNot(filelib:is_file(MarkerFile)),
92+
ok.
93+
94+
prevent_startup_if_node_was_reset_enabled(Config) ->
95+
DataDir = rabbit_ct_broker_helpers:get_node_config(Config, 0, data_dir),
96+
MarkerFile = filename:join(DataDir, "node_initialized.marker"),
97+
98+
ok = stop_app(Config),
99+
set_env(Config, true),
100+
ok = start_app(Config),
101+
% Setting is enabled so marker file should be present after initial startup
102+
?assert(filelib:is_file(MarkerFile)),
103+
104+
% Restarting the node should be fine, as there is a marker file
105+
% and corresponding schema data (consistent state)
106+
107+
ok = stop_app(Config),
108+
ok = start_app(Config),
109+
110+
SchemaFile = schema_file(Config),
111+
112+
?assert(filelib:is_file(MarkerFile)),
113+
114+
% Stop the node and remove the present schema to simulate data loss
115+
ok = stop_app(Config),
116+
file:delete(SchemaFile),
117+
% Node should fail to start because marker exists but schema is missing,
118+
% indicating potential data loss or corruption
119+
?assertMatch(
120+
{error, 69, _},
121+
start_app(Config)
122+
),
123+
ok.
124+
125+
%% -------------------------------------------------------------------
126+
%% Internal helpers
127+
%% -------------------------------------------------------------------
128+
129+
stop_app(Config) ->
130+
Node = rabbit_ct_broker_helpers:get_node_config(Config, 0, nodename),
131+
case rabbit_ct_broker_helpers:rabbitmqctl(Config, Node, ["stop_app"]) of
132+
{ok, _} -> ok;
133+
Error -> Error
134+
end.
135+
136+
start_app(Config) ->
137+
Node = rabbit_ct_broker_helpers:get_node_config(Config, 0, nodename),
138+
case rabbit_ct_broker_helpers:rabbitmqctl(Config, Node, ["start_app"]) of
139+
{ok, _} -> ok;
140+
Error -> Error
141+
end.
142+
143+
maybe_enable_prevent_startup_if_node_was_reset(Config, prevent_startup_if_node_was_reset_enabled) ->
144+
rabbit_ct_helpers:merge_app_env(
145+
Config, {rabbit, [{prevent_startup_if_node_was_reset, true}]}
146+
);
147+
maybe_enable_prevent_startup_if_node_was_reset(Config, _) ->
148+
Config.
149+
150+
meta_store(single_node_mnesia) ->
151+
mnesia;
152+
meta_store(single_node_khepri) ->
153+
khepri.
154+
155+
schema_file(Config) ->
156+
DataDir = rabbit_ct_broker_helpers:get_node_config(Config, 0, data_dir),
157+
MetaStore = rabbit_ct_helpers:get_config(Config, metadata_store),
158+
case MetaStore of
159+
mnesia ->
160+
filename:join(DataDir, "schema.DAT");
161+
khepri ->
162+
NodeName = rabbit_ct_broker_helpers:get_node_config(Config, 0, nodename),
163+
filename:join([DataDir, "coordination", NodeName, "names.dets"])
164+
end.
165+
166+
set_env(Config, Bool) ->
167+
Node = rabbit_ct_broker_helpers:get_node_config(Config, 0, nodename),
168+
ok = rpc:call(Node, application, set_env, [rabbit, prevent_startup_if_node_was_reset, Bool]).

0 commit comments

Comments
 (0)