Skip to content

Commit 92d7cea

Browse files
authored
Merge pull request #161 from hpsim/fix/multinode-issues
Fix/multinode issues
2 parents eb16448 + 816e288 commit 92d7cea

10 files changed

+93
-30
lines changed

include/OGL/DevicePersistent/ExecutorHandler.hpp

+12-4
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,13 @@ struct DeviceIdHandler {
3131
FatalErrorInFunction << "Only parallel runs are supported for OGL"
3232
<< exit(FatalError);
3333
}
34+
35+
if (Pstream::nProcs(0) % ranks_per_gpu != 0) {
36+
FatalErrorInFunction
37+
<< " Total number of ranks = " << Pstream::nProcs(0)
38+
<< " is not a multiple of "
39+
<< " ranksPerGPU " << ranks_per_gpu << exit(FatalError);
40+
}
3441
}
3542

3643
/* @brief compute the local device id
@@ -102,7 +109,6 @@ struct ExecutorInitFunctor {
102109
{
103110
auto host_exec = gko::share(gko::ReferenceExecutor::create());
104111

105-
106112
auto msg = [](auto exec, auto id) {
107113
std::string s;
108114
// auto node_comm = Pstream::commInterHost();
@@ -111,16 +117,17 @@ struct ExecutorInitFunctor {
111117
label global_ranks = Pstream::nProcs(0);
112118
label device_ranks = Pstream::nProcs(node_comm);
113119
label node_id = global_ranks / device_ranks;
120+
114121
// Pstream::barrier(0);
115122
// sleep(0.03 * global_rank);
116123
s += std::string("Create ") + std::string(exec) +
117124
std::string(" executor device ") + std::to_string(id) +
118125
std::string(" node ") + std::to_string(node_id) +
119126
std::string(" local rank [") +
120127
std::to_string(Pstream::myProcNo(node_comm)) +
121-
std::string("/") + std::to_string(device_ranks) +
128+
std::string("/") + std::to_string(device_ranks - 1) +
122129
std::string("] global rank [") + std::to_string(global_rank) +
123-
std::string("/") + std::to_string(global_ranks) +
130+
std::string("/") + std::to_string(global_ranks - 1) +
124131
std::string("]");
125132
return s;
126133
};
@@ -135,7 +142,8 @@ struct ExecutorInitFunctor {
135142
label id = device_id_handler_.compute_device_id(
136143
gko::CudaExecutor::get_num_devices());
137144
LOG_0(verbose_, msg(executor_name_, id))
138-
return gko::share(gko::CudaExecutor::create(id, host_exec));
145+
auto ret = gko::share(gko::CudaExecutor::create(id, host_exec));
146+
return ret;
139147
}
140148
if (executor_name_ == "sycl" || executor_name_ == "dpcpp") {
141149
if (version.dpcpp_version.tag == not_compiled_tag) {

include/OGL/MatrixWrapper/HostMatrix.hpp

+4
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ class HostMatrixWrapper {
8181

8282
const word field_name_;
8383

84+
const word folder_;
85+
8486
// Whether the matrix coefficients should be reordered
8587
// during copy or on device
8688
const bool reorder_on_copy_;
@@ -268,6 +270,8 @@ class HostMatrixWrapper {
268270
const ExecutorHandler &get_exec_handler() const { return exec_; }
269271

270272
const word get_field_name() const { return field_name_; }
273+
274+
const word get_folder() const { return folder_; }
271275
};
272276

273277

include/OGL/MatrixWrapper/SparsityPattern.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ std::vector<label> sort_permutation(const std::vector<T> &vec, Compare compare)
4949
*to
5050
*/
5151
std::pair<std::vector<std::vector<label>>, std::vector<label>> compress_cols(
52-
std::vector<std::vector<label>> in, std::vector<label> comm_id);
52+
std::vector<std::vector<label>> in, std::vector<label> comm_rank);
5353
} // namespace detail
5454

5555
namespace Foam {

include/OGL/Preconditioner.hpp

+8-8
Original file line numberDiff line numberDiff line change
@@ -302,14 +302,6 @@ class Preconditioner {
302302
.with_max_block_size(static_cast<gko::uint32>(1))
303303
.on(device_exec));
304304

305-
auto coarsest_gen = gko::share(
306-
cg::build()
307-
.with_preconditioner(ras::build()
308-
.with_local_solver(pre_factory)
309-
.on(device_exec))
310-
.with_criteria(gko::stop::Iteration::build().with_max_iters(
311-
static_cast<gko::uint32>(coarse_solver_iters)))
312-
.on(device_exec));
313305

314306
word msg =
315307
"Generate preconditioner: " + name +
@@ -361,6 +353,14 @@ class Preconditioner {
361353
}
362354

363355
if (type == "Distributed") {
356+
auto coarsest_gen = gko::share(
357+
cg::build()
358+
.with_preconditioner(
359+
ras::build().with_local_solver(pre_factory))
360+
.with_criteria(
361+
gko::stop::Iteration::build().with_max_iters(
362+
static_cast<gko::uint32>(coarse_solver_iters)))
363+
.on(device_exec));
364364
auto gkodistmatrix =
365365
gko::as<RepartDistMatrix>(gkomatrix)->get_dist_matrix();
366366
auto smoother_gen = gko::share(

include/OGL/lduLduBase.hpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -272,9 +272,10 @@ class lduLduBase : public OGL_Info,
272272
auto precond = this->init_preconditioner(
273273
dist_A_v, exec_handler_.get_device_exec());)
274274

275+
bool active = repartitioner->get_repart_size() != 0;
275276
bool export_system(
276277
solver_controls_.lookupOrDefault<Switch>("export", false));
277-
if (export_system && db_.time().writeTime()) {
278+
if (export_system && db_.time().writeTime() && active) {
278279
bool write_global(
279280
solver_controls_.lookupOrDefault<Switch>("writeGlobal", true));
280281
LOG_0(verbose_, "Export system")
@@ -293,7 +294,6 @@ class lduLduBase : public OGL_Info,
293294
LOG_1(verbose_, "done create solver")
294295

295296
// solve only on active rank
296-
bool active = repartitioner->get_repart_size() != 0;
297297
label delta_t_solve_ = 0;
298298
bool split_mpi_comm =
299299
solver_controls_.lookupOrDefault<Switch>("splitMPIComm", true);

src/CommunicationPattern.cpp

+5-3
Original file line numberDiff line numberDiff line change
@@ -280,15 +280,17 @@ std::ostream &operator<<(std::ostream &out, const CommunicationPattern &e)
280280
}
281281

282282

283+
// computes flat vector of send idxs
283284
std::vector<label> CommunicationPattern::total_rank_send_idx() const
284285
{
285-
std::vector<label> tmp;
286+
// flatten and return
287+
std::vector<label> ret;
286288

287289
for (auto &rows : send_idxs) {
288-
tmp.insert(tmp.end(), rows.begin(), rows.end());
290+
ret.insert(ret.end(), rows.begin(), rows.end());
289291
}
290292

291-
return tmp;
293+
return ret;
292294
}
293295

294296

src/MatrixWrapper/Distributed.cpp

+13
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
// SPDX-License-Identifier: GPL-3.0-or-later
44

55
#include "OGL/MatrixWrapper/Distributed.hpp"
6+
#include <fstream>
67

78
/* helper function to convert a sparsity pattern into a vector of linops with
89
* zero coefficients
@@ -169,6 +170,7 @@ void RepartDistMatrix::write(const ExecutorHandler &exec_handler,
169170
auto non_local = gko::share(
170171
gko::matrix::Coo<scalar, label>::create(exec_handler.get_ref_exec()));
171172

173+
172174
if (fuse_) {
173175
gko::as<LocalMatrixType>(dist_mtx_->get_local_matrix())
174176
->convert_to(local.get());
@@ -461,6 +463,17 @@ std::shared_ptr<RepartDistMatrix> create_impl(
461463
auto [send_counts, send_offsets, recv_sizes, recv_offsets] =
462464
repart_comm_pattern->send_recv_pattern();
463465

466+
if (verbose > 1){
467+
std::ofstream myfile;
468+
std::string folder = host_A->get_folder();
469+
myfile.open(folder + "/host_comm_pattern_" +
470+
std::to_string(Pstream::myProcNo()));
471+
myfile << "repart_comm_pattern " << *repart_comm_pattern.get() << "\n";
472+
myfile << "\nrecv_gather_idxs: " << convert_to_vector(recv_gather_idxs)
473+
<< "\nrecv_sizes: " << recv_sizes
474+
<< "\nrecv_offsets: " << recv_offsets << "\n";
475+
}
476+
464477
if (fuse) {
465478
dist_A = gko::share(dist_mtx::create(
466479
device_exec, device_comm, global_dim, local_linops[0],

src/MatrixWrapper/HostMatrix.cpp

+5-1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ HostMatrixWrapper::HostMatrixWrapper(
3535
device_id_guard_{db, fieldName, exec_.get_device_exec()},
3636
verbose_(verbose),
3737
field_name_(fieldName),
38+
folder_(db.time().rootPath()),
3839
reorder_on_copy_(
3940
solverControls.lookupOrDefault<Switch>("reorderOnHost", true)),
4041
addr_(addr),
@@ -168,7 +169,10 @@ HostMatrixWrapper::create_communication_pattern() const
168169
// create index_sets
169170
std::vector<std::vector<label>> send_idxs;
170171
for (label proc : target_ids) {
171-
send_idxs.emplace_back(interface_cell_map[proc]);
172+
auto send_idx = interface_cell_map[proc];
173+
// comm pattern send idxs need to be in order
174+
std::stable_sort(send_idx.begin(), send_idx.end());
175+
send_idxs.emplace_back(send_idx);
172176
}
173177

174178
return std::make_shared<CommunicationPattern>(get_exec_handler(),

src/MatrixWrapper/SparsityPattern.cpp

+33-8
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,12 @@ namespace detail {
88

99

1010
std::pair<std::vector<std::vector<label>>, std::vector<label>> compress_cols(
11-
std::vector<std::vector<label>> in, std::vector<label> comm_id)
11+
std::vector<std::vector<label>> in, std::vector<label> comm_rank)
1212
{
13-
// create a sorting map based on the comm ids
14-
// here the ids with higher id should receive data first
15-
auto id_permutation =
16-
sort_permutation(comm_id, [](label a, label b) { return a < b; });
13+
// create a sorting map based on the comm ranks
14+
// here data from lower ranks is received first
15+
auto comm_permutation =
16+
sort_permutation(comm_rank, [](label a, label b) { return a < b; });
1717
std::map<label, label> col_map;
1818

1919
std::vector<label> global_cols;
@@ -23,10 +23,35 @@ std::pair<std::vector<std::vector<label>>, std::vector<label>> compress_cols(
2323
}
2424
}
2525

26-
label ctr = 0;
26+
27+
// ranks that are the same need to be fused
28+
// first before sorting the interface
29+
// because we send indices sorted per interface
30+
std::vector<std::vector<label>> fused_in;
31+
label prev = -1;
32+
for (auto id : comm_permutation) {
33+
if (comm_rank[id] == prev) {
34+
auto &back = fused_in.back();
35+
for (auto col : in[id]) {
36+
back.push_back(col);
37+
}
38+
} else {
39+
std::vector<label> ins;
40+
for (auto col : in[id]) {
41+
ins.push_back(col);
42+
}
43+
fused_in.push_back(ins);
44+
prev = comm_rank[id];
45+
}
46+
}
47+
48+
2749
// iterate in the order of communication ranks
28-
for (auto id : id_permutation) {
29-
auto &cols = in[id];
50+
label ctr = 0;
51+
for (auto cols : fused_in) {
52+
// sort by global id because this the order how they are sent
53+
std::stable_sort(cols.begin(), cols.end());
54+
// based on global col we compute the compressed recv ctr
3055
for (auto col : cols) {
3156
// new element found
3257
if (col_map.find(col) == col_map.end()) {

src/Repartitioner.cpp

+10-3
Original file line numberDiff line numberDiff line change
@@ -150,9 +150,11 @@ Repartitioner::repartition_comm_pattern(
150150
const ExecutorHandler &exec_handler,
151151
std::shared_ptr<const CommunicationPattern> src_comm_pattern) const
152152
{
153-
if (ranks_per_gpu_ == 1) {
154-
return src_comm_pattern;
155-
}
153+
// TODO:add early return again
154+
// and just sort send_idxs
155+
// if (ranks_per_gpu_ == 1) {
156+
// return src_comm_pattern;
157+
// }
156158

157159
// using comm_size_type = label;
158160
auto exec = exec_handler.get_ref_exec();
@@ -326,6 +328,11 @@ Repartitioner::repartition_comm_pattern(
326328
}
327329
}
328330

331+
// sort the send_idxs so that we send ordered dofs per interface
332+
for (auto &iface_idxs : merged_send_idxs) {
333+
std::stable_sort(iface_idxs.begin(), iface_idxs.end());
334+
}
335+
329336
// recompute send_idxs
330337
send_idxs.clear();
331338

0 commit comments

Comments
 (0)