Skip to content

Commit 8029e6d

Browse files
committed
opal/mca/ofi: select NIC closest to accelerator if requested
When accelerator is requested, select the closest NIC to the accelerator device. If the accelerator or NIC PCI information is not available, fallback to select the NIC on the closest package. Signed-off-by: Wenduo Wang <[email protected]>
1 parent 18a7064 commit 8029e6d

File tree

1 file changed

+198
-9
lines changed

1 file changed

+198
-9
lines changed

opal/mca/common/ofi/common_ofi.c

Lines changed: 198 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030

3131
#include "common_ofi.h"
3232
#include "opal/constants.h"
33+
#include "opal/mca/accelerator/accelerator.h"
3334
#include "opal/mca/base/mca_base_framework.h"
3435
#include "opal/mca/base/mca_base_var.h"
3536
#include "opal/mca/hwloc/base/base.h"
@@ -38,6 +39,7 @@
3839
#include "opal/util/argv.h"
3940
#include "opal/util/show_help.h"
4041

42+
extern opal_accelerator_base_module_t opal_accelerator;
4143
opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL,
4244
.prov_exclude = NULL,
4345
.output = -1};
@@ -446,6 +448,18 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr
446448
}
447449
}
448450

451+
#if OPAL_OFI_PCI_DATA_AVAILABLE
452+
static int get_provider_nic_pci(struct fi_info *provider, struct fi_pci_attr *pci)
453+
{
454+
if (NULL != provider->nic && NULL != provider->nic->bus_attr
455+
&& provider->nic->bus_attr->bus_type == FI_BUS_PCI) {
456+
*pci = provider->nic->bus_attr->attr.pci;
457+
return OPAL_SUCCESS;
458+
}
459+
return OPAL_ERR_NOT_AVAILABLE;
460+
}
461+
#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
462+
449463
/**
450464
* Calculate device distances
451465
*
@@ -784,6 +798,165 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
784798
return (uint32_t) process_info->myprocid.rank;
785799
}
786800

801+
static int get_obj_depth(hwloc_obj_t obj, int *depth)
802+
{
803+
hwloc_obj_t parent = NULL;
804+
int depth_from_obj = 0;
805+
806+
/* For hwloc < 2.0, depth is unsigned type, but it could store a negative value */
807+
if (0 <= (int) obj->depth) {
808+
*depth = obj->depth;
809+
return OPAL_SUCCESS;
810+
}
811+
812+
parent = obj->parent;
813+
while (parent) {
814+
++depth_from_obj;
815+
if (0 <= (int) parent->depth) {
816+
*depth = parent->depth + depth_from_obj;
817+
return OPAL_SUCCESS;
818+
}
819+
parent = obj->parent;
820+
}
821+
822+
return OPAL_ERROR;
823+
}
824+
825+
#if OPAL_OFI_PCI_DATA_AVAILABLE
826+
/**
827+
* @brief Attempt to find a nearest provider from the accelerator.
828+
* Check if opal_accelerator is initialized with a valid PCI device, and find a provider from the
829+
* shortest distance.
830+
* Special cases:
831+
* 1. If not accelerator device is available, returns OPAL_ERR_NOT_AVAILABLE.
832+
* 2. If the provider does not have PCI attributers, we do not attempt to make a selection, and
833+
* return OPAL_ERR_NOT_AVAILABLE.
834+
* 3. If there are more than 1 providers with the same equal distance, break the tie using a modulo
835+
* i.e. (local rank on the same accelerator) % (number of nearest providers)
836+
* @param[in] provider_list linked list of providers
837+
* @param[in] num_providers number of providers
838+
* @param[in] device_rank local rank on the accelerator
839+
* @param[out] provider pointer to the selected provider
840+
* @return OPAL_SUCCESS if a provider is successfully selected
841+
* OPAL_ERR_NOT_AVAILABLE if a provider cannot be decided deterministically
842+
* OPAL_ERROR if a fatal error happened
843+
*/
844+
static int find_nearest_provider_from_accelerator(struct fi_info *provider_list,
845+
size_t num_providers, uint32_t device_rank,
846+
struct fi_info **provider)
847+
{
848+
hwloc_obj_t accl_dev = NULL, prov_dev = NULL, common_ancestor = NULL;
849+
int ret = -1, accl_id = -1, depth = -1, max_common_ancestor_depth = -1;
850+
opal_accelerator_pci_attr_t accl_pci_attr = {0};
851+
struct fi_info *current_provider = NULL;
852+
struct fi_pci_attr pci = {0};
853+
uint32_t near_provider_count = 0, provider_rank = 0;
854+
uint32_t distances[num_providers], *distance = distances;
855+
856+
memset(distances, 0, sizeof(distances));
857+
858+
ret = opal_accelerator.get_device(&accl_id);
859+
if (OPAL_SUCCESS != ret) {
860+
opal_output_verbose(1, opal_common_ofi.output, "%s:%d:Accelerator is not available",
861+
__FILE__, __LINE__);
862+
return OPAL_ERR_NOT_AVAILABLE;
863+
}
864+
865+
ret = opal_accelerator.get_device_pci_attr(accl_id, &accl_pci_attr);
866+
if (OPAL_SUCCESS != ret) {
867+
opal_output_verbose(1, opal_common_ofi.output,
868+
"%s:%d:Accelerator PCI info is not available", __FILE__, __LINE__);
869+
return OPAL_ERROR;
870+
}
871+
872+
accl_dev = hwloc_get_pcidev_by_busid(opal_hwloc_topology, accl_pci_attr.domain_id,
873+
accl_pci_attr.bus_id, accl_pci_attr.device_id,
874+
accl_pci_attr.function_id);
875+
if (NULL == accl_dev) {
876+
opal_output_verbose(1, opal_common_ofi.output,
877+
"%s:%d:Failed to find accelerator PCI device", __FILE__, __LINE__);
878+
return OPAL_ERROR;
879+
}
880+
881+
opal_output_verbose(1, opal_common_ofi.output,
882+
"%s:%d:Found accelerator device %d: %04x:%02x:%02x.%x VID: %x DID: %x",
883+
__FILE__, __LINE__, accl_id, accl_pci_attr.domain_id, accl_pci_attr.bus_id,
884+
accl_pci_attr.device_id, accl_pci_attr.function_id,
885+
accl_dev->attr->pcidev.vendor_id, accl_dev->attr->pcidev.device_id);
886+
887+
current_provider = provider_list;
888+
while (NULL != current_provider) {
889+
common_ancestor = NULL;
890+
if (0 == check_provider_attr(provider_list, current_provider)
891+
&& OPAL_SUCCESS == get_provider_nic_pci(current_provider, &pci)) {
892+
prov_dev = hwloc_get_pcidev_by_busid(opal_hwloc_topology, pci.domain_id, pci.bus_id,
893+
pci.device_id, pci.function_id);
894+
if (NULL == prov_dev) {
895+
opal_output_verbose(1, opal_common_ofi.output,
896+
"%s:%d:Failed to find provider PCI device", __FILE__, __LINE__);
897+
return OPAL_ERROR;
898+
}
899+
900+
common_ancestor = hwloc_get_common_ancestor_obj(opal_hwloc_topology, accl_dev,
901+
prov_dev);
902+
if (!common_ancestor) {
903+
opal_output_verbose(
904+
1, opal_common_ofi.output,
905+
"%s:%d:Failed to find common ancestor of accelerator and provider PCI device",
906+
__FILE__, __LINE__);
907+
/**
908+
* Return error because any 2 PCI devices should share at least one common ancestor,
909+
* i.e. root
910+
*/
911+
return OPAL_ERROR;
912+
}
913+
914+
ret = get_obj_depth(common_ancestor, &depth);
915+
if (OPAL_SUCCESS != ret) {
916+
opal_output_verbose(1, opal_common_ofi.output,
917+
"%s:%d:Failed to get common ancestor depth", __FILE__,
918+
__LINE__);
919+
return OPAL_ERROR;
920+
}
921+
922+
if (max_common_ancestor_depth < depth) {
923+
max_common_ancestor_depth = depth;
924+
near_provider_count = 1;
925+
} else if (max_common_ancestor_depth == depth) {
926+
++near_provider_count;
927+
}
928+
}
929+
930+
*(distance++) = !common_ancestor ? 0 : depth;
931+
current_provider = current_provider->next;
932+
}
933+
934+
if (0 == near_provider_count || 0 > max_common_ancestor_depth) {
935+
opal_output_verbose(1, opal_common_ofi.output, "%s:%d:Provider does not have PCI device",
936+
__FILE__, __LINE__);
937+
return OPAL_ERR_NOT_AVAILABLE;
938+
}
939+
940+
provider_rank = device_rank % near_provider_count;
941+
942+
distance = distances;
943+
current_provider = provider_list;
944+
while (NULL != current_provider) {
945+
if (max_common_ancestor_depth == *(distance++) && provider_rank == --near_provider_count) {
946+
*provider = current_provider;
947+
return OPAL_SUCCESS;
948+
}
949+
950+
current_provider = current_provider->next;
951+
}
952+
953+
assert(0 == near_provider_count);
954+
955+
return OPAL_ERROR;
956+
}
957+
#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
958+
959+
787960
struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
788961
opal_process_info_t *process_info)
789962
{
@@ -809,8 +982,29 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
809982
__FILE__, __LINE__);
810983
}
811984

985+
/* Current process' local rank on the same package(socket) */
986+
package_rank = get_package_rank(process_info);
812987
provider_limit = count_providers(provider_list);
813988

989+
#if OPAL_OFI_PCI_DATA_AVAILABLE
990+
/**
991+
* If accelerator is enabled, select the closest provider to the accelerator.
992+
* Note: the function expects a local rank on the accelerator to break ties if there are
993+
* multiple equidistant providers. package_rank is NOT an accurate measure, but a proxy.
994+
*/
995+
ret = find_nearest_provider_from_accelerator(provider_list, provider_limit, package_rank,
996+
&provider);
997+
if (!ret)
998+
return provider;
999+
1000+
if (OPAL_ERR_NOT_AVAILABLE != ret) {
1001+
opal_output_verbose(1, opal_common_ofi.output,
1002+
"%s:%d:Failed to find a provider close to the accelerator. Error: %d",
1003+
__FILE__, __LINE__, ret);
1004+
return provider_list;
1005+
}
1006+
#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
1007+
8141008
/* Allocate memory for provider table */
8151009
provider_table = calloc(provider_limit, sizeof(struct fi_info *));
8161010
if (NULL == provider_table) {
@@ -827,20 +1021,15 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
8271021
distances = get_nearest_nics(&num_distances, &pmix_val);
8281022
#endif
8291023

830-
current_provider = provider;
1024+
current_provider = provider_list;
8311025

8321026
/* Cycle through remaining fi_info objects, looking for alike providers */
8331027
while (NULL != current_provider) {
834-
if (!check_provider_attr(provider, current_provider)) {
1028+
if (!check_provider_attr(provider_list, current_provider)) {
8351029
near = false;
8361030
#if OPAL_OFI_PCI_DATA_AVAILABLE
837-
if (NULL != current_provider->nic
838-
&& NULL != current_provider->nic->bus_attr
839-
&& current_provider->nic->bus_attr->bus_type == FI_BUS_PCI) {
840-
pci = current_provider->nic->bus_attr->attr.pci;
841-
near = is_near(distances, num_distances,
842-
opal_hwloc_topology, pci);
843-
}
1031+
if (OPAL_SUCCESS == get_provider_nic_pci(current_provider, &pci))
1032+
near = is_near(distances, num_distances, opal_hwloc_topology, pci);
8441033
#endif
8451034
/* We could have multiple near providers */
8461035
if (near && !provider_found) {

0 commit comments

Comments
 (0)