30
30
31
31
#include "common_ofi.h"
32
32
#include "opal/constants.h"
33
+ #include "opal/mca/accelerator/accelerator.h"
33
34
#include "opal/mca/base/mca_base_framework.h"
34
35
#include "opal/mca/base/mca_base_var.h"
35
36
#include "opal/mca/hwloc/base/base.h"
38
39
#include "opal/util/argv.h"
39
40
#include "opal/util/show_help.h"
40
41
42
+ extern opal_accelerator_base_module_t opal_accelerator ;
41
43
opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL ,
42
44
.prov_exclude = NULL ,
43
45
.output = -1 };
@@ -446,6 +448,18 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr
446
448
}
447
449
}
448
450
451
+ #if OPAL_OFI_PCI_DATA_AVAILABLE
452
+ static int get_provider_nic_pci (struct fi_info * provider , struct fi_pci_attr * pci )
453
+ {
454
+ if (NULL != provider -> nic && NULL != provider -> nic -> bus_attr
455
+ && provider -> nic -> bus_attr -> bus_type == FI_BUS_PCI ) {
456
+ * pci = provider -> nic -> bus_attr -> attr .pci ;
457
+ return OPAL_SUCCESS ;
458
+ }
459
+ return OPAL_ERR_NOT_AVAILABLE ;
460
+ }
461
+ #endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
462
+
449
463
/**
450
464
* Calculate device distances
451
465
*
@@ -784,6 +798,165 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
784
798
return (uint32_t ) process_info -> myprocid .rank ;
785
799
}
786
800
801
+ static int get_obj_depth (hwloc_obj_t obj , int * depth )
802
+ {
803
+ hwloc_obj_t parent = NULL ;
804
+ int depth_from_obj = 0 ;
805
+
806
+ /* For hwloc < 2.0, depth is unsigned type, but it could store a negative value */
807
+ if (0 <= (int ) obj -> depth ) {
808
+ * depth = obj -> depth ;
809
+ return OPAL_SUCCESS ;
810
+ }
811
+
812
+ parent = obj -> parent ;
813
+ while (parent ) {
814
+ ++ depth_from_obj ;
815
+ if (0 <= (int ) parent -> depth ) {
816
+ * depth = parent -> depth + depth_from_obj ;
817
+ return OPAL_SUCCESS ;
818
+ }
819
+ parent = obj -> parent ;
820
+ }
821
+
822
+ return OPAL_ERROR ;
823
+ }
824
+
825
+ #if OPAL_OFI_PCI_DATA_AVAILABLE
826
+ /**
827
+ * @brief Attempt to find a nearest provider from the accelerator.
828
+ * Check if opal_accelerator is initialized with a valid PCI device, and find a provider from the
829
+ * shortest distance.
830
+ * Special cases:
831
+ * 1. If not accelerator device is available, returns OPAL_ERR_NOT_AVAILABLE.
832
+ * 2. If the provider does not have PCI attributers, we do not attempt to make a selection, and
833
+ * return OPAL_ERR_NOT_AVAILABLE.
834
+ * 3. If there are more than 1 providers with the same equal distance, break the tie using a modulo
835
+ * i.e. (local rank on the same accelerator) % (number of nearest providers)
836
+ * @param[in] provider_list linked list of providers
837
+ * @param[in] num_providers number of providers
838
+ * @param[in] device_rank local rank on the accelerator
839
+ * @param[out] provider pointer to the selected provider
840
+ * @return OPAL_SUCCESS if a provider is successfully selected
841
+ * OPAL_ERR_NOT_AVAILABLE if a provider cannot be decided deterministically
842
+ * OPAL_ERROR if a fatal error happened
843
+ */
844
+ static int find_nearest_provider_from_accelerator (struct fi_info * provider_list ,
845
+ size_t num_providers , uint32_t device_rank ,
846
+ struct fi_info * * provider )
847
+ {
848
+ hwloc_obj_t accl_dev = NULL , prov_dev = NULL , common_ancestor = NULL ;
849
+ int ret = -1 , accl_id = -1 , depth = -1 , max_common_ancestor_depth = -1 ;
850
+ opal_accelerator_pci_attr_t accl_pci_attr = {0 };
851
+ struct fi_info * current_provider = NULL ;
852
+ struct fi_pci_attr pci = {0 };
853
+ uint32_t near_provider_count = 0 , provider_rank = 0 ;
854
+ uint32_t distances [num_providers ], * distance = distances ;
855
+
856
+ memset (distances , 0 , sizeof (distances ));
857
+
858
+ ret = opal_accelerator .get_device (& accl_id );
859
+ if (OPAL_SUCCESS != ret ) {
860
+ opal_output_verbose (1 , opal_common_ofi .output , "%s:%d:Accelerator is not available" ,
861
+ __FILE__ , __LINE__ );
862
+ return OPAL_ERR_NOT_AVAILABLE ;
863
+ }
864
+
865
+ ret = opal_accelerator .get_device_pci_attr (accl_id , & accl_pci_attr );
866
+ if (OPAL_SUCCESS != ret ) {
867
+ opal_output_verbose (1 , opal_common_ofi .output ,
868
+ "%s:%d:Accelerator PCI info is not available" , __FILE__ , __LINE__ );
869
+ return OPAL_ERROR ;
870
+ }
871
+
872
+ accl_dev = hwloc_get_pcidev_by_busid (opal_hwloc_topology , accl_pci_attr .domain_id ,
873
+ accl_pci_attr .bus_id , accl_pci_attr .device_id ,
874
+ accl_pci_attr .function_id );
875
+ if (NULL == accl_dev ) {
876
+ opal_output_verbose (1 , opal_common_ofi .output ,
877
+ "%s:%d:Failed to find accelerator PCI device" , __FILE__ , __LINE__ );
878
+ return OPAL_ERROR ;
879
+ }
880
+
881
+ opal_output_verbose (1 , opal_common_ofi .output ,
882
+ "%s:%d:Found accelerator device %d: %04x:%02x:%02x.%x VID: %x DID: %x" ,
883
+ __FILE__ , __LINE__ , accl_id , accl_pci_attr .domain_id , accl_pci_attr .bus_id ,
884
+ accl_pci_attr .device_id , accl_pci_attr .function_id ,
885
+ accl_dev -> attr -> pcidev .vendor_id , accl_dev -> attr -> pcidev .device_id );
886
+
887
+ current_provider = provider_list ;
888
+ while (NULL != current_provider ) {
889
+ common_ancestor = NULL ;
890
+ if (0 == check_provider_attr (provider_list , current_provider )
891
+ && OPAL_SUCCESS == get_provider_nic_pci (current_provider , & pci )) {
892
+ prov_dev = hwloc_get_pcidev_by_busid (opal_hwloc_topology , pci .domain_id , pci .bus_id ,
893
+ pci .device_id , pci .function_id );
894
+ if (NULL == prov_dev ) {
895
+ opal_output_verbose (1 , opal_common_ofi .output ,
896
+ "%s:%d:Failed to find provider PCI device" , __FILE__ , __LINE__ );
897
+ return OPAL_ERROR ;
898
+ }
899
+
900
+ common_ancestor = hwloc_get_common_ancestor_obj (opal_hwloc_topology , accl_dev ,
901
+ prov_dev );
902
+ if (!common_ancestor ) {
903
+ opal_output_verbose (
904
+ 1 , opal_common_ofi .output ,
905
+ "%s:%d:Failed to find common ancestor of accelerator and provider PCI device" ,
906
+ __FILE__ , __LINE__ );
907
+ /**
908
+ * Return error because any 2 PCI devices should share at least one common ancestor,
909
+ * i.e. root
910
+ */
911
+ return OPAL_ERROR ;
912
+ }
913
+
914
+ ret = get_obj_depth (common_ancestor , & depth );
915
+ if (OPAL_SUCCESS != ret ) {
916
+ opal_output_verbose (1 , opal_common_ofi .output ,
917
+ "%s:%d:Failed to get common ancestor depth" , __FILE__ ,
918
+ __LINE__ );
919
+ return OPAL_ERROR ;
920
+ }
921
+
922
+ if (max_common_ancestor_depth < depth ) {
923
+ max_common_ancestor_depth = depth ;
924
+ near_provider_count = 1 ;
925
+ } else if (max_common_ancestor_depth == depth ) {
926
+ ++ near_provider_count ;
927
+ }
928
+ }
929
+
930
+ * (distance ++ ) = !common_ancestor ? 0 : depth ;
931
+ current_provider = current_provider -> next ;
932
+ }
933
+
934
+ if (0 == near_provider_count || 0 > max_common_ancestor_depth ) {
935
+ opal_output_verbose (1 , opal_common_ofi .output , "%s:%d:Provider does not have PCI device" ,
936
+ __FILE__ , __LINE__ );
937
+ return OPAL_ERR_NOT_AVAILABLE ;
938
+ }
939
+
940
+ provider_rank = device_rank % near_provider_count ;
941
+
942
+ distance = distances ;
943
+ current_provider = provider_list ;
944
+ while (NULL != current_provider ) {
945
+ if (max_common_ancestor_depth == * (distance ++ ) && provider_rank == -- near_provider_count ) {
946
+ * provider = current_provider ;
947
+ return OPAL_SUCCESS ;
948
+ }
949
+
950
+ current_provider = current_provider -> next ;
951
+ }
952
+
953
+ assert (0 == near_provider_count );
954
+
955
+ return OPAL_ERROR ;
956
+ }
957
+ #endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
958
+
959
+
787
960
struct fi_info * opal_common_ofi_select_provider (struct fi_info * provider_list ,
788
961
opal_process_info_t * process_info )
789
962
{
@@ -809,8 +982,29 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
809
982
__FILE__ , __LINE__ );
810
983
}
811
984
985
+ /* Current process' local rank on the same package(socket) */
986
+ package_rank = get_package_rank (process_info );
812
987
provider_limit = count_providers (provider_list );
813
988
989
+ #if OPAL_OFI_PCI_DATA_AVAILABLE
990
+ /**
991
+ * If accelerator is enabled, select the closest provider to the accelerator.
992
+ * Note: the function expects a local rank on the accelerator to break ties if there are
993
+ * multiple equidistant providers. package_rank is NOT an accurate measure, but a proxy.
994
+ */
995
+ ret = find_nearest_provider_from_accelerator (provider_list , provider_limit , package_rank ,
996
+ & provider );
997
+ if (!ret )
998
+ return provider ;
999
+
1000
+ if (OPAL_ERR_NOT_AVAILABLE != ret ) {
1001
+ opal_output_verbose (1 , opal_common_ofi .output ,
1002
+ "%s:%d:Failed to find a provider close to the accelerator. Error: %d" ,
1003
+ __FILE__ , __LINE__ , ret );
1004
+ return provider_list ;
1005
+ }
1006
+ #endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
1007
+
814
1008
/* Allocate memory for provider table */
815
1009
provider_table = calloc (provider_limit , sizeof (struct fi_info * ));
816
1010
if (NULL == provider_table ) {
@@ -827,20 +1021,15 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
827
1021
distances = get_nearest_nics (& num_distances , & pmix_val );
828
1022
#endif
829
1023
830
- current_provider = provider ;
1024
+ current_provider = provider_list ;
831
1025
832
1026
/* Cycle through remaining fi_info objects, looking for alike providers */
833
1027
while (NULL != current_provider ) {
834
- if (!check_provider_attr (provider , current_provider )) {
1028
+ if (!check_provider_attr (provider_list , current_provider )) {
835
1029
near = false;
836
1030
#if OPAL_OFI_PCI_DATA_AVAILABLE
837
- if (NULL != current_provider -> nic
838
- && NULL != current_provider -> nic -> bus_attr
839
- && current_provider -> nic -> bus_attr -> bus_type == FI_BUS_PCI ) {
840
- pci = current_provider -> nic -> bus_attr -> attr .pci ;
841
- near = is_near (distances , num_distances ,
842
- opal_hwloc_topology , pci );
843
- }
1031
+ if (OPAL_SUCCESS == get_provider_nic_pci (current_provider , & pci ))
1032
+ near = is_near (distances , num_distances , opal_hwloc_topology , pci );
844
1033
#endif
845
1034
/* We could have multiple near providers */
846
1035
if (near && !provider_found ) {
0 commit comments