LPD-EPFL/ASCYLIB

Correct hardware-independent thread pinning

schuay opened this issue · 1 comments

Have you thought about using hwloc to do hardware independent thread pinning? It would add a dependency on the one hand, but make things much easier for people running this lib on the other. It'd also let allow you to remove the horrible the_cores hack ;)

A quick draft of the required changes below:

diff --git a/common/Makefile.common b/common/Makefile.common
index d8bba15..f51050b 100644
--- a/common/Makefile.common
+++ b/common/Makefile.common
@@ -248,7 +248,7 @@ CFLAGS += -Wall
 CFLAGS += -fno-strict-aliasing
 CFLAGS += -I$(LIBAO_INC) -I$(ROOT)/include -I$(LIBSSMEM)/include

-LDFLAGS += -lpthread -lrt -lm
+LDFLAGS += -lpthread -lrt -lm -lhwloc

 ######################
 # compilation settings
diff --git a/include/utils.h b/include/utils.h
index 609fe5a..c5e4f92 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -655,36 +655,25 @@ static __attribute__ ((unused)) double eng_per_test_iter_nj[40][5] =
     return (double)t.tv_sec + ((double)t.tv_usec)/1000000.0;
   }

+#include <hwloc.h>
+  hwloc_topology_t m_topology;
+
   static inline 
   void set_cpu(int cpu) 
   {
-#ifndef NO_SET_CPU
-#  ifdef __sparc__
-    processor_bind(P_LWPID,P_MYID, cpu, NULL);
-#  elif defined(__tile__)
-    if (cpu>=tmc_cpus_grid_total()) {
-      perror("Thread id too high");
+    const int depth = hwloc_get_type_or_below_depth(m_topology, HWLOC_OBJ_CORE);
+    const int ncores = hwloc_get_nbobjs_by_depth(m_topology, depth);
+
+    const hwloc_obj_t obj = hwloc_get_obj_by_depth(m_topology, depth, cpu % ncores);
+
+    hwloc_cpuset_t cpuset = hwloc_bitmap_dup(obj->cpuset);
+    hwloc_bitmap_singlify(cpuset);
+
+    if (hwloc_set_cpubind(m_topology, cpuset, HWLOC_CPUBIND_THREAD) != 0) {
+        fprintf(stderr, "Could not bind to core: %s\n", strerror(errno));
     }
-    // cput_set_t cpus;
-    if (tmc_cpus_set_my_cpu(cpu)<0) {
-      tmc_task_die("tmc_cpus_set_my_cpu() failed."); 
-    }    
-#  else
-    cpu %= (NUMBER_OF_SOCKETS * CORES_PER_SOCKET);
-
-    cpu_set_t mask;
-    CPU_ZERO(&mask);
-    CPU_SET(cpu, &mask);
-#    if defined(PLATFORM_NUMA)
-    numa_set_preferred(get_cluster(cpu));
-#    endif
-    pthread_t thread = pthread_self();
-    if (pthread_setaffinity_np(thread, sizeof(cpu_set_t), &mask) != 0) 
-      {
-   fprintf(stderr, "Error setting thread affinity\n");
-      }
-#  endif
-#endif
+
+    hwloc_bitmap_free(cpuset);    
   }


diff --git a/src/priorityqueue-alistarh/test_simple.c b/src/priorityqueue-alistarh/test_simple.c
index 03099f7..5c1ebdc 100755
--- a/src/priorityqueue-alistarh/test_simple.c
+++ b/src/priorityqueue-alistarh/test_simple.c
@@ -130,7 +130,7 @@ test(void* thread)
   thread_data_t* td = (thread_data_t*) thread;
   uint32_t ID = td->id;
   int phys_id = the_cores[ID];
-  set_cpu(phys_id);
+  set_cpu(ID);
   ssalloc_init();

   DS_TYPE* set = td->set;
@@ -295,10 +295,16 @@ test(void* thread)
   pthread_exit(NULL);
 }

+#include <hwloc.h>
+extern hwloc_topology_t m_topology;
+
 int
 main(int argc, char **argv) 
 {
-  set_cpu(the_cores[0]);
+  hwloc_topology_init(&m_topology);
+  hwloc_topology_load(m_topology);
+
+  set_cpu(0);
   ssalloc_init();
   seeds = seed_rand();

@@ -398,8 +404,6 @@ main(int argc, char **argv)
    }
     }

-
-
   if (!is_power_of_two(initial))
     {
       size_t initial_pow2 = pow2roundup(initial);
@@ -608,6 +612,8 @@ main(int argc, char **argv)
   RR_PRINT_UNPROTECTED(RAPL_PRINT_POW);
   RR_PRINT_CORRECTED();
   RETRY_STATS_PRINT(total, putting_count_total, removing_count_total, putting_count_total_succ + removing_count_total_succ);
+
+  hwloc_topology_destroy(m_topology);

   pthread_exit(NULL);

That is a good idea. We will integrate it as an option as soon as possible.
Thanks.