xref: /llvm-project/compiler-rt/lib/tsan/benchmarks/vts_many_threads_bench.cpp (revision d11b16e1fef5886c73a7a6701b6e0264ae6b44d4)
1*d11b16e1SNico Weber // Mini-benchmark for tsan VTS worst case performance
2*d11b16e1SNico Weber // Idea:
3*d11b16e1SNico Weber // 1) Spawn M + N threads (M >> N)
4*d11b16e1SNico Weber //    We'll call the 'M' threads as 'garbage threads'.
5*d11b16e1SNico Weber // 2) Make sure all threads have created thus no TIDs were reused
6*d11b16e1SNico Weber // 3) Join the garbage threads
7*d11b16e1SNico Weber // 4) Do many sync operations on the remaining N threads
8*d11b16e1SNico Weber //
9*d11b16e1SNico Weber // It turns out that due to O(M+N) VTS complexity the (4) is much slower with
10*d11b16e1SNico Weber // when N is large.
11*d11b16e1SNico Weber //
12*d11b16e1SNico Weber // Some numbers:
13*d11b16e1SNico Weber // a) clang++ native O1 with n_iterations=200kk takes
14*d11b16e1SNico Weber //      5s regardless of M
15*d11b16e1SNico Weber //    clang++ tsanv2 O1 with n_iterations=20kk takes
16*d11b16e1SNico Weber //      23.5s with M=200
17*d11b16e1SNico Weber //      11.5s with M=1
18*d11b16e1SNico Weber //    i.e. tsanv2 is ~23x to ~47x slower than native, depends on M.
19*d11b16e1SNico Weber // b) g++ native O1 with n_iterations=200kk takes
20*d11b16e1SNico Weber //      5.5s regardless of M
21*d11b16e1SNico Weber //    g++ tsanv1 O1 with n_iterations=2kk takes
22*d11b16e1SNico Weber //      39.5s with M=200
23*d11b16e1SNico Weber //      20.5s with M=1
24*d11b16e1SNico Weber //    i.e. tsanv1 is ~370x to ~720x slower than native, depends on M.
25*d11b16e1SNico Weber 
26*d11b16e1SNico Weber #include <assert.h>
27*d11b16e1SNico Weber #include <pthread.h>
28*d11b16e1SNico Weber #include <stdio.h>
29*d11b16e1SNico Weber #include <stdlib.h>
30*d11b16e1SNico Weber 
31*d11b16e1SNico Weber class __attribute__((aligned(64))) Mutex {
32*d11b16e1SNico Weber  public:
Mutex()33*d11b16e1SNico Weber   Mutex()  { pthread_mutex_init(&m_, NULL); }
~Mutex()34*d11b16e1SNico Weber   ~Mutex() { pthread_mutex_destroy(&m_); }
Lock()35*d11b16e1SNico Weber   void Lock() { pthread_mutex_lock(&m_); }
Unlock()36*d11b16e1SNico Weber   void Unlock() { pthread_mutex_unlock(&m_); }
37*d11b16e1SNico Weber 
38*d11b16e1SNico Weber  private:
39*d11b16e1SNico Weber   pthread_mutex_t m_;
40*d11b16e1SNico Weber };
41*d11b16e1SNico Weber 
42*d11b16e1SNico Weber const int kNumMutexes = 1024;
43*d11b16e1SNico Weber Mutex mutexes[kNumMutexes];
44*d11b16e1SNico Weber 
45*d11b16e1SNico Weber int n_threads, n_iterations;
46*d11b16e1SNico Weber 
47*d11b16e1SNico Weber pthread_barrier_t all_threads_ready, main_threads_ready;
48*d11b16e1SNico Weber 
GarbageThread(void * unused)49*d11b16e1SNico Weber void* GarbageThread(void *unused) {
50*d11b16e1SNico Weber   pthread_barrier_wait(&all_threads_ready);
51*d11b16e1SNico Weber   return 0;
52*d11b16e1SNico Weber }
53*d11b16e1SNico Weber 
Thread(void * arg)54*d11b16e1SNico Weber void *Thread(void *arg) {
55*d11b16e1SNico Weber   long idx = (long)arg;
56*d11b16e1SNico Weber   pthread_barrier_wait(&all_threads_ready);
57*d11b16e1SNico Weber 
58*d11b16e1SNico Weber   // Wait for the main thread to join the garbage threads.
59*d11b16e1SNico Weber   pthread_barrier_wait(&main_threads_ready);
60*d11b16e1SNico Weber 
61*d11b16e1SNico Weber   printf("Thread %ld go!\n", idx);
62*d11b16e1SNico Weber   int offset = idx * kNumMutexes / n_threads;
63*d11b16e1SNico Weber   for (int i = 0; i < n_iterations; i++) {
64*d11b16e1SNico Weber     mutexes[(offset + i) % kNumMutexes].Lock();
65*d11b16e1SNico Weber     mutexes[(offset + i) % kNumMutexes].Unlock();
66*d11b16e1SNico Weber   }
67*d11b16e1SNico Weber   printf("Thread %ld done\n", idx);
68*d11b16e1SNico Weber   return 0;
69*d11b16e1SNico Weber }
70*d11b16e1SNico Weber 
main(int argc,char ** argv)71*d11b16e1SNico Weber int main(int argc, char **argv) {
72*d11b16e1SNico Weber   int n_garbage_threads;
73*d11b16e1SNico Weber   if (argc == 1) {
74*d11b16e1SNico Weber     n_threads = 2;
75*d11b16e1SNico Weber     n_garbage_threads = 200;
76*d11b16e1SNico Weber     n_iterations = 20000000;
77*d11b16e1SNico Weber   } else if (argc == 4) {
78*d11b16e1SNico Weber     n_threads = atoi(argv[1]);
79*d11b16e1SNico Weber     assert(n_threads > 0 && n_threads <= 32);
80*d11b16e1SNico Weber     n_garbage_threads = atoi(argv[2]);
81*d11b16e1SNico Weber     assert(n_garbage_threads > 0 && n_garbage_threads <= 16000);
82*d11b16e1SNico Weber     n_iterations = atoi(argv[3]);
83*d11b16e1SNico Weber   } else {
84*d11b16e1SNico Weber     printf("Usage: %s n_threads n_garbage_threads n_iterations\n", argv[0]);
85*d11b16e1SNico Weber     return 1;
86*d11b16e1SNico Weber   }
87*d11b16e1SNico Weber   printf("%s: n_threads=%d n_garbage_threads=%d n_iterations=%d\n",
88*d11b16e1SNico Weber          __FILE__, n_threads, n_garbage_threads, n_iterations);
89*d11b16e1SNico Weber 
90*d11b16e1SNico Weber   pthread_barrier_init(&all_threads_ready, NULL, n_garbage_threads + n_threads + 1);
91*d11b16e1SNico Weber   pthread_barrier_init(&main_threads_ready, NULL, n_threads + 1);
92*d11b16e1SNico Weber 
93*d11b16e1SNico Weber   pthread_t *t = new pthread_t[n_threads];
94*d11b16e1SNico Weber   {
95*d11b16e1SNico Weber     pthread_t *g_t = new pthread_t[n_garbage_threads];
96*d11b16e1SNico Weber     for (int i = 0; i < n_garbage_threads; i++) {
97*d11b16e1SNico Weber       int status = pthread_create(&g_t[i], 0, GarbageThread, NULL);
98*d11b16e1SNico Weber       assert(status == 0);
99*d11b16e1SNico Weber     }
100*d11b16e1SNico Weber     for (int i = 0; i < n_threads; i++) {
101*d11b16e1SNico Weber       int status = pthread_create(&t[i], 0, Thread, (void*)i);
102*d11b16e1SNico Weber       assert(status == 0);
103*d11b16e1SNico Weber     }
104*d11b16e1SNico Weber     pthread_barrier_wait(&all_threads_ready);
105*d11b16e1SNico Weber     printf("All threads started! Killing the garbage threads.\n");
106*d11b16e1SNico Weber     for (int i = 0; i < n_garbage_threads; i++) {
107*d11b16e1SNico Weber       pthread_join(g_t[i], 0);
108*d11b16e1SNico Weber     }
109*d11b16e1SNico Weber     delete [] g_t;
110*d11b16e1SNico Weber   }
111*d11b16e1SNico Weber   printf("Resuming the main threads.\n");
112*d11b16e1SNico Weber   pthread_barrier_wait(&main_threads_ready);
113*d11b16e1SNico Weber 
114*d11b16e1SNico Weber 
115*d11b16e1SNico Weber   for (int i = 0; i < n_threads; i++) {
116*d11b16e1SNico Weber     pthread_join(t[i], 0);
117*d11b16e1SNico Weber   }
118*d11b16e1SNico Weber   delete [] t;
119*d11b16e1SNico Weber   return 0;
120*d11b16e1SNico Weber }
121