xref: /dpdk/lib/timer/rte_timer.c (revision 3b78aa7b2317fb385ed7fa5f5535f60050ede618)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4 
5 #include <stdio.h>
6 #include <stdint.h>
7 #include <stdbool.h>
8 #include <inttypes.h>
9 #include <assert.h>
10 #include <sys/queue.h>
11 
12 #include <rte_common.h>
13 #include <rte_cycles.h>
14 #include <rte_eal_memconfig.h>
15 #include <rte_memory.h>
16 #include <rte_lcore.h>
17 #include <rte_branch_prediction.h>
18 #include <rte_spinlock.h>
19 #include <rte_random.h>
20 #include <rte_pause.h>
21 #include <rte_memzone.h>
22 
23 #include "rte_timer.h"
24 
25 /**
26  * Per-lcore info for timers.
27  */
28 struct priv_timer {
29 	struct rte_timer pending_head;  /**< dummy timer instance to head up list */
30 	rte_spinlock_t list_lock;       /**< lock to protect list access */
31 
32 	/** per-core variable that true if a timer was updated on this
33 	 *  core since last reset of the variable */
34 	int updated;
35 
36 	/** track the current depth of the skiplist */
37 	unsigned curr_skiplist_depth;
38 
39 	unsigned prev_lcore;              /**< used for lcore round robin */
40 
41 	/** running timer on this lcore now */
42 	struct rte_timer *running_tim;
43 
44 #ifdef RTE_LIBRTE_TIMER_DEBUG
45 	/** per-lcore statistics */
46 	struct rte_timer_debug_stats stats;
47 #endif
48 } __rte_cache_aligned;
49 
50 #define FL_ALLOCATED	(1 << 0)
51 struct rte_timer_data {
52 	struct priv_timer priv_timer[RTE_MAX_LCORE];
53 	uint8_t internal_flags;
54 };
55 
56 #define RTE_MAX_DATA_ELS 64
57 static const struct rte_memzone *rte_timer_data_mz;
58 static int *volatile rte_timer_mz_refcnt;
59 static struct rte_timer_data *rte_timer_data_arr;
60 static const uint32_t default_data_id;
61 static uint32_t rte_timer_subsystem_initialized;
62 
63 /* when debug is enabled, store some statistics */
64 #ifdef RTE_LIBRTE_TIMER_DEBUG
65 #define __TIMER_STAT_ADD(priv_timer, name, n) do {			\
66 		unsigned __lcore_id = rte_lcore_id();			\
67 		if (__lcore_id < RTE_MAX_LCORE)				\
68 			priv_timer[__lcore_id].stats.name += (n);	\
69 	} while(0)
70 #else
71 #define __TIMER_STAT_ADD(priv_timer, name, n) do {} while (0)
72 #endif
73 
74 static inline int
75 timer_data_valid(uint32_t id)
76 {
77 	return rte_timer_data_arr &&
78 		(rte_timer_data_arr[id].internal_flags & FL_ALLOCATED);
79 }
80 
81 /* validate ID and retrieve timer data pointer, or return error value */
82 #define TIMER_DATA_VALID_GET_OR_ERR_RET(id, timer_data, retval) do {	\
83 	if (id >= RTE_MAX_DATA_ELS || !timer_data_valid(id))		\
84 		return retval;						\
85 	timer_data = &rte_timer_data_arr[id];				\
86 } while (0)
87 
88 int
89 rte_timer_data_alloc(uint32_t *id_ptr)
90 {
91 	int i;
92 	struct rte_timer_data *data;
93 
94 	if (!rte_timer_subsystem_initialized)
95 		return -ENOMEM;
96 
97 	for (i = 0; i < RTE_MAX_DATA_ELS; i++) {
98 		data = &rte_timer_data_arr[i];
99 		if (!(data->internal_flags & FL_ALLOCATED)) {
100 			data->internal_flags |= FL_ALLOCATED;
101 
102 			if (id_ptr)
103 				*id_ptr = i;
104 
105 			return 0;
106 		}
107 	}
108 
109 	return -ENOSPC;
110 }
111 
112 int
113 rte_timer_data_dealloc(uint32_t id)
114 {
115 	struct rte_timer_data *timer_data;
116 	TIMER_DATA_VALID_GET_OR_ERR_RET(id, timer_data, -EINVAL);
117 
118 	timer_data->internal_flags &= ~(FL_ALLOCATED);
119 
120 	return 0;
121 }
122 
123 /* Init the timer library. Allocate an array of timer data structs in shared
124  * memory, and allocate the zeroth entry for use with original timer
125  * APIs. Since the intersection of the sets of lcore ids in primary and
126  * secondary processes should be empty, the zeroth entry can be shared by
127  * multiple processes.
128  */
129 int
130 rte_timer_subsystem_init(void)
131 {
132 	const struct rte_memzone *mz;
133 	struct rte_timer_data *data;
134 	int i, lcore_id;
135 	static const char *mz_name = "rte_timer_mz";
136 	const size_t data_arr_size =
137 			RTE_MAX_DATA_ELS * sizeof(*rte_timer_data_arr);
138 	const size_t mem_size = data_arr_size + sizeof(*rte_timer_mz_refcnt);
139 	bool do_full_init = true;
140 
141 	rte_mcfg_timer_lock();
142 
143 	if (rte_timer_subsystem_initialized) {
144 		rte_mcfg_timer_unlock();
145 		return -EALREADY;
146 	}
147 
148 	mz = rte_memzone_lookup(mz_name);
149 	if (mz == NULL) {
150 		mz = rte_memzone_reserve_aligned(mz_name, mem_size,
151 				SOCKET_ID_ANY, 0, RTE_CACHE_LINE_SIZE);
152 		if (mz == NULL) {
153 			rte_mcfg_timer_unlock();
154 			return -ENOMEM;
155 		}
156 		do_full_init = true;
157 	} else
158 		do_full_init = false;
159 
160 	rte_timer_data_mz = mz;
161 	rte_timer_data_arr = mz->addr;
162 	rte_timer_mz_refcnt = (void *)((char *)mz->addr + data_arr_size);
163 
164 	if (do_full_init) {
165 		for (i = 0; i < RTE_MAX_DATA_ELS; i++) {
166 			data = &rte_timer_data_arr[i];
167 
168 			for (lcore_id = 0; lcore_id < RTE_MAX_LCORE;
169 			     lcore_id++) {
170 				rte_spinlock_init(
171 					&data->priv_timer[lcore_id].list_lock);
172 				data->priv_timer[lcore_id].prev_lcore =
173 					lcore_id;
174 			}
175 		}
176 	}
177 
178 	rte_timer_data_arr[default_data_id].internal_flags |= FL_ALLOCATED;
179 	(*rte_timer_mz_refcnt)++;
180 
181 	rte_timer_subsystem_initialized = 1;
182 
183 	rte_mcfg_timer_unlock();
184 
185 	return 0;
186 }
187 
188 void
189 rte_timer_subsystem_finalize(void)
190 {
191 	rte_mcfg_timer_lock();
192 
193 	if (!rte_timer_subsystem_initialized) {
194 		rte_mcfg_timer_unlock();
195 		return;
196 	}
197 
198 	if (--(*rte_timer_mz_refcnt) == 0)
199 		rte_memzone_free(rte_timer_data_mz);
200 
201 	rte_timer_subsystem_initialized = 0;
202 
203 	rte_mcfg_timer_unlock();
204 }
205 
206 /* Initialize the timer handle tim for use */
207 void
208 rte_timer_init(struct rte_timer *tim)
209 {
210 	union rte_timer_status status;
211 
212 	status.state = RTE_TIMER_STOP;
213 	status.owner = RTE_TIMER_NO_OWNER;
214 	__atomic_store_n(&tim->status.u32, status.u32, __ATOMIC_RELAXED);
215 }
216 
217 /*
218  * if timer is pending or stopped (or running on the same core than
219  * us), mark timer as configuring, and on success return the previous
220  * status of the timer
221  */
222 static int
223 timer_set_config_state(struct rte_timer *tim,
224 		       union rte_timer_status *ret_prev_status,
225 		       struct priv_timer *priv_timer)
226 {
227 	union rte_timer_status prev_status, status;
228 	int success = 0;
229 	unsigned lcore_id;
230 
231 	lcore_id = rte_lcore_id();
232 
233 	/* wait that the timer is in correct status before update,
234 	 * and mark it as being configured */
235 	prev_status.u32 = __atomic_load_n(&tim->status.u32, __ATOMIC_RELAXED);
236 
237 	while (success == 0) {
238 		/* timer is running on another core
239 		 * or ready to run on local core, exit
240 		 */
241 		if (prev_status.state == RTE_TIMER_RUNNING &&
242 		    (prev_status.owner != (uint16_t)lcore_id ||
243 		     tim != priv_timer[lcore_id].running_tim))
244 			return -1;
245 
246 		/* timer is being configured on another core */
247 		if (prev_status.state == RTE_TIMER_CONFIG)
248 			return -1;
249 
250 		/* here, we know that timer is stopped or pending,
251 		 * mark it atomically as being configured */
252 		status.state = RTE_TIMER_CONFIG;
253 		status.owner = (int16_t)lcore_id;
254 		/* CONFIG states are acting as locked states. If the
255 		 * timer is in CONFIG state, the state cannot be changed
256 		 * by other threads. So, we should use ACQUIRE here.
257 		 */
258 		success = __atomic_compare_exchange_n(&tim->status.u32,
259 					      &prev_status.u32,
260 					      status.u32, 0,
261 					      __ATOMIC_ACQUIRE,
262 					      __ATOMIC_RELAXED);
263 	}
264 
265 	ret_prev_status->u32 = prev_status.u32;
266 	return 0;
267 }
268 
269 /*
270  * if timer is pending, mark timer as running
271  */
272 static int
273 timer_set_running_state(struct rte_timer *tim)
274 {
275 	union rte_timer_status prev_status, status;
276 	unsigned lcore_id = rte_lcore_id();
277 	int success = 0;
278 
279 	/* wait that the timer is in correct status before update,
280 	 * and mark it as running */
281 	prev_status.u32 = __atomic_load_n(&tim->status.u32, __ATOMIC_RELAXED);
282 
283 	while (success == 0) {
284 		/* timer is not pending anymore */
285 		if (prev_status.state != RTE_TIMER_PENDING)
286 			return -1;
287 
288 		/* we know that the timer will be pending at this point
289 		 * mark it atomically as being running
290 		 */
291 		status.state = RTE_TIMER_RUNNING;
292 		status.owner = (int16_t)lcore_id;
293 		/* RUNNING states are acting as locked states. If the
294 		 * timer is in RUNNING state, the state cannot be changed
295 		 * by other threads. So, we should use ACQUIRE here.
296 		 */
297 		success = __atomic_compare_exchange_n(&tim->status.u32,
298 					      &prev_status.u32,
299 					      status.u32, 0,
300 					      __ATOMIC_ACQUIRE,
301 					      __ATOMIC_RELAXED);
302 	}
303 
304 	return 0;
305 }
306 
307 /*
308  * Return a skiplist level for a new entry.
309  * This probabilistically gives a level with p=1/4 that an entry at level n
310  * will also appear at level n+1.
311  */
312 static uint32_t
313 timer_get_skiplist_level(unsigned curr_depth)
314 {
315 #ifdef RTE_LIBRTE_TIMER_DEBUG
316 	static uint32_t i, count = 0;
317 	static uint32_t levels[MAX_SKIPLIST_DEPTH] = {0};
318 #endif
319 
320 	/* probability value is 1/4, i.e. all at level 0, 1 in 4 is at level 1,
321 	 * 1 in 16 at level 2, 1 in 64 at level 3, etc. Calculated using lowest
322 	 * bit position of a (pseudo)random number.
323 	 */
324 	uint32_t rand = rte_rand() & (UINT32_MAX - 1);
325 	uint32_t level = rand == 0 ? MAX_SKIPLIST_DEPTH : (rte_bsf32(rand)-1) / 2;
326 
327 	/* limit the levels used to one above our current level, so we don't,
328 	 * for instance, have a level 0 and a level 7 without anything between
329 	 */
330 	if (level > curr_depth)
331 		level = curr_depth;
332 	if (level >= MAX_SKIPLIST_DEPTH)
333 		level = MAX_SKIPLIST_DEPTH-1;
334 #ifdef RTE_LIBRTE_TIMER_DEBUG
335 	count ++;
336 	levels[level]++;
337 	if (count % 10000 == 0)
338 		for (i = 0; i < MAX_SKIPLIST_DEPTH; i++)
339 			printf("Level %u: %u\n", (unsigned)i, (unsigned)levels[i]);
340 #endif
341 	return level;
342 }
343 
344 /*
345  * For a given time value, get the entries at each level which
346  * are <= that time value.
347  */
348 static void
349 timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore,
350 		       struct rte_timer **prev, struct priv_timer *priv_timer)
351 {
352 	unsigned lvl = priv_timer[tim_lcore].curr_skiplist_depth;
353 	prev[lvl] = &priv_timer[tim_lcore].pending_head;
354 	while(lvl != 0) {
355 		lvl--;
356 		prev[lvl] = prev[lvl+1];
357 		while (prev[lvl]->sl_next[lvl] &&
358 				prev[lvl]->sl_next[lvl]->expire <= time_val)
359 			prev[lvl] = prev[lvl]->sl_next[lvl];
360 	}
361 }
362 
363 /*
364  * Given a timer node in the skiplist, find the previous entries for it at
365  * all skiplist levels.
366  */
367 static void
368 timer_get_prev_entries_for_node(struct rte_timer *tim, unsigned tim_lcore,
369 				struct rte_timer **prev,
370 				struct priv_timer *priv_timer)
371 {
372 	int i;
373 
374 	/* to get a specific entry in the list, look for just lower than the time
375 	 * values, and then increment on each level individually if necessary
376 	 */
377 	timer_get_prev_entries(tim->expire - 1, tim_lcore, prev, priv_timer);
378 	for (i = priv_timer[tim_lcore].curr_skiplist_depth - 1; i >= 0; i--) {
379 		while (prev[i]->sl_next[i] != NULL &&
380 				prev[i]->sl_next[i] != tim &&
381 				prev[i]->sl_next[i]->expire <= tim->expire)
382 			prev[i] = prev[i]->sl_next[i];
383 	}
384 }
385 
386 /* call with lock held as necessary
387  * add in list
388  * timer must be in config state
389  * timer must not be in a list
390  */
391 static void
392 timer_add(struct rte_timer *tim, unsigned int tim_lcore,
393 	  struct priv_timer *priv_timer)
394 {
395 	unsigned lvl;
396 	struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1];
397 
398 	/* find where exactly this element goes in the list of elements
399 	 * for each depth. */
400 	timer_get_prev_entries(tim->expire, tim_lcore, prev, priv_timer);
401 
402 	/* now assign it a new level and add at that level */
403 	const unsigned tim_level = timer_get_skiplist_level(
404 			priv_timer[tim_lcore].curr_skiplist_depth);
405 	if (tim_level == priv_timer[tim_lcore].curr_skiplist_depth)
406 		priv_timer[tim_lcore].curr_skiplist_depth++;
407 
408 	lvl = tim_level;
409 	while (lvl > 0) {
410 		tim->sl_next[lvl] = prev[lvl]->sl_next[lvl];
411 		prev[lvl]->sl_next[lvl] = tim;
412 		lvl--;
413 	}
414 	tim->sl_next[0] = prev[0]->sl_next[0];
415 	prev[0]->sl_next[0] = tim;
416 
417 	/* save the lowest list entry into the expire field of the dummy hdr
418 	 * NOTE: this is not atomic on 32-bit*/
419 	priv_timer[tim_lcore].pending_head.expire = priv_timer[tim_lcore].\
420 			pending_head.sl_next[0]->expire;
421 }
422 
423 /*
424  * del from list, lock if needed
425  * timer must be in config state
426  * timer must be in a list
427  */
428 static void
429 timer_del(struct rte_timer *tim, union rte_timer_status prev_status,
430 	  int local_is_locked, struct priv_timer *priv_timer)
431 {
432 	unsigned lcore_id = rte_lcore_id();
433 	unsigned prev_owner = prev_status.owner;
434 	int i;
435 	struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1];
436 
437 	/* if timer needs is pending another core, we need to lock the
438 	 * list; if it is on local core, we need to lock if we are not
439 	 * called from rte_timer_manage() */
440 	if (prev_owner != lcore_id || !local_is_locked)
441 		rte_spinlock_lock(&priv_timer[prev_owner].list_lock);
442 
443 	/* save the lowest list entry into the expire field of the dummy hdr.
444 	 * NOTE: this is not atomic on 32-bit */
445 	if (tim == priv_timer[prev_owner].pending_head.sl_next[0])
446 		priv_timer[prev_owner].pending_head.expire =
447 				((tim->sl_next[0] == NULL) ? 0 : tim->sl_next[0]->expire);
448 
449 	/* adjust pointers from previous entries to point past this */
450 	timer_get_prev_entries_for_node(tim, prev_owner, prev, priv_timer);
451 	for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--) {
452 		if (prev[i]->sl_next[i] == tim)
453 			prev[i]->sl_next[i] = tim->sl_next[i];
454 	}
455 
456 	/* in case we deleted last entry at a level, adjust down max level */
457 	for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--)
458 		if (priv_timer[prev_owner].pending_head.sl_next[i] == NULL)
459 			priv_timer[prev_owner].curr_skiplist_depth --;
460 		else
461 			break;
462 
463 	if (prev_owner != lcore_id || !local_is_locked)
464 		rte_spinlock_unlock(&priv_timer[prev_owner].list_lock);
465 }
466 
467 /* Reset and start the timer associated with the timer handle (private func) */
468 static int
469 __rte_timer_reset(struct rte_timer *tim, uint64_t expire,
470 		  uint64_t period, unsigned tim_lcore,
471 		  rte_timer_cb_t fct, void *arg,
472 		  int local_is_locked,
473 		  struct rte_timer_data *timer_data)
474 {
475 	union rte_timer_status prev_status, status;
476 	int ret;
477 	unsigned lcore_id = rte_lcore_id();
478 	struct priv_timer *priv_timer = timer_data->priv_timer;
479 
480 	/* round robin for tim_lcore */
481 	if (tim_lcore == (unsigned)LCORE_ID_ANY) {
482 		if (lcore_id < RTE_MAX_LCORE) {
483 			/* EAL thread with valid lcore_id */
484 			tim_lcore = rte_get_next_lcore(
485 				priv_timer[lcore_id].prev_lcore,
486 				0, 1);
487 			priv_timer[lcore_id].prev_lcore = tim_lcore;
488 		} else
489 			/* non-EAL thread do not run rte_timer_manage(),
490 			 * so schedule the timer on the first enabled lcore. */
491 			tim_lcore = rte_get_next_lcore(LCORE_ID_ANY, 0, 1);
492 	}
493 
494 	/* wait that the timer is in correct status before update,
495 	 * and mark it as being configured */
496 	ret = timer_set_config_state(tim, &prev_status, priv_timer);
497 	if (ret < 0)
498 		return -1;
499 
500 	__TIMER_STAT_ADD(priv_timer, reset, 1);
501 	if (prev_status.state == RTE_TIMER_RUNNING &&
502 	    lcore_id < RTE_MAX_LCORE) {
503 		priv_timer[lcore_id].updated = 1;
504 	}
505 
506 	/* remove it from list */
507 	if (prev_status.state == RTE_TIMER_PENDING) {
508 		timer_del(tim, prev_status, local_is_locked, priv_timer);
509 		__TIMER_STAT_ADD(priv_timer, pending, -1);
510 	}
511 
512 	tim->period = period;
513 	tim->expire = expire;
514 	tim->f = fct;
515 	tim->arg = arg;
516 
517 	/* if timer needs to be scheduled on another core, we need to
518 	 * lock the destination list; if it is on local core, we need to lock if
519 	 * we are not called from rte_timer_manage()
520 	 */
521 	if (tim_lcore != lcore_id || !local_is_locked)
522 		rte_spinlock_lock(&priv_timer[tim_lcore].list_lock);
523 
524 	__TIMER_STAT_ADD(priv_timer, pending, 1);
525 	timer_add(tim, tim_lcore, priv_timer);
526 
527 	/* update state: as we are in CONFIG state, only us can modify
528 	 * the state so we don't need to use cmpset() here */
529 	status.state = RTE_TIMER_PENDING;
530 	status.owner = (int16_t)tim_lcore;
531 	/* The "RELEASE" ordering guarantees the memory operations above
532 	 * the status update are observed before the update by all threads
533 	 */
534 	__atomic_store_n(&tim->status.u32, status.u32, __ATOMIC_RELEASE);
535 
536 	if (tim_lcore != lcore_id || !local_is_locked)
537 		rte_spinlock_unlock(&priv_timer[tim_lcore].list_lock);
538 
539 	return 0;
540 }
541 
542 /* Reset and start the timer associated with the timer handle tim */
543 int
544 rte_timer_reset(struct rte_timer *tim, uint64_t ticks,
545 		      enum rte_timer_type type, unsigned int tim_lcore,
546 		      rte_timer_cb_t fct, void *arg)
547 {
548 	return rte_timer_alt_reset(default_data_id, tim, ticks, type,
549 				   tim_lcore, fct, arg);
550 }
551 
552 int
553 rte_timer_alt_reset(uint32_t timer_data_id, struct rte_timer *tim,
554 		    uint64_t ticks, enum rte_timer_type type,
555 		    unsigned int tim_lcore, rte_timer_cb_t fct, void *arg)
556 {
557 	uint64_t cur_time = rte_get_timer_cycles();
558 	uint64_t period;
559 	struct rte_timer_data *timer_data;
560 
561 	TIMER_DATA_VALID_GET_OR_ERR_RET(timer_data_id, timer_data, -EINVAL);
562 
563 	if (type == PERIODICAL)
564 		period = ticks;
565 	else
566 		period = 0;
567 
568 	return __rte_timer_reset(tim,  cur_time + ticks, period, tim_lcore,
569 				 fct, arg, 0, timer_data);
570 }
571 
572 /* loop until rte_timer_reset() succeed */
573 void
574 rte_timer_reset_sync(struct rte_timer *tim, uint64_t ticks,
575 		     enum rte_timer_type type, unsigned tim_lcore,
576 		     rte_timer_cb_t fct, void *arg)
577 {
578 	while (rte_timer_reset(tim, ticks, type, tim_lcore,
579 			       fct, arg) != 0)
580 		rte_pause();
581 }
582 
583 static int
584 __rte_timer_stop(struct rte_timer *tim, int local_is_locked,
585 		 struct rte_timer_data *timer_data)
586 {
587 	union rte_timer_status prev_status, status;
588 	unsigned lcore_id = rte_lcore_id();
589 	int ret;
590 	struct priv_timer *priv_timer = timer_data->priv_timer;
591 
592 	/* wait that the timer is in correct status before update,
593 	 * and mark it as being configured */
594 	ret = timer_set_config_state(tim, &prev_status, priv_timer);
595 	if (ret < 0)
596 		return -1;
597 
598 	__TIMER_STAT_ADD(priv_timer, stop, 1);
599 	if (prev_status.state == RTE_TIMER_RUNNING &&
600 	    lcore_id < RTE_MAX_LCORE) {
601 		priv_timer[lcore_id].updated = 1;
602 	}
603 
604 	/* remove it from list */
605 	if (prev_status.state == RTE_TIMER_PENDING) {
606 		timer_del(tim, prev_status, local_is_locked, priv_timer);
607 		__TIMER_STAT_ADD(priv_timer, pending, -1);
608 	}
609 
610 	/* mark timer as stopped */
611 	status.state = RTE_TIMER_STOP;
612 	status.owner = RTE_TIMER_NO_OWNER;
613 	/* The "RELEASE" ordering guarantees the memory operations above
614 	 * the status update are observed before the update by all threads
615 	 */
616 	__atomic_store_n(&tim->status.u32, status.u32, __ATOMIC_RELEASE);
617 
618 	return 0;
619 }
620 
621 /* Stop the timer associated with the timer handle tim */
622 int
623 rte_timer_stop(struct rte_timer *tim)
624 {
625 	return rte_timer_alt_stop(default_data_id, tim);
626 }
627 
628 int
629 rte_timer_alt_stop(uint32_t timer_data_id, struct rte_timer *tim)
630 {
631 	struct rte_timer_data *timer_data;
632 
633 	TIMER_DATA_VALID_GET_OR_ERR_RET(timer_data_id, timer_data, -EINVAL);
634 
635 	return __rte_timer_stop(tim, 0, timer_data);
636 }
637 
638 /* loop until rte_timer_stop() succeed */
639 void
640 rte_timer_stop_sync(struct rte_timer *tim)
641 {
642 	while (rte_timer_stop(tim) != 0)
643 		rte_pause();
644 }
645 
646 /* Test the PENDING status of the timer handle tim */
647 int
648 rte_timer_pending(struct rte_timer *tim)
649 {
650 	return __atomic_load_n(&tim->status.state,
651 				__ATOMIC_RELAXED) == RTE_TIMER_PENDING;
652 }
653 
654 /* must be called periodically, run all timer that expired */
655 static void
656 __rte_timer_manage(struct rte_timer_data *timer_data)
657 {
658 	union rte_timer_status status;
659 	struct rte_timer *tim, *next_tim;
660 	struct rte_timer *run_first_tim, **pprev;
661 	unsigned lcore_id = rte_lcore_id();
662 	struct rte_timer *prev[MAX_SKIPLIST_DEPTH + 1];
663 	uint64_t cur_time;
664 	int i, ret;
665 	struct priv_timer *priv_timer = timer_data->priv_timer;
666 
667 	/* timer manager only runs on EAL thread with valid lcore_id */
668 	assert(lcore_id < RTE_MAX_LCORE);
669 
670 	__TIMER_STAT_ADD(priv_timer, manage, 1);
671 	/* optimize for the case where per-cpu list is empty */
672 	if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL)
673 		return;
674 	cur_time = rte_get_timer_cycles();
675 
676 #ifdef RTE_ARCH_64
677 	/* on 64-bit the value cached in the pending_head.expired will be
678 	 * updated atomically, so we can consult that for a quick check here
679 	 * outside the lock */
680 	if (likely(priv_timer[lcore_id].pending_head.expire > cur_time))
681 		return;
682 #endif
683 
684 	/* browse ordered list, add expired timers in 'expired' list */
685 	rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
686 
687 	/* if nothing to do just unlock and return */
688 	if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL ||
689 	    priv_timer[lcore_id].pending_head.sl_next[0]->expire > cur_time) {
690 		rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
691 		return;
692 	}
693 
694 	/* save start of list of expired timers */
695 	tim = priv_timer[lcore_id].pending_head.sl_next[0];
696 
697 	/* break the existing list at current time point */
698 	timer_get_prev_entries(cur_time, lcore_id, prev, priv_timer);
699 	for (i = priv_timer[lcore_id].curr_skiplist_depth -1; i >= 0; i--) {
700 		if (prev[i] == &priv_timer[lcore_id].pending_head)
701 			continue;
702 		priv_timer[lcore_id].pending_head.sl_next[i] =
703 		    prev[i]->sl_next[i];
704 		if (prev[i]->sl_next[i] == NULL)
705 			priv_timer[lcore_id].curr_skiplist_depth--;
706 		prev[i] ->sl_next[i] = NULL;
707 	}
708 
709 	/* transition run-list from PENDING to RUNNING */
710 	run_first_tim = tim;
711 	pprev = &run_first_tim;
712 
713 	for ( ; tim != NULL; tim = next_tim) {
714 		next_tim = tim->sl_next[0];
715 
716 		ret = timer_set_running_state(tim);
717 		if (likely(ret == 0)) {
718 			pprev = &tim->sl_next[0];
719 		} else {
720 			/* another core is trying to re-config this one,
721 			 * remove it from local expired list
722 			 */
723 			*pprev = next_tim;
724 		}
725 	}
726 
727 	/* update the next to expire timer value */
728 	priv_timer[lcore_id].pending_head.expire =
729 	    (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) ? 0 :
730 		priv_timer[lcore_id].pending_head.sl_next[0]->expire;
731 
732 	rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
733 
734 	/* now scan expired list and call callbacks */
735 	for (tim = run_first_tim; tim != NULL; tim = next_tim) {
736 		next_tim = tim->sl_next[0];
737 		priv_timer[lcore_id].updated = 0;
738 		priv_timer[lcore_id].running_tim = tim;
739 
740 		/* execute callback function with list unlocked */
741 		tim->f(tim, tim->arg);
742 
743 		__TIMER_STAT_ADD(priv_timer, pending, -1);
744 		/* the timer was stopped or reloaded by the callback
745 		 * function, we have nothing to do here */
746 		if (priv_timer[lcore_id].updated == 1)
747 			continue;
748 
749 		if (tim->period == 0) {
750 			/* remove from done list and mark timer as stopped */
751 			status.state = RTE_TIMER_STOP;
752 			status.owner = RTE_TIMER_NO_OWNER;
753 			/* The "RELEASE" ordering guarantees the memory
754 			 * operations above the status update are observed
755 			 * before the update by all threads
756 			 */
757 			__atomic_store_n(&tim->status.u32, status.u32,
758 				__ATOMIC_RELEASE);
759 		}
760 		else {
761 			/* keep it in list and mark timer as pending */
762 			rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
763 			status.state = RTE_TIMER_PENDING;
764 			__TIMER_STAT_ADD(priv_timer, pending, 1);
765 			status.owner = (int16_t)lcore_id;
766 			/* The "RELEASE" ordering guarantees the memory
767 			 * operations above the status update are observed
768 			 * before the update by all threads
769 			 */
770 			__atomic_store_n(&tim->status.u32, status.u32,
771 				__ATOMIC_RELEASE);
772 			__rte_timer_reset(tim, tim->expire + tim->period,
773 				tim->period, lcore_id, tim->f, tim->arg, 1,
774 				timer_data);
775 			rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
776 		}
777 	}
778 	priv_timer[lcore_id].running_tim = NULL;
779 }
780 
781 int
782 rte_timer_manage(void)
783 {
784 	struct rte_timer_data *timer_data;
785 
786 	TIMER_DATA_VALID_GET_OR_ERR_RET(default_data_id, timer_data, -EINVAL);
787 
788 	__rte_timer_manage(timer_data);
789 
790 	return 0;
791 }
792 
793 int
794 rte_timer_alt_manage(uint32_t timer_data_id,
795 		     unsigned int *poll_lcores,
796 		     int nb_poll_lcores,
797 		     rte_timer_alt_manage_cb_t f)
798 {
799 	unsigned int default_poll_lcores[] = {rte_lcore_id()};
800 	union rte_timer_status status;
801 	struct rte_timer *tim, *next_tim, **pprev;
802 	struct rte_timer *run_first_tims[RTE_MAX_LCORE];
803 	unsigned int this_lcore = rte_lcore_id();
804 	struct rte_timer *prev[MAX_SKIPLIST_DEPTH + 1];
805 	uint64_t cur_time;
806 	int i, j, ret;
807 	int nb_runlists = 0;
808 	struct rte_timer_data *data;
809 	struct priv_timer *privp;
810 	uint32_t poll_lcore;
811 
812 	TIMER_DATA_VALID_GET_OR_ERR_RET(timer_data_id, data, -EINVAL);
813 
814 	/* timer manager only runs on EAL thread with valid lcore_id */
815 	assert(this_lcore < RTE_MAX_LCORE);
816 
817 	__TIMER_STAT_ADD(data->priv_timer, manage, 1);
818 
819 	if (poll_lcores == NULL) {
820 		poll_lcores = default_poll_lcores;
821 		nb_poll_lcores = RTE_DIM(default_poll_lcores);
822 	}
823 
824 	for (i = 0; i < nb_poll_lcores; i++) {
825 		poll_lcore = poll_lcores[i];
826 		privp = &data->priv_timer[poll_lcore];
827 
828 		/* optimize for the case where per-cpu list is empty */
829 		if (privp->pending_head.sl_next[0] == NULL)
830 			continue;
831 		cur_time = rte_get_timer_cycles();
832 
833 #ifdef RTE_ARCH_64
834 		/* on 64-bit the value cached in the pending_head.expired will
835 		 * be updated atomically, so we can consult that for a quick
836 		 * check here outside the lock
837 		 */
838 		if (likely(privp->pending_head.expire > cur_time))
839 			continue;
840 #endif
841 
842 		/* browse ordered list, add expired timers in 'expired' list */
843 		rte_spinlock_lock(&privp->list_lock);
844 
845 		/* if nothing to do just unlock and return */
846 		if (privp->pending_head.sl_next[0] == NULL ||
847 		    privp->pending_head.sl_next[0]->expire > cur_time) {
848 			rte_spinlock_unlock(&privp->list_lock);
849 			continue;
850 		}
851 
852 		/* save start of list of expired timers */
853 		tim = privp->pending_head.sl_next[0];
854 
855 		/* break the existing list at current time point */
856 		timer_get_prev_entries(cur_time, poll_lcore, prev,
857 				       data->priv_timer);
858 		for (j = privp->curr_skiplist_depth - 1; j >= 0; j--) {
859 			if (prev[j] == &privp->pending_head)
860 				continue;
861 			privp->pending_head.sl_next[j] =
862 				prev[j]->sl_next[j];
863 			if (prev[j]->sl_next[j] == NULL)
864 				privp->curr_skiplist_depth--;
865 
866 			prev[j]->sl_next[j] = NULL;
867 		}
868 
869 		/* transition run-list from PENDING to RUNNING */
870 		run_first_tims[nb_runlists] = tim;
871 		pprev = &run_first_tims[nb_runlists];
872 		nb_runlists++;
873 
874 		for ( ; tim != NULL; tim = next_tim) {
875 			next_tim = tim->sl_next[0];
876 
877 			ret = timer_set_running_state(tim);
878 			if (likely(ret == 0)) {
879 				pprev = &tim->sl_next[0];
880 			} else {
881 				/* another core is trying to re-config this one,
882 				 * remove it from local expired list
883 				 */
884 				*pprev = next_tim;
885 			}
886 		}
887 
888 		/* update the next to expire timer value */
889 		privp->pending_head.expire =
890 		    (privp->pending_head.sl_next[0] == NULL) ? 0 :
891 			privp->pending_head.sl_next[0]->expire;
892 
893 		rte_spinlock_unlock(&privp->list_lock);
894 	}
895 
896 	/* Now process the run lists */
897 	while (1) {
898 		bool done = true;
899 		uint64_t min_expire = UINT64_MAX;
900 		int min_idx = 0;
901 
902 		/* Find the next oldest timer to process */
903 		for (i = 0; i < nb_runlists; i++) {
904 			tim = run_first_tims[i];
905 
906 			if (tim != NULL && tim->expire < min_expire) {
907 				min_expire = tim->expire;
908 				min_idx = i;
909 				done = false;
910 			}
911 		}
912 
913 		if (done)
914 			break;
915 
916 		tim = run_first_tims[min_idx];
917 
918 		/* Move down the runlist from which we picked a timer to
919 		 * execute
920 		 */
921 		run_first_tims[min_idx] = run_first_tims[min_idx]->sl_next[0];
922 
923 		data->priv_timer[this_lcore].updated = 0;
924 		data->priv_timer[this_lcore].running_tim = tim;
925 
926 		/* Call the provided callback function */
927 		f(tim);
928 
929 		__TIMER_STAT_ADD(data->priv_timer, pending, -1);
930 
931 		/* the timer was stopped or reloaded by the callback
932 		 * function, we have nothing to do here
933 		 */
934 		if (data->priv_timer[this_lcore].updated == 1)
935 			continue;
936 
937 		if (tim->period == 0) {
938 			/* remove from done list and mark timer as stopped */
939 			status.state = RTE_TIMER_STOP;
940 			status.owner = RTE_TIMER_NO_OWNER;
941 			/* The "RELEASE" ordering guarantees the memory
942 			 * operations above the status update are observed
943 			 * before the update by all threads
944 			 */
945 			__atomic_store_n(&tim->status.u32, status.u32,
946 				__ATOMIC_RELEASE);
947 		} else {
948 			/* keep it in list and mark timer as pending */
949 			rte_spinlock_lock(
950 				&data->priv_timer[this_lcore].list_lock);
951 			status.state = RTE_TIMER_PENDING;
952 			__TIMER_STAT_ADD(data->priv_timer, pending, 1);
953 			status.owner = (int16_t)this_lcore;
954 			/* The "RELEASE" ordering guarantees the memory
955 			 * operations above the status update are observed
956 			 * before the update by all threads
957 			 */
958 			__atomic_store_n(&tim->status.u32, status.u32,
959 				__ATOMIC_RELEASE);
960 			__rte_timer_reset(tim, tim->expire + tim->period,
961 				tim->period, this_lcore, tim->f, tim->arg, 1,
962 				data);
963 			rte_spinlock_unlock(
964 				&data->priv_timer[this_lcore].list_lock);
965 		}
966 
967 		data->priv_timer[this_lcore].running_tim = NULL;
968 	}
969 
970 	return 0;
971 }
972 
973 /* Walk pending lists, stopping timers and calling user-specified function */
974 int
975 rte_timer_stop_all(uint32_t timer_data_id, unsigned int *walk_lcores,
976 		   int nb_walk_lcores,
977 		   rte_timer_stop_all_cb_t f, void *f_arg)
978 {
979 	int i;
980 	struct priv_timer *priv_timer;
981 	uint32_t walk_lcore;
982 	struct rte_timer *tim, *next_tim;
983 	struct rte_timer_data *timer_data;
984 
985 	TIMER_DATA_VALID_GET_OR_ERR_RET(timer_data_id, timer_data, -EINVAL);
986 
987 	for (i = 0; i < nb_walk_lcores; i++) {
988 		walk_lcore = walk_lcores[i];
989 		priv_timer = &timer_data->priv_timer[walk_lcore];
990 
991 		rte_spinlock_lock(&priv_timer->list_lock);
992 
993 		for (tim = priv_timer->pending_head.sl_next[0];
994 		     tim != NULL;
995 		     tim = next_tim) {
996 			next_tim = tim->sl_next[0];
997 
998 			/* Call timer_stop with lock held */
999 			__rte_timer_stop(tim, 1, timer_data);
1000 
1001 			if (f)
1002 				f(tim, f_arg);
1003 		}
1004 
1005 		rte_spinlock_unlock(&priv_timer->list_lock);
1006 	}
1007 
1008 	return 0;
1009 }
1010 
1011 int64_t
1012 rte_timer_next_ticks(void)
1013 {
1014 	unsigned int lcore_id = rte_lcore_id();
1015 	struct rte_timer_data *timer_data;
1016 	struct priv_timer *priv_timer;
1017 	const struct rte_timer *tm;
1018 	uint64_t cur_time;
1019 	int64_t left = -ENOENT;
1020 
1021 	TIMER_DATA_VALID_GET_OR_ERR_RET(default_data_id, timer_data, -EINVAL);
1022 
1023 	priv_timer = timer_data->priv_timer;
1024 	cur_time = rte_get_timer_cycles();
1025 
1026 	rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
1027 	tm = priv_timer[lcore_id].pending_head.sl_next[0];
1028 	if (tm) {
1029 		left = tm->expire - cur_time;
1030 		if (left < 0)
1031 			left = 0;
1032 	}
1033 	rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
1034 
1035 	return left;
1036 }
1037 
1038 /* dump statistics about timers */
1039 static void
1040 __rte_timer_dump_stats(struct rte_timer_data *timer_data __rte_unused, FILE *f)
1041 {
1042 #ifdef RTE_LIBRTE_TIMER_DEBUG
1043 	struct rte_timer_debug_stats sum;
1044 	unsigned lcore_id;
1045 	struct priv_timer *priv_timer = timer_data->priv_timer;
1046 
1047 	memset(&sum, 0, sizeof(sum));
1048 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1049 		sum.reset += priv_timer[lcore_id].stats.reset;
1050 		sum.stop += priv_timer[lcore_id].stats.stop;
1051 		sum.manage += priv_timer[lcore_id].stats.manage;
1052 		sum.pending += priv_timer[lcore_id].stats.pending;
1053 	}
1054 	fprintf(f, "Timer statistics:\n");
1055 	fprintf(f, "  reset = %"PRIu64"\n", sum.reset);
1056 	fprintf(f, "  stop = %"PRIu64"\n", sum.stop);
1057 	fprintf(f, "  manage = %"PRIu64"\n", sum.manage);
1058 	fprintf(f, "  pending = %"PRIu64"\n", sum.pending);
1059 #else
1060 	fprintf(f, "No timer statistics, RTE_LIBRTE_TIMER_DEBUG is disabled\n");
1061 #endif
1062 }
1063 
1064 int
1065 rte_timer_dump_stats(FILE *f)
1066 {
1067 	return rte_timer_alt_dump_stats(default_data_id, f);
1068 }
1069 
1070 int
1071 rte_timer_alt_dump_stats(uint32_t timer_data_id __rte_unused, FILE *f)
1072 {
1073 	struct rte_timer_data *timer_data;
1074 
1075 	TIMER_DATA_VALID_GET_OR_ERR_RET(timer_data_id, timer_data, -EINVAL);
1076 
1077 	__rte_timer_dump_stats(timer_data, f);
1078 
1079 	return 0;
1080 }
1081