xref: /dpdk/lib/timer/rte_timer.c (revision daa02b5cddbb8e11b31d41e2bf7bb1ae64dcae2f)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4 
5 #include <string.h>
6 #include <stdio.h>
7 #include <stdint.h>
8 #include <stdbool.h>
9 #include <inttypes.h>
10 #include <assert.h>
11 #include <sys/queue.h>
12 
13 #include <rte_common.h>
14 #include <rte_cycles.h>
15 #include <rte_eal_memconfig.h>
16 #include <rte_per_lcore.h>
17 #include <rte_memory.h>
18 #include <rte_launch.h>
19 #include <rte_eal.h>
20 #include <rte_lcore.h>
21 #include <rte_branch_prediction.h>
22 #include <rte_spinlock.h>
23 #include <rte_random.h>
24 #include <rte_pause.h>
25 #include <rte_memzone.h>
26 #include <rte_malloc.h>
27 #include <rte_errno.h>
28 
29 #include "rte_timer.h"
30 
31 /**
32  * Per-lcore info for timers.
33  */
34 struct priv_timer {
35 	struct rte_timer pending_head;  /**< dummy timer instance to head up list */
36 	rte_spinlock_t list_lock;       /**< lock to protect list access */
37 
38 	/** per-core variable that true if a timer was updated on this
39 	 *  core since last reset of the variable */
40 	int updated;
41 
42 	/** track the current depth of the skiplist */
43 	unsigned curr_skiplist_depth;
44 
45 	unsigned prev_lcore;              /**< used for lcore round robin */
46 
47 	/** running timer on this lcore now */
48 	struct rte_timer *running_tim;
49 
50 #ifdef RTE_LIBRTE_TIMER_DEBUG
51 	/** per-lcore statistics */
52 	struct rte_timer_debug_stats stats;
53 #endif
54 } __rte_cache_aligned;
55 
56 #define FL_ALLOCATED	(1 << 0)
57 struct rte_timer_data {
58 	struct priv_timer priv_timer[RTE_MAX_LCORE];
59 	uint8_t internal_flags;
60 };
61 
62 #define RTE_MAX_DATA_ELS 64
63 static const struct rte_memzone *rte_timer_data_mz;
64 static int *volatile rte_timer_mz_refcnt;
65 static struct rte_timer_data *rte_timer_data_arr;
66 static const uint32_t default_data_id;
67 static uint32_t rte_timer_subsystem_initialized;
68 
69 /* when debug is enabled, store some statistics */
70 #ifdef RTE_LIBRTE_TIMER_DEBUG
71 #define __TIMER_STAT_ADD(priv_timer, name, n) do {			\
72 		unsigned __lcore_id = rte_lcore_id();			\
73 		if (__lcore_id < RTE_MAX_LCORE)				\
74 			priv_timer[__lcore_id].stats.name += (n);	\
75 	} while(0)
76 #else
77 #define __TIMER_STAT_ADD(priv_timer, name, n) do {} while (0)
78 #endif
79 
80 static inline int
81 timer_data_valid(uint32_t id)
82 {
83 	return rte_timer_data_arr &&
84 		(rte_timer_data_arr[id].internal_flags & FL_ALLOCATED);
85 }
86 
87 /* validate ID and retrieve timer data pointer, or return error value */
88 #define TIMER_DATA_VALID_GET_OR_ERR_RET(id, timer_data, retval) do {	\
89 	if (id >= RTE_MAX_DATA_ELS || !timer_data_valid(id))		\
90 		return retval;						\
91 	timer_data = &rte_timer_data_arr[id];				\
92 } while (0)
93 
94 int
95 rte_timer_data_alloc(uint32_t *id_ptr)
96 {
97 	int i;
98 	struct rte_timer_data *data;
99 
100 	if (!rte_timer_subsystem_initialized)
101 		return -ENOMEM;
102 
103 	for (i = 0; i < RTE_MAX_DATA_ELS; i++) {
104 		data = &rte_timer_data_arr[i];
105 		if (!(data->internal_flags & FL_ALLOCATED)) {
106 			data->internal_flags |= FL_ALLOCATED;
107 
108 			if (id_ptr)
109 				*id_ptr = i;
110 
111 			return 0;
112 		}
113 	}
114 
115 	return -ENOSPC;
116 }
117 
118 int
119 rte_timer_data_dealloc(uint32_t id)
120 {
121 	struct rte_timer_data *timer_data;
122 	TIMER_DATA_VALID_GET_OR_ERR_RET(id, timer_data, -EINVAL);
123 
124 	timer_data->internal_flags &= ~(FL_ALLOCATED);
125 
126 	return 0;
127 }
128 
129 /* Init the timer library. Allocate an array of timer data structs in shared
130  * memory, and allocate the zeroth entry for use with original timer
131  * APIs. Since the intersection of the sets of lcore ids in primary and
132  * secondary processes should be empty, the zeroth entry can be shared by
133  * multiple processes.
134  */
135 int
136 rte_timer_subsystem_init(void)
137 {
138 	const struct rte_memzone *mz;
139 	struct rte_timer_data *data;
140 	int i, lcore_id;
141 	static const char *mz_name = "rte_timer_mz";
142 	const size_t data_arr_size =
143 			RTE_MAX_DATA_ELS * sizeof(*rte_timer_data_arr);
144 	const size_t mem_size = data_arr_size + sizeof(*rte_timer_mz_refcnt);
145 	bool do_full_init = true;
146 
147 	rte_mcfg_timer_lock();
148 
149 	if (rte_timer_subsystem_initialized) {
150 		rte_mcfg_timer_unlock();
151 		return -EALREADY;
152 	}
153 
154 	mz = rte_memzone_lookup(mz_name);
155 	if (mz == NULL) {
156 		mz = rte_memzone_reserve_aligned(mz_name, mem_size,
157 				SOCKET_ID_ANY, 0, RTE_CACHE_LINE_SIZE);
158 		if (mz == NULL) {
159 			rte_mcfg_timer_unlock();
160 			return -ENOMEM;
161 		}
162 		do_full_init = true;
163 	} else
164 		do_full_init = false;
165 
166 	rte_timer_data_mz = mz;
167 	rte_timer_data_arr = mz->addr;
168 	rte_timer_mz_refcnt = (void *)((char *)mz->addr + data_arr_size);
169 
170 	if (do_full_init) {
171 		for (i = 0; i < RTE_MAX_DATA_ELS; i++) {
172 			data = &rte_timer_data_arr[i];
173 
174 			for (lcore_id = 0; lcore_id < RTE_MAX_LCORE;
175 			     lcore_id++) {
176 				rte_spinlock_init(
177 					&data->priv_timer[lcore_id].list_lock);
178 				data->priv_timer[lcore_id].prev_lcore =
179 					lcore_id;
180 			}
181 		}
182 	}
183 
184 	rte_timer_data_arr[default_data_id].internal_flags |= FL_ALLOCATED;
185 	(*rte_timer_mz_refcnt)++;
186 
187 	rte_timer_subsystem_initialized = 1;
188 
189 	rte_mcfg_timer_unlock();
190 
191 	return 0;
192 }
193 
194 void
195 rte_timer_subsystem_finalize(void)
196 {
197 	rte_mcfg_timer_lock();
198 
199 	if (!rte_timer_subsystem_initialized) {
200 		rte_mcfg_timer_unlock();
201 		return;
202 	}
203 
204 	if (--(*rte_timer_mz_refcnt) == 0)
205 		rte_memzone_free(rte_timer_data_mz);
206 
207 	rte_timer_subsystem_initialized = 0;
208 
209 	rte_mcfg_timer_unlock();
210 }
211 
212 /* Initialize the timer handle tim for use */
213 void
214 rte_timer_init(struct rte_timer *tim)
215 {
216 	union rte_timer_status status;
217 
218 	status.state = RTE_TIMER_STOP;
219 	status.owner = RTE_TIMER_NO_OWNER;
220 	__atomic_store_n(&tim->status.u32, status.u32, __ATOMIC_RELAXED);
221 }
222 
223 /*
224  * if timer is pending or stopped (or running on the same core than
225  * us), mark timer as configuring, and on success return the previous
226  * status of the timer
227  */
228 static int
229 timer_set_config_state(struct rte_timer *tim,
230 		       union rte_timer_status *ret_prev_status,
231 		       struct priv_timer *priv_timer)
232 {
233 	union rte_timer_status prev_status, status;
234 	int success = 0;
235 	unsigned lcore_id;
236 
237 	lcore_id = rte_lcore_id();
238 
239 	/* wait that the timer is in correct status before update,
240 	 * and mark it as being configured */
241 	prev_status.u32 = __atomic_load_n(&tim->status.u32, __ATOMIC_RELAXED);
242 
243 	while (success == 0) {
244 		/* timer is running on another core
245 		 * or ready to run on local core, exit
246 		 */
247 		if (prev_status.state == RTE_TIMER_RUNNING &&
248 		    (prev_status.owner != (uint16_t)lcore_id ||
249 		     tim != priv_timer[lcore_id].running_tim))
250 			return -1;
251 
252 		/* timer is being configured on another core */
253 		if (prev_status.state == RTE_TIMER_CONFIG)
254 			return -1;
255 
256 		/* here, we know that timer is stopped or pending,
257 		 * mark it atomically as being configured */
258 		status.state = RTE_TIMER_CONFIG;
259 		status.owner = (int16_t)lcore_id;
260 		/* CONFIG states are acting as locked states. If the
261 		 * timer is in CONFIG state, the state cannot be changed
262 		 * by other threads. So, we should use ACQUIRE here.
263 		 */
264 		success = __atomic_compare_exchange_n(&tim->status.u32,
265 					      &prev_status.u32,
266 					      status.u32, 0,
267 					      __ATOMIC_ACQUIRE,
268 					      __ATOMIC_RELAXED);
269 	}
270 
271 	ret_prev_status->u32 = prev_status.u32;
272 	return 0;
273 }
274 
275 /*
276  * if timer is pending, mark timer as running
277  */
278 static int
279 timer_set_running_state(struct rte_timer *tim)
280 {
281 	union rte_timer_status prev_status, status;
282 	unsigned lcore_id = rte_lcore_id();
283 	int success = 0;
284 
285 	/* wait that the timer is in correct status before update,
286 	 * and mark it as running */
287 	prev_status.u32 = __atomic_load_n(&tim->status.u32, __ATOMIC_RELAXED);
288 
289 	while (success == 0) {
290 		/* timer is not pending anymore */
291 		if (prev_status.state != RTE_TIMER_PENDING)
292 			return -1;
293 
294 		/* we know that the timer will be pending at this point
295 		 * mark it atomically as being running
296 		 */
297 		status.state = RTE_TIMER_RUNNING;
298 		status.owner = (int16_t)lcore_id;
299 		/* RUNNING states are acting as locked states. If the
300 		 * timer is in RUNNING state, the state cannot be changed
301 		 * by other threads. So, we should use ACQUIRE here.
302 		 */
303 		success = __atomic_compare_exchange_n(&tim->status.u32,
304 					      &prev_status.u32,
305 					      status.u32, 0,
306 					      __ATOMIC_ACQUIRE,
307 					      __ATOMIC_RELAXED);
308 	}
309 
310 	return 0;
311 }
312 
313 /*
314  * Return a skiplist level for a new entry.
315  * This probabilistically gives a level with p=1/4 that an entry at level n
316  * will also appear at level n+1.
317  */
318 static uint32_t
319 timer_get_skiplist_level(unsigned curr_depth)
320 {
321 #ifdef RTE_LIBRTE_TIMER_DEBUG
322 	static uint32_t i, count = 0;
323 	static uint32_t levels[MAX_SKIPLIST_DEPTH] = {0};
324 #endif
325 
326 	/* probability value is 1/4, i.e. all at level 0, 1 in 4 is at level 1,
327 	 * 1 in 16 at level 2, 1 in 64 at level 3, etc. Calculated using lowest
328 	 * bit position of a (pseudo)random number.
329 	 */
330 	uint32_t rand = rte_rand() & (UINT32_MAX - 1);
331 	uint32_t level = rand == 0 ? MAX_SKIPLIST_DEPTH : (rte_bsf32(rand)-1) / 2;
332 
333 	/* limit the levels used to one above our current level, so we don't,
334 	 * for instance, have a level 0 and a level 7 without anything between
335 	 */
336 	if (level > curr_depth)
337 		level = curr_depth;
338 	if (level >= MAX_SKIPLIST_DEPTH)
339 		level = MAX_SKIPLIST_DEPTH-1;
340 #ifdef RTE_LIBRTE_TIMER_DEBUG
341 	count ++;
342 	levels[level]++;
343 	if (count % 10000 == 0)
344 		for (i = 0; i < MAX_SKIPLIST_DEPTH; i++)
345 			printf("Level %u: %u\n", (unsigned)i, (unsigned)levels[i]);
346 #endif
347 	return level;
348 }
349 
350 /*
351  * For a given time value, get the entries at each level which
352  * are <= that time value.
353  */
354 static void
355 timer_get_prev_entries(uint64_t time_val, unsigned tim_lcore,
356 		       struct rte_timer **prev, struct priv_timer *priv_timer)
357 {
358 	unsigned lvl = priv_timer[tim_lcore].curr_skiplist_depth;
359 	prev[lvl] = &priv_timer[tim_lcore].pending_head;
360 	while(lvl != 0) {
361 		lvl--;
362 		prev[lvl] = prev[lvl+1];
363 		while (prev[lvl]->sl_next[lvl] &&
364 				prev[lvl]->sl_next[lvl]->expire <= time_val)
365 			prev[lvl] = prev[lvl]->sl_next[lvl];
366 	}
367 }
368 
369 /*
370  * Given a timer node in the skiplist, find the previous entries for it at
371  * all skiplist levels.
372  */
373 static void
374 timer_get_prev_entries_for_node(struct rte_timer *tim, unsigned tim_lcore,
375 				struct rte_timer **prev,
376 				struct priv_timer *priv_timer)
377 {
378 	int i;
379 
380 	/* to get a specific entry in the list, look for just lower than the time
381 	 * values, and then increment on each level individually if necessary
382 	 */
383 	timer_get_prev_entries(tim->expire - 1, tim_lcore, prev, priv_timer);
384 	for (i = priv_timer[tim_lcore].curr_skiplist_depth - 1; i >= 0; i--) {
385 		while (prev[i]->sl_next[i] != NULL &&
386 				prev[i]->sl_next[i] != tim &&
387 				prev[i]->sl_next[i]->expire <= tim->expire)
388 			prev[i] = prev[i]->sl_next[i];
389 	}
390 }
391 
392 /* call with lock held as necessary
393  * add in list
394  * timer must be in config state
395  * timer must not be in a list
396  */
397 static void
398 timer_add(struct rte_timer *tim, unsigned int tim_lcore,
399 	  struct priv_timer *priv_timer)
400 {
401 	unsigned lvl;
402 	struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1];
403 
404 	/* find where exactly this element goes in the list of elements
405 	 * for each depth. */
406 	timer_get_prev_entries(tim->expire, tim_lcore, prev, priv_timer);
407 
408 	/* now assign it a new level and add at that level */
409 	const unsigned tim_level = timer_get_skiplist_level(
410 			priv_timer[tim_lcore].curr_skiplist_depth);
411 	if (tim_level == priv_timer[tim_lcore].curr_skiplist_depth)
412 		priv_timer[tim_lcore].curr_skiplist_depth++;
413 
414 	lvl = tim_level;
415 	while (lvl > 0) {
416 		tim->sl_next[lvl] = prev[lvl]->sl_next[lvl];
417 		prev[lvl]->sl_next[lvl] = tim;
418 		lvl--;
419 	}
420 	tim->sl_next[0] = prev[0]->sl_next[0];
421 	prev[0]->sl_next[0] = tim;
422 
423 	/* save the lowest list entry into the expire field of the dummy hdr
424 	 * NOTE: this is not atomic on 32-bit*/
425 	priv_timer[tim_lcore].pending_head.expire = priv_timer[tim_lcore].\
426 			pending_head.sl_next[0]->expire;
427 }
428 
429 /*
430  * del from list, lock if needed
431  * timer must be in config state
432  * timer must be in a list
433  */
434 static void
435 timer_del(struct rte_timer *tim, union rte_timer_status prev_status,
436 	  int local_is_locked, struct priv_timer *priv_timer)
437 {
438 	unsigned lcore_id = rte_lcore_id();
439 	unsigned prev_owner = prev_status.owner;
440 	int i;
441 	struct rte_timer *prev[MAX_SKIPLIST_DEPTH+1];
442 
443 	/* if timer needs is pending another core, we need to lock the
444 	 * list; if it is on local core, we need to lock if we are not
445 	 * called from rte_timer_manage() */
446 	if (prev_owner != lcore_id || !local_is_locked)
447 		rte_spinlock_lock(&priv_timer[prev_owner].list_lock);
448 
449 	/* save the lowest list entry into the expire field of the dummy hdr.
450 	 * NOTE: this is not atomic on 32-bit */
451 	if (tim == priv_timer[prev_owner].pending_head.sl_next[0])
452 		priv_timer[prev_owner].pending_head.expire =
453 				((tim->sl_next[0] == NULL) ? 0 : tim->sl_next[0]->expire);
454 
455 	/* adjust pointers from previous entries to point past this */
456 	timer_get_prev_entries_for_node(tim, prev_owner, prev, priv_timer);
457 	for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--) {
458 		if (prev[i]->sl_next[i] == tim)
459 			prev[i]->sl_next[i] = tim->sl_next[i];
460 	}
461 
462 	/* in case we deleted last entry at a level, adjust down max level */
463 	for (i = priv_timer[prev_owner].curr_skiplist_depth - 1; i >= 0; i--)
464 		if (priv_timer[prev_owner].pending_head.sl_next[i] == NULL)
465 			priv_timer[prev_owner].curr_skiplist_depth --;
466 		else
467 			break;
468 
469 	if (prev_owner != lcore_id || !local_is_locked)
470 		rte_spinlock_unlock(&priv_timer[prev_owner].list_lock);
471 }
472 
473 /* Reset and start the timer associated with the timer handle (private func) */
474 static int
475 __rte_timer_reset(struct rte_timer *tim, uint64_t expire,
476 		  uint64_t period, unsigned tim_lcore,
477 		  rte_timer_cb_t fct, void *arg,
478 		  int local_is_locked,
479 		  struct rte_timer_data *timer_data)
480 {
481 	union rte_timer_status prev_status, status;
482 	int ret;
483 	unsigned lcore_id = rte_lcore_id();
484 	struct priv_timer *priv_timer = timer_data->priv_timer;
485 
486 	/* round robin for tim_lcore */
487 	if (tim_lcore == (unsigned)LCORE_ID_ANY) {
488 		if (lcore_id < RTE_MAX_LCORE) {
489 			/* EAL thread with valid lcore_id */
490 			tim_lcore = rte_get_next_lcore(
491 				priv_timer[lcore_id].prev_lcore,
492 				0, 1);
493 			priv_timer[lcore_id].prev_lcore = tim_lcore;
494 		} else
495 			/* non-EAL thread do not run rte_timer_manage(),
496 			 * so schedule the timer on the first enabled lcore. */
497 			tim_lcore = rte_get_next_lcore(LCORE_ID_ANY, 0, 1);
498 	}
499 
500 	/* wait that the timer is in correct status before update,
501 	 * and mark it as being configured */
502 	ret = timer_set_config_state(tim, &prev_status, priv_timer);
503 	if (ret < 0)
504 		return -1;
505 
506 	__TIMER_STAT_ADD(priv_timer, reset, 1);
507 	if (prev_status.state == RTE_TIMER_RUNNING &&
508 	    lcore_id < RTE_MAX_LCORE) {
509 		priv_timer[lcore_id].updated = 1;
510 	}
511 
512 	/* remove it from list */
513 	if (prev_status.state == RTE_TIMER_PENDING) {
514 		timer_del(tim, prev_status, local_is_locked, priv_timer);
515 		__TIMER_STAT_ADD(priv_timer, pending, -1);
516 	}
517 
518 	tim->period = period;
519 	tim->expire = expire;
520 	tim->f = fct;
521 	tim->arg = arg;
522 
523 	/* if timer needs to be scheduled on another core, we need to
524 	 * lock the destination list; if it is on local core, we need to lock if
525 	 * we are not called from rte_timer_manage()
526 	 */
527 	if (tim_lcore != lcore_id || !local_is_locked)
528 		rte_spinlock_lock(&priv_timer[tim_lcore].list_lock);
529 
530 	__TIMER_STAT_ADD(priv_timer, pending, 1);
531 	timer_add(tim, tim_lcore, priv_timer);
532 
533 	/* update state: as we are in CONFIG state, only us can modify
534 	 * the state so we don't need to use cmpset() here */
535 	status.state = RTE_TIMER_PENDING;
536 	status.owner = (int16_t)tim_lcore;
537 	/* The "RELEASE" ordering guarantees the memory operations above
538 	 * the status update are observed before the update by all threads
539 	 */
540 	__atomic_store_n(&tim->status.u32, status.u32, __ATOMIC_RELEASE);
541 
542 	if (tim_lcore != lcore_id || !local_is_locked)
543 		rte_spinlock_unlock(&priv_timer[tim_lcore].list_lock);
544 
545 	return 0;
546 }
547 
548 /* Reset and start the timer associated with the timer handle tim */
549 int
550 rte_timer_reset(struct rte_timer *tim, uint64_t ticks,
551 		      enum rte_timer_type type, unsigned int tim_lcore,
552 		      rte_timer_cb_t fct, void *arg)
553 {
554 	return rte_timer_alt_reset(default_data_id, tim, ticks, type,
555 				   tim_lcore, fct, arg);
556 }
557 
558 int
559 rte_timer_alt_reset(uint32_t timer_data_id, struct rte_timer *tim,
560 		    uint64_t ticks, enum rte_timer_type type,
561 		    unsigned int tim_lcore, rte_timer_cb_t fct, void *arg)
562 {
563 	uint64_t cur_time = rte_get_timer_cycles();
564 	uint64_t period;
565 	struct rte_timer_data *timer_data;
566 
567 	TIMER_DATA_VALID_GET_OR_ERR_RET(timer_data_id, timer_data, -EINVAL);
568 
569 	if (type == PERIODICAL)
570 		period = ticks;
571 	else
572 		period = 0;
573 
574 	return __rte_timer_reset(tim,  cur_time + ticks, period, tim_lcore,
575 				 fct, arg, 0, timer_data);
576 }
577 
578 /* loop until rte_timer_reset() succeed */
579 void
580 rte_timer_reset_sync(struct rte_timer *tim, uint64_t ticks,
581 		     enum rte_timer_type type, unsigned tim_lcore,
582 		     rte_timer_cb_t fct, void *arg)
583 {
584 	while (rte_timer_reset(tim, ticks, type, tim_lcore,
585 			       fct, arg) != 0)
586 		rte_pause();
587 }
588 
589 static int
590 __rte_timer_stop(struct rte_timer *tim, int local_is_locked,
591 		 struct rte_timer_data *timer_data)
592 {
593 	union rte_timer_status prev_status, status;
594 	unsigned lcore_id = rte_lcore_id();
595 	int ret;
596 	struct priv_timer *priv_timer = timer_data->priv_timer;
597 
598 	/* wait that the timer is in correct status before update,
599 	 * and mark it as being configured */
600 	ret = timer_set_config_state(tim, &prev_status, priv_timer);
601 	if (ret < 0)
602 		return -1;
603 
604 	__TIMER_STAT_ADD(priv_timer, stop, 1);
605 	if (prev_status.state == RTE_TIMER_RUNNING &&
606 	    lcore_id < RTE_MAX_LCORE) {
607 		priv_timer[lcore_id].updated = 1;
608 	}
609 
610 	/* remove it from list */
611 	if (prev_status.state == RTE_TIMER_PENDING) {
612 		timer_del(tim, prev_status, local_is_locked, priv_timer);
613 		__TIMER_STAT_ADD(priv_timer, pending, -1);
614 	}
615 
616 	/* mark timer as stopped */
617 	status.state = RTE_TIMER_STOP;
618 	status.owner = RTE_TIMER_NO_OWNER;
619 	/* The "RELEASE" ordering guarantees the memory operations above
620 	 * the status update are observed before the update by all threads
621 	 */
622 	__atomic_store_n(&tim->status.u32, status.u32, __ATOMIC_RELEASE);
623 
624 	return 0;
625 }
626 
627 /* Stop the timer associated with the timer handle tim */
628 int
629 rte_timer_stop(struct rte_timer *tim)
630 {
631 	return rte_timer_alt_stop(default_data_id, tim);
632 }
633 
634 int
635 rte_timer_alt_stop(uint32_t timer_data_id, struct rte_timer *tim)
636 {
637 	struct rte_timer_data *timer_data;
638 
639 	TIMER_DATA_VALID_GET_OR_ERR_RET(timer_data_id, timer_data, -EINVAL);
640 
641 	return __rte_timer_stop(tim, 0, timer_data);
642 }
643 
644 /* loop until rte_timer_stop() succeed */
645 void
646 rte_timer_stop_sync(struct rte_timer *tim)
647 {
648 	while (rte_timer_stop(tim) != 0)
649 		rte_pause();
650 }
651 
652 /* Test the PENDING status of the timer handle tim */
653 int
654 rte_timer_pending(struct rte_timer *tim)
655 {
656 	return __atomic_load_n(&tim->status.state,
657 				__ATOMIC_RELAXED) == RTE_TIMER_PENDING;
658 }
659 
660 /* must be called periodically, run all timer that expired */
661 static void
662 __rte_timer_manage(struct rte_timer_data *timer_data)
663 {
664 	union rte_timer_status status;
665 	struct rte_timer *tim, *next_tim;
666 	struct rte_timer *run_first_tim, **pprev;
667 	unsigned lcore_id = rte_lcore_id();
668 	struct rte_timer *prev[MAX_SKIPLIST_DEPTH + 1];
669 	uint64_t cur_time;
670 	int i, ret;
671 	struct priv_timer *priv_timer = timer_data->priv_timer;
672 
673 	/* timer manager only runs on EAL thread with valid lcore_id */
674 	assert(lcore_id < RTE_MAX_LCORE);
675 
676 	__TIMER_STAT_ADD(priv_timer, manage, 1);
677 	/* optimize for the case where per-cpu list is empty */
678 	if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL)
679 		return;
680 	cur_time = rte_get_timer_cycles();
681 
682 #ifdef RTE_ARCH_64
683 	/* on 64-bit the value cached in the pending_head.expired will be
684 	 * updated atomically, so we can consult that for a quick check here
685 	 * outside the lock */
686 	if (likely(priv_timer[lcore_id].pending_head.expire > cur_time))
687 		return;
688 #endif
689 
690 	/* browse ordered list, add expired timers in 'expired' list */
691 	rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
692 
693 	/* if nothing to do just unlock and return */
694 	if (priv_timer[lcore_id].pending_head.sl_next[0] == NULL ||
695 	    priv_timer[lcore_id].pending_head.sl_next[0]->expire > cur_time) {
696 		rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
697 		return;
698 	}
699 
700 	/* save start of list of expired timers */
701 	tim = priv_timer[lcore_id].pending_head.sl_next[0];
702 
703 	/* break the existing list at current time point */
704 	timer_get_prev_entries(cur_time, lcore_id, prev, priv_timer);
705 	for (i = priv_timer[lcore_id].curr_skiplist_depth -1; i >= 0; i--) {
706 		if (prev[i] == &priv_timer[lcore_id].pending_head)
707 			continue;
708 		priv_timer[lcore_id].pending_head.sl_next[i] =
709 		    prev[i]->sl_next[i];
710 		if (prev[i]->sl_next[i] == NULL)
711 			priv_timer[lcore_id].curr_skiplist_depth--;
712 		prev[i] ->sl_next[i] = NULL;
713 	}
714 
715 	/* transition run-list from PENDING to RUNNING */
716 	run_first_tim = tim;
717 	pprev = &run_first_tim;
718 
719 	for ( ; tim != NULL; tim = next_tim) {
720 		next_tim = tim->sl_next[0];
721 
722 		ret = timer_set_running_state(tim);
723 		if (likely(ret == 0)) {
724 			pprev = &tim->sl_next[0];
725 		} else {
726 			/* another core is trying to re-config this one,
727 			 * remove it from local expired list
728 			 */
729 			*pprev = next_tim;
730 		}
731 	}
732 
733 	/* update the next to expire timer value */
734 	priv_timer[lcore_id].pending_head.expire =
735 	    (priv_timer[lcore_id].pending_head.sl_next[0] == NULL) ? 0 :
736 		priv_timer[lcore_id].pending_head.sl_next[0]->expire;
737 
738 	rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
739 
740 	/* now scan expired list and call callbacks */
741 	for (tim = run_first_tim; tim != NULL; tim = next_tim) {
742 		next_tim = tim->sl_next[0];
743 		priv_timer[lcore_id].updated = 0;
744 		priv_timer[lcore_id].running_tim = tim;
745 
746 		/* execute callback function with list unlocked */
747 		tim->f(tim, tim->arg);
748 
749 		__TIMER_STAT_ADD(priv_timer, pending, -1);
750 		/* the timer was stopped or reloaded by the callback
751 		 * function, we have nothing to do here */
752 		if (priv_timer[lcore_id].updated == 1)
753 			continue;
754 
755 		if (tim->period == 0) {
756 			/* remove from done list and mark timer as stopped */
757 			status.state = RTE_TIMER_STOP;
758 			status.owner = RTE_TIMER_NO_OWNER;
759 			/* The "RELEASE" ordering guarantees the memory
760 			 * operations above the status update are observed
761 			 * before the update by all threads
762 			 */
763 			__atomic_store_n(&tim->status.u32, status.u32,
764 				__ATOMIC_RELEASE);
765 		}
766 		else {
767 			/* keep it in list and mark timer as pending */
768 			rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
769 			status.state = RTE_TIMER_PENDING;
770 			__TIMER_STAT_ADD(priv_timer, pending, 1);
771 			status.owner = (int16_t)lcore_id;
772 			/* The "RELEASE" ordering guarantees the memory
773 			 * operations above the status update are observed
774 			 * before the update by all threads
775 			 */
776 			__atomic_store_n(&tim->status.u32, status.u32,
777 				__ATOMIC_RELEASE);
778 			__rte_timer_reset(tim, tim->expire + tim->period,
779 				tim->period, lcore_id, tim->f, tim->arg, 1,
780 				timer_data);
781 			rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
782 		}
783 	}
784 	priv_timer[lcore_id].running_tim = NULL;
785 }
786 
787 int
788 rte_timer_manage(void)
789 {
790 	struct rte_timer_data *timer_data;
791 
792 	TIMER_DATA_VALID_GET_OR_ERR_RET(default_data_id, timer_data, -EINVAL);
793 
794 	__rte_timer_manage(timer_data);
795 
796 	return 0;
797 }
798 
799 int
800 rte_timer_alt_manage(uint32_t timer_data_id,
801 		     unsigned int *poll_lcores,
802 		     int nb_poll_lcores,
803 		     rte_timer_alt_manage_cb_t f)
804 {
805 	unsigned int default_poll_lcores[] = {rte_lcore_id()};
806 	union rte_timer_status status;
807 	struct rte_timer *tim, *next_tim, **pprev;
808 	struct rte_timer *run_first_tims[RTE_MAX_LCORE];
809 	unsigned int this_lcore = rte_lcore_id();
810 	struct rte_timer *prev[MAX_SKIPLIST_DEPTH + 1];
811 	uint64_t cur_time;
812 	int i, j, ret;
813 	int nb_runlists = 0;
814 	struct rte_timer_data *data;
815 	struct priv_timer *privp;
816 	uint32_t poll_lcore;
817 
818 	TIMER_DATA_VALID_GET_OR_ERR_RET(timer_data_id, data, -EINVAL);
819 
820 	/* timer manager only runs on EAL thread with valid lcore_id */
821 	assert(this_lcore < RTE_MAX_LCORE);
822 
823 	__TIMER_STAT_ADD(data->priv_timer, manage, 1);
824 
825 	if (poll_lcores == NULL) {
826 		poll_lcores = default_poll_lcores;
827 		nb_poll_lcores = RTE_DIM(default_poll_lcores);
828 	}
829 
830 	for (i = 0; i < nb_poll_lcores; i++) {
831 		poll_lcore = poll_lcores[i];
832 		privp = &data->priv_timer[poll_lcore];
833 
834 		/* optimize for the case where per-cpu list is empty */
835 		if (privp->pending_head.sl_next[0] == NULL)
836 			continue;
837 		cur_time = rte_get_timer_cycles();
838 
839 #ifdef RTE_ARCH_64
840 		/* on 64-bit the value cached in the pending_head.expired will
841 		 * be updated atomically, so we can consult that for a quick
842 		 * check here outside the lock
843 		 */
844 		if (likely(privp->pending_head.expire > cur_time))
845 			continue;
846 #endif
847 
848 		/* browse ordered list, add expired timers in 'expired' list */
849 		rte_spinlock_lock(&privp->list_lock);
850 
851 		/* if nothing to do just unlock and return */
852 		if (privp->pending_head.sl_next[0] == NULL ||
853 		    privp->pending_head.sl_next[0]->expire > cur_time) {
854 			rte_spinlock_unlock(&privp->list_lock);
855 			continue;
856 		}
857 
858 		/* save start of list of expired timers */
859 		tim = privp->pending_head.sl_next[0];
860 
861 		/* break the existing list at current time point */
862 		timer_get_prev_entries(cur_time, poll_lcore, prev,
863 				       data->priv_timer);
864 		for (j = privp->curr_skiplist_depth - 1; j >= 0; j--) {
865 			if (prev[j] == &privp->pending_head)
866 				continue;
867 			privp->pending_head.sl_next[j] =
868 				prev[j]->sl_next[j];
869 			if (prev[j]->sl_next[j] == NULL)
870 				privp->curr_skiplist_depth--;
871 
872 			prev[j]->sl_next[j] = NULL;
873 		}
874 
875 		/* transition run-list from PENDING to RUNNING */
876 		run_first_tims[nb_runlists] = tim;
877 		pprev = &run_first_tims[nb_runlists];
878 		nb_runlists++;
879 
880 		for ( ; tim != NULL; tim = next_tim) {
881 			next_tim = tim->sl_next[0];
882 
883 			ret = timer_set_running_state(tim);
884 			if (likely(ret == 0)) {
885 				pprev = &tim->sl_next[0];
886 			} else {
887 				/* another core is trying to re-config this one,
888 				 * remove it from local expired list
889 				 */
890 				*pprev = next_tim;
891 			}
892 		}
893 
894 		/* update the next to expire timer value */
895 		privp->pending_head.expire =
896 		    (privp->pending_head.sl_next[0] == NULL) ? 0 :
897 			privp->pending_head.sl_next[0]->expire;
898 
899 		rte_spinlock_unlock(&privp->list_lock);
900 	}
901 
902 	/* Now process the run lists */
903 	while (1) {
904 		bool done = true;
905 		uint64_t min_expire = UINT64_MAX;
906 		int min_idx = 0;
907 
908 		/* Find the next oldest timer to process */
909 		for (i = 0; i < nb_runlists; i++) {
910 			tim = run_first_tims[i];
911 
912 			if (tim != NULL && tim->expire < min_expire) {
913 				min_expire = tim->expire;
914 				min_idx = i;
915 				done = false;
916 			}
917 		}
918 
919 		if (done)
920 			break;
921 
922 		tim = run_first_tims[min_idx];
923 
924 		/* Move down the runlist from which we picked a timer to
925 		 * execute
926 		 */
927 		run_first_tims[min_idx] = run_first_tims[min_idx]->sl_next[0];
928 
929 		data->priv_timer[this_lcore].updated = 0;
930 		data->priv_timer[this_lcore].running_tim = tim;
931 
932 		/* Call the provided callback function */
933 		f(tim);
934 
935 		__TIMER_STAT_ADD(data->priv_timer, pending, -1);
936 
937 		/* the timer was stopped or reloaded by the callback
938 		 * function, we have nothing to do here
939 		 */
940 		if (data->priv_timer[this_lcore].updated == 1)
941 			continue;
942 
943 		if (tim->period == 0) {
944 			/* remove from done list and mark timer as stopped */
945 			status.state = RTE_TIMER_STOP;
946 			status.owner = RTE_TIMER_NO_OWNER;
947 			/* The "RELEASE" ordering guarantees the memory
948 			 * operations above the status update are observed
949 			 * before the update by all threads
950 			 */
951 			__atomic_store_n(&tim->status.u32, status.u32,
952 				__ATOMIC_RELEASE);
953 		} else {
954 			/* keep it in list and mark timer as pending */
955 			rte_spinlock_lock(
956 				&data->priv_timer[this_lcore].list_lock);
957 			status.state = RTE_TIMER_PENDING;
958 			__TIMER_STAT_ADD(data->priv_timer, pending, 1);
959 			status.owner = (int16_t)this_lcore;
960 			/* The "RELEASE" ordering guarantees the memory
961 			 * operations above the status update are observed
962 			 * before the update by all threads
963 			 */
964 			__atomic_store_n(&tim->status.u32, status.u32,
965 				__ATOMIC_RELEASE);
966 			__rte_timer_reset(tim, tim->expire + tim->period,
967 				tim->period, this_lcore, tim->f, tim->arg, 1,
968 				data);
969 			rte_spinlock_unlock(
970 				&data->priv_timer[this_lcore].list_lock);
971 		}
972 
973 		data->priv_timer[this_lcore].running_tim = NULL;
974 	}
975 
976 	return 0;
977 }
978 
979 /* Walk pending lists, stopping timers and calling user-specified function */
980 int
981 rte_timer_stop_all(uint32_t timer_data_id, unsigned int *walk_lcores,
982 		   int nb_walk_lcores,
983 		   rte_timer_stop_all_cb_t f, void *f_arg)
984 {
985 	int i;
986 	struct priv_timer *priv_timer;
987 	uint32_t walk_lcore;
988 	struct rte_timer *tim, *next_tim;
989 	struct rte_timer_data *timer_data;
990 
991 	TIMER_DATA_VALID_GET_OR_ERR_RET(timer_data_id, timer_data, -EINVAL);
992 
993 	for (i = 0; i < nb_walk_lcores; i++) {
994 		walk_lcore = walk_lcores[i];
995 		priv_timer = &timer_data->priv_timer[walk_lcore];
996 
997 		rte_spinlock_lock(&priv_timer->list_lock);
998 
999 		for (tim = priv_timer->pending_head.sl_next[0];
1000 		     tim != NULL;
1001 		     tim = next_tim) {
1002 			next_tim = tim->sl_next[0];
1003 
1004 			/* Call timer_stop with lock held */
1005 			__rte_timer_stop(tim, 1, timer_data);
1006 
1007 			if (f)
1008 				f(tim, f_arg);
1009 		}
1010 
1011 		rte_spinlock_unlock(&priv_timer->list_lock);
1012 	}
1013 
1014 	return 0;
1015 }
1016 
1017 int64_t
1018 rte_timer_next_ticks(void)
1019 {
1020 	unsigned int lcore_id = rte_lcore_id();
1021 	struct rte_timer_data *timer_data;
1022 	struct priv_timer *priv_timer;
1023 	const struct rte_timer *tm;
1024 	uint64_t cur_time;
1025 	int64_t left = -ENOENT;
1026 
1027 	TIMER_DATA_VALID_GET_OR_ERR_RET(default_data_id, timer_data, -EINVAL);
1028 
1029 	priv_timer = timer_data->priv_timer;
1030 	cur_time = rte_get_timer_cycles();
1031 
1032 	rte_spinlock_lock(&priv_timer[lcore_id].list_lock);
1033 	tm = priv_timer[lcore_id].pending_head.sl_next[0];
1034 	if (tm) {
1035 		left = tm->expire - cur_time;
1036 		if (left < 0)
1037 			left = 0;
1038 	}
1039 	rte_spinlock_unlock(&priv_timer[lcore_id].list_lock);
1040 
1041 	return left;
1042 }
1043 
1044 /* dump statistics about timers */
1045 static void
1046 __rte_timer_dump_stats(struct rte_timer_data *timer_data __rte_unused, FILE *f)
1047 {
1048 #ifdef RTE_LIBRTE_TIMER_DEBUG
1049 	struct rte_timer_debug_stats sum;
1050 	unsigned lcore_id;
1051 	struct priv_timer *priv_timer = timer_data->priv_timer;
1052 
1053 	memset(&sum, 0, sizeof(sum));
1054 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1055 		sum.reset += priv_timer[lcore_id].stats.reset;
1056 		sum.stop += priv_timer[lcore_id].stats.stop;
1057 		sum.manage += priv_timer[lcore_id].stats.manage;
1058 		sum.pending += priv_timer[lcore_id].stats.pending;
1059 	}
1060 	fprintf(f, "Timer statistics:\n");
1061 	fprintf(f, "  reset = %"PRIu64"\n", sum.reset);
1062 	fprintf(f, "  stop = %"PRIu64"\n", sum.stop);
1063 	fprintf(f, "  manage = %"PRIu64"\n", sum.manage);
1064 	fprintf(f, "  pending = %"PRIu64"\n", sum.pending);
1065 #else
1066 	fprintf(f, "No timer statistics, RTE_LIBRTE_TIMER_DEBUG is disabled\n");
1067 #endif
1068 }
1069 
1070 int
1071 rte_timer_dump_stats(FILE *f)
1072 {
1073 	return rte_timer_alt_dump_stats(default_data_id, f);
1074 }
1075 
1076 int
1077 rte_timer_alt_dump_stats(uint32_t timer_data_id __rte_unused, FILE *f)
1078 {
1079 	struct rte_timer_data *timer_data;
1080 
1081 	TIMER_DATA_VALID_GET_OR_ERR_RET(timer_data_id, timer_data, -EINVAL);
1082 
1083 	__rte_timer_dump_stats(timer_data, f);
1084 
1085 	return 0;
1086 }
1087