event/dsw/dsw_evdev.h

/* SPDX-License-Identifier: BSD-3-Clause
 * Copyright(c) 2018 Ericsson AB
 */

#ifndef _DSW_EVDEV_H_
#define _DSW_EVDEV_H_

#include <eventdev_pmd.h>

#include <rte_bitset.h>
#include <rte_event_ring.h>
#include <rte_eventdev.h>

#define DSW_PMD_NAME RTE_STR(event_dsw)

#define DSW_MAX_PORTS (64)
#define DSW_MAX_PORT_DEQUEUE_DEPTH (128)
#define DSW_MAX_PORT_ENQUEUE_DEPTH (128)
#define DSW_MAX_PORT_OUT_BUFFER (32)

#define DSW_MAX_QUEUES (16)

#define DSW_MAX_EVENTS (16384)

/* Multiple 24-bit flow ids will map to the same DSW-level flow. The
 * number of DSW flows should be high enough make it unlikely that
 * flow ids of several large flows hash to the same DSW-level flow.
 * Such collisions will limit parallelism and thus the number of cores
 * that may be utilized. However, configuring a large number of DSW
 * flows might potentially, depending on traffic and actual
 * application flow id value range, result in each such DSW-level flow
 * being very small. The effect of migrating such flows will be small,
 * in terms amount of processing load redistributed. This will in turn
 * reduce the load balancing speed, since flow migration rate has an
 * upper limit. Code changes are required to allow > 32k DSW-level
 * flows.
 */
#define DSW_MAX_FLOWS_BITS (13)
#define DSW_MAX_FLOWS (1<<(DSW_MAX_FLOWS_BITS))
#define DSW_MAX_FLOWS_MASK (DSW_MAX_FLOWS-1)

/* Eventdev RTE_SCHED_TYPE_PARALLEL doesn't have a concept of flows,
 * but the 'dsw' scheduler (more or less) randomly assign flow id to
 * events on parallel queues, to be able to reuse some of the
 * migration mechanism and scheduling logic from
 * RTE_SCHED_TYPE_ATOMIC. By moving one of the parallel "flows" from a
 * particular port, the likely-hood of events being scheduled to this
 * port is reduced, and thus a kind of statistical load balancing is
 * achieved.
 */
#define DSW_PARALLEL_FLOWS (1024)

/* 'Background tasks' are polling the control rings for *
 *  migration-related messages, or flush the output buffer (so
 *  buffered events doesn't linger too long). Shouldn't be too low,
 *  since the system won't benefit from the 'batching' effects from
 *  the output buffer, and shouldn't be too high, since it will make
 *  buffered events linger too long in case the port goes idle.
 */
#define DSW_MAX_PORT_OPS_PER_BG_TASK (128)

/* Avoid making small 'loans' from the central in-flight event credit
 * pool, to improve efficiency.
 */
#define DSW_MIN_CREDIT_LOAN (64)
#define DSW_PORT_MAX_CREDITS (2*DSW_MIN_CREDIT_LOAN)
#define DSW_PORT_MIN_CREDITS (DSW_MIN_CREDIT_LOAN)

/* The rings are dimensioned so that all in-flight events can reside
 * on any one of the port rings, to avoid the trouble of having to
 * care about the case where there's no room on the destination port's
 * input ring.
 */
#define DSW_IN_RING_SIZE (DSW_MAX_EVENTS)

#define DSW_MAX_LOAD (INT16_MAX)
#define DSW_LOAD_FROM_PERCENT(x) ((int16_t)(((x)*DSW_MAX_LOAD)/100))
#define DSW_LOAD_TO_PERCENT(x) ((100*x)/DSW_MAX_LOAD)

/* The thought behind keeping the load update interval shorter than
 * the migration interval is that the load from newly migrated flows
 * should 'show up' on the load measurement before new migrations are
 * considered. This is to avoid having too many flows, from too many
 * source ports, to be migrated too quickly to a lightly loaded port -
 * in particular since this might cause the system to oscillate.
 */
#define DSW_LOAD_UPDATE_INTERVAL (DSW_MIGRATION_INTERVAL/4)
#define DSW_OLD_LOAD_WEIGHT (1)

/* The minimum time (in us) between two flow migrations. What puts an
 * upper limit on the actual migration rate is primarily the pace in
 * which the ports send and receive control messages, which in turn is
 * largely a function of how much cycles are spent the processing of
 * an event burst.
 */
#define DSW_MIGRATION_INTERVAL (1000)
#define DSW_MIN_SOURCE_LOAD_FOR_MIGRATION (DSW_LOAD_FROM_PERCENT(70))
#define DSW_MAX_TARGET_LOAD_FOR_MIGRATION (DSW_LOAD_FROM_PERCENT(95))
#define DSW_REBALANCE_THRESHOLD (DSW_LOAD_FROM_PERCENT(3))

#define DSW_MAX_EVENTS_RECORDED (128)

#define DSW_MAX_FLOWS_PER_MIGRATION (8)

/* Only one outstanding migration per port is allowed */
#define DSW_MAX_PAUSED_FLOWS (DSW_MAX_PORTS*DSW_MAX_FLOWS_PER_MIGRATION)

/* Enough room for pause request/confirm and unpaus request/confirm for
 * all possible senders.
 */
#define DSW_CTL_IN_RING_SIZE ((DSW_MAX_PORTS-1)*4)

/* With DSW_SORT_DEQUEUED enabled, the scheduler will, at the point of
 * dequeue(), arrange events so that events with the same flow id on
 * the same queue forms a back-to-back "burst", and also so that such
 * bursts of different flow ids, but on the same queue, also come
 * consecutively. All this in an attempt to improve data and
 * instruction cache usage for the application, at the cost of a
 * scheduler overhead increase.
 */

/* #define DSW_SORT_DEQUEUED */

struct dsw_queue_flow {
	uint8_t queue_id;
	uint16_t flow_hash;
};

enum dsw_migration_state {
	DSW_MIGRATION_STATE_IDLE,
	DSW_MIGRATION_STATE_FINISH_PENDING,
	DSW_MIGRATION_STATE_PAUSING,
	DSW_MIGRATION_STATE_UNPAUSING
};

struct __rte_cache_aligned dsw_port {
	uint16_t id;

	/* Keeping a pointer here to avoid container_of() calls, which
	 * are expensive since they are very frequent and will result
	 * in an integer multiplication (since the port id is an index
	 * into the dsw_evdev port array).
	 */
	struct dsw_evdev *dsw;

	uint16_t dequeue_depth;
	uint16_t enqueue_depth;

	int32_t inflight_credits;

	int32_t new_event_threshold;

	bool implicit_release;

	uint16_t pending_releases;

	uint16_t next_parallel_flow_id;

	uint16_t ops_since_bg_task;

	/* most recent 'background' processing */
	uint64_t last_bg;

	/* For port load measurement. */
	uint64_t next_load_update;
	uint64_t load_update_interval;
	uint64_t measurement_start;
	uint64_t busy_start;
	uint64_t busy_cycles;
	uint64_t total_busy_cycles;

	/* For the ctl interface and flow migration mechanism. */
	uint64_t next_emigration;
	uint64_t migration_interval;
	enum dsw_migration_state migration_state;

	uint64_t emigration_start;
	uint64_t emigrations;
	uint64_t emigration_latency;

	uint8_t emigration_target_port_ids[DSW_MAX_FLOWS_PER_MIGRATION];
	struct dsw_queue_flow
		emigration_target_qfs[DSW_MAX_FLOWS_PER_MIGRATION];
	uint8_t emigration_targets_len;
	uint8_t cfm_cnt;

	uint64_t immigrations;

	uint16_t paused_flows_len;
	struct dsw_queue_flow paused_flows[DSW_MAX_PAUSED_FLOWS];

	/* In a very contrived worst case all inflight events can be
	 * laying around paused here.
	 */
	uint16_t paused_events_len;
	struct rte_event paused_events[DSW_MAX_EVENTS];

	uint16_t emigrating_events_len;
	/* Buffer for not-yet-processed events pertaining to a flow
	 * emigrating from this port. These events will be forwarded
	 * to the target port.
	 */
	struct rte_event emigrating_events[DSW_MAX_EVENTS];

	uint16_t seen_events_len;
	uint16_t seen_events_idx;
	struct dsw_queue_flow seen_events[DSW_MAX_EVENTS_RECORDED];

	uint64_t enqueue_calls;
	uint64_t new_enqueued;
	uint64_t forward_enqueued;
	uint64_t release_enqueued;
	uint64_t queue_enqueued[DSW_MAX_QUEUES];

	uint64_t dequeue_calls;
	uint64_t dequeued;
	uint64_t queue_dequeued[DSW_MAX_QUEUES];

	uint16_t out_buffer_len[DSW_MAX_PORTS];
	struct rte_event out_buffer[DSW_MAX_PORTS][DSW_MAX_PORT_OUT_BUFFER];

	uint16_t in_buffer_len;
	uint16_t in_buffer_start;
	/* This buffer may contain events that were read up from the
	 * in_ring during the flow migration process.
	 */
	struct rte_event in_buffer[DSW_MAX_EVENTS];

	alignas(RTE_CACHE_LINE_SIZE) struct rte_event_ring *in_ring;

	alignas(RTE_CACHE_LINE_SIZE) struct rte_ring *ctl_in_ring;

	/* Estimate of current port load. */
	alignas(RTE_CACHE_LINE_SIZE) RTE_ATOMIC(int16_t) load;
	/* Estimate of flows currently migrating to this port. */
	alignas(RTE_CACHE_LINE_SIZE) RTE_ATOMIC(int32_t) immigration_load;
};

struct dsw_queue {
	uint8_t schedule_type;
	RTE_BITSET_DECLARE(serving_ports, DSW_MAX_PORTS);
	uint16_t num_serving_ports;

	alignas(RTE_CACHE_LINE_SIZE) uint8_t flow_to_port_map[DSW_MAX_FLOWS];
};

/* Limited by the size of the 'serving_ports' bitmask */
static_assert(DSW_MAX_PORTS <= 64, "Max compile-time port count exceeded");

struct dsw_evdev {
	struct rte_eventdev_data *data;

	struct dsw_port ports[DSW_MAX_PORTS];
	uint16_t num_ports;
	struct dsw_queue queues[DSW_MAX_QUEUES];
	uint8_t num_queues;
	int32_t max_inflight;

	alignas(RTE_CACHE_LINE_SIZE) RTE_ATOMIC(int32_t) credits_on_loan;
};

#define DSW_CTL_PAUSE_REQ (0)
#define DSW_CTL_UNPAUSE_REQ (1)
#define DSW_CTL_CFM (2)

struct __rte_aligned(4) dsw_ctl_msg {
	uint8_t type;
	uint8_t originating_port_id;
	uint8_t qfs_len;
	struct dsw_queue_flow qfs[DSW_MAX_FLOWS_PER_MIGRATION];
};

uint16_t dsw_event_enqueue_burst(void *port,
				 const struct rte_event events[],
				 uint16_t events_len);
uint16_t dsw_event_enqueue_new_burst(void *port,
				     const struct rte_event events[],
				     uint16_t events_len);
uint16_t dsw_event_enqueue_forward_burst(void *port,
					 const struct rte_event events[],
					 uint16_t events_len);

uint16_t dsw_event_dequeue_burst(void *port, struct rte_event *events,
				 uint16_t num, uint64_t wait);
void dsw_event_maintain(void *port, int op);

int dsw_xstats_get_names(const struct rte_eventdev *dev,
			 enum rte_event_dev_xstats_mode mode,
			 uint8_t queue_port_id,
			 struct rte_event_dev_xstats_name *xstats_names,
			 uint64_t *ids, unsigned int size);
int dsw_xstats_get(const struct rte_eventdev *dev,
		   enum rte_event_dev_xstats_mode mode, uint8_t queue_port_id,
		   const uint64_t ids[], uint64_t values[], unsigned int n);
uint64_t dsw_xstats_get_by_name(const struct rte_eventdev *dev,
				const char *name, uint64_t *id);

static inline struct dsw_evdev *
dsw_pmd_priv(const struct rte_eventdev *eventdev)
{
	return eventdev->data->dev_private;
}

extern int event_dsw_logtype;
#define RTE_LOGTYPE_EVENT_DSW event_dsw_logtype
#define DSW_LOG_DP_LINE(level, fmt, ...)				\
	RTE_LOG_DP_LINE(level, EVENT_DSW, "%s() line %u: " fmt,		\
		   __func__, __LINE__, ## __VA_ARGS__)

#define DSW_LOG_DP_PORT_LINE(level, port_id, fmt, ...)			\
	DSW_LOG_DP_LINE(level, "<Port %d> " fmt, port_id, ## __VA_ARGS__)

#endif