xref: /dpdk/drivers/net/memif/rte_eth_memif.c (revision f665790a5dbad7b645ff46f31d65e977324e7bfc)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2018-2019 Cisco Systems, Inc.  All rights reserved.
3  */
4 
5 #include <stdlib.h>
6 #include <fcntl.h>
7 #include <unistd.h>
8 #include <sys/types.h>
9 #include <sys/socket.h>
10 #include <sys/un.h>
11 #include <sys/ioctl.h>
12 #include <sys/mman.h>
13 #include <linux/if_ether.h>
14 #include <errno.h>
15 #include <sys/eventfd.h>
16 
17 #include <rte_version.h>
18 #include <rte_mbuf.h>
19 #include <rte_ether.h>
20 #include <ethdev_driver.h>
21 #include <ethdev_vdev.h>
22 #include <rte_malloc.h>
23 #include <rte_kvargs.h>
24 #include <bus_vdev_driver.h>
25 #include <rte_string_fns.h>
26 #include <rte_errno.h>
27 #include <rte_memory.h>
28 #include <rte_memzone.h>
29 #include <rte_eal_memconfig.h>
30 
31 #include "rte_eth_memif.h"
32 #include "memif_socket.h"
33 
34 #define ETH_MEMIF_ID_ARG		"id"
35 #define ETH_MEMIF_ROLE_ARG		"role"
36 #define ETH_MEMIF_PKT_BUFFER_SIZE_ARG	"bsize"
37 #define ETH_MEMIF_RING_SIZE_ARG		"rsize"
38 #define ETH_MEMIF_SOCKET_ARG		"socket"
39 #define ETH_MEMIF_SOCKET_ABSTRACT_ARG	"socket-abstract"
40 #define ETH_MEMIF_OWNER_UID_ARG		"owner-uid"
41 #define ETH_MEMIF_OWNER_GID_ARG		"owner-gid"
42 #define ETH_MEMIF_MAC_ARG		"mac"
43 #define ETH_MEMIF_ZC_ARG		"zero-copy"
44 #define ETH_MEMIF_SECRET_ARG		"secret"
45 
46 static const char * const valid_arguments[] = {
47 	ETH_MEMIF_ID_ARG,
48 	ETH_MEMIF_ROLE_ARG,
49 	ETH_MEMIF_PKT_BUFFER_SIZE_ARG,
50 	ETH_MEMIF_RING_SIZE_ARG,
51 	ETH_MEMIF_SOCKET_ARG,
52 	ETH_MEMIF_SOCKET_ABSTRACT_ARG,
53 	ETH_MEMIF_OWNER_UID_ARG,
54 	ETH_MEMIF_OWNER_GID_ARG,
55 	ETH_MEMIF_MAC_ARG,
56 	ETH_MEMIF_ZC_ARG,
57 	ETH_MEMIF_SECRET_ARG,
58 	NULL
59 };
60 
61 static const struct rte_eth_link pmd_link = {
62 	.link_speed = RTE_ETH_SPEED_NUM_100G,
63 	.link_duplex = RTE_ETH_LINK_FULL_DUPLEX,
64 	.link_status = RTE_ETH_LINK_DOWN,
65 	.link_autoneg = RTE_ETH_LINK_AUTONEG
66 };
67 
68 #define MEMIF_MP_SEND_REGION		"memif_mp_send_region"
69 
70 
71 static int memif_region_init_zc(const struct rte_memseg_list *msl,
72 				const struct rte_memseg *ms, void *arg);
73 
74 const char *
75 memif_version(void)
76 {
77 	return ("memif-" RTE_STR(MEMIF_VERSION_MAJOR) "." RTE_STR(MEMIF_VERSION_MINOR));
78 }
79 
80 /* Message header to synchronize regions */
81 struct mp_region_msg {
82 	char port_name[RTE_DEV_NAME_MAX_LEN];
83 	memif_region_index_t idx;
84 	memif_region_size_t size;
85 };
86 
87 static int
88 memif_mp_send_region(const struct rte_mp_msg *msg, const void *peer)
89 {
90 	struct rte_eth_dev *dev;
91 	struct pmd_process_private *proc_private;
92 	const struct mp_region_msg *msg_param = (const struct mp_region_msg *)msg->param;
93 	struct rte_mp_msg reply;
94 	struct mp_region_msg *reply_param = (struct mp_region_msg *)reply.param;
95 
96 	/* Get requested port */
97 	dev = rte_eth_dev_get_by_name(msg_param->port_name);
98 	if (!dev) {
99 		MIF_LOG(ERR, "Failed to get port id for %s",
100 			msg_param->port_name);
101 		return -1;
102 	}
103 	proc_private = dev->process_private;
104 
105 	memset(&reply, 0, sizeof(reply));
106 	strlcpy(reply.name, msg->name, sizeof(reply.name));
107 	reply_param->idx = msg_param->idx;
108 	if (proc_private->regions[msg_param->idx] != NULL) {
109 		reply_param->size = proc_private->regions[msg_param->idx]->region_size;
110 		reply.fds[0] = proc_private->regions[msg_param->idx]->fd;
111 		reply.num_fds = 1;
112 	}
113 	reply.len_param = sizeof(*reply_param);
114 	if (rte_mp_reply(&reply, peer) < 0) {
115 		MIF_LOG(ERR, "Failed to reply to an add region request");
116 		return -1;
117 	}
118 
119 	return 0;
120 }
121 
122 /*
123  * Request regions
124  * Called by secondary process, when ports link status goes up.
125  */
126 static int
127 memif_mp_request_regions(struct rte_eth_dev *dev)
128 {
129 	int ret, i;
130 	struct timespec timeout = {.tv_sec = 5, .tv_nsec = 0};
131 	struct rte_mp_msg msg, *reply;
132 	struct rte_mp_reply replies;
133 	struct mp_region_msg *msg_param = (struct mp_region_msg *)msg.param;
134 	struct mp_region_msg *reply_param;
135 	struct memif_region *r;
136 	struct pmd_process_private *proc_private = dev->process_private;
137 	struct pmd_internals *pmd = dev->data->dev_private;
138 	/* in case of zero-copy client, only request region 0 */
139 	uint16_t max_region_num = (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) ?
140 				   1 : ETH_MEMIF_MAX_REGION_NUM;
141 
142 	MIF_LOG(DEBUG, "Requesting memory regions");
143 
144 	for (i = 0; i < max_region_num; i++) {
145 		/* Prepare the message */
146 		memset(&msg, 0, sizeof(msg));
147 		strlcpy(msg.name, MEMIF_MP_SEND_REGION, sizeof(msg.name));
148 		strlcpy(msg_param->port_name, dev->data->name,
149 			sizeof(msg_param->port_name));
150 		msg_param->idx = i;
151 		msg.len_param = sizeof(*msg_param);
152 
153 		/* Send message */
154 		ret = rte_mp_request_sync(&msg, &replies, &timeout);
155 		if (ret < 0 || replies.nb_received != 1) {
156 			MIF_LOG(ERR, "Failed to send mp msg: %d",
157 				rte_errno);
158 			return -1;
159 		}
160 
161 		reply = &replies.msgs[0];
162 		reply_param = (struct mp_region_msg *)reply->param;
163 
164 		if (reply_param->size > 0) {
165 			r = rte_zmalloc("region", sizeof(struct memif_region), 0);
166 			if (r == NULL) {
167 				MIF_LOG(ERR, "Failed to alloc memif region.");
168 				free(reply);
169 				return -ENOMEM;
170 			}
171 			r->region_size = reply_param->size;
172 			if (reply->num_fds < 1) {
173 				MIF_LOG(ERR, "Missing file descriptor.");
174 				free(reply);
175 				return -1;
176 			}
177 			r->fd = reply->fds[0];
178 			r->addr = NULL;
179 
180 			proc_private->regions[reply_param->idx] = r;
181 			proc_private->regions_num++;
182 		}
183 		free(reply);
184 	}
185 
186 	if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
187 		ret = rte_memseg_walk(memif_region_init_zc, (void *)proc_private);
188 		if (ret < 0)
189 			return ret;
190 	}
191 
192 	return memif_connect(dev);
193 }
194 
195 static int
196 memif_dev_info(struct rte_eth_dev *dev __rte_unused, struct rte_eth_dev_info *dev_info)
197 {
198 	dev_info->max_mac_addrs = 1;
199 	dev_info->max_rx_pktlen = RTE_ETHER_MAX_LEN;
200 	dev_info->max_rx_queues = ETH_MEMIF_MAX_NUM_Q_PAIRS;
201 	dev_info->max_tx_queues = ETH_MEMIF_MAX_NUM_Q_PAIRS;
202 	dev_info->min_rx_bufsize = 0;
203 	dev_info->tx_offload_capa = RTE_ETH_TX_OFFLOAD_MULTI_SEGS;
204 
205 	return 0;
206 }
207 
208 static memif_ring_t *
209 memif_get_ring(struct pmd_internals *pmd, struct pmd_process_private *proc_private,
210 	       memif_ring_type_t type, uint16_t ring_num)
211 {
212 	/* rings only in region 0 */
213 	void *p = proc_private->regions[0]->addr;
214 	int ring_size = sizeof(memif_ring_t) + sizeof(memif_desc_t) *
215 	    (1 << pmd->run.log2_ring_size);
216 
217 	p = (uint8_t *)p + (ring_num + type * pmd->run.num_c2s_rings) * ring_size;
218 
219 	return (memif_ring_t *)p;
220 }
221 
222 static memif_region_offset_t
223 memif_get_ring_offset(struct rte_eth_dev *dev, struct memif_queue *mq,
224 		      memif_ring_type_t type, uint16_t num)
225 {
226 	struct pmd_internals *pmd = dev->data->dev_private;
227 	struct pmd_process_private *proc_private = dev->process_private;
228 
229 	return ((uint8_t *)memif_get_ring(pmd, proc_private, type, num) -
230 		(uint8_t *)proc_private->regions[mq->region]->addr);
231 }
232 
233 static memif_ring_t *
234 memif_get_ring_from_queue(struct pmd_process_private *proc_private,
235 			  struct memif_queue *mq)
236 {
237 	struct memif_region *r;
238 
239 	r = proc_private->regions[mq->region];
240 	if (r == NULL)
241 		return NULL;
242 
243 	return (memif_ring_t *)((uint8_t *)r->addr + mq->ring_offset);
244 }
245 
246 static void *
247 memif_get_buffer(struct pmd_process_private *proc_private, memif_desc_t *d)
248 {
249 	return ((uint8_t *)proc_private->regions[d->region]->addr + d->offset);
250 }
251 
252 /* Free mbufs received by server */
253 static void
254 memif_free_stored_mbufs(struct pmd_process_private *proc_private, struct memif_queue *mq)
255 {
256 	uint16_t cur_tail;
257 	uint16_t mask = (1 << mq->log2_ring_size) - 1;
258 	memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
259 
260 	/* FIXME: improve performance */
261 	/* The ring->tail acts as a guard variable between Tx and Rx
262 	 * threads, so using load-acquire pairs with store-release
263 	 * in function eth_memif_rx for C2S queues.
264 	 */
265 	cur_tail = rte_atomic_load_explicit(&ring->tail, rte_memory_order_acquire);
266 	while (mq->last_tail != cur_tail) {
267 		RTE_MBUF_PREFETCH_TO_FREE(mq->buffers[(mq->last_tail + 1) & mask]);
268 		rte_pktmbuf_free_seg(mq->buffers[mq->last_tail & mask]);
269 		mq->last_tail++;
270 	}
271 }
272 
273 static int
274 memif_pktmbuf_chain(struct rte_mbuf *head, struct rte_mbuf *cur_tail,
275 		    struct rte_mbuf *tail)
276 {
277 	/* Check for number-of-segments-overflow */
278 	if (unlikely(head->nb_segs + tail->nb_segs > RTE_MBUF_MAX_NB_SEGS))
279 		return -EOVERFLOW;
280 
281 	/* Chain 'tail' onto the old tail */
282 	cur_tail->next = tail;
283 
284 	/* accumulate number of segments and total length. */
285 	head->nb_segs = (uint16_t)(head->nb_segs + tail->nb_segs);
286 
287 	tail->pkt_len = tail->data_len;
288 	head->pkt_len += tail->pkt_len;
289 
290 	return 0;
291 }
292 
293 static uint16_t
294 eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
295 {
296 	struct memif_queue *mq = queue;
297 	struct pmd_internals *pmd = rte_eth_devices[mq->in_port].data->dev_private;
298 	struct pmd_process_private *proc_private =
299 		rte_eth_devices[mq->in_port].process_private;
300 	memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
301 	uint16_t cur_slot, last_slot, n_slots, ring_size, mask, s0;
302 	uint16_t pkts, rx_pkts, n_rx_pkts = 0;
303 	uint16_t mbuf_size = rte_pktmbuf_data_room_size(mq->mempool) -
304 		RTE_PKTMBUF_HEADROOM;
305 	uint16_t src_len, src_off, dst_len, dst_off, cp_len;
306 	memif_ring_type_t type = mq->type;
307 	memif_desc_t *d0;
308 	struct rte_mbuf *mbuf, *mbuf_head, *mbuf_tail;
309 	uint64_t b;
310 	ssize_t size __rte_unused;
311 	uint16_t head;
312 	int ret;
313 	struct rte_eth_link link;
314 
315 	if (unlikely((pmd->flags & ETH_MEMIF_FLAG_CONNECTED) == 0))
316 		return 0;
317 	if (unlikely(ring == NULL)) {
318 		/* Secondary process will attempt to request regions. */
319 		ret = rte_eth_link_get(mq->in_port, &link);
320 		if (ret < 0)
321 			MIF_LOG(ERR, "Failed to get port %u link info: %s",
322 				mq->in_port, rte_strerror(-ret));
323 		return 0;
324 	}
325 
326 	/* consume interrupt */
327 	if (((ring->flags & MEMIF_RING_FLAG_MASK_INT) == 0) &&
328 	    (rte_intr_fd_get(mq->intr_handle) >= 0))
329 		size = read(rte_intr_fd_get(mq->intr_handle), &b,
330 			    sizeof(b));
331 
332 	ring_size = 1 << mq->log2_ring_size;
333 	mask = ring_size - 1;
334 
335 	if (type == MEMIF_RING_C2S) {
336 		cur_slot = mq->last_head;
337 		last_slot = rte_atomic_load_explicit(&ring->head, rte_memory_order_acquire);
338 	} else {
339 		cur_slot = mq->last_tail;
340 		last_slot = rte_atomic_load_explicit(&ring->tail, rte_memory_order_acquire);
341 	}
342 
343 	if (cur_slot == last_slot)
344 		goto refill;
345 	n_slots = last_slot - cur_slot;
346 
347 	if (likely(mbuf_size >= pmd->cfg.pkt_buffer_size)) {
348 		struct rte_mbuf *mbufs[MAX_PKT_BURST];
349 next_bulk:
350 		ret = rte_pktmbuf_alloc_bulk(mq->mempool, mbufs, MAX_PKT_BURST);
351 		if (unlikely(ret < 0))
352 			goto no_free_bufs;
353 
354 		rx_pkts = 0;
355 		pkts = nb_pkts < MAX_PKT_BURST ? nb_pkts : MAX_PKT_BURST;
356 		while (n_slots && rx_pkts < pkts) {
357 			mbuf_head = mbufs[rx_pkts];
358 			mbuf = mbuf_head;
359 
360 next_slot1:
361 			mbuf->port = mq->in_port;
362 			s0 = cur_slot & mask;
363 			d0 = &ring->desc[s0];
364 
365 			cp_len = d0->length;
366 
367 			rte_pktmbuf_data_len(mbuf) = cp_len;
368 			rte_pktmbuf_pkt_len(mbuf) = cp_len;
369 			if (mbuf != mbuf_head)
370 				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
371 
372 			rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
373 				(uint8_t *)memif_get_buffer(proc_private, d0), cp_len);
374 
375 			cur_slot++;
376 			n_slots--;
377 
378 			if (d0->flags & MEMIF_DESC_FLAG_NEXT) {
379 				mbuf_tail = mbuf;
380 				mbuf = rte_pktmbuf_alloc(mq->mempool);
381 				if (unlikely(mbuf == NULL)) {
382 					rte_pktmbuf_free_bulk(mbufs + rx_pkts,
383 							MAX_PKT_BURST - rx_pkts);
384 					goto no_free_bufs;
385 				}
386 				ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
387 				if (unlikely(ret < 0)) {
388 					MIF_LOG(ERR, "number-of-segments-overflow");
389 					rte_pktmbuf_free(mbuf);
390 					rte_pktmbuf_free_bulk(mbufs + rx_pkts,
391 							MAX_PKT_BURST - rx_pkts);
392 					goto no_free_bufs;
393 				}
394 				goto next_slot1;
395 			}
396 
397 			mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
398 			*bufs++ = mbuf_head;
399 			rx_pkts++;
400 			n_rx_pkts++;
401 		}
402 
403 		if (rx_pkts < MAX_PKT_BURST) {
404 			rte_pktmbuf_free_bulk(mbufs + rx_pkts, MAX_PKT_BURST - rx_pkts);
405 		} else {
406 			nb_pkts -= rx_pkts;
407 			if (nb_pkts)
408 				goto next_bulk;
409 		}
410 	} else {
411 		while (n_slots && n_rx_pkts < nb_pkts) {
412 			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
413 			if (unlikely(mbuf_head == NULL))
414 				goto no_free_bufs;
415 			mbuf = mbuf_head;
416 			mbuf->port = mq->in_port;
417 
418 next_slot2:
419 			s0 = cur_slot & mask;
420 			d0 = &ring->desc[s0];
421 
422 			src_len = d0->length;
423 			dst_off = 0;
424 			src_off = 0;
425 
426 			do {
427 				dst_len = mbuf_size - dst_off;
428 				if (dst_len == 0) {
429 					dst_off = 0;
430 					dst_len = mbuf_size;
431 
432 					/* store pointer to tail */
433 					mbuf_tail = mbuf;
434 					mbuf = rte_pktmbuf_alloc(mq->mempool);
435 					if (unlikely(mbuf == NULL))
436 						goto no_free_bufs;
437 					mbuf->port = mq->in_port;
438 					ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
439 					if (unlikely(ret < 0)) {
440 						MIF_LOG(ERR, "number-of-segments-overflow");
441 						rte_pktmbuf_free(mbuf);
442 						goto no_free_bufs;
443 					}
444 				}
445 				cp_len = RTE_MIN(dst_len, src_len);
446 
447 				rte_pktmbuf_data_len(mbuf) += cp_len;
448 				rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
449 				if (mbuf != mbuf_head)
450 					rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
451 
452 				rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
453 								   dst_off),
454 					(uint8_t *)memif_get_buffer(proc_private, d0) +
455 					src_off, cp_len);
456 
457 				src_off += cp_len;
458 				dst_off += cp_len;
459 				src_len -= cp_len;
460 			} while (src_len);
461 
462 			cur_slot++;
463 			n_slots--;
464 
465 			if (d0->flags & MEMIF_DESC_FLAG_NEXT)
466 				goto next_slot2;
467 
468 			mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
469 			*bufs++ = mbuf_head;
470 			n_rx_pkts++;
471 		}
472 	}
473 
474 no_free_bufs:
475 	if (type == MEMIF_RING_C2S) {
476 		rte_atomic_store_explicit(&ring->tail, cur_slot, rte_memory_order_release);
477 		mq->last_head = cur_slot;
478 	} else {
479 		mq->last_tail = cur_slot;
480 	}
481 
482 refill:
483 	if (type == MEMIF_RING_S2C) {
484 		/* ring->head is updated by the receiver and this function
485 		 * is called in the context of receiver thread. The loads in
486 		 * the receiver do not need to synchronize with its own stores.
487 		 */
488 		head = rte_atomic_load_explicit(&ring->head, rte_memory_order_relaxed);
489 		n_slots = ring_size - head + mq->last_tail;
490 
491 		while (n_slots--) {
492 			s0 = head++ & mask;
493 			d0 = &ring->desc[s0];
494 			d0->length = pmd->run.pkt_buffer_size;
495 		}
496 		rte_atomic_store_explicit(&ring->head, head, rte_memory_order_release);
497 	}
498 
499 	mq->n_pkts += n_rx_pkts;
500 	return n_rx_pkts;
501 }
502 
503 static uint16_t
504 eth_memif_rx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
505 {
506 	struct memif_queue *mq = queue;
507 	struct pmd_internals *pmd = rte_eth_devices[mq->in_port].data->dev_private;
508 	struct pmd_process_private *proc_private =
509 		rte_eth_devices[mq->in_port].process_private;
510 	memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
511 	uint16_t cur_slot, last_slot, n_slots, ring_size, mask, s0, head;
512 	uint16_t n_rx_pkts = 0;
513 	memif_desc_t *d0;
514 	struct rte_mbuf *mbuf, *mbuf_tail;
515 	struct rte_mbuf *mbuf_head = NULL;
516 	int ret;
517 	struct rte_eth_link link;
518 
519 	if (unlikely((pmd->flags & ETH_MEMIF_FLAG_CONNECTED) == 0))
520 		return 0;
521 	if (unlikely(ring == NULL)) {
522 		/* Secondary process will attempt to request regions. */
523 		rte_eth_link_get(mq->in_port, &link);
524 		return 0;
525 	}
526 
527 	/* consume interrupt */
528 	if ((rte_intr_fd_get(mq->intr_handle) >= 0) &&
529 	    ((ring->flags & MEMIF_RING_FLAG_MASK_INT) == 0)) {
530 		uint64_t b;
531 		ssize_t size __rte_unused;
532 		size = read(rte_intr_fd_get(mq->intr_handle), &b,
533 			    sizeof(b));
534 	}
535 
536 	ring_size = 1 << mq->log2_ring_size;
537 	mask = ring_size - 1;
538 
539 	cur_slot = mq->last_tail;
540 	/* The ring->tail acts as a guard variable between Tx and Rx
541 	 * threads, so using load-acquire pairs with store-release
542 	 * to synchronize it between threads.
543 	 */
544 	last_slot = rte_atomic_load_explicit(&ring->tail, rte_memory_order_acquire);
545 	if (cur_slot == last_slot)
546 		goto refill;
547 	n_slots = last_slot - cur_slot;
548 
549 	while (n_slots && n_rx_pkts < nb_pkts) {
550 		s0 = cur_slot & mask;
551 
552 		d0 = &ring->desc[s0];
553 		mbuf_head = mq->buffers[s0];
554 		mbuf = mbuf_head;
555 
556 next_slot:
557 		/* prefetch next descriptor */
558 		if (n_rx_pkts + 1 < nb_pkts)
559 			rte_prefetch0(&ring->desc[(cur_slot + 1) & mask]);
560 
561 		mbuf->port = mq->in_port;
562 		rte_pktmbuf_data_len(mbuf) = d0->length;
563 		rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
564 
565 		mq->n_bytes += rte_pktmbuf_data_len(mbuf);
566 
567 		cur_slot++;
568 		n_slots--;
569 		if (d0->flags & MEMIF_DESC_FLAG_NEXT) {
570 			s0 = cur_slot & mask;
571 			d0 = &ring->desc[s0];
572 			mbuf_tail = mbuf;
573 			mbuf = mq->buffers[s0];
574 			ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
575 			if (unlikely(ret < 0)) {
576 				MIF_LOG(ERR, "number-of-segments-overflow");
577 				goto refill;
578 			}
579 			goto next_slot;
580 		}
581 
582 		*bufs++ = mbuf_head;
583 		n_rx_pkts++;
584 	}
585 
586 	mq->last_tail = cur_slot;
587 
588 /* Supply server with new buffers */
589 refill:
590 	/* ring->head is updated by the receiver and this function
591 	 * is called in the context of receiver thread. The loads in
592 	 * the receiver do not need to synchronize with its own stores.
593 	 */
594 	head = rte_atomic_load_explicit(&ring->head, rte_memory_order_relaxed);
595 	n_slots = ring_size - head + mq->last_tail;
596 
597 	if (n_slots < 32)
598 		goto no_free_mbufs;
599 
600 	ret = rte_pktmbuf_alloc_bulk(mq->mempool, &mq->buffers[head & mask], n_slots);
601 	if (unlikely(ret < 0))
602 		goto no_free_mbufs;
603 
604 	while (n_slots--) {
605 		s0 = head++ & mask;
606 		if (n_slots > 0)
607 			rte_prefetch0(mq->buffers[head & mask]);
608 		d0 = &ring->desc[s0];
609 		/* store buffer header */
610 		mbuf = mq->buffers[s0];
611 		/* populate descriptor */
612 		d0->length = rte_pktmbuf_data_room_size(mq->mempool) -
613 				RTE_PKTMBUF_HEADROOM;
614 		d0->region = 1;
615 		d0->offset = rte_pktmbuf_mtod(mbuf, uint8_t *) -
616 			(uint8_t *)proc_private->regions[d0->region]->addr;
617 	}
618 no_free_mbufs:
619 	/* The ring->head acts as a guard variable between Tx and Rx
620 	 * threads, so using store-release pairs with load-acquire
621 	 * in function eth_memif_tx.
622 	 */
623 	rte_atomic_store_explicit(&ring->head, head, rte_memory_order_release);
624 
625 	mq->n_pkts += n_rx_pkts;
626 
627 	return n_rx_pkts;
628 }
629 
630 static uint16_t
631 eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
632 {
633 	struct memif_queue *mq = queue;
634 	struct pmd_internals *pmd = rte_eth_devices[mq->in_port].data->dev_private;
635 	struct pmd_process_private *proc_private =
636 		rte_eth_devices[mq->in_port].process_private;
637 	memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
638 	uint16_t slot, saved_slot, n_free, ring_size, mask, n_tx_pkts = 0;
639 	uint16_t src_len, src_off, dst_len, dst_off, cp_len, nb_segs;
640 	memif_ring_type_t type = mq->type;
641 	memif_desc_t *d0;
642 	struct rte_mbuf *mbuf;
643 	struct rte_mbuf *mbuf_head;
644 	uint64_t a;
645 	ssize_t size;
646 	struct rte_eth_link link;
647 
648 	if (unlikely((pmd->flags & ETH_MEMIF_FLAG_CONNECTED) == 0))
649 		return 0;
650 	if (unlikely(ring == NULL)) {
651 		int ret;
652 
653 		/* Secondary process will attempt to request regions. */
654 		ret = rte_eth_link_get(mq->in_port, &link);
655 		if (ret < 0)
656 			MIF_LOG(ERR, "Failed to get port %u link info: %s",
657 				mq->in_port, rte_strerror(-ret));
658 		return 0;
659 	}
660 
661 	ring_size = 1 << mq->log2_ring_size;
662 	mask = ring_size - 1;
663 
664 	if (type == MEMIF_RING_C2S) {
665 		/* For C2S queues ring->head is updated by the sender and
666 		 * this function is called in the context of sending thread.
667 		 * The loads in the sender do not need to synchronize with
668 		 * its own stores. Hence, the following load can be a
669 		 * relaxed load.
670 		 */
671 		slot = rte_atomic_load_explicit(&ring->head, rte_memory_order_relaxed);
672 		n_free = ring_size - slot +
673 				rte_atomic_load_explicit(&ring->tail, rte_memory_order_acquire);
674 	} else {
675 		/* For S2C queues ring->tail is updated by the sender and
676 		 * this function is called in the context of sending thread.
677 		 * The loads in the sender do not need to synchronize with
678 		 * its own stores. Hence, the following load can be a
679 		 * relaxed load.
680 		 */
681 		slot = rte_atomic_load_explicit(&ring->tail, rte_memory_order_relaxed);
682 		n_free = rte_atomic_load_explicit(&ring->head, rte_memory_order_acquire) - slot;
683 	}
684 
685 	uint16_t i;
686 	struct rte_mbuf **buf_tmp = bufs;
687 	mbuf_head = *buf_tmp++;
688 	struct rte_mempool *mp = mbuf_head->pool;
689 
690 	for (i = 1; i < nb_pkts; i++) {
691 		mbuf_head = *buf_tmp++;
692 		if (mbuf_head->pool != mp)
693 			break;
694 	}
695 
696 	uint16_t mbuf_size = rte_pktmbuf_data_room_size(mp) - RTE_PKTMBUF_HEADROOM;
697 	if (i == nb_pkts && pmd->cfg.pkt_buffer_size >= mbuf_size) {
698 		buf_tmp = bufs;
699 		while (n_tx_pkts < nb_pkts && n_free) {
700 			mbuf_head = *bufs++;
701 			nb_segs = mbuf_head->nb_segs;
702 			mbuf = mbuf_head;
703 
704 			saved_slot = slot;
705 
706 next_in_chain1:
707 			d0 = &ring->desc[slot & mask];
708 			cp_len = rte_pktmbuf_data_len(mbuf);
709 
710 			rte_memcpy((uint8_t *)memif_get_buffer(proc_private, d0),
711 				rte_pktmbuf_mtod(mbuf, void *), cp_len);
712 
713 			d0->length = cp_len;
714 			mq->n_bytes += cp_len;
715 			slot++;
716 			n_free--;
717 
718 			if (--nb_segs > 0) {
719 				if (n_free) {
720 					d0->flags |= MEMIF_DESC_FLAG_NEXT;
721 					mbuf = mbuf->next;
722 					goto next_in_chain1;
723 				} else {
724 					slot = saved_slot;
725 					goto free_mbufs;
726 				}
727 			}
728 
729 			n_tx_pkts++;
730 		}
731 free_mbufs:
732 		rte_pktmbuf_free_bulk(buf_tmp, n_tx_pkts);
733 	} else {
734 		while (n_tx_pkts < nb_pkts && n_free) {
735 			mbuf_head = *bufs++;
736 			nb_segs = mbuf_head->nb_segs;
737 			mbuf = mbuf_head;
738 
739 			saved_slot = slot;
740 			d0 = &ring->desc[slot & mask];
741 			dst_off = 0;
742 			dst_len = (type == MEMIF_RING_C2S) ?
743 				pmd->run.pkt_buffer_size : d0->length;
744 
745 next_in_chain2:
746 			src_off = 0;
747 			src_len = rte_pktmbuf_data_len(mbuf);
748 
749 			while (src_len) {
750 				if (dst_len == 0) {
751 					if (n_free) {
752 						slot++;
753 						n_free--;
754 						d0->flags |= MEMIF_DESC_FLAG_NEXT;
755 						d0 = &ring->desc[slot & mask];
756 						dst_off = 0;
757 						dst_len = (type == MEMIF_RING_C2S) ?
758 						    pmd->run.pkt_buffer_size : d0->length;
759 						d0->flags = 0;
760 					} else {
761 						slot = saved_slot;
762 						goto no_free_slots;
763 					}
764 				}
765 				cp_len = RTE_MIN(dst_len, src_len);
766 
767 				rte_memcpy((uint8_t *)memif_get_buffer(proc_private,
768 								       d0) + dst_off,
769 					rte_pktmbuf_mtod_offset(mbuf, void *, src_off),
770 					cp_len);
771 
772 				mq->n_bytes += cp_len;
773 				src_off += cp_len;
774 				dst_off += cp_len;
775 				src_len -= cp_len;
776 				dst_len -= cp_len;
777 
778 				d0->length = dst_off;
779 			}
780 
781 			if (--nb_segs > 0) {
782 				mbuf = mbuf->next;
783 				goto next_in_chain2;
784 			}
785 
786 			n_tx_pkts++;
787 			slot++;
788 			n_free--;
789 			rte_pktmbuf_free(mbuf_head);
790 		}
791 	}
792 
793 no_free_slots:
794 	if (type == MEMIF_RING_C2S)
795 		rte_atomic_store_explicit(&ring->head, slot, rte_memory_order_release);
796 	else
797 		rte_atomic_store_explicit(&ring->tail, slot, rte_memory_order_release);
798 
799 	if (((ring->flags & MEMIF_RING_FLAG_MASK_INT) == 0) &&
800 	    (rte_intr_fd_get(mq->intr_handle) >= 0)) {
801 		a = 1;
802 		size = write(rte_intr_fd_get(mq->intr_handle), &a,
803 			     sizeof(a));
804 		if (unlikely(size < 0)) {
805 			MIF_LOG(WARNING,
806 				"Failed to send interrupt. %s", strerror(errno));
807 		}
808 	}
809 
810 	mq->n_pkts += n_tx_pkts;
811 	return n_tx_pkts;
812 }
813 
814 static int
815 memif_tx_one_zc(struct pmd_process_private *proc_private, struct memif_queue *mq,
816 		memif_ring_t *ring, struct rte_mbuf *mbuf, const uint16_t mask,
817 		uint16_t slot, uint16_t n_free)
818 {
819 	memif_desc_t *d0;
820 	uint16_t nb_segs = mbuf->nb_segs;
821 	int used_slots = 1;
822 
823 next_in_chain:
824 	/* store pointer to mbuf to free it later */
825 	mq->buffers[slot & mask] = mbuf;
826 	/* populate descriptor */
827 	d0 = &ring->desc[slot & mask];
828 	d0->length = rte_pktmbuf_data_len(mbuf);
829 	mq->n_bytes += rte_pktmbuf_data_len(mbuf);
830 	/* FIXME: get region index */
831 	d0->region = 1;
832 	d0->offset = rte_pktmbuf_mtod(mbuf, uint8_t *) -
833 		(uint8_t *)proc_private->regions[d0->region]->addr;
834 	d0->flags = 0;
835 
836 	/* check if buffer is chained */
837 	if (--nb_segs > 0) {
838 		if (n_free < 2)
839 			return 0;
840 		/* mark buffer as chained */
841 		d0->flags |= MEMIF_DESC_FLAG_NEXT;
842 		/* advance mbuf */
843 		mbuf = mbuf->next;
844 		/* update counters */
845 		used_slots++;
846 		slot++;
847 		n_free--;
848 		goto next_in_chain;
849 	}
850 	return used_slots;
851 }
852 
853 static uint16_t
854 eth_memif_tx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
855 {
856 	struct memif_queue *mq = queue;
857 	struct pmd_internals *pmd = rte_eth_devices[mq->in_port].data->dev_private;
858 	struct pmd_process_private *proc_private =
859 		rte_eth_devices[mq->in_port].process_private;
860 	memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
861 	uint16_t slot, n_free, ring_size, mask, n_tx_pkts = 0;
862 	struct rte_eth_link link;
863 
864 	if (unlikely((pmd->flags & ETH_MEMIF_FLAG_CONNECTED) == 0))
865 		return 0;
866 	if (unlikely(ring == NULL)) {
867 		/* Secondary process will attempt to request regions. */
868 		rte_eth_link_get(mq->in_port, &link);
869 		return 0;
870 	}
871 
872 	ring_size = 1 << mq->log2_ring_size;
873 	mask = ring_size - 1;
874 
875 	/* free mbufs received by server */
876 	memif_free_stored_mbufs(proc_private, mq);
877 
878 	/* ring type always MEMIF_RING_C2S */
879 	/* For C2S queues ring->head is updated by the sender and
880 	 * this function is called in the context of sending thread.
881 	 * The loads in the sender do not need to synchronize with
882 	 * its own stores. Hence, the following load can be a
883 	 * relaxed load.
884 	 */
885 	slot = rte_atomic_load_explicit(&ring->head, rte_memory_order_relaxed);
886 	n_free = ring_size - slot + mq->last_tail;
887 
888 	int used_slots;
889 
890 	while (n_free && (n_tx_pkts < nb_pkts)) {
891 		while ((n_free > 4) && ((nb_pkts - n_tx_pkts) > 4)) {
892 			if ((nb_pkts - n_tx_pkts) > 8) {
893 				rte_prefetch0(*bufs + 4);
894 				rte_prefetch0(*bufs + 5);
895 				rte_prefetch0(*bufs + 6);
896 				rte_prefetch0(*bufs + 7);
897 			}
898 			used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
899 				mask, slot, n_free);
900 			if (unlikely(used_slots < 1))
901 				goto no_free_slots;
902 			n_tx_pkts++;
903 			slot += used_slots;
904 			n_free -= used_slots;
905 
906 			used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
907 				mask, slot, n_free);
908 			if (unlikely(used_slots < 1))
909 				goto no_free_slots;
910 			n_tx_pkts++;
911 			slot += used_slots;
912 			n_free -= used_slots;
913 
914 			used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
915 				mask, slot, n_free);
916 			if (unlikely(used_slots < 1))
917 				goto no_free_slots;
918 			n_tx_pkts++;
919 			slot += used_slots;
920 			n_free -= used_slots;
921 
922 			used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
923 				mask, slot, n_free);
924 			if (unlikely(used_slots < 1))
925 				goto no_free_slots;
926 			n_tx_pkts++;
927 			slot += used_slots;
928 			n_free -= used_slots;
929 		}
930 		used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
931 			mask, slot, n_free);
932 		if (unlikely(used_slots < 1))
933 			goto no_free_slots;
934 		n_tx_pkts++;
935 		slot += used_slots;
936 		n_free -= used_slots;
937 	}
938 
939 no_free_slots:
940 	/* ring type always MEMIF_RING_C2S */
941 	/* The ring->head acts as a guard variable between Tx and Rx
942 	 * threads, so using store-release pairs with load-acquire
943 	 * in function eth_memif_rx for C2S rings.
944 	 */
945 	rte_atomic_store_explicit(&ring->head, slot, rte_memory_order_release);
946 
947 	/* Send interrupt, if enabled. */
948 	if ((ring->flags & MEMIF_RING_FLAG_MASK_INT) == 0) {
949 		uint64_t a = 1;
950 		if (rte_intr_fd_get(mq->intr_handle) < 0)
951 			return -1;
952 
953 		ssize_t size = write(rte_intr_fd_get(mq->intr_handle),
954 				     &a, sizeof(a));
955 		if (unlikely(size < 0)) {
956 			MIF_LOG(WARNING,
957 				"Failed to send interrupt. %s", strerror(errno));
958 		}
959 	}
960 
961 	/* increment queue counters */
962 	mq->n_pkts += n_tx_pkts;
963 
964 	return n_tx_pkts;
965 }
966 
967 void
968 memif_free_regions(struct rte_eth_dev *dev)
969 {
970 	struct pmd_process_private *proc_private = dev->process_private;
971 	struct pmd_internals *pmd = dev->data->dev_private;
972 	int i;
973 	struct memif_region *r;
974 
975 	/* regions are allocated contiguously, so it's
976 	 * enough to loop until 'proc_private->regions_num'
977 	 */
978 	for (i = 0; i < proc_private->regions_num; i++) {
979 		r = proc_private->regions[i];
980 		if (r != NULL) {
981 			/* This is memzone */
982 			if (i > 0 && (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)) {
983 				r->addr = NULL;
984 				if (r->fd > 0)
985 					close(r->fd);
986 			}
987 			if (r->addr != NULL) {
988 				munmap(r->addr, r->region_size);
989 				if (r->fd > 0) {
990 					close(r->fd);
991 					r->fd = -1;
992 				}
993 			}
994 			rte_free(r);
995 			proc_private->regions[i] = NULL;
996 		}
997 	}
998 	proc_private->regions_num = 0;
999 }
1000 
1001 static int
1002 memif_region_init_zc(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
1003 		     void *arg)
1004 {
1005 	struct pmd_process_private *proc_private = (struct pmd_process_private *)arg;
1006 	struct memif_region *r;
1007 
1008 	if (proc_private->regions_num < 1) {
1009 		MIF_LOG(ERR, "Missing descriptor region");
1010 		return -1;
1011 	}
1012 
1013 	r = proc_private->regions[proc_private->regions_num - 1];
1014 
1015 	if (r->addr != msl->base_va)
1016 		r = proc_private->regions[++proc_private->regions_num - 1];
1017 
1018 	if (r == NULL) {
1019 		r = rte_zmalloc("region", sizeof(struct memif_region), 0);
1020 		if (r == NULL) {
1021 			MIF_LOG(ERR, "Failed to alloc memif region.");
1022 			return -ENOMEM;
1023 		}
1024 
1025 		r->addr = msl->base_va;
1026 		r->region_size = ms->len;
1027 		r->fd = rte_memseg_get_fd(ms);
1028 		if (r->fd < 0)
1029 			return -1;
1030 		r->pkt_buffer_offset = 0;
1031 
1032 		proc_private->regions[proc_private->regions_num - 1] = r;
1033 	} else {
1034 		r->region_size += ms->len;
1035 	}
1036 
1037 	return 0;
1038 }
1039 
1040 static int
1041 memif_region_init_shm(struct rte_eth_dev *dev, uint8_t has_buffers)
1042 {
1043 	struct pmd_internals *pmd = dev->data->dev_private;
1044 	struct pmd_process_private *proc_private = dev->process_private;
1045 	char shm_name[ETH_MEMIF_SHM_NAME_SIZE];
1046 	int ret = 0;
1047 	struct memif_region *r;
1048 
1049 	if (proc_private->regions_num >= ETH_MEMIF_MAX_REGION_NUM) {
1050 		MIF_LOG(ERR, "Too many regions.");
1051 		return -1;
1052 	}
1053 
1054 	r = rte_zmalloc("region", sizeof(struct memif_region), 0);
1055 	if (r == NULL) {
1056 		MIF_LOG(ERR, "Failed to alloc memif region.");
1057 		return -ENOMEM;
1058 	}
1059 
1060 	/* calculate buffer offset */
1061 	r->pkt_buffer_offset = (pmd->run.num_c2s_rings + pmd->run.num_s2c_rings) *
1062 	    (sizeof(memif_ring_t) + sizeof(memif_desc_t) *
1063 	    (1 << pmd->run.log2_ring_size));
1064 
1065 	r->region_size = r->pkt_buffer_offset;
1066 	/* if region has buffers, add buffers size to region_size */
1067 	if (has_buffers == 1)
1068 		r->region_size += (uint32_t)(pmd->run.pkt_buffer_size *
1069 			(1 << pmd->run.log2_ring_size) *
1070 			(pmd->run.num_c2s_rings +
1071 			 pmd->run.num_s2c_rings));
1072 
1073 	memset(shm_name, 0, sizeof(char) * ETH_MEMIF_SHM_NAME_SIZE);
1074 	snprintf(shm_name, ETH_MEMIF_SHM_NAME_SIZE, "memif_region_%d",
1075 		 proc_private->regions_num);
1076 
1077 	r->fd = memfd_create(shm_name, MFD_ALLOW_SEALING);
1078 	if (r->fd < 0) {
1079 		MIF_LOG(ERR, "Failed to create shm file: %s.", strerror(errno));
1080 		ret = -1;
1081 		goto error;
1082 	}
1083 
1084 	ret = fcntl(r->fd, F_ADD_SEALS, F_SEAL_SHRINK);
1085 	if (ret < 0) {
1086 		MIF_LOG(ERR, "Failed to add seals to shm file: %s.", strerror(errno));
1087 		goto error;
1088 	}
1089 
1090 	ret = ftruncate(r->fd, r->region_size);
1091 	if (ret < 0) {
1092 		MIF_LOG(ERR, "Failed to truncate shm file: %s.", strerror(errno));
1093 		goto error;
1094 	}
1095 
1096 	r->addr = mmap(NULL, r->region_size, PROT_READ |
1097 		       PROT_WRITE, MAP_SHARED, r->fd, 0);
1098 	if (r->addr == MAP_FAILED) {
1099 		MIF_LOG(ERR, "Failed to mmap shm region: %s.", strerror(ret));
1100 		ret = -1;
1101 		goto error;
1102 	}
1103 
1104 	proc_private->regions[proc_private->regions_num] = r;
1105 	proc_private->regions_num++;
1106 
1107 	return ret;
1108 
1109 error:
1110 	if (r->fd > 0)
1111 		close(r->fd);
1112 	r->fd = -1;
1113 
1114 	return ret;
1115 }
1116 
1117 static int
1118 memif_regions_init(struct rte_eth_dev *dev)
1119 {
1120 	struct pmd_internals *pmd = dev->data->dev_private;
1121 	int ret;
1122 
1123 	/*
1124 	 * Zero-copy exposes dpdk memory.
1125 	 * Each memseg list will be represented by memif region.
1126 	 * Zero-copy regions indexing: memseg list idx + 1,
1127 	 * as we already have region 0 reserved for descriptors.
1128 	 */
1129 	if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
1130 		/* create region idx 0 containing descriptors */
1131 		ret = memif_region_init_shm(dev, 0);
1132 		if (ret < 0)
1133 			return ret;
1134 		ret = rte_memseg_walk(memif_region_init_zc, (void *)dev->process_private);
1135 		if (ret < 0)
1136 			return ret;
1137 	} else {
1138 		/* create one memory region containing rings and buffers */
1139 		ret = memif_region_init_shm(dev, /* has buffers */ 1);
1140 		if (ret < 0)
1141 			return ret;
1142 	}
1143 
1144 	return 0;
1145 }
1146 
1147 static void
1148 memif_init_rings(struct rte_eth_dev *dev)
1149 {
1150 	struct pmd_internals *pmd = dev->data->dev_private;
1151 	struct pmd_process_private *proc_private = dev->process_private;
1152 	memif_ring_t *ring;
1153 	int i, j;
1154 	uint16_t slot;
1155 
1156 	for (i = 0; i < pmd->run.num_c2s_rings; i++) {
1157 		ring = memif_get_ring(pmd, proc_private, MEMIF_RING_C2S, i);
1158 		rte_atomic_store_explicit(&ring->head, 0, rte_memory_order_relaxed);
1159 		rte_atomic_store_explicit(&ring->tail, 0, rte_memory_order_relaxed);
1160 		ring->cookie = MEMIF_COOKIE;
1161 		ring->flags = 0;
1162 
1163 		if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)
1164 			continue;
1165 
1166 		for (j = 0; j < (1 << pmd->run.log2_ring_size); j++) {
1167 			slot = i * (1 << pmd->run.log2_ring_size) + j;
1168 			ring->desc[j].region = 0;
1169 			ring->desc[j].offset =
1170 				proc_private->regions[0]->pkt_buffer_offset +
1171 				(uint32_t)(slot * pmd->run.pkt_buffer_size);
1172 			ring->desc[j].length = pmd->run.pkt_buffer_size;
1173 		}
1174 	}
1175 
1176 	for (i = 0; i < pmd->run.num_s2c_rings; i++) {
1177 		ring = memif_get_ring(pmd, proc_private, MEMIF_RING_S2C, i);
1178 		rte_atomic_store_explicit(&ring->head, 0, rte_memory_order_relaxed);
1179 		rte_atomic_store_explicit(&ring->tail, 0, rte_memory_order_relaxed);
1180 		ring->cookie = MEMIF_COOKIE;
1181 		ring->flags = 0;
1182 
1183 		if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)
1184 			continue;
1185 
1186 		for (j = 0; j < (1 << pmd->run.log2_ring_size); j++) {
1187 			slot = (i + pmd->run.num_c2s_rings) *
1188 			    (1 << pmd->run.log2_ring_size) + j;
1189 			ring->desc[j].region = 0;
1190 			ring->desc[j].offset =
1191 				proc_private->regions[0]->pkt_buffer_offset +
1192 				(uint32_t)(slot * pmd->run.pkt_buffer_size);
1193 			ring->desc[j].length = pmd->run.pkt_buffer_size;
1194 		}
1195 	}
1196 }
1197 
1198 /* called only by client */
1199 static int
1200 memif_init_queues(struct rte_eth_dev *dev)
1201 {
1202 	struct pmd_internals *pmd = dev->data->dev_private;
1203 	struct memif_queue *mq;
1204 	int i;
1205 
1206 	for (i = 0; i < pmd->run.num_c2s_rings; i++) {
1207 		mq = dev->data->tx_queues[i];
1208 		mq->log2_ring_size = pmd->run.log2_ring_size;
1209 		/* queues located only in region 0 */
1210 		mq->region = 0;
1211 		mq->ring_offset = memif_get_ring_offset(dev, mq, MEMIF_RING_C2S, i);
1212 		mq->last_head = 0;
1213 		mq->last_tail = 0;
1214 		if (rte_intr_fd_set(mq->intr_handle, eventfd(0, EFD_NONBLOCK)))
1215 			return -rte_errno;
1216 
1217 		if (rte_intr_fd_get(mq->intr_handle) < 0) {
1218 			MIF_LOG(WARNING,
1219 				"Failed to create eventfd for tx queue %d: %s.", i,
1220 				strerror(errno));
1221 		}
1222 		mq->buffers = NULL;
1223 		if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
1224 			mq->buffers = rte_zmalloc("bufs", sizeof(struct rte_mbuf *) *
1225 						  (1 << mq->log2_ring_size), 0);
1226 			if (mq->buffers == NULL)
1227 				return -ENOMEM;
1228 		}
1229 	}
1230 
1231 	for (i = 0; i < pmd->run.num_s2c_rings; i++) {
1232 		mq = dev->data->rx_queues[i];
1233 		mq->log2_ring_size = pmd->run.log2_ring_size;
1234 		/* queues located only in region 0 */
1235 		mq->region = 0;
1236 		mq->ring_offset = memif_get_ring_offset(dev, mq, MEMIF_RING_S2C, i);
1237 		mq->last_head = 0;
1238 		mq->last_tail = 0;
1239 		if (rte_intr_fd_set(mq->intr_handle, eventfd(0, EFD_NONBLOCK)))
1240 			return -rte_errno;
1241 		if (rte_intr_fd_get(mq->intr_handle) < 0) {
1242 			MIF_LOG(WARNING,
1243 				"Failed to create eventfd for rx queue %d: %s.", i,
1244 				strerror(errno));
1245 		}
1246 		mq->buffers = NULL;
1247 		if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
1248 			mq->buffers = rte_zmalloc("bufs", sizeof(struct rte_mbuf *) *
1249 						  (1 << mq->log2_ring_size), 0);
1250 			if (mq->buffers == NULL)
1251 				return -ENOMEM;
1252 		}
1253 	}
1254 	return 0;
1255 }
1256 
1257 int
1258 memif_init_regions_and_queues(struct rte_eth_dev *dev)
1259 {
1260 	int ret;
1261 
1262 	ret = memif_regions_init(dev);
1263 	if (ret < 0)
1264 		return ret;
1265 
1266 	memif_init_rings(dev);
1267 
1268 	ret = memif_init_queues(dev);
1269 	if (ret < 0)
1270 		return ret;
1271 
1272 	return 0;
1273 }
1274 
1275 int
1276 memif_connect(struct rte_eth_dev *dev)
1277 {
1278 	struct pmd_internals *pmd = dev->data->dev_private;
1279 	struct pmd_process_private *proc_private = dev->process_private;
1280 	struct memif_region *mr;
1281 	struct memif_queue *mq;
1282 	memif_ring_t *ring;
1283 	int i;
1284 
1285 	for (i = 0; i < proc_private->regions_num; i++) {
1286 		mr = proc_private->regions[i];
1287 		if (mr != NULL) {
1288 			if (mr->addr == NULL) {
1289 				if (mr->fd < 0)
1290 					return -1;
1291 				mr->addr = mmap(NULL, mr->region_size,
1292 						PROT_READ | PROT_WRITE,
1293 						MAP_SHARED, mr->fd, 0);
1294 				if (mr->addr == MAP_FAILED) {
1295 					MIF_LOG(ERR, "mmap failed: %s",
1296 						strerror(errno));
1297 					return -1;
1298 				}
1299 			}
1300 			if (i > 0 && (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)) {
1301 				/* close memseg file */
1302 				close(mr->fd);
1303 				mr->fd = -1;
1304 			}
1305 		}
1306 	}
1307 
1308 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
1309 		for (i = 0; i < pmd->run.num_c2s_rings; i++) {
1310 			mq = (pmd->role == MEMIF_ROLE_CLIENT) ?
1311 			    dev->data->tx_queues[i] : dev->data->rx_queues[i];
1312 			ring = memif_get_ring_from_queue(proc_private, mq);
1313 			if (ring == NULL || ring->cookie != MEMIF_COOKIE) {
1314 				MIF_LOG(ERR, "Wrong ring");
1315 				return -1;
1316 			}
1317 			rte_atomic_store_explicit(&ring->head, 0, rte_memory_order_relaxed);
1318 			rte_atomic_store_explicit(&ring->tail, 0, rte_memory_order_relaxed);
1319 			mq->last_head = 0;
1320 			mq->last_tail = 0;
1321 			/* enable polling mode */
1322 			if (pmd->role == MEMIF_ROLE_SERVER)
1323 				ring->flags = MEMIF_RING_FLAG_MASK_INT;
1324 		}
1325 		for (i = 0; i < pmd->run.num_s2c_rings; i++) {
1326 			mq = (pmd->role == MEMIF_ROLE_CLIENT) ?
1327 			    dev->data->rx_queues[i] : dev->data->tx_queues[i];
1328 			ring = memif_get_ring_from_queue(proc_private, mq);
1329 			if (ring == NULL || ring->cookie != MEMIF_COOKIE) {
1330 				MIF_LOG(ERR, "Wrong ring");
1331 				return -1;
1332 			}
1333 			rte_atomic_store_explicit(&ring->head, 0, rte_memory_order_relaxed);
1334 			rte_atomic_store_explicit(&ring->tail, 0, rte_memory_order_relaxed);
1335 			mq->last_head = 0;
1336 			mq->last_tail = 0;
1337 			/* enable polling mode */
1338 			if (pmd->role == MEMIF_ROLE_CLIENT)
1339 				ring->flags = MEMIF_RING_FLAG_MASK_INT;
1340 		}
1341 
1342 		pmd->flags &= ~ETH_MEMIF_FLAG_CONNECTING;
1343 		pmd->flags |= ETH_MEMIF_FLAG_CONNECTED;
1344 		dev->data->dev_link.link_status = RTE_ETH_LINK_UP;
1345 	}
1346 	MIF_LOG(INFO, "Connected.");
1347 	return 0;
1348 }
1349 
1350 static int
1351 memif_dev_start(struct rte_eth_dev *dev)
1352 {
1353 	struct pmd_internals *pmd = dev->data->dev_private;
1354 	int ret = 0;
1355 	uint16_t i;
1356 
1357 	switch (pmd->role) {
1358 	case MEMIF_ROLE_CLIENT:
1359 		ret = memif_connect_client(dev);
1360 		break;
1361 	case MEMIF_ROLE_SERVER:
1362 		ret = memif_connect_server(dev);
1363 		break;
1364 	default:
1365 		MIF_LOG(ERR, "Unknown role: %d.", pmd->role);
1366 		ret = -1;
1367 		break;
1368 	}
1369 
1370 	if (ret == 0) {
1371 		for (i = 0; i < dev->data->nb_rx_queues; i++)
1372 			dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED;
1373 		for (i = 0; i < dev->data->nb_tx_queues; i++)
1374 			dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED;
1375 	}
1376 
1377 	return ret;
1378 }
1379 
1380 static int
1381 memif_dev_stop(struct rte_eth_dev *dev)
1382 {
1383 	uint16_t i;
1384 
1385 	memif_disconnect(dev);
1386 
1387 	for (i = 0; i < dev->data->nb_rx_queues; i++)
1388 		dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED;
1389 	for (i = 0; i < dev->data->nb_tx_queues; i++)
1390 		dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED;
1391 
1392 	return 0;
1393 }
1394 
1395 static int
1396 memif_dev_close(struct rte_eth_dev *dev)
1397 {
1398 	struct pmd_internals *pmd = dev->data->dev_private;
1399 	int i;
1400 
1401 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
1402 		memif_msg_enq_disconnect(pmd->cc, "Device closed", 0);
1403 
1404 		for (i = 0; i < dev->data->nb_rx_queues; i++)
1405 			(*dev->dev_ops->rx_queue_release)(dev, i);
1406 		for (i = 0; i < dev->data->nb_tx_queues; i++)
1407 			(*dev->dev_ops->tx_queue_release)(dev, i);
1408 
1409 		memif_socket_remove_device(dev);
1410 	}
1411 
1412 	rte_free(dev->process_private);
1413 
1414 	return 0;
1415 }
1416 
1417 static int
1418 memif_dev_configure(struct rte_eth_dev *dev)
1419 {
1420 	struct pmd_internals *pmd = dev->data->dev_private;
1421 
1422 	/*
1423 	 * CLIENT - TXQ
1424 	 * SERVER - RXQ
1425 	 */
1426 	pmd->cfg.num_c2s_rings = (pmd->role == MEMIF_ROLE_CLIENT) ?
1427 				  dev->data->nb_tx_queues : dev->data->nb_rx_queues;
1428 
1429 	/*
1430 	 * CLIENT - RXQ
1431 	 * SERVER - TXQ
1432 	 */
1433 	pmd->cfg.num_s2c_rings = (pmd->role == MEMIF_ROLE_CLIENT) ?
1434 				  dev->data->nb_rx_queues : dev->data->nb_tx_queues;
1435 
1436 	return 0;
1437 }
1438 
1439 static int
1440 memif_tx_queue_setup(struct rte_eth_dev *dev,
1441 		     uint16_t qid,
1442 		     uint16_t nb_tx_desc __rte_unused,
1443 		     unsigned int socket_id __rte_unused,
1444 		     const struct rte_eth_txconf *tx_conf __rte_unused)
1445 {
1446 	struct pmd_internals *pmd = dev->data->dev_private;
1447 	struct memif_queue *mq;
1448 
1449 	mq = rte_zmalloc("tx-queue", sizeof(struct memif_queue), 0);
1450 	if (mq == NULL) {
1451 		MIF_LOG(ERR, "Failed to allocate tx queue id: %u", qid);
1452 		return -ENOMEM;
1453 	}
1454 
1455 	/* Allocate interrupt instance */
1456 	mq->intr_handle = rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_SHARED);
1457 	if (mq->intr_handle == NULL) {
1458 		MIF_LOG(ERR, "Failed to allocate intr handle");
1459 		return -ENOMEM;
1460 	}
1461 
1462 	mq->type =
1463 	    (pmd->role == MEMIF_ROLE_CLIENT) ? MEMIF_RING_C2S : MEMIF_RING_S2C;
1464 	mq->n_pkts = 0;
1465 	mq->n_bytes = 0;
1466 
1467 	if (rte_intr_fd_set(mq->intr_handle, -1))
1468 		return -rte_errno;
1469 
1470 	if (rte_intr_type_set(mq->intr_handle, RTE_INTR_HANDLE_EXT))
1471 		return -rte_errno;
1472 
1473 	mq->in_port = dev->data->port_id;
1474 	dev->data->tx_queues[qid] = mq;
1475 
1476 	return 0;
1477 }
1478 
1479 static int
1480 memif_rx_queue_setup(struct rte_eth_dev *dev,
1481 		     uint16_t qid,
1482 		     uint16_t nb_rx_desc __rte_unused,
1483 		     unsigned int socket_id __rte_unused,
1484 		     const struct rte_eth_rxconf *rx_conf __rte_unused,
1485 		     struct rte_mempool *mb_pool)
1486 {
1487 	struct pmd_internals *pmd = dev->data->dev_private;
1488 	struct memif_queue *mq;
1489 
1490 	mq = rte_zmalloc("rx-queue", sizeof(struct memif_queue), 0);
1491 	if (mq == NULL) {
1492 		MIF_LOG(ERR, "Failed to allocate rx queue id: %u", qid);
1493 		return -ENOMEM;
1494 	}
1495 
1496 	/* Allocate interrupt instance */
1497 	mq->intr_handle = rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_SHARED);
1498 	if (mq->intr_handle == NULL) {
1499 		MIF_LOG(ERR, "Failed to allocate intr handle");
1500 		return -ENOMEM;
1501 	}
1502 
1503 	mq->type = (pmd->role == MEMIF_ROLE_CLIENT) ? MEMIF_RING_S2C : MEMIF_RING_C2S;
1504 	mq->n_pkts = 0;
1505 	mq->n_bytes = 0;
1506 
1507 	if (rte_intr_fd_set(mq->intr_handle, -1))
1508 		return -rte_errno;
1509 
1510 	if (rte_intr_type_set(mq->intr_handle, RTE_INTR_HANDLE_EXT))
1511 		return -rte_errno;
1512 
1513 	mq->mempool = mb_pool;
1514 	mq->in_port = dev->data->port_id;
1515 	dev->data->rx_queues[qid] = mq;
1516 
1517 	return 0;
1518 }
1519 
1520 static void
1521 memif_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
1522 {
1523 	struct memif_queue *mq = dev->data->rx_queues[qid];
1524 
1525 	if (!mq)
1526 		return;
1527 
1528 	rte_intr_instance_free(mq->intr_handle);
1529 	rte_free(mq);
1530 }
1531 
1532 static void
1533 memif_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
1534 {
1535 	struct memif_queue *mq = dev->data->tx_queues[qid];
1536 
1537 	if (!mq)
1538 		return;
1539 
1540 	rte_free(mq);
1541 }
1542 
1543 static int
1544 memif_link_update(struct rte_eth_dev *dev,
1545 		  int wait_to_complete __rte_unused)
1546 {
1547 	struct pmd_process_private *proc_private;
1548 
1549 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1550 		proc_private = dev->process_private;
1551 		if (dev->data->dev_link.link_status == RTE_ETH_LINK_UP &&
1552 				proc_private->regions_num == 0) {
1553 			memif_mp_request_regions(dev);
1554 		} else if (dev->data->dev_link.link_status == RTE_ETH_LINK_DOWN &&
1555 				proc_private->regions_num > 0) {
1556 			memif_free_regions(dev);
1557 		}
1558 	}
1559 	return 0;
1560 }
1561 
1562 static int
1563 memif_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1564 {
1565 	struct pmd_internals *pmd = dev->data->dev_private;
1566 	struct memif_queue *mq;
1567 	int i;
1568 	uint8_t tmp, nq;
1569 
1570 	stats->ipackets = 0;
1571 	stats->ibytes = 0;
1572 	stats->opackets = 0;
1573 	stats->obytes = 0;
1574 
1575 	tmp = (pmd->role == MEMIF_ROLE_CLIENT) ? pmd->run.num_s2c_rings :
1576 	    pmd->run.num_c2s_rings;
1577 	nq = (tmp < RTE_ETHDEV_QUEUE_STAT_CNTRS) ? tmp :
1578 	    RTE_ETHDEV_QUEUE_STAT_CNTRS;
1579 
1580 	/* RX stats */
1581 	for (i = 0; i < nq; i++) {
1582 		mq = dev->data->rx_queues[i];
1583 		stats->q_ipackets[i] = mq->n_pkts;
1584 		stats->q_ibytes[i] = mq->n_bytes;
1585 		stats->ipackets += mq->n_pkts;
1586 		stats->ibytes += mq->n_bytes;
1587 	}
1588 
1589 	tmp = (pmd->role == MEMIF_ROLE_CLIENT) ? pmd->run.num_c2s_rings :
1590 	    pmd->run.num_s2c_rings;
1591 	nq = (tmp < RTE_ETHDEV_QUEUE_STAT_CNTRS) ? tmp :
1592 	    RTE_ETHDEV_QUEUE_STAT_CNTRS;
1593 
1594 	/* TX stats */
1595 	for (i = 0; i < nq; i++) {
1596 		mq = dev->data->tx_queues[i];
1597 		stats->q_opackets[i] = mq->n_pkts;
1598 		stats->q_obytes[i] = mq->n_bytes;
1599 		stats->opackets += mq->n_pkts;
1600 		stats->obytes += mq->n_bytes;
1601 	}
1602 	return 0;
1603 }
1604 
1605 static int
1606 memif_stats_reset(struct rte_eth_dev *dev)
1607 {
1608 	struct pmd_internals *pmd = dev->data->dev_private;
1609 	int i;
1610 	struct memif_queue *mq;
1611 
1612 	for (i = 0; i < pmd->run.num_c2s_rings; i++) {
1613 		mq = (pmd->role == MEMIF_ROLE_CLIENT) ? dev->data->tx_queues[i] :
1614 		    dev->data->rx_queues[i];
1615 		mq->n_pkts = 0;
1616 		mq->n_bytes = 0;
1617 	}
1618 	for (i = 0; i < pmd->run.num_s2c_rings; i++) {
1619 		mq = (pmd->role == MEMIF_ROLE_CLIENT) ? dev->data->rx_queues[i] :
1620 		    dev->data->tx_queues[i];
1621 		mq->n_pkts = 0;
1622 		mq->n_bytes = 0;
1623 	}
1624 
1625 	return 0;
1626 }
1627 
1628 static const struct eth_dev_ops ops = {
1629 	.dev_start = memif_dev_start,
1630 	.dev_stop = memif_dev_stop,
1631 	.dev_close = memif_dev_close,
1632 	.dev_infos_get = memif_dev_info,
1633 	.dev_configure = memif_dev_configure,
1634 	.tx_queue_setup = memif_tx_queue_setup,
1635 	.rx_queue_setup = memif_rx_queue_setup,
1636 	.rx_queue_release = memif_rx_queue_release,
1637 	.tx_queue_release = memif_tx_queue_release,
1638 	.link_update = memif_link_update,
1639 	.stats_get = memif_stats_get,
1640 	.stats_reset = memif_stats_reset,
1641 };
1642 
1643 static int
1644 memif_create(struct rte_vdev_device *vdev, enum memif_role_t role,
1645 	     memif_interface_id_t id, uint32_t flags,
1646 	     const char *socket_filename, uid_t owner_uid, gid_t owner_gid,
1647 	     memif_log2_ring_size_t log2_ring_size,
1648 	     uint16_t pkt_buffer_size, const char *secret,
1649 	     struct rte_ether_addr *ether_addr)
1650 {
1651 	int ret = 0;
1652 	struct rte_eth_dev *eth_dev;
1653 	struct rte_eth_dev_data *data;
1654 	struct pmd_internals *pmd;
1655 	struct pmd_process_private *process_private;
1656 	const unsigned int numa_node = vdev->device.numa_node;
1657 	const char *name = rte_vdev_device_name(vdev);
1658 
1659 	eth_dev = rte_eth_vdev_allocate(vdev, sizeof(*pmd));
1660 	if (eth_dev == NULL) {
1661 		MIF_LOG(ERR, "%s: Unable to allocate device struct.", name);
1662 		return -1;
1663 	}
1664 
1665 	process_private = (struct pmd_process_private *)
1666 		rte_zmalloc(name, sizeof(struct pmd_process_private),
1667 			    RTE_CACHE_LINE_SIZE);
1668 
1669 	if (process_private == NULL) {
1670 		MIF_LOG(ERR, "Failed to alloc memory for process private");
1671 		return -1;
1672 	}
1673 	eth_dev->process_private = process_private;
1674 
1675 	pmd = eth_dev->data->dev_private;
1676 	memset(pmd, 0, sizeof(*pmd));
1677 
1678 	pmd->id = id;
1679 	pmd->flags = flags;
1680 	pmd->flags |= ETH_MEMIF_FLAG_DISABLED;
1681 	pmd->role = role;
1682 	/* Zero-copy flag irelevant to server. */
1683 	if (pmd->role == MEMIF_ROLE_SERVER)
1684 		pmd->flags &= ~ETH_MEMIF_FLAG_ZERO_COPY;
1685 	pmd->owner_uid = owner_uid;
1686 	pmd->owner_gid = owner_gid;
1687 
1688 	ret = memif_socket_init(eth_dev, socket_filename);
1689 	if (ret < 0)
1690 		return ret;
1691 
1692 	memset(pmd->secret, 0, sizeof(char) * ETH_MEMIF_SECRET_SIZE);
1693 	if (secret != NULL)
1694 		strlcpy(pmd->secret, secret, sizeof(pmd->secret));
1695 
1696 	pmd->cfg.log2_ring_size = log2_ring_size;
1697 	/* set in .dev_configure() */
1698 	pmd->cfg.num_c2s_rings = 0;
1699 	pmd->cfg.num_s2c_rings = 0;
1700 
1701 	pmd->cfg.pkt_buffer_size = pkt_buffer_size;
1702 	rte_spinlock_init(&pmd->cc_lock);
1703 
1704 	data = eth_dev->data;
1705 	data->dev_private = pmd;
1706 	data->numa_node = numa_node;
1707 	data->dev_link = pmd_link;
1708 	data->mac_addrs = ether_addr;
1709 	data->promiscuous = 1;
1710 	data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
1711 
1712 	eth_dev->dev_ops = &ops;
1713 	eth_dev->device = &vdev->device;
1714 	if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
1715 		eth_dev->rx_pkt_burst = eth_memif_rx_zc;
1716 		eth_dev->tx_pkt_burst = eth_memif_tx_zc;
1717 	} else {
1718 		eth_dev->rx_pkt_burst = eth_memif_rx;
1719 		eth_dev->tx_pkt_burst = eth_memif_tx;
1720 	}
1721 
1722 	rte_eth_dev_probing_finish(eth_dev);
1723 
1724 	return 0;
1725 }
1726 
1727 static int
1728 memif_set_role(const char *key __rte_unused, const char *value,
1729 	       void *extra_args)
1730 {
1731 	enum memif_role_t *role = (enum memif_role_t *)extra_args;
1732 
1733 	if (strstr(value, "server") != NULL) {
1734 		*role = MEMIF_ROLE_SERVER;
1735 	} else if (strstr(value, "client") != NULL) {
1736 		*role = MEMIF_ROLE_CLIENT;
1737 	} else if (strstr(value, "master") != NULL) {
1738 		MIF_LOG(NOTICE, "Role argument \"master\" is deprecated, use \"server\"");
1739 		*role = MEMIF_ROLE_SERVER;
1740 	} else if (strstr(value, "slave") != NULL) {
1741 		MIF_LOG(NOTICE, "Role argument \"slave\" is deprecated, use \"client\"");
1742 		*role = MEMIF_ROLE_CLIENT;
1743 	} else {
1744 		MIF_LOG(ERR, "Unknown role: %s.", value);
1745 		return -EINVAL;
1746 	}
1747 	return 0;
1748 }
1749 
1750 static int
1751 memif_set_zc(const char *key __rte_unused, const char *value, void *extra_args)
1752 {
1753 	uint32_t *flags = (uint32_t *)extra_args;
1754 
1755 	if (strstr(value, "yes") != NULL) {
1756 		if (!rte_mcfg_get_single_file_segments()) {
1757 			MIF_LOG(ERR, "Zero-copy doesn't support multi-file segments.");
1758 			return -ENOTSUP;
1759 		}
1760 		*flags |= ETH_MEMIF_FLAG_ZERO_COPY;
1761 	} else if (strstr(value, "no") != NULL) {
1762 		*flags &= ~ETH_MEMIF_FLAG_ZERO_COPY;
1763 	} else {
1764 		MIF_LOG(ERR, "Failed to parse zero-copy param: %s.", value);
1765 		return -EINVAL;
1766 	}
1767 	return 0;
1768 }
1769 
1770 static int
1771 memif_set_id(const char *key __rte_unused, const char *value, void *extra_args)
1772 {
1773 	memif_interface_id_t *id = (memif_interface_id_t *)extra_args;
1774 
1775 	/* even if parsing fails, 0 is a valid id */
1776 	*id = strtoul(value, NULL, 10);
1777 	return 0;
1778 }
1779 
1780 static int
1781 memif_set_bs(const char *key __rte_unused, const char *value, void *extra_args)
1782 {
1783 	unsigned long tmp;
1784 	uint16_t *pkt_buffer_size = (uint16_t *)extra_args;
1785 
1786 	tmp = strtoul(value, NULL, 10);
1787 	if (tmp == 0 || tmp > 0xFFFF) {
1788 		MIF_LOG(ERR, "Invalid buffer size: %s.", value);
1789 		return -EINVAL;
1790 	}
1791 	*pkt_buffer_size = tmp;
1792 	return 0;
1793 }
1794 
1795 static int
1796 memif_set_rs(const char *key __rte_unused, const char *value, void *extra_args)
1797 {
1798 	unsigned long tmp;
1799 	memif_log2_ring_size_t *log2_ring_size =
1800 	    (memif_log2_ring_size_t *)extra_args;
1801 
1802 	tmp = strtoul(value, NULL, 10);
1803 	if (tmp == 0 || tmp > ETH_MEMIF_MAX_LOG2_RING_SIZE) {
1804 		MIF_LOG(ERR, "Invalid ring size: %s (max %u).",
1805 			value, ETH_MEMIF_MAX_LOG2_RING_SIZE);
1806 		return -EINVAL;
1807 	}
1808 	*log2_ring_size = tmp;
1809 	return 0;
1810 }
1811 
1812 /* check if directory exists and if we have permission to read/write */
1813 static int
1814 memif_check_socket_filename(const char *filename)
1815 {
1816 	char *dir = NULL, *tmp;
1817 	uint32_t idx;
1818 	int ret = 0;
1819 
1820 	if (strlen(filename) >= MEMIF_SOCKET_UN_SIZE) {
1821 		MIF_LOG(ERR, "Unix socket address too long (max 108).");
1822 		return -1;
1823 	}
1824 
1825 	tmp = strrchr(filename, '/');
1826 	if (tmp != NULL) {
1827 		idx = tmp - filename;
1828 		dir = rte_zmalloc("memif_tmp", sizeof(char) * (idx + 1), 0);
1829 		if (dir == NULL) {
1830 			MIF_LOG(ERR, "Failed to allocate memory.");
1831 			return -1;
1832 		}
1833 		strlcpy(dir, filename, sizeof(char) * (idx + 1));
1834 	}
1835 
1836 	if (dir == NULL || (faccessat(-1, dir, F_OK | R_OK |
1837 					W_OK, AT_EACCESS) < 0)) {
1838 		MIF_LOG(ERR, "Invalid socket directory.");
1839 		ret = -EINVAL;
1840 	}
1841 
1842 	rte_free(dir);
1843 
1844 	return ret;
1845 }
1846 
1847 static int
1848 memif_set_socket_filename(const char *key __rte_unused, const char *value,
1849 			  void *extra_args)
1850 {
1851 	const char **socket_filename = (const char **)extra_args;
1852 
1853 	*socket_filename = value;
1854 	return 0;
1855 }
1856 
1857 static int
1858 memif_set_is_socket_abstract(const char *key __rte_unused, const char *value, void *extra_args)
1859 {
1860 	uint32_t *flags = (uint32_t *)extra_args;
1861 
1862 	if (strstr(value, "yes") != NULL) {
1863 		*flags |= ETH_MEMIF_FLAG_SOCKET_ABSTRACT;
1864 	} else if (strstr(value, "no") != NULL) {
1865 		*flags &= ~ETH_MEMIF_FLAG_SOCKET_ABSTRACT;
1866 	} else {
1867 		MIF_LOG(ERR, "Failed to parse socket-abstract param: %s.", value);
1868 		return -EINVAL;
1869 	}
1870 	return 0;
1871 }
1872 
1873 static int
1874 memif_set_owner(const char *key, const char *value, void *extra_args)
1875 {
1876 	RTE_ASSERT(sizeof(uid_t) == sizeof(uint32_t));
1877 	RTE_ASSERT(sizeof(gid_t) == sizeof(uint32_t));
1878 
1879 	unsigned long val;
1880 	char *end = NULL;
1881 	uint32_t *id = (uint32_t *)extra_args;
1882 
1883 	val = strtoul(value, &end, 10);
1884 	if (*value == '\0' || *end != '\0') {
1885 		MIF_LOG(ERR, "Failed to parse %s: %s.", key, value);
1886 		return -EINVAL;
1887 	}
1888 	if (val >= UINT32_MAX) {
1889 		MIF_LOG(ERR, "Invalid %s: %s.", key, value);
1890 		return -ERANGE;
1891 	}
1892 
1893 	*id = val;
1894 	return 0;
1895 }
1896 
1897 static int
1898 memif_set_mac(const char *key __rte_unused, const char *value, void *extra_args)
1899 {
1900 	struct rte_ether_addr *ether_addr = (struct rte_ether_addr *)extra_args;
1901 
1902 	if (rte_ether_unformat_addr(value, ether_addr) < 0)
1903 		MIF_LOG(WARNING, "Failed to parse mac '%s'.", value);
1904 	return 0;
1905 }
1906 
1907 static int
1908 memif_set_secret(const char *key __rte_unused, const char *value, void *extra_args)
1909 {
1910 	const char **secret = (const char **)extra_args;
1911 
1912 	*secret = value;
1913 	return 0;
1914 }
1915 
1916 static int
1917 rte_pmd_memif_probe(struct rte_vdev_device *vdev)
1918 {
1919 	RTE_BUILD_BUG_ON(sizeof(memif_msg_t) != 128);
1920 	RTE_BUILD_BUG_ON(sizeof(memif_desc_t) != 16);
1921 	int ret = 0;
1922 	struct rte_kvargs *kvlist;
1923 	const char *name = rte_vdev_device_name(vdev);
1924 	enum memif_role_t role = MEMIF_ROLE_CLIENT;
1925 	memif_interface_id_t id = 0;
1926 	uint16_t pkt_buffer_size = ETH_MEMIF_DEFAULT_PKT_BUFFER_SIZE;
1927 	memif_log2_ring_size_t log2_ring_size = ETH_MEMIF_DEFAULT_RING_SIZE;
1928 	const char *socket_filename = ETH_MEMIF_DEFAULT_SOCKET_FILENAME;
1929 	uid_t owner_uid = -1;
1930 	gid_t owner_gid = -1;
1931 	uint32_t flags = 0;
1932 	const char *secret = NULL;
1933 	struct rte_ether_addr *ether_addr = rte_zmalloc("",
1934 		sizeof(struct rte_ether_addr), 0);
1935 	struct rte_eth_dev *eth_dev;
1936 
1937 	rte_eth_random_addr(ether_addr->addr_bytes);
1938 
1939 	MIF_LOG(INFO, "Initialize MEMIF: %s.", name);
1940 
1941 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1942 		eth_dev = rte_eth_dev_attach_secondary(name);
1943 		if (!eth_dev) {
1944 			MIF_LOG(ERR, "Failed to probe %s", name);
1945 			return -1;
1946 		}
1947 
1948 		eth_dev->dev_ops = &ops;
1949 		eth_dev->device = &vdev->device;
1950 		eth_dev->rx_pkt_burst = eth_memif_rx;
1951 		eth_dev->tx_pkt_burst = eth_memif_tx;
1952 
1953 		if (!rte_eal_primary_proc_alive(NULL)) {
1954 			MIF_LOG(ERR, "Primary process is missing");
1955 			return -1;
1956 		}
1957 
1958 		eth_dev->process_private = (struct pmd_process_private *)
1959 			rte_zmalloc(name,
1960 				sizeof(struct pmd_process_private),
1961 				RTE_CACHE_LINE_SIZE);
1962 		if (eth_dev->process_private == NULL) {
1963 			MIF_LOG(ERR,
1964 				"Failed to alloc memory for process private");
1965 			return -1;
1966 		}
1967 
1968 		rte_eth_dev_probing_finish(eth_dev);
1969 
1970 		return 0;
1971 	}
1972 
1973 	ret = rte_mp_action_register(MEMIF_MP_SEND_REGION, memif_mp_send_region);
1974 	/*
1975 	 * Primary process can continue probing, but secondary process won't
1976 	 * be able to get memory regions information
1977 	 */
1978 	if (ret < 0 && rte_errno != EEXIST)
1979 		MIF_LOG(WARNING, "Failed to register mp action callback: %s",
1980 			strerror(rte_errno));
1981 
1982 	/* use abstract address by default */
1983 	flags |= ETH_MEMIF_FLAG_SOCKET_ABSTRACT;
1984 
1985 	kvlist = rte_kvargs_parse(rte_vdev_device_args(vdev), valid_arguments);
1986 
1987 	/* parse parameters */
1988 	if (kvlist != NULL) {
1989 		ret = rte_kvargs_process(kvlist, ETH_MEMIF_ROLE_ARG,
1990 					 &memif_set_role, &role);
1991 		if (ret < 0)
1992 			goto exit;
1993 		ret = rte_kvargs_process(kvlist, ETH_MEMIF_ID_ARG,
1994 					 &memif_set_id, &id);
1995 		if (ret < 0)
1996 			goto exit;
1997 		ret = rte_kvargs_process(kvlist, ETH_MEMIF_PKT_BUFFER_SIZE_ARG,
1998 					 &memif_set_bs, &pkt_buffer_size);
1999 		if (ret < 0)
2000 			goto exit;
2001 		ret = rte_kvargs_process(kvlist, ETH_MEMIF_RING_SIZE_ARG,
2002 					 &memif_set_rs, &log2_ring_size);
2003 		if (ret < 0)
2004 			goto exit;
2005 		ret = rte_kvargs_process(kvlist, ETH_MEMIF_SOCKET_ARG,
2006 					 &memif_set_socket_filename,
2007 					 (void *)(&socket_filename));
2008 		if (ret < 0)
2009 			goto exit;
2010 		ret = rte_kvargs_process(kvlist, ETH_MEMIF_SOCKET_ABSTRACT_ARG,
2011 					 &memif_set_is_socket_abstract, &flags);
2012 		if (ret < 0)
2013 			goto exit;
2014 		ret = rte_kvargs_process(kvlist, ETH_MEMIF_OWNER_UID_ARG,
2015 					 &memif_set_owner, &owner_uid);
2016 		if (ret < 0)
2017 			goto exit;
2018 		ret = rte_kvargs_process(kvlist, ETH_MEMIF_OWNER_GID_ARG,
2019 					 &memif_set_owner, &owner_gid);
2020 		if (ret < 0)
2021 			goto exit;
2022 		ret = rte_kvargs_process(kvlist, ETH_MEMIF_MAC_ARG,
2023 					 &memif_set_mac, ether_addr);
2024 		if (ret < 0)
2025 			goto exit;
2026 		ret = rte_kvargs_process(kvlist, ETH_MEMIF_ZC_ARG,
2027 					 &memif_set_zc, &flags);
2028 		if (ret < 0)
2029 			goto exit;
2030 		ret = rte_kvargs_process(kvlist, ETH_MEMIF_SECRET_ARG,
2031 					 &memif_set_secret, (void *)(&secret));
2032 		if (ret < 0)
2033 			goto exit;
2034 	}
2035 
2036 	if (!(flags & ETH_MEMIF_FLAG_SOCKET_ABSTRACT)) {
2037 		ret = memif_check_socket_filename(socket_filename);
2038 		if (ret < 0)
2039 			goto exit;
2040 	}
2041 
2042 	/* create interface */
2043 	ret = memif_create(vdev, role, id, flags, socket_filename, owner_uid, owner_gid,
2044 			   log2_ring_size, pkt_buffer_size, secret, ether_addr);
2045 
2046 exit:
2047 	rte_kvargs_free(kvlist);
2048 	return ret;
2049 }
2050 
2051 static int
2052 rte_pmd_memif_remove(struct rte_vdev_device *vdev)
2053 {
2054 	struct rte_eth_dev *eth_dev;
2055 
2056 	eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(vdev));
2057 	if (eth_dev == NULL)
2058 		return 0;
2059 
2060 	return rte_eth_dev_close(eth_dev->data->port_id);
2061 }
2062 
2063 static struct rte_vdev_driver pmd_memif_drv = {
2064 	.probe = rte_pmd_memif_probe,
2065 	.remove = rte_pmd_memif_remove,
2066 };
2067 
2068 RTE_PMD_REGISTER_VDEV(net_memif, pmd_memif_drv);
2069 
2070 RTE_PMD_REGISTER_PARAM_STRING(net_memif,
2071 			      ETH_MEMIF_ID_ARG "=<int>"
2072 			      ETH_MEMIF_ROLE_ARG "=server|client"
2073 			      ETH_MEMIF_PKT_BUFFER_SIZE_ARG "=<int>"
2074 			      ETH_MEMIF_RING_SIZE_ARG "=<int>"
2075 			      ETH_MEMIF_SOCKET_ARG "=<string>"
2076 			      ETH_MEMIF_SOCKET_ABSTRACT_ARG "=yes|no"
2077 			      ETH_MEMIF_OWNER_UID_ARG "=<int>"
2078 			      ETH_MEMIF_OWNER_GID_ARG "=<int>"
2079 			      ETH_MEMIF_MAC_ARG "=xx:xx:xx:xx:xx:xx"
2080 			      ETH_MEMIF_ZC_ARG "=yes|no"
2081 			      ETH_MEMIF_SECRET_ARG "=<string>");
2082 
2083 RTE_LOG_REGISTER_DEFAULT(memif_logtype, NOTICE);
2084