xref: /dpdk/lib/pcapng/rte_pcapng.c (revision 62774b78a84e9fa5df56d04cffed69bef8c901f1)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2019 Microsoft Corporation
3  */
4 
5 #include <errno.h>
6 #include <stdbool.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <time.h>
11 #include <unistd.h>
12 
13 #ifndef RTE_EXEC_ENV_WINDOWS
14 #include <net/if.h>
15 #include <sys/uio.h>
16 #endif
17 
18 #include <bus_driver.h>
19 #include <rte_common.h>
20 #include <rte_cycles.h>
21 #include <dev_driver.h>
22 #include <rte_errno.h>
23 #include <rte_ethdev.h>
24 #include <rte_ether.h>
25 #include <rte_mbuf.h>
26 #include <rte_os_shim.h>
27 #include <rte_pcapng.h>
28 #include <rte_reciprocal.h>
29 #include <rte_time.h>
30 
31 #include "pcapng_proto.h"
32 
33 /* conversion from DPDK speed to PCAPNG */
34 #define PCAPNG_MBPS_SPEED 1000000ull
35 
36 /* Format of the capture file handle */
37 struct rte_pcapng {
38 	int  outfd;		/* output file */
39 
40 	unsigned int ports;	/* number of interfaces added */
41 
42 	/* DPDK port id to interface index in file */
43 	uint32_t port_index[RTE_MAX_ETHPORTS];
44 };
45 
46 /* For converting TSC cycles to PCAPNG ns format */
47 static struct pcapng_time {
48 	uint64_t ns;
49 	uint64_t cycles;
50 	uint64_t tsc_hz;
51 	struct rte_reciprocal_u64 tsc_hz_inverse;
52 } pcapng_time;
53 
54 
55 #ifdef RTE_EXEC_ENV_WINDOWS
56 /*
57  * Windows does not have writev() call.
58  * Emulate this by copying to a new buffer.
59  * The copy is necessary since pcapng needs to be thread-safe
60  * and do atomic write operations.
61  */
62 
63 #define IOV_MAX 128
64 struct iovec {
65 	void   *iov_base;
66 	size_t  iov_len;
67 };
68 
69 static ssize_t writev(int fd, const struct iovec *iov, int iovcnt)
70 {
71 	size_t bytes = 0;
72 	uint8_t *ptr;
73 	void *tmp_buf;
74 	ssize_t ret;
75 	int i;
76 
77 	for (i = 0; i < iovcnt; i++)
78 		bytes += iov[i].iov_len;
79 
80 	if (unlikely(bytes == 0))
81 		return 0;
82 
83 	tmp_buf = malloc(bytes);
84 	if (unlikely(tmp_buf == NULL)) {
85 		errno = ENOMEM;
86 		return -1;
87 	}
88 
89 	ptr = tmp_buf;
90 	for (i = 0; i < iovcnt; i++) {
91 		rte_memcpy(ptr, iov[i].iov_base, iov[i].iov_len);
92 		ptr += iov[i].iov_len;
93 	}
94 
95 	ret = write(fd, tmp_buf, bytes);
96 	free(tmp_buf);
97 	return ret;
98 }
99 
100 #define IF_NAMESIZE	16
101 /* compatibility wrapper because name is optional */
102 #define if_indextoname(ifindex, ifname) NULL
103 #endif
104 
105 static inline void
106 pcapng_init(void)
107 {
108 	struct timespec ts;
109 
110 	pcapng_time.cycles = rte_get_tsc_cycles();
111 	clock_gettime(CLOCK_REALTIME, &ts);
112 	pcapng_time.cycles = (pcapng_time.cycles + rte_get_tsc_cycles()) / 2;
113 	pcapng_time.ns = rte_timespec_to_ns(&ts);
114 
115 	pcapng_time.tsc_hz = rte_get_tsc_hz();
116 	pcapng_time.tsc_hz_inverse = rte_reciprocal_value_u64(pcapng_time.tsc_hz);
117 }
118 
119 /* PCAPNG timestamps are in nanoseconds */
120 static uint64_t pcapng_tsc_to_ns(uint64_t cycles)
121 {
122 	uint64_t delta, secs;
123 
124 	if (!pcapng_time.tsc_hz)
125 		pcapng_init();
126 
127 	/* In essence the calculation is:
128 	 *   delta = (cycles - pcapng_time.cycles) * NSEC_PRE_SEC / rte_get_tsc_hz()
129 	 * but this overflows within 4 to 8 seconds depending on TSC frequency.
130 	 * Instead, if delta >= pcapng_time.tsc_hz:
131 	 *   Increase pcapng_time.ns and pcapng_time.cycles by the number of
132 	 *   whole seconds in delta and reduce delta accordingly.
133 	 * delta will therefore always lie in the interval [0, pcapng_time.tsc_hz),
134 	 * which will not overflow when multiplied by NSEC_PER_SEC provided the
135 	 * TSC frequency < approx 18.4GHz.
136 	 *
137 	 * Currently all TSCs operate below 5GHz.
138 	 */
139 	delta = cycles - pcapng_time.cycles;
140 	if (unlikely(delta >= pcapng_time.tsc_hz)) {
141 		if (likely(delta < pcapng_time.tsc_hz * 2)) {
142 			delta -= pcapng_time.tsc_hz;
143 			pcapng_time.cycles += pcapng_time.tsc_hz;
144 			pcapng_time.ns += NSEC_PER_SEC;
145 		} else {
146 			secs = rte_reciprocal_divide_u64(delta, &pcapng_time.tsc_hz_inverse);
147 			delta -= secs * pcapng_time.tsc_hz;
148 			pcapng_time.cycles += secs * pcapng_time.tsc_hz;
149 			pcapng_time.ns += secs * NSEC_PER_SEC;
150 		}
151 	}
152 
153 	return pcapng_time.ns + rte_reciprocal_divide_u64(delta * NSEC_PER_SEC,
154 							  &pcapng_time.tsc_hz_inverse);
155 }
156 
157 /* length of option including padding */
158 static uint16_t pcapng_optlen(uint16_t len)
159 {
160 	return RTE_ALIGN(sizeof(struct pcapng_option) + len,
161 			 sizeof(uint32_t));
162 }
163 
164 /* build TLV option and return location of next */
165 static struct pcapng_option *
166 pcapng_add_option(struct pcapng_option *popt, uint16_t code,
167 		  const void *data, uint16_t len)
168 {
169 	popt->code = code;
170 	popt->length = len;
171 	memcpy(popt->data, data, len);
172 
173 	return (struct pcapng_option *)((uint8_t *)popt + pcapng_optlen(len));
174 }
175 
176 /*
177  * Write required initial section header describing the capture
178  */
179 static int
180 pcapng_section_block(rte_pcapng_t *self,
181 		    const char *os, const char *hw,
182 		    const char *app, const char *comment)
183 {
184 	struct pcapng_section_header *hdr;
185 	struct pcapng_option *opt;
186 	void *buf;
187 	uint32_t len;
188 	ssize_t cc;
189 
190 	len = sizeof(*hdr);
191 	if (hw)
192 		len += pcapng_optlen(strlen(hw));
193 	if (os)
194 		len += pcapng_optlen(strlen(os));
195 	if (app)
196 		len += pcapng_optlen(strlen(app));
197 	if (comment)
198 		len += pcapng_optlen(strlen(comment));
199 
200 	/* reserve space for OPT_END */
201 	len += pcapng_optlen(0);
202 	len += sizeof(uint32_t);
203 
204 	buf = calloc(1, len);
205 	if (!buf)
206 		return -1;
207 
208 	hdr = (struct pcapng_section_header *)buf;
209 	*hdr = (struct pcapng_section_header) {
210 		.block_type = PCAPNG_SECTION_BLOCK,
211 		.block_length = len,
212 		.byte_order_magic = PCAPNG_BYTE_ORDER_MAGIC,
213 		.major_version = PCAPNG_MAJOR_VERS,
214 		.minor_version = PCAPNG_MINOR_VERS,
215 		.section_length = UINT64_MAX,
216 	};
217 
218 	/* After the section header insert variable length options. */
219 	opt = (struct pcapng_option *)(hdr + 1);
220 	if (comment)
221 		opt = pcapng_add_option(opt, PCAPNG_OPT_COMMENT,
222 					comment, strlen(comment));
223 	if (hw)
224 		opt = pcapng_add_option(opt, PCAPNG_SHB_HARDWARE,
225 					hw, strlen(hw));
226 	if (os)
227 		opt = pcapng_add_option(opt, PCAPNG_SHB_OS,
228 					os, strlen(os));
229 	if (app)
230 		opt = pcapng_add_option(opt, PCAPNG_SHB_USERAPPL,
231 					app, strlen(app));
232 
233 	/* The standard requires last option to be OPT_END */
234 	opt = pcapng_add_option(opt, PCAPNG_OPT_END, NULL, 0);
235 
236 	/* clone block_length after option */
237 	memcpy(opt, &hdr->block_length, sizeof(uint32_t));
238 
239 	cc = write(self->outfd, buf, len);
240 	free(buf);
241 
242 	return cc;
243 }
244 
245 /* Write an interface block for a DPDK port */
246 int
247 rte_pcapng_add_interface(rte_pcapng_t *self, uint16_t port,
248 			 const char *ifname, const char *ifdescr,
249 			 const char *filter)
250 {
251 	struct pcapng_interface_block *hdr;
252 	struct rte_eth_dev_info dev_info;
253 	struct rte_ether_addr *ea, macaddr;
254 	const struct rte_device *dev;
255 	struct rte_eth_link link;
256 	struct pcapng_option *opt;
257 	const uint8_t tsresol = 9;	/* nanosecond resolution */
258 	uint32_t len;
259 	void *buf;
260 	char ifname_buf[IF_NAMESIZE];
261 	char ifhw[256];
262 	uint64_t speed = 0;
263 
264 	if (rte_eth_dev_info_get(port, &dev_info) < 0)
265 		return -1;
266 
267 	/* make something like an interface name */
268 	if (ifname == NULL) {
269 		/* Use kernel name if available */
270 		ifname = if_indextoname(dev_info.if_index, ifname_buf);
271 		if (ifname == NULL) {
272 			snprintf(ifname_buf, IF_NAMESIZE, "dpdk:%u", port);
273 			ifname = ifname_buf;
274 		}
275 	}
276 
277 	/* make a useful device hardware string */
278 	dev = dev_info.device;
279 	if (dev)
280 		snprintf(ifhw, sizeof(ifhw),
281 			 "%s-%s", dev->bus->name, dev->name);
282 
283 	/* DPDK reports in units of Mbps */
284 	if (rte_eth_link_get(port, &link) == 0 &&
285 	    link.link_status == RTE_ETH_LINK_UP)
286 		speed = link.link_speed * PCAPNG_MBPS_SPEED;
287 
288 	if (rte_eth_macaddr_get(port, &macaddr) < 0)
289 		ea = NULL;
290 	else
291 		ea = &macaddr;
292 
293 	/* Compute length of interface block options */
294 	len = sizeof(*hdr);
295 
296 	len += pcapng_optlen(sizeof(tsresol));	/* timestamp */
297 	len += pcapng_optlen(strlen(ifname));	/* ifname */
298 
299 	if (ifdescr)
300 		len += pcapng_optlen(strlen(ifdescr));
301 	if (ea)
302 		len += pcapng_optlen(RTE_ETHER_ADDR_LEN); /* macaddr */
303 	if (speed != 0)
304 		len += pcapng_optlen(sizeof(uint64_t));
305 	if (filter)
306 		len += pcapng_optlen(strlen(filter) + 1);
307 	if (dev)
308 		len += pcapng_optlen(strlen(ifhw));
309 
310 	len += pcapng_optlen(0);
311 	len += sizeof(uint32_t);
312 
313 	buf = alloca(len);
314 	if (!buf)
315 		return -1;
316 
317 	hdr = (struct pcapng_interface_block *)buf;
318 	*hdr = (struct pcapng_interface_block) {
319 		.block_type = PCAPNG_INTERFACE_BLOCK,
320 		.link_type = 1,		/* DLT_EN10MB - Ethernet */
321 		.block_length = len,
322 	};
323 
324 	opt = (struct pcapng_option *)(hdr + 1);
325 	opt = pcapng_add_option(opt, PCAPNG_IFB_TSRESOL,
326 				&tsresol, sizeof(tsresol));
327 	opt = pcapng_add_option(opt, PCAPNG_IFB_NAME,
328 				ifname, strlen(ifname));
329 	if (ifdescr)
330 		opt = pcapng_add_option(opt, PCAPNG_IFB_DESCRIPTION,
331 					ifdescr, strlen(ifdescr));
332 	if (ea)
333 		opt = pcapng_add_option(opt, PCAPNG_IFB_MACADDR,
334 					ea, RTE_ETHER_ADDR_LEN);
335 	if (speed != 0)
336 		opt = pcapng_add_option(opt, PCAPNG_IFB_SPEED,
337 					 &speed, sizeof(uint64_t));
338 	if (dev)
339 		opt = pcapng_add_option(opt, PCAPNG_IFB_HARDWARE,
340 					 ifhw, strlen(ifhw));
341 	if (filter) {
342 		/* Encoding is that the first octet indicates string vs BPF */
343 		size_t len;
344 		char *buf;
345 
346 		len = strlen(filter) + 1;
347 		buf = alloca(len);
348 		*buf = '\0';
349 		memcpy(buf + 1, filter, len);
350 
351 		opt = pcapng_add_option(opt, PCAPNG_IFB_FILTER,
352 					buf, len);
353 	}
354 
355 	opt = pcapng_add_option(opt, PCAPNG_OPT_END, NULL, 0);
356 
357 	/* clone block_length after optionsa */
358 	memcpy(opt, &hdr->block_length, sizeof(uint32_t));
359 
360 	/* remember the file index */
361 	self->port_index[port] = self->ports++;
362 
363 	return write(self->outfd, buf, len);
364 }
365 
366 /*
367  * Write an Interface statistics block at the end of capture.
368  */
369 ssize_t
370 rte_pcapng_write_stats(rte_pcapng_t *self, uint16_t port_id,
371 		       const char *comment,
372 		       uint64_t start_time, uint64_t end_time,
373 		       uint64_t ifrecv, uint64_t ifdrop)
374 {
375 	struct pcapng_statistics *hdr;
376 	struct pcapng_option *opt;
377 	uint32_t optlen, len;
378 	uint8_t *buf;
379 	uint64_t ns;
380 
381 	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
382 
383 	optlen = 0;
384 
385 	if (ifrecv != UINT64_MAX)
386 		optlen += pcapng_optlen(sizeof(ifrecv));
387 	if (ifdrop != UINT64_MAX)
388 		optlen += pcapng_optlen(sizeof(ifdrop));
389 	if (start_time != 0)
390 		optlen += pcapng_optlen(sizeof(start_time));
391 	if (end_time != 0)
392 		optlen += pcapng_optlen(sizeof(end_time));
393 	if (comment)
394 		optlen += pcapng_optlen(strlen(comment));
395 	if (optlen != 0)
396 		optlen += pcapng_optlen(0);
397 
398 	len = sizeof(*hdr) + optlen + sizeof(uint32_t);
399 	buf = alloca(len);
400 	if (buf == NULL)
401 		return -1;
402 
403 	hdr = (struct pcapng_statistics *)buf;
404 	opt = (struct pcapng_option *)(hdr + 1);
405 
406 	if (comment)
407 		opt = pcapng_add_option(opt, PCAPNG_OPT_COMMENT,
408 					comment, strlen(comment));
409 	if (start_time != 0)
410 		opt = pcapng_add_option(opt, PCAPNG_ISB_STARTTIME,
411 					 &start_time, sizeof(start_time));
412 	if (end_time != 0)
413 		opt = pcapng_add_option(opt, PCAPNG_ISB_ENDTIME,
414 					 &end_time, sizeof(end_time));
415 	if (ifrecv != UINT64_MAX)
416 		opt = pcapng_add_option(opt, PCAPNG_ISB_IFRECV,
417 				&ifrecv, sizeof(ifrecv));
418 	if (ifdrop != UINT64_MAX)
419 		opt = pcapng_add_option(opt, PCAPNG_ISB_IFDROP,
420 				&ifdrop, sizeof(ifdrop));
421 	if (optlen != 0)
422 		opt = pcapng_add_option(opt, PCAPNG_OPT_END, NULL, 0);
423 
424 	hdr->block_type = PCAPNG_INTERFACE_STATS_BLOCK;
425 	hdr->block_length = len;
426 	hdr->interface_id = self->port_index[port_id];
427 
428 	ns = pcapng_tsc_to_ns(rte_get_tsc_cycles());
429 	hdr->timestamp_hi = ns >> 32;
430 	hdr->timestamp_lo = (uint32_t)ns;
431 
432 	/* clone block_length after option */
433 	memcpy(opt, &len, sizeof(uint32_t));
434 
435 	return write(self->outfd, buf, len);
436 }
437 
438 uint32_t
439 rte_pcapng_mbuf_size(uint32_t length)
440 {
441 	/* The VLAN and EPB header must fit in the mbuf headroom. */
442 	RTE_ASSERT(sizeof(struct pcapng_enhance_packet_block) +
443 		   sizeof(struct rte_vlan_hdr) <= RTE_PKTMBUF_HEADROOM);
444 
445 	/* The flags and queue information are added at the end. */
446 	return sizeof(struct rte_mbuf)
447 		+ RTE_ALIGN(length, sizeof(uint32_t))
448 		+ pcapng_optlen(sizeof(uint32_t)) /* flag option */
449 		+ pcapng_optlen(sizeof(uint32_t)) /* queue option */
450 		+ sizeof(uint32_t);		  /*  length */
451 }
452 
453 /* More generalized version rte_vlan_insert() */
454 static int
455 pcapng_vlan_insert(struct rte_mbuf *m, uint16_t ether_type, uint16_t tci)
456 {
457 	struct rte_ether_hdr *nh, *oh;
458 	struct rte_vlan_hdr *vh;
459 
460 	if (!RTE_MBUF_DIRECT(m) || rte_mbuf_refcnt_read(m) > 1)
461 		return -EINVAL;
462 
463 	if (rte_pktmbuf_data_len(m) < sizeof(*oh))
464 		return -EINVAL;
465 
466 	oh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
467 	nh = (struct rte_ether_hdr *)
468 		rte_pktmbuf_prepend(m, sizeof(struct rte_vlan_hdr));
469 	if (nh == NULL)
470 		return -ENOSPC;
471 
472 	memmove(nh, oh, 2 * RTE_ETHER_ADDR_LEN);
473 	nh->ether_type = rte_cpu_to_be_16(ether_type);
474 
475 	vh = (struct rte_vlan_hdr *) (nh + 1);
476 	vh->vlan_tci = rte_cpu_to_be_16(tci);
477 
478 	return 0;
479 }
480 
481 /*
482  *   The mbufs created use the Pcapng standard enhanced packet  block.
483  *
484  *                         1                   2                   3
485  *     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
486  *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
487  *  0 |                    Block Type = 0x00000006                    |
488  *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
489  *  4 |                      Block Total Length                       |
490  *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
491  *  8 |                         Interface ID                          |
492  *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
493  * 12 |                        Timestamp (High)                       |
494  *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
495  * 16 |                        Timestamp (Low)                        |
496  *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
497  * 20 |                    Captured Packet Length                     |
498  *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
499  * 24 |                    Original Packet Length                     |
500  *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
501  * 28 /                                                               /
502  *    /                          Packet Data                          /
503  *    /              variable length, padded to 32 bits               /
504  *    /                                                               /
505  *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
506  *    |      Option Code = 0x0002     |     Option Length = 0x004     |
507  *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
508  *    |              Flags (direction)                                |
509  *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
510  *    |      Option Code = 0x0006     |     Option Length = 0x002     |
511  *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
512  *    |              Queue id                                         |
513  *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
514  *    |                      Block Total Length                       |
515  *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
516  */
517 
518 /* Make a copy of original mbuf with pcapng header and options */
519 struct rte_mbuf *
520 rte_pcapng_copy(uint16_t port_id, uint32_t queue,
521 		const struct rte_mbuf *md,
522 		struct rte_mempool *mp,
523 		uint32_t length, uint64_t cycles,
524 		enum rte_pcapng_direction direction,
525 		const char *comment)
526 {
527 	struct pcapng_enhance_packet_block *epb;
528 	uint32_t orig_len, data_len, padding, flags;
529 	struct pcapng_option *opt;
530 	uint16_t optlen;
531 	struct rte_mbuf *mc;
532 	uint64_t ns;
533 	bool rss_hash;
534 
535 #ifdef RTE_LIBRTE_ETHDEV_DEBUG
536 	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, NULL);
537 #endif
538 	ns = pcapng_tsc_to_ns(cycles);
539 
540 	orig_len = rte_pktmbuf_pkt_len(md);
541 
542 	/* Take snapshot of the data */
543 	mc = rte_pktmbuf_copy(md, mp, 0, length);
544 	if (unlikely(mc == NULL))
545 		return NULL;
546 
547 	/* Expand any offloaded VLAN information */
548 	if ((direction == RTE_PCAPNG_DIRECTION_IN &&
549 	     (md->ol_flags & RTE_MBUF_F_RX_VLAN_STRIPPED)) ||
550 	    (direction == RTE_PCAPNG_DIRECTION_OUT &&
551 	     (md->ol_flags & RTE_MBUF_F_TX_VLAN))) {
552 		if (pcapng_vlan_insert(mc, RTE_ETHER_TYPE_VLAN,
553 				       md->vlan_tci) != 0)
554 			goto fail;
555 	}
556 
557 	if ((direction == RTE_PCAPNG_DIRECTION_IN &&
558 	     (md->ol_flags & RTE_MBUF_F_RX_QINQ_STRIPPED)) ||
559 	    (direction == RTE_PCAPNG_DIRECTION_OUT &&
560 	     (md->ol_flags & RTE_MBUF_F_TX_QINQ))) {
561 		if (pcapng_vlan_insert(mc, RTE_ETHER_TYPE_QINQ,
562 				       md->vlan_tci_outer) != 0)
563 			goto fail;
564 	}
565 
566 	/* record HASH on incoming packets */
567 	rss_hash = (direction == RTE_PCAPNG_DIRECTION_IN &&
568 		    (md->ol_flags & RTE_MBUF_F_RX_RSS_HASH));
569 
570 	/* pad the packet to 32 bit boundary */
571 	data_len = rte_pktmbuf_data_len(mc);
572 	padding = RTE_ALIGN(data_len, sizeof(uint32_t)) - data_len;
573 	if (padding > 0) {
574 		void *tail = rte_pktmbuf_append(mc, padding);
575 
576 		if (tail == NULL)
577 			goto fail;
578 		memset(tail, 0, padding);
579 	}
580 
581 	optlen = pcapng_optlen(sizeof(flags));
582 	optlen += pcapng_optlen(sizeof(queue));
583 	if (rss_hash)
584 		optlen += pcapng_optlen(sizeof(uint8_t) + sizeof(uint32_t));
585 
586 	if (comment)
587 		optlen += pcapng_optlen(strlen(comment));
588 
589 	/* reserve trailing options and block length */
590 	opt = (struct pcapng_option *)
591 		rte_pktmbuf_append(mc, optlen + sizeof(uint32_t));
592 	if (unlikely(opt == NULL))
593 		goto fail;
594 
595 	switch (direction) {
596 	case RTE_PCAPNG_DIRECTION_IN:
597 		flags = PCAPNG_IFB_INBOUND;
598 		break;
599 	case RTE_PCAPNG_DIRECTION_OUT:
600 		flags = PCAPNG_IFB_OUTBOUND;
601 		break;
602 	default:
603 		flags = 0;
604 	}
605 
606 	opt = pcapng_add_option(opt, PCAPNG_EPB_FLAGS,
607 				&flags, sizeof(flags));
608 
609 	opt = pcapng_add_option(opt, PCAPNG_EPB_QUEUE,
610 				&queue, sizeof(queue));
611 
612 	if (rss_hash) {
613 		uint8_t hash_opt[5];
614 
615 		/* The algorithm could be something else if
616 		 * using rte_flow_action_rss; but the current API does not
617 		 * have a way for ethdev to report  this on a per-packet basis.
618 		 */
619 		hash_opt[0] = PCAPNG_HASH_TOEPLITZ;
620 
621 		memcpy(&hash_opt[1], &md->hash.rss, sizeof(uint32_t));
622 		opt = pcapng_add_option(opt, PCAPNG_EPB_HASH,
623 					&hash_opt, sizeof(hash_opt));
624 	}
625 
626 	if (comment)
627 		opt = pcapng_add_option(opt, PCAPNG_OPT_COMMENT, comment,
628 					strlen(comment));
629 
630 	/* Note: END_OPT necessary here. Wireshark doesn't do it. */
631 
632 	/* Add PCAPNG packet header */
633 	epb = (struct pcapng_enhance_packet_block *)
634 		rte_pktmbuf_prepend(mc, sizeof(*epb));
635 	if (unlikely(epb == NULL))
636 		goto fail;
637 
638 	epb->block_type = PCAPNG_ENHANCED_PACKET_BLOCK;
639 	epb->block_length = rte_pktmbuf_data_len(mc);
640 
641 	/* Interface index is filled in later during write */
642 	mc->port = port_id;
643 
644 	epb->timestamp_hi = ns >> 32;
645 	epb->timestamp_lo = (uint32_t)ns;
646 	epb->capture_length = data_len;
647 	epb->original_length = orig_len;
648 
649 	/* set trailer of block length */
650 	*(uint32_t *)opt = epb->block_length;
651 
652 	return mc;
653 
654 fail:
655 	rte_pktmbuf_free(mc);
656 	return NULL;
657 }
658 
659 /* Write pre-formatted packets to file. */
660 ssize_t
661 rte_pcapng_write_packets(rte_pcapng_t *self,
662 			 struct rte_mbuf *pkts[], uint16_t nb_pkts)
663 {
664 	struct iovec iov[IOV_MAX];
665 	unsigned int i, cnt = 0;
666 	ssize_t ret, total = 0;
667 
668 	for (i = 0; i < nb_pkts; i++) {
669 		struct rte_mbuf *m = pkts[i];
670 		struct pcapng_enhance_packet_block *epb;
671 
672 		/* sanity check that is really a pcapng mbuf */
673 		epb = rte_pktmbuf_mtod(m, struct pcapng_enhance_packet_block *);
674 		if (unlikely(epb->block_type != PCAPNG_ENHANCED_PACKET_BLOCK ||
675 			     epb->block_length != rte_pktmbuf_data_len(m))) {
676 			rte_errno = EINVAL;
677 			return -1;
678 		}
679 
680 		/* check that this interface was added. */
681 		epb->interface_id = self->port_index[m->port];
682 		if (unlikely(epb->interface_id > RTE_MAX_ETHPORTS)) {
683 			rte_errno = EINVAL;
684 			return -1;
685 		}
686 
687 		/*
688 		 * Handle case of highly fragmented and large burst size
689 		 * Note: this assumes that max segments per mbuf < IOV_MAX
690 		 */
691 		if (unlikely(cnt + m->nb_segs >= IOV_MAX)) {
692 			ret = writev(self->outfd, iov, cnt);
693 			if (unlikely(ret < 0)) {
694 				rte_errno = errno;
695 				return -1;
696 			}
697 			total += ret;
698 			cnt = 0;
699 		}
700 
701 		/*
702 		 * The DPDK port is recorded during pcapng_copy.
703 		 * Map that to PCAPNG interface in file.
704 		 */
705 		do {
706 			iov[cnt].iov_base = rte_pktmbuf_mtod(m, void *);
707 			iov[cnt].iov_len = rte_pktmbuf_data_len(m);
708 			++cnt;
709 		} while ((m = m->next));
710 	}
711 
712 	ret = writev(self->outfd, iov, cnt);
713 	if (unlikely(ret < 0)) {
714 		rte_errno = errno;
715 		return -1;
716 	}
717 	return total + ret;
718 }
719 
720 /* Create new pcapng writer handle */
721 rte_pcapng_t *
722 rte_pcapng_fdopen(int fd,
723 		  const char *osname, const char *hardware,
724 		  const char *appname, const char *comment)
725 {
726 	unsigned int i;
727 	rte_pcapng_t *self;
728 
729 	self = malloc(sizeof(*self));
730 	if (!self) {
731 		rte_errno = ENOMEM;
732 		return NULL;
733 	}
734 
735 	self->outfd = fd;
736 	self->ports = 0;
737 	for (i = 0; i < RTE_MAX_ETHPORTS; i++)
738 		self->port_index[i] = UINT32_MAX;
739 
740 	if (pcapng_section_block(self, osname, hardware, appname, comment) < 0)
741 		goto fail;
742 
743 	return self;
744 fail:
745 	free(self);
746 	return NULL;
747 }
748 
749 void
750 rte_pcapng_close(rte_pcapng_t *self)
751 {
752 	close(self->outfd);
753 	free(self);
754 }
755