1 /* $NetBSD: pktqueue.c,v 1.13 2021/03/25 08:18:03 skrll Exp $ */ 2 3 /*- 4 * Copyright (c) 2014 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Mindaugas Rasiukevicius. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * The packet queue (pktqueue) interface is a lockless IP input queue 34 * which also abstracts and handles network ISR scheduling. It provides 35 * a mechanism to enable receiver-side packet steering (RPS). 36 */ 37 38 #include <sys/cdefs.h> 39 __KERNEL_RCSID(0, "$NetBSD: pktqueue.c,v 1.13 2021/03/25 08:18:03 skrll Exp $"); 40 41 #include <sys/param.h> 42 #include <sys/types.h> 43 44 #include <sys/atomic.h> 45 #include <sys/cpu.h> 46 #include <sys/pcq.h> 47 #include <sys/intr.h> 48 #include <sys/mbuf.h> 49 #include <sys/proc.h> 50 #include <sys/percpu.h> 51 #include <sys/xcall.h> 52 53 #include <net/pktqueue.h> 54 55 struct pktqueue { 56 /* 57 * The lock used for a barrier mechanism. The barrier counter, 58 * as well as the drop counter, are managed atomically though. 59 * Ensure this group is in a separate cache line. 60 */ 61 union { 62 struct { 63 kmutex_t pq_lock; 64 volatile u_int pq_barrier; 65 }; 66 uint8_t _pad[COHERENCY_UNIT]; 67 }; 68 69 /* The size of the queue, counters and the interrupt handler. */ 70 u_int pq_maxlen; 71 percpu_t * pq_counters; 72 void * pq_sih; 73 74 /* Finally, per-CPU queues. */ 75 struct percpu * pq_pcq; /* struct pcq * */ 76 }; 77 78 /* The counters of the packet queue. */ 79 #define PQCNT_ENQUEUE 0 80 #define PQCNT_DEQUEUE 1 81 #define PQCNT_DROP 2 82 #define PQCNT_NCOUNTERS 3 83 84 typedef struct { 85 uint64_t count[PQCNT_NCOUNTERS]; 86 } pktq_counters_t; 87 88 /* Special marker value used by pktq_barrier() mechanism. */ 89 #define PKTQ_MARKER ((void *)(~0ULL)) 90 91 static void 92 pktq_init_cpu(void *vqp, void *vpq, struct cpu_info *ci) 93 { 94 struct pcq **qp = vqp; 95 struct pktqueue *pq = vpq; 96 97 *qp = pcq_create(pq->pq_maxlen, KM_SLEEP); 98 } 99 100 static void 101 pktq_fini_cpu(void *vqp, void *vpq, struct cpu_info *ci) 102 { 103 struct pcq **qp = vqp, *q = *qp; 104 105 KASSERT(pcq_peek(q) == NULL); 106 pcq_destroy(q); 107 *qp = NULL; /* paranoia */ 108 } 109 110 static struct pcq * 111 pktq_pcq(struct pktqueue *pq, struct cpu_info *ci) 112 { 113 struct pcq **qp, *q; 114 115 /* 116 * As long as preemption is disabled, the xcall to swap percpu 117 * buffers can't complete, so it is safe to read the pointer. 118 */ 119 KASSERT(kpreempt_disabled()); 120 121 qp = percpu_getptr_remote(pq->pq_pcq, ci); 122 q = *qp; 123 124 return q; 125 } 126 127 pktqueue_t * 128 pktq_create(size_t maxlen, void (*intrh)(void *), void *sc) 129 { 130 const u_int sflags = SOFTINT_NET | SOFTINT_MPSAFE | SOFTINT_RCPU; 131 pktqueue_t *pq; 132 percpu_t *pc; 133 void *sih; 134 135 pc = percpu_alloc(sizeof(pktq_counters_t)); 136 if ((sih = softint_establish(sflags, intrh, sc)) == NULL) { 137 percpu_free(pc, sizeof(pktq_counters_t)); 138 return NULL; 139 } 140 141 pq = kmem_zalloc(sizeof(*pq), KM_SLEEP); 142 mutex_init(&pq->pq_lock, MUTEX_DEFAULT, IPL_NONE); 143 pq->pq_maxlen = maxlen; 144 pq->pq_counters = pc; 145 pq->pq_sih = sih; 146 pq->pq_pcq = percpu_create(sizeof(struct pcq *), 147 pktq_init_cpu, pktq_fini_cpu, pq); 148 149 return pq; 150 } 151 152 void 153 pktq_destroy(pktqueue_t *pq) 154 { 155 156 percpu_free(pq->pq_pcq, sizeof(struct pcq *)); 157 percpu_free(pq->pq_counters, sizeof(pktq_counters_t)); 158 softint_disestablish(pq->pq_sih); 159 mutex_destroy(&pq->pq_lock); 160 kmem_free(pq, sizeof(*pq)); 161 } 162 163 /* 164 * - pktq_inc_counter: increment the counter given an ID. 165 * - pktq_collect_counts: handler to sum up the counts from each CPU. 166 * - pktq_getcount: return the effective count given an ID. 167 */ 168 169 static inline void 170 pktq_inc_count(pktqueue_t *pq, u_int i) 171 { 172 percpu_t *pc = pq->pq_counters; 173 pktq_counters_t *c; 174 175 c = percpu_getref(pc); 176 c->count[i]++; 177 percpu_putref(pc); 178 } 179 180 static void 181 pktq_collect_counts(void *mem, void *arg, struct cpu_info *ci) 182 { 183 const pktq_counters_t *c = mem; 184 pktq_counters_t *sum = arg; 185 186 int s = splnet(); 187 188 for (u_int i = 0; i < PQCNT_NCOUNTERS; i++) { 189 sum->count[i] += c->count[i]; 190 } 191 192 splx(s); 193 } 194 195 uint64_t 196 pktq_get_count(pktqueue_t *pq, pktq_count_t c) 197 { 198 pktq_counters_t sum; 199 200 if (c != PKTQ_MAXLEN) { 201 memset(&sum, 0, sizeof(sum)); 202 percpu_foreach_xcall(pq->pq_counters, 203 XC_HIGHPRI_IPL(IPL_SOFTNET), pktq_collect_counts, &sum); 204 } 205 switch (c) { 206 case PKTQ_NITEMS: 207 return sum.count[PQCNT_ENQUEUE] - sum.count[PQCNT_DEQUEUE]; 208 case PKTQ_DROPS: 209 return sum.count[PQCNT_DROP]; 210 case PKTQ_MAXLEN: 211 return pq->pq_maxlen; 212 } 213 return 0; 214 } 215 216 uint32_t 217 pktq_rps_hash(const struct mbuf *m __unused) 218 { 219 /* 220 * XXX: No distribution yet; the softnet_lock contention 221 * XXX: must be eliminated first. 222 */ 223 return 0; 224 } 225 226 /* 227 * pktq_enqueue: inject the packet into the end of the queue. 228 * 229 * => Must be called from the interrupt or with the preemption disabled. 230 * => Consumes the packet and returns true on success. 231 * => Returns false on failure; caller is responsible to free the packet. 232 */ 233 bool 234 pktq_enqueue(pktqueue_t *pq, struct mbuf *m, const u_int hash __unused) 235 { 236 #if defined(_RUMPKERNEL) || defined(_RUMP_NATIVE_ABI) 237 struct cpu_info *ci = curcpu(); 238 #else 239 struct cpu_info *ci = cpu_lookup(hash % ncpu); 240 #endif 241 242 KASSERT(kpreempt_disabled()); 243 244 if (__predict_false(!pcq_put(pktq_pcq(pq, ci), m))) { 245 pktq_inc_count(pq, PQCNT_DROP); 246 return false; 247 } 248 softint_schedule_cpu(pq->pq_sih, ci); 249 pktq_inc_count(pq, PQCNT_ENQUEUE); 250 return true; 251 } 252 253 /* 254 * pktq_dequeue: take a packet from the queue. 255 * 256 * => Must be called with preemption disabled. 257 * => Must ensure there are not concurrent dequeue calls. 258 */ 259 struct mbuf * 260 pktq_dequeue(pktqueue_t *pq) 261 { 262 struct cpu_info *ci = curcpu(); 263 struct mbuf *m; 264 265 KASSERT(kpreempt_disabled()); 266 267 m = pcq_get(pktq_pcq(pq, ci)); 268 if (__predict_false(m == PKTQ_MARKER)) { 269 /* Note the marker entry. */ 270 atomic_inc_uint(&pq->pq_barrier); 271 return NULL; 272 } 273 if (__predict_true(m != NULL)) { 274 pktq_inc_count(pq, PQCNT_DEQUEUE); 275 } 276 return m; 277 } 278 279 /* 280 * pktq_barrier: waits for a grace period when all packets enqueued at 281 * the moment of calling this routine will be processed. This is used 282 * to ensure that e.g. packets referencing some interface were drained. 283 */ 284 void 285 pktq_barrier(pktqueue_t *pq) 286 { 287 CPU_INFO_ITERATOR cii; 288 struct cpu_info *ci; 289 u_int pending = 0; 290 291 mutex_enter(&pq->pq_lock); 292 KASSERT(pq->pq_barrier == 0); 293 294 for (CPU_INFO_FOREACH(cii, ci)) { 295 struct pcq *q; 296 297 kpreempt_disable(); 298 q = pktq_pcq(pq, ci); 299 kpreempt_enable(); 300 301 /* If the queue is empty - nothing to do. */ 302 if (pcq_peek(q) == NULL) { 303 continue; 304 } 305 /* Otherwise, put the marker and entry. */ 306 while (!pcq_put(q, PKTQ_MARKER)) { 307 kpause("pktqsync", false, 1, NULL); 308 } 309 kpreempt_disable(); 310 softint_schedule_cpu(pq->pq_sih, ci); 311 kpreempt_enable(); 312 pending++; 313 } 314 315 /* Wait for each queue to process the markers. */ 316 while (pq->pq_barrier != pending) { 317 kpause("pktqsync", false, 1, NULL); 318 } 319 pq->pq_barrier = 0; 320 mutex_exit(&pq->pq_lock); 321 } 322 323 /* 324 * pktq_flush: free mbufs in all queues. 325 * 326 * => The caller must ensure there are no concurrent writers or flush calls. 327 */ 328 void 329 pktq_flush(pktqueue_t *pq) 330 { 331 CPU_INFO_ITERATOR cii; 332 struct cpu_info *ci; 333 struct mbuf *m; 334 335 for (CPU_INFO_FOREACH(cii, ci)) { 336 struct pcq *q; 337 338 kpreempt_disable(); 339 q = pktq_pcq(pq, ci); 340 kpreempt_enable(); 341 342 /* 343 * XXX This can't be right -- if the softint is running 344 * then pcq_get isn't safe here. 345 */ 346 while ((m = pcq_get(q)) != NULL) { 347 pktq_inc_count(pq, PQCNT_DEQUEUE); 348 m_freem(m); 349 } 350 } 351 } 352 353 static void 354 pktq_set_maxlen_cpu(void *vpq, void *vqs) 355 { 356 struct pktqueue *pq = vpq; 357 struct pcq **qp, *q, **qs = vqs; 358 unsigned i = cpu_index(curcpu()); 359 int s; 360 361 s = splnet(); 362 qp = percpu_getref(pq->pq_pcq); 363 q = *qp; 364 *qp = qs[i]; 365 qs[i] = q; 366 percpu_putref(pq->pq_pcq); 367 splx(s); 368 } 369 370 /* 371 * pktq_set_maxlen: create per-CPU queues using a new size and replace 372 * the existing queues without losing any packets. 373 * 374 * XXX ncpu must remain stable throughout. 375 */ 376 int 377 pktq_set_maxlen(pktqueue_t *pq, size_t maxlen) 378 { 379 const u_int slotbytes = ncpu * sizeof(pcq_t *); 380 pcq_t **qs; 381 382 if (!maxlen || maxlen > PCQ_MAXLEN) 383 return EINVAL; 384 if (pq->pq_maxlen == maxlen) 385 return 0; 386 387 /* First, allocate the new queues. */ 388 qs = kmem_zalloc(slotbytes, KM_SLEEP); 389 for (u_int i = 0; i < ncpu; i++) { 390 qs[i] = pcq_create(maxlen, KM_SLEEP); 391 } 392 393 /* 394 * Issue an xcall to replace the queue pointers on each CPU. 395 * This implies all the necessary memory barriers. 396 */ 397 mutex_enter(&pq->pq_lock); 398 xc_wait(xc_broadcast(XC_HIGHPRI, pktq_set_maxlen_cpu, pq, qs)); 399 pq->pq_maxlen = maxlen; 400 mutex_exit(&pq->pq_lock); 401 402 /* 403 * At this point, the new packets are flowing into the new 404 * queues. However, the old queues may have some packets 405 * present which are no longer being processed. We are going 406 * to re-enqueue them. This may change the order of packet 407 * arrival, but it is not considered an issue. 408 * 409 * There may be in-flight interrupts calling pktq_dequeue() 410 * which reference the old queues. Issue a barrier to ensure 411 * that we are going to be the only pcq_get() callers on the 412 * old queues. 413 */ 414 pktq_barrier(pq); 415 416 for (u_int i = 0; i < ncpu; i++) { 417 struct pcq *q; 418 struct mbuf *m; 419 420 kpreempt_disable(); 421 q = pktq_pcq(pq, cpu_lookup(i)); 422 kpreempt_enable(); 423 424 while ((m = pcq_get(qs[i])) != NULL) { 425 while (!pcq_put(q, m)) { 426 kpause("pktqrenq", false, 1, NULL); 427 } 428 } 429 pcq_destroy(qs[i]); 430 } 431 432 /* Well, that was fun. */ 433 kmem_free(qs, slotbytes); 434 return 0; 435 } 436 437 int 438 sysctl_pktq_maxlen(SYSCTLFN_ARGS, pktqueue_t *pq) 439 { 440 u_int nmaxlen = pktq_get_count(pq, PKTQ_MAXLEN); 441 struct sysctlnode node = *rnode; 442 int error; 443 444 node.sysctl_data = &nmaxlen; 445 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 446 if (error || newp == NULL) 447 return error; 448 return pktq_set_maxlen(pq, nmaxlen); 449 } 450 451 int 452 sysctl_pktq_count(SYSCTLFN_ARGS, pktqueue_t *pq, u_int count_id) 453 { 454 uint64_t count = pktq_get_count(pq, count_id); 455 struct sysctlnode node = *rnode; 456 457 node.sysctl_data = &count; 458 return sysctl_lookup(SYSCTLFN_CALL(&node)); 459 } 460