1 /* $NetBSD: pktqueue.c,v 1.12 2020/09/11 14:29:00 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2014 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Mindaugas Rasiukevicius. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * The packet queue (pktqueue) interface is a lockless IP input queue 34 * which also abstracts and handles network ISR scheduling. It provides 35 * a mechanism to enable receiver-side packet steering (RPS). 36 */ 37 38 #include <sys/cdefs.h> 39 __KERNEL_RCSID(0, "$NetBSD: pktqueue.c,v 1.12 2020/09/11 14:29:00 riastradh Exp $"); 40 41 #include <sys/param.h> 42 #include <sys/types.h> 43 44 #include <sys/atomic.h> 45 #include <sys/cpu.h> 46 #include <sys/pcq.h> 47 #include <sys/intr.h> 48 #include <sys/mbuf.h> 49 #include <sys/proc.h> 50 #include <sys/percpu.h> 51 #include <sys/xcall.h> 52 53 #include <net/pktqueue.h> 54 55 /* 56 * WARNING: update this if struct pktqueue changes. 57 */ 58 #define PKTQ_CLPAD \ 59 MAX(COHERENCY_UNIT, COHERENCY_UNIT - sizeof(kmutex_t) - sizeof(u_int)) 60 61 struct pktqueue { 62 /* 63 * The lock used for a barrier mechanism. The barrier counter, 64 * as well as the drop counter, are managed atomically though. 65 * Ensure this group is in a separate cache line. 66 */ 67 kmutex_t pq_lock; 68 volatile u_int pq_barrier; 69 uint8_t _pad[PKTQ_CLPAD]; 70 71 /* The size of the queue, counters and the interrupt handler. */ 72 u_int pq_maxlen; 73 percpu_t * pq_counters; 74 void * pq_sih; 75 76 /* Finally, per-CPU queues. */ 77 struct percpu * pq_pcq; /* struct pcq * */ 78 }; 79 80 /* The counters of the packet queue. */ 81 #define PQCNT_ENQUEUE 0 82 #define PQCNT_DEQUEUE 1 83 #define PQCNT_DROP 2 84 #define PQCNT_NCOUNTERS 3 85 86 typedef struct { 87 uint64_t count[PQCNT_NCOUNTERS]; 88 } pktq_counters_t; 89 90 /* Special marker value used by pktq_barrier() mechanism. */ 91 #define PKTQ_MARKER ((void *)(~0ULL)) 92 93 static void 94 pktq_init_cpu(void *vqp, void *vpq, struct cpu_info *ci) 95 { 96 struct pcq **qp = vqp; 97 struct pktqueue *pq = vpq; 98 99 *qp = pcq_create(pq->pq_maxlen, KM_SLEEP); 100 } 101 102 static void 103 pktq_fini_cpu(void *vqp, void *vpq, struct cpu_info *ci) 104 { 105 struct pcq **qp = vqp, *q = *qp; 106 107 KASSERT(pcq_peek(q) == NULL); 108 pcq_destroy(q); 109 *qp = NULL; /* paranoia */ 110 } 111 112 static struct pcq * 113 pktq_pcq(struct pktqueue *pq, struct cpu_info *ci) 114 { 115 struct pcq **qp, *q; 116 117 /* 118 * As long as preemption is disabled, the xcall to swap percpu 119 * buffers can't complete, so it is safe to read the pointer. 120 */ 121 KASSERT(kpreempt_disabled()); 122 123 qp = percpu_getptr_remote(pq->pq_pcq, ci); 124 q = *qp; 125 126 return q; 127 } 128 129 pktqueue_t * 130 pktq_create(size_t maxlen, void (*intrh)(void *), void *sc) 131 { 132 const u_int sflags = SOFTINT_NET | SOFTINT_MPSAFE | SOFTINT_RCPU; 133 pktqueue_t *pq; 134 percpu_t *pc; 135 void *sih; 136 137 pc = percpu_alloc(sizeof(pktq_counters_t)); 138 if ((sih = softint_establish(sflags, intrh, sc)) == NULL) { 139 percpu_free(pc, sizeof(pktq_counters_t)); 140 return NULL; 141 } 142 143 pq = kmem_zalloc(sizeof(*pq), KM_SLEEP); 144 mutex_init(&pq->pq_lock, MUTEX_DEFAULT, IPL_NONE); 145 pq->pq_maxlen = maxlen; 146 pq->pq_counters = pc; 147 pq->pq_sih = sih; 148 pq->pq_pcq = percpu_create(sizeof(struct pcq *), 149 pktq_init_cpu, pktq_fini_cpu, pq); 150 151 return pq; 152 } 153 154 void 155 pktq_destroy(pktqueue_t *pq) 156 { 157 158 percpu_free(pq->pq_pcq, sizeof(struct pcq *)); 159 percpu_free(pq->pq_counters, sizeof(pktq_counters_t)); 160 softint_disestablish(pq->pq_sih); 161 mutex_destroy(&pq->pq_lock); 162 kmem_free(pq, sizeof(*pq)); 163 } 164 165 /* 166 * - pktq_inc_counter: increment the counter given an ID. 167 * - pktq_collect_counts: handler to sum up the counts from each CPU. 168 * - pktq_getcount: return the effective count given an ID. 169 */ 170 171 static inline void 172 pktq_inc_count(pktqueue_t *pq, u_int i) 173 { 174 percpu_t *pc = pq->pq_counters; 175 pktq_counters_t *c; 176 177 c = percpu_getref(pc); 178 c->count[i]++; 179 percpu_putref(pc); 180 } 181 182 static void 183 pktq_collect_counts(void *mem, void *arg, struct cpu_info *ci) 184 { 185 const pktq_counters_t *c = mem; 186 pktq_counters_t *sum = arg; 187 188 int s = splnet(); 189 190 for (u_int i = 0; i < PQCNT_NCOUNTERS; i++) { 191 sum->count[i] += c->count[i]; 192 } 193 194 splx(s); 195 } 196 197 uint64_t 198 pktq_get_count(pktqueue_t *pq, pktq_count_t c) 199 { 200 pktq_counters_t sum; 201 202 if (c != PKTQ_MAXLEN) { 203 memset(&sum, 0, sizeof(sum)); 204 percpu_foreach_xcall(pq->pq_counters, 205 XC_HIGHPRI_IPL(IPL_SOFTNET), pktq_collect_counts, &sum); 206 } 207 switch (c) { 208 case PKTQ_NITEMS: 209 return sum.count[PQCNT_ENQUEUE] - sum.count[PQCNT_DEQUEUE]; 210 case PKTQ_DROPS: 211 return sum.count[PQCNT_DROP]; 212 case PKTQ_MAXLEN: 213 return pq->pq_maxlen; 214 } 215 return 0; 216 } 217 218 uint32_t 219 pktq_rps_hash(const struct mbuf *m __unused) 220 { 221 /* 222 * XXX: No distribution yet; the softnet_lock contention 223 * XXX: must be eliminated first. 224 */ 225 return 0; 226 } 227 228 /* 229 * pktq_enqueue: inject the packet into the end of the queue. 230 * 231 * => Must be called from the interrupt or with the preemption disabled. 232 * => Consumes the packet and returns true on success. 233 * => Returns false on failure; caller is responsible to free the packet. 234 */ 235 bool 236 pktq_enqueue(pktqueue_t *pq, struct mbuf *m, const u_int hash __unused) 237 { 238 #if defined(_RUMPKERNEL) || defined(_RUMP_NATIVE_ABI) 239 struct cpu_info *ci = curcpu(); 240 #else 241 struct cpu_info *ci = cpu_lookup(hash % ncpu); 242 #endif 243 244 KASSERT(kpreempt_disabled()); 245 246 if (__predict_false(!pcq_put(pktq_pcq(pq, ci), m))) { 247 pktq_inc_count(pq, PQCNT_DROP); 248 return false; 249 } 250 softint_schedule_cpu(pq->pq_sih, ci); 251 pktq_inc_count(pq, PQCNT_ENQUEUE); 252 return true; 253 } 254 255 /* 256 * pktq_dequeue: take a packet from the queue. 257 * 258 * => Must be called with preemption disabled. 259 * => Must ensure there are not concurrent dequeue calls. 260 */ 261 struct mbuf * 262 pktq_dequeue(pktqueue_t *pq) 263 { 264 struct cpu_info *ci = curcpu(); 265 struct mbuf *m; 266 267 KASSERT(kpreempt_disabled()); 268 269 m = pcq_get(pktq_pcq(pq, ci)); 270 if (__predict_false(m == PKTQ_MARKER)) { 271 /* Note the marker entry. */ 272 atomic_inc_uint(&pq->pq_barrier); 273 return NULL; 274 } 275 if (__predict_true(m != NULL)) { 276 pktq_inc_count(pq, PQCNT_DEQUEUE); 277 } 278 return m; 279 } 280 281 /* 282 * pktq_barrier: waits for a grace period when all packets enqueued at 283 * the moment of calling this routine will be processed. This is used 284 * to ensure that e.g. packets referencing some interface were drained. 285 */ 286 void 287 pktq_barrier(pktqueue_t *pq) 288 { 289 CPU_INFO_ITERATOR cii; 290 struct cpu_info *ci; 291 u_int pending = 0; 292 293 mutex_enter(&pq->pq_lock); 294 KASSERT(pq->pq_barrier == 0); 295 296 for (CPU_INFO_FOREACH(cii, ci)) { 297 struct pcq *q; 298 299 kpreempt_disable(); 300 q = pktq_pcq(pq, ci); 301 kpreempt_enable(); 302 303 /* If the queue is empty - nothing to do. */ 304 if (pcq_peek(q) == NULL) { 305 continue; 306 } 307 /* Otherwise, put the marker and entry. */ 308 while (!pcq_put(q, PKTQ_MARKER)) { 309 kpause("pktqsync", false, 1, NULL); 310 } 311 kpreempt_disable(); 312 softint_schedule_cpu(pq->pq_sih, ci); 313 kpreempt_enable(); 314 pending++; 315 } 316 317 /* Wait for each queue to process the markers. */ 318 while (pq->pq_barrier != pending) { 319 kpause("pktqsync", false, 1, NULL); 320 } 321 pq->pq_barrier = 0; 322 mutex_exit(&pq->pq_lock); 323 } 324 325 /* 326 * pktq_flush: free mbufs in all queues. 327 * 328 * => The caller must ensure there are no concurrent writers or flush calls. 329 */ 330 void 331 pktq_flush(pktqueue_t *pq) 332 { 333 CPU_INFO_ITERATOR cii; 334 struct cpu_info *ci; 335 struct mbuf *m; 336 337 for (CPU_INFO_FOREACH(cii, ci)) { 338 struct pcq *q; 339 340 kpreempt_disable(); 341 q = pktq_pcq(pq, ci); 342 kpreempt_enable(); 343 344 /* 345 * XXX This can't be right -- if the softint is running 346 * then pcq_get isn't safe here. 347 */ 348 while ((m = pcq_get(q)) != NULL) { 349 pktq_inc_count(pq, PQCNT_DEQUEUE); 350 m_freem(m); 351 } 352 } 353 } 354 355 static void 356 pktq_set_maxlen_cpu(void *vpq, void *vqs) 357 { 358 struct pktqueue *pq = vpq; 359 struct pcq **qp, *q, **qs = vqs; 360 unsigned i = cpu_index(curcpu()); 361 int s; 362 363 s = splnet(); 364 qp = percpu_getref(pq->pq_pcq); 365 q = *qp; 366 *qp = qs[i]; 367 qs[i] = q; 368 percpu_putref(pq->pq_pcq); 369 splx(s); 370 } 371 372 /* 373 * pktq_set_maxlen: create per-CPU queues using a new size and replace 374 * the existing queues without losing any packets. 375 * 376 * XXX ncpu must remain stable throughout. 377 */ 378 int 379 pktq_set_maxlen(pktqueue_t *pq, size_t maxlen) 380 { 381 const u_int slotbytes = ncpu * sizeof(pcq_t *); 382 pcq_t **qs; 383 384 if (!maxlen || maxlen > PCQ_MAXLEN) 385 return EINVAL; 386 if (pq->pq_maxlen == maxlen) 387 return 0; 388 389 /* First, allocate the new queues. */ 390 qs = kmem_zalloc(slotbytes, KM_SLEEP); 391 for (u_int i = 0; i < ncpu; i++) { 392 qs[i] = pcq_create(maxlen, KM_SLEEP); 393 } 394 395 /* 396 * Issue an xcall to replace the queue pointers on each CPU. 397 * This implies all the necessary memory barriers. 398 */ 399 mutex_enter(&pq->pq_lock); 400 xc_wait(xc_broadcast(XC_HIGHPRI, pktq_set_maxlen_cpu, pq, qs)); 401 pq->pq_maxlen = maxlen; 402 mutex_exit(&pq->pq_lock); 403 404 /* 405 * At this point, the new packets are flowing into the new 406 * queues. However, the old queues may have some packets 407 * present which are no longer being processed. We are going 408 * to re-enqueue them. This may change the order of packet 409 * arrival, but it is not considered an issue. 410 * 411 * There may be in-flight interrupts calling pktq_dequeue() 412 * which reference the old queues. Issue a barrier to ensure 413 * that we are going to be the only pcq_get() callers on the 414 * old queues. 415 */ 416 pktq_barrier(pq); 417 418 for (u_int i = 0; i < ncpu; i++) { 419 struct pcq *q; 420 struct mbuf *m; 421 422 kpreempt_disable(); 423 q = pktq_pcq(pq, cpu_lookup(i)); 424 kpreempt_enable(); 425 426 while ((m = pcq_get(qs[i])) != NULL) { 427 while (!pcq_put(q, m)) { 428 kpause("pktqrenq", false, 1, NULL); 429 } 430 } 431 pcq_destroy(qs[i]); 432 } 433 434 /* Well, that was fun. */ 435 kmem_free(qs, slotbytes); 436 return 0; 437 } 438 439 int 440 sysctl_pktq_maxlen(SYSCTLFN_ARGS, pktqueue_t *pq) 441 { 442 u_int nmaxlen = pktq_get_count(pq, PKTQ_MAXLEN); 443 struct sysctlnode node = *rnode; 444 int error; 445 446 node.sysctl_data = &nmaxlen; 447 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 448 if (error || newp == NULL) 449 return error; 450 return pktq_set_maxlen(pq, nmaxlen); 451 } 452 453 int 454 sysctl_pktq_count(SYSCTLFN_ARGS, pktqueue_t *pq, u_int count_id) 455 { 456 uint64_t count = pktq_get_count(pq, count_id); 457 struct sysctlnode node = *rnode; 458 459 node.sysctl_data = &count; 460 return sysctl_lookup(SYSCTLFN_CALL(&node)); 461 } 462