1 /* 2 * Copyright (c) 2003, 2004 Matthew Dillon. All rights reserved. 3 * Copyright (c) 2003, 2004 Jeffrey M. Hsu. All rights reserved. 4 * Copyright (c) 2003 Jonathan Lemon. All rights reserved. 5 * Copyright (c) 2003, 2004 The DragonFly Project. All rights reserved. 6 * 7 * This code is derived from software contributed to The DragonFly Project 8 * by Jonathan Lemon, Jeffrey M. Hsu, and Matthew Dillon. 9 * 10 * Jonathan Lemon gave Jeffrey Hsu permission to combine his copyright 11 * into this one around July 8 2004. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of The DragonFly Project nor the names of its 22 * contributors may be used to endorse or promote products derived 23 * from this software without specific, prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 28 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 29 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 30 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 31 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 32 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 33 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 34 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 35 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * $DragonFly: src/sys/net/netisr.c,v 1.49 2008/11/01 10:29:31 sephe Exp $ 39 */ 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/kernel.h> 44 #include <sys/malloc.h> 45 #include <sys/msgport.h> 46 #include <sys/proc.h> 47 #include <sys/interrupt.h> 48 #include <sys/socket.h> 49 #include <sys/sysctl.h> 50 #include <sys/socketvar.h> 51 #include <net/if.h> 52 #include <net/if_var.h> 53 #include <net/netisr.h> 54 #include <machine/cpufunc.h> 55 56 #include <sys/thread2.h> 57 #include <sys/msgport2.h> 58 #include <net/netmsg2.h> 59 #include <sys/mplock2.h> 60 61 static void netmsg_sync_func(struct netmsg *msg); 62 static void netmsg_service_loop(void *arg); 63 static void cpu0_cpufn(struct mbuf **mp, int hoff); 64 65 struct netmsg_port_registration { 66 TAILQ_ENTRY(netmsg_port_registration) npr_entry; 67 lwkt_port_t npr_port; 68 }; 69 70 struct netmsg_rollup { 71 TAILQ_ENTRY(netmsg_rollup) ru_entry; 72 netisr_ru_t ru_func; 73 }; 74 75 static struct netisr netisrs[NETISR_MAX]; 76 static TAILQ_HEAD(,netmsg_port_registration) netreglist; 77 static TAILQ_HEAD(,netmsg_rollup) netrulist; 78 79 /* Per-CPU thread to handle any protocol. */ 80 static struct thread netisr_cpu[MAXCPU]; 81 lwkt_port netisr_afree_rport; 82 lwkt_port netisr_adone_rport; 83 lwkt_port netisr_apanic_rport; 84 lwkt_port netisr_sync_port; 85 86 static int (*netmsg_fwd_port_fn)(lwkt_port_t, lwkt_msg_t); 87 88 SYSCTL_NODE(_net, OID_AUTO, netisr, CTLFLAG_RW, 0, "netisr"); 89 90 /* 91 * netisr_afree_rport replymsg function, only used to handle async 92 * messages which the sender has abandoned to their fate. 93 */ 94 static void 95 netisr_autofree_reply(lwkt_port_t port, lwkt_msg_t msg) 96 { 97 kfree(msg, M_LWKTMSG); 98 } 99 100 /* 101 * We need a custom putport function to handle the case where the 102 * message target is the current thread's message port. This case 103 * can occur when the TCP or UDP stack does a direct callback to NFS and NFS 104 * then turns around and executes a network operation synchronously. 105 * 106 * To prevent deadlocking, we must execute these self-referential messages 107 * synchronously, effectively turning the message into a glorified direct 108 * procedure call back into the protocol stack. The operation must be 109 * complete on return or we will deadlock, so panic if it isn't. 110 */ 111 static int 112 netmsg_put_port(lwkt_port_t port, lwkt_msg_t lmsg) 113 { 114 netmsg_t netmsg = (void *)lmsg; 115 116 if ((lmsg->ms_flags & MSGF_SYNC) && port == &curthread->td_msgport) { 117 netmsg->nm_dispatch(netmsg); 118 if ((lmsg->ms_flags & MSGF_DONE) == 0) { 119 panic("netmsg_put_port: self-referential " 120 "deadlock on netport"); 121 } 122 return(EASYNC); 123 } else { 124 return(netmsg_fwd_port_fn(port, lmsg)); 125 } 126 } 127 128 /* 129 * UNIX DOMAIN sockets still have to run their uipc functions synchronously, 130 * because they depend on the user proc context for a number of things 131 * (like creds) which we have not yet incorporated into the message structure. 132 * 133 * However, we maintain or message/port abstraction. Having a special 134 * synchronous port which runs the commands synchronously gives us the 135 * ability to serialize operations in one place later on when we start 136 * removing the BGL. 137 */ 138 static int 139 netmsg_sync_putport(lwkt_port_t port, lwkt_msg_t lmsg) 140 { 141 netmsg_t netmsg = (void *)lmsg; 142 143 KKASSERT((lmsg->ms_flags & MSGF_DONE) == 0); 144 145 lmsg->ms_target_port = port; /* required for abort */ 146 netmsg->nm_dispatch(netmsg); 147 return(EASYNC); 148 } 149 150 static void 151 netisr_init(void) 152 { 153 int i; 154 155 TAILQ_INIT(&netreglist); 156 TAILQ_INIT(&netrulist); 157 158 /* 159 * Create default per-cpu threads for generic protocol handling. 160 */ 161 for (i = 0; i < ncpus; ++i) { 162 lwkt_create(netmsg_service_loop, NULL, NULL, 163 &netisr_cpu[i], TDF_STOPREQ, i, 164 "netisr_cpu %d", i); 165 netmsg_service_port_init(&netisr_cpu[i].td_msgport); 166 lwkt_schedule(&netisr_cpu[i]); 167 } 168 169 /* 170 * The netisr_afree_rport is a special reply port which automatically 171 * frees the replied message. The netisr_adone_rport simply marks 172 * the message as being done. The netisr_apanic_rport panics if 173 * the message is replied to. 174 */ 175 lwkt_initport_replyonly(&netisr_afree_rport, netisr_autofree_reply); 176 lwkt_initport_replyonly_null(&netisr_adone_rport); 177 lwkt_initport_panic(&netisr_apanic_rport); 178 179 /* 180 * The netisr_syncport is a special port which executes the message 181 * synchronously and waits for it if EASYNC is returned. 182 */ 183 lwkt_initport_putonly(&netisr_sync_port, netmsg_sync_putport); 184 } 185 186 SYSINIT(netisr, SI_SUB_PRE_DRIVERS, SI_ORDER_FIRST, netisr_init, NULL); 187 188 /* 189 * Finish initializing the message port for a netmsg service. This also 190 * registers the port for synchronous cleanup operations such as when an 191 * ifnet is being destroyed. There is no deregistration API yet. 192 */ 193 void 194 netmsg_service_port_init(lwkt_port_t port) 195 { 196 struct netmsg_port_registration *reg; 197 198 /* 199 * Override the putport function. Our custom function checks for 200 * self-references and executes such commands synchronously. 201 */ 202 if (netmsg_fwd_port_fn == NULL) 203 netmsg_fwd_port_fn = port->mp_putport; 204 KKASSERT(netmsg_fwd_port_fn == port->mp_putport); 205 port->mp_putport = netmsg_put_port; 206 207 /* 208 * Keep track of ports using the netmsg API so we can synchronize 209 * certain operations (such as freeing an ifnet structure) across all 210 * consumers. 211 */ 212 reg = kmalloc(sizeof(*reg), M_TEMP, M_WAITOK|M_ZERO); 213 reg->npr_port = port; 214 TAILQ_INSERT_TAIL(&netreglist, reg, npr_entry); 215 } 216 217 /* 218 * This function synchronizes the caller with all netmsg services. For 219 * example, if an interface is being removed we must make sure that all 220 * packets related to that interface complete processing before the structure 221 * can actually be freed. This sort of synchronization is an alternative to 222 * ref-counting the netif, removing the ref counting overhead in favor of 223 * placing additional overhead in the netif freeing sequence (where it is 224 * inconsequential). 225 */ 226 void 227 netmsg_service_sync(void) 228 { 229 struct netmsg_port_registration *reg; 230 struct netmsg smsg; 231 232 netmsg_init(&smsg, NULL, &curthread->td_msgport, 0, netmsg_sync_func); 233 234 TAILQ_FOREACH(reg, &netreglist, npr_entry) { 235 lwkt_domsg(reg->npr_port, &smsg.nm_lmsg, 0); 236 } 237 } 238 239 /* 240 * The netmsg function simply replies the message. API semantics require 241 * EASYNC to be returned if the netmsg function disposes of the message. 242 */ 243 static void 244 netmsg_sync_func(struct netmsg *msg) 245 { 246 lwkt_replymsg(&msg->nm_lmsg, 0); 247 } 248 249 /* 250 * Generic netmsg service loop. Some protocols may roll their own but all 251 * must do the basic command dispatch function call done here. 252 */ 253 static void 254 netmsg_service_loop(void *arg) 255 { 256 struct netmsg_rollup *ru; 257 struct netmsg *msg; 258 thread_t td = curthread;; 259 int limit; 260 261 while ((msg = lwkt_waitport(&td->td_msgport, 0))) { 262 /* 263 * Run up to 512 pending netmsgs. 264 */ 265 limit = 512; 266 do { 267 KASSERT(msg->nm_dispatch != NULL, 268 ("netmsg_service isr %d badmsg\n", 269 msg->nm_lmsg.u.ms_result)); 270 msg->nm_dispatch(msg); 271 if (--limit == 0) 272 break; 273 } while ((msg = lwkt_getport(&td->td_msgport)) != NULL); 274 275 /* 276 * Run all registered rollup functions for this cpu 277 * (e.g. tcp_willblock()). 278 */ 279 TAILQ_FOREACH(ru, &netrulist, ru_entry) 280 ru->ru_func(); 281 } 282 } 283 284 /* 285 * Forward a packet to a netisr service function. 286 * 287 * If the packet has not been assigned to a protocol thread we call 288 * the port characterization function to assign it. The caller must 289 * clear M_HASH (or not have set it in the first place) if the caller 290 * wishes the packet to be recharacterized. 291 */ 292 int 293 netisr_queue(int num, struct mbuf *m) 294 { 295 struct netisr *ni; 296 struct netmsg_packet *pmsg; 297 lwkt_port_t port; 298 299 KASSERT((num > 0 && num <= (sizeof(netisrs)/sizeof(netisrs[0]))), 300 ("Bad isr %d", num)); 301 302 ni = &netisrs[num]; 303 if (ni->ni_handler == NULL) { 304 kprintf("Unregistered isr %d\n", num); 305 m_freem(m); 306 return (EIO); 307 } 308 309 /* 310 * Figure out which protocol thread to send to. This does not 311 * have to be perfect but performance will be really good if it 312 * is correct. Major protocol inputs such as ip_input() will 313 * re-characterize the packet as necessary. 314 */ 315 if ((m->m_flags & M_HASH) == 0) { 316 ni->ni_cpufn(&m, 0); 317 if (m == NULL) { 318 m_freem(m); 319 return (EIO); 320 } 321 if ((m->m_flags & M_HASH) == 0) { 322 kprintf("netisr_queue(%d): packet hash failed\n", num); 323 m_freem(m); 324 return (EIO); 325 } 326 } 327 328 /* 329 * Get the protocol port based on the packet hash, initialize 330 * the netmsg, and send it off. 331 */ 332 port = cpu_portfn(m->m_pkthdr.hash); 333 pmsg = &m->m_hdr.mh_netmsg; 334 netmsg_init(&pmsg->nm_netmsg, NULL, &netisr_apanic_rport, 335 0, ni->ni_handler); 336 pmsg->nm_packet = m; 337 pmsg->nm_netmsg.nm_lmsg.u.ms_result = num; 338 lwkt_sendmsg(port, &pmsg->nm_netmsg.nm_lmsg); 339 340 return (0); 341 } 342 343 /* 344 * Pre-characterization of a deeper portion of the packet for the 345 * requested isr. 346 * 347 * The base of the ISR type (e.g. IP) that we want to characterize is 348 * at (hoff) relative to the beginning of the mbuf. This allows 349 * e.g. ether_input_chain() to not have to adjust the m_data/m_len. 350 */ 351 void 352 netisr_characterize(int num, struct mbuf **mp, int hoff) 353 { 354 struct netisr *ni; 355 struct mbuf *m; 356 357 /* 358 * Validation 359 */ 360 KASSERT((num > 0 && num <= (sizeof(netisrs)/sizeof(netisrs[0]))), 361 ("Bad isr %d", num)); 362 m = *mp; 363 KKASSERT(m != NULL); 364 365 /* 366 * Valid netisr? 367 */ 368 ni = &netisrs[num]; 369 if (ni->ni_handler == NULL) { 370 kprintf("Unregistered isr %d\n", num); 371 m_freem(m); 372 *mp = NULL; 373 } 374 375 /* 376 * Characterize the packet 377 */ 378 if ((m->m_flags & M_HASH) == 0) { 379 ni->ni_cpufn(mp, hoff); 380 m = *mp; 381 if (m && (m->m_flags & M_HASH) == 0) 382 kprintf("netisr_queue(%d): packet hash failed\n", num); 383 } 384 } 385 386 void 387 netisr_register(int num, netisr_fn_t handler, netisr_cpufn_t cpufn) 388 { 389 struct netisr *ni; 390 391 KASSERT((num > 0 && num <= (sizeof(netisrs)/sizeof(netisrs[0]))), 392 ("netisr_register: bad isr %d", num)); 393 KKASSERT(handler != NULL); 394 395 if (cpufn == NULL) 396 cpufn = cpu0_cpufn; 397 398 ni = &netisrs[num]; 399 400 ni->ni_handler = handler; 401 ni->ni_cpufn = cpufn; 402 netmsg_init(&ni->ni_netmsg, NULL, &netisr_adone_rport, 0, NULL); 403 } 404 405 void 406 netisr_register_rollup(netisr_ru_t ru_func) 407 { 408 struct netmsg_rollup *ru; 409 410 ru = kmalloc(sizeof(*ru), M_TEMP, M_WAITOK|M_ZERO); 411 ru->ru_func = ru_func; 412 TAILQ_INSERT_TAIL(&netrulist, ru, ru_entry); 413 } 414 415 /* 416 * Return the message port for the general protocol message servicing 417 * thread for a particular cpu. 418 */ 419 lwkt_port_t 420 cpu_portfn(int cpu) 421 { 422 KKASSERT(cpu >= 0 && cpu < ncpus); 423 return (&netisr_cpu[cpu].td_msgport); 424 } 425 426 /* 427 * Return the current cpu's network protocol thread. 428 */ 429 lwkt_port_t 430 cur_netport(void) 431 { 432 return(cpu_portfn(mycpu->gd_cpuid)); 433 } 434 435 /* 436 * Return a default protocol mbuf processing thread port 437 */ 438 lwkt_port_t 439 cpu0_soport(struct socket *so __unused, struct sockaddr *nam __unused, 440 struct mbuf **dummy __unused) 441 { 442 return (&netisr_cpu[0].td_msgport); 443 } 444 445 /* 446 * Return a default protocol control message processing thread port 447 */ 448 lwkt_port_t 449 cpu0_ctlport(int cmd __unused, struct sockaddr *sa __unused, 450 void *extra __unused) 451 { 452 return (&netisr_cpu[0].td_msgport); 453 } 454 455 /* 456 * This is a dummy port that causes a message to be executed synchronously 457 * instead of being queued to a port. 458 */ 459 lwkt_port_t 460 sync_soport(struct socket *so __unused, struct sockaddr *nam __unused, 461 struct mbuf **dummy __unused) 462 { 463 return (&netisr_sync_port); 464 } 465 466 /* 467 * This is a default netisr packet characterization function which 468 * sets M_HASH. If a netisr is registered with a NULL cpufn function 469 * this one is assigned. 470 * 471 * This function makes no attempt to validate the packet. 472 */ 473 static void 474 cpu0_cpufn(struct mbuf **mp, int hoff __unused) 475 { 476 struct mbuf *m = *mp; 477 478 m->m_flags |= M_HASH; 479 m->m_pkthdr.hash = 0; 480 } 481 482 /* 483 * schednetisr() is used to call the netisr handler from the appropriate 484 * netisr thread for polling and other purposes. 485 * 486 * This function may be called from a hard interrupt or IPI and must be 487 * MP SAFE and non-blocking. We use a fixed per-cpu message instead of 488 * trying to allocate one. We must get ourselves onto the target cpu 489 * to safely check the MSGF_DONE bit on the message but since the message 490 * will be sent to that cpu anyway this does not add any extra work beyond 491 * what lwkt_sendmsg() would have already had to do to schedule the target 492 * thread. 493 */ 494 static void 495 schednetisr_remote(void *data) 496 { 497 int num = (int)(intptr_t)data; 498 struct netisr *ni = &netisrs[num]; 499 lwkt_port_t port = &netisr_cpu[0].td_msgport; 500 struct netmsg *pmsg; 501 502 pmsg = &netisrs[num].ni_netmsg; 503 if (pmsg->nm_lmsg.ms_flags & MSGF_DONE) { 504 netmsg_init(pmsg, NULL, &netisr_adone_rport, 0, ni->ni_handler); 505 pmsg->nm_lmsg.u.ms_result = num; 506 lwkt_sendmsg(port, &pmsg->nm_lmsg); 507 } 508 } 509 510 void 511 schednetisr(int num) 512 { 513 KASSERT((num > 0 && num <= (sizeof(netisrs)/sizeof(netisrs[0]))), 514 ("schednetisr: bad isr %d", num)); 515 KKASSERT(netisrs[num].ni_handler != NULL); 516 #ifdef SMP 517 if (mycpu->gd_cpuid != 0) { 518 lwkt_send_ipiq(globaldata_find(0), 519 schednetisr_remote, (void *)(intptr_t)num); 520 } else { 521 crit_enter(); 522 schednetisr_remote((void *)(intptr_t)num); 523 crit_exit(); 524 } 525 #else 526 crit_enter(); 527 schednetisr_remote((void *)(intptr_t)num); 528 crit_exit(); 529 #endif 530 } 531