1 /* $NetBSD: if_tap.c,v 1.55 2009/04/04 10:12:51 ad Exp $ */ 2 3 /* 4 * Copyright (c) 2003, 2004, 2008, 2009 The NetBSD Foundation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * tap(4) is a virtual Ethernet interface. It appears as a real Ethernet 31 * device to the system, but can also be accessed by userland through a 32 * character device interface, which allows reading and injecting frames. 33 */ 34 35 #include <sys/cdefs.h> 36 __KERNEL_RCSID(0, "$NetBSD: if_tap.c,v 1.55 2009/04/04 10:12:51 ad Exp $"); 37 38 #if defined(_KERNEL_OPT) 39 #include "bpfilter.h" 40 #include "opt_modular.h" 41 #include "opt_compat_netbsd.h" 42 #endif 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/kernel.h> 47 #include <sys/malloc.h> 48 #include <sys/conf.h> 49 #include <sys/device.h> 50 #include <sys/file.h> 51 #include <sys/filedesc.h> 52 #include <sys/ksyms.h> 53 #include <sys/poll.h> 54 #include <sys/proc.h> 55 #include <sys/select.h> 56 #include <sys/sockio.h> 57 #if defined(COMPAT_40) || defined(MODULAR) 58 #include <sys/sysctl.h> 59 #endif 60 #include <sys/kauth.h> 61 #include <sys/mutex.h> 62 #include <sys/simplelock.h> 63 #include <sys/intr.h> 64 65 #include <net/if.h> 66 #include <net/if_dl.h> 67 #include <net/if_ether.h> 68 #include <net/if_media.h> 69 #include <net/if_tap.h> 70 #if NBPFILTER > 0 71 #include <net/bpf.h> 72 #endif 73 74 #include <compat/sys/sockio.h> 75 76 #if defined(COMPAT_40) || defined(MODULAR) 77 /* 78 * sysctl node management 79 * 80 * It's not really possible to use a SYSCTL_SETUP block with 81 * current module implementation, so it is easier to just define 82 * our own function. 83 * 84 * The handler function is a "helper" in Andrew Brown's sysctl 85 * framework terminology. It is used as a gateway for sysctl 86 * requests over the nodes. 87 * 88 * tap_log allows the module to log creations of nodes and 89 * destroy them all at once using sysctl_teardown. 90 */ 91 static int tap_node; 92 static int tap_sysctl_handler(SYSCTLFN_PROTO); 93 SYSCTL_SETUP_PROTO(sysctl_tap_setup); 94 #endif 95 96 /* 97 * Since we're an Ethernet device, we need the 3 following 98 * components: a leading struct device, a struct ethercom, 99 * and also a struct ifmedia since we don't attach a PHY to 100 * ourselves. We could emulate one, but there's no real 101 * point. 102 */ 103 104 struct tap_softc { 105 device_t sc_dev; 106 struct ifmedia sc_im; 107 struct ethercom sc_ec; 108 int sc_flags; 109 #define TAP_INUSE 0x00000001 /* tap device can only be opened once */ 110 #define TAP_ASYNCIO 0x00000002 /* user is using async I/O (SIGIO) on the device */ 111 #define TAP_NBIO 0x00000004 /* user wants calls to avoid blocking */ 112 #define TAP_GOING 0x00000008 /* interface is being destroyed */ 113 struct selinfo sc_rsel; 114 pid_t sc_pgid; /* For async. IO */ 115 kmutex_t sc_rdlock; 116 struct simplelock sc_kqlock; 117 void *sc_sih; 118 }; 119 120 /* autoconf(9) glue */ 121 122 void tapattach(int); 123 124 static int tap_match(device_t, cfdata_t, void *); 125 static void tap_attach(device_t, device_t, void *); 126 static int tap_detach(device_t, int); 127 128 CFATTACH_DECL_NEW(tap, sizeof(struct tap_softc), 129 tap_match, tap_attach, tap_detach, NULL); 130 extern struct cfdriver tap_cd; 131 132 /* Real device access routines */ 133 static int tap_dev_close(struct tap_softc *); 134 static int tap_dev_read(int, struct uio *, int); 135 static int tap_dev_write(int, struct uio *, int); 136 static int tap_dev_ioctl(int, u_long, void *, struct lwp *); 137 static int tap_dev_poll(int, int, struct lwp *); 138 static int tap_dev_kqfilter(int, struct knote *); 139 140 /* Fileops access routines */ 141 static int tap_fops_close(file_t *); 142 static int tap_fops_read(file_t *, off_t *, struct uio *, 143 kauth_cred_t, int); 144 static int tap_fops_write(file_t *, off_t *, struct uio *, 145 kauth_cred_t, int); 146 static int tap_fops_ioctl(file_t *, u_long, void *); 147 static int tap_fops_poll(file_t *, int); 148 static int tap_fops_kqfilter(file_t *, struct knote *); 149 150 static const struct fileops tap_fileops = { 151 .fo_read = tap_fops_read, 152 .fo_write = tap_fops_write, 153 .fo_ioctl = tap_fops_ioctl, 154 .fo_fcntl = fnullop_fcntl, 155 .fo_poll = tap_fops_poll, 156 .fo_stat = fbadop_stat, 157 .fo_close = tap_fops_close, 158 .fo_kqfilter = tap_fops_kqfilter, 159 .fo_drain = fnullop_drain, 160 }; 161 162 /* Helper for cloning open() */ 163 static int tap_dev_cloner(struct lwp *); 164 165 /* Character device routines */ 166 static int tap_cdev_open(dev_t, int, int, struct lwp *); 167 static int tap_cdev_close(dev_t, int, int, struct lwp *); 168 static int tap_cdev_read(dev_t, struct uio *, int); 169 static int tap_cdev_write(dev_t, struct uio *, int); 170 static int tap_cdev_ioctl(dev_t, u_long, void *, int, struct lwp *); 171 static int tap_cdev_poll(dev_t, int, struct lwp *); 172 static int tap_cdev_kqfilter(dev_t, struct knote *); 173 174 const struct cdevsw tap_cdevsw = { 175 tap_cdev_open, tap_cdev_close, 176 tap_cdev_read, tap_cdev_write, 177 tap_cdev_ioctl, nostop, notty, 178 tap_cdev_poll, nommap, 179 tap_cdev_kqfilter, 180 D_OTHER, 181 }; 182 183 #define TAP_CLONER 0xfffff /* Maximal minor value */ 184 185 /* kqueue-related routines */ 186 static void tap_kqdetach(struct knote *); 187 static int tap_kqread(struct knote *, long); 188 189 /* 190 * Those are needed by the if_media interface. 191 */ 192 193 static int tap_mediachange(struct ifnet *); 194 static void tap_mediastatus(struct ifnet *, struct ifmediareq *); 195 196 /* 197 * Those are needed by the ifnet interface, and would typically be 198 * there for any network interface driver. 199 * Some other routines are optional: watchdog and drain. 200 */ 201 202 static void tap_start(struct ifnet *); 203 static void tap_stop(struct ifnet *, int); 204 static int tap_init(struct ifnet *); 205 static int tap_ioctl(struct ifnet *, u_long, void *); 206 207 /* Internal functions */ 208 #if defined(COMPAT_40) || defined(MODULAR) 209 static int tap_lifaddr(struct ifnet *, u_long, struct ifaliasreq *); 210 #endif 211 static void tap_softintr(void *); 212 213 /* 214 * tap is a clonable interface, although it is highly unrealistic for 215 * an Ethernet device. 216 * 217 * Here are the bits needed for a clonable interface. 218 */ 219 static int tap_clone_create(struct if_clone *, int); 220 static int tap_clone_destroy(struct ifnet *); 221 222 struct if_clone tap_cloners = IF_CLONE_INITIALIZER("tap", 223 tap_clone_create, 224 tap_clone_destroy); 225 226 /* Helper functionis shared by the two cloning code paths */ 227 static struct tap_softc * tap_clone_creator(int); 228 int tap_clone_destroyer(device_t); 229 230 void 231 tapattach(int n) 232 { 233 int error; 234 235 error = config_cfattach_attach(tap_cd.cd_name, &tap_ca); 236 if (error) { 237 aprint_error("%s: unable to register cfattach\n", 238 tap_cd.cd_name); 239 (void)config_cfdriver_detach(&tap_cd); 240 return; 241 } 242 243 if_clone_attach(&tap_cloners); 244 } 245 246 /* Pretty much useless for a pseudo-device */ 247 static int 248 tap_match(device_t parent, cfdata_t cfdata, void *arg) 249 { 250 251 return (1); 252 } 253 254 void 255 tap_attach(device_t parent, device_t self, void *aux) 256 { 257 struct tap_softc *sc = device_private(self); 258 struct ifnet *ifp; 259 #if defined(COMPAT_40) || defined(MODULAR) 260 const struct sysctlnode *node; 261 int error; 262 #endif 263 uint8_t enaddr[ETHER_ADDR_LEN] = 264 { 0xf2, 0x0b, 0xa4, 0xff, 0xff, 0xff }; 265 char enaddrstr[3 * ETHER_ADDR_LEN]; 266 struct timeval tv; 267 uint32_t ui; 268 269 sc->sc_dev = self; 270 sc->sc_sih = softint_establish(SOFTINT_CLOCK, tap_softintr, sc); 271 272 if (!pmf_device_register(self, NULL, NULL)) 273 aprint_error_dev(self, "couldn't establish power handler\n"); 274 275 /* 276 * In order to obtain unique initial Ethernet address on a host, 277 * do some randomisation using the current uptime. It's not meant 278 * for anything but avoiding hard-coding an address. 279 */ 280 getmicrouptime(&tv); 281 ui = (tv.tv_sec ^ tv.tv_usec) & 0xffffff; 282 memcpy(enaddr+3, (uint8_t *)&ui, 3); 283 284 aprint_verbose_dev(self, "Ethernet address %s\n", 285 ether_snprintf(enaddrstr, sizeof(enaddrstr), enaddr)); 286 287 /* 288 * Why 1000baseT? Why not? You can add more. 289 * 290 * Note that there are 3 steps: init, one or several additions to 291 * list of supported media, and in the end, the selection of one 292 * of them. 293 */ 294 ifmedia_init(&sc->sc_im, 0, tap_mediachange, tap_mediastatus); 295 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_1000_T, 0, NULL); 296 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_1000_T|IFM_FDX, 0, NULL); 297 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_100_TX, 0, NULL); 298 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_100_TX|IFM_FDX, 0, NULL); 299 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_10_T, 0, NULL); 300 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_10_T|IFM_FDX, 0, NULL); 301 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_AUTO, 0, NULL); 302 ifmedia_set(&sc->sc_im, IFM_ETHER|IFM_AUTO); 303 304 /* 305 * One should note that an interface must do multicast in order 306 * to support IPv6. 307 */ 308 ifp = &sc->sc_ec.ec_if; 309 strcpy(ifp->if_xname, device_xname(self)); 310 ifp->if_softc = sc; 311 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 312 ifp->if_ioctl = tap_ioctl; 313 ifp->if_start = tap_start; 314 ifp->if_stop = tap_stop; 315 ifp->if_init = tap_init; 316 IFQ_SET_READY(&ifp->if_snd); 317 318 sc->sc_ec.ec_capabilities = ETHERCAP_VLAN_MTU | ETHERCAP_JUMBO_MTU; 319 320 /* Those steps are mandatory for an Ethernet driver, the fisrt call 321 * being common to all network interface drivers. */ 322 if_attach(ifp); 323 ether_ifattach(ifp, enaddr); 324 325 sc->sc_flags = 0; 326 327 #if defined(COMPAT_40) || defined(MODULAR) 328 /* 329 * Add a sysctl node for that interface. 330 * 331 * The pointer transmitted is not a string, but instead a pointer to 332 * the softc structure, which we can use to build the string value on 333 * the fly in the helper function of the node. See the comments for 334 * tap_sysctl_handler for details. 335 * 336 * Usually sysctl_createv is called with CTL_CREATE as the before-last 337 * component. However, we can allocate a number ourselves, as we are 338 * the only consumer of the net.link.<iface> node. In this case, the 339 * unit number is conveniently used to number the node. CTL_CREATE 340 * would just work, too. 341 */ 342 if ((error = sysctl_createv(NULL, 0, NULL, 343 &node, CTLFLAG_READWRITE, 344 CTLTYPE_STRING, device_xname(self), NULL, 345 tap_sysctl_handler, 0, sc, 18, 346 CTL_NET, AF_LINK, tap_node, device_unit(sc->sc_dev), 347 CTL_EOL)) != 0) 348 aprint_error_dev(self, "sysctl_createv returned %d, ignoring\n", 349 error); 350 #endif 351 352 /* 353 * Initialize the two locks for the device. 354 * 355 * We need a lock here because even though the tap device can be 356 * opened only once, the file descriptor might be passed to another 357 * process, say a fork(2)ed child. 358 * 359 * The Giant saves us from most of the hassle, but since the read 360 * operation can sleep, we don't want two processes to wake up at 361 * the same moment and both try and dequeue a single packet. 362 * 363 * The queue for event listeners (used by kqueue(9), see below) has 364 * to be protected, too, but we don't need the same level of 365 * complexity for that lock, so a simple spinning lock is fine. 366 */ 367 mutex_init(&sc->sc_rdlock, MUTEX_DEFAULT, IPL_NONE); 368 simple_lock_init(&sc->sc_kqlock); 369 370 selinit(&sc->sc_rsel); 371 } 372 373 /* 374 * When detaching, we do the inverse of what is done in the attach 375 * routine, in reversed order. 376 */ 377 static int 378 tap_detach(device_t self, int flags) 379 { 380 struct tap_softc *sc = device_private(self); 381 struct ifnet *ifp = &sc->sc_ec.ec_if; 382 #if defined(COMPAT_40) || defined(MODULAR) 383 int error; 384 #endif 385 int s; 386 387 sc->sc_flags |= TAP_GOING; 388 s = splnet(); 389 tap_stop(ifp, 1); 390 if_down(ifp); 391 splx(s); 392 393 softint_disestablish(sc->sc_sih); 394 395 #if defined(COMPAT_40) || defined(MODULAR) 396 /* 397 * Destroying a single leaf is a very straightforward operation using 398 * sysctl_destroyv. One should be sure to always end the path with 399 * CTL_EOL. 400 */ 401 if ((error = sysctl_destroyv(NULL, CTL_NET, AF_LINK, tap_node, 402 device_unit(sc->sc_dev), CTL_EOL)) != 0) 403 aprint_error_dev(self, 404 "sysctl_destroyv returned %d, ignoring\n", error); 405 #endif 406 ether_ifdetach(ifp); 407 if_detach(ifp); 408 ifmedia_delete_instance(&sc->sc_im, IFM_INST_ANY); 409 seldestroy(&sc->sc_rsel); 410 mutex_destroy(&sc->sc_rdlock); 411 412 pmf_device_deregister(self); 413 414 return (0); 415 } 416 417 /* 418 * This function is called by the ifmedia layer to notify the driver 419 * that the user requested a media change. A real driver would 420 * reconfigure the hardware. 421 */ 422 static int 423 tap_mediachange(struct ifnet *ifp) 424 { 425 return (0); 426 } 427 428 /* 429 * Here the user asks for the currently used media. 430 */ 431 static void 432 tap_mediastatus(struct ifnet *ifp, struct ifmediareq *imr) 433 { 434 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc; 435 imr->ifm_active = sc->sc_im.ifm_cur->ifm_media; 436 } 437 438 /* 439 * This is the function where we SEND packets. 440 * 441 * There is no 'receive' equivalent. A typical driver will get 442 * interrupts from the hardware, and from there will inject new packets 443 * into the network stack. 444 * 445 * Once handled, a packet must be freed. A real driver might not be able 446 * to fit all the pending packets into the hardware, and is allowed to 447 * return before having sent all the packets. It should then use the 448 * if_flags flag IFF_OACTIVE to notify the upper layer. 449 * 450 * There are also other flags one should check, such as IFF_PAUSE. 451 * 452 * It is our duty to make packets available to BPF listeners. 453 * 454 * You should be aware that this function is called by the Ethernet layer 455 * at splnet(). 456 * 457 * When the device is opened, we have to pass the packet(s) to the 458 * userland. For that we stay in OACTIVE mode while the userland gets 459 * the packets, and we send a signal to the processes waiting to read. 460 * 461 * wakeup(sc) is the counterpart to the tsleep call in 462 * tap_dev_read, while selnotify() is used for kevent(2) and 463 * poll(2) (which includes select(2)) listeners. 464 */ 465 static void 466 tap_start(struct ifnet *ifp) 467 { 468 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc; 469 struct mbuf *m0; 470 471 if ((sc->sc_flags & TAP_INUSE) == 0) { 472 /* Simply drop packets */ 473 for(;;) { 474 IFQ_DEQUEUE(&ifp->if_snd, m0); 475 if (m0 == NULL) 476 return; 477 478 ifp->if_opackets++; 479 #if NBPFILTER > 0 480 if (ifp->if_bpf) 481 bpf_mtap(ifp->if_bpf, m0); 482 #endif 483 484 m_freem(m0); 485 } 486 } else if (!IFQ_IS_EMPTY(&ifp->if_snd)) { 487 ifp->if_flags |= IFF_OACTIVE; 488 wakeup(sc); 489 selnotify(&sc->sc_rsel, 0, 1); 490 if (sc->sc_flags & TAP_ASYNCIO) 491 softint_schedule(sc->sc_sih); 492 } 493 } 494 495 static void 496 tap_softintr(void *cookie) 497 { 498 struct tap_softc *sc; 499 struct ifnet *ifp; 500 int a, b; 501 502 sc = cookie; 503 504 if (sc->sc_flags & TAP_ASYNCIO) { 505 ifp = &sc->sc_ec.ec_if; 506 if (ifp->if_flags & IFF_RUNNING) { 507 a = POLL_IN; 508 b = POLLIN|POLLRDNORM; 509 } else { 510 a = POLL_HUP; 511 b = 0; 512 } 513 fownsignal(sc->sc_pgid, SIGIO, a, b, NULL); 514 } 515 } 516 517 /* 518 * A typical driver will only contain the following handlers for 519 * ioctl calls, except SIOCSIFPHYADDR. 520 * The latter is a hack I used to set the Ethernet address of the 521 * faked device. 522 * 523 * Note that both ifmedia_ioctl() and ether_ioctl() have to be 524 * called under splnet(). 525 */ 526 static int 527 tap_ioctl(struct ifnet *ifp, u_long cmd, void *data) 528 { 529 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc; 530 struct ifreq *ifr = (struct ifreq *)data; 531 int s, error; 532 533 s = splnet(); 534 535 switch (cmd) { 536 #ifdef OSIOCSIFMEDIA 537 case OSIOCSIFMEDIA: 538 #endif 539 case SIOCSIFMEDIA: 540 case SIOCGIFMEDIA: 541 error = ifmedia_ioctl(ifp, ifr, &sc->sc_im, cmd); 542 break; 543 #if defined(COMPAT_40) || defined(MODULAR) 544 case SIOCSIFPHYADDR: 545 error = tap_lifaddr(ifp, cmd, (struct ifaliasreq *)data); 546 break; 547 #endif 548 default: 549 error = ether_ioctl(ifp, cmd, data); 550 if (error == ENETRESET) 551 error = 0; 552 break; 553 } 554 555 splx(s); 556 557 return (error); 558 } 559 560 #if defined(COMPAT_40) || defined(MODULAR) 561 /* 562 * Helper function to set Ethernet address. This has been replaced by 563 * the generic SIOCALIFADDR ioctl on a PF_LINK socket. 564 */ 565 static int 566 tap_lifaddr(struct ifnet *ifp, u_long cmd, struct ifaliasreq *ifra) 567 { 568 const struct sockaddr *sa = &ifra->ifra_addr; 569 570 if (sa->sa_family != AF_LINK) 571 return (EINVAL); 572 573 if_set_sadl(ifp, sa->sa_data, ETHER_ADDR_LEN, false); 574 575 return (0); 576 } 577 #endif 578 579 /* 580 * _init() would typically be called when an interface goes up, 581 * meaning it should configure itself into the state in which it 582 * can send packets. 583 */ 584 static int 585 tap_init(struct ifnet *ifp) 586 { 587 ifp->if_flags |= IFF_RUNNING; 588 589 tap_start(ifp); 590 591 return (0); 592 } 593 594 /* 595 * _stop() is called when an interface goes down. It is our 596 * responsability to validate that state by clearing the 597 * IFF_RUNNING flag. 598 * 599 * We have to wake up all the sleeping processes to have the pending 600 * read requests cancelled. 601 */ 602 static void 603 tap_stop(struct ifnet *ifp, int disable) 604 { 605 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc; 606 607 ifp->if_flags &= ~IFF_RUNNING; 608 wakeup(sc); 609 selnotify(&sc->sc_rsel, 0, 1); 610 if (sc->sc_flags & TAP_ASYNCIO) 611 softint_schedule(sc->sc_sih); 612 } 613 614 /* 615 * The 'create' command of ifconfig can be used to create 616 * any numbered instance of a given device. Thus we have to 617 * make sure we have enough room in cd_devs to create the 618 * user-specified instance. config_attach_pseudo will do this 619 * for us. 620 */ 621 static int 622 tap_clone_create(struct if_clone *ifc, int unit) 623 { 624 if (tap_clone_creator(unit) == NULL) { 625 aprint_error("%s%d: unable to attach an instance\n", 626 tap_cd.cd_name, unit); 627 return (ENXIO); 628 } 629 630 return (0); 631 } 632 633 /* 634 * tap(4) can be cloned by two ways: 635 * using 'ifconfig tap0 create', which will use the network 636 * interface cloning API, and call tap_clone_create above. 637 * opening the cloning device node, whose minor number is TAP_CLONER. 638 * See below for an explanation on how this part work. 639 */ 640 static struct tap_softc * 641 tap_clone_creator(int unit) 642 { 643 struct cfdata *cf; 644 645 cf = malloc(sizeof(*cf), M_DEVBUF, M_WAITOK); 646 cf->cf_name = tap_cd.cd_name; 647 cf->cf_atname = tap_ca.ca_name; 648 if (unit == -1) { 649 /* let autoconf find the first free one */ 650 cf->cf_unit = 0; 651 cf->cf_fstate = FSTATE_STAR; 652 } else { 653 cf->cf_unit = unit; 654 cf->cf_fstate = FSTATE_FOUND; 655 } 656 657 return device_private(config_attach_pseudo(cf)); 658 } 659 660 /* 661 * The clean design of if_clone and autoconf(9) makes that part 662 * really straightforward. The second argument of config_detach 663 * means neither QUIET nor FORCED. 664 */ 665 static int 666 tap_clone_destroy(struct ifnet *ifp) 667 { 668 struct tap_softc *sc = ifp->if_softc; 669 670 return tap_clone_destroyer(sc->sc_dev); 671 } 672 673 int 674 tap_clone_destroyer(device_t dev) 675 { 676 cfdata_t cf = device_cfdata(dev); 677 int error; 678 679 if ((error = config_detach(dev, 0)) != 0) 680 aprint_error_dev(dev, "unable to detach instance\n"); 681 free(cf, M_DEVBUF); 682 683 return (error); 684 } 685 686 /* 687 * tap(4) is a bit of an hybrid device. It can be used in two different 688 * ways: 689 * 1. ifconfig tapN create, then use /dev/tapN to read/write off it. 690 * 2. open /dev/tap, get a new interface created and read/write off it. 691 * That interface is destroyed when the process that had it created exits. 692 * 693 * The first way is managed by the cdevsw structure, and you access interfaces 694 * through a (major, minor) mapping: tap4 is obtained by the minor number 695 * 4. The entry points for the cdevsw interface are prefixed by tap_cdev_. 696 * 697 * The second way is the so-called "cloning" device. It's a special minor 698 * number (chosen as the maximal number, to allow as much tap devices as 699 * possible). The user first opens the cloner (e.g., /dev/tap), and that 700 * call ends in tap_cdev_open. The actual place where it is handled is 701 * tap_dev_cloner. 702 * 703 * An tap device cannot be opened more than once at a time, so the cdevsw 704 * part of open() does nothing but noting that the interface is being used and 705 * hence ready to actually handle packets. 706 */ 707 708 static int 709 tap_cdev_open(dev_t dev, int flags, int fmt, struct lwp *l) 710 { 711 struct tap_softc *sc; 712 713 if (minor(dev) == TAP_CLONER) 714 return tap_dev_cloner(l); 715 716 sc = device_lookup_private(&tap_cd, minor(dev)); 717 if (sc == NULL) 718 return (ENXIO); 719 720 /* The device can only be opened once */ 721 if (sc->sc_flags & TAP_INUSE) 722 return (EBUSY); 723 sc->sc_flags |= TAP_INUSE; 724 return (0); 725 } 726 727 /* 728 * There are several kinds of cloning devices, and the most simple is the one 729 * tap(4) uses. What it does is change the file descriptor with a new one, 730 * with its own fileops structure (which maps to the various read, write, 731 * ioctl functions). It starts allocating a new file descriptor with falloc, 732 * then actually creates the new tap devices. 733 * 734 * Once those two steps are successful, we can re-wire the existing file 735 * descriptor to its new self. This is done with fdclone(): it fills the fp 736 * structure as needed (notably f_data gets filled with the fifth parameter 737 * passed, the unit of the tap device which will allows us identifying the 738 * device later), and returns EMOVEFD. 739 * 740 * That magic value is interpreted by sys_open() which then replaces the 741 * current file descriptor by the new one (through a magic member of struct 742 * lwp, l_dupfd). 743 * 744 * The tap device is flagged as being busy since it otherwise could be 745 * externally accessed through the corresponding device node with the cdevsw 746 * interface. 747 */ 748 749 static int 750 tap_dev_cloner(struct lwp *l) 751 { 752 struct tap_softc *sc; 753 file_t *fp; 754 int error, fd; 755 756 if ((error = fd_allocfile(&fp, &fd)) != 0) 757 return (error); 758 759 if ((sc = tap_clone_creator(-1)) == NULL) { 760 fd_abort(curproc, fp, fd); 761 return (ENXIO); 762 } 763 764 sc->sc_flags |= TAP_INUSE; 765 766 return fd_clone(fp, fd, FREAD|FWRITE, &tap_fileops, 767 (void *)(intptr_t)device_unit(sc->sc_dev)); 768 } 769 770 /* 771 * While all other operations (read, write, ioctl, poll and kqfilter) are 772 * really the same whether we are in cdevsw or fileops mode, the close() 773 * function is slightly different in the two cases. 774 * 775 * As for the other, the core of it is shared in tap_dev_close. What 776 * it does is sufficient for the cdevsw interface, but the cloning interface 777 * needs another thing: the interface is destroyed when the processes that 778 * created it closes it. 779 */ 780 static int 781 tap_cdev_close(dev_t dev, int flags, int fmt, 782 struct lwp *l) 783 { 784 struct tap_softc *sc = 785 device_lookup_private(&tap_cd, minor(dev)); 786 787 if (sc == NULL) 788 return (ENXIO); 789 790 return tap_dev_close(sc); 791 } 792 793 /* 794 * It might happen that the administrator used ifconfig to externally destroy 795 * the interface. In that case, tap_fops_close will be called while 796 * tap_detach is already happening. If we called it again from here, we 797 * would dead lock. TAP_GOING ensures that this situation doesn't happen. 798 */ 799 static int 800 tap_fops_close(file_t *fp) 801 { 802 int unit = (intptr_t)fp->f_data; 803 struct tap_softc *sc; 804 int error; 805 806 sc = device_lookup_private(&tap_cd, unit); 807 if (sc == NULL) 808 return (ENXIO); 809 810 /* tap_dev_close currently always succeeds, but it might not 811 * always be the case. */ 812 KERNEL_LOCK(1, NULL); 813 if ((error = tap_dev_close(sc)) != 0) { 814 KERNEL_UNLOCK_ONE(NULL); 815 return (error); 816 } 817 818 /* Destroy the device now that it is no longer useful, 819 * unless it's already being destroyed. */ 820 if ((sc->sc_flags & TAP_GOING) != 0) { 821 KERNEL_UNLOCK_ONE(NULL); 822 return (0); 823 } 824 825 error = tap_clone_destroyer(sc->sc_dev); 826 KERNEL_UNLOCK_ONE(NULL); 827 return error; 828 } 829 830 static int 831 tap_dev_close(struct tap_softc *sc) 832 { 833 struct ifnet *ifp; 834 int s; 835 836 s = splnet(); 837 /* Let tap_start handle packets again */ 838 ifp = &sc->sc_ec.ec_if; 839 ifp->if_flags &= ~IFF_OACTIVE; 840 841 /* Purge output queue */ 842 if (!(IFQ_IS_EMPTY(&ifp->if_snd))) { 843 struct mbuf *m; 844 845 for (;;) { 846 IFQ_DEQUEUE(&ifp->if_snd, m); 847 if (m == NULL) 848 break; 849 850 ifp->if_opackets++; 851 #if NBPFILTER > 0 852 if (ifp->if_bpf) 853 bpf_mtap(ifp->if_bpf, m); 854 #endif 855 } 856 } 857 splx(s); 858 859 sc->sc_flags &= ~(TAP_INUSE | TAP_ASYNCIO); 860 861 return (0); 862 } 863 864 static int 865 tap_cdev_read(dev_t dev, struct uio *uio, int flags) 866 { 867 return tap_dev_read(minor(dev), uio, flags); 868 } 869 870 static int 871 tap_fops_read(file_t *fp, off_t *offp, struct uio *uio, 872 kauth_cred_t cred, int flags) 873 { 874 int error; 875 876 KERNEL_LOCK(1, NULL); 877 error = tap_dev_read((intptr_t)fp->f_data, uio, flags); 878 KERNEL_UNLOCK_ONE(NULL); 879 return error; 880 } 881 882 static int 883 tap_dev_read(int unit, struct uio *uio, int flags) 884 { 885 struct tap_softc *sc = 886 device_lookup_private(&tap_cd, unit); 887 struct ifnet *ifp; 888 struct mbuf *m, *n; 889 int error = 0, s; 890 891 if (sc == NULL) 892 return (ENXIO); 893 894 ifp = &sc->sc_ec.ec_if; 895 if ((ifp->if_flags & IFF_UP) == 0) 896 return (EHOSTDOWN); 897 898 /* 899 * In the TAP_NBIO case, we have to make sure we won't be sleeping 900 */ 901 if ((sc->sc_flags & TAP_NBIO) != 0) { 902 if (!mutex_tryenter(&sc->sc_rdlock)) 903 return (EWOULDBLOCK); 904 } else { 905 mutex_enter(&sc->sc_rdlock); 906 } 907 908 s = splnet(); 909 if (IFQ_IS_EMPTY(&ifp->if_snd)) { 910 ifp->if_flags &= ~IFF_OACTIVE; 911 /* 912 * We must release the lock before sleeping, and re-acquire it 913 * after. 914 */ 915 mutex_exit(&sc->sc_rdlock); 916 if (sc->sc_flags & TAP_NBIO) 917 error = EWOULDBLOCK; 918 else 919 error = tsleep(sc, PSOCK|PCATCH, "tap", 0); 920 splx(s); 921 922 if (error != 0) 923 return (error); 924 /* The device might have been downed */ 925 if ((ifp->if_flags & IFF_UP) == 0) 926 return (EHOSTDOWN); 927 if ((sc->sc_flags & TAP_NBIO)) { 928 if (!mutex_tryenter(&sc->sc_rdlock)) 929 return (EWOULDBLOCK); 930 } else { 931 mutex_enter(&sc->sc_rdlock); 932 } 933 s = splnet(); 934 } 935 936 IFQ_DEQUEUE(&ifp->if_snd, m); 937 ifp->if_flags &= ~IFF_OACTIVE; 938 splx(s); 939 if (m == NULL) { 940 error = 0; 941 goto out; 942 } 943 944 ifp->if_opackets++; 945 #if NBPFILTER > 0 946 if (ifp->if_bpf) 947 bpf_mtap(ifp->if_bpf, m); 948 #endif 949 950 /* 951 * One read is one packet. 952 */ 953 do { 954 error = uiomove(mtod(m, void *), 955 min(m->m_len, uio->uio_resid), uio); 956 MFREE(m, n); 957 m = n; 958 } while (m != NULL && uio->uio_resid > 0 && error == 0); 959 960 if (m != NULL) 961 m_freem(m); 962 963 out: 964 mutex_exit(&sc->sc_rdlock); 965 return (error); 966 } 967 968 static int 969 tap_cdev_write(dev_t dev, struct uio *uio, int flags) 970 { 971 return tap_dev_write(minor(dev), uio, flags); 972 } 973 974 static int 975 tap_fops_write(file_t *fp, off_t *offp, struct uio *uio, 976 kauth_cred_t cred, int flags) 977 { 978 int error; 979 980 KERNEL_LOCK(1, NULL); 981 error = tap_dev_write((intptr_t)fp->f_data, uio, flags); 982 KERNEL_UNLOCK_ONE(NULL); 983 return error; 984 } 985 986 static int 987 tap_dev_write(int unit, struct uio *uio, int flags) 988 { 989 struct tap_softc *sc = 990 device_lookup_private(&tap_cd, unit); 991 struct ifnet *ifp; 992 struct mbuf *m, **mp; 993 int error = 0; 994 int s; 995 996 if (sc == NULL) 997 return (ENXIO); 998 999 ifp = &sc->sc_ec.ec_if; 1000 1001 /* One write, one packet, that's the rule */ 1002 MGETHDR(m, M_DONTWAIT, MT_DATA); 1003 if (m == NULL) { 1004 ifp->if_ierrors++; 1005 return (ENOBUFS); 1006 } 1007 m->m_pkthdr.len = uio->uio_resid; 1008 1009 mp = &m; 1010 while (error == 0 && uio->uio_resid > 0) { 1011 if (*mp != m) { 1012 MGET(*mp, M_DONTWAIT, MT_DATA); 1013 if (*mp == NULL) { 1014 error = ENOBUFS; 1015 break; 1016 } 1017 } 1018 (*mp)->m_len = min(MHLEN, uio->uio_resid); 1019 error = uiomove(mtod(*mp, void *), (*mp)->m_len, uio); 1020 mp = &(*mp)->m_next; 1021 } 1022 if (error) { 1023 ifp->if_ierrors++; 1024 m_freem(m); 1025 return (error); 1026 } 1027 1028 ifp->if_ipackets++; 1029 m->m_pkthdr.rcvif = ifp; 1030 1031 #if NBPFILTER > 0 1032 if (ifp->if_bpf) 1033 bpf_mtap(ifp->if_bpf, m); 1034 #endif 1035 s =splnet(); 1036 (*ifp->if_input)(ifp, m); 1037 splx(s); 1038 1039 return (0); 1040 } 1041 1042 static int 1043 tap_cdev_ioctl(dev_t dev, u_long cmd, void *data, int flags, 1044 struct lwp *l) 1045 { 1046 return tap_dev_ioctl(minor(dev), cmd, data, l); 1047 } 1048 1049 static int 1050 tap_fops_ioctl(file_t *fp, u_long cmd, void *data) 1051 { 1052 return tap_dev_ioctl((intptr_t)fp->f_data, cmd, data, curlwp); 1053 } 1054 1055 static int 1056 tap_dev_ioctl(int unit, u_long cmd, void *data, struct lwp *l) 1057 { 1058 struct tap_softc *sc = 1059 device_lookup_private(&tap_cd, unit); 1060 int error = 0; 1061 1062 if (sc == NULL) 1063 return (ENXIO); 1064 1065 switch (cmd) { 1066 case FIONREAD: 1067 { 1068 struct ifnet *ifp = &sc->sc_ec.ec_if; 1069 struct mbuf *m; 1070 int s; 1071 1072 s = splnet(); 1073 IFQ_POLL(&ifp->if_snd, m); 1074 1075 if (m == NULL) 1076 *(int *)data = 0; 1077 else 1078 *(int *)data = m->m_pkthdr.len; 1079 splx(s); 1080 } break; 1081 case TIOCSPGRP: 1082 case FIOSETOWN: 1083 error = fsetown(&sc->sc_pgid, cmd, data); 1084 break; 1085 case TIOCGPGRP: 1086 case FIOGETOWN: 1087 error = fgetown(sc->sc_pgid, cmd, data); 1088 break; 1089 case FIOASYNC: 1090 if (*(int *)data) 1091 sc->sc_flags |= TAP_ASYNCIO; 1092 else 1093 sc->sc_flags &= ~TAP_ASYNCIO; 1094 break; 1095 case FIONBIO: 1096 if (*(int *)data) 1097 sc->sc_flags |= TAP_NBIO; 1098 else 1099 sc->sc_flags &= ~TAP_NBIO; 1100 break; 1101 #ifdef OTAPGIFNAME 1102 case OTAPGIFNAME: 1103 #endif 1104 case TAPGIFNAME: 1105 { 1106 struct ifreq *ifr = (struct ifreq *)data; 1107 struct ifnet *ifp = &sc->sc_ec.ec_if; 1108 1109 strlcpy(ifr->ifr_name, ifp->if_xname, IFNAMSIZ); 1110 } break; 1111 default: 1112 error = ENOTTY; 1113 break; 1114 } 1115 1116 return (0); 1117 } 1118 1119 static int 1120 tap_cdev_poll(dev_t dev, int events, struct lwp *l) 1121 { 1122 return tap_dev_poll(minor(dev), events, l); 1123 } 1124 1125 static int 1126 tap_fops_poll(file_t *fp, int events) 1127 { 1128 return tap_dev_poll((intptr_t)fp->f_data, events, curlwp); 1129 } 1130 1131 static int 1132 tap_dev_poll(int unit, int events, struct lwp *l) 1133 { 1134 struct tap_softc *sc = 1135 device_lookup_private(&tap_cd, unit); 1136 int revents = 0; 1137 1138 if (sc == NULL) 1139 return POLLERR; 1140 1141 if (events & (POLLIN|POLLRDNORM)) { 1142 struct ifnet *ifp = &sc->sc_ec.ec_if; 1143 struct mbuf *m; 1144 int s; 1145 1146 s = splnet(); 1147 IFQ_POLL(&ifp->if_snd, m); 1148 splx(s); 1149 1150 if (m != NULL) 1151 revents |= events & (POLLIN|POLLRDNORM); 1152 else { 1153 simple_lock(&sc->sc_kqlock); 1154 selrecord(l, &sc->sc_rsel); 1155 simple_unlock(&sc->sc_kqlock); 1156 } 1157 } 1158 revents |= events & (POLLOUT|POLLWRNORM); 1159 1160 return (revents); 1161 } 1162 1163 static struct filterops tap_read_filterops = { 1, NULL, tap_kqdetach, 1164 tap_kqread }; 1165 static struct filterops tap_seltrue_filterops = { 1, NULL, tap_kqdetach, 1166 filt_seltrue }; 1167 1168 static int 1169 tap_cdev_kqfilter(dev_t dev, struct knote *kn) 1170 { 1171 return tap_dev_kqfilter(minor(dev), kn); 1172 } 1173 1174 static int 1175 tap_fops_kqfilter(file_t *fp, struct knote *kn) 1176 { 1177 return tap_dev_kqfilter((intptr_t)fp->f_data, kn); 1178 } 1179 1180 static int 1181 tap_dev_kqfilter(int unit, struct knote *kn) 1182 { 1183 struct tap_softc *sc = 1184 device_lookup_private(&tap_cd, unit); 1185 1186 if (sc == NULL) 1187 return (ENXIO); 1188 1189 KERNEL_LOCK(1, NULL); 1190 switch(kn->kn_filter) { 1191 case EVFILT_READ: 1192 kn->kn_fop = &tap_read_filterops; 1193 break; 1194 case EVFILT_WRITE: 1195 kn->kn_fop = &tap_seltrue_filterops; 1196 break; 1197 default: 1198 KERNEL_UNLOCK_ONE(NULL); 1199 return (EINVAL); 1200 } 1201 1202 kn->kn_hook = sc; 1203 simple_lock(&sc->sc_kqlock); 1204 SLIST_INSERT_HEAD(&sc->sc_rsel.sel_klist, kn, kn_selnext); 1205 simple_unlock(&sc->sc_kqlock); 1206 KERNEL_UNLOCK_ONE(NULL); 1207 return (0); 1208 } 1209 1210 static void 1211 tap_kqdetach(struct knote *kn) 1212 { 1213 struct tap_softc *sc = (struct tap_softc *)kn->kn_hook; 1214 1215 KERNEL_LOCK(1, NULL); 1216 simple_lock(&sc->sc_kqlock); 1217 SLIST_REMOVE(&sc->sc_rsel.sel_klist, kn, knote, kn_selnext); 1218 simple_unlock(&sc->sc_kqlock); 1219 KERNEL_UNLOCK_ONE(NULL); 1220 } 1221 1222 static int 1223 tap_kqread(struct knote *kn, long hint) 1224 { 1225 struct tap_softc *sc = (struct tap_softc *)kn->kn_hook; 1226 struct ifnet *ifp = &sc->sc_ec.ec_if; 1227 struct mbuf *m; 1228 int s, rv; 1229 1230 KERNEL_LOCK(1, NULL); 1231 s = splnet(); 1232 IFQ_POLL(&ifp->if_snd, m); 1233 1234 if (m == NULL) 1235 kn->kn_data = 0; 1236 else 1237 kn->kn_data = m->m_pkthdr.len; 1238 splx(s); 1239 rv = (kn->kn_data != 0 ? 1 : 0); 1240 KERNEL_UNLOCK_ONE(NULL); 1241 return rv; 1242 } 1243 1244 #if defined(COMPAT_40) || defined(MODULAR) 1245 /* 1246 * sysctl management routines 1247 * You can set the address of an interface through: 1248 * net.link.tap.tap<number> 1249 * 1250 * Note the consistent use of tap_log in order to use 1251 * sysctl_teardown at unload time. 1252 * 1253 * In the kernel you will find a lot of SYSCTL_SETUP blocks. Those 1254 * blocks register a function in a special section of the kernel 1255 * (called a link set) which is used at init_sysctl() time to cycle 1256 * through all those functions to create the kernel's sysctl tree. 1257 * 1258 * It is not possible to use link sets in a module, so the 1259 * easiest is to simply call our own setup routine at load time. 1260 * 1261 * In the SYSCTL_SETUP blocks you find in the kernel, nodes have the 1262 * CTLFLAG_PERMANENT flag, meaning they cannot be removed. Once the 1263 * whole kernel sysctl tree is built, it is not possible to add any 1264 * permanent node. 1265 * 1266 * It should be noted that we're not saving the sysctlnode pointer 1267 * we are returned when creating the "tap" node. That structure 1268 * cannot be trusted once out of the calling function, as it might 1269 * get reused. So we just save the MIB number, and always give the 1270 * full path starting from the root for later calls to sysctl_createv 1271 * and sysctl_destroyv. 1272 */ 1273 SYSCTL_SETUP(sysctl_tap_setup, "sysctl net.link.tap subtree setup") 1274 { 1275 const struct sysctlnode *node; 1276 int error = 0; 1277 1278 if ((error = sysctl_createv(clog, 0, NULL, NULL, 1279 CTLFLAG_PERMANENT, 1280 CTLTYPE_NODE, "net", NULL, 1281 NULL, 0, NULL, 0, 1282 CTL_NET, CTL_EOL)) != 0) 1283 return; 1284 1285 if ((error = sysctl_createv(clog, 0, NULL, NULL, 1286 CTLFLAG_PERMANENT, 1287 CTLTYPE_NODE, "link", NULL, 1288 NULL, 0, NULL, 0, 1289 CTL_NET, AF_LINK, CTL_EOL)) != 0) 1290 return; 1291 1292 /* 1293 * The first four parameters of sysctl_createv are for management. 1294 * 1295 * The four that follows, here starting with a '0' for the flags, 1296 * describe the node. 1297 * 1298 * The next series of four set its value, through various possible 1299 * means. 1300 * 1301 * Last but not least, the path to the node is described. That path 1302 * is relative to the given root (third argument). Here we're 1303 * starting from the root. 1304 */ 1305 if ((error = sysctl_createv(clog, 0, NULL, &node, 1306 CTLFLAG_PERMANENT, 1307 CTLTYPE_NODE, "tap", NULL, 1308 NULL, 0, NULL, 0, 1309 CTL_NET, AF_LINK, CTL_CREATE, CTL_EOL)) != 0) 1310 return; 1311 tap_node = node->sysctl_num; 1312 } 1313 1314 /* 1315 * The helper functions make Andrew Brown's interface really 1316 * shine. It makes possible to create value on the fly whether 1317 * the sysctl value is read or written. 1318 * 1319 * As shown as an example in the man page, the first step is to 1320 * create a copy of the node to have sysctl_lookup work on it. 1321 * 1322 * Here, we have more work to do than just a copy, since we have 1323 * to create the string. The first step is to collect the actual 1324 * value of the node, which is a convenient pointer to the softc 1325 * of the interface. From there we create the string and use it 1326 * as the value, but only for the *copy* of the node. 1327 * 1328 * Then we let sysctl_lookup do the magic, which consists in 1329 * setting oldp and newp as required by the operation. When the 1330 * value is read, that means that the string will be copied to 1331 * the user, and when it is written, the new value will be copied 1332 * over in the addr array. 1333 * 1334 * If newp is NULL, the user was reading the value, so we don't 1335 * have anything else to do. If a new value was written, we 1336 * have to check it. 1337 * 1338 * If it is incorrect, we can return an error and leave 'node' as 1339 * it is: since it is a copy of the actual node, the change will 1340 * be forgotten. 1341 * 1342 * Upon a correct input, we commit the change to the ifnet 1343 * structure of our interface. 1344 */ 1345 static int 1346 tap_sysctl_handler(SYSCTLFN_ARGS) 1347 { 1348 struct sysctlnode node; 1349 struct tap_softc *sc; 1350 struct ifnet *ifp; 1351 int error; 1352 size_t len; 1353 char addr[3 * ETHER_ADDR_LEN]; 1354 uint8_t enaddr[ETHER_ADDR_LEN]; 1355 1356 node = *rnode; 1357 sc = node.sysctl_data; 1358 ifp = &sc->sc_ec.ec_if; 1359 (void)ether_snprintf(addr, sizeof(addr), CLLADDR(ifp->if_sadl)); 1360 node.sysctl_data = addr; 1361 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 1362 if (error || newp == NULL) 1363 return (error); 1364 1365 len = strlen(addr); 1366 if (len < 11 || len > 17) 1367 return (EINVAL); 1368 1369 /* Commit change */ 1370 if (ether_nonstatic_aton(enaddr, addr) != 0) 1371 return (EINVAL); 1372 if_set_sadl(ifp, enaddr, ETHER_ADDR_LEN, false); 1373 return (error); 1374 } 1375 #endif 1376