1 /* $NetBSD: if_tap.c,v 1.35 2007/12/05 17:20:00 pooka Exp $ */ 2 3 /* 4 * Copyright (c) 2003, 2004 The NetBSD Foundation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of The NetBSD Foundation nor the names of its 16 * contributors may be used to endorse or promote products derived 17 * from this software without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * tap(4) is a virtual Ethernet interface. It appears as a real Ethernet 34 * device to the system, but can also be accessed by userland through a 35 * character device interface, which allows reading and injecting frames. 36 */ 37 38 #include <sys/cdefs.h> 39 __KERNEL_RCSID(0, "$NetBSD: if_tap.c,v 1.35 2007/12/05 17:20:00 pooka Exp $"); 40 41 #if defined(_KERNEL_OPT) 42 #include "bpfilter.h" 43 #endif 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/kernel.h> 48 #include <sys/malloc.h> 49 #include <sys/conf.h> 50 #include <sys/device.h> 51 #include <sys/file.h> 52 #include <sys/filedesc.h> 53 #include <sys/ksyms.h> 54 #include <sys/poll.h> 55 #include <sys/select.h> 56 #include <sys/sockio.h> 57 #include <sys/sysctl.h> 58 #include <sys/kauth.h> 59 #include <sys/mutex.h> 60 61 #include <net/if.h> 62 #include <net/if_dl.h> 63 #include <net/if_ether.h> 64 #include <net/if_media.h> 65 #include <net/if_tap.h> 66 #if NBPFILTER > 0 67 #include <net/bpf.h> 68 #endif 69 70 #include <compat/sys/sockio.h> 71 72 /* 73 * sysctl node management 74 * 75 * It's not really possible to use a SYSCTL_SETUP block with 76 * current LKM implementation, so it is easier to just define 77 * our own function. 78 * 79 * The handler function is a "helper" in Andrew Brown's sysctl 80 * framework terminology. It is used as a gateway for sysctl 81 * requests over the nodes. 82 * 83 * tap_log allows the module to log creations of nodes and 84 * destroy them all at once using sysctl_teardown. 85 */ 86 static int tap_node; 87 static int tap_sysctl_handler(SYSCTLFN_PROTO); 88 SYSCTL_SETUP_PROTO(sysctl_tap_setup); 89 90 /* 91 * Since we're an Ethernet device, we need the 3 following 92 * components: a leading struct device, a struct ethercom, 93 * and also a struct ifmedia since we don't attach a PHY to 94 * ourselves. We could emulate one, but there's no real 95 * point. 96 */ 97 98 struct tap_softc { 99 struct device sc_dev; 100 struct ifmedia sc_im; 101 struct ethercom sc_ec; 102 int sc_flags; 103 #define TAP_INUSE 0x00000001 /* tap device can only be opened once */ 104 #define TAP_ASYNCIO 0x00000002 /* user is using async I/O (SIGIO) on the device */ 105 #define TAP_NBIO 0x00000004 /* user wants calls to avoid blocking */ 106 #define TAP_GOING 0x00000008 /* interface is being destroyed */ 107 struct selinfo sc_rsel; 108 pid_t sc_pgid; /* For async. IO */ 109 kmutex_t sc_rdlock; 110 struct simplelock sc_kqlock; 111 }; 112 113 /* autoconf(9) glue */ 114 115 void tapattach(int); 116 117 static int tap_match(struct device *, struct cfdata *, void *); 118 static void tap_attach(struct device *, struct device *, void *); 119 static int tap_detach(struct device*, int); 120 121 CFATTACH_DECL(tap, sizeof(struct tap_softc), 122 tap_match, tap_attach, tap_detach, NULL); 123 extern struct cfdriver tap_cd; 124 125 /* Real device access routines */ 126 static int tap_dev_close(struct tap_softc *); 127 static int tap_dev_read(int, struct uio *, int); 128 static int tap_dev_write(int, struct uio *, int); 129 static int tap_dev_ioctl(int, u_long, void *, struct lwp *); 130 static int tap_dev_poll(int, int, struct lwp *); 131 static int tap_dev_kqfilter(int, struct knote *); 132 133 /* Fileops access routines */ 134 static int tap_fops_close(struct file *, struct lwp *); 135 static int tap_fops_read(struct file *, off_t *, struct uio *, 136 kauth_cred_t, int); 137 static int tap_fops_write(struct file *, off_t *, struct uio *, 138 kauth_cred_t, int); 139 static int tap_fops_ioctl(struct file *, u_long, void *, 140 struct lwp *); 141 static int tap_fops_poll(struct file *, int, struct lwp *); 142 static int tap_fops_kqfilter(struct file *, struct knote *); 143 144 static const struct fileops tap_fileops = { 145 tap_fops_read, 146 tap_fops_write, 147 tap_fops_ioctl, 148 fnullop_fcntl, 149 tap_fops_poll, 150 fbadop_stat, 151 tap_fops_close, 152 tap_fops_kqfilter, 153 }; 154 155 /* Helper for cloning open() */ 156 static int tap_dev_cloner(struct lwp *); 157 158 /* Character device routines */ 159 static int tap_cdev_open(dev_t, int, int, struct lwp *); 160 static int tap_cdev_close(dev_t, int, int, struct lwp *); 161 static int tap_cdev_read(dev_t, struct uio *, int); 162 static int tap_cdev_write(dev_t, struct uio *, int); 163 static int tap_cdev_ioctl(dev_t, u_long, void *, int, struct lwp *); 164 static int tap_cdev_poll(dev_t, int, struct lwp *); 165 static int tap_cdev_kqfilter(dev_t, struct knote *); 166 167 const struct cdevsw tap_cdevsw = { 168 tap_cdev_open, tap_cdev_close, 169 tap_cdev_read, tap_cdev_write, 170 tap_cdev_ioctl, nostop, notty, 171 tap_cdev_poll, nommap, 172 tap_cdev_kqfilter, 173 D_OTHER, 174 }; 175 176 #define TAP_CLONER 0xfffff /* Maximal minor value */ 177 178 /* kqueue-related routines */ 179 static void tap_kqdetach(struct knote *); 180 static int tap_kqread(struct knote *, long); 181 182 /* 183 * Those are needed by the if_media interface. 184 */ 185 186 static int tap_mediachange(struct ifnet *); 187 static void tap_mediastatus(struct ifnet *, struct ifmediareq *); 188 189 /* 190 * Those are needed by the ifnet interface, and would typically be 191 * there for any network interface driver. 192 * Some other routines are optional: watchdog and drain. 193 */ 194 195 static void tap_start(struct ifnet *); 196 static void tap_stop(struct ifnet *, int); 197 static int tap_init(struct ifnet *); 198 static int tap_ioctl(struct ifnet *, u_long, void *); 199 200 /* This is an internal function to keep tap_ioctl readable */ 201 static int tap_lifaddr(struct ifnet *, u_long, struct ifaliasreq *); 202 203 /* 204 * tap is a clonable interface, although it is highly unrealistic for 205 * an Ethernet device. 206 * 207 * Here are the bits needed for a clonable interface. 208 */ 209 static int tap_clone_create(struct if_clone *, int); 210 static int tap_clone_destroy(struct ifnet *); 211 212 struct if_clone tap_cloners = IF_CLONE_INITIALIZER("tap", 213 tap_clone_create, 214 tap_clone_destroy); 215 216 /* Helper functionis shared by the two cloning code paths */ 217 static struct tap_softc * tap_clone_creator(int); 218 int tap_clone_destroyer(struct device *); 219 220 void 221 tapattach(int n) 222 { 223 int error; 224 225 error = config_cfattach_attach(tap_cd.cd_name, &tap_ca); 226 if (error) { 227 aprint_error("%s: unable to register cfattach\n", 228 tap_cd.cd_name); 229 (void)config_cfdriver_detach(&tap_cd); 230 return; 231 } 232 233 if_clone_attach(&tap_cloners); 234 } 235 236 /* Pretty much useless for a pseudo-device */ 237 static int 238 tap_match(struct device *self, struct cfdata *cfdata, 239 void *arg) 240 { 241 return (1); 242 } 243 244 void 245 tap_attach(struct device *parent, struct device *self, 246 void *aux) 247 { 248 struct tap_softc *sc = (struct tap_softc *)self; 249 struct ifnet *ifp; 250 const struct sysctlnode *node; 251 u_int8_t enaddr[ETHER_ADDR_LEN] = 252 { 0xf2, 0x0b, 0xa4, 0xff, 0xff, 0xff }; 253 char enaddrstr[3 * ETHER_ADDR_LEN]; 254 struct timeval tv; 255 uint32_t ui; 256 int error; 257 258 /* 259 * In order to obtain unique initial Ethernet address on a host, 260 * do some randomisation using the current uptime. It's not meant 261 * for anything but avoiding hard-coding an address. 262 */ 263 getmicrouptime(&tv); 264 ui = (tv.tv_sec ^ tv.tv_usec) & 0xffffff; 265 memcpy(enaddr+3, (u_int8_t *)&ui, 3); 266 267 aprint_verbose("%s: Ethernet address %s\n", device_xname(&sc->sc_dev), 268 ether_snprintf(enaddrstr, sizeof(enaddrstr), enaddr)); 269 270 /* 271 * Why 1000baseT? Why not? You can add more. 272 * 273 * Note that there are 3 steps: init, one or several additions to 274 * list of supported media, and in the end, the selection of one 275 * of them. 276 */ 277 ifmedia_init(&sc->sc_im, 0, tap_mediachange, tap_mediastatus); 278 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_1000_T, 0, NULL); 279 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_1000_T|IFM_FDX, 0, NULL); 280 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_100_TX, 0, NULL); 281 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_100_TX|IFM_FDX, 0, NULL); 282 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_10_T, 0, NULL); 283 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_10_T|IFM_FDX, 0, NULL); 284 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_AUTO, 0, NULL); 285 ifmedia_set(&sc->sc_im, IFM_ETHER|IFM_AUTO); 286 287 /* 288 * One should note that an interface must do multicast in order 289 * to support IPv6. 290 */ 291 ifp = &sc->sc_ec.ec_if; 292 strcpy(ifp->if_xname, sc->sc_dev.dv_xname); 293 ifp->if_softc = sc; 294 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 295 ifp->if_ioctl = tap_ioctl; 296 ifp->if_start = tap_start; 297 ifp->if_stop = tap_stop; 298 ifp->if_init = tap_init; 299 IFQ_SET_READY(&ifp->if_snd); 300 301 sc->sc_ec.ec_capabilities = ETHERCAP_VLAN_MTU | ETHERCAP_JUMBO_MTU; 302 303 /* Those steps are mandatory for an Ethernet driver, the fisrt call 304 * being common to all network interface drivers. */ 305 if_attach(ifp); 306 ether_ifattach(ifp, enaddr); 307 308 sc->sc_flags = 0; 309 310 /* 311 * Add a sysctl node for that interface. 312 * 313 * The pointer transmitted is not a string, but instead a pointer to 314 * the softc structure, which we can use to build the string value on 315 * the fly in the helper function of the node. See the comments for 316 * tap_sysctl_handler for details. 317 * 318 * Usually sysctl_createv is called with CTL_CREATE as the before-last 319 * component. However, we can allocate a number ourselves, as we are 320 * the only consumer of the net.link.<iface> node. In this case, the 321 * unit number is conveniently used to number the node. CTL_CREATE 322 * would just work, too. 323 */ 324 if ((error = sysctl_createv(NULL, 0, NULL, 325 &node, CTLFLAG_READWRITE, 326 CTLTYPE_STRING, sc->sc_dev.dv_xname, NULL, 327 tap_sysctl_handler, 0, sc, 18, 328 CTL_NET, AF_LINK, tap_node, device_unit(&sc->sc_dev), 329 CTL_EOL)) != 0) 330 aprint_error("%s: sysctl_createv returned %d, ignoring\n", 331 sc->sc_dev.dv_xname, error); 332 333 /* 334 * Initialize the two locks for the device. 335 * 336 * We need a lock here because even though the tap device can be 337 * opened only once, the file descriptor might be passed to another 338 * process, say a fork(2)ed child. 339 * 340 * The Giant saves us from most of the hassle, but since the read 341 * operation can sleep, we don't want two processes to wake up at 342 * the same moment and both try and dequeue a single packet. 343 * 344 * The queue for event listeners (used by kqueue(9), see below) has 345 * to be protected, too, but we don't need the same level of 346 * complexity for that lock, so a simple spinning lock is fine. 347 */ 348 mutex_init(&sc->sc_rdlock, MUTEX_DEFAULT, IPL_NONE); 349 simple_lock_init(&sc->sc_kqlock); 350 } 351 352 /* 353 * When detaching, we do the inverse of what is done in the attach 354 * routine, in reversed order. 355 */ 356 static int 357 tap_detach(struct device* self, int flags) 358 { 359 struct tap_softc *sc = (struct tap_softc *)self; 360 struct ifnet *ifp = &sc->sc_ec.ec_if; 361 int error, s; 362 363 sc->sc_flags |= TAP_GOING; 364 s = splnet(); 365 tap_stop(ifp, 1); 366 if_down(ifp); 367 splx(s); 368 369 /* 370 * Destroying a single leaf is a very straightforward operation using 371 * sysctl_destroyv. One should be sure to always end the path with 372 * CTL_EOL. 373 */ 374 if ((error = sysctl_destroyv(NULL, CTL_NET, AF_LINK, tap_node, 375 device_unit(&sc->sc_dev), CTL_EOL)) != 0) 376 aprint_error("%s: sysctl_destroyv returned %d, ignoring\n", 377 sc->sc_dev.dv_xname, error); 378 ether_ifdetach(ifp); 379 if_detach(ifp); 380 ifmedia_delete_instance(&sc->sc_im, IFM_INST_ANY); 381 mutex_destroy(&sc->sc_rdlock); 382 383 return (0); 384 } 385 386 /* 387 * This function is called by the ifmedia layer to notify the driver 388 * that the user requested a media change. A real driver would 389 * reconfigure the hardware. 390 */ 391 static int 392 tap_mediachange(struct ifnet *ifp) 393 { 394 return (0); 395 } 396 397 /* 398 * Here the user asks for the currently used media. 399 */ 400 static void 401 tap_mediastatus(struct ifnet *ifp, struct ifmediareq *imr) 402 { 403 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc; 404 imr->ifm_active = sc->sc_im.ifm_cur->ifm_media; 405 } 406 407 /* 408 * This is the function where we SEND packets. 409 * 410 * There is no 'receive' equivalent. A typical driver will get 411 * interrupts from the hardware, and from there will inject new packets 412 * into the network stack. 413 * 414 * Once handled, a packet must be freed. A real driver might not be able 415 * to fit all the pending packets into the hardware, and is allowed to 416 * return before having sent all the packets. It should then use the 417 * if_flags flag IFF_OACTIVE to notify the upper layer. 418 * 419 * There are also other flags one should check, such as IFF_PAUSE. 420 * 421 * It is our duty to make packets available to BPF listeners. 422 * 423 * You should be aware that this function is called by the Ethernet layer 424 * at splnet(). 425 * 426 * When the device is opened, we have to pass the packet(s) to the 427 * userland. For that we stay in OACTIVE mode while the userland gets 428 * the packets, and we send a signal to the processes waiting to read. 429 * 430 * wakeup(sc) is the counterpart to the tsleep call in 431 * tap_dev_read, while selnotify() is used for kevent(2) and 432 * poll(2) (which includes select(2)) listeners. 433 */ 434 static void 435 tap_start(struct ifnet *ifp) 436 { 437 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc; 438 struct mbuf *m0; 439 440 if ((sc->sc_flags & TAP_INUSE) == 0) { 441 /* Simply drop packets */ 442 for(;;) { 443 IFQ_DEQUEUE(&ifp->if_snd, m0); 444 if (m0 == NULL) 445 return; 446 447 ifp->if_opackets++; 448 #if NBPFILTER > 0 449 if (ifp->if_bpf) 450 bpf_mtap(ifp->if_bpf, m0); 451 #endif 452 453 m_freem(m0); 454 } 455 } else if (!IFQ_IS_EMPTY(&ifp->if_snd)) { 456 ifp->if_flags |= IFF_OACTIVE; 457 wakeup(sc); 458 selnotify(&sc->sc_rsel, 1); 459 if (sc->sc_flags & TAP_ASYNCIO) 460 fownsignal(sc->sc_pgid, SIGIO, POLL_IN, 461 POLLIN|POLLRDNORM, NULL); 462 } 463 } 464 465 /* 466 * A typical driver will only contain the following handlers for 467 * ioctl calls, except SIOCSIFPHYADDR. 468 * The latter is a hack I used to set the Ethernet address of the 469 * faked device. 470 * 471 * Note that both ifmedia_ioctl() and ether_ioctl() have to be 472 * called under splnet(). 473 */ 474 static int 475 tap_ioctl(struct ifnet *ifp, u_long cmd, void *data) 476 { 477 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc; 478 struct ifreq *ifr = (struct ifreq *)data; 479 int s, error; 480 481 s = splnet(); 482 483 switch (cmd) { 484 #ifdef OSIOCSIFMEDIA 485 case OSIOCSIFMEDIA: 486 #endif 487 case SIOCSIFMEDIA: 488 case SIOCGIFMEDIA: 489 error = ifmedia_ioctl(ifp, ifr, &sc->sc_im, cmd); 490 break; 491 case SIOCSIFPHYADDR: 492 error = tap_lifaddr(ifp, cmd, (struct ifaliasreq *)data); 493 break; 494 default: 495 error = ether_ioctl(ifp, cmd, data); 496 if (error == ENETRESET) 497 error = 0; 498 break; 499 } 500 501 splx(s); 502 503 return (error); 504 } 505 506 /* 507 * Helper function to set Ethernet address. This shouldn't be done there, 508 * and should actually be available to all Ethernet drivers, real or not. 509 */ 510 static int 511 tap_lifaddr(struct ifnet *ifp, u_long cmd, struct ifaliasreq *ifra) 512 { 513 struct sockaddr *sa = (struct sockaddr *)&ifra->ifra_addr; 514 515 if (sa->sa_family != AF_LINK) 516 return (EINVAL); 517 518 (void)sockaddr_dl_setaddr(ifp->if_sadl, ifp->if_sadl->sdl_len, 519 sa->sa_data, ETHER_ADDR_LEN); 520 521 return (0); 522 } 523 524 /* 525 * _init() would typically be called when an interface goes up, 526 * meaning it should configure itself into the state in which it 527 * can send packets. 528 */ 529 static int 530 tap_init(struct ifnet *ifp) 531 { 532 ifp->if_flags |= IFF_RUNNING; 533 534 tap_start(ifp); 535 536 return (0); 537 } 538 539 /* 540 * _stop() is called when an interface goes down. It is our 541 * responsability to validate that state by clearing the 542 * IFF_RUNNING flag. 543 * 544 * We have to wake up all the sleeping processes to have the pending 545 * read requests cancelled. 546 */ 547 static void 548 tap_stop(struct ifnet *ifp, int disable) 549 { 550 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc; 551 552 ifp->if_flags &= ~IFF_RUNNING; 553 wakeup(sc); 554 selnotify(&sc->sc_rsel, 1); 555 if (sc->sc_flags & TAP_ASYNCIO) 556 fownsignal(sc->sc_pgid, SIGIO, POLL_HUP, 0, NULL); 557 } 558 559 /* 560 * The 'create' command of ifconfig can be used to create 561 * any numbered instance of a given device. Thus we have to 562 * make sure we have enough room in cd_devs to create the 563 * user-specified instance. config_attach_pseudo will do this 564 * for us. 565 */ 566 static int 567 tap_clone_create(struct if_clone *ifc, int unit) 568 { 569 if (tap_clone_creator(unit) == NULL) { 570 aprint_error("%s%d: unable to attach an instance\n", 571 tap_cd.cd_name, unit); 572 return (ENXIO); 573 } 574 575 return (0); 576 } 577 578 /* 579 * tap(4) can be cloned by two ways: 580 * using 'ifconfig tap0 create', which will use the network 581 * interface cloning API, and call tap_clone_create above. 582 * opening the cloning device node, whose minor number is TAP_CLONER. 583 * See below for an explanation on how this part work. 584 */ 585 static struct tap_softc * 586 tap_clone_creator(int unit) 587 { 588 struct cfdata *cf; 589 590 cf = malloc(sizeof(*cf), M_DEVBUF, M_WAITOK); 591 cf->cf_name = tap_cd.cd_name; 592 cf->cf_atname = tap_ca.ca_name; 593 if (unit == -1) { 594 /* let autoconf find the first free one */ 595 cf->cf_unit = 0; 596 cf->cf_fstate = FSTATE_STAR; 597 } else { 598 cf->cf_unit = unit; 599 cf->cf_fstate = FSTATE_NOTFOUND; 600 } 601 602 return (struct tap_softc *)config_attach_pseudo(cf); 603 } 604 605 /* 606 * The clean design of if_clone and autoconf(9) makes that part 607 * really straightforward. The second argument of config_detach 608 * means neither QUIET nor FORCED. 609 */ 610 static int 611 tap_clone_destroy(struct ifnet *ifp) 612 { 613 return tap_clone_destroyer((struct device *)ifp->if_softc); 614 } 615 616 int 617 tap_clone_destroyer(struct device *dev) 618 { 619 struct cfdata *cf = device_cfdata(dev); 620 int error; 621 622 if ((error = config_detach(dev, 0)) != 0) 623 aprint_error("%s: unable to detach instance\n", 624 dev->dv_xname); 625 free(cf, M_DEVBUF); 626 627 return (error); 628 } 629 630 /* 631 * tap(4) is a bit of an hybrid device. It can be used in two different 632 * ways: 633 * 1. ifconfig tapN create, then use /dev/tapN to read/write off it. 634 * 2. open /dev/tap, get a new interface created and read/write off it. 635 * That interface is destroyed when the process that had it created exits. 636 * 637 * The first way is managed by the cdevsw structure, and you access interfaces 638 * through a (major, minor) mapping: tap4 is obtained by the minor number 639 * 4. The entry points for the cdevsw interface are prefixed by tap_cdev_. 640 * 641 * The second way is the so-called "cloning" device. It's a special minor 642 * number (chosen as the maximal number, to allow as much tap devices as 643 * possible). The user first opens the cloner (e.g., /dev/tap), and that 644 * call ends in tap_cdev_open. The actual place where it is handled is 645 * tap_dev_cloner. 646 * 647 * An tap device cannot be opened more than once at a time, so the cdevsw 648 * part of open() does nothing but noting that the interface is being used and 649 * hence ready to actually handle packets. 650 */ 651 652 static int 653 tap_cdev_open(dev_t dev, int flags, int fmt, struct lwp *l) 654 { 655 struct tap_softc *sc; 656 657 if (minor(dev) == TAP_CLONER) 658 return tap_dev_cloner(l); 659 660 sc = (struct tap_softc *)device_lookup(&tap_cd, minor(dev)); 661 if (sc == NULL) 662 return (ENXIO); 663 664 /* The device can only be opened once */ 665 if (sc->sc_flags & TAP_INUSE) 666 return (EBUSY); 667 sc->sc_flags |= TAP_INUSE; 668 return (0); 669 } 670 671 /* 672 * There are several kinds of cloning devices, and the most simple is the one 673 * tap(4) uses. What it does is change the file descriptor with a new one, 674 * with its own fileops structure (which maps to the various read, write, 675 * ioctl functions). It starts allocating a new file descriptor with falloc, 676 * then actually creates the new tap devices. 677 * 678 * Once those two steps are successful, we can re-wire the existing file 679 * descriptor to its new self. This is done with fdclone(): it fills the fp 680 * structure as needed (notably f_data gets filled with the fifth parameter 681 * passed, the unit of the tap device which will allows us identifying the 682 * device later), and returns EMOVEFD. 683 * 684 * That magic value is interpreted by sys_open() which then replaces the 685 * current file descriptor by the new one (through a magic member of struct 686 * lwp, l_dupfd). 687 * 688 * The tap device is flagged as being busy since it otherwise could be 689 * externally accessed through the corresponding device node with the cdevsw 690 * interface. 691 */ 692 693 static int 694 tap_dev_cloner(struct lwp *l) 695 { 696 struct tap_softc *sc; 697 struct file *fp; 698 int error, fd; 699 700 if ((error = falloc(l, &fp, &fd)) != 0) 701 return (error); 702 703 if ((sc = tap_clone_creator(-1)) == NULL) { 704 FILE_UNUSE(fp, l); 705 ffree(fp); 706 return (ENXIO); 707 } 708 709 sc->sc_flags |= TAP_INUSE; 710 711 return fdclone(l, fp, fd, FREAD|FWRITE, &tap_fileops, 712 (void *)(intptr_t)device_unit(&sc->sc_dev)); 713 } 714 715 /* 716 * While all other operations (read, write, ioctl, poll and kqfilter) are 717 * really the same whether we are in cdevsw or fileops mode, the close() 718 * function is slightly different in the two cases. 719 * 720 * As for the other, the core of it is shared in tap_dev_close. What 721 * it does is sufficient for the cdevsw interface, but the cloning interface 722 * needs another thing: the interface is destroyed when the processes that 723 * created it closes it. 724 */ 725 static int 726 tap_cdev_close(dev_t dev, int flags, int fmt, 727 struct lwp *l) 728 { 729 struct tap_softc *sc = 730 (struct tap_softc *)device_lookup(&tap_cd, minor(dev)); 731 732 if (sc == NULL) 733 return (ENXIO); 734 735 return tap_dev_close(sc); 736 } 737 738 /* 739 * It might happen that the administrator used ifconfig to externally destroy 740 * the interface. In that case, tap_fops_close will be called while 741 * tap_detach is already happening. If we called it again from here, we 742 * would dead lock. TAP_GOING ensures that this situation doesn't happen. 743 */ 744 static int 745 tap_fops_close(struct file *fp, struct lwp *l) 746 { 747 int unit = (intptr_t)fp->f_data; 748 struct tap_softc *sc; 749 int error; 750 751 sc = (struct tap_softc *)device_lookup(&tap_cd, unit); 752 if (sc == NULL) 753 return (ENXIO); 754 755 /* tap_dev_close currently always succeeds, but it might not 756 * always be the case. */ 757 if ((error = tap_dev_close(sc)) != 0) 758 return (error); 759 760 /* Destroy the device now that it is no longer useful, 761 * unless it's already being destroyed. */ 762 if ((sc->sc_flags & TAP_GOING) != 0) 763 return (0); 764 765 return tap_clone_destroyer((struct device *)sc); 766 } 767 768 static int 769 tap_dev_close(struct tap_softc *sc) 770 { 771 struct ifnet *ifp; 772 int s; 773 774 s = splnet(); 775 /* Let tap_start handle packets again */ 776 ifp = &sc->sc_ec.ec_if; 777 ifp->if_flags &= ~IFF_OACTIVE; 778 779 /* Purge output queue */ 780 if (!(IFQ_IS_EMPTY(&ifp->if_snd))) { 781 struct mbuf *m; 782 783 for (;;) { 784 IFQ_DEQUEUE(&ifp->if_snd, m); 785 if (m == NULL) 786 break; 787 788 ifp->if_opackets++; 789 #if NBPFILTER > 0 790 if (ifp->if_bpf) 791 bpf_mtap(ifp->if_bpf, m); 792 #endif 793 } 794 } 795 splx(s); 796 797 sc->sc_flags &= ~(TAP_INUSE | TAP_ASYNCIO); 798 799 return (0); 800 } 801 802 static int 803 tap_cdev_read(dev_t dev, struct uio *uio, int flags) 804 { 805 return tap_dev_read(minor(dev), uio, flags); 806 } 807 808 static int 809 tap_fops_read(struct file *fp, off_t *offp, struct uio *uio, 810 kauth_cred_t cred, int flags) 811 { 812 return tap_dev_read((intptr_t)fp->f_data, uio, flags); 813 } 814 815 static int 816 tap_dev_read(int unit, struct uio *uio, int flags) 817 { 818 struct tap_softc *sc = 819 (struct tap_softc *)device_lookup(&tap_cd, unit); 820 struct ifnet *ifp; 821 struct mbuf *m, *n; 822 int error = 0, s; 823 824 if (sc == NULL) 825 return (ENXIO); 826 827 ifp = &sc->sc_ec.ec_if; 828 if ((ifp->if_flags & IFF_UP) == 0) 829 return (EHOSTDOWN); 830 831 /* 832 * In the TAP_NBIO case, we have to make sure we won't be sleeping 833 */ 834 if ((sc->sc_flags & TAP_NBIO) != 0) { 835 if (!mutex_tryenter(&sc->sc_rdlock)) 836 return (EWOULDBLOCK); 837 } else { 838 mutex_enter(&sc->sc_rdlock); 839 } 840 841 s = splnet(); 842 if (IFQ_IS_EMPTY(&ifp->if_snd)) { 843 ifp->if_flags &= ~IFF_OACTIVE; 844 splx(s); 845 /* 846 * We must release the lock before sleeping, and re-acquire it 847 * after. 848 */ 849 mutex_exit(&sc->sc_rdlock); 850 if (sc->sc_flags & TAP_NBIO) 851 error = EWOULDBLOCK; 852 else 853 error = tsleep(sc, PSOCK|PCATCH, "tap", 0); 854 if (error != 0) 855 return (error); 856 /* The device might have been downed */ 857 if ((ifp->if_flags & IFF_UP) == 0) 858 return (EHOSTDOWN); 859 if ((sc->sc_flags & TAP_NBIO)) { 860 if (!mutex_tryenter(&sc->sc_rdlock)) 861 return (EWOULDBLOCK); 862 } else { 863 mutex_enter(&sc->sc_rdlock); 864 } 865 s = splnet(); 866 } 867 868 IFQ_DEQUEUE(&ifp->if_snd, m); 869 ifp->if_flags &= ~IFF_OACTIVE; 870 splx(s); 871 if (m == NULL) { 872 error = 0; 873 goto out; 874 } 875 876 ifp->if_opackets++; 877 #if NBPFILTER > 0 878 if (ifp->if_bpf) 879 bpf_mtap(ifp->if_bpf, m); 880 #endif 881 882 /* 883 * One read is one packet. 884 */ 885 do { 886 error = uiomove(mtod(m, void *), 887 min(m->m_len, uio->uio_resid), uio); 888 MFREE(m, n); 889 m = n; 890 } while (m != NULL && uio->uio_resid > 0 && error == 0); 891 892 if (m != NULL) 893 m_freem(m); 894 895 out: 896 mutex_exit(&sc->sc_rdlock); 897 return (error); 898 } 899 900 static int 901 tap_cdev_write(dev_t dev, struct uio *uio, int flags) 902 { 903 return tap_dev_write(minor(dev), uio, flags); 904 } 905 906 static int 907 tap_fops_write(struct file *fp, off_t *offp, struct uio *uio, 908 kauth_cred_t cred, int flags) 909 { 910 return tap_dev_write((intptr_t)fp->f_data, uio, flags); 911 } 912 913 static int 914 tap_dev_write(int unit, struct uio *uio, int flags) 915 { 916 struct tap_softc *sc = 917 (struct tap_softc *)device_lookup(&tap_cd, unit); 918 struct ifnet *ifp; 919 struct mbuf *m, **mp; 920 int error = 0; 921 int s; 922 923 if (sc == NULL) 924 return (ENXIO); 925 926 ifp = &sc->sc_ec.ec_if; 927 928 /* One write, one packet, that's the rule */ 929 MGETHDR(m, M_DONTWAIT, MT_DATA); 930 if (m == NULL) { 931 ifp->if_ierrors++; 932 return (ENOBUFS); 933 } 934 m->m_pkthdr.len = uio->uio_resid; 935 936 mp = &m; 937 while (error == 0 && uio->uio_resid > 0) { 938 if (*mp != m) { 939 MGET(*mp, M_DONTWAIT, MT_DATA); 940 if (*mp == NULL) { 941 error = ENOBUFS; 942 break; 943 } 944 } 945 (*mp)->m_len = min(MHLEN, uio->uio_resid); 946 error = uiomove(mtod(*mp, void *), (*mp)->m_len, uio); 947 mp = &(*mp)->m_next; 948 } 949 if (error) { 950 ifp->if_ierrors++; 951 m_freem(m); 952 return (error); 953 } 954 955 ifp->if_ipackets++; 956 m->m_pkthdr.rcvif = ifp; 957 958 #if NBPFILTER > 0 959 if (ifp->if_bpf) 960 bpf_mtap(ifp->if_bpf, m); 961 #endif 962 s =splnet(); 963 (*ifp->if_input)(ifp, m); 964 splx(s); 965 966 return (0); 967 } 968 969 static int 970 tap_cdev_ioctl(dev_t dev, u_long cmd, void *data, int flags, 971 struct lwp *l) 972 { 973 return tap_dev_ioctl(minor(dev), cmd, data, l); 974 } 975 976 static int 977 tap_fops_ioctl(struct file *fp, u_long cmd, void *data, struct lwp *l) 978 { 979 return tap_dev_ioctl((intptr_t)fp->f_data, cmd, (void *)data, l); 980 } 981 982 static int 983 tap_dev_ioctl(int unit, u_long cmd, void *data, struct lwp *l) 984 { 985 struct tap_softc *sc = 986 (struct tap_softc *)device_lookup(&tap_cd, unit); 987 int error = 0; 988 989 if (sc == NULL) 990 return (ENXIO); 991 992 switch (cmd) { 993 case FIONREAD: 994 { 995 struct ifnet *ifp = &sc->sc_ec.ec_if; 996 struct mbuf *m; 997 int s; 998 999 s = splnet(); 1000 IFQ_POLL(&ifp->if_snd, m); 1001 1002 if (m == NULL) 1003 *(int *)data = 0; 1004 else 1005 *(int *)data = m->m_pkthdr.len; 1006 splx(s); 1007 } break; 1008 case TIOCSPGRP: 1009 case FIOSETOWN: 1010 error = fsetown(l->l_proc, &sc->sc_pgid, cmd, data); 1011 break; 1012 case TIOCGPGRP: 1013 case FIOGETOWN: 1014 error = fgetown(l->l_proc, sc->sc_pgid, cmd, data); 1015 break; 1016 case FIOASYNC: 1017 if (*(int *)data) 1018 sc->sc_flags |= TAP_ASYNCIO; 1019 else 1020 sc->sc_flags &= ~TAP_ASYNCIO; 1021 break; 1022 case FIONBIO: 1023 if (*(int *)data) 1024 sc->sc_flags |= TAP_NBIO; 1025 else 1026 sc->sc_flags &= ~TAP_NBIO; 1027 break; 1028 #ifdef OTAPGIFNAME 1029 case OTAPGIFNAME: 1030 #endif 1031 case TAPGIFNAME: 1032 { 1033 struct ifreq *ifr = (struct ifreq *)data; 1034 struct ifnet *ifp = &sc->sc_ec.ec_if; 1035 1036 strlcpy(ifr->ifr_name, ifp->if_xname, IFNAMSIZ); 1037 } break; 1038 default: 1039 error = ENOTTY; 1040 break; 1041 } 1042 1043 return (0); 1044 } 1045 1046 static int 1047 tap_cdev_poll(dev_t dev, int events, struct lwp *l) 1048 { 1049 return tap_dev_poll(minor(dev), events, l); 1050 } 1051 1052 static int 1053 tap_fops_poll(struct file *fp, int events, struct lwp *l) 1054 { 1055 return tap_dev_poll((intptr_t)fp->f_data, events, l); 1056 } 1057 1058 static int 1059 tap_dev_poll(int unit, int events, struct lwp *l) 1060 { 1061 struct tap_softc *sc = 1062 (struct tap_softc *)device_lookup(&tap_cd, unit); 1063 int revents = 0; 1064 1065 if (sc == NULL) 1066 return POLLERR; 1067 1068 if (events & (POLLIN|POLLRDNORM)) { 1069 struct ifnet *ifp = &sc->sc_ec.ec_if; 1070 struct mbuf *m; 1071 int s; 1072 1073 s = splnet(); 1074 IFQ_POLL(&ifp->if_snd, m); 1075 splx(s); 1076 1077 if (m != NULL) 1078 revents |= events & (POLLIN|POLLRDNORM); 1079 else { 1080 simple_lock(&sc->sc_kqlock); 1081 selrecord(l, &sc->sc_rsel); 1082 simple_unlock(&sc->sc_kqlock); 1083 } 1084 } 1085 revents |= events & (POLLOUT|POLLWRNORM); 1086 1087 return (revents); 1088 } 1089 1090 static struct filterops tap_read_filterops = { 1, NULL, tap_kqdetach, 1091 tap_kqread }; 1092 static struct filterops tap_seltrue_filterops = { 1, NULL, tap_kqdetach, 1093 filt_seltrue }; 1094 1095 static int 1096 tap_cdev_kqfilter(dev_t dev, struct knote *kn) 1097 { 1098 return tap_dev_kqfilter(minor(dev), kn); 1099 } 1100 1101 static int 1102 tap_fops_kqfilter(struct file *fp, struct knote *kn) 1103 { 1104 return tap_dev_kqfilter((intptr_t)fp->f_data, kn); 1105 } 1106 1107 static int 1108 tap_dev_kqfilter(int unit, struct knote *kn) 1109 { 1110 struct tap_softc *sc = 1111 (struct tap_softc *)device_lookup(&tap_cd, unit); 1112 1113 if (sc == NULL) 1114 return (ENXIO); 1115 1116 switch(kn->kn_filter) { 1117 case EVFILT_READ: 1118 kn->kn_fop = &tap_read_filterops; 1119 break; 1120 case EVFILT_WRITE: 1121 kn->kn_fop = &tap_seltrue_filterops; 1122 break; 1123 default: 1124 return (EINVAL); 1125 } 1126 1127 kn->kn_hook = sc; 1128 simple_lock(&sc->sc_kqlock); 1129 SLIST_INSERT_HEAD(&sc->sc_rsel.sel_klist, kn, kn_selnext); 1130 simple_unlock(&sc->sc_kqlock); 1131 return (0); 1132 } 1133 1134 static void 1135 tap_kqdetach(struct knote *kn) 1136 { 1137 struct tap_softc *sc = (struct tap_softc *)kn->kn_hook; 1138 1139 simple_lock(&sc->sc_kqlock); 1140 SLIST_REMOVE(&sc->sc_rsel.sel_klist, kn, knote, kn_selnext); 1141 simple_unlock(&sc->sc_kqlock); 1142 } 1143 1144 static int 1145 tap_kqread(struct knote *kn, long hint) 1146 { 1147 struct tap_softc *sc = (struct tap_softc *)kn->kn_hook; 1148 struct ifnet *ifp = &sc->sc_ec.ec_if; 1149 struct mbuf *m; 1150 int s; 1151 1152 s = splnet(); 1153 IFQ_POLL(&ifp->if_snd, m); 1154 1155 if (m == NULL) 1156 kn->kn_data = 0; 1157 else 1158 kn->kn_data = m->m_pkthdr.len; 1159 splx(s); 1160 return (kn->kn_data != 0 ? 1 : 0); 1161 } 1162 1163 /* 1164 * sysctl management routines 1165 * You can set the address of an interface through: 1166 * net.link.tap.tap<number> 1167 * 1168 * Note the consistent use of tap_log in order to use 1169 * sysctl_teardown at unload time. 1170 * 1171 * In the kernel you will find a lot of SYSCTL_SETUP blocks. Those 1172 * blocks register a function in a special section of the kernel 1173 * (called a link set) which is used at init_sysctl() time to cycle 1174 * through all those functions to create the kernel's sysctl tree. 1175 * 1176 * It is not (currently) possible to use link sets in a LKM, so the 1177 * easiest is to simply call our own setup routine at load time. 1178 * 1179 * In the SYSCTL_SETUP blocks you find in the kernel, nodes have the 1180 * CTLFLAG_PERMANENT flag, meaning they cannot be removed. Once the 1181 * whole kernel sysctl tree is built, it is not possible to add any 1182 * permanent node. 1183 * 1184 * It should be noted that we're not saving the sysctlnode pointer 1185 * we are returned when creating the "tap" node. That structure 1186 * cannot be trusted once out of the calling function, as it might 1187 * get reused. So we just save the MIB number, and always give the 1188 * full path starting from the root for later calls to sysctl_createv 1189 * and sysctl_destroyv. 1190 */ 1191 SYSCTL_SETUP(sysctl_tap_setup, "sysctl net.link.tap subtree setup") 1192 { 1193 const struct sysctlnode *node; 1194 int error = 0; 1195 1196 if ((error = sysctl_createv(clog, 0, NULL, NULL, 1197 CTLFLAG_PERMANENT, 1198 CTLTYPE_NODE, "net", NULL, 1199 NULL, 0, NULL, 0, 1200 CTL_NET, CTL_EOL)) != 0) 1201 return; 1202 1203 if ((error = sysctl_createv(clog, 0, NULL, NULL, 1204 CTLFLAG_PERMANENT, 1205 CTLTYPE_NODE, "link", NULL, 1206 NULL, 0, NULL, 0, 1207 CTL_NET, AF_LINK, CTL_EOL)) != 0) 1208 return; 1209 1210 /* 1211 * The first four parameters of sysctl_createv are for management. 1212 * 1213 * The four that follows, here starting with a '0' for the flags, 1214 * describe the node. 1215 * 1216 * The next series of four set its value, through various possible 1217 * means. 1218 * 1219 * Last but not least, the path to the node is described. That path 1220 * is relative to the given root (third argument). Here we're 1221 * starting from the root. 1222 */ 1223 if ((error = sysctl_createv(clog, 0, NULL, &node, 1224 CTLFLAG_PERMANENT, 1225 CTLTYPE_NODE, "tap", NULL, 1226 NULL, 0, NULL, 0, 1227 CTL_NET, AF_LINK, CTL_CREATE, CTL_EOL)) != 0) 1228 return; 1229 tap_node = node->sysctl_num; 1230 } 1231 1232 /* 1233 * The helper functions make Andrew Brown's interface really 1234 * shine. It makes possible to create value on the fly whether 1235 * the sysctl value is read or written. 1236 * 1237 * As shown as an example in the man page, the first step is to 1238 * create a copy of the node to have sysctl_lookup work on it. 1239 * 1240 * Here, we have more work to do than just a copy, since we have 1241 * to create the string. The first step is to collect the actual 1242 * value of the node, which is a convenient pointer to the softc 1243 * of the interface. From there we create the string and use it 1244 * as the value, but only for the *copy* of the node. 1245 * 1246 * Then we let sysctl_lookup do the magic, which consists in 1247 * setting oldp and newp as required by the operation. When the 1248 * value is read, that means that the string will be copied to 1249 * the user, and when it is written, the new value will be copied 1250 * over in the addr array. 1251 * 1252 * If newp is NULL, the user was reading the value, so we don't 1253 * have anything else to do. If a new value was written, we 1254 * have to check it. 1255 * 1256 * If it is incorrect, we can return an error and leave 'node' as 1257 * it is: since it is a copy of the actual node, the change will 1258 * be forgotten. 1259 * 1260 * Upon a correct input, we commit the change to the ifnet 1261 * structure of our interface. 1262 */ 1263 static int 1264 tap_sysctl_handler(SYSCTLFN_ARGS) 1265 { 1266 struct sysctlnode node; 1267 struct tap_softc *sc; 1268 struct ifnet *ifp; 1269 int error; 1270 size_t len; 1271 char addr[3 * ETHER_ADDR_LEN]; 1272 uint8_t enaddr[ETHER_ADDR_LEN]; 1273 1274 node = *rnode; 1275 sc = node.sysctl_data; 1276 ifp = &sc->sc_ec.ec_if; 1277 (void)ether_snprintf(addr, sizeof(addr), CLLADDR(ifp->if_sadl)); 1278 node.sysctl_data = addr; 1279 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 1280 if (error || newp == NULL) 1281 return (error); 1282 1283 len = strlen(addr); 1284 if (len < 11 || len > 17) 1285 return (EINVAL); 1286 1287 /* Commit change */ 1288 if (ether_nonstatic_aton(enaddr, addr) != 0 || 1289 sockaddr_dl_setaddr(ifp->if_sadl, ifp->if_sadl->sdl_len, enaddr, 1290 ETHER_ADDR_LEN) == NULL) 1291 return (EINVAL); 1292 return (error); 1293 } 1294