1 /* $NetBSD: if_tap.c,v 1.22 2006/10/12 01:32:28 christos Exp $ */ 2 3 /* 4 * Copyright (c) 2003, 2004 The NetBSD Foundation. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to the NetBSD Foundation 8 * by Quentin Garnier. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 39 /* 40 * tap(4) is a virtual Ethernet interface. It appears as a real Ethernet 41 * device to the system, but can also be accessed by userland through a 42 * character device interface, which allows reading and injecting frames. 43 */ 44 45 #include <sys/cdefs.h> 46 __KERNEL_RCSID(0, "$NetBSD: if_tap.c,v 1.22 2006/10/12 01:32:28 christos Exp $"); 47 48 #if defined(_KERNEL_OPT) 49 #include "bpfilter.h" 50 #endif 51 52 #include <sys/param.h> 53 #include <sys/systm.h> 54 #include <sys/kernel.h> 55 #include <sys/malloc.h> 56 #include <sys/conf.h> 57 #include <sys/device.h> 58 #include <sys/file.h> 59 #include <sys/filedesc.h> 60 #include <sys/ksyms.h> 61 #include <sys/poll.h> 62 #include <sys/select.h> 63 #include <sys/sockio.h> 64 #include <sys/sysctl.h> 65 #include <sys/kauth.h> 66 67 #include <net/if.h> 68 #include <net/if_dl.h> 69 #include <net/if_ether.h> 70 #include <net/if_media.h> 71 #include <net/if_tap.h> 72 #if NBPFILTER > 0 73 #include <net/bpf.h> 74 #endif 75 76 /* 77 * sysctl node management 78 * 79 * It's not really possible to use a SYSCTL_SETUP block with 80 * current LKM implementation, so it is easier to just define 81 * our own function. 82 * 83 * The handler function is a "helper" in Andrew Brown's sysctl 84 * framework terminology. It is used as a gateway for sysctl 85 * requests over the nodes. 86 * 87 * tap_log allows the module to log creations of nodes and 88 * destroy them all at once using sysctl_teardown. 89 */ 90 static int tap_node; 91 static int tap_sysctl_handler(SYSCTLFN_PROTO); 92 SYSCTL_SETUP_PROTO(sysctl_tap_setup); 93 94 /* 95 * Since we're an Ethernet device, we need the 3 following 96 * components: a leading struct device, a struct ethercom, 97 * and also a struct ifmedia since we don't attach a PHY to 98 * ourselves. We could emulate one, but there's no real 99 * point. 100 */ 101 102 struct tap_softc { 103 struct device sc_dev; 104 struct ifmedia sc_im; 105 struct ethercom sc_ec; 106 int sc_flags; 107 #define TAP_INUSE 0x00000001 /* tap device can only be opened once */ 108 #define TAP_ASYNCIO 0x00000002 /* user is using async I/O (SIGIO) on the device */ 109 #define TAP_NBIO 0x00000004 /* user wants calls to avoid blocking */ 110 #define TAP_GOING 0x00000008 /* interface is being destroyed */ 111 struct selinfo sc_rsel; 112 pid_t sc_pgid; /* For async. IO */ 113 struct lock sc_rdlock; 114 struct simplelock sc_kqlock; 115 }; 116 117 /* autoconf(9) glue */ 118 119 void tapattach(int); 120 121 static int tap_match(struct device *, struct cfdata *, void *); 122 static void tap_attach(struct device *, struct device *, void *); 123 static int tap_detach(struct device*, int); 124 125 /* Ethernet address helper functions */ 126 127 static int tap_ether_aton(u_char *, char *); 128 129 CFATTACH_DECL(tap, sizeof(struct tap_softc), 130 tap_match, tap_attach, tap_detach, NULL); 131 extern struct cfdriver tap_cd; 132 133 /* Real device access routines */ 134 static int tap_dev_close(struct tap_softc *); 135 static int tap_dev_read(int, struct uio *, int); 136 static int tap_dev_write(int, struct uio *, int); 137 static int tap_dev_ioctl(int, u_long, caddr_t, struct lwp *); 138 static int tap_dev_poll(int, int, struct lwp *); 139 static int tap_dev_kqfilter(int, struct knote *); 140 141 /* Fileops access routines */ 142 static int tap_fops_close(struct file *, struct lwp *); 143 static int tap_fops_read(struct file *, off_t *, struct uio *, 144 kauth_cred_t, int); 145 static int tap_fops_write(struct file *, off_t *, struct uio *, 146 kauth_cred_t, int); 147 static int tap_fops_ioctl(struct file *, u_long, void *, 148 struct lwp *); 149 static int tap_fops_poll(struct file *, int, struct lwp *); 150 static int tap_fops_kqfilter(struct file *, struct knote *); 151 152 static const struct fileops tap_fileops = { 153 tap_fops_read, 154 tap_fops_write, 155 tap_fops_ioctl, 156 fnullop_fcntl, 157 tap_fops_poll, 158 fbadop_stat, 159 tap_fops_close, 160 tap_fops_kqfilter, 161 }; 162 163 /* Helper for cloning open() */ 164 static int tap_dev_cloner(struct lwp *); 165 166 /* Character device routines */ 167 static int tap_cdev_open(dev_t, int, int, struct lwp *); 168 static int tap_cdev_close(dev_t, int, int, struct lwp *); 169 static int tap_cdev_read(dev_t, struct uio *, int); 170 static int tap_cdev_write(dev_t, struct uio *, int); 171 static int tap_cdev_ioctl(dev_t, u_long, caddr_t, int, struct lwp *); 172 static int tap_cdev_poll(dev_t, int, struct lwp *); 173 static int tap_cdev_kqfilter(dev_t, struct knote *); 174 175 const struct cdevsw tap_cdevsw = { 176 tap_cdev_open, tap_cdev_close, 177 tap_cdev_read, tap_cdev_write, 178 tap_cdev_ioctl, nostop, notty, 179 tap_cdev_poll, nommap, 180 tap_cdev_kqfilter, 181 D_OTHER, 182 }; 183 184 #define TAP_CLONER 0xfffff /* Maximal minor value */ 185 186 /* kqueue-related routines */ 187 static void tap_kqdetach(struct knote *); 188 static int tap_kqread(struct knote *, long); 189 190 /* 191 * Those are needed by the if_media interface. 192 */ 193 194 static int tap_mediachange(struct ifnet *); 195 static void tap_mediastatus(struct ifnet *, struct ifmediareq *); 196 197 /* 198 * Those are needed by the ifnet interface, and would typically be 199 * there for any network interface driver. 200 * Some other routines are optional: watchdog and drain. 201 */ 202 203 static void tap_start(struct ifnet *); 204 static void tap_stop(struct ifnet *, int); 205 static int tap_init(struct ifnet *); 206 static int tap_ioctl(struct ifnet *, u_long, caddr_t); 207 208 /* This is an internal function to keep tap_ioctl readable */ 209 static int tap_lifaddr(struct ifnet *, u_long, struct ifaliasreq *); 210 211 /* 212 * tap is a clonable interface, although it is highly unrealistic for 213 * an Ethernet device. 214 * 215 * Here are the bits needed for a clonable interface. 216 */ 217 static int tap_clone_create(struct if_clone *, int); 218 static int tap_clone_destroy(struct ifnet *); 219 220 struct if_clone tap_cloners = IF_CLONE_INITIALIZER("tap", 221 tap_clone_create, 222 tap_clone_destroy); 223 224 /* Helper functionis shared by the two cloning code paths */ 225 static struct tap_softc * tap_clone_creator(int); 226 int tap_clone_destroyer(struct device *); 227 228 void 229 tapattach(int n __unused) 230 { 231 int error; 232 233 error = config_cfattach_attach(tap_cd.cd_name, &tap_ca); 234 if (error) { 235 aprint_error("%s: unable to register cfattach\n", 236 tap_cd.cd_name); 237 (void)config_cfdriver_detach(&tap_cd); 238 return; 239 } 240 241 if_clone_attach(&tap_cloners); 242 } 243 244 /* Pretty much useless for a pseudo-device */ 245 static int 246 tap_match(struct device *self __unused, struct cfdata *cfdata __unused, 247 void *arg __unused) 248 { 249 return (1); 250 } 251 252 void 253 tap_attach(struct device *parent __unused, struct device *self, 254 void *aux __unused) 255 { 256 struct tap_softc *sc = (struct tap_softc *)self; 257 struct ifnet *ifp; 258 const struct sysctlnode *node; 259 u_int8_t enaddr[ETHER_ADDR_LEN] = 260 { 0xf2, 0x0b, 0xa4, 0xff, 0xff, 0xff }; 261 char enaddrstr[3 * ETHER_ADDR_LEN]; 262 struct timeval tv; 263 uint32_t ui; 264 int error; 265 266 aprint_normal("%s: faking Ethernet device\n", 267 self->dv_xname); 268 269 /* 270 * In order to obtain unique initial Ethernet address on a host, 271 * do some randomisation using the current uptime. It's not meant 272 * for anything but avoiding hard-coding an address. 273 */ 274 getmicrouptime(&tv); 275 ui = (tv.tv_sec ^ tv.tv_usec) & 0xffffff; 276 memcpy(enaddr+3, (u_int8_t *)&ui, 3); 277 278 aprint_normal("%s: Ethernet address %s\n", sc->sc_dev.dv_xname, 279 ether_snprintf(enaddrstr, sizeof(enaddrstr), enaddr)); 280 281 /* 282 * Why 1000baseT? Why not? You can add more. 283 * 284 * Note that there are 3 steps: init, one or several additions to 285 * list of supported media, and in the end, the selection of one 286 * of them. 287 */ 288 ifmedia_init(&sc->sc_im, 0, tap_mediachange, tap_mediastatus); 289 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_1000_T, 0, NULL); 290 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_1000_T|IFM_FDX, 0, NULL); 291 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_100_TX, 0, NULL); 292 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_100_TX|IFM_FDX, 0, NULL); 293 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_10_T, 0, NULL); 294 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_10_T|IFM_FDX, 0, NULL); 295 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_AUTO, 0, NULL); 296 ifmedia_set(&sc->sc_im, IFM_ETHER|IFM_AUTO); 297 298 /* 299 * One should note that an interface must do multicast in order 300 * to support IPv6. 301 */ 302 ifp = &sc->sc_ec.ec_if; 303 strcpy(ifp->if_xname, sc->sc_dev.dv_xname); 304 ifp->if_softc = sc; 305 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 306 ifp->if_ioctl = tap_ioctl; 307 ifp->if_start = tap_start; 308 ifp->if_stop = tap_stop; 309 ifp->if_init = tap_init; 310 IFQ_SET_READY(&ifp->if_snd); 311 312 sc->sc_ec.ec_capabilities = ETHERCAP_VLAN_MTU | ETHERCAP_JUMBO_MTU; 313 314 /* Those steps are mandatory for an Ethernet driver, the fisrt call 315 * being common to all network interface drivers. */ 316 if_attach(ifp); 317 ether_ifattach(ifp, enaddr); 318 319 sc->sc_flags = 0; 320 321 /* 322 * Add a sysctl node for that interface. 323 * 324 * The pointer transmitted is not a string, but instead a pointer to 325 * the softc structure, which we can use to build the string value on 326 * the fly in the helper function of the node. See the comments for 327 * tap_sysctl_handler for details. 328 * 329 * Usually sysctl_createv is called with CTL_CREATE as the before-last 330 * component. However, we can allocate a number ourselves, as we are 331 * the only consumer of the net.link.<iface> node. In this case, the 332 * unit number is conveniently used to number the node. CTL_CREATE 333 * would just work, too. 334 */ 335 if ((error = sysctl_createv(NULL, 0, NULL, 336 &node, CTLFLAG_READWRITE, 337 CTLTYPE_STRING, sc->sc_dev.dv_xname, NULL, 338 tap_sysctl_handler, 0, sc, 18, 339 CTL_NET, AF_LINK, tap_node, device_unit(&sc->sc_dev), 340 CTL_EOL)) != 0) 341 aprint_error("%s: sysctl_createv returned %d, ignoring\n", 342 sc->sc_dev.dv_xname, error); 343 344 /* 345 * Initialize the two locks for the device. 346 * 347 * We need a lock here because even though the tap device can be 348 * opened only once, the file descriptor might be passed to another 349 * process, say a fork(2)ed child. 350 * 351 * The Giant saves us from most of the hassle, but since the read 352 * operation can sleep, we don't want two processes to wake up at 353 * the same moment and both try and dequeue a single packet. 354 * 355 * The queue for event listeners (used by kqueue(9), see below) has 356 * to be protected, too, but we don't need the same level of 357 * complexity for that lock, so a simple spinning lock is fine. 358 */ 359 lockinit(&sc->sc_rdlock, PSOCK|PCATCH, "tapl", 0, LK_SLEEPFAIL); 360 simple_lock_init(&sc->sc_kqlock); 361 } 362 363 /* 364 * When detaching, we do the inverse of what is done in the attach 365 * routine, in reversed order. 366 */ 367 static int 368 tap_detach(struct device* self, int flags __unused) 369 { 370 struct tap_softc *sc = (struct tap_softc *)self; 371 struct ifnet *ifp = &sc->sc_ec.ec_if; 372 int error, s; 373 374 /* 375 * Some processes might be sleeping on "tap", so we have to make 376 * them release their hold on the device. 377 * 378 * The LK_DRAIN operation will wait for every locked process to 379 * release their hold. 380 */ 381 sc->sc_flags |= TAP_GOING; 382 s = splnet(); 383 tap_stop(ifp, 1); 384 if_down(ifp); 385 splx(s); 386 lockmgr(&sc->sc_rdlock, LK_DRAIN, NULL); 387 388 /* 389 * Destroying a single leaf is a very straightforward operation using 390 * sysctl_destroyv. One should be sure to always end the path with 391 * CTL_EOL. 392 */ 393 if ((error = sysctl_destroyv(NULL, CTL_NET, AF_LINK, tap_node, 394 device_unit(&sc->sc_dev), CTL_EOL)) != 0) 395 aprint_error("%s: sysctl_destroyv returned %d, ignoring\n", 396 sc->sc_dev.dv_xname, error); 397 ether_ifdetach(ifp); 398 if_detach(ifp); 399 ifmedia_delete_instance(&sc->sc_im, IFM_INST_ANY); 400 401 return (0); 402 } 403 404 /* 405 * This function is called by the ifmedia layer to notify the driver 406 * that the user requested a media change. A real driver would 407 * reconfigure the hardware. 408 */ 409 static int 410 tap_mediachange(struct ifnet *ifp __unused) 411 { 412 return (0); 413 } 414 415 /* 416 * Here the user asks for the currently used media. 417 */ 418 static void 419 tap_mediastatus(struct ifnet *ifp, struct ifmediareq *imr) 420 { 421 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc; 422 imr->ifm_active = sc->sc_im.ifm_cur->ifm_media; 423 } 424 425 /* 426 * This is the function where we SEND packets. 427 * 428 * There is no 'receive' equivalent. A typical driver will get 429 * interrupts from the hardware, and from there will inject new packets 430 * into the network stack. 431 * 432 * Once handled, a packet must be freed. A real driver might not be able 433 * to fit all the pending packets into the hardware, and is allowed to 434 * return before having sent all the packets. It should then use the 435 * if_flags flag IFF_OACTIVE to notify the upper layer. 436 * 437 * There are also other flags one should check, such as IFF_PAUSE. 438 * 439 * It is our duty to make packets available to BPF listeners. 440 * 441 * You should be aware that this function is called by the Ethernet layer 442 * at splnet(). 443 * 444 * When the device is opened, we have to pass the packet(s) to the 445 * userland. For that we stay in OACTIVE mode while the userland gets 446 * the packets, and we send a signal to the processes waiting to read. 447 * 448 * wakeup(sc) is the counterpart to the tsleep call in 449 * tap_dev_read, while selnotify() is used for kevent(2) and 450 * poll(2) (which includes select(2)) listeners. 451 */ 452 static void 453 tap_start(struct ifnet *ifp) 454 { 455 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc; 456 struct mbuf *m0; 457 458 if ((sc->sc_flags & TAP_INUSE) == 0) { 459 /* Simply drop packets */ 460 for(;;) { 461 IFQ_DEQUEUE(&ifp->if_snd, m0); 462 if (m0 == NULL) 463 return; 464 465 ifp->if_opackets++; 466 #if NBPFILTER > 0 467 if (ifp->if_bpf) 468 bpf_mtap(ifp->if_bpf, m0); 469 #endif 470 471 m_freem(m0); 472 } 473 } else if (!IFQ_IS_EMPTY(&ifp->if_snd)) { 474 ifp->if_flags |= IFF_OACTIVE; 475 wakeup(sc); 476 selnotify(&sc->sc_rsel, 1); 477 if (sc->sc_flags & TAP_ASYNCIO) 478 fownsignal(sc->sc_pgid, SIGIO, POLL_IN, 479 POLLIN|POLLRDNORM, NULL); 480 } 481 } 482 483 /* 484 * A typical driver will only contain the following handlers for 485 * ioctl calls, except SIOCSIFPHYADDR. 486 * The latter is a hack I used to set the Ethernet address of the 487 * faked device. 488 * 489 * Note that both ifmedia_ioctl() and ether_ioctl() have to be 490 * called under splnet(). 491 */ 492 static int 493 tap_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 494 { 495 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc; 496 struct ifreq *ifr = (struct ifreq *)data; 497 int s, error; 498 499 s = splnet(); 500 501 switch (cmd) { 502 case SIOCSIFMEDIA: 503 case SIOCGIFMEDIA: 504 error = ifmedia_ioctl(ifp, ifr, &sc->sc_im, cmd); 505 break; 506 case SIOCSIFPHYADDR: 507 error = tap_lifaddr(ifp, cmd, (struct ifaliasreq *)data); 508 break; 509 default: 510 error = ether_ioctl(ifp, cmd, data); 511 if (error == ENETRESET) 512 error = 0; 513 break; 514 } 515 516 splx(s); 517 518 return (error); 519 } 520 521 /* 522 * Helper function to set Ethernet address. This shouldn't be done there, 523 * and should actually be available to all Ethernet drivers, real or not. 524 */ 525 static int 526 tap_lifaddr(struct ifnet *ifp, u_long cmd __unused, struct ifaliasreq *ifra) 527 { 528 struct sockaddr *sa = (struct sockaddr *)&ifra->ifra_addr; 529 530 if (sa->sa_family != AF_LINK) 531 return (EINVAL); 532 533 memcpy(LLADDR(ifp->if_sadl), sa->sa_data, ETHER_ADDR_LEN); 534 535 return (0); 536 } 537 538 /* 539 * _init() would typically be called when an interface goes up, 540 * meaning it should configure itself into the state in which it 541 * can send packets. 542 */ 543 static int 544 tap_init(struct ifnet *ifp) 545 { 546 ifp->if_flags |= IFF_RUNNING; 547 548 tap_start(ifp); 549 550 return (0); 551 } 552 553 /* 554 * _stop() is called when an interface goes down. It is our 555 * responsability to validate that state by clearing the 556 * IFF_RUNNING flag. 557 * 558 * We have to wake up all the sleeping processes to have the pending 559 * read requests cancelled. 560 */ 561 static void 562 tap_stop(struct ifnet *ifp, int disable __unused) 563 { 564 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc; 565 566 ifp->if_flags &= ~IFF_RUNNING; 567 wakeup(sc); 568 selnotify(&sc->sc_rsel, 1); 569 if (sc->sc_flags & TAP_ASYNCIO) 570 fownsignal(sc->sc_pgid, SIGIO, POLL_HUP, 0, NULL); 571 } 572 573 /* 574 * The 'create' command of ifconfig can be used to create 575 * any numbered instance of a given device. Thus we have to 576 * make sure we have enough room in cd_devs to create the 577 * user-specified instance. config_attach_pseudo will do this 578 * for us. 579 */ 580 static int 581 tap_clone_create(struct if_clone *ifc __unused, int unit) 582 { 583 if (tap_clone_creator(unit) == NULL) { 584 aprint_error("%s%d: unable to attach an instance\n", 585 tap_cd.cd_name, unit); 586 return (ENXIO); 587 } 588 589 return (0); 590 } 591 592 /* 593 * tap(4) can be cloned by two ways: 594 * using 'ifconfig tap0 create', which will use the network 595 * interface cloning API, and call tap_clone_create above. 596 * opening the cloning device node, whose minor number is TAP_CLONER. 597 * See below for an explanation on how this part work. 598 * 599 * config_attach_pseudo can be called with unit = DVUNIT_ANY to have 600 * autoconf(9) choose a unit number for us. This is what happens when 601 * the cloner is openend, while the ifcloner interface creates a device 602 * with a specific unit number. 603 */ 604 static struct tap_softc * 605 tap_clone_creator(int unit) 606 { 607 struct cfdata *cf; 608 609 cf = malloc(sizeof(*cf), M_DEVBUF, M_WAITOK); 610 cf->cf_name = tap_cd.cd_name; 611 cf->cf_atname = tap_ca.ca_name; 612 cf->cf_unit = unit; 613 cf->cf_fstate = FSTATE_STAR; 614 615 return (struct tap_softc *)config_attach_pseudo(cf); 616 } 617 618 /* 619 * The clean design of if_clone and autoconf(9) makes that part 620 * really straightforward. The second argument of config_detach 621 * means neither QUIET nor FORCED. 622 */ 623 static int 624 tap_clone_destroy(struct ifnet *ifp) 625 { 626 return tap_clone_destroyer((struct device *)ifp->if_softc); 627 } 628 629 int 630 tap_clone_destroyer(struct device *dev) 631 { 632 struct cfdata *cf = device_cfdata(dev); 633 int error; 634 635 if ((error = config_detach(dev, 0)) != 0) 636 aprint_error("%s: unable to detach instance\n", 637 dev->dv_xname); 638 free(cf, M_DEVBUF); 639 640 return (error); 641 } 642 643 /* 644 * tap(4) is a bit of an hybrid device. It can be used in two different 645 * ways: 646 * 1. ifconfig tapN create, then use /dev/tapN to read/write off it. 647 * 2. open /dev/tap, get a new interface created and read/write off it. 648 * That interface is destroyed when the process that had it created exits. 649 * 650 * The first way is managed by the cdevsw structure, and you access interfaces 651 * through a (major, minor) mapping: tap4 is obtained by the minor number 652 * 4. The entry points for the cdevsw interface are prefixed by tap_cdev_. 653 * 654 * The second way is the so-called "cloning" device. It's a special minor 655 * number (chosen as the maximal number, to allow as much tap devices as 656 * possible). The user first opens the cloner (e.g., /dev/tap), and that 657 * call ends in tap_cdev_open. The actual place where it is handled is 658 * tap_dev_cloner. 659 * 660 * An tap device cannot be opened more than once at a time, so the cdevsw 661 * part of open() does nothing but noting that the interface is being used and 662 * hence ready to actually handle packets. 663 */ 664 665 static int 666 tap_cdev_open(dev_t dev, int flags __unused, int fmt __unused, struct lwp *l) 667 { 668 struct tap_softc *sc; 669 670 if (minor(dev) == TAP_CLONER) 671 return tap_dev_cloner(l); 672 673 sc = (struct tap_softc *)device_lookup(&tap_cd, minor(dev)); 674 if (sc == NULL) 675 return (ENXIO); 676 677 /* The device can only be opened once */ 678 if (sc->sc_flags & TAP_INUSE) 679 return (EBUSY); 680 sc->sc_flags |= TAP_INUSE; 681 return (0); 682 } 683 684 /* 685 * There are several kinds of cloning devices, and the most simple is the one 686 * tap(4) uses. What it does is change the file descriptor with a new one, 687 * with its own fileops structure (which maps to the various read, write, 688 * ioctl functions). It starts allocating a new file descriptor with falloc, 689 * then actually creates the new tap devices. 690 * 691 * Once those two steps are successful, we can re-wire the existing file 692 * descriptor to its new self. This is done with fdclone(): it fills the fp 693 * structure as needed (notably f_data gets filled with the fifth parameter 694 * passed, the unit of the tap device which will allows us identifying the 695 * device later), and returns EMOVEFD. 696 * 697 * That magic value is interpreted by sys_open() which then replaces the 698 * current file descriptor by the new one (through a magic member of struct 699 * lwp, l_dupfd). 700 * 701 * The tap device is flagged as being busy since it otherwise could be 702 * externally accessed through the corresponding device node with the cdevsw 703 * interface. 704 */ 705 706 static int 707 tap_dev_cloner(struct lwp *l) 708 { 709 struct tap_softc *sc; 710 struct file *fp; 711 int error, fd; 712 713 if ((error = falloc(l, &fp, &fd)) != 0) 714 return (error); 715 716 if ((sc = tap_clone_creator(DVUNIT_ANY)) == NULL) { 717 FILE_UNUSE(fp, l); 718 ffree(fp); 719 return (ENXIO); 720 } 721 722 sc->sc_flags |= TAP_INUSE; 723 724 return fdclone(l, fp, fd, FREAD|FWRITE, &tap_fileops, 725 (void *)(intptr_t)device_unit(&sc->sc_dev)); 726 } 727 728 /* 729 * While all other operations (read, write, ioctl, poll and kqfilter) are 730 * really the same whether we are in cdevsw or fileops mode, the close() 731 * function is slightly different in the two cases. 732 * 733 * As for the other, the core of it is shared in tap_dev_close. What 734 * it does is sufficient for the cdevsw interface, but the cloning interface 735 * needs another thing: the interface is destroyed when the processes that 736 * created it closes it. 737 */ 738 static int 739 tap_cdev_close(dev_t dev, int flags __unused, int fmt __unused, 740 struct lwp *l __unused) 741 { 742 struct tap_softc *sc = 743 (struct tap_softc *)device_lookup(&tap_cd, minor(dev)); 744 745 if (sc == NULL) 746 return (ENXIO); 747 748 return tap_dev_close(sc); 749 } 750 751 /* 752 * It might happen that the administrator used ifconfig to externally destroy 753 * the interface. In that case, tap_fops_close will be called while 754 * tap_detach is already happening. If we called it again from here, we 755 * would dead lock. TAP_GOING ensures that this situation doesn't happen. 756 */ 757 static int 758 tap_fops_close(struct file *fp, struct lwp *l __unused) 759 { 760 int unit = (intptr_t)fp->f_data; 761 struct tap_softc *sc; 762 int error; 763 764 sc = (struct tap_softc *)device_lookup(&tap_cd, unit); 765 if (sc == NULL) 766 return (ENXIO); 767 768 /* tap_dev_close currently always succeeds, but it might not 769 * always be the case. */ 770 if ((error = tap_dev_close(sc)) != 0) 771 return (error); 772 773 /* Destroy the device now that it is no longer useful, 774 * unless it's already being destroyed. */ 775 if ((sc->sc_flags & TAP_GOING) != 0) 776 return (0); 777 778 return tap_clone_destroyer((struct device *)sc); 779 } 780 781 static int 782 tap_dev_close(struct tap_softc *sc) 783 { 784 struct ifnet *ifp; 785 int s; 786 787 s = splnet(); 788 /* Let tap_start handle packets again */ 789 ifp = &sc->sc_ec.ec_if; 790 ifp->if_flags &= ~IFF_OACTIVE; 791 792 /* Purge output queue */ 793 if (!(IFQ_IS_EMPTY(&ifp->if_snd))) { 794 struct mbuf *m; 795 796 for (;;) { 797 IFQ_DEQUEUE(&ifp->if_snd, m); 798 if (m == NULL) 799 break; 800 801 ifp->if_opackets++; 802 #if NBPFILTER > 0 803 if (ifp->if_bpf) 804 bpf_mtap(ifp->if_bpf, m); 805 #endif 806 } 807 } 808 splx(s); 809 810 sc->sc_flags &= ~(TAP_INUSE | TAP_ASYNCIO); 811 812 return (0); 813 } 814 815 static int 816 tap_cdev_read(dev_t dev, struct uio *uio, int flags) 817 { 818 return tap_dev_read(minor(dev), uio, flags); 819 } 820 821 static int 822 tap_fops_read(struct file *fp, off_t *offp __unused, struct uio *uio, 823 kauth_cred_t cred __unused, int flags) 824 { 825 return tap_dev_read((intptr_t)fp->f_data, uio, flags); 826 } 827 828 static int 829 tap_dev_read(int unit, struct uio *uio, int flags __unused) 830 { 831 struct tap_softc *sc = 832 (struct tap_softc *)device_lookup(&tap_cd, unit); 833 struct ifnet *ifp; 834 struct mbuf *m, *n; 835 int error = 0, s; 836 837 if (sc == NULL) 838 return (ENXIO); 839 840 ifp = &sc->sc_ec.ec_if; 841 if ((ifp->if_flags & IFF_UP) == 0) 842 return (EHOSTDOWN); 843 844 /* 845 * In the TAP_NBIO case, we have to make sure we won't be sleeping 846 */ 847 if ((sc->sc_flags & TAP_NBIO) && 848 lockstatus(&sc->sc_rdlock) == LK_EXCLUSIVE) 849 return (EWOULDBLOCK); 850 error = lockmgr(&sc->sc_rdlock, LK_EXCLUSIVE, NULL); 851 if (error != 0) 852 return (error); 853 854 s = splnet(); 855 if (IFQ_IS_EMPTY(&ifp->if_snd)) { 856 ifp->if_flags &= ~IFF_OACTIVE; 857 splx(s); 858 /* 859 * We must release the lock before sleeping, and re-acquire it 860 * after. 861 */ 862 (void)lockmgr(&sc->sc_rdlock, LK_RELEASE, NULL); 863 if (sc->sc_flags & TAP_NBIO) 864 error = EWOULDBLOCK; 865 else 866 error = tsleep(sc, PSOCK|PCATCH, "tap", 0); 867 868 if (error != 0) 869 return (error); 870 /* The device might have been downed */ 871 if ((ifp->if_flags & IFF_UP) == 0) 872 return (EHOSTDOWN); 873 if ((sc->sc_flags & TAP_NBIO) && 874 lockstatus(&sc->sc_rdlock) == LK_EXCLUSIVE) 875 return (EWOULDBLOCK); 876 error = lockmgr(&sc->sc_rdlock, LK_EXCLUSIVE, NULL); 877 if (error != 0) 878 return (error); 879 s = splnet(); 880 } 881 882 IFQ_DEQUEUE(&ifp->if_snd, m); 883 ifp->if_flags &= ~IFF_OACTIVE; 884 splx(s); 885 if (m == NULL) { 886 error = 0; 887 goto out; 888 } 889 890 ifp->if_opackets++; 891 #if NBPFILTER > 0 892 if (ifp->if_bpf) 893 bpf_mtap(ifp->if_bpf, m); 894 #endif 895 896 /* 897 * One read is one packet. 898 */ 899 do { 900 error = uiomove(mtod(m, caddr_t), 901 min(m->m_len, uio->uio_resid), uio); 902 MFREE(m, n); 903 m = n; 904 } while (m != NULL && uio->uio_resid > 0 && error == 0); 905 906 if (m != NULL) 907 m_freem(m); 908 909 out: 910 (void)lockmgr(&sc->sc_rdlock, LK_RELEASE, NULL); 911 return (error); 912 } 913 914 static int 915 tap_cdev_write(dev_t dev, struct uio *uio, int flags) 916 { 917 return tap_dev_write(minor(dev), uio, flags); 918 } 919 920 static int 921 tap_fops_write(struct file *fp, off_t *offp __unused, struct uio *uio, 922 kauth_cred_t cred __unused, int flags) 923 { 924 return tap_dev_write((intptr_t)fp->f_data, uio, flags); 925 } 926 927 static int 928 tap_dev_write(int unit, struct uio *uio, int flags __unused) 929 { 930 struct tap_softc *sc = 931 (struct tap_softc *)device_lookup(&tap_cd, unit); 932 struct ifnet *ifp; 933 struct mbuf *m, **mp; 934 int error = 0; 935 int s; 936 937 if (sc == NULL) 938 return (ENXIO); 939 940 ifp = &sc->sc_ec.ec_if; 941 942 /* One write, one packet, that's the rule */ 943 MGETHDR(m, M_DONTWAIT, MT_DATA); 944 if (m == NULL) { 945 ifp->if_ierrors++; 946 return (ENOBUFS); 947 } 948 m->m_pkthdr.len = uio->uio_resid; 949 950 mp = &m; 951 while (error == 0 && uio->uio_resid > 0) { 952 if (*mp != m) { 953 MGET(*mp, M_DONTWAIT, MT_DATA); 954 if (*mp == NULL) { 955 error = ENOBUFS; 956 break; 957 } 958 } 959 (*mp)->m_len = min(MHLEN, uio->uio_resid); 960 error = uiomove(mtod(*mp, caddr_t), (*mp)->m_len, uio); 961 mp = &(*mp)->m_next; 962 } 963 if (error) { 964 ifp->if_ierrors++; 965 m_freem(m); 966 return (error); 967 } 968 969 ifp->if_ipackets++; 970 m->m_pkthdr.rcvif = ifp; 971 972 #if NBPFILTER > 0 973 if (ifp->if_bpf) 974 bpf_mtap(ifp->if_bpf, m); 975 #endif 976 s =splnet(); 977 (*ifp->if_input)(ifp, m); 978 splx(s); 979 980 return (0); 981 } 982 983 static int 984 tap_cdev_ioctl(dev_t dev, u_long cmd, caddr_t data, int flags __unused, 985 struct lwp *l) 986 { 987 return tap_dev_ioctl(minor(dev), cmd, data, l); 988 } 989 990 static int 991 tap_fops_ioctl(struct file *fp, u_long cmd, void *data, struct lwp *l) 992 { 993 return tap_dev_ioctl((intptr_t)fp->f_data, cmd, (caddr_t)data, l); 994 } 995 996 static int 997 tap_dev_ioctl(int unit, u_long cmd, caddr_t data, struct lwp *l) 998 { 999 struct tap_softc *sc = 1000 (struct tap_softc *)device_lookup(&tap_cd, unit); 1001 int error = 0; 1002 1003 if (sc == NULL) 1004 return (ENXIO); 1005 1006 switch (cmd) { 1007 case FIONREAD: 1008 { 1009 struct ifnet *ifp = &sc->sc_ec.ec_if; 1010 struct mbuf *m; 1011 int s; 1012 1013 s = splnet(); 1014 IFQ_POLL(&ifp->if_snd, m); 1015 1016 if (m == NULL) 1017 *(int *)data = 0; 1018 else 1019 *(int *)data = m->m_pkthdr.len; 1020 splx(s); 1021 } break; 1022 case TIOCSPGRP: 1023 case FIOSETOWN: 1024 error = fsetown(l->l_proc, &sc->sc_pgid, cmd, data); 1025 break; 1026 case TIOCGPGRP: 1027 case FIOGETOWN: 1028 error = fgetown(l->l_proc, sc->sc_pgid, cmd, data); 1029 break; 1030 case FIOASYNC: 1031 if (*(int *)data) 1032 sc->sc_flags |= TAP_ASYNCIO; 1033 else 1034 sc->sc_flags &= ~TAP_ASYNCIO; 1035 break; 1036 case FIONBIO: 1037 if (*(int *)data) 1038 sc->sc_flags |= TAP_NBIO; 1039 else 1040 sc->sc_flags &= ~TAP_NBIO; 1041 break; 1042 case TAPGIFNAME: 1043 { 1044 struct ifreq *ifr = (struct ifreq *)data; 1045 struct ifnet *ifp = &sc->sc_ec.ec_if; 1046 1047 strlcpy(ifr->ifr_name, ifp->if_xname, IFNAMSIZ); 1048 } break; 1049 default: 1050 error = ENOTTY; 1051 break; 1052 } 1053 1054 return (0); 1055 } 1056 1057 static int 1058 tap_cdev_poll(dev_t dev, int events, struct lwp *l) 1059 { 1060 return tap_dev_poll(minor(dev), events, l); 1061 } 1062 1063 static int 1064 tap_fops_poll(struct file *fp, int events, struct lwp *l) 1065 { 1066 return tap_dev_poll((intptr_t)fp->f_data, events, l); 1067 } 1068 1069 static int 1070 tap_dev_poll(int unit, int events, struct lwp *l) 1071 { 1072 struct tap_softc *sc = 1073 (struct tap_softc *)device_lookup(&tap_cd, unit); 1074 int revents = 0; 1075 1076 if (sc == NULL) 1077 return (ENXIO); 1078 1079 if (events & (POLLIN|POLLRDNORM)) { 1080 struct ifnet *ifp = &sc->sc_ec.ec_if; 1081 struct mbuf *m; 1082 int s; 1083 1084 s = splnet(); 1085 IFQ_POLL(&ifp->if_snd, m); 1086 splx(s); 1087 1088 if (m != NULL) 1089 revents |= events & (POLLIN|POLLRDNORM); 1090 else { 1091 simple_lock(&sc->sc_kqlock); 1092 selrecord(l, &sc->sc_rsel); 1093 simple_unlock(&sc->sc_kqlock); 1094 } 1095 } 1096 revents |= events & (POLLOUT|POLLWRNORM); 1097 1098 return (revents); 1099 } 1100 1101 static struct filterops tap_read_filterops = { 1, NULL, tap_kqdetach, 1102 tap_kqread }; 1103 static struct filterops tap_seltrue_filterops = { 1, NULL, tap_kqdetach, 1104 filt_seltrue }; 1105 1106 static int 1107 tap_cdev_kqfilter(dev_t dev, struct knote *kn) 1108 { 1109 return tap_dev_kqfilter(minor(dev), kn); 1110 } 1111 1112 static int 1113 tap_fops_kqfilter(struct file *fp, struct knote *kn) 1114 { 1115 return tap_dev_kqfilter((intptr_t)fp->f_data, kn); 1116 } 1117 1118 static int 1119 tap_dev_kqfilter(int unit, struct knote *kn) 1120 { 1121 struct tap_softc *sc = 1122 (struct tap_softc *)device_lookup(&tap_cd, unit); 1123 1124 if (sc == NULL) 1125 return (ENXIO); 1126 1127 switch(kn->kn_filter) { 1128 case EVFILT_READ: 1129 kn->kn_fop = &tap_read_filterops; 1130 break; 1131 case EVFILT_WRITE: 1132 kn->kn_fop = &tap_seltrue_filterops; 1133 break; 1134 default: 1135 return (1); 1136 } 1137 1138 kn->kn_hook = sc; 1139 simple_lock(&sc->sc_kqlock); 1140 SLIST_INSERT_HEAD(&sc->sc_rsel.sel_klist, kn, kn_selnext); 1141 simple_unlock(&sc->sc_kqlock); 1142 return (0); 1143 } 1144 1145 static void 1146 tap_kqdetach(struct knote *kn) 1147 { 1148 struct tap_softc *sc = (struct tap_softc *)kn->kn_hook; 1149 1150 simple_lock(&sc->sc_kqlock); 1151 SLIST_REMOVE(&sc->sc_rsel.sel_klist, kn, knote, kn_selnext); 1152 simple_unlock(&sc->sc_kqlock); 1153 } 1154 1155 static int 1156 tap_kqread(struct knote *kn, long hint __unused) 1157 { 1158 struct tap_softc *sc = (struct tap_softc *)kn->kn_hook; 1159 struct ifnet *ifp = &sc->sc_ec.ec_if; 1160 struct mbuf *m; 1161 int s; 1162 1163 s = splnet(); 1164 IFQ_POLL(&ifp->if_snd, m); 1165 1166 if (m == NULL) 1167 kn->kn_data = 0; 1168 else 1169 kn->kn_data = m->m_pkthdr.len; 1170 splx(s); 1171 return (kn->kn_data != 0 ? 1 : 0); 1172 } 1173 1174 /* 1175 * sysctl management routines 1176 * You can set the address of an interface through: 1177 * net.link.tap.tap<number> 1178 * 1179 * Note the consistent use of tap_log in order to use 1180 * sysctl_teardown at unload time. 1181 * 1182 * In the kernel you will find a lot of SYSCTL_SETUP blocks. Those 1183 * blocks register a function in a special section of the kernel 1184 * (called a link set) which is used at init_sysctl() time to cycle 1185 * through all those functions to create the kernel's sysctl tree. 1186 * 1187 * It is not (currently) possible to use link sets in a LKM, so the 1188 * easiest is to simply call our own setup routine at load time. 1189 * 1190 * In the SYSCTL_SETUP blocks you find in the kernel, nodes have the 1191 * CTLFLAG_PERMANENT flag, meaning they cannot be removed. Once the 1192 * whole kernel sysctl tree is built, it is not possible to add any 1193 * permanent node. 1194 * 1195 * It should be noted that we're not saving the sysctlnode pointer 1196 * we are returned when creating the "tap" node. That structure 1197 * cannot be trusted once out of the calling function, as it might 1198 * get reused. So we just save the MIB number, and always give the 1199 * full path starting from the root for later calls to sysctl_createv 1200 * and sysctl_destroyv. 1201 */ 1202 SYSCTL_SETUP(sysctl_tap_setup, "sysctl net.link.tap subtree setup") 1203 { 1204 const struct sysctlnode *node; 1205 int error = 0; 1206 1207 if ((error = sysctl_createv(clog, 0, NULL, NULL, 1208 CTLFLAG_PERMANENT, 1209 CTLTYPE_NODE, "net", NULL, 1210 NULL, 0, NULL, 0, 1211 CTL_NET, CTL_EOL)) != 0) 1212 return; 1213 1214 if ((error = sysctl_createv(clog, 0, NULL, NULL, 1215 CTLFLAG_PERMANENT, 1216 CTLTYPE_NODE, "link", NULL, 1217 NULL, 0, NULL, 0, 1218 CTL_NET, AF_LINK, CTL_EOL)) != 0) 1219 return; 1220 1221 /* 1222 * The first four parameters of sysctl_createv are for management. 1223 * 1224 * The four that follows, here starting with a '0' for the flags, 1225 * describe the node. 1226 * 1227 * The next series of four set its value, through various possible 1228 * means. 1229 * 1230 * Last but not least, the path to the node is described. That path 1231 * is relative to the given root (third argument). Here we're 1232 * starting from the root. 1233 */ 1234 if ((error = sysctl_createv(clog, 0, NULL, &node, 1235 CTLFLAG_PERMANENT, 1236 CTLTYPE_NODE, "tap", NULL, 1237 NULL, 0, NULL, 0, 1238 CTL_NET, AF_LINK, CTL_CREATE, CTL_EOL)) != 0) 1239 return; 1240 tap_node = node->sysctl_num; 1241 } 1242 1243 /* 1244 * The helper functions make Andrew Brown's interface really 1245 * shine. It makes possible to create value on the fly whether 1246 * the sysctl value is read or written. 1247 * 1248 * As shown as an example in the man page, the first step is to 1249 * create a copy of the node to have sysctl_lookup work on it. 1250 * 1251 * Here, we have more work to do than just a copy, since we have 1252 * to create the string. The first step is to collect the actual 1253 * value of the node, which is a convenient pointer to the softc 1254 * of the interface. From there we create the string and use it 1255 * as the value, but only for the *copy* of the node. 1256 * 1257 * Then we let sysctl_lookup do the magic, which consists in 1258 * setting oldp and newp as required by the operation. When the 1259 * value is read, that means that the string will be copied to 1260 * the user, and when it is written, the new value will be copied 1261 * over in the addr array. 1262 * 1263 * If newp is NULL, the user was reading the value, so we don't 1264 * have anything else to do. If a new value was written, we 1265 * have to check it. 1266 * 1267 * If it is incorrect, we can return an error and leave 'node' as 1268 * it is: since it is a copy of the actual node, the change will 1269 * be forgotten. 1270 * 1271 * Upon a correct input, we commit the change to the ifnet 1272 * structure of our interface. 1273 */ 1274 static int 1275 tap_sysctl_handler(SYSCTLFN_ARGS) 1276 { 1277 struct sysctlnode node; 1278 struct tap_softc *sc; 1279 struct ifnet *ifp; 1280 int error; 1281 size_t len; 1282 char addr[3 * ETHER_ADDR_LEN]; 1283 1284 node = *rnode; 1285 sc = node.sysctl_data; 1286 ifp = &sc->sc_ec.ec_if; 1287 (void)ether_snprintf(addr, sizeof(addr), LLADDR(ifp->if_sadl)); 1288 node.sysctl_data = addr; 1289 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 1290 if (error || newp == NULL) 1291 return (error); 1292 1293 len = strlen(addr); 1294 if (len < 11 || len > 17) 1295 return (EINVAL); 1296 1297 /* Commit change */ 1298 if (tap_ether_aton(LLADDR(ifp->if_sadl), addr) != 0) 1299 return (EINVAL); 1300 return (error); 1301 } 1302 1303 /* 1304 * ether_aton implementation, not using a static buffer. 1305 */ 1306 static int 1307 tap_ether_aton(u_char *dest, char *str) 1308 { 1309 int i; 1310 char *cp = str; 1311 u_char val[6]; 1312 1313 #define set_value \ 1314 if (*cp > '9' && *cp < 'a') \ 1315 *cp -= 'A' - 10; \ 1316 else if (*cp > '9') \ 1317 *cp -= 'a' - 10; \ 1318 else \ 1319 *cp -= '0' 1320 1321 for (i = 0; i < 6; i++, cp++) { 1322 if (!isxdigit(*cp)) 1323 return (1); 1324 set_value; 1325 val[i] = *cp++; 1326 if (isxdigit(*cp)) { 1327 set_value; 1328 val[i] *= 16; 1329 val[i] += *cp++; 1330 } 1331 if (*cp == ':' || i == 5) 1332 continue; 1333 else 1334 return (1); 1335 } 1336 memcpy(dest, val, 6); 1337 return (0); 1338 } 1339