1 /* $NetBSD: if_tap.c,v 1.18 2006/06/07 22:33:43 kardel Exp $ */ 2 3 /* 4 * Copyright (c) 2003, 2004 The NetBSD Foundation. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to the NetBSD Foundation 8 * by Quentin Garnier. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 39 /* 40 * tap(4) is a virtual Ethernet interface. It appears as a real Ethernet 41 * device to the system, but can also be accessed by userland through a 42 * character device interface, which allows reading and injecting frames. 43 */ 44 45 #include <sys/cdefs.h> 46 __KERNEL_RCSID(0, "$NetBSD: if_tap.c,v 1.18 2006/06/07 22:33:43 kardel Exp $"); 47 48 #if defined(_KERNEL_OPT) 49 #include "bpfilter.h" 50 #endif 51 52 #include <sys/param.h> 53 #include <sys/systm.h> 54 #include <sys/kernel.h> 55 #include <sys/malloc.h> 56 #include <sys/conf.h> 57 #include <sys/device.h> 58 #include <sys/file.h> 59 #include <sys/filedesc.h> 60 #include <sys/ksyms.h> 61 #include <sys/poll.h> 62 #include <sys/select.h> 63 #include <sys/sockio.h> 64 #include <sys/sysctl.h> 65 #include <sys/kauth.h> 66 67 #include <net/if.h> 68 #include <net/if_dl.h> 69 #include <net/if_ether.h> 70 #include <net/if_media.h> 71 #include <net/if_tap.h> 72 #if NBPFILTER > 0 73 #include <net/bpf.h> 74 #endif 75 76 /* 77 * sysctl node management 78 * 79 * It's not really possible to use a SYSCTL_SETUP block with 80 * current LKM implementation, so it is easier to just define 81 * our own function. 82 * 83 * The handler function is a "helper" in Andrew Brown's sysctl 84 * framework terminology. It is used as a gateway for sysctl 85 * requests over the nodes. 86 * 87 * tap_log allows the module to log creations of nodes and 88 * destroy them all at once using sysctl_teardown. 89 */ 90 static int tap_node; 91 static int tap_sysctl_handler(SYSCTLFN_PROTO); 92 SYSCTL_SETUP_PROTO(sysctl_tap_setup); 93 94 /* 95 * Since we're an Ethernet device, we need the 3 following 96 * components: a leading struct device, a struct ethercom, 97 * and also a struct ifmedia since we don't attach a PHY to 98 * ourselves. We could emulate one, but there's no real 99 * point. 100 */ 101 102 struct tap_softc { 103 struct device sc_dev; 104 struct ifmedia sc_im; 105 struct ethercom sc_ec; 106 int sc_flags; 107 #define TAP_INUSE 0x00000001 /* tap device can only be opened once */ 108 #define TAP_ASYNCIO 0x00000002 /* user is using async I/O (SIGIO) on the device */ 109 #define TAP_NBIO 0x00000004 /* user wants calls to avoid blocking */ 110 #define TAP_GOING 0x00000008 /* interface is being destroyed */ 111 struct selinfo sc_rsel; 112 pid_t sc_pgid; /* For async. IO */ 113 struct lock sc_rdlock; 114 struct simplelock sc_kqlock; 115 }; 116 117 /* autoconf(9) glue */ 118 119 void tapattach(int); 120 121 static int tap_match(struct device *, struct cfdata *, void *); 122 static void tap_attach(struct device *, struct device *, void *); 123 static int tap_detach(struct device*, int); 124 125 /* Ethernet address helper functions */ 126 127 static int tap_ether_aton(u_char *, char *); 128 129 CFATTACH_DECL(tap, sizeof(struct tap_softc), 130 tap_match, tap_attach, tap_detach, NULL); 131 extern struct cfdriver tap_cd; 132 133 /* Real device access routines */ 134 static int tap_dev_close(struct tap_softc *); 135 static int tap_dev_read(int, struct uio *, int); 136 static int tap_dev_write(int, struct uio *, int); 137 static int tap_dev_ioctl(int, u_long, caddr_t, struct lwp *); 138 static int tap_dev_poll(int, int, struct lwp *); 139 static int tap_dev_kqfilter(int, struct knote *); 140 141 /* Fileops access routines */ 142 static int tap_fops_close(struct file *, struct lwp *); 143 static int tap_fops_read(struct file *, off_t *, struct uio *, 144 kauth_cred_t, int); 145 static int tap_fops_write(struct file *, off_t *, struct uio *, 146 kauth_cred_t, int); 147 static int tap_fops_ioctl(struct file *, u_long, void *, 148 struct lwp *); 149 static int tap_fops_poll(struct file *, int, struct lwp *); 150 static int tap_fops_kqfilter(struct file *, struct knote *); 151 152 static const struct fileops tap_fileops = { 153 tap_fops_read, 154 tap_fops_write, 155 tap_fops_ioctl, 156 fnullop_fcntl, 157 tap_fops_poll, 158 fbadop_stat, 159 tap_fops_close, 160 tap_fops_kqfilter, 161 }; 162 163 /* Helper for cloning open() */ 164 static int tap_dev_cloner(struct lwp *); 165 166 /* Character device routines */ 167 static int tap_cdev_open(dev_t, int, int, struct lwp *); 168 static int tap_cdev_close(dev_t, int, int, struct lwp *); 169 static int tap_cdev_read(dev_t, struct uio *, int); 170 static int tap_cdev_write(dev_t, struct uio *, int); 171 static int tap_cdev_ioctl(dev_t, u_long, caddr_t, int, struct lwp *); 172 static int tap_cdev_poll(dev_t, int, struct lwp *); 173 static int tap_cdev_kqfilter(dev_t, struct knote *); 174 175 const struct cdevsw tap_cdevsw = { 176 tap_cdev_open, tap_cdev_close, 177 tap_cdev_read, tap_cdev_write, 178 tap_cdev_ioctl, nostop, notty, 179 tap_cdev_poll, nommap, 180 tap_cdev_kqfilter, 181 }; 182 183 #define TAP_CLONER 0xfffff /* Maximal minor value */ 184 185 /* kqueue-related routines */ 186 static void tap_kqdetach(struct knote *); 187 static int tap_kqread(struct knote *, long); 188 189 /* 190 * Those are needed by the if_media interface. 191 */ 192 193 static int tap_mediachange(struct ifnet *); 194 static void tap_mediastatus(struct ifnet *, struct ifmediareq *); 195 196 /* 197 * Those are needed by the ifnet interface, and would typically be 198 * there for any network interface driver. 199 * Some other routines are optional: watchdog and drain. 200 */ 201 202 static void tap_start(struct ifnet *); 203 static void tap_stop(struct ifnet *, int); 204 static int tap_init(struct ifnet *); 205 static int tap_ioctl(struct ifnet *, u_long, caddr_t); 206 207 /* This is an internal function to keep tap_ioctl readable */ 208 static int tap_lifaddr(struct ifnet *, u_long, struct ifaliasreq *); 209 210 /* 211 * tap is a clonable interface, although it is highly unrealistic for 212 * an Ethernet device. 213 * 214 * Here are the bits needed for a clonable interface. 215 */ 216 static int tap_clone_create(struct if_clone *, int); 217 static int tap_clone_destroy(struct ifnet *); 218 219 struct if_clone tap_cloners = IF_CLONE_INITIALIZER("tap", 220 tap_clone_create, 221 tap_clone_destroy); 222 223 /* Helper functionis shared by the two cloning code paths */ 224 static struct tap_softc * tap_clone_creator(int); 225 int tap_clone_destroyer(struct device *); 226 227 void 228 tapattach(int n) 229 { 230 int error; 231 232 error = config_cfattach_attach(tap_cd.cd_name, &tap_ca); 233 if (error) { 234 aprint_error("%s: unable to register cfattach\n", 235 tap_cd.cd_name); 236 (void)config_cfdriver_detach(&tap_cd); 237 return; 238 } 239 240 if_clone_attach(&tap_cloners); 241 } 242 243 /* Pretty much useless for a pseudo-device */ 244 static int 245 tap_match(struct device *self, struct cfdata *cfdata, void *arg) 246 { 247 return (1); 248 } 249 250 void 251 tap_attach(struct device *parent, struct device *self, void *aux) 252 { 253 struct tap_softc *sc = (struct tap_softc *)self; 254 struct ifnet *ifp; 255 const struct sysctlnode *node; 256 u_int8_t enaddr[ETHER_ADDR_LEN] = 257 { 0xf2, 0x0b, 0xa4, 0xff, 0xff, 0xff }; 258 char enaddrstr[3 * ETHER_ADDR_LEN]; 259 struct timeval tv; 260 uint32_t ui; 261 int error; 262 263 aprint_normal("%s: faking Ethernet device\n", 264 self->dv_xname); 265 266 /* 267 * In order to obtain unique initial Ethernet address on a host, 268 * do some randomisation using the current uptime. It's not meant 269 * for anything but avoiding hard-coding an address. 270 */ 271 getmicrouptime(&tv); 272 ui = (tv.tv_sec ^ tv.tv_usec) & 0xffffff; 273 memcpy(enaddr+3, (u_int8_t *)&ui, 3); 274 275 aprint_normal("%s: Ethernet address %s\n", sc->sc_dev.dv_xname, 276 ether_snprintf(enaddrstr, sizeof(enaddrstr), enaddr)); 277 278 /* 279 * Why 1000baseT? Why not? You can add more. 280 * 281 * Note that there are 3 steps: init, one or several additions to 282 * list of supported media, and in the end, the selection of one 283 * of them. 284 */ 285 ifmedia_init(&sc->sc_im, 0, tap_mediachange, tap_mediastatus); 286 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_1000_T, 0, NULL); 287 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_1000_T|IFM_FDX, 0, NULL); 288 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_100_TX, 0, NULL); 289 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_100_TX|IFM_FDX, 0, NULL); 290 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_10_T, 0, NULL); 291 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_10_T|IFM_FDX, 0, NULL); 292 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_AUTO, 0, NULL); 293 ifmedia_set(&sc->sc_im, IFM_ETHER|IFM_AUTO); 294 295 /* 296 * One should note that an interface must do multicast in order 297 * to support IPv6. 298 */ 299 ifp = &sc->sc_ec.ec_if; 300 strcpy(ifp->if_xname, sc->sc_dev.dv_xname); 301 ifp->if_softc = sc; 302 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 303 ifp->if_ioctl = tap_ioctl; 304 ifp->if_start = tap_start; 305 ifp->if_stop = tap_stop; 306 ifp->if_init = tap_init; 307 IFQ_SET_READY(&ifp->if_snd); 308 309 sc->sc_ec.ec_capabilities = ETHERCAP_VLAN_MTU | ETHERCAP_JUMBO_MTU; 310 311 /* Those steps are mandatory for an Ethernet driver, the fisrt call 312 * being common to all network interface drivers. */ 313 if_attach(ifp); 314 ether_ifattach(ifp, enaddr); 315 316 sc->sc_flags = 0; 317 318 /* 319 * Add a sysctl node for that interface. 320 * 321 * The pointer transmitted is not a string, but instead a pointer to 322 * the softc structure, which we can use to build the string value on 323 * the fly in the helper function of the node. See the comments for 324 * tap_sysctl_handler for details. 325 */ 326 if ((error = sysctl_createv(NULL, 0, NULL, 327 &node, CTLFLAG_READWRITE, 328 CTLTYPE_STRING, sc->sc_dev.dv_xname, NULL, 329 tap_sysctl_handler, 0, sc, 18, 330 CTL_NET, AF_LINK, tap_node, device_unit(&sc->sc_dev), 331 CTL_EOL)) != 0) 332 aprint_error("%s: sysctl_createv returned %d, ignoring\n", 333 sc->sc_dev.dv_xname, error); 334 335 /* 336 * Initialize the two locks for the device. 337 * 338 * We need a lock here because even though the tap device can be 339 * opened only once, the file descriptor might be passed to another 340 * process, say a fork(2)ed child. 341 * 342 * The Giant saves us from most of the hassle, but since the read 343 * operation can sleep, we don't want two processes to wake up at 344 * the same moment and both try and dequeue a single packet. 345 * 346 * The queue for event listeners (used by kqueue(9), see below) has 347 * to be protected, too, but we don't need the same level of 348 * complexity for that lock, so a simple spinning lock is fine. 349 */ 350 lockinit(&sc->sc_rdlock, PSOCK|PCATCH, "tapl", 0, LK_SLEEPFAIL); 351 simple_lock_init(&sc->sc_kqlock); 352 } 353 354 /* 355 * When detaching, we do the inverse of what is done in the attach 356 * routine, in reversed order. 357 */ 358 static int 359 tap_detach(struct device* self, int flags) 360 { 361 struct tap_softc *sc = (struct tap_softc *)self; 362 struct ifnet *ifp = &sc->sc_ec.ec_if; 363 int error, s; 364 365 /* 366 * Some processes might be sleeping on "tap", so we have to make 367 * them release their hold on the device. 368 * 369 * The LK_DRAIN operation will wait for every locked process to 370 * release their hold. 371 */ 372 sc->sc_flags |= TAP_GOING; 373 s = splnet(); 374 tap_stop(ifp, 1); 375 if_down(ifp); 376 splx(s); 377 lockmgr(&sc->sc_rdlock, LK_DRAIN, NULL); 378 379 /* 380 * Destroying a single leaf is a very straightforward operation using 381 * sysctl_destroyv. One should be sure to always end the path with 382 * CTL_EOL. 383 */ 384 if ((error = sysctl_destroyv(NULL, CTL_NET, AF_LINK, tap_node, 385 device_unit(&sc->sc_dev), CTL_EOL)) != 0) 386 aprint_error("%s: sysctl_destroyv returned %d, ignoring\n", 387 sc->sc_dev.dv_xname, error); 388 ether_ifdetach(ifp); 389 if_detach(ifp); 390 ifmedia_delete_instance(&sc->sc_im, IFM_INST_ANY); 391 392 return (0); 393 } 394 395 /* 396 * This function is called by the ifmedia layer to notify the driver 397 * that the user requested a media change. A real driver would 398 * reconfigure the hardware. 399 */ 400 static int 401 tap_mediachange(struct ifnet *ifp) 402 { 403 return (0); 404 } 405 406 /* 407 * Here the user asks for the currently used media. 408 */ 409 static void 410 tap_mediastatus(struct ifnet *ifp, struct ifmediareq *imr) 411 { 412 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc; 413 imr->ifm_active = sc->sc_im.ifm_cur->ifm_media; 414 } 415 416 /* 417 * This is the function where we SEND packets. 418 * 419 * There is no 'receive' equivalent. A typical driver will get 420 * interrupts from the hardware, and from there will inject new packets 421 * into the network stack. 422 * 423 * Once handled, a packet must be freed. A real driver might not be able 424 * to fit all the pending packets into the hardware, and is allowed to 425 * return before having sent all the packets. It should then use the 426 * if_flags flag IFF_OACTIVE to notify the upper layer. 427 * 428 * There are also other flags one should check, such as IFF_PAUSE. 429 * 430 * It is our duty to make packets available to BPF listeners. 431 * 432 * You should be aware that this function is called by the Ethernet layer 433 * at splnet(). 434 * 435 * When the device is opened, we have to pass the packet(s) to the 436 * userland. For that we stay in OACTIVE mode while the userland gets 437 * the packets, and we send a signal to the processes waiting to read. 438 * 439 * wakeup(sc) is the counterpart to the tsleep call in 440 * tap_dev_read, while selnotify() is used for kevent(2) and 441 * poll(2) (which includes select(2)) listeners. 442 */ 443 static void 444 tap_start(struct ifnet *ifp) 445 { 446 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc; 447 struct mbuf *m0; 448 449 if ((sc->sc_flags & TAP_INUSE) == 0) { 450 /* Simply drop packets */ 451 for(;;) { 452 IFQ_DEQUEUE(&ifp->if_snd, m0); 453 if (m0 == NULL) 454 return; 455 456 ifp->if_opackets++; 457 #if NBPFILTER > 0 458 if (ifp->if_bpf) 459 bpf_mtap(ifp->if_bpf, m0); 460 #endif 461 462 m_freem(m0); 463 } 464 } else if (!IFQ_IS_EMPTY(&ifp->if_snd)) { 465 ifp->if_flags |= IFF_OACTIVE; 466 wakeup(sc); 467 selnotify(&sc->sc_rsel, 1); 468 if (sc->sc_flags & TAP_ASYNCIO) 469 fownsignal(sc->sc_pgid, SIGIO, POLL_IN, 470 POLLIN|POLLRDNORM, NULL); 471 } 472 } 473 474 /* 475 * A typical driver will only contain the following handlers for 476 * ioctl calls, except SIOCSIFPHYADDR. 477 * The latter is a hack I used to set the Ethernet address of the 478 * faked device. 479 * 480 * Note that both ifmedia_ioctl() and ether_ioctl() have to be 481 * called under splnet(). 482 */ 483 static int 484 tap_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 485 { 486 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc; 487 struct ifreq *ifr = (struct ifreq *)data; 488 int s, error; 489 490 s = splnet(); 491 492 switch (cmd) { 493 case SIOCSIFMEDIA: 494 case SIOCGIFMEDIA: 495 error = ifmedia_ioctl(ifp, ifr, &sc->sc_im, cmd); 496 break; 497 case SIOCSIFPHYADDR: 498 error = tap_lifaddr(ifp, cmd, (struct ifaliasreq *)data); 499 break; 500 default: 501 error = ether_ioctl(ifp, cmd, data); 502 if (error == ENETRESET) 503 error = 0; 504 break; 505 } 506 507 splx(s); 508 509 return (error); 510 } 511 512 /* 513 * Helper function to set Ethernet address. This shouldn't be done there, 514 * and should actually be available to all Ethernet drivers, real or not. 515 */ 516 static int 517 tap_lifaddr(struct ifnet *ifp, u_long cmd, struct ifaliasreq *ifra) 518 { 519 struct sockaddr *sa = (struct sockaddr *)&ifra->ifra_addr; 520 521 if (sa->sa_family != AF_LINK) 522 return (EINVAL); 523 524 memcpy(LLADDR(ifp->if_sadl), sa->sa_data, ETHER_ADDR_LEN); 525 526 return (0); 527 } 528 529 /* 530 * _init() would typically be called when an interface goes up, 531 * meaning it should configure itself into the state in which it 532 * can send packets. 533 */ 534 static int 535 tap_init(struct ifnet *ifp) 536 { 537 ifp->if_flags |= IFF_RUNNING; 538 539 tap_start(ifp); 540 541 return (0); 542 } 543 544 /* 545 * _stop() is called when an interface goes down. It is our 546 * responsability to validate that state by clearing the 547 * IFF_RUNNING flag. 548 * 549 * We have to wake up all the sleeping processes to have the pending 550 * read requests cancelled. 551 */ 552 static void 553 tap_stop(struct ifnet *ifp, int disable) 554 { 555 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc; 556 557 ifp->if_flags &= ~IFF_RUNNING; 558 wakeup(sc); 559 selnotify(&sc->sc_rsel, 1); 560 if (sc->sc_flags & TAP_ASYNCIO) 561 fownsignal(sc->sc_pgid, SIGIO, POLL_HUP, 0, NULL); 562 } 563 564 /* 565 * The 'create' command of ifconfig can be used to create 566 * any numbered instance of a given device. Thus we have to 567 * make sure we have enough room in cd_devs to create the 568 * user-specified instance. config_attach_pseudo will do this 569 * for us. 570 */ 571 static int 572 tap_clone_create(struct if_clone *ifc, int unit) 573 { 574 if (tap_clone_creator(unit) == NULL) { 575 aprint_error("%s%d: unable to attach an instance\n", 576 tap_cd.cd_name, unit); 577 return (ENXIO); 578 } 579 580 return (0); 581 } 582 583 /* 584 * tap(4) can be cloned by two ways: 585 * using 'ifconfig tap0 create', which will use the network 586 * interface cloning API, and call tap_clone_create above. 587 * opening the cloning device node, whose minor number is TAP_CLONER. 588 * See below for an explanation on how this part work. 589 * 590 * config_attach_pseudo can be called with unit = DVUNIT_ANY to have 591 * autoconf(9) choose a unit number for us. This is what happens when 592 * the cloner is openend, while the ifcloner interface creates a device 593 * with a specific unit number. 594 */ 595 static struct tap_softc * 596 tap_clone_creator(int unit) 597 { 598 struct cfdata *cf; 599 600 cf = malloc(sizeof(*cf), M_DEVBUF, M_WAITOK); 601 cf->cf_name = tap_cd.cd_name; 602 cf->cf_atname = tap_ca.ca_name; 603 cf->cf_unit = unit; 604 cf->cf_fstate = FSTATE_STAR; 605 606 return (struct tap_softc *)config_attach_pseudo(cf); 607 } 608 609 /* 610 * The clean design of if_clone and autoconf(9) makes that part 611 * really straightforward. The second argument of config_detach 612 * means neither QUIET nor FORCED. 613 */ 614 static int 615 tap_clone_destroy(struct ifnet *ifp) 616 { 617 return tap_clone_destroyer((struct device *)ifp->if_softc); 618 } 619 620 int 621 tap_clone_destroyer(struct device *dev) 622 { 623 struct cfdata *cf = device_cfdata(dev); 624 int error; 625 626 if ((error = config_detach(dev, 0)) != 0) 627 aprint_error("%s: unable to detach instance\n", 628 dev->dv_xname); 629 free(cf, M_DEVBUF); 630 631 return (error); 632 } 633 634 /* 635 * tap(4) is a bit of an hybrid device. It can be used in two different 636 * ways: 637 * 1. ifconfig tapN create, then use /dev/tapN to read/write off it. 638 * 2. open /dev/tap, get a new interface created and read/write off it. 639 * That interface is destroyed when the process that had it created exits. 640 * 641 * The first way is managed by the cdevsw structure, and you access interfaces 642 * through a (major, minor) mapping: tap4 is obtained by the minor number 643 * 4. The entry points for the cdevsw interface are prefixed by tap_cdev_. 644 * 645 * The second way is the so-called "cloning" device. It's a special minor 646 * number (chosen as the maximal number, to allow as much tap devices as 647 * possible). The user first opens the cloner (e.g., /dev/tap), and that 648 * call ends in tap_cdev_open. The actual place where it is handled is 649 * tap_dev_cloner. 650 * 651 * An tap device cannot be opened more than once at a time, so the cdevsw 652 * part of open() does nothing but noting that the interface is being used and 653 * hence ready to actually handle packets. 654 */ 655 656 static int 657 tap_cdev_open(dev_t dev, int flags, int fmt, struct lwp *l) 658 { 659 struct tap_softc *sc; 660 661 if (minor(dev) == TAP_CLONER) 662 return tap_dev_cloner(l); 663 664 sc = (struct tap_softc *)device_lookup(&tap_cd, minor(dev)); 665 if (sc == NULL) 666 return (ENXIO); 667 668 /* The device can only be opened once */ 669 if (sc->sc_flags & TAP_INUSE) 670 return (EBUSY); 671 sc->sc_flags |= TAP_INUSE; 672 return (0); 673 } 674 675 /* 676 * There are several kinds of cloning devices, and the most simple is the one 677 * tap(4) uses. What it does is change the file descriptor with a new one, 678 * with its own fileops structure (which maps to the various read, write, 679 * ioctl functions). It starts allocating a new file descriptor with falloc, 680 * then actually creates the new tap devices. 681 * 682 * Once those two steps are successful, we can re-wire the existing file 683 * descriptor to its new self. This is done with fdclone(): it fills the fp 684 * structure as needed (notably f_data gets filled with the fifth parameter 685 * passed, the unit of the tap device which will allows us identifying the 686 * device later), and returns EMOVEFD. 687 * 688 * That magic value is interpreted by sys_open() which then replaces the 689 * current file descriptor by the new one (through a magic member of struct 690 * lwp, l_dupfd). 691 * 692 * The tap device is flagged as being busy since it otherwise could be 693 * externally accessed through the corresponding device node with the cdevsw 694 * interface. 695 */ 696 697 static int 698 tap_dev_cloner(struct lwp *l) 699 { 700 struct tap_softc *sc; 701 struct file *fp; 702 int error, fd; 703 704 if ((error = falloc(l->l_proc, &fp, &fd)) != 0) 705 return (error); 706 707 if ((sc = tap_clone_creator(DVUNIT_ANY)) == NULL) { 708 FILE_UNUSE(fp, l); 709 ffree(fp); 710 return (ENXIO); 711 } 712 713 sc->sc_flags |= TAP_INUSE; 714 715 return fdclone(l, fp, fd, FREAD|FWRITE, &tap_fileops, 716 (void *)(intptr_t)device_unit(&sc->sc_dev)); 717 } 718 719 /* 720 * While all other operations (read, write, ioctl, poll and kqfilter) are 721 * really the same whether we are in cdevsw or fileops mode, the close() 722 * function is slightly different in the two cases. 723 * 724 * As for the other, the core of it is shared in tap_dev_close. What 725 * it does is sufficient for the cdevsw interface, but the cloning interface 726 * needs another thing: the interface is destroyed when the processes that 727 * created it closes it. 728 */ 729 static int 730 tap_cdev_close(dev_t dev, int flags, int fmt, struct lwp *l) 731 { 732 struct tap_softc *sc = 733 (struct tap_softc *)device_lookup(&tap_cd, minor(dev)); 734 735 if (sc == NULL) 736 return (ENXIO); 737 738 return tap_dev_close(sc); 739 } 740 741 /* 742 * It might happen that the administrator used ifconfig to externally destroy 743 * the interface. In that case, tap_fops_close will be called while 744 * tap_detach is already happening. If we called it again from here, we 745 * would dead lock. TAP_GOING ensures that this situation doesn't happen. 746 */ 747 static int 748 tap_fops_close(struct file *fp, struct lwp *l) 749 { 750 int unit = (intptr_t)fp->f_data; 751 struct tap_softc *sc; 752 int error; 753 754 sc = (struct tap_softc *)device_lookup(&tap_cd, unit); 755 if (sc == NULL) 756 return (ENXIO); 757 758 /* tap_dev_close currently always succeeds, but it might not 759 * always be the case. */ 760 if ((error = tap_dev_close(sc)) != 0) 761 return (error); 762 763 /* Destroy the device now that it is no longer useful, 764 * unless it's already being destroyed. */ 765 if ((sc->sc_flags & TAP_GOING) != 0) 766 return (0); 767 768 return tap_clone_destroyer((struct device *)sc); 769 } 770 771 static int 772 tap_dev_close(struct tap_softc *sc) 773 { 774 struct ifnet *ifp; 775 int s; 776 777 s = splnet(); 778 /* Let tap_start handle packets again */ 779 ifp = &sc->sc_ec.ec_if; 780 ifp->if_flags &= ~IFF_OACTIVE; 781 782 /* Purge output queue */ 783 if (!(IFQ_IS_EMPTY(&ifp->if_snd))) { 784 struct mbuf *m; 785 786 for (;;) { 787 IFQ_DEQUEUE(&ifp->if_snd, m); 788 if (m == NULL) 789 break; 790 791 ifp->if_opackets++; 792 #if NBPFILTER > 0 793 if (ifp->if_bpf) 794 bpf_mtap(ifp->if_bpf, m); 795 #endif 796 } 797 } 798 splx(s); 799 800 sc->sc_flags &= ~(TAP_INUSE | TAP_ASYNCIO); 801 802 return (0); 803 } 804 805 static int 806 tap_cdev_read(dev_t dev, struct uio *uio, int flags) 807 { 808 return tap_dev_read(minor(dev), uio, flags); 809 } 810 811 static int 812 tap_fops_read(struct file *fp, off_t *offp, struct uio *uio, 813 kauth_cred_t cred, int flags) 814 { 815 return tap_dev_read((intptr_t)fp->f_data, uio, flags); 816 } 817 818 static int 819 tap_dev_read(int unit, struct uio *uio, int flags) 820 { 821 struct tap_softc *sc = 822 (struct tap_softc *)device_lookup(&tap_cd, unit); 823 struct ifnet *ifp; 824 struct mbuf *m, *n; 825 int error = 0, s; 826 827 if (sc == NULL) 828 return (ENXIO); 829 830 ifp = &sc->sc_ec.ec_if; 831 if ((ifp->if_flags & IFF_UP) == 0) 832 return (EHOSTDOWN); 833 834 /* 835 * In the TAP_NBIO case, we have to make sure we won't be sleeping 836 */ 837 if ((sc->sc_flags & TAP_NBIO) && 838 lockstatus(&sc->sc_rdlock) == LK_EXCLUSIVE) 839 return (EWOULDBLOCK); 840 error = lockmgr(&sc->sc_rdlock, LK_EXCLUSIVE, NULL); 841 if (error != 0) 842 return (error); 843 844 s = splnet(); 845 if (IFQ_IS_EMPTY(&ifp->if_snd)) { 846 ifp->if_flags &= ~IFF_OACTIVE; 847 splx(s); 848 /* 849 * We must release the lock before sleeping, and re-acquire it 850 * after. 851 */ 852 (void)lockmgr(&sc->sc_rdlock, LK_RELEASE, NULL); 853 if (sc->sc_flags & TAP_NBIO) 854 error = EWOULDBLOCK; 855 else 856 error = tsleep(sc, PSOCK|PCATCH, "tap", 0); 857 858 if (error != 0) 859 return (error); 860 /* The device might have been downed */ 861 if ((ifp->if_flags & IFF_UP) == 0) 862 return (EHOSTDOWN); 863 if ((sc->sc_flags & TAP_NBIO) && 864 lockstatus(&sc->sc_rdlock) == LK_EXCLUSIVE) 865 return (EWOULDBLOCK); 866 error = lockmgr(&sc->sc_rdlock, LK_EXCLUSIVE, NULL); 867 if (error != 0) 868 return (error); 869 s = splnet(); 870 } 871 872 IFQ_DEQUEUE(&ifp->if_snd, m); 873 ifp->if_flags &= ~IFF_OACTIVE; 874 splx(s); 875 if (m == NULL) { 876 error = 0; 877 goto out; 878 } 879 880 ifp->if_opackets++; 881 #if NBPFILTER > 0 882 if (ifp->if_bpf) 883 bpf_mtap(ifp->if_bpf, m); 884 #endif 885 886 /* 887 * One read is one packet. 888 */ 889 do { 890 error = uiomove(mtod(m, caddr_t), 891 min(m->m_len, uio->uio_resid), uio); 892 MFREE(m, n); 893 m = n; 894 } while (m != NULL && uio->uio_resid > 0 && error == 0); 895 896 if (m != NULL) 897 m_freem(m); 898 899 out: 900 (void)lockmgr(&sc->sc_rdlock, LK_RELEASE, NULL); 901 return (error); 902 } 903 904 static int 905 tap_cdev_write(dev_t dev, struct uio *uio, int flags) 906 { 907 return tap_dev_write(minor(dev), uio, flags); 908 } 909 910 static int 911 tap_fops_write(struct file *fp, off_t *offp, struct uio *uio, 912 kauth_cred_t cred, int flags) 913 { 914 return tap_dev_write((intptr_t)fp->f_data, uio, flags); 915 } 916 917 static int 918 tap_dev_write(int unit, struct uio *uio, int flags) 919 { 920 struct tap_softc *sc = 921 (struct tap_softc *)device_lookup(&tap_cd, unit); 922 struct ifnet *ifp; 923 struct mbuf *m, **mp; 924 int error = 0; 925 int s; 926 927 if (sc == NULL) 928 return (ENXIO); 929 930 ifp = &sc->sc_ec.ec_if; 931 932 /* One write, one packet, that's the rule */ 933 MGETHDR(m, M_DONTWAIT, MT_DATA); 934 if (m == NULL) { 935 ifp->if_ierrors++; 936 return (ENOBUFS); 937 } 938 m->m_pkthdr.len = uio->uio_resid; 939 940 mp = &m; 941 while (error == 0 && uio->uio_resid > 0) { 942 if (*mp != m) { 943 MGET(*mp, M_DONTWAIT, MT_DATA); 944 if (*mp == NULL) { 945 error = ENOBUFS; 946 break; 947 } 948 } 949 (*mp)->m_len = min(MHLEN, uio->uio_resid); 950 error = uiomove(mtod(*mp, caddr_t), (*mp)->m_len, uio); 951 mp = &(*mp)->m_next; 952 } 953 if (error) { 954 ifp->if_ierrors++; 955 m_freem(m); 956 return (error); 957 } 958 959 ifp->if_ipackets++; 960 m->m_pkthdr.rcvif = ifp; 961 962 #if NBPFILTER > 0 963 if (ifp->if_bpf) 964 bpf_mtap(ifp->if_bpf, m); 965 #endif 966 s =splnet(); 967 (*ifp->if_input)(ifp, m); 968 splx(s); 969 970 return (0); 971 } 972 973 static int 974 tap_cdev_ioctl(dev_t dev, u_long cmd, caddr_t data, int flags, 975 struct lwp *l) 976 { 977 return tap_dev_ioctl(minor(dev), cmd, data, l); 978 } 979 980 static int 981 tap_fops_ioctl(struct file *fp, u_long cmd, void *data, struct lwp *l) 982 { 983 return tap_dev_ioctl((intptr_t)fp->f_data, cmd, (caddr_t)data, l); 984 } 985 986 static int 987 tap_dev_ioctl(int unit, u_long cmd, caddr_t data, struct lwp *l) 988 { 989 struct tap_softc *sc = 990 (struct tap_softc *)device_lookup(&tap_cd, unit); 991 int error = 0; 992 993 if (sc == NULL) 994 return (ENXIO); 995 996 switch (cmd) { 997 case FIONREAD: 998 { 999 struct ifnet *ifp = &sc->sc_ec.ec_if; 1000 struct mbuf *m; 1001 int s; 1002 1003 s = splnet(); 1004 IFQ_POLL(&ifp->if_snd, m); 1005 1006 if (m == NULL) 1007 *(int *)data = 0; 1008 else 1009 *(int *)data = m->m_pkthdr.len; 1010 splx(s); 1011 } break; 1012 case TIOCSPGRP: 1013 case FIOSETOWN: 1014 error = fsetown(l->l_proc, &sc->sc_pgid, cmd, data); 1015 break; 1016 case TIOCGPGRP: 1017 case FIOGETOWN: 1018 error = fgetown(l->l_proc, sc->sc_pgid, cmd, data); 1019 break; 1020 case FIOASYNC: 1021 if (*(int *)data) 1022 sc->sc_flags |= TAP_ASYNCIO; 1023 else 1024 sc->sc_flags &= ~TAP_ASYNCIO; 1025 break; 1026 case FIONBIO: 1027 if (*(int *)data) 1028 sc->sc_flags |= TAP_NBIO; 1029 else 1030 sc->sc_flags &= ~TAP_NBIO; 1031 break; 1032 case TAPGIFNAME: 1033 { 1034 struct ifreq *ifr = (struct ifreq *)data; 1035 struct ifnet *ifp = &sc->sc_ec.ec_if; 1036 1037 strlcpy(ifr->ifr_name, ifp->if_xname, IFNAMSIZ); 1038 } break; 1039 default: 1040 error = ENOTTY; 1041 break; 1042 } 1043 1044 return (0); 1045 } 1046 1047 static int 1048 tap_cdev_poll(dev_t dev, int events, struct lwp *l) 1049 { 1050 return tap_dev_poll(minor(dev), events, l); 1051 } 1052 1053 static int 1054 tap_fops_poll(struct file *fp, int events, struct lwp *l) 1055 { 1056 return tap_dev_poll((intptr_t)fp->f_data, events, l); 1057 } 1058 1059 static int 1060 tap_dev_poll(int unit, int events, struct lwp *l) 1061 { 1062 struct tap_softc *sc = 1063 (struct tap_softc *)device_lookup(&tap_cd, unit); 1064 int revents = 0; 1065 1066 if (sc == NULL) 1067 return (ENXIO); 1068 1069 if (events & (POLLIN|POLLRDNORM)) { 1070 struct ifnet *ifp = &sc->sc_ec.ec_if; 1071 struct mbuf *m; 1072 int s; 1073 1074 s = splnet(); 1075 IFQ_POLL(&ifp->if_snd, m); 1076 splx(s); 1077 1078 if (m != NULL) 1079 revents |= events & (POLLIN|POLLRDNORM); 1080 else { 1081 simple_lock(&sc->sc_kqlock); 1082 selrecord(l, &sc->sc_rsel); 1083 simple_unlock(&sc->sc_kqlock); 1084 } 1085 } 1086 revents |= events & (POLLOUT|POLLWRNORM); 1087 1088 return (revents); 1089 } 1090 1091 static struct filterops tap_read_filterops = { 1, NULL, tap_kqdetach, 1092 tap_kqread }; 1093 static struct filterops tap_seltrue_filterops = { 1, NULL, tap_kqdetach, 1094 filt_seltrue }; 1095 1096 static int 1097 tap_cdev_kqfilter(dev_t dev, struct knote *kn) 1098 { 1099 return tap_dev_kqfilter(minor(dev), kn); 1100 } 1101 1102 static int 1103 tap_fops_kqfilter(struct file *fp, struct knote *kn) 1104 { 1105 return tap_dev_kqfilter((intptr_t)fp->f_data, kn); 1106 } 1107 1108 static int 1109 tap_dev_kqfilter(int unit, struct knote *kn) 1110 { 1111 struct tap_softc *sc = 1112 (struct tap_softc *)device_lookup(&tap_cd, unit); 1113 1114 if (sc == NULL) 1115 return (ENXIO); 1116 1117 switch(kn->kn_filter) { 1118 case EVFILT_READ: 1119 kn->kn_fop = &tap_read_filterops; 1120 break; 1121 case EVFILT_WRITE: 1122 kn->kn_fop = &tap_seltrue_filterops; 1123 break; 1124 default: 1125 return (1); 1126 } 1127 1128 kn->kn_hook = sc; 1129 simple_lock(&sc->sc_kqlock); 1130 SLIST_INSERT_HEAD(&sc->sc_rsel.sel_klist, kn, kn_selnext); 1131 simple_unlock(&sc->sc_kqlock); 1132 return (0); 1133 } 1134 1135 static void 1136 tap_kqdetach(struct knote *kn) 1137 { 1138 struct tap_softc *sc = (struct tap_softc *)kn->kn_hook; 1139 1140 simple_lock(&sc->sc_kqlock); 1141 SLIST_REMOVE(&sc->sc_rsel.sel_klist, kn, knote, kn_selnext); 1142 simple_unlock(&sc->sc_kqlock); 1143 } 1144 1145 static int 1146 tap_kqread(struct knote *kn, long hint) 1147 { 1148 struct tap_softc *sc = (struct tap_softc *)kn->kn_hook; 1149 struct ifnet *ifp = &sc->sc_ec.ec_if; 1150 struct mbuf *m; 1151 int s; 1152 1153 s = splnet(); 1154 IFQ_POLL(&ifp->if_snd, m); 1155 1156 if (m == NULL) 1157 kn->kn_data = 0; 1158 else 1159 kn->kn_data = m->m_pkthdr.len; 1160 splx(s); 1161 return (kn->kn_data != 0 ? 1 : 0); 1162 } 1163 1164 /* 1165 * sysctl management routines 1166 * You can set the address of an interface through: 1167 * net.link.tap.tap<number> 1168 * 1169 * Note the consistent use of tap_log in order to use 1170 * sysctl_teardown at unload time. 1171 * 1172 * In the kernel you will find a lot of SYSCTL_SETUP blocks. Those 1173 * blocks register a function in a special section of the kernel 1174 * (called a link set) which is used at init_sysctl() time to cycle 1175 * through all those functions to create the kernel's sysctl tree. 1176 * 1177 * It is not (currently) possible to use link sets in a LKM, so the 1178 * easiest is to simply call our own setup routine at load time. 1179 * 1180 * In the SYSCTL_SETUP blocks you find in the kernel, nodes have the 1181 * CTLFLAG_PERMANENT flag, meaning they cannot be removed. Once the 1182 * whole kernel sysctl tree is built, it is not possible to add any 1183 * permanent node. 1184 * 1185 * It should be noted that we're not saving the sysctlnode pointer 1186 * we are returned when creating the "tap" node. That structure 1187 * cannot be trusted once out of the calling function, as it might 1188 * get reused. So we just save the MIB number, and always give the 1189 * full path starting from the root for later calls to sysctl_createv 1190 * and sysctl_destroyv. 1191 */ 1192 SYSCTL_SETUP(sysctl_tap_setup, "sysctl net.link.tap subtree setup") 1193 { 1194 const struct sysctlnode *node; 1195 int error = 0; 1196 1197 if ((error = sysctl_createv(clog, 0, NULL, NULL, 1198 CTLFLAG_PERMANENT, 1199 CTLTYPE_NODE, "net", NULL, 1200 NULL, 0, NULL, 0, 1201 CTL_NET, CTL_EOL)) != 0) 1202 return; 1203 1204 if ((error = sysctl_createv(clog, 0, NULL, NULL, 1205 CTLFLAG_PERMANENT, 1206 CTLTYPE_NODE, "link", NULL, 1207 NULL, 0, NULL, 0, 1208 CTL_NET, AF_LINK, CTL_EOL)) != 0) 1209 return; 1210 1211 /* 1212 * The first four parameters of sysctl_createv are for management. 1213 * 1214 * The four that follows, here starting with a '0' for the flags, 1215 * describe the node. 1216 * 1217 * The next series of four set its value, through various possible 1218 * means. 1219 * 1220 * Last but not least, the path to the node is described. That path 1221 * is relative to the given root (third argument). Here we're 1222 * starting from the root. 1223 */ 1224 if ((error = sysctl_createv(clog, 0, NULL, &node, 1225 CTLFLAG_PERMANENT, 1226 CTLTYPE_NODE, "tap", NULL, 1227 NULL, 0, NULL, 0, 1228 CTL_NET, AF_LINK, CTL_CREATE, CTL_EOL)) != 0) 1229 return; 1230 tap_node = node->sysctl_num; 1231 } 1232 1233 /* 1234 * The helper functions make Andrew Brown's interface really 1235 * shine. It makes possible to create value on the fly whether 1236 * the sysctl value is read or written. 1237 * 1238 * As shown as an example in the man page, the first step is to 1239 * create a copy of the node to have sysctl_lookup work on it. 1240 * 1241 * Here, we have more work to do than just a copy, since we have 1242 * to create the string. The first step is to collect the actual 1243 * value of the node, which is a convenient pointer to the softc 1244 * of the interface. From there we create the string and use it 1245 * as the value, but only for the *copy* of the node. 1246 * 1247 * Then we let sysctl_lookup do the magic, which consists in 1248 * setting oldp and newp as required by the operation. When the 1249 * value is read, that means that the string will be copied to 1250 * the user, and when it is written, the new value will be copied 1251 * over in the addr array. 1252 * 1253 * If newp is NULL, the user was reading the value, so we don't 1254 * have anything else to do. If a new value was written, we 1255 * have to check it. 1256 * 1257 * If it is incorrect, we can return an error and leave 'node' as 1258 * it is: since it is a copy of the actual node, the change will 1259 * be forgotten. 1260 * 1261 * Upon a correct input, we commit the change to the ifnet 1262 * structure of our interface. 1263 */ 1264 static int 1265 tap_sysctl_handler(SYSCTLFN_ARGS) 1266 { 1267 struct sysctlnode node; 1268 struct tap_softc *sc; 1269 struct ifnet *ifp; 1270 int error; 1271 size_t len; 1272 char addr[3 * ETHER_ADDR_LEN]; 1273 1274 node = *rnode; 1275 sc = node.sysctl_data; 1276 ifp = &sc->sc_ec.ec_if; 1277 (void)ether_snprintf(addr, sizeof(addr), LLADDR(ifp->if_sadl)); 1278 node.sysctl_data = addr; 1279 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 1280 if (error || newp == NULL) 1281 return (error); 1282 1283 len = strlen(addr); 1284 if (len < 11 || len > 17) 1285 return (EINVAL); 1286 1287 /* Commit change */ 1288 if (tap_ether_aton(LLADDR(ifp->if_sadl), addr) != 0) 1289 return (EINVAL); 1290 return (error); 1291 } 1292 1293 /* 1294 * ether_aton implementation, not using a static buffer. 1295 */ 1296 static int 1297 tap_ether_aton(u_char *dest, char *str) 1298 { 1299 int i; 1300 char *cp = str; 1301 u_char val[6]; 1302 1303 #define set_value \ 1304 if (*cp > '9' && *cp < 'a') \ 1305 *cp -= 'A' - 10; \ 1306 else if (*cp > '9') \ 1307 *cp -= 'a' - 10; \ 1308 else \ 1309 *cp -= '0' 1310 1311 for (i = 0; i < 6; i++, cp++) { 1312 if (!isxdigit(*cp)) 1313 return (1); 1314 set_value; 1315 val[i] = *cp++; 1316 if (isxdigit(*cp)) { 1317 set_value; 1318 val[i] *= 16; 1319 val[i] += *cp++; 1320 } 1321 if (*cp == ':' || i == 5) 1322 continue; 1323 else 1324 return (1); 1325 } 1326 memcpy(dest, val, 6); 1327 return (0); 1328 } 1329