1 /* $NetBSD: if_tap.c,v 1.14 2006/03/16 15:57:59 christos Exp $ */ 2 3 /* 4 * Copyright (c) 2003, 2004 The NetBSD Foundation. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to the NetBSD Foundation 8 * by Quentin Garnier. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 39 /* 40 * tap(4) is a virtual Ethernet interface. It appears as a real Ethernet 41 * device to the system, but can also be accessed by userland through a 42 * character device interface, which allows reading and injecting frames. 43 */ 44 45 #include <sys/cdefs.h> 46 __KERNEL_RCSID(0, "$NetBSD: if_tap.c,v 1.14 2006/03/16 15:57:59 christos Exp $"); 47 48 #if defined(_KERNEL_OPT) 49 #include "bpfilter.h" 50 #endif 51 52 #include <sys/param.h> 53 #include <sys/systm.h> 54 #include <sys/kernel.h> 55 #include <sys/malloc.h> 56 #include <sys/conf.h> 57 #include <sys/device.h> 58 #include <sys/file.h> 59 #include <sys/filedesc.h> 60 #include <sys/ksyms.h> 61 #include <sys/poll.h> 62 #include <sys/select.h> 63 #include <sys/sockio.h> 64 #include <sys/sysctl.h> 65 66 #include <net/if.h> 67 #include <net/if_dl.h> 68 #include <net/if_ether.h> 69 #include <net/if_media.h> 70 #include <net/if_tap.h> 71 #if NBPFILTER > 0 72 #include <net/bpf.h> 73 #endif 74 75 /* 76 * sysctl node management 77 * 78 * It's not really possible to use a SYSCTL_SETUP block with 79 * current LKM implementation, so it is easier to just define 80 * our own function. 81 * 82 * The handler function is a "helper" in Andrew Brown's sysctl 83 * framework terminology. It is used as a gateway for sysctl 84 * requests over the nodes. 85 * 86 * tap_log allows the module to log creations of nodes and 87 * destroy them all at once using sysctl_teardown. 88 */ 89 static int tap_node; 90 static int tap_sysctl_handler(SYSCTLFN_PROTO); 91 SYSCTL_SETUP_PROTO(sysctl_tap_setup); 92 93 /* 94 * Since we're an Ethernet device, we need the 3 following 95 * components: a leading struct device, a struct ethercom, 96 * and also a struct ifmedia since we don't attach a PHY to 97 * ourselves. We could emulate one, but there's no real 98 * point. 99 */ 100 101 struct tap_softc { 102 struct device sc_dev; 103 struct ifmedia sc_im; 104 struct ethercom sc_ec; 105 int sc_flags; 106 #define TAP_INUSE 0x00000001 /* tap device can only be opened once */ 107 #define TAP_ASYNCIO 0x00000002 /* user is using async I/O (SIGIO) on the device */ 108 #define TAP_NBIO 0x00000004 /* user wants calls to avoid blocking */ 109 #define TAP_GOING 0x00000008 /* interface is being destroyed */ 110 struct selinfo sc_rsel; 111 pid_t sc_pgid; /* For async. IO */ 112 struct lock sc_rdlock; 113 struct simplelock sc_kqlock; 114 }; 115 116 /* autoconf(9) glue */ 117 118 void tapattach(int); 119 120 static int tap_match(struct device *, struct cfdata *, void *); 121 static void tap_attach(struct device *, struct device *, void *); 122 static int tap_detach(struct device*, int); 123 124 /* Ethernet address helper functions */ 125 126 static int tap_ether_aton(u_char *, char *); 127 128 CFATTACH_DECL(tap, sizeof(struct tap_softc), 129 tap_match, tap_attach, tap_detach, NULL); 130 extern struct cfdriver tap_cd; 131 132 /* Real device access routines */ 133 static int tap_dev_close(struct tap_softc *); 134 static int tap_dev_read(int, struct uio *, int); 135 static int tap_dev_write(int, struct uio *, int); 136 static int tap_dev_ioctl(int, u_long, caddr_t, struct lwp *); 137 static int tap_dev_poll(int, int, struct lwp *); 138 static int tap_dev_kqfilter(int, struct knote *); 139 140 /* Fileops access routines */ 141 static int tap_fops_close(struct file *, struct lwp *); 142 static int tap_fops_read(struct file *, off_t *, struct uio *, 143 struct ucred *, int); 144 static int tap_fops_write(struct file *, off_t *, struct uio *, 145 struct ucred *, int); 146 static int tap_fops_ioctl(struct file *, u_long, void *, 147 struct lwp *); 148 static int tap_fops_poll(struct file *, int, struct lwp *); 149 static int tap_fops_kqfilter(struct file *, struct knote *); 150 151 static const struct fileops tap_fileops = { 152 tap_fops_read, 153 tap_fops_write, 154 tap_fops_ioctl, 155 fnullop_fcntl, 156 tap_fops_poll, 157 fbadop_stat, 158 tap_fops_close, 159 tap_fops_kqfilter, 160 }; 161 162 /* Helper for cloning open() */ 163 static int tap_dev_cloner(struct lwp *); 164 165 /* Character device routines */ 166 static int tap_cdev_open(dev_t, int, int, struct lwp *); 167 static int tap_cdev_close(dev_t, int, int, struct lwp *); 168 static int tap_cdev_read(dev_t, struct uio *, int); 169 static int tap_cdev_write(dev_t, struct uio *, int); 170 static int tap_cdev_ioctl(dev_t, u_long, caddr_t, int, struct lwp *); 171 static int tap_cdev_poll(dev_t, int, struct lwp *); 172 static int tap_cdev_kqfilter(dev_t, struct knote *); 173 174 const struct cdevsw tap_cdevsw = { 175 tap_cdev_open, tap_cdev_close, 176 tap_cdev_read, tap_cdev_write, 177 tap_cdev_ioctl, nostop, notty, 178 tap_cdev_poll, nommap, 179 tap_cdev_kqfilter, 180 }; 181 182 #define TAP_CLONER 0xfffff /* Maximal minor value */ 183 184 /* kqueue-related routines */ 185 static void tap_kqdetach(struct knote *); 186 static int tap_kqread(struct knote *, long); 187 188 /* 189 * Those are needed by the if_media interface. 190 */ 191 192 static int tap_mediachange(struct ifnet *); 193 static void tap_mediastatus(struct ifnet *, struct ifmediareq *); 194 195 /* 196 * Those are needed by the ifnet interface, and would typically be 197 * there for any network interface driver. 198 * Some other routines are optional: watchdog and drain. 199 */ 200 201 static void tap_start(struct ifnet *); 202 static void tap_stop(struct ifnet *, int); 203 static int tap_init(struct ifnet *); 204 static int tap_ioctl(struct ifnet *, u_long, caddr_t); 205 206 /* This is an internal function to keep tap_ioctl readable */ 207 static int tap_lifaddr(struct ifnet *, u_long, struct ifaliasreq *); 208 209 /* 210 * tap is a clonable interface, although it is highly unrealistic for 211 * an Ethernet device. 212 * 213 * Here are the bits needed for a clonable interface. 214 */ 215 static int tap_clone_create(struct if_clone *, int); 216 static int tap_clone_destroy(struct ifnet *); 217 218 struct if_clone tap_cloners = IF_CLONE_INITIALIZER("tap", 219 tap_clone_create, 220 tap_clone_destroy); 221 222 /* Helper functionis shared by the two cloning code paths */ 223 static struct tap_softc * tap_clone_creator(int); 224 int tap_clone_destroyer(struct device *); 225 226 void 227 tapattach(int n) 228 { 229 int error; 230 231 error = config_cfattach_attach(tap_cd.cd_name, &tap_ca); 232 if (error) { 233 aprint_error("%s: unable to register cfattach\n", 234 tap_cd.cd_name); 235 (void)config_cfdriver_detach(&tap_cd); 236 return; 237 } 238 239 if_clone_attach(&tap_cloners); 240 } 241 242 /* Pretty much useless for a pseudo-device */ 243 static int 244 tap_match(struct device *self, struct cfdata *cfdata, void *arg) 245 { 246 return (1); 247 } 248 249 void 250 tap_attach(struct device *parent, struct device *self, void *aux) 251 { 252 struct tap_softc *sc = (struct tap_softc *)self; 253 struct ifnet *ifp; 254 u_int8_t enaddr[ETHER_ADDR_LEN] = 255 { 0xf2, 0x0b, 0xa4, 0xff, 0xff, 0xff }; 256 char enaddrstr[3 * ETHER_ADDR_LEN]; 257 uint32_t ui; 258 int error; 259 const struct sysctlnode *node; 260 261 aprint_normal("%s: faking Ethernet device\n", 262 self->dv_xname); 263 264 /* 265 * In order to obtain unique initial Ethernet address on a host, 266 * do some randomisation using mono_time. It's not meant for anything 267 * but avoiding hard-coding an address. 268 */ 269 ui = (mono_time.tv_sec ^ mono_time.tv_usec) & 0xffffff; 270 memcpy(enaddr+3, (u_int8_t *)&ui, 3); 271 272 aprint_normal("%s: Ethernet address %s\n", sc->sc_dev.dv_xname, 273 ether_snprintf(enaddrstr, sizeof(enaddrstr), enaddr)); 274 275 /* 276 * Why 1000baseT? Why not? You can add more. 277 * 278 * Note that there are 3 steps: init, one or several additions to 279 * list of supported media, and in the end, the selection of one 280 * of them. 281 */ 282 ifmedia_init(&sc->sc_im, 0, tap_mediachange, tap_mediastatus); 283 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_1000_T, 0, NULL); 284 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_1000_T|IFM_FDX, 0, NULL); 285 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_100_TX, 0, NULL); 286 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_100_TX|IFM_FDX, 0, NULL); 287 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_10_T, 0, NULL); 288 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_10_T|IFM_FDX, 0, NULL); 289 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_AUTO, 0, NULL); 290 ifmedia_set(&sc->sc_im, IFM_ETHER|IFM_AUTO); 291 292 /* 293 * One should note that an interface must do multicast in order 294 * to support IPv6. 295 */ 296 ifp = &sc->sc_ec.ec_if; 297 strcpy(ifp->if_xname, sc->sc_dev.dv_xname); 298 ifp->if_softc = sc; 299 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 300 ifp->if_ioctl = tap_ioctl; 301 ifp->if_start = tap_start; 302 ifp->if_stop = tap_stop; 303 ifp->if_init = tap_init; 304 IFQ_SET_READY(&ifp->if_snd); 305 306 sc->sc_ec.ec_capabilities = ETHERCAP_VLAN_MTU | ETHERCAP_JUMBO_MTU; 307 308 /* Those steps are mandatory for an Ethernet driver, the fisrt call 309 * being common to all network interface drivers. */ 310 if_attach(ifp); 311 ether_ifattach(ifp, enaddr); 312 313 sc->sc_flags = 0; 314 315 /* 316 * Add a sysctl node for that interface. 317 * 318 * The pointer transmitted is not a string, but instead a pointer to 319 * the softc structure, which we can use to build the string value on 320 * the fly in the helper function of the node. See the comments for 321 * tap_sysctl_handler for details. 322 */ 323 if ((error = sysctl_createv(NULL, 0, NULL, 324 &node, CTLFLAG_READWRITE, 325 CTLTYPE_STRING, sc->sc_dev.dv_xname, NULL, 326 tap_sysctl_handler, 0, sc, 18, 327 CTL_NET, AF_LINK, tap_node, sc->sc_dev.dv_unit, CTL_EOL)) != 0) 328 aprint_error("%s: sysctl_createv returned %d, ignoring\n", 329 sc->sc_dev.dv_xname, error); 330 331 /* 332 * Initialize the two locks for the device. 333 * 334 * We need a lock here because even though the tap device can be 335 * opened only once, the file descriptor might be passed to another 336 * process, say a fork(2)ed child. 337 * 338 * The Giant saves us from most of the hassle, but since the read 339 * operation can sleep, we don't want two processes to wake up at 340 * the same moment and both try and dequeue a single packet. 341 * 342 * The queue for event listeners (used by kqueue(9), see below) has 343 * to be protected, too, but we don't need the same level of 344 * complexity for that lock, so a simple spinning lock is fine. 345 */ 346 lockinit(&sc->sc_rdlock, PSOCK|PCATCH, "tapl", 0, LK_SLEEPFAIL); 347 simple_lock_init(&sc->sc_kqlock); 348 } 349 350 /* 351 * When detaching, we do the inverse of what is done in the attach 352 * routine, in reversed order. 353 */ 354 static int 355 tap_detach(struct device* self, int flags) 356 { 357 struct tap_softc *sc = (struct tap_softc *)self; 358 struct ifnet *ifp = &sc->sc_ec.ec_if; 359 int error, s; 360 361 /* 362 * Some processes might be sleeping on "tap", so we have to make 363 * them release their hold on the device. 364 * 365 * The LK_DRAIN operation will wait for every locked process to 366 * release their hold. 367 */ 368 sc->sc_flags |= TAP_GOING; 369 s = splnet(); 370 tap_stop(ifp, 1); 371 if_down(ifp); 372 splx(s); 373 lockmgr(&sc->sc_rdlock, LK_DRAIN, NULL); 374 375 /* 376 * Destroying a single leaf is a very straightforward operation using 377 * sysctl_destroyv. One should be sure to always end the path with 378 * CTL_EOL. 379 */ 380 if ((error = sysctl_destroyv(NULL, CTL_NET, AF_LINK, tap_node, 381 sc->sc_dev.dv_unit, CTL_EOL)) != 0) 382 aprint_error("%s: sysctl_destroyv returned %d, ignoring\n", 383 sc->sc_dev.dv_xname, error); 384 ether_ifdetach(ifp); 385 if_detach(ifp); 386 ifmedia_delete_instance(&sc->sc_im, IFM_INST_ANY); 387 388 return (0); 389 } 390 391 /* 392 * This function is called by the ifmedia layer to notify the driver 393 * that the user requested a media change. A real driver would 394 * reconfigure the hardware. 395 */ 396 static int 397 tap_mediachange(struct ifnet *ifp) 398 { 399 return (0); 400 } 401 402 /* 403 * Here the user asks for the currently used media. 404 */ 405 static void 406 tap_mediastatus(struct ifnet *ifp, struct ifmediareq *imr) 407 { 408 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc; 409 imr->ifm_active = sc->sc_im.ifm_cur->ifm_media; 410 } 411 412 /* 413 * This is the function where we SEND packets. 414 * 415 * There is no 'receive' equivalent. A typical driver will get 416 * interrupts from the hardware, and from there will inject new packets 417 * into the network stack. 418 * 419 * Once handled, a packet must be freed. A real driver might not be able 420 * to fit all the pending packets into the hardware, and is allowed to 421 * return before having sent all the packets. It should then use the 422 * if_flags flag IFF_OACTIVE to notify the upper layer. 423 * 424 * There are also other flags one should check, such as IFF_PAUSE. 425 * 426 * It is our duty to make packets available to BPF listeners. 427 * 428 * You should be aware that this function is called by the Ethernet layer 429 * at splnet(). 430 * 431 * When the device is opened, we have to pass the packet(s) to the 432 * userland. For that we stay in OACTIVE mode while the userland gets 433 * the packets, and we send a signal to the processes waiting to read. 434 * 435 * wakeup(sc) is the counterpart to the tsleep call in 436 * tap_dev_read, while selnotify() is used for kevent(2) and 437 * poll(2) (which includes select(2)) listeners. 438 */ 439 static void 440 tap_start(struct ifnet *ifp) 441 { 442 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc; 443 struct mbuf *m0; 444 445 if ((sc->sc_flags & TAP_INUSE) == 0) { 446 /* Simply drop packets */ 447 for(;;) { 448 IFQ_DEQUEUE(&ifp->if_snd, m0); 449 if (m0 == NULL) 450 return; 451 452 ifp->if_opackets++; 453 #if NBPFILTER > 0 454 if (ifp->if_bpf) 455 bpf_mtap(ifp->if_bpf, m0); 456 #endif 457 458 m_freem(m0); 459 } 460 } else if (!IFQ_IS_EMPTY(&ifp->if_snd)) { 461 ifp->if_flags |= IFF_OACTIVE; 462 wakeup(sc); 463 selnotify(&sc->sc_rsel, 1); 464 if (sc->sc_flags & TAP_ASYNCIO) 465 fownsignal(sc->sc_pgid, SIGIO, POLL_IN, 466 POLLIN|POLLRDNORM, NULL); 467 } 468 } 469 470 /* 471 * A typical driver will only contain the following handlers for 472 * ioctl calls, except SIOCSIFPHYADDR. 473 * The latter is a hack I used to set the Ethernet address of the 474 * faked device. 475 * 476 * Note that both ifmedia_ioctl() and ether_ioctl() have to be 477 * called under splnet(). 478 */ 479 static int 480 tap_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 481 { 482 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc; 483 struct ifreq *ifr = (struct ifreq *)data; 484 int s, error; 485 486 s = splnet(); 487 488 switch (cmd) { 489 case SIOCSIFMEDIA: 490 case SIOCGIFMEDIA: 491 error = ifmedia_ioctl(ifp, ifr, &sc->sc_im, cmd); 492 break; 493 case SIOCSIFPHYADDR: 494 error = tap_lifaddr(ifp, cmd, (struct ifaliasreq *)data); 495 break; 496 default: 497 error = ether_ioctl(ifp, cmd, data); 498 if (error == ENETRESET) 499 error = 0; 500 break; 501 } 502 503 splx(s); 504 505 return (error); 506 } 507 508 /* 509 * Helper function to set Ethernet address. This shouldn't be done there, 510 * and should actually be available to all Ethernet drivers, real or not. 511 */ 512 static int 513 tap_lifaddr(struct ifnet *ifp, u_long cmd, struct ifaliasreq *ifra) 514 { 515 struct sockaddr *sa = (struct sockaddr *)&ifra->ifra_addr; 516 517 if (sa->sa_family != AF_LINK) 518 return (EINVAL); 519 520 memcpy(LLADDR(ifp->if_sadl), sa->sa_data, ETHER_ADDR_LEN); 521 522 return (0); 523 } 524 525 /* 526 * _init() would typically be called when an interface goes up, 527 * meaning it should configure itself into the state in which it 528 * can send packets. 529 */ 530 static int 531 tap_init(struct ifnet *ifp) 532 { 533 ifp->if_flags |= IFF_RUNNING; 534 535 tap_start(ifp); 536 537 return (0); 538 } 539 540 /* 541 * _stop() is called when an interface goes down. It is our 542 * responsability to validate that state by clearing the 543 * IFF_RUNNING flag. 544 * 545 * We have to wake up all the sleeping processes to have the pending 546 * read requests cancelled. 547 */ 548 static void 549 tap_stop(struct ifnet *ifp, int disable) 550 { 551 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc; 552 553 ifp->if_flags &= ~IFF_RUNNING; 554 wakeup(sc); 555 selnotify(&sc->sc_rsel, 1); 556 if (sc->sc_flags & TAP_ASYNCIO) 557 fownsignal(sc->sc_pgid, SIGIO, POLL_HUP, 0, NULL); 558 } 559 560 /* 561 * The 'create' command of ifconfig can be used to create 562 * any numbered instance of a given device. Thus we have to 563 * make sure we have enough room in cd_devs to create the 564 * user-specified instance. config_attach_pseudo will do this 565 * for us. 566 */ 567 static int 568 tap_clone_create(struct if_clone *ifc, int unit) 569 { 570 if (tap_clone_creator(unit) == NULL) { 571 aprint_error("%s%d: unable to attach an instance\n", 572 tap_cd.cd_name, unit); 573 return (ENXIO); 574 } 575 576 return (0); 577 } 578 579 /* 580 * tap(4) can be cloned by two ways: 581 * using 'ifconfig tap0 create', which will use the network 582 * interface cloning API, and call tap_clone_create above. 583 * opening the cloning device node, whose minor number is TAP_CLONER. 584 * See below for an explanation on how this part work. 585 * 586 * config_attach_pseudo can be called with unit = DVUNIT_ANY to have 587 * autoconf(9) choose a unit number for us. This is what happens when 588 * the cloner is openend, while the ifcloner interface creates a device 589 * with a specific unit number. 590 */ 591 static struct tap_softc * 592 tap_clone_creator(int unit) 593 { 594 struct cfdata *cf; 595 596 cf = malloc(sizeof(*cf), M_DEVBUF, M_WAITOK); 597 cf->cf_name = tap_cd.cd_name; 598 cf->cf_atname = tap_ca.ca_name; 599 cf->cf_unit = unit; 600 cf->cf_fstate = FSTATE_STAR; 601 602 return (struct tap_softc *)config_attach_pseudo(cf); 603 } 604 605 /* 606 * The clean design of if_clone and autoconf(9) makes that part 607 * really straightforward. The second argument of config_detach 608 * means neither QUIET nor FORCED. 609 */ 610 static int 611 tap_clone_destroy(struct ifnet *ifp) 612 { 613 return tap_clone_destroyer((struct device *)ifp->if_softc); 614 } 615 616 int 617 tap_clone_destroyer(struct device *dev) 618 { 619 struct cfdata *cf = dev->dv_cfdata; 620 int error; 621 622 if ((error = config_detach(dev, 0)) != 0) 623 aprint_error("%s: unable to detach instance\n", 624 dev->dv_xname); 625 free(cf, M_DEVBUF); 626 627 return (error); 628 } 629 630 /* 631 * tap(4) is a bit of an hybrid device. It can be used in two different 632 * ways: 633 * 1. ifconfig tapN create, then use /dev/tapN to read/write off it. 634 * 2. open /dev/tap, get a new interface created and read/write off it. 635 * That interface is destroyed when the process that had it created exits. 636 * 637 * The first way is managed by the cdevsw structure, and you access interfaces 638 * through a (major, minor) mapping: tap4 is obtained by the minor number 639 * 4. The entry points for the cdevsw interface are prefixed by tap_cdev_. 640 * 641 * The second way is the so-called "cloning" device. It's a special minor 642 * number (chosen as the maximal number, to allow as much tap devices as 643 * possible). The user first opens the cloner (e.g., /dev/tap), and that 644 * call ends in tap_cdev_open. The actual place where it is handled is 645 * tap_dev_cloner. 646 * 647 * An tap device cannot be opened more than once at a time, so the cdevsw 648 * part of open() does nothing but noting that the interface is being used and 649 * hence ready to actually handle packets. 650 */ 651 652 static int 653 tap_cdev_open(dev_t dev, int flags, int fmt, struct lwp *l) 654 { 655 struct tap_softc *sc; 656 657 if (minor(dev) == TAP_CLONER) 658 return tap_dev_cloner(l); 659 660 sc = (struct tap_softc *)device_lookup(&tap_cd, minor(dev)); 661 if (sc == NULL) 662 return (ENXIO); 663 664 /* The device can only be opened once */ 665 if (sc->sc_flags & TAP_INUSE) 666 return (EBUSY); 667 sc->sc_flags |= TAP_INUSE; 668 return (0); 669 } 670 671 /* 672 * There are several kinds of cloning devices, and the most simple is the one 673 * tap(4) uses. What it does is change the file descriptor with a new one, 674 * with its own fileops structure (which maps to the various read, write, 675 * ioctl functions). It starts allocating a new file descriptor with falloc, 676 * then actually creates the new tap devices. 677 * 678 * Once those two steps are successful, we can re-wire the existing file 679 * descriptor to its new self. This is done with fdclone(): it fills the fp 680 * structure as needed (notably f_data gets filled with the fifth parameter 681 * passed, the unit of the tap device which will allows us identifying the 682 * device later), and returns EMOVEFD. 683 * 684 * That magic value is interpreted by sys_open() which then replaces the 685 * current file descriptor by the new one (through a magic member of struct 686 * lwp, l_dupfd). 687 * 688 * The tap device is flagged as being busy since it otherwise could be 689 * externally accessed through the corresponding device node with the cdevsw 690 * interface. 691 */ 692 693 static int 694 tap_dev_cloner(struct lwp *l) 695 { 696 struct tap_softc *sc; 697 struct file *fp; 698 int error, fd; 699 700 if ((error = falloc(l->l_proc, &fp, &fd)) != 0) 701 return (error); 702 703 if ((sc = tap_clone_creator(DVUNIT_ANY)) == NULL) { 704 FILE_UNUSE(fp, l); 705 ffree(fp); 706 return (ENXIO); 707 } 708 709 sc->sc_flags |= TAP_INUSE; 710 711 return fdclone(l, fp, fd, FREAD|FWRITE, &tap_fileops, 712 (void *)(intptr_t)sc->sc_dev.dv_unit); 713 } 714 715 /* 716 * While all other operations (read, write, ioctl, poll and kqfilter) are 717 * really the same whether we are in cdevsw or fileops mode, the close() 718 * function is slightly different in the two cases. 719 * 720 * As for the other, the core of it is shared in tap_dev_close. What 721 * it does is sufficient for the cdevsw interface, but the cloning interface 722 * needs another thing: the interface is destroyed when the processes that 723 * created it closes it. 724 */ 725 static int 726 tap_cdev_close(dev_t dev, int flags, int fmt, struct lwp *l) 727 { 728 struct tap_softc *sc = 729 (struct tap_softc *)device_lookup(&tap_cd, minor(dev)); 730 731 if (sc == NULL) 732 return (ENXIO); 733 734 return tap_dev_close(sc); 735 } 736 737 /* 738 * It might happen that the administrator used ifconfig to externally destroy 739 * the interface. In that case, tap_fops_close will be called while 740 * tap_detach is already happening. If we called it again from here, we 741 * would dead lock. TAP_GOING ensures that this situation doesn't happen. 742 */ 743 static int 744 tap_fops_close(struct file *fp, struct lwp *l) 745 { 746 int unit = (intptr_t)fp->f_data; 747 struct tap_softc *sc; 748 int error; 749 750 sc = (struct tap_softc *)device_lookup(&tap_cd, unit); 751 if (sc == NULL) 752 return (ENXIO); 753 754 /* tap_dev_close currently always succeeds, but it might not 755 * always be the case. */ 756 if ((error = tap_dev_close(sc)) != 0) 757 return (error); 758 759 /* Destroy the device now that it is no longer useful, 760 * unless it's already being destroyed. */ 761 if ((sc->sc_flags & TAP_GOING) != 0) 762 return (0); 763 764 return tap_clone_destroyer((struct device *)sc); 765 } 766 767 static int 768 tap_dev_close(struct tap_softc *sc) 769 { 770 struct ifnet *ifp; 771 int s; 772 773 s = splnet(); 774 /* Let tap_start handle packets again */ 775 ifp = &sc->sc_ec.ec_if; 776 ifp->if_flags &= ~IFF_OACTIVE; 777 778 /* Purge output queue */ 779 if (!(IFQ_IS_EMPTY(&ifp->if_snd))) { 780 struct mbuf *m; 781 782 for (;;) { 783 IFQ_DEQUEUE(&ifp->if_snd, m); 784 if (m == NULL) 785 break; 786 787 ifp->if_opackets++; 788 #if NBPFILTER > 0 789 if (ifp->if_bpf) 790 bpf_mtap(ifp->if_bpf, m); 791 #endif 792 } 793 } 794 splx(s); 795 796 sc->sc_flags &= ~(TAP_INUSE | TAP_ASYNCIO); 797 798 return (0); 799 } 800 801 static int 802 tap_cdev_read(dev_t dev, struct uio *uio, int flags) 803 { 804 return tap_dev_read(minor(dev), uio, flags); 805 } 806 807 static int 808 tap_fops_read(struct file *fp, off_t *offp, struct uio *uio, 809 struct ucred *cred, int flags) 810 { 811 return tap_dev_read((intptr_t)fp->f_data, uio, flags); 812 } 813 814 static int 815 tap_dev_read(int unit, struct uio *uio, int flags) 816 { 817 struct tap_softc *sc = 818 (struct tap_softc *)device_lookup(&tap_cd, unit); 819 struct ifnet *ifp; 820 struct mbuf *m, *n; 821 int error = 0, s; 822 823 if (sc == NULL) 824 return (ENXIO); 825 826 ifp = &sc->sc_ec.ec_if; 827 if ((ifp->if_flags & IFF_UP) == 0) 828 return (EHOSTDOWN); 829 830 /* 831 * In the TAP_NBIO case, we have to make sure we won't be sleeping 832 */ 833 if ((sc->sc_flags & TAP_NBIO) && 834 lockstatus(&sc->sc_rdlock) == LK_EXCLUSIVE) 835 return (EWOULDBLOCK); 836 error = lockmgr(&sc->sc_rdlock, LK_EXCLUSIVE, NULL); 837 if (error != 0) 838 return (error); 839 840 s = splnet(); 841 if (IFQ_IS_EMPTY(&ifp->if_snd)) { 842 ifp->if_flags &= ~IFF_OACTIVE; 843 splx(s); 844 /* 845 * We must release the lock before sleeping, and re-acquire it 846 * after. 847 */ 848 (void)lockmgr(&sc->sc_rdlock, LK_RELEASE, NULL); 849 if (sc->sc_flags & TAP_NBIO) 850 error = EWOULDBLOCK; 851 else 852 error = tsleep(sc, PSOCK|PCATCH, "tap", 0); 853 854 if (error != 0) 855 return (error); 856 /* The device might have been downed */ 857 if ((ifp->if_flags & IFF_UP) == 0) 858 return (EHOSTDOWN); 859 if ((sc->sc_flags & TAP_NBIO) && 860 lockstatus(&sc->sc_rdlock) == LK_EXCLUSIVE) 861 return (EWOULDBLOCK); 862 error = lockmgr(&sc->sc_rdlock, LK_EXCLUSIVE, NULL); 863 if (error != 0) 864 return (error); 865 s = splnet(); 866 } 867 868 IFQ_DEQUEUE(&ifp->if_snd, m); 869 ifp->if_flags &= ~IFF_OACTIVE; 870 splx(s); 871 if (m == NULL) { 872 error = 0; 873 goto out; 874 } 875 876 ifp->if_opackets++; 877 #if NBPFILTER > 0 878 if (ifp->if_bpf) 879 bpf_mtap(ifp->if_bpf, m); 880 #endif 881 882 /* 883 * One read is one packet. 884 */ 885 do { 886 error = uiomove(mtod(m, caddr_t), 887 min(m->m_len, uio->uio_resid), uio); 888 MFREE(m, n); 889 m = n; 890 } while (m != NULL && uio->uio_resid > 0 && error == 0); 891 892 if (m != NULL) 893 m_freem(m); 894 895 out: 896 (void)lockmgr(&sc->sc_rdlock, LK_RELEASE, NULL); 897 return (error); 898 } 899 900 static int 901 tap_cdev_write(dev_t dev, struct uio *uio, int flags) 902 { 903 return tap_dev_write(minor(dev), uio, flags); 904 } 905 906 static int 907 tap_fops_write(struct file *fp, off_t *offp, struct uio *uio, 908 struct ucred *cred, int flags) 909 { 910 return tap_dev_write((intptr_t)fp->f_data, uio, flags); 911 } 912 913 static int 914 tap_dev_write(int unit, struct uio *uio, int flags) 915 { 916 struct tap_softc *sc = 917 (struct tap_softc *)device_lookup(&tap_cd, unit); 918 struct ifnet *ifp; 919 struct mbuf *m, **mp; 920 int error = 0; 921 int s; 922 923 if (sc == NULL) 924 return (ENXIO); 925 926 ifp = &sc->sc_ec.ec_if; 927 928 /* One write, one packet, that's the rule */ 929 MGETHDR(m, M_DONTWAIT, MT_DATA); 930 if (m == NULL) { 931 ifp->if_ierrors++; 932 return (ENOBUFS); 933 } 934 m->m_pkthdr.len = uio->uio_resid; 935 936 mp = &m; 937 while (error == 0 && uio->uio_resid > 0) { 938 if (*mp != m) { 939 MGET(*mp, M_DONTWAIT, MT_DATA); 940 if (*mp == NULL) { 941 error = ENOBUFS; 942 break; 943 } 944 } 945 (*mp)->m_len = min(MHLEN, uio->uio_resid); 946 error = uiomove(mtod(*mp, caddr_t), (*mp)->m_len, uio); 947 mp = &(*mp)->m_next; 948 } 949 if (error) { 950 ifp->if_ierrors++; 951 m_freem(m); 952 return (error); 953 } 954 955 ifp->if_ipackets++; 956 m->m_pkthdr.rcvif = ifp; 957 958 #if NBPFILTER > 0 959 if (ifp->if_bpf) 960 bpf_mtap(ifp->if_bpf, m); 961 #endif 962 s =splnet(); 963 (*ifp->if_input)(ifp, m); 964 splx(s); 965 966 return (0); 967 } 968 969 static int 970 tap_cdev_ioctl(dev_t dev, u_long cmd, caddr_t data, int flags, 971 struct lwp *l) 972 { 973 return tap_dev_ioctl(minor(dev), cmd, data, l); 974 } 975 976 static int 977 tap_fops_ioctl(struct file *fp, u_long cmd, void *data, struct lwp *l) 978 { 979 return tap_dev_ioctl((intptr_t)fp->f_data, cmd, (caddr_t)data, l); 980 } 981 982 static int 983 tap_dev_ioctl(int unit, u_long cmd, caddr_t data, struct lwp *l) 984 { 985 struct tap_softc *sc = 986 (struct tap_softc *)device_lookup(&tap_cd, unit); 987 int error = 0; 988 989 if (sc == NULL) 990 return (ENXIO); 991 992 switch (cmd) { 993 case FIONREAD: 994 { 995 struct ifnet *ifp = &sc->sc_ec.ec_if; 996 struct mbuf *m; 997 int s; 998 999 s = splnet(); 1000 IFQ_POLL(&ifp->if_snd, m); 1001 1002 if (m == NULL) 1003 *(int *)data = 0; 1004 else 1005 *(int *)data = m->m_pkthdr.len; 1006 splx(s); 1007 } break; 1008 case TIOCSPGRP: 1009 case FIOSETOWN: 1010 error = fsetown(l->l_proc, &sc->sc_pgid, cmd, data); 1011 break; 1012 case TIOCGPGRP: 1013 case FIOGETOWN: 1014 error = fgetown(l->l_proc, sc->sc_pgid, cmd, data); 1015 break; 1016 case FIOASYNC: 1017 if (*(int *)data) 1018 sc->sc_flags |= TAP_ASYNCIO; 1019 else 1020 sc->sc_flags &= ~TAP_ASYNCIO; 1021 break; 1022 case FIONBIO: 1023 if (*(int *)data) 1024 sc->sc_flags |= TAP_NBIO; 1025 else 1026 sc->sc_flags &= ~TAP_NBIO; 1027 break; 1028 case TAPGIFNAME: 1029 { 1030 struct ifreq *ifr = (struct ifreq *)data; 1031 struct ifnet *ifp = &sc->sc_ec.ec_if; 1032 1033 strlcpy(ifr->ifr_name, ifp->if_xname, IFNAMSIZ); 1034 } break; 1035 default: 1036 error = ENOTTY; 1037 break; 1038 } 1039 1040 return (0); 1041 } 1042 1043 static int 1044 tap_cdev_poll(dev_t dev, int events, struct lwp *l) 1045 { 1046 return tap_dev_poll(minor(dev), events, l); 1047 } 1048 1049 static int 1050 tap_fops_poll(struct file *fp, int events, struct lwp *l) 1051 { 1052 return tap_dev_poll((intptr_t)fp->f_data, events, l); 1053 } 1054 1055 static int 1056 tap_dev_poll(int unit, int events, struct lwp *l) 1057 { 1058 struct tap_softc *sc = 1059 (struct tap_softc *)device_lookup(&tap_cd, unit); 1060 int revents = 0; 1061 1062 if (sc == NULL) 1063 return (ENXIO); 1064 1065 if (events & (POLLIN|POLLRDNORM)) { 1066 struct ifnet *ifp = &sc->sc_ec.ec_if; 1067 struct mbuf *m; 1068 int s; 1069 1070 s = splnet(); 1071 IFQ_POLL(&ifp->if_snd, m); 1072 splx(s); 1073 1074 if (m != NULL) 1075 revents |= events & (POLLIN|POLLRDNORM); 1076 else { 1077 simple_lock(&sc->sc_kqlock); 1078 selrecord(l, &sc->sc_rsel); 1079 simple_unlock(&sc->sc_kqlock); 1080 } 1081 } 1082 revents |= events & (POLLOUT|POLLWRNORM); 1083 1084 return (revents); 1085 } 1086 1087 static struct filterops tap_read_filterops = { 1, NULL, tap_kqdetach, 1088 tap_kqread }; 1089 static struct filterops tap_seltrue_filterops = { 1, NULL, tap_kqdetach, 1090 filt_seltrue }; 1091 1092 static int 1093 tap_cdev_kqfilter(dev_t dev, struct knote *kn) 1094 { 1095 return tap_dev_kqfilter(minor(dev), kn); 1096 } 1097 1098 static int 1099 tap_fops_kqfilter(struct file *fp, struct knote *kn) 1100 { 1101 return tap_dev_kqfilter((intptr_t)fp->f_data, kn); 1102 } 1103 1104 static int 1105 tap_dev_kqfilter(int unit, struct knote *kn) 1106 { 1107 struct tap_softc *sc = 1108 (struct tap_softc *)device_lookup(&tap_cd, unit); 1109 1110 if (sc == NULL) 1111 return (ENXIO); 1112 1113 switch(kn->kn_filter) { 1114 case EVFILT_READ: 1115 kn->kn_fop = &tap_read_filterops; 1116 break; 1117 case EVFILT_WRITE: 1118 kn->kn_fop = &tap_seltrue_filterops; 1119 break; 1120 default: 1121 return (1); 1122 } 1123 1124 kn->kn_hook = sc; 1125 simple_lock(&sc->sc_kqlock); 1126 SLIST_INSERT_HEAD(&sc->sc_rsel.sel_klist, kn, kn_selnext); 1127 simple_unlock(&sc->sc_kqlock); 1128 return (0); 1129 } 1130 1131 static void 1132 tap_kqdetach(struct knote *kn) 1133 { 1134 struct tap_softc *sc = (struct tap_softc *)kn->kn_hook; 1135 1136 simple_lock(&sc->sc_kqlock); 1137 SLIST_REMOVE(&sc->sc_rsel.sel_klist, kn, knote, kn_selnext); 1138 simple_unlock(&sc->sc_kqlock); 1139 } 1140 1141 static int 1142 tap_kqread(struct knote *kn, long hint) 1143 { 1144 struct tap_softc *sc = (struct tap_softc *)kn->kn_hook; 1145 struct ifnet *ifp = &sc->sc_ec.ec_if; 1146 struct mbuf *m; 1147 int s; 1148 1149 s = splnet(); 1150 IFQ_POLL(&ifp->if_snd, m); 1151 1152 if (m == NULL) 1153 kn->kn_data = 0; 1154 else 1155 kn->kn_data = m->m_pkthdr.len; 1156 splx(s); 1157 return (kn->kn_data != 0 ? 1 : 0); 1158 } 1159 1160 /* 1161 * sysctl management routines 1162 * You can set the address of an interface through: 1163 * net.link.tap.tap<number> 1164 * 1165 * Note the consistent use of tap_log in order to use 1166 * sysctl_teardown at unload time. 1167 * 1168 * In the kernel you will find a lot of SYSCTL_SETUP blocks. Those 1169 * blocks register a function in a special section of the kernel 1170 * (called a link set) which is used at init_sysctl() time to cycle 1171 * through all those functions to create the kernel's sysctl tree. 1172 * 1173 * It is not (currently) possible to use link sets in a LKM, so the 1174 * easiest is to simply call our own setup routine at load time. 1175 * 1176 * In the SYSCTL_SETUP blocks you find in the kernel, nodes have the 1177 * CTLFLAG_PERMANENT flag, meaning they cannot be removed. Once the 1178 * whole kernel sysctl tree is built, it is not possible to add any 1179 * permanent node. 1180 * 1181 * It should be noted that we're not saving the sysctlnode pointer 1182 * we are returned when creating the "tap" node. That structure 1183 * cannot be trusted once out of the calling function, as it might 1184 * get reused. So we just save the MIB number, and always give the 1185 * full path starting from the root for later calls to sysctl_createv 1186 * and sysctl_destroyv. 1187 */ 1188 SYSCTL_SETUP(sysctl_tap_setup, "sysctl net.link.tap subtree setup") 1189 { 1190 const struct sysctlnode *node; 1191 int error = 0; 1192 1193 if ((error = sysctl_createv(clog, 0, NULL, NULL, 1194 CTLFLAG_PERMANENT, 1195 CTLTYPE_NODE, "net", NULL, 1196 NULL, 0, NULL, 0, 1197 CTL_NET, CTL_EOL)) != 0) 1198 return; 1199 1200 if ((error = sysctl_createv(clog, 0, NULL, NULL, 1201 CTLFLAG_PERMANENT, 1202 CTLTYPE_NODE, "link", NULL, 1203 NULL, 0, NULL, 0, 1204 CTL_NET, AF_LINK, CTL_EOL)) != 0) 1205 return; 1206 1207 /* 1208 * The first four parameters of sysctl_createv are for management. 1209 * 1210 * The four that follows, here starting with a '0' for the flags, 1211 * describe the node. 1212 * 1213 * The next series of four set its value, through various possible 1214 * means. 1215 * 1216 * Last but not least, the path to the node is described. That path 1217 * is relative to the given root (third argument). Here we're 1218 * starting from the root. 1219 */ 1220 if ((error = sysctl_createv(clog, 0, NULL, &node, 1221 CTLFLAG_PERMANENT, 1222 CTLTYPE_NODE, "tap", NULL, 1223 NULL, 0, NULL, 0, 1224 CTL_NET, AF_LINK, CTL_CREATE, CTL_EOL)) != 0) 1225 return; 1226 tap_node = node->sysctl_num; 1227 } 1228 1229 /* 1230 * The helper functions make Andrew Brown's interface really 1231 * shine. It makes possible to create value on the fly whether 1232 * the sysctl value is read or written. 1233 * 1234 * As shown as an example in the man page, the first step is to 1235 * create a copy of the node to have sysctl_lookup work on it. 1236 * 1237 * Here, we have more work to do than just a copy, since we have 1238 * to create the string. The first step is to collect the actual 1239 * value of the node, which is a convenient pointer to the softc 1240 * of the interface. From there we create the string and use it 1241 * as the value, but only for the *copy* of the node. 1242 * 1243 * Then we let sysctl_lookup do the magic, which consists in 1244 * setting oldp and newp as required by the operation. When the 1245 * value is read, that means that the string will be copied to 1246 * the user, and when it is written, the new value will be copied 1247 * over in the addr array. 1248 * 1249 * If newp is NULL, the user was reading the value, so we don't 1250 * have anything else to do. If a new value was written, we 1251 * have to check it. 1252 * 1253 * If it is incorrect, we can return an error and leave 'node' as 1254 * it is: since it is a copy of the actual node, the change will 1255 * be forgotten. 1256 * 1257 * Upon a correct input, we commit the change to the ifnet 1258 * structure of our interface. 1259 */ 1260 static int 1261 tap_sysctl_handler(SYSCTLFN_ARGS) 1262 { 1263 struct sysctlnode node; 1264 struct tap_softc *sc; 1265 struct ifnet *ifp; 1266 int error; 1267 size_t len; 1268 char addr[3 * ETHER_ADDR_LEN]; 1269 1270 node = *rnode; 1271 sc = node.sysctl_data; 1272 ifp = &sc->sc_ec.ec_if; 1273 (void)ether_snprintf(addr, sizeof(addr), LLADDR(ifp->if_sadl)); 1274 node.sysctl_data = addr; 1275 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 1276 if (error || newp == NULL) 1277 return (error); 1278 1279 len = strlen(addr); 1280 if (len < 11 || len > 17) 1281 return (EINVAL); 1282 1283 /* Commit change */ 1284 if (tap_ether_aton(LLADDR(ifp->if_sadl), addr) != 0) 1285 return (EINVAL); 1286 return (error); 1287 } 1288 1289 /* 1290 * ether_aton implementation, not using a static buffer. 1291 */ 1292 static int 1293 tap_ether_aton(u_char *dest, char *str) 1294 { 1295 int i; 1296 char *cp = str; 1297 u_char val[6]; 1298 1299 #define set_value \ 1300 if (*cp > '9' && *cp < 'a') \ 1301 *cp -= 'A' - 10; \ 1302 else if (*cp > '9') \ 1303 *cp -= 'a' - 10; \ 1304 else \ 1305 *cp -= '0' 1306 1307 for (i = 0; i < 6; i++, cp++) { 1308 if (!isxdigit(*cp)) 1309 return (1); 1310 set_value; 1311 val[i] = *cp++; 1312 if (isxdigit(*cp)) { 1313 set_value; 1314 val[i] *= 16; 1315 val[i] += *cp++; 1316 } 1317 if (*cp == ':' || i == 5) 1318 continue; 1319 else 1320 return (1); 1321 } 1322 memcpy(dest, val, 6); 1323 return (0); 1324 } 1325