xref: /minix3/minix/net/lwip/route.c (revision e4dbab1e5368dc2124168836ba46a7d3ff6414b0)
1  /* LWIP service - route.c - route management */
2  /*
3   * This module provides a destination-based routing implementation, roughly
4   * matching the routing as done traditionally by the BSDs and by current NetBSD
5   * in particular.  As such, this implementation almost completely replaces
6   * lwIP's own more limited (and less rigid) routing algorithms.  It does this
7   * using a combination of overriding lwIP functions (ip4_route, ip6_route) with
8   * weak-symbol patching, and lwIP-provided gateway hooks.  Especially the
9   * former gives us a level of control that lwIP's routing hooks do not provide:
10   * not only does such overriding give us the ability to flag that no route was
11   * found at all, we also bypass a number of default decisions taken by lwIP
12   * where the routing hooks are not called at all.
13   *
14   * As a result, the routing tables as visible to the user are an almost
15   * completely accurate reflection of the routing decisions taken by this TCP/IP
16   * stack in practice.  There is currently only one exception: for IPv4 gateway
17   * selection, lwIP will bypass the gateway hook if the given address is on the
18   * local subnet according to the locally assigned IP address and subnet mask.
19   * This exception should practically affect noone, though.
20   *
21   * Our routing implementation differs from NetBSD's in various aspects, though.
22   * Perhaps the most important one, also noted elsewhere, is that we do not
23   * support the coexistence of an all-bits-set network route and a host route
24   * for the same IP address.  If necessary, this issue can be resolved.
25   *
26   * We use a custom concept of "immutable" routes for local addresses, which are
27   * a somewhat special case as explained in the ifaddr module.  Since those
28   * RTF_LOCAL routes cannot be deleted, a small change is made to the route(8)
29   * flush-all command to skip them.  Packets directed at local addresses on
30   * non-loopback interfaces are handled in a way that differs from NetBSD's,
31   * too.  This is explained in the ifdev module.
32   *
33   * The BSDs support special routes that reject or blackhole packets, based on
34   * routing flags.  We support such routes as well, but implement them somewhat
35   * differently from the BSDs: such packets always get routed over a loopback
36   * interface (regardless of their associated interface), in order to save on
37   * routing lookups for packets in the common case.
38   *
39   * As general rules of thumb: if there is no route to a destination, assignment
40   * of a local address will already fail with a "no route to host" error.  If
41   * there is an RTF_REJECT route, a local address will be assigned, but actual
42   * packets will be routed to a loopback interface and result in a "no route to
43   * host" error upon reception there - this is what NetBSD seems to do too, even
44   * though the documentation says that RTF_REJECT routes generate ICMP messages
45   * instead.  RTF_BLACKHOLE behaves similarly to RTF_REJECT, except that the
46   * packet is simply discarded upon receipt by the loopback interface.
47   *
48   * In various places, both here and elsewhere, we check to make sure that on
49   * routing and output, scoped IPv6 source and destination addresses never leave
50   * their zone.  For example, a packet must not be sent to an outgoing interface
51   * if its source address is a link-local address with a zone for another
52   * interface.  lwIP does not check for such violations, and so we must make
53   * sure that this does not happen ourselves.
54   *
55   * Normally, one would tell lwIP to use a particular default IPv4 gateway by
56   * associating the gateway address to a particular interface, and then setting
57   * that interface as default interface (netif_default).  We explicitly do
58   * neither of these things.  Instead, the routing hooks should return the
59   * default route whenever applicable, and the gateway hooks should return the
60   * default route's gateway IP address whenever needed.
61   *
62   * Due to lwIP's limited set of error codes, we do not properly distinguish
63   * between cases where EHOSTUNREACH or ENETUNREACH should be thrown, and throw
64   * the former in most cases.
65   */
66  
67  #include "lwip.h"
68  #include "ifaddr.h"
69  #include "rttree.h"
70  #include "rtsock.h"
71  #include "route.h"
72  #include "lldata.h"
73  
74  #include "lwip/nd6.h"
75  
76  /*
77   * The maximum number of uint8_t bytes needed to represent a routing address.
78   * This value is the maximum of 4 (for IPv4) and 16 (for IPv6).
79   */
80  #define ROUTE_ADDR_MAX	(MAX(IP4_BITS, IP6_BITS) / NBBY)
81  
82  /*
83   * We use a shared routing entry data structure for IPv4 and IPv6 routing
84   * entries.  The result is cleaner code at the cost of (currently) about 2.3KB
85   * of memory wasted (costing 12 bytes per address for three addresses for 64 of
86   * the 128 routing entries that would be for IPv4), although with the benefit
87   * that either address family may use more than half of the routing entries.
88   * From that 2.3KB, 1KB can be reclaimed by moving the destination address and
89   * mask into the rttree_entry data structure, at the cost of its generality.
90   */
91  struct route_entry {
92  	struct rttree_entry re_entry;		/* routing tree entry */
93  	union pxfer_re_pu {
94  		struct ifdev *repu_ifdev;	/* associated interface */
95  		SIMPLEQ_ENTRY(route_entry) repu_next;	/* next free pointer */
96  	} re_pu;
97  	unsigned int re_flags;			/* routing flags (RTF_) */
98  	unsigned int re_use;			/* number of times used */
99  	uint8_t re_addr[ROUTE_ADDR_MAX];	/* destination address */
100  	uint8_t re_mask[ROUTE_ADDR_MAX];	/* destination mask */
101  	union ixfer_re_gu {
102  		ip4_addr_p_t regu_gw4;		/* gateway (IPv4) */
103  		ip6_addr_p_t regu_gw6;		/* gateway (IPv6) */
104  	} re_gu;
105  };
106  #define re_ifdev	re_pu.repu_ifdev
107  #define re_next		re_pu.repu_next
108  #define re_gw4		re_gu.regu_gw4
109  #define re_gw6		re_gu.regu_gw6
110  
111  /* Routes for local addresses are immutable, for reasons explained in ifdev. */
112  #define route_is_immutable(route)	((route)->re_flags & RTF_LOCAL)
113  
114  /*
115   * We override a subset of the BSD routing flags in order to store our own
116   * local settings.  In particular, we have to have a way to store whether a
117   * route is for an IPv4 or IPv6 destination address.  We override BSD's
118   * RTF_DONE flag for this: RTF_DONE is only used with routing sockets, and
119   * never associated with actual routes.  In contrast, RTF_IPV6 is only used
120   * with actual routes, and never sent across routing sockets.  In general,
121   * overriding flags is preferable to adding new ones, as BSD might later add
122   * more flags itself as well, while it can never remove existing flags.
123   */
124  #define RTF_IPV6	RTF_DONE	/* route is for an IPv6 destination */
125  
126  /* The total number of routing entries (IPv4 and IPv6 combined). */
127  #define NR_ROUTE_ENTRY	128
128  
129  static struct route_entry route_array[NR_ROUTE_ENTRY];	/* routing entries */
130  
131  static SIMPLEQ_HEAD(, route_entry) route_freelist;	/* free entry list */
132  
133  /* The routing trees.  There are two: one for IPv4 and one for IPv6. */
134  #define ROUTE_TREE_V4	0
135  #define ROUTE_TREE_V6	1
136  #define NR_ROUTE_TREE	2
137  
138  static struct rttree route_tree[NR_ROUTE_TREE];
139  
140  /* We support a single cached routing entry per address family (IPv4, IPv6). */
141  static int rtcache_v4set;
142  static ip4_addr_t rtcache_v4addr;
143  static struct route_entry *rtcache_v4route;
144  
145  static int rtcache_v6set;
146  static ip6_addr_t rtcache_v6addr;
147  static struct route_entry *rtcache_v6route;
148  
149  /*
150   * Initialize the routing cache.  There are a lot of trivial functions here,
151   * but this is designed to be extended in the future.
152   */
153  static void
154  rtcache_init(void)
155  {
156  
157  	rtcache_v4set = FALSE;
158  	rtcache_v6set = FALSE;
159  }
160  
161  /*
162   * Look up the given IPv4 address in the routing cache.  If there is a match,
163   * return TRUE with the associated route in 'route', possibly NULL if a
164   * negative result was cached.  Return FALSE if the routing cache does not
165   * cache the given IPv4 address.
166   */
167  static inline int
168  rtcache_lookup_v4(const ip4_addr_t * ipaddr, struct route_entry ** route)
169  {
170  
171  	if (rtcache_v4set && ip4_addr_cmp(&rtcache_v4addr, ipaddr)) {
172  		*route = rtcache_v4route;
173  
174  		return TRUE;
175  	} else
176  		return FALSE;
177  }
178  
179  /*
180   * Add the given IPv4 address and the given routing entry (NULL for negative
181   * caching) to the routing cache.
182   */
183  static inline void
184  rtcache_add_v4(const ip4_addr_t * ipaddr, struct route_entry * route)
185  {
186  
187  	rtcache_v4addr = *ipaddr;
188  	rtcache_v4route = route;
189  	rtcache_v4set = TRUE;
190  }
191  
192  /*
193   * Reset the IPv4 routing cache.
194   */
195  static void
196  rtcache_reset_v4(void)
197  {
198  
199  	rtcache_v4set = FALSE;
200  }
201  
202  /*
203   * Look up the given IPv6 address in the routing cache.  If there is a match,
204   * return TRUE with the associated route in 'route', possibly NULL if a
205   * negative result was cached.  Return FALSE if the routing cache does not
206   * cache the given IPv6 address.
207   */
208  static inline int
209  rtcache_lookup_v6(const ip6_addr_t * ipaddr, struct route_entry ** route)
210  {
211  
212  	if (rtcache_v6set && ip6_addr_cmp(&rtcache_v6addr, ipaddr)) {
213  		*route = rtcache_v6route;
214  
215  		return TRUE;
216  	} else
217  		return FALSE;
218  }
219  
220  /*
221   * Add the given IPv6 address and the given routing entry (NULL for negative
222   * caching) to the routing cache.  Caching of scoped addresses without zones is
223   * not supported.
224   */
225  static inline void
226  rtcache_add_v6(const ip6_addr_t * ipaddr, struct route_entry * route)
227  {
228  
229  	rtcache_v6addr = *ipaddr;
230  	rtcache_v6route = route;
231  	rtcache_v6set = TRUE;
232  }
233  
234  /*
235   * Reset the IPv6 routing cache.
236   */
237  static void
238  rtcache_reset_v6(void)
239  {
240  
241  	rtcache_v6set = FALSE;
242  }
243  
244  /*
245   * Initialize the routing module.
246   */
247  void
248  route_init(void)
249  {
250  	unsigned int slot;
251  
252  	/* Initialize the routing trees. */
253  	rttree_init(&route_tree[ROUTE_TREE_V4], IP4_BITS);
254  	rttree_init(&route_tree[ROUTE_TREE_V6], IP6_BITS);
255  
256  	/* Initialize the list of free routing entries. */
257  	SIMPLEQ_INIT(&route_freelist);
258  
259  	for (slot = 0; slot < __arraycount(route_array); slot++)
260  		SIMPLEQ_INSERT_TAIL(&route_freelist, &route_array[slot],
261  		    re_next);
262  
263  	/* Reset the routing cache. */
264  	rtcache_init();
265  }
266  
267  /*
268   * Prepare for a routing tree operation by converting the given IPv4 address
269   * into a raw address that can be used in that routing tree operation.
270   */
271  static inline void
272  route_prepare_v4(const ip4_addr_t * ip4addr, uint8_t rtaddr[ROUTE_ADDR_MAX])
273  {
274  	uint32_t val;
275  
276  	val = ip4_addr_get_u32(ip4addr);
277  
278  	memcpy(rtaddr, &val, sizeof(val));
279  }
280  
281  /*
282   * Prepare for a routing tree operation by converting the given IPv6 address
283   * into a raw address that can be used in that routing tree operation.  If the
284   * given prefix length allows for it, also incorporate the address zone.
285   */
286  static inline void
287  route_prepare_v6(const ip6_addr_t * ip6addr, unsigned int prefix,
288  	uint8_t rtaddr[ROUTE_ADDR_MAX])
289  {
290  
291  	assert(sizeof(ip6addr->addr) == IP6_BITS / NBBY);
292  
293  	/*
294  	 * TODO: in most cases, we could actually return a pointer to the
295  	 * address contained in the given lwIP IP address structure.  However,
296  	 * doing so would make a lot things quite a bit messier around here,
297  	 * but the small performance gain may still make it worth it.
298  	 */
299  	memcpy(rtaddr, ip6addr->addr, sizeof(ip6addr->addr));
300  
301  	/*
302  	 * Embed the zone ID into the address, KAME style.  This is the
303  	 * easiest way to have link-local addresses for multiple interfaces
304  	 * coexist in a single routing tree.  Do this only if the full zone ID
305  	 * would be included in the prefix though, or we might de-normalize the
306  	 * address.
307  	 */
308  	if (ip6_addr_has_zone(ip6addr) && prefix >= 32)
309  		rtaddr[3] = ip6_addr_zone(ip6addr);
310  }
311  
312  /*
313   * Prepare for a routing tree operation by converting the given IP address into
314   * a raw address that can be used in that routing tree operation.  The given
315   * address's zone ID is embedded "KAME-style" into the raw (IPv6) address when
316   * applicable and if the given prefix length allows for it.  Return the index
317   * of the routing tree to use (ROUTE_TREE_V4 or ROUTE_TREE_V6).
318   */
319  static unsigned int
320  route_prepare(const ip_addr_t * ipaddr, unsigned int prefix,
321  	uint8_t rtaddr[ROUTE_ADDR_MAX])
322  {
323  
324  	switch (IP_GET_TYPE(ipaddr)) {
325  	case IPADDR_TYPE_V4:
326  		route_prepare_v4(ip_2_ip4(ipaddr), rtaddr);
327  
328  		return ROUTE_TREE_V4;
329  
330  	case IPADDR_TYPE_V6:
331  		route_prepare_v6(ip_2_ip6(ipaddr), prefix, rtaddr);
332  
333  		return ROUTE_TREE_V6;
334  
335  	default:
336  		panic("unknown IP address type: %u", IP_GET_TYPE(ipaddr));
337  	}
338  }
339  
340  /*
341   * The given routing tree (ROUTE_TREE_V4 or ROUTE_TREE_V6) has been updated.
342   * Invalidate any cache entries that may now have become stale, both locally
343   * and in lwIP.
344   */
345  static void
346  route_updated(unsigned int tree)
347  {
348  
349  	if (tree == ROUTE_TREE_V6) {
350  		rtcache_reset_v6();
351  
352  		/*
353  		 * Also clear the lwIP ND6 destination cache, which may now
354  		 * contain entries for the wrong gateway.
355  		 */
356  		nd6_clear_destination_cache();
357  	} else
358  		rtcache_reset_v4();
359  }
360  
361  /*
362   * Add a route to the appropriate routing table.  The address, address zone,
363   * prefix, and RTF_HOST flag in the flags field make up the identity of the
364   * route.  If the flags field contains RTF_GATEWAY, a gateway must be given;
365   * otherwise, it must be NULL.  The route is associated with the given
366   * interface, which may not be NULL.  The caller must ensure that the flags
367   * field does not contain unsupported flags.  On success, return OK, and also
368   * also announce the addition.  On failure, return a negative error code.
369   */
370  int
371  route_add(const ip_addr_t * addr, unsigned int prefix,
372  	const ip_addr_t * gateway, struct ifdev * ifdev, unsigned int flags,
373  	const struct rtsock_request * rtr)
374  {
375  	struct route_entry *route;
376  	unsigned int tree, byte;
377  	int r;
378  
379  	assert(flags & RTF_UP);
380  	assert(!!(flags & RTF_GATEWAY) == (gateway != NULL));
381  	assert(ifdev != NULL);
382  
383  	/* Get a routing entry, if any are available. */
384  	if (SIMPLEQ_EMPTY(&route_freelist))
385  		return ENOBUFS;
386  
387  	route = SIMPLEQ_FIRST(&route_freelist);
388  
389  	/*
390  	 * Perform sanity checks on the input, and fill in enough of the
391  	 * routing entry to be able to try and add it to the routing tree.
392  	 */
393  	memset(route->re_addr, 0, sizeof(route->re_addr));
394  
395  	tree = route_prepare(addr, prefix, route->re_addr);
396  
397  	switch (tree) {
398  	case ROUTE_TREE_V4:
399  		if (prefix > IP4_BITS ||
400  		    (prefix != IP4_BITS && (flags & RTF_HOST)))
401  			return EINVAL;
402  
403  		flags &= ~RTF_IPV6;
404  
405  		break;
406  
407  	case ROUTE_TREE_V6:
408  		if (prefix > IP6_BITS ||
409  		    (prefix != IP6_BITS && (flags & RTF_HOST)))
410  			return EINVAL;
411  
412  		flags |= RTF_IPV6;
413  
414  		break;
415  
416  	default:
417  		return EINVAL;
418  	}
419  
420  	/* Generate the (raw) network mask.  This is protocol agnostic! */
421  	addr_make_netmask(route->re_mask, sizeof(route->re_mask), prefix);
422  
423  	/* The given address must be normalized to its mask. */
424  	for (byte = 0; byte < __arraycount(route->re_addr); byte++)
425  		if ((route->re_addr[byte] & ~route->re_mask[byte]) != 0)
426  			return EINVAL;
427  
428  	/*
429  	 * Attempt to add the routing entry.  Host-type entries do not have an
430  	 * associated mask, enabling ever-so-slightly faster matching.
431  	 */
432  	if ((r = rttree_add(&route_tree[tree], &route->re_entry,
433  	    route->re_addr, (flags & RTF_HOST) ? NULL : route->re_mask,
434  	    prefix)) != OK)
435  		return r;
436  
437  	/*
438  	 * Success.  Finish the routing entry.  Remove the entry from the free
439  	 * list before assigning re_ifdev, as these two use the same memory.
440  	 */
441  	SIMPLEQ_REMOVE_HEAD(&route_freelist, re_next);
442  
443  	route->re_ifdev = ifdev;
444  	route->re_flags = flags;
445  
446  	/*
447  	 * Store the gateway if one is given.  Store the address in lwIP format
448  	 * because that is the easiest way use it later again.  Store it as a
449  	 * union to keep the route entry structure as small as possible.  Store
450  	 * the address without its zone, because the gateway's address zone is
451  	 * implied by its associated ifdev.
452  	 *
453  	 * If no gateway is given, this is a link-type route, i.e., a route for
454  	 * a local network, with all nodes directly connected and reachable.
455  	 */
456  	if (flags & RTF_GATEWAY) {
457  		if (flags & RTF_IPV6)
458  			ip6_addr_copy_to_packed(route->re_gw6,
459  			    *ip_2_ip6(gateway));
460  		else
461  			ip4_addr_copy(route->re_gw4, *ip_2_ip4(gateway));
462  	}
463  
464  	/* We have made routing changes. */
465  	route_updated(tree);
466  
467  	/* Announce the route addition. */
468  	rtsock_msg_route(route, RTM_ADD, rtr);
469  
470  	return OK;
471  }
472  
473  /*
474   * Check whether it is possible to add a route for the given destination to the
475   * corresponding routing table, that is, a subsequent route_add() call for this
476   * destination address is guaranteed to succeed (if all its parameters are
477   * valid).  Return TRUE if adding the route is guaranteed to succeed, or FALSE
478   * if creating a route for the given destination would fail.
479   */
480  int
481  route_can_add(const ip_addr_t * addr, unsigned int prefix,
482  	int is_host __unused)
483  {
484  	uint8_t rtaddr[ROUTE_ADDR_MAX];
485  	unsigned int tree;
486  
487  	tree = route_prepare(addr, prefix, rtaddr);
488  
489  	/*
490  	 * The corresponding routing tree must not already contain an exact
491  	 * match for the destination.  If the routing tree implementation is
492  	 * ever extended with support for coexisting host and net entries with
493  	 * the same prefix, we should also pass in 'is_host' here.
494  	 */
495  	if (rttree_lookup_exact(&route_tree[tree], rtaddr, prefix) != NULL)
496  		return FALSE;
497  
498  	/* There must be a routing entry on the free list as well. */
499  	return !SIMPLEQ_EMPTY(&route_freelist);
500  }
501  
502  /*
503   * Find a route with the exact given route identity.  Return the route if
504   * found, or NULL if no route exists with this identity.
505   */
506  struct route_entry *
507  route_find(const ip_addr_t * addr, unsigned int prefix, int is_host)
508  {
509  	struct rttree_entry *entry;
510  	struct route_entry *route;
511  	uint8_t rtaddr[ROUTE_ADDR_MAX];
512  	unsigned int tree;
513  
514  	tree = route_prepare(addr, prefix, rtaddr);
515  
516  	entry = rttree_lookup_exact(&route_tree[tree], rtaddr, prefix);
517  	if (entry == NULL)
518  		return NULL;
519  
520  	route = (struct route_entry *)entry;
521  
522  	/*
523  	 * As long as the routing tree code does not support coexisting host
524  	 * and net entries with the same prefix, we have to check the type.
525  	 */
526  	if (!!(route->re_flags & RTF_HOST) != is_host)
527  		return NULL;
528  
529  	return route;
530  }
531  
532  /*
533   * A route lookup failed for the given IP address.  Generate an RTM_MISS
534   * message on routing sockets.
535   */
536  static void
537  route_miss(const ip_addr_t * ipaddr)
538  {
539  	union sockaddr_any addr;
540  	socklen_t addr_len;
541  
542  	addr_len = sizeof(addr);
543  
544  	addr_put_inet(&addr.sa, &addr_len, ipaddr, TRUE /*kame*/, 0 /*port*/);
545  
546  	rtsock_msg_miss(&addr.sa);
547  }
548  
549  /*
550   * A route lookup failed for the given IPv4 address.  Generate an RTM_MISS
551   * message on routing sockets.
552   */
553  static void
554  route_miss_v4(const ip4_addr_t * ip4addr)
555  {
556  	ip_addr_t ipaddr;
557  
558  	ip_addr_copy_from_ip4(ipaddr, *ip4addr);
559  
560  	route_miss(&ipaddr);
561  }
562  
563  /*
564   * A route lookup failed for the given IPv6 address.  Generate an RTM_MISS
565   * message on routing sockets.
566   */
567  static void
568  route_miss_v6(const ip6_addr_t * ip6addr)
569  {
570  	ip_addr_t ipaddr;
571  
572  	ip_addr_copy_from_ip6(ipaddr, *ip6addr);
573  
574  	route_miss(&ipaddr);
575  }
576  
577  /*
578   * Look up the most narrow matching routing entry for the given IPv4 address.
579   * Return the routing entry if one exists at all, or NULL otherwise.  This
580   * function performs caching.
581   */
582  static inline struct route_entry *
583  route_lookup_v4(const ip4_addr_t * ip4addr)
584  {
585  	uint8_t rtaddr[ROUTE_ADDR_MAX];
586  	struct route_entry *route;
587  
588  	/*
589  	 * Look up the route for the destination IP address, unless we have a
590  	 * cached route entry.  We cache negatives in order to avoid generating
591  	 * lots of RTM_MISS messages for the same destination in a row.
592  	 */
593  	if (rtcache_lookup_v4(ip4addr, &route))
594  		return route;
595  
596  	route_prepare_v4(ip4addr, rtaddr);
597  
598  	route = (struct route_entry *)
599  	    rttree_lookup_match(&route_tree[ROUTE_TREE_V4], rtaddr);
600  
601  	/* Cache the result, even if we found no route. */
602  	rtcache_add_v4(ip4addr, route);
603  
604  	return route;
605  }
606  
607  /*
608   * Look up the most narrow matching routing entry for the given IPv6 address,
609   * taking into account its zone ID if applicable.  Return the routing entry if
610   * one exists at all, or NULL otherwise.  This function performs caching.
611   */
612  static inline struct route_entry *
613  route_lookup_v6(const ip6_addr_t * ip6addr)
614  {
615  	uint8_t rtaddr[ROUTE_ADDR_MAX];
616  	struct route_entry *route;
617  	int use_cache;
618  
619  	/*
620  	 * We do not support caching of addresses that should have a zone but
621  	 * do not: in different contexts, such addresses could yield different
622  	 * routes.
623  	 */
624  	use_cache = !ip6_addr_lacks_zone(ip6addr, IP6_UNKNOWN);
625  
626  	if (use_cache && rtcache_lookup_v6(ip6addr, &route))
627  		return route;
628  
629  	route_prepare_v6(ip6addr, IP6_BITS, rtaddr);
630  
631  	route = (struct route_entry *)
632  	    rttree_lookup_match(&route_tree[ROUTE_TREE_V6], rtaddr);
633  
634  	/* Cache the result, even if no route was found. */
635  	if (use_cache)
636  		rtcache_add_v6(ip6addr, route);
637  
638  	return route;
639  }
640  
641  /*
642   * Look up the most narrow matching routing entry for the given IP address,
643   * taking into account its zone ID if applicable.  Return the routing entry if
644   * one exists at all, or NULL otherwise.  This function performs caching.
645   */
646  struct route_entry *
647  route_lookup(const ip_addr_t * addr)
648  {
649  
650  	if (IP_IS_V4(addr))
651  		return route_lookup_v4(ip_2_ip4(addr));
652  	else
653  		return route_lookup_v6(ip_2_ip6(addr));
654  }
655  
656  /*
657   * Change an existing routing entry.  Its flags are always updated to the new
658   * set of given flags, although certain flags are always preserved.  If the
659   * new flags set has RTF_GATEWAY set and 'gateway' is not NULL, update the
660   * gateway associated with the route.  If 'ifdev' is not NULL, reassociate the
661   * route with the given interface; this will not affect the zone of the
662   * route's destination address.  On success, return OK, and also announce the
663   * change.  On failure, return a negative error code.
664   */
665  static int
666  route_change(struct route_entry * route, const ip_addr_t * gateway,
667  	struct ifdev * ifdev, unsigned int flags,
668  	const struct rtsock_request * rtr)
669  {
670  	unsigned int tree, preserve;
671  
672  	tree = (route->re_flags & RTF_IPV6) ? ROUTE_TREE_V6 : ROUTE_TREE_V4;
673  
674  	/* Update the associated interface (only) if a new one is given. */
675  	if (ifdev != NULL)
676  		route->re_ifdev = ifdev;
677  
678  	/*
679  	 * These flags may not be changed.  RTF_UP should always be set anyway.
680  	 * RTF_HOST and RTF_IPV6 are part of the route's identity.  RTF_LOCAL
681  	 * should be preserved as well, although we will not get here if either
682  	 * the old or the new flags have it set anyway.
683  	 */
684  	preserve = RTF_UP | RTF_HOST | RTF_IPV6 | RTF_LOCAL;
685  
686  	/* Always update the flags.  There is no way not to. */
687  	route->re_flags = (route->re_flags & preserve) | (flags & ~preserve);
688  
689  	/*
690  	 * If a new gateway is given *and* RTF_GATEWAY is set, update the
691  	 * gateway.  If RTF_GATEWAY is not set, this is a link-type route with
692  	 * no gateway.  If no new gateway is given, we keep the gateway as is.
693  	 */
694  	if (gateway != NULL && (flags & RTF_GATEWAY)) {
695  		if (flags & RTF_IPV6)
696  			ip6_addr_copy_to_packed(route->re_gw6,
697  			    *ip_2_ip6(gateway));
698  		else
699  			ip4_addr_copy(route->re_gw4, *ip_2_ip4(gateway));
700  	}
701  
702  	/* We have made routing changes. */
703  	route_updated(tree);
704  
705  	/* Announce the route change. */
706  	rtsock_msg_route(route, RTM_CHANGE, rtr);
707  
708  	return OK;
709  }
710  
711  /*
712   * Delete the given route, and announce its deletion.
713   */
714  void
715  route_delete(struct route_entry * route, const struct rtsock_request * rtr)
716  {
717  	unsigned int tree;
718  
719  	/* First announce the deletion, while the route is still around. */
720  	tree = (route->re_flags & RTF_IPV6) ? ROUTE_TREE_V6 : ROUTE_TREE_V4;
721  
722  	rtsock_msg_route(route, RTM_DELETE, rtr);
723  
724  	/* Then actually delete the route. */
725  	rttree_delete(&route_tree[tree], &route->re_entry);
726  
727  	SIMPLEQ_INSERT_HEAD(&route_freelist, route, re_next);
728  
729  	/* We have made routing changes. */
730  	route_updated(tree);
731  }
732  
733  /*
734   * Delete all routes associated with the given interface, typically as part of
735   * destroying the interface.
736   */
737  void
738  route_clear(struct ifdev * ifdev)
739  {
740  	struct rttree_entry *entry, *parent;
741  	struct route_entry *route;
742  	unsigned int tree;
743  
744  	/*
745  	 * Delete all routes associated with the given interface.  Fortunately,
746  	 * we need not also delete addresses zoned to the given interface,
747  	 * because no route can be created with a zone ID that does not match
748  	 * the associated interface.  That is the main reason why we ignore
749  	 * zone IDs for gateways when adding or changing routes..
750  	 */
751  	for (tree = 0; tree < NR_ROUTE_TREE; tree++) {
752  		parent = NULL;
753  
754  		while ((entry = rttree_enum(&route_tree[tree],
755  		    parent)) != NULL) {
756  			route = (struct route_entry *)entry;
757  
758  			if (route->re_ifdev == ifdev)
759  				route_delete(route, NULL /*request*/);
760  			else
761  				parent = entry;
762  		}
763  	}
764  }
765  
766  /*
767   * Process a routing command specifically for an IPv4 or IPv6 route, as one of
768   * the specific continuations of processing started by route_process().  The
769   * RTM_ routing command is given as 'type'.  The route destination is given as
770   * 'dst_addr'; its address type determines whether the operation is for IPv4 or
771   * IPv6.  The sockaddr structures for 'mask' and 'gateway' are passed on as is
772   * and may have to be parsed here if not NULL.  'ifdev' is the interface to be
773   * associated with a route; it is non-NULL only if an interface name (IFP) or
774   * address (IFA) was given.  The RTF_ flags field 'flags' has been checked
775   * against the globally supported flags, but may have to be checked for flags
776   * that do not apply to IPv4/IPv6 routes.  Return OK or a negative error code,
777   * following the same semantics as route_process().
778   */
779  static int
780  route_process_inet(unsigned int type, const ip_addr_t * dst_addr,
781  	const struct sockaddr * mask, const struct sockaddr * gateway,
782  	struct ifdev * ifdev, unsigned int flags,
783  	const struct rtsock_request * rtr)
784  {
785  	struct route_entry *route;
786  	ip_addr_t gw_storage, *gw_addr;
787  	struct ifdev *ifdev2;
788  	uint32_t zone;
789  	unsigned int prefix;
790  	int r;
791  
792  	assert(!(flags & RTF_LLDATA));
793  
794  	if ((flags & (RTF_DYNAMIC | RTF_MODIFIED | RTF_DONE | RTF_XRESOLVE |
795  	    RTF_LLINFO | RTF_CLONED | RTF_SRC | RTF_ANNOUNCE |
796  	    RTF_BROADCAST)) != 0)
797  		return EINVAL;
798  
799  	/*
800  	 * For network entries, a network mask must be provided in all cases.
801  	 * For host entries, the network mask is ignored, and we use a prefix
802  	 * with all bits set.
803  	 */
804  	if (!(flags & RTF_HOST)) {
805  		if (mask == NULL)
806  			return EINVAL;
807  
808  		if ((r = addr_get_netmask(mask, mask->sa_len,
809  		    IP_GET_TYPE(dst_addr), &prefix, NULL /*ipaddr*/)) != OK)
810  			return r;
811  	} else {
812  		if (IP_IS_V4(dst_addr))
813  			prefix = IP4_BITS;
814  		else
815  			prefix = IP6_BITS;
816  	}
817  
818  	gw_addr = NULL;
819  
820  	/*
821  	 * Determine the gateway and interface for the routing entry, if
822  	 * applicable.
823  	 */
824  	if (type == RTM_ADD || type == RTM_CHANGE) {
825  		/*
826  		 * The RTF_UP flag must always be set, but only if the flags
827  		 * field is used at all.
828  		 */
829  		if (!(flags & RTF_UP))
830  			return EINVAL;
831  
832  		if ((flags & RTF_GATEWAY) && gateway != NULL) {
833  			if ((r = addr_get_inet(gateway, gateway->sa_len,
834  			    IP_GET_TYPE(dst_addr), &gw_storage, TRUE /*kame*/,
835  			    NULL /*port*/)) != OK)
836  				return r;
837  
838  			gw_addr = &gw_storage;
839  
840  			/*
841  			 * We use the zone of the gateway to help determine the
842  			 * interface, but we do not reject a mismatching zone
843  			 * here.  The reason for this is that we do not want
844  			 * routes that have zones for an interface other than
845  			 * the one associated with the route, as that could
846  			 * create a world of trouble: packets leaving their
847  			 * zone, complications with cleaning up interfaces..
848  			 */
849  			if (IP_IS_V6(gw_addr) &&
850  			    ip6_addr_has_zone(ip_2_ip6(gw_addr))) {
851  				zone = ip6_addr_zone(ip_2_ip6(gw_addr));
852  
853  				ifdev2 = ifdev_get_by_index(zone);
854  
855  				if (ifdev != NULL && ifdev != ifdev2)
856  					return EINVAL;
857  				else
858  					ifdev = ifdev2;
859  			}
860  
861  			/*
862  			 * If we still have no interface at this point, see if
863  			 * we can find one based on just the gateway address.
864  			 * See if a locally attached network owns the address.
865  			 * That may not succeed, leaving ifdev set to NULL.
866  			 */
867  			if (ifdev == NULL)
868  				ifdev = ifaddr_map_by_subnet(gw_addr);
869  		}
870  
871  		/*
872  		 * When adding routes, all necessary information must be given.
873  		 * When changing routes, we can leave some settings as is.
874  		 */
875  		if (type == RTM_ADD) {
876  			if ((flags & RTF_GATEWAY) && gw_addr == NULL)
877  				return EINVAL;
878  
879  			/* TODO: try harder to find a matching interface.. */
880  			if (ifdev == NULL)
881  				return ENETUNREACH;
882  		}
883  	}
884  
885  	/*
886  	 * All route commands except RTM_ADD require that a route exists for
887  	 * the given identity, although RTM_GET, when requesting a host entry,
888  	 * may return a wider (network) route based on just the destination
889  	 * address.
890  	 */
891  	if (type != RTM_ADD) {
892  		/* For RTM_GET (only), a host query may return a net route. */
893  		if (type == RTM_GET && (flags & RTF_HOST))
894  			route = route_lookup(dst_addr);
895  		else
896  			route = route_find(dst_addr, prefix,
897  			    !!(flags & RTF_HOST));
898  
899  		if (route == NULL)
900  			return ESRCH;
901  	} else
902  		route = NULL;
903  
904  	/* Process the actual routing command. */
905  	switch (type) {
906  	case RTM_ADD:
907  		return route_add(dst_addr, prefix, gw_addr, ifdev, flags, rtr);
908  
909  	case RTM_CHANGE:
910  		/* Routes for local addresses are immutable. */
911  		if (route_is_immutable(route))
912  			return EPERM;
913  
914  		return route_change(route, gw_addr, ifdev, flags, rtr);
915  
916  	case RTM_DELETE:
917  		/* Routes for local addresses are immutable. */
918  		if (route_is_immutable(route))
919  			return EPERM;
920  
921  		route_delete(route, rtr);
922  
923  		return OK;
924  
925  	case RTM_LOCK:
926  		/*
927  		 * TODO: implement even the suggestion that we support this.
928  		 * For now, we do not keep per-route metrics, let alone change
929  		 * them dynamically ourselves, so "locking" metrics is really
930  		 * not a concept that applies to us.  We may however have to
931  		 * save the lock mask and return it in queries..
932  		 */
933  		/* FALLTHROUGH */
934  	case RTM_GET:
935  		/* Simply generate a message for the route we just found. */
936  		rtsock_msg_route(route, type, rtr);
937  
938  		return OK;
939  
940  	default:
941  		return EINVAL;
942  	}
943  }
944  
945  /*
946   * Process a routing command from a routing socket.  The RTM_ type of command
947   * is given as 'type', and is one of RTM_ADD, RTM_CHANGE, RTM_DELETE, RTM_GET,
948   * RTM_LOCK.  In addition, the function takes a set of sockaddr pointers as
949   * provided by the routing command.  Each of these sockaddr pointers may be
950   * NULL; if not NULL, the structure is at least large enough to contain the
951   * address length (sa_len) and family (sa_family), and the length never exceeds
952   * the amount of memory used to store the sockaddr structure.  However, the
953   * length itself has not yet been checked against the expected protocol
954   * structure and could even be zero.  The command's RTF_ routing flags and
955   * metrics are provided as well.  On success, return OK, in which case the
956   * caller assumes that a routing socket announcement for the processed command
957   * has been sent already (passing on 'rtr' to the announcement function as is).
958   * On failure, return a negative error code; in that case, the caller will send
959   * a failure response on the original routing socket itself.
960   */
961  int
962  route_process(unsigned int type, const struct sockaddr * dst,
963  	const struct sockaddr * mask, const struct sockaddr * gateway,
964  	const struct sockaddr * ifp, const struct sockaddr * ifa,
965  	unsigned int flags, unsigned long inits,
966  	const struct rt_metrics * rmx, const struct rtsock_request * rtr)
967  {
968  	struct ifdev *ifdev, *ifdev2;
969  	char name[IFNAMSIZ];
970  	ip_addr_t dst_addr, if_addr;
971  	uint32_t zone;
972  	uint8_t addr_type;
973  	int r;
974  
975  	/*
976  	 * The identity of a route is determined by its destination address,
977  	 * destination zone, prefix length, and whether it is a host entry
978  	 * or not.  If it is a host entry (RTF_HOST is set), the prefix length
979  	 * is implied by the protocol; otherwise it should be obtained from the
980  	 * given netmask if necessary.  For link-local addresses, the zone ID
981  	 * must be embedded KAME-style in the destination address.  A
982  	 * destination address must always be given.  The destination address
983  	 * also determines the overall address family.
984  	 */
985  	if (dst == NULL)
986  		return EINVAL;
987  
988  	switch (dst->sa_family) {
989  	case AF_INET:
990  		addr_type = IPADDR_TYPE_V4;
991  		break;
992  #ifdef INET6
993  	case AF_INET6:
994  		addr_type = IPADDR_TYPE_V6;
995  		break;
996  #endif /* INET6 */
997  	default:
998  		return EAFNOSUPPORT;
999  	}
1000  
1001  	if ((r = addr_get_inet(dst, dst->sa_len, addr_type, &dst_addr,
1002  	    TRUE /*kame*/, NULL /*port*/)) != OK)
1003  		return r;
1004  
1005  	/*
1006  	 * Perform a generic test on the given flags.  This covers everything
1007  	 * we support at all, plus a few flags we ignore.  Specific route types
1008  	 * may have further restrictions; those tests are performed later.
1009  	 */
1010  	if ((flags & ~(RTF_UP | RTF_GATEWAY | RTF_HOST | RTF_REJECT |
1011  	    RTF_CLONING | RTF_LLINFO | RTF_LLDATA | RTF_STATIC |
1012  	    RTF_BLACKHOLE | RTF_CLONED | RTF_PROTO2 | RTF_PROTO1)) != 0)
1013  		return EINVAL;
1014  
1015  	ifdev = NULL;
1016  
1017  	if (type == RTM_ADD || type == RTM_CHANGE) {
1018  		/*
1019  		 * If an interface address or name is given, use that to
1020  		 * identify the target interface.  If both are given, make sure
1021  		 * that both identify the same interface--a hopefully helpful
1022  		 * feature to detect wrong route(8) usage (NetBSD simply takes
1023  		 * IFP over IFA).  An empty interface name is ignored on the
1024  		 * basis that libc link_addr(3) is broken.
1025  		 */
1026  		if (ifp != NULL) {
1027  			if ((r = addr_get_link(ifp, ifp->sa_len, name,
1028  			    sizeof(name), NULL /*hwaddr*/,
1029  			    0 /*hwaddr_len*/)) != OK)
1030  				return r;
1031  
1032  			if (name[0] != '\0' &&
1033  			    (ifdev = ifdev_find_by_name(name)) == NULL)
1034  				return ENXIO;
1035  		}
1036  
1037  		if (ifa != NULL) {
1038  			/*
1039  			 * This is similar to retrieval of source addresses in
1040  			 * ipsock, with the difference that we do not impose
1041  			 * that a zone ID be given for link-local addresses.
1042  			 */
1043  			if ((r = addr_get_inet(ifa, ifa->sa_len, addr_type,
1044  			    &if_addr, TRUE /*kame*/, NULL /*port*/)) != OK)
1045  				return r;
1046  
1047  			if ((ifdev2 = ifaddr_map_by_addr(&if_addr)) == NULL)
1048  				return EADDRNOTAVAIL;
1049  
1050  			if (ifdev != NULL && ifdev != ifdev2)
1051  				return EINVAL;
1052  			else
1053  				ifdev = ifdev2;
1054  		}
1055  
1056  		/*
1057  		 * If the destination address has a zone, then it must not
1058  		 * conflict with the interface, if one was given.  If not, we
1059  		 * may use it to decide the interface to use for the route.
1060  		 */
1061  		if (IP_IS_V6(&dst_addr) &&
1062  		    ip6_addr_has_zone(ip_2_ip6(&dst_addr))) {
1063  			if (ifdev == NULL) {
1064  				zone = ip6_addr_zone(ip_2_ip6(&dst_addr));
1065  
1066  				ifdev = ifdev_get_by_index(zone);
1067  			} else {
1068  				if (!ip6_addr_test_zone(ip_2_ip6(&dst_addr),
1069  				    ifdev_get_netif(ifdev)))
1070  					return EADDRNOTAVAIL;
1071  			}
1072  		}
1073  	}
1074  
1075  	/*
1076  	 * For now, no initializers are supported by any of the sub-processing
1077  	 * routines, so outright reject requests that set any initializers.
1078  	 * Most importantly, we do not support per-route MTU settings (RTV_MTU)
1079  	 * because lwIP would not use them, and we do not support non-zero
1080  	 * expiry (RTV_EXPIRE) because for IPv4/IPv6 routes it is not a widely
1081  	 * used feature and for ARP/NDP we would have to change lwIP.
1082  	 * dhcpcd(8) does supply RTV_MTU, we have to ignore that option rather
1083  	 * than reject it, unfortunately.  arp(8) always sets RTV_EXPIRE, so we
1084  	 * reject only non-zero expiry there.
1085  	 */
1086  	if ((inits & ~(RTV_EXPIRE | RTV_MTU)) != 0 ||
1087  	    ((inits & RTV_EXPIRE) != 0 && rmx->rmx_expire != 0))
1088  		return ENOSYS;
1089  
1090  	/*
1091  	 * From here on, the processing differs for ARP, NDP, and IP routes.
1092  	 * As of writing, our userland is from NetBSD 7, which puts link-local
1093  	 * route entries in its main route tables.  This means we would have to
1094  	 * search for existing routes before we can determine whether, say, a
1095  	 * RTM_GET request is for an IP or an ARP route entry.  As of NetBSD 8,
1096  	 * the link-local administration is separated, and all requests use the
1097  	 * RTF_LLDATA flag to indicate that they are for ARP/NDP routes rather
1098  	 * than IP routes.  Since that change makes things much cleaner for us,
1099  	 * we borrow from the future, patching arp(8) and ndp(8) to add the
1100  	 * RTF_LLDATA flag now, so that we can implement a clean split here.
1101  	 */
1102  	if (!(flags & RTF_LLDATA))
1103  		return route_process_inet(type, &dst_addr, mask, gateway,
1104  		    ifdev, flags, rtr);
1105  	else
1106  		return lldata_process(type, &dst_addr, gateway, ifdev, flags,
1107  		    rtr);
1108  }
1109  
1110  /*
1111   * Return the routing flags (RTF_) for the given routing entry.  Strip out any
1112   * internal flags.
1113   */
1114  unsigned int
1115  route_get_flags(const struct route_entry * route)
1116  {
1117  
1118  	return route->re_flags & ~RTF_IPV6;
1119  }
1120  
1121  /*
1122   * Return TRUE if the given routing entry is for the IPv6 address family, or
1123   * FALSE if it is for IPv4.
1124   */
1125  int
1126  route_is_ipv6(const struct route_entry * route)
1127  {
1128  
1129  	return !!(route->re_flags & RTF_IPV6);
1130  }
1131  
1132  /*
1133   * Return the interface associated with the given routing entry.  The resulting
1134   * interface is never NULL.
1135   */
1136  struct ifdev *
1137  route_get_ifdev(const struct route_entry * route)
1138  {
1139  
1140  	return route->re_ifdev;
1141  }
1142  
1143  /*
1144   * Convert the given raw routing address pointed to by 'rtaddr' into a
1145   * lwIP-style IP address 'ipaddr' of type 'type', which must by IPADDR_TYPE_V4
1146   * or IPADDR_TYPE_V6.
1147   */
1148  static void
1149  route_get_addr(ip_addr_t * ipaddr, const uint8_t * rtaddr, uint8_t type)
1150  {
1151  	ip6_addr_t *ip6addr;
1152  	uint32_t val, zone;
1153  
1154  	/*
1155  	 * Convert the routing address to a lwIP-type IP address.  Take out the
1156  	 * KAME-style embedded zone, if needed.
1157  	 */
1158  	memset(ipaddr, 0, sizeof(*ipaddr));
1159  	IP_SET_TYPE(ipaddr, type);
1160  
1161  	switch (type) {
1162  	case IPADDR_TYPE_V4:
1163  		memcpy(&val, rtaddr, sizeof(val));
1164  
1165  		ip_addr_set_ip4_u32(ipaddr, val);
1166  
1167  		break;
1168  
1169  	case IPADDR_TYPE_V6:
1170  		ip6addr = ip_2_ip6(ipaddr);
1171  
1172  		memcpy(ip6addr->addr, rtaddr, sizeof(ip6addr->addr));
1173  
1174  		if (ip6_addr_has_scope(ip6addr, IP6_UNKNOWN)) {
1175  			zone = ntohl(ip6addr->addr[0]) & 0x0000ffffU;
1176  
1177  			ip6addr->addr[0] &= PP_HTONL(0xffff0000U);
1178  
1179  			ip6_addr_set_zone(ip6addr, zone);
1180  		}
1181  
1182  		break;
1183  
1184  	default:
1185  		panic("unknown IP address type: %u", type);
1186  	}
1187  }
1188  
1189  /*
1190   * Obtain information about an IPv4 or IPv6 routing entry, by filling 'addr',
1191   * 'mask', 'gateway', and optionally (if not NULL) 'ifp' and 'ifa' with
1192   * sockaddr-type data for each of those fields.  Also store the associated
1193   * interface in 'ifdevp', the routing entry's flags in 'flags', and the route's
1194   * usage count in 'use'.
1195   */
1196  void
1197  route_get(const struct route_entry * route, union sockaddr_any * addr,
1198  	union sockaddr_any * mask, union sockaddr_any * gateway,
1199  	union sockaddr_any * ifp, union sockaddr_any * ifa,
1200  	struct ifdev ** ifdevp, unsigned int * flags, unsigned int * use)
1201  {
1202  	const ip_addr_t *src_addr;
1203  	ip_addr_t dst_addr, gw_addr;
1204  	struct ifdev *ifdev;
1205  	socklen_t addr_len;
1206  	uint8_t type;
1207  
1208  	type = (route->re_flags & RTF_IPV6) ? IPADDR_TYPE_V6 : IPADDR_TYPE_V4;
1209  
1210  	/* Get the destination address. */
1211  	route_get_addr(&dst_addr, route->re_addr, type);
1212  
1213  	addr_len = sizeof(*addr);
1214  
1215  	addr_put_inet(&addr->sa, &addr_len, &dst_addr, TRUE /*kame*/,
1216  	    0 /*port*/);
1217  
1218  	/* Get the network mask, if applicable. */
1219  	if (!(route->re_flags & RTF_HOST)) {
1220  		addr_len = sizeof(*mask);
1221  
1222  		addr_put_netmask(&mask->sa, &addr_len, type,
1223  		    rttree_get_prefix(&route->re_entry));
1224  	} else
1225  		mask->sa.sa_len = 0;
1226  
1227  	/* Get the gateway, which may be an IP address or a local link. */
1228  	addr_len = sizeof(*gateway);
1229  
1230  	ifdev = route->re_ifdev;
1231  
1232  	if (route->re_flags & RTF_GATEWAY) {
1233  		if (type == IPADDR_TYPE_V4)
1234  			ip_addr_copy_from_ip4(gw_addr, route->re_gw4);
1235  		else
1236  			ip_addr_copy_from_ip6_packed(gw_addr, route->re_gw6);
1237  
1238  		addr_put_inet(&gateway->sa, &addr_len, &gw_addr, TRUE /*kame*/,
1239  		    0 /*port*/);
1240  	} else {
1241  		addr_put_link(&gateway->sa, &addr_len, ifdev_get_index(ifdev),
1242  		    ifdev_get_iftype(ifdev), NULL /*name*/, NULL /*hwaddr*/,
1243  		    0 /*hwaddr_len*/);
1244  	}
1245  
1246  	/* Get the associated interface name. */
1247  	if (ifp != NULL) {
1248  		addr_len = sizeof(*ifp);
1249  
1250  		addr_put_link(&ifp->sa, &addr_len, ifdev_get_index(ifdev),
1251  		    ifdev_get_iftype(ifdev), ifdev_get_name(ifdev),
1252  		    NULL /*hwaddr*/, 0 /*hwaddr_len*/);
1253  	}
1254  
1255  	/* Get the associated source address, if we can determine one. */
1256  	if (ifa != NULL) {
1257  		src_addr = ifaddr_select(&dst_addr, ifdev, NULL /*ifdevp*/);
1258  
1259  		if (src_addr != NULL) {
1260  			addr_len = sizeof(*ifa);
1261  
1262  			addr_put_inet(&ifa->sa, &addr_len, src_addr,
1263  			    TRUE /*kame*/, 0 /*port*/);
1264  		} else
1265  			ifa->sa.sa_len = 0;
1266  	}
1267  
1268  	/* Get other fields. */
1269  	*flags = route_get_flags(route);	/* strip any internal flags */
1270  	*ifdevp = ifdev;
1271  	*use = route->re_use;
1272  }
1273  
1274  /*
1275   * Enumerate IPv4 routing entries.  Return the first IPv4 routing entry if
1276   * 'last' is NULL, or the next routing entry after 'last' if it is not NULL.
1277   * In both cases, the return value may be NULL if there are no more routes.
1278   */
1279  struct route_entry *
1280  route_enum_v4(struct route_entry * last)
1281  {
1282  
1283  	assert(last == NULL || !(last->re_flags & RTF_IPV6));
1284  
1285  	return (struct route_entry *)rttree_enum(&route_tree[ROUTE_TREE_V4],
1286  	    (last != NULL) ? &last->re_entry : NULL);
1287  }
1288  
1289  /*
1290   * Enumerate IPv6 routing entries.  Return the first IPv6 routing entry if
1291   * 'last' is NULL, or the next routing entry after 'last' if it is not NULL.
1292   * In both cases, the return value may be NULL if there are no more routes.
1293   */
1294  struct route_entry *
1295  route_enum_v6(struct route_entry * last)
1296  {
1297  
1298  	assert(last == NULL || (last->re_flags & RTF_IPV6));
1299  
1300  	return (struct route_entry *)rttree_enum(&route_tree[ROUTE_TREE_V6],
1301  	    (last != NULL) ? &last->re_entry : NULL);
1302  }
1303  
1304  /*
1305   * lwIP IPv4 routing function.   Given an IPv4 destination address, look up and
1306   * return the target interface, or NULL if there is no route to the address.
1307   *
1308   * This is a full replacement of the corresponding lwIP function, which should
1309   * be overridden with weak symbols, using patches against the lwIP source code.
1310   * As such, the lwIP headers should already provide the correct prototype for
1311   * this function.  If not, something will have changed in the lwIP
1312   * implementation, and this code must be revised accordingly.
1313   */
1314  struct netif *
1315  ip4_route(const ip4_addr_t * dst)
1316  {
1317  	struct route_entry *route;
1318  	struct ifdev *ifdev;
1319  
1320  	/*
1321  	 * Look up the route for the destination IPv4 address.  If no route is
1322  	 * found at all, return NULL to the caller.
1323  	 */
1324  	if ((route = route_lookup_v4(dst)) == NULL) {
1325  		route_miss_v4(dst);
1326  
1327  		return NULL;
1328  	}
1329  
1330  	/*
1331  	 * For now, we increase the use counter only for actual route lookups,
1332  	 * and not for gateway lookups or user queries.  As of writing,
1333  	 * route(8) does not print this number anyway..
1334  	 */
1335  	route->re_use++;
1336  
1337  	/*
1338  	 * For all packets that are supposed to be rejected or blackholed, use
1339  	 * a loopback interface, regardless of the interface to which the route
1340  	 * is associated (even though it will typically be lo0 anyway).  The
1341  	 * reason for this is that on packet output, we perform another route
1342  	 * route lookup just to check for rejection/blackholing, but for
1343  	 * efficiency reasons, we limit such checks to loopback interfaces:
1344  	 * loopback traffic will typically use only one IP address anyway, thus
1345  	 * limiting route misses from such rejection/blackhole route lookups as
1346  	 * much as we can.  The lookup is implemented in route_output_v4().  We
1347  	 * divert only if the target interface is not a loopback interface
1348  	 * already, mainly to allow userland tests to create blackhole routes
1349  	 * to a specific loopback interface for testing purposes.
1350  	 *
1351  	 * It is not correct to return NULL for RTF_REJECT routes here, because
1352  	 * this could cause e.g. connect() calls to fail immediately, which is
1353  	 * not how rejection should work.  Related: a previous incarnation of
1354  	 * support for these flags used a dedicated netif to eliminate the
1355  	 * extra route lookup on regular output altogether, but in the current
1356  	 * situation, that netif would have to be assigned (IPv4 and IPv6)
1357  	 * addresses in order not to break e.g. connect() in the same way.
1358  	 */
1359  	if ((route->re_flags & (RTF_REJECT | RTF_BLACKHOLE)) &&
1360  	    !ifdev_is_loopback(route->re_ifdev))
1361  		ifdev = ifdev_get_loopback();
1362  	else
1363  		ifdev = route->re_ifdev;
1364  
1365  	return ifdev_get_netif(ifdev);
1366  }
1367  
1368  /*
1369   * lwIP IPv4 routing hook.  Since this hook is called only from lwIP's own
1370   * ip4_route() implementation, this hook must never fire.  If it does, either
1371   * something is wrong with overriding ip4_route(), or lwIP added other places
1372   * from which this hook is called.  Both cases are highly problematic and must
1373   * be resolved somehow, which is why we simply call panic() here.
1374   */
1375  struct netif *
1376  lwip_hook_ip4_route(const ip4_addr_t * dst)
1377  {
1378  
1379  	panic("IPv4 routing hook called - this should not happen!");
1380  }
1381  
1382  /*
1383   * lwIP IPv4 ARP gateway hook.
1384   */
1385  const ip4_addr_t *
1386  lwip_hook_etharp_get_gw(struct netif * netif, const ip4_addr_t * ip4addr)
1387  {
1388  	static ip4_addr_t gw_addr; /* may be returned to the caller */
1389  	struct route_entry *route;
1390  
1391  	/* Look up the route for the destination IP address. */
1392  	if ((route = route_lookup_v4(ip4addr)) == NULL)
1393  		return NULL;
1394  
1395  	/*
1396  	 * This case could only ever trigger as a result of lwIP taking its own
1397  	 * routing decisions instead of calling the IPv4 routing hook.  While
1398  	 * not impossible, such cases should be extremely rare.  We cannot
1399  	 * provide a meaningful gateway address in this case either, though.
1400  	 */
1401  	if (route->re_ifdev != netif_get_ifdev(netif)) {
1402  		printf("LWIP: unexpected interface for gateway lookup\n");
1403  
1404  		return NULL;
1405  	}
1406  
1407  	/*
1408  	 * If this route has a gateway, return the IP address of the gateway.
1409  	 * Otherwise, the route is for a local network, and we would typically
1410  	 * not get here because lwIP performs the local-network check itself.
1411  	 * It is possible that the local network consists of more than one IP
1412  	 * range, and the user has configured a route for the other range.  In
1413  	 * that case, return the IP address of the actual destination.
1414  	 *
1415  	 * We store a packed version of the IPv4 address, so reconstruct the
1416  	 * unpacked version to a static variable first - for consistency with
1417  	 * the IPv6 code.
1418  	 */
1419  	if (route->re_flags & RTF_GATEWAY) {
1420  		ip4_addr_copy(gw_addr, route->re_gw4);
1421  
1422  		return &gw_addr;
1423  	} else
1424  		return ip4addr;
1425  }
1426  
1427  /*
1428   * lwIP IPv6 routing function.   Given an IPv6 source and destination address,
1429   * look up and return the target interface, or NULL if there is no route to the
1430   * address.  Our routing algorithm is destination-based, meaning that the
1431   * source address must be considered only to resolve zone ambiguity.
1432   *
1433   * This is a full replacement of the corresponding lwIP function, which should
1434   * be overridden with weak symbols, using patches against the lwIP source code.
1435   * As such, the lwIP headers should already provide the correct prototype for
1436   * this function.  If not, something will have changed in the lwIP
1437   * implementation, and this code must be revised accordingly.
1438   */
1439  struct netif *
1440  ip6_route(const ip6_addr_t * src, const ip6_addr_t * dst)
1441  {
1442  	struct route_entry *route;
1443  	struct ifdev *ifdev;
1444  	ip6_addr_t dst_addr;
1445  	uint32_t zone;
1446  
1447  	assert(src != NULL);
1448  	assert(dst != NULL);
1449  
1450  	/*
1451  	 * If the destination address is scoped but has no zone, use the source
1452  	 * address to determine a zone, which we then set on the destination
1453  	 * address to find the route, if successful.  Obviously, the interface
1454  	 * is not going to be different from the zone, but we do need to check
1455  	 * other aspects of the route (e.g., one might want to null-route all
1456  	 * multicast traffic).  In the case that no source address is given at
1457  	 * all, first see if the destination address happens to be a locally
1458  	 * assigned address.  In theory this could yield multiple matches, so
1459  	 * pick the first one.  If not even that helps, we have absolutely
1460  	 * nothing we can use to refine route selection.  We could pick an
1461  	 * arbitrary interface in that case, but we currently don't.
1462  	 */
1463  	zone = IP6_NO_ZONE;
1464  
1465  	if (ip6_addr_lacks_zone(dst, IP6_UNKNOWN)) {
1466  		if (ip6_addr_has_zone(src))
1467  			zone = ip6_addr_zone(src);
1468  		else if (!ip6_addr_isany(src)) {
1469  			if ((ifdev = ifaddr_v6_map_by_addr(src)) == NULL)
1470  				return NULL; /* should never happen */
1471  			zone = ifdev_get_index(ifdev);
1472  		} else {
1473  			if ((ifdev = ifaddr_v6_map_by_addr(dst)) != NULL)
1474  				zone = ifdev_get_index(ifdev);
1475  			else
1476  				return NULL; /* TODO: try harder */
1477  		}
1478  
1479  		if (zone != IP6_NO_ZONE) {
1480  			dst_addr = *dst;
1481  
1482  			ip6_addr_set_zone(&dst_addr, zone);
1483  
1484  			dst = &dst_addr;
1485  		}
1486  	}
1487  
1488  	route = route_lookup_v6(dst);
1489  
1490  	/*
1491  	 * Look up the route for the destination IPv6 address.  If no route is
1492  	 * found at all, return NULL to the caller.
1493  	 */
1494  	if (route == NULL) {
1495  		/*
1496  		 * Since we rely on userland to create routes for on-link
1497  		 * prefixes and default routers, we do not have to call lwIP's
1498  		 * nd6_find_route() here.
1499  		 */
1500  
1501  		/* Generate an RTM_MISS message. */
1502  		route_miss_v6(dst);
1503  
1504  		return NULL;
1505  	}
1506  
1507  	/*
1508  	 * We have found a route based on the destination address.  If we did
1509  	 * not pick the destination address zone based on the source address,
1510  	 * we should now check for source address zone violations.  Note that
1511  	 * if even the destination address zone violates its target interface,
1512  	 * this case will be caught by route_lookup_v6().
1513  	 */
1514  	if (zone == IP6_NO_ZONE &&
1515  	    ifaddr_is_zone_mismatch(src, route->re_ifdev))
1516  		return NULL;
1517  
1518  	route->re_use++;
1519  
1520  	/*
1521  	 * See ip4_route() for an explanation of the use of loopback here.  For
1522  	 * the IPv6 case, the matching logic is in route_output_v6().
1523  	 */
1524  	if ((route->re_flags & (RTF_REJECT | RTF_BLACKHOLE)) &&
1525  	    !ifdev_is_loopback(route->re_ifdev))
1526  		ifdev = ifdev_get_loopback();
1527  	else
1528  		ifdev = route->re_ifdev;
1529  
1530  	/*
1531  	 * If the selected interface would cause the destination address to
1532  	 * leave its zone, fail route selection altogether.  This case may
1533  	 * trigger especially for reject routes, for which the interface change
1534  	 * to loopback may introduce a zone violation.
1535  	 */
1536  	if (ip6_addr_has_zone(dst) &&
1537  	    !ip6_addr_test_zone(dst, ifdev_get_netif(ifdev)))
1538  		return NULL;
1539  
1540  	return ifdev_get_netif(ifdev);
1541  }
1542  
1543  /*
1544   * lwIP IPv6 (source) routing hook.  Since this hook is called only from lwIP's
1545   * own ip6_route() implementation, this hook must never fire.  If it does,
1546   * either something is wrong with overriding ip6_route(), or lwIP added other
1547   * places from which this hook is called.  Both cases are highly problematic
1548   * and must be resolved somehow, which is why we simply call panic() here.
1549   */
1550  struct netif *
1551  lwip_hook_ip6_route(const ip6_addr_t * src, const ip6_addr_t * dst)
1552  {
1553  
1554  	panic("IPv6 routing hook called - this should not happen!");
1555  }
1556  
1557  /*
1558   * lwIP IPv6 ND6 gateway hook.
1559   */
1560  const ip6_addr_t *
1561  lwip_hook_nd6_get_gw(struct netif * netif, const ip6_addr_t * ip6addr)
1562  {
1563  	static ip6_addr_t gw_addr; /* may be returned to the caller */
1564  	struct route_entry *route;
1565  	struct ifdev *ifdev;
1566  
1567  	ifdev = netif_get_ifdev(netif);
1568  	assert(ifdev != NULL);
1569  
1570  	/* Look up the route for the destination IP address. */
1571  	if ((route = route_lookup_v6(ip6addr)) == NULL)
1572  		return NULL;
1573  
1574  	/* As for IPv4. */
1575  	if (route->re_ifdev != ifdev) {
1576  		printf("LWIP: unexpected interface for gateway lookup\n");
1577  
1578  		return NULL;
1579  	}
1580  
1581  	/*
1582  	 * We save memory by storing a packed (zoneless) version of the IPv6
1583  	 * gateway address.  That means we cannot return a pointer to it here.
1584  	 * Instead, we have to resort to expanding the address into a static
1585  	 * variable.  The caller will immediately make a copy anyway, though.
1586  	 */
1587  	if (route->re_flags & RTF_GATEWAY) {
1588  		ip6_addr_copy_from_packed(gw_addr, route->re_gw6);
1589  		ip6_addr_assign_zone(&gw_addr, IP6_UNKNOWN, netif);
1590  
1591  		return &gw_addr;
1592  	} else
1593  		return ip6addr;
1594  }
1595  
1596  /*
1597   * Check whether a packet is allowed to be sent to the given destination IPv4
1598   * address 'ipaddr' on the interface 'ifdev', according to route information.
1599   * Return TRUE if the packet should be sent.  Return FALSE if the packet should
1600   * be rejected or discarded, with 'err' set to the error to return to lwIP.
1601   */
1602  int
1603  route_output_v4(struct ifdev * ifdev, const ip4_addr_t * ipaddr, err_t * err)
1604  {
1605  	const struct route_entry *route;
1606  
1607  	/* See if we should reject/blackhole packets to this destination. */
1608  	if (ifdev_is_loopback(ifdev) &&
1609  	    (route = route_lookup_v4(ipaddr)) != NULL &&
1610  	    (route->re_flags & (RTF_REJECT | RTF_BLACKHOLE))) {
1611  		if (route->re_flags & RTF_REJECT)
1612  			*err = ERR_RTE;
1613  		else
1614  			*err = ERR_OK;
1615  
1616  		return FALSE;
1617  	}
1618  
1619  	return TRUE;
1620  }
1621  
1622  /*
1623   * Check whether a packet is allowed to be sent to the given destination IPv6
1624   * address 'ipaddr' on the interface 'ifdev', according to route information.
1625   * Return TRUE if the packet should be sent.  Return FALSE if the packet should
1626   * be rejected or discarded, with 'err' set to the error to return to lwIP.
1627   */
1628  int
1629  route_output_v6(struct ifdev * ifdev, const ip6_addr_t * ipaddr, err_t * err)
1630  {
1631  	const struct route_entry *route;
1632  
1633  	/* Do one more zone violation test, just in case.  It's cheap. */
1634  	if (ip6_addr_has_zone(ipaddr) &&
1635  	    !ip6_addr_test_zone(ipaddr, ifdev_get_netif(ifdev))) {
1636  		*err = ERR_RTE;
1637  
1638  		return FALSE;
1639  	}
1640  
1641  	/* See if we should reject/blackhole packets to this destination. */
1642  	if (ifdev_is_loopback(ifdev) &&
1643  	    (route = route_lookup_v6(ipaddr)) != NULL &&
1644  	    (route->re_flags & (RTF_REJECT | RTF_BLACKHOLE))) {
1645  		if (route->re_flags & RTF_REJECT)
1646  			*err = ERR_RTE;
1647  		else
1648  			*err = ERR_OK;
1649  
1650  		return FALSE;
1651  	}
1652  
1653  	return TRUE;
1654  }
1655