xref: /onnv-gate/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c (revision 0:68f95e015346)
1*0Sstevel@tonic-gate /*
2*0Sstevel@tonic-gate  * CDDL HEADER START
3*0Sstevel@tonic-gate  *
4*0Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*0Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
6*0Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
7*0Sstevel@tonic-gate  * with the License.
8*0Sstevel@tonic-gate  *
9*0Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*0Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
11*0Sstevel@tonic-gate  * See the License for the specific language governing permissions
12*0Sstevel@tonic-gate  * and limitations under the License.
13*0Sstevel@tonic-gate  *
14*0Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
15*0Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*0Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
17*0Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
18*0Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
19*0Sstevel@tonic-gate  *
20*0Sstevel@tonic-gate  * CDDL HEADER END
21*0Sstevel@tonic-gate  */
22*0Sstevel@tonic-gate /*
23*0Sstevel@tonic-gate  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24*0Sstevel@tonic-gate  * Use is subject to license terms.
25*0Sstevel@tonic-gate  */
26*0Sstevel@tonic-gate 
27*0Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*0Sstevel@tonic-gate 
29*0Sstevel@tonic-gate #include "mpd_defs.h"
30*0Sstevel@tonic-gate #include "mpd_tables.h"
31*0Sstevel@tonic-gate 
32*0Sstevel@tonic-gate int debug = 0;				/* Debug flag */
33*0Sstevel@tonic-gate static int pollfd_num = 0;		/* Num. of poll descriptors */
34*0Sstevel@tonic-gate static struct pollfd *pollfds = NULL;	/* Array of poll descriptors */
35*0Sstevel@tonic-gate 
36*0Sstevel@tonic-gate 					/* All times below in ms */
37*0Sstevel@tonic-gate int	user_failure_detection_time;	/* user specified failure detection */
38*0Sstevel@tonic-gate 					/* time (fdt) */
39*0Sstevel@tonic-gate int	user_probe_interval;		/* derived from user specified fdt */
40*0Sstevel@tonic-gate 
41*0Sstevel@tonic-gate static int	rtsock_v4;		/* AF_INET routing socket */
42*0Sstevel@tonic-gate static int	rtsock_v6;		/* AF_INET6 routing socket */
43*0Sstevel@tonic-gate int	ifsock_v4 = -1;			/* IPv4 socket for ioctls  */
44*0Sstevel@tonic-gate int	ifsock_v6 = -1;			/* IPv6 socket for ioctls  */
45*0Sstevel@tonic-gate static int	lsock_v4;		/* Listen socket to detect mpathd */
46*0Sstevel@tonic-gate static int	lsock_v6;		/* Listen socket to detect mpathd */
47*0Sstevel@tonic-gate static int	mibfd = -1;		/* fd to get mib info */
48*0Sstevel@tonic-gate static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */
49*0Sstevel@tonic-gate 
50*0Sstevel@tonic-gate boolean_t	full_scan_required = _B_FALSE;
51*0Sstevel@tonic-gate static uint_t	last_initifs_time;	/* Time when initifs was last run */
52*0Sstevel@tonic-gate static	char **argv0;			/* Saved for re-exec on SIGHUP */
53*0Sstevel@tonic-gate boolean_t handle_link_notifications = _B_TRUE;
54*0Sstevel@tonic-gate 
55*0Sstevel@tonic-gate static void	initlog(void);
56*0Sstevel@tonic-gate static void	run_timeouts(void);
57*0Sstevel@tonic-gate static void	initifs(void);
58*0Sstevel@tonic-gate static void	check_if_removed(struct phyint_instance *pii);
59*0Sstevel@tonic-gate static void	select_test_ifs(void);
60*0Sstevel@tonic-gate static void	ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len);
61*0Sstevel@tonic-gate static void	ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len);
62*0Sstevel@tonic-gate static void	router_add_v4(mib2_ipRouteEntry_t *rp1,
63*0Sstevel@tonic-gate     struct in_addr nexthop_v4);
64*0Sstevel@tonic-gate static void	router_add_v6(mib2_ipv6RouteEntry_t *rp1,
65*0Sstevel@tonic-gate     struct in6_addr nexthop_v6);
66*0Sstevel@tonic-gate static void	router_add_common(int af, char *ifname,
67*0Sstevel@tonic-gate     struct in6_addr nexthop);
68*0Sstevel@tonic-gate static void	init_router_targets();
69*0Sstevel@tonic-gate static void	cleanup(void);
70*0Sstevel@tonic-gate static int	setup_listener(int af);
71*0Sstevel@tonic-gate static void	check_config(void);
72*0Sstevel@tonic-gate static void	check_addr_unique(int af, char *name);
73*0Sstevel@tonic-gate static void	init_host_targets(void);
74*0Sstevel@tonic-gate static void	dup_host_targets(struct phyint_instance *desired_pii);
75*0Sstevel@tonic-gate static void	loopback_cmd(int sock, int family);
76*0Sstevel@tonic-gate static int	poll_remove(int fd);
77*0Sstevel@tonic-gate static boolean_t daemonize(void);
78*0Sstevel@tonic-gate static int	closefunc(void *, int);
79*0Sstevel@tonic-gate static unsigned int process_cmd(int newfd, union mi_commands *mpi);
80*0Sstevel@tonic-gate static unsigned int process_query(int fd, mi_query_t *miq);
81*0Sstevel@tonic-gate static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop);
82*0Sstevel@tonic-gate static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp);
83*0Sstevel@tonic-gate static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop);
84*0Sstevel@tonic-gate static unsigned int send_result(int fd, unsigned int error, int syserror);
85*0Sstevel@tonic-gate 
86*0Sstevel@tonic-gate /*
87*0Sstevel@tonic-gate  * Return the current time in milliseconds (from an arbitrary reference)
88*0Sstevel@tonic-gate  * truncated to fit into an int. Truncation is ok since we are interested
89*0Sstevel@tonic-gate  * only in differences and not the absolute values.
90*0Sstevel@tonic-gate  */
91*0Sstevel@tonic-gate uint_t
92*0Sstevel@tonic-gate getcurrenttime(void)
93*0Sstevel@tonic-gate {
94*0Sstevel@tonic-gate 	uint_t	cur_time;	/* In ms */
95*0Sstevel@tonic-gate 
96*0Sstevel@tonic-gate 	/*
97*0Sstevel@tonic-gate 	 * Use of a non-user-adjustable source of time is
98*0Sstevel@tonic-gate 	 * required. However millisecond precision is sufficient.
99*0Sstevel@tonic-gate 	 * divide by 10^6
100*0Sstevel@tonic-gate 	 */
101*0Sstevel@tonic-gate 	cur_time = (uint_t)(gethrtime() / 1000000LL);
102*0Sstevel@tonic-gate 	return (cur_time);
103*0Sstevel@tonic-gate }
104*0Sstevel@tonic-gate 
105*0Sstevel@tonic-gate /*
106*0Sstevel@tonic-gate  * Add fd to the set being polled. Returns 0 if ok; -1 if failed.
107*0Sstevel@tonic-gate  */
108*0Sstevel@tonic-gate int
109*0Sstevel@tonic-gate poll_add(int fd)
110*0Sstevel@tonic-gate {
111*0Sstevel@tonic-gate 	int i;
112*0Sstevel@tonic-gate 	int new_num;
113*0Sstevel@tonic-gate 	struct pollfd *newfds;
114*0Sstevel@tonic-gate retry:
115*0Sstevel@tonic-gate 	/* Check if already present */
116*0Sstevel@tonic-gate 	for (i = 0; i < pollfd_num; i++) {
117*0Sstevel@tonic-gate 		if (pollfds[i].fd == fd)
118*0Sstevel@tonic-gate 			return (0);
119*0Sstevel@tonic-gate 	}
120*0Sstevel@tonic-gate 	/* Check for empty spot already present */
121*0Sstevel@tonic-gate 	for (i = 0; i < pollfd_num; i++) {
122*0Sstevel@tonic-gate 		if (pollfds[i].fd == -1) {
123*0Sstevel@tonic-gate 			pollfds[i].fd = fd;
124*0Sstevel@tonic-gate 			return (0);
125*0Sstevel@tonic-gate 		}
126*0Sstevel@tonic-gate 	}
127*0Sstevel@tonic-gate 
128*0Sstevel@tonic-gate 	/* Allocate space for 32 more fds and initialize to -1 */
129*0Sstevel@tonic-gate 	new_num = pollfd_num + 32;
130*0Sstevel@tonic-gate 	newfds = realloc(pollfds, new_num * sizeof (struct pollfd));
131*0Sstevel@tonic-gate 	if (newfds == NULL) {
132*0Sstevel@tonic-gate 		logperror("poll_add: realloc");
133*0Sstevel@tonic-gate 		return (-1);
134*0Sstevel@tonic-gate 	}
135*0Sstevel@tonic-gate 	for (i = pollfd_num; i < new_num; i++) {
136*0Sstevel@tonic-gate 		newfds[i].fd = -1;
137*0Sstevel@tonic-gate 		newfds[i].events = POLLIN;
138*0Sstevel@tonic-gate 	}
139*0Sstevel@tonic-gate 	pollfd_num = new_num;
140*0Sstevel@tonic-gate 	pollfds = newfds;
141*0Sstevel@tonic-gate 	goto retry;
142*0Sstevel@tonic-gate }
143*0Sstevel@tonic-gate 
144*0Sstevel@tonic-gate /*
145*0Sstevel@tonic-gate  * Remove fd from the set being polled. Returns 0 if ok; -1 if failed.
146*0Sstevel@tonic-gate  */
147*0Sstevel@tonic-gate static int
148*0Sstevel@tonic-gate poll_remove(int fd)
149*0Sstevel@tonic-gate {
150*0Sstevel@tonic-gate 	int i;
151*0Sstevel@tonic-gate 
152*0Sstevel@tonic-gate 	/* Check if already present */
153*0Sstevel@tonic-gate 	for (i = 0; i < pollfd_num; i++) {
154*0Sstevel@tonic-gate 		if (pollfds[i].fd == fd) {
155*0Sstevel@tonic-gate 			pollfds[i].fd = -1;
156*0Sstevel@tonic-gate 			return (0);
157*0Sstevel@tonic-gate 		}
158*0Sstevel@tonic-gate 	}
159*0Sstevel@tonic-gate 	return (-1);
160*0Sstevel@tonic-gate }
161*0Sstevel@tonic-gate 
162*0Sstevel@tonic-gate /*
163*0Sstevel@tonic-gate  * Extract information about the phyint instance. If the phyint instance still
164*0Sstevel@tonic-gate  * exists in the kernel then set pii_in_use, else clear it. check_if_removed()
165*0Sstevel@tonic-gate  * will use it to detect phyint instances that don't exist any longer and
166*0Sstevel@tonic-gate  * remove them, from our database of phyint instances.
167*0Sstevel@tonic-gate  * Return value:
168*0Sstevel@tonic-gate  *	returns true if the phyint instance exists in the kernel,
169*0Sstevel@tonic-gate  *	returns false otherwise
170*0Sstevel@tonic-gate  */
171*0Sstevel@tonic-gate static boolean_t
172*0Sstevel@tonic-gate pii_process(int af, char *name, struct phyint_instance **pii_p)
173*0Sstevel@tonic-gate {
174*0Sstevel@tonic-gate 	int err;
175*0Sstevel@tonic-gate 	struct phyint_instance *pii;
176*0Sstevel@tonic-gate 	struct phyint_instance *pii_other;
177*0Sstevel@tonic-gate 
178*0Sstevel@tonic-gate 	if (debug & D_PHYINT)
179*0Sstevel@tonic-gate 		logdebug("pii_process(%s %s)\n", AF_STR(af), name);
180*0Sstevel@tonic-gate 
181*0Sstevel@tonic-gate 	pii = phyint_inst_lookup(af, name);
182*0Sstevel@tonic-gate 	if (pii == NULL) {
183*0Sstevel@tonic-gate 		/*
184*0Sstevel@tonic-gate 		 * Phyint instance does not exist in our tables,
185*0Sstevel@tonic-gate 		 * create new phyint instance
186*0Sstevel@tonic-gate 		 */
187*0Sstevel@tonic-gate 		pii = phyint_inst_init_from_k(af, name);
188*0Sstevel@tonic-gate 	} else {
189*0Sstevel@tonic-gate 		/* Phyint exists in our tables */
190*0Sstevel@tonic-gate 		err = phyint_inst_update_from_k(pii);
191*0Sstevel@tonic-gate 
192*0Sstevel@tonic-gate 		switch (err) {
193*0Sstevel@tonic-gate 		case PI_IOCTL_ERROR:
194*0Sstevel@tonic-gate 			/* Some ioctl error. don't change anything */
195*0Sstevel@tonic-gate 			pii->pii_in_use = 1;
196*0Sstevel@tonic-gate 			break;
197*0Sstevel@tonic-gate 
198*0Sstevel@tonic-gate 		case PI_GROUP_CHANGED:
199*0Sstevel@tonic-gate 			/*
200*0Sstevel@tonic-gate 			 * The phyint has changed group.
201*0Sstevel@tonic-gate 			 */
202*0Sstevel@tonic-gate 			restore_phyint(pii->pii_phyint);
203*0Sstevel@tonic-gate 			/* FALLTHRU */
204*0Sstevel@tonic-gate 
205*0Sstevel@tonic-gate 		case PI_IFINDEX_CHANGED:
206*0Sstevel@tonic-gate 			/*
207*0Sstevel@tonic-gate 			 * Interface index has changed. Delete and
208*0Sstevel@tonic-gate 			 * recreate the phyint as it is quite likely
209*0Sstevel@tonic-gate 			 * the interface has been unplumbed and replumbed.
210*0Sstevel@tonic-gate 			 */
211*0Sstevel@tonic-gate 			pii_other = phyint_inst_other(pii);
212*0Sstevel@tonic-gate 			if (pii_other != NULL)
213*0Sstevel@tonic-gate 				phyint_inst_delete(pii_other);
214*0Sstevel@tonic-gate 			phyint_inst_delete(pii);
215*0Sstevel@tonic-gate 			pii = phyint_inst_init_from_k(af, name);
216*0Sstevel@tonic-gate 			break;
217*0Sstevel@tonic-gate 
218*0Sstevel@tonic-gate 		case PI_DELETED:
219*0Sstevel@tonic-gate 			/* Phyint instance has disappeared from kernel */
220*0Sstevel@tonic-gate 			pii->pii_in_use = 0;
221*0Sstevel@tonic-gate 			break;
222*0Sstevel@tonic-gate 
223*0Sstevel@tonic-gate 		case PI_OK:
224*0Sstevel@tonic-gate 			/* Phyint instance exists and is fine */
225*0Sstevel@tonic-gate 			pii->pii_in_use = 1;
226*0Sstevel@tonic-gate 			break;
227*0Sstevel@tonic-gate 
228*0Sstevel@tonic-gate 		default:
229*0Sstevel@tonic-gate 			/* Unknown status */
230*0Sstevel@tonic-gate 			logerr("pii_process: Unknown status %d\n", err);
231*0Sstevel@tonic-gate 			break;
232*0Sstevel@tonic-gate 		}
233*0Sstevel@tonic-gate 	}
234*0Sstevel@tonic-gate 
235*0Sstevel@tonic-gate 	*pii_p = pii;
236*0Sstevel@tonic-gate 	if (pii != NULL)
237*0Sstevel@tonic-gate 		return (pii->pii_in_use ? _B_TRUE : _B_FALSE);
238*0Sstevel@tonic-gate 	else
239*0Sstevel@tonic-gate 		return (_B_FALSE);
240*0Sstevel@tonic-gate }
241*0Sstevel@tonic-gate 
242*0Sstevel@tonic-gate /*
243*0Sstevel@tonic-gate  * This phyint is leaving the group. Try to restore the phyint to its
244*0Sstevel@tonic-gate  * initial state. Return the addresses that belong to other group members,
245*0Sstevel@tonic-gate  * to the group, and take back any addresses owned by this phyint
246*0Sstevel@tonic-gate  */
247*0Sstevel@tonic-gate void
248*0Sstevel@tonic-gate restore_phyint(struct phyint *pi)
249*0Sstevel@tonic-gate {
250*0Sstevel@tonic-gate 	if (pi->pi_group == phyint_anongroup)
251*0Sstevel@tonic-gate 		return;
252*0Sstevel@tonic-gate 
253*0Sstevel@tonic-gate 	/*
254*0Sstevel@tonic-gate 	 * Move everthing to some other member in the group.
255*0Sstevel@tonic-gate 	 * The phyint has changed group in the kernel. But we
256*0Sstevel@tonic-gate 	 * have yet to do it in our tables.
257*0Sstevel@tonic-gate 	 */
258*0Sstevel@tonic-gate 	if (!pi->pi_empty)
259*0Sstevel@tonic-gate 		(void) try_failover(pi, FAILOVER_TO_ANY);
260*0Sstevel@tonic-gate 	/*
261*0Sstevel@tonic-gate 	 * Move all addresses owned by 'pi' back to pi, from each
262*0Sstevel@tonic-gate 	 * of the other members of the group
263*0Sstevel@tonic-gate 	 */
264*0Sstevel@tonic-gate 	(void) try_failback(pi, _B_FALSE);
265*0Sstevel@tonic-gate }
266*0Sstevel@tonic-gate 
267*0Sstevel@tonic-gate /*
268*0Sstevel@tonic-gate  * Scan all interfaces to detect changes as well as new and deleted interfaces
269*0Sstevel@tonic-gate  */
270*0Sstevel@tonic-gate static void
271*0Sstevel@tonic-gate initifs()
272*0Sstevel@tonic-gate {
273*0Sstevel@tonic-gate 	int	n;
274*0Sstevel@tonic-gate 	int	af;
275*0Sstevel@tonic-gate 	char	*cp;
276*0Sstevel@tonic-gate 	char	*buf;
277*0Sstevel@tonic-gate 	int	numifs;
278*0Sstevel@tonic-gate 	struct lifnum	lifn;
279*0Sstevel@tonic-gate 	struct lifconf	lifc;
280*0Sstevel@tonic-gate 	struct lifreq	*lifr;
281*0Sstevel@tonic-gate 	struct logint	*li;
282*0Sstevel@tonic-gate 	struct phyint_instance *pii;
283*0Sstevel@tonic-gate 	struct phyint_instance *next_pii;
284*0Sstevel@tonic-gate 	char	pi_name[LIFNAMSIZ + 1];
285*0Sstevel@tonic-gate 	boolean_t exists;
286*0Sstevel@tonic-gate 	struct phyint	*pi;
287*0Sstevel@tonic-gate 
288*0Sstevel@tonic-gate 	if (debug & D_PHYINT)
289*0Sstevel@tonic-gate 		logdebug("initifs: Scanning interfaces\n");
290*0Sstevel@tonic-gate 
291*0Sstevel@tonic-gate 	last_initifs_time = getcurrenttime();
292*0Sstevel@tonic-gate 
293*0Sstevel@tonic-gate 	/*
294*0Sstevel@tonic-gate 	 * Mark the interfaces so that we can find phyints and logints
295*0Sstevel@tonic-gate 	 * which have disappeared from the kernel. pii_process() and
296*0Sstevel@tonic-gate 	 * logint_init_from_k() will set {pii,li}_in_use when they find
297*0Sstevel@tonic-gate 	 * the interface in the kernel. Also, clear dupaddr bit on probe
298*0Sstevel@tonic-gate 	 * logint. check_addr_unique() will set the dupaddr bit on the
299*0Sstevel@tonic-gate 	 * probe logint, if the testaddress is not unique.
300*0Sstevel@tonic-gate 	 */
301*0Sstevel@tonic-gate 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
302*0Sstevel@tonic-gate 		pii->pii_in_use = 0;
303*0Sstevel@tonic-gate 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
304*0Sstevel@tonic-gate 			li->li_in_use = 0;
305*0Sstevel@tonic-gate 			if (pii->pii_probe_logint == li)
306*0Sstevel@tonic-gate 				li->li_dupaddr = 0;
307*0Sstevel@tonic-gate 		}
308*0Sstevel@tonic-gate 	}
309*0Sstevel@tonic-gate 
310*0Sstevel@tonic-gate 	lifn.lifn_family = AF_UNSPEC;
311*0Sstevel@tonic-gate 	lifn.lifn_flags = 0;
312*0Sstevel@tonic-gate 	if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) {
313*0Sstevel@tonic-gate 		logperror("initifs: ioctl (get interface numbers)");
314*0Sstevel@tonic-gate 		return;
315*0Sstevel@tonic-gate 	}
316*0Sstevel@tonic-gate 	numifs = lifn.lifn_count;
317*0Sstevel@tonic-gate 
318*0Sstevel@tonic-gate 	buf = (char *)calloc(numifs, sizeof (struct lifreq));
319*0Sstevel@tonic-gate 	if (buf == NULL) {
320*0Sstevel@tonic-gate 		logperror("initifs: calloc");
321*0Sstevel@tonic-gate 		return;
322*0Sstevel@tonic-gate 	}
323*0Sstevel@tonic-gate 
324*0Sstevel@tonic-gate 	lifc.lifc_family = AF_UNSPEC;
325*0Sstevel@tonic-gate 	lifc.lifc_flags = 0;
326*0Sstevel@tonic-gate 	lifc.lifc_len = numifs * sizeof (struct lifreq);
327*0Sstevel@tonic-gate 	lifc.lifc_buf = buf;
328*0Sstevel@tonic-gate 
329*0Sstevel@tonic-gate 	if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) {
330*0Sstevel@tonic-gate 		/*
331*0Sstevel@tonic-gate 		 * EINVAL is commonly encountered, when things change
332*0Sstevel@tonic-gate 		 * underneath us rapidly, (eg. at boot, when new interfaces
333*0Sstevel@tonic-gate 		 * are plumbed successively) and the kernel finds the buffer
334*0Sstevel@tonic-gate 		 * size we passed as too small. We will retry again
335*0Sstevel@tonic-gate 		 * when we see the next routing socket msg, or at worst after
336*0Sstevel@tonic-gate 		 * IF_SCAN_INTERVAL ms.
337*0Sstevel@tonic-gate 		 */
338*0Sstevel@tonic-gate 		if (errno != EINVAL) {
339*0Sstevel@tonic-gate 			logperror("initifs: ioctl"
340*0Sstevel@tonic-gate 			    " (get interface configuration)");
341*0Sstevel@tonic-gate 		}
342*0Sstevel@tonic-gate 		free(buf);
343*0Sstevel@tonic-gate 		return;
344*0Sstevel@tonic-gate 	}
345*0Sstevel@tonic-gate 
346*0Sstevel@tonic-gate 	lifr = (struct lifreq *)lifc.lifc_req;
347*0Sstevel@tonic-gate 
348*0Sstevel@tonic-gate 	/*
349*0Sstevel@tonic-gate 	 * For each lifreq returned by SIOGGLIFCONF, call pii_process()
350*0Sstevel@tonic-gate 	 * and get the state of the corresponding phyint_instance. If it is
351*0Sstevel@tonic-gate 	 * successful, then call logint_init_from_k() to get the state of the
352*0Sstevel@tonic-gate 	 * logint.
353*0Sstevel@tonic-gate 	 */
354*0Sstevel@tonic-gate 	for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifr++) {
355*0Sstevel@tonic-gate 		af = lifr->lifr_addr.ss_family;
356*0Sstevel@tonic-gate 
357*0Sstevel@tonic-gate 		/*
358*0Sstevel@tonic-gate 		 * Need to pass a phyint name to pii_process. Insert the
359*0Sstevel@tonic-gate 		 * null where the ':' IF_SEPARATOR is found in the logical
360*0Sstevel@tonic-gate 		 * name.
361*0Sstevel@tonic-gate 		 */
362*0Sstevel@tonic-gate 		(void) strncpy(pi_name, lifr->lifr_name, sizeof (pi_name));
363*0Sstevel@tonic-gate 		pi_name[sizeof (pi_name) - 1] = '\0';
364*0Sstevel@tonic-gate 		if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL)
365*0Sstevel@tonic-gate 			*cp = '\0';
366*0Sstevel@tonic-gate 
367*0Sstevel@tonic-gate 		exists = pii_process(af, pi_name, &pii);
368*0Sstevel@tonic-gate 		if (exists) {
369*0Sstevel@tonic-gate 			/* The phyint is fine. So process the logint */
370*0Sstevel@tonic-gate 			logint_init_from_k(pii, lifr->lifr_name);
371*0Sstevel@tonic-gate 		}
372*0Sstevel@tonic-gate 		check_addr_unique(af, lifr->lifr_name);
373*0Sstevel@tonic-gate 	}
374*0Sstevel@tonic-gate 
375*0Sstevel@tonic-gate 	free(buf);
376*0Sstevel@tonic-gate 
377*0Sstevel@tonic-gate 	/*
378*0Sstevel@tonic-gate 	 * If the test address is now unique, and if it was not unique
379*0Sstevel@tonic-gate 	 * previously,	clear the li_dupaddrmsg_printed flag and log a
380*0Sstevel@tonic-gate 	 * recovery message
381*0Sstevel@tonic-gate 	 */
382*0Sstevel@tonic-gate 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
383*0Sstevel@tonic-gate 		struct logint *li;
384*0Sstevel@tonic-gate 		char abuf[INET6_ADDRSTRLEN];
385*0Sstevel@tonic-gate 
386*0Sstevel@tonic-gate 		li = pii->pii_probe_logint;
387*0Sstevel@tonic-gate 		if ((li != NULL) && !li->li_dupaddr &&
388*0Sstevel@tonic-gate 		    li->li_dupaddrmsg_printed) {
389*0Sstevel@tonic-gate 			logerr("Test address %s is unique; enabling probe-"
390*0Sstevel@tonic-gate 			    "based failure detection\n",
391*0Sstevel@tonic-gate 			    pr_addr(pii->pii_af, li->li_addr, abuf,
392*0Sstevel@tonic-gate 				sizeof (abuf)));
393*0Sstevel@tonic-gate 			li->li_dupaddrmsg_printed = 0;
394*0Sstevel@tonic-gate 		}
395*0Sstevel@tonic-gate 	}
396*0Sstevel@tonic-gate 
397*0Sstevel@tonic-gate 	/*
398*0Sstevel@tonic-gate 	 * Scan for phyints and logints that have disappeared from the
399*0Sstevel@tonic-gate 	 * kernel, and delete them.
400*0Sstevel@tonic-gate 	 */
401*0Sstevel@tonic-gate 	pii = phyint_instances;
402*0Sstevel@tonic-gate 
403*0Sstevel@tonic-gate 	while (pii != NULL) {
404*0Sstevel@tonic-gate 		next_pii = pii->pii_next;
405*0Sstevel@tonic-gate 		check_if_removed(pii);
406*0Sstevel@tonic-gate 		pii = next_pii;
407*0Sstevel@tonic-gate 	}
408*0Sstevel@tonic-gate 
409*0Sstevel@tonic-gate 	/*
410*0Sstevel@tonic-gate 	 * Select a test address for sending probes on each phyint instance
411*0Sstevel@tonic-gate 	 */
412*0Sstevel@tonic-gate 	select_test_ifs();
413*0Sstevel@tonic-gate 
414*0Sstevel@tonic-gate 	/*
415*0Sstevel@tonic-gate 	 * Handle link up/down notifications from the NICs.
416*0Sstevel@tonic-gate 	 */
417*0Sstevel@tonic-gate 	process_link_state_changes();
418*0Sstevel@tonic-gate 
419*0Sstevel@tonic-gate 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
420*0Sstevel@tonic-gate 		/*
421*0Sstevel@tonic-gate 		 * If this is a case of group failure, we don't have much
422*0Sstevel@tonic-gate 		 * to do until the group recovers again.
423*0Sstevel@tonic-gate 		 */
424*0Sstevel@tonic-gate 		if (GROUP_FAILED(pi->pi_group))
425*0Sstevel@tonic-gate 			continue;
426*0Sstevel@tonic-gate 
427*0Sstevel@tonic-gate 		/*
428*0Sstevel@tonic-gate 		 * Try/Retry any pending failovers / failbacks, that did not
429*0Sstevel@tonic-gate 		 * not complete, or that could not be initiated previously.
430*0Sstevel@tonic-gate 		 * This implements the 3 invariants described in the big block
431*0Sstevel@tonic-gate 		 * comment at the beginning of probe.c
432*0Sstevel@tonic-gate 		 */
433*0Sstevel@tonic-gate 		if (pi->pi_flags & IFF_INACTIVE) {
434*0Sstevel@tonic-gate 			if (!pi->pi_empty)
435*0Sstevel@tonic-gate 				(void) try_failover(pi, FAILOVER_TO_NONSTANDBY);
436*0Sstevel@tonic-gate 		} else {
437*0Sstevel@tonic-gate 			struct phyint_instance *pii;
438*0Sstevel@tonic-gate 
439*0Sstevel@tonic-gate 			pii = pi->pi_v4;
440*0Sstevel@tonic-gate 			if (LINK_UP(pi) && !PROBE_CAPABLE(pii))
441*0Sstevel@tonic-gate 				pii = pi->pi_v6;
442*0Sstevel@tonic-gate 			if (LINK_UP(pi) && !PROBE_CAPABLE(pii))
443*0Sstevel@tonic-gate 				continue;
444*0Sstevel@tonic-gate 			/*
445*0Sstevel@tonic-gate 			 * It is possible that the phyint has started
446*0Sstevel@tonic-gate 			 * receiving packets, after it has been marked
447*0Sstevel@tonic-gate 			 * PI_FAILED. Don't initiate failover, if the
448*0Sstevel@tonic-gate 			 * phyint has started recovering. failure_state()
449*0Sstevel@tonic-gate 			 * captures this check. A similar logic is used
450*0Sstevel@tonic-gate 			 * for failback/repair case.
451*0Sstevel@tonic-gate 			 */
452*0Sstevel@tonic-gate 			if (pi->pi_state == PI_FAILED && !pi->pi_empty &&
453*0Sstevel@tonic-gate 			    (failure_state(pii) == PHYINT_FAILURE)) {
454*0Sstevel@tonic-gate 				(void) try_failover(pi, FAILOVER_NORMAL);
455*0Sstevel@tonic-gate 			} else if (pi->pi_state == PI_RUNNING && !pi->pi_full) {
456*0Sstevel@tonic-gate 				if (try_failback(pi, _B_FALSE) !=
457*0Sstevel@tonic-gate 				    IPMP_FAILURE) {
458*0Sstevel@tonic-gate 					(void) change_lif_flags(pi, IFF_FAILED,
459*0Sstevel@tonic-gate 					    _B_FALSE);
460*0Sstevel@tonic-gate 					/* Per state diagram */
461*0Sstevel@tonic-gate 					pi->pi_empty = 0;
462*0Sstevel@tonic-gate 				}
463*0Sstevel@tonic-gate 			}
464*0Sstevel@tonic-gate 		}
465*0Sstevel@tonic-gate 	}
466*0Sstevel@tonic-gate }
467*0Sstevel@tonic-gate 
468*0Sstevel@tonic-gate /*
469*0Sstevel@tonic-gate  * Check that test/probe addresses are always unique. link-locals and
470*0Sstevel@tonic-gate  * ptp unnumbered may not be unique, and bind to such an (IFF_NOFAILOVER)
471*0Sstevel@tonic-gate  * address can produce unexpected results. Log an error and alert the user.
472*0Sstevel@tonic-gate  */
473*0Sstevel@tonic-gate static void
474*0Sstevel@tonic-gate check_addr_unique(int af, char *name)
475*0Sstevel@tonic-gate {
476*0Sstevel@tonic-gate 	struct lifreq	lifr;
477*0Sstevel@tonic-gate 	struct phyint	*pi;
478*0Sstevel@tonic-gate 	struct in6_addr	addr;
479*0Sstevel@tonic-gate 	struct phyint_instance	*pii;
480*0Sstevel@tonic-gate 	struct sockaddr_in	*sin;
481*0Sstevel@tonic-gate 	struct sockaddr_in6	*sin6;
482*0Sstevel@tonic-gate 	int ifsock;
483*0Sstevel@tonic-gate 	char abuf[INET6_ADDRSTRLEN];
484*0Sstevel@tonic-gate 
485*0Sstevel@tonic-gate 	/* Get the socket for doing ioctls */
486*0Sstevel@tonic-gate 	ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
487*0Sstevel@tonic-gate 
488*0Sstevel@tonic-gate 	(void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name));
489*0Sstevel@tonic-gate 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
490*0Sstevel@tonic-gate 	/*
491*0Sstevel@tonic-gate 	 * Get the address corresponding to 'name'. We cannot
492*0Sstevel@tonic-gate 	 * do a logint lookup in our tables, because, not all logints
493*0Sstevel@tonic-gate 	 * in the system are tracked by mpathd. (eg. things not in a group)
494*0Sstevel@tonic-gate 	 */
495*0Sstevel@tonic-gate 	if (ioctl(ifsock, SIOCGLIFADDR, (char *)&lifr) < 0) {
496*0Sstevel@tonic-gate 		if (errno == ENXIO) {
497*0Sstevel@tonic-gate 			/* Interface has vanished */
498*0Sstevel@tonic-gate 			return;
499*0Sstevel@tonic-gate 		} else {
500*0Sstevel@tonic-gate 			logperror("ioctl (get addr)");
501*0Sstevel@tonic-gate 			return;
502*0Sstevel@tonic-gate 		}
503*0Sstevel@tonic-gate 	}
504*0Sstevel@tonic-gate 
505*0Sstevel@tonic-gate 	if (af == AF_INET) {
506*0Sstevel@tonic-gate 		sin = (struct sockaddr_in *)&lifr.lifr_addr;
507*0Sstevel@tonic-gate 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr);
508*0Sstevel@tonic-gate 	} else {
509*0Sstevel@tonic-gate 		sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr;
510*0Sstevel@tonic-gate 		addr = sin6->sin6_addr;
511*0Sstevel@tonic-gate 	}
512*0Sstevel@tonic-gate 
513*0Sstevel@tonic-gate 	/*
514*0Sstevel@tonic-gate 	 * Does the address 'addr' match any known test address ? If so
515*0Sstevel@tonic-gate 	 * it is a duplicate, unless we are looking at the same logint
516*0Sstevel@tonic-gate 	 */
517*0Sstevel@tonic-gate 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
518*0Sstevel@tonic-gate 		pii = PHYINT_INSTANCE(pi, af);
519*0Sstevel@tonic-gate 		if (pii == NULL || pii->pii_probe_logint == NULL)
520*0Sstevel@tonic-gate 			continue;
521*0Sstevel@tonic-gate 
522*0Sstevel@tonic-gate 		if (!IN6_ARE_ADDR_EQUAL(&addr,
523*0Sstevel@tonic-gate 		    &pii->pii_probe_logint->li_addr)) {
524*0Sstevel@tonic-gate 			continue;
525*0Sstevel@tonic-gate 		}
526*0Sstevel@tonic-gate 
527*0Sstevel@tonic-gate 		if (strncmp(pii->pii_probe_logint->li_name, name,
528*0Sstevel@tonic-gate 		    sizeof (pii->pii_probe_logint->li_name)) == 0) {
529*0Sstevel@tonic-gate 			continue;
530*0Sstevel@tonic-gate 		}
531*0Sstevel@tonic-gate 
532*0Sstevel@tonic-gate 		/*
533*0Sstevel@tonic-gate 		 * This test address is not unique. Set the dupaddr bit
534*0Sstevel@tonic-gate 		 */
535*0Sstevel@tonic-gate 		pii->pii_probe_logint->li_dupaddr = 1;
536*0Sstevel@tonic-gate 
537*0Sstevel@tonic-gate 		/*
538*0Sstevel@tonic-gate 		 * Log an error message if not already logged
539*0Sstevel@tonic-gate 		 */
540*0Sstevel@tonic-gate 		if (pii->pii_probe_logint->li_dupaddrmsg_printed)
541*0Sstevel@tonic-gate 			continue;
542*0Sstevel@tonic-gate 
543*0Sstevel@tonic-gate 		logerr("Test address %s is not unique; disabling "
544*0Sstevel@tonic-gate 		    "probe-based failure detection\n",
545*0Sstevel@tonic-gate 		    pr_addr(af, addr, abuf, sizeof (abuf)));
546*0Sstevel@tonic-gate 
547*0Sstevel@tonic-gate 		pii->pii_probe_logint->li_dupaddrmsg_printed = 1;
548*0Sstevel@tonic-gate 	}
549*0Sstevel@tonic-gate }
550*0Sstevel@tonic-gate 
551*0Sstevel@tonic-gate /*
552*0Sstevel@tonic-gate  * The pii_probe_logint used for probing, must satisfy the following properties
553*0Sstevel@tonic-gate  * with respect to its li_flags.
554*0Sstevel@tonic-gate  * IFF_NOFAILOVER - must be set (except in singleton group case)
555*0Sstevel@tonic-gate  * IFF_UP	  - must be set
556*0Sstevel@tonic-gate  * IFF_NOXMIT	  - must be clear
557*0Sstevel@tonic-gate  * IFF_NOLOCAL	  - must be clear
558*0Sstevel@tonic-gate  * IFF_DEPRECATED - preferably set (for IPv4)
559*0Sstevel@tonic-gate  */
560*0Sstevel@tonic-gate #define	BEST_FLAG_SET	(IFF_NOFAILOVER | IFF_UP | IFF_DEPRECATED)
561*0Sstevel@tonic-gate #define	CLEAR_FLAG_SET	(IFF_NOXMIT | IFF_NOLOCAL)
562*0Sstevel@tonic-gate #define	TEST_CLEAR_FLAG_SET	CLEAR_FLAG_SET
563*0Sstevel@tonic-gate #define	TEST_MINIMAL_FLAG_SET	(IFF_UP | CLEAR_FLAG_SET)
564*0Sstevel@tonic-gate #define	TEST_BEST_FLAG_SET	(BEST_FLAG_SET | CLEAR_FLAG_SET)
565*0Sstevel@tonic-gate 
566*0Sstevel@tonic-gate /*
567*0Sstevel@tonic-gate  * Stop probing an interface.  Called when an interface is offlined.
568*0Sstevel@tonic-gate  * The probe socket is closed on each interface instance, and the
569*0Sstevel@tonic-gate  * interface state set to PI_OFFLINE.
570*0Sstevel@tonic-gate  */
571*0Sstevel@tonic-gate static void
572*0Sstevel@tonic-gate stop_probing(struct phyint *pi)
573*0Sstevel@tonic-gate {
574*0Sstevel@tonic-gate 	struct phyint_instance *pii;
575*0Sstevel@tonic-gate 
576*0Sstevel@tonic-gate 	pii = pi->pi_v4;
577*0Sstevel@tonic-gate 	if (pii != NULL) {
578*0Sstevel@tonic-gate 		if (pii->pii_probe_sock != -1)
579*0Sstevel@tonic-gate 			close_probe_socket(pii, _B_TRUE);
580*0Sstevel@tonic-gate 		pii->pii_probe_logint = NULL;
581*0Sstevel@tonic-gate 	}
582*0Sstevel@tonic-gate 
583*0Sstevel@tonic-gate 	pii = pi->pi_v6;
584*0Sstevel@tonic-gate 	if (pii != NULL) {
585*0Sstevel@tonic-gate 		if (pii->pii_probe_sock != -1)
586*0Sstevel@tonic-gate 			close_probe_socket(pii, _B_TRUE);
587*0Sstevel@tonic-gate 		pii->pii_probe_logint = NULL;
588*0Sstevel@tonic-gate 	}
589*0Sstevel@tonic-gate 
590*0Sstevel@tonic-gate 	phyint_chstate(pi, PI_OFFLINE);
591*0Sstevel@tonic-gate }
592*0Sstevel@tonic-gate 
593*0Sstevel@tonic-gate /*
594*0Sstevel@tonic-gate  * Do the test address selection for each phyint instance. Pick an
595*0Sstevel@tonic-gate  * IFF_NOFAILOVER address as test address. For singleton case,
596*0Sstevel@tonic-gate  * if user didn't configure an IFF_NOFAILOVER address, we will pick a
597*0Sstevel@tonic-gate  * normal address as test address. For (multiple adapter) groups,
598*0Sstevel@tonic-gate  * user is required to configure IFF_NOFAILOVER test address. Call
599*0Sstevel@tonic-gate  * phyint_inst_sockinit() to complete the initializations.
600*0Sstevel@tonic-gate  */
601*0Sstevel@tonic-gate static void
602*0Sstevel@tonic-gate select_test_ifs(void)
603*0Sstevel@tonic-gate {
604*0Sstevel@tonic-gate 	struct phyint		*pi;
605*0Sstevel@tonic-gate 	struct phyint_instance	*pii;
606*0Sstevel@tonic-gate 	struct phyint_instance	*next_pii;
607*0Sstevel@tonic-gate 	struct logint	*li;
608*0Sstevel@tonic-gate 	struct logint	*test_logint;
609*0Sstevel@tonic-gate 	boolean_t target_scan_reqd = _B_FALSE;
610*0Sstevel@tonic-gate 	struct target *tg;
611*0Sstevel@tonic-gate 
612*0Sstevel@tonic-gate 	if (debug & D_PHYINT)
613*0Sstevel@tonic-gate 		logdebug("select_test_ifs\n");
614*0Sstevel@tonic-gate 
615*0Sstevel@tonic-gate 	/*
616*0Sstevel@tonic-gate 	 * For each phyint instance, do the test address selection
617*0Sstevel@tonic-gate 	 */
618*0Sstevel@tonic-gate 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
619*0Sstevel@tonic-gate 		next_pii = pii->pii_next;
620*0Sstevel@tonic-gate 		/*
621*0Sstevel@tonic-gate 		 * An interface that is offline, should not be probed.
622*0Sstevel@tonic-gate 		 * Offline interfaces should always in PI_OFFLINE state,
623*0Sstevel@tonic-gate 		 * unless some other entity has set the offline flag.
624*0Sstevel@tonic-gate 		 */
625*0Sstevel@tonic-gate 		if (pii->pii_phyint->pi_flags & IFF_OFFLINE) {
626*0Sstevel@tonic-gate 			if (pii->pii_phyint->pi_state != PI_OFFLINE) {
627*0Sstevel@tonic-gate 				logerr("shouldn't be probing offline"
628*0Sstevel@tonic-gate 					" interface %s (state is: %u)."
629*0Sstevel@tonic-gate 					" Stopping probes.\n",
630*0Sstevel@tonic-gate 					pii->pii_phyint->pi_name,
631*0Sstevel@tonic-gate 					pii->pii_phyint->pi_state);
632*0Sstevel@tonic-gate 				stop_probing(pii->pii_phyint);
633*0Sstevel@tonic-gate 			}
634*0Sstevel@tonic-gate 			continue;
635*0Sstevel@tonic-gate 		}
636*0Sstevel@tonic-gate 
637*0Sstevel@tonic-gate 		test_logint = pii->pii_probe_logint;
638*0Sstevel@tonic-gate 
639*0Sstevel@tonic-gate 		if (test_logint != NULL) {
640*0Sstevel@tonic-gate 			if ((test_logint->li_flags & TEST_BEST_FLAG_SET)
641*0Sstevel@tonic-gate 			    == BEST_FLAG_SET)
642*0Sstevel@tonic-gate 				continue;
643*0Sstevel@tonic-gate 
644*0Sstevel@tonic-gate 			/*
645*0Sstevel@tonic-gate 			 * If user configures IFF_NOXMIT or IFF_NOLOCAL
646*0Sstevel@tonic-gate 			 * flags on test addresses after in.mpathd has
647*0Sstevel@tonic-gate 			 * has started, the daemon aborts. In future
648*0Sstevel@tonic-gate 			 * this can be better handling, i.e. instead
649*0Sstevel@tonic-gate 			 * of abort the daemon, a more appropriate
650*0Sstevel@tonic-gate 			 * action may be issuing a warning and choose
651*0Sstevel@tonic-gate 			 * a different test address.
652*0Sstevel@tonic-gate 			 */
653*0Sstevel@tonic-gate 			assert((test_logint->li_flags & TEST_CLEAR_FLAG_SET)
654*0Sstevel@tonic-gate 			    == 0);
655*0Sstevel@tonic-gate 		}
656*0Sstevel@tonic-gate 
657*0Sstevel@tonic-gate 		/*
658*0Sstevel@tonic-gate 		 * Walk the logints of this phyint instance, and select
659*0Sstevel@tonic-gate 		 * the best available test address
660*0Sstevel@tonic-gate 		 */
661*0Sstevel@tonic-gate 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
662*0Sstevel@tonic-gate 			/*
663*0Sstevel@tonic-gate 			 * Skip any IPv6 logints that are not link-local,
664*0Sstevel@tonic-gate 			 * since we should always have a link-local address
665*0Sstevel@tonic-gate 			 * anyway and in6_data() expects link-local replies.
666*0Sstevel@tonic-gate 			 */
667*0Sstevel@tonic-gate 			if (pii->pii_af == AF_INET6 &&
668*0Sstevel@tonic-gate 			    !IN6_IS_ADDR_LINKLOCAL(&li->li_addr))
669*0Sstevel@tonic-gate 				continue;
670*0Sstevel@tonic-gate 
671*0Sstevel@tonic-gate 			if ((li->li_flags & TEST_MINIMAL_FLAG_SET) == IFF_UP) {
672*0Sstevel@tonic-gate 				/*
673*0Sstevel@tonic-gate 				 * Now we have a testaddress, that satisfies
674*0Sstevel@tonic-gate 				 * the minimal properties.
675*0Sstevel@tonic-gate 				 */
676*0Sstevel@tonic-gate 				if ((li->li_flags & TEST_BEST_FLAG_SET)
677*0Sstevel@tonic-gate 				    == BEST_FLAG_SET) {
678*0Sstevel@tonic-gate 					/*
679*0Sstevel@tonic-gate 					 * This is the best possible address.
680*0Sstevel@tonic-gate 					 * So break, and continue to the
681*0Sstevel@tonic-gate 					 * next phyint
682*0Sstevel@tonic-gate 					 */
683*0Sstevel@tonic-gate 					test_logint = li;
684*0Sstevel@tonic-gate 					break;
685*0Sstevel@tonic-gate 				}
686*0Sstevel@tonic-gate 				if ((test_logint == NULL) ||
687*0Sstevel@tonic-gate 				    (!(test_logint->li_flags &
688*0Sstevel@tonic-gate 				    IFF_NOFAILOVER) &&
689*0Sstevel@tonic-gate 				    (li->li_flags & IFF_NOFAILOVER)))
690*0Sstevel@tonic-gate 					/*
691*0Sstevel@tonic-gate 					 * This is a possible candidate,
692*0Sstevel@tonic-gate 					 * unless we find a better one.
693*0Sstevel@tonic-gate 					 */
694*0Sstevel@tonic-gate 					test_logint = li;
695*0Sstevel@tonic-gate 			}
696*0Sstevel@tonic-gate 		}
697*0Sstevel@tonic-gate 
698*0Sstevel@tonic-gate 		/*
699*0Sstevel@tonic-gate 		 * If we've gone from a singleton group to a multiple adapter
700*0Sstevel@tonic-gate 		 * group, and we haven't found an IFF_NOFAILOVER test address
701*0Sstevel@tonic-gate 		 * by now, the old test address is no longer valid. If we are
702*0Sstevel@tonic-gate 		 * not dealing with a singleton group, and the above test
703*0Sstevel@tonic-gate 		 * address selection loop has selected a non IFF_NOFAILOVER
704*0Sstevel@tonic-gate 		 * address as a candidate, we will correct that here.
705*0Sstevel@tonic-gate 		 */
706*0Sstevel@tonic-gate 		if ((test_logint != NULL) &&
707*0Sstevel@tonic-gate 		    !SINGLETON_GROUP(pii->pii_phyint) &&
708*0Sstevel@tonic-gate 		    !(test_logint->li_flags & IFF_NOFAILOVER)) {
709*0Sstevel@tonic-gate 			test_logint = NULL;
710*0Sstevel@tonic-gate 			if (pii->pii_probe_sock != -1)
711*0Sstevel@tonic-gate 				close_probe_socket(pii, _B_TRUE);
712*0Sstevel@tonic-gate 			pii->pii_probe_logint = NULL;
713*0Sstevel@tonic-gate 		}
714*0Sstevel@tonic-gate 
715*0Sstevel@tonic-gate 		if (test_logint == NULL) {
716*0Sstevel@tonic-gate 			/*
717*0Sstevel@tonic-gate 			 * We don't have a test address. Don't print an
718*0Sstevel@tonic-gate 			 * error message immediately. check_config() will
719*0Sstevel@tonic-gate 			 * take care of it. Zero out the probe stats array
720*0Sstevel@tonic-gate 			 * since it is no longer relevant. Optimize by
721*0Sstevel@tonic-gate 			 * checking if it is already zeroed out.
722*0Sstevel@tonic-gate 			 */
723*0Sstevel@tonic-gate 			int pr_ndx;
724*0Sstevel@tonic-gate 
725*0Sstevel@tonic-gate 			pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
726*0Sstevel@tonic-gate 			if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) {
727*0Sstevel@tonic-gate 				clear_pii_probe_stats(pii);
728*0Sstevel@tonic-gate 				reset_crtt_all(pii->pii_phyint);
729*0Sstevel@tonic-gate 			}
730*0Sstevel@tonic-gate 			continue;
731*0Sstevel@tonic-gate 		} else if (test_logint == pii->pii_probe_logint) {
732*0Sstevel@tonic-gate 			/*
733*0Sstevel@tonic-gate 			 * If we didn't find any new test addr, go to the
734*0Sstevel@tonic-gate 			 * next phyint.
735*0Sstevel@tonic-gate 			 */
736*0Sstevel@tonic-gate 			continue;
737*0Sstevel@tonic-gate 		}
738*0Sstevel@tonic-gate 
739*0Sstevel@tonic-gate 		/*
740*0Sstevel@tonic-gate 		 * The phyint is either being assigned a new testaddr
741*0Sstevel@tonic-gate 		 * or is being assigned a testaddr for the 1st time.
742*0Sstevel@tonic-gate 		 * Need to initialize the phyint socket
743*0Sstevel@tonic-gate 		 */
744*0Sstevel@tonic-gate 		pii->pii_probe_logint = test_logint;
745*0Sstevel@tonic-gate 		if (!phyint_inst_sockinit(pii)) {
746*0Sstevel@tonic-gate 			if (debug & D_PHYINT) {
747*0Sstevel@tonic-gate 				logdebug("select_test_ifs: "
748*0Sstevel@tonic-gate 				    "phyint_sockinit failed\n");
749*0Sstevel@tonic-gate 			}
750*0Sstevel@tonic-gate 			phyint_inst_delete(pii);
751*0Sstevel@tonic-gate 			continue;
752*0Sstevel@tonic-gate 		}
753*0Sstevel@tonic-gate 
754*0Sstevel@tonic-gate 		/*
755*0Sstevel@tonic-gate 		 * This phyint instance is now enabled for probes; this
756*0Sstevel@tonic-gate 		 * impacts our state machine in two ways:
757*0Sstevel@tonic-gate 		 *
758*0Sstevel@tonic-gate 		 * 1. If we're probe *capable* as well (i.e., we have
759*0Sstevel@tonic-gate 		 *    probe targets) and the interface is in PI_NOTARGETS,
760*0Sstevel@tonic-gate 		 *    then transition to PI_RUNNING.
761*0Sstevel@tonic-gate 		 *
762*0Sstevel@tonic-gate 		 * 2. If we're not probe capable, and the other phyint
763*0Sstevel@tonic-gate 		 *    instance is also not probe capable, and we were in
764*0Sstevel@tonic-gate 		 *    PI_RUNNING, then transition to PI_NOTARGETS.
765*0Sstevel@tonic-gate 		 *
766*0Sstevel@tonic-gate 		 * Also see the state diagram in mpd_probe.c.
767*0Sstevel@tonic-gate 		 */
768*0Sstevel@tonic-gate 		if (PROBE_CAPABLE(pii)) {
769*0Sstevel@tonic-gate 			if (pii->pii_phyint->pi_state == PI_NOTARGETS)
770*0Sstevel@tonic-gate 				phyint_chstate(pii->pii_phyint, PI_RUNNING);
771*0Sstevel@tonic-gate 		} else if (!PROBE_CAPABLE(phyint_inst_other(pii))) {
772*0Sstevel@tonic-gate 			if (pii->pii_phyint->pi_state == PI_RUNNING)
773*0Sstevel@tonic-gate 				phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
774*0Sstevel@tonic-gate 		}
775*0Sstevel@tonic-gate 
776*0Sstevel@tonic-gate 		if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) {
777*0Sstevel@tonic-gate 			tg = pii->pii_targets;
778*0Sstevel@tonic-gate 			if (tg != NULL)
779*0Sstevel@tonic-gate 				target_delete(tg);
780*0Sstevel@tonic-gate 			assert(pii->pii_targets == NULL);
781*0Sstevel@tonic-gate 			assert(pii->pii_target_next == NULL);
782*0Sstevel@tonic-gate 			assert(pii->pii_ntargets == 0);
783*0Sstevel@tonic-gate 			target_create(pii, test_logint->li_dstaddr,
784*0Sstevel@tonic-gate 			    _B_TRUE);
785*0Sstevel@tonic-gate 		}
786*0Sstevel@tonic-gate 
787*0Sstevel@tonic-gate 		/*
788*0Sstevel@tonic-gate 		 * If no targets are currently known for this phyint
789*0Sstevel@tonic-gate 		 * we need to call init_router_targets. Since
790*0Sstevel@tonic-gate 		 * init_router_targets() initializes the list of targets
791*0Sstevel@tonic-gate 		 * for all phyints it is done below the loop.
792*0Sstevel@tonic-gate 		 */
793*0Sstevel@tonic-gate 		if (pii->pii_targets == NULL)
794*0Sstevel@tonic-gate 			target_scan_reqd = _B_TRUE;
795*0Sstevel@tonic-gate 
796*0Sstevel@tonic-gate 		/*
797*0Sstevel@tonic-gate 		 * Start the probe timer for this instance.
798*0Sstevel@tonic-gate 		 */
799*0Sstevel@tonic-gate 		if (!pii->pii_basetime_inited && pii->pii_probe_sock != -1) {
800*0Sstevel@tonic-gate 			start_timer(pii);
801*0Sstevel@tonic-gate 			pii->pii_basetime_inited = 1;
802*0Sstevel@tonic-gate 		}
803*0Sstevel@tonic-gate 	}
804*0Sstevel@tonic-gate 
805*0Sstevel@tonic-gate 	/*
806*0Sstevel@tonic-gate 	 * Check the interface list for any interfaces that are marked
807*0Sstevel@tonic-gate 	 * PI_FAILED but no longer enabled to send probes, and call
808*0Sstevel@tonic-gate 	 * phyint_check_for_repair() to see if the link now indicates that the
809*0Sstevel@tonic-gate 	 * interface should be repaired.  Also see the state diagram in
810*0Sstevel@tonic-gate 	 * mpd_probe.c.
811*0Sstevel@tonic-gate 	 */
812*0Sstevel@tonic-gate 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
813*0Sstevel@tonic-gate 		if (pi->pi_state == PI_FAILED &&
814*0Sstevel@tonic-gate 		    !PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
815*0Sstevel@tonic-gate 			phyint_check_for_repair(pi);
816*0Sstevel@tonic-gate 		}
817*0Sstevel@tonic-gate 	}
818*0Sstevel@tonic-gate 
819*0Sstevel@tonic-gate 	/*
820*0Sstevel@tonic-gate 	 * Try to populate the target list. init_router_targets populates
821*0Sstevel@tonic-gate 	 * the target list from the routing table. If our target list is
822*0Sstevel@tonic-gate 	 * still empty, init_host_targets adds host targets based on the
823*0Sstevel@tonic-gate 	 * host target list of other phyints in the group.
824*0Sstevel@tonic-gate 	 */
825*0Sstevel@tonic-gate 	if (target_scan_reqd) {
826*0Sstevel@tonic-gate 		init_router_targets();
827*0Sstevel@tonic-gate 		init_host_targets();
828*0Sstevel@tonic-gate 	}
829*0Sstevel@tonic-gate }
830*0Sstevel@tonic-gate 
831*0Sstevel@tonic-gate /*
832*0Sstevel@tonic-gate  * Check phyint group configuration, to detect any inconsistencies,
833*0Sstevel@tonic-gate  * and log an error message. This is called from runtimeouts every
834*0Sstevel@tonic-gate  * 20 secs. But the error message is displayed once. If the
835*0Sstevel@tonic-gate  * consistency is resolved by the admin, a recovery message is displayed
836*0Sstevel@tonic-gate  * once.
837*0Sstevel@tonic-gate  */
838*0Sstevel@tonic-gate static void
839*0Sstevel@tonic-gate check_config(void)
840*0Sstevel@tonic-gate {
841*0Sstevel@tonic-gate 	struct phyint_group *pg;
842*0Sstevel@tonic-gate 	struct phyint *pi;
843*0Sstevel@tonic-gate 	boolean_t v4_in_group;
844*0Sstevel@tonic-gate 	boolean_t v6_in_group;
845*0Sstevel@tonic-gate 
846*0Sstevel@tonic-gate 	/*
847*0Sstevel@tonic-gate 	 * All phyints of a group must be homogenous to ensure that
848*0Sstevel@tonic-gate 	 * failover or failback can be done. If any phyint in a group
849*0Sstevel@tonic-gate 	 * has IPv4 plumbed, check that all phyints have IPv4 plumbed.
850*0Sstevel@tonic-gate 	 * Do a similar check for IPv6.
851*0Sstevel@tonic-gate 	 */
852*0Sstevel@tonic-gate 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
853*0Sstevel@tonic-gate 		if (pg == phyint_anongroup)
854*0Sstevel@tonic-gate 			continue;
855*0Sstevel@tonic-gate 
856*0Sstevel@tonic-gate 		v4_in_group = _B_FALSE;
857*0Sstevel@tonic-gate 		v6_in_group = _B_FALSE;
858*0Sstevel@tonic-gate 		/*
859*0Sstevel@tonic-gate 		 * 1st pass. Determine if at least 1 phyint in the group
860*0Sstevel@tonic-gate 		 * has IPv4 plumbed and if so set v4_in_group to true.
861*0Sstevel@tonic-gate 		 * Repeat similarly for IPv6.
862*0Sstevel@tonic-gate 		 */
863*0Sstevel@tonic-gate 		for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
864*0Sstevel@tonic-gate 			if (pi->pi_v4 != NULL)
865*0Sstevel@tonic-gate 				v4_in_group = _B_TRUE;
866*0Sstevel@tonic-gate 			if (pi->pi_v6 != NULL)
867*0Sstevel@tonic-gate 				v6_in_group = _B_TRUE;
868*0Sstevel@tonic-gate 		}
869*0Sstevel@tonic-gate 
870*0Sstevel@tonic-gate 		/*
871*0Sstevel@tonic-gate 		 * 2nd pass. If v4_in_group is true, check that phyint
872*0Sstevel@tonic-gate 		 * has IPv4 plumbed. Repeat similarly for IPv6. Print
873*0Sstevel@tonic-gate 		 * out a message the 1st time only.
874*0Sstevel@tonic-gate 		 */
875*0Sstevel@tonic-gate 		for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
876*0Sstevel@tonic-gate 			if (pi->pi_flags & IFF_OFFLINE)
877*0Sstevel@tonic-gate 				continue;
878*0Sstevel@tonic-gate 
879*0Sstevel@tonic-gate 			if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) {
880*0Sstevel@tonic-gate 				if (!pi->pi_cfgmsg_printed) {
881*0Sstevel@tonic-gate 					logerr("NIC %s of group %s is"
882*0Sstevel@tonic-gate 					    " not plumbed for IPv4 and may"
883*0Sstevel@tonic-gate 					    " affect failover capability\n",
884*0Sstevel@tonic-gate 					    pi->pi_name,
885*0Sstevel@tonic-gate 					    pi->pi_group->pg_name);
886*0Sstevel@tonic-gate 					pi->pi_cfgmsg_printed = 1;
887*0Sstevel@tonic-gate 				}
888*0Sstevel@tonic-gate 			} else if (v6_in_group == _B_TRUE &&
889*0Sstevel@tonic-gate 			    pi->pi_v6 == NULL) {
890*0Sstevel@tonic-gate 				if (!pi->pi_cfgmsg_printed) {
891*0Sstevel@tonic-gate 					logerr("NIC %s of group %s is"
892*0Sstevel@tonic-gate 					    " not plumbed for IPv6 and may"
893*0Sstevel@tonic-gate 					    " affect failover capability\n",
894*0Sstevel@tonic-gate 					    pi->pi_name,
895*0Sstevel@tonic-gate 					    pi->pi_group->pg_name);
896*0Sstevel@tonic-gate 					pi->pi_cfgmsg_printed = 1;
897*0Sstevel@tonic-gate 				}
898*0Sstevel@tonic-gate 			} else {
899*0Sstevel@tonic-gate 				/*
900*0Sstevel@tonic-gate 				 * The phyint matches the group configuration,
901*0Sstevel@tonic-gate 				 * if we have reached this point. If it was
902*0Sstevel@tonic-gate 				 * improperly configured earlier, log an
903*0Sstevel@tonic-gate 				 * error recovery message
904*0Sstevel@tonic-gate 				 */
905*0Sstevel@tonic-gate 				if (pi->pi_cfgmsg_printed) {
906*0Sstevel@tonic-gate 					logerr("NIC %s is now consistent with "
907*0Sstevel@tonic-gate 					    "group %s and failover capability "
908*0Sstevel@tonic-gate 					    "is restored\n", pi->pi_name,
909*0Sstevel@tonic-gate 					    pi->pi_group->pg_name);
910*0Sstevel@tonic-gate 					pi->pi_cfgmsg_printed = 0;
911*0Sstevel@tonic-gate 				}
912*0Sstevel@tonic-gate 			}
913*0Sstevel@tonic-gate 
914*0Sstevel@tonic-gate 		}
915*0Sstevel@tonic-gate 	}
916*0Sstevel@tonic-gate 
917*0Sstevel@tonic-gate 	/*
918*0Sstevel@tonic-gate 	 * In order to perform probe-based failure detection, a phyint must
919*0Sstevel@tonic-gate 	 * have at least 1 test/probe address for sending and receiving probes
920*0Sstevel@tonic-gate 	 * (either on IPv4 or IPv6 instance or both).  If no test address has
921*0Sstevel@tonic-gate 	 * been configured, notify the administrator, but continue on since we
922*0Sstevel@tonic-gate 	 * can still perform load spreading, along with "link up/down" based
923*0Sstevel@tonic-gate 	 * failure detection.
924*0Sstevel@tonic-gate 	 *
925*0Sstevel@tonic-gate 	 * Note: In the singleton group case, when user didn't configure
926*0Sstevel@tonic-gate 	 * a test address, the probe address is picked by this daemon.
927*0Sstevel@tonic-gate 	 */
928*0Sstevel@tonic-gate 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
929*0Sstevel@tonic-gate 		if (pi->pi_flags & IFF_OFFLINE)
930*0Sstevel@tonic-gate 			continue;
931*0Sstevel@tonic-gate 
932*0Sstevel@tonic-gate 		if ((pi->pi_v4 == NULL ||
933*0Sstevel@tonic-gate 		    pi->pi_v4->pii_probe_logint == NULL) &&
934*0Sstevel@tonic-gate 		    (pi->pi_v6 == NULL ||
935*0Sstevel@tonic-gate 		    pi->pi_v6->pii_probe_logint == NULL)) {
936*0Sstevel@tonic-gate 			if (!pi->pi_taddrmsg_printed) {
937*0Sstevel@tonic-gate 				logerr("No test address configured on "
938*0Sstevel@tonic-gate 				    "interface %s; disabling probe-based "
939*0Sstevel@tonic-gate 				    "failure detection on it\n", pi->pi_name);
940*0Sstevel@tonic-gate 				pi->pi_taddrmsg_printed = 1;
941*0Sstevel@tonic-gate 			}
942*0Sstevel@tonic-gate 		} else if (pi->pi_taddrmsg_printed) {
943*0Sstevel@tonic-gate 			logerr("Test address now configured on interface %s; "
944*0Sstevel@tonic-gate 			    "enabling probe-based failure detection on it\n",
945*0Sstevel@tonic-gate 			    pi->pi_name);
946*0Sstevel@tonic-gate 			pi->pi_taddrmsg_printed = 0;
947*0Sstevel@tonic-gate 		}
948*0Sstevel@tonic-gate 
949*0Sstevel@tonic-gate 	}
950*0Sstevel@tonic-gate }
951*0Sstevel@tonic-gate 
952*0Sstevel@tonic-gate /*
953*0Sstevel@tonic-gate  * Timer mechanism using relative time (in milliseconds) from the
954*0Sstevel@tonic-gate  * previous timer event. Timers exceeding TIMER_INFINITY milliseconds
955*0Sstevel@tonic-gate  * will fire after TIMER_INFINITY milliseconds.
956*0Sstevel@tonic-gate  * Unsigned arithmetic note: We assume a 32-bit circular sequence space for
957*0Sstevel@tonic-gate  * time values. Hence 2 consecutive timer events cannot be spaced farther
958*0Sstevel@tonic-gate  * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value
959*0Sstevel@tonic-gate  * that can be passed for the delay parameter of timer_schedule()
960*0Sstevel@tonic-gate  */
961*0Sstevel@tonic-gate static uint_t timer_next;	/* Currently scheduled timeout */
962*0Sstevel@tonic-gate static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */
963*0Sstevel@tonic-gate 
964*0Sstevel@tonic-gate static void
965*0Sstevel@tonic-gate timer_init(void)
966*0Sstevel@tonic-gate {
967*0Sstevel@tonic-gate 	timer_next = getcurrenttime() + TIMER_INFINITY;
968*0Sstevel@tonic-gate 	/*
969*0Sstevel@tonic-gate 	 * The call to run_timeouts() will get the timer started
970*0Sstevel@tonic-gate 	 * Since there are no phyints at this point, the timer will
971*0Sstevel@tonic-gate 	 * be set for IF_SCAN_INTERVAL ms.
972*0Sstevel@tonic-gate 	 */
973*0Sstevel@tonic-gate 	run_timeouts();
974*0Sstevel@tonic-gate }
975*0Sstevel@tonic-gate 
976*0Sstevel@tonic-gate /*
977*0Sstevel@tonic-gate  * Make sure the next SIGALRM occurs delay milliseconds from the current
978*0Sstevel@tonic-gate  * time if not earlier. We are interested only in time differences.
979*0Sstevel@tonic-gate  */
980*0Sstevel@tonic-gate void
981*0Sstevel@tonic-gate timer_schedule(uint_t delay)
982*0Sstevel@tonic-gate {
983*0Sstevel@tonic-gate 	uint_t now;
984*0Sstevel@tonic-gate 	struct itimerval itimerval;
985*0Sstevel@tonic-gate 
986*0Sstevel@tonic-gate 	if (debug & D_TIMER)
987*0Sstevel@tonic-gate 		logdebug("timer_schedule(%u)\n", delay);
988*0Sstevel@tonic-gate 
989*0Sstevel@tonic-gate 	assert(delay <= TIMER_INFINITY);
990*0Sstevel@tonic-gate 
991*0Sstevel@tonic-gate 	now = getcurrenttime();
992*0Sstevel@tonic-gate 	if (delay == 0) {
993*0Sstevel@tonic-gate 		/* Minimum allowed delay */
994*0Sstevel@tonic-gate 		delay = 1;
995*0Sstevel@tonic-gate 	}
996*0Sstevel@tonic-gate 	/* Will this timer occur before the currently scheduled SIGALRM? */
997*0Sstevel@tonic-gate 	if (timer_active && TIME_GE(now + delay, timer_next)) {
998*0Sstevel@tonic-gate 		if (debug & D_TIMER) {
999*0Sstevel@tonic-gate 			logdebug("timer_schedule(%u) - no action: "
1000*0Sstevel@tonic-gate 			    "now %u next %u\n", delay, now, timer_next);
1001*0Sstevel@tonic-gate 		}
1002*0Sstevel@tonic-gate 		return;
1003*0Sstevel@tonic-gate 	}
1004*0Sstevel@tonic-gate 	timer_next = now + delay;
1005*0Sstevel@tonic-gate 
1006*0Sstevel@tonic-gate 	itimerval.it_value.tv_sec = delay / 1000;
1007*0Sstevel@tonic-gate 	itimerval.it_value.tv_usec = (delay % 1000) * 1000;
1008*0Sstevel@tonic-gate 	itimerval.it_interval.tv_sec = 0;
1009*0Sstevel@tonic-gate 	itimerval.it_interval.tv_usec = 0;
1010*0Sstevel@tonic-gate 	if (debug & D_TIMER) {
1011*0Sstevel@tonic-gate 		logdebug("timer_schedule(%u): sec %ld usec %ld\n",
1012*0Sstevel@tonic-gate 		    delay, itimerval.it_value.tv_sec,
1013*0Sstevel@tonic-gate 		    itimerval.it_value.tv_usec);
1014*0Sstevel@tonic-gate 	}
1015*0Sstevel@tonic-gate 	timer_active = _B_TRUE;
1016*0Sstevel@tonic-gate 	if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) {
1017*0Sstevel@tonic-gate 		logperror("timer_schedule: setitimer");
1018*0Sstevel@tonic-gate 		exit(2);
1019*0Sstevel@tonic-gate 	}
1020*0Sstevel@tonic-gate }
1021*0Sstevel@tonic-gate 
1022*0Sstevel@tonic-gate /*
1023*0Sstevel@tonic-gate  * Timer has fired. Determine when the next timer event will occur by asking
1024*0Sstevel@tonic-gate  * all the timer routines. Should not be called from a timer routine.
1025*0Sstevel@tonic-gate  */
1026*0Sstevel@tonic-gate static void
1027*0Sstevel@tonic-gate run_timeouts(void)
1028*0Sstevel@tonic-gate {
1029*0Sstevel@tonic-gate 	uint_t next;
1030*0Sstevel@tonic-gate 	uint_t next_event_time;
1031*0Sstevel@tonic-gate 	struct phyint_instance *pii;
1032*0Sstevel@tonic-gate 	struct phyint_instance *next_pii;
1033*0Sstevel@tonic-gate 	static boolean_t timeout_running;
1034*0Sstevel@tonic-gate 
1035*0Sstevel@tonic-gate 	/* assert that recursive timeouts don't happen. */
1036*0Sstevel@tonic-gate 	assert(!timeout_running);
1037*0Sstevel@tonic-gate 
1038*0Sstevel@tonic-gate 	timeout_running = _B_TRUE;
1039*0Sstevel@tonic-gate 
1040*0Sstevel@tonic-gate 	if (debug & D_TIMER)
1041*0Sstevel@tonic-gate 		logdebug("run_timeouts()\n");
1042*0Sstevel@tonic-gate 
1043*0Sstevel@tonic-gate 	next = TIMER_INFINITY;
1044*0Sstevel@tonic-gate 
1045*0Sstevel@tonic-gate 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1046*0Sstevel@tonic-gate 		next_pii = pii->pii_next;
1047*0Sstevel@tonic-gate 		next_event_time = phyint_inst_timer(pii);
1048*0Sstevel@tonic-gate 		if (next_event_time != TIMER_INFINITY && next_event_time < next)
1049*0Sstevel@tonic-gate 			next = next_event_time;
1050*0Sstevel@tonic-gate 
1051*0Sstevel@tonic-gate 		if (debug & D_TIMER) {
1052*0Sstevel@tonic-gate 			logdebug("run_timeouts(%s %s): next scheduled for"
1053*0Sstevel@tonic-gate 			    " this phyint inst %u, next scheduled global"
1054*0Sstevel@tonic-gate 			    " %u ms\n",
1055*0Sstevel@tonic-gate 			    AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
1056*0Sstevel@tonic-gate 			    next_event_time, next);
1057*0Sstevel@tonic-gate 		}
1058*0Sstevel@tonic-gate 	}
1059*0Sstevel@tonic-gate 
1060*0Sstevel@tonic-gate 	/*
1061*0Sstevel@tonic-gate 	 * Make sure initifs() is called at least once every
1062*0Sstevel@tonic-gate 	 * IF_SCAN_INTERVAL, to make sure that we are in sync
1063*0Sstevel@tonic-gate 	 * with the kernel, in case we have missed any routing
1064*0Sstevel@tonic-gate 	 * socket messages.
1065*0Sstevel@tonic-gate 	 */
1066*0Sstevel@tonic-gate 	if (next > IF_SCAN_INTERVAL)
1067*0Sstevel@tonic-gate 		next = IF_SCAN_INTERVAL;
1068*0Sstevel@tonic-gate 
1069*0Sstevel@tonic-gate 	if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) {
1070*0Sstevel@tonic-gate 		initifs();
1071*0Sstevel@tonic-gate 		check_config();
1072*0Sstevel@tonic-gate 	}
1073*0Sstevel@tonic-gate 
1074*0Sstevel@tonic-gate 	if (debug & D_TIMER)
1075*0Sstevel@tonic-gate 		logdebug("run_timeouts: %u ms\n", next);
1076*0Sstevel@tonic-gate 
1077*0Sstevel@tonic-gate 	timer_schedule(next);
1078*0Sstevel@tonic-gate 	timeout_running = _B_FALSE;
1079*0Sstevel@tonic-gate }
1080*0Sstevel@tonic-gate 
1081*0Sstevel@tonic-gate static int eventpipe_read = -1;	/* Used for synchronous signal delivery */
1082*0Sstevel@tonic-gate static int eventpipe_write = -1;
1083*0Sstevel@tonic-gate static boolean_t cleanup_started = _B_FALSE;
1084*0Sstevel@tonic-gate 				/* Don't write to eventpipe if in cleanup */
1085*0Sstevel@tonic-gate /*
1086*0Sstevel@tonic-gate  * Ensure that signals are processed synchronously with the rest of
1087*0Sstevel@tonic-gate  * the code by just writing a one character signal number on the pipe.
1088*0Sstevel@tonic-gate  * The poll loop will pick this up and process the signal event.
1089*0Sstevel@tonic-gate  */
1090*0Sstevel@tonic-gate static void
1091*0Sstevel@tonic-gate sig_handler(int signo)
1092*0Sstevel@tonic-gate {
1093*0Sstevel@tonic-gate 	uchar_t buf = (uchar_t)signo;
1094*0Sstevel@tonic-gate 
1095*0Sstevel@tonic-gate 	/*
1096*0Sstevel@tonic-gate 	 * Don't write to pipe if cleanup has already begun. cleanup()
1097*0Sstevel@tonic-gate 	 * might have closed the pipe already
1098*0Sstevel@tonic-gate 	 */
1099*0Sstevel@tonic-gate 	if (cleanup_started)
1100*0Sstevel@tonic-gate 		return;
1101*0Sstevel@tonic-gate 
1102*0Sstevel@tonic-gate 	if (eventpipe_write == -1) {
1103*0Sstevel@tonic-gate 		logerr("sig_handler: no pipe found\n");
1104*0Sstevel@tonic-gate 		return;
1105*0Sstevel@tonic-gate 	}
1106*0Sstevel@tonic-gate 	if (write(eventpipe_write, &buf, sizeof (buf)) < 0)
1107*0Sstevel@tonic-gate 		logperror("sig_handler: write");
1108*0Sstevel@tonic-gate }
1109*0Sstevel@tonic-gate 
1110*0Sstevel@tonic-gate extern struct probes_missed probes_missed;
1111*0Sstevel@tonic-gate 
1112*0Sstevel@tonic-gate /*
1113*0Sstevel@tonic-gate  * Pick up a signal "byte" from the pipe and process it.
1114*0Sstevel@tonic-gate  */
1115*0Sstevel@tonic-gate static void
1116*0Sstevel@tonic-gate in_signal(int fd)
1117*0Sstevel@tonic-gate {
1118*0Sstevel@tonic-gate 	uchar_t buf;
1119*0Sstevel@tonic-gate 	uint64_t  sent, acked, lost, unacked, unknown;
1120*0Sstevel@tonic-gate 	struct phyint_instance *pii;
1121*0Sstevel@tonic-gate 	int pr_ndx;
1122*0Sstevel@tonic-gate 
1123*0Sstevel@tonic-gate 	switch (read(fd, &buf, sizeof (buf))) {
1124*0Sstevel@tonic-gate 	case -1:
1125*0Sstevel@tonic-gate 		logperror("in_signal: read");
1126*0Sstevel@tonic-gate 		exit(1);
1127*0Sstevel@tonic-gate 		/* NOTREACHED */
1128*0Sstevel@tonic-gate 	case 1:
1129*0Sstevel@tonic-gate 		break;
1130*0Sstevel@tonic-gate 	case 0:
1131*0Sstevel@tonic-gate 		logerr("in_signal: read end of file\n");
1132*0Sstevel@tonic-gate 		exit(1);
1133*0Sstevel@tonic-gate 		/* NOTREACHED */
1134*0Sstevel@tonic-gate 	default:
1135*0Sstevel@tonic-gate 		logerr("in_signal: read > 1\n");
1136*0Sstevel@tonic-gate 		exit(1);
1137*0Sstevel@tonic-gate 	}
1138*0Sstevel@tonic-gate 
1139*0Sstevel@tonic-gate 	if (debug & D_TIMER)
1140*0Sstevel@tonic-gate 		logdebug("in_signal() got %d\n", buf);
1141*0Sstevel@tonic-gate 
1142*0Sstevel@tonic-gate 	switch (buf) {
1143*0Sstevel@tonic-gate 	case SIGALRM:
1144*0Sstevel@tonic-gate 		if (debug & D_TIMER) {
1145*0Sstevel@tonic-gate 			uint_t now = getcurrenttime();
1146*0Sstevel@tonic-gate 
1147*0Sstevel@tonic-gate 			logdebug("in_signal(SIGALRM) delta %u\n",
1148*0Sstevel@tonic-gate 			    now - timer_next);
1149*0Sstevel@tonic-gate 		}
1150*0Sstevel@tonic-gate 		timer_active = _B_FALSE;
1151*0Sstevel@tonic-gate 		run_timeouts();
1152*0Sstevel@tonic-gate 		break;
1153*0Sstevel@tonic-gate 	case SIGUSR1:
1154*0Sstevel@tonic-gate 		logdebug("Printing configuration:\n");
1155*0Sstevel@tonic-gate 		/* Print out the internal tables */
1156*0Sstevel@tonic-gate 		phyint_inst_print_all();
1157*0Sstevel@tonic-gate 
1158*0Sstevel@tonic-gate 		/*
1159*0Sstevel@tonic-gate 		 * Print out the accumulated statistics about missed
1160*0Sstevel@tonic-gate 		 * probes (happens due to scheduling delay).
1161*0Sstevel@tonic-gate 		 */
1162*0Sstevel@tonic-gate 		logerr("Missed sending total of %d probes spread over"
1163*0Sstevel@tonic-gate 		    " %d occurrences\n", probes_missed.pm_nprobes,
1164*0Sstevel@tonic-gate 		    probes_missed.pm_ntimes);
1165*0Sstevel@tonic-gate 
1166*0Sstevel@tonic-gate 		/*
1167*0Sstevel@tonic-gate 		 * Print out the accumulated statistics about probes
1168*0Sstevel@tonic-gate 		 * that were sent.
1169*0Sstevel@tonic-gate 		 */
1170*0Sstevel@tonic-gate 		for (pii = phyint_instances; pii != NULL;
1171*0Sstevel@tonic-gate 		    pii = pii->pii_next) {
1172*0Sstevel@tonic-gate 			unacked = 0;
1173*0Sstevel@tonic-gate 			acked = pii->pii_cum_stats.acked;
1174*0Sstevel@tonic-gate 			lost = pii->pii_cum_stats.lost;
1175*0Sstevel@tonic-gate 			sent = pii->pii_cum_stats.sent;
1176*0Sstevel@tonic-gate 			unknown = pii->pii_cum_stats.unknown;
1177*0Sstevel@tonic-gate 			for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) {
1178*0Sstevel@tonic-gate 				switch (pii->pii_probes[pr_ndx].pr_status) {
1179*0Sstevel@tonic-gate 				case PR_ACKED:
1180*0Sstevel@tonic-gate 					acked++;
1181*0Sstevel@tonic-gate 					break;
1182*0Sstevel@tonic-gate 				case PR_LOST:
1183*0Sstevel@tonic-gate 					lost++;
1184*0Sstevel@tonic-gate 					break;
1185*0Sstevel@tonic-gate 				case PR_UNACKED:
1186*0Sstevel@tonic-gate 					unacked++;
1187*0Sstevel@tonic-gate 					break;
1188*0Sstevel@tonic-gate 				}
1189*0Sstevel@tonic-gate 			}
1190*0Sstevel@tonic-gate 			logerr("\nProbe stats on (%s %s)\n"
1191*0Sstevel@tonic-gate 			    "Number of probes sent %lld\n"
1192*0Sstevel@tonic-gate 			    "Number of probe acks received %lld\n"
1193*0Sstevel@tonic-gate 			    "Number of probes/acks lost %lld\n"
1194*0Sstevel@tonic-gate 			    "Number of valid unacknowled probes %lld\n"
1195*0Sstevel@tonic-gate 			    "Number of ambiguous probe acks received %lld\n",
1196*0Sstevel@tonic-gate 			    AF_STR(pii->pii_af), pii->pii_name,
1197*0Sstevel@tonic-gate 			    sent, acked, lost, unacked, unknown);
1198*0Sstevel@tonic-gate 		}
1199*0Sstevel@tonic-gate 		break;
1200*0Sstevel@tonic-gate 	case SIGHUP:
1201*0Sstevel@tonic-gate 		logerr("SIGHUP: restart and reread config file\n");
1202*0Sstevel@tonic-gate 		cleanup();
1203*0Sstevel@tonic-gate 		(void) execv(argv0[0], argv0);
1204*0Sstevel@tonic-gate 		_exit(0177);
1205*0Sstevel@tonic-gate 		/* NOTREACHED */
1206*0Sstevel@tonic-gate 	case SIGINT:
1207*0Sstevel@tonic-gate 	case SIGTERM:
1208*0Sstevel@tonic-gate 	case SIGQUIT:
1209*0Sstevel@tonic-gate 		cleanup();
1210*0Sstevel@tonic-gate 		exit(0);
1211*0Sstevel@tonic-gate 		/* NOTREACHED */
1212*0Sstevel@tonic-gate 	default:
1213*0Sstevel@tonic-gate 		logerr("in_signal: unknown signal: %d\n", buf);
1214*0Sstevel@tonic-gate 	}
1215*0Sstevel@tonic-gate }
1216*0Sstevel@tonic-gate 
1217*0Sstevel@tonic-gate static void
1218*0Sstevel@tonic-gate cleanup(void)
1219*0Sstevel@tonic-gate {
1220*0Sstevel@tonic-gate 	struct phyint_instance *pii;
1221*0Sstevel@tonic-gate 	struct phyint_instance *next_pii;
1222*0Sstevel@tonic-gate 
1223*0Sstevel@tonic-gate 	/*
1224*0Sstevel@tonic-gate 	 * Make sure that we don't write to eventpipe in
1225*0Sstevel@tonic-gate 	 * sig_handler() if any signal notably SIGALRM,
1226*0Sstevel@tonic-gate 	 * occurs after we close the eventpipe descriptor below
1227*0Sstevel@tonic-gate 	 */
1228*0Sstevel@tonic-gate 	cleanup_started = _B_TRUE;
1229*0Sstevel@tonic-gate 
1230*0Sstevel@tonic-gate 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1231*0Sstevel@tonic-gate 		next_pii = pii->pii_next;
1232*0Sstevel@tonic-gate 		phyint_inst_delete(pii);
1233*0Sstevel@tonic-gate 	}
1234*0Sstevel@tonic-gate 
1235*0Sstevel@tonic-gate 	(void) close(ifsock_v4);
1236*0Sstevel@tonic-gate 	(void) close(ifsock_v6);
1237*0Sstevel@tonic-gate 	(void) close(rtsock_v4);
1238*0Sstevel@tonic-gate 	(void) close(rtsock_v6);
1239*0Sstevel@tonic-gate 	(void) close(lsock_v4);
1240*0Sstevel@tonic-gate 	(void) close(lsock_v6);
1241*0Sstevel@tonic-gate 	(void) close(0);
1242*0Sstevel@tonic-gate 	(void) close(1);
1243*0Sstevel@tonic-gate 	(void) close(2);
1244*0Sstevel@tonic-gate 	(void) close(mibfd);
1245*0Sstevel@tonic-gate 	(void) close(eventpipe_read);
1246*0Sstevel@tonic-gate 	(void) close(eventpipe_write);
1247*0Sstevel@tonic-gate }
1248*0Sstevel@tonic-gate 
1249*0Sstevel@tonic-gate /*
1250*0Sstevel@tonic-gate  * Create pipe for signal delivery and set up signal handlers.
1251*0Sstevel@tonic-gate  */
1252*0Sstevel@tonic-gate static void
1253*0Sstevel@tonic-gate setup_eventpipe(void)
1254*0Sstevel@tonic-gate {
1255*0Sstevel@tonic-gate 	int fds[2];
1256*0Sstevel@tonic-gate 	struct sigaction act;
1257*0Sstevel@tonic-gate 
1258*0Sstevel@tonic-gate 	if ((pipe(fds)) < 0) {
1259*0Sstevel@tonic-gate 		logperror("setup_eventpipe: pipe");
1260*0Sstevel@tonic-gate 		exit(1);
1261*0Sstevel@tonic-gate 	}
1262*0Sstevel@tonic-gate 	eventpipe_read = fds[0];
1263*0Sstevel@tonic-gate 	eventpipe_write = fds[1];
1264*0Sstevel@tonic-gate 	if (poll_add(eventpipe_read) == -1) {
1265*0Sstevel@tonic-gate 		exit(1);
1266*0Sstevel@tonic-gate 	}
1267*0Sstevel@tonic-gate 
1268*0Sstevel@tonic-gate 	act.sa_handler = sig_handler;
1269*0Sstevel@tonic-gate 	act.sa_flags = SA_RESTART;
1270*0Sstevel@tonic-gate 	(void) sigaction(SIGALRM, &act, NULL);
1271*0Sstevel@tonic-gate 
1272*0Sstevel@tonic-gate 	(void) sigset(SIGHUP, sig_handler);
1273*0Sstevel@tonic-gate 	(void) sigset(SIGUSR1, sig_handler);
1274*0Sstevel@tonic-gate 	(void) sigset(SIGTERM, sig_handler);
1275*0Sstevel@tonic-gate 	(void) sigset(SIGINT, sig_handler);
1276*0Sstevel@tonic-gate 	(void) sigset(SIGQUIT, sig_handler);
1277*0Sstevel@tonic-gate }
1278*0Sstevel@tonic-gate 
1279*0Sstevel@tonic-gate /*
1280*0Sstevel@tonic-gate  * Create a routing socket for receiving RTM_IFINFO messages.
1281*0Sstevel@tonic-gate  */
1282*0Sstevel@tonic-gate static int
1283*0Sstevel@tonic-gate setup_rtsock(int af)
1284*0Sstevel@tonic-gate {
1285*0Sstevel@tonic-gate 	int	s;
1286*0Sstevel@tonic-gate 	int	flags;
1287*0Sstevel@tonic-gate 
1288*0Sstevel@tonic-gate 	s = socket(PF_ROUTE, SOCK_RAW, af);
1289*0Sstevel@tonic-gate 	if (s == -1) {
1290*0Sstevel@tonic-gate 		logperror("setup_rtsock: socket PF_ROUTE");
1291*0Sstevel@tonic-gate 		exit(1);
1292*0Sstevel@tonic-gate 	}
1293*0Sstevel@tonic-gate 	if ((flags = fcntl(s, F_GETFL, 0)) < 0) {
1294*0Sstevel@tonic-gate 		logperror("setup_rtsock: fcntl F_GETFL");
1295*0Sstevel@tonic-gate 		(void) close(s);
1296*0Sstevel@tonic-gate 		exit(1);
1297*0Sstevel@tonic-gate 	}
1298*0Sstevel@tonic-gate 	if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) {
1299*0Sstevel@tonic-gate 		logperror("setup_rtsock: fcntl F_SETFL");
1300*0Sstevel@tonic-gate 		(void) close(s);
1301*0Sstevel@tonic-gate 		exit(1);
1302*0Sstevel@tonic-gate 	}
1303*0Sstevel@tonic-gate 	if (poll_add(s) == -1) {
1304*0Sstevel@tonic-gate 		(void) close(s);
1305*0Sstevel@tonic-gate 		exit(1);
1306*0Sstevel@tonic-gate 	}
1307*0Sstevel@tonic-gate 	return (s);
1308*0Sstevel@tonic-gate }
1309*0Sstevel@tonic-gate 
1310*0Sstevel@tonic-gate /*
1311*0Sstevel@tonic-gate  * Process an RTM_IFINFO message received on a routing socket.
1312*0Sstevel@tonic-gate  * The return value indicates whether a full interface scan is required.
1313*0Sstevel@tonic-gate  * Link up/down notifications from the NICs are reflected in the
1314*0Sstevel@tonic-gate  * IFF_RUNNING flag.
1315*0Sstevel@tonic-gate  * If just the state of the IFF_RUNNING interface flag has changed, a
1316*0Sstevel@tonic-gate  * a full interface scan isn't required.
1317*0Sstevel@tonic-gate  */
1318*0Sstevel@tonic-gate static boolean_t
1319*0Sstevel@tonic-gate process_rtm_ifinfo(if_msghdr_t *ifm, int type)
1320*0Sstevel@tonic-gate {
1321*0Sstevel@tonic-gate 	struct sockaddr_dl *sdl;
1322*0Sstevel@tonic-gate 	struct phyint *pi;
1323*0Sstevel@tonic-gate 	uint64_t old_flags;
1324*0Sstevel@tonic-gate 	struct phyint_instance *pii;
1325*0Sstevel@tonic-gate 
1326*0Sstevel@tonic-gate 	assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP);
1327*0Sstevel@tonic-gate 
1328*0Sstevel@tonic-gate 	/*
1329*0Sstevel@tonic-gate 	 * Although the sockaddr_dl structure is directly after the
1330*0Sstevel@tonic-gate 	 * if_msghdr_t structure. At the time of writing, the size of the
1331*0Sstevel@tonic-gate 	 * if_msghdr_t structure is different on 32 and 64 bit kernels, due
1332*0Sstevel@tonic-gate 	 * to the presence of a timeval structure, which contains longs,
1333*0Sstevel@tonic-gate 	 * in the if_data structure.  Anyway, we know where the message ends,
1334*0Sstevel@tonic-gate 	 * so we work backwards to get the start of the sockaddr_dl structure.
1335*0Sstevel@tonic-gate 	 */
1336*0Sstevel@tonic-gate 	/*LINTED*/
1337*0Sstevel@tonic-gate 	sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen -
1338*0Sstevel@tonic-gate 		sizeof (struct sockaddr_dl));
1339*0Sstevel@tonic-gate 
1340*0Sstevel@tonic-gate 	assert(sdl->sdl_family == AF_LINK);
1341*0Sstevel@tonic-gate 
1342*0Sstevel@tonic-gate 	/*
1343*0Sstevel@tonic-gate 	 * The interface name is in sdl_data.
1344*0Sstevel@tonic-gate 	 * RTM_IFINFO messages are only generated for logical interface
1345*0Sstevel@tonic-gate 	 * zero, so there is no colon and logical interface number to
1346*0Sstevel@tonic-gate 	 * strip from the name.	 The name is not null terminated, but
1347*0Sstevel@tonic-gate 	 * there should be enough space in sdl_data to add the null.
1348*0Sstevel@tonic-gate 	 */
1349*0Sstevel@tonic-gate 	if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) {
1350*0Sstevel@tonic-gate 		if (debug & D_LINKNOTE)
1351*0Sstevel@tonic-gate 			logdebug("process_rtm_ifinfo: "
1352*0Sstevel@tonic-gate 				"phyint name too long\n");
1353*0Sstevel@tonic-gate 		return (_B_TRUE);
1354*0Sstevel@tonic-gate 	}
1355*0Sstevel@tonic-gate 	sdl->sdl_data[sdl->sdl_nlen] = 0;
1356*0Sstevel@tonic-gate 
1357*0Sstevel@tonic-gate 	pi = phyint_lookup(sdl->sdl_data);
1358*0Sstevel@tonic-gate 	if (pi == NULL) {
1359*0Sstevel@tonic-gate 		if (debug & D_LINKNOTE)
1360*0Sstevel@tonic-gate 			logdebug("process_rtm_ifinfo: phyint lookup failed"
1361*0Sstevel@tonic-gate 				" for %s\n", sdl->sdl_data);
1362*0Sstevel@tonic-gate 		return (_B_TRUE);
1363*0Sstevel@tonic-gate 	}
1364*0Sstevel@tonic-gate 
1365*0Sstevel@tonic-gate 	/*
1366*0Sstevel@tonic-gate 	 * We want to try and avoid doing a full interface scan for
1367*0Sstevel@tonic-gate 	 * link state notifications from the NICs, as indicated
1368*0Sstevel@tonic-gate 	 * by the state of the IFF_RUNNING flag.  If just the
1369*0Sstevel@tonic-gate 	 * IFF_RUNNING flag has changed state, the link state changes
1370*0Sstevel@tonic-gate 	 * are processed without a full scan.
1371*0Sstevel@tonic-gate 	 * If there is both an IPv4 and IPv6 instance associated with
1372*0Sstevel@tonic-gate 	 * the physical interface, we will get an RTM_IFINFO message
1373*0Sstevel@tonic-gate 	 * for each instance.  If we just maintained a single copy of
1374*0Sstevel@tonic-gate 	 * the physical interface flags, it would appear that no flags
1375*0Sstevel@tonic-gate 	 * had changed when the second message is processed, leading us
1376*0Sstevel@tonic-gate 	 * to believe that the message wasn't generated by a flags change,
1377*0Sstevel@tonic-gate 	 * and that a full interface scan is required.
1378*0Sstevel@tonic-gate 	 * To get around this problem, two additional copies of the flags
1379*0Sstevel@tonic-gate 	 * are kept, one copy for each instance.  These are only used in
1380*0Sstevel@tonic-gate 	 * this routine.  At any one time, all three copies of the flags
1381*0Sstevel@tonic-gate 	 * should be identical except for the IFF_RUNNING flag.	 The
1382*0Sstevel@tonic-gate 	 * copy of the flags in the "phyint" structure is always up to
1383*0Sstevel@tonic-gate 	 * date.
1384*0Sstevel@tonic-gate 	 */
1385*0Sstevel@tonic-gate 	pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6;
1386*0Sstevel@tonic-gate 	if (pii == NULL) {
1387*0Sstevel@tonic-gate 		if (debug & D_LINKNOTE)
1388*0Sstevel@tonic-gate 			logdebug("process_rtm_ifinfo: no instance of address "
1389*0Sstevel@tonic-gate 			    "family %s for %s\n", AF_STR(type), pi->pi_name);
1390*0Sstevel@tonic-gate 		return (_B_TRUE);
1391*0Sstevel@tonic-gate 	}
1392*0Sstevel@tonic-gate 
1393*0Sstevel@tonic-gate 	old_flags = pii->pii_flags;
1394*0Sstevel@tonic-gate 	pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags);
1395*0Sstevel@tonic-gate 	pi->pi_flags = pii->pii_flags;
1396*0Sstevel@tonic-gate 
1397*0Sstevel@tonic-gate 	if (debug & D_LINKNOTE) {
1398*0Sstevel@tonic-gate 		logdebug("process_rtm_ifinfo: %s address family: %s, "
1399*0Sstevel@tonic-gate 		    "old flags: %llx, new flags: %llx\n", pi->pi_name,
1400*0Sstevel@tonic-gate 		    AF_STR(type), old_flags, pi->pi_flags);
1401*0Sstevel@tonic-gate 	}
1402*0Sstevel@tonic-gate 
1403*0Sstevel@tonic-gate 	/*
1404*0Sstevel@tonic-gate 	 * If IFF_STANDBY has changed, indicate that the interface has changed
1405*0Sstevel@tonic-gate 	 * types.
1406*0Sstevel@tonic-gate 	 */
1407*0Sstevel@tonic-gate 	if ((old_flags ^ pii->pii_flags) & IFF_STANDBY)
1408*0Sstevel@tonic-gate 		phyint_newtype(pi);
1409*0Sstevel@tonic-gate 
1410*0Sstevel@tonic-gate 	/*
1411*0Sstevel@tonic-gate 	 * If IFF_INACTIVE has been set, then no data addresses should be
1412*0Sstevel@tonic-gate 	 * hosted on the interface.  If IFF_INACTIVE has been cleared, then
1413*0Sstevel@tonic-gate 	 * move previously failed-over addresses back to it, provided it is
1414*0Sstevel@tonic-gate 	 * not failed.	For details, see the state diagram in mpd_probe.c.
1415*0Sstevel@tonic-gate 	 */
1416*0Sstevel@tonic-gate 	if ((old_flags ^ pii->pii_flags) & IFF_INACTIVE) {
1417*0Sstevel@tonic-gate 		if (pii->pii_flags & IFF_INACTIVE) {
1418*0Sstevel@tonic-gate 			assert(pii->pii_flags & IFF_STANDBY);
1419*0Sstevel@tonic-gate 			if (!pi->pi_empty) {
1420*0Sstevel@tonic-gate 				(void) try_failover(pi, FAILOVER_TO_NONSTANDBY);
1421*0Sstevel@tonic-gate 			}
1422*0Sstevel@tonic-gate 		} else {
1423*0Sstevel@tonic-gate 			if (pi->pi_state == PI_RUNNING && !pi->pi_full) {
1424*0Sstevel@tonic-gate 				pi->pi_empty = 0;
1425*0Sstevel@tonic-gate 				(void) try_failback(pi, _B_FALSE);
1426*0Sstevel@tonic-gate 			}
1427*0Sstevel@tonic-gate 		}
1428*0Sstevel@tonic-gate 	}
1429*0Sstevel@tonic-gate 
1430*0Sstevel@tonic-gate 	/* Has just the IFF_RUNNING flag changed state ? */
1431*0Sstevel@tonic-gate 	if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) {
1432*0Sstevel@tonic-gate 		struct phyint_instance *pii_other;
1433*0Sstevel@tonic-gate 		/*
1434*0Sstevel@tonic-gate 		 * It wasn't just a link state change.	Update
1435*0Sstevel@tonic-gate 		 * the other instance's copy of the flags.
1436*0Sstevel@tonic-gate 		 */
1437*0Sstevel@tonic-gate 		pii_other = phyint_inst_other(pii);
1438*0Sstevel@tonic-gate 		if (pii_other != NULL)
1439*0Sstevel@tonic-gate 			pii_other->pii_flags = pii->pii_flags;
1440*0Sstevel@tonic-gate 		return (_B_TRUE);
1441*0Sstevel@tonic-gate 	}
1442*0Sstevel@tonic-gate 
1443*0Sstevel@tonic-gate 	return (_B_FALSE);
1444*0Sstevel@tonic-gate }
1445*0Sstevel@tonic-gate 
1446*0Sstevel@tonic-gate /*
1447*0Sstevel@tonic-gate  * Retrieve as many routing socket messages as possible, and try to
1448*0Sstevel@tonic-gate  * empty the routing sockets. Initiate full scan of targets or interfaces
1449*0Sstevel@tonic-gate  * as needed.
1450*0Sstevel@tonic-gate  * We listen on separate IPv4 an IPv6 sockets so that we can accurately
1451*0Sstevel@tonic-gate  * detect changes in certain flags (see "process_rtm_ifinfo()" above).
1452*0Sstevel@tonic-gate  */
1453*0Sstevel@tonic-gate static void
1454*0Sstevel@tonic-gate process_rtsock(int rtsock_v4, int rtsock_v6)
1455*0Sstevel@tonic-gate {
1456*0Sstevel@tonic-gate 	int	nbytes;
1457*0Sstevel@tonic-gate 	int64_t msg[2048 / 8];
1458*0Sstevel@tonic-gate 	struct rt_msghdr *rtm;
1459*0Sstevel@tonic-gate 	boolean_t need_if_scan = _B_FALSE;
1460*0Sstevel@tonic-gate 	boolean_t need_rt_scan = _B_FALSE;
1461*0Sstevel@tonic-gate 	boolean_t rtm_ifinfo_seen = _B_FALSE;
1462*0Sstevel@tonic-gate 	int type;
1463*0Sstevel@tonic-gate 
1464*0Sstevel@tonic-gate 	/* Read as many messages as possible and try to empty the sockets */
1465*0Sstevel@tonic-gate 	for (type = AF_INET; ; type = AF_INET6) {
1466*0Sstevel@tonic-gate 		for (;;) {
1467*0Sstevel@tonic-gate 			nbytes = read((type == AF_INET) ? rtsock_v4 :
1468*0Sstevel@tonic-gate 				rtsock_v6, msg, sizeof (msg));
1469*0Sstevel@tonic-gate 			if (nbytes <= 0) {
1470*0Sstevel@tonic-gate 				/* No more messages */
1471*0Sstevel@tonic-gate 				break;
1472*0Sstevel@tonic-gate 			}
1473*0Sstevel@tonic-gate 			rtm = (struct rt_msghdr *)msg;
1474*0Sstevel@tonic-gate 			if (rtm->rtm_version != RTM_VERSION) {
1475*0Sstevel@tonic-gate 				logerr("process_rtsock: version %d "
1476*0Sstevel@tonic-gate 				    "not understood\n", rtm->rtm_version);
1477*0Sstevel@tonic-gate 				break;
1478*0Sstevel@tonic-gate 			}
1479*0Sstevel@tonic-gate 
1480*0Sstevel@tonic-gate 			if (debug & D_PHYINT) {
1481*0Sstevel@tonic-gate 				logdebug("process_rtsock: message %d\n",
1482*0Sstevel@tonic-gate 				    rtm->rtm_type);
1483*0Sstevel@tonic-gate 			}
1484*0Sstevel@tonic-gate 
1485*0Sstevel@tonic-gate 			switch (rtm->rtm_type) {
1486*0Sstevel@tonic-gate 			case RTM_NEWADDR:
1487*0Sstevel@tonic-gate 			case RTM_DELADDR:
1488*0Sstevel@tonic-gate 				/*
1489*0Sstevel@tonic-gate 				 * Some logical interface has changed,
1490*0Sstevel@tonic-gate 				 * have to scan everything to determine
1491*0Sstevel@tonic-gate 				 * what actually changed.
1492*0Sstevel@tonic-gate 				 */
1493*0Sstevel@tonic-gate 				need_if_scan = _B_TRUE;
1494*0Sstevel@tonic-gate 				break;
1495*0Sstevel@tonic-gate 
1496*0Sstevel@tonic-gate 			case RTM_IFINFO:
1497*0Sstevel@tonic-gate 				rtm_ifinfo_seen = _B_TRUE;
1498*0Sstevel@tonic-gate 				need_if_scan |=
1499*0Sstevel@tonic-gate 					process_rtm_ifinfo((if_msghdr_t *)rtm,
1500*0Sstevel@tonic-gate 					type);
1501*0Sstevel@tonic-gate 				break;
1502*0Sstevel@tonic-gate 
1503*0Sstevel@tonic-gate 			case RTM_ADD:
1504*0Sstevel@tonic-gate 			case RTM_DELETE:
1505*0Sstevel@tonic-gate 			case RTM_CHANGE:
1506*0Sstevel@tonic-gate 			case RTM_OLDADD:
1507*0Sstevel@tonic-gate 			case RTM_OLDDEL:
1508*0Sstevel@tonic-gate 				need_rt_scan = _B_TRUE;
1509*0Sstevel@tonic-gate 				break;
1510*0Sstevel@tonic-gate 
1511*0Sstevel@tonic-gate 			default:
1512*0Sstevel@tonic-gate 				/* Not interesting */
1513*0Sstevel@tonic-gate 				break;
1514*0Sstevel@tonic-gate 			}
1515*0Sstevel@tonic-gate 		}
1516*0Sstevel@tonic-gate 		if (type == AF_INET6)
1517*0Sstevel@tonic-gate 			break;
1518*0Sstevel@tonic-gate 	}
1519*0Sstevel@tonic-gate 
1520*0Sstevel@tonic-gate 	if (need_if_scan) {
1521*0Sstevel@tonic-gate 		if (debug & D_LINKNOTE && rtm_ifinfo_seen)
1522*0Sstevel@tonic-gate 			logdebug("process_rtsock: synchronizing with kernel\n");
1523*0Sstevel@tonic-gate 		initifs();
1524*0Sstevel@tonic-gate 	} else if (rtm_ifinfo_seen) {
1525*0Sstevel@tonic-gate 		if (debug & D_LINKNOTE)
1526*0Sstevel@tonic-gate 			logdebug("process_rtsock: "
1527*0Sstevel@tonic-gate 			    "link up/down notification(s) seen\n");
1528*0Sstevel@tonic-gate 		process_link_state_changes();
1529*0Sstevel@tonic-gate 	}
1530*0Sstevel@tonic-gate 
1531*0Sstevel@tonic-gate 	if (need_rt_scan)
1532*0Sstevel@tonic-gate 		init_router_targets();
1533*0Sstevel@tonic-gate }
1534*0Sstevel@tonic-gate 
1535*0Sstevel@tonic-gate /*
1536*0Sstevel@tonic-gate  * Look if the phyint instance or one of its logints have been removed from
1537*0Sstevel@tonic-gate  * the kernel and take appropriate action.
1538*0Sstevel@tonic-gate  * Uses {pii,li}_in_use.
1539*0Sstevel@tonic-gate  */
1540*0Sstevel@tonic-gate static void
1541*0Sstevel@tonic-gate check_if_removed(struct phyint_instance *pii)
1542*0Sstevel@tonic-gate {
1543*0Sstevel@tonic-gate 	struct logint *li;
1544*0Sstevel@tonic-gate 	struct logint *next_li;
1545*0Sstevel@tonic-gate 
1546*0Sstevel@tonic-gate 	/* Detect phyints that have been removed from the kernel. */
1547*0Sstevel@tonic-gate 	if (!pii->pii_in_use) {
1548*0Sstevel@tonic-gate 		logtrace("%s %s has been removed from kernel\n",
1549*0Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
1550*0Sstevel@tonic-gate 		phyint_inst_delete(pii);
1551*0Sstevel@tonic-gate 	} else {
1552*0Sstevel@tonic-gate 		/* Detect logints that have been removed. */
1553*0Sstevel@tonic-gate 		for (li = pii->pii_logint; li != NULL; li = next_li) {
1554*0Sstevel@tonic-gate 			next_li = li->li_next;
1555*0Sstevel@tonic-gate 			if (!li->li_in_use) {
1556*0Sstevel@tonic-gate 				logint_delete(li);
1557*0Sstevel@tonic-gate 			}
1558*0Sstevel@tonic-gate 		}
1559*0Sstevel@tonic-gate 	}
1560*0Sstevel@tonic-gate }
1561*0Sstevel@tonic-gate 
1562*0Sstevel@tonic-gate /*
1563*0Sstevel@tonic-gate  * Send down a T_OPTMGMT_REQ to ip asking for all data in the various
1564*0Sstevel@tonic-gate  * tables defined by mib2.h. Parse the returned data and extract
1565*0Sstevel@tonic-gate  * the 'routing' information table. Process the 'routing' table
1566*0Sstevel@tonic-gate  * to get the list of known onlink routers, and update our database.
1567*0Sstevel@tonic-gate  * These onlink routers will serve as our probe targets.
1568*0Sstevel@tonic-gate  * Returns false, if any system calls resulted in errors, true otherwise.
1569*0Sstevel@tonic-gate  */
1570*0Sstevel@tonic-gate static boolean_t
1571*0Sstevel@tonic-gate update_router_list(int fd)
1572*0Sstevel@tonic-gate {
1573*0Sstevel@tonic-gate 	union {
1574*0Sstevel@tonic-gate 		char	ubuf[1024];
1575*0Sstevel@tonic-gate 		union T_primitives uprim;
1576*0Sstevel@tonic-gate 	} buf;
1577*0Sstevel@tonic-gate 
1578*0Sstevel@tonic-gate 	int			flags;
1579*0Sstevel@tonic-gate 	struct strbuf		ctlbuf;
1580*0Sstevel@tonic-gate 	struct strbuf		databuf;
1581*0Sstevel@tonic-gate 	struct T_optmgmt_req	*tor;
1582*0Sstevel@tonic-gate 	struct T_optmgmt_ack	*toa;
1583*0Sstevel@tonic-gate 	struct T_error_ack	*tea;
1584*0Sstevel@tonic-gate 	struct opthdr		*optp;
1585*0Sstevel@tonic-gate 	struct opthdr		*req;
1586*0Sstevel@tonic-gate 	int			status;
1587*0Sstevel@tonic-gate 	t_scalar_t		prim;
1588*0Sstevel@tonic-gate 
1589*0Sstevel@tonic-gate 	tor = (struct T_optmgmt_req *)&buf;
1590*0Sstevel@tonic-gate 
1591*0Sstevel@tonic-gate 	tor->PRIM_type = T_SVR4_OPTMGMT_REQ;
1592*0Sstevel@tonic-gate 	tor->OPT_offset = sizeof (struct T_optmgmt_req);
1593*0Sstevel@tonic-gate 	tor->OPT_length = sizeof (struct opthdr);
1594*0Sstevel@tonic-gate 	tor->MGMT_flags = T_CURRENT;
1595*0Sstevel@tonic-gate 
1596*0Sstevel@tonic-gate 	req = (struct opthdr *)&tor[1];
1597*0Sstevel@tonic-gate 	req->level = MIB2_IP;	/* any MIB2_xxx value ok here */
1598*0Sstevel@tonic-gate 	req->name  = 0;
1599*0Sstevel@tonic-gate 	req->len   = 0;
1600*0Sstevel@tonic-gate 
1601*0Sstevel@tonic-gate 	ctlbuf.buf = (char *)&buf;
1602*0Sstevel@tonic-gate 	ctlbuf.len = tor->OPT_length + tor->OPT_offset;
1603*0Sstevel@tonic-gate 	ctlbuf.maxlen = sizeof (buf);
1604*0Sstevel@tonic-gate 	flags = 0;
1605*0Sstevel@tonic-gate 	if (putmsg(fd, &ctlbuf, NULL, flags) == -1) {
1606*0Sstevel@tonic-gate 		logperror("update_router_list: putmsg(ctl)");
1607*0Sstevel@tonic-gate 		return (_B_FALSE);
1608*0Sstevel@tonic-gate 	}
1609*0Sstevel@tonic-gate 
1610*0Sstevel@tonic-gate 	/*
1611*0Sstevel@tonic-gate 	 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for
1612*0Sstevel@tonic-gate 	 * each table defined in mib2.h.  Each T_OPTMGMT_ACK msg contains
1613*0Sstevel@tonic-gate 	 * a control and data part. The control part contains a struct
1614*0Sstevel@tonic-gate 	 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies
1615*0Sstevel@tonic-gate 	 * the level, name and length of the data in the data part. The
1616*0Sstevel@tonic-gate 	 * data part contains the actual table data. The last message
1617*0Sstevel@tonic-gate 	 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a
1618*0Sstevel@tonic-gate 	 * single option with zero optlen.
1619*0Sstevel@tonic-gate 	 */
1620*0Sstevel@tonic-gate 
1621*0Sstevel@tonic-gate 	for (;;) {
1622*0Sstevel@tonic-gate 		/*
1623*0Sstevel@tonic-gate 		 * Go around this loop once for each table. Ignore
1624*0Sstevel@tonic-gate 		 * all tables except the routing information table.
1625*0Sstevel@tonic-gate 		 */
1626*0Sstevel@tonic-gate 		flags = 0;
1627*0Sstevel@tonic-gate 		status = getmsg(fd, &ctlbuf, NULL, &flags);
1628*0Sstevel@tonic-gate 		if (status < 0) {
1629*0Sstevel@tonic-gate 			if (errno == EINTR)
1630*0Sstevel@tonic-gate 				continue;
1631*0Sstevel@tonic-gate 			logperror("update_router_list: getmsg(ctl)");
1632*0Sstevel@tonic-gate 			return (_B_FALSE);
1633*0Sstevel@tonic-gate 		}
1634*0Sstevel@tonic-gate 		if (ctlbuf.len < sizeof (t_scalar_t)) {
1635*0Sstevel@tonic-gate 			logerr("update_router_list: ctlbuf.len %d\n",
1636*0Sstevel@tonic-gate 			    ctlbuf.len);
1637*0Sstevel@tonic-gate 			return (_B_FALSE);
1638*0Sstevel@tonic-gate 		}
1639*0Sstevel@tonic-gate 
1640*0Sstevel@tonic-gate 		prim = buf.uprim.type;
1641*0Sstevel@tonic-gate 
1642*0Sstevel@tonic-gate 		switch (prim) {
1643*0Sstevel@tonic-gate 
1644*0Sstevel@tonic-gate 		case T_ERROR_ACK:
1645*0Sstevel@tonic-gate 			tea = &buf.uprim.error_ack;
1646*0Sstevel@tonic-gate 			if (ctlbuf.len < sizeof (struct T_error_ack)) {
1647*0Sstevel@tonic-gate 				logerr("update_router_list: T_ERROR_ACK"
1648*0Sstevel@tonic-gate 				    " ctlbuf.len %d\n", ctlbuf.len);
1649*0Sstevel@tonic-gate 				return (_B_FALSE);
1650*0Sstevel@tonic-gate 			}
1651*0Sstevel@tonic-gate 			logerr("update_router_list: T_ERROR_ACK:"
1652*0Sstevel@tonic-gate 			    " TLI_error = 0x%lx, UNIX_error = 0x%lx\n",
1653*0Sstevel@tonic-gate 			    tea->TLI_error, tea->UNIX_error);
1654*0Sstevel@tonic-gate 			return (_B_FALSE);
1655*0Sstevel@tonic-gate 
1656*0Sstevel@tonic-gate 		case T_OPTMGMT_ACK:
1657*0Sstevel@tonic-gate 			toa = &buf.uprim.optmgmt_ack;
1658*0Sstevel@tonic-gate 			optp = (struct opthdr *)&toa[1];
1659*0Sstevel@tonic-gate 			if (ctlbuf.len < sizeof (struct T_optmgmt_ack)) {
1660*0Sstevel@tonic-gate 				logerr("update_router_list: ctlbuf.len %d\n",
1661*0Sstevel@tonic-gate 				    ctlbuf.len);
1662*0Sstevel@tonic-gate 				return (_B_FALSE);
1663*0Sstevel@tonic-gate 			}
1664*0Sstevel@tonic-gate 			if (toa->MGMT_flags != T_SUCCESS) {
1665*0Sstevel@tonic-gate 				logerr("update_router_list: MGMT_flags 0x%lx\n",
1666*0Sstevel@tonic-gate 				    toa->MGMT_flags);
1667*0Sstevel@tonic-gate 				return (_B_FALSE);
1668*0Sstevel@tonic-gate 			}
1669*0Sstevel@tonic-gate 			break;
1670*0Sstevel@tonic-gate 
1671*0Sstevel@tonic-gate 		default:
1672*0Sstevel@tonic-gate 			logerr("update_router_list: unknown primitive %ld\n",
1673*0Sstevel@tonic-gate 			    prim);
1674*0Sstevel@tonic-gate 			return (_B_FALSE);
1675*0Sstevel@tonic-gate 		}
1676*0Sstevel@tonic-gate 
1677*0Sstevel@tonic-gate 		/* Process the T_OPGMGMT_ACK below */
1678*0Sstevel@tonic-gate 		assert(prim == T_OPTMGMT_ACK);
1679*0Sstevel@tonic-gate 
1680*0Sstevel@tonic-gate 		switch (status) {
1681*0Sstevel@tonic-gate 		case 0:
1682*0Sstevel@tonic-gate 			/*
1683*0Sstevel@tonic-gate 			 * We have reached the end of this T_OPTMGMT_ACK
1684*0Sstevel@tonic-gate 			 * message. If this is the last message i.e EOD,
1685*0Sstevel@tonic-gate 			 * return, else process the next T_OPTMGMT_ACK msg.
1686*0Sstevel@tonic-gate 			 */
1687*0Sstevel@tonic-gate 			if ((ctlbuf.len == sizeof (struct T_optmgmt_ack) +
1688*0Sstevel@tonic-gate 			    sizeof (struct opthdr)) && optp->len == 0 &&
1689*0Sstevel@tonic-gate 			    optp->name == 0 && optp->level == 0) {
1690*0Sstevel@tonic-gate 				/*
1691*0Sstevel@tonic-gate 				 * This is the EOD message. Return
1692*0Sstevel@tonic-gate 				 */
1693*0Sstevel@tonic-gate 				return (_B_TRUE);
1694*0Sstevel@tonic-gate 			}
1695*0Sstevel@tonic-gate 			continue;
1696*0Sstevel@tonic-gate 
1697*0Sstevel@tonic-gate 		case MORECTL:
1698*0Sstevel@tonic-gate 		case MORECTL | MOREDATA:
1699*0Sstevel@tonic-gate 			/*
1700*0Sstevel@tonic-gate 			 * This should not happen. We should be able to read
1701*0Sstevel@tonic-gate 			 * the control portion in a single getmsg.
1702*0Sstevel@tonic-gate 			 */
1703*0Sstevel@tonic-gate 			logerr("update_router_list: MORECTL\n");
1704*0Sstevel@tonic-gate 			return (_B_FALSE);
1705*0Sstevel@tonic-gate 
1706*0Sstevel@tonic-gate 		case MOREDATA:
1707*0Sstevel@tonic-gate 			databuf.maxlen = optp->len;
1708*0Sstevel@tonic-gate 			/* malloc of 0 bytes is ok */
1709*0Sstevel@tonic-gate 			databuf.buf = malloc((size_t)optp->len);
1710*0Sstevel@tonic-gate 			if (databuf.maxlen != 0 && databuf.buf == NULL) {
1711*0Sstevel@tonic-gate 				logperror("update_router_list: malloc");
1712*0Sstevel@tonic-gate 				return (_B_FALSE);
1713*0Sstevel@tonic-gate 			}
1714*0Sstevel@tonic-gate 			databuf.len = 0;
1715*0Sstevel@tonic-gate 			flags = 0;
1716*0Sstevel@tonic-gate 			for (;;) {
1717*0Sstevel@tonic-gate 				status = getmsg(fd, NULL, &databuf, &flags);
1718*0Sstevel@tonic-gate 				if (status >= 0) {
1719*0Sstevel@tonic-gate 					break;
1720*0Sstevel@tonic-gate 				} else if (errno == EINTR) {
1721*0Sstevel@tonic-gate 					continue;
1722*0Sstevel@tonic-gate 				} else {
1723*0Sstevel@tonic-gate 					logperror("update_router_list:"
1724*0Sstevel@tonic-gate 					    " getmsg(data)");
1725*0Sstevel@tonic-gate 					free(databuf.buf);
1726*0Sstevel@tonic-gate 					return (_B_FALSE);
1727*0Sstevel@tonic-gate 				}
1728*0Sstevel@tonic-gate 			}
1729*0Sstevel@tonic-gate 
1730*0Sstevel@tonic-gate 			if (optp->level == MIB2_IP &&
1731*0Sstevel@tonic-gate 			    optp->name == MIB2_IP_ROUTE) {
1732*0Sstevel@tonic-gate 				/* LINTED */
1733*0Sstevel@tonic-gate 				ire_process_v4((mib2_ipRouteEntry_t *)
1734*0Sstevel@tonic-gate 				    databuf.buf, databuf.len);
1735*0Sstevel@tonic-gate 			} else if (optp->level == MIB2_IP6 &&
1736*0Sstevel@tonic-gate 			    optp->name == MIB2_IP6_ROUTE) {
1737*0Sstevel@tonic-gate 				/* LINTED */
1738*0Sstevel@tonic-gate 				ire_process_v6((mib2_ipv6RouteEntry_t *)
1739*0Sstevel@tonic-gate 				    databuf.buf, databuf.len);
1740*0Sstevel@tonic-gate 			}
1741*0Sstevel@tonic-gate 			free(databuf.buf);
1742*0Sstevel@tonic-gate 		}
1743*0Sstevel@tonic-gate 	}
1744*0Sstevel@tonic-gate 	/* NOTREACHED */
1745*0Sstevel@tonic-gate }
1746*0Sstevel@tonic-gate 
1747*0Sstevel@tonic-gate /*
1748*0Sstevel@tonic-gate  * Examine the IPv4 routing table, for default routers. For each default
1749*0Sstevel@tonic-gate  * router, populate the list of targets of each phyint that is on the same
1750*0Sstevel@tonic-gate  * link as the default router
1751*0Sstevel@tonic-gate  */
1752*0Sstevel@tonic-gate static void
1753*0Sstevel@tonic-gate ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len)
1754*0Sstevel@tonic-gate {
1755*0Sstevel@tonic-gate 	mib2_ipRouteEntry_t	*rp;
1756*0Sstevel@tonic-gate 	mib2_ipRouteEntry_t	*rp1;
1757*0Sstevel@tonic-gate 	struct	in_addr		nexthop_v4;
1758*0Sstevel@tonic-gate 	mib2_ipRouteEntry_t	*endp;
1759*0Sstevel@tonic-gate 
1760*0Sstevel@tonic-gate 	if (len == 0)
1761*0Sstevel@tonic-gate 		return;
1762*0Sstevel@tonic-gate 	assert((len % sizeof (mib2_ipRouteEntry_t)) == 0);
1763*0Sstevel@tonic-gate 
1764*0Sstevel@tonic-gate 	endp = buf + (len / sizeof (mib2_ipRouteEntry_t));
1765*0Sstevel@tonic-gate 
1766*0Sstevel@tonic-gate 	/*
1767*0Sstevel@tonic-gate 	 * Loop thru the routing table entries. Process any IRE_DEFAULT,
1768*0Sstevel@tonic-gate 	 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others.
1769*0Sstevel@tonic-gate 	 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address.
1770*0Sstevel@tonic-gate 	 * This is a potential target for probing, which we try to add
1771*0Sstevel@tonic-gate 	 * to the list of probe targets.
1772*0Sstevel@tonic-gate 	 */
1773*0Sstevel@tonic-gate 	for (rp = buf; rp < endp; rp++) {
1774*0Sstevel@tonic-gate 		if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET))
1775*0Sstevel@tonic-gate 			continue;
1776*0Sstevel@tonic-gate 
1777*0Sstevel@tonic-gate 		/*  Get the nexthop address. */
1778*0Sstevel@tonic-gate 		nexthop_v4.s_addr = rp->ipRouteNextHop;
1779*0Sstevel@tonic-gate 
1780*0Sstevel@tonic-gate 		/*
1781*0Sstevel@tonic-gate 		 * Get the nexthop address. Then determine the outgoing
1782*0Sstevel@tonic-gate 		 * interface, by examining all interface IREs, and picking the
1783*0Sstevel@tonic-gate 		 * match. We don't look at the interface specified in the route
1784*0Sstevel@tonic-gate 		 * because we need to add the router target on all matching
1785*0Sstevel@tonic-gate 		 * interfaces anyway; the goal is to avoid falling back to
1786*0Sstevel@tonic-gate 		 * multicast when some interfaces are in the same subnet but
1787*0Sstevel@tonic-gate 		 * not in the same group.
1788*0Sstevel@tonic-gate 		 */
1789*0Sstevel@tonic-gate 		for (rp1 = buf; rp1 < endp; rp1++) {
1790*0Sstevel@tonic-gate 			if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE)) {
1791*0Sstevel@tonic-gate 				continue;
1792*0Sstevel@tonic-gate 			}
1793*0Sstevel@tonic-gate 
1794*0Sstevel@tonic-gate 			/*
1795*0Sstevel@tonic-gate 			 * Determine the interface IRE that matches the nexthop.
1796*0Sstevel@tonic-gate 			 * i.e.	 (IRE addr & IRE mask) == (nexthop & IRE mask)
1797*0Sstevel@tonic-gate 			 */
1798*0Sstevel@tonic-gate 			if ((rp1->ipRouteDest & rp1->ipRouteMask) ==
1799*0Sstevel@tonic-gate 			    (nexthop_v4.s_addr & rp1->ipRouteMask)) {
1800*0Sstevel@tonic-gate 				/*
1801*0Sstevel@tonic-gate 				 * We found the interface ire
1802*0Sstevel@tonic-gate 				 */
1803*0Sstevel@tonic-gate 				router_add_v4(rp1, nexthop_v4);
1804*0Sstevel@tonic-gate 			}
1805*0Sstevel@tonic-gate 		}
1806*0Sstevel@tonic-gate 	}
1807*0Sstevel@tonic-gate }
1808*0Sstevel@tonic-gate 
1809*0Sstevel@tonic-gate void
1810*0Sstevel@tonic-gate router_add_v4(mib2_ipRouteEntry_t *rp1, struct in_addr nexthop_v4)
1811*0Sstevel@tonic-gate {
1812*0Sstevel@tonic-gate 	char *cp;
1813*0Sstevel@tonic-gate 	char ifname[LIFNAMSIZ + 1];
1814*0Sstevel@tonic-gate 	struct in6_addr	nexthop;
1815*0Sstevel@tonic-gate 	int len;
1816*0Sstevel@tonic-gate 
1817*0Sstevel@tonic-gate 	if (debug & D_TARGET)
1818*0Sstevel@tonic-gate 		logdebug("router_add_v4()\n");
1819*0Sstevel@tonic-gate 
1820*0Sstevel@tonic-gate 	len = MIN(rp1->ipRouteIfIndex.o_length, sizeof (ifname) - 1);
1821*0Sstevel@tonic-gate 	(void) memcpy(ifname, rp1->ipRouteIfIndex.o_bytes, len);
1822*0Sstevel@tonic-gate 	ifname[len] = '\0';
1823*0Sstevel@tonic-gate 
1824*0Sstevel@tonic-gate 	if (ifname[0] == '\0')
1825*0Sstevel@tonic-gate 		return;
1826*0Sstevel@tonic-gate 
1827*0Sstevel@tonic-gate 	cp = strchr(ifname, IF_SEPARATOR);
1828*0Sstevel@tonic-gate 	if (cp != NULL)
1829*0Sstevel@tonic-gate 		*cp = '\0';
1830*0Sstevel@tonic-gate 
1831*0Sstevel@tonic-gate 	IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop);
1832*0Sstevel@tonic-gate 	router_add_common(AF_INET, ifname, nexthop);
1833*0Sstevel@tonic-gate }
1834*0Sstevel@tonic-gate 
1835*0Sstevel@tonic-gate void
1836*0Sstevel@tonic-gate router_add_common(int af, char *ifname, struct in6_addr nexthop)
1837*0Sstevel@tonic-gate {
1838*0Sstevel@tonic-gate 	struct phyint_instance *pii;
1839*0Sstevel@tonic-gate 	struct phyint *pi;
1840*0Sstevel@tonic-gate 
1841*0Sstevel@tonic-gate 	if (debug & D_TARGET)
1842*0Sstevel@tonic-gate 		logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname);
1843*0Sstevel@tonic-gate 
1844*0Sstevel@tonic-gate 	/*
1845*0Sstevel@tonic-gate 	 * Retrieve the phyint instance; bail if it's not known to us yet.
1846*0Sstevel@tonic-gate 	 */
1847*0Sstevel@tonic-gate 	pii = phyint_inst_lookup(af, ifname);
1848*0Sstevel@tonic-gate 	if (pii == NULL)
1849*0Sstevel@tonic-gate 		return;
1850*0Sstevel@tonic-gate 
1851*0Sstevel@tonic-gate 	/*
1852*0Sstevel@tonic-gate 	 * Don't use our own addresses as targets.
1853*0Sstevel@tonic-gate 	 */
1854*0Sstevel@tonic-gate 	if (own_address(pii->pii_af, nexthop))
1855*0Sstevel@tonic-gate 		return;
1856*0Sstevel@tonic-gate 
1857*0Sstevel@tonic-gate 	/*
1858*0Sstevel@tonic-gate 	 * If the phyint is part a named group, then add the address to all
1859*0Sstevel@tonic-gate 	 * members of the group; note that this is suboptimal in the IPv4 case
1860*0Sstevel@tonic-gate 	 * as it has already been added to all matching interfaces in
1861*0Sstevel@tonic-gate 	 * ire_process_v4(). Otherwise, add the address only to the phyint
1862*0Sstevel@tonic-gate 	 * itself, since other phyints in the anongroup may not be on the same
1863*0Sstevel@tonic-gate 	 * subnet.
1864*0Sstevel@tonic-gate 	 */
1865*0Sstevel@tonic-gate 	pi = pii->pii_phyint;
1866*0Sstevel@tonic-gate 	if (pi->pi_group == phyint_anongroup) {
1867*0Sstevel@tonic-gate 		target_add(pii, nexthop, _B_TRUE);
1868*0Sstevel@tonic-gate 	} else {
1869*0Sstevel@tonic-gate 		pi = pi->pi_group->pg_phyint;
1870*0Sstevel@tonic-gate 		for (; pi != NULL; pi = pi->pi_pgnext)
1871*0Sstevel@tonic-gate 			target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE);
1872*0Sstevel@tonic-gate 	}
1873*0Sstevel@tonic-gate }
1874*0Sstevel@tonic-gate 
1875*0Sstevel@tonic-gate /*
1876*0Sstevel@tonic-gate  * Examine the IPv6 routing table, for default routers. For each default
1877*0Sstevel@tonic-gate  * router, populate the list of targets of each phyint that is on the same
1878*0Sstevel@tonic-gate  * link as the default router
1879*0Sstevel@tonic-gate  */
1880*0Sstevel@tonic-gate static void
1881*0Sstevel@tonic-gate ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len)
1882*0Sstevel@tonic-gate {
1883*0Sstevel@tonic-gate 	mib2_ipv6RouteEntry_t	*rp;
1884*0Sstevel@tonic-gate 	mib2_ipv6RouteEntry_t	*endp;
1885*0Sstevel@tonic-gate 	struct	in6_addr nexthop_v6;
1886*0Sstevel@tonic-gate 
1887*0Sstevel@tonic-gate 	if (debug & D_TARGET)
1888*0Sstevel@tonic-gate 		logdebug("ire_process_v6(len %d)\n", len);
1889*0Sstevel@tonic-gate 
1890*0Sstevel@tonic-gate 	if (len == 0)
1891*0Sstevel@tonic-gate 		return;
1892*0Sstevel@tonic-gate 
1893*0Sstevel@tonic-gate 	assert((len % sizeof (mib2_ipv6RouteEntry_t)) == 0);
1894*0Sstevel@tonic-gate 	endp = buf + (len / sizeof (mib2_ipv6RouteEntry_t));
1895*0Sstevel@tonic-gate 
1896*0Sstevel@tonic-gate 	/*
1897*0Sstevel@tonic-gate 	 * Loop thru the routing table entries. Process any IRE_DEFAULT,
1898*0Sstevel@tonic-gate 	 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others.
1899*0Sstevel@tonic-gate 	 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address.
1900*0Sstevel@tonic-gate 	 * This is a potential target for probing, which we try to add
1901*0Sstevel@tonic-gate 	 * to the list of probe targets.
1902*0Sstevel@tonic-gate 	 */
1903*0Sstevel@tonic-gate 	for (rp = buf; rp < endp; rp++) {
1904*0Sstevel@tonic-gate 		if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET))
1905*0Sstevel@tonic-gate 			continue;
1906*0Sstevel@tonic-gate 
1907*0Sstevel@tonic-gate 		/*
1908*0Sstevel@tonic-gate 		 * We have the outgoing interface in ipv6RouteIfIndex
1909*0Sstevel@tonic-gate 		 * if ipv6RouteIfindex.o_length is non-zero. The outgoing
1910*0Sstevel@tonic-gate 		 * interface must be present for link-local addresses. Since
1911*0Sstevel@tonic-gate 		 * we use only link-local addreses for probing, we don't
1912*0Sstevel@tonic-gate 		 * consider the case when the outgoing interface is not
1913*0Sstevel@tonic-gate 		 * known and we need to scan interface ires
1914*0Sstevel@tonic-gate 		 */
1915*0Sstevel@tonic-gate 		nexthop_v6 = rp->ipv6RouteNextHop;
1916*0Sstevel@tonic-gate 		if (rp->ipv6RouteIfIndex.o_length != 0) {
1917*0Sstevel@tonic-gate 			/*
1918*0Sstevel@tonic-gate 			 * We already have the outgoing interface
1919*0Sstevel@tonic-gate 			 * in ipv6RouteIfIndex.
1920*0Sstevel@tonic-gate 			 */
1921*0Sstevel@tonic-gate 			router_add_v6(rp, nexthop_v6);
1922*0Sstevel@tonic-gate 		}
1923*0Sstevel@tonic-gate 	}
1924*0Sstevel@tonic-gate }
1925*0Sstevel@tonic-gate 
1926*0Sstevel@tonic-gate 
1927*0Sstevel@tonic-gate void
1928*0Sstevel@tonic-gate router_add_v6(mib2_ipv6RouteEntry_t *rp1, struct in6_addr nexthop_v6)
1929*0Sstevel@tonic-gate {
1930*0Sstevel@tonic-gate 	char ifname[LIFNAMSIZ + 1];
1931*0Sstevel@tonic-gate 	char *cp;
1932*0Sstevel@tonic-gate 	int  len;
1933*0Sstevel@tonic-gate 
1934*0Sstevel@tonic-gate 	if (debug & D_TARGET)
1935*0Sstevel@tonic-gate 		logdebug("router_add_v6()\n");
1936*0Sstevel@tonic-gate 
1937*0Sstevel@tonic-gate 	len = MIN(rp1->ipv6RouteIfIndex.o_length, sizeof (ifname) - 1);
1938*0Sstevel@tonic-gate 	(void) memcpy(ifname, rp1->ipv6RouteIfIndex.o_bytes, len);
1939*0Sstevel@tonic-gate 	ifname[len] = '\0';
1940*0Sstevel@tonic-gate 
1941*0Sstevel@tonic-gate 	if (ifname[0] == '\0')
1942*0Sstevel@tonic-gate 		return;
1943*0Sstevel@tonic-gate 
1944*0Sstevel@tonic-gate 	cp = strchr(ifname, IF_SEPARATOR);
1945*0Sstevel@tonic-gate 	if (cp != NULL)
1946*0Sstevel@tonic-gate 		*cp = '\0';
1947*0Sstevel@tonic-gate 
1948*0Sstevel@tonic-gate 	router_add_common(AF_INET6, ifname, nexthop_v6);
1949*0Sstevel@tonic-gate }
1950*0Sstevel@tonic-gate 
1951*0Sstevel@tonic-gate 
1952*0Sstevel@tonic-gate 
1953*0Sstevel@tonic-gate /*
1954*0Sstevel@tonic-gate  * Build a list of target routers, by scanning the routing tables.
1955*0Sstevel@tonic-gate  * It is assumed that interface routes exist, to reach the routers.
1956*0Sstevel@tonic-gate  */
1957*0Sstevel@tonic-gate static void
1958*0Sstevel@tonic-gate init_router_targets(void)
1959*0Sstevel@tonic-gate {
1960*0Sstevel@tonic-gate 	struct	target *tg;
1961*0Sstevel@tonic-gate 	struct	target *next_tg;
1962*0Sstevel@tonic-gate 	struct	phyint_instance *pii;
1963*0Sstevel@tonic-gate 	struct	phyint *pi;
1964*0Sstevel@tonic-gate 
1965*0Sstevel@tonic-gate 	if (force_mcast)
1966*0Sstevel@tonic-gate 		return;
1967*0Sstevel@tonic-gate 
1968*0Sstevel@tonic-gate 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1969*0Sstevel@tonic-gate 		pi = pii->pii_phyint;
1970*0Sstevel@tonic-gate 		/*
1971*0Sstevel@tonic-gate 		 * Exclude ptp and host targets. Set tg_in_use to false,
1972*0Sstevel@tonic-gate 		 * only for router targets.
1973*0Sstevel@tonic-gate 		 */
1974*0Sstevel@tonic-gate 		if (!pii->pii_targets_are_routers ||
1975*0Sstevel@tonic-gate 		    (pi->pi_flags & IFF_POINTOPOINT))
1976*0Sstevel@tonic-gate 			continue;
1977*0Sstevel@tonic-gate 
1978*0Sstevel@tonic-gate 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
1979*0Sstevel@tonic-gate 			tg->tg_in_use = 0;
1980*0Sstevel@tonic-gate 	}
1981*0Sstevel@tonic-gate 
1982*0Sstevel@tonic-gate 	if (mibfd < 0) {
1983*0Sstevel@tonic-gate 		mibfd = open("/dev/ip", O_RDWR);
1984*0Sstevel@tonic-gate 		if (mibfd < 0) {
1985*0Sstevel@tonic-gate 			logperror("mibopen: ip open");
1986*0Sstevel@tonic-gate 			exit(1);
1987*0Sstevel@tonic-gate 		}
1988*0Sstevel@tonic-gate 	}
1989*0Sstevel@tonic-gate 
1990*0Sstevel@tonic-gate 	if (!update_router_list(mibfd)) {
1991*0Sstevel@tonic-gate 		(void) close(mibfd);
1992*0Sstevel@tonic-gate 		mibfd = -1;
1993*0Sstevel@tonic-gate 	}
1994*0Sstevel@tonic-gate 
1995*0Sstevel@tonic-gate 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1996*0Sstevel@tonic-gate 		if (!pii->pii_targets_are_routers ||
1997*0Sstevel@tonic-gate 		    (pi->pi_flags & IFF_POINTOPOINT))
1998*0Sstevel@tonic-gate 			continue;
1999*0Sstevel@tonic-gate 
2000*0Sstevel@tonic-gate 		for (tg = pii->pii_targets; tg != NULL; tg = next_tg) {
2001*0Sstevel@tonic-gate 			next_tg = tg->tg_next;
2002*0Sstevel@tonic-gate 			if (!tg->tg_in_use) {
2003*0Sstevel@tonic-gate 				target_delete(tg);
2004*0Sstevel@tonic-gate 			}
2005*0Sstevel@tonic-gate 		}
2006*0Sstevel@tonic-gate 	}
2007*0Sstevel@tonic-gate }
2008*0Sstevel@tonic-gate 
2009*0Sstevel@tonic-gate /*
2010*0Sstevel@tonic-gate  * Attempt to assign host targets to any interfaces that do not currently
2011*0Sstevel@tonic-gate  * have probe targets by sharing targets with other interfaces in the group.
2012*0Sstevel@tonic-gate  */
2013*0Sstevel@tonic-gate static void
2014*0Sstevel@tonic-gate init_host_targets(void)
2015*0Sstevel@tonic-gate {
2016*0Sstevel@tonic-gate 	struct phyint_instance *pii;
2017*0Sstevel@tonic-gate 	struct phyint_group *pg;
2018*0Sstevel@tonic-gate 
2019*0Sstevel@tonic-gate 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2020*0Sstevel@tonic-gate 		pg = pii->pii_phyint->pi_group;
2021*0Sstevel@tonic-gate 		if (pg != phyint_anongroup && pii->pii_targets == NULL)
2022*0Sstevel@tonic-gate 			dup_host_targets(pii);
2023*0Sstevel@tonic-gate 	}
2024*0Sstevel@tonic-gate }
2025*0Sstevel@tonic-gate 
2026*0Sstevel@tonic-gate /*
2027*0Sstevel@tonic-gate  * Duplicate host targets from other phyints of the group to
2028*0Sstevel@tonic-gate  * the phyint instance 'desired_pii'.
2029*0Sstevel@tonic-gate  */
2030*0Sstevel@tonic-gate static void
2031*0Sstevel@tonic-gate dup_host_targets(struct phyint_instance	 *desired_pii)
2032*0Sstevel@tonic-gate {
2033*0Sstevel@tonic-gate 	int af;
2034*0Sstevel@tonic-gate 	struct phyint *pi;
2035*0Sstevel@tonic-gate 	struct phyint_instance *pii;
2036*0Sstevel@tonic-gate 	struct target *tg;
2037*0Sstevel@tonic-gate 
2038*0Sstevel@tonic-gate 	assert(desired_pii->pii_phyint->pi_group != phyint_anongroup);
2039*0Sstevel@tonic-gate 
2040*0Sstevel@tonic-gate 	af = desired_pii->pii_af;
2041*0Sstevel@tonic-gate 
2042*0Sstevel@tonic-gate 	/*
2043*0Sstevel@tonic-gate 	 * For every phyint in the same group as desired_pii, check if
2044*0Sstevel@tonic-gate 	 * it has any host targets. If so add them to desired_pii.
2045*0Sstevel@tonic-gate 	 */
2046*0Sstevel@tonic-gate 	for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) {
2047*0Sstevel@tonic-gate 		pii = PHYINT_INSTANCE(pi, af);
2048*0Sstevel@tonic-gate 		/*
2049*0Sstevel@tonic-gate 		 * We know that we don't have targets on this phyint instance
2050*0Sstevel@tonic-gate 		 * since we have been called. But we still check for
2051*0Sstevel@tonic-gate 		 * pii_targets_are_routers because another phyint instance
2052*0Sstevel@tonic-gate 		 * could have router targets, since IFF_NOFAILOVER addresses
2053*0Sstevel@tonic-gate 		 * on different phyint instances may belong to different
2054*0Sstevel@tonic-gate 		 * subnets.
2055*0Sstevel@tonic-gate 		 */
2056*0Sstevel@tonic-gate 		if ((pii == NULL) || (pii == desired_pii) ||
2057*0Sstevel@tonic-gate 		    pii->pii_targets_are_routers)
2058*0Sstevel@tonic-gate 			continue;
2059*0Sstevel@tonic-gate 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2060*0Sstevel@tonic-gate 			target_create(desired_pii, tg->tg_address, _B_FALSE);
2061*0Sstevel@tonic-gate 		}
2062*0Sstevel@tonic-gate 	}
2063*0Sstevel@tonic-gate }
2064*0Sstevel@tonic-gate 
2065*0Sstevel@tonic-gate static void
2066*0Sstevel@tonic-gate usage(char *cmd)
2067*0Sstevel@tonic-gate {
2068*0Sstevel@tonic-gate 	(void) fprintf(stderr, "usage: %s\n", cmd);
2069*0Sstevel@tonic-gate }
2070*0Sstevel@tonic-gate 
2071*0Sstevel@tonic-gate 
2072*0Sstevel@tonic-gate #define	MPATHD_DEFAULT_FILE	"/etc/default/mpathd"
2073*0Sstevel@tonic-gate 
2074*0Sstevel@tonic-gate /* Get an option from the /etc/default/mpathd file */
2075*0Sstevel@tonic-gate static char *
2076*0Sstevel@tonic-gate getdefault(char *name)
2077*0Sstevel@tonic-gate {
2078*0Sstevel@tonic-gate 	char namebuf[BUFSIZ];
2079*0Sstevel@tonic-gate 	char *value = NULL;
2080*0Sstevel@tonic-gate 
2081*0Sstevel@tonic-gate 	if (defopen(MPATHD_DEFAULT_FILE) == 0) {
2082*0Sstevel@tonic-gate 		char	*cp;
2083*0Sstevel@tonic-gate 		int	flags;
2084*0Sstevel@tonic-gate 
2085*0Sstevel@tonic-gate 		/*
2086*0Sstevel@tonic-gate 		 * ignore case
2087*0Sstevel@tonic-gate 		 */
2088*0Sstevel@tonic-gate 		flags = defcntl(DC_GETFLAGS, 0);
2089*0Sstevel@tonic-gate 		TURNOFF(flags, DC_CASE);
2090*0Sstevel@tonic-gate 		(void) defcntl(DC_SETFLAGS, flags);
2091*0Sstevel@tonic-gate 
2092*0Sstevel@tonic-gate 		/* Add "=" to the name */
2093*0Sstevel@tonic-gate 		(void) strncpy(namebuf, name, sizeof (namebuf) - 2);
2094*0Sstevel@tonic-gate 		(void) strncat(namebuf, "=", 2);
2095*0Sstevel@tonic-gate 
2096*0Sstevel@tonic-gate 		if ((cp = defread(namebuf)) != NULL)
2097*0Sstevel@tonic-gate 			value = strdup(cp);
2098*0Sstevel@tonic-gate 
2099*0Sstevel@tonic-gate 		/* close */
2100*0Sstevel@tonic-gate 		(void) defopen((char *)NULL);
2101*0Sstevel@tonic-gate 	}
2102*0Sstevel@tonic-gate 	return (value);
2103*0Sstevel@tonic-gate }
2104*0Sstevel@tonic-gate 
2105*0Sstevel@tonic-gate 
2106*0Sstevel@tonic-gate /*
2107*0Sstevel@tonic-gate  * Command line options below
2108*0Sstevel@tonic-gate  */
2109*0Sstevel@tonic-gate boolean_t	failback_enabled = _B_TRUE;	/* failback enabled/disabled */
2110*0Sstevel@tonic-gate boolean_t	track_all_phyints = _B_FALSE;	/* option to track all NICs */
2111*0Sstevel@tonic-gate static boolean_t adopt = _B_FALSE;
2112*0Sstevel@tonic-gate static boolean_t foreground = _B_FALSE;
2113*0Sstevel@tonic-gate 
2114*0Sstevel@tonic-gate int
2115*0Sstevel@tonic-gate main(int argc, char *argv[])
2116*0Sstevel@tonic-gate {
2117*0Sstevel@tonic-gate 	int i;
2118*0Sstevel@tonic-gate 	int c;
2119*0Sstevel@tonic-gate 	struct phyint_instance *pii;
2120*0Sstevel@tonic-gate 	char *value;
2121*0Sstevel@tonic-gate 
2122*0Sstevel@tonic-gate 	argv0 = argv;		/* Saved for re-exec on SIGHUP */
2123*0Sstevel@tonic-gate 	srandom(gethostid());	/* Initialize the random number generator */
2124*0Sstevel@tonic-gate 
2125*0Sstevel@tonic-gate 	/*
2126*0Sstevel@tonic-gate 	 * NOTE: The messages output by in.mpathd are not suitable for
2127*0Sstevel@tonic-gate 	 * translation, so we do not call textdomain().
2128*0Sstevel@tonic-gate 	 */
2129*0Sstevel@tonic-gate 	(void) setlocale(LC_ALL, "");
2130*0Sstevel@tonic-gate 
2131*0Sstevel@tonic-gate 	/*
2132*0Sstevel@tonic-gate 	 * Get the user specified value of 'failure detection time'
2133*0Sstevel@tonic-gate 	 * from /etc/default/mpathd
2134*0Sstevel@tonic-gate 	 */
2135*0Sstevel@tonic-gate 	value = getdefault("FAILURE_DETECTION_TIME");
2136*0Sstevel@tonic-gate 	if (value != NULL) {
2137*0Sstevel@tonic-gate 		user_failure_detection_time =
2138*0Sstevel@tonic-gate 		    (int)strtol((char *)value, NULL, 0);
2139*0Sstevel@tonic-gate 
2140*0Sstevel@tonic-gate 		if (user_failure_detection_time <= 0) {
2141*0Sstevel@tonic-gate 			user_failure_detection_time = FAILURE_DETECTION_TIME;
2142*0Sstevel@tonic-gate 			logerr("Invalid failure detection time %s, assuming "
2143*0Sstevel@tonic-gate 			    "default %d\n", value, user_failure_detection_time);
2144*0Sstevel@tonic-gate 
2145*0Sstevel@tonic-gate 		} else if (user_failure_detection_time <
2146*0Sstevel@tonic-gate 		    MIN_FAILURE_DETECTION_TIME) {
2147*0Sstevel@tonic-gate 			user_failure_detection_time =
2148*0Sstevel@tonic-gate 			    MIN_FAILURE_DETECTION_TIME;
2149*0Sstevel@tonic-gate 			logerr("Too small failure detection time of %s, "
2150*0Sstevel@tonic-gate 			    "assuming minimum %d\n", value,
2151*0Sstevel@tonic-gate 			    user_failure_detection_time);
2152*0Sstevel@tonic-gate 		}
2153*0Sstevel@tonic-gate 		free(value);
2154*0Sstevel@tonic-gate 	} else {
2155*0Sstevel@tonic-gate 		/* User has not specified the parameter, Use default value */
2156*0Sstevel@tonic-gate 		user_failure_detection_time = FAILURE_DETECTION_TIME;
2157*0Sstevel@tonic-gate 	}
2158*0Sstevel@tonic-gate 
2159*0Sstevel@tonic-gate 	/*
2160*0Sstevel@tonic-gate 	 * This gives the frequency at which probes will be sent.
2161*0Sstevel@tonic-gate 	 * When fdt ms elapses, we should be able to determine
2162*0Sstevel@tonic-gate 	 * whether 5 consecutive probes have failed or not.
2163*0Sstevel@tonic-gate 	 * 1 probe will be sent in every user_probe_interval ms,
2164*0Sstevel@tonic-gate 	 * randomly anytime in the (0.5  - 1.0) 2nd half of every
2165*0Sstevel@tonic-gate 	 * user_probe_interval. Thus when we send out probe 'n' we
2166*0Sstevel@tonic-gate 	 * can be sure that probe 'n - 2' is lost, if we have not
2167*0Sstevel@tonic-gate 	 * got the ack. (since the probe interval is > crtt). But
2168*0Sstevel@tonic-gate 	 * probe 'n - 1' may be a valid unacked probe, since the
2169*0Sstevel@tonic-gate 	 * time between 2 successive probes could be as small as
2170*0Sstevel@tonic-gate 	 * 0.5 * user_probe_interval.  Hence the NUM_PROBE_FAILS + 2
2171*0Sstevel@tonic-gate 	 */
2172*0Sstevel@tonic-gate 	user_probe_interval = user_failure_detection_time /
2173*0Sstevel@tonic-gate 	    (NUM_PROBE_FAILS + 2);
2174*0Sstevel@tonic-gate 
2175*0Sstevel@tonic-gate 	/*
2176*0Sstevel@tonic-gate 	 * Get the user specified value of failback_enabled from
2177*0Sstevel@tonic-gate 	 * /etc/default/mpathd
2178*0Sstevel@tonic-gate 	 */
2179*0Sstevel@tonic-gate 	value = getdefault("FAILBACK");
2180*0Sstevel@tonic-gate 	if (value != NULL) {
2181*0Sstevel@tonic-gate 		if (strncasecmp(value, "yes", 3) == 0)
2182*0Sstevel@tonic-gate 			failback_enabled = _B_TRUE;
2183*0Sstevel@tonic-gate 		else if (strncasecmp(value, "no", 2) == 0)
2184*0Sstevel@tonic-gate 			failback_enabled = _B_FALSE;
2185*0Sstevel@tonic-gate 		else
2186*0Sstevel@tonic-gate 			logerr("Invalid value for FAILBACK %s\n", value);
2187*0Sstevel@tonic-gate 		free(value);
2188*0Sstevel@tonic-gate 	} else {
2189*0Sstevel@tonic-gate 		failback_enabled = _B_TRUE;
2190*0Sstevel@tonic-gate 	}
2191*0Sstevel@tonic-gate 
2192*0Sstevel@tonic-gate 	/*
2193*0Sstevel@tonic-gate 	 * Get the user specified value of track_all_phyints from
2194*0Sstevel@tonic-gate 	 * /etc/default/mpathd. The sense is reversed in
2195*0Sstevel@tonic-gate 	 * TRACK_INTERFACES_ONLY_WITH_GROUPS.
2196*0Sstevel@tonic-gate 	 */
2197*0Sstevel@tonic-gate 	value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS");
2198*0Sstevel@tonic-gate 	if (value != NULL) {
2199*0Sstevel@tonic-gate 		if (strncasecmp(value, "yes", 3) == 0)
2200*0Sstevel@tonic-gate 			track_all_phyints = _B_FALSE;
2201*0Sstevel@tonic-gate 		else if (strncasecmp(value, "no", 2) == 0)
2202*0Sstevel@tonic-gate 			track_all_phyints = _B_TRUE;
2203*0Sstevel@tonic-gate 		else
2204*0Sstevel@tonic-gate 			logerr("Invalid value for "
2205*0Sstevel@tonic-gate 			    "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value);
2206*0Sstevel@tonic-gate 		free(value);
2207*0Sstevel@tonic-gate 	} else {
2208*0Sstevel@tonic-gate 		track_all_phyints = _B_FALSE;
2209*0Sstevel@tonic-gate 	}
2210*0Sstevel@tonic-gate 
2211*0Sstevel@tonic-gate 	while ((c = getopt(argc, argv, "adD:ml")) != EOF) {
2212*0Sstevel@tonic-gate 		switch (c) {
2213*0Sstevel@tonic-gate 		case 'a':
2214*0Sstevel@tonic-gate 			adopt = _B_TRUE;
2215*0Sstevel@tonic-gate 			break;
2216*0Sstevel@tonic-gate 		case 'm':
2217*0Sstevel@tonic-gate 			force_mcast = _B_TRUE;
2218*0Sstevel@tonic-gate 			break;
2219*0Sstevel@tonic-gate 		case 'd':
2220*0Sstevel@tonic-gate 			debug = D_ALL;
2221*0Sstevel@tonic-gate 			foreground = _B_TRUE;
2222*0Sstevel@tonic-gate 			break;
2223*0Sstevel@tonic-gate 		case 'D':
2224*0Sstevel@tonic-gate 			i = (int)strtol(optarg, NULL, 0);
2225*0Sstevel@tonic-gate 			if (i == 0) {
2226*0Sstevel@tonic-gate 				(void) fprintf(stderr, "Bad debug flags: %s\n",
2227*0Sstevel@tonic-gate 				    optarg);
2228*0Sstevel@tonic-gate 				exit(1);
2229*0Sstevel@tonic-gate 			}
2230*0Sstevel@tonic-gate 			debug |= i;
2231*0Sstevel@tonic-gate 			foreground = _B_TRUE;
2232*0Sstevel@tonic-gate 			break;
2233*0Sstevel@tonic-gate 		case 'l':
2234*0Sstevel@tonic-gate 			/*
2235*0Sstevel@tonic-gate 			 * Turn off link state notification handling.
2236*0Sstevel@tonic-gate 			 * Undocumented command line flag, for debugging
2237*0Sstevel@tonic-gate 			 * purposes.
2238*0Sstevel@tonic-gate 			 */
2239*0Sstevel@tonic-gate 			handle_link_notifications = _B_FALSE;
2240*0Sstevel@tonic-gate 			break;
2241*0Sstevel@tonic-gate 		default:
2242*0Sstevel@tonic-gate 			usage(argv[0]);
2243*0Sstevel@tonic-gate 			exit(1);
2244*0Sstevel@tonic-gate 		}
2245*0Sstevel@tonic-gate 	}
2246*0Sstevel@tonic-gate 
2247*0Sstevel@tonic-gate 	/*
2248*0Sstevel@tonic-gate 	 * The sockets for the loopback command interface should be listening
2249*0Sstevel@tonic-gate 	 * before we fork and exit in daemonize(). This way, whoever started us
2250*0Sstevel@tonic-gate 	 * can use the loopback interface as soon as they get a zero exit
2251*0Sstevel@tonic-gate 	 * status.
2252*0Sstevel@tonic-gate 	 */
2253*0Sstevel@tonic-gate 	lsock_v4 = setup_listener(AF_INET);
2254*0Sstevel@tonic-gate 	lsock_v6 = setup_listener(AF_INET6);
2255*0Sstevel@tonic-gate 
2256*0Sstevel@tonic-gate 	if (lsock_v4 < 0 && lsock_v6 < 0) {
2257*0Sstevel@tonic-gate 		logerr("main: setup_listener failed for both IPv4 and IPv6\n");
2258*0Sstevel@tonic-gate 		exit(1);
2259*0Sstevel@tonic-gate 	}
2260*0Sstevel@tonic-gate 
2261*0Sstevel@tonic-gate 	if (!foreground) {
2262*0Sstevel@tonic-gate 		if (!daemonize()) {
2263*0Sstevel@tonic-gate 			logerr("cannot daemonize\n");
2264*0Sstevel@tonic-gate 			exit(EXIT_FAILURE);
2265*0Sstevel@tonic-gate 		}
2266*0Sstevel@tonic-gate 		initlog();
2267*0Sstevel@tonic-gate 	}
2268*0Sstevel@tonic-gate 
2269*0Sstevel@tonic-gate 	/*
2270*0Sstevel@tonic-gate 	 * Initializations:
2271*0Sstevel@tonic-gate 	 * 1. Create ifsock* sockets. These are used for performing SIOC*
2272*0Sstevel@tonic-gate 	 *    ioctls. We have 2 sockets 1 each for IPv4 and IPv6.
2273*0Sstevel@tonic-gate 	 * 2. Initialize a pipe for handling/recording signal events.
2274*0Sstevel@tonic-gate 	 * 3. Create the routing sockets,  used for listening
2275*0Sstevel@tonic-gate 	 *    to routing / interface changes.
2276*0Sstevel@tonic-gate 	 * 4. phyint_init() - Initialize physical interface state
2277*0Sstevel@tonic-gate 	 *    (in mpd_tables.c).  Must be done before creating interfaces,
2278*0Sstevel@tonic-gate 	 *    which timer_init() does indirectly.
2279*0Sstevel@tonic-gate 	 * 5. timer_init()  - Initialize timer related stuff
2280*0Sstevel@tonic-gate 	 * 6. initifs() - Initialize our database of all known interfaces
2281*0Sstevel@tonic-gate 	 * 7. init_router_targets() - Initialize our database of all known
2282*0Sstevel@tonic-gate 	 *    router targets.
2283*0Sstevel@tonic-gate 	 */
2284*0Sstevel@tonic-gate 	ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0);
2285*0Sstevel@tonic-gate 	if (ifsock_v4 < 0) {
2286*0Sstevel@tonic-gate 		logperror("main: IPv4 socket open");
2287*0Sstevel@tonic-gate 		exit(1);
2288*0Sstevel@tonic-gate 	}
2289*0Sstevel@tonic-gate 
2290*0Sstevel@tonic-gate 	ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0);
2291*0Sstevel@tonic-gate 	if (ifsock_v6 < 0) {
2292*0Sstevel@tonic-gate 		logperror("main: IPv6 socket open");
2293*0Sstevel@tonic-gate 		exit(1);
2294*0Sstevel@tonic-gate 	}
2295*0Sstevel@tonic-gate 
2296*0Sstevel@tonic-gate 	setup_eventpipe();
2297*0Sstevel@tonic-gate 
2298*0Sstevel@tonic-gate 	rtsock_v4 = setup_rtsock(AF_INET);
2299*0Sstevel@tonic-gate 	rtsock_v6 = setup_rtsock(AF_INET6);
2300*0Sstevel@tonic-gate 
2301*0Sstevel@tonic-gate 	if (phyint_init() == -1) {
2302*0Sstevel@tonic-gate 		logerr("cannot initialize physical interface structures");
2303*0Sstevel@tonic-gate 		exit(1);
2304*0Sstevel@tonic-gate 	}
2305*0Sstevel@tonic-gate 
2306*0Sstevel@tonic-gate 	timer_init();
2307*0Sstevel@tonic-gate 
2308*0Sstevel@tonic-gate 	initifs();
2309*0Sstevel@tonic-gate 
2310*0Sstevel@tonic-gate 	/*
2311*0Sstevel@tonic-gate 	 * If we're operating in "adopt" mode and no interfaces need to be
2312*0Sstevel@tonic-gate 	 * tracked, shut down (ifconfig(1M) will restart us on demand if
2313*0Sstevel@tonic-gate 	 * interfaces are subsequently put into multipathing groups).
2314*0Sstevel@tonic-gate 	 */
2315*0Sstevel@tonic-gate 	if (adopt && phyint_instances == NULL)
2316*0Sstevel@tonic-gate 		exit(0);
2317*0Sstevel@tonic-gate 
2318*0Sstevel@tonic-gate 	/*
2319*0Sstevel@tonic-gate 	 * Main body. Keep listening for activity on any of the sockets
2320*0Sstevel@tonic-gate 	 * that we are monitoring and take appropriate action as necessary.
2321*0Sstevel@tonic-gate 	 * signals are also handled synchronously.
2322*0Sstevel@tonic-gate 	 */
2323*0Sstevel@tonic-gate 	for (;;) {
2324*0Sstevel@tonic-gate 		if (poll(pollfds, pollfd_num, -1) < 0) {
2325*0Sstevel@tonic-gate 			if (errno == EINTR)
2326*0Sstevel@tonic-gate 				continue;
2327*0Sstevel@tonic-gate 			logperror("main: poll");
2328*0Sstevel@tonic-gate 			exit(1);
2329*0Sstevel@tonic-gate 		}
2330*0Sstevel@tonic-gate 		for (i = 0; i < pollfd_num; i++) {
2331*0Sstevel@tonic-gate 			if ((pollfds[i].fd == -1) ||
2332*0Sstevel@tonic-gate 			    !(pollfds[i].revents & POLLIN))
2333*0Sstevel@tonic-gate 				continue;
2334*0Sstevel@tonic-gate 			if (pollfds[i].fd == eventpipe_read) {
2335*0Sstevel@tonic-gate 				in_signal(eventpipe_read);
2336*0Sstevel@tonic-gate 				break;
2337*0Sstevel@tonic-gate 			}
2338*0Sstevel@tonic-gate 			if (pollfds[i].fd == rtsock_v4 ||
2339*0Sstevel@tonic-gate 				pollfds[i].fd == rtsock_v6) {
2340*0Sstevel@tonic-gate 				process_rtsock(rtsock_v4, rtsock_v6);
2341*0Sstevel@tonic-gate 				break;
2342*0Sstevel@tonic-gate 			}
2343*0Sstevel@tonic-gate 			for (pii = phyint_instances; pii != NULL;
2344*0Sstevel@tonic-gate 			    pii = pii->pii_next) {
2345*0Sstevel@tonic-gate 				if (pollfds[i].fd == pii->pii_probe_sock) {
2346*0Sstevel@tonic-gate 					if (pii->pii_af == AF_INET)
2347*0Sstevel@tonic-gate 						in_data(pii);
2348*0Sstevel@tonic-gate 					else
2349*0Sstevel@tonic-gate 						in6_data(pii);
2350*0Sstevel@tonic-gate 					break;
2351*0Sstevel@tonic-gate 				}
2352*0Sstevel@tonic-gate 			}
2353*0Sstevel@tonic-gate 			if (pollfds[i].fd == lsock_v4)
2354*0Sstevel@tonic-gate 				loopback_cmd(lsock_v4, AF_INET);
2355*0Sstevel@tonic-gate 			else if (pollfds[i].fd == lsock_v6)
2356*0Sstevel@tonic-gate 				loopback_cmd(lsock_v6, AF_INET6);
2357*0Sstevel@tonic-gate 		}
2358*0Sstevel@tonic-gate 		if (full_scan_required) {
2359*0Sstevel@tonic-gate 			initifs();
2360*0Sstevel@tonic-gate 			full_scan_required = _B_FALSE;
2361*0Sstevel@tonic-gate 		}
2362*0Sstevel@tonic-gate 	}
2363*0Sstevel@tonic-gate 	/* NOTREACHED */
2364*0Sstevel@tonic-gate 	return (EXIT_SUCCESS);
2365*0Sstevel@tonic-gate }
2366*0Sstevel@tonic-gate 
2367*0Sstevel@tonic-gate static int
2368*0Sstevel@tonic-gate setup_listener(int af)
2369*0Sstevel@tonic-gate {
2370*0Sstevel@tonic-gate 	int sock;
2371*0Sstevel@tonic-gate 	int on;
2372*0Sstevel@tonic-gate 	int len;
2373*0Sstevel@tonic-gate 	int ret;
2374*0Sstevel@tonic-gate 	struct sockaddr_storage laddr;
2375*0Sstevel@tonic-gate 	struct sockaddr_in  *sin;
2376*0Sstevel@tonic-gate 	struct sockaddr_in6 *sin6;
2377*0Sstevel@tonic-gate 	struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
2378*0Sstevel@tonic-gate 
2379*0Sstevel@tonic-gate 	assert(af == AF_INET || af == AF_INET6);
2380*0Sstevel@tonic-gate 
2381*0Sstevel@tonic-gate 	sock = socket(af, SOCK_STREAM, 0);
2382*0Sstevel@tonic-gate 	if (sock < 0) {
2383*0Sstevel@tonic-gate 		logperror("setup_listener: socket");
2384*0Sstevel@tonic-gate 		exit(1);
2385*0Sstevel@tonic-gate 	}
2386*0Sstevel@tonic-gate 
2387*0Sstevel@tonic-gate 	on = 1;
2388*0Sstevel@tonic-gate 	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
2389*0Sstevel@tonic-gate 	    sizeof (on)) < 0) {
2390*0Sstevel@tonic-gate 		logperror("setup_listener: setsockopt (SO_REUSEADDR)");
2391*0Sstevel@tonic-gate 		exit(1);
2392*0Sstevel@tonic-gate 	}
2393*0Sstevel@tonic-gate 
2394*0Sstevel@tonic-gate 	bzero(&laddr, sizeof (laddr));
2395*0Sstevel@tonic-gate 	laddr.ss_family = af;
2396*0Sstevel@tonic-gate 
2397*0Sstevel@tonic-gate 	if (af == AF_INET) {
2398*0Sstevel@tonic-gate 		sin = (struct sockaddr_in *)&laddr;
2399*0Sstevel@tonic-gate 		sin->sin_port = htons(MPATHD_PORT);
2400*0Sstevel@tonic-gate 		sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
2401*0Sstevel@tonic-gate 		len = sizeof (struct sockaddr_in);
2402*0Sstevel@tonic-gate 	} else {
2403*0Sstevel@tonic-gate 		sin6 = (struct sockaddr_in6 *)&laddr;
2404*0Sstevel@tonic-gate 		sin6->sin6_port = htons(MPATHD_PORT);
2405*0Sstevel@tonic-gate 		sin6->sin6_addr = loopback_addr;
2406*0Sstevel@tonic-gate 		len = sizeof (struct sockaddr_in6);
2407*0Sstevel@tonic-gate 	}
2408*0Sstevel@tonic-gate 
2409*0Sstevel@tonic-gate 	ret = bind(sock, (struct sockaddr *)&laddr, len);
2410*0Sstevel@tonic-gate 	if (ret < 0) {
2411*0Sstevel@tonic-gate 		if (errno == EADDRINUSE) {
2412*0Sstevel@tonic-gate 			/*
2413*0Sstevel@tonic-gate 			 * Another instance of mpathd may be already active.
2414*0Sstevel@tonic-gate 			 */
2415*0Sstevel@tonic-gate 			logerr("main: is another instance of in.mpathd "
2416*0Sstevel@tonic-gate 			    "already active?\n");
2417*0Sstevel@tonic-gate 			exit(1);
2418*0Sstevel@tonic-gate 		} else {
2419*0Sstevel@tonic-gate 			(void) close(sock);
2420*0Sstevel@tonic-gate 			return (-1);
2421*0Sstevel@tonic-gate 		}
2422*0Sstevel@tonic-gate 	}
2423*0Sstevel@tonic-gate 	if (listen(sock, 30) < 0) {
2424*0Sstevel@tonic-gate 		logperror("main: listen");
2425*0Sstevel@tonic-gate 		exit(1);
2426*0Sstevel@tonic-gate 	}
2427*0Sstevel@tonic-gate 	if (poll_add(sock) == -1) {
2428*0Sstevel@tonic-gate 		(void) close(sock);
2429*0Sstevel@tonic-gate 		exit(1);
2430*0Sstevel@tonic-gate 	}
2431*0Sstevel@tonic-gate 
2432*0Sstevel@tonic-gate 	return (sock);
2433*0Sstevel@tonic-gate }
2434*0Sstevel@tonic-gate 
2435*0Sstevel@tonic-gate /*
2436*0Sstevel@tonic-gate  * Table of commands and their expected size; used by loopback_cmd().
2437*0Sstevel@tonic-gate  */
2438*0Sstevel@tonic-gate static struct {
2439*0Sstevel@tonic-gate 	const char	*name;
2440*0Sstevel@tonic-gate 	unsigned int	size;
2441*0Sstevel@tonic-gate } commands[] = {
2442*0Sstevel@tonic-gate 	{ "MI_PING",		sizeof (uint32_t)	},
2443*0Sstevel@tonic-gate 	{ "MI_OFFLINE",		sizeof (mi_offline_t)	},
2444*0Sstevel@tonic-gate 	{ "MI_UNDO_OFFLINE",	sizeof (mi_undo_offline_t) },
2445*0Sstevel@tonic-gate 	{ "MI_SETOINDEX",	sizeof (mi_setoindex_t) },
2446*0Sstevel@tonic-gate 	{ "MI_QUERY",		sizeof (mi_query_t)	}
2447*0Sstevel@tonic-gate };
2448*0Sstevel@tonic-gate 
2449*0Sstevel@tonic-gate /*
2450*0Sstevel@tonic-gate  * Commands received over the loopback interface come here. Currently
2451*0Sstevel@tonic-gate  * the agents that send commands are ifconfig, if_mpadm and the RCM IPMP
2452*0Sstevel@tonic-gate  * module. ifconfig only makes a connection, and closes it to check if
2453*0Sstevel@tonic-gate  * in.mpathd is running.
2454*0Sstevel@tonic-gate  * if_mpadm sends commands in the format specified by the mpathd_interface
2455*0Sstevel@tonic-gate  * structure.
2456*0Sstevel@tonic-gate  */
2457*0Sstevel@tonic-gate static void
2458*0Sstevel@tonic-gate loopback_cmd(int sock, int family)
2459*0Sstevel@tonic-gate {
2460*0Sstevel@tonic-gate 	int newfd;
2461*0Sstevel@tonic-gate 	ssize_t len;
2462*0Sstevel@tonic-gate 	struct sockaddr_storage	peer;
2463*0Sstevel@tonic-gate 	struct sockaddr_in	*peer_sin;
2464*0Sstevel@tonic-gate 	struct sockaddr_in6	*peer_sin6;
2465*0Sstevel@tonic-gate 	socklen_t peerlen;
2466*0Sstevel@tonic-gate 	union mi_commands mpi;
2467*0Sstevel@tonic-gate 	struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
2468*0Sstevel@tonic-gate 	char abuf[INET6_ADDRSTRLEN];
2469*0Sstevel@tonic-gate 	uint_t cmd;
2470*0Sstevel@tonic-gate 	int retval;
2471*0Sstevel@tonic-gate 
2472*0Sstevel@tonic-gate 	peerlen = sizeof (peer);
2473*0Sstevel@tonic-gate 	newfd = accept(sock, (struct sockaddr *)&peer, &peerlen);
2474*0Sstevel@tonic-gate 	if (newfd < 0) {
2475*0Sstevel@tonic-gate 		logperror("loopback_cmd: accept");
2476*0Sstevel@tonic-gate 		return;
2477*0Sstevel@tonic-gate 	}
2478*0Sstevel@tonic-gate 
2479*0Sstevel@tonic-gate 	switch (family) {
2480*0Sstevel@tonic-gate 	case AF_INET:
2481*0Sstevel@tonic-gate 		/*
2482*0Sstevel@tonic-gate 		 * Validate the address and port to make sure that
2483*0Sstevel@tonic-gate 		 * non privileged processes don't connect and start
2484*0Sstevel@tonic-gate 		 * talking to us.
2485*0Sstevel@tonic-gate 		 */
2486*0Sstevel@tonic-gate 		if (peerlen != sizeof (struct sockaddr_in)) {
2487*0Sstevel@tonic-gate 			logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen);
2488*0Sstevel@tonic-gate 			(void) close(newfd);
2489*0Sstevel@tonic-gate 			return;
2490*0Sstevel@tonic-gate 		}
2491*0Sstevel@tonic-gate 		peer_sin = (struct sockaddr_in *)&peer;
2492*0Sstevel@tonic-gate 		if ((ntohs(peer_sin->sin_port) >= IPPORT_RESERVED) ||
2493*0Sstevel@tonic-gate 		    (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK)) {
2494*0Sstevel@tonic-gate 			(void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr,
2495*0Sstevel@tonic-gate 			    abuf, sizeof (abuf));
2496*0Sstevel@tonic-gate 			logerr("Attempt to connect from addr %s port %d\n",
2497*0Sstevel@tonic-gate 			    abuf, ntohs(peer_sin->sin_port));
2498*0Sstevel@tonic-gate 			(void) close(newfd);
2499*0Sstevel@tonic-gate 			return;
2500*0Sstevel@tonic-gate 		}
2501*0Sstevel@tonic-gate 		break;
2502*0Sstevel@tonic-gate 
2503*0Sstevel@tonic-gate 	case AF_INET6:
2504*0Sstevel@tonic-gate 		if (peerlen != sizeof (struct sockaddr_in6)) {
2505*0Sstevel@tonic-gate 			logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen);
2506*0Sstevel@tonic-gate 			(void) close(newfd);
2507*0Sstevel@tonic-gate 			return;
2508*0Sstevel@tonic-gate 		}
2509*0Sstevel@tonic-gate 		/*
2510*0Sstevel@tonic-gate 		 * Validate the address and port to make sure that
2511*0Sstevel@tonic-gate 		 * non privileged processes don't connect and start
2512*0Sstevel@tonic-gate 		 * talking to us.
2513*0Sstevel@tonic-gate 		 */
2514*0Sstevel@tonic-gate 		peer_sin6 = (struct sockaddr_in6 *)&peer;
2515*0Sstevel@tonic-gate 		if ((ntohs(peer_sin6->sin6_port) >= IPPORT_RESERVED) ||
2516*0Sstevel@tonic-gate 		    (!IN6_ARE_ADDR_EQUAL(&peer_sin6->sin6_addr,
2517*0Sstevel@tonic-gate 		    &loopback_addr))) {
2518*0Sstevel@tonic-gate 			(void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf,
2519*0Sstevel@tonic-gate 			    sizeof (abuf));
2520*0Sstevel@tonic-gate 			logerr("Attempt to connect from addr %s port %d\n",
2521*0Sstevel@tonic-gate 			    abuf, ntohs(peer_sin6->sin6_port));
2522*0Sstevel@tonic-gate 			(void) close(newfd);
2523*0Sstevel@tonic-gate 			return;
2524*0Sstevel@tonic-gate 		}
2525*0Sstevel@tonic-gate 
2526*0Sstevel@tonic-gate 	default:
2527*0Sstevel@tonic-gate 		logdebug("loopback_cmd: family %d\n", family);
2528*0Sstevel@tonic-gate 		(void) close(newfd);
2529*0Sstevel@tonic-gate 		return;
2530*0Sstevel@tonic-gate 	}
2531*0Sstevel@tonic-gate 
2532*0Sstevel@tonic-gate 	/*
2533*0Sstevel@tonic-gate 	 * The sizeof the 'mpi' buffer corresponds to the maximum size of
2534*0Sstevel@tonic-gate 	 * all supported commands
2535*0Sstevel@tonic-gate 	 */
2536*0Sstevel@tonic-gate 	len = read(newfd, &mpi, sizeof (mpi));
2537*0Sstevel@tonic-gate 
2538*0Sstevel@tonic-gate 	/*
2539*0Sstevel@tonic-gate 	 * ifconfig does not send any data. Just tests to see if mpathd
2540*0Sstevel@tonic-gate 	 * is already running.
2541*0Sstevel@tonic-gate 	 */
2542*0Sstevel@tonic-gate 	if (len <= 0) {
2543*0Sstevel@tonic-gate 		(void) close(newfd);
2544*0Sstevel@tonic-gate 		return;
2545*0Sstevel@tonic-gate 	}
2546*0Sstevel@tonic-gate 
2547*0Sstevel@tonic-gate 	/*
2548*0Sstevel@tonic-gate 	 * In theory, we can receive any sized message for a stream socket,
2549*0Sstevel@tonic-gate 	 * but we don't expect that to happen for a small message over a
2550*0Sstevel@tonic-gate 	 * loopback connection.
2551*0Sstevel@tonic-gate 	 */
2552*0Sstevel@tonic-gate 	if (len < sizeof (uint32_t)) {
2553*0Sstevel@tonic-gate 		logerr("loopback_cmd: bad command format or read returns "
2554*0Sstevel@tonic-gate 		    "partial data %d\n", len);
2555*0Sstevel@tonic-gate 	}
2556*0Sstevel@tonic-gate 
2557*0Sstevel@tonic-gate 	cmd = mpi.mi_command;
2558*0Sstevel@tonic-gate 	if (cmd >= MI_NCMD) {
2559*0Sstevel@tonic-gate 		logerr("loopback_cmd: unknown command id `%d'\n", cmd);
2560*0Sstevel@tonic-gate 		(void) close(newfd);
2561*0Sstevel@tonic-gate 		return;
2562*0Sstevel@tonic-gate 	}
2563*0Sstevel@tonic-gate 
2564*0Sstevel@tonic-gate 	if (len < commands[cmd].size) {
2565*0Sstevel@tonic-gate 		logerr("loopback_cmd: short %s command (expected %d, got %d)\n",
2566*0Sstevel@tonic-gate 		    commands[cmd].name, commands[cmd].size, len);
2567*0Sstevel@tonic-gate 		(void) close(newfd);
2568*0Sstevel@tonic-gate 		return;
2569*0Sstevel@tonic-gate 	}
2570*0Sstevel@tonic-gate 
2571*0Sstevel@tonic-gate 	retval = process_cmd(newfd, &mpi);
2572*0Sstevel@tonic-gate 	if (retval != IPMP_SUCCESS) {
2573*0Sstevel@tonic-gate 		logerr("failed processing %s: %s\n", commands[cmd].name,
2574*0Sstevel@tonic-gate 		    ipmp_errmsg(retval));
2575*0Sstevel@tonic-gate 	}
2576*0Sstevel@tonic-gate 	(void) close(newfd);
2577*0Sstevel@tonic-gate }
2578*0Sstevel@tonic-gate 
2579*0Sstevel@tonic-gate extern int global_errno;	/* set by failover() or failback() */
2580*0Sstevel@tonic-gate 
2581*0Sstevel@tonic-gate /*
2582*0Sstevel@tonic-gate  * Process the offline, undo offline and set original index commands,
2583*0Sstevel@tonic-gate  * received from if_mpadm(1M)
2584*0Sstevel@tonic-gate  */
2585*0Sstevel@tonic-gate static unsigned int
2586*0Sstevel@tonic-gate process_cmd(int newfd, union mi_commands *mpi)
2587*0Sstevel@tonic-gate {
2588*0Sstevel@tonic-gate 	uint_t	nif = 0;
2589*0Sstevel@tonic-gate 	uint32_t cmd;
2590*0Sstevel@tonic-gate 	struct phyint *pi;
2591*0Sstevel@tonic-gate 	struct phyint *pi2;
2592*0Sstevel@tonic-gate 	struct phyint_group *pg;
2593*0Sstevel@tonic-gate 	boolean_t success;
2594*0Sstevel@tonic-gate 	int error;
2595*0Sstevel@tonic-gate 	struct mi_offline *mio;
2596*0Sstevel@tonic-gate 	struct mi_undo_offline *miu;
2597*0Sstevel@tonic-gate 	struct lifreq lifr;
2598*0Sstevel@tonic-gate 	int ifsock;
2599*0Sstevel@tonic-gate 	struct mi_setoindex *mis;
2600*0Sstevel@tonic-gate 
2601*0Sstevel@tonic-gate 	cmd = mpi->mi_command;
2602*0Sstevel@tonic-gate 
2603*0Sstevel@tonic-gate 	switch (cmd) {
2604*0Sstevel@tonic-gate 	case MI_OFFLINE:
2605*0Sstevel@tonic-gate 		mio = &mpi->mi_ocmd;
2606*0Sstevel@tonic-gate 		/*
2607*0Sstevel@tonic-gate 		 * Lookup the interface that needs to be offlined.
2608*0Sstevel@tonic-gate 		 * If it does not exist, return a suitable error.
2609*0Sstevel@tonic-gate 		 */
2610*0Sstevel@tonic-gate 		pi = phyint_lookup(mio->mio_ifname);
2611*0Sstevel@tonic-gate 		if (pi == NULL)
2612*0Sstevel@tonic-gate 			return (send_result(newfd, IPMP_FAILURE, EINVAL));
2613*0Sstevel@tonic-gate 
2614*0Sstevel@tonic-gate 		/*
2615*0Sstevel@tonic-gate 		 * Verify that the minimum redundancy requirements are met.
2616*0Sstevel@tonic-gate 		 * The multipathing group must have at least the specified
2617*0Sstevel@tonic-gate 		 * number of functional interfaces after offlining the
2618*0Sstevel@tonic-gate 		 * requested interface. Otherwise return a suitable error.
2619*0Sstevel@tonic-gate 		 */
2620*0Sstevel@tonic-gate 		pg = pi->pi_group;
2621*0Sstevel@tonic-gate 		nif = 0;
2622*0Sstevel@tonic-gate 		if (pg != phyint_anongroup) {
2623*0Sstevel@tonic-gate 			for (nif = 0, pi2 = pg->pg_phyint; pi2 != NULL;
2624*0Sstevel@tonic-gate 			    pi2 = pi2->pi_pgnext) {
2625*0Sstevel@tonic-gate 				if ((pi2->pi_state == PI_RUNNING) ||
2626*0Sstevel@tonic-gate 				    (pg->pg_groupfailed &&
2627*0Sstevel@tonic-gate 				    !(pi2->pi_flags & IFF_OFFLINE)))
2628*0Sstevel@tonic-gate 					nif++;
2629*0Sstevel@tonic-gate 			}
2630*0Sstevel@tonic-gate 		}
2631*0Sstevel@tonic-gate 		if (nif < mio->mio_min_redundancy)
2632*0Sstevel@tonic-gate 			return (send_result(newfd, IPMP_EMINRED, 0));
2633*0Sstevel@tonic-gate 
2634*0Sstevel@tonic-gate 		/*
2635*0Sstevel@tonic-gate 		 * The order of operation is to set IFF_OFFLINE, followed by
2636*0Sstevel@tonic-gate 		 * failover. Setting IFF_OFFLINE ensures that no new ipif's
2637*0Sstevel@tonic-gate 		 * can be created. Subsequent failover moves everything on
2638*0Sstevel@tonic-gate 		 * the OFFLINE interface to some other functional interface.
2639*0Sstevel@tonic-gate 		 */
2640*0Sstevel@tonic-gate 		success = change_lif_flags(pi, IFF_OFFLINE, _B_TRUE);
2641*0Sstevel@tonic-gate 		if (success) {
2642*0Sstevel@tonic-gate 			if (!pi->pi_empty) {
2643*0Sstevel@tonic-gate 				error = try_failover(pi, FAILOVER_NORMAL);
2644*0Sstevel@tonic-gate 				if (error != 0) {
2645*0Sstevel@tonic-gate 					if (!change_lif_flags(pi, IFF_OFFLINE,
2646*0Sstevel@tonic-gate 					    _B_FALSE)) {
2647*0Sstevel@tonic-gate 						logerr("process_cmd: couldn't"
2648*0Sstevel@tonic-gate 						    " clear OFFLINE flag on"
2649*0Sstevel@tonic-gate 						    " %s\n", pi->pi_name);
2650*0Sstevel@tonic-gate 						/*
2651*0Sstevel@tonic-gate 						 * Offline interfaces should
2652*0Sstevel@tonic-gate 						 * not be probed.
2653*0Sstevel@tonic-gate 						 */
2654*0Sstevel@tonic-gate 						stop_probing(pi);
2655*0Sstevel@tonic-gate 					}
2656*0Sstevel@tonic-gate 					return (send_result(newfd, error,
2657*0Sstevel@tonic-gate 					    global_errno));
2658*0Sstevel@tonic-gate 				}
2659*0Sstevel@tonic-gate 			}
2660*0Sstevel@tonic-gate 		} else {
2661*0Sstevel@tonic-gate 			return (send_result(newfd, IPMP_FAILURE, errno));
2662*0Sstevel@tonic-gate 		}
2663*0Sstevel@tonic-gate 
2664*0Sstevel@tonic-gate 		/*
2665*0Sstevel@tonic-gate 		 * The interface is now Offline, so stop probing it.
2666*0Sstevel@tonic-gate 		 * Note that if_mpadm(1M) will down the test addresses,
2667*0Sstevel@tonic-gate 		 * after receiving a success reply from us. The routing
2668*0Sstevel@tonic-gate 		 * socket message will then make us close the socket used
2669*0Sstevel@tonic-gate 		 * for sending probes. But it is more logical that an
2670*0Sstevel@tonic-gate 		 * offlined interface must not be probed, even if it has
2671*0Sstevel@tonic-gate 		 * test addresses.
2672*0Sstevel@tonic-gate 		 */
2673*0Sstevel@tonic-gate 		stop_probing(pi);
2674*0Sstevel@tonic-gate 		return (send_result(newfd, IPMP_SUCCESS, 0));
2675*0Sstevel@tonic-gate 
2676*0Sstevel@tonic-gate 	case MI_UNDO_OFFLINE:
2677*0Sstevel@tonic-gate 		miu = &mpi->mi_ucmd;
2678*0Sstevel@tonic-gate 		/*
2679*0Sstevel@tonic-gate 		 * Undo the offline command. As usual lookup the interface.
2680*0Sstevel@tonic-gate 		 * Send an error if it does not exist.
2681*0Sstevel@tonic-gate 		 */
2682*0Sstevel@tonic-gate 		pi = phyint_lookup(miu->miu_ifname);
2683*0Sstevel@tonic-gate 		if (pi == NULL)
2684*0Sstevel@tonic-gate 			return (send_result(newfd, IPMP_FAILURE, EINVAL));
2685*0Sstevel@tonic-gate 
2686*0Sstevel@tonic-gate 		/*
2687*0Sstevel@tonic-gate 		 * Inverse of the offline operation. Do a failback, and then
2688*0Sstevel@tonic-gate 		 * clear the IFF_OFFLINE flag.
2689*0Sstevel@tonic-gate 		 */
2690*0Sstevel@tonic-gate 		error = do_failback(pi, _B_TRUE);
2691*0Sstevel@tonic-gate 		if (error == IPMP_EFBPARTIAL)
2692*0Sstevel@tonic-gate 			return (send_result(newfd, IPMP_EFBPARTIAL, 0));
2693*0Sstevel@tonic-gate 		error = do_failback(pi, _B_FALSE);
2694*0Sstevel@tonic-gate 
2695*0Sstevel@tonic-gate 		switch (error) {
2696*0Sstevel@tonic-gate 		case IPMP_SUCCESS:
2697*0Sstevel@tonic-gate 			if (!change_lif_flags(pi, IFF_OFFLINE, _B_FALSE)) {
2698*0Sstevel@tonic-gate 				logdebug("undo error %X\n", global_errno);
2699*0Sstevel@tonic-gate 				error = IPMP_FAILURE;
2700*0Sstevel@tonic-gate 				break;
2701*0Sstevel@tonic-gate 			}
2702*0Sstevel@tonic-gate 			/* FALLTHROUGH */
2703*0Sstevel@tonic-gate 
2704*0Sstevel@tonic-gate 		case IPMP_EFBPARTIAL:
2705*0Sstevel@tonic-gate 			/*
2706*0Sstevel@tonic-gate 			 * Reset the state of the interface based on the
2707*0Sstevel@tonic-gate 			 * current link state; if this phyint subsequently
2708*0Sstevel@tonic-gate 			 * acquires a test address, the state will be changed
2709*0Sstevel@tonic-gate 			 * again later as a result of the probes.
2710*0Sstevel@tonic-gate 			 */
2711*0Sstevel@tonic-gate 			if (LINK_UP(pi))
2712*0Sstevel@tonic-gate 				phyint_chstate(pi, PI_RUNNING);
2713*0Sstevel@tonic-gate 			else
2714*0Sstevel@tonic-gate 				phyint_chstate(pi, PI_FAILED);
2715*0Sstevel@tonic-gate 			break;
2716*0Sstevel@tonic-gate 
2717*0Sstevel@tonic-gate 		case IPMP_FAILURE:
2718*0Sstevel@tonic-gate 			break;
2719*0Sstevel@tonic-gate 
2720*0Sstevel@tonic-gate 		default:
2721*0Sstevel@tonic-gate 			logdebug("do_failback: unexpected return value\n");
2722*0Sstevel@tonic-gate 			break;
2723*0Sstevel@tonic-gate 		}
2724*0Sstevel@tonic-gate 		return (send_result(newfd, error, global_errno));
2725*0Sstevel@tonic-gate 
2726*0Sstevel@tonic-gate 	case MI_SETOINDEX:
2727*0Sstevel@tonic-gate 		mis = &mpi->mi_scmd;
2728*0Sstevel@tonic-gate 
2729*0Sstevel@tonic-gate 		/* Get the socket for doing ioctls */
2730*0Sstevel@tonic-gate 		ifsock = (mis->mis_iftype == AF_INET) ? ifsock_v4 : ifsock_v6;
2731*0Sstevel@tonic-gate 
2732*0Sstevel@tonic-gate 		/*
2733*0Sstevel@tonic-gate 		 * Get index of new original interface.
2734*0Sstevel@tonic-gate 		 * The index is returned in lifr.lifr_index.
2735*0Sstevel@tonic-gate 		 */
2736*0Sstevel@tonic-gate 		(void) strlcpy(lifr.lifr_name, mis->mis_new_pifname,
2737*0Sstevel@tonic-gate 		    sizeof (lifr.lifr_name));
2738*0Sstevel@tonic-gate 
2739*0Sstevel@tonic-gate 		if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0)
2740*0Sstevel@tonic-gate 			return (send_result(newfd, IPMP_FAILURE, errno));
2741*0Sstevel@tonic-gate 
2742*0Sstevel@tonic-gate 		/*
2743*0Sstevel@tonic-gate 		 * Set new original interface index.
2744*0Sstevel@tonic-gate 		 * The new index was put into lifr.lifr_index by the
2745*0Sstevel@tonic-gate 		 * SIOCGLIFINDEX ioctl.
2746*0Sstevel@tonic-gate 		 */
2747*0Sstevel@tonic-gate 		(void) strlcpy(lifr.lifr_name, mis->mis_lifname,
2748*0Sstevel@tonic-gate 		    sizeof (lifr.lifr_name));
2749*0Sstevel@tonic-gate 
2750*0Sstevel@tonic-gate 		if (ioctl(ifsock, SIOCSLIFOINDEX, (char *)&lifr) < 0)
2751*0Sstevel@tonic-gate 			return (send_result(newfd, IPMP_FAILURE, errno));
2752*0Sstevel@tonic-gate 
2753*0Sstevel@tonic-gate 		return (send_result(newfd, IPMP_SUCCESS, 0));
2754*0Sstevel@tonic-gate 
2755*0Sstevel@tonic-gate 	case MI_QUERY:
2756*0Sstevel@tonic-gate 		return (process_query(newfd, &mpi->mi_qcmd));
2757*0Sstevel@tonic-gate 
2758*0Sstevel@tonic-gate 	default:
2759*0Sstevel@tonic-gate 		break;
2760*0Sstevel@tonic-gate 	}
2761*0Sstevel@tonic-gate 
2762*0Sstevel@tonic-gate 	return (send_result(newfd, IPMP_EPROTO, 0));
2763*0Sstevel@tonic-gate }
2764*0Sstevel@tonic-gate 
2765*0Sstevel@tonic-gate /*
2766*0Sstevel@tonic-gate  * Process the query request pointed to by `miq' and send a reply on file
2767*0Sstevel@tonic-gate  * descriptor `fd'.  Returns an IPMP error code.
2768*0Sstevel@tonic-gate  */
2769*0Sstevel@tonic-gate static unsigned int
2770*0Sstevel@tonic-gate process_query(int fd, mi_query_t *miq)
2771*0Sstevel@tonic-gate {
2772*0Sstevel@tonic-gate 	ipmp_groupinfo_t	*grinfop;
2773*0Sstevel@tonic-gate 	ipmp_groupinfolist_t	*grlp;
2774*0Sstevel@tonic-gate 	ipmp_grouplist_t	*grlistp;
2775*0Sstevel@tonic-gate 	ipmp_ifinfo_t		*ifinfop;
2776*0Sstevel@tonic-gate 	ipmp_ifinfolist_t	*iflp;
2777*0Sstevel@tonic-gate 	ipmp_snap_t		*snap;
2778*0Sstevel@tonic-gate 	unsigned int		retval;
2779*0Sstevel@tonic-gate 
2780*0Sstevel@tonic-gate 	switch (miq->miq_inforeq) {
2781*0Sstevel@tonic-gate 	case IPMP_GROUPLIST:
2782*0Sstevel@tonic-gate 		retval = getgrouplist(&grlistp);
2783*0Sstevel@tonic-gate 		if (retval != IPMP_SUCCESS)
2784*0Sstevel@tonic-gate 			return (send_result(fd, retval, errno));
2785*0Sstevel@tonic-gate 
2786*0Sstevel@tonic-gate 		retval = send_result(fd, IPMP_SUCCESS, 0);
2787*0Sstevel@tonic-gate 		if (retval == IPMP_SUCCESS)
2788*0Sstevel@tonic-gate 			retval = send_grouplist(fd, grlistp);
2789*0Sstevel@tonic-gate 
2790*0Sstevel@tonic-gate 		ipmp_freegrouplist(grlistp);
2791*0Sstevel@tonic-gate 		return (retval);
2792*0Sstevel@tonic-gate 
2793*0Sstevel@tonic-gate 	case IPMP_GROUPINFO:
2794*0Sstevel@tonic-gate 		miq->miq_grname[LIFGRNAMSIZ - 1] = '\0';
2795*0Sstevel@tonic-gate 		retval = getgroupinfo(miq->miq_ifname, &grinfop);
2796*0Sstevel@tonic-gate 		if (retval != IPMP_SUCCESS)
2797*0Sstevel@tonic-gate 			return (send_result(fd, retval, errno));
2798*0Sstevel@tonic-gate 
2799*0Sstevel@tonic-gate 		retval = send_result(fd, IPMP_SUCCESS, 0);
2800*0Sstevel@tonic-gate 		if (retval == IPMP_SUCCESS)
2801*0Sstevel@tonic-gate 			retval = send_groupinfo(fd, grinfop);
2802*0Sstevel@tonic-gate 
2803*0Sstevel@tonic-gate 		ipmp_freegroupinfo(grinfop);
2804*0Sstevel@tonic-gate 		return (retval);
2805*0Sstevel@tonic-gate 
2806*0Sstevel@tonic-gate 	case IPMP_IFINFO:
2807*0Sstevel@tonic-gate 		miq->miq_ifname[LIFNAMSIZ - 1] = '\0';
2808*0Sstevel@tonic-gate 		retval = getifinfo(miq->miq_ifname, &ifinfop);
2809*0Sstevel@tonic-gate 		if (retval != IPMP_SUCCESS)
2810*0Sstevel@tonic-gate 			return (send_result(fd, retval, errno));
2811*0Sstevel@tonic-gate 
2812*0Sstevel@tonic-gate 		retval = send_result(fd, IPMP_SUCCESS, 0);
2813*0Sstevel@tonic-gate 		if (retval == IPMP_SUCCESS)
2814*0Sstevel@tonic-gate 			retval = send_ifinfo(fd, ifinfop);
2815*0Sstevel@tonic-gate 
2816*0Sstevel@tonic-gate 		ipmp_freeifinfo(ifinfop);
2817*0Sstevel@tonic-gate 		return (retval);
2818*0Sstevel@tonic-gate 
2819*0Sstevel@tonic-gate 	case IPMP_SNAP:
2820*0Sstevel@tonic-gate 		retval = getsnap(&snap);
2821*0Sstevel@tonic-gate 		if (retval != IPMP_SUCCESS)
2822*0Sstevel@tonic-gate 			return (send_result(fd, retval, errno));
2823*0Sstevel@tonic-gate 
2824*0Sstevel@tonic-gate 		retval = send_result(fd, IPMP_SUCCESS, 0);
2825*0Sstevel@tonic-gate 		if (retval != IPMP_SUCCESS)
2826*0Sstevel@tonic-gate 			goto out;
2827*0Sstevel@tonic-gate 
2828*0Sstevel@tonic-gate 		retval = ipmp_writetlv(fd, IPMP_SNAP, sizeof (*snap), snap);
2829*0Sstevel@tonic-gate 		if (retval != IPMP_SUCCESS)
2830*0Sstevel@tonic-gate 			goto out;
2831*0Sstevel@tonic-gate 
2832*0Sstevel@tonic-gate 		retval = send_grouplist(fd, snap->sn_grlistp);
2833*0Sstevel@tonic-gate 		if (retval != IPMP_SUCCESS)
2834*0Sstevel@tonic-gate 			goto out;
2835*0Sstevel@tonic-gate 
2836*0Sstevel@tonic-gate 		iflp = snap->sn_ifinfolistp;
2837*0Sstevel@tonic-gate 		for (; iflp != NULL; iflp = iflp->ifl_next) {
2838*0Sstevel@tonic-gate 			retval = send_ifinfo(fd, iflp->ifl_ifinfop);
2839*0Sstevel@tonic-gate 			if (retval != IPMP_SUCCESS)
2840*0Sstevel@tonic-gate 				goto out;
2841*0Sstevel@tonic-gate 		}
2842*0Sstevel@tonic-gate 
2843*0Sstevel@tonic-gate 		grlp = snap->sn_grinfolistp;
2844*0Sstevel@tonic-gate 		for (; grlp != NULL; grlp = grlp->grl_next) {
2845*0Sstevel@tonic-gate 			retval = send_groupinfo(fd, grlp->grl_grinfop);
2846*0Sstevel@tonic-gate 			if (retval != IPMP_SUCCESS)
2847*0Sstevel@tonic-gate 				goto out;
2848*0Sstevel@tonic-gate 		}
2849*0Sstevel@tonic-gate 	out:
2850*0Sstevel@tonic-gate 		ipmp_snap_free(snap);
2851*0Sstevel@tonic-gate 		return (retval);
2852*0Sstevel@tonic-gate 
2853*0Sstevel@tonic-gate 	default:
2854*0Sstevel@tonic-gate 		break;
2855*0Sstevel@tonic-gate 
2856*0Sstevel@tonic-gate 	}
2857*0Sstevel@tonic-gate 	return (send_result(fd, IPMP_EPROTO, 0));
2858*0Sstevel@tonic-gate }
2859*0Sstevel@tonic-gate 
2860*0Sstevel@tonic-gate /*
2861*0Sstevel@tonic-gate  * Send the group information pointed to by `grinfop' on file descriptor `fd'.
2862*0Sstevel@tonic-gate  * Returns an IPMP error code.
2863*0Sstevel@tonic-gate  */
2864*0Sstevel@tonic-gate static unsigned int
2865*0Sstevel@tonic-gate send_groupinfo(int fd, ipmp_groupinfo_t *grinfop)
2866*0Sstevel@tonic-gate {
2867*0Sstevel@tonic-gate 	ipmp_iflist_t	*iflistp = grinfop->gr_iflistp;
2868*0Sstevel@tonic-gate 	unsigned int	retval;
2869*0Sstevel@tonic-gate 
2870*0Sstevel@tonic-gate 	retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop);
2871*0Sstevel@tonic-gate 	if (retval != IPMP_SUCCESS)
2872*0Sstevel@tonic-gate 		return (retval);
2873*0Sstevel@tonic-gate 
2874*0Sstevel@tonic-gate 	return (ipmp_writetlv(fd, IPMP_IFLIST,
2875*0Sstevel@tonic-gate 	    IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp));
2876*0Sstevel@tonic-gate }
2877*0Sstevel@tonic-gate 
2878*0Sstevel@tonic-gate /*
2879*0Sstevel@tonic-gate  * Send the interface information pointed to by `ifinfop' on file descriptor
2880*0Sstevel@tonic-gate  * `fd'.  Returns an IPMP error code.
2881*0Sstevel@tonic-gate  */
2882*0Sstevel@tonic-gate static unsigned int
2883*0Sstevel@tonic-gate send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop)
2884*0Sstevel@tonic-gate {
2885*0Sstevel@tonic-gate 	return (ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop));
2886*0Sstevel@tonic-gate }
2887*0Sstevel@tonic-gate 
2888*0Sstevel@tonic-gate /*
2889*0Sstevel@tonic-gate  * Send the group list pointed to by `grlistp' on file descriptor `fd'.
2890*0Sstevel@tonic-gate  * Returns an IPMP error code.
2891*0Sstevel@tonic-gate  */
2892*0Sstevel@tonic-gate static unsigned int
2893*0Sstevel@tonic-gate send_grouplist(int fd, ipmp_grouplist_t *grlistp)
2894*0Sstevel@tonic-gate {
2895*0Sstevel@tonic-gate 	return (ipmp_writetlv(fd, IPMP_GROUPLIST,
2896*0Sstevel@tonic-gate 	    IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp));
2897*0Sstevel@tonic-gate }
2898*0Sstevel@tonic-gate 
2899*0Sstevel@tonic-gate /*
2900*0Sstevel@tonic-gate  * Initialize an mi_result_t structure using `error' and `syserror' and
2901*0Sstevel@tonic-gate  * send it on file descriptor `fd'.  Returns an IPMP error code.
2902*0Sstevel@tonic-gate  */
2903*0Sstevel@tonic-gate static unsigned int
2904*0Sstevel@tonic-gate send_result(int fd, unsigned int error, int syserror)
2905*0Sstevel@tonic-gate {
2906*0Sstevel@tonic-gate 	mi_result_t me;
2907*0Sstevel@tonic-gate 
2908*0Sstevel@tonic-gate 	me.me_mpathd_error = error;
2909*0Sstevel@tonic-gate 	if (error == IPMP_FAILURE)
2910*0Sstevel@tonic-gate 		me.me_sys_error = syserror;
2911*0Sstevel@tonic-gate 	else
2912*0Sstevel@tonic-gate 		me.me_sys_error = 0;
2913*0Sstevel@tonic-gate 
2914*0Sstevel@tonic-gate 	return (ipmp_write(fd, &me, sizeof (me)));
2915*0Sstevel@tonic-gate }
2916*0Sstevel@tonic-gate 
2917*0Sstevel@tonic-gate /*
2918*0Sstevel@tonic-gate  * Daemonize the process.
2919*0Sstevel@tonic-gate  */
2920*0Sstevel@tonic-gate static boolean_t
2921*0Sstevel@tonic-gate daemonize(void)
2922*0Sstevel@tonic-gate {
2923*0Sstevel@tonic-gate 	switch (fork()) {
2924*0Sstevel@tonic-gate 	case -1:
2925*0Sstevel@tonic-gate 		return (_B_FALSE);
2926*0Sstevel@tonic-gate 
2927*0Sstevel@tonic-gate 	case  0:
2928*0Sstevel@tonic-gate 		/*
2929*0Sstevel@tonic-gate 		 * Lose our controlling terminal, and become both a session
2930*0Sstevel@tonic-gate 		 * leader and a process group leader.
2931*0Sstevel@tonic-gate 		 */
2932*0Sstevel@tonic-gate 		if (setsid() == -1)
2933*0Sstevel@tonic-gate 			return (_B_FALSE);
2934*0Sstevel@tonic-gate 
2935*0Sstevel@tonic-gate 		/*
2936*0Sstevel@tonic-gate 		 * Under POSIX, a session leader can accidentally (through
2937*0Sstevel@tonic-gate 		 * open(2)) acquire a controlling terminal if it does not
2938*0Sstevel@tonic-gate 		 * have one.  Just to be safe, fork() again so we are not a
2939*0Sstevel@tonic-gate 		 * session leader.
2940*0Sstevel@tonic-gate 		 */
2941*0Sstevel@tonic-gate 		switch (fork()) {
2942*0Sstevel@tonic-gate 		case -1:
2943*0Sstevel@tonic-gate 			return (_B_FALSE);
2944*0Sstevel@tonic-gate 
2945*0Sstevel@tonic-gate 		case 0:
2946*0Sstevel@tonic-gate 			(void) chdir("/");
2947*0Sstevel@tonic-gate 			(void) umask(022);
2948*0Sstevel@tonic-gate 			(void) fdwalk(closefunc, NULL);
2949*0Sstevel@tonic-gate 			break;
2950*0Sstevel@tonic-gate 
2951*0Sstevel@tonic-gate 		default:
2952*0Sstevel@tonic-gate 			_exit(EXIT_SUCCESS);
2953*0Sstevel@tonic-gate 		}
2954*0Sstevel@tonic-gate 		break;
2955*0Sstevel@tonic-gate 
2956*0Sstevel@tonic-gate 	default:
2957*0Sstevel@tonic-gate 		_exit(EXIT_SUCCESS);
2958*0Sstevel@tonic-gate 	}
2959*0Sstevel@tonic-gate 
2960*0Sstevel@tonic-gate 	return (_B_TRUE);
2961*0Sstevel@tonic-gate }
2962*0Sstevel@tonic-gate 
2963*0Sstevel@tonic-gate /*
2964*0Sstevel@tonic-gate  * The parent has created some fds before forking on purpose, keep them open.
2965*0Sstevel@tonic-gate  */
2966*0Sstevel@tonic-gate static int
2967*0Sstevel@tonic-gate closefunc(void *not_used, int fd)
2968*0Sstevel@tonic-gate /* ARGSUSED */
2969*0Sstevel@tonic-gate {
2970*0Sstevel@tonic-gate 	if (fd != lsock_v4 && fd != lsock_v6)
2971*0Sstevel@tonic-gate 		(void) close(fd);
2972*0Sstevel@tonic-gate 	return (0);
2973*0Sstevel@tonic-gate }
2974*0Sstevel@tonic-gate 
2975*0Sstevel@tonic-gate /* LOGGER */
2976*0Sstevel@tonic-gate 
2977*0Sstevel@tonic-gate #include <syslog.h>
2978*0Sstevel@tonic-gate 
2979*0Sstevel@tonic-gate /*
2980*0Sstevel@tonic-gate  * Logging routines.  All routines log to syslog, unless the daemon is
2981*0Sstevel@tonic-gate  * running in the foreground, in which case the logging goes to stderr.
2982*0Sstevel@tonic-gate  *
2983*0Sstevel@tonic-gate  * The following routines are available:
2984*0Sstevel@tonic-gate  *
2985*0Sstevel@tonic-gate  *	logdebug(): A printf-like function for outputting debug messages
2986*0Sstevel@tonic-gate  *	(messages at LOG_DEBUG) that are only of use to developers.
2987*0Sstevel@tonic-gate  *
2988*0Sstevel@tonic-gate  *	logtrace(): A printf-like function for outputting tracing messages
2989*0Sstevel@tonic-gate  *	(messages at LOG_INFO) from the daemon.	 This is typically used
2990*0Sstevel@tonic-gate  *	to log the receipt of interesting network-related conditions.
2991*0Sstevel@tonic-gate  *
2992*0Sstevel@tonic-gate  *	logerr(): A printf-like function for outputting error messages
2993*0Sstevel@tonic-gate  *	(messages at LOG_ERR) from the daemon.
2994*0Sstevel@tonic-gate  *
2995*0Sstevel@tonic-gate  *	logperror*(): A set of functions used to output error messages
2996*0Sstevel@tonic-gate  *	(messages at LOG_ERR); these automatically append strerror(errno)
2997*0Sstevel@tonic-gate  *	and a newline to the message passed to them.
2998*0Sstevel@tonic-gate  *
2999*0Sstevel@tonic-gate  * NOTE: since the logging functions write to syslog, the messages passed
3000*0Sstevel@tonic-gate  *	 to them are not eligible for localization.  Thus, gettext() must
3001*0Sstevel@tonic-gate  *	 *not* be used.
3002*0Sstevel@tonic-gate  */
3003*0Sstevel@tonic-gate 
3004*0Sstevel@tonic-gate static int logging = 0;
3005*0Sstevel@tonic-gate 
3006*0Sstevel@tonic-gate static void
3007*0Sstevel@tonic-gate initlog(void)
3008*0Sstevel@tonic-gate {
3009*0Sstevel@tonic-gate 	logging++;
3010*0Sstevel@tonic-gate 	openlog("in.mpathd", LOG_PID | LOG_CONS, LOG_DAEMON);
3011*0Sstevel@tonic-gate }
3012*0Sstevel@tonic-gate 
3013*0Sstevel@tonic-gate /* PRINTFLIKE1 */
3014*0Sstevel@tonic-gate void
3015*0Sstevel@tonic-gate logerr(char *fmt, ...)
3016*0Sstevel@tonic-gate {
3017*0Sstevel@tonic-gate 	va_list ap;
3018*0Sstevel@tonic-gate 
3019*0Sstevel@tonic-gate 	va_start(ap, fmt);
3020*0Sstevel@tonic-gate 
3021*0Sstevel@tonic-gate 	if (logging)
3022*0Sstevel@tonic-gate 		vsyslog(LOG_ERR, fmt, ap);
3023*0Sstevel@tonic-gate 	else
3024*0Sstevel@tonic-gate 		(void) vfprintf(stderr, fmt, ap);
3025*0Sstevel@tonic-gate 	va_end(ap);
3026*0Sstevel@tonic-gate }
3027*0Sstevel@tonic-gate 
3028*0Sstevel@tonic-gate /* PRINTFLIKE1 */
3029*0Sstevel@tonic-gate void
3030*0Sstevel@tonic-gate logtrace(char *fmt, ...)
3031*0Sstevel@tonic-gate {
3032*0Sstevel@tonic-gate 	va_list ap;
3033*0Sstevel@tonic-gate 
3034*0Sstevel@tonic-gate 	va_start(ap, fmt);
3035*0Sstevel@tonic-gate 
3036*0Sstevel@tonic-gate 	if (logging)
3037*0Sstevel@tonic-gate 		vsyslog(LOG_INFO, fmt, ap);
3038*0Sstevel@tonic-gate 	else
3039*0Sstevel@tonic-gate 		(void) vfprintf(stderr, fmt, ap);
3040*0Sstevel@tonic-gate 	va_end(ap);
3041*0Sstevel@tonic-gate }
3042*0Sstevel@tonic-gate 
3043*0Sstevel@tonic-gate /* PRINTFLIKE1 */
3044*0Sstevel@tonic-gate void
3045*0Sstevel@tonic-gate logdebug(char *fmt, ...)
3046*0Sstevel@tonic-gate {
3047*0Sstevel@tonic-gate 	va_list ap;
3048*0Sstevel@tonic-gate 
3049*0Sstevel@tonic-gate 	va_start(ap, fmt);
3050*0Sstevel@tonic-gate 
3051*0Sstevel@tonic-gate 	if (logging)
3052*0Sstevel@tonic-gate 		vsyslog(LOG_DEBUG, fmt, ap);
3053*0Sstevel@tonic-gate 	else
3054*0Sstevel@tonic-gate 		(void) vfprintf(stderr, fmt, ap);
3055*0Sstevel@tonic-gate 	va_end(ap);
3056*0Sstevel@tonic-gate }
3057*0Sstevel@tonic-gate 
3058*0Sstevel@tonic-gate /* PRINTFLIKE1 */
3059*0Sstevel@tonic-gate void
3060*0Sstevel@tonic-gate logperror(char *str)
3061*0Sstevel@tonic-gate {
3062*0Sstevel@tonic-gate 	if (logging)
3063*0Sstevel@tonic-gate 		syslog(LOG_ERR, "%s: %m\n", str);
3064*0Sstevel@tonic-gate 	else
3065*0Sstevel@tonic-gate 		(void) fprintf(stderr, "%s: %s\n", str, strerror(errno));
3066*0Sstevel@tonic-gate }
3067*0Sstevel@tonic-gate 
3068*0Sstevel@tonic-gate void
3069*0Sstevel@tonic-gate logperror_pii(struct phyint_instance *pii, char *str)
3070*0Sstevel@tonic-gate {
3071*0Sstevel@tonic-gate 	if (logging) {
3072*0Sstevel@tonic-gate 		syslog(LOG_ERR, "%s (%s %s): %m\n",
3073*0Sstevel@tonic-gate 		    str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
3074*0Sstevel@tonic-gate 	} else {
3075*0Sstevel@tonic-gate 		(void) fprintf(stderr, "%s (%s %s): %s\n",
3076*0Sstevel@tonic-gate 		    str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
3077*0Sstevel@tonic-gate 		    strerror(errno));
3078*0Sstevel@tonic-gate 	}
3079*0Sstevel@tonic-gate }
3080*0Sstevel@tonic-gate 
3081*0Sstevel@tonic-gate void
3082*0Sstevel@tonic-gate logperror_li(struct logint *li, char *str)
3083*0Sstevel@tonic-gate {
3084*0Sstevel@tonic-gate 	struct	phyint_instance	*pii = li->li_phyint_inst;
3085*0Sstevel@tonic-gate 
3086*0Sstevel@tonic-gate 	if (logging) {
3087*0Sstevel@tonic-gate 		syslog(LOG_ERR, "%s (%s %s): %m\n",
3088*0Sstevel@tonic-gate 		    str, AF_STR(pii->pii_af), li->li_name);
3089*0Sstevel@tonic-gate 	} else {
3090*0Sstevel@tonic-gate 		(void) fprintf(stderr, "%s (%s %s): %s\n",
3091*0Sstevel@tonic-gate 		    str, AF_STR(pii->pii_af), li->li_name,
3092*0Sstevel@tonic-gate 		    strerror(errno));
3093*0Sstevel@tonic-gate 	}
3094*0Sstevel@tonic-gate }
3095*0Sstevel@tonic-gate 
3096*0Sstevel@tonic-gate void
3097*0Sstevel@tonic-gate close_probe_socket(struct phyint_instance *pii, boolean_t polled)
3098*0Sstevel@tonic-gate {
3099*0Sstevel@tonic-gate 	if (polled)
3100*0Sstevel@tonic-gate 		(void) poll_remove(pii->pii_probe_sock);
3101*0Sstevel@tonic-gate 	(void) close(pii->pii_probe_sock);
3102*0Sstevel@tonic-gate 	pii->pii_probe_sock = -1;
3103*0Sstevel@tonic-gate 	pii->pii_basetime_inited = 0;
3104*0Sstevel@tonic-gate }
3105