xref: /onnv-gate/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c (revision 0:68f95e015346)
1*0Sstevel@tonic-gate /*
2*0Sstevel@tonic-gate  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
3*0Sstevel@tonic-gate  * Use is subject to license terms.
4*0Sstevel@tonic-gate  */
5*0Sstevel@tonic-gate 
6*0Sstevel@tonic-gate /*
7*0Sstevel@tonic-gate  * Copyright (c) 1987 Regents of the University of California.
8*0Sstevel@tonic-gate  * All rights reserved.
9*0Sstevel@tonic-gate  *
10*0Sstevel@tonic-gate  * Redistribution and use in source and binary forms are permitted
11*0Sstevel@tonic-gate  * provided that the above copyright notice and this paragraph are
12*0Sstevel@tonic-gate  * duplicated in all such forms and that any documentation,
13*0Sstevel@tonic-gate  * advertising materials, and other materials related to such
14*0Sstevel@tonic-gate  * distribution and use acknowledge that the software was developed
15*0Sstevel@tonic-gate  * by the University of California, Berkeley. The name of the
16*0Sstevel@tonic-gate  * University may not be used to endorse or promote products derived
17*0Sstevel@tonic-gate  * from this software without specific prior written permission.
18*0Sstevel@tonic-gate  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
19*0Sstevel@tonic-gate  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
20*0Sstevel@tonic-gate  * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
21*0Sstevel@tonic-gate  */
22*0Sstevel@tonic-gate 
23*0Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
24*0Sstevel@tonic-gate 
25*0Sstevel@tonic-gate #include "mpd_defs.h"
26*0Sstevel@tonic-gate #include "mpd_tables.h"
27*0Sstevel@tonic-gate 
28*0Sstevel@tonic-gate /*
29*0Sstevel@tonic-gate  * Probe types for probe()
30*0Sstevel@tonic-gate  */
31*0Sstevel@tonic-gate #define	PROBE_UNI	0x1234		/* Unicast probe packet */
32*0Sstevel@tonic-gate #define	PROBE_MULTI	0x5678		/* Multicast probe packet */
33*0Sstevel@tonic-gate #define	PROBE_RTT	0x9abc		/* RTT only probe packet */
34*0Sstevel@tonic-gate 
35*0Sstevel@tonic-gate #define	MSEC_PERMIN	(60 * MILLISEC)	/* Number of milliseconds in a minute */
36*0Sstevel@tonic-gate 
37*0Sstevel@tonic-gate /*
38*0Sstevel@tonic-gate  * Format of probe / probe response packets. This is an ICMP Echo request
39*0Sstevel@tonic-gate  * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
40*0Sstevel@tonic-gate  */
41*0Sstevel@tonic-gate struct pr_icmp
42*0Sstevel@tonic-gate {
43*0Sstevel@tonic-gate 	uint8_t  pr_icmp_type;		/* type field */
44*0Sstevel@tonic-gate 	uint8_t  pr_icmp_code;		/* code field */
45*0Sstevel@tonic-gate 	uint16_t pr_icmp_cksum;		/* checksum field */
46*0Sstevel@tonic-gate 	uint16_t pr_icmp_id;		/* Identification */
47*0Sstevel@tonic-gate 	uint16_t pr_icmp_seq;		/* sequence number */
48*0Sstevel@tonic-gate 	uint32_t pr_icmp_timestamp;	/* Time stamp	*/
49*0Sstevel@tonic-gate 	uint32_t pr_icmp_mtype;		/* Message type */
50*0Sstevel@tonic-gate };
51*0Sstevel@tonic-gate 
52*0Sstevel@tonic-gate static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0,
53*0Sstevel@tonic-gate 				    0x0, 0x0, 0x0, 0x0,
54*0Sstevel@tonic-gate 				    0x0, 0x0, 0x0, 0x0,
55*0Sstevel@tonic-gate 				    0x0, 0x0, 0x0, 0x1 } };
56*0Sstevel@tonic-gate 
57*0Sstevel@tonic-gate static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } };
58*0Sstevel@tonic-gate 
59*0Sstevel@tonic-gate static hrtime_t	last_fdt_bumpup_time;	/* When FDT was bumped up last */
60*0Sstevel@tonic-gate 
61*0Sstevel@tonic-gate static void		*find_ancillary(struct msghdr *msg, int cmsg_type);
62*0Sstevel@tonic-gate static void		pi_set_crtt(struct target *tg, int m,
63*0Sstevel@tonic-gate     boolean_t is_probe_uni);
64*0Sstevel@tonic-gate static void		incoming_echo_reply(struct phyint_instance *pii,
65*0Sstevel@tonic-gate     struct pr_icmp *reply, struct in6_addr fromaddr);
66*0Sstevel@tonic-gate static void		incoming_rtt_reply(struct phyint_instance *pii,
67*0Sstevel@tonic-gate     struct pr_icmp *reply, struct in6_addr fromaddr);
68*0Sstevel@tonic-gate static void		incoming_mcast_reply(struct phyint_instance *pii,
69*0Sstevel@tonic-gate     struct pr_icmp *reply, struct in6_addr fromaddr);
70*0Sstevel@tonic-gate 
71*0Sstevel@tonic-gate static boolean_t	check_pg_crtt_improved(struct phyint_group *pg);
72*0Sstevel@tonic-gate static boolean_t	check_pii_crtt_improved(struct phyint_instance *pii);
73*0Sstevel@tonic-gate static boolean_t	check_exception_target(struct phyint_instance *pii,
74*0Sstevel@tonic-gate     struct target *target);
75*0Sstevel@tonic-gate static void		probe_fail_info(struct phyint_instance *pii,
76*0Sstevel@tonic-gate     struct target *cur_tg, struct probe_fail_count *pfinfo);
77*0Sstevel@tonic-gate static void		probe_success_info(struct phyint_instance *pii,
78*0Sstevel@tonic-gate     struct target *cur_tg, struct probe_success_count *psinfo);
79*0Sstevel@tonic-gate static boolean_t	phyint_repaired(struct phyint *pi);
80*0Sstevel@tonic-gate 
81*0Sstevel@tonic-gate static int		failover(struct phyint *from, struct phyint *to);
82*0Sstevel@tonic-gate static int		failback(struct phyint *from, struct phyint *to);
83*0Sstevel@tonic-gate static struct phyint	*get_failover_dst(struct phyint *pi, int failover_type);
84*0Sstevel@tonic-gate 
85*0Sstevel@tonic-gate static boolean_t	highest_ack_tg(uint16_t seq, struct target *tg);
86*0Sstevel@tonic-gate static int 		in_cksum(ushort_t *addr, int len);
87*0Sstevel@tonic-gate static void		reset_snxt_basetimes(void);
88*0Sstevel@tonic-gate 
89*0Sstevel@tonic-gate /*
90*0Sstevel@tonic-gate  * CRTT - Conservative Round Trip Time Estimate
91*0Sstevel@tonic-gate  * Probe success - A matching probe reply received before CRTT ms has elapsed
92*0Sstevel@tonic-gate  *	after sending the probe.
93*0Sstevel@tonic-gate  * Probe failure - No probe reply received and more than CRTT ms has elapsed
94*0Sstevel@tonic-gate  *	after sending the probe.
95*0Sstevel@tonic-gate  *
96*0Sstevel@tonic-gate  * TLS - Time last success. Most recent probe ack received at this time.
97*0Sstevel@tonic-gate  * TFF - Time first fail. The time of the earliest probe failure in
98*0Sstevel@tonic-gate  *	a consecutive series of probe failures.
99*0Sstevel@tonic-gate  * NUM_PROBE_REPAIRS  - Number of consecutive successful probes required
100*0Sstevel@tonic-gate  * 	before declaring phyint repair.
101*0Sstevel@tonic-gate  * NUM_PROBE_FAILS - Number of consecutive probe failures required to
102*0Sstevel@tonic-gate  *	declare a phyint failure.
103*0Sstevel@tonic-gate  *
104*0Sstevel@tonic-gate  * 			Phyint state diagram
105*0Sstevel@tonic-gate  *
106*0Sstevel@tonic-gate  * The state of a phyint that is capable of being probed, is completely
107*0Sstevel@tonic-gate  * specified by the 5-tuple <pi_state, pg_groupfailed, I, pi_empty, pi_full>.
108*0Sstevel@tonic-gate  *
109*0Sstevel@tonic-gate  * A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state
110*0Sstevel@tonic-gate  * of the link (according to the driver).  If the phyint is also configured
111*0Sstevel@tonic-gate  * with a test address (the common case) and probe targets, then a phyint must
112*0Sstevel@tonic-gate  * also successfully be able to send and receive probes in order to remain in
113*0Sstevel@tonic-gate  * the PI_RUNNING state (otherwise, it transitions to PI_FAILED).
114*0Sstevel@tonic-gate  *
115*0Sstevel@tonic-gate  * Further, if a PI_RUNNING phyint is configured with a test address but is
116*0Sstevel@tonic-gate  * unable to find any probe targets, it will transition to the PI_NOTARGETS
117*0Sstevel@tonic-gate  * state, which indicates that the link is apparently functional but that
118*0Sstevel@tonic-gate  * in.mpathd is unable to send probes to verify functionality (in this case,
119*0Sstevel@tonic-gate  * in.mpathd makes the optimistic assumption that the interface is working
120*0Sstevel@tonic-gate  * correctly and thus does not perform a failover, but reports the interface
121*0Sstevel@tonic-gate  * as IPMP_IF_UNKNOWN through the async events and query interfaces).
122*0Sstevel@tonic-gate  *
123*0Sstevel@tonic-gate  * At any point, a phyint may be administratively marked offline via if_mpadm.
124*0Sstevel@tonic-gate  * In this case, the interface always transitions to PI_OFFLINE, regardless
125*0Sstevel@tonic-gate  * of its previous state.  When the interface is later brought back online,
126*0Sstevel@tonic-gate  * in.mpathd acts as if the interface is new (and thus it transitions to
127*0Sstevel@tonic-gate  * PI_RUNNING or PI_FAILED based on the status of the link and the result of
128*0Sstevel@tonic-gate  * its probes, if probes are sent).
129*0Sstevel@tonic-gate  *
130*0Sstevel@tonic-gate  * pi_state -  PI_RUNNING or PI_FAILED
131*0Sstevel@tonic-gate  *	PI_RUNNING: The failure detection logic says the phyint is good.
132*0Sstevel@tonic-gate  *	PI_FAILED: The failure detection logic says the phyint has failed.
133*0Sstevel@tonic-gate  *
134*0Sstevel@tonic-gate  * pg_groupfailed  - Group failure, all interfaces in the group have failed.
135*0Sstevel@tonic-gate  *	The pi_state may be either PI_FAILED or PI_NOTARGETS.
136*0Sstevel@tonic-gate  *	In the case of router targets, we assume that the current list of
137*0Sstevel@tonic-gate  *	targets obtained from the routing table, is still valid, so the
138*0Sstevel@tonic-gate  *	phyint stat is PI_FAILED. In the case of host targets, we delete the
139*0Sstevel@tonic-gate  *	list of targets, and multicast to the all hosts, to reconstruct the
140*0Sstevel@tonic-gate  *	target list. So the phyints are in the PI_NOTARGETS state.
141*0Sstevel@tonic-gate  *
142*0Sstevel@tonic-gate  * I -	value of (pi_flags & IFF_INACTIVE)
143*0Sstevel@tonic-gate  *	IFF_INACTIVE: No failovers have been done to the standby, from
144*0Sstevel@tonic-gate  *		other phyints. This phyint is an inactive standby.
145*0Sstevel@tonic-gate  *
146*0Sstevel@tonic-gate  * pi_empty
147*0Sstevel@tonic-gate  *	This phyint has failed over successfully to another phyint, and
148*0Sstevel@tonic-gate  *	this phyint is currently "empty". It does not host any addresses or
149*0Sstevel@tonic-gate  *	multicast membership etc. This is the state of a phyint after a
150*0Sstevel@tonic-gate  *	failover from the phyint has completed successfully and no subsequent
151*0Sstevel@tonic-gate  *	'failover to' or 'failback to' has occurred on the phyint.
152*0Sstevel@tonic-gate  *	IP guarantees that no new logicals will be hosted nor any multicast
153*0Sstevel@tonic-gate  *	joins permitted on the phyint, since the phyint is either failed or
154*0Sstevel@tonic-gate  *	inactive. pi_empty is set implies the phyint is either failed or
155*0Sstevel@tonic-gate  *	inactive.
156*0Sstevel@tonic-gate  *
157*0Sstevel@tonic-gate  * pi_full
158*0Sstevel@tonic-gate  *	The phyint hosts all of its own addresses that it "owns". If the
159*0Sstevel@tonic-gate  *	phyint was previously failed or inactive, failbacks to the phyint
160*0Sstevel@tonic-gate  *	has completed successfully. i.e. No more failbacks to this phyint
161*0Sstevel@tonic-gate  *	can produce any change in system state whatsoever.
162*0Sstevel@tonic-gate  *
163*0Sstevel@tonic-gate  * Not all 32 possible combinations of the above 5-tuple are possible.
164*0Sstevel@tonic-gate  * Furthermore some of the above combinations are transient. They may occur
165*0Sstevel@tonic-gate  * only because the failover or failback did not complete successfully. The
166*0Sstevel@tonic-gate  * failover/failback will be retried and eventually a stable state will be
167*0Sstevel@tonic-gate  * reached.
168*0Sstevel@tonic-gate  *
169*0Sstevel@tonic-gate  * I is tracked by IP. pi_state, pi_empty and pi_full are tracked by mpathd.
170*0Sstevel@tonic-gate  * The following are the state machines. 'from' and 'to' are the src and
171*0Sstevel@tonic-gate  * dst of the failover/failback, below
172*0Sstevel@tonic-gate  *
173*0Sstevel@tonic-gate  *			pi_empty state machine
174*0Sstevel@tonic-gate  * ---------------------------------------------------------------------------
175*0Sstevel@tonic-gate  *	Event				State	->	New State
176*0Sstevel@tonic-gate  * ---------------------------------------------------------------------------
177*0Sstevel@tonic-gate  *	successful completion 		from.pi_empty = 0 -> from.pi_empty = 1
178*0Sstevel@tonic-gate  *	of failover
179*0Sstevel@tonic-gate  *
180*0Sstevel@tonic-gate  *	Initiate failover 		to.pi_empty = X   -> to.pi_empty = 0
181*0Sstevel@tonic-gate  *
182*0Sstevel@tonic-gate  * 	Initiate failback 		to.pi_empty = X   -> to.pi_empty = 0
183*0Sstevel@tonic-gate  *
184*0Sstevel@tonic-gate  * 	group failure			pi_empty = X	  -> pi_empty = 0
185*0Sstevel@tonic-gate  * ---------------------------------------------------------------------------
186*0Sstevel@tonic-gate  *
187*0Sstevel@tonic-gate  *			pi_full state machine
188*0Sstevel@tonic-gate  * ---------------------------------------------------------------------------
189*0Sstevel@tonic-gate  *	Event				State		  -> New State
190*0Sstevel@tonic-gate  * ---------------------------------------------------------------------------
191*0Sstevel@tonic-gate  *	successful completion		to.pi_full = 0    -> to.pi_full = 1
192*0Sstevel@tonic-gate  *	of failback from
193*0Sstevel@tonic-gate  *	each of the other phyints
194*0Sstevel@tonic-gate  *
195*0Sstevel@tonic-gate  *	Initiate failover 		from.pi_full = X  -> from.pi_full = 0
196*0Sstevel@tonic-gate  *
197*0Sstevel@tonic-gate  *	group failure			pi_full = X	  -> pi_full = 0
198*0Sstevel@tonic-gate  * ---------------------------------------------------------------------------
199*0Sstevel@tonic-gate  *
200*0Sstevel@tonic-gate  *			pi_state state machine
201*0Sstevel@tonic-gate  * ---------------------------------------------------------------------------
202*0Sstevel@tonic-gate  *	Event			State			New State
203*0Sstevel@tonic-gate  *				Action:
204*0Sstevel@tonic-gate  * ---------------------------------------------------------------------------
205*0Sstevel@tonic-gate  *	NIC failure		(PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
206*0Sstevel@tonic-gate  *	detection		: set IFF_FAILED on this phyint
207*0Sstevel@tonic-gate  *				: failover from this phyint to another
208*0Sstevel@tonic-gate  *
209*0Sstevel@tonic-gate  *	NIC failure		(PI_RUNNING, I == 1) -> (PI_FAILED, I == 1)
210*0Sstevel@tonic-gate  *	detection		: set IFF_FAILED on this phyint
211*0Sstevel@tonic-gate  *
212*0Sstevel@tonic-gate  *	NIC repair 		(PI_FAILED, I == 0)  ->	(PI_RUNNING, I == 0)
213*0Sstevel@tonic-gate  *	detection		: to.pi_empty = 0
214*0Sstevel@tonic-gate  *				: failback to this phyint if enabled
215*0Sstevel@tonic-gate  *				: clear IFF_FAILED on this phyint
216*0Sstevel@tonic-gate  *
217*0Sstevel@tonic-gate  *	NIC repair 		(PI_FAILED, I == 1)  ->	(PI_RUNNING, I == 1)
218*0Sstevel@tonic-gate  *	detection		: clear IFF_FAILED on this phyint
219*0Sstevel@tonic-gate  *
220*0Sstevel@tonic-gate  *	Group failure		(perform on all phyints in the group)
221*0Sstevel@tonic-gate  *	detection 		PI_RUNNING		PI_FAILED
222*0Sstevel@tonic-gate  *	(Router targets)	: set IFF_FAILED
223*0Sstevel@tonic-gate  *				: clear pi_empty and pi_full
224*0Sstevel@tonic-gate  *
225*0Sstevel@tonic-gate  *	Group failure		(perform on all phyints in the group)
226*0Sstevel@tonic-gate  *	detection 		PI_RUNNING		PI_NOTARGETS
227*0Sstevel@tonic-gate  *	(Host targets)		: set IFF_FAILED
228*0Sstevel@tonic-gate  *				: clear pi_empty and pi_full
229*0Sstevel@tonic-gate  *				: delete the target list on all phyints
230*0Sstevel@tonic-gate  * ---------------------------------------------------------------------------
231*0Sstevel@tonic-gate  *
232*0Sstevel@tonic-gate  *			I state machine
233*0Sstevel@tonic-gate  * ---------------------------------------------------------------------------
234*0Sstevel@tonic-gate  *	Event		State			Action:
235*0Sstevel@tonic-gate  * ---------------------------------------------------------------------------
236*0Sstevel@tonic-gate  *	Turn on I 	pi_empty == 0 		: failover from standby
237*0Sstevel@tonic-gate  *
238*0Sstevel@tonic-gate  *	Turn off I 	PI_RUNNING,		: pi_empty = 0
239*0Sstevel@tonic-gate  *			pi_full == 0		: failback to this if enabled
240*0Sstevel@tonic-gate  * ---------------------------------------------------------------------------
241*0Sstevel@tonic-gate  *
242*0Sstevel@tonic-gate  * Assertions: (Read '==>' as implies)
243*0Sstevel@tonic-gate  *
244*0Sstevel@tonic-gate  * (pi_empty == 1) ==> (I == 1 || pi_state == PI_FAILED)
245*0Sstevel@tonic-gate  * (pi_empty == 1) ==> (pi_full == 0)
246*0Sstevel@tonic-gate  * (pi_full  == 1) ==> (pi_empty == 0)
247*0Sstevel@tonic-gate  *
248*0Sstevel@tonic-gate  * Invariants
249*0Sstevel@tonic-gate  *
250*0Sstevel@tonic-gate  * pg_groupfailed = 0  &&
251*0Sstevel@tonic-gate  *   1. (I == 1, pi_empty == 0)		 ==> initiate failover from standby
252*0Sstevel@tonic-gate  *   2. (I == 0, PI_FAILED, pi_empty == 0) ==> initiate failover from phyint
253*0Sstevel@tonic-gate  *   3. (I == 0, PI_RUNNING, pi_full == 0) ==> initiate failback to phyint
254*0Sstevel@tonic-gate  *
255*0Sstevel@tonic-gate  * 1. says that an inactive standby, that is not empty, has to be failed
256*0Sstevel@tonic-gate  * over. For a standby to be truly inactive, it should not host any
257*0Sstevel@tonic-gate  * addresses. So we move them to some other phyint. Usually we catch the
258*0Sstevel@tonic-gate  * turn on of IFF_INACTIVE, and perform this action. However if the failover
259*0Sstevel@tonic-gate  * did not complete successfully, then subsequently we have lost the edge
260*0Sstevel@tonic-gate  * trigger, and this invariant kicks in and completes the action.
261*0Sstevel@tonic-gate  *
262*0Sstevel@tonic-gate  * 2. says that any failed phyint that is not empty must be failed over.
263*0Sstevel@tonic-gate  * Usually we do the failover when we detect NIC failure. However if the
264*0Sstevel@tonic-gate  * failover does not complete successfully, this invariant kicks in and
265*0Sstevel@tonic-gate  * completes the failover. We exclude inactive standby which is covered by 1.
266*0Sstevel@tonic-gate  *
267*0Sstevel@tonic-gate  * 3. says that any running phyint that is not full must be failed back.
268*0Sstevel@tonic-gate  * Usually we do the failback when we detect NIC repair. However if the
269*0Sstevel@tonic-gate  * failback does not complete successfully, this invariant kicks in and
270*0Sstevel@tonic-gate  * completes the failback. Note that we don't want to failback to an inactive
271*0Sstevel@tonic-gate  * standby.
272*0Sstevel@tonic-gate  *
273*0Sstevel@tonic-gate  * The invariants 1 - 3 and the actions are in initifs().
274*0Sstevel@tonic-gate  */
275*0Sstevel@tonic-gate 
276*0Sstevel@tonic-gate struct probes_missed probes_missed;
277*0Sstevel@tonic-gate 
278*0Sstevel@tonic-gate /*
279*0Sstevel@tonic-gate  * Compose and transmit an ICMP ECHO REQUEST packet.  The IP header
280*0Sstevel@tonic-gate  * will be added on by the kernel.  The id field identifies this phyint.
281*0Sstevel@tonic-gate  * and the sequence number is an increasing (modulo 2^^16) integer. The data
282*0Sstevel@tonic-gate  * portion holds the time value when the packet is sent. On echo this is
283*0Sstevel@tonic-gate  * extracted to compute the round-trip time. Three different types of
284*0Sstevel@tonic-gate  * probe packets are used.
285*0Sstevel@tonic-gate  *
286*0Sstevel@tonic-gate  * PROBE_UNI: This type is used to do failure detection / failure recovery
287*0Sstevel@tonic-gate  *	and RTT calculation. PROBE_UNI probes are spaced apart in time,
288*0Sstevel@tonic-gate  *	not less than the current CRTT. pii_probes[] stores data
289*0Sstevel@tonic-gate  *	about these probes. These packets consume sequence number space.
290*0Sstevel@tonic-gate  *
291*0Sstevel@tonic-gate  * PROBE_RTT: This type is used to make only rtt measurments. Normally these
292*0Sstevel@tonic-gate  * 	are not used. Under heavy network load, the rtt may go up very high,
293*0Sstevel@tonic-gate  *	due to a spike, or may appear to go high, due to extreme scheduling
294*0Sstevel@tonic-gate  * 	delays. Once the network stress is removed, mpathd takes long time to
295*0Sstevel@tonic-gate  *	recover, because the probe_interval is already high, and it takes
296*0Sstevel@tonic-gate  *	a long time to send out sufficient number of probes to bring down the
297*0Sstevel@tonic-gate  *	rtt. To avoid this problem, PROBE_RTT probes are sent out every
298*0Sstevel@tonic-gate  *	user_probe_interval ms. and will cause only rtt updates. These packets
299*0Sstevel@tonic-gate  *	do not consume sequence number space nor is information about these
300*0Sstevel@tonic-gate  *	packets stored in the pii_probes[]
301*0Sstevel@tonic-gate  *
302*0Sstevel@tonic-gate  * PROBE_MULTI: This type is only used to construct a list of targets, when
303*0Sstevel@tonic-gate  *	no targets are known. The packet is multicast to the all hosts addr.
304*0Sstevel@tonic-gate  */
305*0Sstevel@tonic-gate static void
306*0Sstevel@tonic-gate probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time)
307*0Sstevel@tonic-gate {
308*0Sstevel@tonic-gate 	struct pr_icmp probe_pkt;	/* Probe packet */
309*0Sstevel@tonic-gate 	struct sockaddr_in6 whereto6; 	/* target address IPv6 */
310*0Sstevel@tonic-gate 	struct sockaddr_in whereto; 	/* target address IPv4 */
311*0Sstevel@tonic-gate 	int	pr_ndx;			/* probe index in pii->pii_probes[] */
312*0Sstevel@tonic-gate 	boolean_t sent = _B_TRUE;
313*0Sstevel@tonic-gate 
314*0Sstevel@tonic-gate 	if (debug & D_TARGET) {
315*0Sstevel@tonic-gate 		logdebug("probe(%s %s %d %u)\n", AF_STR(pii->pii_af),
316*0Sstevel@tonic-gate 		    pii->pii_name, probe_type, cur_time);
317*0Sstevel@tonic-gate 	}
318*0Sstevel@tonic-gate 
319*0Sstevel@tonic-gate 	assert(pii->pii_probe_sock != -1);
320*0Sstevel@tonic-gate 	assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI ||
321*0Sstevel@tonic-gate 	    probe_type == PROBE_RTT);
322*0Sstevel@tonic-gate 
323*0Sstevel@tonic-gate 	probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ?
324*0Sstevel@tonic-gate 	    ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST;
325*0Sstevel@tonic-gate 	probe_pkt.pr_icmp_code = 0;
326*0Sstevel@tonic-gate 	probe_pkt.pr_icmp_cksum = 0;
327*0Sstevel@tonic-gate 	probe_pkt.pr_icmp_seq = htons(pii->pii_snxt);
328*0Sstevel@tonic-gate 
329*0Sstevel@tonic-gate 	/*
330*0Sstevel@tonic-gate 	 * Since there is no need to do arithmetic on the icmpid,
331*0Sstevel@tonic-gate 	 * (only equality check is done) pii_icmpid is stored in
332*0Sstevel@tonic-gate 	 * network byte order at initialization itself.
333*0Sstevel@tonic-gate 	 */
334*0Sstevel@tonic-gate 	probe_pkt.pr_icmp_id = pii->pii_icmpid;
335*0Sstevel@tonic-gate 	probe_pkt.pr_icmp_timestamp = htonl(cur_time);
336*0Sstevel@tonic-gate 	probe_pkt.pr_icmp_mtype = htonl(probe_type);
337*0Sstevel@tonic-gate 
338*0Sstevel@tonic-gate 	/*
339*0Sstevel@tonic-gate 	 * If probe_type is PROBE_MULTI, this packet will be multicast to
340*0Sstevel@tonic-gate 	 * the all hosts address. Otherwise it is unicast to the next target.
341*0Sstevel@tonic-gate 	 */
342*0Sstevel@tonic-gate 	assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) &&
343*0Sstevel@tonic-gate 	    pii->pii_rtt_target_next != NULL));
344*0Sstevel@tonic-gate 
345*0Sstevel@tonic-gate 	if (pii->pii_af == AF_INET6) {
346*0Sstevel@tonic-gate 		bzero(&whereto6, sizeof (whereto6));
347*0Sstevel@tonic-gate 		whereto6.sin6_family = AF_INET6;
348*0Sstevel@tonic-gate 		if (probe_type == PROBE_MULTI) {
349*0Sstevel@tonic-gate 			whereto6.sin6_addr = all_nodes_mcast_v6;
350*0Sstevel@tonic-gate 		} else if (probe_type == PROBE_UNI) {
351*0Sstevel@tonic-gate 			whereto6.sin6_addr = pii->pii_target_next->tg_address;
352*0Sstevel@tonic-gate 		} else  {
353*0Sstevel@tonic-gate 			/* type is PROBE_RTT */
354*0Sstevel@tonic-gate 			whereto6.sin6_addr =
355*0Sstevel@tonic-gate 			    pii->pii_rtt_target_next->tg_address;
356*0Sstevel@tonic-gate 		}
357*0Sstevel@tonic-gate 		if (sendto(pii->pii_probe_sock, (char *)&probe_pkt,
358*0Sstevel@tonic-gate 		    sizeof (probe_pkt), 0, (struct sockaddr *)&whereto6,
359*0Sstevel@tonic-gate 		    sizeof (whereto6)) != sizeof (probe_pkt)) {
360*0Sstevel@tonic-gate 			logperror_pii(pii, "probe: probe sendto");
361*0Sstevel@tonic-gate 			sent = _B_FALSE;
362*0Sstevel@tonic-gate 		}
363*0Sstevel@tonic-gate 	} else {
364*0Sstevel@tonic-gate 		bzero(&whereto, sizeof (whereto));
365*0Sstevel@tonic-gate 		whereto.sin_family = AF_INET;
366*0Sstevel@tonic-gate 		if (probe_type == PROBE_MULTI) {
367*0Sstevel@tonic-gate 			whereto.sin_addr = all_nodes_mcast_v4;
368*0Sstevel@tonic-gate 		} else if (probe_type == PROBE_UNI) {
369*0Sstevel@tonic-gate 			IN6_V4MAPPED_TO_INADDR(
370*0Sstevel@tonic-gate 			    &pii->pii_target_next->tg_address,
371*0Sstevel@tonic-gate 			    &whereto.sin_addr);
372*0Sstevel@tonic-gate 		} else {
373*0Sstevel@tonic-gate 			/* type is PROBE_RTT */
374*0Sstevel@tonic-gate 			IN6_V4MAPPED_TO_INADDR(
375*0Sstevel@tonic-gate 			    &pii->pii_rtt_target_next->tg_address,
376*0Sstevel@tonic-gate 			    &whereto.sin_addr);
377*0Sstevel@tonic-gate 		}
378*0Sstevel@tonic-gate 
379*0Sstevel@tonic-gate 		/*
380*0Sstevel@tonic-gate 		 * Compute the IPv4 icmp checksum. Does not cover the IP header.
381*0Sstevel@tonic-gate 		 */
382*0Sstevel@tonic-gate 		probe_pkt.pr_icmp_cksum =
383*0Sstevel@tonic-gate 		    in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt));
384*0Sstevel@tonic-gate 		if (sendto(pii->pii_probe_sock, (char *)&probe_pkt,
385*0Sstevel@tonic-gate 		    sizeof (probe_pkt), 0, (struct sockaddr *)&whereto,
386*0Sstevel@tonic-gate 		    sizeof (whereto)) != sizeof (probe_pkt)) {
387*0Sstevel@tonic-gate 			logperror_pii(pii, "probe: probe sendto");
388*0Sstevel@tonic-gate 			sent = _B_FALSE;
389*0Sstevel@tonic-gate 		}
390*0Sstevel@tonic-gate 	}
391*0Sstevel@tonic-gate 
392*0Sstevel@tonic-gate 	/*
393*0Sstevel@tonic-gate 	 * If this is a PROBE_UNI probe packet being unicast to a target, then
394*0Sstevel@tonic-gate 	 * update our tables. We will need this info in processing the probe
395*0Sstevel@tonic-gate 	 * response. PROBE_MULTI and PROBE_RTT packets are not used for
396*0Sstevel@tonic-gate 	 * the purpose of failure or recovery detection. PROBE_MULTI packets
397*0Sstevel@tonic-gate 	 * are only used to construct a list of targets. PROBE_RTT packets are
398*0Sstevel@tonic-gate 	 * used only for updating the rtt and not for failure detection.
399*0Sstevel@tonic-gate 	 */
400*0Sstevel@tonic-gate 	if (probe_type == PROBE_UNI && sent) {
401*0Sstevel@tonic-gate 		pr_ndx = pii->pii_probe_next;
402*0Sstevel@tonic-gate 		assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT);
403*0Sstevel@tonic-gate 
404*0Sstevel@tonic-gate 		/* Collect statistics, before we reuse the last slot. */
405*0Sstevel@tonic-gate 		if (pii->pii_probes[pr_ndx].pr_status == PR_LOST)
406*0Sstevel@tonic-gate 			pii->pii_cum_stats.lost++;
407*0Sstevel@tonic-gate 		else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED)
408*0Sstevel@tonic-gate 			pii->pii_cum_stats.acked++;
409*0Sstevel@tonic-gate 		pii->pii_cum_stats.sent++;
410*0Sstevel@tonic-gate 
411*0Sstevel@tonic-gate 		pii->pii_probes[pr_ndx].pr_status = PR_UNACKED;
412*0Sstevel@tonic-gate 		pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next;
413*0Sstevel@tonic-gate 		pii->pii_probes[pr_ndx].pr_time_sent = cur_time;
414*0Sstevel@tonic-gate 		pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next);
415*0Sstevel@tonic-gate 		pii->pii_target_next = target_next(pii->pii_target_next);
416*0Sstevel@tonic-gate 		assert(pii->pii_target_next != NULL);
417*0Sstevel@tonic-gate 		/*
418*0Sstevel@tonic-gate 		 * If we have a single variable to denote the next target to
419*0Sstevel@tonic-gate 		 * probe for both rtt probes and failure detection probes, we
420*0Sstevel@tonic-gate 		 * could end up with a situation where the failure detection
421*0Sstevel@tonic-gate 		 * probe targets become disjoint from the rtt probe targets.
422*0Sstevel@tonic-gate 		 * Eg. if 2 targets and the actual fdt is double the user
423*0Sstevel@tonic-gate 		 * specified fdt. So we have 2 variables. In this scheme
424*0Sstevel@tonic-gate 		 * we also reset pii_rtt_target_next for every fdt probe,
425*0Sstevel@tonic-gate 		 * though that may not be necessary.
426*0Sstevel@tonic-gate 		 */
427*0Sstevel@tonic-gate 		pii->pii_rtt_target_next = pii->pii_target_next;
428*0Sstevel@tonic-gate 		pii->pii_snxt++;
429*0Sstevel@tonic-gate 	} else if (probe_type == PROBE_RTT) {
430*0Sstevel@tonic-gate 		pii->pii_rtt_target_next =
431*0Sstevel@tonic-gate 		    target_next(pii->pii_rtt_target_next);
432*0Sstevel@tonic-gate 		assert(pii->pii_rtt_target_next != NULL);
433*0Sstevel@tonic-gate 	}
434*0Sstevel@tonic-gate }
435*0Sstevel@tonic-gate 
436*0Sstevel@tonic-gate /*
437*0Sstevel@tonic-gate  * Incoming IPv4 data from wire, is received here. Called from main.
438*0Sstevel@tonic-gate  */
439*0Sstevel@tonic-gate void
440*0Sstevel@tonic-gate in_data(struct phyint_instance *pii)
441*0Sstevel@tonic-gate {
442*0Sstevel@tonic-gate 	struct	sockaddr_in 	from;
443*0Sstevel@tonic-gate 	struct	in6_addr	fromaddr;
444*0Sstevel@tonic-gate 	uint_t	fromlen;
445*0Sstevel@tonic-gate 	static uint_t in_packet[(IP_MAXPACKET + 1)/4];
446*0Sstevel@tonic-gate 	struct ip *ip;
447*0Sstevel@tonic-gate 	int 	iphlen;
448*0Sstevel@tonic-gate 	int 	len;
449*0Sstevel@tonic-gate 	char 	abuf[INET_ADDRSTRLEN];
450*0Sstevel@tonic-gate 	struct	pr_icmp	*reply;
451*0Sstevel@tonic-gate 
452*0Sstevel@tonic-gate 	if (debug & D_PROBE) {
453*0Sstevel@tonic-gate 		logdebug("in_data(%s %s)\n",
454*0Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name);
455*0Sstevel@tonic-gate 	}
456*0Sstevel@tonic-gate 
457*0Sstevel@tonic-gate 	/*
458*0Sstevel@tonic-gate 	 * Poll has already told us that a message is waiting,
459*0Sstevel@tonic-gate 	 * on this socket. Read it now. We should not block.
460*0Sstevel@tonic-gate 	 */
461*0Sstevel@tonic-gate 	fromlen = sizeof (from);
462*0Sstevel@tonic-gate 	len = recvfrom(pii->pii_probe_sock, (char *)in_packet,
463*0Sstevel@tonic-gate 	    sizeof (in_packet), 0, (struct sockaddr *)&from, &fromlen);
464*0Sstevel@tonic-gate 	if (len < 0) {
465*0Sstevel@tonic-gate 		logperror_pii(pii, "in_data: recvfrom");
466*0Sstevel@tonic-gate 		return;
467*0Sstevel@tonic-gate 	}
468*0Sstevel@tonic-gate 
469*0Sstevel@tonic-gate 	/*
470*0Sstevel@tonic-gate 	 * If the NIC has indicated the link is down, don't go
471*0Sstevel@tonic-gate 	 * any further.
472*0Sstevel@tonic-gate 	 */
473*0Sstevel@tonic-gate 	if (LINK_DOWN(pii->pii_phyint))
474*0Sstevel@tonic-gate 		return;
475*0Sstevel@tonic-gate 
476*0Sstevel@tonic-gate 	/* Get the printable address for error reporting */
477*0Sstevel@tonic-gate 	(void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf));
478*0Sstevel@tonic-gate 
479*0Sstevel@tonic-gate 	/* Make sure packet contains at least minimum ICMP header */
480*0Sstevel@tonic-gate 	ip = (struct ip *)in_packet;
481*0Sstevel@tonic-gate 	iphlen = ip->ip_hl << 2;
482*0Sstevel@tonic-gate 	if (len < iphlen + ICMP_MINLEN) {
483*0Sstevel@tonic-gate 		if (debug & D_PKTBAD) {
484*0Sstevel@tonic-gate 			logdebug("in_data: packet too short (%d bytes)"
485*0Sstevel@tonic-gate 			    " from %s\n", len, abuf);
486*0Sstevel@tonic-gate 		}
487*0Sstevel@tonic-gate 		return;
488*0Sstevel@tonic-gate 	}
489*0Sstevel@tonic-gate 
490*0Sstevel@tonic-gate 	/*
491*0Sstevel@tonic-gate 	 * Subtract the IP hdr length, 'len' will be length of the probe
492*0Sstevel@tonic-gate 	 * reply, starting from the icmp hdr.
493*0Sstevel@tonic-gate 	 */
494*0Sstevel@tonic-gate 	len -= iphlen;
495*0Sstevel@tonic-gate 	/* LINTED */
496*0Sstevel@tonic-gate 	reply = (struct pr_icmp *)((char *)in_packet + iphlen);
497*0Sstevel@tonic-gate 
498*0Sstevel@tonic-gate 	/* Probe replies are icmp echo replies. Ignore anything else */
499*0Sstevel@tonic-gate 	if (reply->pr_icmp_type != ICMP_ECHO_REPLY)
500*0Sstevel@tonic-gate 		return;
501*0Sstevel@tonic-gate 
502*0Sstevel@tonic-gate 	/*
503*0Sstevel@tonic-gate 	 * The icmp id should match what we sent, which is stored
504*0Sstevel@tonic-gate 	 * in pi_icmpid. The icmp code for reply must be 0.
505*0Sstevel@tonic-gate 	 * The reply content must be a struct pr_icmp
506*0Sstevel@tonic-gate 	 */
507*0Sstevel@tonic-gate 	if (reply->pr_icmp_id != pii->pii_icmpid) {
508*0Sstevel@tonic-gate 		/* Not in response to our probe */
509*0Sstevel@tonic-gate 		return;
510*0Sstevel@tonic-gate 	}
511*0Sstevel@tonic-gate 
512*0Sstevel@tonic-gate 	if (reply->pr_icmp_code != 0) {
513*0Sstevel@tonic-gate 		logtrace("probe reply code %d from %s on %s\n",
514*0Sstevel@tonic-gate 		    reply->pr_icmp_code, abuf, pii->pii_name);
515*0Sstevel@tonic-gate 		return;
516*0Sstevel@tonic-gate 	}
517*0Sstevel@tonic-gate 
518*0Sstevel@tonic-gate 	if (len < sizeof (struct pr_icmp)) {
519*0Sstevel@tonic-gate 		logtrace("probe reply too short: %d bytes from %s on %s\n",
520*0Sstevel@tonic-gate 		    len, abuf, pii->pii_name);
521*0Sstevel@tonic-gate 		return;
522*0Sstevel@tonic-gate 	}
523*0Sstevel@tonic-gate 
524*0Sstevel@tonic-gate 	IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr);
525*0Sstevel@tonic-gate 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI))
526*0Sstevel@tonic-gate 		/* Unicast probe reply */
527*0Sstevel@tonic-gate 		incoming_echo_reply(pii, reply, fromaddr);
528*0Sstevel@tonic-gate 	else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
529*0Sstevel@tonic-gate 		/* Multicast reply */
530*0Sstevel@tonic-gate 		incoming_mcast_reply(pii, reply, fromaddr);
531*0Sstevel@tonic-gate 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
532*0Sstevel@tonic-gate 		incoming_rtt_reply(pii, reply, fromaddr);
533*0Sstevel@tonic-gate 	} else {
534*0Sstevel@tonic-gate 		/* Probably not in response to our probe */
535*0Sstevel@tonic-gate 		logtrace("probe reply type: %d from %s on %s\n",
536*0Sstevel@tonic-gate 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
537*0Sstevel@tonic-gate 		return;
538*0Sstevel@tonic-gate 	}
539*0Sstevel@tonic-gate 
540*0Sstevel@tonic-gate }
541*0Sstevel@tonic-gate 
542*0Sstevel@tonic-gate /*
543*0Sstevel@tonic-gate  * Incoming IPv6 data from wire is received here. Called from main.
544*0Sstevel@tonic-gate  */
545*0Sstevel@tonic-gate void
546*0Sstevel@tonic-gate in6_data(struct phyint_instance *pii)
547*0Sstevel@tonic-gate {
548*0Sstevel@tonic-gate 	struct sockaddr_in6 from;
549*0Sstevel@tonic-gate 	static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
550*0Sstevel@tonic-gate 	static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
551*0Sstevel@tonic-gate 	int len;
552*0Sstevel@tonic-gate 	char abuf[INET6_ADDRSTRLEN];
553*0Sstevel@tonic-gate 	struct msghdr msg;
554*0Sstevel@tonic-gate 	struct iovec iov;
555*0Sstevel@tonic-gate 	uchar_t *opt;
556*0Sstevel@tonic-gate 	struct	pr_icmp *reply;
557*0Sstevel@tonic-gate 
558*0Sstevel@tonic-gate 	if (debug & D_PROBE) {
559*0Sstevel@tonic-gate 		logdebug("in6_data(%s %s)\n",
560*0Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name);
561*0Sstevel@tonic-gate 	}
562*0Sstevel@tonic-gate 
563*0Sstevel@tonic-gate 	iov.iov_base = (char *)in_packet;
564*0Sstevel@tonic-gate 	iov.iov_len = sizeof (in_packet);
565*0Sstevel@tonic-gate 	msg.msg_iov = &iov;
566*0Sstevel@tonic-gate 	msg.msg_iovlen = 1;
567*0Sstevel@tonic-gate 	msg.msg_name = (struct sockaddr *)&from;
568*0Sstevel@tonic-gate 	msg.msg_namelen = sizeof (from);
569*0Sstevel@tonic-gate 	msg.msg_control = ancillary_data;
570*0Sstevel@tonic-gate 	msg.msg_controllen = sizeof (ancillary_data);
571*0Sstevel@tonic-gate 
572*0Sstevel@tonic-gate 	if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
573*0Sstevel@tonic-gate 		logperror_pii(pii, "in6_data: recvfrom");
574*0Sstevel@tonic-gate 		return;
575*0Sstevel@tonic-gate 	}
576*0Sstevel@tonic-gate 
577*0Sstevel@tonic-gate 	/*
578*0Sstevel@tonic-gate 	 * If the NIC has indicated that the link is down, don't go
579*0Sstevel@tonic-gate 	 * any further.
580*0Sstevel@tonic-gate 	 */
581*0Sstevel@tonic-gate 	if (LINK_DOWN(pii->pii_phyint))
582*0Sstevel@tonic-gate 		return;
583*0Sstevel@tonic-gate 
584*0Sstevel@tonic-gate 	/* Get the printable address for error reporting */
585*0Sstevel@tonic-gate 	(void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf));
586*0Sstevel@tonic-gate 	if (len < ICMP_MINLEN) {
587*0Sstevel@tonic-gate 		if (debug & D_PKTBAD) {
588*0Sstevel@tonic-gate 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
589*0Sstevel@tonic-gate 			    msg.msg_flags, abuf);
590*0Sstevel@tonic-gate 		}
591*0Sstevel@tonic-gate 		return;
592*0Sstevel@tonic-gate 	}
593*0Sstevel@tonic-gate 	/* Ignore packets > 64k or control buffers that don't fit */
594*0Sstevel@tonic-gate 	if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
595*0Sstevel@tonic-gate 		if (debug & D_PKTBAD) {
596*0Sstevel@tonic-gate 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
597*0Sstevel@tonic-gate 			    msg.msg_flags, abuf);
598*0Sstevel@tonic-gate 		}
599*0Sstevel@tonic-gate 		return;
600*0Sstevel@tonic-gate 	}
601*0Sstevel@tonic-gate 
602*0Sstevel@tonic-gate 	reply = (struct pr_icmp *)in_packet;
603*0Sstevel@tonic-gate 	if (reply->pr_icmp_type != ICMP6_ECHO_REPLY)
604*0Sstevel@tonic-gate 		return;
605*0Sstevel@tonic-gate 
606*0Sstevel@tonic-gate 	if (reply->pr_icmp_id != pii->pii_icmpid) {
607*0Sstevel@tonic-gate 		/* Not in response to our probe */
608*0Sstevel@tonic-gate 		return;
609*0Sstevel@tonic-gate 	}
610*0Sstevel@tonic-gate 
611*0Sstevel@tonic-gate 	/*
612*0Sstevel@tonic-gate 	 * The kernel has already verified the the ICMP checksum.
613*0Sstevel@tonic-gate 	 */
614*0Sstevel@tonic-gate 	if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) {
615*0Sstevel@tonic-gate 		logtrace("ICMPv6 echo reply source address not linklocal from "
616*0Sstevel@tonic-gate 		    "%s on %s\n", abuf, pii->pii_name);
617*0Sstevel@tonic-gate 		return;
618*0Sstevel@tonic-gate 	}
619*0Sstevel@tonic-gate 	opt = find_ancillary(&msg, IPV6_RTHDR);
620*0Sstevel@tonic-gate 	if (opt != NULL) {
621*0Sstevel@tonic-gate 		/* Can't allow routing headers in probe replies  */
622*0Sstevel@tonic-gate 		logtrace("message with routing header from %s on %s\n",
623*0Sstevel@tonic-gate 		    abuf, pii->pii_name);
624*0Sstevel@tonic-gate 		return;
625*0Sstevel@tonic-gate 	}
626*0Sstevel@tonic-gate 	if (reply->pr_icmp_code != 0) {
627*0Sstevel@tonic-gate 		logtrace("probe reply code: %d from %s on %s\n",
628*0Sstevel@tonic-gate 		    reply->pr_icmp_code, abuf, pii->pii_name);
629*0Sstevel@tonic-gate 		return;
630*0Sstevel@tonic-gate 	}
631*0Sstevel@tonic-gate 	if (len < (sizeof (struct pr_icmp))) {
632*0Sstevel@tonic-gate 		logtrace("probe reply too short: %d bytes from %s on %s\n",
633*0Sstevel@tonic-gate 		    len, abuf, pii->pii_name);
634*0Sstevel@tonic-gate 		return;
635*0Sstevel@tonic-gate 	}
636*0Sstevel@tonic-gate 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) {
637*0Sstevel@tonic-gate 		incoming_echo_reply(pii, reply, from.sin6_addr);
638*0Sstevel@tonic-gate 	} else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
639*0Sstevel@tonic-gate 		incoming_mcast_reply(pii, reply, from.sin6_addr);
640*0Sstevel@tonic-gate 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
641*0Sstevel@tonic-gate 		incoming_rtt_reply(pii, reply, from.sin6_addr);
642*0Sstevel@tonic-gate 	} else  {
643*0Sstevel@tonic-gate 		/* Probably not in response to our probe */
644*0Sstevel@tonic-gate 		logtrace("probe reply type: %d from %s on %s\n",
645*0Sstevel@tonic-gate 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
646*0Sstevel@tonic-gate 	}
647*0Sstevel@tonic-gate }
648*0Sstevel@tonic-gate 
649*0Sstevel@tonic-gate /*
650*0Sstevel@tonic-gate  * Process the incoming rtt reply, in response to our rtt probe.
651*0Sstevel@tonic-gate  * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't
652*0Sstevel@tonic-gate  * have any stored information about the probe we sent. So we don't log
653*0Sstevel@tonic-gate  * any errors if we receive bad replies.
654*0Sstevel@tonic-gate  */
655*0Sstevel@tonic-gate static void
656*0Sstevel@tonic-gate incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply,
657*0Sstevel@tonic-gate     struct in6_addr fromaddr)
658*0Sstevel@tonic-gate {
659*0Sstevel@tonic-gate 	int 	m;		/* rtt measurment in ms */
660*0Sstevel@tonic-gate 	uint32_t cur_time;	/* in ms from some arbitrary point */
661*0Sstevel@tonic-gate 	char	abuf[INET6_ADDRSTRLEN];
662*0Sstevel@tonic-gate 	struct	target	*target;
663*0Sstevel@tonic-gate 	uint32_t pr_icmp_timestamp;
664*0Sstevel@tonic-gate 	struct 	phyint_group *pg;
665*0Sstevel@tonic-gate 
666*0Sstevel@tonic-gate 	/* Get the printable address for error reporting */
667*0Sstevel@tonic-gate 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
668*0Sstevel@tonic-gate 
669*0Sstevel@tonic-gate 	if (debug & D_PROBE) {
670*0Sstevel@tonic-gate 		logdebug("incoming_rtt_reply: %s %s %s\n",
671*0Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name, abuf);
672*0Sstevel@tonic-gate 	}
673*0Sstevel@tonic-gate 
674*0Sstevel@tonic-gate 	/* Do we know this target ? */
675*0Sstevel@tonic-gate 	target = target_lookup(pii, fromaddr);
676*0Sstevel@tonic-gate 	if (target == NULL)
677*0Sstevel@tonic-gate 		return;
678*0Sstevel@tonic-gate 
679*0Sstevel@tonic-gate 	pr_icmp_timestamp  = ntohl(reply->pr_icmp_timestamp);
680*0Sstevel@tonic-gate 	cur_time = getcurrenttime();
681*0Sstevel@tonic-gate 	m = (int)(cur_time - pr_icmp_timestamp);
682*0Sstevel@tonic-gate 
683*0Sstevel@tonic-gate 	/* Invalid rtt. It has wrapped around */
684*0Sstevel@tonic-gate 	if (m < 0)
685*0Sstevel@tonic-gate 		return;
686*0Sstevel@tonic-gate 
687*0Sstevel@tonic-gate 	/*
688*0Sstevel@tonic-gate 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
689*0Sstevel@tonic-gate 	 * The initial few responses after the interface is repaired may
690*0Sstevel@tonic-gate 	 * contain high rtt's because they could have been queued up waiting
691*0Sstevel@tonic-gate 	 * for ARP/NDP resolution on a failed interface.
692*0Sstevel@tonic-gate 	 */
693*0Sstevel@tonic-gate 	pg = pii->pii_phyint->pi_group;
694*0Sstevel@tonic-gate 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
695*0Sstevel@tonic-gate 		return;
696*0Sstevel@tonic-gate 
697*0Sstevel@tonic-gate 	/*
698*0Sstevel@tonic-gate 	 * Update rtt only if the new rtt is lower than the current rtt.
699*0Sstevel@tonic-gate 	 * (specified by the 3rd parameter to pi_set_crtt).
700*0Sstevel@tonic-gate 	 * If a spike has caused the current probe_interval to be >
701*0Sstevel@tonic-gate 	 * user_probe_interval, then this mechanism is used to bring down
702*0Sstevel@tonic-gate 	 * the rtt rapidly once the network stress is removed.
703*0Sstevel@tonic-gate 	 * If the new rtt is higher than the current rtt, we don't want to
704*0Sstevel@tonic-gate 	 * update the rtt. We are having more than 1 outstanding probe and
705*0Sstevel@tonic-gate 	 * the increase in rtt we are seeing is being unnecessarily weighted
706*0Sstevel@tonic-gate 	 * many times. The regular rtt update will be handled by
707*0Sstevel@tonic-gate 	 * incoming_echo_reply() and will take care of any rtt increase.
708*0Sstevel@tonic-gate 	 */
709*0Sstevel@tonic-gate 	pi_set_crtt(target, m, _B_FALSE);
710*0Sstevel@tonic-gate 	if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
711*0Sstevel@tonic-gate 	    (user_failure_detection_time < pg->pg_fdt) &&
712*0Sstevel@tonic-gate 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
713*0Sstevel@tonic-gate 		/*
714*0Sstevel@tonic-gate 		 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER,
715*0Sstevel@tonic-gate 		 * investigate if we can improve the failure detection time to
716*0Sstevel@tonic-gate 		 * meet whatever the user specified.
717*0Sstevel@tonic-gate 		 */
718*0Sstevel@tonic-gate 		if (check_pg_crtt_improved(pg)) {
719*0Sstevel@tonic-gate 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
720*0Sstevel@tonic-gate 			    user_failure_detection_time);
721*0Sstevel@tonic-gate 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
722*0Sstevel@tonic-gate 			if (pii->pii_phyint->pi_group != phyint_anongroup) {
723*0Sstevel@tonic-gate 				logerr("Improved failure detection time %d ms "
724*0Sstevel@tonic-gate 				    "on (%s %s) for group \"%s\"\n",
725*0Sstevel@tonic-gate 				    pg->pg_fdt, AF_STR(pii->pii_af),
726*0Sstevel@tonic-gate 				    pii->pii_name,
727*0Sstevel@tonic-gate 				    pii->pii_phyint->pi_group->pg_name);
728*0Sstevel@tonic-gate 			}
729*0Sstevel@tonic-gate 			if (user_failure_detection_time == pg->pg_fdt) {
730*0Sstevel@tonic-gate 				/* Avoid any truncation or rounding errors */
731*0Sstevel@tonic-gate 				pg->pg_probeint = user_probe_interval;
732*0Sstevel@tonic-gate 				/*
733*0Sstevel@tonic-gate 				 * No more rtt probes will be sent. The actual
734*0Sstevel@tonic-gate 				 * fdt has dropped to the user specified value.
735*0Sstevel@tonic-gate 				 * pii_fd_snxt_basetime and pii_snxt_basetime
736*0Sstevel@tonic-gate 				 * will be in sync henceforth.
737*0Sstevel@tonic-gate 				 */
738*0Sstevel@tonic-gate 				reset_snxt_basetimes();
739*0Sstevel@tonic-gate 			}
740*0Sstevel@tonic-gate 		}
741*0Sstevel@tonic-gate 	}
742*0Sstevel@tonic-gate }
743*0Sstevel@tonic-gate 
744*0Sstevel@tonic-gate /*
745*0Sstevel@tonic-gate  * Process the incoming echo reply, in response to our unicast probe.
746*0Sstevel@tonic-gate  * Common for both IPv4 and IPv6
747*0Sstevel@tonic-gate  */
748*0Sstevel@tonic-gate static void
749*0Sstevel@tonic-gate incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
750*0Sstevel@tonic-gate     struct in6_addr fromaddr)
751*0Sstevel@tonic-gate {
752*0Sstevel@tonic-gate 	int 	m;		/* rtt measurment in ms */
753*0Sstevel@tonic-gate 	uint32_t cur_time;	/* in ms from some arbitrary point */
754*0Sstevel@tonic-gate 	char	abuf[INET6_ADDRSTRLEN];
755*0Sstevel@tonic-gate 	int	pr_ndx;
756*0Sstevel@tonic-gate 	struct	target	*target;
757*0Sstevel@tonic-gate 	boolean_t exception;
758*0Sstevel@tonic-gate 	uint32_t pr_icmp_timestamp;
759*0Sstevel@tonic-gate 	uint16_t pr_icmp_seq;
760*0Sstevel@tonic-gate 	struct 	phyint_group *pg = pii->pii_phyint->pi_group;
761*0Sstevel@tonic-gate 
762*0Sstevel@tonic-gate 	/* Get the printable address for error reporting */
763*0Sstevel@tonic-gate 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
764*0Sstevel@tonic-gate 
765*0Sstevel@tonic-gate 	if (debug & D_PROBE) {
766*0Sstevel@tonic-gate 		logdebug("incoming_echo_reply: %s %s %s seq %u\n",
767*0Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name, abuf,
768*0Sstevel@tonic-gate 		    ntohs(reply->pr_icmp_seq));
769*0Sstevel@tonic-gate 	}
770*0Sstevel@tonic-gate 
771*0Sstevel@tonic-gate 	pr_icmp_timestamp  = ntohl(reply->pr_icmp_timestamp);
772*0Sstevel@tonic-gate 	pr_icmp_seq  = ntohs(reply->pr_icmp_seq);
773*0Sstevel@tonic-gate 
774*0Sstevel@tonic-gate 	/* Reject out of window probe replies */
775*0Sstevel@tonic-gate 	if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) ||
776*0Sstevel@tonic-gate 	    SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) {
777*0Sstevel@tonic-gate 		logtrace("out of window probe seq %u snxt %u on %s from %s\n",
778*0Sstevel@tonic-gate 		    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
779*0Sstevel@tonic-gate 		pii->pii_cum_stats.unknown++;
780*0Sstevel@tonic-gate 		return;
781*0Sstevel@tonic-gate 	}
782*0Sstevel@tonic-gate 	cur_time = getcurrenttime();
783*0Sstevel@tonic-gate 	m = (int)(cur_time - pr_icmp_timestamp);
784*0Sstevel@tonic-gate 	if (m < 0) {
785*0Sstevel@tonic-gate 		/*
786*0Sstevel@tonic-gate 		 * This is a ridiculously high value of rtt. rtt has wrapped
787*0Sstevel@tonic-gate 		 * around. Log a message, and ignore the rtt.
788*0Sstevel@tonic-gate 		 */
789*0Sstevel@tonic-gate 		logerr("incoming_echo_reply: rtt wraparound cur_time %u reply "
790*0Sstevel@tonic-gate 		    "timestamp %u\n", cur_time, pr_icmp_timestamp);
791*0Sstevel@tonic-gate 	}
792*0Sstevel@tonic-gate 
793*0Sstevel@tonic-gate 	/*
794*0Sstevel@tonic-gate 	 * Get the probe index pr_ndx corresponding to the received icmp seq.
795*0Sstevel@tonic-gate 	 * number in our pii->pii_probes[] array. The icmp sequence number
796*0Sstevel@tonic-gate 	 * pii_snxt corresponds to the probe index pii->pii_probe_next
797*0Sstevel@tonic-gate 	 */
798*0Sstevel@tonic-gate 	pr_ndx = MOD_SUB(pii->pii_probe_next,
799*0Sstevel@tonic-gate 	    (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT);
800*0Sstevel@tonic-gate 
801*0Sstevel@tonic-gate 	assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status));
802*0Sstevel@tonic-gate 
803*0Sstevel@tonic-gate 	target = pii->pii_probes[pr_ndx].pr_target;
804*0Sstevel@tonic-gate 
805*0Sstevel@tonic-gate 	/*
806*0Sstevel@tonic-gate 	 * Perform sanity checks, whether this probe reply that we
807*0Sstevel@tonic-gate 	 * have received is genuine
808*0Sstevel@tonic-gate 	 */
809*0Sstevel@tonic-gate 	if (target != NULL) {
810*0Sstevel@tonic-gate 		/*
811*0Sstevel@tonic-gate 		 * Compare the src. addr of the received ICMP or ICMPv6
812*0Sstevel@tonic-gate 		 * probe reply with the target address in our tables.
813*0Sstevel@tonic-gate 		 */
814*0Sstevel@tonic-gate 		if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) {
815*0Sstevel@tonic-gate 			/*
816*0Sstevel@tonic-gate 			 * We don't have any record of having sent a probe to
817*0Sstevel@tonic-gate 			 * this target. This is a fake probe reply. Log an error
818*0Sstevel@tonic-gate 			 */
819*0Sstevel@tonic-gate 			logtrace("probe status %d Fake probe reply seq %u "
820*0Sstevel@tonic-gate 			    "snxt %u on %s from %s\n",
821*0Sstevel@tonic-gate 			    pii->pii_probes[pr_ndx].pr_status,
822*0Sstevel@tonic-gate 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
823*0Sstevel@tonic-gate 			pii->pii_cum_stats.unknown++;
824*0Sstevel@tonic-gate 			return;
825*0Sstevel@tonic-gate 		} else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
826*0Sstevel@tonic-gate 			/*
827*0Sstevel@tonic-gate 			 * The address matches, but our tables indicate that
828*0Sstevel@tonic-gate 			 * this probe reply has been acked already. So this
829*0Sstevel@tonic-gate 			 * is a duplicate probe reply. Log an error
830*0Sstevel@tonic-gate 			 */
831*0Sstevel@tonic-gate 			logtrace("probe status %d Duplicate probe reply seq %u "
832*0Sstevel@tonic-gate 			    "snxt %u on %s from %s\n",
833*0Sstevel@tonic-gate 			    pii->pii_probes[pr_ndx].pr_status,
834*0Sstevel@tonic-gate 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
835*0Sstevel@tonic-gate 			pii->pii_cum_stats.unknown++;
836*0Sstevel@tonic-gate 			return;
837*0Sstevel@tonic-gate 		}
838*0Sstevel@tonic-gate 	} else {
839*0Sstevel@tonic-gate 		/*
840*0Sstevel@tonic-gate 		 * Target must not be NULL in the PR_UNACKED state
841*0Sstevel@tonic-gate 		 */
842*0Sstevel@tonic-gate 		assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED);
843*0Sstevel@tonic-gate 		if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) {
844*0Sstevel@tonic-gate 			/*
845*0Sstevel@tonic-gate 			 * The probe stats slot is unused. So we didn't
846*0Sstevel@tonic-gate 			 * send out any probe to this target. This is a fake.
847*0Sstevel@tonic-gate 			 * Log an error.
848*0Sstevel@tonic-gate 			 */
849*0Sstevel@tonic-gate 			logtrace("probe status %d Fake probe reply seq %u "
850*0Sstevel@tonic-gate 			    "snxt %u on %s from %s\n",
851*0Sstevel@tonic-gate 			    pii->pii_probes[pr_ndx].pr_status,
852*0Sstevel@tonic-gate 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
853*0Sstevel@tonic-gate 		}
854*0Sstevel@tonic-gate 		pii->pii_cum_stats.unknown++;
855*0Sstevel@tonic-gate 		return;
856*0Sstevel@tonic-gate 	}
857*0Sstevel@tonic-gate 
858*0Sstevel@tonic-gate 	/*
859*0Sstevel@tonic-gate 	 * If the rtt does not appear to be right, don't update the
860*0Sstevel@tonic-gate 	 * rtt stats. This can happen if the system dropped into the
861*0Sstevel@tonic-gate 	 * debugger, or the system was hung or too busy for a
862*0Sstevel@tonic-gate 	 * substantial time that we didn't get a chance to run.
863*0Sstevel@tonic-gate 	 */
864*0Sstevel@tonic-gate 	if ((m < 0) || (m > PROBE_STATS_COUNT * pg->pg_probeint)) {
865*0Sstevel@tonic-gate 		/*
866*0Sstevel@tonic-gate 		 * If the probe corresponding to this receieved response
867*0Sstevel@tonic-gate 		 * was truly sent 'm' ms. ago, then this response must
868*0Sstevel@tonic-gate 		 * have been rejected by the sequence number checks. The
869*0Sstevel@tonic-gate 		 * fact that it has passed the sequence number checks
870*0Sstevel@tonic-gate 		 * means that the measured rtt is wrong. We were probably
871*0Sstevel@tonic-gate 		 * scheduled long after the packet was received.
872*0Sstevel@tonic-gate 		 */
873*0Sstevel@tonic-gate 		goto out;
874*0Sstevel@tonic-gate 	}
875*0Sstevel@tonic-gate 
876*0Sstevel@tonic-gate 	/*
877*0Sstevel@tonic-gate 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
878*0Sstevel@tonic-gate 	 * The initial few responses after the interface is repaired may
879*0Sstevel@tonic-gate 	 * contain high rtt's because they could have been queued up waiting
880*0Sstevel@tonic-gate 	 * for ARP/NDP resolution on a failed interface.
881*0Sstevel@tonic-gate 	 */
882*0Sstevel@tonic-gate 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
883*0Sstevel@tonic-gate 		goto out;
884*0Sstevel@tonic-gate 
885*0Sstevel@tonic-gate 	/*
886*0Sstevel@tonic-gate 	 * Don't update the Conservative Round Trip Time estimate for this
887*0Sstevel@tonic-gate 	 * (phint, target) pair if this is the not the highest ack seq seen
888*0Sstevel@tonic-gate 	 * thus far on this target.
889*0Sstevel@tonic-gate 	 */
890*0Sstevel@tonic-gate 	if (!highest_ack_tg(pr_icmp_seq, target))
891*0Sstevel@tonic-gate 		goto out;
892*0Sstevel@tonic-gate 
893*0Sstevel@tonic-gate 	/*
894*0Sstevel@tonic-gate 	 * Always update the rtt. This is a failure detection probe
895*0Sstevel@tonic-gate 	 * and we want to measure both increase / decrease in rtt.
896*0Sstevel@tonic-gate 	 */
897*0Sstevel@tonic-gate 	pi_set_crtt(target, m, _B_TRUE);
898*0Sstevel@tonic-gate 
899*0Sstevel@tonic-gate 	/*
900*0Sstevel@tonic-gate 	 * If the crtt exceeds the average time between probes,
901*0Sstevel@tonic-gate 	 * investigate if this slow target is an exception. If so we
902*0Sstevel@tonic-gate 	 * can avoid this target and still meet the failure detection
903*0Sstevel@tonic-gate 	 * time. Otherwise we can't meet the failure detection time.
904*0Sstevel@tonic-gate 	 */
905*0Sstevel@tonic-gate 	if (target->tg_crtt > pg->pg_probeint) {
906*0Sstevel@tonic-gate 		exception = check_exception_target(pii, target);
907*0Sstevel@tonic-gate 		if (exception) {
908*0Sstevel@tonic-gate 			/*
909*0Sstevel@tonic-gate 			 * This target is exceptionally slow. Don't use it
910*0Sstevel@tonic-gate 			 * for future probes. check_exception_target() has
911*0Sstevel@tonic-gate 			 * made sure that we have at least MIN_PROBE_TARGETS
912*0Sstevel@tonic-gate 			 * other active targets
913*0Sstevel@tonic-gate 			 */
914*0Sstevel@tonic-gate 			if (pii->pii_targets_are_routers) {
915*0Sstevel@tonic-gate 				/*
916*0Sstevel@tonic-gate 				 * This is a slow router, mark it as slow
917*0Sstevel@tonic-gate 				 * and don't use it for further probes. We
918*0Sstevel@tonic-gate 				 * don't delete it, since it will be populated
919*0Sstevel@tonic-gate 				 * again when we do a router scan. Hence we
920*0Sstevel@tonic-gate 				 * need to maintain extra state (unlike the
921*0Sstevel@tonic-gate 				 * host case below).  Mark it as TG_SLOW.
922*0Sstevel@tonic-gate 				 */
923*0Sstevel@tonic-gate 				if (target->tg_status == TG_ACTIVE)
924*0Sstevel@tonic-gate 					pii->pii_ntargets--;
925*0Sstevel@tonic-gate 				target->tg_status = TG_SLOW;
926*0Sstevel@tonic-gate 				target->tg_latime = gethrtime();
927*0Sstevel@tonic-gate 				target->tg_rtt_sa = -1;
928*0Sstevel@tonic-gate 				target->tg_crtt = 0;
929*0Sstevel@tonic-gate 				target->tg_rtt_sd = 0;
930*0Sstevel@tonic-gate 				if (pii->pii_target_next == target) {
931*0Sstevel@tonic-gate 					pii->pii_target_next =
932*0Sstevel@tonic-gate 					    target_next(target);
933*0Sstevel@tonic-gate 				}
934*0Sstevel@tonic-gate 			} else {
935*0Sstevel@tonic-gate 				/*
936*0Sstevel@tonic-gate 				 * the slow target is not a router, we can
937*0Sstevel@tonic-gate 				 * just delete it. Send an icmp multicast and
938*0Sstevel@tonic-gate 				 * pick the fastest responder that is not
939*0Sstevel@tonic-gate 				 * already an active target. target_delete()
940*0Sstevel@tonic-gate 				 * adjusts pii->pii_target_next
941*0Sstevel@tonic-gate 				 */
942*0Sstevel@tonic-gate 				target_delete(target);
943*0Sstevel@tonic-gate 				probe(pii, PROBE_MULTI, cur_time);
944*0Sstevel@tonic-gate 			}
945*0Sstevel@tonic-gate 		} else {
946*0Sstevel@tonic-gate 			/*
947*0Sstevel@tonic-gate 			 * We can't meet the failure detection time.
948*0Sstevel@tonic-gate 			 * Log a message, and update the detection time to
949*0Sstevel@tonic-gate 			 * whatever we can achieve.
950*0Sstevel@tonic-gate 			 */
951*0Sstevel@tonic-gate 			pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE;
952*0Sstevel@tonic-gate 			pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2);
953*0Sstevel@tonic-gate 			last_fdt_bumpup_time = gethrtime();
954*0Sstevel@tonic-gate 			if (pg != phyint_anongroup) {
955*0Sstevel@tonic-gate 				logerr("Cannot meet requested failure detection"
956*0Sstevel@tonic-gate 				    " time of %d ms on (%s %s) new failure"
957*0Sstevel@tonic-gate 				    " detection time for group \"%s\" is %d"
958*0Sstevel@tonic-gate 				    " ms\n", user_failure_detection_time,
959*0Sstevel@tonic-gate 				    AF_STR(pii->pii_af), pii->pii_name,
960*0Sstevel@tonic-gate 				    pg->pg_name, pg->pg_fdt);
961*0Sstevel@tonic-gate 			}
962*0Sstevel@tonic-gate 		}
963*0Sstevel@tonic-gate 	} else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
964*0Sstevel@tonic-gate 	    (user_failure_detection_time < pg->pg_fdt) &&
965*0Sstevel@tonic-gate 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
966*0Sstevel@tonic-gate 		/*
967*0Sstevel@tonic-gate 		 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER
968*0Sstevel@tonic-gate 		 * investigate if we can improve the failure detection time to
969*0Sstevel@tonic-gate 		 * meet whatever the user specified.
970*0Sstevel@tonic-gate 		 */
971*0Sstevel@tonic-gate 		if (check_pg_crtt_improved(pg)) {
972*0Sstevel@tonic-gate 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
973*0Sstevel@tonic-gate 			    user_failure_detection_time);
974*0Sstevel@tonic-gate 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
975*0Sstevel@tonic-gate 			if (pg != phyint_anongroup) {
976*0Sstevel@tonic-gate 				logerr("Improved failure detection time %d ms "
977*0Sstevel@tonic-gate 				    "on (%s %s) for group \"%s\"\n", pg->pg_fdt,
978*0Sstevel@tonic-gate 				    AF_STR(pii->pii_af), pii->pii_name,
979*0Sstevel@tonic-gate 				    pg->pg_name);
980*0Sstevel@tonic-gate 			}
981*0Sstevel@tonic-gate 			if (user_failure_detection_time == pg->pg_fdt) {
982*0Sstevel@tonic-gate 				/* Avoid any truncation or rounding errors */
983*0Sstevel@tonic-gate 				pg->pg_probeint = user_probe_interval;
984*0Sstevel@tonic-gate 				/*
985*0Sstevel@tonic-gate 				 * No more rtt probes will be sent. The actual
986*0Sstevel@tonic-gate 				 * fdt has dropped to the user specified value.
987*0Sstevel@tonic-gate 				 * pii_fd_snxt_basetime and pii_snxt_basetime
988*0Sstevel@tonic-gate 				 * will be in sync henceforth.
989*0Sstevel@tonic-gate 				 */
990*0Sstevel@tonic-gate 				reset_snxt_basetimes();
991*0Sstevel@tonic-gate 			}
992*0Sstevel@tonic-gate 		}
993*0Sstevel@tonic-gate 	}
994*0Sstevel@tonic-gate out:
995*0Sstevel@tonic-gate 	pii->pii_probes[pr_ndx].pr_status = PR_ACKED;
996*0Sstevel@tonic-gate 	pii->pii_probes[pr_ndx].pr_time_acked = cur_time;
997*0Sstevel@tonic-gate 
998*0Sstevel@tonic-gate 	/*
999*0Sstevel@tonic-gate 	 * Update pii->pii_rack, i.e. the sequence number of the last received
1000*0Sstevel@tonic-gate 	 * probe response, based on the echo reply we have received now, if
1001*0Sstevel@tonic-gate 	 * either of the following conditions are satisfied.
1002*0Sstevel@tonic-gate 	 * a. pii_rack is outside the current receive window of
1003*0Sstevel@tonic-gate 	 *    [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt).
1004*0Sstevel@tonic-gate 	 *    This means we have not received probe responses for a
1005*0Sstevel@tonic-gate 	 *    long time, and the sequence number has wrapped around.
1006*0Sstevel@tonic-gate 	 * b. pii_rack is within the current receive window and this echo
1007*0Sstevel@tonic-gate 	 *    reply corresponds to the highest sequence number we have seen
1008*0Sstevel@tonic-gate 	 *    so far.
1009*0Sstevel@tonic-gate 	 */
1010*0Sstevel@tonic-gate 	if (SEQ_GE(pii->pii_rack, pii->pii_snxt) ||
1011*0Sstevel@tonic-gate 	    SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) ||
1012*0Sstevel@tonic-gate 	    SEQ_GT(pr_icmp_seq, pii->pii_rack)) {
1013*0Sstevel@tonic-gate 		pii->pii_rack = pr_icmp_seq;
1014*0Sstevel@tonic-gate 	}
1015*0Sstevel@tonic-gate }
1016*0Sstevel@tonic-gate 
1017*0Sstevel@tonic-gate /*
1018*0Sstevel@tonic-gate  * Returns true if seq is the highest unacknowledged seq for target tg
1019*0Sstevel@tonic-gate  * else returns false
1020*0Sstevel@tonic-gate  */
1021*0Sstevel@tonic-gate static boolean_t
1022*0Sstevel@tonic-gate highest_ack_tg(uint16_t seq, struct target *tg)
1023*0Sstevel@tonic-gate {
1024*0Sstevel@tonic-gate 	struct phyint_instance *pii;
1025*0Sstevel@tonic-gate 	int	 pr_ndx;
1026*0Sstevel@tonic-gate 	uint16_t pr_seq;
1027*0Sstevel@tonic-gate 
1028*0Sstevel@tonic-gate 	pii = tg->tg_phyint_inst;
1029*0Sstevel@tonic-gate 
1030*0Sstevel@tonic-gate 	/*
1031*0Sstevel@tonic-gate 	 * Get the seq number of the most recent probe sent so far,
1032*0Sstevel@tonic-gate 	 * and also get the corresponding probe index in the probe stats
1033*0Sstevel@tonic-gate 	 * array.
1034*0Sstevel@tonic-gate 	 */
1035*0Sstevel@tonic-gate 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
1036*0Sstevel@tonic-gate 	pr_seq = pii->pii_snxt;
1037*0Sstevel@tonic-gate 	pr_seq--;
1038*0Sstevel@tonic-gate 
1039*0Sstevel@tonic-gate 	/*
1040*0Sstevel@tonic-gate 	 * Start from the most recent probe and walk back, trying to find
1041*0Sstevel@tonic-gate 	 * an acked probe corresponding to target tg.
1042*0Sstevel@tonic-gate 	 */
1043*0Sstevel@tonic-gate 	for (; pr_ndx != pii->pii_probe_next;
1044*0Sstevel@tonic-gate 	    pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) {
1045*0Sstevel@tonic-gate 		if (pii->pii_probes[pr_ndx].pr_target == tg &&
1046*0Sstevel@tonic-gate 		    pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
1047*0Sstevel@tonic-gate 			if (SEQ_GT(pr_seq, seq))
1048*0Sstevel@tonic-gate 				return (_B_FALSE);
1049*0Sstevel@tonic-gate 		}
1050*0Sstevel@tonic-gate 	}
1051*0Sstevel@tonic-gate 	return (_B_TRUE);
1052*0Sstevel@tonic-gate }
1053*0Sstevel@tonic-gate 
1054*0Sstevel@tonic-gate /*
1055*0Sstevel@tonic-gate  * Check whether the crtt for the group has improved by a factor of
1056*0Sstevel@tonic-gate  * LOWER_FDT_TRIGGER.  Small crtt improvements are ignored to avoid failure
1057*0Sstevel@tonic-gate  * detection time flapping in the face of small crtt changes.
1058*0Sstevel@tonic-gate  */
1059*0Sstevel@tonic-gate static boolean_t
1060*0Sstevel@tonic-gate check_pg_crtt_improved(struct phyint_group *pg)
1061*0Sstevel@tonic-gate {
1062*0Sstevel@tonic-gate 	struct	phyint *pi;
1063*0Sstevel@tonic-gate 
1064*0Sstevel@tonic-gate 	if (debug & D_PROBE)
1065*0Sstevel@tonic-gate 		logdebug("check_pg_crtt_improved()\n");
1066*0Sstevel@tonic-gate 
1067*0Sstevel@tonic-gate 	/*
1068*0Sstevel@tonic-gate 	 * The crtt for the group is only improved if each phyint_instance
1069*0Sstevel@tonic-gate 	 * for both ipv4 and ipv6 is improved.
1070*0Sstevel@tonic-gate 	 */
1071*0Sstevel@tonic-gate 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
1072*0Sstevel@tonic-gate 		if (!check_pii_crtt_improved(pi->pi_v4) ||
1073*0Sstevel@tonic-gate 		    !check_pii_crtt_improved(pi->pi_v6))
1074*0Sstevel@tonic-gate 			return (_B_FALSE);
1075*0Sstevel@tonic-gate 	}
1076*0Sstevel@tonic-gate 
1077*0Sstevel@tonic-gate 	return (_B_TRUE);
1078*0Sstevel@tonic-gate }
1079*0Sstevel@tonic-gate 
1080*0Sstevel@tonic-gate /*
1081*0Sstevel@tonic-gate  * Check whether the crtt has improved substantially on this phyint_instance.
1082*0Sstevel@tonic-gate  * Returns _B_TRUE if there's no crtt information available, because pii
1083*0Sstevel@tonic-gate  * is NULL or the phyint_instance is not capable of probing.
1084*0Sstevel@tonic-gate  */
1085*0Sstevel@tonic-gate boolean_t
1086*0Sstevel@tonic-gate check_pii_crtt_improved(struct phyint_instance *pii) {
1087*0Sstevel@tonic-gate 	struct 	target *tg;
1088*0Sstevel@tonic-gate 
1089*0Sstevel@tonic-gate 	if (pii == NULL)
1090*0Sstevel@tonic-gate 		return (_B_TRUE);
1091*0Sstevel@tonic-gate 
1092*0Sstevel@tonic-gate 	if (!PROBE_CAPABLE(pii) ||
1093*0Sstevel@tonic-gate 	    pii->pii_phyint->pi_state == PI_FAILED)
1094*0Sstevel@tonic-gate 		return (_B_TRUE);
1095*0Sstevel@tonic-gate 
1096*0Sstevel@tonic-gate 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1097*0Sstevel@tonic-gate 		if (tg->tg_status != TG_ACTIVE)
1098*0Sstevel@tonic-gate 			continue;
1099*0Sstevel@tonic-gate 		if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint /
1100*0Sstevel@tonic-gate 		    LOWER_FDT_TRIGGER)) {
1101*0Sstevel@tonic-gate 			return (_B_FALSE);
1102*0Sstevel@tonic-gate 		}
1103*0Sstevel@tonic-gate 	}
1104*0Sstevel@tonic-gate 
1105*0Sstevel@tonic-gate 	return (_B_TRUE);
1106*0Sstevel@tonic-gate }
1107*0Sstevel@tonic-gate 
1108*0Sstevel@tonic-gate /*
1109*0Sstevel@tonic-gate  * This target responds very slowly to probes. The target's crtt exceeds
1110*0Sstevel@tonic-gate  * the probe interval of its group. Compare against other targets
1111*0Sstevel@tonic-gate  * and determine if this target is an exception, if so return true, else false
1112*0Sstevel@tonic-gate  */
1113*0Sstevel@tonic-gate static boolean_t
1114*0Sstevel@tonic-gate check_exception_target(struct phyint_instance *pii, struct target *target)
1115*0Sstevel@tonic-gate {
1116*0Sstevel@tonic-gate 	struct	target *tg;
1117*0Sstevel@tonic-gate 	char abuf[INET6_ADDRSTRLEN];
1118*0Sstevel@tonic-gate 
1119*0Sstevel@tonic-gate 	if (debug & D_PROBE) {
1120*0Sstevel@tonic-gate 		logdebug("check_exception_target(%s %s target %s)\n",
1121*0Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name,
1122*0Sstevel@tonic-gate 		    pr_addr(pii->pii_af, target->tg_address,
1123*0Sstevel@tonic-gate 			abuf, sizeof (abuf)));
1124*0Sstevel@tonic-gate 	}
1125*0Sstevel@tonic-gate 
1126*0Sstevel@tonic-gate 	/*
1127*0Sstevel@tonic-gate 	 * We should have at least MIN_PROBE_TARGETS + 1 good targets now,
1128*0Sstevel@tonic-gate 	 * to make a good judgement. Otherwise don't drop this target.
1129*0Sstevel@tonic-gate 	 */
1130*0Sstevel@tonic-gate 	if (pii->pii_ntargets <  MIN_PROBE_TARGETS + 1)
1131*0Sstevel@tonic-gate 		return (_B_FALSE);
1132*0Sstevel@tonic-gate 
1133*0Sstevel@tonic-gate 	/*
1134*0Sstevel@tonic-gate 	 * Determine whether only this particular target is slow.
1135*0Sstevel@tonic-gate 	 * We know that this target's crtt exceeds the group's probe interval.
1136*0Sstevel@tonic-gate 	 * If all other active targets have a
1137*0Sstevel@tonic-gate 	 * crtt < (this group's probe interval) / EXCEPTION_FACTOR,
1138*0Sstevel@tonic-gate 	 * then this target is considered slow.
1139*0Sstevel@tonic-gate 	 */
1140*0Sstevel@tonic-gate 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1141*0Sstevel@tonic-gate 		if (tg != target && tg->tg_status == TG_ACTIVE) {
1142*0Sstevel@tonic-gate 			if (tg->tg_crtt >
1143*0Sstevel@tonic-gate 			    pii->pii_phyint->pi_group->pg_probeint /
1144*0Sstevel@tonic-gate 			    EXCEPTION_FACTOR) {
1145*0Sstevel@tonic-gate 				return (_B_FALSE);
1146*0Sstevel@tonic-gate 			}
1147*0Sstevel@tonic-gate 		}
1148*0Sstevel@tonic-gate 	}
1149*0Sstevel@tonic-gate 
1150*0Sstevel@tonic-gate 	return (_B_TRUE);
1151*0Sstevel@tonic-gate }
1152*0Sstevel@tonic-gate 
1153*0Sstevel@tonic-gate /*
1154*0Sstevel@tonic-gate  * Update the target list. The icmp all hosts multicast has given us
1155*0Sstevel@tonic-gate  * some host to which we can send probes. If we already have sufficient
1156*0Sstevel@tonic-gate  * targets, discard it.
1157*0Sstevel@tonic-gate  */
1158*0Sstevel@tonic-gate static void
1159*0Sstevel@tonic-gate incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply,
1160*0Sstevel@tonic-gate     struct in6_addr fromaddr)
1161*0Sstevel@tonic-gate /* ARGSUSED */
1162*0Sstevel@tonic-gate {
1163*0Sstevel@tonic-gate 	int af;
1164*0Sstevel@tonic-gate 	char abuf[INET6_ADDRSTRLEN];
1165*0Sstevel@tonic-gate 	struct phyint *pi;
1166*0Sstevel@tonic-gate 
1167*0Sstevel@tonic-gate 	if (debug & D_PROBE) {
1168*0Sstevel@tonic-gate 		logdebug("incoming_mcast_reply(%s %s %s)\n",
1169*0Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name,
1170*0Sstevel@tonic-gate 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)));
1171*0Sstevel@tonic-gate 	}
1172*0Sstevel@tonic-gate 
1173*0Sstevel@tonic-gate 	/*
1174*0Sstevel@tonic-gate 	 * Using host targets is a fallback mechanism. If we have
1175*0Sstevel@tonic-gate 	 * found a router, don't add this host target. If we already
1176*0Sstevel@tonic-gate 	 * know MAX_PROBE_TARGETS, don't add another target.
1177*0Sstevel@tonic-gate 	 */
1178*0Sstevel@tonic-gate 	assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
1179*0Sstevel@tonic-gate 	if (pii->pii_targets != NULL) {
1180*0Sstevel@tonic-gate 		if (pii->pii_targets_are_routers ||
1181*0Sstevel@tonic-gate 		    (pii->pii_ntargets == MAX_PROBE_TARGETS)) {
1182*0Sstevel@tonic-gate 			return;
1183*0Sstevel@tonic-gate 		}
1184*0Sstevel@tonic-gate 	}
1185*0Sstevel@tonic-gate 
1186*0Sstevel@tonic-gate 	if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) ||
1187*0Sstevel@tonic-gate 	    IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) {
1188*0Sstevel@tonic-gate 		/*
1189*0Sstevel@tonic-gate 		 * Guard against response from 0.0.0.0
1190*0Sstevel@tonic-gate 		 * and ::. Log a trace message
1191*0Sstevel@tonic-gate 		 */
1192*0Sstevel@tonic-gate 		logtrace("probe response from %s on %s\n",
1193*0Sstevel@tonic-gate 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)),
1194*0Sstevel@tonic-gate 		    pii->pii_name);
1195*0Sstevel@tonic-gate 		return;
1196*0Sstevel@tonic-gate 	}
1197*0Sstevel@tonic-gate 
1198*0Sstevel@tonic-gate 	/*
1199*0Sstevel@tonic-gate 	 * This address is one of our own, so reject this address as a
1200*0Sstevel@tonic-gate 	 * valid probe target.
1201*0Sstevel@tonic-gate 	 */
1202*0Sstevel@tonic-gate 	af = pii->pii_af;
1203*0Sstevel@tonic-gate 	if (own_address(af, fromaddr))
1204*0Sstevel@tonic-gate 		return;
1205*0Sstevel@tonic-gate 
1206*0Sstevel@tonic-gate 	/*
1207*0Sstevel@tonic-gate 	 * If the phyint is part a named group, then add the address to all
1208*0Sstevel@tonic-gate 	 * members of the group.  Otherwise, add the address only to the
1209*0Sstevel@tonic-gate 	 * phyint itself, since other phyints in the anongroup may not be on
1210*0Sstevel@tonic-gate 	 * the same subnet.
1211*0Sstevel@tonic-gate 	 */
1212*0Sstevel@tonic-gate 	pi = pii->pii_phyint;
1213*0Sstevel@tonic-gate 	if (pi->pi_group == phyint_anongroup) {
1214*0Sstevel@tonic-gate 		target_add(pii, fromaddr, _B_FALSE);
1215*0Sstevel@tonic-gate 	} else {
1216*0Sstevel@tonic-gate 		pi = pi->pi_group->pg_phyint;
1217*0Sstevel@tonic-gate 		for (; pi != NULL; pi = pi->pi_pgnext)
1218*0Sstevel@tonic-gate 			target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE);
1219*0Sstevel@tonic-gate 	}
1220*0Sstevel@tonic-gate }
1221*0Sstevel@tonic-gate 
1222*0Sstevel@tonic-gate /*
1223*0Sstevel@tonic-gate  * Compute CRTT given an existing scaled average, scaled deviation estimate
1224*0Sstevel@tonic-gate  * and a new rtt time.  The formula is from Jacobson and Karels'
1225*0Sstevel@tonic-gate  * "Congestion Avoidance and Control" in SIGCOMM '88.  The variable names
1226*0Sstevel@tonic-gate  * are the same as those in Appendix A.2 of that paper.
1227*0Sstevel@tonic-gate  *
1228*0Sstevel@tonic-gate  * m = new measurement
1229*0Sstevel@tonic-gate  * sa = scaled RTT average (8 * average estimates)
1230*0Sstevel@tonic-gate  * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates).
1231*0Sstevel@tonic-gate  * crtt = Conservative round trip time. Used to determine whether probe
1232*0Sstevel@tonic-gate  * has timed out.
1233*0Sstevel@tonic-gate  *
1234*0Sstevel@tonic-gate  * New scaled average and deviation are passed back via sap and svp
1235*0Sstevel@tonic-gate  */
1236*0Sstevel@tonic-gate static int
1237*0Sstevel@tonic-gate compute_crtt(int *sap, int *svp, int m)
1238*0Sstevel@tonic-gate {
1239*0Sstevel@tonic-gate 	int sa = *sap;
1240*0Sstevel@tonic-gate 	int sv = *svp;
1241*0Sstevel@tonic-gate 	int crtt;
1242*0Sstevel@tonic-gate 	int saved_m = m;
1243*0Sstevel@tonic-gate 
1244*0Sstevel@tonic-gate 	assert(*sap >= -1);
1245*0Sstevel@tonic-gate 	assert(*svp >= 0);
1246*0Sstevel@tonic-gate 
1247*0Sstevel@tonic-gate 	if (sa != -1) {
1248*0Sstevel@tonic-gate 		/*
1249*0Sstevel@tonic-gate 		 * Update average estimator:
1250*0Sstevel@tonic-gate 		 *	new rtt = old rtt + 1/8 Error
1251*0Sstevel@tonic-gate 		 *	    where Error = m - old rtt
1252*0Sstevel@tonic-gate 		 *	i.e. 8 * new rtt = 8 * old rtt + Error
1253*0Sstevel@tonic-gate 		 *	i.e. new sa =  old sa + Error
1254*0Sstevel@tonic-gate 		 */
1255*0Sstevel@tonic-gate 		m -= sa >> 3;		/* m is now Error in estimate. */
1256*0Sstevel@tonic-gate 		if ((sa += m) < 0) {
1257*0Sstevel@tonic-gate 			/* Don't allow the smoothed average to be negative. */
1258*0Sstevel@tonic-gate 			sa = 0;
1259*0Sstevel@tonic-gate 		}
1260*0Sstevel@tonic-gate 
1261*0Sstevel@tonic-gate 		/*
1262*0Sstevel@tonic-gate 		 * Update deviation estimator:
1263*0Sstevel@tonic-gate 		 *	new mdev =  old mdev + 1/4 (abs(Error) - old mdev)
1264*0Sstevel@tonic-gate 		 *	i.e. 4 * new mdev = 4 * old mdev +
1265*0Sstevel@tonic-gate 		 *		(abs(Error) - old mdev)
1266*0Sstevel@tonic-gate 		 * 	i.e. new sv = old sv + (abs(Error) - old mdev)
1267*0Sstevel@tonic-gate 		 */
1268*0Sstevel@tonic-gate 		if (m < 0)
1269*0Sstevel@tonic-gate 			m = -m;
1270*0Sstevel@tonic-gate 		m -= sv >> 2;
1271*0Sstevel@tonic-gate 		sv += m;
1272*0Sstevel@tonic-gate 	} else {
1273*0Sstevel@tonic-gate 		/* Initialization. This is the first response received. */
1274*0Sstevel@tonic-gate 		sa = (m << 3);
1275*0Sstevel@tonic-gate 		sv = (m << 1);
1276*0Sstevel@tonic-gate 	}
1277*0Sstevel@tonic-gate 
1278*0Sstevel@tonic-gate 	crtt = (sa >> 3) + sv;
1279*0Sstevel@tonic-gate 
1280*0Sstevel@tonic-gate 	if (debug & D_PROBE) {
1281*0Sstevel@tonic-gate 		logdebug("compute_crtt: m = %d sa = %d, sv = %d -> crtt = "
1282*0Sstevel@tonic-gate 		    "%d\n", saved_m, sa, sv, crtt);
1283*0Sstevel@tonic-gate 	}
1284*0Sstevel@tonic-gate 
1285*0Sstevel@tonic-gate 	*sap = sa;
1286*0Sstevel@tonic-gate 	*svp = sv;
1287*0Sstevel@tonic-gate 
1288*0Sstevel@tonic-gate 	/*
1289*0Sstevel@tonic-gate 	 * CRTT = average estimates  + 4 * deviation estimates
1290*0Sstevel@tonic-gate 	 *	= sa / 8 + sv
1291*0Sstevel@tonic-gate 	 */
1292*0Sstevel@tonic-gate 	return (crtt);
1293*0Sstevel@tonic-gate }
1294*0Sstevel@tonic-gate 
1295*0Sstevel@tonic-gate static void
1296*0Sstevel@tonic-gate pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni)
1297*0Sstevel@tonic-gate {
1298*0Sstevel@tonic-gate 	struct phyint_instance *pii = tg->tg_phyint_inst;
1299*0Sstevel@tonic-gate 	int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1300*0Sstevel@tonic-gate 	int sa = tg->tg_rtt_sa;
1301*0Sstevel@tonic-gate 	int sv = tg->tg_rtt_sd;
1302*0Sstevel@tonic-gate 	int new_crtt;
1303*0Sstevel@tonic-gate 	int i;
1304*0Sstevel@tonic-gate 
1305*0Sstevel@tonic-gate 	if (debug & D_PROBE)
1306*0Sstevel@tonic-gate 		logdebug("pi_set_crtt: target -  m %d\n", m);
1307*0Sstevel@tonic-gate 
1308*0Sstevel@tonic-gate 	/* store the round trip time, in case we need to defer computation */
1309*0Sstevel@tonic-gate 	tg->tg_deferred[tg->tg_num_deferred] = m;
1310*0Sstevel@tonic-gate 
1311*0Sstevel@tonic-gate 	new_crtt = compute_crtt(&sa, &sv, m);
1312*0Sstevel@tonic-gate 
1313*0Sstevel@tonic-gate 	/*
1314*0Sstevel@tonic-gate 	 * If this probe's round trip time would singlehandedly cause an
1315*0Sstevel@tonic-gate 	 * increase in the group's probe interval consider it suspect.
1316*0Sstevel@tonic-gate 	 */
1317*0Sstevel@tonic-gate 	if ((new_crtt > probe_interval) && is_probe_uni) {
1318*0Sstevel@tonic-gate 		if (debug & D_PROBE) {
1319*0Sstevel@tonic-gate 			logdebug("Received a suspect probe on %s, new_crtt ="
1320*0Sstevel@tonic-gate 			    " %d, probe_interval = %d, num_deferred = %d\n",
1321*0Sstevel@tonic-gate 			    pii->pii_probe_logint->li_name, new_crtt,
1322*0Sstevel@tonic-gate 			    probe_interval, tg->tg_num_deferred);
1323*0Sstevel@tonic-gate 		}
1324*0Sstevel@tonic-gate 
1325*0Sstevel@tonic-gate 		/*
1326*0Sstevel@tonic-gate 		 * If we've deferred as many rtts as we plan on deferring, then
1327*0Sstevel@tonic-gate 		 * assume the link really did slow down and process all queued
1328*0Sstevel@tonic-gate 		 * rtts
1329*0Sstevel@tonic-gate 		 */
1330*0Sstevel@tonic-gate 		if (tg->tg_num_deferred == MAXDEFERREDRTT) {
1331*0Sstevel@tonic-gate 			if (debug & D_PROBE) {
1332*0Sstevel@tonic-gate 				logdebug("Received MAXDEFERREDRTT probes which "
1333*0Sstevel@tonic-gate 				    "would cause an increased probe_interval.  "
1334*0Sstevel@tonic-gate 				    "Integrating queued rtt data points.\n");
1335*0Sstevel@tonic-gate 			}
1336*0Sstevel@tonic-gate 
1337*0Sstevel@tonic-gate 			for (i = 0; i <= tg->tg_num_deferred; i++) {
1338*0Sstevel@tonic-gate 				tg->tg_crtt = compute_crtt(&tg->tg_rtt_sa,
1339*0Sstevel@tonic-gate 				    &tg->tg_rtt_sd, tg->tg_deferred[i]);
1340*0Sstevel@tonic-gate 			}
1341*0Sstevel@tonic-gate 
1342*0Sstevel@tonic-gate 			tg->tg_num_deferred = 0;
1343*0Sstevel@tonic-gate 		} else {
1344*0Sstevel@tonic-gate 			tg->tg_num_deferred++;
1345*0Sstevel@tonic-gate 		}
1346*0Sstevel@tonic-gate 		return;
1347*0Sstevel@tonic-gate 	}
1348*0Sstevel@tonic-gate 
1349*0Sstevel@tonic-gate 	/*
1350*0Sstevel@tonic-gate 	 * If this is a normal probe, or an RTT probe that would lead to a
1351*0Sstevel@tonic-gate 	 * reduced CRTT, then update our CRTT data.  Further, if this was
1352*0Sstevel@tonic-gate 	 * a normal probe, pitch any deferred probes since our probes are
1353*0Sstevel@tonic-gate 	 * again being answered within our CRTT estimates.
1354*0Sstevel@tonic-gate 	 */
1355*0Sstevel@tonic-gate 	if (is_probe_uni || new_crtt < tg->tg_crtt) {
1356*0Sstevel@tonic-gate 		tg->tg_rtt_sa = sa;
1357*0Sstevel@tonic-gate 		tg->tg_rtt_sd = sv;
1358*0Sstevel@tonic-gate 		tg->tg_crtt = new_crtt;
1359*0Sstevel@tonic-gate 		if (is_probe_uni)
1360*0Sstevel@tonic-gate 			tg->tg_num_deferred = 0;
1361*0Sstevel@tonic-gate 	}
1362*0Sstevel@tonic-gate }
1363*0Sstevel@tonic-gate 
1364*0Sstevel@tonic-gate /*
1365*0Sstevel@tonic-gate  * Return a pointer to the specified option buffer.
1366*0Sstevel@tonic-gate  * If not found return NULL.
1367*0Sstevel@tonic-gate  */
1368*0Sstevel@tonic-gate static void *
1369*0Sstevel@tonic-gate find_ancillary(struct msghdr *msg, int cmsg_type)
1370*0Sstevel@tonic-gate {
1371*0Sstevel@tonic-gate 	struct cmsghdr *cmsg;
1372*0Sstevel@tonic-gate 
1373*0Sstevel@tonic-gate 	for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
1374*0Sstevel@tonic-gate 	    cmsg = CMSG_NXTHDR(msg, cmsg)) {
1375*0Sstevel@tonic-gate 		if (cmsg->cmsg_level == IPPROTO_IPV6 &&
1376*0Sstevel@tonic-gate 		    cmsg->cmsg_type == cmsg_type) {
1377*0Sstevel@tonic-gate 			return (CMSG_DATA(cmsg));
1378*0Sstevel@tonic-gate 		}
1379*0Sstevel@tonic-gate 	}
1380*0Sstevel@tonic-gate 	return (NULL);
1381*0Sstevel@tonic-gate }
1382*0Sstevel@tonic-gate 
1383*0Sstevel@tonic-gate /*
1384*0Sstevel@tonic-gate  * See if a previously failed interface has started working again.
1385*0Sstevel@tonic-gate  */
1386*0Sstevel@tonic-gate void
1387*0Sstevel@tonic-gate phyint_check_for_repair(struct phyint *pi)
1388*0Sstevel@tonic-gate {
1389*0Sstevel@tonic-gate 	if (phyint_repaired(pi)) {
1390*0Sstevel@tonic-gate 		if (pi->pi_group == phyint_anongroup) {
1391*0Sstevel@tonic-gate 			logerr("NIC repair detected on %s\n", pi->pi_name);
1392*0Sstevel@tonic-gate 		} else {
1393*0Sstevel@tonic-gate 			logerr("NIC repair detected on %s of group %s\n",
1394*0Sstevel@tonic-gate 			    pi->pi_name, pi->pi_group->pg_name);
1395*0Sstevel@tonic-gate 		}
1396*0Sstevel@tonic-gate 
1397*0Sstevel@tonic-gate 		/*
1398*0Sstevel@tonic-gate 		 * If the interface is offline, just clear the FAILED flag,
1399*0Sstevel@tonic-gate 		 * delaying the state change and failback operation until it
1400*0Sstevel@tonic-gate 		 * is brought back online.
1401*0Sstevel@tonic-gate 		 */
1402*0Sstevel@tonic-gate 		if (pi->pi_state == PI_OFFLINE) {
1403*0Sstevel@tonic-gate 			(void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
1404*0Sstevel@tonic-gate 			return;
1405*0Sstevel@tonic-gate 		}
1406*0Sstevel@tonic-gate 
1407*0Sstevel@tonic-gate 		if (pi->pi_flags & IFF_INACTIVE) {
1408*0Sstevel@tonic-gate 			(void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
1409*0Sstevel@tonic-gate 		} else {
1410*0Sstevel@tonic-gate 			if (try_failback(pi, _B_FALSE) != IPMP_FAILURE) {
1411*0Sstevel@tonic-gate 				(void) change_lif_flags(pi,
1412*0Sstevel@tonic-gate 				    IFF_FAILED, _B_FALSE);
1413*0Sstevel@tonic-gate 				/* Per state diagram */
1414*0Sstevel@tonic-gate 				pi->pi_empty = 0;
1415*0Sstevel@tonic-gate 			}
1416*0Sstevel@tonic-gate 		}
1417*0Sstevel@tonic-gate 
1418*0Sstevel@tonic-gate 		phyint_chstate(pi, PI_RUNNING);
1419*0Sstevel@tonic-gate 
1420*0Sstevel@tonic-gate 		if (GROUP_FAILED(pi->pi_group)) {
1421*0Sstevel@tonic-gate 			/*
1422*0Sstevel@tonic-gate 			 * This is the 1st phyint to receive a response
1423*0Sstevel@tonic-gate 			 * after group failure.
1424*0Sstevel@tonic-gate 			 */
1425*0Sstevel@tonic-gate 			logerr("At least 1 interface (%s) of group %s has "
1426*0Sstevel@tonic-gate 			    "repaired\n", pi->pi_name, pi->pi_group->pg_name);
1427*0Sstevel@tonic-gate 			phyint_group_chstate(pi->pi_group, PG_RUNNING);
1428*0Sstevel@tonic-gate 		}
1429*0Sstevel@tonic-gate 	}
1430*0Sstevel@tonic-gate }
1431*0Sstevel@tonic-gate 
1432*0Sstevel@tonic-gate /*
1433*0Sstevel@tonic-gate  * See if a previously functioning interface has failed, or if the
1434*0Sstevel@tonic-gate  * whole group of interfaces has failed.
1435*0Sstevel@tonic-gate  */
1436*0Sstevel@tonic-gate static void
1437*0Sstevel@tonic-gate phyint_inst_check_for_failure(struct phyint_instance *pii)
1438*0Sstevel@tonic-gate {
1439*0Sstevel@tonic-gate 	struct	phyint	*pi;
1440*0Sstevel@tonic-gate 	struct	phyint	*pi2;
1441*0Sstevel@tonic-gate 
1442*0Sstevel@tonic-gate 	pi = pii->pii_phyint;
1443*0Sstevel@tonic-gate 
1444*0Sstevel@tonic-gate 	switch (failure_state(pii)) {
1445*0Sstevel@tonic-gate 	case PHYINT_FAILURE:
1446*0Sstevel@tonic-gate 		(void) change_lif_flags(pi, IFF_FAILED, _B_TRUE);
1447*0Sstevel@tonic-gate 		if (pi->pi_group == phyint_anongroup) {
1448*0Sstevel@tonic-gate 			logerr("NIC failure detected on %s\n", pii->pii_name);
1449*0Sstevel@tonic-gate 		} else {
1450*0Sstevel@tonic-gate 			logerr("NIC failure detected on %s of group %s\n",
1451*0Sstevel@tonic-gate 			    pii->pii_name, pi->pi_group->pg_name);
1452*0Sstevel@tonic-gate 		}
1453*0Sstevel@tonic-gate 		/*
1454*0Sstevel@tonic-gate 		 * Do the failover, unless the interface is offline (in
1455*0Sstevel@tonic-gate 		 * which case we've already failed over).
1456*0Sstevel@tonic-gate 		 */
1457*0Sstevel@tonic-gate 		if (pi->pi_state != PI_OFFLINE) {
1458*0Sstevel@tonic-gate 			phyint_chstate(pi, PI_FAILED);
1459*0Sstevel@tonic-gate 			reset_crtt_all(pi);
1460*0Sstevel@tonic-gate 			if (!(pi->pi_flags & IFF_INACTIVE))
1461*0Sstevel@tonic-gate 				(void) try_failover(pi, FAILOVER_NORMAL);
1462*0Sstevel@tonic-gate 		}
1463*0Sstevel@tonic-gate 		break;
1464*0Sstevel@tonic-gate 
1465*0Sstevel@tonic-gate 	case GROUP_FAILURE:
1466*0Sstevel@tonic-gate 		logerr("All Interfaces in group %s have failed\n",
1467*0Sstevel@tonic-gate 		    pi->pi_group->pg_name);
1468*0Sstevel@tonic-gate 		for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL;
1469*0Sstevel@tonic-gate 		    pi2 = pi2->pi_pgnext) {
1470*0Sstevel@tonic-gate 			if (pi2->pi_flags & IFF_OFFLINE)
1471*0Sstevel@tonic-gate 				continue;
1472*0Sstevel@tonic-gate 			(void) change_lif_flags(pi2, IFF_FAILED, _B_TRUE);
1473*0Sstevel@tonic-gate 			reset_crtt_all(pi2);
1474*0Sstevel@tonic-gate 
1475*0Sstevel@tonic-gate 			/*
1476*0Sstevel@tonic-gate 			 * In the case of host targets, we
1477*0Sstevel@tonic-gate 			 * would have flushed the targets,
1478*0Sstevel@tonic-gate 			 * and gone to PI_NOTARGETS state.
1479*0Sstevel@tonic-gate 			 */
1480*0Sstevel@tonic-gate 			if (pi2->pi_state == PI_RUNNING)
1481*0Sstevel@tonic-gate 				phyint_chstate(pi, PI_FAILED);
1482*0Sstevel@tonic-gate 
1483*0Sstevel@tonic-gate 			pi2->pi_empty = 0;
1484*0Sstevel@tonic-gate 			pi2->pi_full = 0;
1485*0Sstevel@tonic-gate 		}
1486*0Sstevel@tonic-gate 		break;
1487*0Sstevel@tonic-gate 
1488*0Sstevel@tonic-gate 	default:
1489*0Sstevel@tonic-gate 		break;
1490*0Sstevel@tonic-gate 	}
1491*0Sstevel@tonic-gate }
1492*0Sstevel@tonic-gate 
1493*0Sstevel@tonic-gate /*
1494*0Sstevel@tonic-gate  * Determines if any timeout event has occurred and returns the number of
1495*0Sstevel@tonic-gate  * milliseconds until the next timeout event for the phyint. Returns
1496*0Sstevel@tonic-gate  * TIMER_INFINITY for "never".
1497*0Sstevel@tonic-gate  */
1498*0Sstevel@tonic-gate uint_t
1499*0Sstevel@tonic-gate phyint_inst_timer(struct phyint_instance *pii)
1500*0Sstevel@tonic-gate {
1501*0Sstevel@tonic-gate 	int 	pr_ndx;
1502*0Sstevel@tonic-gate 	uint_t	timeout;
1503*0Sstevel@tonic-gate 	struct	target	*cur_tg;
1504*0Sstevel@tonic-gate 	struct	probe_stats *pr_statp;
1505*0Sstevel@tonic-gate 	struct	phyint_instance *pii_other;
1506*0Sstevel@tonic-gate 	struct	phyint *pi;
1507*0Sstevel@tonic-gate 	int	valid_unack_count;
1508*0Sstevel@tonic-gate 	int	i;
1509*0Sstevel@tonic-gate 	int	interval;
1510*0Sstevel@tonic-gate 	uint_t	check_time;
1511*0Sstevel@tonic-gate 	uint_t	cur_time;
1512*0Sstevel@tonic-gate 	hrtime_t cur_hrtime;
1513*0Sstevel@tonic-gate 	int	probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1514*0Sstevel@tonic-gate 
1515*0Sstevel@tonic-gate 	cur_time = getcurrenttime();
1516*0Sstevel@tonic-gate 
1517*0Sstevel@tonic-gate 	if (debug & D_TIMER) {
1518*0Sstevel@tonic-gate 		logdebug("phyint_inst_timer(%s %s)\n",
1519*0Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name);
1520*0Sstevel@tonic-gate 	}
1521*0Sstevel@tonic-gate 
1522*0Sstevel@tonic-gate 	pii_other = phyint_inst_other(pii);
1523*0Sstevel@tonic-gate 	if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) {
1524*0Sstevel@tonic-gate 		/*
1525*0Sstevel@tonic-gate 		 * Check to see if we're here due to link up/down flapping; If
1526*0Sstevel@tonic-gate 		 * enough time has passed, then try to bring the interface
1527*0Sstevel@tonic-gate 		 * back up; otherwise, schedule a timer to bring it back up
1528*0Sstevel@tonic-gate 		 * when enough time *has* elapsed.
1529*0Sstevel@tonic-gate 		 */
1530*0Sstevel@tonic-gate 		pi = pii->pii_phyint;
1531*0Sstevel@tonic-gate 		if (pi->pi_state == PI_FAILED && LINK_UP(pi)) {
1532*0Sstevel@tonic-gate 			check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN;
1533*0Sstevel@tonic-gate 			if (check_time > cur_time)
1534*0Sstevel@tonic-gate 				return (check_time - cur_time);
1535*0Sstevel@tonic-gate 
1536*0Sstevel@tonic-gate 			phyint_check_for_repair(pi);
1537*0Sstevel@tonic-gate 		}
1538*0Sstevel@tonic-gate 	}
1539*0Sstevel@tonic-gate 
1540*0Sstevel@tonic-gate 	/*
1541*0Sstevel@tonic-gate 	 * If this phyint is not yet initialized for probes,
1542*0Sstevel@tonic-gate 	 * don't proceed further
1543*0Sstevel@tonic-gate 	 */
1544*0Sstevel@tonic-gate 	if (pii->pii_probe_sock == -1)
1545*0Sstevel@tonic-gate 		return (TIMER_INFINITY);
1546*0Sstevel@tonic-gate 
1547*0Sstevel@tonic-gate 	/*
1548*0Sstevel@tonic-gate 	 * If the timer has fired too soon, probably triggered
1549*0Sstevel@tonic-gate 	 * by some other phyint instance, return the remaining
1550*0Sstevel@tonic-gate 	 * time
1551*0Sstevel@tonic-gate 	 */
1552*0Sstevel@tonic-gate 	if (TIME_LT(cur_time, pii->pii_snxt_time))
1553*0Sstevel@tonic-gate 		return (pii->pii_snxt_time - cur_time);
1554*0Sstevel@tonic-gate 
1555*0Sstevel@tonic-gate 	/*
1556*0Sstevel@tonic-gate 	 * If the link is down, don't send any probes for now.
1557*0Sstevel@tonic-gate 	 */
1558*0Sstevel@tonic-gate 	if (LINK_DOWN(pii->pii_phyint))
1559*0Sstevel@tonic-gate 		return (TIMER_INFINITY);
1560*0Sstevel@tonic-gate 
1561*0Sstevel@tonic-gate 	/*
1562*0Sstevel@tonic-gate 	 * Randomize the next probe time, between MIN_RANDOM_FACTOR
1563*0Sstevel@tonic-gate 	 * and MAX_RANDOM_FACTOR with respect to the base probe time.
1564*0Sstevel@tonic-gate 	 * Base probe time is strictly periodic.
1565*0Sstevel@tonic-gate 	 */
1566*0Sstevel@tonic-gate 	interval = GET_RANDOM(
1567*0Sstevel@tonic-gate 	    (int)(MIN_RANDOM_FACTOR * user_probe_interval),
1568*0Sstevel@tonic-gate 	    (int)(MAX_RANDOM_FACTOR * user_probe_interval));
1569*0Sstevel@tonic-gate 	pii->pii_snxt_time = pii->pii_snxt_basetime + interval;
1570*0Sstevel@tonic-gate 
1571*0Sstevel@tonic-gate 	/*
1572*0Sstevel@tonic-gate 	 * Check if the current time > next time to probe. If so, we missed
1573*0Sstevel@tonic-gate 	 * sending 1 or more probes, probably due to heavy system load. At least
1574*0Sstevel@tonic-gate 	 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we
1575*0Sstevel@tonic-gate 	 * were scheduled. Make adjustments to the times, in multiples of
1576*0Sstevel@tonic-gate 	 * user_probe_interval.
1577*0Sstevel@tonic-gate 	 */
1578*0Sstevel@tonic-gate 	if (TIME_GT(cur_time, pii->pii_snxt_time)) {
1579*0Sstevel@tonic-gate 		int n;
1580*0Sstevel@tonic-gate 
1581*0Sstevel@tonic-gate 		n = (cur_time - pii->pii_snxt_time) / user_probe_interval;
1582*0Sstevel@tonic-gate 		pii->pii_snxt_time 	+= (n + 1) * user_probe_interval;
1583*0Sstevel@tonic-gate 		pii->pii_snxt_basetime 	+= (n + 1) * user_probe_interval;
1584*0Sstevel@tonic-gate 		logtrace("missed sending %d probes cur_time %u snxt_time %u"
1585*0Sstevel@tonic-gate 		    " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time,
1586*0Sstevel@tonic-gate 		    pii->pii_snxt_basetime);
1587*0Sstevel@tonic-gate 
1588*0Sstevel@tonic-gate 		/* Collect statistics about missed probes */
1589*0Sstevel@tonic-gate 		probes_missed.pm_nprobes += n + 1;
1590*0Sstevel@tonic-gate 		probes_missed.pm_ntimes++;
1591*0Sstevel@tonic-gate 	}
1592*0Sstevel@tonic-gate 	pii->pii_snxt_basetime += user_probe_interval;
1593*0Sstevel@tonic-gate 	interval = pii->pii_snxt_time - cur_time;
1594*0Sstevel@tonic-gate 	if (debug & D_TARGET) {
1595*0Sstevel@tonic-gate 		logdebug("cur_time %u snxt_time %u snxt_basetime %u"
1596*0Sstevel@tonic-gate 		    " interval %u\n", cur_time, pii->pii_snxt_time,
1597*0Sstevel@tonic-gate 		    pii->pii_snxt_basetime, interval);
1598*0Sstevel@tonic-gate 	}
1599*0Sstevel@tonic-gate 
1600*0Sstevel@tonic-gate 	/*
1601*0Sstevel@tonic-gate 	 * If no targets are known, we need to send an ICMP multicast. The
1602*0Sstevel@tonic-gate 	 * probe type is PROBE_MULTI.  We'll check back in 'interval' msec
1603*0Sstevel@tonic-gate 	 * to see if we found a target.
1604*0Sstevel@tonic-gate 	 */
1605*0Sstevel@tonic-gate 	if (pii->pii_target_next == NULL) {
1606*0Sstevel@tonic-gate 		assert(pii->pii_ntargets == 0);
1607*0Sstevel@tonic-gate 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1608*0Sstevel@tonic-gate 		probe(pii, PROBE_MULTI, cur_time);
1609*0Sstevel@tonic-gate 		return (interval);
1610*0Sstevel@tonic-gate 	}
1611*0Sstevel@tonic-gate 
1612*0Sstevel@tonic-gate 	if ((user_probe_interval != probe_interval) &&
1613*0Sstevel@tonic-gate 	    TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) {
1614*0Sstevel@tonic-gate 		/*
1615*0Sstevel@tonic-gate 		 * the failure detection (fd) probe timer has not yet fired.
1616*0Sstevel@tonic-gate 		 * Need to send only an rtt probe. The probe type is PROBE_RTT.
1617*0Sstevel@tonic-gate 		 */
1618*0Sstevel@tonic-gate 		probe(pii, PROBE_RTT, cur_time);
1619*0Sstevel@tonic-gate 		return (interval);
1620*0Sstevel@tonic-gate 	}
1621*0Sstevel@tonic-gate 	/*
1622*0Sstevel@tonic-gate 	 * the fd probe timer has fired. Need to do all failure
1623*0Sstevel@tonic-gate 	 * detection / recovery calculations, and then send an fd probe
1624*0Sstevel@tonic-gate 	 * of type PROBE_UNI.
1625*0Sstevel@tonic-gate 	 */
1626*0Sstevel@tonic-gate 	if (user_probe_interval == probe_interval) {
1627*0Sstevel@tonic-gate 		/*
1628*0Sstevel@tonic-gate 		 * We could have missed some probes, and then adjusted
1629*0Sstevel@tonic-gate 		 * pii_snxt_basetime above. Otherwise we could have
1630*0Sstevel@tonic-gate 		 * blindly added probe_interval to pii_fd_snxt_basetime.
1631*0Sstevel@tonic-gate 		 */
1632*0Sstevel@tonic-gate 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1633*0Sstevel@tonic-gate 	} else {
1634*0Sstevel@tonic-gate 		pii->pii_fd_snxt_basetime += probe_interval;
1635*0Sstevel@tonic-gate 		if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) {
1636*0Sstevel@tonic-gate 			int n;
1637*0Sstevel@tonic-gate 
1638*0Sstevel@tonic-gate 			n = (cur_time - pii->pii_fd_snxt_basetime) /
1639*0Sstevel@tonic-gate 			    probe_interval;
1640*0Sstevel@tonic-gate 			pii->pii_fd_snxt_basetime += (n + 1) * probe_interval;
1641*0Sstevel@tonic-gate 		}
1642*0Sstevel@tonic-gate 	}
1643*0Sstevel@tonic-gate 
1644*0Sstevel@tonic-gate 	/*
1645*0Sstevel@tonic-gate 	 * We can have at most, the latest 2 probes that we sent, in
1646*0Sstevel@tonic-gate 	 * the PR_UNACKED state. All previous probes sent, are either
1647*0Sstevel@tonic-gate 	 * PR_LOST or PR_ACKED. An unacknowledged probe is considered
1648*0Sstevel@tonic-gate 	 * timed out if the probe's time_sent + the CRTT < currenttime.
1649*0Sstevel@tonic-gate 	 * For each of the last 2 probes, examine whether it has timed
1650*0Sstevel@tonic-gate 	 * out. If so, mark it PR_LOST. The probe stats is a circular array.
1651*0Sstevel@tonic-gate 	 */
1652*0Sstevel@tonic-gate 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
1653*0Sstevel@tonic-gate 	valid_unack_count = 0;
1654*0Sstevel@tonic-gate 
1655*0Sstevel@tonic-gate 	for (i = 0; i < 2; i++) {
1656*0Sstevel@tonic-gate 		pr_statp = &pii->pii_probes[pr_ndx];
1657*0Sstevel@tonic-gate 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
1658*0Sstevel@tonic-gate 		switch (pr_statp->pr_status) {
1659*0Sstevel@tonic-gate 		case PR_ACKED:
1660*0Sstevel@tonic-gate 			/*
1661*0Sstevel@tonic-gate 			 * We received back an ACK, so the switch clearly
1662*0Sstevel@tonic-gate 			 * is not dropping our traffic, and thus we can
1663*0Sstevel@tonic-gate 			 * enable failure detection immediately.
1664*0Sstevel@tonic-gate 			 */
1665*0Sstevel@tonic-gate 			if (pii->pii_fd_hrtime > gethrtime()) {
1666*0Sstevel@tonic-gate 				if (debug & D_PROBE) {
1667*0Sstevel@tonic-gate 					logdebug("successful probe on %s; "
1668*0Sstevel@tonic-gate 					    "ending quiet period\n",
1669*0Sstevel@tonic-gate 					    pii->pii_phyint->pi_name);
1670*0Sstevel@tonic-gate 				}
1671*0Sstevel@tonic-gate 				pii->pii_fd_hrtime = gethrtime();
1672*0Sstevel@tonic-gate 			}
1673*0Sstevel@tonic-gate 			break;
1674*0Sstevel@tonic-gate 
1675*0Sstevel@tonic-gate 		case PR_UNACKED:
1676*0Sstevel@tonic-gate 			assert(cur_tg != NULL);
1677*0Sstevel@tonic-gate 			/*
1678*0Sstevel@tonic-gate 			 * The crtt could be zero for some reason,
1679*0Sstevel@tonic-gate 			 * Eg. the phyint could be failed. If the crtt is
1680*0Sstevel@tonic-gate 			 * not available use group's probe interval,
1681*0Sstevel@tonic-gate 			 * which is a worst case estimate.
1682*0Sstevel@tonic-gate 			 */
1683*0Sstevel@tonic-gate 			if (cur_tg->tg_crtt != 0) {
1684*0Sstevel@tonic-gate 				timeout = pr_statp->pr_time_sent +
1685*0Sstevel@tonic-gate 				    cur_tg->tg_crtt;
1686*0Sstevel@tonic-gate 			} else {
1687*0Sstevel@tonic-gate 				timeout = pr_statp->pr_time_sent +
1688*0Sstevel@tonic-gate 				    probe_interval;
1689*0Sstevel@tonic-gate 			}
1690*0Sstevel@tonic-gate 			if (TIME_LT(timeout, cur_time)) {
1691*0Sstevel@tonic-gate 				pr_statp->pr_status = PR_LOST;
1692*0Sstevel@tonic-gate 				pr_statp->pr_time_lost = timeout;
1693*0Sstevel@tonic-gate 			} else if (i == 1) {
1694*0Sstevel@tonic-gate 				/*
1695*0Sstevel@tonic-gate 				 * We are forced to consider this probe
1696*0Sstevel@tonic-gate 				 * lost, as we can have at most 2 unack.
1697*0Sstevel@tonic-gate 				 * probes any time, and we will be sending a
1698*0Sstevel@tonic-gate 				 * probe at the end of this function.
1699*0Sstevel@tonic-gate 				 * Normally, we should not be here, but
1700*0Sstevel@tonic-gate 				 * this can happen if an incoming response
1701*0Sstevel@tonic-gate 				 * that was considered lost has increased
1702*0Sstevel@tonic-gate 				 * the crtt for this target, and also bumped
1703*0Sstevel@tonic-gate 				 * up the FDT. Note that we never cancel or
1704*0Sstevel@tonic-gate 				 * increase the current pii_time_left, so
1705*0Sstevel@tonic-gate 				 * when the timer fires, we find 2 valid
1706*0Sstevel@tonic-gate 				 * unacked probes, and they are yet to timeout
1707*0Sstevel@tonic-gate 				 */
1708*0Sstevel@tonic-gate 				pr_statp->pr_status = PR_LOST;
1709*0Sstevel@tonic-gate 				pr_statp->pr_time_lost = cur_time;
1710*0Sstevel@tonic-gate 			} else {
1711*0Sstevel@tonic-gate 				/*
1712*0Sstevel@tonic-gate 				 * Only the most recent probe can enter
1713*0Sstevel@tonic-gate 				 * this 'else' arm. The second most recent
1714*0Sstevel@tonic-gate 				 * probe must take either of the above arms,
1715*0Sstevel@tonic-gate 				 * if it is unacked.
1716*0Sstevel@tonic-gate 				 */
1717*0Sstevel@tonic-gate 				valid_unack_count++;
1718*0Sstevel@tonic-gate 			}
1719*0Sstevel@tonic-gate 			break;
1720*0Sstevel@tonic-gate 		}
1721*0Sstevel@tonic-gate 		pr_ndx = PROBE_INDEX_PREV(pr_ndx);
1722*0Sstevel@tonic-gate 	}
1723*0Sstevel@tonic-gate 
1724*0Sstevel@tonic-gate 	/*
1725*0Sstevel@tonic-gate 	 * We send out 1 probe randomly in the interval between one half
1726*0Sstevel@tonic-gate 	 * and one probe interval for the group. Given that the CRTT is always
1727*0Sstevel@tonic-gate 	 * less than the group's probe interval, we can have at most 1
1728*0Sstevel@tonic-gate 	 * unacknowledged probe now.  All previous probes are either lost or
1729*0Sstevel@tonic-gate 	 * acked.
1730*0Sstevel@tonic-gate 	 */
1731*0Sstevel@tonic-gate 	assert(valid_unack_count == 0 || valid_unack_count == 1);
1732*0Sstevel@tonic-gate 
1733*0Sstevel@tonic-gate 	/*
1734*0Sstevel@tonic-gate 	 * The timer has fired. Take appropriate action depending
1735*0Sstevel@tonic-gate 	 * on the current state of the phyint.
1736*0Sstevel@tonic-gate 	 *
1737*0Sstevel@tonic-gate 	 * PI_RUNNING state 	- Failure detection and failover
1738*0Sstevel@tonic-gate 	 * PI_FAILED state 	- Repair detection and failback
1739*0Sstevel@tonic-gate 	 */
1740*0Sstevel@tonic-gate 	switch (pii->pii_phyint->pi_state) {
1741*0Sstevel@tonic-gate 	case PI_FAILED:
1742*0Sstevel@tonic-gate 		/*
1743*0Sstevel@tonic-gate 		 * If the most recent probe (excluding unacked probes that
1744*0Sstevel@tonic-gate 		 * are yet to time out) has been acked, check whether the
1745*0Sstevel@tonic-gate 		 * phyint is now repaired. If the phyint is repaired, then
1746*0Sstevel@tonic-gate 		 * attempt failback, unless it is an inactive standby.
1747*0Sstevel@tonic-gate 		 */
1748*0Sstevel@tonic-gate 		if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) {
1749*0Sstevel@tonic-gate 			phyint_check_for_repair(pii->pii_phyint);
1750*0Sstevel@tonic-gate 		}
1751*0Sstevel@tonic-gate 		break;
1752*0Sstevel@tonic-gate 
1753*0Sstevel@tonic-gate 	case PI_RUNNING:
1754*0Sstevel@tonic-gate 		/*
1755*0Sstevel@tonic-gate 		 * It's possible our probes have been lost because of a
1756*0Sstevel@tonic-gate 		 * spanning-tree mandated quiet period on the switch.  If so,
1757*0Sstevel@tonic-gate 		 * ignore the lost probes and consider the interface to still
1758*0Sstevel@tonic-gate 		 * be functioning.
1759*0Sstevel@tonic-gate 		 */
1760*0Sstevel@tonic-gate 		cur_hrtime = gethrtime();
1761*0Sstevel@tonic-gate 		if (pii->pii_fd_hrtime - cur_hrtime > 0)
1762*0Sstevel@tonic-gate 			break;
1763*0Sstevel@tonic-gate 
1764*0Sstevel@tonic-gate 		if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) {
1765*0Sstevel@tonic-gate 			/*
1766*0Sstevel@tonic-gate 			 * We have 1 or more failed probes (excluding unacked
1767*0Sstevel@tonic-gate 			 * probes that are yet to time out). Determine if the
1768*0Sstevel@tonic-gate 			 * phyint has failed. If so attempt a failover,
1769*0Sstevel@tonic-gate 			 * unless it is an inactive standby
1770*0Sstevel@tonic-gate 			 */
1771*0Sstevel@tonic-gate 			phyint_inst_check_for_failure(pii);
1772*0Sstevel@tonic-gate 		}
1773*0Sstevel@tonic-gate 		break;
1774*0Sstevel@tonic-gate 
1775*0Sstevel@tonic-gate 	default:
1776*0Sstevel@tonic-gate 		logerr("phyint_inst_timer: invalid state %d\n",
1777*0Sstevel@tonic-gate 		    pii->pii_phyint->pi_state);
1778*0Sstevel@tonic-gate 		abort();
1779*0Sstevel@tonic-gate 	}
1780*0Sstevel@tonic-gate 
1781*0Sstevel@tonic-gate 	/*
1782*0Sstevel@tonic-gate 	 * Start the next probe. probe() will also set pii->pii_probe_time_left
1783*0Sstevel@tonic-gate 	 * to the group's probe interval. If phyint_failed -> target_flush_hosts
1784*0Sstevel@tonic-gate 	 * was called, the target list may be empty.
1785*0Sstevel@tonic-gate 	 */
1786*0Sstevel@tonic-gate 	if (pii->pii_target_next != NULL) {
1787*0Sstevel@tonic-gate 		probe(pii, PROBE_UNI, cur_time);
1788*0Sstevel@tonic-gate 		/*
1789*0Sstevel@tonic-gate 		 * If we have just the one probe target, and we're not using
1790*0Sstevel@tonic-gate 		 * router targets, try to find another as we presently have
1791*0Sstevel@tonic-gate 		 * no resilience.
1792*0Sstevel@tonic-gate 		 */
1793*0Sstevel@tonic-gate 		if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1)
1794*0Sstevel@tonic-gate 			probe(pii, PROBE_MULTI, cur_time);
1795*0Sstevel@tonic-gate 	} else {
1796*0Sstevel@tonic-gate 		probe(pii, PROBE_MULTI, cur_time);
1797*0Sstevel@tonic-gate 	}
1798*0Sstevel@tonic-gate 	return (interval);
1799*0Sstevel@tonic-gate }
1800*0Sstevel@tonic-gate 
1801*0Sstevel@tonic-gate /*
1802*0Sstevel@tonic-gate  * Start the probe timer for an interface instance.
1803*0Sstevel@tonic-gate  */
1804*0Sstevel@tonic-gate void
1805*0Sstevel@tonic-gate start_timer(struct phyint_instance *pii)
1806*0Sstevel@tonic-gate {
1807*0Sstevel@tonic-gate 	uint32_t interval;
1808*0Sstevel@tonic-gate 
1809*0Sstevel@tonic-gate 	/*
1810*0Sstevel@tonic-gate 	 * Spread the base probe times (pi_snxt_basetime) across phyints
1811*0Sstevel@tonic-gate 	 * uniformly over the (curtime..curtime + the group's probe_interval).
1812*0Sstevel@tonic-gate 	 * pi_snxt_basetime is strictly periodic with a frequency of
1813*0Sstevel@tonic-gate 	 * the group's probe interval. The actual probe time pi_snxt_time
1814*0Sstevel@tonic-gate 	 * adds some randomness to pi_snxt_basetime and happens in probe().
1815*0Sstevel@tonic-gate 	 * For the 1st probe on each phyint after the timer is started,
1816*0Sstevel@tonic-gate 	 * pi_snxt_time and pi_snxt_basetime are the same.
1817*0Sstevel@tonic-gate 	 */
1818*0Sstevel@tonic-gate 	interval = GET_RANDOM(0,
1819*0Sstevel@tonic-gate 	    (int)pii->pii_phyint->pi_group->pg_probeint);
1820*0Sstevel@tonic-gate 
1821*0Sstevel@tonic-gate 	pii->pii_snxt_basetime = getcurrenttime() + interval;
1822*0Sstevel@tonic-gate 	pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1823*0Sstevel@tonic-gate 	pii->pii_snxt_time = pii->pii_snxt_basetime;
1824*0Sstevel@tonic-gate 	timer_schedule(interval);
1825*0Sstevel@tonic-gate }
1826*0Sstevel@tonic-gate 
1827*0Sstevel@tonic-gate /*
1828*0Sstevel@tonic-gate  * Restart the probe timer on an interface instance.
1829*0Sstevel@tonic-gate  */
1830*0Sstevel@tonic-gate static void
1831*0Sstevel@tonic-gate restart_timer(struct phyint_instance *pii)
1832*0Sstevel@tonic-gate {
1833*0Sstevel@tonic-gate 	/*
1834*0Sstevel@tonic-gate 	 * We don't need to restart the timer if it was never started in
1835*0Sstevel@tonic-gate 	 * the first place (pii->pii_basetime_inited not set), as the timer
1836*0Sstevel@tonic-gate 	 * won't have gone off yet.
1837*0Sstevel@tonic-gate 	 */
1838*0Sstevel@tonic-gate 	if (pii->pii_basetime_inited != 0) {
1839*0Sstevel@tonic-gate 
1840*0Sstevel@tonic-gate 		if (debug & D_LINKNOTE)
1841*0Sstevel@tonic-gate 			logdebug("restart timer: restarting timer on %s, "
1842*0Sstevel@tonic-gate 			    "address family %s\n", pii->pii_phyint->pi_name,
1843*0Sstevel@tonic-gate 			    AF_STR(pii->pii_af));
1844*0Sstevel@tonic-gate 
1845*0Sstevel@tonic-gate 		start_timer(pii);
1846*0Sstevel@tonic-gate 	}
1847*0Sstevel@tonic-gate }
1848*0Sstevel@tonic-gate 
1849*0Sstevel@tonic-gate static void
1850*0Sstevel@tonic-gate process_link_state_down(struct phyint *pi)
1851*0Sstevel@tonic-gate {
1852*0Sstevel@tonic-gate 	logerr("The link has gone down on %s\n", pi->pi_name);
1853*0Sstevel@tonic-gate 
1854*0Sstevel@tonic-gate 	/*
1855*0Sstevel@tonic-gate 	 * Clear the probe statistics arrays, we don't want the repair
1856*0Sstevel@tonic-gate 	 * detection logic relying on probes that were succesful prior
1857*0Sstevel@tonic-gate 	 *  to the link going down.
1858*0Sstevel@tonic-gate 	 */
1859*0Sstevel@tonic-gate 	if (PROBE_CAPABLE(pi->pi_v4))
1860*0Sstevel@tonic-gate 		clear_pii_probe_stats(pi->pi_v4);
1861*0Sstevel@tonic-gate 	if (PROBE_CAPABLE(pi->pi_v6))
1862*0Sstevel@tonic-gate 		clear_pii_probe_stats(pi->pi_v6);
1863*0Sstevel@tonic-gate 	/*
1864*0Sstevel@tonic-gate 	 * Check for interface failure.  Although we know the interface
1865*0Sstevel@tonic-gate 	 * has failed, we don't know if all the other interfaces in the
1866*0Sstevel@tonic-gate 	 * group have failed as well.
1867*0Sstevel@tonic-gate 	 */
1868*0Sstevel@tonic-gate 	if ((pi->pi_state == PI_RUNNING) ||
1869*0Sstevel@tonic-gate 	    (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) {
1870*0Sstevel@tonic-gate 		if (debug & D_LINKNOTE) {
1871*0Sstevel@tonic-gate 			logdebug("process_link_state_down:"
1872*0Sstevel@tonic-gate 			    " checking for failure on %s\n", pi->pi_name);
1873*0Sstevel@tonic-gate 		}
1874*0Sstevel@tonic-gate 
1875*0Sstevel@tonic-gate 		if (pi->pi_v4 != NULL)
1876*0Sstevel@tonic-gate 			phyint_inst_check_for_failure(pi->pi_v4);
1877*0Sstevel@tonic-gate 		else if (pi->pi_v6 != NULL)
1878*0Sstevel@tonic-gate 			phyint_inst_check_for_failure(pi->pi_v6);
1879*0Sstevel@tonic-gate 	}
1880*0Sstevel@tonic-gate }
1881*0Sstevel@tonic-gate 
1882*0Sstevel@tonic-gate static void
1883*0Sstevel@tonic-gate process_link_state_up(struct phyint *pi)
1884*0Sstevel@tonic-gate {
1885*0Sstevel@tonic-gate 	logerr("The link has come up on %s\n", pi->pi_name);
1886*0Sstevel@tonic-gate 
1887*0Sstevel@tonic-gate 	/*
1888*0Sstevel@tonic-gate 	 * We stopped any running timers on each instance when the link
1889*0Sstevel@tonic-gate 	 * went down, so restart them.
1890*0Sstevel@tonic-gate 	 */
1891*0Sstevel@tonic-gate 	if (pi->pi_v4)
1892*0Sstevel@tonic-gate 		restart_timer(pi->pi_v4);
1893*0Sstevel@tonic-gate 	if (pi->pi_v6)
1894*0Sstevel@tonic-gate 		restart_timer(pi->pi_v6);
1895*0Sstevel@tonic-gate 
1896*0Sstevel@tonic-gate 	phyint_check_for_repair(pi);
1897*0Sstevel@tonic-gate 
1898*0Sstevel@tonic-gate 	pi->pi_whenup[pi->pi_whendx++] = getcurrenttime();
1899*0Sstevel@tonic-gate 	if (pi->pi_whendx == LINK_UP_PERMIN)
1900*0Sstevel@tonic-gate 		pi->pi_whendx = 0;
1901*0Sstevel@tonic-gate }
1902*0Sstevel@tonic-gate 
1903*0Sstevel@tonic-gate /*
1904*0Sstevel@tonic-gate  * Process any changes in link state passed up from the interfaces.
1905*0Sstevel@tonic-gate  */
1906*0Sstevel@tonic-gate void
1907*0Sstevel@tonic-gate process_link_state_changes(void)
1908*0Sstevel@tonic-gate {
1909*0Sstevel@tonic-gate 	struct phyint *pi;
1910*0Sstevel@tonic-gate 
1911*0Sstevel@tonic-gate 	/* Look for interfaces where the link state has just changed */
1912*0Sstevel@tonic-gate 
1913*0Sstevel@tonic-gate 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
1914*0Sstevel@tonic-gate 		boolean_t old_link_state_up = LINK_UP(pi);
1915*0Sstevel@tonic-gate 
1916*0Sstevel@tonic-gate 		/*
1917*0Sstevel@tonic-gate 		 * Except when the "phyint" structure is created, this is
1918*0Sstevel@tonic-gate 		 * the only place the link state is updated.  This allows
1919*0Sstevel@tonic-gate 		 * this routine to detect changes in link state, rather
1920*0Sstevel@tonic-gate 		 * than just the current state.
1921*0Sstevel@tonic-gate 		 */
1922*0Sstevel@tonic-gate 		UPDATE_LINK_STATE(pi);
1923*0Sstevel@tonic-gate 
1924*0Sstevel@tonic-gate 		if (LINK_DOWN(pi)) {
1925*0Sstevel@tonic-gate 			/*
1926*0Sstevel@tonic-gate 			 * Has link just gone down?
1927*0Sstevel@tonic-gate 			 */
1928*0Sstevel@tonic-gate 			if (old_link_state_up)
1929*0Sstevel@tonic-gate 				process_link_state_down(pi);
1930*0Sstevel@tonic-gate 		} else {
1931*0Sstevel@tonic-gate 			/*
1932*0Sstevel@tonic-gate 			 * Has link just gone back up?
1933*0Sstevel@tonic-gate 			 */
1934*0Sstevel@tonic-gate 			if (!old_link_state_up)
1935*0Sstevel@tonic-gate 				process_link_state_up(pi);
1936*0Sstevel@tonic-gate 		}
1937*0Sstevel@tonic-gate 	}
1938*0Sstevel@tonic-gate }
1939*0Sstevel@tonic-gate 
1940*0Sstevel@tonic-gate void
1941*0Sstevel@tonic-gate reset_crtt_all(struct phyint *pi)
1942*0Sstevel@tonic-gate {
1943*0Sstevel@tonic-gate 	struct phyint_instance *pii;
1944*0Sstevel@tonic-gate 	struct target *tg;
1945*0Sstevel@tonic-gate 
1946*0Sstevel@tonic-gate 	pii = pi->pi_v4;
1947*0Sstevel@tonic-gate 	if (pii != NULL) {
1948*0Sstevel@tonic-gate 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1949*0Sstevel@tonic-gate 			tg->tg_crtt = 0;
1950*0Sstevel@tonic-gate 			tg->tg_rtt_sa = -1;
1951*0Sstevel@tonic-gate 			tg->tg_rtt_sd = 0;
1952*0Sstevel@tonic-gate 		}
1953*0Sstevel@tonic-gate 	}
1954*0Sstevel@tonic-gate 
1955*0Sstevel@tonic-gate 	pii = pi->pi_v6;
1956*0Sstevel@tonic-gate 	if (pii != NULL) {
1957*0Sstevel@tonic-gate 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1958*0Sstevel@tonic-gate 			tg->tg_crtt = 0;
1959*0Sstevel@tonic-gate 			tg->tg_rtt_sa = -1;
1960*0Sstevel@tonic-gate 			tg->tg_rtt_sd = 0;
1961*0Sstevel@tonic-gate 		}
1962*0Sstevel@tonic-gate 	}
1963*0Sstevel@tonic-gate }
1964*0Sstevel@tonic-gate 
1965*0Sstevel@tonic-gate /*
1966*0Sstevel@tonic-gate  * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive
1967*0Sstevel@tonic-gate  * probes on both instances IPv4 and IPv6.
1968*0Sstevel@tonic-gate  * If the interface has failed, return the time of the first probe failure
1969*0Sstevel@tonic-gate  * in "tff".
1970*0Sstevel@tonic-gate  */
1971*0Sstevel@tonic-gate static int
1972*0Sstevel@tonic-gate phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff)
1973*0Sstevel@tonic-gate {
1974*0Sstevel@tonic-gate 	uint_t	pi_tff;
1975*0Sstevel@tonic-gate 	struct	target *cur_tg;
1976*0Sstevel@tonic-gate 	struct	probe_fail_count pfinfo;
1977*0Sstevel@tonic-gate 	struct	phyint_instance *pii_other;
1978*0Sstevel@tonic-gate 	int	pr_ndx;
1979*0Sstevel@tonic-gate 
1980*0Sstevel@tonic-gate 	/*
1981*0Sstevel@tonic-gate 	 * Get the number of consecutive failed probes on
1982*0Sstevel@tonic-gate 	 * this phyint across all targets. Also get the number
1983*0Sstevel@tonic-gate 	 * of consecutive failed probes on this target only
1984*0Sstevel@tonic-gate 	 */
1985*0Sstevel@tonic-gate 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
1986*0Sstevel@tonic-gate 	cur_tg = pii->pii_probes[pr_ndx].pr_target;
1987*0Sstevel@tonic-gate 	probe_fail_info(pii, cur_tg, &pfinfo);
1988*0Sstevel@tonic-gate 
1989*0Sstevel@tonic-gate 	/* Get the time of first failure, for later use */
1990*0Sstevel@tonic-gate 	pi_tff = pfinfo.pf_tff;
1991*0Sstevel@tonic-gate 
1992*0Sstevel@tonic-gate 	/*
1993*0Sstevel@tonic-gate 	 * If the current target has not responded to the
1994*0Sstevel@tonic-gate 	 * last NUM_PROBE_FAILS probes, and other targets are
1995*0Sstevel@tonic-gate 	 * responding delete this target. Dead gateway detection
1996*0Sstevel@tonic-gate 	 * will eventually remove this target (if router) from the
1997*0Sstevel@tonic-gate 	 * routing tables. If that does not occur, we may end
1998*0Sstevel@tonic-gate 	 * up adding this to our list again.
1999*0Sstevel@tonic-gate 	 */
2000*0Sstevel@tonic-gate 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS &&
2001*0Sstevel@tonic-gate 	    pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) {
2002*0Sstevel@tonic-gate 		if (pii->pii_targets_are_routers) {
2003*0Sstevel@tonic-gate 			if (cur_tg->tg_status == TG_ACTIVE)
2004*0Sstevel@tonic-gate 				pii->pii_ntargets--;
2005*0Sstevel@tonic-gate 			cur_tg->tg_status = TG_DEAD;
2006*0Sstevel@tonic-gate 			cur_tg->tg_crtt = 0;
2007*0Sstevel@tonic-gate 			cur_tg->tg_rtt_sa = -1;
2008*0Sstevel@tonic-gate 			cur_tg->tg_rtt_sd = 0;
2009*0Sstevel@tonic-gate 			if (pii->pii_target_next == cur_tg)
2010*0Sstevel@tonic-gate 				pii->pii_target_next = target_next(cur_tg);
2011*0Sstevel@tonic-gate 		} else {
2012*0Sstevel@tonic-gate 			target_delete(cur_tg);
2013*0Sstevel@tonic-gate 			probe(pii, PROBE_MULTI, getcurrenttime());
2014*0Sstevel@tonic-gate 		}
2015*0Sstevel@tonic-gate 		return (PHYINT_OK);
2016*0Sstevel@tonic-gate 	}
2017*0Sstevel@tonic-gate 
2018*0Sstevel@tonic-gate 	/*
2019*0Sstevel@tonic-gate 	 * If the phyint has lost NUM_PROBE_FAILS or more
2020*0Sstevel@tonic-gate 	 * consecutive probes, on both IPv4 and IPv6 protocol
2021*0Sstevel@tonic-gate 	 * instances of the phyint, then trigger failure
2022*0Sstevel@tonic-gate 	 * detection, else return false
2023*0Sstevel@tonic-gate 	 */
2024*0Sstevel@tonic-gate 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS)
2025*0Sstevel@tonic-gate 		return (PHYINT_OK);
2026*0Sstevel@tonic-gate 
2027*0Sstevel@tonic-gate 	pii_other = phyint_inst_other(pii);
2028*0Sstevel@tonic-gate 	if (PROBE_CAPABLE(pii_other)) {
2029*0Sstevel@tonic-gate 		probe_fail_info(pii_other, NULL, &pfinfo);
2030*0Sstevel@tonic-gate 		if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) {
2031*0Sstevel@tonic-gate 			/*
2032*0Sstevel@tonic-gate 			 * We have NUM_PROBE_FAILS or more failures
2033*0Sstevel@tonic-gate 			 * on both IPv4 and IPv6. Get the earliest
2034*0Sstevel@tonic-gate 			 * time when failure was detected on this
2035*0Sstevel@tonic-gate 			 * phyint across IPv4 and IPv6.
2036*0Sstevel@tonic-gate 			 */
2037*0Sstevel@tonic-gate 			if (TIME_LT(pfinfo.pf_tff, pi_tff))
2038*0Sstevel@tonic-gate 				pi_tff = pfinfo.pf_tff;
2039*0Sstevel@tonic-gate 		} else {
2040*0Sstevel@tonic-gate 			/*
2041*0Sstevel@tonic-gate 			 * This instance has < NUM_PROBE_FAILS failure.
2042*0Sstevel@tonic-gate 			 * So return false
2043*0Sstevel@tonic-gate 			 */
2044*0Sstevel@tonic-gate 			return (PHYINT_OK);
2045*0Sstevel@tonic-gate 		}
2046*0Sstevel@tonic-gate 	}
2047*0Sstevel@tonic-gate 	*tff = pi_tff;
2048*0Sstevel@tonic-gate 	return (PHYINT_FAILURE);
2049*0Sstevel@tonic-gate }
2050*0Sstevel@tonic-gate 
2051*0Sstevel@tonic-gate /*
2052*0Sstevel@tonic-gate  * Check if the link has gone down on this phyint, or it has failed the
2053*0Sstevel@tonic-gate  * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6.
2054*0Sstevel@tonic-gate  * Also look at other phyints of this group, for group failures.
2055*0Sstevel@tonic-gate  */
2056*0Sstevel@tonic-gate int
2057*0Sstevel@tonic-gate failure_state(struct phyint_instance *pii)
2058*0Sstevel@tonic-gate {
2059*0Sstevel@tonic-gate 	struct	probe_success_count psinfo;
2060*0Sstevel@tonic-gate 	uint_t	pi2_tls;		/* time last success */
2061*0Sstevel@tonic-gate 	uint_t	pi_tff;			/* time first fail */
2062*0Sstevel@tonic-gate 	struct	phyint	*pi2;
2063*0Sstevel@tonic-gate 	struct	phyint *pi;
2064*0Sstevel@tonic-gate 	struct	phyint_instance *pii2;
2065*0Sstevel@tonic-gate 	struct  phyint_group *pg;
2066*0Sstevel@tonic-gate 	boolean_t alone;
2067*0Sstevel@tonic-gate 
2068*0Sstevel@tonic-gate 	if (debug & D_FAILOVER)
2069*0Sstevel@tonic-gate 		logdebug("phyint_failed(%s)\n", pii->pii_name);
2070*0Sstevel@tonic-gate 
2071*0Sstevel@tonic-gate 	pi = pii->pii_phyint;
2072*0Sstevel@tonic-gate 	pg = pi->pi_group;
2073*0Sstevel@tonic-gate 
2074*0Sstevel@tonic-gate 	if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) ==
2075*0Sstevel@tonic-gate 		PHYINT_OK)
2076*0Sstevel@tonic-gate 		return (PHYINT_OK);
2077*0Sstevel@tonic-gate 
2078*0Sstevel@tonic-gate 	/*
2079*0Sstevel@tonic-gate 	 * At this point, the link is down, or the phyint is suspect,
2080*0Sstevel@tonic-gate 	 * as it has lost NUM_PROBE_FAILS or more probes. If the phyint
2081*0Sstevel@tonic-gate 	 * does not belong to any group, or is the only member of the
2082*0Sstevel@tonic-gate 	 * group capable of being probed, return PHYINT_FAILURE.
2083*0Sstevel@tonic-gate 	 */
2084*0Sstevel@tonic-gate 	alone = _B_TRUE;
2085*0Sstevel@tonic-gate 	if (pg != phyint_anongroup) {
2086*0Sstevel@tonic-gate 		for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2087*0Sstevel@tonic-gate 			if (pi2 == pi)
2088*0Sstevel@tonic-gate 				continue;
2089*0Sstevel@tonic-gate 			if (PROBE_CAPABLE(pi2->pi_v4) ||
2090*0Sstevel@tonic-gate 			    PROBE_CAPABLE(pi2->pi_v6)) {
2091*0Sstevel@tonic-gate 				alone = _B_FALSE;
2092*0Sstevel@tonic-gate 				break;
2093*0Sstevel@tonic-gate 			}
2094*0Sstevel@tonic-gate 		}
2095*0Sstevel@tonic-gate 	}
2096*0Sstevel@tonic-gate 	if (alone)
2097*0Sstevel@tonic-gate 		return (PHYINT_FAILURE);
2098*0Sstevel@tonic-gate 
2099*0Sstevel@tonic-gate 	/*
2100*0Sstevel@tonic-gate 	 * Need to compare against other phyints of the same group
2101*0Sstevel@tonic-gate 	 * to exclude group failures. If the failure was detected via
2102*0Sstevel@tonic-gate 	 * probing, then if the time of last success (tls) of any
2103*0Sstevel@tonic-gate 	 * phyint is more recent than the time of first fail (tff) of the
2104*0Sstevel@tonic-gate 	 * phyint in question, and the link is up on the phyint,
2105*0Sstevel@tonic-gate 	 * then it is a phyint failure. Otherwise it is a group failure.
2106*0Sstevel@tonic-gate 	 * If failure was detected via a link down notification sent from
2107*0Sstevel@tonic-gate 	 * the driver to IP, we see if any phyints in the group are still
2108*0Sstevel@tonic-gate 	 * running and haven't received a link down notification.  We
2109*0Sstevel@tonic-gate 	 * will usually be processing the link down notification shortly
2110*0Sstevel@tonic-gate 	 * after it was received, so there is no point looking at the tls
2111*0Sstevel@tonic-gate 	 * of other phyints.
2112*0Sstevel@tonic-gate 	 */
2113*0Sstevel@tonic-gate 	for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2114*0Sstevel@tonic-gate 		/* Exclude ourself from comparison */
2115*0Sstevel@tonic-gate 		if (pi2 == pi)
2116*0Sstevel@tonic-gate 			continue;
2117*0Sstevel@tonic-gate 
2118*0Sstevel@tonic-gate 		if (LINK_DOWN(pi)) {
2119*0Sstevel@tonic-gate 			/*
2120*0Sstevel@tonic-gate 			 * We use FLAGS_TO_LINK_STATE() to test the
2121*0Sstevel@tonic-gate 			 * flags directly, rather then LINK_UP() or
2122*0Sstevel@tonic-gate 			 * LINK_DOWN(), as we may not have got round
2123*0Sstevel@tonic-gate 			 * to processing the link state for the other
2124*0Sstevel@tonic-gate 			 * phyints in the group yet.
2125*0Sstevel@tonic-gate 			 *
2126*0Sstevel@tonic-gate 			 * The check for PI_RUNNING and group
2127*0Sstevel@tonic-gate 			 * failure handles the case when the
2128*0Sstevel@tonic-gate 			 * group begins to recover.  The first
2129*0Sstevel@tonic-gate 			 * phyint to recover should not trigger
2130*0Sstevel@tonic-gate 			 * a failover from the soon-to-recover
2131*0Sstevel@tonic-gate 			 * other phyints to the first recovered
2132*0Sstevel@tonic-gate 			 * phyint. PI_RUNNING will be set, and
2133*0Sstevel@tonic-gate 			 * pg_groupfailed cleared only after
2134*0Sstevel@tonic-gate 			 * receipt of NUM_PROBE_REPAIRS, by
2135*0Sstevel@tonic-gate 			 * which time the other phyints should
2136*0Sstevel@tonic-gate 			 * have received at least 1 packet,
2137*0Sstevel@tonic-gate 			 * and so will not have NUM_PROBE_FAILS.
2138*0Sstevel@tonic-gate 			 */
2139*0Sstevel@tonic-gate 			if ((pi2->pi_state == PI_RUNNING) &&
2140*0Sstevel@tonic-gate 			    !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2))
2141*0Sstevel@tonic-gate 				return (PHYINT_FAILURE);
2142*0Sstevel@tonic-gate 		} else {
2143*0Sstevel@tonic-gate 			/*
2144*0Sstevel@tonic-gate 			 * Need to compare against both IPv4 and
2145*0Sstevel@tonic-gate 			 * IPv6 instances.
2146*0Sstevel@tonic-gate 			 */
2147*0Sstevel@tonic-gate 			pii2 = pi2->pi_v4;
2148*0Sstevel@tonic-gate 			if (pii2 != NULL) {
2149*0Sstevel@tonic-gate 				probe_success_info(pii2, NULL, &psinfo);
2150*0Sstevel@tonic-gate 				if (psinfo.ps_tls_valid) {
2151*0Sstevel@tonic-gate 					pi2_tls = psinfo.ps_tls;
2152*0Sstevel@tonic-gate 					/*
2153*0Sstevel@tonic-gate 					 * See comment above regarding check
2154*0Sstevel@tonic-gate 					 * for PI_RUNNING and group failure.
2155*0Sstevel@tonic-gate 					 */
2156*0Sstevel@tonic-gate 					if (TIME_GT(pi2_tls, pi_tff) &&
2157*0Sstevel@tonic-gate 					    (pi2->pi_state == PI_RUNNING) &&
2158*0Sstevel@tonic-gate 					    !GROUP_FAILED(pg) &&
2159*0Sstevel@tonic-gate 					    FLAGS_TO_LINK_STATE(pi2))
2160*0Sstevel@tonic-gate 						return (PHYINT_FAILURE);
2161*0Sstevel@tonic-gate 				}
2162*0Sstevel@tonic-gate 			}
2163*0Sstevel@tonic-gate 
2164*0Sstevel@tonic-gate 			pii2 = pi2->pi_v6;
2165*0Sstevel@tonic-gate 			if (pii2 != NULL) {
2166*0Sstevel@tonic-gate 				probe_success_info(pii2, NULL, &psinfo);
2167*0Sstevel@tonic-gate 				if (psinfo.ps_tls_valid) {
2168*0Sstevel@tonic-gate 					pi2_tls = psinfo.ps_tls;
2169*0Sstevel@tonic-gate 					/*
2170*0Sstevel@tonic-gate 					 * See comment above regarding check
2171*0Sstevel@tonic-gate 					 * for PI_RUNNING and group failure.
2172*0Sstevel@tonic-gate 					 */
2173*0Sstevel@tonic-gate 					if (TIME_GT(pi2_tls, pi_tff) &&
2174*0Sstevel@tonic-gate 					    (pi2->pi_state == PI_RUNNING) &&
2175*0Sstevel@tonic-gate 					    !GROUP_FAILED(pg) &&
2176*0Sstevel@tonic-gate 					    FLAGS_TO_LINK_STATE(pi2))
2177*0Sstevel@tonic-gate 						return (PHYINT_FAILURE);
2178*0Sstevel@tonic-gate 				}
2179*0Sstevel@tonic-gate 			}
2180*0Sstevel@tonic-gate 		}
2181*0Sstevel@tonic-gate 	}
2182*0Sstevel@tonic-gate 
2183*0Sstevel@tonic-gate 	/*
2184*0Sstevel@tonic-gate 	 * Change the group state to PG_FAILED if it's not already.
2185*0Sstevel@tonic-gate 	 */
2186*0Sstevel@tonic-gate 	if (!GROUP_FAILED(pg))
2187*0Sstevel@tonic-gate 		phyint_group_chstate(pg, PG_FAILED);
2188*0Sstevel@tonic-gate 
2189*0Sstevel@tonic-gate 	return (GROUP_FAILURE);
2190*0Sstevel@tonic-gate }
2191*0Sstevel@tonic-gate 
2192*0Sstevel@tonic-gate /*
2193*0Sstevel@tonic-gate  * Return the information associated with consecutive probe successes
2194*0Sstevel@tonic-gate  * starting with the most recent probe. At most the last 2 probes can be
2195*0Sstevel@tonic-gate  * in the unacknowledged state. All previous probes have either failed
2196*0Sstevel@tonic-gate  * or succeeded.
2197*0Sstevel@tonic-gate  */
2198*0Sstevel@tonic-gate static void
2199*0Sstevel@tonic-gate probe_success_info(struct phyint_instance *pii, struct target *cur_tg,
2200*0Sstevel@tonic-gate     struct probe_success_count *psinfo)
2201*0Sstevel@tonic-gate {
2202*0Sstevel@tonic-gate 	uint_t	i;
2203*0Sstevel@tonic-gate 	struct probe_stats *pr_statp;
2204*0Sstevel@tonic-gate 	uint_t most_recent;
2205*0Sstevel@tonic-gate 	uint_t second_most_recent;
2206*0Sstevel@tonic-gate 	boolean_t pi_found_failure = _B_FALSE;
2207*0Sstevel@tonic-gate 	boolean_t tg_found_failure = _B_FALSE;
2208*0Sstevel@tonic-gate 	uint_t now;
2209*0Sstevel@tonic-gate 	uint_t timeout;
2210*0Sstevel@tonic-gate 	struct target *tg;
2211*0Sstevel@tonic-gate 
2212*0Sstevel@tonic-gate 	if (debug & D_FAILOVER)
2213*0Sstevel@tonic-gate 		logdebug("probe_success_info(%s)\n", pii->pii_name);
2214*0Sstevel@tonic-gate 
2215*0Sstevel@tonic-gate 	bzero(psinfo, sizeof (*psinfo));
2216*0Sstevel@tonic-gate 	now = getcurrenttime();
2217*0Sstevel@tonic-gate 
2218*0Sstevel@tonic-gate 	/*
2219*0Sstevel@tonic-gate 	 * Start with the most recent probe, and count the number
2220*0Sstevel@tonic-gate 	 * of consecutive probe successes. Latch the number of successes
2221*0Sstevel@tonic-gate 	 * on hitting a failure.
2222*0Sstevel@tonic-gate 	 */
2223*0Sstevel@tonic-gate 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
2224*0Sstevel@tonic-gate 	second_most_recent = PROBE_INDEX_PREV(most_recent);
2225*0Sstevel@tonic-gate 
2226*0Sstevel@tonic-gate 	for (i = most_recent; i != pii->pii_probe_next;
2227*0Sstevel@tonic-gate 	    i = PROBE_INDEX_PREV(i)) {
2228*0Sstevel@tonic-gate 		pr_statp = &pii->pii_probes[i];
2229*0Sstevel@tonic-gate 
2230*0Sstevel@tonic-gate 		switch (pr_statp->pr_status) {
2231*0Sstevel@tonic-gate 		case PR_UNACKED:
2232*0Sstevel@tonic-gate 			/*
2233*0Sstevel@tonic-gate 			 * Only the most recent 2 probes can be unacknowledged
2234*0Sstevel@tonic-gate 			 */
2235*0Sstevel@tonic-gate 			assert(i == most_recent || i == second_most_recent);
2236*0Sstevel@tonic-gate 
2237*0Sstevel@tonic-gate 			tg = pr_statp->pr_target;
2238*0Sstevel@tonic-gate 			assert(tg != NULL);
2239*0Sstevel@tonic-gate 			/*
2240*0Sstevel@tonic-gate 			 * The crtt could be zero for some reason,
2241*0Sstevel@tonic-gate 			 * Eg. the phyint could be failed. If the crtt is
2242*0Sstevel@tonic-gate 			 * not available use the value of the group's probe
2243*0Sstevel@tonic-gate 			 * interval which is a worst case estimate.
2244*0Sstevel@tonic-gate 			 */
2245*0Sstevel@tonic-gate 			if (tg->tg_crtt != 0) {
2246*0Sstevel@tonic-gate 				timeout = pr_statp->pr_time_sent + tg->tg_crtt;
2247*0Sstevel@tonic-gate 			} else {
2248*0Sstevel@tonic-gate 				timeout = pr_statp->pr_time_sent +
2249*0Sstevel@tonic-gate 				    pii->pii_phyint->pi_group->pg_probeint;
2250*0Sstevel@tonic-gate 			}
2251*0Sstevel@tonic-gate 
2252*0Sstevel@tonic-gate 			if (TIME_LT(timeout, now)) {
2253*0Sstevel@tonic-gate 				/*
2254*0Sstevel@tonic-gate 				 * We hit a failure. Latch the total number of
2255*0Sstevel@tonic-gate 				 * recent consecutive successes.
2256*0Sstevel@tonic-gate 				 */
2257*0Sstevel@tonic-gate 				pr_statp->pr_time_lost = timeout;
2258*0Sstevel@tonic-gate 				pr_statp->pr_status = PR_LOST;
2259*0Sstevel@tonic-gate 				pi_found_failure = _B_TRUE;
2260*0Sstevel@tonic-gate 				if (cur_tg != NULL && tg == cur_tg) {
2261*0Sstevel@tonic-gate 					/*
2262*0Sstevel@tonic-gate 					 * We hit a failure for the desired
2263*0Sstevel@tonic-gate 					 * target. Latch the number of recent
2264*0Sstevel@tonic-gate 					 * consecutive successes for this target
2265*0Sstevel@tonic-gate 					 */
2266*0Sstevel@tonic-gate 					tg_found_failure = _B_TRUE;
2267*0Sstevel@tonic-gate 				}
2268*0Sstevel@tonic-gate 			}
2269*0Sstevel@tonic-gate 			break;
2270*0Sstevel@tonic-gate 
2271*0Sstevel@tonic-gate 		case PR_ACKED:
2272*0Sstevel@tonic-gate 			/*
2273*0Sstevel@tonic-gate 			 * Bump up the count of probe successes, if we
2274*0Sstevel@tonic-gate 			 * have not seen any failure so far.
2275*0Sstevel@tonic-gate 			 */
2276*0Sstevel@tonic-gate 			if (!pi_found_failure)
2277*0Sstevel@tonic-gate 				psinfo->ps_nsucc++;
2278*0Sstevel@tonic-gate 
2279*0Sstevel@tonic-gate 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
2280*0Sstevel@tonic-gate 			    !tg_found_failure) {
2281*0Sstevel@tonic-gate 				psinfo->ps_nsucc_tg++;
2282*0Sstevel@tonic-gate 			}
2283*0Sstevel@tonic-gate 
2284*0Sstevel@tonic-gate 			/*
2285*0Sstevel@tonic-gate 			 * Record the time of last success, if this is
2286*0Sstevel@tonic-gate 			 * the most recent probe success.
2287*0Sstevel@tonic-gate 			 */
2288*0Sstevel@tonic-gate 			if (!psinfo->ps_tls_valid) {
2289*0Sstevel@tonic-gate 				psinfo->ps_tls = pr_statp->pr_time_acked;
2290*0Sstevel@tonic-gate 				psinfo->ps_tls_valid = _B_TRUE;
2291*0Sstevel@tonic-gate 			}
2292*0Sstevel@tonic-gate 			break;
2293*0Sstevel@tonic-gate 
2294*0Sstevel@tonic-gate 		case PR_LOST:
2295*0Sstevel@tonic-gate 			/*
2296*0Sstevel@tonic-gate 			 * We hit a failure. Latch the total number of
2297*0Sstevel@tonic-gate 			 * recent consecutive successes.
2298*0Sstevel@tonic-gate 			 */
2299*0Sstevel@tonic-gate 			pi_found_failure = _B_TRUE;
2300*0Sstevel@tonic-gate 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
2301*0Sstevel@tonic-gate 				/*
2302*0Sstevel@tonic-gate 				 * We hit a failure for the desired target.
2303*0Sstevel@tonic-gate 				 * Latch the number of recent consecutive
2304*0Sstevel@tonic-gate 				 * successes for this target
2305*0Sstevel@tonic-gate 				 */
2306*0Sstevel@tonic-gate 				tg_found_failure = _B_TRUE;
2307*0Sstevel@tonic-gate 			}
2308*0Sstevel@tonic-gate 			break;
2309*0Sstevel@tonic-gate 
2310*0Sstevel@tonic-gate 		default:
2311*0Sstevel@tonic-gate 			return;
2312*0Sstevel@tonic-gate 
2313*0Sstevel@tonic-gate 		}
2314*0Sstevel@tonic-gate 	}
2315*0Sstevel@tonic-gate }
2316*0Sstevel@tonic-gate 
2317*0Sstevel@tonic-gate /*
2318*0Sstevel@tonic-gate  * Return the information associated with consecutive probe failures
2319*0Sstevel@tonic-gate  * starting with the most recent probe. Only the last 2 probes can be in the
2320*0Sstevel@tonic-gate  * unacknowledged state. All previous probes have either failed or succeeded.
2321*0Sstevel@tonic-gate  */
2322*0Sstevel@tonic-gate static void
2323*0Sstevel@tonic-gate probe_fail_info(struct phyint_instance *pii, struct target *cur_tg,
2324*0Sstevel@tonic-gate     struct probe_fail_count *pfinfo)
2325*0Sstevel@tonic-gate {
2326*0Sstevel@tonic-gate 	int	i;
2327*0Sstevel@tonic-gate 	struct probe_stats *pr_statp;
2328*0Sstevel@tonic-gate 	boolean_t	tg_found_success = _B_FALSE;
2329*0Sstevel@tonic-gate 	boolean_t	pi_found_success = _B_FALSE;
2330*0Sstevel@tonic-gate 	int	most_recent;
2331*0Sstevel@tonic-gate 	int	second_most_recent;
2332*0Sstevel@tonic-gate 	uint_t	now;
2333*0Sstevel@tonic-gate 	uint_t	timeout;
2334*0Sstevel@tonic-gate 	struct	target *tg;
2335*0Sstevel@tonic-gate 
2336*0Sstevel@tonic-gate 	if (debug & D_FAILOVER)
2337*0Sstevel@tonic-gate 		logdebug("probe_fail_info(%s)\n", pii->pii_name);
2338*0Sstevel@tonic-gate 
2339*0Sstevel@tonic-gate 	bzero(pfinfo, sizeof (*pfinfo));
2340*0Sstevel@tonic-gate 	now = getcurrenttime();
2341*0Sstevel@tonic-gate 
2342*0Sstevel@tonic-gate 	/*
2343*0Sstevel@tonic-gate 	 * Start with the most recent probe, and count the number
2344*0Sstevel@tonic-gate 	 * of consecutive probe failures. Latch the number of failures
2345*0Sstevel@tonic-gate 	 * on hitting a probe success.
2346*0Sstevel@tonic-gate 	 */
2347*0Sstevel@tonic-gate 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
2348*0Sstevel@tonic-gate 	second_most_recent = PROBE_INDEX_PREV(most_recent);
2349*0Sstevel@tonic-gate 
2350*0Sstevel@tonic-gate 	for (i = most_recent; i != pii->pii_probe_next;
2351*0Sstevel@tonic-gate 	    i = PROBE_INDEX_PREV(i)) {
2352*0Sstevel@tonic-gate 		pr_statp = &pii->pii_probes[i];
2353*0Sstevel@tonic-gate 
2354*0Sstevel@tonic-gate 		assert(PR_STATUS_VALID(pr_statp->pr_status));
2355*0Sstevel@tonic-gate 
2356*0Sstevel@tonic-gate 		switch (pr_statp->pr_status) {
2357*0Sstevel@tonic-gate 		case PR_UNACKED:
2358*0Sstevel@tonic-gate 			/*
2359*0Sstevel@tonic-gate 			 * Only the most recent 2 probes can be unacknowledged
2360*0Sstevel@tonic-gate 			 */
2361*0Sstevel@tonic-gate 			assert(i == most_recent || i == second_most_recent);
2362*0Sstevel@tonic-gate 
2363*0Sstevel@tonic-gate 			tg = pr_statp->pr_target;
2364*0Sstevel@tonic-gate 			/*
2365*0Sstevel@tonic-gate 			 * Target is guaranteed to exist in the unack. state
2366*0Sstevel@tonic-gate 			 */
2367*0Sstevel@tonic-gate 			assert(tg != NULL);
2368*0Sstevel@tonic-gate 			/*
2369*0Sstevel@tonic-gate 			 * The crtt could be zero for some reason,
2370*0Sstevel@tonic-gate 			 * Eg. the phyint could be failed. If the crtt is
2371*0Sstevel@tonic-gate 			 * not available use the group's probe interval,
2372*0Sstevel@tonic-gate 			 * which is a worst case estimate.
2373*0Sstevel@tonic-gate 			 */
2374*0Sstevel@tonic-gate 			if (tg->tg_crtt != 0) {
2375*0Sstevel@tonic-gate 				timeout = pr_statp->pr_time_sent + tg->tg_crtt;
2376*0Sstevel@tonic-gate 			} else {
2377*0Sstevel@tonic-gate 				timeout = pr_statp->pr_time_sent +
2378*0Sstevel@tonic-gate 				    pii->pii_phyint->pi_group->pg_probeint;
2379*0Sstevel@tonic-gate 			}
2380*0Sstevel@tonic-gate 
2381*0Sstevel@tonic-gate 			if (TIME_GT(timeout, now))
2382*0Sstevel@tonic-gate 				break;
2383*0Sstevel@tonic-gate 
2384*0Sstevel@tonic-gate 			pr_statp->pr_time_lost = timeout;
2385*0Sstevel@tonic-gate 			pr_statp->pr_status = PR_LOST;
2386*0Sstevel@tonic-gate 			/* FALLTHRU */
2387*0Sstevel@tonic-gate 
2388*0Sstevel@tonic-gate 		case PR_LOST:
2389*0Sstevel@tonic-gate 			if (!pi_found_success) {
2390*0Sstevel@tonic-gate 				pfinfo->pf_nfail++;
2391*0Sstevel@tonic-gate 				pfinfo->pf_tff = pr_statp->pr_time_lost;
2392*0Sstevel@tonic-gate 			}
2393*0Sstevel@tonic-gate 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
2394*0Sstevel@tonic-gate 			    !tg_found_success)  {
2395*0Sstevel@tonic-gate 				pfinfo->pf_nfail_tg++;
2396*0Sstevel@tonic-gate 			}
2397*0Sstevel@tonic-gate 			break;
2398*0Sstevel@tonic-gate 
2399*0Sstevel@tonic-gate 		default:
2400*0Sstevel@tonic-gate 			/*
2401*0Sstevel@tonic-gate 			 * We hit a success or unused slot. Latch the
2402*0Sstevel@tonic-gate 			 * total number of recent consecutive failures.
2403*0Sstevel@tonic-gate 			 */
2404*0Sstevel@tonic-gate 			pi_found_success = _B_TRUE;
2405*0Sstevel@tonic-gate 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
2406*0Sstevel@tonic-gate 				/*
2407*0Sstevel@tonic-gate 				 * We hit a success for the desired target.
2408*0Sstevel@tonic-gate 				 * Latch the number of recent consecutive
2409*0Sstevel@tonic-gate 				 * failures for this target
2410*0Sstevel@tonic-gate 				 */
2411*0Sstevel@tonic-gate 				tg_found_success = _B_TRUE;
2412*0Sstevel@tonic-gate 			}
2413*0Sstevel@tonic-gate 		}
2414*0Sstevel@tonic-gate 	}
2415*0Sstevel@tonic-gate }
2416*0Sstevel@tonic-gate 
2417*0Sstevel@tonic-gate /*
2418*0Sstevel@tonic-gate  * Check if the phyint has been repaired.  If no test address has been
2419*0Sstevel@tonic-gate  * configured, then consider the interface repaired if the link is up (unless
2420*0Sstevel@tonic-gate  * the link is flapping; see below).  Otherwise, look for proof of probes
2421*0Sstevel@tonic-gate  * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on
2422*0Sstevel@tonic-gate  * either IPv4 or IPv6 instance, the phyint can be considered repaired.
2423*0Sstevel@tonic-gate  */
2424*0Sstevel@tonic-gate static boolean_t
2425*0Sstevel@tonic-gate phyint_repaired(struct phyint *pi)
2426*0Sstevel@tonic-gate {
2427*0Sstevel@tonic-gate 	struct	probe_success_count psinfo;
2428*0Sstevel@tonic-gate 	struct	phyint_instance *pii;
2429*0Sstevel@tonic-gate 	struct	target *cur_tg;
2430*0Sstevel@tonic-gate 	int	pr_ndx;
2431*0Sstevel@tonic-gate 	uint_t	cur_time;
2432*0Sstevel@tonic-gate 
2433*0Sstevel@tonic-gate 	if (debug & D_FAILOVER)
2434*0Sstevel@tonic-gate 		logdebug("phyint_repaired(%s)\n", pi->pi_name);
2435*0Sstevel@tonic-gate 
2436*0Sstevel@tonic-gate 	if (LINK_DOWN(pi))
2437*0Sstevel@tonic-gate 		return (_B_FALSE);
2438*0Sstevel@tonic-gate 
2439*0Sstevel@tonic-gate 	/*
2440*0Sstevel@tonic-gate 	 * If we don't have any test addresses and the link is up, then
2441*0Sstevel@tonic-gate 	 * consider the interface repaired, unless we've received more than
2442*0Sstevel@tonic-gate 	 * LINK_UP_PERMIN link up notifications in the last minute, in
2443*0Sstevel@tonic-gate 	 * which case we keep the link down until we drop back below
2444*0Sstevel@tonic-gate 	 * the threshold.
2445*0Sstevel@tonic-gate 	 */
2446*0Sstevel@tonic-gate 	if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
2447*0Sstevel@tonic-gate 		cur_time = getcurrenttime();
2448*0Sstevel@tonic-gate 		if ((pi->pi_whenup[pi->pi_whendx] == 0 ||
2449*0Sstevel@tonic-gate 		    (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) {
2450*0Sstevel@tonic-gate 			pi->pi_lfmsg_printed = 0;
2451*0Sstevel@tonic-gate 			return (_B_TRUE);
2452*0Sstevel@tonic-gate 		}
2453*0Sstevel@tonic-gate 		if (!pi->pi_lfmsg_printed) {
2454*0Sstevel@tonic-gate 			logerr("The link has come up on %s more than %d times "
2455*0Sstevel@tonic-gate 			    "in the last minute; disabling failback until it "
2456*0Sstevel@tonic-gate 			    "stabilizes\n", pi->pi_name, LINK_UP_PERMIN);
2457*0Sstevel@tonic-gate 			pi->pi_lfmsg_printed = 1;
2458*0Sstevel@tonic-gate 		}
2459*0Sstevel@tonic-gate 
2460*0Sstevel@tonic-gate 		return (_B_FALSE);
2461*0Sstevel@tonic-gate 	}
2462*0Sstevel@tonic-gate 
2463*0Sstevel@tonic-gate 	pii = pi->pi_v4;
2464*0Sstevel@tonic-gate 	if (PROBE_CAPABLE(pii)) {
2465*0Sstevel@tonic-gate 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2466*0Sstevel@tonic-gate 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
2467*0Sstevel@tonic-gate 		probe_success_info(pii, cur_tg, &psinfo);
2468*0Sstevel@tonic-gate 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
2469*0Sstevel@tonic-gate 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
2470*0Sstevel@tonic-gate 			return (_B_TRUE);
2471*0Sstevel@tonic-gate 	}
2472*0Sstevel@tonic-gate 
2473*0Sstevel@tonic-gate 	pii = pi->pi_v6;
2474*0Sstevel@tonic-gate 	if (PROBE_CAPABLE(pii)) {
2475*0Sstevel@tonic-gate 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2476*0Sstevel@tonic-gate 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
2477*0Sstevel@tonic-gate 		probe_success_info(pii, cur_tg, &psinfo);
2478*0Sstevel@tonic-gate 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
2479*0Sstevel@tonic-gate 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
2480*0Sstevel@tonic-gate 			return (_B_TRUE);
2481*0Sstevel@tonic-gate 	}
2482*0Sstevel@tonic-gate 
2483*0Sstevel@tonic-gate 	return (_B_FALSE);
2484*0Sstevel@tonic-gate }
2485*0Sstevel@tonic-gate 
2486*0Sstevel@tonic-gate /*
2487*0Sstevel@tonic-gate  * Try failover from phyint 'pi' to a suitable destination.
2488*0Sstevel@tonic-gate  */
2489*0Sstevel@tonic-gate int
2490*0Sstevel@tonic-gate try_failover(struct phyint *pi, int failover_type)
2491*0Sstevel@tonic-gate {
2492*0Sstevel@tonic-gate 	struct phyint *dst;
2493*0Sstevel@tonic-gate 	int err;
2494*0Sstevel@tonic-gate 
2495*0Sstevel@tonic-gate 	if (debug & D_FAILOVER)
2496*0Sstevel@tonic-gate 		logdebug("try_failover(%s %d)\n", pi->pi_name, failover_type);
2497*0Sstevel@tonic-gate 
2498*0Sstevel@tonic-gate 	/*
2499*0Sstevel@tonic-gate 	 * Attempt to find a failover destination 'dst'.
2500*0Sstevel@tonic-gate 	 * dst will be null if any of the following is true
2501*0Sstevel@tonic-gate 	 * Phyint is not part of a group  OR
2502*0Sstevel@tonic-gate 	 * Phyint is the only member of a group OR
2503*0Sstevel@tonic-gate 	 * No suitable failover dst was available
2504*0Sstevel@tonic-gate 	 */
2505*0Sstevel@tonic-gate 	dst = get_failover_dst(pi, failover_type);
2506*0Sstevel@tonic-gate 	if (dst == NULL)
2507*0Sstevel@tonic-gate 		return (IPMP_EMINRED);
2508*0Sstevel@tonic-gate 
2509*0Sstevel@tonic-gate 	dst->pi_empty = 0;			/* Per state diagram */
2510*0Sstevel@tonic-gate 	pi->pi_full = 0;			/* Per state diagram */
2511*0Sstevel@tonic-gate 
2512*0Sstevel@tonic-gate 	err = failover(pi, dst);
2513*0Sstevel@tonic-gate 
2514*0Sstevel@tonic-gate 	if (debug & D_FAILOVER) {
2515*0Sstevel@tonic-gate 		logdebug("failed over from %s to %s ret %d\n",
2516*0Sstevel@tonic-gate 		    pi->pi_name, dst->pi_name, err);
2517*0Sstevel@tonic-gate 	}
2518*0Sstevel@tonic-gate 	if (err == 0) {
2519*0Sstevel@tonic-gate 		pi->pi_empty = 1;		/* Per state diagram */
2520*0Sstevel@tonic-gate 		/*
2521*0Sstevel@tonic-gate 		 * we don't want to print out this message if a
2522*0Sstevel@tonic-gate 		 * phyint is leaving the group, nor for failover from
2523*0Sstevel@tonic-gate 		 * standby
2524*0Sstevel@tonic-gate 		 */
2525*0Sstevel@tonic-gate 		if (failover_type == FAILOVER_NORMAL) {
2526*0Sstevel@tonic-gate 			logerr("Successfully failed over from NIC %s to NIC "
2527*0Sstevel@tonic-gate 			    "%s\n", pi->pi_name, dst->pi_name);
2528*0Sstevel@tonic-gate 		}
2529*0Sstevel@tonic-gate 		return (0);
2530*0Sstevel@tonic-gate 	} else {
2531*0Sstevel@tonic-gate 		/*
2532*0Sstevel@tonic-gate 		 * The failover did not succeed. We must retry the failover
2533*0Sstevel@tonic-gate 		 * only after resyncing our state based on the kernel's.
2534*0Sstevel@tonic-gate 		 * For eg. either the src or the dst might have been unplumbed
2535*0Sstevel@tonic-gate 		 * causing this failure. initifs() will be called again,
2536*0Sstevel@tonic-gate 		 * from main, since full_scan_required has been set to true
2537*0Sstevel@tonic-gate 		 * by failover();
2538*0Sstevel@tonic-gate 		 */
2539*0Sstevel@tonic-gate 		return (IPMP_FAILURE);
2540*0Sstevel@tonic-gate 	}
2541*0Sstevel@tonic-gate }
2542*0Sstevel@tonic-gate 
2543*0Sstevel@tonic-gate /*
2544*0Sstevel@tonic-gate  * global_errno captures the errno value, if failover() or failback()
2545*0Sstevel@tonic-gate  * fails. This is sent to if_mpadm(1M).
2546*0Sstevel@tonic-gate  */
2547*0Sstevel@tonic-gate int global_errno;
2548*0Sstevel@tonic-gate 
2549*0Sstevel@tonic-gate /*
2550*0Sstevel@tonic-gate  * Attempt failover from phyint 'from' to phyint 'to'.
2551*0Sstevel@tonic-gate  * IP moves everything from phyint 'from' to phyint 'to'.
2552*0Sstevel@tonic-gate  */
2553*0Sstevel@tonic-gate static int
2554*0Sstevel@tonic-gate failover(struct phyint *from, struct phyint *to)
2555*0Sstevel@tonic-gate {
2556*0Sstevel@tonic-gate 	struct	lifreq	lifr;
2557*0Sstevel@tonic-gate 	int 	ret;
2558*0Sstevel@tonic-gate 
2559*0Sstevel@tonic-gate 	if (debug & D_FAILOVER) {
2560*0Sstevel@tonic-gate 		logdebug("failing over from %s to %s\n",
2561*0Sstevel@tonic-gate 		    from->pi_name, to->pi_name);
2562*0Sstevel@tonic-gate 	}
2563*0Sstevel@tonic-gate 
2564*0Sstevel@tonic-gate 	/*
2565*0Sstevel@tonic-gate 	 * Perform the failover. Both IPv4 and IPv6 are failed over
2566*0Sstevel@tonic-gate 	 * using a single ioctl by passing in AF_UNSPEC family.
2567*0Sstevel@tonic-gate 	 */
2568*0Sstevel@tonic-gate 	lifr.lifr_addr.ss_family = AF_UNSPEC;
2569*0Sstevel@tonic-gate 	(void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name));
2570*0Sstevel@tonic-gate 	lifr.lifr_movetoindex = to->pi_ifindex;
2571*0Sstevel@tonic-gate 
2572*0Sstevel@tonic-gate 	ret = ioctl(ifsock_v4, SIOCLIFFAILOVER, (caddr_t)&lifr);
2573*0Sstevel@tonic-gate 	if (ret < 0) {
2574*0Sstevel@tonic-gate 		global_errno = errno;
2575*0Sstevel@tonic-gate 		logperror("failover: ioctl (failover)");
2576*0Sstevel@tonic-gate 	}
2577*0Sstevel@tonic-gate 
2578*0Sstevel@tonic-gate 	/*
2579*0Sstevel@tonic-gate 	 * Set full_scan_required to true. This will make us read
2580*0Sstevel@tonic-gate 	 * the state from the kernel in initifs() and update our tables,
2581*0Sstevel@tonic-gate 	 * to reflect the current state after the failover. If the
2582*0Sstevel@tonic-gate 	 * failover has failed it will then reissue the failover.
2583*0Sstevel@tonic-gate 	 */
2584*0Sstevel@tonic-gate 	full_scan_required = _B_TRUE;
2585*0Sstevel@tonic-gate 	return (ret);
2586*0Sstevel@tonic-gate }
2587*0Sstevel@tonic-gate 
2588*0Sstevel@tonic-gate /*
2589*0Sstevel@tonic-gate  * phyint 'pi' has recovered. Attempt failback from every phyint in the same
2590*0Sstevel@tonic-gate  * group as phyint 'pi' that is a potential failback source, to phyint 'pi'.
2591*0Sstevel@tonic-gate  * Return values:
2592*0Sstevel@tonic-gate  * IPMP_SUCCESS:		Failback successful from each of the other
2593*0Sstevel@tonic-gate  *				phyints in the group.
2594*0Sstevel@tonic-gate  * IPMP_EFBPARTIAL: 		Failback successful from some of the other
2595*0Sstevel@tonic-gate  *				phyints in the group.
2596*0Sstevel@tonic-gate  * IPMP_FAILURE:		Failback syscall failed with some error.
2597*0Sstevel@tonic-gate  *
2598*0Sstevel@tonic-gate  * Note that failback is attempted regardless of the setting of the
2599*0Sstevel@tonic-gate  * failback_enabled flag.
2600*0Sstevel@tonic-gate  */
2601*0Sstevel@tonic-gate int
2602*0Sstevel@tonic-gate do_failback(struct phyint *pi, boolean_t check_only)
2603*0Sstevel@tonic-gate {
2604*0Sstevel@tonic-gate 	struct  phyint *from;
2605*0Sstevel@tonic-gate 	boolean_t done;
2606*0Sstevel@tonic-gate 	boolean_t partial;
2607*0Sstevel@tonic-gate 	boolean_t attempted_failback = _B_FALSE;
2608*0Sstevel@tonic-gate 
2609*0Sstevel@tonic-gate 	if (debug & D_FAILOVER)
2610*0Sstevel@tonic-gate 		logdebug("do_failback(%s)\n", pi->pi_name);
2611*0Sstevel@tonic-gate 
2612*0Sstevel@tonic-gate 	/* If this phyint is not part of a named group, return. */
2613*0Sstevel@tonic-gate 	if (pi->pi_group == phyint_anongroup) {
2614*0Sstevel@tonic-gate 		pi->pi_full = 1;
2615*0Sstevel@tonic-gate 		return (IPMP_SUCCESS);
2616*0Sstevel@tonic-gate 	}
2617*0Sstevel@tonic-gate 
2618*0Sstevel@tonic-gate 	/*
2619*0Sstevel@tonic-gate 	 * Attempt failback from every phyint in the group to 'pi'.
2620*0Sstevel@tonic-gate 	 * The reason for doing this, instead of only from the
2621*0Sstevel@tonic-gate 	 * phyint to which we did the failover is given below.
2622*0Sstevel@tonic-gate 	 *
2623*0Sstevel@tonic-gate 	 * After 'pi' failed, if any app. tries to join on a multicast
2624*0Sstevel@tonic-gate 	 * address (IPv6), on the failed phyint, IP picks any arbitrary
2625*0Sstevel@tonic-gate 	 * non-failed phyint in the group, instead of the failed phyint,
2626*0Sstevel@tonic-gate 	 * in.mpathd is not aware of this. Thus failing back only from the
2627*0Sstevel@tonic-gate 	 * interface to which 'pi' failed over, will failback the ipif's
2628*0Sstevel@tonic-gate 	 * but not the ilm's. So we need to failback from all members of
2629*0Sstevel@tonic-gate 	 * the phyint group
2630*0Sstevel@tonic-gate 	 */
2631*0Sstevel@tonic-gate 	done = _B_TRUE;
2632*0Sstevel@tonic-gate 	partial = _B_FALSE;
2633*0Sstevel@tonic-gate 	for (from = pi->pi_group->pg_phyint; from != NULL;
2634*0Sstevel@tonic-gate 	    from = from->pi_pgnext) {
2635*0Sstevel@tonic-gate 		/* Exclude ourself as a failback src */
2636*0Sstevel@tonic-gate 		if (from == pi)
2637*0Sstevel@tonic-gate 			continue;
2638*0Sstevel@tonic-gate 
2639*0Sstevel@tonic-gate 		/*
2640*0Sstevel@tonic-gate 		 * If the 'from' phyint has IPv4 plumbed, the 'to'
2641*0Sstevel@tonic-gate 		 * phyint must also have IPv4 plumbed. Similar check
2642*0Sstevel@tonic-gate 		 * for IPv6. IP makes the same check. Otherwise the
2643*0Sstevel@tonic-gate 		 * failback will fail.
2644*0Sstevel@tonic-gate 		 */
2645*0Sstevel@tonic-gate 		if ((from->pi_v4 != NULL && pi->pi_v4 == NULL) ||
2646*0Sstevel@tonic-gate 		    (from->pi_v6 != NULL && pi->pi_v6 == NULL)) {
2647*0Sstevel@tonic-gate 			partial = _B_TRUE;
2648*0Sstevel@tonic-gate 			continue;
2649*0Sstevel@tonic-gate 		}
2650*0Sstevel@tonic-gate 
2651*0Sstevel@tonic-gate 		if (!check_only) {
2652*0Sstevel@tonic-gate 			pi->pi_empty = 0;	/* Per state diagram */
2653*0Sstevel@tonic-gate 			attempted_failback = _B_TRUE;
2654*0Sstevel@tonic-gate 			if (failback(from, pi) != 0) {
2655*0Sstevel@tonic-gate 				done = _B_FALSE;
2656*0Sstevel@tonic-gate 				break;
2657*0Sstevel@tonic-gate 			}
2658*0Sstevel@tonic-gate 		}
2659*0Sstevel@tonic-gate 	}
2660*0Sstevel@tonic-gate 
2661*0Sstevel@tonic-gate 	if (check_only) {
2662*0Sstevel@tonic-gate 		return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS);
2663*0Sstevel@tonic-gate 	}
2664*0Sstevel@tonic-gate 
2665*0Sstevel@tonic-gate 	/*
2666*0Sstevel@tonic-gate 	 * We are done. No more phyint from which we can src the failback
2667*0Sstevel@tonic-gate 	 */
2668*0Sstevel@tonic-gate 	if (done) {
2669*0Sstevel@tonic-gate 		if (!partial)
2670*0Sstevel@tonic-gate 			pi->pi_full = 1;	/* Per state diagram */
2671*0Sstevel@tonic-gate 		/*
2672*0Sstevel@tonic-gate 		 * Don't print out a message unless there is a
2673*0Sstevel@tonic-gate 		 * transition from FAILED to RUNNING. For eg.
2674*0Sstevel@tonic-gate 		 * we don't want to print out this message if a
2675*0Sstevel@tonic-gate 		 * phyint is leaving the group, or at startup
2676*0Sstevel@tonic-gate 		 */
2677*0Sstevel@tonic-gate 		if (attempted_failback && (pi->pi_flags &
2678*0Sstevel@tonic-gate 		    (IFF_FAILED | IFF_OFFLINE))) {
2679*0Sstevel@tonic-gate 			logerr("Successfully failed back to NIC %s\n",
2680*0Sstevel@tonic-gate 			    pi->pi_name);
2681*0Sstevel@tonic-gate 		}
2682*0Sstevel@tonic-gate 		return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS);
2683*0Sstevel@tonic-gate 	}
2684*0Sstevel@tonic-gate 
2685*0Sstevel@tonic-gate 	return (IPMP_FAILURE);
2686*0Sstevel@tonic-gate }
2687*0Sstevel@tonic-gate 
2688*0Sstevel@tonic-gate /*
2689*0Sstevel@tonic-gate  * This function is similar to do_failback() above, but respects the
2690*0Sstevel@tonic-gate  * failback_enabled flag for phyints in named groups.
2691*0Sstevel@tonic-gate  */
2692*0Sstevel@tonic-gate int
2693*0Sstevel@tonic-gate try_failback(struct phyint *pi, boolean_t check_only)
2694*0Sstevel@tonic-gate {
2695*0Sstevel@tonic-gate 	if (debug & D_FAILOVER)
2696*0Sstevel@tonic-gate 		logdebug("try_failback(%s)\n", pi->pi_name);
2697*0Sstevel@tonic-gate 
2698*0Sstevel@tonic-gate 	if (pi->pi_group != phyint_anongroup && !failback_enabled)
2699*0Sstevel@tonic-gate 		return (IPMP_EFBDISABLED);
2700*0Sstevel@tonic-gate 
2701*0Sstevel@tonic-gate 	return (do_failback(pi, check_only));
2702*0Sstevel@tonic-gate }
2703*0Sstevel@tonic-gate 
2704*0Sstevel@tonic-gate /*
2705*0Sstevel@tonic-gate  * Failback everything from phyint 'from' that has the same ifindex
2706*0Sstevel@tonic-gate  * as phyint to's ifindex.
2707*0Sstevel@tonic-gate  */
2708*0Sstevel@tonic-gate static int
2709*0Sstevel@tonic-gate failback(struct phyint *from, struct phyint *to)
2710*0Sstevel@tonic-gate {
2711*0Sstevel@tonic-gate 	struct lifreq lifr;
2712*0Sstevel@tonic-gate 	int ret;
2713*0Sstevel@tonic-gate 
2714*0Sstevel@tonic-gate 	if (debug & D_FAILOVER)
2715*0Sstevel@tonic-gate 		logdebug("failback(%s %s)\n", from->pi_name, to->pi_name);
2716*0Sstevel@tonic-gate 
2717*0Sstevel@tonic-gate 	lifr.lifr_addr.ss_family = AF_UNSPEC;
2718*0Sstevel@tonic-gate 	(void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name));
2719*0Sstevel@tonic-gate 	lifr.lifr_movetoindex = to->pi_ifindex;
2720*0Sstevel@tonic-gate 
2721*0Sstevel@tonic-gate 	ret = ioctl(ifsock_v4, SIOCLIFFAILBACK, (caddr_t)&lifr);
2722*0Sstevel@tonic-gate 	if (ret < 0) {
2723*0Sstevel@tonic-gate 		global_errno = errno;
2724*0Sstevel@tonic-gate 		logperror("failback: ioctl (failback)");
2725*0Sstevel@tonic-gate 	}
2726*0Sstevel@tonic-gate 
2727*0Sstevel@tonic-gate 	/*
2728*0Sstevel@tonic-gate 	 * Set full_scan_required to true. This will make us read
2729*0Sstevel@tonic-gate 	 * the state from the kernel in initifs() and update our tables,
2730*0Sstevel@tonic-gate 	 * to reflect the current state after the failback. If the
2731*0Sstevel@tonic-gate 	 * failback has failed it will then reissue the failback.
2732*0Sstevel@tonic-gate 	 */
2733*0Sstevel@tonic-gate 	full_scan_required = _B_TRUE;
2734*0Sstevel@tonic-gate 
2735*0Sstevel@tonic-gate 	return (ret);
2736*0Sstevel@tonic-gate }
2737*0Sstevel@tonic-gate 
2738*0Sstevel@tonic-gate /*
2739*0Sstevel@tonic-gate  * Select a target phyint for failing over from 'pi'.
2740*0Sstevel@tonic-gate  * In the normal case i.e. failover_type is FAILOVER_NORMAL, the preferred
2741*0Sstevel@tonic-gate  * target phyint is chosen as follows,
2742*0Sstevel@tonic-gate  *	1. Pick any inactive standby interface.
2743*0Sstevel@tonic-gate  *	2. If no inactive standby is available, select any phyint in the
2744*0Sstevel@tonic-gate  *	   same group that has the least number of logints, (excluding
2745*0Sstevel@tonic-gate  *	   IFF_NOFAILOVER and !IFF_UP logints)
2746*0Sstevel@tonic-gate  * If we are failing over from a standby, failover_type is
2747*0Sstevel@tonic-gate  * FAILOVER_TO_NONSTANDBY, and we won't pick a standby for the destination.
2748*0Sstevel@tonic-gate  * If a phyint is leaving the group, then failover_type is FAILOVER_TO_ANY,
2749*0Sstevel@tonic-gate  * and we won't return NULL, as long as there is at least 1 other phyint
2750*0Sstevel@tonic-gate  * in the group.
2751*0Sstevel@tonic-gate  */
2752*0Sstevel@tonic-gate static struct phyint *
2753*0Sstevel@tonic-gate get_failover_dst(struct phyint *pi, int failover_type)
2754*0Sstevel@tonic-gate {
2755*0Sstevel@tonic-gate 	struct phyint	*maybe = NULL;
2756*0Sstevel@tonic-gate 	struct phyint	*pi2;
2757*0Sstevel@tonic-gate 	struct phyint 	*last_choice = NULL;
2758*0Sstevel@tonic-gate 
2759*0Sstevel@tonic-gate 	if (pi->pi_group == phyint_anongroup)
2760*0Sstevel@tonic-gate 		return (NULL);
2761*0Sstevel@tonic-gate 
2762*0Sstevel@tonic-gate 	/*
2763*0Sstevel@tonic-gate 	 * Loop thru the phyints in the group, and pick the preferred
2764*0Sstevel@tonic-gate 	 * phyint for the target.
2765*0Sstevel@tonic-gate 	 */
2766*0Sstevel@tonic-gate 	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2767*0Sstevel@tonic-gate 		/* Exclude ourself and offlined interfaces */
2768*0Sstevel@tonic-gate 		if (pi2 == pi || pi2->pi_state == PI_OFFLINE)
2769*0Sstevel@tonic-gate 			continue;
2770*0Sstevel@tonic-gate 
2771*0Sstevel@tonic-gate 		/*
2772*0Sstevel@tonic-gate 		 * The chosen target phyint must have IPv4 instance
2773*0Sstevel@tonic-gate 		 * plumbed, if the src phyint has IPv4 plumbed. Similarly
2774*0Sstevel@tonic-gate 		 * for IPv6.
2775*0Sstevel@tonic-gate 		 */
2776*0Sstevel@tonic-gate 		if ((pi2->pi_v4 == NULL && pi->pi_v4 != NULL) ||
2777*0Sstevel@tonic-gate 		    (pi2->pi_v6 == NULL && pi->pi_v6 != NULL))
2778*0Sstevel@tonic-gate 			continue;
2779*0Sstevel@tonic-gate 
2780*0Sstevel@tonic-gate 		/* The chosen target must be PI_RUNNING. */
2781*0Sstevel@tonic-gate 		if (pi2->pi_state != PI_RUNNING) {
2782*0Sstevel@tonic-gate 			last_choice = pi2;
2783*0Sstevel@tonic-gate 			continue;
2784*0Sstevel@tonic-gate 		}
2785*0Sstevel@tonic-gate 
2786*0Sstevel@tonic-gate 		if ((pi2->pi_flags & IFF_INACTIVE) &&
2787*0Sstevel@tonic-gate 		    (failover_type != FAILOVER_TO_NONSTANDBY)) {
2788*0Sstevel@tonic-gate 			return (pi2);
2789*0Sstevel@tonic-gate 		} else {
2790*0Sstevel@tonic-gate 			if (maybe == NULL)
2791*0Sstevel@tonic-gate 				maybe = pi2;
2792*0Sstevel@tonic-gate 			else if (logint_upcount(pi2) < logint_upcount(maybe))
2793*0Sstevel@tonic-gate 				maybe = pi2;
2794*0Sstevel@tonic-gate 		}
2795*0Sstevel@tonic-gate 	}
2796*0Sstevel@tonic-gate 	if (maybe == NULL && failover_type == FAILOVER_TO_ANY)
2797*0Sstevel@tonic-gate 		return (last_choice);
2798*0Sstevel@tonic-gate 	else
2799*0Sstevel@tonic-gate 		return (maybe);
2800*0Sstevel@tonic-gate }
2801*0Sstevel@tonic-gate 
2802*0Sstevel@tonic-gate /*
2803*0Sstevel@tonic-gate  * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call.
2804*0Sstevel@tonic-gate  */
2805*0Sstevel@tonic-gate boolean_t
2806*0Sstevel@tonic-gate change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl)
2807*0Sstevel@tonic-gate {
2808*0Sstevel@tonic-gate 	int ifsock;
2809*0Sstevel@tonic-gate 	struct lifreq lifr;
2810*0Sstevel@tonic-gate 
2811*0Sstevel@tonic-gate 	if (debug & D_FAILOVER) {
2812*0Sstevel@tonic-gate 		logdebug("change_lif_flags(%s): flags %llx setfl %d\n",
2813*0Sstevel@tonic-gate 		    pi->pi_name, flags, (int)setfl);
2814*0Sstevel@tonic-gate 	}
2815*0Sstevel@tonic-gate 
2816*0Sstevel@tonic-gate 	if (pi->pi_v4 != NULL) {
2817*0Sstevel@tonic-gate 		ifsock = ifsock_v4;
2818*0Sstevel@tonic-gate 	} else  {
2819*0Sstevel@tonic-gate 		ifsock = ifsock_v6;
2820*0Sstevel@tonic-gate 	}
2821*0Sstevel@tonic-gate 
2822*0Sstevel@tonic-gate 	/*
2823*0Sstevel@tonic-gate 	 * Get the current flags from the kernel, and set/clear the
2824*0Sstevel@tonic-gate 	 * desired phyint flags. Since we set only phyint flags, we can
2825*0Sstevel@tonic-gate 	 * do it on either IPv4 or IPv6 instance.
2826*0Sstevel@tonic-gate 	 */
2827*0Sstevel@tonic-gate 	(void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
2828*0Sstevel@tonic-gate 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
2829*0Sstevel@tonic-gate 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
2830*0Sstevel@tonic-gate 		if (errno != ENXIO)
2831*0Sstevel@tonic-gate 			logperror("change_lif_flags: ioctl (get flags)");
2832*0Sstevel@tonic-gate 		return (_B_FALSE);
2833*0Sstevel@tonic-gate 	}
2834*0Sstevel@tonic-gate 	if (setfl)
2835*0Sstevel@tonic-gate 		lifr.lifr_flags |= flags;
2836*0Sstevel@tonic-gate 	else
2837*0Sstevel@tonic-gate 		lifr.lifr_flags &= ~flags;
2838*0Sstevel@tonic-gate 	if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
2839*0Sstevel@tonic-gate 		if (errno != ENXIO)
2840*0Sstevel@tonic-gate 			logperror("change_lif_flags: ioctl (set flags)");
2841*0Sstevel@tonic-gate 		return (_B_FALSE);
2842*0Sstevel@tonic-gate 	}
2843*0Sstevel@tonic-gate 
2844*0Sstevel@tonic-gate 	/*
2845*0Sstevel@tonic-gate 	 * Keep pi_flags in synch. with actual flags. Assumes flags are
2846*0Sstevel@tonic-gate 	 * phyint flags.
2847*0Sstevel@tonic-gate 	 */
2848*0Sstevel@tonic-gate 	if (setfl)
2849*0Sstevel@tonic-gate 		pi->pi_flags |= flags;
2850*0Sstevel@tonic-gate 	else
2851*0Sstevel@tonic-gate 		pi->pi_flags &= ~flags;
2852*0Sstevel@tonic-gate 
2853*0Sstevel@tonic-gate 	if (pi->pi_v4)
2854*0Sstevel@tonic-gate 		pi->pi_v4->pii_flags = pi->pi_flags;
2855*0Sstevel@tonic-gate 
2856*0Sstevel@tonic-gate 	if (pi->pi_v6)
2857*0Sstevel@tonic-gate 		pi->pi_v6->pii_flags = pi->pi_flags;
2858*0Sstevel@tonic-gate 
2859*0Sstevel@tonic-gate 	return (_B_TRUE);
2860*0Sstevel@tonic-gate }
2861*0Sstevel@tonic-gate 
2862*0Sstevel@tonic-gate /*
2863*0Sstevel@tonic-gate  * icmp cksum computation for IPv4.
2864*0Sstevel@tonic-gate  */
2865*0Sstevel@tonic-gate static int
2866*0Sstevel@tonic-gate in_cksum(ushort_t *addr, int len)
2867*0Sstevel@tonic-gate {
2868*0Sstevel@tonic-gate 	register int nleft = len;
2869*0Sstevel@tonic-gate 	register ushort_t *w = addr;
2870*0Sstevel@tonic-gate 	register ushort_t answer;
2871*0Sstevel@tonic-gate 	ushort_t odd_byte = 0;
2872*0Sstevel@tonic-gate 	register int sum = 0;
2873*0Sstevel@tonic-gate 
2874*0Sstevel@tonic-gate 	/*
2875*0Sstevel@tonic-gate 	 *  Our algorithm is simple, using a 32 bit accumulator (sum),
2876*0Sstevel@tonic-gate 	 *  we add sequential 16 bit words to it, and at the end, fold
2877*0Sstevel@tonic-gate 	 *  back all the carry bits from the top 16 bits into the lower
2878*0Sstevel@tonic-gate 	 *  16 bits.
2879*0Sstevel@tonic-gate 	 */
2880*0Sstevel@tonic-gate 	while (nleft > 1)  {
2881*0Sstevel@tonic-gate 		sum += *w++;
2882*0Sstevel@tonic-gate 		nleft -= 2;
2883*0Sstevel@tonic-gate 	}
2884*0Sstevel@tonic-gate 
2885*0Sstevel@tonic-gate 	/* mop up an odd byte, if necessary */
2886*0Sstevel@tonic-gate 	if (nleft == 1) {
2887*0Sstevel@tonic-gate 		*(uchar_t *)(&odd_byte) = *(uchar_t *)w;
2888*0Sstevel@tonic-gate 		sum += odd_byte;
2889*0Sstevel@tonic-gate 	}
2890*0Sstevel@tonic-gate 
2891*0Sstevel@tonic-gate 	/*
2892*0Sstevel@tonic-gate 	 * add back carry outs from top 16 bits to low 16 bits
2893*0Sstevel@tonic-gate 	 */
2894*0Sstevel@tonic-gate 	sum = (sum >> 16) + (sum & 0xffff);	/* add hi 16 to low 16 */
2895*0Sstevel@tonic-gate 	sum += (sum >> 16);			/* add carry */
2896*0Sstevel@tonic-gate 	answer = ~sum;				/* truncate to 16 bits */
2897*0Sstevel@tonic-gate 	return (answer);
2898*0Sstevel@tonic-gate }
2899*0Sstevel@tonic-gate 
2900*0Sstevel@tonic-gate static void
2901*0Sstevel@tonic-gate reset_snxt_basetimes(void)
2902*0Sstevel@tonic-gate {
2903*0Sstevel@tonic-gate 	struct phyint_instance *pii;
2904*0Sstevel@tonic-gate 
2905*0Sstevel@tonic-gate 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2906*0Sstevel@tonic-gate 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
2907*0Sstevel@tonic-gate 	}
2908*0Sstevel@tonic-gate }
2909*0Sstevel@tonic-gate 
2910*0Sstevel@tonic-gate /*
2911*0Sstevel@tonic-gate  * Is the address one of our own addresses? Unfortunately,
2912*0Sstevel@tonic-gate  * we cannot check our phyint tables to determine if the address
2913*0Sstevel@tonic-gate  * is our own. This is because, we don't track interfaces that
2914*0Sstevel@tonic-gate  * are not part of any group. We have to either use a 'bind' or
2915*0Sstevel@tonic-gate  * get the complete list of all interfaces using SIOCGLIFCONF,
2916*0Sstevel@tonic-gate  * to do this check. We choose to use 'bind'. We could use
2917*0Sstevel@tonic-gate  * SIOCTMYADDR, but bind is preferred, since it is stronger.
2918*0Sstevel@tonic-gate  * SIOCTMYADDR excludes down interfaces, while bind includes even
2919*0Sstevel@tonic-gate  * down interfaces.
2920*0Sstevel@tonic-gate  */
2921*0Sstevel@tonic-gate boolean_t
2922*0Sstevel@tonic-gate own_address(int af, struct in6_addr addr)
2923*0Sstevel@tonic-gate {
2924*0Sstevel@tonic-gate 	int sock;
2925*0Sstevel@tonic-gate 	boolean_t ours = _B_TRUE;
2926*0Sstevel@tonic-gate 
2927*0Sstevel@tonic-gate 	sock = socket(AF_INET6, SOCK_DGRAM, 0);
2928*0Sstevel@tonic-gate 	if (sock  == -1) {
2929*0Sstevel@tonic-gate 		logperror("own_address: socket");
2930*0Sstevel@tonic-gate 		/*
2931*0Sstevel@tonic-gate 		 * If the socket call fails, err on the side of caution,
2932*0Sstevel@tonic-gate 		 * and return true.
2933*0Sstevel@tonic-gate 		 */
2934*0Sstevel@tonic-gate 	} else {
2935*0Sstevel@tonic-gate 		struct sockaddr_in6 sin6;
2936*0Sstevel@tonic-gate 
2937*0Sstevel@tonic-gate 		(void) memset(&sin6, 0, sizeof (struct sockaddr_in6));
2938*0Sstevel@tonic-gate 		sin6.sin6_family = AF_INET6;
2939*0Sstevel@tonic-gate 		sin6.sin6_addr = addr;
2940*0Sstevel@tonic-gate 		/*
2941*0Sstevel@tonic-gate 		 * If the bind succeeds, then this address is one of our
2942*0Sstevel@tonic-gate 		 * addresses.
2943*0Sstevel@tonic-gate 		 * If bind returns error EADDRNOTAVAIL, the address is
2944*0Sstevel@tonic-gate 		 * not one of ours.
2945*0Sstevel@tonic-gate 		 * If bind returns an error other than EADDRNOTAVAIL, err
2946*0Sstevel@tonic-gate 		 * on the side of caution and report the address as one of
2947*0Sstevel@tonic-gate 		 * our own.
2948*0Sstevel@tonic-gate 		 */
2949*0Sstevel@tonic-gate 		if (bind(sock, (struct sockaddr *)&sin6,
2950*0Sstevel@tonic-gate 		    sizeof (struct sockaddr_in6)) == -1) {
2951*0Sstevel@tonic-gate 			if (errno == EADDRNOTAVAIL)
2952*0Sstevel@tonic-gate 				ours = _B_FALSE;
2953*0Sstevel@tonic-gate 			else
2954*0Sstevel@tonic-gate 				logperror("own_address: bind");
2955*0Sstevel@tonic-gate 		}
2956*0Sstevel@tonic-gate 		(void) close(sock);
2957*0Sstevel@tonic-gate 	}
2958*0Sstevel@tonic-gate 	if (debug & D_TARGET) {
2959*0Sstevel@tonic-gate 		char abuf[INET6_ADDRSTRLEN];
2960*0Sstevel@tonic-gate 
2961*0Sstevel@tonic-gate 		logdebug("own_address: addr %s is %s ours\n",
2962*0Sstevel@tonic-gate 		    pr_addr(af, addr, abuf, sizeof (abuf)),
2963*0Sstevel@tonic-gate 		    ours ? "one of" : "not");
2964*0Sstevel@tonic-gate 	}
2965*0Sstevel@tonic-gate 	return (ours);
2966*0Sstevel@tonic-gate }
2967